mirror of
https://github.com/tesseract-ocr/tesseract.git
synced 2024-12-18 11:28:51 +08:00
3a13d80d24
git-svn-id: https://tesseract-ocr.googlecode.com/svn/trunk@293 d0cd1f9f-072b-0410-8dd7-cf729c803f20
1404 lines
50 KiB
C++
1404 lines
50 KiB
C++
/******************************************************************************
|
|
** Filename: stopper.c
|
|
** Purpose: Stopping criteria for word classifier.
|
|
** Author: Dan Johnson
|
|
** History: Mon Apr 29 14:56:49 1991, DSJ, Created.
|
|
**
|
|
** (c) Copyright Hewlett-Packard Company, 1988.
|
|
** Licensed under the Apache License, Version 2.0 (the "License");
|
|
** you may not use this file except in compliance with the License.
|
|
** You may obtain a copy of the License at
|
|
** http://www.apache.org/licenses/LICENSE-2.0
|
|
** Unless required by applicable law or agreed to in writing, software
|
|
** distributed under the License is distributed on an "AS IS" BASIS,
|
|
** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
** See the License for the specific language governing permissions and
|
|
** limitations under the License.
|
|
******************************************************************************/
|
|
/**----------------------------------------------------------------------------
|
|
Include Files and Type Defines
|
|
----------------------------------------------------------------------------**/
|
|
#include "stopper.h"
|
|
#include "emalloc.h"
|
|
#include "matchdefs.h"
|
|
#include "callcpp.h"
|
|
#include "permute.h"
|
|
#include "context.h"
|
|
#include "danerror.h"
|
|
#include "const.h"
|
|
#include "freelist.h"
|
|
#include "efio.h"
|
|
#include "scanutils.h"
|
|
#include "unichar.h"
|
|
#include "varable.h"
|
|
#include "dict.h"
|
|
#include "image.h"
|
|
#include "ccutil.h"
|
|
#include "ratngs.h"
|
|
#include "ambigs.h"
|
|
|
|
#include <stdio.h>
|
|
#include <string.h>
|
|
#include <ctype.h>
|
|
#include <math.h>
|
|
#ifdef __UNIX__
|
|
#include <assert.h>
|
|
#endif
|
|
|
|
/* these are kludges - add appropriate .h file later */
|
|
/* from adaptmatch.cpp */
|
|
#define MAX_WERD_SIZE 100
|
|
|
|
typedef struct
|
|
{
|
|
VIABLE_CHOICE Choice;
|
|
float ChunkCertainty[MAX_NUM_CHUNKS];
|
|
UNICHAR_ID ChunkClass[MAX_NUM_CHUNKS];
|
|
} EXPANDED_CHOICE;
|
|
|
|
/**----------------------------------------------------------------------------
|
|
Macros
|
|
----------------------------------------------------------------------------**/
|
|
#define BestCertainty(Choices) (((VIABLE_CHOICE) first_node (Choices))->Certainty)
|
|
#define BestRating(Choices) (((VIABLE_CHOICE) first_node (Choices))->Rating)
|
|
#define BestFactor(Choices) (((VIABLE_CHOICE) first_node (Choices))->AdjustFactor)
|
|
|
|
#define AmbigThreshold(F1,F2) (((F2) - (F1)) * stopper_ambiguity_threshold_gain - \
|
|
stopper_ambiguity_threshold_offset)
|
|
|
|
/*---------------------------------------------------------------------------
|
|
Private Function Prototoypes
|
|
----------------------------------------------------------------------------*/
|
|
void AddNewChunk(VIABLE_CHOICE Choice, int Blob);
|
|
|
|
int CmpChoiceRatings(void *arg1, //VIABLE_CHOICE Choice1,
|
|
void *arg2); //VIABLE_CHOICE Choice2);
|
|
|
|
void ExpandChoice(VIABLE_CHOICE Choice, EXPANDED_CHOICE *ExpandedChoice);
|
|
|
|
int FreeBadChoice(void *item1, //VIABLE_CHOICE Choice,
|
|
void *item2); //EXPANDED_CHOICE *BestChoice);
|
|
|
|
int UniformCertainties(const BLOB_CHOICE_LIST_VECTOR &Choices,
|
|
const WERD_CHOICE &BestChoice);
|
|
|
|
/**----------------------------------------------------------------------
|
|
V a r i a b l e s
|
|
----------------------------------------------------------------------**/
|
|
double_VAR(certainty_scale, 20.0, "Certainty scaling factor");
|
|
|
|
double_VAR(stopper_nondict_certainty_base, -2.50,
|
|
"Certainty threshold for non-dict words");
|
|
|
|
double_VAR(stopper_phase2_certainty_rejection_offset, 1.0,
|
|
"Reject certainty offset");
|
|
|
|
INT_VAR(stopper_smallword_size, 2,
|
|
"Size of dict word to be treated as non-dict word");
|
|
|
|
double_VAR(stopper_certainty_per_char, -0.50,
|
|
"Certainty to add for each dict char above small word size.");
|
|
|
|
double_VAR(stopper_allowable_character_badness, 3.0,
|
|
"Max certaintly variation allowed in a word (in sigma)");
|
|
|
|
INT_VAR(stopper_debug_level, 0, "Stopper debug level");
|
|
|
|
double_VAR(stopper_ambiguity_threshold_gain, 8.0,
|
|
"Gain factor for ambiguity threshold");
|
|
|
|
double_VAR(stopper_ambiguity_threshold_offset, 1.5,
|
|
"Certainty offset for ambiguity threshold");
|
|
|
|
BOOL_VAR(stopper_no_acceptable_choices, false,
|
|
"Make AcceptableChoice() always return false. Useful"
|
|
" when there is a need to explore all segmentations");
|
|
|
|
BOOL_VAR(save_raw_choices, false, "Save all explored raw choices");
|
|
|
|
INT_VAR (tessedit_truncate_wordchoice_log, 10, "Max words to keep in list");
|
|
|
|
STRING_VAR(word_to_debug, "", "Word for which stopper debug information"
|
|
" should be printed to stdout");
|
|
|
|
STRING_VAR(word_to_debug_lengths, "", "Lengths of unichars in word_to_debug");
|
|
|
|
/**----------------------------------------------------------------------------
|
|
Public Code
|
|
----------------------------------------------------------------------------**/
|
|
/*---------------------------------------------------------------------------*/
|
|
namespace tesseract {
|
|
int Dict::AcceptableChoice(BLOB_CHOICE_LIST_VECTOR *Choices,
|
|
WERD_CHOICE *BestChoice,
|
|
const WERD_CHOICE &RawChoice,
|
|
DANGERR *fixpt,
|
|
ACCEPTABLE_CHOICE_CALLER caller,
|
|
bool *modified_blobs) {
|
|
/*
|
|
** Parameters:
|
|
** Choices choices for current segmentation
|
|
** BestChoice best choice for current segmentation
|
|
** RawChoice best raw choice for current segmentation
|
|
** Variables Used:
|
|
** stopper_nondict_certainty_base certainty for a non-dict word
|
|
** stopper_smallword_size size of word to be treated as non-word
|
|
** stopper_certainty_per_char certainty to add for each dict char
|
|
** Operation: Return TRUE if the results from this segmentation are
|
|
** good enough to stop. Otherwise return FALSE.
|
|
** Return: TRUE or FALSE.
|
|
** Exceptions: none
|
|
** History: Mon Apr 29 14:57:32 1991, DSJ, Created.
|
|
*/
|
|
float CertaintyThreshold = stopper_nondict_certainty_base;
|
|
int WordSize;
|
|
|
|
if (stopper_no_acceptable_choices) return false;
|
|
|
|
if (fixpt != NULL)
|
|
fixpt->index = -1;
|
|
if (BestChoice->length() == 0)
|
|
return (FALSE);
|
|
if (caller == CHOPPER_CALLER && BestChoice->fragment_mark()) {
|
|
if (stopper_debug_level >= 1) {
|
|
cprintf("AcceptableChoice(): a choice with fragments beats BestChoice");
|
|
}
|
|
return false;
|
|
}
|
|
|
|
bool no_dang_ambigs =
|
|
NoDangerousAmbig(BestChoice, fixpt, true, Choices, modified_blobs);
|
|
|
|
if (stopper_debug_level >= 1)
|
|
tprintf("\nStopper: %s (word=%c, case=%c)\n",
|
|
BestChoice->debug_string(getUnicharset()).string(),
|
|
(valid_word(*BestChoice) ? 'y' : 'n'),
|
|
(Context::case_ok(*BestChoice, getUnicharset()) ? 'y' : 'n'));
|
|
|
|
if (valid_word(*BestChoice) &&
|
|
Context::case_ok(*BestChoice, getUnicharset())) {
|
|
WordSize = LengthOfShortestAlphaRun(*BestChoice);
|
|
WordSize -= stopper_smallword_size;
|
|
if (WordSize < 0)
|
|
WordSize = 0;
|
|
CertaintyThreshold += WordSize * stopper_certainty_per_char;
|
|
}
|
|
|
|
if (stopper_debug_level >= 1)
|
|
tprintf("Stopper: Certainty = %4.1f, Threshold = %4.1f\n",
|
|
BestChoice->certainty(), CertaintyThreshold);
|
|
|
|
if (no_dang_ambigs &&
|
|
BestChoice->certainty() > CertaintyThreshold &&
|
|
UniformCertainties(*Choices, *BestChoice)) {
|
|
return (TRUE);
|
|
} else {
|
|
return (FALSE);
|
|
}
|
|
} /* AcceptableChoice */
|
|
|
|
|
|
/*---------------------------------------------------------------------------*/
|
|
int Dict::AcceptableResult(const WERD_CHOICE &BestChoice,
|
|
const WERD_CHOICE &RawChoice) {
|
|
/*
|
|
** Parameters:
|
|
** BestChoice best choice for current word
|
|
** RawChoice best raw choice for current word
|
|
** Variables Used:
|
|
** stopper_nondict_certainty_base certainty for a non-dict word
|
|
** stopper_smallword_size size of word to be treated as non-word
|
|
** stopper_certainty_per_char certainty to add for each dict char
|
|
** best_choices_ list of all good choices found
|
|
** reject_offset_ allowed offset before a word is rejected
|
|
** Operation: Return FALSE if the best choice for the current word
|
|
** is questionable and should be tried again on the second
|
|
** pass or should be flagged to the user.
|
|
** Return: TRUE or FALSE.
|
|
** Exceptions: none
|
|
** History: Thu May 9 14:05:05 1991, DSJ, Created.
|
|
*/
|
|
float CertaintyThreshold = stopper_nondict_certainty_base - reject_offset_;
|
|
int WordSize;
|
|
|
|
if (stopper_debug_level >= 1) {
|
|
tprintf("\nRejecter: %s (word=%c, case=%c, unambig=%c)\n",
|
|
BestChoice.debug_string(getUnicharset()).string(),
|
|
(valid_word(BestChoice) ? 'y' : 'n'),
|
|
(Context::case_ok(BestChoice, getUnicharset()) ? 'y' : 'n'),
|
|
((rest (best_choices_) != NIL) ? 'n' : 'y'));
|
|
}
|
|
|
|
if (BestChoice.length() == 0 || CurrentWordAmbig())
|
|
return (FALSE);
|
|
if (BestChoice.fragment_mark()) {
|
|
if (stopper_debug_level >= 1) {
|
|
cprintf("AcceptableResult(): a choice with fragments beats BestChoice\n");
|
|
}
|
|
return false;
|
|
}
|
|
if (valid_word(BestChoice) &&
|
|
Context::case_ok(BestChoice, getUnicharset())) {
|
|
WordSize = LengthOfShortestAlphaRun(BestChoice);
|
|
WordSize -= stopper_smallword_size;
|
|
if (WordSize < 0)
|
|
WordSize = 0;
|
|
CertaintyThreshold += WordSize * stopper_certainty_per_char;
|
|
}
|
|
|
|
if (stopper_debug_level >= 1)
|
|
cprintf ("Rejecter: Certainty = %4.1f, Threshold = %4.1f ",
|
|
BestChoice.certainty(), CertaintyThreshold);
|
|
|
|
if (BestChoice.certainty() > CertaintyThreshold &&
|
|
!stopper_no_acceptable_choices) {
|
|
if (stopper_debug_level >= 1)
|
|
cprintf("ACCEPTED\n");
|
|
return (TRUE);
|
|
}
|
|
else {
|
|
if (stopper_debug_level >= 1)
|
|
cprintf("REJECTED\n");
|
|
return (FALSE);
|
|
}
|
|
} /* AcceptableResult */
|
|
|
|
|
|
/*---------------------------------------------------------------------------*/
|
|
int Dict::AlternativeChoicesWorseThan(FLOAT32 Threshold) {
|
|
/*
|
|
** Parameters:
|
|
** Threshold minimum adjust factor for alternative choices
|
|
** Variables Used:
|
|
** best_choices_ alternative choices for current word
|
|
** Operation: This routine returns TRUE if there are no alternative
|
|
** choices for the current word OR if all alternatives have
|
|
** an adjust factor worse than Threshold.
|
|
** Return: TRUE or FALSE.
|
|
** Exceptions: none
|
|
** History: Mon Jun 3 09:36:31 1991, DSJ, Created.
|
|
*/
|
|
LIST Alternatives;
|
|
VIABLE_CHOICE Choice;
|
|
|
|
Alternatives = rest (best_choices_);
|
|
iterate(Alternatives) {
|
|
Choice = (VIABLE_CHOICE) first_node (Alternatives);
|
|
if (Choice->AdjustFactor <= Threshold)
|
|
return (FALSE);
|
|
}
|
|
|
|
return (TRUE);
|
|
|
|
} /* AlternativeChoicesWorseThan */
|
|
|
|
|
|
/*---------------------------------------------------------------------------*/
|
|
int Dict::CurrentBestChoiceIs(const WERD_CHOICE &WordChoice) {
|
|
/*
|
|
** Parameters:
|
|
** Word word that will be compared to the best choice
|
|
** Variables Used:
|
|
** best_choices_ set of best choices for current word
|
|
** Operation: Returns TRUE if Word is the same as the current best
|
|
** choice, FALSE otherwise.
|
|
** Return: TRUE or FALSE
|
|
** Exceptions: none
|
|
** History: Thu May 30 14:44:22 1991, DSJ, Created.
|
|
*/
|
|
return (best_choices_ != NIL &&
|
|
StringSameAs(WordChoice, (VIABLE_CHOICE)first_node(best_choices_)));
|
|
} /* CurrentBestChoiceIs */
|
|
|
|
|
|
/*---------------------------------------------------------------------------*/
|
|
FLOAT32 Dict::CurrentBestChoiceAdjustFactor() {
|
|
/*
|
|
** Parameters: none
|
|
** Variables Used:
|
|
** best_choices_ set of best choices for current word
|
|
** Operation: Return the adjustment factor for the best choice for
|
|
** the current word.
|
|
** Return: Adjust factor for current best choice.
|
|
** Exceptions: none
|
|
** History: Thu May 30 14:48:24 1991, DSJ, Created.
|
|
*/
|
|
VIABLE_CHOICE BestChoice;
|
|
|
|
if (best_choices_ == NIL)
|
|
return (MAX_FLOAT32);
|
|
|
|
BestChoice = (VIABLE_CHOICE) first_node (best_choices_);
|
|
return (BestChoice->AdjustFactor);
|
|
|
|
} /* CurrentBestChoiceAdjustFactor */
|
|
|
|
|
|
/*---------------------------------------------------------------------------*/
|
|
int Dict::CurrentWordAmbig() {
|
|
/*
|
|
** Parameters: none
|
|
** Variables Used:
|
|
** best_choices_ set of best choices for current word
|
|
** Operation: This routine returns TRUE if there are multiple good
|
|
** choices for the current word and FALSE otherwise.
|
|
** Return: TRUE or FALSE
|
|
** Exceptions: none
|
|
** History: Wed May 22 15:38:38 1991, DSJ, Created.
|
|
*/
|
|
return (rest (best_choices_) != NIL);
|
|
|
|
} /* CurrentWordAmbig */
|
|
|
|
|
|
/*---------------------------------------------------------------------------*/
|
|
void Dict::DebugWordChoices() {
|
|
/*
|
|
** Parameters: none
|
|
** Variables Used:
|
|
** best_raw_choice_
|
|
** best_choices_
|
|
** Operation: Print the current choices for this word to stdout.
|
|
** Return: none
|
|
** Exceptions: none
|
|
** History: Wed May 15 13:52:08 1991, DSJ, Created.
|
|
*/
|
|
LIST Choices;
|
|
int i;
|
|
char LabelString[80];
|
|
VIABLE_CHOICE VChoice = (VIABLE_CHOICE)first_node(best_choices_);
|
|
bool force_debug =
|
|
fragments_debug && VChoice != NULL && VChoice->ComposedFromCharFragments;
|
|
|
|
if (stopper_debug_level >= 1 || force_debug ||
|
|
(((STRING)word_to_debug).length() > 0 && best_choices_ &&
|
|
StringSameAs(word_to_debug.string(), word_to_debug_lengths.string(),
|
|
(VIABLE_CHOICE)first_node(best_choices_)))) {
|
|
if (best_raw_choice_)
|
|
PrintViableChoice(stderr, "\nBest Raw Choice: ", best_raw_choice_);
|
|
|
|
i = 1;
|
|
Choices = best_choices_;
|
|
if (Choices)
|
|
cprintf("\nBest Cooked Choices:\n");
|
|
iterate(Choices) {
|
|
sprintf(LabelString, "Cooked Choice #%d: ", i);
|
|
PrintViableChoice(stderr, LabelString,
|
|
(VIABLE_CHOICE)first_node(Choices));
|
|
i++;
|
|
}
|
|
}
|
|
} /* DebugWordChoices */
|
|
|
|
// Print all the choices in raw_choices_ list for non 1-1 ambiguities.
|
|
void Dict::PrintAmbigAlternatives(FILE *file, const char *label,
|
|
int label_num_unichars) {
|
|
iterate(raw_choices_) {
|
|
VIABLE_CHOICE Choice = (VIABLE_CHOICE)first_node(raw_choices_);
|
|
if (Choice->Length > 0 &&
|
|
(label_num_unichars > 1 || Choice->Length > 1)) {
|
|
for (int i = 0; i < Choice->Length; i++) {
|
|
fprintf(file, "%s",
|
|
getUnicharset().id_to_unichar(Choice->Blob[i].Class));
|
|
}
|
|
fflush(file);
|
|
fprintf(file, "\t%s\t%.4f\t%.4f\n", label,
|
|
Choice->Rating, Choice->Certainty);
|
|
}
|
|
}
|
|
}
|
|
|
|
/*---------------------------------------------------------------------------*/
|
|
void Dict::FilterWordChoices() {
|
|
/*
|
|
** Parameters: none
|
|
** Variables Used:
|
|
** best_choices_ set of choices for current word
|
|
** Operation: This routine removes from best_choices_ all choices which
|
|
** are not within a reasonable range of the best choice.
|
|
** Return: none
|
|
** Exceptions: none
|
|
** History: Wed May 15 13:08:24 1991, DSJ, Created.
|
|
*/
|
|
EXPANDED_CHOICE BestChoice;
|
|
|
|
if (best_choices_ == NIL || second_node (best_choices_) == NIL)
|
|
return;
|
|
|
|
/* compute certainties and class for each chunk in best choice */
|
|
ExpandChoice((VIABLE_CHOICE_STRUCT *)first_node(best_choices_), &BestChoice);
|
|
|
|
set_rest (best_choices_, delete_d (rest (best_choices_),
|
|
&BestChoice, FreeBadChoice));
|
|
|
|
} /* FilterWordChoices */
|
|
|
|
/*---------------------------------------------------------------------------*/
|
|
void Dict::FindClassifierErrors(FLOAT32 MinRating,
|
|
FLOAT32 MaxRating,
|
|
FLOAT32 RatingMargin,
|
|
FLOAT32 Thresholds[]) {
|
|
/*
|
|
** Parameters:
|
|
** MinRating limits how tight to make a template
|
|
** MaxRating limits how loose to make a template
|
|
** RatingMargin amount of margin to put in template
|
|
** Thresholds[] place to put error thresholds
|
|
** Operation: This routine compares the best choice for the current
|
|
** word to the best raw choice to determine which characters
|
|
** were classified incorrectly by the classifier. It then
|
|
** places a separate threshold into Thresholds for each
|
|
** character in the word. If the classifier was correct,
|
|
** MaxRating is placed into Thresholds. If the
|
|
** classifier was incorrect, the avg. match rating (error
|
|
** percentage) of the classifier's incorrect choice minus
|
|
** some margin is
|
|
** placed into thresholds. This can then be used by the
|
|
** caller to try to create a new template for the desired
|
|
** class that will classify the character with a rating better
|
|
** than the threshold value. The match rating placed into
|
|
** Thresholds is never allowed to be below MinRating in order
|
|
** to prevent trying to make overly tight templates.
|
|
** Return: none (results are placed in Thresholds)
|
|
** Exceptions: none
|
|
** History: Fri May 31 16:02:57 1991, DSJ, Created.
|
|
*/
|
|
EXPANDED_CHOICE BestRaw;
|
|
VIABLE_CHOICE Choice;
|
|
int i, j, Chunk;
|
|
FLOAT32 AvgRating;
|
|
int NumErrorChunks;
|
|
|
|
assert (best_choices_ != NIL);
|
|
assert (best_raw_choice_ != NULL);
|
|
|
|
ExpandChoice(best_raw_choice_, &BestRaw);
|
|
Choice = (VIABLE_CHOICE) first_node (best_choices_);
|
|
|
|
for (i = 0, Chunk = 0; i < Choice->Length; i++, Thresholds++) {
|
|
AvgRating = 0.0;
|
|
NumErrorChunks = 0;
|
|
|
|
for (j = 0; j < Choice->Blob[i].NumChunks; j++, Chunk++) {
|
|
if (Choice->Blob[i].Class != BestRaw.ChunkClass[Chunk]) {
|
|
AvgRating += BestRaw.ChunkCertainty[Chunk];
|
|
NumErrorChunks++;
|
|
}
|
|
}
|
|
|
|
if (NumErrorChunks > 0) {
|
|
AvgRating /= NumErrorChunks;
|
|
*Thresholds = (AvgRating / -certainty_scale) * (1.0 - RatingMargin);
|
|
}
|
|
else
|
|
*Thresholds = MaxRating;
|
|
|
|
if (*Thresholds > MaxRating)
|
|
*Thresholds = MaxRating;
|
|
if (*Thresholds < MinRating)
|
|
*Thresholds = MinRating;
|
|
}
|
|
} /* FindClassifierErrors */
|
|
|
|
|
|
/*---------------------------------------------------------------------------*/
|
|
void Dict::InitChoiceAccum() {
|
|
/*
|
|
** Parameters: none
|
|
** Operation: This routine initializes the data structures used to
|
|
** keep track the good word choices found for a word.
|
|
** Return: none
|
|
** Exceptions: none
|
|
** History: Fri May 17 07:59:00 1991, DSJ, Created.
|
|
*/
|
|
BLOB_WIDTH *BlobWidth, *End;
|
|
|
|
if (best_raw_choice_)
|
|
memfree(best_raw_choice_);
|
|
best_raw_choice_ = NULL;
|
|
|
|
if (best_choices_)
|
|
destroy_nodes(best_choices_, memfree);
|
|
best_choices_ = NIL;
|
|
|
|
if (raw_choices_)
|
|
destroy_nodes(raw_choices_, memfree);
|
|
raw_choices_ = NIL;
|
|
|
|
EnableChoiceAccum();
|
|
|
|
for (BlobWidth = current_segmentation_,
|
|
End = current_segmentation_ + MAX_NUM_CHUNKS;
|
|
BlobWidth < End; *BlobWidth++ = 1);
|
|
|
|
} /* InitChoiceAccum */
|
|
|
|
|
|
/*---------------------------------------------------------------------------*/
|
|
void Dict::LogNewSegmentation(PIECES_STATE BlobWidth) {
|
|
/*
|
|
** Parameters:
|
|
** BlobWidth[] number of chunks in each blob in segmentation
|
|
** Variables Used:
|
|
** current_segmentation blob widths for current segmentation
|
|
** Operation: This routine updates the blob widths in current_segmentation
|
|
** to be the same as provided in BlobWidth.
|
|
** Return: none
|
|
** Exceptions: none
|
|
** History: Mon May 20 11:52:26 1991, DSJ, Created.
|
|
*/
|
|
BLOB_WIDTH *Segmentation;
|
|
|
|
for (Segmentation = current_segmentation_; *BlobWidth != 0;
|
|
BlobWidth++, Segmentation++)
|
|
*Segmentation = *BlobWidth;
|
|
*Segmentation = 0;
|
|
|
|
} /* LogNewSegmentation */
|
|
|
|
|
|
/*---------------------------------------------------------------------------*/
|
|
void Dict::LogNewSplit(int Blob) {
|
|
/*
|
|
** Parameters:
|
|
** Blob index of blob that was split
|
|
** Variables Used:
|
|
** best_raw_choice_ current best raw choice
|
|
** best_choices_ list of best choices found so far
|
|
** Operation: This routine adds 1 chunk to the specified blob for each
|
|
** choice in best_choices_ and for the best_raw_choice_.
|
|
** Return: none
|
|
** Exceptions: none
|
|
** History: Mon May 20 11:38:56 1991, DSJ, Created.
|
|
*/
|
|
LIST Choices;
|
|
|
|
if (best_raw_choice_) {
|
|
AddNewChunk(best_raw_choice_, Blob);
|
|
}
|
|
|
|
Choices = best_choices_;
|
|
iterate(Choices) {
|
|
AddNewChunk ((VIABLE_CHOICE) first_node (Choices), Blob);
|
|
}
|
|
Choices = raw_choices_;
|
|
iterate(Choices) {
|
|
AddNewChunk ((VIABLE_CHOICE) first_node (Choices), Blob);
|
|
}
|
|
} /* LogNewSplit */
|
|
|
|
|
|
/*---------------------------------------------------------------------------*/
|
|
void Dict::LogNewChoice(const WERD_CHOICE &WordChoice,
|
|
FLOAT32 AdjustFactor,
|
|
const float Certainties[],
|
|
bool raw_choice) {
|
|
/*
|
|
** Parameters:
|
|
** Choice new choice for current word
|
|
** AdjustFactor adjustment factor which was applied to choice
|
|
** Certainties certainties for each char in new choice
|
|
** ChoicesList list with choices seen so far
|
|
** Variables Used:
|
|
** best_raw_choice_ best raw choice so far for current word
|
|
** Operation: This routine adds Choice to ChoicesList if the
|
|
** adjusted certainty for Choice is within a reasonable range
|
|
** of the best choice in ChoicesList. The ChoicesList
|
|
** list is kept in sorted order by rating. Duplicates are
|
|
** removed.
|
|
** Return: none
|
|
** Exceptions: none
|
|
** History: Wed May 15 09:57:19 1991, DSJ, Created.
|
|
*/
|
|
VIABLE_CHOICE NewChoice;
|
|
LIST ChoicesList;
|
|
LIST Choices;
|
|
FLOAT32 Threshold;
|
|
|
|
if (!keep_word_choices_)
|
|
return;
|
|
|
|
if (raw_choice) {
|
|
if (!best_raw_choice_)
|
|
best_raw_choice_ = NewViableChoice(WordChoice, AdjustFactor, Certainties);
|
|
else if (WordChoice.rating() < best_raw_choice_->Rating) {
|
|
if (ChoiceSameAs(WordChoice, best_raw_choice_))
|
|
FillViableChoice(WordChoice, AdjustFactor, Certainties, true,
|
|
best_raw_choice_);
|
|
else {
|
|
memfree(best_raw_choice_);
|
|
best_raw_choice_ =
|
|
NewViableChoice(WordChoice, AdjustFactor, Certainties);
|
|
}
|
|
}
|
|
if (!save_raw_choices) return;
|
|
ChoicesList = raw_choices_;
|
|
} else {
|
|
ChoicesList = best_choices_;
|
|
}
|
|
|
|
/* throw out obviously bad choices to save some work */
|
|
if (ChoicesList != NIL) {
|
|
Threshold = AmbigThreshold (BestFactor (ChoicesList), AdjustFactor);
|
|
if (Threshold > -stopper_ambiguity_threshold_offset)
|
|
Threshold = -stopper_ambiguity_threshold_offset;
|
|
if (WordChoice.certainty() - BestCertainty (ChoicesList) < Threshold)
|
|
return;
|
|
}
|
|
|
|
/* see if a choice with the same text string has already been found */
|
|
NewChoice = NULL;
|
|
Choices = ChoicesList;
|
|
|
|
iterate(Choices) {
|
|
if (ChoiceSameAs (WordChoice, (VIABLE_CHOICE) first_node (Choices))) {
|
|
if (WordChoice.rating() < BestRating (Choices)) {
|
|
NewChoice = (VIABLE_CHOICE) first_node (Choices);
|
|
} else {
|
|
return;
|
|
}
|
|
}
|
|
}
|
|
|
|
if (NewChoice) {
|
|
FillViableChoice(WordChoice, AdjustFactor, Certainties, true, NewChoice);
|
|
ChoicesList = delete_d(ChoicesList, NewChoice, is_same_node);
|
|
}
|
|
else {
|
|
NewChoice = NewViableChoice (WordChoice, AdjustFactor, Certainties);
|
|
}
|
|
|
|
ChoicesList = s_adjoin (ChoicesList, NewChoice, CmpChoiceRatings);
|
|
if (stopper_debug_level >= 2)
|
|
raw_choice ? PrintViableChoice (stderr, "New Raw Choice: ", NewChoice) :
|
|
PrintViableChoice (stderr, "New Word Choice: ", NewChoice);
|
|
if (count (ChoicesList) > tessedit_truncate_wordchoice_log) {
|
|
Choices =
|
|
(LIST) nth_cell (ChoicesList, tessedit_truncate_wordchoice_log);
|
|
destroy_nodes (rest (Choices), Efree);
|
|
set_rest(Choices, NIL);
|
|
}
|
|
|
|
// Update raw_choices_/best_choices_ pointer.
|
|
if (raw_choice) {
|
|
raw_choices_ = ChoicesList;
|
|
} else {
|
|
best_choices_ = ChoicesList;
|
|
}
|
|
} /* LogNewChoice */
|
|
|
|
|
|
/*---------------------------------------------------------------------------*/
|
|
int Dict::NoDangerousAmbig(WERD_CHOICE *best_choice,
|
|
DANGERR *fix_pt,
|
|
bool fix_replaceable,
|
|
BLOB_CHOICE_LIST_VECTOR *blob_choices,
|
|
bool *modified_blobs) {
|
|
if (stopper_debug_level > 2) {
|
|
tprintf("\nRunning NoDangerousAmbig() for %s\n",
|
|
best_choice->debug_string(getUnicharset()).string());
|
|
}
|
|
|
|
// Construct BLOB_CHOICE_LIST_VECTOR with ambiguities
|
|
// for each unichar id in BestChoice.
|
|
BLOB_CHOICE_LIST_VECTOR ambig_blob_choices;
|
|
int i;
|
|
bool modified_best_choice = false;
|
|
bool ambigs_found = false;
|
|
// For each position in best_choice:
|
|
// -- choose AMBIG_SPEC_LIST that corresponds to unichar_id at best_choice[i]
|
|
// -- initialize wrong_ngram with a single unichar_id at best_choice[i]
|
|
// -- look for ambiguities corresponding to wrong_ngram in the list while
|
|
// adding the following unichar_ids from best_choice to wrong_ngram
|
|
//
|
|
// Repeat the above procedure twice: first time look through
|
|
// ambigs to be replaced and replace all the ambiguities found;
|
|
// second time look through dangerous ambiguities and construct
|
|
// ambig_blob_choices with fake a blob choice for each ambiguity
|
|
// and pass them to dawg_permute_and_select() to search for
|
|
// ambiguous words in the dictionaries.
|
|
//
|
|
// Note that during the execution of the for loop (on the first pass)
|
|
// if replacements are made the length of best_choice might change.
|
|
for (int pass = 0; pass < 2; ++pass) {
|
|
bool replace = (pass == 0);
|
|
const UnicharAmbigsVector &table = replace ?
|
|
getUnicharAmbigs().replace_ambigs() : getUnicharAmbigs().dang_ambigs();
|
|
if (!replace) {
|
|
// Initialize ambig_blob_choices with lists containing a single
|
|
// unichar id for the correspoding position in best_choice.
|
|
// best_choice consisting from only the original letters will
|
|
// have a rating of 0.0.
|
|
for (i = 0; i < best_choice->length(); ++i) {
|
|
BLOB_CHOICE_LIST *lst = new BLOB_CHOICE_LIST();
|
|
BLOB_CHOICE_IT lst_it(lst);
|
|
lst_it.add_to_end(new BLOB_CHOICE(best_choice->unichar_id(i),
|
|
0.0, 0.0, 0, -1));
|
|
ambig_blob_choices.push_back(lst);
|
|
}
|
|
}
|
|
UNICHAR_ID wrong_ngram[MAX_AMBIG_SIZE + 1];
|
|
int wrong_ngram_index;
|
|
int next_index;
|
|
for (i = 0; i < best_choice->length(); ++i) {
|
|
UNICHAR_ID curr_unichar_id = best_choice->unichar_id(i);
|
|
if (stopper_debug_level > 2) {
|
|
tprintf("Looking for %s ngrams starting with %s:\n",
|
|
replace ? "replaceable" : "ambiguous",
|
|
getUnicharset().debug_str(curr_unichar_id).string());
|
|
}
|
|
wrong_ngram_index = 0;
|
|
wrong_ngram[wrong_ngram_index] = curr_unichar_id;
|
|
if (curr_unichar_id == INVALID_UNICHAR_ID ||
|
|
curr_unichar_id >= table.size() ||
|
|
table[curr_unichar_id] == NULL) {
|
|
continue; // there is no ambig spec for this unichar id
|
|
}
|
|
AmbigSpec_IT spec_it(table[curr_unichar_id]);
|
|
for (spec_it.mark_cycle_pt(); !spec_it.cycled_list();) {
|
|
const AmbigSpec *ambig_spec = spec_it.data();
|
|
wrong_ngram[wrong_ngram_index+1] = INVALID_UNICHAR_ID;
|
|
int compare = UnicharIdArrayUtils::compare(wrong_ngram,
|
|
ambig_spec->wrong_ngram);
|
|
if (stopper_debug_level > 2) {
|
|
tprintf("candidate ngram: ");
|
|
UnicharIdArrayUtils::print(wrong_ngram, getUnicharset());
|
|
tprintf("current ngram from spec: ");
|
|
UnicharIdArrayUtils::print(ambig_spec->wrong_ngram, getUnicharset());
|
|
tprintf("comparison result: %d\n", compare);
|
|
}
|
|
if (compare == 0) {
|
|
if (replace) {
|
|
if (stopper_debug_level > 2) {
|
|
tprintf("replace ambiguity with: ");
|
|
UnicharIdArrayUtils::print(
|
|
ambig_spec->correct_fragments, getUnicharset());
|
|
}
|
|
ReplaceAmbig(i, ambig_spec->wrong_ngram_size,
|
|
ambig_spec->correct_ngram_id,
|
|
best_choice, blob_choices, modified_blobs);
|
|
modified_best_choice = true;
|
|
} else if (i > 0 || ambig_spec->type != CASE_AMBIG) {
|
|
// We found dang ambig - update ambig_blob_choices.
|
|
if (stopper_debug_level > 2) {
|
|
tprintf("found ambiguity: ");
|
|
UnicharIdArrayUtils::print(
|
|
ambig_spec->correct_fragments, getUnicharset());
|
|
}
|
|
ambigs_found = true;
|
|
for (int tmp_index = 0; tmp_index <= wrong_ngram_index;
|
|
++tmp_index) {
|
|
// Add a blob choice for the corresponding fragment of the
|
|
// ambiguity. These fake blob choices are initialized with
|
|
// negative ratings (which are not possible for real blob
|
|
// choices), so that dawg_permute_and_select() considers any
|
|
// word not consisting of only the original letters a better
|
|
// choice and stops searching for alternatives once such a
|
|
// choice is found.
|
|
BLOB_CHOICE_IT bc_it(ambig_blob_choices[i+tmp_index]);
|
|
bc_it.add_to_end(new BLOB_CHOICE(
|
|
ambig_spec->correct_fragments[tmp_index], -1.0, 0.0, 0, -1));
|
|
}
|
|
}
|
|
spec_it.forward();
|
|
} else if (compare == -1) {
|
|
if (wrong_ngram_index+1 < ambig_spec->wrong_ngram_size &&
|
|
((next_index = wrong_ngram_index+1+i) < best_choice->length())) {
|
|
// Add the next unichar id to wrong_ngram and keep looking for
|
|
// more ambigs starting with curr_unichar_id in AMBIG_SPEC_LIST.
|
|
wrong_ngram[++wrong_ngram_index] =
|
|
best_choice->unichar_id(next_index);
|
|
} else {
|
|
break; // no more matching ambigs in this AMBIG_SPEC_LIST
|
|
}
|
|
} else {
|
|
spec_it.forward();
|
|
}
|
|
} // end searching AmbigSpec_LIST
|
|
} // end searching best_choice
|
|
} // end searching replace and dangerous ambigs
|
|
if (modified_best_choice) best_choice->populate_unichars(getUnicharset());
|
|
// If any ambiguities were found permute the constructed ambig_blob_choices
|
|
// to see if an alternative dictionary word can be found.
|
|
if (ambigs_found) {
|
|
if (stopper_debug_level > 2) {
|
|
tprintf("\nResulting ambig_blob_choices:\n");
|
|
for (i = 0; i < ambig_blob_choices.length(); ++i) {
|
|
print_ratings_list("", ambig_blob_choices.get(i), getUnicharset());
|
|
tprintf("\n");
|
|
}
|
|
}
|
|
WERD_CHOICE *alt_word = dawg_permute_and_select(ambig_blob_choices, 0.0);
|
|
ambigs_found = (alt_word->rating() < 0.0);
|
|
if (ambigs_found && stopper_debug_level >= 1) {
|
|
tprintf ("Stopper: Possible ambiguous word = %s\n",
|
|
alt_word->debug_string(getUnicharset()).string());
|
|
}
|
|
delete alt_word;
|
|
}
|
|
ambig_blob_choices.delete_data_pointers();
|
|
return !ambigs_found;
|
|
}
|
|
|
|
void Dict::EndDangerousAmbigs() {}
|
|
|
|
/*---------------------------------------------------------------------------*/
|
|
void Dict::SettupStopperPass1() {
|
|
/*
|
|
** Parameters: none
|
|
** Variables Used:
|
|
** reject_offset_ offset allowed before word is rejected
|
|
** Operation: This routine performs any settup of stopper variables
|
|
** that is needed in preparation for the first pass.
|
|
** Return: none
|
|
** Exceptions: none
|
|
** History: Mon Jun 3 12:32:00 1991, DSJ, Created.
|
|
*/
|
|
reject_offset_ = 0.0;
|
|
} /* SettupStopperPass1 */
|
|
|
|
|
|
/*---------------------------------------------------------------------------*/
|
|
void Dict::SettupStopperPass2() {
|
|
/*
|
|
** Parameters: none
|
|
** Variables Used:
|
|
** reject_offset_ offset allowed before word is rejected
|
|
** Operation: This routine performs any settup of stopper variables
|
|
** that is needed in preparation for the second pass.
|
|
** Return: none
|
|
** Exceptions: none
|
|
** History: Mon Jun 3 12:32:00 1991, DSJ, Created.
|
|
*/
|
|
reject_offset_ = stopper_phase2_certainty_rejection_offset;
|
|
} /* SettupStopperPass2 */
|
|
} // namespace tesseract
|
|
|
|
|
|
/**----------------------------------------------------------------------------
|
|
Private Code
|
|
----------------------------------------------------------------------------**/
|
|
/*---------------------------------------------------------------------------*/
|
|
void AddNewChunk(VIABLE_CHOICE Choice, int Blob) {
|
|
/*
|
|
** Parameters:
|
|
** Choice choice to add a new chunk to
|
|
** Blob index of blob being split
|
|
** Operation: This routine increments the chunk count of the character
|
|
** in Choice which corresponds to Blob.
|
|
** Return: none
|
|
** Exceptions: none
|
|
** History: Mon May 20 11:43:27 1991, DSJ, Created.
|
|
*/
|
|
int i, LastChunk;
|
|
|
|
for (i = 0, LastChunk = 0; i < Choice->Length; i++) {
|
|
LastChunk += Choice->Blob[i].NumChunks;
|
|
if (Blob < LastChunk) {
|
|
(Choice->Blob[i].NumChunks)++;
|
|
return;
|
|
}
|
|
}
|
|
mem_tidy (1);
|
|
cprintf ("AddNewChunk failed:Choice->Length=%d, LastChunk=%d, Blob=%d\n",
|
|
Choice->Length, LastChunk, Blob);
|
|
assert(FALSE); /* this should never get executed */
|
|
|
|
} /* AddNewChunk */
|
|
|
|
|
|
/*---------------------------------------------------------------------------*/
|
|
namespace tesseract {
|
|
// Replaces the corresponding wrong ngram in werd_choice with the correct one.
|
|
// We indicate that this newly inserted ngram unichar is composed from several
|
|
// fragments and modify the corresponding entries in blob_choices to contain
|
|
// fragments of the correct ngram unichar instead of the original unichars.
|
|
// Ratings and certainties of entries in blob_choices and werd_choice are
|
|
// unichaged. E.g. for werd_choice mystring'' and ambiguity ''->":
|
|
// werd_choice becomes mystring", first ' in blob_choices becomes |"|0|2,
|
|
// second one is set to |"|1|2.
|
|
void Dict::ReplaceAmbig(int wrong_ngram_begin_index, int wrong_ngram_size,
|
|
UNICHAR_ID correct_ngram_id, WERD_CHOICE *werd_choice,
|
|
BLOB_CHOICE_LIST_VECTOR *blob_choices,
|
|
bool *modified_blobs) {
|
|
int num_blobs_to_replace = 0;
|
|
int begin_blob_index = 0;
|
|
int i;
|
|
for (i = 0; i < wrong_ngram_begin_index + wrong_ngram_size; ++i) {
|
|
if (i >= wrong_ngram_begin_index) {
|
|
num_blobs_to_replace += werd_choice->fragment_length(i);
|
|
} else {
|
|
begin_blob_index += werd_choice->fragment_length(i);
|
|
}
|
|
}
|
|
BLOB_CHOICE_IT bit;
|
|
int temp_blob_index = begin_blob_index;
|
|
const char *temp_uch = NULL;
|
|
const char *correct_ngram_str =
|
|
getUnicharset().id_to_unichar(correct_ngram_id);
|
|
for (int replaced_count = 0; replaced_count < wrong_ngram_size;
|
|
++replaced_count) {
|
|
if (blob_choices != NULL) {
|
|
UNICHAR_ID uch_id = werd_choice->unichar_id(wrong_ngram_begin_index);
|
|
int fraglen = werd_choice->fragment_length(wrong_ngram_begin_index);
|
|
if (fraglen > 1) temp_uch = getUnicharset().id_to_unichar(uch_id);
|
|
for (i = 0; i < fraglen; ++i) {
|
|
if (fraglen > 1) {
|
|
STRING frag_str =
|
|
CHAR_FRAGMENT::to_string(temp_uch, i, fraglen);
|
|
getUnicharset().unichar_insert(frag_str.string());
|
|
uch_id = getUnicharset().unichar_to_id(frag_str.string());
|
|
}
|
|
bit.set_to_list(blob_choices->get(temp_blob_index));
|
|
STRING correct_frag_uch =
|
|
CHAR_FRAGMENT::to_string(correct_ngram_str,
|
|
temp_blob_index - begin_blob_index,
|
|
num_blobs_to_replace);
|
|
getUnicharset().unichar_insert(correct_frag_uch.string());
|
|
UNICHAR_ID correct_frag_uch_id =
|
|
getUnicharset().unichar_to_id(correct_frag_uch.string());
|
|
// Find the WERD_CHOICE corresponding to the original unichar in
|
|
// the list of blob choices, add the derived character fragment
|
|
// before it with the same rating and certainty.
|
|
for (bit.mark_cycle_pt(); !bit.cycled_list(); bit.forward()) {
|
|
if (bit.data()->unichar_id() == correct_frag_uch_id) {
|
|
break; // the unichar we want to insert is already there
|
|
}
|
|
if (bit.data()->unichar_id() == uch_id) {
|
|
bit.add_before_then_move(new BLOB_CHOICE(*(bit.data())));
|
|
bit.data()->set_unichar_id(correct_frag_uch_id);
|
|
if (modified_blobs != NULL) *modified_blobs = true;
|
|
break;
|
|
}
|
|
}
|
|
temp_blob_index++;
|
|
}
|
|
}
|
|
// Remove current unichar from werd_choice. On the last iteration
|
|
// set the correct replacement unichar instead of removing a unichar.
|
|
if (replaced_count + 1 == wrong_ngram_size) {
|
|
werd_choice->set_unichar_id(correct_ngram_id,
|
|
num_blobs_to_replace, 0.0, 0.0, wrong_ngram_begin_index);
|
|
} else {
|
|
werd_choice->remove_unichar_id(wrong_ngram_begin_index);
|
|
}
|
|
}
|
|
if (stopper_debug_level >= 1) {
|
|
tprintf("ReplaceAmbigs() modified werd_choice: %s\n",
|
|
werd_choice->debug_string(getUnicharset()).string());
|
|
werd_choice->print();
|
|
if (modified_blobs != NULL && *modified_blobs && blob_choices != NULL) {
|
|
tprintf("Modified blob_choices: ");
|
|
for (int i = 0; i < blob_choices->size(); ++i) {
|
|
print_ratings_list("\n", blob_choices->get(i), getUnicharset());
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
|
|
/*---------------------------------------------------------------------------*/
|
|
int Dict::ChoiceSameAs(const WERD_CHOICE &WordChoice,
|
|
VIABLE_CHOICE ViableChoice) {
|
|
/*
|
|
** Parameters:
|
|
** Choice choice to compare to ViableChoice
|
|
** ViableChoice viable choice to compare to Choice
|
|
** Operation: This routine compares the corresponding strings of
|
|
** Choice and ViableChoice and returns TRUE if they are the
|
|
** same, FALSE otherwise.
|
|
** Return: TRUE or FALSE.
|
|
** Exceptions: none
|
|
** History: Fri May 17 08:48:04 1991, DSJ, Created.
|
|
*/
|
|
return (StringSameAs(WordChoice, ViableChoice));
|
|
|
|
} /* ChoiceSameAs */
|
|
} // namespace tesseract
|
|
|
|
|
|
/*---------------------------------------------------------------------------*/
|
|
int CmpChoiceRatings(void *arg1, //VIABLE_CHOICE Choice1,
|
|
void *arg2) { //VIABLE_CHOICE Choice2)
|
|
/*
|
|
** Parameters:
|
|
** Choice1, Choice2 choices to compare ratings for
|
|
** Operation: Return -1 if the rating for Choice1 is less than the
|
|
** rating for Choice2, otherwise return (1).
|
|
** Return: -1 or 1
|
|
** Exceptions: none
|
|
** History: Wed May 15 13:02:37 1991, DSJ, Created.
|
|
*/
|
|
float R1, R2;
|
|
VIABLE_CHOICE Choice1 = (VIABLE_CHOICE) arg1;
|
|
VIABLE_CHOICE Choice2 = (VIABLE_CHOICE) arg2;
|
|
|
|
R1 = Choice1->Rating;
|
|
R2 = Choice2->Rating;
|
|
|
|
if (R1 < R2)
|
|
return (-1);
|
|
else
|
|
return (1);
|
|
|
|
} /* CmpChoiceRatings */
|
|
|
|
|
|
/*---------------------------------------------------------------------------*/
|
|
void ExpandChoice(VIABLE_CHOICE Choice, EXPANDED_CHOICE *ExpandedChoice) {
|
|
/*
|
|
** Parameters:
|
|
** Choice choice to be expanded
|
|
** ExpandedChoice place to put resulting expanded choice
|
|
** Operation: This routine expands Choice and places the results
|
|
** in ExpandedChoice. The primary function of expansion
|
|
** is to create an two arrays, one which holds the corresponding
|
|
** certainty for each chunk in Choice, and one which holds
|
|
** the class for each chunk.
|
|
** Return: none (results are placed in ExpandedChoice)
|
|
** Exceptions: none
|
|
** History: Fri May 31 15:21:57 1991, DSJ, Created.
|
|
*/
|
|
int i, j, Chunk;
|
|
|
|
ExpandedChoice->Choice = Choice;
|
|
for (i = 0, Chunk = 0; i < Choice->Length; i++)
|
|
for (j = 0; j < Choice->Blob[i].NumChunks; j++, Chunk++) {
|
|
ExpandedChoice->ChunkCertainty[Chunk] = Choice->Blob[i].Certainty;
|
|
ExpandedChoice->ChunkClass[Chunk] = Choice->Blob[i].Class;
|
|
}
|
|
} /* ExpandChoice */
|
|
|
|
/*---------------------------------------------------------------------------*/
|
|
int FreeBadChoice(void *item1, //VIABLE_CHOICE Choice,
|
|
void *item2) { //EXPANDED_CHOICE *BestChoice)
|
|
/*
|
|
** Parameters:
|
|
** Choice choice to be tested
|
|
** BestChoice best choice found
|
|
** Variables Used:
|
|
** stopper_ambiguity_threshold_gain
|
|
** stopper_ambiguity_threshold_offset
|
|
** Operation: If the certainty of any chunk in Choice is not ambiguous
|
|
** with the corresponding chunk in the best choice, free
|
|
** Choice and return TRUE. Otherwise, return FALSE.
|
|
** Return: TRUE or FALSE.
|
|
** Exceptions: none
|
|
** History: Wed May 15 13:20:26 1991, DSJ, Created.
|
|
*/
|
|
int i, j, Chunk;
|
|
FLOAT32 Threshold;
|
|
VIABLE_CHOICE Choice;
|
|
EXPANDED_CHOICE *BestChoice;
|
|
|
|
Choice = (VIABLE_CHOICE) item1;
|
|
BestChoice = (EXPANDED_CHOICE *) item2;
|
|
|
|
Threshold = AmbigThreshold (BestChoice->Choice->AdjustFactor,
|
|
Choice->AdjustFactor);
|
|
|
|
for (i = 0, Chunk = 0; i < Choice->Length; i++)
|
|
for (j = 0; j < Choice->Blob[i].NumChunks; j++, Chunk++)
|
|
if (Choice->Blob[i].Class != BestChoice->ChunkClass[Chunk] &&
|
|
Choice->Blob[i].Certainty - BestChoice->ChunkCertainty[Chunk] <
|
|
Threshold) {
|
|
memfree(Choice);
|
|
return (TRUE);
|
|
}
|
|
|
|
return (FALSE);
|
|
|
|
} /* FreeBadChoice */
|
|
|
|
|
|
/*---------------------------------------------------------------------------*/
|
|
namespace tesseract {
|
|
int Dict::LengthOfShortestAlphaRun(const WERD_CHOICE &WordChoice) {
|
|
/*
|
|
** Parameters:
|
|
** Word word to be tested
|
|
** Operation: Return the length of the shortest alpha run in Word.
|
|
** Return: Return the length of the shortest alpha run in Word.
|
|
** Exceptions: none
|
|
** History: Tue May 14 07:50:45 1991, DSJ, Created.
|
|
*/
|
|
register int Shortest = MAX_INT32;
|
|
register int Length;
|
|
int x;
|
|
int y;
|
|
|
|
for (x = 0; x < WordChoice.length(); ++x) {
|
|
if (getUnicharset().get_isalpha(WordChoice.unichar_id(x))) {
|
|
for (y = x + 1, Length = 1;
|
|
y < WordChoice.length() &&
|
|
getUnicharset().get_isalpha(WordChoice.unichar_id(y));
|
|
++y, ++Length);
|
|
if (Length < Shortest) {
|
|
Shortest = Length;
|
|
}
|
|
if (y == WordChoice.length()) {
|
|
break;
|
|
}
|
|
}
|
|
}
|
|
if (Shortest == MAX_INT32)
|
|
Shortest = 0;
|
|
|
|
return (Shortest);
|
|
|
|
} /* LengthOfShortestAlphaRun */
|
|
|
|
|
|
/*---------------------------------------------------------------------------*/
|
|
VIABLE_CHOICE Dict::NewViableChoice(const WERD_CHOICE &WordChoice,
|
|
FLOAT32 AdjustFactor,
|
|
const float Certainties[]) {
|
|
/*
|
|
** Parameters:
|
|
** Choice choice to be converted to a viable choice
|
|
** AdjustFactor factor used to adjust ratings for Choice
|
|
** Certainties certainty for each character in Choice
|
|
** Variables Used:
|
|
** current_segmentation segmentation corresponding to Choice
|
|
** Operation: Allocate a new viable choice data structure, copy
|
|
** Choice, Certainties, and current_segmentation_ into it,
|
|
** and return a pointer to it.
|
|
** Return: Ptr to new viable choice.
|
|
** Exceptions: none
|
|
** History: Thu May 16 15:28:29 1991, DSJ, Created.
|
|
*/
|
|
int Length = WordChoice.length();
|
|
assert (Length <= MAX_NUM_CHUNKS && Length > 0);
|
|
VIABLE_CHOICE NewChoice = (VIABLE_CHOICE) Emalloc (
|
|
sizeof (VIABLE_CHOICE_STRUCT) + (Length - 1) * sizeof (CHAR_CHOICE));
|
|
FillViableChoice(WordChoice, AdjustFactor, Certainties, false, NewChoice);
|
|
return (NewChoice);
|
|
} /* NewViableChoice */
|
|
|
|
|
|
/*---------------------------------------------------------------------------*/
|
|
void Dict::PrintViableChoice(FILE *File, const char *Label, VIABLE_CHOICE Choice) {
|
|
/*
|
|
** Parameters:
|
|
** File open text file to print Choice to
|
|
** Label text label to be printed with Choice
|
|
** Choice choice to be printed
|
|
** Operation: This routine dumps a text representation of the
|
|
** specified Choice to File.
|
|
** Return: none
|
|
** Exceptions: none
|
|
** History: Mon May 20 11:16:44 1991, DSJ, Created.
|
|
*/
|
|
int i, j;
|
|
|
|
fprintf (File, "%s", Label);
|
|
|
|
fprintf(File, "(R=%5.1f, C=%4.1f, F=%4.2f, Frag=%d) ",
|
|
Choice->Rating, Choice->Certainty,
|
|
Choice->AdjustFactor, Choice->ComposedFromCharFragments);
|
|
|
|
for (i = 0; i < Choice->Length; i++)
|
|
fprintf(File, "%s", getUnicharset().id_to_unichar(Choice->Blob[i].Class));
|
|
fprintf(File, "\n");
|
|
|
|
for (i = 0; i < Choice->Length; i++) {
|
|
fprintf(File, " %s", getUnicharset().id_to_unichar(Choice->Blob[i].Class));
|
|
for (j = 0; j < Choice->Blob[i].NumChunks - 1; j++)
|
|
fprintf(File, " ");
|
|
}
|
|
fprintf(File, "\n");
|
|
|
|
for (i = 0; i < Choice->Length; i++) {
|
|
for (j = 0; j < Choice->Blob[i].NumChunks; j++)
|
|
fprintf(File, "%3d ", (int) (Choice->Blob[i].Certainty * -10.0));
|
|
}
|
|
fprintf(File, "\n");
|
|
|
|
for (i = 0; i < Choice->Length; i++) {
|
|
for (j = 0; j < Choice->Blob[i].NumChunks; j++)
|
|
fprintf(File, "%3d ", Choice->Blob[i].NumChunks);
|
|
}
|
|
fprintf(File, "\n");
|
|
} /* PrintViableChoice */
|
|
|
|
|
|
/*---------------------------------------------------------------------------*/
|
|
void Dict::FillViableChoice(const WERD_CHOICE &WordChoice,
|
|
FLOAT32 AdjustFactor, const float Certainties[],
|
|
bool SameString, VIABLE_CHOICE ViableChoice) {
|
|
/*
|
|
** Parameters:
|
|
** WordChoice a choice with info that will be copied
|
|
** AdjustFactor factor used to adjust ratings for AChoice
|
|
** Certainties certainty for each character in AChoice
|
|
** SameString if true the string in the viable choice
|
|
** will not be changed
|
|
** ViableChoice existing viable choice to fill in
|
|
** Variables Used:
|
|
** current_segmentation_ segmentation for NewChoice
|
|
** Operation:
|
|
** Fill ViableChoice with information from AChoice,
|
|
** AdjustFactor, and Certainties.
|
|
** Return: none
|
|
** Exceptions: none
|
|
** History: Fri May 17 13:35:58 1991, DSJ, Created.
|
|
*/
|
|
CHAR_CHOICE *NewChar;
|
|
BLOB_WIDTH *BlobWidth;
|
|
int x;
|
|
|
|
ViableChoice->Rating = WordChoice.rating();
|
|
ViableChoice->Certainty = WordChoice.certainty();
|
|
ViableChoice->AdjustFactor = AdjustFactor;
|
|
ViableChoice->ComposedFromCharFragments = false;
|
|
if (!SameString) {
|
|
ViableChoice->Length = WordChoice.length();
|
|
}
|
|
for (x = 0,
|
|
NewChar = &(ViableChoice->Blob[0]),
|
|
BlobWidth = current_segmentation_;
|
|
x < WordChoice.length();
|
|
x++, NewChar++, Certainties++, BlobWidth++) {
|
|
if (!SameString) {
|
|
NewChar->Class = WordChoice.unichar_id(x);
|
|
}
|
|
NewChar->NumChunks = *BlobWidth;
|
|
NewChar->Certainty = *Certainties;
|
|
for (int i = 1; i < WordChoice.fragment_length(x); ++i) {
|
|
BlobWidth++;
|
|
assert(*BlobWidth > 0);
|
|
NewChar->NumChunks += *BlobWidth;
|
|
ViableChoice->ComposedFromCharFragments = true;
|
|
}
|
|
}
|
|
} /* FillViableChoice */
|
|
|
|
|
|
// Compares unichar ids in word_choice to those in viable_choice,
|
|
// returns true if they are the same, false otherwise.
|
|
bool Dict::StringSameAs(const WERD_CHOICE &WordChoice,
|
|
VIABLE_CHOICE ViableChoice) {
|
|
if (WordChoice.length() != ViableChoice->Length) {
|
|
return false;
|
|
}
|
|
int i;
|
|
CHAR_CHOICE *CharChoice;
|
|
for (i = 0, CharChoice = &(ViableChoice->Blob[0]);
|
|
i < ViableChoice->Length; CharChoice++, i++) {
|
|
if (CharChoice->Class != WordChoice.unichar_id(i)) {
|
|
return false;
|
|
}
|
|
}
|
|
return true;
|
|
}
|
|
|
|
/*---------------------------------------------------------------------------*/
|
|
int Dict::StringSameAs(const char *String,
|
|
const char *String_lengths,
|
|
VIABLE_CHOICE ViableChoice) {
|
|
/*
|
|
** Parameters:
|
|
** String string to compare to ViableChoice
|
|
** String_lengths lengths of unichars in String
|
|
** ViableChoice viable choice to compare to String
|
|
** Operation: This routine compares String to ViableChoice and
|
|
** returns TRUE if they are the same, FALSE otherwise.
|
|
** Return: TRUE or FALSE.
|
|
** Exceptions: none
|
|
** History: Fri May 17 08:48:04 1991, DSJ, Created.
|
|
*/
|
|
CHAR_CHOICE *Char;
|
|
int i;
|
|
int current_unichar_length;
|
|
|
|
for (Char = &(ViableChoice->Blob[0]), i = 0;
|
|
i < ViableChoice->Length;
|
|
String += *(String_lengths++), Char++, i++) {
|
|
current_unichar_length = strlen(getUnicharset().id_to_unichar(Char->Class));
|
|
if (current_unichar_length != *String_lengths ||
|
|
strncmp(String, getUnicharset().id_to_unichar(Char->Class),
|
|
current_unichar_length) != 0)
|
|
return (FALSE);
|
|
}
|
|
|
|
if (*String == 0)
|
|
return (TRUE);
|
|
else
|
|
return (FALSE);
|
|
|
|
} /* StringSameAs */
|
|
} // namespace tesseract
|
|
|
|
/*---------------------------------------------------------------------------*/
|
|
int UniformCertainties(const BLOB_CHOICE_LIST_VECTOR &Choices,
|
|
const WERD_CHOICE &BestChoice) {
|
|
/*
|
|
** Parameters:
|
|
** Choices choices for current segmentation
|
|
** BestChoice best choice for current segmentation
|
|
** Variables Used:
|
|
** stopper_allowable_character_badness
|
|
** max allowed certainty variation
|
|
** Operation: This routine returns TRUE if the certainty of the
|
|
** BestChoice word is within a reasonable range of the average
|
|
** certainties for the best choices for each character in
|
|
** the segmentation. This test is used to catch words in which
|
|
** one character is much worse than the other characters in
|
|
** the word (i.e. FALSE will be returned in that case).
|
|
** The algorithm computes the mean and std deviation of the
|
|
** certainties in the word with the worst certainty thrown out.
|
|
** Return: TRUE or FALSE.
|
|
** Exceptions: none
|
|
** History: Tue May 14 08:23:21 1991, DSJ, Created.
|
|
*/
|
|
float Certainty;
|
|
float WorstCertainty = MAX_FLOAT32;
|
|
float CertaintyThreshold;
|
|
FLOAT64 TotalCertainty;
|
|
FLOAT64 TotalCertaintySquared;
|
|
FLOAT64 Variance;
|
|
FLOAT32 Mean, StdDev;
|
|
int WordLength;
|
|
|
|
WordLength = Choices.length();
|
|
if (WordLength < 3)
|
|
return (TRUE);
|
|
|
|
TotalCertainty = TotalCertaintySquared = 0.0;
|
|
BLOB_CHOICE_IT BlobChoiceIt;
|
|
for (int i = 0; i < Choices.length(); ++i) {
|
|
BlobChoiceIt.set_to_list(Choices.get(i));
|
|
Certainty = BlobChoiceIt.data()->certainty();
|
|
TotalCertainty += Certainty;
|
|
TotalCertaintySquared += Certainty * Certainty;
|
|
if (Certainty < WorstCertainty)
|
|
WorstCertainty = Certainty;
|
|
}
|
|
|
|
/* subtract off worst certainty from statistics */
|
|
WordLength--;
|
|
TotalCertainty -= WorstCertainty;
|
|
TotalCertaintySquared -= WorstCertainty * WorstCertainty;
|
|
|
|
Mean = TotalCertainty / WordLength;
|
|
Variance = ((WordLength * TotalCertaintySquared -
|
|
TotalCertainty * TotalCertainty) /
|
|
(WordLength * (WordLength - 1)));
|
|
if (Variance < 0.0)
|
|
Variance = 0.0;
|
|
StdDev = sqrt (Variance);
|
|
|
|
CertaintyThreshold = Mean - stopper_allowable_character_badness * StdDev;
|
|
if (CertaintyThreshold > stopper_nondict_certainty_base)
|
|
CertaintyThreshold = stopper_nondict_certainty_base;
|
|
|
|
if (BestChoice.certainty() < CertaintyThreshold) {
|
|
if (stopper_debug_level >= 1)
|
|
cprintf("Stopper: Non-uniform certainty = %4.1f"
|
|
" (m=%4.1f, s=%4.1f, t=%4.1f)\n",
|
|
BestChoice.certainty(), Mean, StdDev, CertaintyThreshold);
|
|
return (FALSE);
|
|
} else {
|
|
return (TRUE);
|
|
}
|
|
} /* UniformCertainties */
|