tesseract/dict/permute.h
2012-02-02 02:56:18 +00:00

151 lines
6.6 KiB
C

/* -*-C-*-
********************************************************************************
*
* File: permute.h (Formerly permute.h)
* Description: Permute choices together
* Author: Mark Seaman, OCR Technology
* Created: Fri Sep 22 14:05:51 1989
* Modified: Mon May 20 16:32:04 1991 (Mark Seaman) marks@hpgrlt
* Language: C
* Package: N/A
* Status: Experimental (Do Not Distribute)
*
* (c) Copyright 1989, Hewlett-Packard Company.
** Licensed under the Apache License, Version 2.0 (the "License");
** you may not use this file except in compliance with the License.
** You may obtain a copy of the License at
** http://www.apache.org/licenses/LICENSE-2.0
** Unless required by applicable law or agreed to in writing, software
** distributed under the License is distributed on an "AS IS" BASIS,
** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
** See the License for the specific language governing permissions and
** limitations under the License.
*
********************************************************************************/
#ifndef PERMUTE_H
#define PERMUTE_H
/*----------------------------------------------------------------------
I n c l u d e s
----------------------------------------------------------------------*/
#include "ratngs.h"
#include "params.h"
#include "unicharset.h"
#define MAX_PERM_LENGTH 128
/*----------------------------------------------------------------------
V a r i a b l e s
----------------------------------------------------------------------*/
extern INT_VAR_H(fragments_debug, 0, "Debug character fragments");
extern INT_VAR_H(segment_debug, 0, "Debug the whole segmentation process");
extern BOOL_VAR_H(permute_debug, 0, "char permutation debug");
extern BOOL_VAR_H(permute_script_word, 0,
"Turn on word script consistency permuter");
extern BOOL_VAR_H(permute_fixed_length_dawg, 0,
"Turn on fixed-length phrasebook search permuter");
extern BOOL_VAR_H(segment_segcost_rating, 0,
"incorporate segmentation cost in word rating?");
extern double_VAR_H(segment_reward_script, 0.95,
"Score multipler for script consistency within a word. "
"Being a 'reward' factor, it should be <= 1. "
"Smaller value implies bigger reward.");
extern BOOL_VAR_H(permute_chartype_word, 0,
"Turn on character type (property) consistency permuter");
extern double_VAR_H(segment_reward_chartype, 0.97,
"Score multipler for char type consistency within a word. ");
extern double_VAR_H(segment_reward_ngram_best_choice, 0.99,
"Score multipler for ngram permuter's best choice"
" (only used in the Han script path).");
extern INT_VAR_H(max_permuter_attempts, 100000,
"Maximum number of different character choices to consider"
" during permutation. This limit is especially useful when"
" user patterns are specified, since overly generic patterns"
" can result in dawg search exploring an overly large number"
"of options.");
extern int permute_only_top;
/*----------------------------------------------------------------------
F u n c t i o n s
----------------------------------------------------------------------*/
void adjust_non_word(const char *word, const char *word_lengths,
float rating, float *new_rating, float *adjust_factor);
const char* choose_il1(const char *first_char, //first choice
const char *second_char, //second choice
const char *third_char, //third choice
const char *prev_char, //prev in word
const char *next_char, //next in word
const char *next_next_char);
namespace tesseract {
// This is an awkward solution to allow "compounding" of permuter effects.
// Right now, each permuter generates a WERD_CHOICE with some modified
// rating which is compared to the current best choice, and the winner
// is saved. Therefore, independent permuter improvements, eg. from script
// consistency, dictionary check, and punctuation promoting, override each
// other and can not be combined.
// We need a trellis and someway to modify the path cost. Instead, we
// approximate by saving a permutation string, which records the preferred
// char choice [0-9] at each position [0..#chunks], and a cumulative reward
// factor. Non-conflicting changes can be accumulated and the combined
// result will be returned.
// Default_bias is the initial value for the base multiplier. In other words,
// it is the multiplier for raw choice rating if nothing is modified.
// This would be 1.0 when used with reward-based permuters in CJK-path,
// but it could be > 1 (eg. segment_penalty_garbage) to be compatible with
// penalty-based permuters in the Latin path.
// Note this class does not handle fragmented characters. It does so by
// setting the preferred position of fragmented characters to '1' at Init,
// which effectively skips the fragment choice. However, it can still be
// overridden if collision is allowed. It is the responsibility of the
// permuters to avoid permuting fragmented characters.
class PermuterState {
public:
PermuterState();
void Init(const BLOB_CHOICE_LIST_VECTOR& char_choices,
const UNICHARSET &unicharset,
float default_bias,
bool debug);
void AddPreference(int start_pos, char* pos_str, float weight);
void AddPreference(int char_pos, BLOB_CHOICE* blob_choice, float weight);
WERD_CHOICE* GetPermutedWord(float *certainties, float *adjust_factor);
void set_allow_collision(bool flag) { allow_collision_ = flag; }
void set_adjust_factor(float factor) { adjust_factor_ = factor; }
void set_debug(bool debug) { debug_ = debug; }
bool position_marked(int pos) { return perm_state_[pos] != kPosFree; }
private:
static const char kPosFree = '.';
const UNICHARSET *unicharset_;
const BLOB_CHOICE_LIST_VECTOR *char_choices_; // reference pointer only
// does not need to be allocated or freed
char perm_state_[MAX_PERM_LENGTH]; // handles upto MAX_PERM_LENGTH-1 states
// stores preferred char choices, '0'..'9', or '.'
int word_length_; // the number of char positions in the word
bool allow_collision_; // can previously set preference to be overwritten?
float adjust_factor_; // multiplying factor for rating adjustment
bool debug_; // whether debug statements should be printed
};
} // namespace tesseract
#endif