mirror of
https://github.com/tesseract-ocr/tesseract.git
synced 2025-01-08 02:47:49 +08:00
187 lines
7.3 KiB
C
187 lines
7.3 KiB
C
|
///////////////////////////////////////////////////////////////////////
|
||
|
// File: ambigs.h
|
||
|
// Description: Constants, flags, functions for dealing with
|
||
|
// ambiguities (training and recognition).
|
||
|
// Author: Daria Antonova
|
||
|
// Created: Mon Aug 23 11:26:43 PDT 2008
|
||
|
//
|
||
|
// (C) Copyright 2008, Google Inc.
|
||
|
// Licensed under the Apache License, Version 2.0 (the "License");
|
||
|
// you may not use this file except in compliance with the License.
|
||
|
// You may obtain a copy of the License at
|
||
|
// http://www.apache.org/licenses/LICENSE-2.0
|
||
|
// Unless required by applicable law or agreed to in writing, software
|
||
|
// distributed under the License is distributed on an "AS IS" BASIS,
|
||
|
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||
|
// See the License for the specific language governing permissions and
|
||
|
// limitations under the License.
|
||
|
//
|
||
|
///////////////////////////////////////////////////////////////////////
|
||
|
|
||
|
#ifndef TESSERACT_CCUTIL_AMBIGS_H_
|
||
|
#define TESSERACT_CCUTIL_AMBIGS_H_
|
||
|
|
||
|
#include "elst.h"
|
||
|
#include "tprintf.h"
|
||
|
#include "unichar.h"
|
||
|
#include "unicharset.h"
|
||
|
#include "genericvector.h"
|
||
|
|
||
|
#define MAX_AMBIG_SIZE 10
|
||
|
|
||
|
extern INT_VAR_H(global_ambigs_debug_level, 0,
|
||
|
"Debug level for unichar ambiguities");
|
||
|
extern BOOL_VAR_H(use_definite_ambigs_for_classifier, 0,
|
||
|
"Use definite ambiguities when running character classifier");
|
||
|
|
||
|
namespace tesseract {
|
||
|
|
||
|
static const int kUnigramAmbigsBufferSize = 1000;
|
||
|
static const char kAmbigNgramSeparator[] = { ' ', '\0' };
|
||
|
static const char kAmbigDelimiters[] = "\t ";
|
||
|
static const char kIllegalMsg[] =
|
||
|
"Illegal ambiguity specification on line %d\n";
|
||
|
static const char kIllegalUnicharMsg[] =
|
||
|
"Illegal unichar %s in ambiguity specification\n";
|
||
|
|
||
|
enum AmbigType {
|
||
|
NOT_AMBIG, // the ngram pair is not ambiguous
|
||
|
REPLACE_AMBIG, // ocred ngram should always be substituted with correct
|
||
|
DEFINITE_AMBIG, // add correct ngram to the classifier results (1-1)
|
||
|
SIMILAR_AMBIG, // use pairwise classifier for ocred/correct pair (1-1)
|
||
|
CASE_AMBIG, // this is a case ambiguity (1-1)
|
||
|
|
||
|
AMBIG_TYPE_COUNT // number of enum entries
|
||
|
};
|
||
|
|
||
|
// A collection of utility functions for arrays of UNICHAR_IDs that are
|
||
|
// terminated by INVALID_UNICHAR_ID.
|
||
|
class UnicharIdArrayUtils {
|
||
|
public:
|
||
|
// Compares two arrays of unichar ids. Returns -1 if the length of array1 is
|
||
|
// less than length of array2, if any array1[i] is less than array2[i].
|
||
|
// Returns 0 if the arrays are equal, 1 otherwise.
|
||
|
// The function assumes that the arrays are terminated by INVALID_UNICHAR_ID.
|
||
|
static inline int compare(const UNICHAR_ID array1[],
|
||
|
const UNICHAR_ID array2[]) {
|
||
|
const UNICHAR_ID *ptr1 = array1;
|
||
|
const UNICHAR_ID *ptr2 = array2;
|
||
|
while (*ptr1 != INVALID_UNICHAR_ID && *ptr2 != INVALID_UNICHAR_ID) {
|
||
|
if (*ptr1 != *ptr2) return *ptr1 < *ptr2 ? -1 : 1;
|
||
|
++ptr1;
|
||
|
++ptr2;
|
||
|
}
|
||
|
if (*ptr1 == INVALID_UNICHAR_ID && *ptr2 == INVALID_UNICHAR_ID) return 0;
|
||
|
return *ptr1 == INVALID_UNICHAR_ID ? -1 : 1;
|
||
|
}
|
||
|
|
||
|
// Copies UNICHAR_IDs from dst to src. Returns the number of ids copied.
|
||
|
// The function assumes that the arrays are terminated by INVALID_UNICHAR_ID
|
||
|
// and that dst has enough space for all the elements from src.
|
||
|
static inline int copy(const UNICHAR_ID src[], UNICHAR_ID dst[]) {
|
||
|
int i = 0;
|
||
|
do {
|
||
|
dst[i] = src[i];
|
||
|
} while (dst[i++] != INVALID_UNICHAR_ID);
|
||
|
return i - 1;
|
||
|
}
|
||
|
|
||
|
// Prints unichars corresponding to the unichar_ids in the given array.
|
||
|
// The function assumes that array is terminated by INVALID_UNICHAR_ID.
|
||
|
static inline void print(const UNICHAR_ID array[],
|
||
|
const UNICHARSET &unicharset) {
|
||
|
const UNICHAR_ID *ptr = array;
|
||
|
if (*ptr == INVALID_UNICHAR_ID) tprintf("[Empty]");
|
||
|
while (*ptr != INVALID_UNICHAR_ID) {
|
||
|
tprintf("%s ", unicharset.id_to_unichar(*ptr++));
|
||
|
}
|
||
|
tprintf("( ");
|
||
|
ptr = array;
|
||
|
while (*ptr != INVALID_UNICHAR_ID) tprintf("%d ", *ptr++);
|
||
|
tprintf(")\n");
|
||
|
}
|
||
|
};
|
||
|
|
||
|
// AMBIG_SPEC_LIST stores a list of dangerous ambigs that
|
||
|
// start with the same unichar (e.g. r->t rn->m rr1->m).
|
||
|
class AmbigSpec : public ELIST_LINK {
|
||
|
public:
|
||
|
AmbigSpec();
|
||
|
~AmbigSpec() {}
|
||
|
|
||
|
// Comparator function for sorting AmbigSpec_LISTs. The lists will
|
||
|
// be sorted by their wrong_ngram arrays. Example of wrong_ngram vectors
|
||
|
// in a a sorted AmbigSpec_LIST: [9 1 3], [9 3 4], [9 8], [9, 8 1].
|
||
|
static int compare_ambig_specs(const void *spec1, const void *spec2) {
|
||
|
const AmbigSpec *s1 =
|
||
|
*reinterpret_cast<const AmbigSpec * const *>(spec1);
|
||
|
const AmbigSpec *s2 =
|
||
|
*reinterpret_cast<const AmbigSpec * const *>(spec2);
|
||
|
return UnicharIdArrayUtils::compare(s1->wrong_ngram, s2->wrong_ngram);
|
||
|
}
|
||
|
|
||
|
UNICHAR_ID wrong_ngram[MAX_AMBIG_SIZE + 1];
|
||
|
UNICHAR_ID correct_fragments[MAX_AMBIG_SIZE + 1];
|
||
|
UNICHAR_ID correct_ngram_id;
|
||
|
AmbigType type;
|
||
|
int wrong_ngram_size;
|
||
|
};
|
||
|
ELISTIZEH(AmbigSpec);
|
||
|
|
||
|
// AMBIG_TABLE[i] stores a set of ambiguities whose
|
||
|
// wrong ngram starts with unichar id i.
|
||
|
typedef GenericVector<AmbigSpec_LIST *> UnicharAmbigsVector;
|
||
|
typedef GenericVector<UNICHAR_ID> UnicharIdVector;
|
||
|
|
||
|
class UnicharAmbigs {
|
||
|
public:
|
||
|
UnicharAmbigs() {}
|
||
|
~UnicharAmbigs() {
|
||
|
replace_ambigs_.delete_data_pointers();
|
||
|
dang_ambigs_.delete_data_pointers();
|
||
|
one_to_one_definite_ambigs_.delete_data_pointers();
|
||
|
}
|
||
|
|
||
|
const UnicharAmbigsVector &dang_ambigs() const { return dang_ambigs_; }
|
||
|
const UnicharAmbigsVector &replace_ambigs() const { return replace_ambigs_; }
|
||
|
|
||
|
// Fills in two ambiguity tables (replaceable and dangerous) with information
|
||
|
// read from the ambigs file. An ambiguity table is an array of lists.
|
||
|
// The array is indexed by a class id. Each entry in the table provides
|
||
|
// a list of potential ambiguities which can start with the corresponding
|
||
|
// character. For example the ambiguity "rn -> m", would be located in the
|
||
|
// table at index of unicharset.unichar_to_id('r').
|
||
|
// In 1-1 ambiguities (e.g. s -> S, 1 -> I) are recorded in
|
||
|
// one_to_one_definite_ambigs_. This vector is also indexed by the class id
|
||
|
// of the wrong part of the ambiguity and each entry contains a vector of
|
||
|
// unichar ids that are ambiguous to it.
|
||
|
void LoadUnicharAmbigs(FILE *ambigs_file, inT64 end_offset,
|
||
|
UNICHARSET *unicharset);
|
||
|
|
||
|
// Return definite 1-1 ambigs.
|
||
|
const UnicharIdVector *OneToOneDefiniteAmbigs(UNICHAR_ID unichar_id) const {
|
||
|
if (one_to_one_definite_ambigs_.empty()) return NULL;
|
||
|
return one_to_one_definite_ambigs_[unichar_id];
|
||
|
}
|
||
|
|
||
|
private:
|
||
|
|
||
|
bool ParseAmbiguityLine(int line_num, int version,
|
||
|
const UNICHARSET &unicharset, char *buffer,
|
||
|
int *TestAmbigPartSize, UNICHAR_ID *TestUnicharIds,
|
||
|
int *ReplacementAmbigPartSize,
|
||
|
char *ReplacementString, int *type);
|
||
|
void InsertIntoTable(UnicharAmbigsVector &table,
|
||
|
int TestAmbigPartSize, UNICHAR_ID *TestUnicharIds,
|
||
|
int ReplacementAmbigPartSize,
|
||
|
const char *ReplacementString, int type,
|
||
|
AmbigSpec *ambig_spec, UNICHARSET *unicharset);
|
||
|
UnicharAmbigsVector dang_ambigs_;
|
||
|
UnicharAmbigsVector replace_ambigs_;
|
||
|
GenericVector<UnicharIdVector *> one_to_one_definite_ambigs_;
|
||
|
};
|
||
|
|
||
|
} // namespace tesseract
|
||
|
|
||
|
#endif // TESSERACT_CCUTIL_AMBIGS_H_
|