tesseract/cube/char_set.h

175 lines
6.1 KiB
C++

/**********************************************************************
* File: char_samp_enum.h
* Description: Declaration of a Character Set Class
* Author: Ahmad Abdulkader
* Created: 2007
*
* (C) Copyright 2008, Google Inc.
** Licensed under the Apache License, Version 2.0 (the "License");
** you may not use this file except in compliance with the License.
** You may obtain a copy of the License at
** http://www.apache.org/licenses/LICENSE-2.0
** Unless required by applicable law or agreed to in writing, software
** distributed under the License is distributed on an "AS IS" BASIS,
** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
** See the License for the specific language governing permissions and
** limitations under the License.
*
**********************************************************************/
// The CharSet class encapsulates the list of 32-bit strings/characters that
// Cube supports for a specific language. The char set is loaded from the
// .unicharset file corresponding to a specific language
// Each string has a corresponding int class-id that gets used throughout Cube
// The class provides pass back and forth conversion between the class-id
// and its corresponding 32-bit string. This is done using a hash table that
// maps the string to the class id.
#ifndef CHAR_SET_H
#define CHAR_SET_H
#include <string.h>
#include <string>
#include <algorithm>
#include "string_32.h"
#include "tessdatamanager.h"
#include "unicharset.h"
#include "cube_const.h"
namespace tesseract {
class CharSet {
public:
CharSet();
~CharSet();
// Returns true if Cube is sharing Tesseract's unicharset.
inline bool SharedUnicharset() { return (unicharset_map_ == NULL); }
// Returns the class id corresponding to a 32-bit string. Returns -1
// if the string is not supported. This is done by hashing the
// string and then looking up the string in the hash-bin if there
// are collisions.
inline int ClassID(const char_32 *str) const {
int hash_val = Hash(str);
if (hash_bin_size_[hash_val] == 0)
return -1;
for (int bin = 0; bin < hash_bin_size_[hash_val]; bin++) {
if (class_strings_[hash_bins_[hash_val][bin]]->compare(str) == 0)
return hash_bins_[hash_val][bin];
}
return -1;
}
// Same as above but using a 32-bit char instead of a string
inline int ClassID(char_32 ch) const {
int hash_val = Hash(ch);
if (hash_bin_size_[hash_val] == 0)
return -1;
for (int bin = 0; bin < hash_bin_size_[hash_val]; bin++) {
if ((*class_strings_[hash_bins_[hash_val][bin]])[0] == ch &&
class_strings_[hash_bins_[hash_val][bin]]->length() == 1) {
return hash_bins_[hash_val][bin];
}
}
return -1;
}
// Retrieve the unicharid in Tesseract's unicharset corresponding
// to a 32-bit string. When Tesseract and Cube share the same
// unicharset, this will just be the class id.
inline int UnicharID(const char_32 *str) const {
int class_id = ClassID(str);
if (class_id == INVALID_UNICHAR_ID)
return INVALID_UNICHAR_ID;
int unichar_id;
if (unicharset_map_)
unichar_id = unicharset_map_[class_id];
else
unichar_id = class_id;
return unichar_id;
}
// Same as above but using a 32-bit char instead of a string
inline int UnicharID(char_32 ch) const {
int class_id = ClassID(ch);
if (class_id == INVALID_UNICHAR_ID)
return INVALID_UNICHAR_ID;
int unichar_id;
if (unicharset_map_)
unichar_id = unicharset_map_[class_id];
else
unichar_id = class_id;
return unichar_id;
}
// Returns the 32-bit string corresponding to a class id
inline const char_32 * ClassString(int class_id) const {
if (class_id < 0 || class_id >= class_cnt_) {
return NULL;
}
return reinterpret_cast<const char_32 *>(class_strings_[class_id]->c_str());
}
// Returns the count of supported strings
inline int ClassCount() const { return class_cnt_; }
// Creates CharSet object by reading the unicharset from the
// TessDatamanager, and mapping Cube's unicharset to Tesseract's if
// they differ.
static CharSet *Create(TessdataManager *tessdata_manager,
UNICHARSET *tess_unicharset);
// Return the UNICHARSET cube is using for recognition internally --
// ClassId() returns unichar_id's in this unicharset.
UNICHARSET *InternalUnicharset() { return unicharset_; }
private:
// Hash table configuration params. Determined emperically on
// the supported languages so far (Eng, Ara, Hin). Might need to be
// tuned for speed when more languages are supported
static const int kHashBins = 3001;
static const int kMaxHashSize = 16;
// Using djb2 hashing function to hash a 32-bit string
// introduced in http://www.cse.yorku.ca/~oz/hash.html
static inline int Hash(const char_32 *str) {
unsigned long hash = 5381;
int c;
while ((c = *str++))
hash = ((hash << 5) + hash) + c;
return (hash%kHashBins);
}
// Same as above but for a single char
static inline int Hash(char_32 ch) {
char_32 b[2];
b[0] = ch;
b[1] = 0;
return Hash(b);
}
// Load the list of supported chars from the given data file
// pointer. If tess_unicharset is non-NULL, mapping each Cube class
// id to a tesseract unicharid.
bool LoadSupportedCharList(FILE *fp, UNICHARSET *tess_unicharset);
// class count
int class_cnt_;
// hash-bin sizes array
int hash_bin_size_[kHashBins];
// hash bins
int hash_bins_[kHashBins][kMaxHashSize];
// supported strings array
string_32 **class_strings_;
// map from class id to secondary (tesseract's) unicharset's ids
int *unicharset_map_;
// A unicharset which is filled in with a Tesseract-style UNICHARSET for
// cube's data if our unicharset is different from tesseract's.
UNICHARSET cube_unicharset_;
// This points to either the tess_unicharset we're passed or cube_unicharset_,
// depending upon whether we just have one unicharset or one for each
// tesseract and cube, respectively.
UNICHARSET *unicharset_;
// has the char set been initialized flag
bool init_;
};
}
#endif // CHAR_SET_H