mirror of
https://github.com/tesseract-ocr/tesseract.git
synced 2025-01-23 10:56:19 +08:00
163 lines
5.5 KiB
C
163 lines
5.5 KiB
C
|
/**********************************************************************
|
||
|
* File: char_samp_enum.h
|
||
|
* Description: Declaration of a Character Set Class
|
||
|
* Author: Ahmad Abdulkader
|
||
|
* Created: 2007
|
||
|
*
|
||
|
* (C) Copyright 2008, Google Inc.
|
||
|
** Licensed under the Apache License, Version 2.0 (the "License");
|
||
|
** you may not use this file except in compliance with the License.
|
||
|
** You may obtain a copy of the License at
|
||
|
** http://www.apache.org/licenses/LICENSE-2.0
|
||
|
** Unless required by applicable law or agreed to in writing, software
|
||
|
** distributed under the License is distributed on an "AS IS" BASIS,
|
||
|
** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||
|
** See the License for the specific language governing permissions and
|
||
|
** limitations under the License.
|
||
|
*
|
||
|
**********************************************************************/
|
||
|
|
||
|
// The CharSet class encapsulates the list of 32-bit strings/characters that
|
||
|
// Cube supports for a specific language. The char set is loaded from the
|
||
|
// .unicharset file corresponding to a specific language
|
||
|
// Each string has a corresponding int class-id that gets used throughout Cube
|
||
|
// The class provides pass back and forth conversion between the class-id
|
||
|
// and its corresponding 32-bit string. This is done using a hash table that
|
||
|
// maps the string to the class id.
|
||
|
|
||
|
#ifndef CHAR_SET_H
|
||
|
#define CHAR_SET_H
|
||
|
|
||
|
#include <string.h>
|
||
|
#include <string>
|
||
|
#include <algorithm>
|
||
|
|
||
|
#include "string_32.h"
|
||
|
#include "tessdatamanager.h"
|
||
|
#include "unicharset.h"
|
||
|
|
||
|
namespace tesseract {
|
||
|
|
||
|
class CharSet {
|
||
|
public:
|
||
|
CharSet();
|
||
|
~CharSet();
|
||
|
|
||
|
// Returns true if Cube is sharing Tesseract's unicharset.
|
||
|
inline bool SharedUnicharset() { return (unicharset_map_ == NULL); }
|
||
|
|
||
|
// Returns the class id corresponding to a 32-bit string. Returns -1
|
||
|
// if the string is not supported. This is done by hashing the
|
||
|
// string and then looking up the string in the hash-bin if there
|
||
|
// are collisions.
|
||
|
inline int ClassID(const char_32 *str) const {
|
||
|
int hash_val = Hash(str);
|
||
|
if (hash_bin_size_[hash_val] == 0)
|
||
|
return -1;
|
||
|
for (int bin = 0; bin < hash_bin_size_[hash_val]; bin++) {
|
||
|
if (class_strings_[hash_bins_[hash_val][bin]]->compare(str) == 0)
|
||
|
return hash_bins_[hash_val][bin];
|
||
|
}
|
||
|
return -1;
|
||
|
}
|
||
|
// Same as above but using a 32-bit char instead of a string
|
||
|
inline int ClassID(char_32 ch) const {
|
||
|
int hash_val = Hash(ch);
|
||
|
if (hash_bin_size_[hash_val] == 0)
|
||
|
return -1;
|
||
|
for (int bin = 0; bin < hash_bin_size_[hash_val]; bin++) {
|
||
|
if ((*class_strings_[hash_bins_[hash_val][bin]])[0] == ch &&
|
||
|
class_strings_[hash_bins_[hash_val][bin]]->length() == 1) {
|
||
|
return hash_bins_[hash_val][bin];
|
||
|
}
|
||
|
}
|
||
|
return -1;
|
||
|
}
|
||
|
// Retrieve the unicharid in Tesseract's unicharset corresponding
|
||
|
// to a 32-bit string. When Tesseract and Cube share the same
|
||
|
// unicharset, this will just be the class id.
|
||
|
inline int UnicharID(const char_32 *str) const {
|
||
|
int class_id = ClassID(str);
|
||
|
if (class_id == INVALID_UNICHAR_ID)
|
||
|
return INVALID_UNICHAR_ID;
|
||
|
int unichar_id;
|
||
|
if (unicharset_map_)
|
||
|
unichar_id = unicharset_map_[class_id];
|
||
|
else
|
||
|
unichar_id = class_id;
|
||
|
return unichar_id;
|
||
|
}
|
||
|
// Same as above but using a 32-bit char instead of a string
|
||
|
inline int UnicharID(char_32 ch) const {
|
||
|
int class_id = ClassID(ch);
|
||
|
if (class_id == INVALID_UNICHAR_ID)
|
||
|
return INVALID_UNICHAR_ID;
|
||
|
int unichar_id;
|
||
|
if (unicharset_map_)
|
||
|
unichar_id = unicharset_map_[class_id];
|
||
|
else
|
||
|
unichar_id = class_id;
|
||
|
return unichar_id;
|
||
|
}
|
||
|
// Returns the 32-bit string corresponding to a class id
|
||
|
inline const char_32 * ClassString(int class_id) const {
|
||
|
if (class_id < 0 || class_id >= class_cnt_) {
|
||
|
return NULL;
|
||
|
}
|
||
|
return reinterpret_cast<const char_32 *>(class_strings_[class_id]->c_str());
|
||
|
}
|
||
|
// Returns the count of supported strings
|
||
|
inline int ClassCount() const { return class_cnt_; }
|
||
|
|
||
|
// Creates CharSet object by reading the unicharset from the
|
||
|
// TessDatamanager, and mapping Cube's unicharset to Tesseract's if
|
||
|
// they differ.
|
||
|
static CharSet *Create(TessdataManager *tessdata_manager,
|
||
|
UNICHARSET *tess_unicharset);
|
||
|
|
||
|
private:
|
||
|
// Hash table configuration params. Determined emperically on
|
||
|
// the supported languages so far (Eng, Ara, Hin). Might need to be
|
||
|
// tuned for speed when more languages are supported
|
||
|
static const int kHashBins = 3001;
|
||
|
static const int kMaxHashSize = 16;
|
||
|
|
||
|
// Using djb2 hashing function to hash a 32-bit string
|
||
|
// introduced in http://www.cse.yorku.ca/~oz/hash.html
|
||
|
static inline int Hash(const char_32 *str) {
|
||
|
unsigned long hash = 5381;
|
||
|
int c;
|
||
|
while ((c = *str++))
|
||
|
hash = ((hash << 5) + hash) + c;
|
||
|
return (hash%kHashBins);
|
||
|
}
|
||
|
// Same as above but for a single char
|
||
|
static inline int Hash(char_32 ch) {
|
||
|
char_32 b[2];
|
||
|
b[0] = ch;
|
||
|
b[1] = 0;
|
||
|
return Hash(b);
|
||
|
}
|
||
|
|
||
|
// Load the list of supported chars from the given data file
|
||
|
// pointer. If tess_unicharset is non-NULL, mapping each Cube class
|
||
|
// id to a tesseract unicharid.
|
||
|
bool LoadSupportedCharList(FILE *fp, UNICHARSET *tess_unicharset);
|
||
|
|
||
|
// class count
|
||
|
int class_cnt_;
|
||
|
// hash-bin sizes array
|
||
|
int hash_bin_size_[kHashBins];
|
||
|
// hash bins
|
||
|
int hash_bins_[kHashBins][kMaxHashSize];
|
||
|
// supported strings array
|
||
|
string_32 **class_strings_;
|
||
|
// map from class id to secondary (tesseract's) unicharset's ids
|
||
|
int *unicharset_map_;
|
||
|
// has the char set been initialized flag
|
||
|
bool init_;
|
||
|
};
|
||
|
}
|
||
|
|
||
|
#endif // CHAR_SET_H
|