/********************************************************************** * File: char_samp_enum.h * Description: Declaration of a Character Set Class * Author: Ahmad Abdulkader * Created: 2007 * * (C) Copyright 2008, Google Inc. ** Licensed under the Apache License, Version 2.0 (the "License"); ** you may not use this file except in compliance with the License. ** You may obtain a copy of the License at ** http://www.apache.org/licenses/LICENSE-2.0 ** Unless required by applicable law or agreed to in writing, software ** distributed under the License is distributed on an "AS IS" BASIS, ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. ** See the License for the specific language governing permissions and ** limitations under the License. * **********************************************************************/ // The CharSet class encapsulates the list of 32-bit strings/characters that // Cube supports for a specific language. The char set is loaded from the // .unicharset file corresponding to a specific language // Each string has a corresponding int class-id that gets used throughout Cube // The class provides pass back and forth conversion between the class-id // and its corresponding 32-bit string. This is done using a hash table that // maps the string to the class id. #ifndef CHAR_SET_H #define CHAR_SET_H #include #include #include #include "string_32.h" #include "tessdatamanager.h" #include "unicharset.h" #include "cube_const.h" namespace tesseract { class CharSet { public: CharSet(); ~CharSet(); // Returns true if Cube is sharing Tesseract's unicharset. inline bool SharedUnicharset() { return (unicharset_map_ == NULL); } // Returns the class id corresponding to a 32-bit string. Returns -1 // if the string is not supported. This is done by hashing the // string and then looking up the string in the hash-bin if there // are collisions. inline int ClassID(const char_32 *str) const { int hash_val = Hash(str); if (hash_bin_size_[hash_val] == 0) return -1; for (int bin = 0; bin < hash_bin_size_[hash_val]; bin++) { if (class_strings_[hash_bins_[hash_val][bin]]->compare(str) == 0) return hash_bins_[hash_val][bin]; } return -1; } // Same as above but using a 32-bit char instead of a string inline int ClassID(char_32 ch) const { int hash_val = Hash(ch); if (hash_bin_size_[hash_val] == 0) return -1; for (int bin = 0; bin < hash_bin_size_[hash_val]; bin++) { if ((*class_strings_[hash_bins_[hash_val][bin]])[0] == ch && class_strings_[hash_bins_[hash_val][bin]]->length() == 1) { return hash_bins_[hash_val][bin]; } } return -1; } // Retrieve the unicharid in Tesseract's unicharset corresponding // to a 32-bit string. When Tesseract and Cube share the same // unicharset, this will just be the class id. inline int UnicharID(const char_32 *str) const { int class_id = ClassID(str); if (class_id == INVALID_UNICHAR_ID) return INVALID_UNICHAR_ID; int unichar_id; if (unicharset_map_) unichar_id = unicharset_map_[class_id]; else unichar_id = class_id; return unichar_id; } // Same as above but using a 32-bit char instead of a string inline int UnicharID(char_32 ch) const { int class_id = ClassID(ch); if (class_id == INVALID_UNICHAR_ID) return INVALID_UNICHAR_ID; int unichar_id; if (unicharset_map_) unichar_id = unicharset_map_[class_id]; else unichar_id = class_id; return unichar_id; } // Returns the 32-bit string corresponding to a class id inline const char_32 * ClassString(int class_id) const { if (class_id < 0 || class_id >= class_cnt_) { return NULL; } return reinterpret_cast(class_strings_[class_id]->c_str()); } // Returns the count of supported strings inline int ClassCount() const { return class_cnt_; } // Creates CharSet object by reading the unicharset from the // TessDatamanager, and mapping Cube's unicharset to Tesseract's if // they differ. static CharSet *Create(TessdataManager *tessdata_manager, UNICHARSET *tess_unicharset); // Return the UNICHARSET cube is using for recognition internally -- // ClassId() returns unichar_id's in this unicharset. UNICHARSET *InternalUnicharset() { return unicharset_; } private: // Hash table configuration params. Determined emperically on // the supported languages so far (Eng, Ara, Hin). Might need to be // tuned for speed when more languages are supported static const int kHashBins = 3001; static const int kMaxHashSize = 16; // Using djb2 hashing function to hash a 32-bit string // introduced in http://www.cse.yorku.ca/~oz/hash.html static inline int Hash(const char_32 *str) { unsigned long hash = 5381; int c; while ((c = *str++)) hash = ((hash << 5) + hash) + c; return (hash%kHashBins); } // Same as above but for a single char static inline int Hash(char_32 ch) { char_32 b[2]; b[0] = ch; b[1] = 0; return Hash(b); } // Load the list of supported chars from the given data file // pointer. If tess_unicharset is non-NULL, mapping each Cube class // id to a tesseract unicharid. bool LoadSupportedCharList(FILE *fp, UNICHARSET *tess_unicharset); // class count int class_cnt_; // hash-bin sizes array int hash_bin_size_[kHashBins]; // hash bins int hash_bins_[kHashBins][kMaxHashSize]; // supported strings array string_32 **class_strings_; // map from class id to secondary (tesseract's) unicharset's ids int *unicharset_map_; // A unicharset which is filled in with a Tesseract-style UNICHARSET for // cube's data if our unicharset is different from tesseract's. UNICHARSET cube_unicharset_; // This points to either the tess_unicharset we're passed or cube_unicharset_, // depending upon whether we just have one unicharset or one for each // tesseract and cube, respectively. UNICHARSET *unicharset_; // has the char set been initialized flag bool init_; }; } #endif // CHAR_SET_H