2007-05-16 09:25:41 +08:00
|
|
|
///////////////////////////////////////////////////////////////////////
|
|
|
|
// File: unicharset.h
|
|
|
|
// Description: Unicode character/ligature set class.
|
|
|
|
// Author: Thomas Kielbus
|
|
|
|
// Created: Wed Jun 28 17:05:01 PDT 2006
|
|
|
|
//
|
|
|
|
// (C) Copyright 2006, Google Inc.
|
|
|
|
// Licensed under the Apache License, Version 2.0 (the "License");
|
|
|
|
// you may not use this file except in compliance with the License.
|
|
|
|
// You may obtain a copy of the License at
|
|
|
|
// http://www.apache.org/licenses/LICENSE-2.0
|
|
|
|
// Unless required by applicable law or agreed to in writing, software
|
|
|
|
// distributed under the License is distributed on an "AS IS" BASIS,
|
|
|
|
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
|
|
// See the License for the specific language governing permissions and
|
|
|
|
// limitations under the License.
|
|
|
|
//
|
|
|
|
///////////////////////////////////////////////////////////////////////
|
|
|
|
|
|
|
|
#ifndef THIRD_PARTY_TESSERACT_CCUTIL_UNICHARSET_H__
|
|
|
|
#define THIRD_PARTY_TESSERACT_CCUTIL_UNICHARSET_H__
|
|
|
|
|
2008-04-22 08:23:41 +08:00
|
|
|
#include "strngs.h"
|
2007-05-16 09:25:41 +08:00
|
|
|
#include "unichar.h"
|
|
|
|
#include "unicharmap.h"
|
|
|
|
|
|
|
|
// The UNICHARSET class is an utility class for Tesseract that holds the
|
|
|
|
// set of characters that are used by the engine. Each character is identified
|
|
|
|
// by a unique number, from 0 to (size - 1).
|
|
|
|
class UNICHARSET {
|
|
|
|
public:
|
|
|
|
|
|
|
|
// Create an empty UNICHARSET
|
|
|
|
UNICHARSET();
|
|
|
|
|
|
|
|
~UNICHARSET();
|
|
|
|
|
|
|
|
// Return the UNICHAR_ID of a given unichar representation within the
|
|
|
|
// UNICHARSET.
|
|
|
|
const UNICHAR_ID unichar_to_id(const char* const unichar_repr) const;
|
|
|
|
|
|
|
|
// Return the UNICHAR_ID of a given unichar representation within the
|
|
|
|
// UNICHARSET. Only the first length characters from unichar_repr are used.
|
|
|
|
const UNICHAR_ID unichar_to_id(const char* const unichar_repr,
|
|
|
|
int length) const;
|
|
|
|
|
2008-02-01 08:21:49 +08:00
|
|
|
// Return the minimum number of bytes that matches a legal UNICHAR_ID,
|
|
|
|
// while leaving a legal UNICHAR_ID afterwards. In other words, if there
|
|
|
|
// is both a short and a long match to the string, return the length that
|
|
|
|
// ensures there is a legal match after it.
|
|
|
|
int step(const char* str) const;
|
|
|
|
|
2007-05-16 09:25:41 +08:00
|
|
|
// Return the unichar representation corresponding to the given UNICHAR_ID
|
|
|
|
// within the UNICHARSET.
|
|
|
|
const char* const id_to_unichar(UNICHAR_ID id) const;
|
|
|
|
|
2008-04-22 08:23:41 +08:00
|
|
|
// Return a STRING containing debug information on the unichar, including
|
|
|
|
// the id_to_unichar, its hex unicodes and the properties.
|
|
|
|
STRING debug_str(UNICHAR_ID id) const;
|
|
|
|
|
2007-05-16 09:25:41 +08:00
|
|
|
// Add a unichar representation to the set.
|
|
|
|
void unichar_insert(const char* const unichar_repr);
|
|
|
|
|
|
|
|
// Return true if the given unichar representation exists within the set.
|
|
|
|
bool contains_unichar(const char* const unichar_repr);
|
2008-02-01 08:21:49 +08:00
|
|
|
bool contains_unichar(const char* const unichar_repr, int length);
|
2007-05-16 09:25:41 +08:00
|
|
|
|
|
|
|
// Return true if the given unichar representation corresponds to the given
|
|
|
|
// UNICHAR_ID within the set.
|
|
|
|
bool eq(UNICHAR_ID unichar_id, const char* const unichar_repr);
|
|
|
|
|
|
|
|
// Clear the UNICHARSET (all the previous data is lost).
|
|
|
|
void clear() {
|
|
|
|
if (size_reserved > 0) {
|
2008-04-22 08:23:41 +08:00
|
|
|
for (int i = 0; i < script_table_size_used; ++i)
|
|
|
|
delete[] script_table[i];
|
|
|
|
delete[] script_table;
|
|
|
|
script_table = 0;
|
|
|
|
script_table_size_reserved = 0;
|
|
|
|
script_table_size_used = 0;
|
2007-05-16 09:25:41 +08:00
|
|
|
delete[] unichars;
|
|
|
|
unichars = 0;
|
|
|
|
size_reserved = 0;
|
|
|
|
size_used = 0;
|
|
|
|
}
|
|
|
|
ids.clear();
|
|
|
|
}
|
|
|
|
|
|
|
|
// Return the size of the set (the number of different UNICHAR it holds).
|
|
|
|
int size() const {
|
|
|
|
return size_used;
|
|
|
|
}
|
|
|
|
|
|
|
|
// Reserve enough memory space for the given number of UNICHARS
|
|
|
|
void reserve(int unichars_number);
|
|
|
|
|
|
|
|
// Save the content of the UNICHARSET to the given file. Return true if the
|
|
|
|
// operation is successful.
|
|
|
|
bool save_to_file(const char* const filename) const;
|
|
|
|
|
|
|
|
// Load the UNICHARSET from the given file. The previous data is lost. Return
|
|
|
|
// true if the operation is successful.
|
|
|
|
bool load_from_file(const char* const filename);
|
|
|
|
|
2008-02-01 08:21:49 +08:00
|
|
|
// Set a whitelist and/or blacklist of characters to recognize.
|
|
|
|
// An empty or NULL whitelist enables everything (minus any blacklist).
|
|
|
|
// An empty or NULL blacklist disables nothing.
|
|
|
|
// The blacklist overrides the whitelist.
|
|
|
|
// Each list is a string of utf8 character strings. Boundaries between
|
|
|
|
// unicharset units are worked out automatically, and characters not in
|
|
|
|
// the unicharset are silently ignored.
|
|
|
|
void set_black_and_whitelist(const char* blacklist, const char* whitelist);
|
|
|
|
|
2007-07-18 09:15:07 +08:00
|
|
|
// Set the isalpha property of the given unichar to the given value.
|
|
|
|
void set_isalpha(UNICHAR_ID unichar_id, bool value) {
|
|
|
|
unichars[unichar_id].properties.isalpha = value;
|
|
|
|
}
|
|
|
|
|
|
|
|
// Set the islower property of the given unichar to the given value.
|
|
|
|
void set_islower(UNICHAR_ID unichar_id, bool value) {
|
|
|
|
unichars[unichar_id].properties.islower = value;
|
|
|
|
}
|
|
|
|
|
|
|
|
// Set the isupper property of the given unichar to the given value.
|
|
|
|
void set_isupper(UNICHAR_ID unichar_id, bool value) {
|
|
|
|
unichars[unichar_id].properties.isupper = value;
|
|
|
|
}
|
|
|
|
|
|
|
|
// Set the isdigit property of the given unichar to the given value.
|
|
|
|
void set_isdigit(UNICHAR_ID unichar_id, bool value) {
|
|
|
|
unichars[unichar_id].properties.isdigit = value;
|
|
|
|
}
|
|
|
|
|
2008-04-22 08:23:41 +08:00
|
|
|
// Set the script name of the given unichar to the given value.
|
|
|
|
// Value is copied and thus can be a temporary;
|
|
|
|
void set_script(UNICHAR_ID unichar_id, const char* value) {
|
|
|
|
unichars[unichar_id].properties.script = add_script(value);
|
|
|
|
}
|
|
|
|
|
2007-07-18 09:15:07 +08:00
|
|
|
// Return the isalpha property of the given unichar.
|
|
|
|
bool get_isalpha(UNICHAR_ID unichar_id) const {
|
|
|
|
return unichars[unichar_id].properties.isalpha;
|
|
|
|
}
|
|
|
|
|
|
|
|
// Return the islower property of the given unichar.
|
|
|
|
bool get_islower(UNICHAR_ID unichar_id) const {
|
|
|
|
return unichars[unichar_id].properties.islower;
|
|
|
|
}
|
|
|
|
|
|
|
|
// Return the isupper property of the given unichar.
|
|
|
|
bool get_isupper(UNICHAR_ID unichar_id) const {
|
|
|
|
return unichars[unichar_id].properties.isupper;
|
|
|
|
}
|
|
|
|
|
|
|
|
// Return the isdigit property of the given unichar.
|
|
|
|
bool get_isdigit(UNICHAR_ID unichar_id) const {
|
|
|
|
return unichars[unichar_id].properties.isdigit;
|
|
|
|
}
|
|
|
|
|
2008-04-22 08:23:41 +08:00
|
|
|
// Return the script name of the given unichar.
|
|
|
|
// The returned pointer will always be the same for the same script, it's
|
|
|
|
// managed by unicharset and thus MUST NOT be deleted
|
|
|
|
const char* get_script(UNICHAR_ID unichar_id) const {
|
|
|
|
return unichars[unichar_id].properties.script;
|
|
|
|
}
|
|
|
|
|
2007-07-18 09:15:07 +08:00
|
|
|
// Return the isalpha property of the given unichar representation.
|
|
|
|
bool get_isalpha(const char* const unichar_repr) const {
|
|
|
|
return get_isalpha(unichar_to_id(unichar_repr));
|
|
|
|
}
|
|
|
|
|
|
|
|
// Return the islower property of the given unichar representation.
|
|
|
|
bool get_islower(const char* const unichar_repr) const {
|
|
|
|
return get_islower(unichar_to_id(unichar_repr));
|
|
|
|
}
|
|
|
|
|
|
|
|
// Return the isupper property of the given unichar representation.
|
|
|
|
bool get_isupper(const char* const unichar_repr) const {
|
|
|
|
return get_isupper(unichar_to_id(unichar_repr));
|
|
|
|
}
|
|
|
|
|
|
|
|
// Return the isdigit property of the given unichar representation.
|
|
|
|
bool get_isdigit(const char* const unichar_repr) const {
|
|
|
|
return get_isdigit(unichar_to_id(unichar_repr));
|
|
|
|
}
|
|
|
|
|
2008-04-22 08:23:41 +08:00
|
|
|
// Return the script name of the given unichar representation.
|
|
|
|
// The returned pointer will always be the same for the same script, it's
|
|
|
|
// managed by unicharset and thus MUST NOT be deleted
|
|
|
|
const char* get_script(const char* const unichar_repr) const {
|
|
|
|
return get_script(unichar_to_id(unichar_repr));
|
|
|
|
}
|
|
|
|
|
2007-07-18 09:15:07 +08:00
|
|
|
// Return the isalpha property of the given unichar representation.
|
|
|
|
// Only the first length characters from unichar_repr are used.
|
|
|
|
bool get_isalpha(const char* const unichar_repr,
|
|
|
|
int length) const {
|
|
|
|
return get_isalpha(unichar_to_id(unichar_repr, length));
|
|
|
|
}
|
|
|
|
|
|
|
|
// Return the islower property of the given unichar representation.
|
|
|
|
// Only the first length characters from unichar_repr are used.
|
|
|
|
bool get_islower(const char* const unichar_repr,
|
|
|
|
int length) const {
|
|
|
|
return get_islower(unichar_to_id(unichar_repr, length));
|
|
|
|
}
|
|
|
|
|
|
|
|
// Return the isupper property of the given unichar representation.
|
|
|
|
// Only the first length characters from unichar_repr are used.
|
|
|
|
bool get_isupper(const char* const unichar_repr,
|
|
|
|
int length) const {
|
|
|
|
return get_isupper(unichar_to_id(unichar_repr, length));
|
|
|
|
}
|
|
|
|
|
|
|
|
// Return the isdigit property of the given unichar representation.
|
|
|
|
// Only the first length characters from unichar_repr are used.
|
|
|
|
bool get_isdigit(const char* const unichar_repr,
|
|
|
|
int length) const {
|
|
|
|
return get_isdigit(unichar_to_id(unichar_repr, length));
|
|
|
|
}
|
|
|
|
|
2008-04-22 08:23:41 +08:00
|
|
|
// Return the script name of the given unichar representation.
|
|
|
|
// Only the first length characters from unichar_repr are used.
|
|
|
|
// The returned pointer will always be the same for the same script, it's
|
|
|
|
// managed by unicharset and thus MUST NOT be deleted
|
|
|
|
const char* get_script(const char* const unichar_repr,
|
|
|
|
int length) const {
|
|
|
|
return get_script(unichar_to_id(unichar_repr, length));
|
|
|
|
}
|
|
|
|
|
2008-02-01 08:21:49 +08:00
|
|
|
// Return the enabled property of the given unichar.
|
|
|
|
bool get_enabled(UNICHAR_ID unichar_id) const {
|
|
|
|
return unichars[unichar_id].properties.enabled;
|
|
|
|
}
|
|
|
|
|
2007-05-16 09:25:41 +08:00
|
|
|
private:
|
|
|
|
|
2008-04-22 08:23:41 +08:00
|
|
|
// Uniquify the given script. For two scripts a and b, if strcmp(a, b) == 0,
|
|
|
|
// then the returned pointer will be the same.
|
|
|
|
// The script parameter is copied and thus can be a temporary.
|
|
|
|
char* add_script(const char* script);
|
|
|
|
|
2007-07-18 09:15:07 +08:00
|
|
|
struct UNICHAR_PROPERTIES {
|
2008-04-22 08:23:41 +08:00
|
|
|
bool isalpha;
|
|
|
|
bool islower;
|
|
|
|
bool isupper;
|
|
|
|
bool isdigit;
|
|
|
|
bool enabled;
|
|
|
|
char* script;
|
2007-07-18 09:15:07 +08:00
|
|
|
};
|
|
|
|
|
|
|
|
struct UNICHAR_SLOT {
|
|
|
|
char representation[UNICHAR_LEN + 1];
|
|
|
|
UNICHAR_PROPERTIES properties;
|
|
|
|
};
|
2007-05-16 09:25:41 +08:00
|
|
|
|
|
|
|
UNICHAR_SLOT* unichars;
|
|
|
|
UNICHARMAP ids;
|
|
|
|
int size_used;
|
|
|
|
int size_reserved;
|
2008-04-22 08:23:41 +08:00
|
|
|
char** script_table;
|
|
|
|
int script_table_size_used;
|
|
|
|
int script_table_size_reserved;
|
|
|
|
const char* null_script;
|
2007-05-16 09:25:41 +08:00
|
|
|
};
|
|
|
|
|
|
|
|
#endif // THIRD_PARTY_TESSERACT_CCUTIL_UNICHARSET_H__
|