mirror of
https://github.com/tesseract-ocr/tesseract.git
synced 2025-01-22 01:30:49 +08:00
Inherit STRING from std::string.
This commit is contained in:
parent
b4495a6bb5
commit
c3e04abe1e
@ -25,47 +25,40 @@
|
||||
#include <cstdint> // for uint32_t
|
||||
#include <cstdio> // for FILE
|
||||
#include <cstring> // for strncpy
|
||||
#include <string>
|
||||
|
||||
namespace tesseract {
|
||||
|
||||
class TFile;
|
||||
|
||||
// STRING_IS_PROTECTED means that string[index] = X is invalid
|
||||
// because you have to go through strings interface to modify it.
|
||||
// This allows the string to ensure internal integrity and maintain
|
||||
// its own string length. Unfortunately this is not possible because
|
||||
// STRINGS are used as direct-manipulation data buffers for things
|
||||
// like length arrays and many places cast away the const on c_str()
|
||||
// to mutate the string. Turning this off means that internally we
|
||||
// cannot assume we know the strlen.
|
||||
#define STRING_IS_PROTECTED 0
|
||||
|
||||
template <typename T>
|
||||
class GenericVector;
|
||||
|
||||
class TESS_API STRING {
|
||||
class STRING : public std::string {
|
||||
public:
|
||||
STRING();
|
||||
STRING(const STRING& string);
|
||||
STRING(const char* string);
|
||||
STRING(const char* data, int length);
|
||||
~STRING();
|
||||
using std::string::string;
|
||||
STRING(const std::string &s) : std::string(s) {}
|
||||
|
||||
// Writes to the given file. Returns false in case of error.
|
||||
TESS_API
|
||||
bool Serialize(FILE* fp) const;
|
||||
// Reads from the given file. Returns false in case of error.
|
||||
// If swap is true, assumes a big/little-endian swap is needed.
|
||||
TESS_API
|
||||
bool DeSerialize(bool swap, FILE* fp);
|
||||
// Writes to the given file. Returns false in case of error.
|
||||
TESS_API
|
||||
bool Serialize(tesseract::TFile* fp) const;
|
||||
// Reads from the given file. Returns false in case of error.
|
||||
// If swap is true, assumes a big/little-endian swap is needed.
|
||||
TESS_API
|
||||
bool DeSerialize(tesseract::TFile* fp);
|
||||
// As DeSerialize, but only seeks past the data - hence a static method.
|
||||
TESS_API
|
||||
static bool SkipDeSerialize(tesseract::TFile* fp);
|
||||
|
||||
TESS_API
|
||||
bool contains(char c) const;
|
||||
int32_t length() const;
|
||||
int32_t size() const {
|
||||
return length();
|
||||
}
|
||||
@ -75,114 +68,26 @@ class TESS_API STRING {
|
||||
assert(0 <= len);
|
||||
return static_cast<uint32_t>(len);
|
||||
}
|
||||
const char* c_str() const;
|
||||
|
||||
inline char* strdup() const {
|
||||
int32_t len = length() + 1;
|
||||
return strncpy(new char[len], GetCStr(), len);
|
||||
return strncpy(new char[len], c_str(), len);
|
||||
}
|
||||
|
||||
#if STRING_IS_PROTECTED
|
||||
const char& operator[](int32_t index) const;
|
||||
// len is number of chars in s to insert starting at index in this string
|
||||
void insert_range(int32_t index, const char* s, int len);
|
||||
void erase_range(int32_t index, int len);
|
||||
#else
|
||||
char& operator[](int32_t index) const;
|
||||
#endif
|
||||
TESS_API
|
||||
void split(char c, GenericVector<STRING>* splited);
|
||||
TESS_API
|
||||
void truncate_at(int32_t index);
|
||||
|
||||
bool operator==(const STRING& string) const;
|
||||
bool operator!=(const STRING& string) const;
|
||||
bool operator!=(const char* string) const;
|
||||
|
||||
STRING& operator=(const char* string);
|
||||
STRING& operator=(const STRING& string);
|
||||
|
||||
STRING operator+(const STRING& string) const;
|
||||
STRING operator+(char ch) const;
|
||||
|
||||
STRING& operator+=(const char* string);
|
||||
STRING& operator+=(const STRING& string);
|
||||
STRING& operator+=(char ch);
|
||||
|
||||
// Assignment for strings which are not null-terminated.
|
||||
void assign(const char* cstr, int len);
|
||||
|
||||
// Appends the given string and int (as a %d) to this.
|
||||
// += cannot be used for ints as there as a char += operator that would
|
||||
// be ambiguous, and ints usually need a string before or between them
|
||||
// anyway.
|
||||
TESS_API
|
||||
void add_str_int(const char* str, int number);
|
||||
// Appends the given string and double (as a %.8g) to this.
|
||||
TESS_API
|
||||
void add_str_double(const char* str, double number);
|
||||
|
||||
// ensure capacity but keep pointer encapsulated
|
||||
inline void ensure(int32_t min_capacity) {
|
||||
ensure_cstr(min_capacity);
|
||||
}
|
||||
|
||||
private:
|
||||
typedef struct STRING_HEADER {
|
||||
// How much space was allocated in the string buffer for char data.
|
||||
int capacity_;
|
||||
|
||||
// used_ is how much of the capacity is currently being used,
|
||||
// including a '\0' terminator.
|
||||
//
|
||||
// If used_ is 0 then string is nullptr (not even the '\0')
|
||||
// else if used_ > 0 then it is strlen() + 1 (because it includes '\0')
|
||||
// else strlen is >= 0 (not nullptr) but needs to be computed.
|
||||
// this condition is set when encapsulation is violated because
|
||||
// an API returned a mutable string.
|
||||
//
|
||||
// capacity_ - used_ = excess capacity that the string can grow
|
||||
// without reallocating
|
||||
mutable int used_;
|
||||
} STRING_HEADER;
|
||||
|
||||
// To preserve the behavior of the old serialization, we only have space
|
||||
// for one pointer in this structure. So we are embedding a data structure
|
||||
// at the start of the storage that will hold additional state variables,
|
||||
// then storing the actual string contents immediately after.
|
||||
STRING_HEADER* data_;
|
||||
|
||||
// returns the header part of the storage
|
||||
inline STRING_HEADER* GetHeader() {
|
||||
return data_;
|
||||
}
|
||||
inline const STRING_HEADER* GetHeader() const {
|
||||
return data_;
|
||||
}
|
||||
|
||||
// returns the string data part of storage
|
||||
inline char* GetCStr() {
|
||||
return (reinterpret_cast<char*>(data_)) + sizeof(STRING_HEADER);
|
||||
}
|
||||
|
||||
inline const char* GetCStr() const {
|
||||
return (reinterpret_cast<const char*>(data_)) + sizeof(STRING_HEADER);
|
||||
}
|
||||
inline bool InvariantOk() const {
|
||||
#if STRING_IS_PROTECTED
|
||||
return (GetHeader()->used_ == 0)
|
||||
? (c_str() == nullptr)
|
||||
: (GetHeader()->used_ == (strlen(c_str()) + 1));
|
||||
#else
|
||||
return true;
|
||||
#endif
|
||||
}
|
||||
|
||||
// Ensure string has requested capacity as optimization
|
||||
// to avoid unnecessary reallocations.
|
||||
// The return value is a cstr buffer with at least requested capacity
|
||||
char* ensure_cstr(int32_t min_capacity);
|
||||
|
||||
void FixHeader() const; // make used_ non-negative, even if const
|
||||
|
||||
char* AllocData(int used, int capacity);
|
||||
void DiscardData();
|
||||
};
|
||||
|
||||
} // namespace tesseract.
|
||||
|
@ -529,6 +529,13 @@ class WERD_CHOICE : public ELIST_LINK {
|
||||
return unichars_in_script_order_;
|
||||
}
|
||||
|
||||
// Returns a UTF-8 string equivalent to the current choice
|
||||
// of UNICHAR IDs.
|
||||
STRING &unichar_string() {
|
||||
this->string_and_lengths(&unichar_string_, &unichar_lengths_);
|
||||
return unichar_string_;
|
||||
}
|
||||
|
||||
// Returns a UTF-8 string equivalent to the current choice
|
||||
// of UNICHAR IDs.
|
||||
const STRING &unichar_string() const {
|
||||
|
@ -53,7 +53,7 @@ void CCUtil::main_setup(const char *argv0, const char *basename) {
|
||||
/* Use tessdata prefix from the environment. */
|
||||
datadir = tessdata_prefix;
|
||||
#if defined(_WIN32)
|
||||
} else if (datadir == nullptr || _access(datadir.c_str(), 0) != 0) {
|
||||
} else if (datadir.empty() || _access(datadir.c_str(), 0) != 0) {
|
||||
/* Look for tessdata in directory of executable. */
|
||||
char path[_MAX_PATH];
|
||||
DWORD length = GetModuleFileName(nullptr, path, sizeof(path));
|
||||
|
@ -35,128 +35,21 @@ namespace tesseract {
|
||||
// possible length of an int (in 64 bits), being -<20 digits>.
|
||||
const int kMaxIntSize = 22;
|
||||
|
||||
/**********************************************************************
|
||||
* STRING_HEADER provides metadata about the allocated buffer,
|
||||
* including total capacity and how much used (strlen with '\0').
|
||||
*
|
||||
* The implementation hides this header at the start of the data
|
||||
* buffer and appends the string on the end to keep sizeof(STRING)
|
||||
* unchanged from earlier versions so serialization is not affected.
|
||||
*
|
||||
* The collection of MACROS provide different implementations depending
|
||||
* on whether the string keeps track of its strlen or not so that this
|
||||
* feature can be added in later when consumers don't modify the string
|
||||
**********************************************************************/
|
||||
|
||||
// Smallest string to allocate by default
|
||||
const int kMinCapacity = 16;
|
||||
|
||||
char* STRING::AllocData(int used, int capacity) {
|
||||
data_ = static_cast<STRING_HEADER *>(malloc(capacity + sizeof(STRING_HEADER)));
|
||||
|
||||
// header is the metadata for this memory block
|
||||
STRING_HEADER* header = GetHeader();
|
||||
header->capacity_ = capacity;
|
||||
header->used_ = used;
|
||||
return GetCStr();
|
||||
}
|
||||
|
||||
void STRING::DiscardData() {
|
||||
free(data_);
|
||||
data_ = nullptr;
|
||||
}
|
||||
|
||||
// This is a private method; ensure FixHeader is called (or used_ is well defined)
|
||||
// beforehand
|
||||
char* STRING::ensure_cstr(int32_t min_capacity) {
|
||||
STRING_HEADER* orig_header = GetHeader();
|
||||
if (min_capacity <= orig_header->capacity_)
|
||||
return (reinterpret_cast<char *>(this->data_)) + sizeof(STRING_HEADER);
|
||||
|
||||
// if we are going to grow bigger, than double our existing
|
||||
// size, but if that still is not big enough then keep the
|
||||
// requested capacity
|
||||
if (min_capacity < 2 * orig_header->capacity_)
|
||||
min_capacity = 2 * orig_header->capacity_;
|
||||
|
||||
int alloc = sizeof(STRING_HEADER) + min_capacity;
|
||||
auto* new_header = static_cast<STRING_HEADER*>(malloc(alloc));
|
||||
|
||||
memcpy(&new_header[1], GetCStr(), orig_header->used_);
|
||||
new_header->capacity_ = min_capacity;
|
||||
new_header->used_ = orig_header->used_;
|
||||
|
||||
// free old memory, then rebind to new memory
|
||||
DiscardData();
|
||||
data_ = new_header;
|
||||
|
||||
assert(InvariantOk());
|
||||
return (reinterpret_cast<char *>(data_)) + sizeof(STRING_HEADER);
|
||||
}
|
||||
|
||||
// This is const, but is modifying a mutable field
|
||||
// this way it can be used on const or non-const instances.
|
||||
void STRING::FixHeader() const {
|
||||
const STRING_HEADER* header = GetHeader();
|
||||
if (header->used_ < 0)
|
||||
header->used_ = strlen(GetCStr()) + 1;
|
||||
}
|
||||
|
||||
|
||||
STRING::STRING() {
|
||||
// Empty STRINGs contain just the "\0".
|
||||
memcpy(AllocData(1, kMinCapacity), "", 1);
|
||||
}
|
||||
|
||||
STRING::STRING(const STRING& str) {
|
||||
str.FixHeader();
|
||||
const STRING_HEADER* str_header = str.GetHeader();
|
||||
const int str_used = str_header->used_;
|
||||
char *this_cstr = AllocData(str_used, str_used);
|
||||
memcpy(this_cstr, str.GetCStr(), str_used);
|
||||
assert(InvariantOk());
|
||||
}
|
||||
|
||||
STRING::STRING(const char* cstr) {
|
||||
if (cstr == nullptr) {
|
||||
// Empty STRINGs contain just the "\0".
|
||||
memcpy(AllocData(1, kMinCapacity), "", 1);
|
||||
} else {
|
||||
const int len = strlen(cstr) + 1;
|
||||
char* this_cstr = AllocData(len, len);
|
||||
memcpy(this_cstr, cstr, len);
|
||||
}
|
||||
assert(InvariantOk());
|
||||
}
|
||||
|
||||
STRING::STRING(const char *data, int length) {
|
||||
if (data == nullptr) {
|
||||
// Empty STRINGs contain just the "\0".
|
||||
memcpy(AllocData(1, kMinCapacity), "", 1);
|
||||
} else {
|
||||
char* this_cstr = AllocData(length + 1, length + 1);
|
||||
memcpy(this_cstr, data, length);
|
||||
this_cstr[length] = '\0';
|
||||
}
|
||||
}
|
||||
|
||||
STRING::~STRING() {
|
||||
DiscardData();
|
||||
}
|
||||
|
||||
// TODO(rays) Change all callers to use TFile and remove the old functions.
|
||||
// Writes to the given file. Returns false in case of error.
|
||||
bool STRING::Serialize(FILE* fp) const {
|
||||
uint32_t len = length();
|
||||
return tesseract::Serialize(fp, &len) &&
|
||||
tesseract::Serialize(fp, GetCStr(), len);
|
||||
tesseract::Serialize(fp, c_str(), len);
|
||||
}
|
||||
|
||||
// Writes to the given file. Returns false in case of error.
|
||||
bool STRING::Serialize(TFile* fp) const {
|
||||
uint32_t len = length();
|
||||
return fp->Serialize(&len) &&
|
||||
fp->Serialize(GetCStr(), len);
|
||||
fp->Serialize(c_str(), len);
|
||||
}
|
||||
|
||||
// Reads from the given file. Returns false in case of error.
|
||||
// If swap is true, assumes a big/little-endian swap is needed.
|
||||
bool STRING::DeSerialize(bool swap, FILE* fp) {
|
||||
@ -167,15 +60,16 @@ bool STRING::DeSerialize(bool swap, FILE* fp) {
|
||||
// Arbitrarily limit the number of characters to protect against bad data.
|
||||
if (len > UINT16_MAX) return false;
|
||||
truncate_at(len);
|
||||
return tesseract::DeSerialize(fp, GetCStr(), len);
|
||||
return tesseract::DeSerialize(fp, data(), len);
|
||||
}
|
||||
|
||||
// Reads from the given file. Returns false in case of error.
|
||||
// If swap is true, assumes a big/little-endian swap is needed.
|
||||
bool STRING::DeSerialize(TFile* fp) {
|
||||
uint32_t len;
|
||||
if (!fp->DeSerialize(&len)) return false;
|
||||
truncate_at(len);
|
||||
return fp->DeSerialize(GetCStr(), len);
|
||||
return fp->DeSerialize(data(), len);
|
||||
}
|
||||
|
||||
// As DeSerialize, but only seeks past the data - hence a static method.
|
||||
@ -186,98 +80,13 @@ bool STRING::SkipDeSerialize(TFile* fp) {
|
||||
}
|
||||
|
||||
bool STRING::contains(const char c) const {
|
||||
return (c != '\0') && (strchr (GetCStr(), c) != nullptr);
|
||||
return (c != '\0') && (strchr (c_str(), c) != nullptr);
|
||||
}
|
||||
|
||||
int32_t STRING::length() const {
|
||||
FixHeader();
|
||||
return GetHeader()->used_ - 1;
|
||||
}
|
||||
|
||||
const char* STRING::c_str() const {
|
||||
const STRING_HEADER* header = GetHeader();
|
||||
if (!header || header->used_ == 0)
|
||||
return nullptr;
|
||||
|
||||
// mark header length unreliable because tesseract might
|
||||
// cast away the const and mutate the string directly.
|
||||
header->used_ = -1;
|
||||
return GetCStr();
|
||||
}
|
||||
|
||||
/******
|
||||
* The STRING_IS_PROTECTED interface adds additional support to migrate
|
||||
* code that needs to modify the STRING in ways not otherwise supported
|
||||
* without violating encapsulation.
|
||||
*
|
||||
* Also makes the [] operator return a const so it is immutable
|
||||
*/
|
||||
#if STRING_IS_PROTECTED
|
||||
const char& STRING::operator[](int32_t index) const {
|
||||
return GetCStr()[index];
|
||||
}
|
||||
|
||||
void STRING::insert_range(int32_t index, const char* str, int len) {
|
||||
// if index is outside current range, then also grow size of string
|
||||
// to accmodate the requested range.
|
||||
STRING_HEADER* this_header = GetHeader();
|
||||
int used = this_header->used_;
|
||||
if (index > used)
|
||||
used = index;
|
||||
|
||||
char* this_cstr = ensure_cstr(used + len + 1);
|
||||
if (index < used) {
|
||||
// move existing string from index to '\0' inclusive.
|
||||
memmove(this_cstr + index + len,
|
||||
this_cstr + index,
|
||||
this_header->used_ - index);
|
||||
} else if (len > 0) {
|
||||
// We are going to overwrite previous null terminator, so write the new one.
|
||||
this_cstr[this_header->used_ + len - 1] = '\0';
|
||||
|
||||
// If the old header did not have the terminator,
|
||||
// then we need to account for it now that we've added it.
|
||||
// Otherwise it was already accounted for; we just moved it.
|
||||
if (this_header->used_ == 0)
|
||||
++this_header->used_;
|
||||
}
|
||||
|
||||
// Write new string to index.
|
||||
// The string is already terminated from the conditions above.
|
||||
memcpy(this_cstr + index, str, len);
|
||||
this_header->used_ += len;
|
||||
|
||||
assert(InvariantOk());
|
||||
}
|
||||
|
||||
void STRING::erase_range(int32_t index, int len) {
|
||||
char* this_cstr = GetCStr();
|
||||
STRING_HEADER* this_header = GetHeader();
|
||||
|
||||
memcpy(this_cstr+index, this_cstr+index+len,
|
||||
this_header->used_ - index - len);
|
||||
this_header->used_ -= len;
|
||||
assert(InvariantOk());
|
||||
}
|
||||
|
||||
#else
|
||||
void STRING::truncate_at(int32_t index) {
|
||||
ASSERT_HOST(index >= 0);
|
||||
FixHeader();
|
||||
char* this_cstr = ensure_cstr(index + 1);
|
||||
this_cstr[index] = '\0';
|
||||
GetHeader()->used_ = index + 1;
|
||||
assert(InvariantOk());
|
||||
resize(index);
|
||||
}
|
||||
|
||||
char& STRING::operator[](int32_t index) const {
|
||||
// Code is casting away this const and mutating the string,
|
||||
// so mark used_ as -1 to flag it unreliable.
|
||||
GetHeader()->used_ = -1;
|
||||
return (const_cast<char *>(GetCStr()))[index];
|
||||
}
|
||||
#endif
|
||||
|
||||
void STRING::split(const char c, GenericVector<STRING> *splited) {
|
||||
int start_index = 0;
|
||||
const int len = length();
|
||||
@ -285,7 +94,7 @@ void STRING::split(const char c, GenericVector<STRING> *splited) {
|
||||
if ((*this)[i] == c) {
|
||||
if (i != start_index) {
|
||||
(*this)[i] = '\0';
|
||||
splited->push_back(STRING(GetCStr() + start_index, i - start_index));
|
||||
splited->push_back(STRING(c_str() + start_index, i - start_index));
|
||||
(*this)[i] = c;
|
||||
}
|
||||
start_index = i + 1;
|
||||
@ -293,86 +102,10 @@ void STRING::split(const char c, GenericVector<STRING> *splited) {
|
||||
}
|
||||
|
||||
if (len != start_index) {
|
||||
splited->push_back(STRING(GetCStr() + start_index, len - start_index));
|
||||
splited->push_back(STRING(c_str() + start_index, len - start_index));
|
||||
}
|
||||
}
|
||||
|
||||
bool STRING::operator==(const STRING& str) const {
|
||||
FixHeader();
|
||||
str.FixHeader();
|
||||
const STRING_HEADER* str_header = str.GetHeader();
|
||||
const STRING_HEADER* this_header = GetHeader();
|
||||
const int this_used = this_header->used_;
|
||||
const int str_used = str_header->used_;
|
||||
|
||||
return (this_used == str_used)
|
||||
&& (memcmp(GetCStr(), str.GetCStr(), this_used) == 0);
|
||||
}
|
||||
|
||||
bool STRING::operator!=(const STRING& str) const {
|
||||
FixHeader();
|
||||
str.FixHeader();
|
||||
const STRING_HEADER* str_header = str.GetHeader();
|
||||
const STRING_HEADER* this_header = GetHeader();
|
||||
const int this_used = this_header->used_;
|
||||
const int str_used = str_header->used_;
|
||||
|
||||
return (this_used != str_used)
|
||||
|| (memcmp(GetCStr(), str.GetCStr(), this_used) != 0);
|
||||
}
|
||||
|
||||
bool STRING::operator!=(const char* cstr) const {
|
||||
FixHeader();
|
||||
const STRING_HEADER* this_header = GetHeader();
|
||||
|
||||
if (cstr == nullptr)
|
||||
return this_header->used_ > 1; // either '\0' or nullptr
|
||||
else {
|
||||
const int32_t length = strlen(cstr) + 1;
|
||||
return (this_header->used_ != length)
|
||||
|| (memcmp(GetCStr(), cstr, length) != 0);
|
||||
}
|
||||
}
|
||||
|
||||
STRING& STRING::operator=(const STRING& str) {
|
||||
str.FixHeader();
|
||||
const STRING_HEADER* str_header = str.GetHeader();
|
||||
const int str_used = str_header->used_;
|
||||
|
||||
GetHeader()->used_ = 0; // clear since ensure doesn't need to copy data
|
||||
char* this_cstr = ensure_cstr(str_used);
|
||||
STRING_HEADER* this_header = GetHeader();
|
||||
|
||||
memcpy(this_cstr, str.GetCStr(), str_used);
|
||||
this_header->used_ = str_used;
|
||||
|
||||
assert(InvariantOk());
|
||||
return *this;
|
||||
}
|
||||
|
||||
STRING & STRING::operator+=(const STRING& str) {
|
||||
FixHeader();
|
||||
str.FixHeader();
|
||||
const STRING_HEADER* str_header = str.GetHeader();
|
||||
const char* str_cstr = str.GetCStr();
|
||||
const int str_used = str_header->used_;
|
||||
const int this_used = GetHeader()->used_;
|
||||
char* this_cstr = ensure_cstr(this_used + str_used);
|
||||
|
||||
STRING_HEADER* this_header = GetHeader(); // after ensure for realloc
|
||||
|
||||
if (this_used > 1) {
|
||||
memcpy(this_cstr + this_used - 1, str_cstr, str_used);
|
||||
this_header->used_ += str_used - 1; // overwrite '\0'
|
||||
} else {
|
||||
memcpy(this_cstr, str_cstr, str_used);
|
||||
this_header->used_ = str_used;
|
||||
}
|
||||
|
||||
assert(InvariantOk());
|
||||
return *this;
|
||||
}
|
||||
|
||||
void STRING::add_str_int(const char* str, int number) {
|
||||
if (str != nullptr)
|
||||
*this += str;
|
||||
@ -382,6 +115,7 @@ void STRING::add_str_int(const char* str, int number) {
|
||||
num_buffer[kMaxIntSize - 1] = '\0';
|
||||
*this += num_buffer;
|
||||
}
|
||||
|
||||
// Appends the given string and double (as a %.8g) to this.
|
||||
void STRING::add_str_double(const char* str, double number) {
|
||||
if (str != nullptr)
|
||||
@ -395,112 +129,4 @@ void STRING::add_str_double(const char* str, double number) {
|
||||
*this += stream.str().c_str();
|
||||
}
|
||||
|
||||
STRING & STRING::operator=(const char* cstr) {
|
||||
STRING_HEADER* this_header = GetHeader();
|
||||
if (cstr) {
|
||||
const int len = strlen(cstr) + 1;
|
||||
|
||||
this_header->used_ = 0; // don't bother copying data if need to realloc
|
||||
char* this_cstr = ensure_cstr(len);
|
||||
this_header = GetHeader(); // for realloc
|
||||
memcpy(this_cstr, cstr, len);
|
||||
this_header->used_ = len;
|
||||
} else {
|
||||
// Reallocate to same state as default constructor.
|
||||
DiscardData();
|
||||
// Empty STRINGs contain just the "\0".
|
||||
memcpy(AllocData(1, kMinCapacity), "", 1);
|
||||
}
|
||||
|
||||
assert(InvariantOk());
|
||||
return *this;
|
||||
}
|
||||
|
||||
void STRING::assign(const char *cstr, int len) {
|
||||
STRING_HEADER* this_header = GetHeader();
|
||||
this_header->used_ = 0; // don't bother copying data if need to realloc
|
||||
char* this_cstr = ensure_cstr(len + 1); // +1 for '\0'
|
||||
|
||||
this_header = GetHeader(); // for realloc
|
||||
memcpy(this_cstr, cstr, len);
|
||||
this_cstr[len] = '\0';
|
||||
this_header->used_ = len + 1;
|
||||
|
||||
assert(InvariantOk());
|
||||
}
|
||||
|
||||
STRING STRING::operator+(const STRING& str) const {
|
||||
STRING result(*this);
|
||||
result += str;
|
||||
|
||||
assert(InvariantOk());
|
||||
return result;
|
||||
}
|
||||
|
||||
|
||||
STRING STRING::operator+(const char ch) const {
|
||||
STRING result;
|
||||
FixHeader();
|
||||
const STRING_HEADER* this_header = GetHeader();
|
||||
const int this_used = this_header->used_;
|
||||
char* result_cstr = result.ensure_cstr(this_used + 1);
|
||||
STRING_HEADER* result_header = result.GetHeader();
|
||||
const int result_used = result_header->used_;
|
||||
|
||||
// copies '\0' but we'll overwrite that
|
||||
memcpy(result_cstr, GetCStr(), this_used);
|
||||
result_cstr[result_used] = ch; // overwrite old '\0'
|
||||
result_cstr[result_used + 1] = '\0'; // append on '\0'
|
||||
++result_header->used_;
|
||||
|
||||
assert(InvariantOk());
|
||||
return result;
|
||||
}
|
||||
|
||||
|
||||
STRING& STRING::operator+=(const char *str) {
|
||||
if (!str || !*str) // empty string has no effect
|
||||
return *this;
|
||||
|
||||
FixHeader();
|
||||
const int len = strlen(str) + 1;
|
||||
const int this_used = GetHeader()->used_;
|
||||
char* this_cstr = ensure_cstr(this_used + len);
|
||||
STRING_HEADER* this_header = GetHeader(); // after ensure for realloc
|
||||
|
||||
// if we had non-empty string then append overwriting old '\0'
|
||||
// otherwise replace
|
||||
if (this_used > 0) {
|
||||
memcpy(this_cstr + this_used - 1, str, len);
|
||||
this_header->used_ += len - 1;
|
||||
} else {
|
||||
memcpy(this_cstr, str, len);
|
||||
this_header->used_ = len;
|
||||
}
|
||||
|
||||
assert(InvariantOk());
|
||||
return *this;
|
||||
}
|
||||
|
||||
|
||||
STRING& STRING::operator+=(const char ch) {
|
||||
if (ch == '\0')
|
||||
return *this;
|
||||
|
||||
FixHeader();
|
||||
int this_used = GetHeader()->used_;
|
||||
char* this_cstr = ensure_cstr(this_used + 1);
|
||||
STRING_HEADER* this_header = GetHeader();
|
||||
|
||||
if (this_used > 0)
|
||||
--this_used; // undo old empty null if there was one
|
||||
|
||||
this_cstr[this_used++] = ch; // append ch to end
|
||||
this_cstr[this_used++] = '\0'; // append '\0' after ch
|
||||
this_header->used_ = this_used;
|
||||
|
||||
assert(InvariantOk());
|
||||
return *this;
|
||||
}
|
||||
|
||||
} // namespace tesseract
|
||||
|
@ -151,7 +151,7 @@ void LanguageModel::InitForWord(const WERD_CHOICE *prev_word,
|
||||
// Fill prev_word_str_ with the last language_model_ngram_order
|
||||
// unichars from prev_word.
|
||||
if (language_model_ngram_on) {
|
||||
if (prev_word != nullptr && prev_word->unichar_string() != nullptr) {
|
||||
if (prev_word != nullptr && !prev_word->unichar_string().empty()) {
|
||||
prev_word_str_ = prev_word->unichar_string();
|
||||
if (language_model_ngram_space_delimited_language) prev_word_str_ += ' ';
|
||||
} else {
|
||||
|
Loading…
Reference in New Issue
Block a user