diff --git a/include/tesseract/strngs.h b/include/tesseract/strngs.h index 2e29cf92f..18aa1ef48 100644 --- a/include/tesseract/strngs.h +++ b/include/tesseract/strngs.h @@ -25,47 +25,40 @@ #include // for uint32_t #include // for FILE #include // for strncpy +#include namespace tesseract { class TFile; -// STRING_IS_PROTECTED means that string[index] = X is invalid -// because you have to go through strings interface to modify it. -// This allows the string to ensure internal integrity and maintain -// its own string length. Unfortunately this is not possible because -// STRINGS are used as direct-manipulation data buffers for things -// like length arrays and many places cast away the const on c_str() -// to mutate the string. Turning this off means that internally we -// cannot assume we know the strlen. -#define STRING_IS_PROTECTED 0 - template class GenericVector; -class TESS_API STRING { +class STRING : public std::string { public: - STRING(); - STRING(const STRING& string); - STRING(const char* string); - STRING(const char* data, int length); - ~STRING(); + using std::string::string; + STRING(const std::string &s) : std::string(s) {} // Writes to the given file. Returns false in case of error. + TESS_API bool Serialize(FILE* fp) const; // Reads from the given file. Returns false in case of error. // If swap is true, assumes a big/little-endian swap is needed. + TESS_API bool DeSerialize(bool swap, FILE* fp); // Writes to the given file. Returns false in case of error. + TESS_API bool Serialize(tesseract::TFile* fp) const; // Reads from the given file. Returns false in case of error. // If swap is true, assumes a big/little-endian swap is needed. + TESS_API bool DeSerialize(tesseract::TFile* fp); // As DeSerialize, but only seeks past the data - hence a static method. + TESS_API static bool SkipDeSerialize(tesseract::TFile* fp); + TESS_API bool contains(char c) const; - int32_t length() const; int32_t size() const { return length(); } @@ -75,114 +68,26 @@ class TESS_API STRING { assert(0 <= len); return static_cast(len); } - const char* c_str() const; inline char* strdup() const { int32_t len = length() + 1; - return strncpy(new char[len], GetCStr(), len); + return strncpy(new char[len], c_str(), len); } -#if STRING_IS_PROTECTED - const char& operator[](int32_t index) const; - // len is number of chars in s to insert starting at index in this string - void insert_range(int32_t index, const char* s, int len); - void erase_range(int32_t index, int len); -#else - char& operator[](int32_t index) const; -#endif + TESS_API void split(char c, GenericVector* splited); + TESS_API void truncate_at(int32_t index); - bool operator==(const STRING& string) const; - bool operator!=(const STRING& string) const; - bool operator!=(const char* string) const; - - STRING& operator=(const char* string); - STRING& operator=(const STRING& string); - - STRING operator+(const STRING& string) const; - STRING operator+(char ch) const; - - STRING& operator+=(const char* string); - STRING& operator+=(const STRING& string); - STRING& operator+=(char ch); - - // Assignment for strings which are not null-terminated. - void assign(const char* cstr, int len); - // Appends the given string and int (as a %d) to this. // += cannot be used for ints as there as a char += operator that would // be ambiguous, and ints usually need a string before or between them // anyway. + TESS_API void add_str_int(const char* str, int number); // Appends the given string and double (as a %.8g) to this. + TESS_API void add_str_double(const char* str, double number); - - // ensure capacity but keep pointer encapsulated - inline void ensure(int32_t min_capacity) { - ensure_cstr(min_capacity); - } - - private: - typedef struct STRING_HEADER { - // How much space was allocated in the string buffer for char data. - int capacity_; - - // used_ is how much of the capacity is currently being used, - // including a '\0' terminator. - // - // If used_ is 0 then string is nullptr (not even the '\0') - // else if used_ > 0 then it is strlen() + 1 (because it includes '\0') - // else strlen is >= 0 (not nullptr) but needs to be computed. - // this condition is set when encapsulation is violated because - // an API returned a mutable string. - // - // capacity_ - used_ = excess capacity that the string can grow - // without reallocating - mutable int used_; - } STRING_HEADER; - - // To preserve the behavior of the old serialization, we only have space - // for one pointer in this structure. So we are embedding a data structure - // at the start of the storage that will hold additional state variables, - // then storing the actual string contents immediately after. - STRING_HEADER* data_; - - // returns the header part of the storage - inline STRING_HEADER* GetHeader() { - return data_; - } - inline const STRING_HEADER* GetHeader() const { - return data_; - } - - // returns the string data part of storage - inline char* GetCStr() { - return (reinterpret_cast(data_)) + sizeof(STRING_HEADER); - } - - inline const char* GetCStr() const { - return (reinterpret_cast(data_)) + sizeof(STRING_HEADER); - } - inline bool InvariantOk() const { -#if STRING_IS_PROTECTED - return (GetHeader()->used_ == 0) - ? (c_str() == nullptr) - : (GetHeader()->used_ == (strlen(c_str()) + 1)); -#else - return true; -#endif - } - - // Ensure string has requested capacity as optimization - // to avoid unnecessary reallocations. - // The return value is a cstr buffer with at least requested capacity - char* ensure_cstr(int32_t min_capacity); - - void FixHeader() const; // make used_ non-negative, even if const - - char* AllocData(int used, int capacity); - void DiscardData(); }; } // namespace tesseract. diff --git a/src/ccstruct/ratngs.h b/src/ccstruct/ratngs.h index fc8a91b51..647176725 100644 --- a/src/ccstruct/ratngs.h +++ b/src/ccstruct/ratngs.h @@ -529,6 +529,13 @@ class WERD_CHOICE : public ELIST_LINK { return unichars_in_script_order_; } + // Returns a UTF-8 string equivalent to the current choice + // of UNICHAR IDs. + STRING &unichar_string() { + this->string_and_lengths(&unichar_string_, &unichar_lengths_); + return unichar_string_; + } + // Returns a UTF-8 string equivalent to the current choice // of UNICHAR IDs. const STRING &unichar_string() const { diff --git a/src/ccutil/mainblk.cpp b/src/ccutil/mainblk.cpp index 4b8e69095..c0ec7bd34 100644 --- a/src/ccutil/mainblk.cpp +++ b/src/ccutil/mainblk.cpp @@ -53,7 +53,7 @@ void CCUtil::main_setup(const char *argv0, const char *basename) { /* Use tessdata prefix from the environment. */ datadir = tessdata_prefix; #if defined(_WIN32) - } else if (datadir == nullptr || _access(datadir.c_str(), 0) != 0) { + } else if (datadir.empty() || _access(datadir.c_str(), 0) != 0) { /* Look for tessdata in directory of executable. */ char path[_MAX_PATH]; DWORD length = GetModuleFileName(nullptr, path, sizeof(path)); diff --git a/src/ccutil/strngs.cpp b/src/ccutil/strngs.cpp index a18de6f7a..8f93fe891 100644 --- a/src/ccutil/strngs.cpp +++ b/src/ccutil/strngs.cpp @@ -35,128 +35,21 @@ namespace tesseract { // possible length of an int (in 64 bits), being -<20 digits>. const int kMaxIntSize = 22; -/********************************************************************** - * STRING_HEADER provides metadata about the allocated buffer, - * including total capacity and how much used (strlen with '\0'). - * - * The implementation hides this header at the start of the data - * buffer and appends the string on the end to keep sizeof(STRING) - * unchanged from earlier versions so serialization is not affected. - * - * The collection of MACROS provide different implementations depending - * on whether the string keeps track of its strlen or not so that this - * feature can be added in later when consumers don't modify the string - **********************************************************************/ - -// Smallest string to allocate by default -const int kMinCapacity = 16; - -char* STRING::AllocData(int used, int capacity) { - data_ = static_cast(malloc(capacity + sizeof(STRING_HEADER))); - - // header is the metadata for this memory block - STRING_HEADER* header = GetHeader(); - header->capacity_ = capacity; - header->used_ = used; - return GetCStr(); -} - -void STRING::DiscardData() { - free(data_); - data_ = nullptr; -} - -// This is a private method; ensure FixHeader is called (or used_ is well defined) -// beforehand -char* STRING::ensure_cstr(int32_t min_capacity) { - STRING_HEADER* orig_header = GetHeader(); - if (min_capacity <= orig_header->capacity_) - return (reinterpret_cast(this->data_)) + sizeof(STRING_HEADER); - - // if we are going to grow bigger, than double our existing - // size, but if that still is not big enough then keep the - // requested capacity - if (min_capacity < 2 * orig_header->capacity_) - min_capacity = 2 * orig_header->capacity_; - - int alloc = sizeof(STRING_HEADER) + min_capacity; - auto* new_header = static_cast(malloc(alloc)); - - memcpy(&new_header[1], GetCStr(), orig_header->used_); - new_header->capacity_ = min_capacity; - new_header->used_ = orig_header->used_; - - // free old memory, then rebind to new memory - DiscardData(); - data_ = new_header; - - assert(InvariantOk()); - return (reinterpret_cast(data_)) + sizeof(STRING_HEADER); -} - -// This is const, but is modifying a mutable field -// this way it can be used on const or non-const instances. -void STRING::FixHeader() const { - const STRING_HEADER* header = GetHeader(); - if (header->used_ < 0) - header->used_ = strlen(GetCStr()) + 1; -} - - -STRING::STRING() { - // Empty STRINGs contain just the "\0". - memcpy(AllocData(1, kMinCapacity), "", 1); -} - -STRING::STRING(const STRING& str) { - str.FixHeader(); - const STRING_HEADER* str_header = str.GetHeader(); - const int str_used = str_header->used_; - char *this_cstr = AllocData(str_used, str_used); - memcpy(this_cstr, str.GetCStr(), str_used); - assert(InvariantOk()); -} - -STRING::STRING(const char* cstr) { - if (cstr == nullptr) { - // Empty STRINGs contain just the "\0". - memcpy(AllocData(1, kMinCapacity), "", 1); - } else { - const int len = strlen(cstr) + 1; - char* this_cstr = AllocData(len, len); - memcpy(this_cstr, cstr, len); - } - assert(InvariantOk()); -} - -STRING::STRING(const char *data, int length) { - if (data == nullptr) { - // Empty STRINGs contain just the "\0". - memcpy(AllocData(1, kMinCapacity), "", 1); - } else { - char* this_cstr = AllocData(length + 1, length + 1); - memcpy(this_cstr, data, length); - this_cstr[length] = '\0'; - } -} - -STRING::~STRING() { - DiscardData(); -} - // TODO(rays) Change all callers to use TFile and remove the old functions. // Writes to the given file. Returns false in case of error. bool STRING::Serialize(FILE* fp) const { uint32_t len = length(); return tesseract::Serialize(fp, &len) && - tesseract::Serialize(fp, GetCStr(), len); + tesseract::Serialize(fp, c_str(), len); } + // Writes to the given file. Returns false in case of error. bool STRING::Serialize(TFile* fp) const { uint32_t len = length(); return fp->Serialize(&len) && - fp->Serialize(GetCStr(), len); + fp->Serialize(c_str(), len); } + // Reads from the given file. Returns false in case of error. // If swap is true, assumes a big/little-endian swap is needed. bool STRING::DeSerialize(bool swap, FILE* fp) { @@ -167,15 +60,16 @@ bool STRING::DeSerialize(bool swap, FILE* fp) { // Arbitrarily limit the number of characters to protect against bad data. if (len > UINT16_MAX) return false; truncate_at(len); - return tesseract::DeSerialize(fp, GetCStr(), len); + return tesseract::DeSerialize(fp, data(), len); } + // Reads from the given file. Returns false in case of error. // If swap is true, assumes a big/little-endian swap is needed. bool STRING::DeSerialize(TFile* fp) { uint32_t len; if (!fp->DeSerialize(&len)) return false; truncate_at(len); - return fp->DeSerialize(GetCStr(), len); + return fp->DeSerialize(data(), len); } // As DeSerialize, but only seeks past the data - hence a static method. @@ -186,98 +80,13 @@ bool STRING::SkipDeSerialize(TFile* fp) { } bool STRING::contains(const char c) const { - return (c != '\0') && (strchr (GetCStr(), c) != nullptr); + return (c != '\0') && (strchr (c_str(), c) != nullptr); } -int32_t STRING::length() const { - FixHeader(); - return GetHeader()->used_ - 1; -} - -const char* STRING::c_str() const { - const STRING_HEADER* header = GetHeader(); - if (!header || header->used_ == 0) - return nullptr; - - // mark header length unreliable because tesseract might - // cast away the const and mutate the string directly. - header->used_ = -1; - return GetCStr(); -} - -/****** - * The STRING_IS_PROTECTED interface adds additional support to migrate - * code that needs to modify the STRING in ways not otherwise supported - * without violating encapsulation. - * - * Also makes the [] operator return a const so it is immutable - */ -#if STRING_IS_PROTECTED -const char& STRING::operator[](int32_t index) const { - return GetCStr()[index]; -} - -void STRING::insert_range(int32_t index, const char* str, int len) { - // if index is outside current range, then also grow size of string - // to accmodate the requested range. - STRING_HEADER* this_header = GetHeader(); - int used = this_header->used_; - if (index > used) - used = index; - - char* this_cstr = ensure_cstr(used + len + 1); - if (index < used) { - // move existing string from index to '\0' inclusive. - memmove(this_cstr + index + len, - this_cstr + index, - this_header->used_ - index); - } else if (len > 0) { - // We are going to overwrite previous null terminator, so write the new one. - this_cstr[this_header->used_ + len - 1] = '\0'; - - // If the old header did not have the terminator, - // then we need to account for it now that we've added it. - // Otherwise it was already accounted for; we just moved it. - if (this_header->used_ == 0) - ++this_header->used_; - } - - // Write new string to index. - // The string is already terminated from the conditions above. - memcpy(this_cstr + index, str, len); - this_header->used_ += len; - - assert(InvariantOk()); -} - -void STRING::erase_range(int32_t index, int len) { - char* this_cstr = GetCStr(); - STRING_HEADER* this_header = GetHeader(); - - memcpy(this_cstr+index, this_cstr+index+len, - this_header->used_ - index - len); - this_header->used_ -= len; - assert(InvariantOk()); -} - -#else void STRING::truncate_at(int32_t index) { - ASSERT_HOST(index >= 0); - FixHeader(); - char* this_cstr = ensure_cstr(index + 1); - this_cstr[index] = '\0'; - GetHeader()->used_ = index + 1; - assert(InvariantOk()); + resize(index); } -char& STRING::operator[](int32_t index) const { - // Code is casting away this const and mutating the string, - // so mark used_ as -1 to flag it unreliable. - GetHeader()->used_ = -1; - return (const_cast(GetCStr()))[index]; -} -#endif - void STRING::split(const char c, GenericVector *splited) { int start_index = 0; const int len = length(); @@ -285,7 +94,7 @@ void STRING::split(const char c, GenericVector *splited) { if ((*this)[i] == c) { if (i != start_index) { (*this)[i] = '\0'; - splited->push_back(STRING(GetCStr() + start_index, i - start_index)); + splited->push_back(STRING(c_str() + start_index, i - start_index)); (*this)[i] = c; } start_index = i + 1; @@ -293,86 +102,10 @@ void STRING::split(const char c, GenericVector *splited) { } if (len != start_index) { - splited->push_back(STRING(GetCStr() + start_index, len - start_index)); + splited->push_back(STRING(c_str() + start_index, len - start_index)); } } -bool STRING::operator==(const STRING& str) const { - FixHeader(); - str.FixHeader(); - const STRING_HEADER* str_header = str.GetHeader(); - const STRING_HEADER* this_header = GetHeader(); - const int this_used = this_header->used_; - const int str_used = str_header->used_; - - return (this_used == str_used) - && (memcmp(GetCStr(), str.GetCStr(), this_used) == 0); -} - -bool STRING::operator!=(const STRING& str) const { - FixHeader(); - str.FixHeader(); - const STRING_HEADER* str_header = str.GetHeader(); - const STRING_HEADER* this_header = GetHeader(); - const int this_used = this_header->used_; - const int str_used = str_header->used_; - - return (this_used != str_used) - || (memcmp(GetCStr(), str.GetCStr(), this_used) != 0); -} - -bool STRING::operator!=(const char* cstr) const { - FixHeader(); - const STRING_HEADER* this_header = GetHeader(); - - if (cstr == nullptr) - return this_header->used_ > 1; // either '\0' or nullptr - else { - const int32_t length = strlen(cstr) + 1; - return (this_header->used_ != length) - || (memcmp(GetCStr(), cstr, length) != 0); - } -} - -STRING& STRING::operator=(const STRING& str) { - str.FixHeader(); - const STRING_HEADER* str_header = str.GetHeader(); - const int str_used = str_header->used_; - - GetHeader()->used_ = 0; // clear since ensure doesn't need to copy data - char* this_cstr = ensure_cstr(str_used); - STRING_HEADER* this_header = GetHeader(); - - memcpy(this_cstr, str.GetCStr(), str_used); - this_header->used_ = str_used; - - assert(InvariantOk()); - return *this; -} - -STRING & STRING::operator+=(const STRING& str) { - FixHeader(); - str.FixHeader(); - const STRING_HEADER* str_header = str.GetHeader(); - const char* str_cstr = str.GetCStr(); - const int str_used = str_header->used_; - const int this_used = GetHeader()->used_; - char* this_cstr = ensure_cstr(this_used + str_used); - - STRING_HEADER* this_header = GetHeader(); // after ensure for realloc - - if (this_used > 1) { - memcpy(this_cstr + this_used - 1, str_cstr, str_used); - this_header->used_ += str_used - 1; // overwrite '\0' - } else { - memcpy(this_cstr, str_cstr, str_used); - this_header->used_ = str_used; - } - - assert(InvariantOk()); - return *this; -} - void STRING::add_str_int(const char* str, int number) { if (str != nullptr) *this += str; @@ -382,6 +115,7 @@ void STRING::add_str_int(const char* str, int number) { num_buffer[kMaxIntSize - 1] = '\0'; *this += num_buffer; } + // Appends the given string and double (as a %.8g) to this. void STRING::add_str_double(const char* str, double number) { if (str != nullptr) @@ -395,112 +129,4 @@ void STRING::add_str_double(const char* str, double number) { *this += stream.str().c_str(); } -STRING & STRING::operator=(const char* cstr) { - STRING_HEADER* this_header = GetHeader(); - if (cstr) { - const int len = strlen(cstr) + 1; - - this_header->used_ = 0; // don't bother copying data if need to realloc - char* this_cstr = ensure_cstr(len); - this_header = GetHeader(); // for realloc - memcpy(this_cstr, cstr, len); - this_header->used_ = len; - } else { - // Reallocate to same state as default constructor. - DiscardData(); - // Empty STRINGs contain just the "\0". - memcpy(AllocData(1, kMinCapacity), "", 1); - } - - assert(InvariantOk()); - return *this; -} - -void STRING::assign(const char *cstr, int len) { - STRING_HEADER* this_header = GetHeader(); - this_header->used_ = 0; // don't bother copying data if need to realloc - char* this_cstr = ensure_cstr(len + 1); // +1 for '\0' - - this_header = GetHeader(); // for realloc - memcpy(this_cstr, cstr, len); - this_cstr[len] = '\0'; - this_header->used_ = len + 1; - - assert(InvariantOk()); -} - -STRING STRING::operator+(const STRING& str) const { - STRING result(*this); - result += str; - - assert(InvariantOk()); - return result; -} - - -STRING STRING::operator+(const char ch) const { - STRING result; - FixHeader(); - const STRING_HEADER* this_header = GetHeader(); - const int this_used = this_header->used_; - char* result_cstr = result.ensure_cstr(this_used + 1); - STRING_HEADER* result_header = result.GetHeader(); - const int result_used = result_header->used_; - - // copies '\0' but we'll overwrite that - memcpy(result_cstr, GetCStr(), this_used); - result_cstr[result_used] = ch; // overwrite old '\0' - result_cstr[result_used + 1] = '\0'; // append on '\0' - ++result_header->used_; - - assert(InvariantOk()); - return result; -} - - -STRING& STRING::operator+=(const char *str) { - if (!str || !*str) // empty string has no effect - return *this; - - FixHeader(); - const int len = strlen(str) + 1; - const int this_used = GetHeader()->used_; - char* this_cstr = ensure_cstr(this_used + len); - STRING_HEADER* this_header = GetHeader(); // after ensure for realloc - - // if we had non-empty string then append overwriting old '\0' - // otherwise replace - if (this_used > 0) { - memcpy(this_cstr + this_used - 1, str, len); - this_header->used_ += len - 1; - } else { - memcpy(this_cstr, str, len); - this_header->used_ = len; - } - - assert(InvariantOk()); - return *this; -} - - -STRING& STRING::operator+=(const char ch) { - if (ch == '\0') - return *this; - - FixHeader(); - int this_used = GetHeader()->used_; - char* this_cstr = ensure_cstr(this_used + 1); - STRING_HEADER* this_header = GetHeader(); - - if (this_used > 0) - --this_used; // undo old empty null if there was one - - this_cstr[this_used++] = ch; // append ch to end - this_cstr[this_used++] = '\0'; // append '\0' after ch - this_header->used_ = this_used; - - assert(InvariantOk()); - return *this; -} - } // namespace tesseract diff --git a/src/wordrec/language_model.cpp b/src/wordrec/language_model.cpp index 5868c4b96..730205458 100644 --- a/src/wordrec/language_model.cpp +++ b/src/wordrec/language_model.cpp @@ -151,7 +151,7 @@ void LanguageModel::InitForWord(const WERD_CHOICE *prev_word, // Fill prev_word_str_ with the last language_model_ngram_order // unichars from prev_word. if (language_model_ngram_on) { - if (prev_word != nullptr && prev_word->unichar_string() != nullptr) { + if (prev_word != nullptr && !prev_word->unichar_string().empty()) { prev_word_str_ = prev_word->unichar_string(); if (language_model_ngram_space_delimited_language) prev_word_str_ += ' '; } else {