tesseract/ccutil/strngs.cpp

513 lines
15 KiB
C++
Raw Normal View History

/**********************************************************************
* File: strngs.c (Formerly strings.c)
* Description: STRING class functions.
* Author: Ray Smith
* Created: Fri Feb 15 09:13:30 GMT 1991
*
* (C) Copyright 1991, Hewlett-Packard Ltd.
** Licensed under the Apache License, Version 2.0 (the "License");
** you may not use this file except in compliance with the License.
** You may obtain a copy of the License at
** http://www.apache.org/licenses/LICENSE-2.0
** Unless required by applicable law or agreed to in writing, software
** distributed under the License is distributed on an "AS IS" BASIS,
** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
** See the License for the specific language governing permissions and
** limitations under the License.
*
**********************************************************************/
#include "mfcpch.h" //precompiled headers
#include "tprintf.h"
#include "strngs.h"
/**********************************************************************
* DataCache for reducing initial allocations, such as the default
* constructor. The memory in this cache is not special, it is just
* held locally rather than freeing. Only blocks with the default
* capacity are considered for the cache.
*
* In practice it does not appear that this cache grows very big,
* so even 2-4 elements are probably sufficient to realize most
* gains.
*
* The cache is maintained globally with a global destructor to
* avoid memory leaks being reported on exit.
**********************************************************************/
// kDataCacheSize is cache of last n min sized buffers freed for
// cheap recyling
const int kDataCacheSize = 8; // max number of buffers cached
// Size of buffer needed to host the decimal representation of the maximum
// possible length of an int (in 64 bits, being -<20 digits>.
const int kMaxIntSize = 22;
#if 1
#define CHECK_INVARIANT(s) // EMPTY
#else
static void check_used_(int len, const char *s) {
bool ok;
if (len == 0)
ok = (s == NULL);
else
ok = (len == (strlen(s) + 1));
if (!ok)
abort();
}
#define CHECK_INVARIANT(s) check_used_(s->GetHeader()->used_, s->string())
#endif
// put recycled buffers into a class so we can destroy it on exit
class DataCache {
public:
DataCache() {
top_ = 0;
}
~DataCache() {
while (--top_ >= 0)
free_string((char *)stack_[top_]);
}
// Allocate a buffer out of this cache.
// Returs NULL if there are no cached buffers.
// The buffers in the cache can be freed using string_free.
void* alloc() {
if (top_ == 0)
return NULL;
return stack_[--top_];
}
// Free pointer either by caching it on the stack of pointers
// or freeing it with string_free if there isnt space left to cache it.
// s should have capacity kMinCapacity.
void free(void* p) {
if (top_ == kDataCacheSize)
free_string((char *)p);
else
stack_[top_++] = p;
}
// Stack of discarded but not-yet freed pointers.
void* stack_[kDataCacheSize];
// Top of stack, points to element after last cached pointer
int top_;
};
static DataCache MinCapacityDataCache;
/**********************************************************************
* STRING_HEADER provides metadata about the allocated buffer,
* including total capacity and how much used (strlen with '\0').
*
* The implementation hides this header at the start of the data
* buffer and appends the string on the end to keep sizeof(STRING)
* unchanged from earlier versions so serialization is not affected.
*
* The collection of MACROS provide different implementations depending
* on whether the string keeps track of its strlen or not so that this
* feature can be added in later when consumers dont modifify the string
**********************************************************************/
// Smallest string to allocate by default
const int kMinCapacity = 16;
char* STRING::AllocData(int used, int capacity) {
if ((capacity != kMinCapacity)
|| ((data_ = (STRING_HEADER *)MinCapacityDataCache.alloc()) == NULL))
data_ = (STRING_HEADER *)alloc_string(capacity + sizeof(STRING_HEADER));
// header is the metadata for this memory block
STRING_HEADER* header = GetHeader();
header->capacity_ = capacity;
header->used_ = used;
return GetCStr();
}
void STRING::DiscardData() {
STRING_HEADER* header = GetHeader();
if (header->capacity_ == kMinCapacity)
MinCapacityDataCache.free(data_);
else
free_string((char *)data_);
}
// This is a private method; ensure FixHeader is called (or used_ is well defined)
// beforehand
char* STRING::ensure_cstr(inT32 min_capacity) {
STRING_HEADER* orig_header = GetHeader();
if (min_capacity <= orig_header->capacity_)
return ((char *)this->data_) + sizeof(STRING_HEADER);
// if we are going to grow bigger, than double our existing
// size, but if that still is not big enough then keep the
// requested capacity
if (min_capacity < 2 * orig_header->capacity_)
min_capacity = 2 * orig_header->capacity_;
int alloc = sizeof(STRING_HEADER) + min_capacity;
STRING_HEADER* new_header = (STRING_HEADER*)(alloc_string(alloc));
memcpy(&new_header[1], GetCStr(), orig_header->used_);
new_header->capacity_ = min_capacity;
new_header->used_ = orig_header->used_;
// free old memory, then rebind to new memory
DiscardData();
data_ = new_header;
CHECK_INVARIANT(this);
return ((char *)data_) + sizeof(STRING_HEADER);
}
// This is const, but is modifying a mutable field
// this way it can be used on const or non-const instances.
void STRING::FixHeader() const {
const STRING_HEADER* header = GetHeader();
if (header->used_ < 0)
header->used_ = strlen(GetCStr()) + 1;
}
STRING::STRING() {
// 0 indicates old NULL -- it doesnt even have '\0'
AllocData(0, kMinCapacity);
}
STRING::STRING(const STRING& str) {
str.FixHeader();
const STRING_HEADER* str_header = str.GetHeader();
int str_used = str_header->used_;
char *this_cstr = AllocData(str_used, str_used);
memcpy(this_cstr, str.GetCStr(), str_used);
CHECK_INVARIANT(this);
}
STRING::STRING(const char* cstr) {
if (cstr == NULL) {
AllocData(0, 0);
} else {
int len = strlen(cstr) + 1;
char* this_cstr = AllocData(len, len);
memcpy(this_cstr, cstr, len);
}
CHECK_INVARIANT(this);
}
STRING::~STRING() {
DiscardData();
}
BOOL8 STRING::contains(const char c) const {
return (c != '\0') && (strchr (GetCStr(), c) != NULL);
}
inT32 STRING::length() const {
FixHeader();
return GetHeader()->used_ - 1;
}
const char* STRING::string() const {
const STRING_HEADER* header = GetHeader();
if (header->used_ == 0)
return NULL;
// mark header length unreliable because tesseract might
// cast away the const and mutate the string directly.
header->used_ = -1;
return GetCStr();
}
/******
* The STRING_IS_PROTECTED interface adds additional support to migrate
* code that needs to modify the STRING in ways not otherwise supported
* without violating encapsulation.
*
* Also makes the [] operator return a const so it is immutable
*/
#if STRING_IS_PROTECTED
const char& STRING::operator[](inT32 index) const {
return GetCStr()[index];
}
void STRING::insert_range(inT32 index, const char* str, int len) {
// if index is outside current range, then also grow size of string
// to accmodate the requested range.
STRING_HEADER* this_header = GetHeader();
int used = this_header->used_;
if (index > used)
used = index;
char* this_cstr = ensure_cstr(used + len + 1);
if (index < used) {
// move existing string from index to '\0' inclusive.
memmove(this_cstr + index + len,
this_cstr + index,
this_header->used_ - index);
} else if (len > 0) {
// We are going to overwrite previous null terminator, so write the new one.
this_cstr[this_header->used_ + len - 1] = '\0';
// If the old header did not have the terminator,
// then we need to account for it now that we've added it.
// Otherwise it was already accounted for; we just moved it.
if (this_header->used_ == 0)
++this_header->used_;
}
// Write new string to index.
// The string is already terminated from the conditions above.
memcpy(this_cstr + index, str, len);
this_header->used_ += len;
CHECK_INVARIANT(this);
}
void STRING::erase_range(inT32 index, int len) {
char* this_cstr = GetCStr();
STRING_HEADER* this_header = GetHeader();
memcpy(this_cstr+index, this_cstr+index+len,
this_header->used_ - index - len);
this_header->used_ -= len;
CHECK_INVARIANT(this);
}
void STRING::truncate_at(inT32 index) {
char* this_cstr = ensure_cstr(index);
this_cstr[index] = '\0';
GetHeader()->used_ = index;
CHECK_INVARIANT(this);
}
#else
char& STRING::operator[](inT32 index) const {
// Code is casting away this const and mutating the string,
// so mark used_ as -1 to flag it unreliable.
GetHeader()->used_ = -1;
return ((char *)GetCStr())[index];
}
#endif
BOOL8 STRING::operator==(const STRING& str) const {
FixHeader();
str.FixHeader();
const STRING_HEADER* str_header = str.GetHeader();
const STRING_HEADER* this_header = GetHeader();
int this_used = this_header->used_;
int str_used = str_header->used_;
return (this_used == str_used)
&& (memcmp(GetCStr(), str.GetCStr(), this_used) == 0);
}
BOOL8 STRING::operator!=(const STRING& str) const {
FixHeader();
str.FixHeader();
const STRING_HEADER* str_header = str.GetHeader();
const STRING_HEADER* this_header = GetHeader();
int this_used = this_header->used_;
int str_used = str_header->used_;
return (this_used != str_used)
|| (memcmp(GetCStr(), str.GetCStr(), this_used) != 0);
}
BOOL8 STRING::operator!=(const char* cstr) const {
FixHeader();
const STRING_HEADER* this_header = GetHeader();
if (cstr == NULL)
return this_header->used_ > 1; // either '\0' or NULL
else {
inT32 length = strlen(cstr) + 1;
return (this_header->used_ != length)
|| (memcmp(GetCStr(), cstr, length) != 0);
}
}
STRING& STRING::operator=(const STRING& str) {
str.FixHeader();
const STRING_HEADER* str_header = str.GetHeader();
int str_used = str_header->used_;
GetHeader()->used_ = 0; // clear since ensure doesnt need to copy data
char* this_cstr = ensure_cstr(str_used);
STRING_HEADER* this_header = GetHeader();
memcpy(this_cstr, str.GetCStr(), str_used);
this_header->used_ = str_used;
CHECK_INVARIANT(this);
return *this;
}
STRING & STRING::operator+=(const STRING& str) {
FixHeader();
str.FixHeader();
const STRING_HEADER* str_header = str.GetHeader();
const char* str_cstr = str.GetCStr();
int str_used = str_header->used_;
int this_used = GetHeader()->used_;
char* this_cstr = ensure_cstr(this_used + str_used);
STRING_HEADER* this_header = GetHeader(); // after ensure for realloc
if (this_used > 1) {
memcpy(this_cstr + this_used - 1, str_cstr, str_used);
this_header->used_ += str_used - 1; // overwrite '\0'
} else {
memcpy(this_cstr, str_cstr, str_used);
this_header->used_ = str_used;
}
CHECK_INVARIANT(this);
return *this;
}
// Appends the given string and int (as a %d) to this.
// += cannot be used for ints as there as a char += operator that would
// be ambiguous, and ints usually need a string before or between them
// anyway.
void STRING::add_str_int(const char* str, int number) {
*this += str;
// Allow space for the maximum possible length of inT64.
char num_buffer[kMaxIntSize];
num_buffer[kMaxIntSize - 1] = '\0';
snprintf(num_buffer, kMaxIntSize - 1, "%d", number);
*this += num_buffer;
}
void STRING::prep_serialise() {
// WARNING
// This method should only be called on a shallow bitwise copy
// by the serialise() method (see serialis.h).
FixHeader();
data_ = (STRING_HEADER *)GetHeader()->used_;
}
void STRING::dump(FILE* f) {
FixHeader();
serialise_bytes (f, data_, GetHeader()->used_);
}
void STRING::de_dump(FILE* f) {
char *instring; //input from read
fprintf(stderr, "de_dump\n");
instring = (char *)de_serialise_bytes(f, (ptrdiff_t)data_);
int len = strlen(instring) + 1;
char* this_cstr = AllocData(len, len);
STRING_HEADER* this_header = GetHeader();
memcpy(this_cstr, instring, len);
this_header->used_ = len;
free_mem(instring);
CHECK_INVARIANT(this);
}
STRING & STRING::operator=(const char* cstr) {
STRING_HEADER* this_header = GetHeader();
if (cstr) {
int len = strlen(cstr) + 1;
this_header->used_ = 0; // dont bother copying data if need to realloc
char* this_cstr = ensure_cstr(len);
this_header = GetHeader(); // for realloc
memcpy(this_cstr, cstr, len);
this_header->used_ = len;
}
else {
// Reallocate to zero capacity buffer, consistent with the corresponding
// copy constructor.
DiscardData();
AllocData(0, 0);
}
CHECK_INVARIANT(this);
return *this;
}
STRING STRING::operator+(const STRING& str) const {
STRING result(*this);
result += str;
CHECK_INVARIANT(this);
return result;
}
STRING STRING::operator+(const char ch) const {
STRING result;
FixHeader();
const STRING_HEADER* this_header = GetHeader();
int this_used = this_header->used_;
char* result_cstr = result.ensure_cstr(this_used + 1);
STRING_HEADER* result_header = result.GetHeader();
int result_used = result_header->used_;
// copies '\0' but we'll overwrite that
memcpy(result_cstr, GetCStr(), this_used);
result_cstr[result_used] = ch; // overwrite old '\0'
result_cstr[result_used + 1] = '\0'; // append on '\0'
++result_header->used_;
CHECK_INVARIANT(this);
return result;
}
STRING& STRING::operator+=(const char *str) {
if (!str || !*str) // empty string has no effect
return *this;
FixHeader();
int len = strlen(str) + 1;
int this_used = GetHeader()->used_;
char* this_cstr = ensure_cstr(this_used + len);
STRING_HEADER* this_header = GetHeader(); // after ensure for realloc
// if we had non-empty string then append overwriting old '\0'
// otherwise replace
if (this_used > 0) {
memcpy(this_cstr + this_used - 1, str, len);
this_header->used_ += len - 1;
} else {
memcpy(this_cstr, str, len);
this_header->used_ = len;
}
CHECK_INVARIANT(this);
return *this;
}
STRING& STRING::operator+=(const char ch) {
if (ch == '\0')
return *this;
FixHeader();
int this_used = GetHeader()->used_;
char* this_cstr = ensure_cstr(this_used + 1);
STRING_HEADER* this_header = GetHeader();
if (this_used > 0)
--this_used; // undo old empty null if there was one
this_cstr[this_used++] = ch; // append ch to end
this_cstr[this_used++] = '\0'; // append '\0' after ch
this_header->used_ = this_used;
CHECK_INVARIANT(this);
return *this;
}