mirror of
https://github.com/tesseract-ocr/tesseract.git
synced 2024-11-30 23:49:05 +08:00
023e1b340e
* api: Replace Tesseract data types by POSIX data types Signed-off-by: Stefan Weil <sw@weilnetz.de> * ccmain: Replace Tesseract data types by POSIX data types Signed-off-by: Stefan Weil <sw@weilnetz.de> * ccstruct: Replace Tesseract data types by POSIX data types Signed-off-by: Stefan Weil <sw@weilnetz.de> * classify: Replace Tesseract data types by POSIX data types Signed-off-by: Stefan Weil <sw@weilnetz.de> * cutil: Replace Tesseract data types by POSIX data types Signed-off-by: Stefan Weil <sw@weilnetz.de> * dict: Replace Tesseract data types by POSIX data types Signed-off-by: Stefan Weil <sw@weilnetz.de> * textord: Replace Tesseract data types by POSIX data types Signed-off-by: Stefan Weil <sw@weilnetz.de> * training: Replace Tesseract data types by POSIX data types Signed-off-by: Stefan Weil <sw@weilnetz.de> * wordrec: Replace Tesseract data types by POSIX data types Signed-off-by: Stefan Weil <sw@weilnetz.de> * ccutil: Replace Tesseract data types by POSIX data types Now all Tesseract data types which are no longer needed can be removed from ccutil/host.h. Signed-off-by: Stefan Weil <sw@weilnetz.de> * ccmain: Replace Tesseract's MIN_*INT, MAX_*INT* by POSIX *INT*_MIN, *INT*_MAX Signed-off-by: Stefan Weil <sw@weilnetz.de> * ccstruct: Replace Tesseract's MIN_*INT, MAX_*INT* by POSIX *INT*_MIN, *INT*_MAX Signed-off-by: Stefan Weil <sw@weilnetz.de> * classify: Replace Tesseract's MIN_*INT, MAX_*INT* by POSIX *INT*_MIN, *INT*_MAX Signed-off-by: Stefan Weil <sw@weilnetz.de> * dict: Replace Tesseract's MIN_*INT, MAX_*INT* by POSIX *INT*_MIN, *INT*_MAX Signed-off-by: Stefan Weil <sw@weilnetz.de> * lstm: Replace Tesseract's MIN_*INT, MAX_*INT* by POSIX *INT*_MIN, *INT*_MAX Signed-off-by: Stefan Weil <sw@weilnetz.de> * textord: Replace Tesseract's MIN_*INT, MAX_*INT* by POSIX *INT*_MIN, *INT*_MAX Signed-off-by: Stefan Weil <sw@weilnetz.de> * wordrec: Replace Tesseract's MIN_*INT, MAX_*INT* by POSIX *INT*_MIN, *INT*_MAX Signed-off-by: Stefan Weil <sw@weilnetz.de> * ccutil: Replace Tesseract's MIN_*INT, MAX_*INT* by POSIX *INT*_MIN, *INT*_MAX Remove the macros which are now unused from ccutil/host.h. Remove also the obsolete history comments. Signed-off-by: Stefan Weil <sw@weilnetz.de> * Fix build error caused by ambiguous ClipToRange Error message vom Appveyor CI: C:\projects\tesseract\ccstruct\coutln.cpp(818): error C2672: 'ClipToRange': no matching overloaded function found [C:\projects\tesseract\build\libtesseract.vcxproj] C:\projects\tesseract\ccstruct\coutln.cpp(818): error C2782: 'T ClipToRange(const T &,const T &,const T &)': template parameter 'T' is ambiguous [C:\projects\tesseract\build\libtesseract.vcxproj] c:\projects\tesseract\ccutil\helpers.h(122): note: see declaration of 'ClipToRange' C:\projects\tesseract\ccstruct\coutln.cpp(818): note: could be 'char' C:\projects\tesseract\ccstruct\coutln.cpp(818): note: or 'int' Signed-off-by: Stefan Weil <sw@weilnetz.de> * unittest: Replace Tesseract's MAX_INT8 by POSIX INT8_MAX Signed-off-by: Stefan Weil <sw@weilnetz.de> * arch: Replace Tesseract's MAX_INT8 by POSIX INT8_MAX Signed-off-by: Stefan Weil <sw@weilnetz.de>
508 lines
15 KiB
C++
508 lines
15 KiB
C++
/**********************************************************************
|
|
* File: strngs.cpp (Formerly strings.c)
|
|
* Description: STRING class functions.
|
|
* Author: Ray Smith
|
|
* Created: Fri Feb 15 09:13:30 GMT 1991
|
|
*
|
|
* (C) Copyright 1991, Hewlett-Packard Ltd.
|
|
** Licensed under the Apache License, Version 2.0 (the "License");
|
|
** you may not use this file except in compliance with the License.
|
|
** You may obtain a copy of the License at
|
|
** http://www.apache.org/licenses/LICENSE-2.0
|
|
** Unless required by applicable law or agreed to in writing, software
|
|
** distributed under the License is distributed on an "AS IS" BASIS,
|
|
** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
** See the License for the specific language governing permissions and
|
|
** limitations under the License.
|
|
*
|
|
**********************************************************************/
|
|
|
|
#include "strngs.h"
|
|
|
|
#include <assert.h>
|
|
|
|
#include "genericvector.h"
|
|
#include "helpers.h"
|
|
#include "serialis.h"
|
|
#include "tprintf.h"
|
|
|
|
using tesseract::TFile;
|
|
|
|
// Size of buffer needed to host the decimal representation of the maximum
|
|
// possible length of an int (in 64 bits), being -<20 digits>.
|
|
const int kMaxIntSize = 22;
|
|
// Size of buffer needed to host the decimal representation of the maximum
|
|
// possible length of a %.8g being -1.2345678e+999<nul> = 16.
|
|
const int kMaxDoubleSize = 16;
|
|
|
|
/**********************************************************************
|
|
* STRING_HEADER provides metadata about the allocated buffer,
|
|
* including total capacity and how much used (strlen with '\0').
|
|
*
|
|
* The implementation hides this header at the start of the data
|
|
* buffer and appends the string on the end to keep sizeof(STRING)
|
|
* unchanged from earlier versions so serialization is not affected.
|
|
*
|
|
* The collection of MACROS provide different implementations depending
|
|
* on whether the string keeps track of its strlen or not so that this
|
|
* feature can be added in later when consumers don't modify the string
|
|
**********************************************************************/
|
|
|
|
// Smallest string to allocate by default
|
|
const int kMinCapacity = 16;
|
|
|
|
char* STRING::AllocData(int used, int capacity) {
|
|
data_ = (STRING_HEADER *)alloc_string(capacity + sizeof(STRING_HEADER));
|
|
|
|
// header is the metadata for this memory block
|
|
STRING_HEADER* header = GetHeader();
|
|
header->capacity_ = capacity;
|
|
header->used_ = used;
|
|
return GetCStr();
|
|
}
|
|
|
|
void STRING::DiscardData() {
|
|
free_string((char *)data_);
|
|
}
|
|
|
|
// This is a private method; ensure FixHeader is called (or used_ is well defined)
|
|
// beforehand
|
|
char* STRING::ensure_cstr(int32_t min_capacity) {
|
|
STRING_HEADER* orig_header = GetHeader();
|
|
if (min_capacity <= orig_header->capacity_)
|
|
return ((char *)this->data_) + sizeof(STRING_HEADER);
|
|
|
|
// if we are going to grow bigger, than double our existing
|
|
// size, but if that still is not big enough then keep the
|
|
// requested capacity
|
|
if (min_capacity < 2 * orig_header->capacity_)
|
|
min_capacity = 2 * orig_header->capacity_;
|
|
|
|
int alloc = sizeof(STRING_HEADER) + min_capacity;
|
|
STRING_HEADER* new_header = (STRING_HEADER*)(alloc_string(alloc));
|
|
|
|
memcpy(&new_header[1], GetCStr(), orig_header->used_);
|
|
new_header->capacity_ = min_capacity;
|
|
new_header->used_ = orig_header->used_;
|
|
|
|
// free old memory, then rebind to new memory
|
|
DiscardData();
|
|
data_ = new_header;
|
|
|
|
assert(InvariantOk());
|
|
return ((char *)data_) + sizeof(STRING_HEADER);
|
|
}
|
|
|
|
// This is const, but is modifying a mutable field
|
|
// this way it can be used on const or non-const instances.
|
|
void STRING::FixHeader() const {
|
|
const STRING_HEADER* header = GetHeader();
|
|
if (header->used_ < 0)
|
|
header->used_ = strlen(GetCStr()) + 1;
|
|
}
|
|
|
|
|
|
STRING::STRING() {
|
|
// Empty STRINGs contain just the "\0".
|
|
memcpy(AllocData(1, kMinCapacity), "", 1);
|
|
}
|
|
|
|
STRING::STRING(const STRING& str) {
|
|
str.FixHeader();
|
|
const STRING_HEADER* str_header = str.GetHeader();
|
|
int str_used = str_header->used_;
|
|
char *this_cstr = AllocData(str_used, str_used);
|
|
memcpy(this_cstr, str.GetCStr(), str_used);
|
|
assert(InvariantOk());
|
|
}
|
|
|
|
STRING::STRING(const char* cstr) {
|
|
if (cstr == NULL) {
|
|
// Empty STRINGs contain just the "\0".
|
|
memcpy(AllocData(1, kMinCapacity), "", 1);
|
|
} else {
|
|
int len = strlen(cstr) + 1;
|
|
char* this_cstr = AllocData(len, len);
|
|
memcpy(this_cstr, cstr, len);
|
|
}
|
|
assert(InvariantOk());
|
|
}
|
|
|
|
STRING::STRING(const char *data, int length) {
|
|
if (data == NULL) {
|
|
// Empty STRINGs contain just the "\0".
|
|
memcpy(AllocData(1, kMinCapacity), "", 1);
|
|
} else {
|
|
char* this_cstr = AllocData(length + 1, length + 1);
|
|
memcpy(this_cstr, data, length);
|
|
this_cstr[length] = '\0';
|
|
}
|
|
}
|
|
|
|
STRING::~STRING() {
|
|
DiscardData();
|
|
}
|
|
|
|
// TODO(rays) Change all callers to use TFile and remove the old functions.
|
|
// Writes to the given file. Returns false in case of error.
|
|
bool STRING::Serialize(FILE* fp) const {
|
|
int32_t len = length();
|
|
if (fwrite(&len, sizeof(len), 1, fp) != 1) return false;
|
|
if (static_cast<int>(fwrite(GetCStr(), 1, len, fp)) != len) return false;
|
|
return true;
|
|
}
|
|
// Writes to the given file. Returns false in case of error.
|
|
bool STRING::Serialize(TFile* fp) const {
|
|
int32_t len = length();
|
|
if (fp->FWrite(&len, sizeof(len), 1) != 1) return false;
|
|
if (fp->FWrite(GetCStr(), 1, len) != len) return false;
|
|
return true;
|
|
}
|
|
// Reads from the given file. Returns false in case of error.
|
|
// If swap is true, assumes a big/little-endian swap is needed.
|
|
bool STRING::DeSerialize(bool swap, FILE* fp) {
|
|
int32_t len;
|
|
if (fread(&len, sizeof(len), 1, fp) != 1) return false;
|
|
if (swap)
|
|
ReverseN(&len, sizeof(len));
|
|
truncate_at(len);
|
|
if (static_cast<int>(fread(GetCStr(), 1, len, fp)) != len) return false;
|
|
return true;
|
|
}
|
|
// Reads from the given file. Returns false in case of error.
|
|
// If swap is true, assumes a big/little-endian swap is needed.
|
|
bool STRING::DeSerialize(TFile* fp) {
|
|
int32_t len;
|
|
if (fp->FReadEndian(&len, sizeof(len), 1) != 1) return false;
|
|
truncate_at(len);
|
|
if (fp->FRead(GetCStr(), 1, len) != len) return false;
|
|
return true;
|
|
}
|
|
|
|
// As DeSerialize, but only seeks past the data - hence a static method.
|
|
bool STRING::SkipDeSerialize(tesseract::TFile* fp) {
|
|
int32_t len;
|
|
if (fp->FReadEndian(&len, sizeof(len), 1) != 1) return false;
|
|
return fp->FRead(NULL, 1, len) == len;
|
|
}
|
|
|
|
BOOL8 STRING::contains(const char c) const {
|
|
return (c != '\0') && (strchr (GetCStr(), c) != NULL);
|
|
}
|
|
|
|
int32_t STRING::length() const {
|
|
FixHeader();
|
|
return GetHeader()->used_ - 1;
|
|
}
|
|
|
|
const char* STRING::string() const {
|
|
const STRING_HEADER* header = GetHeader();
|
|
if (header->used_ == 0)
|
|
return NULL;
|
|
|
|
// mark header length unreliable because tesseract might
|
|
// cast away the const and mutate the string directly.
|
|
header->used_ = -1;
|
|
return GetCStr();
|
|
}
|
|
|
|
const char* STRING::c_str() const {
|
|
return string();
|
|
}
|
|
|
|
/******
|
|
* The STRING_IS_PROTECTED interface adds additional support to migrate
|
|
* code that needs to modify the STRING in ways not otherwise supported
|
|
* without violating encapsulation.
|
|
*
|
|
* Also makes the [] operator return a const so it is immutable
|
|
*/
|
|
#if STRING_IS_PROTECTED
|
|
const char& STRING::operator[](int32_t index) const {
|
|
return GetCStr()[index];
|
|
}
|
|
|
|
void STRING::insert_range(int32_t index, const char* str, int len) {
|
|
// if index is outside current range, then also grow size of string
|
|
// to accmodate the requested range.
|
|
STRING_HEADER* this_header = GetHeader();
|
|
int used = this_header->used_;
|
|
if (index > used)
|
|
used = index;
|
|
|
|
char* this_cstr = ensure_cstr(used + len + 1);
|
|
if (index < used) {
|
|
// move existing string from index to '\0' inclusive.
|
|
memmove(this_cstr + index + len,
|
|
this_cstr + index,
|
|
this_header->used_ - index);
|
|
} else if (len > 0) {
|
|
// We are going to overwrite previous null terminator, so write the new one.
|
|
this_cstr[this_header->used_ + len - 1] = '\0';
|
|
|
|
// If the old header did not have the terminator,
|
|
// then we need to account for it now that we've added it.
|
|
// Otherwise it was already accounted for; we just moved it.
|
|
if (this_header->used_ == 0)
|
|
++this_header->used_;
|
|
}
|
|
|
|
// Write new string to index.
|
|
// The string is already terminated from the conditions above.
|
|
memcpy(this_cstr + index, str, len);
|
|
this_header->used_ += len;
|
|
|
|
assert(InvariantOk());
|
|
}
|
|
|
|
void STRING::erase_range(int32_t index, int len) {
|
|
char* this_cstr = GetCStr();
|
|
STRING_HEADER* this_header = GetHeader();
|
|
|
|
memcpy(this_cstr+index, this_cstr+index+len,
|
|
this_header->used_ - index - len);
|
|
this_header->used_ -= len;
|
|
assert(InvariantOk());
|
|
}
|
|
|
|
#else
|
|
void STRING::truncate_at(int32_t index) {
|
|
ASSERT_HOST(index >= 0);
|
|
FixHeader();
|
|
char* this_cstr = ensure_cstr(index + 1);
|
|
this_cstr[index] = '\0';
|
|
GetHeader()->used_ = index + 1;
|
|
assert(InvariantOk());
|
|
}
|
|
|
|
char& STRING::operator[](int32_t index) const {
|
|
// Code is casting away this const and mutating the string,
|
|
// so mark used_ as -1 to flag it unreliable.
|
|
GetHeader()->used_ = -1;
|
|
return ((char *)GetCStr())[index];
|
|
}
|
|
#endif
|
|
|
|
void STRING::split(const char c, GenericVector<STRING> *splited) {
|
|
int start_index = 0;
|
|
int len = length();
|
|
for (int i = 0; i < len; i++) {
|
|
if ((*this)[i] == c) {
|
|
if (i != start_index) {
|
|
(*this)[i] = '\0';
|
|
splited->push_back(STRING(GetCStr() + start_index, i - start_index));
|
|
(*this)[i] = c;
|
|
}
|
|
start_index = i + 1;
|
|
}
|
|
}
|
|
|
|
if (len != start_index) {
|
|
splited->push_back(STRING(GetCStr() + start_index, len - start_index));
|
|
}
|
|
}
|
|
|
|
BOOL8 STRING::operator==(const STRING& str) const {
|
|
FixHeader();
|
|
str.FixHeader();
|
|
const STRING_HEADER* str_header = str.GetHeader();
|
|
const STRING_HEADER* this_header = GetHeader();
|
|
int this_used = this_header->used_;
|
|
int str_used = str_header->used_;
|
|
|
|
return (this_used == str_used)
|
|
&& (memcmp(GetCStr(), str.GetCStr(), this_used) == 0);
|
|
}
|
|
|
|
BOOL8 STRING::operator!=(const STRING& str) const {
|
|
FixHeader();
|
|
str.FixHeader();
|
|
const STRING_HEADER* str_header = str.GetHeader();
|
|
const STRING_HEADER* this_header = GetHeader();
|
|
int this_used = this_header->used_;
|
|
int str_used = str_header->used_;
|
|
|
|
return (this_used != str_used)
|
|
|| (memcmp(GetCStr(), str.GetCStr(), this_used) != 0);
|
|
}
|
|
|
|
BOOL8 STRING::operator!=(const char* cstr) const {
|
|
FixHeader();
|
|
const STRING_HEADER* this_header = GetHeader();
|
|
|
|
if (cstr == NULL)
|
|
return this_header->used_ > 1; // either '\0' or NULL
|
|
else {
|
|
int32_t length = strlen(cstr) + 1;
|
|
return (this_header->used_ != length)
|
|
|| (memcmp(GetCStr(), cstr, length) != 0);
|
|
}
|
|
}
|
|
|
|
STRING& STRING::operator=(const STRING& str) {
|
|
str.FixHeader();
|
|
const STRING_HEADER* str_header = str.GetHeader();
|
|
int str_used = str_header->used_;
|
|
|
|
GetHeader()->used_ = 0; // clear since ensure doesn't need to copy data
|
|
char* this_cstr = ensure_cstr(str_used);
|
|
STRING_HEADER* this_header = GetHeader();
|
|
|
|
memcpy(this_cstr, str.GetCStr(), str_used);
|
|
this_header->used_ = str_used;
|
|
|
|
assert(InvariantOk());
|
|
return *this;
|
|
}
|
|
|
|
STRING & STRING::operator+=(const STRING& str) {
|
|
FixHeader();
|
|
str.FixHeader();
|
|
const STRING_HEADER* str_header = str.GetHeader();
|
|
const char* str_cstr = str.GetCStr();
|
|
int str_used = str_header->used_;
|
|
int this_used = GetHeader()->used_;
|
|
char* this_cstr = ensure_cstr(this_used + str_used);
|
|
|
|
STRING_HEADER* this_header = GetHeader(); // after ensure for realloc
|
|
|
|
if (this_used > 1) {
|
|
memcpy(this_cstr + this_used - 1, str_cstr, str_used);
|
|
this_header->used_ += str_used - 1; // overwrite '\0'
|
|
} else {
|
|
memcpy(this_cstr, str_cstr, str_used);
|
|
this_header->used_ = str_used;
|
|
}
|
|
|
|
assert(InvariantOk());
|
|
return *this;
|
|
}
|
|
|
|
void STRING::add_str_int(const char* str, int number) {
|
|
if (str != NULL)
|
|
*this += str;
|
|
// Allow space for the maximum possible length of int64_t.
|
|
char num_buffer[kMaxIntSize];
|
|
snprintf(num_buffer, kMaxIntSize - 1, "%d", number);
|
|
num_buffer[kMaxIntSize - 1] = '\0';
|
|
*this += num_buffer;
|
|
}
|
|
// Appends the given string and double (as a %.8g) to this.
|
|
void STRING::add_str_double(const char* str, double number) {
|
|
if (str != NULL)
|
|
*this += str;
|
|
// Allow space for the maximum possible length of %8g.
|
|
char num_buffer[kMaxDoubleSize];
|
|
snprintf(num_buffer, kMaxDoubleSize - 1, "%.8g", number);
|
|
num_buffer[kMaxDoubleSize - 1] = '\0';
|
|
*this += num_buffer;
|
|
}
|
|
|
|
STRING & STRING::operator=(const char* cstr) {
|
|
STRING_HEADER* this_header = GetHeader();
|
|
if (cstr) {
|
|
int len = strlen(cstr) + 1;
|
|
|
|
this_header->used_ = 0; // don't bother copying data if need to realloc
|
|
char* this_cstr = ensure_cstr(len);
|
|
this_header = GetHeader(); // for realloc
|
|
memcpy(this_cstr, cstr, len);
|
|
this_header->used_ = len;
|
|
} else {
|
|
// Reallocate to same state as default constructor.
|
|
DiscardData();
|
|
// Empty STRINGs contain just the "\0".
|
|
memcpy(AllocData(1, kMinCapacity), "", 1);
|
|
}
|
|
|
|
assert(InvariantOk());
|
|
return *this;
|
|
}
|
|
|
|
void STRING::assign(const char *cstr, int len) {
|
|
STRING_HEADER* this_header = GetHeader();
|
|
this_header->used_ = 0; // don't bother copying data if need to realloc
|
|
char* this_cstr = ensure_cstr(len + 1); // +1 for '\0'
|
|
|
|
this_header = GetHeader(); // for realloc
|
|
memcpy(this_cstr, cstr, len);
|
|
this_cstr[len] = '\0';
|
|
this_header->used_ = len + 1;
|
|
|
|
assert(InvariantOk());
|
|
}
|
|
|
|
STRING STRING::operator+(const STRING& str) const {
|
|
STRING result(*this);
|
|
result += str;
|
|
|
|
assert(InvariantOk());
|
|
return result;
|
|
}
|
|
|
|
|
|
STRING STRING::operator+(const char ch) const {
|
|
STRING result;
|
|
FixHeader();
|
|
const STRING_HEADER* this_header = GetHeader();
|
|
int this_used = this_header->used_;
|
|
char* result_cstr = result.ensure_cstr(this_used + 1);
|
|
STRING_HEADER* result_header = result.GetHeader();
|
|
int result_used = result_header->used_;
|
|
|
|
// copies '\0' but we'll overwrite that
|
|
memcpy(result_cstr, GetCStr(), this_used);
|
|
result_cstr[result_used] = ch; // overwrite old '\0'
|
|
result_cstr[result_used + 1] = '\0'; // append on '\0'
|
|
++result_header->used_;
|
|
|
|
assert(InvariantOk());
|
|
return result;
|
|
}
|
|
|
|
|
|
STRING& STRING::operator+=(const char *str) {
|
|
if (!str || !*str) // empty string has no effect
|
|
return *this;
|
|
|
|
FixHeader();
|
|
int len = strlen(str) + 1;
|
|
int this_used = GetHeader()->used_;
|
|
char* this_cstr = ensure_cstr(this_used + len);
|
|
STRING_HEADER* this_header = GetHeader(); // after ensure for realloc
|
|
|
|
// if we had non-empty string then append overwriting old '\0'
|
|
// otherwise replace
|
|
if (this_used > 0) {
|
|
memcpy(this_cstr + this_used - 1, str, len);
|
|
this_header->used_ += len - 1;
|
|
} else {
|
|
memcpy(this_cstr, str, len);
|
|
this_header->used_ = len;
|
|
}
|
|
|
|
assert(InvariantOk());
|
|
return *this;
|
|
}
|
|
|
|
|
|
STRING& STRING::operator+=(const char ch) {
|
|
if (ch == '\0')
|
|
return *this;
|
|
|
|
FixHeader();
|
|
int this_used = GetHeader()->used_;
|
|
char* this_cstr = ensure_cstr(this_used + 1);
|
|
STRING_HEADER* this_header = GetHeader();
|
|
|
|
if (this_used > 0)
|
|
--this_used; // undo old empty null if there was one
|
|
|
|
this_cstr[this_used++] = ch; // append ch to end
|
|
this_cstr[this_used++] = '\0'; // append '\0' after ch
|
|
this_header->used_ = this_used;
|
|
|
|
assert(InvariantOk());
|
|
return *this;
|
|
}
|