Started TFile conversion to remove fmemopen

git-svn-id: https://tesseract-ocr.googlecode.com/svn/trunk@1139 d0cd1f9f-072b-0410-8dd7-cf729c803f20
This commit is contained in:
theraysmith@gmail.com 2014-08-11 23:09:25 +00:00
parent d52231cff3
commit c86fe22a62
7 changed files with 299 additions and 43 deletions

View File

@ -28,6 +28,7 @@
#include "errcode.h"
#include "helpers.h"
#include "ndminx.h"
#include "serialis.h"
#include "strngs.h"
// Use PointerVector<T> below in preference to GenericVector<T*>, as that
@ -61,6 +62,11 @@ class GenericVector {
// Resizes to size and sets all values to t.
void init_to_size(int size, T t);
// Resizes to size without any initialization.
void resize_no_init(int size) {
reserve(size);
size_used_ = size;
}
// Return the size used.
int size() const {
@ -159,22 +165,27 @@ class GenericVector {
bool read(FILE* f, TessResultCallback3<bool, FILE*, T*, bool>* cb, bool swap);
// Writes a vector of simple types to the given file. Assumes that bitwise
// read/write of T will work. Returns false in case of error.
// TODO(rays) Change all callers to use TFile and remove deprecated methods.
bool Serialize(FILE* fp) const;
bool Serialize(tesseract::TFile* fp) const;
// Reads a vector of simple types from the given file. Assumes that bitwise
// read/write will work with ReverseN according to sizeof(T).
// Returns false in case of error.
// If swap is true, assumes a big/little-endian swap is needed.
bool DeSerialize(bool swap, FILE* fp);
bool DeSerialize(bool swap, tesseract::TFile* fp);
// Writes a vector of classes to the given file. Assumes the existence of
// bool T::Serialize(FILE* fp) const that returns false in case of error.
// Returns false in case of error.
bool SerializeClasses(FILE* fp) const;
bool SerializeClasses(tesseract::TFile* fp) const;
// Reads a vector of classes from the given file. Assumes the existence of
// bool T::Deserialize(bool swap, FILE* fp) that returns false in case of
// error. Also needs T::T() and T::T(constT&), as init_to_size is used in
// this function. Returns false in case of error.
// If swap is true, assumes a big/little-endian swap is needed.
bool DeSerializeClasses(bool swap, FILE* fp);
bool DeSerializeClasses(bool swap, tesseract::TFile* fp);
// Allocates a new array of double the current_size, copies over the
// information from data to the new location, deletes data and returns
@ -188,6 +199,12 @@ class GenericVector {
return data_new;
}
// Reverses the elements of the vector.
void reverse() {
for (int i = 0; i < size_used_ / 2; ++i)
Swap(&data_[i], &data_[size_used_ - 1 - i]);
}
// Sorts the members of this vector using the less than comparator (cmp_lt),
// which compares the values. Useful for GenericVectors to primitive types.
// Will not work so great for pointers (unless you just want to sort some
@ -296,6 +313,15 @@ class GenericVector {
data_[index2] = tmp;
}
}
// Returns true if all elements of *this are within the given range.
// Only uses operator<
bool WithinBounds(const T& rangemin, const T& rangemax) const {
for (int i = 0; i < size_used_; ++i) {
if (data_[i] < rangemin || rangemax < data_[i])
return false;
}
return true;
}
protected:
// Internal recursive version of choose_nth_item.
@ -343,7 +369,7 @@ inline bool LoadDataFromFile(const STRING& filename,
// The default FileWriter writes the vector of char to the filename file,
// returning false on error.
inline bool SaveDataToFile(const GenericVector<char>& data,
const STRING& filename) {
const STRING& filename) {
FILE* fp = fopen(filename.string(), "wb");
if (fp == NULL) return false;
bool result =
@ -470,8 +496,11 @@ class PointerVector : public GenericVector<T*> {
GenericVector<T*>::clear();
}
// Writes a vector of simple types to the given file. Assumes that bitwise
// read/write of T will work. Returns false in case of error.
// Writes a vector of (pointers to) classes to the given file. Assumes the
// existence of bool T::Serialize(FILE*) const that returns false in case of
// error. There is no Serialize for simple types, as you would have a
// normal GenericVector of those.
// Returns false in case of error.
bool Serialize(FILE* fp) const {
inT32 used = GenericVector<T*>::size_used_;
if (fwrite(&used, sizeof(used), 1, fp) != 1) return false;
@ -482,16 +511,29 @@ class PointerVector : public GenericVector<T*> {
}
return true;
}
// Reads a vector of simple types from the given file. Assumes that bitwise
// read/write will work with ReverseN according to sizeof(T).
bool Serialize(TFile* fp) const {
inT32 used = GenericVector<T*>::size_used_;
if (fp->FWrite(&used, sizeof(used), 1) != 1) return false;
for (int i = 0; i < used; ++i) {
inT8 non_null = GenericVector<T*>::data_[i] != NULL;
if (fp->FWrite(&non_null, sizeof(non_null), 1) != 1) return false;
if (non_null && !GenericVector<T*>::data_[i]->Serialize(fp)) return false;
}
return true;
}
// Reads a vector of (pointers to) classes to the given file. Assumes the
// existence of bool T::DeSerialize(bool, Tfile*) const that returns false in
// case of error. There is no Serialize for simple types, as you would have a
// normal GenericVector of those.
// If swap is true, assumes a big/little-endian swap is needed.
// Also needs T::T(), as new T is used in this function.
// Returns false in case of error.
// If swap is true, assumes a big/little-endian swap is needed.
bool DeSerialize(bool swap, FILE* fp) {
inT32 reserved;
if (fread(&reserved, sizeof(reserved), 1, fp) != 1) return false;
if (swap) Reverse32(&reserved);
GenericVector<T*>::reserve(reserved);
truncate(0);
for (int i = 0; i < reserved; ++i) {
inT8 non_null;
if (fread(&non_null, sizeof(non_null), 1, fp) != 1) return false;
@ -510,6 +552,30 @@ class PointerVector : public GenericVector<T*> {
}
return true;
}
bool DeSerialize(bool swap, TFile* fp) {
inT32 reserved;
if (fp->FRead(&reserved, sizeof(reserved), 1) != 1) return false;
if (swap) Reverse32(&reserved);
GenericVector<T*>::reserve(reserved);
truncate(0);
for (int i = 0; i < reserved; ++i) {
inT8 non_null;
if (fp->FRead(&non_null, sizeof(non_null), 1) != 1) return false;
T* item = NULL;
if (non_null) {
item = new T;
if (!item->DeSerialize(swap, fp)) {
delete item;
return false;
}
this->push_back(item);
} else {
// Null elements should keep their place in the vector.
this->push_back(NULL);
}
}
return true;
}
// Sorts the items pointed to by the members of this vector using
// t::operator<().
@ -817,6 +883,12 @@ bool GenericVector<T>::Serialize(FILE* fp) const {
if (fwrite(data_, sizeof(*data_), size_used_, fp) != size_used_) return false;
return true;
}
template <typename T>
bool GenericVector<T>::Serialize(tesseract::TFile* fp) const {
if (fp->FWrite(&size_used_, sizeof(size_used_), 1) != 1) return false;
if (fp->FWrite(data_, sizeof(*data_), size_used_) != size_used_) return false;
return true;
}
// Reads a vector of simple types from the given file. Assumes that bitwise
// read/write will work with ReverseN according to sizeof(T).
@ -836,6 +908,20 @@ bool GenericVector<T>::DeSerialize(bool swap, FILE* fp) {
}
return true;
}
template <typename T>
bool GenericVector<T>::DeSerialize(bool swap, tesseract::TFile* fp) {
inT32 reserved;
if (fp->FRead(&reserved, sizeof(reserved), 1) != 1) return false;
if (swap) Reverse32(&reserved);
reserve(reserved);
size_used_ = reserved;
if (fp->FRead(data_, sizeof(T), size_used_) != size_used_) return false;
if (swap) {
for (int i = 0; i < size_used_; ++i)
ReverseN(&data_[i], sizeof(data_[i]));
}
return true;
}
// Writes a vector of classes to the given file. Assumes the existence of
// bool T::Serialize(FILE* fp) const that returns false in case of error.
@ -848,6 +934,14 @@ bool GenericVector<T>::SerializeClasses(FILE* fp) const {
}
return true;
}
template <typename T>
bool GenericVector<T>::SerializeClasses(tesseract::TFile* fp) const {
if (fp->FWrite(&size_used_, sizeof(size_used_), 1) != 1) return false;
for (int i = 0; i < size_used_; ++i) {
if (!data_[i].Serialize(fp)) return false;
}
return true;
}
// Reads a vector of classes from the given file. Assumes the existence of
// bool T::Deserialize(bool swap, FILE* fp) that returns false in case of
@ -866,6 +960,18 @@ bool GenericVector<T>::DeSerializeClasses(bool swap, FILE* fp) {
}
return true;
}
template <typename T>
bool GenericVector<T>::DeSerializeClasses(bool swap, tesseract::TFile* fp) {
uinT32 reserved;
if (fp->FRead(&reserved, sizeof(reserved), 1) != 1) return false;
if (swap) Reverse32(&reserved);
T empty;
init_to_size(reserved, empty);
for (int i = 0; i < reserved; ++i) {
if (!data_[i].DeSerialize(swap, fp)) return false;
}
return true;
}
// This method clear the current object, then, does a shallow copy of
// its argument, and finally invalidates its argument.

View File

@ -19,24 +19,41 @@
#include "serialis.h"
#include <stdio.h>
#include "genericvector.h"
namespace tesseract {
TFile::TFile() : offset_(0) {
TFile::TFile()
: offset_(0), data_(NULL), data_is_owned_(false), is_writing_(false) {
}
TFile::~TFile() {
if (data_is_owned_)
delete data_;
}
bool TFile::Open(const STRING& filename, FileReader reader) {
if (!data_is_owned_) {
data_ = new GenericVector<char>;
data_is_owned_ = true;
}
offset_ = 0;
is_writing_ = false;
if (reader == NULL)
return LoadDataFromFile(filename, &data_);
return LoadDataFromFile(filename, data_);
else
return (*reader)(filename, &data_);
return (*reader)(filename, data_);
}
bool TFile::Open(const char* data, int size) {
offset_ = 0;
data_.init_to_size(size, 0);
memcpy(&data_[0], data, size);
if (!data_is_owned_) {
data_ = new GenericVector<char>;
data_is_owned_ = true;
}
is_writing_ = false;
data_->init_to_size(size, 0);
memcpy(&(*data_)[0], data, size);
return true;
}
@ -49,30 +66,78 @@ bool TFile::Open(FILE* fp, inT64 end_offset) {
fseek(fp, current_pos, SEEK_SET);
}
int size = end_offset - current_pos;
data_.init_to_size(size, 0);
return static_cast<int>(fread(&data_[0], 1, size, fp)) == size;
is_writing_ = false;
if (!data_is_owned_) {
data_ = new GenericVector<char>;
data_is_owned_ = true;
}
data_->init_to_size(size, 0);
return static_cast<int>(fread(&(*data_)[0], 1, size, fp)) == size;
}
char* TFile::FGets(char* buffer, int buffer_size) {
ASSERT_HOST(!is_writing_);
int size = 0;
while (size + 1 < buffer_size && offset_ < data_.size()) {
buffer[size++] = data_[offset_++];
if (data_[offset_ - 1] == '\n') break;
while (size + 1 < buffer_size && offset_ < data_->size()) {
buffer[size++] = (*data_)[offset_++];
if ((*data_)[offset_ - 1] == '\n') break;
}
if (size < buffer_size) buffer[size] = '\0';
return size > 0 ? buffer : NULL;
}
int TFile::FRead(void* buffer, int size, int count) {
char* char_buffer = reinterpret_cast<char*>(buffer);
ASSERT_HOST(!is_writing_);
int required_size = size * count;
if (data_.size() - offset_ < required_size)
required_size = data_.size() - offset_;
memcpy(char_buffer, &data_[offset_], required_size);
if (required_size <= 0) return 0;
char* char_buffer = reinterpret_cast<char*>(buffer);
if (data_->size() - offset_ < required_size)
required_size = data_->size() - offset_;
if (required_size > 0)
memcpy(char_buffer, &(*data_)[offset_], required_size);
offset_ += required_size;
return required_size / size;
}
void TFile::Rewind() {
ASSERT_HOST(!is_writing_);
offset_ = 0;
}
void TFile::OpenWrite(GenericVector<char>* data) {
offset_ = 0;
if (data != NULL) {
if (data_is_owned_) delete data_;
data_ = data;
data_is_owned_ = false;
} else if (!data_is_owned_) {
data_ = new GenericVector<char>;
data_is_owned_ = true;
}
is_writing_ = true;
data_->truncate(0);
}
bool TFile::CloseWrite(const STRING& filename, FileWriter writer) {
ASSERT_HOST(is_writing_);
if (writer == NULL)
return SaveDataToFile(*data_, filename);
else
return (*writer)(*data_, filename);
}
int TFile::FWrite(const void* buffer, int size, int count) {
ASSERT_HOST(is_writing_);
int total = size * count;
if (total <= 0) return 0;
const char* buf = reinterpret_cast<const char*>(buffer);
// This isn't very efficient, but memory is so fast compared to disk
// that it is relatively unimportant, and very simple.
for (int i = 0; i < total; ++i)
data_->push_back(buf[i]);
return count;
}
} // namespace tesseract.

View File

@ -20,11 +20,13 @@
#ifndef SERIALIS_H
#define SERIALIS_H
#include <stdlib.h>
#include <string.h>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <stdio.h>
#include "host.h"
#include "genericvector.h"
template <typename T> class GenericVector;
class STRING;
/***********************************************************************
QUOTE_IT MACRO DEFINITION
@ -36,14 +38,24 @@ Replace <parm> with "<parm>". <parm> may be an arbitrary number of tokens
namespace tesseract {
// Simple file class. Only does input for now.
// Allows for portable file input from memory.
// Function to read a GenericVector<char> from a whole file.
// Returns false on failure.
typedef bool (*FileReader)(const STRING& filename, GenericVector<char>* data);
// Function to write a GenericVector<char> to a whole file.
// Returns false on failure.
typedef bool (*FileWriter)(const GenericVector<char>& data,
const STRING& filename);
// Simple file class.
// Allows for portable file input from memory and from foreign file systems.
class TFile {
public:
TFile();
~TFile();
// All the Open methods load the whole file into memory.
// All the Open methods load the whole file into memory for reading.
// Opens a file with a supplied reader, or NULL to use the default.
// Note that mixed read/write is not supported.
bool Open(const STRING& filename, FileReader reader);
// From an existing memory buffer.
bool Open(const char* data, int size);
@ -53,21 +65,33 @@ class TFile {
// Reads a line like fgets. Returns NULL on EOF, otherwise buffer.
// Reads at most buffer_size bytes, including '\0' terminator, even if
// the line is longer. Does nothing if buffer_size <= 0.
// To use fscanf use FGets and sscanf.
char* FGets(char* buffer, int buffer_size);
// Replicates fread, returning the number of items read.
int FRead(void* buffer, int size, int count);
// To use fscanf use FGets and sscanf.
// Resets the TFile as if it has been Opened, but nothing read.
void Rewind() {
offset_ = 0;
}
// Only allowed while reading!
void Rewind();
// Open for writing. Either supply a non-NULL data with OpenWrite before
// calling FWrite, (no close required), or supply a NULL data to OpenWrite
// and call CloseWrite to write to a file after the FWrites.
void OpenWrite(GenericVector<char>* data);
bool CloseWrite(const STRING& filename, FileWriter writer);
// Replicates fwrite, returning the number of items written.
// To use fprintf, use snprintf and FWrite.
int FWrite(const void* buffer, int size, int count);
private:
// The number of bytes used so far.
int offset_;
// The buffered data from the file.
GenericVector<char> data_;
GenericVector<char>* data_;
// True if the data_ pointer is owned by *this.
bool data_is_owned_;
// True if the TFile is open for writing.
bool is_writing_;
};
} // namespace tesseract.

View File

@ -17,12 +17,17 @@
*
**********************************************************************/
#include "helpers.h"
#include "tprintf.h"
#include "strngs.h"
#include "genericvector.h"
#include "strngs.h"
#include <assert.h>
#include "genericvector.h"
#include "helpers.h"
#include "serialis.h"
#include "tprintf.h"
using tesseract::TFile;
// Size of buffer needed to host the decimal representation of the maximum
// possible length of an int (in 64 bits), being -<20 digits>.
const int kMaxIntSize = 22;
@ -123,10 +128,22 @@ STRING::STRING(const char* cstr) {
assert(InvariantOk());
}
STRING::STRING(const char *data, int length) {
if (data == NULL) {
// Empty STRINGs contain just the "\0".
memcpy(AllocData(1, kMinCapacity), "", 1);
} else {
char* this_cstr = AllocData(length + 1, length + 1);
memcpy(this_cstr, data, length);
this_cstr[length] = '\0';
}
}
STRING::~STRING() {
DiscardData();
}
// TODO(rays) Change all callers to use TFile and remove the old functions.
// Writes to the given file. Returns false in case of error.
bool STRING::Serialize(FILE* fp) const {
inT32 len = length();
@ -134,6 +151,13 @@ bool STRING::Serialize(FILE* fp) const {
if (static_cast<int>(fwrite(GetCStr(), 1, len, fp)) != len) return false;
return true;
}
// Writes to the given file. Returns false in case of error.
bool STRING::Serialize(TFile* fp) const {
inT32 len = length();
if (fp->FWrite(&len, sizeof(len), 1) != 1) return false;
if (fp->FWrite(GetCStr(), 1, len) != len) return false;
return true;
}
// Reads from the given file. Returns false in case of error.
// If swap is true, assumes a big/little-endian swap is needed.
bool STRING::DeSerialize(bool swap, FILE* fp) {
@ -145,6 +169,17 @@ bool STRING::DeSerialize(bool swap, FILE* fp) {
if (static_cast<int>(fread(GetCStr(), 1, len, fp)) != len) return false;
return true;
}
// Reads from the given file. Returns false in case of error.
// If swap is true, assumes a big/little-endian swap is needed.
bool STRING::DeSerialize(bool swap, TFile* fp) {
inT32 len;
if (fp->FRead(&len, sizeof(len), 1) != 1) return false;
if (swap)
ReverseN(&len, sizeof(len));
truncate_at(len);
if (fp->FRead(GetCStr(), 1, len) != len) return false;
return true;
}
BOOL8 STRING::contains(const char c) const {
return (c != '\0') && (strchr (GetCStr(), c) != NULL);
@ -245,21 +280,20 @@ char& STRING::operator[](inT32 index) const {
void STRING::split(const char c, GenericVector<STRING> *splited) {
int start_index = 0;
for (int i = 0; i < length(); i++) {
int len = length();
for (int i = 0; i < len; i++) {
if ((*this)[i] == c) {
if (i != start_index) {
(*this)[i] = '\0';
STRING tmp = GetCStr() + start_index;
splited->push_back(tmp);
splited->push_back(STRING(GetCStr() + start_index, i - start_index));
(*this)[i] = c;
}
start_index = i + 1;
}
}
if (length() != start_index) {
STRING tmp = GetCStr() + start_index;
splited->push_back(tmp);
if (len != start_index) {
splited->push_back(STRING(GetCStr() + start_index, len - start_index));
}
}

View File

@ -25,6 +25,10 @@
#include "platform.h"
#include "memry.h"
namespace tesseract {
class TFile;
} // namespace tesseract.
// STRING_IS_PROTECTED means that string[index] = X is invalid
// because you have to go through strings interface to modify it.
// This allows the string to ensure internal integrity and maintain
@ -43,6 +47,7 @@ class TESS_API STRING
STRING();
STRING(const STRING &string);
STRING(const char *string);
STRING(const char *data, int length);
~STRING ();
// Writes to the given file. Returns false in case of error.
@ -50,6 +55,11 @@ class TESS_API STRING
// Reads from the given file. Returns false in case of error.
// If swap is true, assumes a big/little-endian swap is needed.
bool DeSerialize(bool swap, FILE* fp);
// Writes to the given file. Returns false in case of error.
bool Serialize(tesseract::TFile* fp) const;
// Reads from the given file. Returns false in case of error.
// If swap is true, assumes a big/little-endian swap is needed.
bool DeSerialize(bool swap, tesseract::TFile* fp);
BOOL8 contains(const char c) const;
inT32 length() const;

View File

@ -19,6 +19,7 @@
#include "unichar.h"
#include "errcode.h"
#include "genericvector.h"
#include "tprintf.h"
#define UNI_MAX_LEGAL_UTF32 0x0010FFFF
@ -203,3 +204,14 @@ UNICHAR::const_iterator UNICHAR::begin(const char* utf8_str, const int len) {
UNICHAR::const_iterator UNICHAR::end(const char* utf8_str, const int len) {
return UNICHAR::const_iterator(utf8_str + len);
}
// Converts a utf-8 string to a vector of unicodes.
void UNICHAR::UTF8ToUnicode(const char* utf8_str,
GenericVector<int>* unicodes) {
const int utf8_length = strlen(utf8_str);
const_iterator end_it(end(utf8_str, utf8_length));
for (const_iterator it(begin(utf8_str, utf8_length)); it != end_it; ++it) {
unicodes->push_back(*it);
}
}

View File

@ -23,6 +23,8 @@
#include <memory.h>
#include <string.h>
template <typename T> class GenericVector;
// Maximum number of characters that can be stored in a UNICHAR. Must be
// at least 4. Must not exceed 31 without changing the coding of length.
#define UNICHAR_LEN 30
@ -148,6 +150,9 @@ class UNICHAR {
static const_iterator begin(const char* utf8_str, const int byte_length);
static const_iterator end(const char* utf8_str, const int byte_length);
// Converts a utf-8 string to a vector of unicodes.
static void UTF8ToUnicode(const char* utf8_str, GenericVector<int>* unicodes);
private:
// A UTF-8 representation of 1 or more Unicode characters.
// The last element (chars[UNICHAR_LEN - 1]) is a length if