backport from 4.00: imagedata

This commit is contained in:
Zdenko Podobný 2016-12-07 15:55:27 +01:00
parent 7099358510
commit c1d37120a5
2 changed files with 416 additions and 129 deletions

View File

@ -30,6 +30,14 @@
#include "helpers.h"
#include "tprintf.h"
#if __cplusplus > 199711L // C++11 support
#include <thread>
#endif
// Number of documents to read ahead while training. Doesn't need to be very
// large.
const int kMaxReadAhead = 8;
namespace tesseract {
WordFeature::WordFeature() : x_(0), y_(0), dir_(0) {
@ -182,6 +190,19 @@ bool ImageData::DeSerialize(bool swap, TFile* fp) {
return true;
}
// As DeSerialize, but only seeks past the data - hence a static method.
bool ImageData::SkipDeSerialize(bool swap, TFile* fp) {
if (!STRING::SkipDeSerialize(swap, fp)) return false;
inT32 page_number;
if (fp->FRead(&page_number, sizeof(page_number), 1) != 1) return false;
if (!GenericVector<char>::SkipDeSerialize(swap, fp)) return false;
if (!STRING::SkipDeSerialize(swap, fp)) return false;
if (!GenericVector<TBOX>::SkipDeSerialize(swap, fp)) return false;
if (!GenericVector<STRING>::SkipDeSerializeClasses(swap, fp)) return false;
inT8 vertical = 0;
return fp->FRead(&vertical, sizeof(vertical), 1) == 1;
}
// Saves the given Pix as a PNG-encoded string and destroys it.
void ImageData::SetPix(Pix* pix) {
SetPixInternal(pix, &image_data_);
@ -195,37 +216,34 @@ Pix* ImageData::GetPix() const {
// Gets anything and everything with a non-NULL pointer, prescaled to a
// given target_height (if 0, then the original image height), and aligned.
// Also returns (if not NULL) the width and height of the scaled image.
// The return value is the scale factor that was applied to the image to
// achieve the target_height.
float ImageData::PreScale(int target_height, Pix** pix,
int* scaled_width, int* scaled_height,
GenericVector<TBOX>* boxes) const {
// The return value is the scaled Pix, which must be pixDestroyed after use,
// and scale_factor (if not NULL) is set to the scale factor that was applied
// to the image to achieve the target_height.
Pix* ImageData::PreScale(int target_height, int max_height, float* scale_factor,
int* scaled_width, int* scaled_height,
GenericVector<TBOX>* boxes) const {
int input_width = 0;
int input_height = 0;
Pix* src_pix = GetPix();
ASSERT_HOST(src_pix != NULL);
input_width = pixGetWidth(src_pix);
input_height = pixGetHeight(src_pix);
if (target_height == 0)
target_height = input_height;
if (target_height == 0) {
target_height = MIN(input_height, max_height);
}
float im_factor = static_cast<float>(target_height) / input_height;
if (scaled_width != NULL)
*scaled_width = IntCastRounded(im_factor * input_width);
if (scaled_height != NULL)
*scaled_height = target_height;
if (pix != NULL) {
// Get the scaled image.
pixDestroy(pix);
*pix = pixScale(src_pix, im_factor, im_factor);
if (*pix == NULL) {
tprintf("Scaling pix of size %d, %d by factor %g made null pix!!\n",
input_width, input_height, im_factor);
}
if (scaled_width != NULL)
*scaled_width = pixGetWidth(*pix);
if (scaled_height != NULL)
*scaled_height = pixGetHeight(*pix);
// Get the scaled image.
Pix* pix = pixScale(src_pix, im_factor, im_factor);
if (pix == NULL) {
tprintf("Scaling pix of size %d, %d by factor %g made null pix!!\n",
input_width, input_height, im_factor);
}
if (scaled_width != NULL) *scaled_width = pixGetWidth(pix);
if (scaled_height != NULL) *scaled_height = pixGetHeight(pix);
pixDestroy(&src_pix);
if (boxes != NULL) {
// Get the boxes.
@ -241,7 +259,8 @@ float ImageData::PreScale(int target_height, Pix** pix,
boxes->push_back(box);
}
}
return im_factor;
if (scale_factor != NULL) *scale_factor = im_factor;
return pix;
}
int ImageData::MemoryUsed() const {
@ -266,19 +285,20 @@ void ImageData::Display() const {
// Draw the boxes.
win->Pen(ScrollView::RED);
win->Brush(ScrollView::NONE);
win->TextAttributes("Arial", kTextSize, false, false, false);
for (int b = 0; b < boxes_.size(); ++b) {
boxes_[b].plot(win);
win->Text(boxes_[b].left(), height + kTextSize, box_texts_[b].string());
TBOX scaled(boxes_[b]);
scaled.scale(256.0 / height);
scaled.plot(win);
int text_size = kTextSize;
if (!boxes_.empty() && boxes_[0].height() * 2 < text_size)
text_size = boxes_[0].height() * 2;
win->TextAttributes("Arial", text_size, false, false, false);
if (!boxes_.empty()) {
for (int b = 0; b < boxes_.size(); ++b) {
boxes_[b].plot(win);
win->Text(boxes_[b].left(), height + kTextSize, box_texts_[b].string());
}
} else {
// The full transcription.
win->Pen(ScrollView::CYAN);
win->Text(0, height + kTextSize * 2, transcription_.string());
}
// The full transcription.
win->Pen(ScrollView::CYAN);
win->Text(0, height + kTextSize * 2, transcription_.string());
// Add the features.
win->Pen(ScrollView::GREEN);
win->Update();
window_wait(win);
#endif
@ -340,27 +360,51 @@ bool ImageData::AddBoxes(const char* box_text) {
return false;
}
DocumentData::DocumentData(const STRING& name)
: document_name_(name), pages_offset_(0), total_pages_(0),
memory_used_(0), max_memory_(0), reader_(NULL) {}
// Thread function to call ReCachePages.
void* ReCachePagesFunc(void* data) {
DocumentData* document_data = reinterpret_cast<DocumentData*>(data);
document_data->ReCachePages();
return NULL;
}
DocumentData::~DocumentData() {}
DocumentData::DocumentData(const STRING& name)
: document_name_(name),
pages_offset_(-1),
total_pages_(-1),
memory_used_(0),
max_memory_(0),
reader_(NULL) {}
DocumentData::~DocumentData() {
SVAutoLock lock_p(&pages_mutex_);
SVAutoLock lock_g(&general_mutex_);
}
// Reads all the pages in the given lstmf filename to the cache. The reader
// is used to read the file.
bool DocumentData::LoadDocument(const char* filename, const char* lang,
int start_page, inT64 max_memory,
FileReader reader) {
SetDocument(filename, lang, max_memory, reader);
pages_offset_ = start_page;
return ReCachePages();
}
// Sets up the document, without actually loading it.
void DocumentData::SetDocument(const char* filename, const char* lang,
inT64 max_memory, FileReader reader) {
SVAutoLock lock_p(&pages_mutex_);
SVAutoLock lock(&general_mutex_);
document_name_ = filename;
lang_ = lang;
pages_offset_ = start_page;
pages_offset_ = -1;
max_memory_ = max_memory;
reader_ = reader;
return ReCachePages();
}
// Writes all the pages to the given filename. Returns false on error.
bool DocumentData::SaveDocument(const char* filename, FileWriter writer) {
SVAutoLock lock(&pages_mutex_);
TFile fp;
fp.OpenWrite(NULL);
if (!pages_.Serialize(&fp) || !fp.CloseWrite(filename, writer)) {
@ -370,112 +414,169 @@ bool DocumentData::SaveDocument(const char* filename, FileWriter writer) {
return true;
}
bool DocumentData::SaveToBuffer(GenericVector<char>* buffer) {
SVAutoLock lock(&pages_mutex_);
TFile fp;
fp.OpenWrite(buffer);
return pages_.Serialize(&fp);
}
// Returns a pointer to the page with the given index, modulo the total
// number of pages, recaching if needed.
const ImageData* DocumentData::GetPage(int index) {
index = Modulo(index, total_pages_);
if (index < pages_offset_ || index >= pages_offset_ + pages_.size()) {
pages_offset_ = index;
if (!ReCachePages()) return NULL;
}
return pages_[index - pages_offset_];
// Adds the given page data to this document, counting up memory.
void DocumentData::AddPageToDocument(ImageData* page) {
SVAutoLock lock(&pages_mutex_);
pages_.push_back(page);
set_memory_used(memory_used() + page->MemoryUsed());
}
// Loads as many pages can fit in max_memory_ starting at index pages_offset_.
// If the given index is not currently loaded, loads it using a separate
// thread.
void DocumentData::LoadPageInBackground(int index) {
ImageData* page = NULL;
if (IsPageAvailable(index, &page)) return;
SVAutoLock lock(&pages_mutex_);
if (pages_offset_ == index) return;
pages_offset_ = index;
pages_.clear();
SVSync::StartThread(ReCachePagesFunc, this);
}
// Returns a pointer to the page with the given index, modulo the total
// number of pages. Blocks until the background load is completed.
const ImageData* DocumentData::GetPage(int index) {
ImageData* page = NULL;
while (!IsPageAvailable(index, &page)) {
// If there is no background load scheduled, schedule one now.
pages_mutex_.Lock();
bool needs_loading = pages_offset_ != index;
pages_mutex_.Unlock();
if (needs_loading) LoadPageInBackground(index);
// We can't directly load the page, or the background load will delete it
// while the caller is using it, so give it a chance to work.
#if __cplusplus > 199711L // C++11 support
//TODO: We need to fix this for compilers without C++11 support (e.g. VS2010)
std::this_thread::sleep_for(std::chrono::seconds(1));
#endif
}
return page;
}
// Returns true if the requested page is available, and provides a pointer,
// which may be NULL if the document is empty. May block, even though it
// doesn't guarantee to return true.
bool DocumentData::IsPageAvailable(int index, ImageData** page) {
SVAutoLock lock(&pages_mutex_);
int num_pages = NumPages();
if (num_pages == 0 || index < 0) {
*page = NULL; // Empty Document.
return true;
}
if (num_pages > 0) {
index = Modulo(index, num_pages);
if (pages_offset_ <= index && index < pages_offset_ + pages_.size()) {
*page = pages_[index - pages_offset_]; // Page is available already.
return true;
}
}
return false;
}
// Removes all pages from memory and frees the memory, but does not forget
// the document metadata.
inT64 DocumentData::UnCache() {
SVAutoLock lock(&pages_mutex_);
inT64 memory_saved = memory_used();
pages_.clear();
pages_offset_ = -1;
set_total_pages(-1);
set_memory_used(0);
tprintf("Unloaded document %s, saving %d memory\n", document_name_.string(),
memory_saved);
return memory_saved;
}
// Locks the pages_mutex_ and Loads as many pages can fit in max_memory_
// starting at index pages_offset_.
bool DocumentData::ReCachePages() {
SVAutoLock lock(&pages_mutex_);
// Read the file.
set_total_pages(0);
set_memory_used(0);
int loaded_pages = 0;
pages_.truncate(0);
TFile fp;
if (!fp.Open(document_name_, reader_)) return false;
memory_used_ = 0;
if (!pages_.DeSerialize(false, &fp)) {
tprintf("Deserialize failed: %s\n", document_name_.string());
pages_.truncate(0);
if (!fp.Open(document_name_, reader_) ||
!PointerVector<ImageData>::DeSerializeSize(false, &fp, &loaded_pages) ||
loaded_pages <= 0) {
tprintf("Deserialize header failed: %s\n", document_name_.string());
return false;
}
total_pages_ = pages_.size();
pages_offset_ %= total_pages_;
// Delete pages before the first one we want, and relocate the rest.
pages_offset_ %= loaded_pages;
// Skip pages before the first one we want, and load the rest until max
// memory and skip the rest after that.
int page;
for (page = 0; page < pages_.size(); ++page) {
if (page < pages_offset_) {
delete pages_[page];
pages_[page] = NULL;
for (page = 0; page < loaded_pages; ++page) {
if (page < pages_offset_ ||
(max_memory_ > 0 && memory_used() > max_memory_)) {
if (!PointerVector<ImageData>::DeSerializeSkip(false, &fp)) break;
} else {
ImageData* image_data = pages_[page];
if (max_memory_ > 0 && page > pages_offset_ &&
memory_used_ + image_data->MemoryUsed() > max_memory_)
break; // Don't go over memory quota unless the first image.
if (!pages_.DeSerializeElement(false, &fp)) break;
ImageData* image_data = pages_.back();
if (image_data->imagefilename().length() == 0) {
image_data->set_imagefilename(document_name_);
image_data->set_page_number(page);
}
image_data->set_language(lang_);
memory_used_ += image_data->MemoryUsed();
if (pages_offset_ != 0) {
pages_[page - pages_offset_] = image_data;
pages_[page] = NULL;
}
set_memory_used(memory_used() + image_data->MemoryUsed());
}
}
pages_.truncate(page - pages_offset_);
tprintf("Loaded %d/%d pages (%d-%d) of document %s\n",
pages_.size(), total_pages_, pages_offset_,
pages_offset_ + pages_.size(), document_name_.string());
if (page < loaded_pages) {
tprintf("Deserialize failed: %s read %d/%d pages\n",
document_name_.string(), page, loaded_pages);
pages_.truncate(0);
} else {
tprintf("Loaded %d/%d pages (%d-%d) of document %s\n", pages_.size(),
loaded_pages, pages_offset_, pages_offset_ + pages_.size(),
document_name_.string());
}
set_total_pages(loaded_pages);
return !pages_.empty();
}
// Adds the given page data to this document, counting up memory.
void DocumentData::AddPageToDocument(ImageData* page) {
pages_.push_back(page);
memory_used_ += page->MemoryUsed();
}
// A collection of DocumentData that knows roughly how much memory it is using.
DocumentCache::DocumentCache(inT64 max_memory)
: total_pages_(0), memory_used_(0), max_memory_(max_memory) {}
: num_pages_per_doc_(0), max_memory_(max_memory) {}
DocumentCache::~DocumentCache() {}
// Adds all the documents in the list of filenames, counting memory.
// The reader is used to read the files.
bool DocumentCache::LoadDocuments(const GenericVector<STRING>& filenames,
const char* lang, FileReader reader) {
inT64 fair_share_memory = max_memory_ / filenames.size();
const char* lang,
CachingStrategy cache_strategy,
FileReader reader) {
cache_strategy_ = cache_strategy;
inT64 fair_share_memory = 0;
// In the round-robin case, each DocumentData handles restricting its content
// to its fair share of memory. In the sequential case, DocumentCache
// determines which DocumentDatas are held entirely in memory.
if (cache_strategy_ == CS_ROUND_ROBIN)
fair_share_memory = max_memory_ / filenames.size();
for (int arg = 0; arg < filenames.size(); ++arg) {
STRING filename = filenames[arg];
DocumentData* document = new DocumentData(filename);
if (document->LoadDocument(filename.string(), lang, 0,
fair_share_memory, reader)) {
AddToCache(document);
} else {
tprintf("Failed to load image %s!\n", filename.string());
delete document;
}
document->SetDocument(filename.string(), lang, fair_share_memory, reader);
AddToCache(document);
}
tprintf("Loaded %d pages, total %gMB\n",
total_pages_, memory_used_ / 1048576.0);
return total_pages_ > 0;
if (!documents_.empty()) {
// Try to get the first page now to verify the list of filenames.
if (GetPageBySerial(0) != NULL) return true;
tprintf("Load of page 0 failed!\n");
}
return false;
}
// Adds document to the cache, throwing out other documents if needed.
// Adds document to the cache.
bool DocumentCache::AddToCache(DocumentData* data) {
inT64 new_memory = data->memory_used();
memory_used_ += new_memory;
documents_.push_back(data);
total_pages_ += data->NumPages();
// Delete the first item in the array, and other pages of the same name
// while memory is full.
while (memory_used_ >= max_memory_ && max_memory_ > 0) {
tprintf("Memory used=%lld vs max=%lld, discarding doc of size %lld\n",
memory_used_ , max_memory_, documents_[0]->memory_used());
memory_used_ -= documents_[0]->memory_used();
total_pages_ -= documents_[0]->NumPages();
documents_.remove(0);
}
return true;
}
@ -488,11 +589,104 @@ DocumentData* DocumentCache::FindDocument(const STRING& document_name) const {
return NULL;
}
// Returns the total number of pages in an epoch. For CS_ROUND_ROBIN cache
// strategy, could take a long time.
int DocumentCache::TotalPages() {
if (cache_strategy_ == CS_SEQUENTIAL) {
// In sequential mode, we assume each doc has the same number of pages
// whether it is true or not.
if (num_pages_per_doc_ == 0) GetPageSequential(0);
return num_pages_per_doc_ * documents_.size();
}
int total_pages = 0;
int num_docs = documents_.size();
for (int d = 0; d < num_docs; ++d) {
// We have to load a page to make NumPages() valid.
documents_[d]->GetPage(0);
total_pages += documents_[d]->NumPages();
}
return total_pages;
}
// Returns a page by serial number, selecting them in a round-robin fashion
// from all the documents.
const ImageData* DocumentCache::GetPageBySerial(int serial) {
int document_index = serial % documents_.size();
return documents_[document_index]->GetPage(serial / documents_.size());
// from all the documents. Highly disk-intensive, but doesn't need samples
// to be shuffled between files to begin with.
const ImageData* DocumentCache::GetPageRoundRobin(int serial) {
int num_docs = documents_.size();
int doc_index = serial % num_docs;
const ImageData* doc = documents_[doc_index]->GetPage(serial / num_docs);
for (int offset = 1; offset <= kMaxReadAhead && offset < num_docs; ++offset) {
doc_index = (serial + offset) % num_docs;
int page = (serial + offset) / num_docs;
documents_[doc_index]->LoadPageInBackground(page);
}
return doc;
}
// Returns a page by serial number, selecting them in sequence from each file.
// Requires the samples to be shuffled between the files to give a random or
// uniform distribution of data. Less disk-intensive than GetPageRoundRobin.
const ImageData* DocumentCache::GetPageSequential(int serial) {
int num_docs = documents_.size();
ASSERT_HOST(num_docs > 0);
if (num_pages_per_doc_ == 0) {
// Use the pages in the first doc as the number of pages in each doc.
documents_[0]->GetPage(0);
num_pages_per_doc_ = documents_[0]->NumPages();
if (num_pages_per_doc_ == 0) {
tprintf("First document cannot be empty!!\n");
ASSERT_HOST(num_pages_per_doc_ > 0);
}
// Get rid of zero now if we don't need it.
if (serial / num_pages_per_doc_ % num_docs > 0) documents_[0]->UnCache();
}
int doc_index = serial / num_pages_per_doc_ % num_docs;
const ImageData* doc =
documents_[doc_index]->GetPage(serial % num_pages_per_doc_);
// Count up total memory. Background loading makes it more complicated to
// keep a running count.
inT64 total_memory = 0;
for (int d = 0; d < num_docs; ++d) {
total_memory += documents_[d]->memory_used();
}
if (total_memory >= max_memory_) {
// Find something to un-cache.
// If there are more than 3 in front, then serial is from the back reader
// of a pair of readers. If we un-cache from in-front-2 to 2-ahead, then
// we create a hole between them and then un-caching the backmost occupied
// will work for both.
int num_in_front = CountNeighbourDocs(doc_index, 1);
for (int offset = num_in_front - 2;
offset > 1 && total_memory >= max_memory_; --offset) {
int next_index = (doc_index + offset) % num_docs;
total_memory -= documents_[next_index]->UnCache();
}
// If that didn't work, the best solution is to un-cache from the back. If
// we take away the document that a 2nd reader is using, it will put it
// back and make a hole between.
int num_behind = CountNeighbourDocs(doc_index, -1);
for (int offset = num_behind; offset < 0 && total_memory >= max_memory_;
++offset) {
int next_index = (doc_index + offset + num_docs) % num_docs;
total_memory -= documents_[next_index]->UnCache();
}
}
int next_index = (doc_index + 1) % num_docs;
if (!documents_[next_index]->IsCached() && total_memory < max_memory_) {
documents_[next_index]->LoadPageInBackground(0);
}
return doc;
}
// Helper counts the number of adjacent cached neighbours of index looking in
// direction dir, ie index+dir, index+2*dir etc.
int DocumentCache::CountNeighbourDocs(int index, int dir) {
int num_docs = documents_.size();
for (int offset = dir; abs(offset) < num_docs; offset += dir) {
int offset_index = (index + offset + num_docs) % num_docs;
if (!documents_[offset_index]->IsCached()) return offset - dir;
}
return num_docs;
}
} // namespace tesseract.

View File

@ -25,6 +25,7 @@
#include "normalis.h"
#include "rect.h"
#include "strngs.h"
#include "svutil.h"
struct Pix;
@ -34,8 +35,22 @@ namespace tesseract {
const int kFeaturePadding = 2;
// Number of pixels to pad around text boxes.
const int kImagePadding = 4;
// Number of training images to combine into a mini-batch for training.
const int kNumPagesPerMiniBatch = 100;
// Enum to determine the caching and data sequencing strategy.
enum CachingStrategy {
// Reads all of one file before moving on to the next. Requires samples to be
// shuffled across files. Uses the count of samples in the first file as
// the count in all the files to achieve high-speed random access. As a
// consequence, if subsequent files are smaller, they get entries used more
// than once, and if subsequent files are larger, some entries are not used.
// Best for larger data sets that don't fit in memory.
CS_SEQUENTIAL,
// Reads one sample from each file in rotation. Does not require shuffled
// samples, but is extremely disk-intensive. Samples in smaller files also
// get used more often than samples in larger files.
// Best for smaller data sets that mostly fit in memory.
CS_ROUND_ROBIN,
};
class WordFeature {
public:
@ -103,6 +118,8 @@ class ImageData {
// Reads from the given file. Returns false in case of error.
// If swap is true, assumes a big/little-endian swap is needed.
bool DeSerialize(bool swap, TFile* fp);
// As DeSerialize, but only seeks past the data - hence a static method.
static bool SkipDeSerialize(bool swap, tesseract::TFile* fp);
// Other accessors.
const STRING& imagefilename() const {
@ -145,11 +162,12 @@ class ImageData {
// Gets anything and everything with a non-NULL pointer, prescaled to a
// given target_height (if 0, then the original image height), and aligned.
// Also returns (if not NULL) the width and height of the scaled image.
// The return value is the scale factor that was applied to the image to
// achieve the target_height.
float PreScale(int target_height, Pix** pix,
int* scaled_width, int* scaled_height,
GenericVector<TBOX>* boxes) const;
// The return value is the scaled Pix, which must be pixDestroyed after use,
// and scale_factor (if not NULL) is set to the scale factor that was applied
// to the image to achieve the target_height.
Pix* PreScale(int target_height, int max_height, float* scale_factor,
int* scaled_width, int* scaled_height,
GenericVector<TBOX>* boxes) const;
int MemoryUsed() const;
@ -184,6 +202,8 @@ class ImageData {
// A collection of ImageData that knows roughly how much memory it is using.
class DocumentData {
friend void* ReCachePagesFunc(void* data);
public:
explicit DocumentData(const STRING& name);
~DocumentData();
@ -192,6 +212,9 @@ class DocumentData {
// is used to read the file.
bool LoadDocument(const char* filename, const char* lang, int start_page,
inT64 max_memory, FileReader reader);
// Sets up the document, without actually loading it.
void SetDocument(const char* filename, const char* lang, inT64 max_memory,
FileReader reader);
// Writes all the pages to the given filename. Returns false on error.
bool SaveDocument(const char* filename, FileWriter writer);
bool SaveToBuffer(GenericVector<char>* buffer);
@ -200,26 +223,62 @@ class DocumentData {
void AddPageToDocument(ImageData* page);
const STRING& document_name() const {
SVAutoLock lock(&general_mutex_);
return document_name_;
}
int NumPages() const {
SVAutoLock lock(&general_mutex_);
return total_pages_;
}
inT64 memory_used() const {
SVAutoLock lock(&general_mutex_);
return memory_used_;
}
// If the given index is not currently loaded, loads it using a separate
// thread. Note: there are 4 cases:
// Document uncached: IsCached() returns false, total_pages_ < 0.
// Required page is available: IsPageAvailable returns true. In this case,
// total_pages_ > 0 and
// pages_offset_ <= index%total_pages_ <= pages_offset_+pages_.size()
// Pages are loaded, but the required one is not.
// The requested page is being loaded by LoadPageInBackground. In this case,
// index == pages_offset_. Once the loading starts, the pages lock is held
// until it completes, at which point IsPageAvailable will unblock and return
// true.
void LoadPageInBackground(int index);
// Returns a pointer to the page with the given index, modulo the total
// number of pages, recaching if needed.
// number of pages. Blocks until the background load is completed.
const ImageData* GetPage(int index);
// Returns true if the requested page is available, and provides a pointer,
// which may be NULL if the document is empty. May block, even though it
// doesn't guarantee to return true.
bool IsPageAvailable(int index, ImageData** page);
// Takes ownership of the given page index. The page is made NULL in *this.
ImageData* TakePage(int index) {
SVAutoLock lock(&pages_mutex_);
ImageData* page = pages_[index];
pages_[index] = NULL;
return page;
}
// Returns true if the document is currently loaded or in the process of
// loading.
bool IsCached() const { return NumPages() >= 0; }
// Removes all pages from memory and frees the memory, but does not forget
// the document metadata. Returns the memory saved.
inT64 UnCache();
private:
// Loads as many pages can fit in max_memory_ starting at index pages_offset_.
// Sets the value of total_pages_ behind a mutex.
void set_total_pages(int total) {
SVAutoLock lock(&general_mutex_);
total_pages_ = total;
}
void set_memory_used(inT64 memory_used) {
SVAutoLock lock(&general_mutex_);
memory_used_ = memory_used;
}
// Locks the pages_mutex_ and Loads as many pages can fit in max_memory_
// starting at index pages_offset_.
bool ReCachePages();
private:
@ -239,43 +298,77 @@ class DocumentData {
inT64 max_memory_;
// Saved reader from LoadDocument to allow re-caching.
FileReader reader_;
// Mutex that protects pages_ and pages_offset_ against multiple parallel
// loads, and provides a wait for page.
SVMutex pages_mutex_;
// Mutex that protects other data members that callers want to access without
// waiting for a load operation.
mutable SVMutex general_mutex_;
};
// A collection of DocumentData that knows roughly how much memory it is using.
// Note that while it supports background read-ahead, it assumes that a single
// thread is accessing documents, ie it is not safe for multiple threads to
// access different documents in parallel, as one may de-cache the other's
// content.
class DocumentCache {
public:
explicit DocumentCache(inT64 max_memory);
~DocumentCache();
// Deletes all existing documents from the cache.
void Clear() {
documents_.clear();
num_pages_per_doc_ = 0;
}
// Adds all the documents in the list of filenames, counting memory.
// The reader is used to read the files.
bool LoadDocuments(const GenericVector<STRING>& filenames, const char* lang,
FileReader reader);
CachingStrategy cache_strategy, FileReader reader);
// Adds document to the cache, throwing out other documents if needed.
// Adds document to the cache.
bool AddToCache(DocumentData* data);
// Finds and returns a document by name.
DocumentData* FindDocument(const STRING& document_name) const;
// Returns a page by serial number, selecting them in a round-robin fashion
// from all the documents.
const ImageData* GetPageBySerial(int serial);
// Returns a page by serial number using the current cache_strategy_ to
// determine the mapping from serial number to page.
const ImageData* GetPageBySerial(int serial) {
if (cache_strategy_ == CS_SEQUENTIAL)
return GetPageSequential(serial);
else
return GetPageRoundRobin(serial);
}
const PointerVector<DocumentData>& documents() const {
return documents_;
}
int total_pages() const {
return total_pages_;
}
// Returns the total number of pages in an epoch. For CS_ROUND_ROBIN cache
// strategy, could take a long time.
int TotalPages();
private:
// Returns a page by serial number, selecting them in a round-robin fashion
// from all the documents. Highly disk-intensive, but doesn't need samples
// to be shuffled between files to begin with.
const ImageData* GetPageRoundRobin(int serial);
// Returns a page by serial number, selecting them in sequence from each file.
// Requires the samples to be shuffled between the files to give a random or
// uniform distribution of data. Less disk-intensive than GetPageRoundRobin.
const ImageData* GetPageSequential(int serial);
// Helper counts the number of adjacent cached neighbour documents_ of index
// looking in direction dir, ie index+dir, index+2*dir etc.
int CountNeighbourDocs(int index, int dir);
// A group of pages that corresponds in some loose way to a document.
PointerVector<DocumentData> documents_;
// Total of all pages.
int total_pages_;
// Total of all memory used by the cache.
inT64 memory_used_;
// Strategy to use for caching and serializing data samples.
CachingStrategy cache_strategy_;
// Number of pages in the first document, used as a divisor in
// GetPageSequential to determine the document index.
int num_pages_per_doc_;
// Max memory allowed in this cache.
inT64 max_memory_;
};