diff --git a/ccstruct/imagedata.cpp b/ccstruct/imagedata.cpp index 3c244c77..11e1c862 100644 --- a/ccstruct/imagedata.cpp +++ b/ccstruct/imagedata.cpp @@ -30,6 +30,14 @@ #include "helpers.h" #include "tprintf.h" +#if __cplusplus > 199711L // C++11 support + #include +#endif + +// Number of documents to read ahead while training. Doesn't need to be very +// large. +const int kMaxReadAhead = 8; + namespace tesseract { WordFeature::WordFeature() : x_(0), y_(0), dir_(0) { @@ -182,6 +190,19 @@ bool ImageData::DeSerialize(bool swap, TFile* fp) { return true; } +// As DeSerialize, but only seeks past the data - hence a static method. +bool ImageData::SkipDeSerialize(bool swap, TFile* fp) { + if (!STRING::SkipDeSerialize(swap, fp)) return false; + inT32 page_number; + if (fp->FRead(&page_number, sizeof(page_number), 1) != 1) return false; + if (!GenericVector::SkipDeSerialize(swap, fp)) return false; + if (!STRING::SkipDeSerialize(swap, fp)) return false; + if (!GenericVector::SkipDeSerialize(swap, fp)) return false; + if (!GenericVector::SkipDeSerializeClasses(swap, fp)) return false; + inT8 vertical = 0; + return fp->FRead(&vertical, sizeof(vertical), 1) == 1; +} + // Saves the given Pix as a PNG-encoded string and destroys it. void ImageData::SetPix(Pix* pix) { SetPixInternal(pix, &image_data_); @@ -195,37 +216,34 @@ Pix* ImageData::GetPix() const { // Gets anything and everything with a non-NULL pointer, prescaled to a // given target_height (if 0, then the original image height), and aligned. // Also returns (if not NULL) the width and height of the scaled image. -// The return value is the scale factor that was applied to the image to -// achieve the target_height. -float ImageData::PreScale(int target_height, Pix** pix, - int* scaled_width, int* scaled_height, - GenericVector* boxes) const { +// The return value is the scaled Pix, which must be pixDestroyed after use, +// and scale_factor (if not NULL) is set to the scale factor that was applied +// to the image to achieve the target_height. +Pix* ImageData::PreScale(int target_height, int max_height, float* scale_factor, + int* scaled_width, int* scaled_height, + GenericVector* boxes) const { int input_width = 0; int input_height = 0; Pix* src_pix = GetPix(); ASSERT_HOST(src_pix != NULL); input_width = pixGetWidth(src_pix); input_height = pixGetHeight(src_pix); - if (target_height == 0) - target_height = input_height; + if (target_height == 0) { + target_height = MIN(input_height, max_height); + } float im_factor = static_cast(target_height) / input_height; if (scaled_width != NULL) *scaled_width = IntCastRounded(im_factor * input_width); if (scaled_height != NULL) *scaled_height = target_height; - if (pix != NULL) { - // Get the scaled image. - pixDestroy(pix); - *pix = pixScale(src_pix, im_factor, im_factor); - if (*pix == NULL) { - tprintf("Scaling pix of size %d, %d by factor %g made null pix!!\n", - input_width, input_height, im_factor); - } - if (scaled_width != NULL) - *scaled_width = pixGetWidth(*pix); - if (scaled_height != NULL) - *scaled_height = pixGetHeight(*pix); + // Get the scaled image. + Pix* pix = pixScale(src_pix, im_factor, im_factor); + if (pix == NULL) { + tprintf("Scaling pix of size %d, %d by factor %g made null pix!!\n", + input_width, input_height, im_factor); } + if (scaled_width != NULL) *scaled_width = pixGetWidth(pix); + if (scaled_height != NULL) *scaled_height = pixGetHeight(pix); pixDestroy(&src_pix); if (boxes != NULL) { // Get the boxes. @@ -241,7 +259,8 @@ float ImageData::PreScale(int target_height, Pix** pix, boxes->push_back(box); } } - return im_factor; + if (scale_factor != NULL) *scale_factor = im_factor; + return pix; } int ImageData::MemoryUsed() const { @@ -266,19 +285,20 @@ void ImageData::Display() const { // Draw the boxes. win->Pen(ScrollView::RED); win->Brush(ScrollView::NONE); - win->TextAttributes("Arial", kTextSize, false, false, false); - for (int b = 0; b < boxes_.size(); ++b) { - boxes_[b].plot(win); - win->Text(boxes_[b].left(), height + kTextSize, box_texts_[b].string()); - TBOX scaled(boxes_[b]); - scaled.scale(256.0 / height); - scaled.plot(win); + int text_size = kTextSize; + if (!boxes_.empty() && boxes_[0].height() * 2 < text_size) + text_size = boxes_[0].height() * 2; + win->TextAttributes("Arial", text_size, false, false, false); + if (!boxes_.empty()) { + for (int b = 0; b < boxes_.size(); ++b) { + boxes_[b].plot(win); + win->Text(boxes_[b].left(), height + kTextSize, box_texts_[b].string()); + } + } else { + // The full transcription. + win->Pen(ScrollView::CYAN); + win->Text(0, height + kTextSize * 2, transcription_.string()); } - // The full transcription. - win->Pen(ScrollView::CYAN); - win->Text(0, height + kTextSize * 2, transcription_.string()); - // Add the features. - win->Pen(ScrollView::GREEN); win->Update(); window_wait(win); #endif @@ -340,27 +360,51 @@ bool ImageData::AddBoxes(const char* box_text) { return false; } -DocumentData::DocumentData(const STRING& name) - : document_name_(name), pages_offset_(0), total_pages_(0), - memory_used_(0), max_memory_(0), reader_(NULL) {} +// Thread function to call ReCachePages. +void* ReCachePagesFunc(void* data) { + DocumentData* document_data = reinterpret_cast(data); + document_data->ReCachePages(); + return NULL; +} -DocumentData::~DocumentData() {} +DocumentData::DocumentData(const STRING& name) + : document_name_(name), + pages_offset_(-1), + total_pages_(-1), + memory_used_(0), + max_memory_(0), + reader_(NULL) {} + +DocumentData::~DocumentData() { + SVAutoLock lock_p(&pages_mutex_); + SVAutoLock lock_g(&general_mutex_); +} // Reads all the pages in the given lstmf filename to the cache. The reader // is used to read the file. bool DocumentData::LoadDocument(const char* filename, const char* lang, int start_page, inT64 max_memory, FileReader reader) { + SetDocument(filename, lang, max_memory, reader); + pages_offset_ = start_page; + return ReCachePages(); +} + +// Sets up the document, without actually loading it. +void DocumentData::SetDocument(const char* filename, const char* lang, + inT64 max_memory, FileReader reader) { + SVAutoLock lock_p(&pages_mutex_); + SVAutoLock lock(&general_mutex_); document_name_ = filename; lang_ = lang; - pages_offset_ = start_page; + pages_offset_ = -1; max_memory_ = max_memory; reader_ = reader; - return ReCachePages(); } // Writes all the pages to the given filename. Returns false on error. bool DocumentData::SaveDocument(const char* filename, FileWriter writer) { + SVAutoLock lock(&pages_mutex_); TFile fp; fp.OpenWrite(NULL); if (!pages_.Serialize(&fp) || !fp.CloseWrite(filename, writer)) { @@ -370,112 +414,169 @@ bool DocumentData::SaveDocument(const char* filename, FileWriter writer) { return true; } bool DocumentData::SaveToBuffer(GenericVector* buffer) { + SVAutoLock lock(&pages_mutex_); TFile fp; fp.OpenWrite(buffer); return pages_.Serialize(&fp); } -// Returns a pointer to the page with the given index, modulo the total -// number of pages, recaching if needed. -const ImageData* DocumentData::GetPage(int index) { - index = Modulo(index, total_pages_); - if (index < pages_offset_ || index >= pages_offset_ + pages_.size()) { - pages_offset_ = index; - if (!ReCachePages()) return NULL; - } - return pages_[index - pages_offset_]; +// Adds the given page data to this document, counting up memory. +void DocumentData::AddPageToDocument(ImageData* page) { + SVAutoLock lock(&pages_mutex_); + pages_.push_back(page); + set_memory_used(memory_used() + page->MemoryUsed()); } -// Loads as many pages can fit in max_memory_ starting at index pages_offset_. +// If the given index is not currently loaded, loads it using a separate +// thread. +void DocumentData::LoadPageInBackground(int index) { + ImageData* page = NULL; + if (IsPageAvailable(index, &page)) return; + SVAutoLock lock(&pages_mutex_); + if (pages_offset_ == index) return; + pages_offset_ = index; + pages_.clear(); + SVSync::StartThread(ReCachePagesFunc, this); +} + +// Returns a pointer to the page with the given index, modulo the total +// number of pages. Blocks until the background load is completed. +const ImageData* DocumentData::GetPage(int index) { + ImageData* page = NULL; + while (!IsPageAvailable(index, &page)) { + // If there is no background load scheduled, schedule one now. + pages_mutex_.Lock(); + bool needs_loading = pages_offset_ != index; + pages_mutex_.Unlock(); + if (needs_loading) LoadPageInBackground(index); + // We can't directly load the page, or the background load will delete it + // while the caller is using it, so give it a chance to work. +#if __cplusplus > 199711L // C++11 support + //TODO: We need to fix this for compilers without C++11 support (e.g. VS2010) + std::this_thread::sleep_for(std::chrono::seconds(1)); +#endif + } + return page; +} + +// Returns true if the requested page is available, and provides a pointer, +// which may be NULL if the document is empty. May block, even though it +// doesn't guarantee to return true. +bool DocumentData::IsPageAvailable(int index, ImageData** page) { + SVAutoLock lock(&pages_mutex_); + int num_pages = NumPages(); + if (num_pages == 0 || index < 0) { + *page = NULL; // Empty Document. + return true; + } + if (num_pages > 0) { + index = Modulo(index, num_pages); + if (pages_offset_ <= index && index < pages_offset_ + pages_.size()) { + *page = pages_[index - pages_offset_]; // Page is available already. + return true; + } + } + return false; +} + +// Removes all pages from memory and frees the memory, but does not forget +// the document metadata. +inT64 DocumentData::UnCache() { + SVAutoLock lock(&pages_mutex_); + inT64 memory_saved = memory_used(); + pages_.clear(); + pages_offset_ = -1; + set_total_pages(-1); + set_memory_used(0); + tprintf("Unloaded document %s, saving %d memory\n", document_name_.string(), + memory_saved); + return memory_saved; +} + +// Locks the pages_mutex_ and Loads as many pages can fit in max_memory_ +// starting at index pages_offset_. bool DocumentData::ReCachePages() { + SVAutoLock lock(&pages_mutex_); // Read the file. + set_total_pages(0); + set_memory_used(0); + int loaded_pages = 0; + pages_.truncate(0); TFile fp; - if (!fp.Open(document_name_, reader_)) return false; - memory_used_ = 0; - if (!pages_.DeSerialize(false, &fp)) { - tprintf("Deserialize failed: %s\n", document_name_.string()); - pages_.truncate(0); + if (!fp.Open(document_name_, reader_) || + !PointerVector::DeSerializeSize(false, &fp, &loaded_pages) || + loaded_pages <= 0) { + tprintf("Deserialize header failed: %s\n", document_name_.string()); return false; } - total_pages_ = pages_.size(); - pages_offset_ %= total_pages_; - // Delete pages before the first one we want, and relocate the rest. + pages_offset_ %= loaded_pages; + // Skip pages before the first one we want, and load the rest until max + // memory and skip the rest after that. int page; - for (page = 0; page < pages_.size(); ++page) { - if (page < pages_offset_) { - delete pages_[page]; - pages_[page] = NULL; + for (page = 0; page < loaded_pages; ++page) { + if (page < pages_offset_ || + (max_memory_ > 0 && memory_used() > max_memory_)) { + if (!PointerVector::DeSerializeSkip(false, &fp)) break; } else { - ImageData* image_data = pages_[page]; - if (max_memory_ > 0 && page > pages_offset_ && - memory_used_ + image_data->MemoryUsed() > max_memory_) - break; // Don't go over memory quota unless the first image. + if (!pages_.DeSerializeElement(false, &fp)) break; + ImageData* image_data = pages_.back(); if (image_data->imagefilename().length() == 0) { image_data->set_imagefilename(document_name_); image_data->set_page_number(page); } image_data->set_language(lang_); - memory_used_ += image_data->MemoryUsed(); - if (pages_offset_ != 0) { - pages_[page - pages_offset_] = image_data; - pages_[page] = NULL; - } + set_memory_used(memory_used() + image_data->MemoryUsed()); } } - pages_.truncate(page - pages_offset_); - tprintf("Loaded %d/%d pages (%d-%d) of document %s\n", - pages_.size(), total_pages_, pages_offset_, - pages_offset_ + pages_.size(), document_name_.string()); + if (page < loaded_pages) { + tprintf("Deserialize failed: %s read %d/%d pages\n", + document_name_.string(), page, loaded_pages); + pages_.truncate(0); + } else { + tprintf("Loaded %d/%d pages (%d-%d) of document %s\n", pages_.size(), + loaded_pages, pages_offset_, pages_offset_ + pages_.size(), + document_name_.string()); + } + set_total_pages(loaded_pages); return !pages_.empty(); } -// Adds the given page data to this document, counting up memory. -void DocumentData::AddPageToDocument(ImageData* page) { - pages_.push_back(page); - memory_used_ += page->MemoryUsed(); -} - // A collection of DocumentData that knows roughly how much memory it is using. DocumentCache::DocumentCache(inT64 max_memory) - : total_pages_(0), memory_used_(0), max_memory_(max_memory) {} + : num_pages_per_doc_(0), max_memory_(max_memory) {} DocumentCache::~DocumentCache() {} // Adds all the documents in the list of filenames, counting memory. // The reader is used to read the files. bool DocumentCache::LoadDocuments(const GenericVector& filenames, - const char* lang, FileReader reader) { - inT64 fair_share_memory = max_memory_ / filenames.size(); + const char* lang, + CachingStrategy cache_strategy, + FileReader reader) { + cache_strategy_ = cache_strategy; + inT64 fair_share_memory = 0; + // In the round-robin case, each DocumentData handles restricting its content + // to its fair share of memory. In the sequential case, DocumentCache + // determines which DocumentDatas are held entirely in memory. + if (cache_strategy_ == CS_ROUND_ROBIN) + fair_share_memory = max_memory_ / filenames.size(); for (int arg = 0; arg < filenames.size(); ++arg) { STRING filename = filenames[arg]; DocumentData* document = new DocumentData(filename); - if (document->LoadDocument(filename.string(), lang, 0, - fair_share_memory, reader)) { - AddToCache(document); - } else { - tprintf("Failed to load image %s!\n", filename.string()); - delete document; - } + document->SetDocument(filename.string(), lang, fair_share_memory, reader); + AddToCache(document); } - tprintf("Loaded %d pages, total %gMB\n", - total_pages_, memory_used_ / 1048576.0); - return total_pages_ > 0; + if (!documents_.empty()) { + // Try to get the first page now to verify the list of filenames. + if (GetPageBySerial(0) != NULL) return true; + tprintf("Load of page 0 failed!\n"); + } + return false; } -// Adds document to the cache, throwing out other documents if needed. +// Adds document to the cache. bool DocumentCache::AddToCache(DocumentData* data) { inT64 new_memory = data->memory_used(); - memory_used_ += new_memory; documents_.push_back(data); - total_pages_ += data->NumPages(); - // Delete the first item in the array, and other pages of the same name - // while memory is full. - while (memory_used_ >= max_memory_ && max_memory_ > 0) { - tprintf("Memory used=%lld vs max=%lld, discarding doc of size %lld\n", - memory_used_ , max_memory_, documents_[0]->memory_used()); - memory_used_ -= documents_[0]->memory_used(); - total_pages_ -= documents_[0]->NumPages(); - documents_.remove(0); - } return true; } @@ -488,11 +589,104 @@ DocumentData* DocumentCache::FindDocument(const STRING& document_name) const { return NULL; } +// Returns the total number of pages in an epoch. For CS_ROUND_ROBIN cache +// strategy, could take a long time. +int DocumentCache::TotalPages() { + if (cache_strategy_ == CS_SEQUENTIAL) { + // In sequential mode, we assume each doc has the same number of pages + // whether it is true or not. + if (num_pages_per_doc_ == 0) GetPageSequential(0); + return num_pages_per_doc_ * documents_.size(); + } + int total_pages = 0; + int num_docs = documents_.size(); + for (int d = 0; d < num_docs; ++d) { + // We have to load a page to make NumPages() valid. + documents_[d]->GetPage(0); + total_pages += documents_[d]->NumPages(); + } + return total_pages; +} + // Returns a page by serial number, selecting them in a round-robin fashion -// from all the documents. -const ImageData* DocumentCache::GetPageBySerial(int serial) { - int document_index = serial % documents_.size(); - return documents_[document_index]->GetPage(serial / documents_.size()); +// from all the documents. Highly disk-intensive, but doesn't need samples +// to be shuffled between files to begin with. +const ImageData* DocumentCache::GetPageRoundRobin(int serial) { + int num_docs = documents_.size(); + int doc_index = serial % num_docs; + const ImageData* doc = documents_[doc_index]->GetPage(serial / num_docs); + for (int offset = 1; offset <= kMaxReadAhead && offset < num_docs; ++offset) { + doc_index = (serial + offset) % num_docs; + int page = (serial + offset) / num_docs; + documents_[doc_index]->LoadPageInBackground(page); + } + return doc; +} + +// Returns a page by serial number, selecting them in sequence from each file. +// Requires the samples to be shuffled between the files to give a random or +// uniform distribution of data. Less disk-intensive than GetPageRoundRobin. +const ImageData* DocumentCache::GetPageSequential(int serial) { + int num_docs = documents_.size(); + ASSERT_HOST(num_docs > 0); + if (num_pages_per_doc_ == 0) { + // Use the pages in the first doc as the number of pages in each doc. + documents_[0]->GetPage(0); + num_pages_per_doc_ = documents_[0]->NumPages(); + if (num_pages_per_doc_ == 0) { + tprintf("First document cannot be empty!!\n"); + ASSERT_HOST(num_pages_per_doc_ > 0); + } + // Get rid of zero now if we don't need it. + if (serial / num_pages_per_doc_ % num_docs > 0) documents_[0]->UnCache(); + } + int doc_index = serial / num_pages_per_doc_ % num_docs; + const ImageData* doc = + documents_[doc_index]->GetPage(serial % num_pages_per_doc_); + // Count up total memory. Background loading makes it more complicated to + // keep a running count. + inT64 total_memory = 0; + for (int d = 0; d < num_docs; ++d) { + total_memory += documents_[d]->memory_used(); + } + if (total_memory >= max_memory_) { + // Find something to un-cache. + // If there are more than 3 in front, then serial is from the back reader + // of a pair of readers. If we un-cache from in-front-2 to 2-ahead, then + // we create a hole between them and then un-caching the backmost occupied + // will work for both. + int num_in_front = CountNeighbourDocs(doc_index, 1); + for (int offset = num_in_front - 2; + offset > 1 && total_memory >= max_memory_; --offset) { + int next_index = (doc_index + offset) % num_docs; + total_memory -= documents_[next_index]->UnCache(); + } + // If that didn't work, the best solution is to un-cache from the back. If + // we take away the document that a 2nd reader is using, it will put it + // back and make a hole between. + int num_behind = CountNeighbourDocs(doc_index, -1); + for (int offset = num_behind; offset < 0 && total_memory >= max_memory_; + ++offset) { + int next_index = (doc_index + offset + num_docs) % num_docs; + total_memory -= documents_[next_index]->UnCache(); + } + } + int next_index = (doc_index + 1) % num_docs; + if (!documents_[next_index]->IsCached() && total_memory < max_memory_) { + documents_[next_index]->LoadPageInBackground(0); + } + return doc; +} + +// Helper counts the number of adjacent cached neighbours of index looking in +// direction dir, ie index+dir, index+2*dir etc. +int DocumentCache::CountNeighbourDocs(int index, int dir) { + int num_docs = documents_.size(); + for (int offset = dir; abs(offset) < num_docs; offset += dir) { + int offset_index = (index + offset + num_docs) % num_docs; + if (!documents_[offset_index]->IsCached()) return offset - dir; + } + return num_docs; } } // namespace tesseract. diff --git a/ccstruct/imagedata.h b/ccstruct/imagedata.h index 6321f121..ae672293 100644 --- a/ccstruct/imagedata.h +++ b/ccstruct/imagedata.h @@ -25,6 +25,7 @@ #include "normalis.h" #include "rect.h" #include "strngs.h" +#include "svutil.h" struct Pix; @@ -34,8 +35,22 @@ namespace tesseract { const int kFeaturePadding = 2; // Number of pixels to pad around text boxes. const int kImagePadding = 4; -// Number of training images to combine into a mini-batch for training. -const int kNumPagesPerMiniBatch = 100; + +// Enum to determine the caching and data sequencing strategy. +enum CachingStrategy { + // Reads all of one file before moving on to the next. Requires samples to be + // shuffled across files. Uses the count of samples in the first file as + // the count in all the files to achieve high-speed random access. As a + // consequence, if subsequent files are smaller, they get entries used more + // than once, and if subsequent files are larger, some entries are not used. + // Best for larger data sets that don't fit in memory. + CS_SEQUENTIAL, + // Reads one sample from each file in rotation. Does not require shuffled + // samples, but is extremely disk-intensive. Samples in smaller files also + // get used more often than samples in larger files. + // Best for smaller data sets that mostly fit in memory. + CS_ROUND_ROBIN, +}; class WordFeature { public: @@ -103,6 +118,8 @@ class ImageData { // Reads from the given file. Returns false in case of error. // If swap is true, assumes a big/little-endian swap is needed. bool DeSerialize(bool swap, TFile* fp); + // As DeSerialize, but only seeks past the data - hence a static method. + static bool SkipDeSerialize(bool swap, tesseract::TFile* fp); // Other accessors. const STRING& imagefilename() const { @@ -145,11 +162,12 @@ class ImageData { // Gets anything and everything with a non-NULL pointer, prescaled to a // given target_height (if 0, then the original image height), and aligned. // Also returns (if not NULL) the width and height of the scaled image. - // The return value is the scale factor that was applied to the image to - // achieve the target_height. - float PreScale(int target_height, Pix** pix, - int* scaled_width, int* scaled_height, - GenericVector* boxes) const; + // The return value is the scaled Pix, which must be pixDestroyed after use, + // and scale_factor (if not NULL) is set to the scale factor that was applied + // to the image to achieve the target_height. + Pix* PreScale(int target_height, int max_height, float* scale_factor, + int* scaled_width, int* scaled_height, + GenericVector* boxes) const; int MemoryUsed() const; @@ -184,6 +202,8 @@ class ImageData { // A collection of ImageData that knows roughly how much memory it is using. class DocumentData { + friend void* ReCachePagesFunc(void* data); + public: explicit DocumentData(const STRING& name); ~DocumentData(); @@ -192,6 +212,9 @@ class DocumentData { // is used to read the file. bool LoadDocument(const char* filename, const char* lang, int start_page, inT64 max_memory, FileReader reader); + // Sets up the document, without actually loading it. + void SetDocument(const char* filename, const char* lang, inT64 max_memory, + FileReader reader); // Writes all the pages to the given filename. Returns false on error. bool SaveDocument(const char* filename, FileWriter writer); bool SaveToBuffer(GenericVector* buffer); @@ -200,26 +223,62 @@ class DocumentData { void AddPageToDocument(ImageData* page); const STRING& document_name() const { + SVAutoLock lock(&general_mutex_); return document_name_; } int NumPages() const { + SVAutoLock lock(&general_mutex_); return total_pages_; } inT64 memory_used() const { + SVAutoLock lock(&general_mutex_); return memory_used_; } + // If the given index is not currently loaded, loads it using a separate + // thread. Note: there are 4 cases: + // Document uncached: IsCached() returns false, total_pages_ < 0. + // Required page is available: IsPageAvailable returns true. In this case, + // total_pages_ > 0 and + // pages_offset_ <= index%total_pages_ <= pages_offset_+pages_.size() + // Pages are loaded, but the required one is not. + // The requested page is being loaded by LoadPageInBackground. In this case, + // index == pages_offset_. Once the loading starts, the pages lock is held + // until it completes, at which point IsPageAvailable will unblock and return + // true. + void LoadPageInBackground(int index); // Returns a pointer to the page with the given index, modulo the total - // number of pages, recaching if needed. + // number of pages. Blocks until the background load is completed. const ImageData* GetPage(int index); + // Returns true if the requested page is available, and provides a pointer, + // which may be NULL if the document is empty. May block, even though it + // doesn't guarantee to return true. + bool IsPageAvailable(int index, ImageData** page); // Takes ownership of the given page index. The page is made NULL in *this. ImageData* TakePage(int index) { + SVAutoLock lock(&pages_mutex_); ImageData* page = pages_[index]; pages_[index] = NULL; return page; } + // Returns true if the document is currently loaded or in the process of + // loading. + bool IsCached() const { return NumPages() >= 0; } + // Removes all pages from memory and frees the memory, but does not forget + // the document metadata. Returns the memory saved. + inT64 UnCache(); private: - // Loads as many pages can fit in max_memory_ starting at index pages_offset_. + // Sets the value of total_pages_ behind a mutex. + void set_total_pages(int total) { + SVAutoLock lock(&general_mutex_); + total_pages_ = total; + } + void set_memory_used(inT64 memory_used) { + SVAutoLock lock(&general_mutex_); + memory_used_ = memory_used; + } + // Locks the pages_mutex_ and Loads as many pages can fit in max_memory_ + // starting at index pages_offset_. bool ReCachePages(); private: @@ -239,43 +298,77 @@ class DocumentData { inT64 max_memory_; // Saved reader from LoadDocument to allow re-caching. FileReader reader_; + // Mutex that protects pages_ and pages_offset_ against multiple parallel + // loads, and provides a wait for page. + SVMutex pages_mutex_; + // Mutex that protects other data members that callers want to access without + // waiting for a load operation. + mutable SVMutex general_mutex_; }; // A collection of DocumentData that knows roughly how much memory it is using. +// Note that while it supports background read-ahead, it assumes that a single +// thread is accessing documents, ie it is not safe for multiple threads to +// access different documents in parallel, as one may de-cache the other's +// content. class DocumentCache { public: explicit DocumentCache(inT64 max_memory); ~DocumentCache(); + // Deletes all existing documents from the cache. + void Clear() { + documents_.clear(); + num_pages_per_doc_ = 0; + } // Adds all the documents in the list of filenames, counting memory. // The reader is used to read the files. bool LoadDocuments(const GenericVector& filenames, const char* lang, - FileReader reader); + CachingStrategy cache_strategy, FileReader reader); - // Adds document to the cache, throwing out other documents if needed. + // Adds document to the cache. bool AddToCache(DocumentData* data); // Finds and returns a document by name. DocumentData* FindDocument(const STRING& document_name) const; - // Returns a page by serial number, selecting them in a round-robin fashion - // from all the documents. - const ImageData* GetPageBySerial(int serial); + // Returns a page by serial number using the current cache_strategy_ to + // determine the mapping from serial number to page. + const ImageData* GetPageBySerial(int serial) { + if (cache_strategy_ == CS_SEQUENTIAL) + return GetPageSequential(serial); + else + return GetPageRoundRobin(serial); + } const PointerVector& documents() const { return documents_; } - int total_pages() const { - return total_pages_; - } + // Returns the total number of pages in an epoch. For CS_ROUND_ROBIN cache + // strategy, could take a long time. + int TotalPages(); private: + // Returns a page by serial number, selecting them in a round-robin fashion + // from all the documents. Highly disk-intensive, but doesn't need samples + // to be shuffled between files to begin with. + const ImageData* GetPageRoundRobin(int serial); + // Returns a page by serial number, selecting them in sequence from each file. + // Requires the samples to be shuffled between the files to give a random or + // uniform distribution of data. Less disk-intensive than GetPageRoundRobin. + const ImageData* GetPageSequential(int serial); + + // Helper counts the number of adjacent cached neighbour documents_ of index + // looking in direction dir, ie index+dir, index+2*dir etc. + int CountNeighbourDocs(int index, int dir); + // A group of pages that corresponds in some loose way to a document. PointerVector documents_; - // Total of all pages. - int total_pages_; - // Total of all memory used by the cache. - inT64 memory_used_; + // Strategy to use for caching and serializing data samples. + CachingStrategy cache_strategy_; + // Number of pages in the first document, used as a divisor in + // GetPageSequential to determine the document index. + int num_pages_per_doc_; // Max memory allowed in this cache. inT64 max_memory_; };