From c1d37120a5d6723c37470e72672c29d242c8cb4b Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Zdenko=20Podobn=C3=BD?= <zdenop@gmail.com>
Date: Wed, 7 Dec 2016 15:55:27 +0100
Subject: [PATCH] backport from 4.00: imagedata

---
 ccstruct/imagedata.cpp | 410 ++++++++++++++++++++++++++++++-----------
 ccstruct/imagedata.h   | 135 +++++++++++---
 2 files changed, 416 insertions(+), 129 deletions(-)
diff --git a/ccstruct/imagedata.cpp b/ccstruct/imagedata.cpp
index 3c244c77..11e1c862 100644
--- a/ccstruct/imagedata.cpp
+++ b/ccstruct/imagedata.cpp
@@ -30,6 +30,14 @@
 #include "helpers.h"
 #include "tprintf.h"
 
+#if __cplusplus > 199711L  // C++11 support
+  #include <thread>
+#endif
+
+// Number of documents to read ahead while training. Doesn't need to be very
+// large.
+const int kMaxReadAhead = 8;
+
 namespace tesseract {
 
 WordFeature::WordFeature() : x_(0), y_(0), dir_(0) {
@@ -182,6 +190,19 @@ bool ImageData::DeSerialize(bool swap, TFile* fp) {
   return true;
 }
 
+// As DeSerialize, but only seeks past the data - hence a static method.
+bool ImageData::SkipDeSerialize(bool swap, TFile* fp) {
+  if (!STRING::SkipDeSerialize(swap, fp)) return false;
+  inT32 page_number;
+  if (fp->FRead(&page_number, sizeof(page_number), 1) != 1) return false;
+  if (!GenericVector<char>::SkipDeSerialize(swap, fp)) return false;
+  if (!STRING::SkipDeSerialize(swap, fp)) return false;
+  if (!GenericVector<TBOX>::SkipDeSerialize(swap, fp)) return false;
+  if (!GenericVector<STRING>::SkipDeSerializeClasses(swap, fp)) return false;
+  inT8 vertical = 0;
+  return fp->FRead(&vertical, sizeof(vertical), 1) == 1;
+}
+
 // Saves the given Pix as a PNG-encoded string and destroys it.
 void ImageData::SetPix(Pix* pix) {
   SetPixInternal(pix, &image_data_);
@@ -195,37 +216,34 @@ Pix* ImageData::GetPix() const {
 // Gets anything and everything with a non-NULL pointer, prescaled to a
 // given target_height (if 0, then the original image height), and aligned.
 // Also returns (if not NULL) the width and height of the scaled image.
-// The return value is the scale factor that was applied to the image to
-// achieve the target_height.
-float ImageData::PreScale(int target_height, Pix** pix,
-                          int* scaled_width, int* scaled_height,
-                          GenericVector<TBOX>* boxes) const {
+// The return value is the scaled Pix, which must be pixDestroyed after use,
+// and scale_factor (if not NULL) is set to the scale factor that was applied
+// to the image to achieve the target_height.
+Pix* ImageData::PreScale(int target_height, int max_height, float* scale_factor,
+                         int* scaled_width, int* scaled_height,
+                         GenericVector<TBOX>* boxes) const {
   int input_width = 0;
   int input_height = 0;
   Pix* src_pix = GetPix();
   ASSERT_HOST(src_pix != NULL);
   input_width = pixGetWidth(src_pix);
   input_height = pixGetHeight(src_pix);
-  if (target_height == 0)
-    target_height = input_height;
+  if (target_height == 0) {
+    target_height = MIN(input_height, max_height);
+  }
   float im_factor = static_cast<float>(target_height) / input_height;
   if (scaled_width != NULL)
     *scaled_width = IntCastRounded(im_factor * input_width);
   if (scaled_height != NULL)
     *scaled_height = target_height;
-  if (pix != NULL) {
-    // Get the scaled image.
-    pixDestroy(pix);
-    *pix = pixScale(src_pix, im_factor, im_factor);
-    if (*pix == NULL) {
-      tprintf("Scaling pix of size %d, %d by factor %g made null pix!!\n",
-              input_width, input_height, im_factor);
-    }
-    if (scaled_width != NULL)
-      *scaled_width = pixGetWidth(*pix);
-    if (scaled_height != NULL)
-      *scaled_height = pixGetHeight(*pix);
+  // Get the scaled image.
+  Pix* pix = pixScale(src_pix, im_factor, im_factor);
+  if (pix == NULL) {
+    tprintf("Scaling pix of size %d, %d by factor %g made null pix!!\n",
+            input_width, input_height, im_factor);
   }
+  if (scaled_width != NULL) *scaled_width = pixGetWidth(pix);
+  if (scaled_height != NULL) *scaled_height = pixGetHeight(pix);
   pixDestroy(&src_pix);
   if (boxes != NULL) {
     // Get the boxes.
@@ -241,7 +259,8 @@ float ImageData::PreScale(int target_height, Pix** pix,
       boxes->push_back(box);
     }
   }
-  return im_factor;
+  if (scale_factor != NULL) *scale_factor = im_factor;
+  return pix;
 }
 
 int ImageData::MemoryUsed() const {
@@ -266,19 +285,20 @@ void ImageData::Display() const {
   // Draw the boxes.
   win->Pen(ScrollView::RED);
   win->Brush(ScrollView::NONE);
-  win->TextAttributes("Arial", kTextSize, false, false, false);
-  for (int b = 0; b < boxes_.size(); ++b) {
-    boxes_[b].plot(win);
-    win->Text(boxes_[b].left(), height + kTextSize, box_texts_[b].string());
-    TBOX scaled(boxes_[b]);
-    scaled.scale(256.0 / height);
-    scaled.plot(win);
+  int text_size = kTextSize;
+  if (!boxes_.empty() && boxes_[0].height() * 2 < text_size)
+    text_size = boxes_[0].height() * 2;
+  win->TextAttributes("Arial", text_size, false, false, false);
+  if (!boxes_.empty()) {
+    for (int b = 0; b < boxes_.size(); ++b) {
+      boxes_[b].plot(win);
+      win->Text(boxes_[b].left(), height + kTextSize, box_texts_[b].string());
+    }
+  } else {
+    // The full transcription.
+    win->Pen(ScrollView::CYAN);
+    win->Text(0, height + kTextSize * 2, transcription_.string());
   }
-  // The full transcription.
-  win->Pen(ScrollView::CYAN);
-  win->Text(0, height + kTextSize * 2, transcription_.string());
-  // Add the features.
-  win->Pen(ScrollView::GREEN);
   win->Update();
   window_wait(win);
 #endif
@@ -340,27 +360,51 @@ bool ImageData::AddBoxes(const char* box_text) {
   return false;
 }
 
-DocumentData::DocumentData(const STRING& name)
-  : document_name_(name), pages_offset_(0), total_pages_(0),
-    memory_used_(0), max_memory_(0), reader_(NULL) {}
+// Thread function to call ReCachePages.
+void* ReCachePagesFunc(void* data) {
+  DocumentData* document_data = reinterpret_cast<DocumentData*>(data);
+  document_data->ReCachePages();
+  return NULL;
+}
 
-DocumentData::~DocumentData() {}
+DocumentData::DocumentData(const STRING& name)
+    : document_name_(name),
+      pages_offset_(-1),
+      total_pages_(-1),
+      memory_used_(0),
+      max_memory_(0),
+      reader_(NULL) {}
+
+DocumentData::~DocumentData() {
+  SVAutoLock lock_p(&pages_mutex_);
+  SVAutoLock lock_g(&general_mutex_);
+}
 
 // Reads all the pages in the given lstmf filename to the cache. The reader
 // is used to read the file.
 bool DocumentData::LoadDocument(const char* filename, const char* lang,
                                 int start_page, inT64 max_memory,
                                 FileReader reader) {
+  SetDocument(filename, lang, max_memory, reader);
+  pages_offset_ = start_page;
+  return ReCachePages();
+}
+
+// Sets up the document, without actually loading it.
+void DocumentData::SetDocument(const char* filename, const char* lang,
+                               inT64 max_memory, FileReader reader) {
+  SVAutoLock lock_p(&pages_mutex_);
+  SVAutoLock lock(&general_mutex_);
   document_name_ = filename;
   lang_ = lang;
-  pages_offset_ = start_page;
+  pages_offset_ = -1;
   max_memory_ = max_memory;
   reader_ = reader;
-  return ReCachePages();
 }
 
 // Writes all the pages to the given filename. Returns false on error.
 bool DocumentData::SaveDocument(const char* filename, FileWriter writer) {
+  SVAutoLock lock(&pages_mutex_);
   TFile fp;
   fp.OpenWrite(NULL);
   if (!pages_.Serialize(&fp) || !fp.CloseWrite(filename, writer)) {
@@ -370,112 +414,169 @@ bool DocumentData::SaveDocument(const char* filename, FileWriter writer) {
   return true;
 }
 bool DocumentData::SaveToBuffer(GenericVector<char>* buffer) {
+  SVAutoLock lock(&pages_mutex_);
   TFile fp;
   fp.OpenWrite(buffer);
   return pages_.Serialize(&fp);
 }
 
-// Returns a pointer to the page with the given index, modulo the total
-// number of pages, recaching if needed.
-const ImageData* DocumentData::GetPage(int index) {
-  index = Modulo(index, total_pages_);
-  if (index < pages_offset_ || index >= pages_offset_ + pages_.size()) {
-    pages_offset_ = index;
-    if (!ReCachePages()) return NULL;
-  }
-  return pages_[index - pages_offset_];
+// Adds the given page data to this document, counting up memory.
+void DocumentData::AddPageToDocument(ImageData* page) {
+  SVAutoLock lock(&pages_mutex_);
+  pages_.push_back(page);
+  set_memory_used(memory_used() + page->MemoryUsed());
 }
 
-// Loads as many pages can fit in max_memory_ starting at index pages_offset_.
+// If the given index is not currently loaded, loads it using a separate
+// thread.
+void DocumentData::LoadPageInBackground(int index) {
+  ImageData* page = NULL;
+  if (IsPageAvailable(index, &page)) return;
+  SVAutoLock lock(&pages_mutex_);
+  if (pages_offset_ == index) return;
+  pages_offset_ = index;
+  pages_.clear();
+  SVSync::StartThread(ReCachePagesFunc, this);
+}
+
+// Returns a pointer to the page with the given index, modulo the total
+// number of pages. Blocks until the background load is completed.
+const ImageData* DocumentData::GetPage(int index) {
+  ImageData* page = NULL;
+  while (!IsPageAvailable(index, &page)) {
+    // If there is no background load scheduled, schedule one now.
+    pages_mutex_.Lock();
+    bool needs_loading = pages_offset_ != index;
+    pages_mutex_.Unlock();
+    if (needs_loading) LoadPageInBackground(index);
+    // We can't directly load the page, or the background load will delete it
+    // while the caller is using it, so give it a chance to work.
+#if __cplusplus > 199711L  // C++11 support
+	//TODO: We need to fix this for compilers without C++11 support (e.g. VS2010)
+	std::this_thread::sleep_for(std::chrono::seconds(1));
+#endif
+  }
+  return page;
+}
+
+// Returns true if the requested page is available, and provides a pointer,
+// which may be NULL if the document is empty. May block, even though it
+// doesn't guarantee to return true.
+bool DocumentData::IsPageAvailable(int index, ImageData** page) {
+  SVAutoLock lock(&pages_mutex_);
+  int num_pages = NumPages();
+  if (num_pages == 0 || index < 0) {
+    *page = NULL;  // Empty Document.
+    return true;
+  }
+  if (num_pages > 0) {
+    index = Modulo(index, num_pages);
+    if (pages_offset_ <= index && index < pages_offset_ + pages_.size()) {
+      *page = pages_[index - pages_offset_];  // Page is available already.
+      return true;
+    }
+  }
+  return false;
+}
+
+// Removes all pages from memory and frees the memory, but does not forget
+// the document metadata.
+inT64 DocumentData::UnCache() {
+  SVAutoLock lock(&pages_mutex_);
+  inT64 memory_saved = memory_used();
+  pages_.clear();
+  pages_offset_ = -1;
+  set_total_pages(-1);
+  set_memory_used(0);
+  tprintf("Unloaded document %s, saving %d memory\n", document_name_.string(),
+          memory_saved);
+  return memory_saved;
+}
+
+// Locks the pages_mutex_ and Loads as many pages can fit in max_memory_
+// starting at index pages_offset_.
 bool DocumentData::ReCachePages() {
+  SVAutoLock lock(&pages_mutex_);
   // Read the file.
+  set_total_pages(0);
+  set_memory_used(0);
+  int loaded_pages = 0;
+  pages_.truncate(0);
   TFile fp;
-  if (!fp.Open(document_name_, reader_)) return false;
-  memory_used_ = 0;
-  if (!pages_.DeSerialize(false, &fp)) {
-    tprintf("Deserialize failed: %s\n", document_name_.string());
-    pages_.truncate(0);
+  if (!fp.Open(document_name_, reader_) ||
+      !PointerVector<ImageData>::DeSerializeSize(false, &fp, &loaded_pages) ||
+      loaded_pages <= 0) {
+    tprintf("Deserialize header failed: %s\n", document_name_.string());
     return false;
   }
-  total_pages_ = pages_.size();
-  pages_offset_ %= total_pages_;
-  // Delete pages before the first one we want, and relocate the rest.
+  pages_offset_ %= loaded_pages;
+  // Skip pages before the first one we want, and load the rest until max
+  // memory and skip the rest after that.
   int page;
-  for (page = 0; page < pages_.size(); ++page) {
-    if (page < pages_offset_) {
-      delete pages_[page];
-      pages_[page] = NULL;
+  for (page = 0; page < loaded_pages; ++page) {
+    if (page < pages_offset_ ||
+        (max_memory_ > 0 && memory_used() > max_memory_)) {
+      if (!PointerVector<ImageData>::DeSerializeSkip(false, &fp)) break;
     } else {
-      ImageData* image_data = pages_[page];
-      if (max_memory_ > 0 && page > pages_offset_ &&
-          memory_used_ + image_data->MemoryUsed() > max_memory_)
-        break;  // Don't go over memory quota unless the first image.
+      if (!pages_.DeSerializeElement(false, &fp)) break;
+      ImageData* image_data = pages_.back();
       if (image_data->imagefilename().length() == 0) {
         image_data->set_imagefilename(document_name_);
         image_data->set_page_number(page);
       }
       image_data->set_language(lang_);
-      memory_used_ += image_data->MemoryUsed();
-      if (pages_offset_ != 0) {
-        pages_[page - pages_offset_] = image_data;
-        pages_[page] = NULL;
-      }
+      set_memory_used(memory_used() + image_data->MemoryUsed());
     }
   }
-  pages_.truncate(page - pages_offset_);
-  tprintf("Loaded %d/%d pages (%d-%d) of document %s\n",
-          pages_.size(), total_pages_, pages_offset_,
-          pages_offset_ + pages_.size(), document_name_.string());
+  if (page < loaded_pages) {
+    tprintf("Deserialize failed: %s read %d/%d pages\n",
+            document_name_.string(), page, loaded_pages);
+    pages_.truncate(0);
+  } else {
+    tprintf("Loaded %d/%d pages (%d-%d) of document %s\n", pages_.size(),
+            loaded_pages, pages_offset_, pages_offset_ + pages_.size(),
+            document_name_.string());
+  }
+  set_total_pages(loaded_pages);
   return !pages_.empty();
 }
 
-// Adds the given page data to this document, counting up memory.
-void DocumentData::AddPageToDocument(ImageData* page) {
-  pages_.push_back(page);
-  memory_used_ += page->MemoryUsed();
-}
-
 // A collection of DocumentData that knows roughly how much memory it is using.
 DocumentCache::DocumentCache(inT64 max_memory)
-  : total_pages_(0), memory_used_(0), max_memory_(max_memory) {}
+    : num_pages_per_doc_(0), max_memory_(max_memory) {}
 DocumentCache::~DocumentCache() {}
 
 // Adds all the documents in the list of filenames, counting memory.
 // The reader is used to read the files.
 bool DocumentCache::LoadDocuments(const GenericVector<STRING>& filenames,
-                                  const char* lang, FileReader reader) {
-  inT64 fair_share_memory = max_memory_ / filenames.size();
+                                  const char* lang,
+                                  CachingStrategy cache_strategy,
+                                  FileReader reader) {
+  cache_strategy_ = cache_strategy;
+  inT64 fair_share_memory = 0;
+  // In the round-robin case, each DocumentData handles restricting its content
+  // to its fair share of memory. In the sequential case, DocumentCache
+  // determines which DocumentDatas are held entirely in memory.
+  if (cache_strategy_ == CS_ROUND_ROBIN)
+    fair_share_memory = max_memory_ / filenames.size();
   for (int arg = 0; arg < filenames.size(); ++arg) {
     STRING filename = filenames[arg];
     DocumentData* document = new DocumentData(filename);
-    if (document->LoadDocument(filename.string(), lang, 0,
-                               fair_share_memory, reader)) {
-      AddToCache(document);
-    } else {
-      tprintf("Failed to load image %s!\n", filename.string());
-      delete document;
-    }
+    document->SetDocument(filename.string(), lang, fair_share_memory, reader);
+    AddToCache(document);
   }
-  tprintf("Loaded %d pages, total %gMB\n",
-          total_pages_, memory_used_ / 1048576.0);
-  return total_pages_ > 0;
+  if (!documents_.empty()) {
+    // Try to get the first page now to verify the list of filenames.
+    if (GetPageBySerial(0) != NULL) return true;
+    tprintf("Load of page 0 failed!\n");
+  }
+  return false;
 }
 
-// Adds document to the cache, throwing out other documents if needed.
+// Adds document to the cache.
 bool DocumentCache::AddToCache(DocumentData* data) {
   inT64 new_memory = data->memory_used();
-  memory_used_ += new_memory;
   documents_.push_back(data);
-  total_pages_ += data->NumPages();
-  // Delete the first item in the array, and other pages of the same name
-  // while memory is full.
-  while (memory_used_ >= max_memory_ && max_memory_ > 0) {
-    tprintf("Memory used=%lld vs max=%lld, discarding doc of size %lld\n",
-            memory_used_ , max_memory_, documents_[0]->memory_used());
-    memory_used_ -= documents_[0]->memory_used();
-    total_pages_ -= documents_[0]->NumPages();
-    documents_.remove(0);
-  }
   return true;
 }
 
@@ -488,11 +589,104 @@ DocumentData* DocumentCache::FindDocument(const STRING& document_name) const {
   return NULL;
 }
 
+// Returns the total number of pages in an epoch. For CS_ROUND_ROBIN cache
+// strategy, could take a long time.
+int DocumentCache::TotalPages() {
+  if (cache_strategy_ == CS_SEQUENTIAL) {
+    // In sequential mode, we assume each doc has the same number of pages
+    // whether it is true or not.
+    if (num_pages_per_doc_ == 0) GetPageSequential(0);
+    return num_pages_per_doc_ * documents_.size();
+  }
+  int total_pages = 0;
+  int num_docs = documents_.size();
+  for (int d = 0; d < num_docs; ++d) {
+    // We have to load a page to make NumPages() valid.
+    documents_[d]->GetPage(0);
+    total_pages += documents_[d]->NumPages();
+  }
+  return total_pages;
+}
+
 // Returns a page by serial number, selecting them in a round-robin fashion
-// from all the documents.
-const ImageData* DocumentCache::GetPageBySerial(int serial) {
-  int document_index = serial % documents_.size();
-  return documents_[document_index]->GetPage(serial / documents_.size());
+// from all the documents. Highly disk-intensive, but doesn't need samples
+// to be shuffled between files to begin with.
+const ImageData* DocumentCache::GetPageRoundRobin(int serial) {
+  int num_docs = documents_.size();
+  int doc_index = serial % num_docs;
+  const ImageData* doc = documents_[doc_index]->GetPage(serial / num_docs);
+  for (int offset = 1; offset <= kMaxReadAhead && offset < num_docs; ++offset) {
+    doc_index = (serial + offset) % num_docs;
+    int page = (serial + offset) / num_docs;
+    documents_[doc_index]->LoadPageInBackground(page);
+  }
+  return doc;
+}
+
+// Returns a page by serial number, selecting them in sequence from each file.
+// Requires the samples to be shuffled between the files to give a random or
+// uniform distribution of data. Less disk-intensive than GetPageRoundRobin.
+const ImageData* DocumentCache::GetPageSequential(int serial) {
+  int num_docs = documents_.size();
+  ASSERT_HOST(num_docs > 0);
+  if (num_pages_per_doc_ == 0) {
+    // Use the pages in the first doc as the number of pages in each doc.
+    documents_[0]->GetPage(0);
+    num_pages_per_doc_ = documents_[0]->NumPages();
+    if (num_pages_per_doc_ == 0) {
+      tprintf("First document cannot be empty!!\n");
+      ASSERT_HOST(num_pages_per_doc_ > 0);
+    }
+    // Get rid of zero now if we don't need it.
+    if (serial / num_pages_per_doc_ % num_docs > 0) documents_[0]->UnCache();
+  }
+  int doc_index = serial / num_pages_per_doc_ % num_docs;
+  const ImageData* doc =
+      documents_[doc_index]->GetPage(serial % num_pages_per_doc_);
+  // Count up total memory. Background loading makes it more complicated to
+  // keep a running count.
+  inT64 total_memory = 0;
+  for (int d = 0; d < num_docs; ++d) {
+    total_memory += documents_[d]->memory_used();
+  }
+  if (total_memory >= max_memory_) {
+    // Find something to un-cache.
+    // If there are more than 3 in front, then serial is from the back reader
+    // of a pair of readers. If we un-cache from in-front-2 to 2-ahead, then
+    // we create a hole between them and then un-caching the backmost occupied
+    // will work for both.
+    int num_in_front = CountNeighbourDocs(doc_index, 1);
+    for (int offset = num_in_front - 2;
+         offset > 1 && total_memory >= max_memory_; --offset) {
+      int next_index = (doc_index + offset) % num_docs;
+      total_memory -= documents_[next_index]->UnCache();
+    }
+    // If that didn't work, the best solution is to un-cache from the back. If
+    // we take away the document that a 2nd reader is using, it will put it
+    // back and make a hole between.
+    int num_behind = CountNeighbourDocs(doc_index, -1);
+    for (int offset = num_behind; offset < 0 && total_memory >= max_memory_;
+         ++offset) {
+      int next_index = (doc_index + offset + num_docs) % num_docs;
+      total_memory -= documents_[next_index]->UnCache();
+    }
+  }
+  int next_index = (doc_index + 1) % num_docs;
+  if (!documents_[next_index]->IsCached() && total_memory < max_memory_) {
+    documents_[next_index]->LoadPageInBackground(0);
+  }
+  return doc;
+}
+
+// Helper counts the number of adjacent cached neighbours of index looking in
+// direction dir, ie index+dir, index+2*dir etc.
+int DocumentCache::CountNeighbourDocs(int index, int dir) {
+  int num_docs = documents_.size();
+  for (int offset = dir; abs(offset) < num_docs; offset += dir) {
+    int offset_index = (index + offset + num_docs) % num_docs;
+    if (!documents_[offset_index]->IsCached()) return offset - dir;
+  }
+  return num_docs;
 }
 
 }  // namespace tesseract.
diff --git a/ccstruct/imagedata.h b/ccstruct/imagedata.h
index 6321f121..ae672293 100644
--- a/ccstruct/imagedata.h
+++ b/ccstruct/imagedata.h
@@ -25,6 +25,7 @@
 #include "normalis.h"
 #include "rect.h"
 #include "strngs.h"
+#include "svutil.h"
 
 struct Pix;
 
@@ -34,8 +35,22 @@ namespace tesseract {
 const int kFeaturePadding = 2;
 // Number of pixels to pad around text boxes.
 const int kImagePadding = 4;
-// Number of training images to combine into a mini-batch for training.
-const int kNumPagesPerMiniBatch = 100;
+
+// Enum to determine the caching and data sequencing strategy.
+enum CachingStrategy {
+  // Reads all of one file before moving on to the next. Requires samples to be
+  // shuffled across files. Uses the count of samples in the first file as
+  // the count in all the files to achieve high-speed random access. As a
+  // consequence, if subsequent files are smaller, they get entries used more
+  // than once, and if subsequent files are larger, some entries are not used.
+  // Best for larger data sets that don't fit in memory.
+  CS_SEQUENTIAL,
+  // Reads one sample from each file in rotation. Does not require shuffled
+  // samples, but is extremely disk-intensive. Samples in smaller files also
+  // get used more often than samples in larger files.
+  // Best for smaller data sets that mostly fit in memory.
+  CS_ROUND_ROBIN,
+};
 
 class WordFeature {
  public:
@@ -103,6 +118,8 @@ class ImageData {
   // Reads from the given file. Returns false in case of error.
   // If swap is true, assumes a big/little-endian swap is needed.
   bool DeSerialize(bool swap, TFile* fp);
+  // As DeSerialize, but only seeks past the data - hence a static method.
+  static bool SkipDeSerialize(bool swap, tesseract::TFile* fp);
 
   // Other accessors.
   const STRING& imagefilename() const {
@@ -145,11 +162,12 @@ class ImageData {
   // Gets anything and everything with a non-NULL pointer, prescaled to a
   // given target_height (if 0, then the original image height), and aligned.
   // Also returns (if not NULL) the width and height of the scaled image.
-  // The return value is the scale factor that was applied to the image to
-  // achieve the target_height.
-  float PreScale(int target_height, Pix** pix,
-                 int* scaled_width, int* scaled_height,
-                 GenericVector<TBOX>* boxes) const;
+  // The return value is the scaled Pix, which must be pixDestroyed after use,
+  // and scale_factor (if not NULL) is set to the scale factor that was applied
+  // to the image to achieve the target_height.
+  Pix* PreScale(int target_height, int max_height, float* scale_factor,
+                int* scaled_width, int* scaled_height,
+                GenericVector<TBOX>* boxes) const;
 
   int MemoryUsed() const;
 
@@ -184,6 +202,8 @@ class ImageData {
 
 // A collection of ImageData that knows roughly how much memory it is using.
 class DocumentData {
+  friend void* ReCachePagesFunc(void* data);
+
  public:
   explicit DocumentData(const STRING& name);
   ~DocumentData();
@@ -192,6 +212,9 @@ class DocumentData {
   // is used to read the file.
   bool LoadDocument(const char* filename, const char* lang, int start_page,
                     inT64 max_memory, FileReader reader);
+  // Sets up the document, without actually loading it.
+  void SetDocument(const char* filename, const char* lang, inT64 max_memory,
+                   FileReader reader);
   // Writes all the pages to the given filename. Returns false on error.
   bool SaveDocument(const char* filename, FileWriter writer);
   bool SaveToBuffer(GenericVector<char>* buffer);
@@ -200,26 +223,62 @@ class DocumentData {
   void AddPageToDocument(ImageData* page);
 
   const STRING& document_name() const {
+    SVAutoLock lock(&general_mutex_);
     return document_name_;
   }
   int NumPages() const {
+    SVAutoLock lock(&general_mutex_);
     return total_pages_;
   }
   inT64 memory_used() const {
+    SVAutoLock lock(&general_mutex_);
     return memory_used_;
   }
+  // If the given index is not currently loaded, loads it using a separate
+  // thread. Note: there are 4 cases:
+  // Document uncached: IsCached() returns false, total_pages_ < 0.
+  // Required page is available: IsPageAvailable returns true. In this case,
+  // total_pages_ > 0 and
+  // pages_offset_ <= index%total_pages_ <= pages_offset_+pages_.size()
+  // Pages are loaded, but the required one is not.
+  // The requested page is being loaded by LoadPageInBackground. In this case,
+  // index == pages_offset_. Once the loading starts, the pages lock is held
+  // until it completes, at which point IsPageAvailable will unblock and return
+  // true.
+  void LoadPageInBackground(int index);
   // Returns a pointer to the page with the given index, modulo the total
-  // number of pages, recaching if needed.
+  // number of pages. Blocks until the background load is completed.
   const ImageData* GetPage(int index);
+  // Returns true if the requested page is available, and provides a pointer,
+  // which may be NULL if the document is empty. May block, even though it
+  // doesn't guarantee to return true.
+  bool IsPageAvailable(int index, ImageData** page);
   // Takes ownership of the given page index. The page is made NULL in *this.
   ImageData* TakePage(int index) {
+    SVAutoLock lock(&pages_mutex_);
     ImageData* page = pages_[index];
     pages_[index] = NULL;
     return page;
   }
+  // Returns true if the document is currently loaded or in the process of
+  // loading.
+  bool IsCached() const { return NumPages() >= 0; }
+  // Removes all pages from memory and frees the memory, but does not forget
+  // the document metadata. Returns the memory saved.
+  inT64 UnCache();
 
  private:
-  // Loads as many pages can fit in max_memory_ starting at index pages_offset_.
+  // Sets the value of total_pages_ behind a mutex.
+  void set_total_pages(int total) {
+    SVAutoLock lock(&general_mutex_);
+    total_pages_ = total;
+  }
+  void set_memory_used(inT64 memory_used) {
+    SVAutoLock lock(&general_mutex_);
+    memory_used_ = memory_used;
+  }
+  // Locks the pages_mutex_ and Loads as many pages can fit in max_memory_
+  // starting at index pages_offset_.
   bool ReCachePages();
 
  private:
@@ -239,43 +298,77 @@ class DocumentData {
   inT64 max_memory_;
   // Saved reader from LoadDocument to allow re-caching.
   FileReader reader_;
+  // Mutex that protects pages_ and pages_offset_ against multiple parallel
+  // loads, and provides a wait for page.
+  SVMutex pages_mutex_;
+  // Mutex that protects other data members that callers want to access without
+  // waiting for a load operation.
+  mutable SVMutex general_mutex_;
 };
 
 // A collection of DocumentData that knows roughly how much memory it is using.
+// Note that while it supports background read-ahead, it assumes that a single
+// thread is accessing documents, ie it is not safe for multiple threads to
+// access different documents in parallel, as one may de-cache the other's
+// content.
 class DocumentCache {
  public:
   explicit DocumentCache(inT64 max_memory);
   ~DocumentCache();
 
+  // Deletes all existing documents from the cache.
+  void Clear() {
+    documents_.clear();
+    num_pages_per_doc_ = 0;
+  }
   // Adds all the documents in the list of filenames, counting memory.
   // The reader is used to read the files.
   bool LoadDocuments(const GenericVector<STRING>& filenames, const char* lang,
-                     FileReader reader);
+                     CachingStrategy cache_strategy, FileReader reader);
 
-  // Adds document to the cache, throwing out other documents if needed.
+  // Adds document to the cache.
   bool AddToCache(DocumentData* data);
 
   // Finds and returns a document by name.
   DocumentData* FindDocument(const STRING& document_name) const;
 
-  // Returns a page by serial number, selecting them in a round-robin fashion
-  // from all the documents.
-  const ImageData* GetPageBySerial(int serial);
+  // Returns a page by serial number using the current cache_strategy_ to
+  // determine the mapping from serial number to page.
+  const ImageData* GetPageBySerial(int serial) {
+    if (cache_strategy_ == CS_SEQUENTIAL)
+      return GetPageSequential(serial);
+    else
+      return GetPageRoundRobin(serial);
+  }
 
   const PointerVector<DocumentData>& documents() const {
     return documents_;
   }
-  int total_pages() const {
-    return total_pages_;
-  }
+  // Returns the total number of pages in an epoch. For CS_ROUND_ROBIN cache
+  // strategy, could take a long time.
+  int TotalPages();
 
  private:
+  // Returns a page by serial number, selecting them in a round-robin fashion
+  // from all the documents. Highly disk-intensive, but doesn't need samples
+  // to be shuffled between files to begin with.
+  const ImageData* GetPageRoundRobin(int serial);
+  // Returns a page by serial number, selecting them in sequence from each file.
+  // Requires the samples to be shuffled between the files to give a random or
+  // uniform distribution of data. Less disk-intensive than GetPageRoundRobin.
+  const ImageData* GetPageSequential(int serial);
+
+  // Helper counts the number of adjacent cached neighbour documents_ of index
+  // looking in direction dir, ie index+dir, index+2*dir etc.
+  int CountNeighbourDocs(int index, int dir);
+
   // A group of pages that corresponds in some loose way to a document.
   PointerVector<DocumentData> documents_;
-  // Total of all pages.
-  int total_pages_;
-  // Total of all memory used by the cache.
-  inT64 memory_used_;
+  // Strategy to use for caching and serializing data samples.
+  CachingStrategy cache_strategy_;
+  // Number of pages in the first document, used as a divisor in
+  // GetPageSequential to determine the document index.
+  int num_pages_per_doc_;
   // Max memory allowed in this cache.
   inT64 max_memory_;
 };