mirror of
https://github.com/tesseract-ocr/tesseract.git
synced 2024-11-30 23:49:05 +08:00
backport from 4.00: imagedata
This commit is contained in:
parent
7099358510
commit
c1d37120a5
@ -30,6 +30,14 @@
|
||||
#include "helpers.h"
|
||||
#include "tprintf.h"
|
||||
|
||||
#if __cplusplus > 199711L // C++11 support
|
||||
#include <thread>
|
||||
#endif
|
||||
|
||||
// Number of documents to read ahead while training. Doesn't need to be very
|
||||
// large.
|
||||
const int kMaxReadAhead = 8;
|
||||
|
||||
namespace tesseract {
|
||||
|
||||
WordFeature::WordFeature() : x_(0), y_(0), dir_(0) {
|
||||
@ -182,6 +190,19 @@ bool ImageData::DeSerialize(bool swap, TFile* fp) {
|
||||
return true;
|
||||
}
|
||||
|
||||
// As DeSerialize, but only seeks past the data - hence a static method.
|
||||
bool ImageData::SkipDeSerialize(bool swap, TFile* fp) {
|
||||
if (!STRING::SkipDeSerialize(swap, fp)) return false;
|
||||
inT32 page_number;
|
||||
if (fp->FRead(&page_number, sizeof(page_number), 1) != 1) return false;
|
||||
if (!GenericVector<char>::SkipDeSerialize(swap, fp)) return false;
|
||||
if (!STRING::SkipDeSerialize(swap, fp)) return false;
|
||||
if (!GenericVector<TBOX>::SkipDeSerialize(swap, fp)) return false;
|
||||
if (!GenericVector<STRING>::SkipDeSerializeClasses(swap, fp)) return false;
|
||||
inT8 vertical = 0;
|
||||
return fp->FRead(&vertical, sizeof(vertical), 1) == 1;
|
||||
}
|
||||
|
||||
// Saves the given Pix as a PNG-encoded string and destroys it.
|
||||
void ImageData::SetPix(Pix* pix) {
|
||||
SetPixInternal(pix, &image_data_);
|
||||
@ -195,37 +216,34 @@ Pix* ImageData::GetPix() const {
|
||||
// Gets anything and everything with a non-NULL pointer, prescaled to a
|
||||
// given target_height (if 0, then the original image height), and aligned.
|
||||
// Also returns (if not NULL) the width and height of the scaled image.
|
||||
// The return value is the scale factor that was applied to the image to
|
||||
// achieve the target_height.
|
||||
float ImageData::PreScale(int target_height, Pix** pix,
|
||||
int* scaled_width, int* scaled_height,
|
||||
GenericVector<TBOX>* boxes) const {
|
||||
// The return value is the scaled Pix, which must be pixDestroyed after use,
|
||||
// and scale_factor (if not NULL) is set to the scale factor that was applied
|
||||
// to the image to achieve the target_height.
|
||||
Pix* ImageData::PreScale(int target_height, int max_height, float* scale_factor,
|
||||
int* scaled_width, int* scaled_height,
|
||||
GenericVector<TBOX>* boxes) const {
|
||||
int input_width = 0;
|
||||
int input_height = 0;
|
||||
Pix* src_pix = GetPix();
|
||||
ASSERT_HOST(src_pix != NULL);
|
||||
input_width = pixGetWidth(src_pix);
|
||||
input_height = pixGetHeight(src_pix);
|
||||
if (target_height == 0)
|
||||
target_height = input_height;
|
||||
if (target_height == 0) {
|
||||
target_height = MIN(input_height, max_height);
|
||||
}
|
||||
float im_factor = static_cast<float>(target_height) / input_height;
|
||||
if (scaled_width != NULL)
|
||||
*scaled_width = IntCastRounded(im_factor * input_width);
|
||||
if (scaled_height != NULL)
|
||||
*scaled_height = target_height;
|
||||
if (pix != NULL) {
|
||||
// Get the scaled image.
|
||||
pixDestroy(pix);
|
||||
*pix = pixScale(src_pix, im_factor, im_factor);
|
||||
if (*pix == NULL) {
|
||||
tprintf("Scaling pix of size %d, %d by factor %g made null pix!!\n",
|
||||
input_width, input_height, im_factor);
|
||||
}
|
||||
if (scaled_width != NULL)
|
||||
*scaled_width = pixGetWidth(*pix);
|
||||
if (scaled_height != NULL)
|
||||
*scaled_height = pixGetHeight(*pix);
|
||||
// Get the scaled image.
|
||||
Pix* pix = pixScale(src_pix, im_factor, im_factor);
|
||||
if (pix == NULL) {
|
||||
tprintf("Scaling pix of size %d, %d by factor %g made null pix!!\n",
|
||||
input_width, input_height, im_factor);
|
||||
}
|
||||
if (scaled_width != NULL) *scaled_width = pixGetWidth(pix);
|
||||
if (scaled_height != NULL) *scaled_height = pixGetHeight(pix);
|
||||
pixDestroy(&src_pix);
|
||||
if (boxes != NULL) {
|
||||
// Get the boxes.
|
||||
@ -241,7 +259,8 @@ float ImageData::PreScale(int target_height, Pix** pix,
|
||||
boxes->push_back(box);
|
||||
}
|
||||
}
|
||||
return im_factor;
|
||||
if (scale_factor != NULL) *scale_factor = im_factor;
|
||||
return pix;
|
||||
}
|
||||
|
||||
int ImageData::MemoryUsed() const {
|
||||
@ -266,19 +285,20 @@ void ImageData::Display() const {
|
||||
// Draw the boxes.
|
||||
win->Pen(ScrollView::RED);
|
||||
win->Brush(ScrollView::NONE);
|
||||
win->TextAttributes("Arial", kTextSize, false, false, false);
|
||||
for (int b = 0; b < boxes_.size(); ++b) {
|
||||
boxes_[b].plot(win);
|
||||
win->Text(boxes_[b].left(), height + kTextSize, box_texts_[b].string());
|
||||
TBOX scaled(boxes_[b]);
|
||||
scaled.scale(256.0 / height);
|
||||
scaled.plot(win);
|
||||
int text_size = kTextSize;
|
||||
if (!boxes_.empty() && boxes_[0].height() * 2 < text_size)
|
||||
text_size = boxes_[0].height() * 2;
|
||||
win->TextAttributes("Arial", text_size, false, false, false);
|
||||
if (!boxes_.empty()) {
|
||||
for (int b = 0; b < boxes_.size(); ++b) {
|
||||
boxes_[b].plot(win);
|
||||
win->Text(boxes_[b].left(), height + kTextSize, box_texts_[b].string());
|
||||
}
|
||||
} else {
|
||||
// The full transcription.
|
||||
win->Pen(ScrollView::CYAN);
|
||||
win->Text(0, height + kTextSize * 2, transcription_.string());
|
||||
}
|
||||
// The full transcription.
|
||||
win->Pen(ScrollView::CYAN);
|
||||
win->Text(0, height + kTextSize * 2, transcription_.string());
|
||||
// Add the features.
|
||||
win->Pen(ScrollView::GREEN);
|
||||
win->Update();
|
||||
window_wait(win);
|
||||
#endif
|
||||
@ -340,27 +360,51 @@ bool ImageData::AddBoxes(const char* box_text) {
|
||||
return false;
|
||||
}
|
||||
|
||||
DocumentData::DocumentData(const STRING& name)
|
||||
: document_name_(name), pages_offset_(0), total_pages_(0),
|
||||
memory_used_(0), max_memory_(0), reader_(NULL) {}
|
||||
// Thread function to call ReCachePages.
|
||||
void* ReCachePagesFunc(void* data) {
|
||||
DocumentData* document_data = reinterpret_cast<DocumentData*>(data);
|
||||
document_data->ReCachePages();
|
||||
return NULL;
|
||||
}
|
||||
|
||||
DocumentData::~DocumentData() {}
|
||||
DocumentData::DocumentData(const STRING& name)
|
||||
: document_name_(name),
|
||||
pages_offset_(-1),
|
||||
total_pages_(-1),
|
||||
memory_used_(0),
|
||||
max_memory_(0),
|
||||
reader_(NULL) {}
|
||||
|
||||
DocumentData::~DocumentData() {
|
||||
SVAutoLock lock_p(&pages_mutex_);
|
||||
SVAutoLock lock_g(&general_mutex_);
|
||||
}
|
||||
|
||||
// Reads all the pages in the given lstmf filename to the cache. The reader
|
||||
// is used to read the file.
|
||||
bool DocumentData::LoadDocument(const char* filename, const char* lang,
|
||||
int start_page, inT64 max_memory,
|
||||
FileReader reader) {
|
||||
SetDocument(filename, lang, max_memory, reader);
|
||||
pages_offset_ = start_page;
|
||||
return ReCachePages();
|
||||
}
|
||||
|
||||
// Sets up the document, without actually loading it.
|
||||
void DocumentData::SetDocument(const char* filename, const char* lang,
|
||||
inT64 max_memory, FileReader reader) {
|
||||
SVAutoLock lock_p(&pages_mutex_);
|
||||
SVAutoLock lock(&general_mutex_);
|
||||
document_name_ = filename;
|
||||
lang_ = lang;
|
||||
pages_offset_ = start_page;
|
||||
pages_offset_ = -1;
|
||||
max_memory_ = max_memory;
|
||||
reader_ = reader;
|
||||
return ReCachePages();
|
||||
}
|
||||
|
||||
// Writes all the pages to the given filename. Returns false on error.
|
||||
bool DocumentData::SaveDocument(const char* filename, FileWriter writer) {
|
||||
SVAutoLock lock(&pages_mutex_);
|
||||
TFile fp;
|
||||
fp.OpenWrite(NULL);
|
||||
if (!pages_.Serialize(&fp) || !fp.CloseWrite(filename, writer)) {
|
||||
@ -370,112 +414,169 @@ bool DocumentData::SaveDocument(const char* filename, FileWriter writer) {
|
||||
return true;
|
||||
}
|
||||
bool DocumentData::SaveToBuffer(GenericVector<char>* buffer) {
|
||||
SVAutoLock lock(&pages_mutex_);
|
||||
TFile fp;
|
||||
fp.OpenWrite(buffer);
|
||||
return pages_.Serialize(&fp);
|
||||
}
|
||||
|
||||
// Returns a pointer to the page with the given index, modulo the total
|
||||
// number of pages, recaching if needed.
|
||||
const ImageData* DocumentData::GetPage(int index) {
|
||||
index = Modulo(index, total_pages_);
|
||||
if (index < pages_offset_ || index >= pages_offset_ + pages_.size()) {
|
||||
pages_offset_ = index;
|
||||
if (!ReCachePages()) return NULL;
|
||||
}
|
||||
return pages_[index - pages_offset_];
|
||||
// Adds the given page data to this document, counting up memory.
|
||||
void DocumentData::AddPageToDocument(ImageData* page) {
|
||||
SVAutoLock lock(&pages_mutex_);
|
||||
pages_.push_back(page);
|
||||
set_memory_used(memory_used() + page->MemoryUsed());
|
||||
}
|
||||
|
||||
// Loads as many pages can fit in max_memory_ starting at index pages_offset_.
|
||||
// If the given index is not currently loaded, loads it using a separate
|
||||
// thread.
|
||||
void DocumentData::LoadPageInBackground(int index) {
|
||||
ImageData* page = NULL;
|
||||
if (IsPageAvailable(index, &page)) return;
|
||||
SVAutoLock lock(&pages_mutex_);
|
||||
if (pages_offset_ == index) return;
|
||||
pages_offset_ = index;
|
||||
pages_.clear();
|
||||
SVSync::StartThread(ReCachePagesFunc, this);
|
||||
}
|
||||
|
||||
// Returns a pointer to the page with the given index, modulo the total
|
||||
// number of pages. Blocks until the background load is completed.
|
||||
const ImageData* DocumentData::GetPage(int index) {
|
||||
ImageData* page = NULL;
|
||||
while (!IsPageAvailable(index, &page)) {
|
||||
// If there is no background load scheduled, schedule one now.
|
||||
pages_mutex_.Lock();
|
||||
bool needs_loading = pages_offset_ != index;
|
||||
pages_mutex_.Unlock();
|
||||
if (needs_loading) LoadPageInBackground(index);
|
||||
// We can't directly load the page, or the background load will delete it
|
||||
// while the caller is using it, so give it a chance to work.
|
||||
#if __cplusplus > 199711L // C++11 support
|
||||
//TODO: We need to fix this for compilers without C++11 support (e.g. VS2010)
|
||||
std::this_thread::sleep_for(std::chrono::seconds(1));
|
||||
#endif
|
||||
}
|
||||
return page;
|
||||
}
|
||||
|
||||
// Returns true if the requested page is available, and provides a pointer,
|
||||
// which may be NULL if the document is empty. May block, even though it
|
||||
// doesn't guarantee to return true.
|
||||
bool DocumentData::IsPageAvailable(int index, ImageData** page) {
|
||||
SVAutoLock lock(&pages_mutex_);
|
||||
int num_pages = NumPages();
|
||||
if (num_pages == 0 || index < 0) {
|
||||
*page = NULL; // Empty Document.
|
||||
return true;
|
||||
}
|
||||
if (num_pages > 0) {
|
||||
index = Modulo(index, num_pages);
|
||||
if (pages_offset_ <= index && index < pages_offset_ + pages_.size()) {
|
||||
*page = pages_[index - pages_offset_]; // Page is available already.
|
||||
return true;
|
||||
}
|
||||
}
|
||||
return false;
|
||||
}
|
||||
|
||||
// Removes all pages from memory and frees the memory, but does not forget
|
||||
// the document metadata.
|
||||
inT64 DocumentData::UnCache() {
|
||||
SVAutoLock lock(&pages_mutex_);
|
||||
inT64 memory_saved = memory_used();
|
||||
pages_.clear();
|
||||
pages_offset_ = -1;
|
||||
set_total_pages(-1);
|
||||
set_memory_used(0);
|
||||
tprintf("Unloaded document %s, saving %d memory\n", document_name_.string(),
|
||||
memory_saved);
|
||||
return memory_saved;
|
||||
}
|
||||
|
||||
// Locks the pages_mutex_ and Loads as many pages can fit in max_memory_
|
||||
// starting at index pages_offset_.
|
||||
bool DocumentData::ReCachePages() {
|
||||
SVAutoLock lock(&pages_mutex_);
|
||||
// Read the file.
|
||||
set_total_pages(0);
|
||||
set_memory_used(0);
|
||||
int loaded_pages = 0;
|
||||
pages_.truncate(0);
|
||||
TFile fp;
|
||||
if (!fp.Open(document_name_, reader_)) return false;
|
||||
memory_used_ = 0;
|
||||
if (!pages_.DeSerialize(false, &fp)) {
|
||||
tprintf("Deserialize failed: %s\n", document_name_.string());
|
||||
pages_.truncate(0);
|
||||
if (!fp.Open(document_name_, reader_) ||
|
||||
!PointerVector<ImageData>::DeSerializeSize(false, &fp, &loaded_pages) ||
|
||||
loaded_pages <= 0) {
|
||||
tprintf("Deserialize header failed: %s\n", document_name_.string());
|
||||
return false;
|
||||
}
|
||||
total_pages_ = pages_.size();
|
||||
pages_offset_ %= total_pages_;
|
||||
// Delete pages before the first one we want, and relocate the rest.
|
||||
pages_offset_ %= loaded_pages;
|
||||
// Skip pages before the first one we want, and load the rest until max
|
||||
// memory and skip the rest after that.
|
||||
int page;
|
||||
for (page = 0; page < pages_.size(); ++page) {
|
||||
if (page < pages_offset_) {
|
||||
delete pages_[page];
|
||||
pages_[page] = NULL;
|
||||
for (page = 0; page < loaded_pages; ++page) {
|
||||
if (page < pages_offset_ ||
|
||||
(max_memory_ > 0 && memory_used() > max_memory_)) {
|
||||
if (!PointerVector<ImageData>::DeSerializeSkip(false, &fp)) break;
|
||||
} else {
|
||||
ImageData* image_data = pages_[page];
|
||||
if (max_memory_ > 0 && page > pages_offset_ &&
|
||||
memory_used_ + image_data->MemoryUsed() > max_memory_)
|
||||
break; // Don't go over memory quota unless the first image.
|
||||
if (!pages_.DeSerializeElement(false, &fp)) break;
|
||||
ImageData* image_data = pages_.back();
|
||||
if (image_data->imagefilename().length() == 0) {
|
||||
image_data->set_imagefilename(document_name_);
|
||||
image_data->set_page_number(page);
|
||||
}
|
||||
image_data->set_language(lang_);
|
||||
memory_used_ += image_data->MemoryUsed();
|
||||
if (pages_offset_ != 0) {
|
||||
pages_[page - pages_offset_] = image_data;
|
||||
pages_[page] = NULL;
|
||||
}
|
||||
set_memory_used(memory_used() + image_data->MemoryUsed());
|
||||
}
|
||||
}
|
||||
pages_.truncate(page - pages_offset_);
|
||||
tprintf("Loaded %d/%d pages (%d-%d) of document %s\n",
|
||||
pages_.size(), total_pages_, pages_offset_,
|
||||
pages_offset_ + pages_.size(), document_name_.string());
|
||||
if (page < loaded_pages) {
|
||||
tprintf("Deserialize failed: %s read %d/%d pages\n",
|
||||
document_name_.string(), page, loaded_pages);
|
||||
pages_.truncate(0);
|
||||
} else {
|
||||
tprintf("Loaded %d/%d pages (%d-%d) of document %s\n", pages_.size(),
|
||||
loaded_pages, pages_offset_, pages_offset_ + pages_.size(),
|
||||
document_name_.string());
|
||||
}
|
||||
set_total_pages(loaded_pages);
|
||||
return !pages_.empty();
|
||||
}
|
||||
|
||||
// Adds the given page data to this document, counting up memory.
|
||||
void DocumentData::AddPageToDocument(ImageData* page) {
|
||||
pages_.push_back(page);
|
||||
memory_used_ += page->MemoryUsed();
|
||||
}
|
||||
|
||||
// A collection of DocumentData that knows roughly how much memory it is using.
|
||||
DocumentCache::DocumentCache(inT64 max_memory)
|
||||
: total_pages_(0), memory_used_(0), max_memory_(max_memory) {}
|
||||
: num_pages_per_doc_(0), max_memory_(max_memory) {}
|
||||
DocumentCache::~DocumentCache() {}
|
||||
|
||||
// Adds all the documents in the list of filenames, counting memory.
|
||||
// The reader is used to read the files.
|
||||
bool DocumentCache::LoadDocuments(const GenericVector<STRING>& filenames,
|
||||
const char* lang, FileReader reader) {
|
||||
inT64 fair_share_memory = max_memory_ / filenames.size();
|
||||
const char* lang,
|
||||
CachingStrategy cache_strategy,
|
||||
FileReader reader) {
|
||||
cache_strategy_ = cache_strategy;
|
||||
inT64 fair_share_memory = 0;
|
||||
// In the round-robin case, each DocumentData handles restricting its content
|
||||
// to its fair share of memory. In the sequential case, DocumentCache
|
||||
// determines which DocumentDatas are held entirely in memory.
|
||||
if (cache_strategy_ == CS_ROUND_ROBIN)
|
||||
fair_share_memory = max_memory_ / filenames.size();
|
||||
for (int arg = 0; arg < filenames.size(); ++arg) {
|
||||
STRING filename = filenames[arg];
|
||||
DocumentData* document = new DocumentData(filename);
|
||||
if (document->LoadDocument(filename.string(), lang, 0,
|
||||
fair_share_memory, reader)) {
|
||||
AddToCache(document);
|
||||
} else {
|
||||
tprintf("Failed to load image %s!\n", filename.string());
|
||||
delete document;
|
||||
}
|
||||
document->SetDocument(filename.string(), lang, fair_share_memory, reader);
|
||||
AddToCache(document);
|
||||
}
|
||||
tprintf("Loaded %d pages, total %gMB\n",
|
||||
total_pages_, memory_used_ / 1048576.0);
|
||||
return total_pages_ > 0;
|
||||
if (!documents_.empty()) {
|
||||
// Try to get the first page now to verify the list of filenames.
|
||||
if (GetPageBySerial(0) != NULL) return true;
|
||||
tprintf("Load of page 0 failed!\n");
|
||||
}
|
||||
return false;
|
||||
}
|
||||
|
||||
// Adds document to the cache, throwing out other documents if needed.
|
||||
// Adds document to the cache.
|
||||
bool DocumentCache::AddToCache(DocumentData* data) {
|
||||
inT64 new_memory = data->memory_used();
|
||||
memory_used_ += new_memory;
|
||||
documents_.push_back(data);
|
||||
total_pages_ += data->NumPages();
|
||||
// Delete the first item in the array, and other pages of the same name
|
||||
// while memory is full.
|
||||
while (memory_used_ >= max_memory_ && max_memory_ > 0) {
|
||||
tprintf("Memory used=%lld vs max=%lld, discarding doc of size %lld\n",
|
||||
memory_used_ , max_memory_, documents_[0]->memory_used());
|
||||
memory_used_ -= documents_[0]->memory_used();
|
||||
total_pages_ -= documents_[0]->NumPages();
|
||||
documents_.remove(0);
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
@ -488,11 +589,104 @@ DocumentData* DocumentCache::FindDocument(const STRING& document_name) const {
|
||||
return NULL;
|
||||
}
|
||||
|
||||
// Returns the total number of pages in an epoch. For CS_ROUND_ROBIN cache
|
||||
// strategy, could take a long time.
|
||||
int DocumentCache::TotalPages() {
|
||||
if (cache_strategy_ == CS_SEQUENTIAL) {
|
||||
// In sequential mode, we assume each doc has the same number of pages
|
||||
// whether it is true or not.
|
||||
if (num_pages_per_doc_ == 0) GetPageSequential(0);
|
||||
return num_pages_per_doc_ * documents_.size();
|
||||
}
|
||||
int total_pages = 0;
|
||||
int num_docs = documents_.size();
|
||||
for (int d = 0; d < num_docs; ++d) {
|
||||
// We have to load a page to make NumPages() valid.
|
||||
documents_[d]->GetPage(0);
|
||||
total_pages += documents_[d]->NumPages();
|
||||
}
|
||||
return total_pages;
|
||||
}
|
||||
|
||||
// Returns a page by serial number, selecting them in a round-robin fashion
|
||||
// from all the documents.
|
||||
const ImageData* DocumentCache::GetPageBySerial(int serial) {
|
||||
int document_index = serial % documents_.size();
|
||||
return documents_[document_index]->GetPage(serial / documents_.size());
|
||||
// from all the documents. Highly disk-intensive, but doesn't need samples
|
||||
// to be shuffled between files to begin with.
|
||||
const ImageData* DocumentCache::GetPageRoundRobin(int serial) {
|
||||
int num_docs = documents_.size();
|
||||
int doc_index = serial % num_docs;
|
||||
const ImageData* doc = documents_[doc_index]->GetPage(serial / num_docs);
|
||||
for (int offset = 1; offset <= kMaxReadAhead && offset < num_docs; ++offset) {
|
||||
doc_index = (serial + offset) % num_docs;
|
||||
int page = (serial + offset) / num_docs;
|
||||
documents_[doc_index]->LoadPageInBackground(page);
|
||||
}
|
||||
return doc;
|
||||
}
|
||||
|
||||
// Returns a page by serial number, selecting them in sequence from each file.
|
||||
// Requires the samples to be shuffled between the files to give a random or
|
||||
// uniform distribution of data. Less disk-intensive than GetPageRoundRobin.
|
||||
const ImageData* DocumentCache::GetPageSequential(int serial) {
|
||||
int num_docs = documents_.size();
|
||||
ASSERT_HOST(num_docs > 0);
|
||||
if (num_pages_per_doc_ == 0) {
|
||||
// Use the pages in the first doc as the number of pages in each doc.
|
||||
documents_[0]->GetPage(0);
|
||||
num_pages_per_doc_ = documents_[0]->NumPages();
|
||||
if (num_pages_per_doc_ == 0) {
|
||||
tprintf("First document cannot be empty!!\n");
|
||||
ASSERT_HOST(num_pages_per_doc_ > 0);
|
||||
}
|
||||
// Get rid of zero now if we don't need it.
|
||||
if (serial / num_pages_per_doc_ % num_docs > 0) documents_[0]->UnCache();
|
||||
}
|
||||
int doc_index = serial / num_pages_per_doc_ % num_docs;
|
||||
const ImageData* doc =
|
||||
documents_[doc_index]->GetPage(serial % num_pages_per_doc_);
|
||||
// Count up total memory. Background loading makes it more complicated to
|
||||
// keep a running count.
|
||||
inT64 total_memory = 0;
|
||||
for (int d = 0; d < num_docs; ++d) {
|
||||
total_memory += documents_[d]->memory_used();
|
||||
}
|
||||
if (total_memory >= max_memory_) {
|
||||
// Find something to un-cache.
|
||||
// If there are more than 3 in front, then serial is from the back reader
|
||||
// of a pair of readers. If we un-cache from in-front-2 to 2-ahead, then
|
||||
// we create a hole between them and then un-caching the backmost occupied
|
||||
// will work for both.
|
||||
int num_in_front = CountNeighbourDocs(doc_index, 1);
|
||||
for (int offset = num_in_front - 2;
|
||||
offset > 1 && total_memory >= max_memory_; --offset) {
|
||||
int next_index = (doc_index + offset) % num_docs;
|
||||
total_memory -= documents_[next_index]->UnCache();
|
||||
}
|
||||
// If that didn't work, the best solution is to un-cache from the back. If
|
||||
// we take away the document that a 2nd reader is using, it will put it
|
||||
// back and make a hole between.
|
||||
int num_behind = CountNeighbourDocs(doc_index, -1);
|
||||
for (int offset = num_behind; offset < 0 && total_memory >= max_memory_;
|
||||
++offset) {
|
||||
int next_index = (doc_index + offset + num_docs) % num_docs;
|
||||
total_memory -= documents_[next_index]->UnCache();
|
||||
}
|
||||
}
|
||||
int next_index = (doc_index + 1) % num_docs;
|
||||
if (!documents_[next_index]->IsCached() && total_memory < max_memory_) {
|
||||
documents_[next_index]->LoadPageInBackground(0);
|
||||
}
|
||||
return doc;
|
||||
}
|
||||
|
||||
// Helper counts the number of adjacent cached neighbours of index looking in
|
||||
// direction dir, ie index+dir, index+2*dir etc.
|
||||
int DocumentCache::CountNeighbourDocs(int index, int dir) {
|
||||
int num_docs = documents_.size();
|
||||
for (int offset = dir; abs(offset) < num_docs; offset += dir) {
|
||||
int offset_index = (index + offset + num_docs) % num_docs;
|
||||
if (!documents_[offset_index]->IsCached()) return offset - dir;
|
||||
}
|
||||
return num_docs;
|
||||
}
|
||||
|
||||
} // namespace tesseract.
|
||||
|
@ -25,6 +25,7 @@
|
||||
#include "normalis.h"
|
||||
#include "rect.h"
|
||||
#include "strngs.h"
|
||||
#include "svutil.h"
|
||||
|
||||
struct Pix;
|
||||
|
||||
@ -34,8 +35,22 @@ namespace tesseract {
|
||||
const int kFeaturePadding = 2;
|
||||
// Number of pixels to pad around text boxes.
|
||||
const int kImagePadding = 4;
|
||||
// Number of training images to combine into a mini-batch for training.
|
||||
const int kNumPagesPerMiniBatch = 100;
|
||||
|
||||
// Enum to determine the caching and data sequencing strategy.
|
||||
enum CachingStrategy {
|
||||
// Reads all of one file before moving on to the next. Requires samples to be
|
||||
// shuffled across files. Uses the count of samples in the first file as
|
||||
// the count in all the files to achieve high-speed random access. As a
|
||||
// consequence, if subsequent files are smaller, they get entries used more
|
||||
// than once, and if subsequent files are larger, some entries are not used.
|
||||
// Best for larger data sets that don't fit in memory.
|
||||
CS_SEQUENTIAL,
|
||||
// Reads one sample from each file in rotation. Does not require shuffled
|
||||
// samples, but is extremely disk-intensive. Samples in smaller files also
|
||||
// get used more often than samples in larger files.
|
||||
// Best for smaller data sets that mostly fit in memory.
|
||||
CS_ROUND_ROBIN,
|
||||
};
|
||||
|
||||
class WordFeature {
|
||||
public:
|
||||
@ -103,6 +118,8 @@ class ImageData {
|
||||
// Reads from the given file. Returns false in case of error.
|
||||
// If swap is true, assumes a big/little-endian swap is needed.
|
||||
bool DeSerialize(bool swap, TFile* fp);
|
||||
// As DeSerialize, but only seeks past the data - hence a static method.
|
||||
static bool SkipDeSerialize(bool swap, tesseract::TFile* fp);
|
||||
|
||||
// Other accessors.
|
||||
const STRING& imagefilename() const {
|
||||
@ -145,11 +162,12 @@ class ImageData {
|
||||
// Gets anything and everything with a non-NULL pointer, prescaled to a
|
||||
// given target_height (if 0, then the original image height), and aligned.
|
||||
// Also returns (if not NULL) the width and height of the scaled image.
|
||||
// The return value is the scale factor that was applied to the image to
|
||||
// achieve the target_height.
|
||||
float PreScale(int target_height, Pix** pix,
|
||||
int* scaled_width, int* scaled_height,
|
||||
GenericVector<TBOX>* boxes) const;
|
||||
// The return value is the scaled Pix, which must be pixDestroyed after use,
|
||||
// and scale_factor (if not NULL) is set to the scale factor that was applied
|
||||
// to the image to achieve the target_height.
|
||||
Pix* PreScale(int target_height, int max_height, float* scale_factor,
|
||||
int* scaled_width, int* scaled_height,
|
||||
GenericVector<TBOX>* boxes) const;
|
||||
|
||||
int MemoryUsed() const;
|
||||
|
||||
@ -184,6 +202,8 @@ class ImageData {
|
||||
|
||||
// A collection of ImageData that knows roughly how much memory it is using.
|
||||
class DocumentData {
|
||||
friend void* ReCachePagesFunc(void* data);
|
||||
|
||||
public:
|
||||
explicit DocumentData(const STRING& name);
|
||||
~DocumentData();
|
||||
@ -192,6 +212,9 @@ class DocumentData {
|
||||
// is used to read the file.
|
||||
bool LoadDocument(const char* filename, const char* lang, int start_page,
|
||||
inT64 max_memory, FileReader reader);
|
||||
// Sets up the document, without actually loading it.
|
||||
void SetDocument(const char* filename, const char* lang, inT64 max_memory,
|
||||
FileReader reader);
|
||||
// Writes all the pages to the given filename. Returns false on error.
|
||||
bool SaveDocument(const char* filename, FileWriter writer);
|
||||
bool SaveToBuffer(GenericVector<char>* buffer);
|
||||
@ -200,26 +223,62 @@ class DocumentData {
|
||||
void AddPageToDocument(ImageData* page);
|
||||
|
||||
const STRING& document_name() const {
|
||||
SVAutoLock lock(&general_mutex_);
|
||||
return document_name_;
|
||||
}
|
||||
int NumPages() const {
|
||||
SVAutoLock lock(&general_mutex_);
|
||||
return total_pages_;
|
||||
}
|
||||
inT64 memory_used() const {
|
||||
SVAutoLock lock(&general_mutex_);
|
||||
return memory_used_;
|
||||
}
|
||||
// If the given index is not currently loaded, loads it using a separate
|
||||
// thread. Note: there are 4 cases:
|
||||
// Document uncached: IsCached() returns false, total_pages_ < 0.
|
||||
// Required page is available: IsPageAvailable returns true. In this case,
|
||||
// total_pages_ > 0 and
|
||||
// pages_offset_ <= index%total_pages_ <= pages_offset_+pages_.size()
|
||||
// Pages are loaded, but the required one is not.
|
||||
// The requested page is being loaded by LoadPageInBackground. In this case,
|
||||
// index == pages_offset_. Once the loading starts, the pages lock is held
|
||||
// until it completes, at which point IsPageAvailable will unblock and return
|
||||
// true.
|
||||
void LoadPageInBackground(int index);
|
||||
// Returns a pointer to the page with the given index, modulo the total
|
||||
// number of pages, recaching if needed.
|
||||
// number of pages. Blocks until the background load is completed.
|
||||
const ImageData* GetPage(int index);
|
||||
// Returns true if the requested page is available, and provides a pointer,
|
||||
// which may be NULL if the document is empty. May block, even though it
|
||||
// doesn't guarantee to return true.
|
||||
bool IsPageAvailable(int index, ImageData** page);
|
||||
// Takes ownership of the given page index. The page is made NULL in *this.
|
||||
ImageData* TakePage(int index) {
|
||||
SVAutoLock lock(&pages_mutex_);
|
||||
ImageData* page = pages_[index];
|
||||
pages_[index] = NULL;
|
||||
return page;
|
||||
}
|
||||
// Returns true if the document is currently loaded or in the process of
|
||||
// loading.
|
||||
bool IsCached() const { return NumPages() >= 0; }
|
||||
// Removes all pages from memory and frees the memory, but does not forget
|
||||
// the document metadata. Returns the memory saved.
|
||||
inT64 UnCache();
|
||||
|
||||
private:
|
||||
// Loads as many pages can fit in max_memory_ starting at index pages_offset_.
|
||||
// Sets the value of total_pages_ behind a mutex.
|
||||
void set_total_pages(int total) {
|
||||
SVAutoLock lock(&general_mutex_);
|
||||
total_pages_ = total;
|
||||
}
|
||||
void set_memory_used(inT64 memory_used) {
|
||||
SVAutoLock lock(&general_mutex_);
|
||||
memory_used_ = memory_used;
|
||||
}
|
||||
// Locks the pages_mutex_ and Loads as many pages can fit in max_memory_
|
||||
// starting at index pages_offset_.
|
||||
bool ReCachePages();
|
||||
|
||||
private:
|
||||
@ -239,43 +298,77 @@ class DocumentData {
|
||||
inT64 max_memory_;
|
||||
// Saved reader from LoadDocument to allow re-caching.
|
||||
FileReader reader_;
|
||||
// Mutex that protects pages_ and pages_offset_ against multiple parallel
|
||||
// loads, and provides a wait for page.
|
||||
SVMutex pages_mutex_;
|
||||
// Mutex that protects other data members that callers want to access without
|
||||
// waiting for a load operation.
|
||||
mutable SVMutex general_mutex_;
|
||||
};
|
||||
|
||||
// A collection of DocumentData that knows roughly how much memory it is using.
|
||||
// Note that while it supports background read-ahead, it assumes that a single
|
||||
// thread is accessing documents, ie it is not safe for multiple threads to
|
||||
// access different documents in parallel, as one may de-cache the other's
|
||||
// content.
|
||||
class DocumentCache {
|
||||
public:
|
||||
explicit DocumentCache(inT64 max_memory);
|
||||
~DocumentCache();
|
||||
|
||||
// Deletes all existing documents from the cache.
|
||||
void Clear() {
|
||||
documents_.clear();
|
||||
num_pages_per_doc_ = 0;
|
||||
}
|
||||
// Adds all the documents in the list of filenames, counting memory.
|
||||
// The reader is used to read the files.
|
||||
bool LoadDocuments(const GenericVector<STRING>& filenames, const char* lang,
|
||||
FileReader reader);
|
||||
CachingStrategy cache_strategy, FileReader reader);
|
||||
|
||||
// Adds document to the cache, throwing out other documents if needed.
|
||||
// Adds document to the cache.
|
||||
bool AddToCache(DocumentData* data);
|
||||
|
||||
// Finds and returns a document by name.
|
||||
DocumentData* FindDocument(const STRING& document_name) const;
|
||||
|
||||
// Returns a page by serial number, selecting them in a round-robin fashion
|
||||
// from all the documents.
|
||||
const ImageData* GetPageBySerial(int serial);
|
||||
// Returns a page by serial number using the current cache_strategy_ to
|
||||
// determine the mapping from serial number to page.
|
||||
const ImageData* GetPageBySerial(int serial) {
|
||||
if (cache_strategy_ == CS_SEQUENTIAL)
|
||||
return GetPageSequential(serial);
|
||||
else
|
||||
return GetPageRoundRobin(serial);
|
||||
}
|
||||
|
||||
const PointerVector<DocumentData>& documents() const {
|
||||
return documents_;
|
||||
}
|
||||
int total_pages() const {
|
||||
return total_pages_;
|
||||
}
|
||||
// Returns the total number of pages in an epoch. For CS_ROUND_ROBIN cache
|
||||
// strategy, could take a long time.
|
||||
int TotalPages();
|
||||
|
||||
private:
|
||||
// Returns a page by serial number, selecting them in a round-robin fashion
|
||||
// from all the documents. Highly disk-intensive, but doesn't need samples
|
||||
// to be shuffled between files to begin with.
|
||||
const ImageData* GetPageRoundRobin(int serial);
|
||||
// Returns a page by serial number, selecting them in sequence from each file.
|
||||
// Requires the samples to be shuffled between the files to give a random or
|
||||
// uniform distribution of data. Less disk-intensive than GetPageRoundRobin.
|
||||
const ImageData* GetPageSequential(int serial);
|
||||
|
||||
// Helper counts the number of adjacent cached neighbour documents_ of index
|
||||
// looking in direction dir, ie index+dir, index+2*dir etc.
|
||||
int CountNeighbourDocs(int index, int dir);
|
||||
|
||||
// A group of pages that corresponds in some loose way to a document.
|
||||
PointerVector<DocumentData> documents_;
|
||||
// Total of all pages.
|
||||
int total_pages_;
|
||||
// Total of all memory used by the cache.
|
||||
inT64 memory_used_;
|
||||
// Strategy to use for caching and serializing data samples.
|
||||
CachingStrategy cache_strategy_;
|
||||
// Number of pages in the first document, used as a divisor in
|
||||
// GetPageSequential to determine the document index.
|
||||
int num_pages_per_doc_;
|
||||
// Max memory allowed in this cache.
|
||||
inT64 max_memory_;
|
||||
};
|
||||
|
Loading…
Reference in New Issue
Block a user