/////////////////////////////////////////////////////////////////////// // File: imagedata.h // Description: Class to hold information about a single image and its // corresponding boxes or text file. // Author: Ray Smith // Created: Mon Jul 22 14:17:06 PDT 2013 // // (C) Copyright 2013, Google Inc. // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // http://www.apache.org/licenses/LICENSE-2.0 // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. /////////////////////////////////////////////////////////////////////// #ifndef TESSERACT_IMAGE_IMAGEDATA_H_ #define TESSERACT_IMAGE_IMAGEDATA_H_ #include "genericvector.h" #include "normalis.h" #include "rect.h" #include "strngs.h" struct Pix; namespace tesseract { // Amount of padding to apply in output pixels in feature mode. const int kFeaturePadding = 2; // Number of pixels to pad around text boxes. const int kImagePadding = 4; // Number of training images to combine into a mini-batch for training. const int kNumPagesPerMiniBatch = 100; class WordFeature { public: WordFeature(); WordFeature(const FCOORD& fcoord, uinT8 dir); // Computes the maximum x and y value in the features. static void ComputeSize(const GenericVector& features, int* max_x, int* max_y); // Draws the features in the given window. static void Draw(const GenericVector& features, ScrollView* window); // Accessors. int x() const { return x_; } int y() const { return y_; } int dir() const { return dir_; } // Writes to the given file. Returns false in case of error. bool Serialize(FILE* fp) const; // Reads from the given file. Returns false in case of error. // If swap is true, assumes a big/little-endian swap is needed. bool DeSerialize(bool swap, FILE* fp); private: inT16 x_; uinT8 y_; uinT8 dir_; }; // A floating-point version of WordFeature, used as an intermediate during // scaling. struct FloatWordFeature { static void FromWordFeatures(const GenericVector& word_features, GenericVector* float_features); // Sort function to sort first by x-bucket, then by y. static int SortByXBucket(const void*, const void*); float x; float y; float dir; int x_bucket; }; // Class to hold information on a single image: // Filename, cached image as a Pix*, character boxes, text transcription. // The text transcription is the ground truth UTF-8 text for the image. // Character boxes are optional and indicate the desired segmentation of // the text into recognition units. class ImageData { public: ImageData(); // Takes ownership of the pix. ImageData(bool vertical, Pix* pix); ~ImageData(); // Builds and returns an ImageData from the basic data. Note that imagedata, // truth_text, and box_text are all the actual file data, NOT filenames. static ImageData* Build(const char* name, int page_number, const char* lang, const char* imagedata, int imagedatasize, const char* truth_text, const char* box_text); // Writes to the given file. Returns false in case of error. bool Serialize(TFile* fp) const; // Reads from the given file. Returns false in case of error. // If swap is true, assumes a big/little-endian swap is needed. bool DeSerialize(bool swap, TFile* fp); // Other accessors. const STRING& imagefilename() const { return imagefilename_; } void set_imagefilename(const STRING& name) { imagefilename_ = name; } int page_number() const { return page_number_; } void set_page_number(int num) { page_number_ = num; } const GenericVector& image_data() const { return image_data_; } const STRING& language() const { return language_; } void set_language(const STRING& lang) { language_ = lang; } const STRING& transcription() const { return transcription_; } const GenericVector& boxes() const { return boxes_; } const GenericVector& box_texts() const { return box_texts_; } const STRING& box_text(int index) const { return box_texts_[index]; } // Saves the given Pix as a PNG-encoded string and destroys it. void SetPix(Pix* pix); // Returns the Pix image for *this. Must be pixDestroyed after use. Pix* GetPix() const; // Gets anything and everything with a non-NULL pointer, prescaled to a // given target_height (if 0, then the original image height), and aligned. // Also returns (if not NULL) the width and height of the scaled image. // The return value is the scale factor that was applied to the image to // achieve the target_height. float PreScale(int target_height, Pix** pix, int* scaled_width, int* scaled_height, GenericVector* boxes) const; int MemoryUsed() const; // Draws the data in a new window. void Display() const; // Adds the supplied boxes and transcriptions that correspond to the correct // page number. void AddBoxes(const GenericVector& boxes, const GenericVector& texts, const GenericVector& box_pages); private: // Saves the given Pix as a PNG-encoded string and destroys it. static void SetPixInternal(Pix* pix, GenericVector* image_data); // Returns the Pix image for the image_data. Must be pixDestroyed after use. static Pix* GetPixInternal(const GenericVector& image_data); // Parses the text string as a box file and adds any discovered boxes that // match the page number. Returns false on error. bool AddBoxes(const char* box_text); private: STRING imagefilename_; // File to read image from. inT32 page_number_; // Page number if multi-page tif or -1. GenericVector image_data_; // PNG file data. STRING language_; // Language code for image. STRING transcription_; // UTF-8 ground truth of image. GenericVector boxes_; // If non-empty boxes of the image. GenericVector box_texts_; // String for text in each box. bool vertical_text_; // Image has been rotated from vertical. }; // A collection of ImageData that knows roughly how much memory it is using. class DocumentData { public: explicit DocumentData(const STRING& name); ~DocumentData(); // Reads all the pages in the given lstmf filename to the cache. The reader // is used to read the file. bool LoadDocument(const char* filename, const char* lang, int start_page, inT64 max_memory, FileReader reader); // Writes all the pages to the given filename. Returns false on error. bool SaveDocument(const char* filename, FileWriter writer); bool SaveToBuffer(GenericVector* buffer); // Adds the given page data to this document, counting up memory. void AddPageToDocument(ImageData* page); const STRING& document_name() const { return document_name_; } int NumPages() const { return total_pages_; } inT64 memory_used() const { return memory_used_; } // Returns a pointer to the page with the given index, modulo the total // number of pages, recaching if needed. const ImageData* GetPage(int index); // Takes ownership of the given page index. The page is made NULL in *this. ImageData* TakePage(int index) { ImageData* page = pages_[index]; pages_[index] = NULL; return page; } private: // Loads as many pages can fit in max_memory_ starting at index pages_offset_. bool ReCachePages(); private: // A name for this document. STRING document_name_; // The language of this document. STRING lang_; // A group of pages that corresponds in some loose way to a document. PointerVector pages_; // Page number of the first index in pages_. int pages_offset_; // Total number of pages in document (may exceed size of pages_.) int total_pages_; // Total of all pix sizes in the document. inT64 memory_used_; // Max memory to use at any time. inT64 max_memory_; // Saved reader from LoadDocument to allow re-caching. FileReader reader_; }; // A collection of DocumentData that knows roughly how much memory it is using. class DocumentCache { public: explicit DocumentCache(inT64 max_memory); ~DocumentCache(); // Adds all the documents in the list of filenames, counting memory. // The reader is used to read the files. bool LoadDocuments(const GenericVector& filenames, const char* lang, FileReader reader); // Adds document to the cache, throwing out other documents if needed. bool AddToCache(DocumentData* data); // Finds and returns a document by name. DocumentData* FindDocument(const STRING& document_name) const; // Returns a page by serial number, selecting them in a round-robin fashion // from all the documents. const ImageData* GetPageBySerial(int serial); const PointerVector& documents() const { return documents_; } int total_pages() const { return total_pages_; } private: // A group of pages that corresponds in some loose way to a document. PointerVector documents_; // Total of all pages. int total_pages_; // Total of all memory used by the cache. inT64 memory_used_; // Max memory allowed in this cache. inT64 max_memory_; }; } // namespace tesseract #endif // TESSERACT_IMAGE_IMAGEDATA_H_