tesseract  4.00.00dev
imagedata.h
Go to the documentation of this file.
1 // File: imagedata.h
3 // Description: Class to hold information about a single image and its
4 // corresponding boxes or text file.
5 // Author: Ray Smith
6 // Created: Mon Jul 22 14:17:06 PDT 2013
7 //
8 // (C) Copyright 2013, Google Inc.
9 // Licensed under the Apache License, Version 2.0 (the "License");
10 // you may not use this file except in compliance with the License.
11 // You may obtain a copy of the License at
12 // http://www.apache.org/licenses/LICENSE-2.0
13 // Unless required by applicable law or agreed to in writing, software
14 // distributed under the License is distributed on an "AS IS" BASIS,
15 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
16 // See the License for the specific language governing permissions and
17 // limitations under the License.
19 
20 #ifndef TESSERACT_IMAGE_IMAGEDATA_H_
21 #define TESSERACT_IMAGE_IMAGEDATA_H_
22 
23 
24 #include "genericvector.h"
25 #include "normalis.h"
26 #include "rect.h"
27 #include "strngs.h"
28 #include "svutil.h"
29 
30 struct Pix;
31 
32 namespace tesseract {
33 
34 // Amount of padding to apply in output pixels in feature mode.
35 const int kFeaturePadding = 2;
36 // Number of pixels to pad around text boxes.
37 const int kImagePadding = 4;
38 
39 // Enum to determine the caching and data sequencing strategy.
41  // Reads all of one file before moving on to the next. Requires samples to be
42  // shuffled across files. Uses the count of samples in the first file as
43  // the count in all the files to achieve high-speed random access. As a
44  // consequence, if subsequent files are smaller, they get entries used more
45  // than once, and if subsequent files are larger, some entries are not used.
46  // Best for larger data sets that don't fit in memory.
48  // Reads one sample from each file in rotation. Does not require shuffled
49  // samples, but is extremely disk-intensive. Samples in smaller files also
50  // get used more often than samples in larger files.
51  // Best for smaller data sets that mostly fit in memory.
53 };
54 
55 class WordFeature {
56  public:
57  WordFeature();
58  WordFeature(const FCOORD& fcoord, uinT8 dir);
59 
60  // Computes the maximum x and y value in the features.
62  int* max_x, int* max_y);
63  // Draws the features in the given window.
64  static void Draw(const GenericVector<WordFeature>& features,
65  ScrollView* window);
66 
67  // Accessors.
68  int x() const { return x_; }
69  int y() const { return y_; }
70  int dir() const { return dir_; }
71 
72  // Writes to the given file. Returns false in case of error.
73  bool Serialize(FILE* fp) const;
74  // Reads from the given file. Returns false in case of error.
75  // If swap is true, assumes a big/little-endian swap is needed.
76  bool DeSerialize(bool swap, FILE* fp);
77 
78  private:
79  inT16 x_;
80  uinT8 y_;
81  uinT8 dir_;
82 };
83 
84 // A floating-point version of WordFeature, used as an intermediate during
85 // scaling.
87  static void FromWordFeatures(const GenericVector<WordFeature>& word_features,
88  GenericVector<FloatWordFeature>* float_features);
89  // Sort function to sort first by x-bucket, then by y.
90  static int SortByXBucket(const void*, const void*);
91 
92  float x;
93  float y;
94  float dir;
95  int x_bucket;
96 };
97 
98 // Class to hold information on a single image:
99 // Filename, cached image as a Pix*, character boxes, text transcription.
100 // The text transcription is the ground truth UTF-8 text for the image.
101 // Character boxes are optional and indicate the desired segmentation of
102 // the text into recognition units.
103 class ImageData {
104  public:
105  ImageData();
106  // Takes ownership of the pix.
107  ImageData(bool vertical, Pix* pix);
108  ~ImageData();
109 
110  // Builds and returns an ImageData from the basic data. Note that imagedata,
111  // truth_text, and box_text are all the actual file data, NOT filenames.
112  static ImageData* Build(const char* name, int page_number, const char* lang,
113  const char* imagedata, int imagedatasize,
114  const char* truth_text, const char* box_text);
115 
116  // Writes to the given file. Returns false in case of error.
117  bool Serialize(TFile* fp) const;
118  // Reads from the given file. Returns false in case of error.
119  bool DeSerialize(TFile* fp);
120  // As DeSerialize, but only seeks past the data - hence a static method.
121  static bool SkipDeSerialize(tesseract::TFile* fp);
122 
123  // Other accessors.
124  const STRING& imagefilename() const {
125  return imagefilename_;
126  }
127  void set_imagefilename(const STRING& name) {
128  imagefilename_ = name;
129  }
130  int page_number() const {
131  return page_number_;
132  }
133  void set_page_number(int num) {
134  page_number_ = num;
135  }
137  return image_data_;
138  }
139  const STRING& language() const {
140  return language_;
141  }
142  void set_language(const STRING& lang) {
143  language_ = lang;
144  }
145  const STRING& transcription() const {
146  return transcription_;
147  }
148  const GenericVector<TBOX>& boxes() const {
149  return boxes_;
150  }
152  return box_texts_;
153  }
154  const STRING& box_text(int index) const {
155  return box_texts_[index];
156  }
157  // Saves the given Pix as a PNG-encoded string and destroys it.
158  void SetPix(Pix* pix);
159  // Returns the Pix image for *this. Must be pixDestroyed after use.
160  Pix* GetPix() const;
161  // Gets anything and everything with a non-NULL pointer, prescaled to a
162  // given target_height (if 0, then the original image height), and aligned.
163  // Also returns (if not NULL) the width and height of the scaled image.
164  // The return value is the scaled Pix, which must be pixDestroyed after use,
165  // and scale_factor (if not NULL) is set to the scale factor that was applied
166  // to the image to achieve the target_height.
167  Pix* PreScale(int target_height, int max_height, float* scale_factor,
168  int* scaled_width, int* scaled_height,
169  GenericVector<TBOX>* boxes) const;
170 
171  int MemoryUsed() const;
172 
173  // Draws the data in a new window.
174  void Display() const;
175 
176  // Adds the supplied boxes and transcriptions that correspond to the correct
177  // page number.
178  void AddBoxes(const GenericVector<TBOX>& boxes,
179  const GenericVector<STRING>& texts,
180  const GenericVector<int>& box_pages);
181 
182  private:
183  // Saves the given Pix as a PNG-encoded string and destroys it.
184  static void SetPixInternal(Pix* pix, GenericVector<char>* image_data);
185  // Returns the Pix image for the image_data. Must be pixDestroyed after use.
186  static Pix* GetPixInternal(const GenericVector<char>& image_data);
187  // Parses the text string as a box file and adds any discovered boxes that
188  // match the page number. Returns false on error.
189  bool AddBoxes(const char* box_text);
190 
191  private:
192  STRING imagefilename_; // File to read image from.
193  inT32 page_number_; // Page number if multi-page tif or -1.
194  GenericVector<char> image_data_; // PNG file data.
195  STRING language_; // Language code for image.
196  STRING transcription_; // UTF-8 ground truth of image.
197  GenericVector<TBOX> boxes_; // If non-empty boxes of the image.
198  GenericVector<STRING> box_texts_; // String for text in each box.
199  bool vertical_text_; // Image has been rotated from vertical.
200 };
201 
202 // A collection of ImageData that knows roughly how much memory it is using.
204  friend void* ReCachePagesFunc(void* data);
205 
206  public:
207  explicit DocumentData(const STRING& name);
208  ~DocumentData();
209 
210  // Reads all the pages in the given lstmf filename to the cache. The reader
211  // is used to read the file.
212  bool LoadDocument(const char* filename, int start_page, inT64 max_memory,
213  FileReader reader);
214  // Sets up the document, without actually loading it.
215  void SetDocument(const char* filename, inT64 max_memory, FileReader reader);
216  // Writes all the pages to the given filename. Returns false on error.
217  bool SaveDocument(const char* filename, FileWriter writer);
218  bool SaveToBuffer(GenericVector<char>* buffer);
219 
220  // Adds the given page data to this document, counting up memory.
221  void AddPageToDocument(ImageData* page);
222 
223  const STRING& document_name() const {
224  SVAutoLock lock(&general_mutex_);
225  return document_name_;
226  }
227  int NumPages() const {
228  SVAutoLock lock(&general_mutex_);
229  return total_pages_;
230  }
231  inT64 memory_used() const {
232  SVAutoLock lock(&general_mutex_);
233  return memory_used_;
234  }
235  // If the given index is not currently loaded, loads it using a separate
236  // thread. Note: there are 4 cases:
237  // Document uncached: IsCached() returns false, total_pages_ < 0.
238  // Required page is available: IsPageAvailable returns true. In this case,
239  // total_pages_ > 0 and
240  // pages_offset_ <= index%total_pages_ <= pages_offset_+pages_.size()
241  // Pages are loaded, but the required one is not.
242  // The requested page is being loaded by LoadPageInBackground. In this case,
243  // index == pages_offset_. Once the loading starts, the pages lock is held
244  // until it completes, at which point IsPageAvailable will unblock and return
245  // true.
246  void LoadPageInBackground(int index);
247  // Returns a pointer to the page with the given index, modulo the total
248  // number of pages. Blocks until the background load is completed.
249  const ImageData* GetPage(int index);
250  // Returns true if the requested page is available, and provides a pointer,
251  // which may be NULL if the document is empty. May block, even though it
252  // doesn't guarantee to return true.
253  bool IsPageAvailable(int index, ImageData** page);
254  // Takes ownership of the given page index. The page is made NULL in *this.
255  ImageData* TakePage(int index) {
256  SVAutoLock lock(&pages_mutex_);
257  ImageData* page = pages_[index];
258  pages_[index] = NULL;
259  return page;
260  }
261  // Returns true if the document is currently loaded or in the process of
262  // loading.
263  bool IsCached() const { return NumPages() >= 0; }
264  // Removes all pages from memory and frees the memory, but does not forget
265  // the document metadata. Returns the memory saved.
266  inT64 UnCache();
267  // Shuffles all the pages in the document.
268  void Shuffle();
269 
270  private:
271  // Sets the value of total_pages_ behind a mutex.
272  void set_total_pages(int total) {
273  SVAutoLock lock(&general_mutex_);
274  total_pages_ = total;
275  }
276  void set_memory_used(inT64 memory_used) {
277  SVAutoLock lock(&general_mutex_);
278  memory_used_ = memory_used;
279  }
280  // Locks the pages_mutex_ and Loads as many pages can fit in max_memory_
281  // starting at index pages_offset_.
282  bool ReCachePages();
283 
284  private:
285  // A name for this document.
286  STRING document_name_;
287  // A group of pages that corresponds in some loose way to a document.
289  // Page number of the first index in pages_.
290  int pages_offset_;
291  // Total number of pages in document (may exceed size of pages_.)
292  int total_pages_;
293  // Total of all pix sizes in the document.
294  inT64 memory_used_;
295  // Max memory to use at any time.
296  inT64 max_memory_;
297  // Saved reader from LoadDocument to allow re-caching.
298  FileReader reader_;
299  // Mutex that protects pages_ and pages_offset_ against multiple parallel
300  // loads, and provides a wait for page.
301  SVMutex pages_mutex_;
302  // Mutex that protects other data members that callers want to access without
303  // waiting for a load operation.
304  mutable SVMutex general_mutex_;
305 };
306 
307 // A collection of DocumentData that knows roughly how much memory it is using.
308 // Note that while it supports background read-ahead, it assumes that a single
309 // thread is accessing documents, ie it is not safe for multiple threads to
310 // access different documents in parallel, as one may de-cache the other's
311 // content.
313  public:
314  explicit DocumentCache(inT64 max_memory);
315  ~DocumentCache();
316 
317  // Deletes all existing documents from the cache.
318  void Clear() {
319  documents_.clear();
320  num_pages_per_doc_ = 0;
321  }
322  // Adds all the documents in the list of filenames, counting memory.
323  // The reader is used to read the files.
324  bool LoadDocuments(const GenericVector<STRING>& filenames,
325  CachingStrategy cache_strategy, FileReader reader);
326 
327  // Adds document to the cache.
328  bool AddToCache(DocumentData* data);
329 
330  // Finds and returns a document by name.
331  DocumentData* FindDocument(const STRING& document_name) const;
332 
333  // Returns a page by serial number using the current cache_strategy_ to
334  // determine the mapping from serial number to page.
335  const ImageData* GetPageBySerial(int serial) {
336  if (cache_strategy_ == CS_SEQUENTIAL)
337  return GetPageSequential(serial);
338  else
339  return GetPageRoundRobin(serial);
340  }
341 
343  return documents_;
344  }
345  // Returns the total number of pages in an epoch. For CS_ROUND_ROBIN cache
346  // strategy, could take a long time.
347  int TotalPages();
348 
349  private:
350  // Returns a page by serial number, selecting them in a round-robin fashion
351  // from all the documents. Highly disk-intensive, but doesn't need samples
352  // to be shuffled between files to begin with.
353  const ImageData* GetPageRoundRobin(int serial);
354  // Returns a page by serial number, selecting them in sequence from each file.
355  // Requires the samples to be shuffled between the files to give a random or
356  // uniform distribution of data. Less disk-intensive than GetPageRoundRobin.
357  const ImageData* GetPageSequential(int serial);
358 
359  // Helper counts the number of adjacent cached neighbour documents_ of index
360  // looking in direction dir, ie index+dir, index+2*dir etc.
361  int CountNeighbourDocs(int index, int dir);
362 
363  // A group of pages that corresponds in some loose way to a document.
364  PointerVector<DocumentData> documents_;
365  // Strategy to use for caching and serializing data samples.
366  CachingStrategy cache_strategy_;
367  // Number of pages in the first document, used as a divisor in
368  // GetPageSequential to determine the document index.
369  int num_pages_per_doc_;
370  // Max memory allowed in this cache.
371  inT64 max_memory_;
372 };
373 
374 } // namespace tesseract
375 
376 
377 #endif // TESSERACT_IMAGE_IMAGEDATA_H_
bool DeSerialize(bool swap, FILE *fp)
Definition: imagedata.cpp:91
const ImageData * GetPageBySerial(int serial)
Definition: imagedata.h:335
Definition: svutil.h:90
Definition: points.h:189
int64_t inT64
Definition: host.h:40
const int kFeaturePadding
Definition: imagedata.h:35
const GenericVector< char > & image_data() const
Definition: imagedata.h:136
void set_language(const STRING &lang)
Definition: imagedata.h:142
int32_t inT32
Definition: host.h:38
int page_number() const
Definition: imagedata.h:130
bool(* FileReader)(const STRING &filename, GenericVector< char > *data)
const STRING & language() const
Definition: imagedata.h:139
const int kImagePadding
Definition: imagedata.h:37
static void ComputeSize(const GenericVector< WordFeature > &features, int *max_x, int *max_y)
Definition: imagedata.cpp:55
const GenericVector< STRING > & box_texts() const
Definition: imagedata.h:151
const STRING & transcription() const
Definition: imagedata.h:145
int16_t inT16
Definition: host.h:36
const STRING & box_text(int index) const
Definition: imagedata.h:154
Definition: strngs.h:45
ImageData * TakePage(int index)
Definition: imagedata.h:255
const GenericVector< TBOX > & boxes() const
Definition: imagedata.h:148
const STRING & imagefilename() const
Definition: imagedata.h:124
bool IsCached() const
Definition: imagedata.h:263
static void Draw(const GenericVector< WordFeature > &features, ScrollView *window)
Definition: imagedata.cpp:66
inT64 memory_used() const
Definition: imagedata.h:231
bool(* FileWriter)(const GenericVector< char > &data, const STRING &filename)
CachingStrategy
Definition: imagedata.h:40
const char * filename
Definition: ioapi.h:38
uint8_t uinT8
Definition: host.h:35
void set_page_number(int num)
Definition: imagedata.h:133
void * ReCachePagesFunc(void *data)
Definition: imagedata.cpp:369
void set_imagefilename(const STRING &name)
Definition: imagedata.h:127
const PointerVector< DocumentData > & documents() const
Definition: imagedata.h:342
bool Serialize(FILE *fp) const
Definition: imagedata.cpp:83
const char features[]
Definition: feature_tests.c:2
const STRING & document_name() const
Definition: imagedata.h:223
int NumPages() const
Definition: imagedata.h:227