tesseract  4.00.00dev
linerec.cpp
Go to the documentation of this file.
1 // File: linerec.cpp
3 // Description: Top-level line-based recognition module for Tesseract.
4 // Author: Ray Smith
5 // Created: Thu May 02 09:47:06 PST 2013
6 //
7 // (C) Copyright 2013, Google Inc.
8 // Licensed under the Apache License, Version 2.0 (the "License");
9 // you may not use this file except in compliance with the License.
10 // You may obtain a copy of the License at
11 // http://www.apache.org/licenses/LICENSE-2.0
12 // Unless required by applicable law or agreed to in writing, software
13 // distributed under the License is distributed on an "AS IS" BASIS,
14 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15 // See the License for the specific language governing permissions and
16 // limitations under the License.
18 
19 #include "tesseractclass.h"
20 
21 #include "allheaders.h"
22 #include "boxread.h"
23 #include "imagedata.h"
24 #ifndef ANDROID_BUILD
25 #include "lstmrecognizer.h"
26 #include "recodebeam.h"
27 #endif
28 #include "ndminx.h"
29 #include "pageres.h"
30 #include "tprintf.h"
31 
32 namespace tesseract {
33 
34 // Arbitarary penalty for non-dictionary words.
35 // TODO(rays) How to learn this?
36 const float kNonDictionaryPenalty = 5.0f;
37 // Scale factor to make certainty more comparable to Tesseract.
38 const float kCertaintyScale = 7.0f;
39 // Worst acceptable certainty for a dictionary word.
40 const float kWorstDictCertainty = -25.0f;
41 
42 // Generates training data for training a line recognizer, eg LSTM.
43 // Breaks the page into lines, according to the boxes, and writes them to a
44 // serialized DocumentData based on output_basename.
45 void Tesseract::TrainLineRecognizer(const STRING& input_imagename,
46  const STRING& output_basename,
47  BLOCK_LIST *block_list) {
48  STRING lstmf_name = output_basename + ".lstmf";
49  DocumentData images(lstmf_name);
50  if (applybox_page > 0) {
51  // Load existing document for the previous pages.
52  if (!images.LoadDocument(lstmf_name.string(), 0, 0, nullptr)) {
53  tprintf("Failed to read training data from %s!\n", lstmf_name.string());
54  return;
55  }
56  }
57  GenericVector<TBOX> boxes;
59  // Get the boxes for this page, if there are any.
60  if (!ReadAllBoxes(applybox_page, false, input_imagename, &boxes, &texts, NULL,
61  NULL) ||
62  boxes.empty()) {
63  tprintf("Failed to read boxes from %s\n", input_imagename.string());
64  return;
65  }
66  TrainFromBoxes(boxes, texts, block_list, &images);
67  images.Shuffle();
68  if (!images.SaveDocument(lstmf_name.string(), NULL)) {
69  tprintf("Failed to write training data to %s!\n", lstmf_name.string());
70  }
71 }
72 
73 // Generates training data for training a line recognizer, eg LSTM.
74 // Breaks the boxes into lines, normalizes them, converts to ImageData and
75 // appends them to the given training_data.
77  const GenericVector<STRING>& texts,
78  BLOCK_LIST *block_list,
79  DocumentData* training_data) {
80  int box_count = boxes.size();
81  // Process all the text lines in this page, as defined by the boxes.
82  int end_box = 0;
83  // Don't let \t, which marks newlines in the box file, get into the line
84  // content, as that makes the line unusable in training.
85  while (end_box < texts.size() && texts[end_box] == "\t") ++end_box;
86  for (int start_box = end_box; start_box < box_count; start_box = end_box) {
87  // Find the textline of boxes starting at start and their bounding box.
88  TBOX line_box = boxes[start_box];
89  STRING line_str = texts[start_box];
90  for (end_box = start_box + 1; end_box < box_count && texts[end_box] != "\t";
91  ++end_box) {
92  line_box += boxes[end_box];
93  line_str += texts[end_box];
94  }
95  // Find the most overlapping block.
96  BLOCK* best_block = NULL;
97  int best_overlap = 0;
98  BLOCK_IT b_it(block_list);
99  for (b_it.mark_cycle_pt(); !b_it.cycled_list(); b_it.forward()) {
100  BLOCK* block = b_it.data();
101  if (block->poly_block() != NULL && !block->poly_block()->IsText())
102  continue; // Not a text block.
103  TBOX block_box = block->bounding_box();
104  block_box.rotate(block->re_rotation());
105  if (block_box.major_overlap(line_box)) {
106  TBOX overlap_box = line_box.intersection(block_box);
107  if (overlap_box.area() > best_overlap) {
108  best_overlap = overlap_box.area();
109  best_block = block;
110  }
111  }
112  }
113  ImageData* imagedata = NULL;
114  if (best_block == NULL) {
115  tprintf("No block overlapping textline: %s\n", line_str.string());
116  } else {
117  imagedata = GetLineData(line_box, boxes, texts, start_box, end_box,
118  *best_block);
119  }
120  if (imagedata != NULL)
121  training_data->AddPageToDocument(imagedata);
122  // Don't let \t, which marks newlines in the box file, get into the line
123  // content, as that makes the line unusable in training.
124  while (end_box < texts.size() && texts[end_box] == "\t") ++end_box;
125  }
126 }
127 
128 // Returns an Imagedata containing the image of the given box,
129 // and ground truth boxes/truth text if available in the input.
130 // The image is not normalized in any way.
132  const GenericVector<TBOX>& boxes,
133  const GenericVector<STRING>& texts,
134  int start_box, int end_box,
135  const BLOCK& block) {
136  TBOX revised_box;
137  ImageData* image_data = GetRectImage(line_box, block, kImagePadding,
138  &revised_box);
139  if (image_data == NULL) return NULL;
140  image_data->set_page_number(applybox_page);
141  // Copy the boxes and shift them so they are relative to the image.
142  FCOORD block_rotation(block.re_rotation().x(), -block.re_rotation().y());
143  ICOORD shift = -revised_box.botleft();
144  GenericVector<TBOX> line_boxes;
145  GenericVector<STRING> line_texts;
146  for (int b = start_box; b < end_box; ++b) {
147  TBOX box = boxes[b];
148  box.rotate(block_rotation);
149  box.move(shift);
150  line_boxes.push_back(box);
151  line_texts.push_back(texts[b]);
152  }
153  GenericVector<int> page_numbers;
154  page_numbers.init_to_size(line_boxes.size(), applybox_page);
155  image_data->AddBoxes(line_boxes, line_texts, page_numbers);
156  return image_data;
157 }
158 
159 // Helper gets the image of a rectangle, using the block.re_rotation() if
160 // needed to get to the image, and rotating the result back to horizontal
161 // layout. (CJK characters will be on their left sides) The vertical text flag
162 // is set in the returned ImageData if the text was originally vertical, which
163 // can be used to invoke a different CJK recognition engine. The revised_box
164 // is also returned to enable calculation of output bounding boxes.
165 ImageData* Tesseract::GetRectImage(const TBOX& box, const BLOCK& block,
166  int padding, TBOX* revised_box) const {
167  TBOX wbox = box;
168  wbox.pad(padding, padding);
169  *revised_box = wbox;
170  // Number of clockwise 90 degree rotations needed to get back to tesseract
171  // coords from the clipped image.
172  int num_rotations = 0;
173  if (block.re_rotation().y() > 0.0f)
174  num_rotations = 1;
175  else if (block.re_rotation().x() < 0.0f)
176  num_rotations = 2;
177  else if (block.re_rotation().y() < 0.0f)
178  num_rotations = 3;
179  // Handle two cases automatically: 1 the box came from the block, 2 the box
180  // came from a box file, and refers to the image, which the block may not.
181  if (block.bounding_box().major_overlap(*revised_box))
182  revised_box->rotate(block.re_rotation());
183  // Now revised_box always refers to the image.
184  // BestPix is never colormapped, but may be of any depth.
185  Pix* pix = BestPix();
186  int width = pixGetWidth(pix);
187  int height = pixGetHeight(pix);
188  TBOX image_box(0, 0, width, height);
189  // Clip to image bounds;
190  *revised_box &= image_box;
191  if (revised_box->null_box()) return NULL;
192  Box* clip_box = boxCreate(revised_box->left(), height - revised_box->top(),
193  revised_box->width(), revised_box->height());
194  Pix* box_pix = pixClipRectangle(pix, clip_box, NULL);
195  if (box_pix == NULL) return NULL;
196  boxDestroy(&clip_box);
197  if (num_rotations > 0) {
198  Pix* rot_pix = pixRotateOrth(box_pix, num_rotations);
199  pixDestroy(&box_pix);
200  box_pix = rot_pix;
201  }
202  // Convert sub-8-bit images to 8 bit.
203  int depth = pixGetDepth(box_pix);
204  if (depth < 8) {
205  Pix* grey;
206  grey = pixConvertTo8(box_pix, false);
207  pixDestroy(&box_pix);
208  box_pix = grey;
209  }
210  bool vertical_text = false;
211  if (num_rotations > 0) {
212  // Rotated the clipped revised box back to internal coordinates.
213  FCOORD rotation(block.re_rotation().x(), -block.re_rotation().y());
214  revised_box->rotate(rotation);
215  if (num_rotations != 2)
216  vertical_text = true;
217  }
218  return new ImageData(vertical_text, box_pix);
219 }
220 
221 #ifndef ANDROID_BUILD
222 // Recognizes a word or group of words, converting to WERD_RES in *words.
223 // Analogous to classify_word_pass1, but can handle a group of words as well.
224 void Tesseract::LSTMRecognizeWord(const BLOCK& block, ROW *row, WERD_RES *word,
225  PointerVector<WERD_RES>* words) {
226  TBOX word_box = word->word->bounding_box();
227  // Get the word image - no frills.
230  // In single word mode, use the whole image without any other row/word
231  // interpretation.
232  word_box = TBOX(0, 0, ImageWidth(), ImageHeight());
233  } else {
234  float baseline = row->base_line((word_box.left() + word_box.right()) / 2);
235  if (baseline + row->descenders() < word_box.bottom())
236  word_box.set_bottom(baseline + row->descenders());
237  if (baseline + row->x_height() + row->ascenders() > word_box.top())
238  word_box.set_top(baseline + row->x_height() + row->ascenders());
239  }
240  ImageData* im_data = GetRectImage(word_box, block, kImagePadding, &word_box);
241  if (im_data == NULL) return;
242  lstm_recognizer_->RecognizeLine(*im_data, true, classify_debug_level > 0,
243  kWorstDictCertainty / kCertaintyScale,
244  lstm_use_matrix, &unicharset, word_box, 2.0,
245  false, words);
246  delete im_data;
247  SearchWords(words);
248 }
249 
250 // Apply segmentation search to the given set of words, within the constraints
251 // of the existing ratings matrix. If there is already a best_choice on a word
252 // leaves it untouched and just sets the done/accepted etc flags.
254  // Run the segmentation search on the network outputs and make a BoxWord
255  // for each of the output words.
256  // If we drop a word as junk, then there is always a space in front of the
257  // next.
258  const Dict* stopper_dict = lstm_recognizer_->GetDict();
259  if (stopper_dict == nullptr) stopper_dict = &getDict();
260  bool any_nonspace_delimited = false;
261  for (int w = 0; w < words->size(); ++w) {
262  WERD_RES* word = (*words)[w];
263  if (word->best_choice != nullptr &&
265  any_nonspace_delimited = true;
266  break;
267  }
268  }
269  for (int w = 0; w < words->size(); ++w) {
270  WERD_RES* word = (*words)[w];
271  if (word->best_choice == NULL) {
272  // If we are using the beam search, the unicharset had better match!
274  WordSearch(word);
275  } else if (word->best_choice->unicharset() == &unicharset &&
276  !lstm_recognizer_->IsRecoding()) {
277  // We set up the word without using the dictionary, so set the permuter
278  // now, but we can only do it because the unicharsets match.
279  word->best_choice->set_permuter(
280  getDict().valid_word(*word->best_choice, true));
281  }
282  if (word->best_choice == NULL) {
283  // It is a dud.
284  word->SetupFake(lstm_recognizer_->GetUnicharset());
285  } else {
286  // Set the best state.
287  for (int i = 0; i < word->best_choice->length(); ++i) {
288  int length = word->best_choice->state(i);
289  word->best_state.push_back(length);
290  }
291  word->reject_map.initialise(word->best_choice->length());
292  word->tess_failed = false;
293  word->tess_accepted = true;
294  word->tess_would_adapt = false;
295  word->done = true;
296  word->tesseract = this;
297  float word_certainty = MIN(word->space_certainty,
298  word->best_choice->certainty());
299  word_certainty *= kCertaintyScale;
300  // Arbitrary ding factor for non-dictionary words.
301  if (!lstm_recognizer_->IsRecoding() &&
303  word_certainty -= kNonDictionaryPenalty;
304  if (getDict().stopper_debug_level >= 1) {
305  tprintf("Best choice certainty=%g, space=%g, scaled=%g, final=%g\n",
306  word->best_choice->certainty(), word->space_certainty,
307  MIN(word->space_certainty, word->best_choice->certainty()) *
308  kCertaintyScale,
309  word_certainty);
310  word->best_choice->print();
311  }
312  word->best_choice->set_certainty(word_certainty);
313  // Discard words that are impossibly bad, but allow a bit more for
314  // dictionary words, and keep bad words in non-space-delimited langs.
315  if (word_certainty >= RecodeBeamSearch::kMinCertainty ||
316  any_nonspace_delimited ||
317  (word_certainty >= kWorstDictCertainty &&
319  word->tess_accepted = stopper_dict->AcceptableResult(word);
320  } else {
321  if (getDict().stopper_debug_level >= 1) {
322  tprintf("Deleting word with certainty %g\n", word_certainty);
323  word->best_choice->print();
324  }
325  // It is a dud.
326  word->SetupFake(lstm_recognizer_->GetUnicharset());
327  }
328  }
329  }
330 }
331 #endif // ANDROID_BUILD
332 
333 } // namespace tesseract.
ImageData * GetRectImage(const TBOX &box, const BLOCK &block, int padding, TBOX *revised_box) const
Definition: linerec.cpp:165
void AddBoxes(const GenericVector< TBOX > &boxes, const GenericVector< STRING > &texts, const GenericVector< int > &box_pages)
Definition: imagedata.cpp:314
void set_certainty(float new_val)
Definition: ratngs.h:370
bool ContainsAnyNonSpaceDelimited() const
Definition: ratngs.h:512
TBOX intersection(const TBOX &box) const
Definition: rect.cpp:87
const float kNonDictionaryPenalty
Definition: linerec.cpp:36
Definition: points.h:189
void print() const
Definition: ratngs.h:578
void RecognizeLine(const ImageData &image_data, bool invert, bool debug, double worst_dict_cert, bool use_alternates, const UNICHARSET *target_unicharset, const TBOX &line_box, float score_ratio, bool one_word, PointerVector< WERD_RES > *words)
void init_to_size(int size, T t)
BOOL8 tess_failed
Definition: pageres.h:272
GenericVector< int > best_state
Definition: pageres.h:255
static const float kMinCertainty
Definition: recodebeam.h:213
WERD_CHOICE * best_choice
Definition: pageres.h:219
void bounding_box(ICOORD &bottom_left, ICOORD &top_right) const
get box
Definition: pdblock.h:59
Treat the image as a single word.
Definition: publictypes.h:162
Dict & getDict()
Definition: classify.h:65
int length() const
Definition: ratngs.h:301
inT32 area() const
Definition: rect.h:118
static bool valid_word_permuter(uinT8 perm, bool numbers_ok)
Check all the DAWGs to see if this word is in any of them.
Definition: dict.h:455
void AddPageToDocument(ImageData *page)
Definition: imagedata.cpp:427
int push_back(T object)
const UNICHARSET & GetUnicharset() const
#define tprintf(...)
Definition: tprintf.h:31
const int kImagePadding
Definition: imagedata.h:37
void LSTMRecognizeWord(const BLOCK &block, ROW *row, WERD_RES *word, PointerVector< WERD_RES > *words)
Definition: linerec.cpp:224
const char * string() const
Definition: strngs.cpp:198
bool empty() const
Definition: genericvector.h:90
float x_height() const
Definition: ocrrow.h:61
const float kWorstDictCertainty
Definition: linerec.cpp:40
int size() const
Definition: genericvector.h:72
TBOX bounding_box() const
Definition: werd.cpp:160
tesseract::Tesseract * tesseract
Definition: pageres.h:266
inT16 left() const
Definition: rect.h:68
uinT8 permuter() const
Definition: ratngs.h:344
void set_top(int y)
Definition: rect.h:57
bool IsText() const
Definition: polyblk.h:52
void SearchWords(PointerVector< WERD_RES > *words)
Definition: linerec.cpp:253
void SetupWordScript(const UNICHARSET &unicharset_in)
Definition: pageres.cpp:376
int stopper_debug_level
Definition: dict.h:622
Definition: strngs.h:45
Pix * BestPix() const
BOOL8 tess_would_adapt
Definition: pageres.h:281
void SetupFake(const UNICHARSET &uch)
Definition: pageres.cpp:344
BOOL8 tess_accepted
Definition: pageres.h:280
const float kCertaintyScale
Definition: linerec.cpp:38
const Dict * GetDict() const
bool null_box() const
Definition: rect.h:46
void pad(int xpad, int ypad)
Definition: rect.h:127
bool LoadDocument(const char *filename, int start_page, inT64 max_memory, FileReader reader)
Definition: imagedata.cpp:390
UNICHARSET unicharset
Definition: ccutil.h:68
bool major_overlap(const TBOX &box) const
Definition: rect.h:358
float certainty() const
Definition: ratngs.h:328
inT16 top() const
Definition: rect.h:54
const UNICHARSET * unicharset() const
Definition: ratngs.h:298
float ascenders() const
Definition: ocrrow.h:79
int state(int index) const
Definition: ratngs.h:317
bool SaveDocument(const char *filename, FileWriter writer)
Definition: imagedata.cpp:409
Definition: rect.h:30
#define MIN(x, y)
Definition: ndminx.h:28
POLY_BLOCK * poly_block() const
Definition: pdblock.h:55
bool ReadAllBoxes(int target_page, bool skip_blanks, const STRING &filename, GenericVector< TBOX > *boxes, GenericVector< STRING > *texts, GenericVector< STRING > *box_texts, GenericVector< int > *pages)
Definition: boxread.cpp:50
inT16 height() const
Definition: rect.h:104
float y() const
Definition: points.h:212
WERD * word
Definition: pageres.h:175
inT16 right() const
Definition: rect.h:75
void set_page_number(int num)
Definition: imagedata.h:133
inT16 width() const
Definition: rect.h:111
void TrainLineRecognizer(const STRING &input_imagename, const STRING &output_basename, BLOCK_LIST *block_list)
Definition: linerec.cpp:45
FCOORD re_rotation() const
Definition: ocrblock.h:138
inT16 bottom() const
Definition: rect.h:61
bool AcceptableResult(WERD_RES *word) const
Definition: stopper.cpp:110
void set_bottom(int y)
Definition: rect.h:64
void move(const ICOORD vec)
Definition: rect.h:153
const ICOORD & botleft() const
Definition: rect.h:88
void TrainFromBoxes(const GenericVector< TBOX > &boxes, const GenericVector< STRING > &texts, BLOCK_LIST *block_list, DocumentData *training_data)
Definition: linerec.cpp:76
ImageData * GetLineData(const TBOX &line_box, const GenericVector< TBOX > &boxes, const GenericVector< STRING > &texts, int start_box, int end_box, const BLOCK &block)
Definition: linerec.cpp:131
float base_line(float xpos) const
Definition: ocrrow.h:56
float space_certainty
Definition: pageres.h:300
void initialise(inT16 length)
Definition: rejctmap.cpp:318
REJMAP reject_map
Definition: pageres.h:271
float x() const
Definition: points.h:209
void set_permuter(uinT8 perm)
Definition: ratngs.h:373
Definition: ocrrow.h:32
void rotate(const FCOORD &vec)
Definition: rect.h:189
void WordSearch(WERD_RES *word_res)
Definition: segsearch.cpp:130
Definition: ocrblock.h:30
BOOL8 done
Definition: pageres.h:282
integer coordinate
Definition: points.h:30
float descenders() const
Definition: ocrrow.h:82