tesseract  4.00.00dev
devanagari_processing.cpp
Go to the documentation of this file.
1 /**********************************************************************
2  * File: devanagari_processing.cpp
3  * Description: Methods to process images containing devanagari symbols,
4  * prior to classification.
5  * Author: Shobhit Saxena
6  * Created: Mon Nov 17 20:26:01 IST 2008
7  *
8  * (C) Copyright 2008, Google Inc.
9  ** Licensed under the Apache License, Version 2.0 (the "License");
10  ** you may not use this file except in compliance with the License.
11  ** You may obtain a copy of the License at
12  ** http://www.apache.org/licenses/LICENSE-2.0
13  ** Unless required by applicable law or agreed to in writing, software
14  ** distributed under the License is distributed on an "AS IS" BASIS,
15  ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
16  ** See the License for the specific language governing permissions and
17  ** limitations under the License.
18  *
19  **********************************************************************/
20 
21 #ifdef HAVE_CONFIG_H
22 #include "config_auto.h"
23 #endif
24 
25 #include "allheaders.h"
26 #include "debugpixa.h"
27 #include "devanagari_processing.h"
28 #include "statistc.h"
29 #include "tordmain.h"
30 
31 // Flags controlling the debugging information for shiro-rekha splitting
32 // strategies.
34  "Debug level for split shiro-rekha process.");
35 
37  "Whether to create a debug image for split shiro-rekha process.");
38 
39 namespace tesseract {
40 
42  orig_pix_ = NULL;
43  segmentation_block_list_ = NULL;
44  splitted_image_ = NULL;
45  global_xheight_ = kUnspecifiedXheight;
46  perform_close_ = false;
47  debug_image_ = NULL;
48  pageseg_split_strategy_ = NO_SPLIT;
49  ocr_split_strategy_ = NO_SPLIT;
50 }
51 
53  Clear();
54 }
55 
57  pixDestroy(&orig_pix_);
58  pixDestroy(&splitted_image_);
59  pageseg_split_strategy_ = NO_SPLIT;
60  ocr_split_strategy_ = NO_SPLIT;
61  pixDestroy(&debug_image_);
62  segmentation_block_list_ = NULL;
63  global_xheight_ = kUnspecifiedXheight;
64  perform_close_ = false;
65 }
66 
67 // On setting the input image, a clone of it is owned by this class.
69  if (orig_pix_) {
70  pixDestroy(&orig_pix_);
71  }
72  orig_pix_ = pixClone(pix);
73 }
74 
75 // Top-level method to perform splitting based on current settings.
76 // Returns true if a split was actually performed.
77 // split_for_pageseg should be true if the splitting is being done prior to
78 // page segmentation. This mode uses the flag
79 // pageseg_devanagari_split_strategy to determine the splitting strategy.
80 bool ShiroRekhaSplitter::Split(bool split_for_pageseg, DebugPixa* pixa_debug) {
81  SplitStrategy split_strategy = split_for_pageseg ? pageseg_split_strategy_ :
82  ocr_split_strategy_;
83  if (split_strategy == NO_SPLIT) {
84  return false; // Nothing to do.
85  }
86  ASSERT_HOST(split_strategy == MINIMAL_SPLIT ||
87  split_strategy == MAXIMAL_SPLIT);
88  ASSERT_HOST(orig_pix_);
90  tprintf("Splitting shiro-rekha ...\n");
91  tprintf("Split strategy = %s\n",
92  split_strategy == MINIMAL_SPLIT ? "Minimal" : "Maximal");
93  tprintf("Initial pageseg available = %s\n",
94  segmentation_block_list_ ? "yes" : "no");
95  }
96  // Create a copy of original image to store the splitting output.
97  pixDestroy(&splitted_image_);
98  splitted_image_ = pixCopy(NULL, orig_pix_);
99 
100  // Initialize debug image if required.
102  pixDestroy(&debug_image_);
103  debug_image_ = pixConvertTo32(orig_pix_);
104  }
105 
106  // Determine all connected components in the input image. A close operation
107  // may be required prior to this, depending on the current settings.
108  Pix* pix_for_ccs = pixClone(orig_pix_);
109  if (perform_close_ && global_xheight_ != kUnspecifiedXheight &&
110  !segmentation_block_list_) {
111  if (devanagari_split_debuglevel > 0) {
112  tprintf("Performing a global close operation..\n");
113  }
114  // A global measure is available for xheight, but no local information
115  // exists.
116  pixDestroy(&pix_for_ccs);
117  pix_for_ccs = pixCopy(NULL, orig_pix_);
118  PerformClose(pix_for_ccs, global_xheight_);
119  }
120  Pixa* ccs;
121  Boxa* tmp_boxa = pixConnComp(pix_for_ccs, &ccs, 8);
122  boxaDestroy(&tmp_boxa);
123  pixDestroy(&pix_for_ccs);
124 
125  // Iterate over all connected components. Get their bounding boxes and clip
126  // out the image regions corresponding to these boxes from the original image.
127  // Conditionally run splitting on each of them.
128  Boxa* regions_to_clear = boxaCreate(0);
129  int num_ccs = 0;
130  if (ccs != nullptr) num_ccs = pixaGetCount(ccs);
131  for (int i = 0; i < num_ccs; ++i) {
132  Box* box = ccs->boxa->box[i];
133  Pix* word_pix = pixClipRectangle(orig_pix_, box, NULL);
134  ASSERT_HOST(word_pix);
135  int xheight = GetXheightForCC(box);
136  if (xheight == kUnspecifiedXheight && segmentation_block_list_ &&
138  pixRenderBoxArb(debug_image_, box, 1, 255, 0, 0);
139  }
140  // If some xheight measure is available, attempt to pre-eliminate small
141  // blobs from the shiro-rekha process. This is primarily to save the CCs
142  // corresponding to punctuation marks/small dots etc which are part of
143  // larger graphemes.
144  if (xheight == kUnspecifiedXheight ||
145  (box->w > xheight / 3 && box->h > xheight / 2)) {
146  SplitWordShiroRekha(split_strategy, word_pix, xheight,
147  box->x, box->y, regions_to_clear);
148  } else if (devanagari_split_debuglevel > 0) {
149  tprintf("CC dropped from splitting: %d,%d (%d, %d)\n",
150  box->x, box->y, box->w, box->h);
151  }
152  pixDestroy(&word_pix);
153  }
154  // Actually clear the boxes now.
155  for (int i = 0; i < boxaGetCount(regions_to_clear); ++i) {
156  Box* box = boxaGetBox(regions_to_clear, i, L_CLONE);
157  pixClearInRect(splitted_image_, box);
158  boxDestroy(&box);
159  }
160  boxaDestroy(&regions_to_clear);
161  pixaDestroy(&ccs);
162  if (devanagari_split_debugimage && pixa_debug != nullptr) {
163  pixa_debug->AddPix(debug_image_,
164  split_for_pageseg ? "pageseg_split" : "ocr_split");
165  }
166  return true;
167 }
168 
169 // Method to perform a close operation on the input image. The xheight
170 // estimate decides the size of sel used.
171 void ShiroRekhaSplitter::PerformClose(Pix* pix, int xheight_estimate) {
172  pixCloseBrick(pix, pix, xheight_estimate / 8, xheight_estimate / 3);
173 }
174 
175 // This method resolves the cc bbox to a particular row and returns the row's
176 // xheight.
177 int ShiroRekhaSplitter::GetXheightForCC(Box* cc_bbox) {
178  if (!segmentation_block_list_) {
179  return global_xheight_;
180  }
181  // Compute the box coordinates in Tesseract's coordinate system.
182  TBOX bbox(cc_bbox->x,
183  pixGetHeight(orig_pix_) - cc_bbox->y - cc_bbox->h - 1,
184  cc_bbox->x + cc_bbox->w,
185  pixGetHeight(orig_pix_) - cc_bbox->y - 1);
186  // Iterate over all blocks.
187  BLOCK_IT block_it(segmentation_block_list_);
188  for (block_it.mark_cycle_pt(); !block_it.cycled_list(); block_it.forward()) {
189  BLOCK* block = block_it.data();
190  // Iterate over all rows in the block.
191  ROW_IT row_it(block->row_list());
192  for (row_it.mark_cycle_pt(); !row_it.cycled_list(); row_it.forward()) {
193  ROW* row = row_it.data();
194  if (!row->bounding_box().major_overlap(bbox)) {
195  continue;
196  }
197  // Row could be skewed, warped, etc. Use the position of the box to
198  // determine the baseline position of the row for that x-coordinate.
199  // Create a square TBOX whose baseline's mid-point lies at this point
200  // and side is row's xheight. Take the overlap of this box with the input
201  // box and check if it is a 'major overlap'. If so, this box lies in this
202  // row. In that case, return the xheight for this row.
203  float box_middle = 0.5 * (bbox.left() + bbox.right());
204  int baseline = static_cast<int>(row->base_line(box_middle) + 0.5);
205  TBOX test_box(box_middle - row->x_height() / 2,
206  baseline,
207  box_middle + row->x_height() / 2,
208  static_cast<int>(baseline + row->x_height()));
209  // Compute overlap. If it is is a major overlap, this is the right row.
210  if (bbox.major_overlap(test_box)) {
211  return row->x_height();
212  }
213  }
214  }
215  // No row found for this bbox.
216  return kUnspecifiedXheight;
217 }
218 
219 // Returns a list of regions (boxes) which should be cleared in the original
220 // image so as to perform shiro-rekha splitting. Pix is assumed to carry one
221 // (or less) word only. Xheight measure could be the global estimate, the row
222 // estimate, or unspecified. If unspecified, over splitting may occur, since a
223 // conservative estimate of stroke width along with an associated multiplier
224 // is used in its place. It is advisable to have a specified xheight when
225 // splitting for classification/training.
226 // A vertical projection histogram of all the on-pixels in the input pix is
227 // computed. The maxima of this histogram is regarded as an approximate location
228 // of the shiro-rekha. By descending on the maxima's peak on both sides,
229 // stroke width of shiro-rekha is estimated.
230 // A horizontal projection histogram is computed for a sub-image of the input
231 // image, which extends from just below the shiro-rekha down to a certain
232 // leeway. The leeway depends on the input xheight, if provided, else a
233 // conservative multiplier on approximate stroke width is used (which may lead
234 // to over-splitting).
235 void ShiroRekhaSplitter::SplitWordShiroRekha(SplitStrategy split_strategy,
236  Pix* pix,
237  int xheight,
238  int word_left,
239  int word_top,
240  Boxa* regions_to_clear) {
241  if (split_strategy == NO_SPLIT) {
242  return;
243  }
244  int width = pixGetWidth(pix);
245  int height = pixGetHeight(pix);
246  // Statistically determine the yextents of the shiro-rekha.
247  int shirorekha_top, shirorekha_bottom, shirorekha_ylevel;
248  GetShiroRekhaYExtents(pix, &shirorekha_top, &shirorekha_bottom,
249  &shirorekha_ylevel);
250  // Since the shiro rekha is also a stroke, its width is equal to the stroke
251  // width.
252  int stroke_width = shirorekha_bottom - shirorekha_top + 1;
253 
254  // Some safeguards to protect CCs we do not want to be split.
255  // These are particularly useful when the word wasn't eliminated earlier
256  // because xheight information was unavailable.
257  if (shirorekha_ylevel > height / 2) {
258  // Shirorekha shouldn't be in the bottom half of the word.
259  if (devanagari_split_debuglevel > 0) {
260  tprintf("Skipping splitting CC at (%d, %d): shirorekha in lower half..\n",
261  word_left, word_top);
262  }
263  return;
264  }
265  if (stroke_width > height / 3) {
266  // Even the boldest of fonts shouldn't do this.
267  if (devanagari_split_debuglevel > 0) {
268  tprintf("Skipping splitting CC at (%d, %d): stroke width too huge..\n",
269  word_left, word_top);
270  }
271  return;
272  }
273 
274  // Clear the ascender and descender regions of the word.
275  // Obtain a vertical projection histogram for the resulting image.
276  Box* box_to_clear = boxCreate(0, shirorekha_top - stroke_width / 3,
277  width, 5 * stroke_width / 3);
278  Pix* word_in_xheight = pixCopy(NULL, pix);
279  pixClearInRect(word_in_xheight, box_to_clear);
280  // Also clear any pixels which are below shirorekha_bottom + some leeway.
281  // The leeway is set to xheight if the information is available, else it is a
282  // multiplier applied to the stroke width.
283  int leeway_to_keep = stroke_width * 3;
284  if (xheight != kUnspecifiedXheight) {
285  // This is because the xheight-region typically includes the shiro-rekha
286  // inside it, i.e., the top of the xheight range corresponds to the top of
287  // shiro-rekha.
288  leeway_to_keep = xheight - stroke_width;
289  }
290  box_to_clear->y = shirorekha_bottom + leeway_to_keep;
291  box_to_clear->h = height - box_to_clear->y;
292  pixClearInRect(word_in_xheight, box_to_clear);
293  boxDestroy(&box_to_clear);
294 
295  PixelHistogram vert_hist;
296  vert_hist.ConstructVerticalCountHist(word_in_xheight);
297  pixDestroy(&word_in_xheight);
298 
299  // If the number of black pixel in any column of the image is less than a
300  // fraction of the stroke width, treat it as noise / a stray mark. Perform
301  // these changes inside the vert_hist data itself, as that is used later on as
302  // a bit vector for the final split decision at every column.
303  for (int i = 0; i < width; ++i) {
304  if (vert_hist.hist()[i] <= stroke_width / 4)
305  vert_hist.hist()[i] = 0;
306  else
307  vert_hist.hist()[i] = 1;
308  }
309  // In order to split the line at any point, we make sure that the width of the
310  // gap is atleast half the stroke width.
311  int i = 0;
312  int cur_component_width = 0;
313  while (i < width) {
314  if (!vert_hist.hist()[i]) {
315  int j = 0;
316  while (i + j < width && !vert_hist.hist()[i+j])
317  ++j;
318  if (j >= stroke_width / 2 && cur_component_width >= stroke_width / 2) {
319  // Perform a shiro-rekha split. The intervening region lies from i to
320  // i+j-1.
321  // A minimal single-pixel split makes the estimation of intra- and
322  // inter-word spacing easier during page layout analysis,
323  // whereas a maximal split may be needed for OCR, depending on
324  // how the engine was trained.
325  bool minimal_split = (split_strategy == MINIMAL_SPLIT);
326  int split_width = minimal_split ? 1 : j;
327  int split_left = minimal_split ? i + (j / 2) - (split_width / 2) : i;
328  if (!minimal_split || (i != 0 && i + j != width)) {
329  Box* box_to_clear =
330  boxCreate(word_left + split_left,
331  word_top + shirorekha_top - stroke_width / 3,
332  split_width,
333  5 * stroke_width / 3);
334  if (box_to_clear) {
335  boxaAddBox(regions_to_clear, box_to_clear, L_CLONE);
336  // Mark this in the debug image if needed.
338  pixRenderBoxArb(debug_image_, box_to_clear, 1, 128, 255, 128);
339  }
340  boxDestroy(&box_to_clear);
341  cur_component_width = 0;
342  }
343  }
344  }
345  i += j;
346  } else {
347  ++i;
348  ++cur_component_width;
349  }
350  }
351 }
352 
353 // Refreshes the words in the segmentation block list by using blobs in the
354 // input block list.
355 // The segmentation block list must be set.
357  C_BLOB_LIST* new_blobs) {
358  // The segmentation block list must have been specified.
359  ASSERT_HOST(segmentation_block_list_);
360  if (devanagari_split_debuglevel > 0) {
361  tprintf("Before refreshing blobs:\n");
362  PrintSegmentationStats(segmentation_block_list_);
363  tprintf("New Blobs found: %d\n", new_blobs->length());
364  }
365 
366  C_BLOB_LIST not_found_blobs;
367  RefreshWordBlobsFromNewBlobs(segmentation_block_list_,
368  new_blobs,
369  ((devanagari_split_debugimage && debug_image_) ?
370  &not_found_blobs : NULL));
371 
372  if (devanagari_split_debuglevel > 0) {
373  tprintf("After refreshing blobs:\n");
374  PrintSegmentationStats(segmentation_block_list_);
375  }
376  if (devanagari_split_debugimage && debug_image_) {
377  // Plot out the original blobs for which no match was found in the new
378  // all_blobs list.
379  C_BLOB_IT not_found_it(&not_found_blobs);
380  for (not_found_it.mark_cycle_pt(); !not_found_it.cycled_list();
381  not_found_it.forward()) {
382  C_BLOB* not_found = not_found_it.data();
383  TBOX not_found_box = not_found->bounding_box();
384  Box* box_to_plot = GetBoxForTBOX(not_found_box);
385  pixRenderBoxArb(debug_image_, box_to_plot, 1, 255, 0, 255);
386  boxDestroy(&box_to_plot);
387  }
388 
389  // Plot out the blobs unused from all blobs.
390  C_BLOB_IT all_blobs_it(new_blobs);
391  for (all_blobs_it.mark_cycle_pt(); !all_blobs_it.cycled_list();
392  all_blobs_it.forward()) {
393  C_BLOB* a_blob = all_blobs_it.data();
394  Box* box_to_plot = GetBoxForTBOX(a_blob->bounding_box());
395  pixRenderBoxArb(debug_image_, box_to_plot, 3, 0, 127, 0);
396  boxDestroy(&box_to_plot);
397  }
398  }
399 }
400 
401 // Returns a new box object for the corresponding TBOX, based on the original
402 // image's coordinate system.
403 Box* ShiroRekhaSplitter::GetBoxForTBOX(const TBOX& tbox) const {
404  return boxCreate(tbox.left(), pixGetHeight(orig_pix_) - tbox.top() - 1,
405  tbox.width(), tbox.height());
406 }
407 
408 // This method returns the computed mode-height of blobs in the pix.
409 // It also prunes very small blobs from calculation.
411  Boxa* boxa = pixConnComp(pix, NULL, 8);
412  STATS heights(0, pixGetHeight(pix));
413  heights.clear();
414  for (int i = 0; i < boxaGetCount(boxa); ++i) {
415  Box* box = boxaGetBox(boxa, i, L_CLONE);
416  if (box->h >= 3 || box->w >= 3) {
417  heights.add(box->h, 1);
418  }
419  boxDestroy(&box);
420  }
421  boxaDestroy(&boxa);
422  return heights.mode();
423 }
424 
425 // This method returns y-extents of the shiro-rekha computed from the input
426 // word image.
427 void ShiroRekhaSplitter::GetShiroRekhaYExtents(Pix* word_pix,
428  int* shirorekha_top,
429  int* shirorekha_bottom,
430  int* shirorekha_ylevel) {
431  // Compute a histogram from projecting the word on a vertical line.
432  PixelHistogram hist_horiz;
433  hist_horiz.ConstructHorizontalCountHist(word_pix);
434  // Get the ylevel where the top-line exists. This is basically the global
435  // maxima in the horizontal histogram.
436  int topline_onpixel_count = 0;
437  int topline_ylevel = hist_horiz.GetHistogramMaximum(&topline_onpixel_count);
438 
439  // Get the upper and lower extents of the shiro rekha.
440  int thresh = (topline_onpixel_count * 70) / 100;
441  int ulimit = topline_ylevel;
442  int llimit = topline_ylevel;
443  while (ulimit > 0 && hist_horiz.hist()[ulimit] >= thresh)
444  --ulimit;
445  while (llimit < pixGetHeight(word_pix) && hist_horiz.hist()[llimit] >= thresh)
446  ++llimit;
447 
448  if (shirorekha_top) *shirorekha_top = ulimit;
449  if (shirorekha_bottom) *shirorekha_bottom = llimit;
450  if (shirorekha_ylevel) *shirorekha_ylevel = topline_ylevel;
451 }
452 
453 // This method returns the global-maxima for the histogram. The frequency of
454 // the global maxima is returned in count, if specified.
456  int best_value = 0;
457  for (int i = 0; i < length_; ++i) {
458  if (hist_[i] > hist_[best_value]) {
459  best_value = i;
460  }
461  }
462  if (count) {
463  *count = hist_[best_value];
464  }
465  return best_value;
466 }
467 
468 // Methods to construct histograms from images.
470  Clear();
471  int width = pixGetWidth(pix);
472  int height = pixGetHeight(pix);
473  hist_ = new int[width];
474  length_ = width;
475  int wpl = pixGetWpl(pix);
476  l_uint32 *data = pixGetData(pix);
477  for (int i = 0; i < width; ++i)
478  hist_[i] = 0;
479  for (int i = 0; i < height; ++i) {
480  l_uint32 *line = data + i * wpl;
481  for (int j = 0; j < width; ++j)
482  if (GET_DATA_BIT(line, j))
483  ++(hist_[j]);
484  }
485 }
486 
488  Clear();
489  Numa* counts = pixCountPixelsByRow(pix, NULL);
490  length_ = numaGetCount(counts);
491  hist_ = new int[length_];
492  for (int i = 0; i < length_; ++i) {
493  l_int32 val = 0;
494  numaGetIValue(counts, i, &val);
495  hist_[i] = val;
496  }
497  numaDestroy(&counts);
498 }
499 
500 } // namespace tesseract.
void RefreshWordBlobsFromNewBlobs(BLOCK_LIST *block_list, C_BLOB_LIST *new_blobs, C_BLOB_LIST *not_found_blobs)
Definition: ocrblock.cpp:478
bool devanagari_split_debugimage
void RefreshSegmentationWithNewBlobs(C_BLOB_LIST *new_blobs)
#define tprintf(...)
Definition: tprintf.h:31
#define BOOL_VAR(name, val, comment)
Definition: params.h:279
float x_height() const
Definition: ocrrow.h:61
#define ASSERT_HOST(x)
Definition: errcode.h:84
inT16 left() const
Definition: rect.h:68
#define INT_VAR(name, val, comment)
Definition: params.h:276
int GetHistogramMaximum(int *count) const
bool major_overlap(const TBOX &box) const
Definition: rect.h:358
TBOX bounding_box() const
Definition: ocrrow.h:85
void PrintSegmentationStats(BLOCK_LIST *block_list)
Definition: ocrblock.cpp:410
void add(inT32 value, inT32 count)
Definition: statistc.cpp:101
inT16 top() const
Definition: rect.h:54
bool Split(bool split_for_pageseg, DebugPixa *pixa_debug)
inT32 mode() const
Definition: statistc.cpp:115
Definition: rect.h:30
TBOX bounding_box() const
Definition: stepblob.cpp:250
inT16 height() const
Definition: rect.h:104
inT16 right() const
Definition: rect.h:75
inT16 width() const
Definition: rect.h:111
Definition: statistc.h:33
int devanagari_split_debuglevel
void AddPix(const Pix *pix, const char *caption)
Definition: debugpixa.h:26
float base_line(float xpos) const
Definition: ocrrow.h:56
Definition: ocrrow.h:32
int count(LIST var_list)
Definition: oldlist.cpp:103
ROW_LIST * row_list()
get rows
Definition: ocrblock.h:120
Definition: ocrblock.h:30
void clear()
Definition: statistc.cpp:81