tesseract  4.00.00dev
edgblob.cpp
Go to the documentation of this file.
1 /**********************************************************************
2  * File: edgblob.c (Formerly edgeloop.c)
3  * Description: Functions to clean up an outline before approximation.
4  * Author: Ray Smith
5  * Created: Tue Mar 26 16:56:25 GMT 1991
6  *
7  *(C) Copyright 1991, Hewlett-Packard Ltd.
8  ** Licensed under the Apache License, Version 2.0(the "License");
9  ** you may not use this file except in compliance with the License.
10  ** You may obtain a copy of the License at
11  ** http://www.apache.org/licenses/LICENSE-2.0
12  ** Unless required by applicable law or agreed to in writing, software
13  ** distributed under the License is distributed on an "AS IS" BASIS,
14  ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15  ** See the License for the specific language governing permissions and
16  ** limitations under the License.
17  *
18  **********************************************************************/
19 
20 #include "scanedg.h"
21 #include "drawedg.h"
22 #include "edgloop.h"
23 #include "edgblob.h"
24 
25 // Include automatically generated configuration file if running autoconf.
26 #ifdef HAVE_CONFIG_H
27 #include "config_auto.h"
28 #endif
29 
30 #define EXTERN
31 
32 // Control parameters used in outline_complexity(), which rejects an outline
33 // if any one of the 3 conditions is satisfied:
34 // - number of children exceeds edges_max_children_per_outline
35 // - number of nested layers exceeds edges_max_children_layers
36 // - joint complexity exceeds edges_children_count_limit(as in child_count())
38  "Use the new outline complexity module");
40  "Max number of children inside a character outline");
42  "Max layers of nested children inside a character outline");
44  "turn on debugging for this module");
45 
46 
48  "Importance ratio for chucking outlines");
50  "Max holes allowed in blob");
52  "Remove boxy parents of char-like children");
54  "Min pixels for potential char in box");
56  "Max lensq/area for acceptable child outline");
58  "Min area fraction of child outline");
60  "Min area fraction of grandchild for box");
61 
69 ICOORD bleft, // corners
70 ICOORD tright): bl(bleft), tr(tright) {
71  bxdim =(tright.x() - bleft.x()) / BUCKETSIZE + 1;
72  bydim =(tright.y() - bleft.y()) / BUCKETSIZE + 1;
73  // make array
74  buckets = new C_OUTLINE_LIST[bxdim * bydim];
75  index = 0;
76 }
77 
78 
86 C_OUTLINE_LIST *
87 OL_BUCKETS::operator()( // array access
88 inT16 x, // image coords
89 inT16 y) {
90  return &buckets[(y-bl.y()) / BUCKETSIZE * bxdim + (x-bl.x()) / BUCKETSIZE];
91 }
92 
93 
115  C_OUTLINE *outline, // parent outline
116  inT32 max_count, // max output
117  inT16 depth // recurion depth
118  ) {
119  inT16 xmin, xmax; // coord limits
120  inT16 ymin, ymax;
121  inT16 xindex, yindex; // current bucket
122  C_OUTLINE *child; // current child
123  inT32 child_count; // no of children
124  inT32 grandchild_count; // no of grandchildren
125  C_OUTLINE_IT child_it; // search iterator
126 
127  TBOX olbox = outline->bounding_box();
128  xmin =(olbox.left() - bl.x()) / BUCKETSIZE;
129  xmax =(olbox.right() - bl.x()) / BUCKETSIZE;
130  ymin =(olbox.bottom() - bl.y()) / BUCKETSIZE;
131  ymax =(olbox.top() - bl.y()) / BUCKETSIZE;
132  child_count = 0;
133  grandchild_count = 0;
134  if (++depth > edges_max_children_layers) // nested loops are too deep
135  return max_count + depth;
136 
137  for (yindex = ymin; yindex <= ymax; yindex++) {
138  for (xindex = xmin; xindex <= xmax; xindex++) {
139  child_it.set_to_list(&buckets[yindex * bxdim + xindex]);
140  if (child_it.empty())
141  continue;
142  for (child_it.mark_cycle_pt(); !child_it.cycled_list();
143  child_it.forward()) {
144  child = child_it.data();
145  if (child == outline || !(*child < *outline))
146  continue;
147  child_count++;
148 
149  if (child_count > edges_max_children_per_outline) { // too fragmented
150  if (edges_debug)
151  tprintf("Discard outline on child_count=%d > "
152  "max_children_per_outline=%d\n",
153  child_count,
154  static_cast<inT32>(edges_max_children_per_outline));
155  return max_count + child_count;
156  }
157 
158  // Compute the "complexity" of each child recursively
159  inT32 remaining_count = max_count - child_count - grandchild_count;
160  if (remaining_count > 0)
161  grandchild_count += edges_children_per_grandchild *
162  outline_complexity(child, remaining_count, depth);
163  if (child_count + grandchild_count > max_count) { // too complex
164  if (edges_debug)
165  tprintf("Disgard outline on child_count=%d + grandchild_count=%d "
166  "> max_count=%d\n",
167  child_count, grandchild_count, max_count);
168  return child_count + grandchild_count;
169  }
170  }
171  }
172  }
173  return child_count + grandchild_count;
174 }
175 
176 
182 // TODO(rays) Merge with outline_complexity.
184  C_OUTLINE *outline, // parent outline
185  inT32 max_count // max output
186  ) {
187  BOOL8 parent_box; // could it be boxy
188  inT16 xmin, xmax; // coord limits
189  inT16 ymin, ymax;
190  inT16 xindex, yindex; // current bucket
191  C_OUTLINE *child; // current child
192  inT32 child_count; // no of children
193  inT32 grandchild_count; // no of grandchildren
194  inT32 parent_area; // potential box
195  FLOAT32 max_parent_area; // potential box
196  inT32 child_area; // current child
197  inT32 child_length; // current child
198  TBOX olbox;
199  C_OUTLINE_IT child_it; // search iterator
200 
201  olbox = outline->bounding_box();
202  xmin =(olbox.left() - bl.x()) / BUCKETSIZE;
203  xmax =(olbox.right() - bl.x()) / BUCKETSIZE;
204  ymin =(olbox.bottom() - bl.y()) / BUCKETSIZE;
205  ymax =(olbox.top() - bl.y()) / BUCKETSIZE;
206  child_count = 0;
207  grandchild_count = 0;
208  parent_area = 0;
209  max_parent_area = 0;
210  parent_box = TRUE;
211  for (yindex = ymin; yindex <= ymax; yindex++) {
212  for (xindex = xmin; xindex <= xmax; xindex++) {
213  child_it.set_to_list(&buckets[yindex * bxdim + xindex]);
214  if (child_it.empty())
215  continue;
216  for (child_it.mark_cycle_pt(); !child_it.cycled_list();
217  child_it.forward()) {
218  child = child_it.data();
219  if (child != outline && *child < *outline) {
220  child_count++;
221  if (child_count <= max_count) {
222  int max_grand =(max_count - child_count) /
224  if (max_grand > 0)
225  grandchild_count += count_children(child, max_grand) *
227  else
228  grandchild_count += count_children(child, 1);
229  }
230  if (child_count + grandchild_count > max_count) {
231  if (edges_debug)
232  tprintf("Discarding parent with child count=%d, gc=%d\n",
233  child_count,grandchild_count);
234  return child_count + grandchild_count;
235  }
236  if (parent_area == 0) {
237  parent_area = outline->outer_area();
238  if (parent_area < 0)
239  parent_area = -parent_area;
240  max_parent_area = outline->bounding_box().area() * edges_boxarea;
241  if (parent_area < max_parent_area)
242  parent_box = FALSE;
243  }
244  if (parent_box &&
245  (!edges_children_fix ||
246  child->bounding_box().height() > edges_min_nonhole)) {
247  child_area = child->outer_area();
248  if (child_area < 0)
249  child_area = -child_area;
250  if (edges_children_fix) {
251  if (parent_area - child_area < max_parent_area) {
252  parent_box = FALSE;
253  continue;
254  }
255  if (grandchild_count > 0) {
256  if (edges_debug)
257  tprintf("Discarding parent of area %d, child area=%d, max%g "
258  "with gc=%d\n",
259  parent_area, child_area, max_parent_area,
260  grandchild_count);
261  return max_count + 1;
262  }
263  child_length = child->pathlength();
264  if (child_length * child_length >
265  child_area * edges_patharea_ratio) {
266  if (edges_debug)
267  tprintf("Discarding parent of area %d, child area=%d, max%g "
268  "with child length=%d\n",
269  parent_area, child_area, max_parent_area,
270  child_length);
271  return max_count + 1;
272  }
273  }
274  if (child_area < child->bounding_box().area() * edges_childarea) {
275  if (edges_debug)
276  tprintf("Discarding parent of area %d, child area=%d, max%g "
277  "with child rect=%d\n",
278  parent_area, child_area, max_parent_area,
279  child->bounding_box().area());
280  return max_count + 1;
281  }
282  }
283  }
284  }
285  }
286  }
287  return child_count + grandchild_count;
288 }
289 
290 
291 
292 
299 void OL_BUCKETS::extract_children( // recursive count
300  C_OUTLINE *outline, // parent outline
301  C_OUTLINE_IT *it // destination iterator
302  ) {
303  inT16 xmin, xmax; // coord limits
304  inT16 ymin, ymax;
305  inT16 xindex, yindex; // current bucket
306  TBOX olbox;
307  C_OUTLINE_IT child_it; // search iterator
308 
309  olbox = outline->bounding_box();
310  xmin =(olbox.left() - bl.x()) / BUCKETSIZE;
311  xmax =(olbox.right() - bl.x()) / BUCKETSIZE;
312  ymin =(olbox.bottom() - bl.y()) / BUCKETSIZE;
313  ymax =(olbox.top() - bl.y()) / BUCKETSIZE;
314  for (yindex = ymin; yindex <= ymax; yindex++) {
315  for (xindex = xmin; xindex <= xmax; xindex++) {
316  child_it.set_to_list(&buckets[yindex * bxdim + xindex]);
317  for (child_it.mark_cycle_pt(); !child_it.cycled_list();
318  child_it.forward()) {
319  if (*child_it.data() < *outline) {
320  it->add_after_then_move(child_it.extract());
321  }
322  }
323  }
324  }
325 }
326 
327 
334 void extract_edges(Pix* pix, // thresholded image
335  BLOCK *block) { // block to scan
336  C_OUTLINE_LIST outlines; // outlines in block
337  C_OUTLINE_IT out_it = &outlines;
338 
339  block_edges(pix, block, &out_it);
340  ICOORD bleft; // block box
341  ICOORD tright;
342  block->bounding_box(bleft, tright);
343  // make blobs
344  outlines_to_blobs(block, bleft, tright, &outlines);
345 }
346 
347 
354 void outlines_to_blobs( // find blobs
355  BLOCK *block, // block to scan
356  ICOORD bleft,
357  ICOORD tright,
358  C_OUTLINE_LIST *outlines) {
359  // make buckets
360  OL_BUCKETS buckets(bleft, tright);
361 
362  fill_buckets(outlines, &buckets);
363  empty_buckets(block, &buckets);
364 }
365 
366 
373 void fill_buckets( // find blobs
374  C_OUTLINE_LIST *outlines, // outlines in block
375  OL_BUCKETS *buckets // output buckets
376  ) {
377  TBOX ol_box; // outline box
378  C_OUTLINE_IT out_it = outlines; // iterator
379  C_OUTLINE_IT bucket_it; // iterator in bucket
380  C_OUTLINE *outline; // current outline
381 
382  for (out_it.mark_cycle_pt(); !out_it.cycled_list(); out_it.forward()) {
383  outline = out_it.extract(); // take off list
384  // get box
385  ol_box = outline->bounding_box();
386  bucket_it.set_to_list((*buckets) (ol_box.left(), ol_box.bottom()));
387  bucket_it.add_to_end(outline);
388  }
389 }
390 
391 
398 void empty_buckets( // find blobs
399  BLOCK *block, // block to scan
400  OL_BUCKETS *buckets // output buckets
401  ) {
402  BOOL8 good_blob; // healthy blob
403  C_OUTLINE_LIST outlines; // outlines in block
404  // iterator
405  C_OUTLINE_IT out_it = &outlines;
406  C_OUTLINE_IT bucket_it = buckets->start_scan();
407  C_OUTLINE_IT parent_it; // parent outline
408  C_BLOB_IT good_blobs = block->blob_list();
409  C_BLOB_IT junk_blobs = block->reject_blobs();
410 
411  while (!bucket_it.empty()) {
412  out_it.set_to_list(&outlines);
413  do {
414  parent_it = bucket_it; // find outermost
415  do {
416  bucket_it.forward();
417  } while (!bucket_it.at_first() &&
418  !(*parent_it.data() < *bucket_it.data()));
419  } while (!bucket_it.at_first());
420 
421  // move to new list
422  out_it.add_after_then_move(parent_it.extract());
423  good_blob = capture_children(buckets, &junk_blobs, &out_it);
424  C_BLOB::ConstructBlobsFromOutlines(good_blob, &outlines, &good_blobs,
425  &junk_blobs);
426 
427  bucket_it.set_to_list(buckets->scan_next());
428  }
429 }
430 
431 
440 BOOL8 capture_children( // find children
441  OL_BUCKETS *buckets, // bucket sort clanss
442  C_BLOB_IT *reject_it, // dead grandchildren
443  C_OUTLINE_IT *blob_it // output outlines
444  ) {
445  C_OUTLINE *outline; // master outline
446  inT32 child_count; // no of children
447 
448  outline = blob_it->data();
450  child_count = buckets->outline_complexity(outline,
452  0);
453  else
454  child_count = buckets->count_children(outline,
456  if (child_count > edges_children_count_limit)
457  return FALSE;
458 
459  if (child_count > 0)
460  buckets->extract_children(outline, blob_it);
461  return TRUE;
462 }
void empty_buckets(BLOCK *block, OL_BUCKETS *buckets)
Definition: edgblob.cpp:398
#define TRUE
Definition: capi.h:45
#define EXTERN
Definition: edgblob.cpp:30
int32_t inT32
Definition: host.h:38
void bounding_box(ICOORD &bottom_left, ICOORD &top_right) const
get box
Definition: pdblock.h:59
inT32 area() const
Definition: rect.h:118
EXTERN int edges_children_count_limit
Definition: edgblob.cpp:50
EXTERN int edges_patharea_ratio
Definition: edgblob.cpp:56
BOOL8 capture_children(OL_BUCKETS *buckets, C_BLOB_IT *reject_it, C_OUTLINE_IT *blob_it)
Definition: edgblob.cpp:440
C_BLOB_LIST * reject_blobs()
Definition: ocrblock.h:135
C_OUTLINE_LIST * scan_next()
Definition: edgblob.h:51
inT16 x() const
access function
Definition: points.h:52
EXTERN int edges_max_children_per_outline
Definition: edgblob.cpp:40
#define tprintf(...)
Definition: tprintf.h:31
EXTERN double edges_childarea
Definition: edgblob.cpp:58
void fill_buckets(C_OUTLINE_LIST *outlines, OL_BUCKETS *buckets)
Definition: edgblob.cpp:373
#define BOOL_VAR(name, val, comment)
Definition: params.h:279
int16_t inT16
Definition: host.h:36
inT16 left() const
Definition: rect.h:68
EXTERN int edges_max_children_layers
Definition: edgblob.cpp:42
EXTERN bool edges_debug
Definition: edgblob.cpp:44
EXTERN bool edges_use_new_outline_complexity
Definition: edgblob.cpp:38
static void ConstructBlobsFromOutlines(bool good_blob, C_OUTLINE_LIST *outline_list, C_BLOB_IT *good_blobs_it, C_BLOB_IT *bad_blobs_it)
Definition: stepblob.cpp:186
#define double_VAR(name, val, comment)
Definition: params.h:285
unsigned char BOOL8
Definition: host.h:44
#define FALSE
Definition: capi.h:46
const TBOX & bounding_box() const
Definition: coutln.h:111
C_BLOB_LIST * blob_list()
get blobs
Definition: ocrblock.h:132
inT16 y() const
access_function
Definition: points.h:56
#define INT_VAR(name, val, comment)
Definition: params.h:276
inT32 outline_complexity(C_OUTLINE *outline, inT32 max_count, inT16 depth)
Definition: edgblob.cpp:114
EXTERN bool edges_children_fix
Definition: edgblob.cpp:52
EXTERN double edges_boxarea
Definition: edgblob.cpp:60
EXTERN int edges_children_per_grandchild
Definition: edgblob.cpp:48
inT32 outer_area() const
Definition: coutln.cpp:308
inT16 top() const
Definition: rect.h:54
C_OUTLINE_LIST * operator()(inT16 x, inT16 y)
Definition: edgblob.cpp:87
float FLOAT32
Definition: host.h:42
Definition: rect.h:30
void block_edges(Pix *t_pix, PDBLK *block, C_OUTLINE_IT *outline_it)
Definition: scanedg.cpp:38
void extract_children(C_OUTLINE *outline, C_OUTLINE_IT *it)
Definition: edgblob.cpp:299
inT16 height() const
Definition: rect.h:104
inT16 right() const
Definition: rect.h:75
EXTERN int edges_min_nonhole
Definition: edgblob.cpp:54
inT16 bottom() const
Definition: rect.h:61
C_OUTLINE_LIST * start_scan()
Definition: edgblob.h:45
#define BUCKETSIZE
Definition: edgblob.h:29
void extract_edges(Pix *pix, BLOCK *block)
Definition: edgblob.cpp:334
inT32 pathlength() const
Definition: coutln.h:133
Definition: ocrblock.h:30
integer coordinate
Definition: points.h:30
inT32 count_children(C_OUTLINE *outline, inT32 max_count)
Definition: edgblob.cpp:183
OL_BUCKETS(ICOORD bleft, ICOORD tright)
Definition: edgblob.cpp:68
void outlines_to_blobs(BLOCK *block, ICOORD bleft, ICOORD tright, C_OUTLINE_LIST *outlines)
Definition: edgblob.cpp:354