tesseract  4.00.00dev
tablerecog.cpp
Go to the documentation of this file.
1 // File: tablerecog.cpp
3 // Description: Helper class to help structure table areas. Given an bounding
4 // box from TableFinder, the TableRecognizer should give a
5 // StructuredTable (maybe a list in the future) of "good" tables
6 // in that area.
7 // Author: Nicholas Beato
8 // Created: Friday, Aug. 20, 2010
9 //
10 // (C) Copyright 2009, Google Inc.
11 // Licensed under the Apache License, Version 2.0 (the "License");
12 // you may not use this file except in compliance with the License.
13 // You may obtain a copy of the License at
14 // http://www.apache.org/licenses/LICENSE-2.0
15 // Unless required by applicable law or agreed to in writing, software
16 // distributed under the License is distributed on an "AS IS" BASIS,
17 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
18 // See the License for the specific language governing permissions and
19 // limitations under the License.
20 //
22 
23 #ifdef HAVE_CONFIG_H
24 #include "config_auto.h"
25 #endif
26 
27 #include "tablerecog.h"
28 
29 namespace tesseract {
30 
31 // The amount of space required between the ColPartitions in 2 columns
32 // of a non-lined table as a multiple of the median width.
33 const double kHorizontalSpacing = 0.30;
34 // The amount of space required between the ColPartitions in 2 rows
35 // of a non-lined table as multiples of the median height.
36 const double kVerticalSpacing = -0.2;
37 // The number of cells that the grid lines may intersect.
38 // See FindCellSplitLocations for explanation.
39 const int kCellSplitRowThreshold = 0;
41 // For "lined tables", the number of required lines. Currently a guess.
44 // Number of columns required, as a fraction of the most columns found.
45 // None of these are tweaked at all.
46 const double kRequiredColumns = 0.7;
47 // The tolerance for comparing margins of potential tables.
48 const double kMarginFactor = 1.1;
49 // The first and last row should be consistent cell height.
50 // This factor is the first and last row cell height max.
51 const double kMaxRowSize = 2.5;
52 // Number of filled columns required to form a strong table row.
53 // For small tables, this is an absolute number.
54 const double kGoodRowNumberOfColumnsSmall[] = { 2, 2, 2, 2, 2, 3, 3 };
56  sizeof(kGoodRowNumberOfColumnsSmall) / sizeof(double) - 1;
57 // For large tables, it is a relative number
58 const double kGoodRowNumberOfColumnsLarge = 0.7;
59 // The amount of area that must be covered in a cell by ColPartitions to
60 // be considered "filled"
61 const double kMinFilledArea = 0.35;
62 
66 
68  : text_grid_(NULL),
69  line_grid_(NULL),
70  is_lined_(false),
71  space_above_(0),
72  space_below_(0),
73  space_left_(0),
74  space_right_(0),
75  median_cell_height_(0),
76  median_cell_width_(0),
77  max_text_height_(MAX_INT32) {
78 }
79 
81 }
82 
84 }
85 
87  text_grid_ = text_grid;
88 }
90  line_grid_ = line_grid;
91 }
93  max_text_height_ = height;
94 }
96  return is_lined_;
97 }
99  return cell_y_.length() == 0 ? 0 : cell_y_.length() - 1;
100 }
102  return cell_x_.length() == 0 ? 0 : cell_x_.length() - 1;
103 }
105  return row_count() * column_count();
106 }
108  bounding_box_ = box;
109 }
111  return bounding_box_;
112 }
114  return median_cell_height_;
115 }
117  return median_cell_width_;
118 }
119 int StructuredTable::row_height(int row) const {
120  ASSERT_HOST(0 <= row && row < row_count());
121  return cell_y_[row + 1] - cell_y_[row];
122 }
123 int StructuredTable::column_width(int column) const {
124  ASSERT_HOST(0 <= column && column < column_count());
125  return cell_x_[column + 1] - cell_x_[column];
126 }
128  return space_above_;
129 }
131  return space_below_;
132 }
133 
134 // At this point, we know that the lines are contained
135 // by the box (by FindLinesBoundingBox).
136 // So try to find the cell structure and make sure it works out.
137 // The assumption is that all lines span the table. If this
138 // assumption fails, the VerifyLinedTable method will
139 // abort the lined table. The TableRecognizer will fall
140 // back on FindWhitespacedStructure.
142  ClearStructure();
143 
144  // Search for all of the lines in the current box.
145  // Update the cellular structure with the exact lines.
147  box_search.SetUniqueMode(true);
148  box_search.StartRectSearch(bounding_box_);
149  ColPartition* line = NULL;
150 
151  while ((line = box_search.NextRectSearch()) != NULL) {
152  if (line->IsHorizontalLine())
153  cell_y_.push_back(line->MidY());
154  if (line->IsVerticalLine())
155  cell_x_.push_back(line->MidX());
156  }
157 
158  // HasSignificantLines should guarantee cells.
159  // Because that code is a different class, just gracefully
160  // return false. This could be an assert.
161  if (cell_x_.length() < 3 || cell_y_.length() < 3)
162  return false;
163 
164  cell_x_.sort();
165  cell_y_.sort();
166 
167  // Remove duplicates that may have occurred due to split lines.
170 
171  // The border should be the extents of line boxes, not middle.
172  cell_x_[0] = bounding_box_.left();
176 
177  // Remove duplicates that may have occurred due to moving the borders.
180 
182  CalculateStats();
184  return is_lined_;
185 }
186 
187 // Finds the cellular structure given a particular box.
189  ClearStructure();
192 
193  if (!VerifyWhitespacedTable()) {
194  return false;
195  } else {
202  CalculateStats();
203  return true;
204  }
205 }
206 
207 // Tests if a partition fits inside the table structure.
208 // Partitions must fully span a grid line in order to intersect it.
209 // This means that a partition does not intersect a line
210 // that it "just" touches. This is mainly because the assumption
211 // throughout the code is that "0" distance is a very very small space.
213  const TBOX& box = part.bounding_box();
214  for (int i = 0; i < cell_x_.length(); ++i)
215  if (box.left() < cell_x_[i] && cell_x_[i] < box.right())
216  return false;
217  for (int i = 0; i < cell_y_.length(); ++i)
218  if (box.bottom() < cell_y_[i] && cell_y_[i] < box.top())
219  return false;
220  return true;
221 }
222 
223 // Checks if a sub-table has multiple data cells filled.
225  return CountFilledCells(0, row_count() - 1, 0, column_count() - 1);
226 }
228  return CountFilledCells(row, row, 0, column_count() - 1);
229 }
231  return CountFilledCells(0, row_count() - 1, column, column);
232 }
233 int StructuredTable::CountFilledCells(int row_start, int row_end,
234  int column_start, int column_end) {
235  ASSERT_HOST(0 <= row_start && row_start <= row_end && row_end < row_count());
236  ASSERT_HOST(0 <= column_start && column_start <= column_end &&
237  column_end < column_count());
238  int cell_count = 0;
239  TBOX cell_box;
240  for (int row = row_start; row <= row_end; ++row) {
241  cell_box.set_bottom(cell_y_[row]);
242  cell_box.set_top(cell_y_[row + 1]);
243  for (int col = column_start; col <= column_end; ++col) {
244  cell_box.set_left(cell_x_[col]);
245  cell_box.set_right(cell_x_[col + 1]);
246  if (CountPartitions(cell_box) > 0)
247  ++cell_count;
248  }
249  }
250  return cell_count;
251 }
252 
253 // Makes sure that at least one cell in a row has substantial area filled.
254 // This can filter out large whitespace caused by growing tables too far
255 // and page numbers.
257  for (int i = 0; i < column_count(); ++i) {
258  double area_filled = CalculateCellFilledPercentage(row, i);
259  if (area_filled >= kMinFilledArea)
260  return true;
261  }
262  return false;
263 }
264 
265 // Finds the filled area in a cell.
266 // Assume ColPartitions do not overlap for simplicity (even though they do).
268  ASSERT_HOST(0 <= row && row <= row_count());
269  ASSERT_HOST(0 <= column && column <= column_count());
270  const TBOX kCellBox(cell_x_[column], cell_y_[row],
271  cell_x_[column + 1], cell_y_[row + 1]);
272  ASSERT_HOST(!kCellBox.null_box());
273 
275  gsearch.SetUniqueMode(true);
276  gsearch.StartRectSearch(kCellBox);
277  double area_covered = 0;
278  ColPartition* text = NULL;
279  while ((text = gsearch.NextRectSearch()) != NULL) {
280  if (text->IsTextType())
281  area_covered += text->bounding_box().intersection(kCellBox).area();
282  }
283  const inT32 current_area = kCellBox.area();
284  if (current_area == 0) {
285  return 1.0;
286  }
287  return MIN(1.0, area_covered / current_area);
288 }
289 
291 #ifndef GRAPHICS_DISABLED
292  window->Brush(ScrollView::NONE);
293  window->Pen(color);
296  for (int i = 0; i < cell_x_.length(); i++) {
297  window->Line(cell_x_[i], bounding_box_.bottom(),
298  cell_x_[i], bounding_box_.top());
299  }
300  for (int i = 0; i < cell_y_.length(); i++) {
301  window->Line(bounding_box_.left(), cell_y_[i],
302  bounding_box_.right(), cell_y_[i]);
303  }
304  window->UpdateWindow();
305 #endif
306 }
307 
308 // Clear structure information.
310  cell_x_.clear();
311  cell_y_.clear();
312  is_lined_ = false;
313  space_above_ = 0;
314  space_below_ = 0;
315  space_left_ = 0;
316  space_right_ = 0;
318  median_cell_width_ = 0;
319 }
320 
321 // When a table has lines, the lines should not intersect any partitions.
322 // The following function makes sure the previous assumption is met.
324  // Function only called when lines exist.
325  ASSERT_HOST(cell_y_.length() >= 2 && cell_x_.length() >= 2);
326  for (int i = 0; i < cell_y_.length(); ++i) {
328  return false;
329  }
330  for (int i = 0; i < cell_x_.length(); ++i) {
332  return false;
333  }
334  return true;
335 }
336 
337 // TODO(nbeato): Could be much better than this.
338 // Examples:
339 // - Caclulate the percentage of filled cells.
340 // - Calculate the average number of ColPartitions per cell.
341 // - Calculate the number of cells per row with partitions.
342 // - Check if ColPartitions in adjacent cells are similar.
343 // - Check that all columns are at least a certain width.
344 // - etc.
346  // criteria for a table, must be at least 2x3 or 3x2
347  return row_count() >= 2 && column_count() >= 2 && cell_count() >= 6;
348 }
349 
350 // Finds vertical splits in the ColPartitions of text_grid_ by considering
351 // all possible "good" guesses. A good guess is just the left/right sides of
352 // the partitions, since these locations will uniquely define where the
353 // extremal values where the splits can occur. The split happens
354 // in the middle of the two nearest partitions.
356  // Set of the extents of all partitions on the page.
357  GenericVectorEqEq<int> left_sides;
358  GenericVectorEqEq<int> right_sides;
359 
360  // Look at each text partition. We want to find the partitions
361  // that have extremal left/right sides. These will give us a basis
362  // for the table columns.
364  gsearch.SetUniqueMode(true);
366  ColPartition* text = NULL;
367  while ((text = gsearch.NextRectSearch()) != NULL) {
368  if (!text->IsTextType())
369  continue;
370 
371  ASSERT_HOST(text->bounding_box().left() < text->bounding_box().right());
372  int spacing = static_cast<int>(text->median_width() *
373  kHorizontalSpacing / 2.0 + 0.5);
374  left_sides.push_back(text->bounding_box().left() - spacing);
375  right_sides.push_back(text->bounding_box().right() + spacing);
376  }
377  // It causes disaster below, so avoid it!
378  if (left_sides.length() == 0 || right_sides.length() == 0)
379  return;
380 
381  // Since data may be inserted in grid order, we sort the left/right sides.
382  left_sides.sort();
383  right_sides.sort();
384 
385  // At this point, in the "merged list", we expect to have a left side,
386  // followed by either more left sides or a right side. The last number
387  // should be a right side. We find places where the splits occur by looking
388  // for "valleys". If we want to force gap sizes or allow overlap, change
389  // the spacing above. If you want to let lines "slice" partitions as long
390  // as it is infrequent, change the following function.
391  FindCellSplitLocations(left_sides, right_sides, kCellSplitColumnThreshold,
392  &cell_x_);
393 }
394 
395 // Finds horizontal splits in the ColPartitions of text_grid_ by considering
396 // all possible "good" guesses. A good guess is just the bottom/top sides of
397 // the partitions, since these locations will uniquely define where the
398 // extremal values where the splits can occur. The split happens
399 // in the middle of the two nearest partitions.
401  // Set of the extents of all partitions on the page.
402  GenericVectorEqEq<int> bottom_sides;
403  GenericVectorEqEq<int> top_sides;
404  // We will be "shrinking" partitions, so keep the min/max around to
405  // make sure the bottom/top lines do not intersect text.
406  int min_bottom = MAX_INT32;
407  int max_top = MIN_INT32;
408 
409  // Look at each text partition. We want to find the partitions
410  // that have extremal bottom/top sides. These will give us a basis
411  // for the table rows. Because the textlines can be skewed and close due
412  // to warping, the height of the partitions is toned down a little bit.
414  gsearch.SetUniqueMode(true);
416  ColPartition* text = NULL;
417  while ((text = gsearch.NextRectSearch()) != NULL) {
418  if (!text->IsTextType())
419  continue;
420 
421  ASSERT_HOST(text->bounding_box().bottom() < text->bounding_box().top());
422  min_bottom = MIN(min_bottom, text->bounding_box().bottom());
423  max_top = MAX(max_top, text->bounding_box().top());
424 
425  // Ignore "tall" text partitions, as these are usually false positive
426  // vertical text or multiple lines pulled together.
427  if (text->bounding_box().height() > max_text_height_)
428  continue;
429 
430  int spacing = static_cast<int>(text->bounding_box().height() *
431  kVerticalSpacing / 2.0 + 0.5);
432  int bottom = text->bounding_box().bottom() - spacing;
433  int top = text->bounding_box().top() + spacing;
434  // For horizontal text, the factor can be negative. This should
435  // probably cause a warning or failure. I haven't actually checked if
436  // it happens.
437  if (bottom >= top)
438  continue;
439 
440  bottom_sides.push_back(bottom);
441  top_sides.push_back(top);
442  }
443  // It causes disaster below, so avoid it!
444  if (bottom_sides.length() == 0 || top_sides.length() == 0)
445  return;
446 
447  // Since data may be inserted in grid order, we sort the bottom/top sides.
448  bottom_sides.sort();
449  top_sides.sort();
450 
451  // At this point, in the "merged list", we expect to have a bottom side,
452  // followed by either more bottom sides or a top side. The last number
453  // should be a top side. We find places where the splits occur by looking
454  // for "valleys". If we want to force gap sizes or allow overlap, change
455  // the spacing above. If you want to let lines "slice" partitions as long
456  // as it is infrequent, change the following function.
457  FindCellSplitLocations(bottom_sides, top_sides, kCellSplitRowThreshold,
458  &cell_y_);
459 
460  // Recover the min/max correctly since it was shifted.
461  cell_y_[0] = min_bottom;
462  cell_y_[cell_y_.length() - 1] = max_top;
463 }
464 
472 }
473 // Finds the nearest partition in grid to the table
474 // boundaries and updates the margin.
476  int below = FindVerticalMargin(grid, bounding_box_.bottom(), true);
477  space_below_ = MIN(space_below_, below);
478  int above = FindVerticalMargin(grid, bounding_box_.top(), false);
479  space_above_ = MIN(space_above_, above);
480  int left = FindHorizontalMargin(grid, bounding_box_.left(), true);
481  space_left_ = MIN(space_left_, left);
482  int right = FindHorizontalMargin(grid, bounding_box_.right(), false);
483  space_right_ = MIN(space_right_, right);
484 }
486  bool decrease) const {
487  ColPartitionGridSearch gsearch(grid);
488  gsearch.SetUniqueMode(true);
490  border);
491  ColPartition* part = NULL;
492  while ((part = gsearch.NextVerticalSearch(decrease)) != NULL) {
493  if (!part->IsTextType() && !part->IsHorizontalLine())
494  continue;
495  int distance = decrease ? border - part->bounding_box().top()
496  : part->bounding_box().bottom() - border;
497  if (distance >= 0)
498  return distance;
499  }
500  return MAX_INT32;
501 }
503  bool decrease) const {
504  ColPartitionGridSearch gsearch(grid);
505  gsearch.SetUniqueMode(true);
506  gsearch.StartSideSearch(border, bounding_box_.bottom(), bounding_box_.top());
507  ColPartition* part = NULL;
508  while ((part = gsearch.NextSideSearch(decrease)) != NULL) {
509  if (!part->IsTextType() && !part->IsVerticalLine())
510  continue;
511  int distance = decrease ? border - part->bounding_box().right()
512  : part->bounding_box().left() - border;
513  if (distance >= 0)
514  return distance;
515  }
516  return MAX_INT32;
517 }
518 
520  const int kMaxCellHeight = 1000;
521  const int kMaxCellWidth = 1000;
522  STATS height_stats(0, kMaxCellHeight + 1);
523  STATS width_stats(0, kMaxCellWidth + 1);
524 
525  for (int i = 0; i < row_count(); ++i)
526  height_stats.add(row_height(i), column_count());
527  for (int i = 0; i < column_count(); ++i)
528  width_stats.add(column_width(i), row_count());
529 
530  median_cell_height_ = static_cast<int>(height_stats.median() + 0.5);
531  median_cell_width_ = static_cast<int>(width_stats.median() + 0.5);
532 }
533 
534 // Looks for grid lines near the current bounding box and
535 // grows the bounding box to include them if no intersections
536 // will occur as a result. This is necessary because the margins
537 // are calculated relative to the closest line/text. If the
538 // line isn't absorbed, the margin will be the distance to the line.
541  gsearch.SetUniqueMode(true);
542 
543  // Is the closest line above good? Loop multiple times for tables with
544  // multi-line (sometimes 2) borders. Limit the number of lines by
545  // making sure they stay within a table cell or so.
546  ColPartition* line = NULL;
548  bounding_box_.top());
549  while ((line = gsearch.NextVerticalSearch(false)) != NULL) {
550  if (!line->IsHorizontalLine())
551  break;
552  TBOX text_search(bounding_box_.left(), bounding_box_.top() + 1,
553  bounding_box_.right(), line->MidY());
554  if (text_search.height() > median_cell_height_ * 2)
555  break;
556  if (CountPartitions(text_search) > 0)
557  break;
558  bounding_box_.set_top(line->MidY());
559  }
560  // As above, is the closest line below good?
561  line = NULL;
564  while ((line = gsearch.NextVerticalSearch(true)) != NULL) {
565  if (!line->IsHorizontalLine())
566  break;
567  TBOX text_search(bounding_box_.left(), line->MidY(),
569  if (text_search.height() > median_cell_height_ * 2)
570  break;
571  if (CountPartitions(text_search) > 0)
572  break;
573  bounding_box_.set_bottom(line->MidY());
574  }
575  // TODO(nbeato): vertical lines
576 }
577 
578 
579 // This function will find all "0 valleys" (of any length) given two
580 // arrays. The arrays are the mins and maxes of partitions (either
581 // left and right or bottom and top). Since the min/max lists are generated
582 // with pairs of increasing integers, we can make some assumptions in
583 // the function about ordering of the overall list, which are shown in the
584 // asserts.
585 // The algorithm works as follows:
586 // While there are numbers to process, take the smallest number.
587 // If it is from the min_list, increment the "hill" counter.
588 // Otherwise, decrement the "hill" counter.
589 // In the process of doing this, keep track of "crossing" the
590 // desired height.
591 // The first/last items are extremal values of the list and known.
592 // NOTE: This function assumes the lists are sorted!
594  const GenericVector<int>& max_list,
595  int max_merged,
596  GenericVector<int>* locations) {
597  locations->clear();
598  ASSERT_HOST(min_list.length() == max_list.length());
599  if (min_list.length() == 0)
600  return;
601  ASSERT_HOST(min_list.get(0) < max_list.get(0));
602  ASSERT_HOST(min_list.get(min_list.length() - 1) <
603  max_list.get(max_list.length() - 1));
604 
605  locations->push_back(min_list.get(0));
606  int min_index = 0;
607  int max_index = 0;
608  int stacked_partitions = 0;
609  int last_cross_position = MAX_INT32;
610  // max_index will expire after min_index.
611  // However, we can't "increase" the hill size if min_index expired.
612  // So finish processing when min_index expires.
613  while (min_index < min_list.length()) {
614  // Increase the hill count.
615  if (min_list[min_index] < max_list[max_index]) {
616  ++stacked_partitions;
617  if (last_cross_position != MAX_INT32 &&
618  stacked_partitions > max_merged) {
619  int mid = (last_cross_position + min_list[min_index]) / 2;
620  locations->push_back(mid);
621  last_cross_position = MAX_INT32;
622  }
623  ++min_index;
624  } else {
625  // Decrease the hill count.
626  --stacked_partitions;
627  if (last_cross_position == MAX_INT32 &&
628  stacked_partitions <= max_merged) {
629  last_cross_position = max_list[max_index];
630  }
631  ++max_index;
632  }
633  }
634  locations->push_back(max_list.get(max_list.length() - 1));
635 }
636 
637 // Counts the number of partitions in the table
638 // box that intersection the given x value.
640  int count = 0;
641  // Make a small box to keep the search time down.
642  const int kGridSize = text_grid_->gridsize();
643  TBOX vertical_box = bounding_box_;
644  vertical_box.set_left(x - kGridSize);
645  vertical_box.set_right(x + kGridSize);
646 
648  gsearch.SetUniqueMode(true);
649  gsearch.StartRectSearch(vertical_box);
650  ColPartition* text = NULL;
651  while ((text = gsearch.NextRectSearch()) != NULL) {
652  if (!text->IsTextType())
653  continue;
654  const TBOX& box = text->bounding_box();
655  if (box.left() < x && x < box.right())
656  ++count;
657  }
658  return count;
659 }
660 
661 // Counts the number of partitions in the table
662 // box that intersection the given y value.
664  int count = 0;
665  // Make a small box to keep the search time down.
666  const int kGridSize = text_grid_->gridsize();
667  TBOX horizontal_box = bounding_box_;
668  horizontal_box.set_bottom(y - kGridSize);
669  horizontal_box.set_top(y + kGridSize);
670 
672  gsearch.SetUniqueMode(true);
673  gsearch.StartRectSearch(horizontal_box);
674  ColPartition* text = NULL;
675  while ((text = gsearch.NextRectSearch()) != NULL) {
676  if (!text->IsTextType())
677  continue;
678 
679  const TBOX& box = text->bounding_box();
680  if (box.bottom() < y && y < box.top())
681  ++count;
682  }
683  return count;
684 }
685 
686 // Counts how many text partitions are in this box.
687 // This is used to count partitons in cells, as that can indicate
688 // how "strong" a potential table row/column (or even full table) actually is.
691  gsearch.SetUniqueMode(true);
692  gsearch.StartRectSearch(box);
693  int count = 0;
694  ColPartition* text = NULL;
695  while ((text = gsearch.NextRectSearch()) != NULL) {
696  if (text->IsTextType())
697  ++count;
698  }
699  return count;
700 }
701 
705 
707  : text_grid_(NULL),
708  line_grid_(NULL),
709  min_height_(0),
710  min_width_(0),
712 }
713 
715 }
716 
718 }
719 
721  text_grid_ = text_grid;
722 }
724  line_grid_ = line_grid;
725 }
727  min_height_ = height;
728 }
730  min_width_ = width;
731 }
733  max_text_height_ = height;
734 }
735 
737  StructuredTable* table = new StructuredTable();
738  table->Init();
739  table->set_text_grid(text_grid_);
740  table->set_line_grid(line_grid_);
742 
743  // Try to solve this simple case, a table with *both*
744  // vertical and horizontal lines.
745  if (RecognizeLinedTable(guess, table))
746  return table;
747 
748  // Fallback to whitespace if that failed.
749  // TODO(nbeato): Break this apart to take advantage of horizontal
750  // lines or vertical lines when present.
751  if (RecognizeWhitespacedTable(guess, table))
752  return table;
753 
754  // No table found...
755  delete table;
756  return NULL;
757 }
758 
760  StructuredTable* table) {
761  if (!HasSignificantLines(guess_box))
762  return false;
763  TBOX line_bound = guess_box;
764  if (!FindLinesBoundingBox(&line_bound))
765  return false;
766  table->set_bounding_box(line_bound);
767  return table->FindLinedStructure();
768 }
769 
770 // Quick implementation. Just count the number of lines in the box.
771 // A better implementation would counter intersections and look for connected
772 // components. It could even go as far as finding similar length lines.
773 // To account for these possible issues, the VerifyLinedTableCells function
774 // will reject lined tables that cause intersections with text on the page.
775 // TODO(nbeato): look for "better" lines
778  box_search.SetUniqueMode(true);
779  box_search.StartRectSearch(guess);
780  ColPartition* line = NULL;
781  int vertical_count = 0;
782  int horizontal_count = 0;
783 
784  while ((line = box_search.NextRectSearch()) != NULL) {
785  if (line->IsHorizontalLine())
786  ++horizontal_count;
787  if (line->IsVerticalLine())
788  ++vertical_count;
789  }
790 
791  return vertical_count >= kLinedTableMinVerticalLines &&
792  horizontal_count >= kLinedTableMinHorizontalLines;
793 }
794 
795 // Given a bounding box with a bunch of horizontal / vertical lines,
796 // we just find the extents of all of these lines iteratively.
797 // The box will be at least as large as guess. This
798 // could possibly be a bad assumption.
799 // It is guaranteed to halt in at least O(n * gridarea) where n
800 // is the number of lines.
801 // The assumption is that growing the box iteratively will add lines
802 // several times, but eventually we'll find the extents.
803 //
804 // For tables, the approach is a bit aggressive, a single line (which could be
805 // noise or a column ruling) can destroy the table inside.
806 //
807 // TODO(nbeato): This is a quick first implementation.
808 // A better implementation would actually look for consistency
809 // in extents of the lines and find the extents using lines
810 // that clearly describe the table. This would allow the
811 // lines to "vote" for height/width. An approach like
812 // this would solve issues with page layout rulings.
813 // I haven't looked for these issues yet, so I can't even
814 // say they happen confidently.
816  // The first iteration will tell us if there are lines
817  // present and shrink the box to a minimal iterative size.
818  if (!FindLinesBoundingBoxIteration(bounding_box))
819  return false;
820 
821  // Keep growing until the area of the table stabilizes.
822  // The box can only get bigger, increasing area.
823  bool changed = true;
824  while (changed) {
825  changed = false;
826  int old_area = bounding_box->area();
827  bool check = FindLinesBoundingBoxIteration(bounding_box);
828  // At this point, the function will return true.
829  ASSERT_HOST(check);
830  ASSERT_HOST(bounding_box->area() >= old_area);
831  changed = (bounding_box->area() > old_area);
832  }
833 
834  return true;
835 }
836 
838  // Search for all of the lines in the current box, keeping track of extents.
840  box_search.SetUniqueMode(true);
841  box_search.StartRectSearch(*bounding_box);
842  ColPartition* line = NULL;
843  bool first_line = true;
844 
845  while ((line = box_search.NextRectSearch()) != NULL) {
846  if (line->IsLineType()) {
847  if (first_line) {
848  // The first iteration can shrink the box.
849  *bounding_box = line->bounding_box();
850  first_line = false;
851  } else {
852  *bounding_box += line->bounding_box();
853  }
854  }
855  }
856  return !first_line;
857 }
858 
859 // The goal of this function is to move the table boundaries around and find
860 // a table that maximizes the whitespace around the table while maximizing
861 // the cellular structure. As a result, it gets confused by headers, footers,
862 // and merged columns (text that crosses columns). There is a tolerance
863 // that allows a few partitions to count towards potential cell merges.
864 // It's the max_merged parameter to FindPartitionLocations.
865 // It can work, but it needs some false positive remove on boundaries.
866 // For now, the grid structure must not intersect any partitions.
867 // Also, small tolerance is added to the horizontal lines for tightly packed
868 // tables. The tolerance is added by adjusting the bounding boxes of the
869 // partitions (in FindHorizontalPartitions). The current implementation
870 // only adjusts the vertical extents of the table.
871 //
872 // Also note. This was hacked at a lot. It could probably use some
873 // more hacking at to find a good set of border conditions and then a
874 // nice clean up.
876  StructuredTable* table) {
877  TBOX best_box = guess_box; // Best borders known.
878  int best_below = 0; // Margin size above best table.
879  int best_above = 0; // Margin size below best table.
880  TBOX adjusted = guess_box; // The search box.
881 
882  // We assume that the guess box is somewhat accurate, so we don't allow
883  // the adjusted border to pass half of the guessed area. This prevents
884  // "negative" tables from forming.
885  const int kMidGuessY = (guess_box.bottom() + guess_box.top()) / 2;
886  // Keeps track of the most columns in an accepted table. The resulting table
887  // may be less than the max, but we don't want to stray too far.
888  int best_cols = 0;
889  // Make sure we find a good border.
890  bool found_good_border = false;
891 
892  // Find the bottom of the table by trying a few different locations. For
893  // each location, the top, left, and right are fixed. We start the search
894  // in a smaller table to favor best_cols getting a good estimate sooner.
895  int last_bottom = MAX_INT32;
896  int bottom = NextHorizontalSplit(guess_box.left(), guess_box.right(),
897  kMidGuessY - min_height_ / 2, true);
898  int top = NextHorizontalSplit(guess_box.left(), guess_box.right(),
899  kMidGuessY + min_height_ / 2, false);
900  adjusted.set_top(top);
901 
902  // Headers/footers can be spaced far from everything.
903  // Make sure that the space below is greater than the space above
904  // the lowest row.
905  int previous_below = 0;
906  const int kMaxChances = 10;
907  int chances = kMaxChances;
908  while (bottom != last_bottom) {
909  adjusted.set_bottom(bottom);
910 
911  if (adjusted.height() >= min_height_) {
912  // Try to fit the grid on the current box. We give it a chance
913  // if the number of columns didn't significantly drop.
914  table->set_bounding_box(adjusted);
915  if (table->FindWhitespacedStructure() &&
916  table->column_count() >= best_cols * kRequiredColumns) {
917  if (false && IsWeakTableRow(table, 0)) {
918  // Currently buggy, but was looking promising so disabled.
919  --chances;
920  } else {
921  // We favor 2 things,
922  // 1- Adding rows that have partitioned data.
923  // 2- Better margins (to find header/footer).
924  // For better tables, we just look for multiple cells in the
925  // bottom row with data in them.
926  // For margins, the space below the last row should
927  // be better than a table with the last row removed.
928  chances = kMaxChances;
929  double max_row_height = kMaxRowSize * table->median_cell_height();
930  if ((table->space_below() * kMarginFactor >= best_below &&
931  table->space_below() >= previous_below) ||
932  (table->CountFilledCellsInRow(0) > 1 &&
933  table->row_height(0) < max_row_height)) {
934  best_box.set_bottom(bottom);
935  best_below = table->space_below();
936  best_cols = MAX(table->column_count(), best_cols);
937  found_good_border = true;
938  }
939  }
940  previous_below = table->space_below();
941  } else {
942  --chances;
943  }
944  }
945  if (chances <= 0)
946  break;
947 
948  last_bottom = bottom;
949  bottom = NextHorizontalSplit(guess_box.left(), guess_box.right(),
950  last_bottom, true);
951  }
952  if (!found_good_border)
953  return false;
954 
955  // TODO(nbeato) comments: follow modified code above... put it in a function!
956  found_good_border = false;
957  int last_top = MIN_INT32;
958  top = NextHorizontalSplit(guess_box.left(), guess_box.right(),
959  kMidGuessY + min_height_ / 2, false);
960  int previous_above = 0;
961  chances = kMaxChances;
962 
963  adjusted.set_bottom(best_box.bottom());
964  while (last_top != top) {
965  adjusted.set_top(top);
966  if (adjusted.height() >= min_height_) {
967  table->set_bounding_box(adjusted);
968  if (table->FindWhitespacedStructure() &&
969  table->column_count() >= best_cols * kRequiredColumns) {
970  int last_row = table->row_count() - 1;
971  if (false && IsWeakTableRow(table, last_row)) {
972  // Currently buggy, but was looking promising so disabled.
973  --chances;
974  } else {
975  chances = kMaxChances;
976  double max_row_height = kMaxRowSize * table->median_cell_height();
977  if ((table->space_above() * kMarginFactor >= best_above &&
978  table->space_above() >= previous_above) ||
979  (table->CountFilledCellsInRow(last_row) > 1 &&
980  table->row_height(last_row) < max_row_height)) {
981  best_box.set_top(top);
982  best_above = table->space_above();
983  best_cols = MAX(table->column_count(), best_cols);
984  found_good_border = true;
985  }
986  }
987  previous_above = table->space_above();
988  } else {
989  --chances;
990  }
991  }
992  if (chances <= 0)
993  break;
994 
995  last_top = top;
996  top = NextHorizontalSplit(guess_box.left(), guess_box.right(),
997  last_top, false);
998  }
999 
1000  if (!found_good_border)
1001  return false;
1002 
1003  // If we get here, this shouldn't happen. It can be an assert, but
1004  // I haven't tested it enough to make it crash things.
1005  if (best_box.null_box())
1006  return false;
1007 
1008  // Given the best locations, fit the box to those locations.
1009  table->set_bounding_box(best_box);
1010  return table->FindWhitespacedStructure();
1011 }
1012 
1013 // Finds the closest value to y that can safely cause a horizontal
1014 // split in the partitions.
1015 // This function has been buggy and not as reliable as I would've
1016 // liked. I suggest finding all of the splits using the
1017 // FindPartitionLocations once and then just keeping the results
1018 // of that function cached somewhere.
1019 int TableRecognizer::NextHorizontalSplit(int left, int right, int y,
1020  bool top_to_bottom) {
1022  gsearch.SetUniqueMode(true);
1023  gsearch.StartVerticalSearch(left, right, y);
1024  ColPartition* text = NULL;
1025  int last_y = y;
1026  while ((text = gsearch.NextVerticalSearch(top_to_bottom)) != NULL) {
1027  if (!text->IsTextType() || !text->IsHorizontalType())
1028  continue;
1029  if (text->bounding_box().height() > max_text_height_)
1030  continue;
1031 
1032  const TBOX& text_box = text->bounding_box();
1033  if (top_to_bottom && (last_y >= y || last_y <= text_box.top())) {
1034  last_y = MIN(last_y, text_box.bottom());
1035  continue;
1036  }
1037  if (!top_to_bottom && (last_y <= y || last_y >= text_box.bottom())) {
1038  last_y = MAX(last_y, text_box.top());
1039  continue;
1040  }
1041 
1042  return last_y;
1043  }
1044  // If none is found, we at least want to preserve the min/max,
1045  // which defines the overlap of y with the last partition in the grid.
1046  return last_y;
1047 }
1048 
1049 // Code is buggy right now. It is disabled in the calling function.
1050 // It seems like sometimes the row that is passed in is not correct
1051 // sometimes (like a phantom row is introduced). There's something going
1052 // on in the cell_y_ data member before this is called... not certain.
1054  if (!table->VerifyRowFilled(row))
1055  return false;
1056 
1057  double threshold = 0.0;
1059  threshold = table->column_count() * kGoodRowNumberOfColumnsLarge;
1060  else
1061  threshold = kGoodRowNumberOfColumnsSmall[table->column_count()];
1062 
1063  return table->CountFilledCellsInRow(row) < threshold;
1064 }
1065 
1066 } // namespace tesseract
bool RecognizeLinedTable(const TBOX &guess_box, StructuredTable *table)
Definition: tablerecog.cpp:759
void Line(int x1, int y1, int x2, int y2)
Definition: scrollview.cpp:538
int FindVerticalMargin(ColPartitionGrid *grid, int start_x, bool decrease) const
Definition: tablerecog.cpp:485
TBOX intersection(const TBOX &box) const
Definition: rect.cpp:87
int CountFilledCellsInRow(int row)
Definition: tablerecog.cpp:227
const double kGoodRowNumberOfColumnsSmall[]
Definition: tablerecog.cpp:54
double CalculateCellFilledPercentage(int row, int column)
Definition: tablerecog.cpp:267
const double kRequiredColumns
Definition: tablerecog.cpp:46
int32_t inT32
Definition: host.h:38
const double kHorizontalSpacing
Definition: tablerecog.cpp:33
int CountVerticalIntersections(int x)
Definition: tablerecog.cpp:639
inT32 area() const
Definition: rect.h:118
void Brush(Color color)
Definition: scrollview.cpp:732
#define MAX_INT32
Definition: host.h:62
GenericVectorEqEq< int > cell_y_
Definition: tablerecog.h:244
int push_back(T object)
void set_line_grid(ColPartitionGrid *lines)
Definition: tablerecog.cpp:89
StructuredTable * RecognizeTable(const TBOX &guess_box)
Definition: tablerecog.cpp:736
void set_line_grid(ColPartitionGrid *lines)
Definition: tablerecog.cpp:723
void set_text_grid(ColPartitionGrid *text)
Definition: tablerecog.cpp:86
bool FindLinesBoundingBox(TBOX *bounding_box)
Definition: tablerecog.cpp:815
int gridsize() const
Definition: bbgrid.h:64
bool IsHorizontalLine() const
Definition: colpartition.h:453
const double kGoodRowNumberOfColumnsLarge
Definition: tablerecog.cpp:58
int NextHorizontalSplit(int left, int right, int y, bool top_to_bottom)
bool DoesPartitionFit(const ColPartition &part) const
Definition: tablerecog.cpp:212
#define ASSERT_HOST(x)
Definition: errcode.h:84
inT16 left() const
Definition: rect.h:68
const int kLinedTableMinHorizontalLines
Definition: tablerecog.cpp:43
void SetUniqueMode(bool mode)
Definition: bbgrid.h:255
void set_top(int y)
Definition: rect.h:57
void StartVerticalSearch(int xmin, int xmax, int y)
Definition: bbgrid.h:792
void UpdateWindow()
Definition: scrollview.cpp:710
void set_max_text_height(int height)
Definition: tablerecog.cpp:92
const int kLinedTableMinVerticalLines
Definition: tablerecog.cpp:42
bool RecognizeWhitespacedTable(const TBOX &guess_box, StructuredTable *table)
Definition: tablerecog.cpp:875
const double kMarginFactor
Definition: tablerecog.cpp:48
void set_min_height(int height)
Definition: tablerecog.cpp:726
void set_max_text_height(int height)
Definition: tablerecog.cpp:732
void Display(ScrollView *window, ScrollView::Color color)
Definition: tablerecog.cpp:290
void Rectangle(int x1, int y1, int x2, int y2)
Definition: scrollview.cpp:606
double median() const
Definition: statistc.cpp:239
const TBOX & bounding_box() const
Definition: colpartition.h:109
static bool IsWeakTableRow(StructuredTable *table, int row)
void StartSideSearch(int x, int ymin, int ymax)
Definition: bbgrid.h:750
bool null_box() const
Definition: rect.h:46
const double kMinFilledArea
Definition: tablerecog.cpp:61
T & get(int index) const
int length() const
Definition: genericvector.h:85
void add(inT32 value, inT32 count)
Definition: statistc.cpp:101
BBC * NextVerticalSearch(bool top_to_bottom)
Definition: bbgrid.h:806
inT16 top() const
Definition: rect.h:54
#define MIN_INT32
Definition: host.h:70
#define MAX(x, y)
Definition: ndminx.h:24
void set_text_grid(ColPartitionGrid *text)
Definition: tablerecog.cpp:720
Definition: rect.h:30
void compact_sorted()
const double kMaxRowSize
Definition: tablerecog.cpp:51
#define MIN(x, y)
Definition: ndminx.h:28
void set_bounding_box(const TBOX &box)
Definition: tablerecog.cpp:107
const int kCellSplitRowThreshold
Definition: tablerecog.cpp:39
BBC * NextSideSearch(bool right_to_left)
Definition: bbgrid.h:765
inT16 height() const
Definition: rect.h:104
BBC * NextRectSearch()
Definition: bbgrid.h:846
static void FindCellSplitLocations(const GenericVector< int > &min_list, const GenericVector< int > &max_list, int max_merged, GenericVector< int > *locations)
Definition: tablerecog.cpp:593
int column_width(int column) const
Definition: tablerecog.cpp:123
inT16 right() const
Definition: rect.h:75
int row_height(int row) const
Definition: tablerecog.cpp:119
ColPartitionGrid * line_grid_
Definition: tablerecog.h:368
void set_right(int x)
Definition: rect.h:78
void set_left(int x)
Definition: rect.h:71
Definition: statistc.h:33
ColPartitionGrid * text_grid_
Definition: tablerecog.h:237
ColPartitionGrid * line_grid_
Definition: tablerecog.h:238
inT16 bottom() const
Definition: rect.h:61
const double kVerticalSpacing
Definition: tablerecog.cpp:36
const TBOX & bounding_box() const
Definition: tablerecog.cpp:110
void StartRectSearch(const TBOX &rect)
Definition: bbgrid.h:834
const int kCellSplitColumnThreshold
Definition: tablerecog.cpp:40
GenericVectorEqEq< int > cell_x_
Definition: tablerecog.h:243
void set_bottom(int y)
Definition: rect.h:64
int CountPartitions(const TBOX &box)
Definition: tablerecog.cpp:689
ColPartitionGrid * text_grid_
Definition: tablerecog.h:367
bool HasSignificantLines(const TBOX &guess)
Definition: tablerecog.cpp:776
bool IsHorizontalType() const
Definition: colpartition.h:439
int count(LIST var_list)
Definition: oldlist.cpp:103
int CountFilledCellsInColumn(int column)
Definition: tablerecog.cpp:230
bool FindLinesBoundingBoxIteration(TBOX *bounding_box)
Definition: tablerecog.cpp:837
void UpdateMargins(ColPartitionGrid *grid)
Definition: tablerecog.cpp:475
bool IsVerticalLine() const
Definition: colpartition.h:448
void Pen(Color color)
Definition: scrollview.cpp:726
bool VerifyRowFilled(int row)
Definition: tablerecog.cpp:256
int FindHorizontalMargin(ColPartitionGrid *grid, int start_y, bool decrease) const
Definition: tablerecog.cpp:502
int CountHorizontalIntersections(int y)
Definition: tablerecog.cpp:663
const int kGoodRowNumberOfColumnsSmallSize
Definition: tablerecog.cpp:55
void set_min_width(int width)
Definition: tablerecog.cpp:729