tesseract  4.00.00dev
tablefind.cpp
Go to the documentation of this file.
1 // File: tablefind.cpp
3 // Description: Helper classes to find tables from ColPartitions.
4 // Author: Faisal Shafait (faisal.shafait@dfki.de)
5 // Created: Tue Jan 06 11:13:01 PST 2009
6 //
7 // (C) Copyright 2009, Google Inc.
8 // Licensed under the Apache License, Version 2.0 (the "License");
9 // you may not use this file except in compliance with the License.
10 // You may obtain a copy of the License at
11 // http://www.apache.org/licenses/LICENSE-2.0
12 // Unless required by applicable law or agreed to in writing, software
13 // distributed under the License is distributed on an "AS IS" BASIS,
14 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15 // See the License for the specific language governing permissions and
16 // limitations under the License.
17 //
19 
20 #ifdef _MSC_VER
21 #pragma warning(disable:4244) // Conversion warnings
22 #endif
23 
24 #ifdef HAVE_CONFIG_H
25 #include "config_auto.h"
26 #endif
27 
28 #include "tablefind.h"
29 #include <math.h>
30 
31 #include "allheaders.h"
32 
33 #include "colpartitionset.h"
34 #include "tablerecog.h"
35 
36 namespace tesseract {
37 
38 // These numbers are used to calculate the global median stats.
39 // They just set an upper bound on the stats objects.
40 // Maximum vertical spacing between neighbor partitions.
41 const int kMaxVerticalSpacing = 500;
42 // Maximum width of a blob in a partition.
43 const int kMaxBlobWidth = 500;
44 
45 // Minimum whitespace size to split a partition (measured as a multiple
46 // of a partition's median width).
47 const double kSplitPartitionSize = 2.0;
48 // To insert text, the partition must satisfy these size constraints
49 // in AllowTextPartition(). The idea is to filter noise partitions
50 // determined by the size compared to the global medians.
51 // TODO(nbeato): Need to find good numbers again.
52 const double kAllowTextHeight = 0.5;
53 const double kAllowTextWidth = 0.6;
54 const double kAllowTextArea = 0.8;
55 // The same thing applies to blobs (to filter noise).
56 // TODO(nbeato): These numbers are a shot in the dark...
57 // height and width are 0.5 * gridsize() in colfind.cpp
58 // area is a rough guess for the size of a period.
59 const double kAllowBlobHeight = 0.3;
60 const double kAllowBlobWidth = 0.4;
61 const double kAllowBlobArea = 0.05;
62 
63 // Minimum number of components in a text partition. A partition having fewer
64 // components than that is more likely a data partition and is a candidate
65 // table cell.
66 const int kMinBoxesInTextPartition = 10;
67 
68 // Maximum number of components that a data partition can have
69 const int kMaxBoxesInDataPartition = 20;
70 
71 // Maximum allowed gap in a text partitions as a multiple of its median size.
72 const double kMaxGapInTextPartition = 4.0;
73 
74 // Minimum value that the maximum gap in a text partition should have as a
75 // factor of its median size.
76 const double kMinMaxGapInTextPartition = 0.5;
77 
78 // The amount of overlap that is "normal" for adjacent blobs in a text
79 // partition. This is used to calculate gap between overlapping blobs.
80 const double kMaxBlobOverlapFactor = 4.0;
81 
82 // Maximum x-height a table partition can have as a multiple of global
83 // median x-height
84 const double kMaxTableCellXheight = 2.0;
85 
86 // Maximum line spacing between a table column header and column contents
87 // for merging the two (as a multiple of the partition's median_size).
89 
90 // Minimum ratio of num_table_partitions to num_text_partitions in a column
91 // block to be called it a table column
92 const double kTableColumnThreshold = 3.0;
93 
94 // Search for horizontal ruling lines within the vertical margin as a
95 // multiple of grid size
96 const int kRulingVerticalMargin = 3;
97 
98 // Minimum overlap that a colpartition must have with a table region
99 // to become part of that table
100 const double kMinOverlapWithTable = 0.6;
101 
102 // Maximum side space (distance from column boundary) that a typical
103 // text-line in flowing text should have as a multiple of its x-height
104 // (Median size).
105 const int kSideSpaceMargin = 10;
106 
107 // Fraction of the peak of x-projection of a table region to set the
108 // threshold for the x-projection histogram
109 const double kSmallTableProjectionThreshold = 0.35;
110 const double kLargeTableProjectionThreshold = 0.45;
111 // Minimum number of rows required to look for more rows in the projection.
112 const int kLargeTableRowCount = 6;
113 
114 // Minimum number of rows in a table
115 const int kMinRowsInTable = 3;
116 
117 // The amount of padding (multiplied by global_median_xheight_ during use)
118 // that is vertically added to the search adjacent leader search during
119 // ColPartition marking.
121 
122 // Used when filtering false positives. When finding the last line
123 // of a paragraph (typically left-aligned), the previous line should have
124 // its center to the right of the last line by this scaled amount.
126 
127 // The maximum amount of whitespace allowed left of a paragraph ending.
128 // Do not filter a ColPartition with more than this space left of it.
130 
131 // Used when filtering false positives. The last line of a paragraph
132 // should be preceded by a line that is predominantly text. This is the
133 // ratio of text to whitespace (to the right of the text) that is required
134 // for the previous line to be a text.
136 
137 // When counting table columns, this is the required gap between two columns
138 // (it is multiplied by global_median_xheight_).
139 const double kMaxXProjectionGapFactor = 2.0;
140 
141 // Used for similarity in partitions using stroke width. Values copied
142 // from ColFind.cpp in Ray's CL.
144 const double kStrokeWidthConstantTolerance = 2.0;
145 
146 BOOL_VAR(textord_show_tables, false, "Show table regions");
148  "Debug table marking steps in detail");
150  "Show page stats used in table finding");
152  "Enables the table recognizer for table layout and filtering.");
153 
156 
157 // Templated helper function used to create destructor callbacks for the
158 // BBGrid::ClearGridData() method.
159 template <typename T> void DeleteObject(T *object) {
160  delete object;
161 }
162 
164  : resolution_(0),
165  global_median_xheight_(0),
166  global_median_blob_width_(0),
167  global_median_ledding_(0),
168  left_to_right_language_(true) {
169 }
170 
172  // ColPartitions and ColSegments created by this class for storage in grids
173  // need to be deleted explicitly.
174  clean_part_grid_.ClearGridData(&DeleteObject<ColPartition>);
175  leader_and_ruling_grid_.ClearGridData(&DeleteObject<ColPartition>);
176  fragmented_text_grid_.ClearGridData(&DeleteObject<ColPartition>);
177  col_seg_grid_.ClearGridData(&DeleteObject<ColSegment>);
178  table_grid_.ClearGridData(&DeleteObject<ColSegment>);
179 }
180 
182  left_to_right_language_ = order;
183 }
184 
185 void TableFinder::Init(int grid_size, const ICOORD& bottom_left,
186  const ICOORD& top_right) {
187  // Initialize clean partitions list and grid
188  clean_part_grid_.Init(grid_size, bottom_left, top_right);
189  leader_and_ruling_grid_.Init(grid_size, bottom_left, top_right);
190  fragmented_text_grid_.Init(grid_size, bottom_left, top_right);
191  col_seg_grid_.Init(grid_size, bottom_left, top_right);
192  table_grid_.Init(grid_size, bottom_left, top_right);
193 }
194 
195 // Copy cleaned partitions from part_grid_ to clean_part_grid_ and
196 // insert leaders and rulers into the leader_and_ruling_grid_
198  TO_BLOCK* block) {
199  // Calculate stats. This lets us filter partitions in AllowTextPartition()
200  // and filter blobs in AllowBlob().
201  SetGlobalSpacings(grid);
202 
203  // Iterate the ColPartitions in the grid.
204  ColPartitionGridSearch gsearch(grid);
205  gsearch.SetUniqueMode(true);
206  gsearch.StartFullSearch();
207  ColPartition* part = NULL;
208  while ((part = gsearch.NextFullSearch()) != NULL) {
209  // Reject partitions with nothing useful inside of them.
210  if (part->blob_type() == BRT_NOISE || part->bounding_box().area() <= 0)
211  continue;
212  ColPartition* clean_part = part->ShallowCopy();
213  ColPartition* leader_part = NULL;
214  if (part->IsLineType()) {
215  InsertRulingPartition(clean_part);
216  continue;
217  }
218  // Insert all non-text partitions to clean_parts
219  if (!part->IsTextType()) {
220  InsertImagePartition(clean_part);
221  continue;
222  }
223  // Insert text colpartitions after removing noisy components from them
224  // The leaders are split into a separate grid.
225  BLOBNBOX_CLIST* part_boxes = part->boxes();
226  BLOBNBOX_C_IT pit(part_boxes);
227  for (pit.mark_cycle_pt(); !pit.cycled_list(); pit.forward()) {
228  BLOBNBOX *pblob = pit.data();
229  // Bad blobs... happens in UNLV set.
230  // news.3G1, page 17 (around x=6)
231  if (!AllowBlob(*pblob))
232  continue;
233  if (pblob->flow() == BTFT_LEADER) {
234  if (leader_part == NULL) {
235  leader_part = part->ShallowCopy();
236  leader_part->set_flow(BTFT_LEADER);
237  }
238  leader_part->AddBox(pblob);
239  } else if (pblob->region_type() != BRT_NOISE) {
240  clean_part->AddBox(pblob);
241  }
242  }
243  clean_part->ComputeLimits();
244  ColPartition* fragmented = clean_part->CopyButDontOwnBlobs();
245  InsertTextPartition(clean_part);
247  if (leader_part != NULL) {
248  // TODO(nbeato): Note that ComputeLimits does not update the column
249  // information. So the leader may appear to span more columns than it
250  // really does later on when IsInSameColumnAs gets called to test
251  // for adjacent leaders.
252  leader_part->ComputeLimits();
253  InsertLeaderPartition(leader_part);
254  }
255  }
256 
257  // Make the partition partners better for upper and lower neighbors.
260 }
261 
262 // High level function to perform table detection
264  ColPartitionSet** all_columns,
265  WidthCallback* width_cb,
266  const FCOORD& reskew) {
267  // initialize spacing, neighbors, and columns
268  InitializePartitions(all_columns);
269 
270 #ifndef GRAPHICS_DISABLED
271  if (textord_show_tables) {
272  ScrollView* table_win = MakeWindow(0, 300, "Column Partitions & Neighbors");
278 
279  table_win = MakeWindow(100, 300, "Fragmented Text");
281  }
282 #endif // GRAPHICS_DISABLED
283 
284  // mark, filter, and smooth candidate table partitions
286 
287  // Make single-column blocks from good_columns_ partitions. col_segments are
288  // moved to a grid later which takes the ownership
289  ColSegment_LIST column_blocks;
290  GetColumnBlocks(all_columns, &column_blocks);
291  // Set the ratio of candidate table partitions in each column
292  SetColumnsType(&column_blocks);
293 
294  // Move column segments to col_seg_grid_
295  MoveColSegmentsToGrid(&column_blocks, &col_seg_grid_);
296 
297  // Detect split in column layout that might have occurred due to the
298  // presence of a table. In such a case, merge the corresponding columns.
300 
301  // Group horizontally overlapping table partitions into table columns.
302  // table_columns created here get deleted at the end of this method.
303  ColSegment_LIST table_columns;
304  GetTableColumns(&table_columns);
305 
306  // Within each column, mark the range table regions occupy based on the
307  // table columns detected. table_regions are moved to a grid later which
308  // takes the ownership
309  ColSegment_LIST table_regions;
310  GetTableRegions(&table_columns, &table_regions);
311 
312 #ifndef GRAPHICS_DISABLED
314  ScrollView* table_win = MakeWindow(1200, 300, "Table Columns and Regions");
315  DisplayColSegments(table_win, &table_columns, ScrollView::DARK_TURQUOISE);
316  DisplayColSegments(table_win, &table_regions, ScrollView::YELLOW);
317  }
318 #endif // GRAPHICS_DISABLED
319 
320  // Merge table regions across columns for tables spanning multiple
321  // columns
322  MoveColSegmentsToGrid(&table_regions, &table_grid_);
324 
325  // Adjust table boundaries by including nearby horizontal lines and left
326  // out column headers
329 
331  // Remove false alarms consiting of a single column
333 
334 #ifndef GRAPHICS_DISABLED
335  if (textord_show_tables) {
336  ScrollView* table_win = MakeWindow(1200, 300, "Detected Table Locations");
338  DisplayColSegments(table_win, &table_columns, ScrollView::KHAKI);
339  table_grid_.DisplayBoxes(table_win);
340  }
341 #endif // GRAPHICS_DISABLED
342 
343  // Find table grid structure and reject tables that are malformed.
344  RecognizeTables();
346  RecognizeTables();
347 
348 #ifndef GRAPHICS_DISABLED
349  if (textord_show_tables) {
350  ScrollView* table_win = MakeWindow(1400, 600, "Recognized Tables");
353  table_grid_.DisplayBoxes(table_win);
354  }
355 #endif // GRAPHICS_DISABLED
356  } else {
357  // Remove false alarms consiting of a single column
358  // TODO(nbeato): verify this is a NOP after structured table rejection.
359  // Right now it isn't. If the recognize function is doing what it is
360  // supposed to do, this function is obsolete.
362 
363 #ifndef GRAPHICS_DISABLED
364  if (textord_show_tables) {
365  ScrollView* table_win = MakeWindow(1500, 300, "Detected Tables");
368  table_grid_.DisplayBoxes(table_win);
369  }
370 #endif // GRAPHICS_DISABLED
371  }
372 
373  // Merge all colpartitions in table regions to make them a single
374  // colpartition and revert types of isolated table cells not
375  // assigned to any table to their original types.
376  MakeTableBlocks(grid, all_columns, width_cb);
377 }
378 // All grids have the same dimensions. The clean_part_grid_ sizes are set from
379 // the part_grid_ that is passed to InsertCleanPartitions, which was the same as
380 // the grid that is the base of ColumnFinder. Just return the clean_part_grid_
381 // dimensions instead of duplicated memory.
383  return clean_part_grid_.gridsize();
384 }
386  return clean_part_grid_.gridwidth();
387 }
389  return clean_part_grid_.gridheight();
390 }
391 const ICOORD& TableFinder::bleft() const {
392  return clean_part_grid_.bleft();
393 }
394 const ICOORD& TableFinder::tright() const {
395  return clean_part_grid_.tright();
396 }
397 
399  ASSERT_HOST(part != NULL);
400  if (AllowTextPartition(*part)) {
401  clean_part_grid_.InsertBBox(true, true, part);
402  } else {
403  delete part;
404  }
405 }
407  ASSERT_HOST(part != NULL);
408  if (AllowTextPartition(*part)) {
409  fragmented_text_grid_.InsertBBox(true, true, part);
410  } else {
411  delete part;
412  }
413 }
415  ASSERT_HOST(part != NULL);
416  if (!part->IsEmpty() && part->bounding_box().area() > 0) {
417  leader_and_ruling_grid_.InsertBBox(true, true, part);
418  } else {
419  delete part;
420  }
421 }
423  leader_and_ruling_grid_.InsertBBox(true, true, part);
424 }
426  // NOTE: If images are placed into a different grid in the future,
427  // the function SetPartitionSpacings needs to be updated. It should
428  // be the only thing that cares about image partitions.
429  clean_part_grid_.InsertBBox(true, true, part);
430 }
431 
432 // Splits a partition into its "words". The splits happen
433 // at locations with wide inter-blob spacing. This is useful
434 // because it allows the table recognize to "cut through" the
435 // text lines on the page. The assumption is that a table
436 // will have several lines with similar overlapping whitespace
437 // whereas text will not have this type of property.
438 // Note: The code Assumes that blobs are sorted by the left side x!
439 // This will not work (as well) if the blobs are sorted by center/right.
441  ASSERT_HOST(part != NULL);
442  // Bye bye empty partitions!
443  if (part->boxes()->empty()) {
444  delete part;
445  return;
446  }
447 
448  // The AllowBlob function prevents this.
449  ASSERT_HOST(part->median_width() > 0);
450  const double kThreshold = part->median_width() * kSplitPartitionSize;
451 
452  ColPartition* right_part = part;
453  bool found_split = true;
454  while (found_split) {
455  found_split = false;
456  BLOBNBOX_C_IT box_it(right_part->boxes());
457  // Blobs are sorted left side first. If blobs overlap,
458  // the previous blob may have a "more right" right side.
459  // Account for this by always keeping the largest "right"
460  // so far.
461  int previous_right = MIN_INT32;
462 
463  // Look for the next split in the partition.
464  for (box_it.mark_cycle_pt(); !box_it.cycled_list(); box_it.forward()) {
465  const TBOX& box = box_it.data()->bounding_box();
466  if (previous_right != MIN_INT32 &&
467  box.left() - previous_right > kThreshold) {
468  // We have a split position. Split the partition in two pieces.
469  // Insert the left piece in the grid and keep processing the right.
470  int mid_x = (box.left() + previous_right) / 2;
471  ColPartition* left_part = right_part;
472  right_part = left_part->SplitAt(mid_x);
473 
475  found_split = true;
476  break;
477  }
478 
479  // The right side of the previous blobs.
480  previous_right = MAX(previous_right, box.right());
481  }
482  }
483  // When a split is not found, the right part is minimized
484  // as much as possible, so process it.
485  InsertFragmentedTextPartition(right_part);
486 }
487 
488 // Some simple criteria to filter out now. We want to make sure the
489 // average blob size in the partition is consistent with the
490 // global page stats.
491 // The area metric will almost always pass for multi-blob partitions.
492 // It is useful when filtering out noise caused by an isolated blob.
494  const double kHeightRequired = global_median_xheight_ * kAllowTextHeight;
495  const double kWidthRequired = global_median_blob_width_ * kAllowTextWidth;
496  const int median_area = global_median_xheight_ * global_median_blob_width_;
497  const double kAreaPerBlobRequired = median_area * kAllowTextArea;
498  // Keep comparisons strictly greater to disallow 0!
499  return part.median_size() > kHeightRequired &&
500  part.median_width() > kWidthRequired &&
501  part.bounding_box().area() > kAreaPerBlobRequired * part.boxes_count();
502 }
503 
504 // Same as above, applied to blobs. Keep in mind that
505 // leaders, commas, and periods are important in tables.
506 bool TableFinder::AllowBlob(const BLOBNBOX& blob) const {
507  const TBOX& box = blob.bounding_box();
508  const double kHeightRequired = global_median_xheight_ * kAllowBlobHeight;
509  const double kWidthRequired = global_median_blob_width_ * kAllowBlobWidth;
510  const int median_area = global_median_xheight_ * global_median_blob_width_;
511  const double kAreaRequired = median_area * kAllowBlobArea;
512  // Keep comparisons strictly greater to disallow 0!
513  return box.height() > kHeightRequired &&
514  box.width() > kWidthRequired &&
515  box.area() > kAreaRequired;
516 }
517 
518 // TODO(nbeato): The grid that makes the window doesn't seem to matter.
519 // The only downside is that window messages will be caught by
520 // clean_part_grid_ instead of a useful object. This is a temporary solution
521 // for the debug windows created by the TableFinder.
522 ScrollView* TableFinder::MakeWindow(int x, int y, const char* window_name) {
523  return clean_part_grid_.MakeWindow(x, y, window_name);
524 }
525 
526 // Make single-column blocks from good_columns_ partitions.
528  ColSegment_LIST* column_blocks) {
529  for (int i = 0; i < gridheight(); ++i) {
530  ColPartitionSet* columns = all_columns[i];
531  if (columns != NULL) {
532  ColSegment_LIST new_blocks;
533  // Get boxes from the current vertical position on the grid
534  columns->GetColumnBoxes(i * gridsize(), (i+1) * gridsize(), &new_blocks);
535  // Merge the new_blocks boxes into column_blocks if they are well-aligned
536  GroupColumnBlocks(&new_blocks, column_blocks);
537  }
538  }
539 }
540 
541 // Merge column segments into the current list if they are well aligned.
542 void TableFinder::GroupColumnBlocks(ColSegment_LIST* new_blocks,
543  ColSegment_LIST* column_blocks) {
544  ColSegment_IT src_it(new_blocks);
545  ColSegment_IT dest_it(column_blocks);
546  // iterate through the source list
547  for (src_it.mark_cycle_pt(); !src_it.cycled_list(); src_it.forward()) {
548  ColSegment* src_seg = src_it.data();
549  const TBOX& src_box = src_seg->bounding_box();
550  bool match_found = false;
551  // iterate through the destination list to find a matching column block
552  for (dest_it.mark_cycle_pt(); !dest_it.cycled_list(); dest_it.forward()) {
553  ColSegment* dest_seg = dest_it.data();
554  TBOX dest_box = dest_seg->bounding_box();
555  if (ConsecutiveBoxes(src_box, dest_box)) {
556  // If matching block is found, insert the current block into it
557  // and delete the soure block
558  dest_seg->InsertBox(src_box);
559  match_found = true;
560  delete src_it.extract();
561  break;
562  }
563  }
564  // If no match is found, just append the source block to column_blocks
565  if (!match_found) {
566  dest_it.add_after_then_move(src_it.extract());
567  }
568  }
569 }
570 
571 // are the two boxes immediate neighbors along the vertical direction
572 bool TableFinder::ConsecutiveBoxes(const TBOX &b1, const TBOX &b2) {
573  int x_margin = 20;
574  int y_margin = 5;
575  return (abs(b1.left() - b2.left()) < x_margin) &&
576  (abs(b1.right() - b2.right()) < x_margin) &&
577  (abs(b1.top()-b2.bottom()) < y_margin ||
578  abs(b2.top()-b1.bottom()) < y_margin);
579 }
580 
581 // Set up info for clean_part_grid_ partitions to be valid during detection
582 // code.
584  FindNeighbors();
585  SetPartitionSpacings(&clean_part_grid_, all_columns);
587 }
588 
589 // Set left, right and top, bottom spacings of each colpartition.
591  ColPartitionSet** all_columns) {
592  // Iterate the ColPartitions in the grid.
593  ColPartitionGridSearch gsearch(grid);
594  gsearch.StartFullSearch();
595  ColPartition* part = NULL;
596  while ((part = gsearch.NextFullSearch()) != NULL) {
597  ColPartitionSet* columns = all_columns[gsearch.GridY()];
598  TBOX box = part->bounding_box();
599  int y = part->MidY();
600  ColPartition* left_column = columns->ColumnContaining(box.left(), y);
601  ColPartition* right_column = columns->ColumnContaining(box.right(), y);
602  // set distance from left column as space to the left
603  if (left_column) {
604  int left_space = MAX(0, box.left() - left_column->LeftAtY(y));
605  part->set_space_to_left(left_space);
606  }
607  // set distance from right column as space to the right
608  if (right_column) {
609  int right_space = MAX(0, right_column->RightAtY(y) - box.right());
610  part->set_space_to_right(right_space);
611  }
612 
613  // Look for images that may be closer.
614  // NOTE: used to be part_grid_, might cause issues now
615  ColPartitionGridSearch hsearch(grid);
616  hsearch.StartSideSearch(box.left(), box.bottom(), box.top());
617  ColPartition* neighbor = NULL;
618  while ((neighbor = hsearch.NextSideSearch(true)) != NULL) {
619  if (neighbor->type() == PT_PULLOUT_IMAGE ||
620  neighbor->type() == PT_FLOWING_IMAGE ||
621  neighbor->type() == PT_HEADING_IMAGE) {
622  int right = neighbor->bounding_box().right();
623  if (right < box.left()) {
624  int space = MIN(box.left() - right, part->space_to_left());
625  part->set_space_to_left(space);
626  }
627  }
628  }
629  hsearch.StartSideSearch(box.left(), box.bottom(), box.top());
630  neighbor = NULL;
631  while ((neighbor = hsearch.NextSideSearch(false)) != NULL) {
632  if (neighbor->type() == PT_PULLOUT_IMAGE ||
633  neighbor->type() == PT_FLOWING_IMAGE ||
634  neighbor->type() == PT_HEADING_IMAGE) {
635  int left = neighbor->bounding_box().left();
636  if (left > box.right()) {
637  int space = MIN(left - box.right(), part->space_to_right());
638  part->set_space_to_right(space);
639  }
640  }
641  }
642 
643  ColPartition* upper_part = part->SingletonPartner(true);
644  if (upper_part) {
645  int space = MAX(0, upper_part->bounding_box().bottom() -
646  part->bounding_box().bottom());
647  part->set_space_above(space);
648  } else {
649  // TODO(nbeato): What constitutes a good value?
650  // 0 is the default value when not set, explicitly noting it needs to
651  // be something else.
652  part->set_space_above(MAX_INT32);
653  }
654 
655  ColPartition* lower_part = part->SingletonPartner(false);
656  if (lower_part) {
657  int space = MAX(0, part->bounding_box().bottom() -
658  lower_part->bounding_box().bottom());
659  part->set_space_below(space);
660  } else {
661  // TODO(nbeato): What constitutes a good value?
662  // 0 is the default value when not set, explicitly noting it needs to
663  // be something else.
664  part->set_space_below(MAX_INT32);
665  }
666  }
667 }
668 
669 // Set spacing and closest neighbors above and below a given colpartition.
671  TBOX box = part->bounding_box();
672  int top_range = MIN(box.top() + kMaxVerticalSpacing, tright().y());
673  int bottom_range = MAX(box.bottom() - kMaxVerticalSpacing, bleft().y());
674  box.set_top(top_range);
675  box.set_bottom(bottom_range);
676 
677  TBOX part_box = part->bounding_box();
678  // Start a rect search
680  rectsearch(&clean_part_grid_);
681  rectsearch.StartRectSearch(box);
682  ColPartition* neighbor;
683  int min_space_above = kMaxVerticalSpacing;
684  int min_space_below = kMaxVerticalSpacing;
685  ColPartition* above_neighbor = NULL;
686  ColPartition* below_neighbor = NULL;
687  while ((neighbor = rectsearch.NextRectSearch()) != NULL) {
688  if (neighbor == part)
689  continue;
690  TBOX neighbor_box = neighbor->bounding_box();
691  if (neighbor_box.major_x_overlap(part_box)) {
692  int gap = abs(part->median_bottom() - neighbor->median_bottom());
693  // If neighbor is below current partition
694  if (neighbor_box.top() < part_box.bottom() &&
695  gap < min_space_below) {
696  min_space_below = gap;
697  below_neighbor = neighbor;
698  } // If neighbor is above current partition
699  else if (part_box.top() < neighbor_box.bottom() &&
700  gap < min_space_above) {
701  min_space_above = gap;
702  above_neighbor = neighbor;
703  }
704  }
705  }
706  part->set_space_above(min_space_above);
707  part->set_space_below(min_space_below);
708  part->set_nearest_neighbor_above(above_neighbor);
709  part->set_nearest_neighbor_below(below_neighbor);
710 }
711 
712 // Set global spacing and x-height estimates
714  STATS xheight_stats(0, kMaxVerticalSpacing + 1);
715  STATS width_stats(0, kMaxBlobWidth + 1);
716  STATS ledding_stats(0, kMaxVerticalSpacing + 1);
717  // Iterate the ColPartitions in the grid.
718  ColPartitionGridSearch gsearch(grid);
719  gsearch.SetUniqueMode(true);
720  gsearch.StartFullSearch();
721  ColPartition* part = NULL;
722  while ((part = gsearch.NextFullSearch()) != NULL) {
723  // TODO(nbeato): HACK HACK HACK! medians are equal to partition length.
724  // ComputeLimits needs to get called somewhere outside of TableFinder
725  // to make sure the partitions are properly initialized.
726  // When this is called, SmoothPartitionPartners dies in an assert after
727  // table find runs. Alternative solution.
728  // part->ComputeLimits();
729  if (part->IsTextType()) {
730  // xheight_stats.add(part->median_size(), part->boxes_count());
731  // width_stats.add(part->median_width(), part->boxes_count());
732 
733  // This loop can be removed when above issues are fixed.
734  // Replace it with the 2 lines commented out above.
735  BLOBNBOX_C_IT it(part->boxes());
736  for (it.mark_cycle_pt(); !it.cycled_list(); it.forward()) {
737  xheight_stats.add(it.data()->bounding_box().height(), 1);
738  width_stats.add(it.data()->bounding_box().width(), 1);
739  }
740 
741  ledding_stats.add(part->space_above(), 1);
742  ledding_stats.add(part->space_below(), 1);
743  }
744  }
745  // Set estimates based on median of statistics obtained
746  set_global_median_xheight(static_cast<int>(xheight_stats.median() + 0.5));
747  set_global_median_blob_width(static_cast<int>(width_stats.median() + 0.5));
748  set_global_median_ledding(static_cast<int>(ledding_stats.median() + 0.5));
749  #ifndef GRAPHICS_DISABLED
751  const char* kWindowName = "X-height (R), X-width (G), and ledding (B)";
752  ScrollView* stats_win = MakeWindow(500, 10, kWindowName);
753  xheight_stats.plot(stats_win, 10, 200, 2, 15, ScrollView::RED);
754  width_stats.plot(stats_win, 10, 200, 2, 15, ScrollView::GREEN);
755  ledding_stats.plot(stats_win, 10, 200, 2, 15, ScrollView::BLUE);
756  }
757  #endif // GRAPHICS_DISABLED
758 }
759 
761  global_median_xheight_ = xheight;
762 }
765 }
767  global_median_ledding_ = ledding;
768 }
769 
772  gsearch.StartFullSearch();
773  ColPartition* part = NULL;
774  while ((part = gsearch.NextFullSearch()) != NULL) {
775  // TODO(nbeato): Rename this function, meaning is different now.
776  // IT is finding nearest neighbors its own way
777  //SetVerticalSpacing(part);
778 
779  ColPartition* upper = part->SingletonPartner(true);
780  if (upper)
781  part->set_nearest_neighbor_above(upper);
782 
783  ColPartition* lower = part->SingletonPartner(false);
784  if (lower)
785  part->set_nearest_neighbor_below(lower);
786  }
787 }
788 
789 // High level interface. Input is an unmarked ColPartitionGrid
790 // (namely, clean_part_grid_). Partitions are identified using local
791 // information and filter/smoothed. The function exit should contain
792 // a good sampling of the table partitions.
796  ScrollView* table_win = MakeWindow(300, 300, "Initial Table Partitions");
800  }
803  ScrollView* table_win = MakeWindow(600, 300, "Filtered Table Partitions");
807  }
810  ScrollView* table_win = MakeWindow(900, 300, "Smoothed Table Partitions");
814  }
817  ScrollView* table_win = MakeWindow(900, 300, "Final Table Partitions");
821  }
822 }
823 
824 // These types of partitions are marked as table partitions:
825 // 1- Partitions that have at lease one large gap between words
826 // 2- Partitions that consist of only one word (no significant gap
827 // between components)
828 // 3- Partitions that vertically overlap with other partitions within the
829 // same column.
830 // 4- Partitions with leaders before/after them.
832  // Iterate the ColPartitions in the grid.
834  gsearch(&clean_part_grid_);
835  gsearch.StartFullSearch();
836  ColPartition* part = NULL;
837  while ((part = gsearch.NextFullSearch()) != NULL) {
838  if (!part->IsTextType()) // Only consider text partitions
839  continue;
840  // Only consider partitions in dominant font size or smaller
841  if (part->median_size() > kMaxTableCellXheight * global_median_xheight_)
842  continue;
843  // Mark partitions with a large gap, or no significant gap as
844  // table partitions.
845  // Comments: It produces several false alarms at:
846  // - last line of a paragraph (fixed)
847  // - single word section headings
848  // - page headers and footers
849  // - numbered equations
850  // - line drawing regions
851  // TODO(faisal): detect and fix above-mentioned cases
852  if (HasWideOrNoInterWordGap(part) ||
853  HasLeaderAdjacent(*part)) {
854  part->set_table_type();
855  }
856  }
857 }
858 
859 // Check if the partition has at least one large gap between words or no
860 // significant gap at all
862  // Should only get text partitions.
863  ASSERT_HOST(part->IsTextType());
864  // Blob access
865  BLOBNBOX_CLIST* part_boxes = part->boxes();
866  BLOBNBOX_C_IT it(part_boxes);
867  // Check if this is a relatively small partition (such as a single word)
868  if (part->bounding_box().width() <
869  kMinBoxesInTextPartition * part->median_size() &&
870  part_boxes->length() < kMinBoxesInTextPartition)
871  return true;
872 
873  // Variables used to compute inter-blob spacing.
874  int current_x0 = -1;
875  int current_x1 = -1;
876  int previous_x1 = -1;
877  // Stores the maximum gap detected.
878  int largest_partition_gap_found = -1;
879  // Text partition gap limits. If this is text (and not a table),
880  // there should be at least one gap larger than min_gap and no gap
881  // larger than max_gap.
882  const double max_gap = kMaxGapInTextPartition * part->median_size();
883  const double min_gap = kMinMaxGapInTextPartition * part->median_size();
884 
885  for (it.mark_cycle_pt(); !it.cycled_list(); it.forward()) {
886  BLOBNBOX* blob = it.data();
887  current_x0 = blob->bounding_box().left();
888  current_x1 = blob->bounding_box().right();
889  if (previous_x1 != -1) {
890  int gap = current_x0 - previous_x1;
891 
892  // TODO(nbeato): Boxes may overlap? Huh?
893  // For example, mag.3B 8003_033.3B.tif in UNLV data. The titles/authors
894  // on the top right of the page are filtered out with this line.
895  // Note 2: Iterating over blobs in a partition, so we are looking for
896  // spacing between the words.
897  if (gap < 0) {
898  // More likely case, the blobs slightly overlap. This can happen
899  // with diacritics (accents) or broken alphabet symbols (characters).
900  // Merge boxes together by taking max of right sides.
901  if (-gap < part->median_size() * kMaxBlobOverlapFactor) {
902  previous_x1 = MAX(previous_x1, current_x1);
903  continue;
904  }
905  // Extreme case, blobs overlap significantly in the same partition...
906  // This should not happen often (if at all), but it does.
907  // TODO(nbeato): investigate cases when this happens.
908  else {
909  // The behavior before was to completely ignore this case.
910  }
911  }
912 
913  // If a large enough gap is found, mark it as a table cell (return true)
914  if (gap > max_gap)
915  return true;
916  if (gap > largest_partition_gap_found)
917  largest_partition_gap_found = gap;
918  }
919  previous_x1 = current_x1;
920  }
921  // Since no large gap was found, return false if the partition is too
922  // long to be a data cell
923  if (part->bounding_box().width() >
924  kMaxBoxesInDataPartition * part->median_size() ||
925  part_boxes->length() > kMaxBoxesInDataPartition)
926  return false;
927 
928  // A partition may be a single blob. In this case, it's an isolated symbol
929  // or non-text (such as a ruling or image).
930  // Detect these as table partitions? Shouldn't this be case by case?
931  // The behavior before was to ignore this, making max_partition_gap < 0
932  // and implicitly return true. Just making it explicit.
933  if (largest_partition_gap_found == -1)
934  return true;
935 
936  // return true if the maximum gap found is smaller than the minimum allowed
937  // max_gap in a text partition. This indicates that there is no significant
938  // space in the partition, hence it is likely a single word.
939  return largest_partition_gap_found < min_gap;
940 }
941 
942 // A criteria for possible tables is that a table may have leaders
943 // between data cells. An aggressive solution to find such tables is to
944 // explicitly mark partitions that have adjacent leaders.
945 // Note that this includes overlapping leaders. However, it does not
946 // include leaders in different columns on the page.
947 // Possible false-positive will include lists, such as a table of contents.
948 // As these arise, the aggressive nature of this search may need to be
949 // trimmed down.
951  if (part.flow() == BTFT_LEADER)
952  return true;
953  // Search range is left and right bounded by an offset of the
954  // median xheight. This offset is to allow some tolerance to the
955  // the leaders on the page in the event that the alignment is still
956  // a bit off.
957  const TBOX& box = part.bounding_box();
958  const int search_size = kAdjacentLeaderSearchPadding * global_median_xheight_;
959  const int top = box.top() + search_size;
960  const int bottom = box.bottom() - search_size;
962  for (int direction = 0; direction < 2; ++direction) {
963  bool right_to_left = (direction == 0);
964  int x = right_to_left ? box.right() : box.left();
965  hsearch.StartSideSearch(x, bottom, top);
966  ColPartition* leader = NULL;
967  while ((leader = hsearch.NextSideSearch(right_to_left)) != NULL) {
968  // The leader could be a horizontal ruling in the grid.
969  // Make sure it is actually a leader.
970  if (leader->flow() != BTFT_LEADER)
971  continue;
972  // This should not happen, they are in different grids.
973  ASSERT_HOST(&part != leader);
974  // Make sure the leader shares a page column with the partition,
975  // otherwise we are spreading across columns.
976  if (!part.IsInSameColumnAs(*leader))
977  break;
978  // There should be a significant vertical overlap
979  if (!leader->VSignificantCoreOverlap(part))
980  continue;
981  // Leader passed all tests, so it is adjacent.
982  return true;
983  }
984  }
985  // No leaders are adjacent to the given partition.
986  return false;
987 }
988 
989 // Filter individual text partitions marked as table partitions
990 // consisting of paragraph endings, small section headings, and
991 // headers and footers.
995  // TODO(nbeato): Fully justified text as non-table?
996 }
997 
999  // Detect last line of paragraph
1000  // Iterate the ColPartitions in the grid.
1002  gsearch.StartFullSearch();
1003  ColPartition* part = NULL;
1004  while ((part = gsearch.NextFullSearch()) != NULL) {
1005  if (part->type() != PT_TABLE)
1006  continue; // Consider only table partitions
1007 
1008  // Paragraph ending should have flowing text above it.
1009  ColPartition* upper_part = part->nearest_neighbor_above();
1010  if (!upper_part)
1011  continue;
1012  if (upper_part->type() != PT_FLOWING_TEXT)
1013  continue;
1014  if (upper_part->bounding_box().width() <
1015  2 * part->bounding_box().width())
1016  continue;
1017  // Check if its the last line of a paragraph.
1018  // In most cases, a paragraph ending should be left-aligned to text line
1019  // above it. Sometimes, it could be a 2 line paragraph, in which case
1020  // the line above it is indented.
1021  // To account for that, check if the partition center is to
1022  // the left of the one above it.
1023  int mid = (part->bounding_box().left() + part->bounding_box().right()) / 2;
1024  int upper_mid = (upper_part->bounding_box().left() +
1025  upper_part->bounding_box().right()) / 2;
1026  int current_spacing = 0; // spacing of the current line to margin
1027  int upper_spacing = 0; // spacing of the previous line to the margin
1029  // Left to right languages, use mid - left to figure out the distance
1030  // the middle is from the left margin.
1031  int left = MIN(part->bounding_box().left(),
1032  upper_part->bounding_box().left());
1033  current_spacing = mid - left;
1034  upper_spacing = upper_mid - left;
1035  } else {
1036  // Right to left languages, use right - mid to figure out the distance
1037  // the middle is from the right margin.
1038  int right = MAX(part->bounding_box().right(),
1039  upper_part->bounding_box().right());
1040  current_spacing = right - mid;
1041  upper_spacing = right - upper_mid;
1042  }
1043  if (current_spacing * kParagraphEndingPreviousLineRatio > upper_spacing)
1044  continue;
1045 
1046  // Paragraphs should have similar fonts.
1047  if (!part->MatchingSizes(*upper_part) ||
1048  !part->MatchingStrokeWidth(*upper_part, kStrokeWidthFractionalTolerance,
1049  kStrokeWidthConstantTolerance)) {
1050  continue;
1051  }
1052 
1053  // The last line of a paragraph should be left aligned.
1054  // TODO(nbeato): This would be untrue if the text was right aligned.
1055  // How often is that?
1056  if (part->space_to_left() >
1057  kMaxParagraphEndingLeftSpaceMultiple * part->median_size())
1058  continue;
1059  // The line above it should be right aligned (assuming justified format).
1060  // Since we can't assume justified text, we compare whitespace to text.
1061  // The above line should have majority spanning text (or the current
1062  // line could have fit on the previous line). So compare
1063  // whitespace to text.
1064  if (upper_part->bounding_box().width() <
1065  kMinParagraphEndingTextToWhitespaceRatio * upper_part->space_to_right())
1066  continue;
1067 
1068  // Ledding above the line should be less than ledding below
1069  if (part->space_above() >= part->space_below() ||
1070  part->space_above() > 2 * global_median_ledding_)
1071  continue;
1072 
1073  // If all checks failed, it is probably text.
1074  part->clear_table_type();
1075  }
1076 }
1077 
1079  // Consider top-most text colpartition as header and bottom most as footer
1080  ColPartition* header = NULL;
1081  ColPartition* footer = NULL;
1082  int max_top = MIN_INT32;
1083  int min_bottom = MAX_INT32;
1085  gsearch.StartFullSearch();
1086  ColPartition* part = NULL;
1087  while ((part = gsearch.NextFullSearch()) != NULL) {
1088  if (!part->IsTextType())
1089  continue; // Consider only text partitions
1090  int top = part->bounding_box().top();
1091  int bottom = part->bounding_box().bottom();
1092  if (top > max_top) {
1093  max_top = top;
1094  header = part;
1095  }
1096  if (bottom < min_bottom) {
1097  min_bottom = bottom;
1098  footer = part;
1099  }
1100  }
1101  if (header)
1102  header->clear_table_type();
1103  if (footer)
1104  footer->clear_table_type();
1105 }
1106 
1107 // Mark all ColPartitions as table cells that have a table cell above
1108 // and below them
1109 // TODO(faisal): This is too aggressive at the moment. The method needs to
1110 // consider spacing and alignment as well. Detection of false alarm table cells
1111 // should also be done as part of it.
1113  // Iterate the ColPartitions in the grid.
1115  gsearch.StartFullSearch();
1116  ColPartition* part = NULL;
1117  while ((part = gsearch.NextFullSearch()) != NULL) {
1118  if (part->type() >= PT_TABLE || part->type() == PT_UNKNOWN)
1119  continue; // Consider only text partitions
1120  ColPartition* upper_part = part->nearest_neighbor_above();
1121  ColPartition* lower_part = part->nearest_neighbor_below();
1122  if (!upper_part || !lower_part)
1123  continue;
1124  if (upper_part->type() == PT_TABLE && lower_part->type() == PT_TABLE)
1125  part->set_table_type();
1126  }
1127 
1128  // Pass 2, do the opposite. If both the upper and lower neighbors
1129  // exist and are not tables, this probably shouldn't be a table.
1130  gsearch.StartFullSearch();
1131  part = NULL;
1132  while ((part = gsearch.NextFullSearch()) != NULL) {
1133  if (part->type() != PT_TABLE)
1134  continue; // Consider only text partitions
1135  ColPartition* upper_part = part->nearest_neighbor_above();
1136  ColPartition* lower_part = part->nearest_neighbor_below();
1137 
1138  // table can't be by itself
1139  if ((upper_part && upper_part->type() != PT_TABLE) &&
1140  (lower_part && lower_part->type() != PT_TABLE)) {
1141  part->clear_table_type();
1142  }
1143  }
1144 }
1145 
1146 // Set the type of a column segment based on the ratio of table to text cells
1147 void TableFinder::SetColumnsType(ColSegment_LIST* column_blocks) {
1148  ColSegment_IT it(column_blocks);
1149  for (it.mark_cycle_pt(); !it.cycled_list(); it.forward()) {
1150  ColSegment* seg = it.data();
1151  TBOX box = seg->bounding_box();
1152  int num_table_cells = 0;
1153  int num_text_cells = 0;
1155  rsearch(&clean_part_grid_);
1156  rsearch.SetUniqueMode(true);
1157  rsearch.StartRectSearch(box);
1158  ColPartition* part = NULL;
1159  while ((part = rsearch.NextRectSearch()) != NULL) {
1160  if (part->type() == PT_TABLE) {
1161  num_table_cells++;
1162  } else if (part->type() == PT_FLOWING_TEXT) {
1163  num_text_cells++;
1164  }
1165  }
1166  // If a column block has no text or table partition in it, it is not needed
1167  // for table detection.
1168  if (!num_table_cells && !num_text_cells) {
1169  delete it.extract();
1170  } else {
1171  seg->set_num_table_cells(num_table_cells);
1172  seg->set_num_text_cells(num_text_cells);
1173  // set column type based on the ratio of table to text cells
1174  seg->set_type();
1175  }
1176  }
1177 }
1178 
1179 // Move column blocks to grid
1180 void TableFinder::MoveColSegmentsToGrid(ColSegment_LIST *segments,
1181  ColSegmentGrid *col_seg_grid) {
1182  ColSegment_IT it(segments);
1183  for (it.mark_cycle_pt(); !it.cycled_list(); it.forward()) {
1184  ColSegment* seg = it.extract();
1185  col_seg_grid->InsertBBox(true, true, seg);
1186  }
1187 }
1188 
1189 // Merge column blocks if a split is detected due to the presence of a
1190 // table. A text block is considered split if it has multiple
1191 // neighboring blocks above/below it, and at least one of the
1192 // neighboring blocks is of table type (has a high density of table
1193 // partitions). In this case neighboring blocks in the direction
1194 // (above/below) of the table block are merged with the text block.
1195 
1196 // Comment: This method does not handle split due to a full page table
1197 // since table columns in this case do not have a text column on which
1198 // split decision can be based.
1200  int margin = gridsize();
1201 
1202  // Iterate the Column Blocks in the grid.
1204  gsearch(&col_seg_grid_);
1205  gsearch.StartFullSearch();
1206  ColSegment* seg;
1207  while ((seg = gsearch.NextFullSearch()) != NULL) {
1208  if (seg->type() != COL_TEXT)
1209  continue; // only consider text blocks for split detection
1210  bool neighbor_found = false;
1211  bool modified = false; // Modified at least once
1212  // keep expanding current box as long as neighboring table columns
1213  // are found above or below it.
1214  do {
1215  TBOX box = seg->bounding_box();
1216  // slightly expand the search region vertically
1217  int top_range = MIN(box.top() + margin, tright().y());
1218  int bottom_range = MAX(box.bottom() - margin, bleft().y());
1219  box.set_top(top_range);
1220  box.set_bottom(bottom_range);
1221  neighbor_found = false;
1223  rectsearch(&col_seg_grid_);
1224  rectsearch.StartRectSearch(box);
1225  ColSegment* neighbor = NULL;
1226  while ((neighbor = rectsearch.NextRectSearch()) != NULL) {
1227  if (neighbor == seg)
1228  continue;
1229  const TBOX& neighbor_box = neighbor->bounding_box();
1230  // If the neighbor box significantly overlaps with the current
1231  // box (due to the expansion of the current box in the
1232  // previous iteration of this loop), remove the neighbor box
1233  // and expand the current box to include it.
1234  if (neighbor_box.overlap_fraction(box) >= 0.9) {
1235  seg->InsertBox(neighbor_box);
1236  modified = true;
1237  rectsearch.RemoveBBox();
1238  gsearch.RepositionIterator();
1239  delete neighbor;
1240  continue;
1241  }
1242  // Only expand if the neighbor box is of table type
1243  if (neighbor->type() != COL_TABLE)
1244  continue;
1245  // Insert the neighbor box into the current column block
1246  if (neighbor_box.major_x_overlap(box) &&
1247  !box.contains(neighbor_box)) {
1248  seg->InsertBox(neighbor_box);
1249  neighbor_found = true;
1250  modified = true;
1251  rectsearch.RemoveBBox();
1252  gsearch.RepositionIterator();
1253  delete neighbor;
1254  }
1255  }
1256  } while (neighbor_found);
1257  if (modified) {
1258  // Because the box has changed, it has to be removed first.
1259  gsearch.RemoveBBox();
1260  col_seg_grid_.InsertBBox(true, true, seg);
1261  gsearch.RepositionIterator();
1262  }
1263  }
1264 }
1265 
1266 // Group horizontally overlapping table partitions into table columns.
1267 // TODO(faisal): This is too aggressive at the moment. The method should
1268 // consider more attributes to group table partitions together. Some common
1269 // errors are:
1270 // 1- page number is merged with a table column above it even
1271 // if there is a large vertical gap between them.
1272 // 2- column headers go on to catch one of the columns arbitrarily
1273 // 3- an isolated noise blob near page top or bottom merges with the table
1274 // column below/above it
1275 // 4- cells from two vertically adjacent tables merge together to make a
1276 // single column resulting in merging of the two tables
1277 void TableFinder::GetTableColumns(ColSegment_LIST *table_columns) {
1278  ColSegment_IT it(table_columns);
1279  // Iterate the ColPartitions in the grid.
1281  gsearch(&clean_part_grid_);
1282  gsearch.StartFullSearch();
1283  ColPartition* part;
1284  while ((part = gsearch.NextFullSearch()) != NULL) {
1285  if (part->inside_table_column() || part->type() != PT_TABLE)
1286  continue; // prevent a partition to be assigned to multiple columns
1287  const TBOX& box = part->bounding_box();
1288  ColSegment* col = new ColSegment();
1289  col->InsertBox(box);
1290  part->set_inside_table_column(true);
1291  // Start a search below the current cell to find bottom neighbours
1292  // Note: a full search will always process things above it first, so
1293  // this should be starting at the highest cell and working its way down.
1295  vsearch(&clean_part_grid_);
1296  vsearch.StartVerticalSearch(box.left(), box.right(), box.bottom());
1297  ColPartition* neighbor = NULL;
1298  bool found_neighbours = false;
1299  while ((neighbor = vsearch.NextVerticalSearch(true)) != NULL) {
1300  // only consider neighbors not assigned to any column yet
1301  if (neighbor->inside_table_column())
1302  continue;
1303  // Horizontal lines should not break the flow
1304  if (neighbor->IsHorizontalLine())
1305  continue;
1306  // presence of a non-table neighbor marks the end of current
1307  // table column
1308  if (neighbor->type() != PT_TABLE)
1309  break;
1310  // add the neighbor partition to the table column
1311  const TBOX& neighbor_box = neighbor->bounding_box();
1312  col->InsertBox(neighbor_box);
1313  neighbor->set_inside_table_column(true);
1314  found_neighbours = true;
1315  }
1316  if (found_neighbours) {
1317  it.add_after_then_move(col);
1318  } else {
1319  part->set_inside_table_column(false);
1320  delete col;
1321  }
1322  }
1323 }
1324 
1325 // Mark regions in a column that are x-bounded by the column boundaries and
1326 // y-bounded by the table columns' projection on the y-axis as table regions
1327 void TableFinder::GetTableRegions(ColSegment_LIST* table_columns,
1328  ColSegment_LIST* table_regions) {
1329  ColSegment_IT cit(table_columns);
1330  ColSegment_IT rit(table_regions);
1331  // Iterate through column blocks
1333  gsearch(&col_seg_grid_);
1334  gsearch.StartFullSearch();
1335  ColSegment* part;
1336  int page_height = tright().y() - bleft().y();
1337  ASSERT_HOST(page_height > 0);
1338  // create a bool array to hold projection on y-axis
1339  bool* table_region = new bool[page_height];
1340  while ((part = gsearch.NextFullSearch()) != NULL) {
1341  const TBOX& part_box = part->bounding_box();
1342  // reset the projection array
1343  for (int i = 0; i < page_height; i++) {
1344  table_region[i] = false;
1345  }
1346  // iterate through all table columns to find regions in the current
1347  // page column block
1348  cit.move_to_first();
1349  for (cit.mark_cycle_pt(); !cit.cycled_list(); cit.forward()) {
1350  TBOX col_box = cit.data()->bounding_box();
1351  // find intersection region of table column and page column
1352  TBOX intersection_box = col_box.intersection(part_box);
1353  // project table column on the y-axis
1354  for (int i = intersection_box.bottom(); i < intersection_box.top(); i++) {
1355  table_region[i - bleft().y()] = true;
1356  }
1357  }
1358  // set x-limits of table regions to page column width
1359  TBOX current_table_box;
1360  current_table_box.set_left(part_box.left());
1361  current_table_box.set_right(part_box.right());
1362  // go through the y-axis projection to find runs of table
1363  // regions. Each run makes one table region.
1364  for (int i = 1; i < page_height; i++) {
1365  // detect start of a table region
1366  if (!table_region[i - 1] && table_region[i]) {
1367  current_table_box.set_bottom(i + bleft().y());
1368  }
1369  // TODO(nbeato): Is it guaranteed that the last row is not a table region?
1370  // detect end of a table region
1371  if (table_region[i - 1] && !table_region[i]) {
1372  current_table_box.set_top(i + bleft().y());
1373  if (!current_table_box.null_box()) {
1374  ColSegment* seg = new ColSegment();
1375  seg->InsertBox(current_table_box);
1376  rit.add_after_then_move(seg);
1377  }
1378  }
1379  }
1380  }
1381  delete[] table_region;
1382 }
1383 
1384 // Merge table regions corresponding to tables spanning multiple columns if
1385 // there is a colpartition (horizontal ruling line or normal text) that
1386 // touches both regions.
1387 // TODO(faisal): A rare error occurs if there are two horizontally adjacent
1388 // tables with aligned ruling lines. In this case, line finder returns a
1389 // single line and hence the tables get merged together
1391  // Iterate the table regions in the grid.
1393  gsearch(&table_grid_);
1394  gsearch.StartFullSearch();
1395  ColSegment* seg = NULL;
1396  while ((seg = gsearch.NextFullSearch()) != NULL) {
1397  bool neighbor_found = false;
1398  bool modified = false; // Modified at least once
1399  do {
1400  // Start a rectangle search x-bounded by the image and y by the table
1401  const TBOX& box = seg->bounding_box();
1402  TBOX search_region(box);
1403  search_region.set_left(bleft().x());
1404  search_region.set_right(tright().x());
1405  neighbor_found = false;
1407  rectsearch(&table_grid_);
1408  rectsearch.StartRectSearch(search_region);
1409  ColSegment* neighbor = NULL;
1410  while ((neighbor = rectsearch.NextRectSearch()) != NULL) {
1411  if (neighbor == seg)
1412  continue;
1413  const TBOX& neighbor_box = neighbor->bounding_box();
1414  // Check if a neighbor box has a large overlap with the table
1415  // region. This may happen as a result of merging two table
1416  // regions in the previous iteration.
1417  if (neighbor_box.overlap_fraction(box) >= 0.9) {
1418  seg->InsertBox(neighbor_box);
1419  rectsearch.RemoveBBox();
1420  gsearch.RepositionIterator();
1421  delete neighbor;
1422  modified = true;
1423  continue;
1424  }
1425  // Check if two table regions belong together based on a common
1426  // horizontal ruling line
1427  if (BelongToOneTable(box, neighbor_box)) {
1428  seg->InsertBox(neighbor_box);
1429  neighbor_found = true;
1430  modified = true;
1431  rectsearch.RemoveBBox();
1432  gsearch.RepositionIterator();
1433  delete neighbor;
1434  }
1435  }
1436  } while (neighbor_found);
1437  if (modified) {
1438  // Because the box has changed, it has to be removed first.
1439  gsearch.RemoveBBox();
1440  table_grid_.InsertBBox(true, true, seg);
1441  gsearch.RepositionIterator();
1442  }
1443  }
1444 }
1445 
1446 // Decide if two table regions belong to one table based on a common
1447 // horizontal ruling line or another colpartition
1448 bool TableFinder::BelongToOneTable(const TBOX &box1, const TBOX &box2) {
1449  // Check the obvious case. Most likely not true because overlapping boxes
1450  // should already be merged, but seems like a good thing to do in case things
1451  // change.
1452  if (box1.overlap(box2))
1453  return true;
1454  // Check for ColPartitions spanning both table regions
1455  TBOX bbox = box1.bounding_union(box2);
1456  // Start a rect search on bbox
1458  rectsearch(&clean_part_grid_);
1459  rectsearch.StartRectSearch(bbox);
1460  ColPartition* part = NULL;
1461  while ((part = rectsearch.NextRectSearch()) != NULL) {
1462  const TBOX& part_box = part->bounding_box();
1463  // return true if a colpartition spanning both table regions is found
1464  if (part_box.overlap(box1) && part_box.overlap(box2) &&
1465  !part->IsImageType())
1466  return true;
1467  }
1468  return false;
1469 }
1470 
1471 // Adjust table boundaries by:
1472 // - building a tight bounding box around all ColPartitions contained in it.
1473 // - expanding table boundaries to include all colpartitions that overlap the
1474 // table by more than half of their area
1475 // - expanding table boundaries to include nearby horizontal rule lines
1476 // - expanding table vertically to include left out column headers
1477 // TODO(faisal): Expansion of table boundaries is quite aggressive. It usually
1478 // makes following errors:
1479 // 1- horizontal lines consisting of underlines are included in the table if
1480 // they are close enough
1481 // 2- horizontal lines originating from noise tend to get merged with a table
1482 // near the top of the page
1483 // 3- the criteria for including horizontal lines is very generous. Many times
1484 // horizontal lines separating headers and footers get merged with a
1485 // single-column table in a multi-column page thereby including text
1486 // from the neighboring column inside the table
1487 // 4- the criteria for including left out column headers also tends to
1488 // occasionally include text-lines above the tables, typically from
1489 // table caption
1491  // Iterate the table regions in the grid
1492  ColSegment_CLIST adjusted_tables;
1493  ColSegment_C_IT it(&adjusted_tables);
1495  gsearch.StartFullSearch();
1496  ColSegment* table = NULL;
1497  while ((table = gsearch.NextFullSearch()) != NULL) {
1498  const TBOX& table_box = table->bounding_box();
1499  TBOX grown_box = table_box;
1500  GrowTableBox(table_box, &grown_box);
1501  // To prevent a table from expanding again, do not insert the
1502  // modified box back to the grid. Instead move it to a list and
1503  // and remove it from the grid. The list is moved later back to the grid.
1504  if (!grown_box.null_box()) {
1505  ColSegment* col = new ColSegment();
1506  col->InsertBox(grown_box);
1507  it.add_after_then_move(col);
1508  }
1509  gsearch.RemoveBBox();
1510  delete table;
1511  }
1512  // clear table grid to move final tables in it
1513  // TODO(nbeato): table_grid_ should already be empty. The above loop
1514  // removed everything. Maybe just assert it is empty?
1515  table_grid_.Clear();
1516  it.move_to_first();
1517  // move back final tables to table_grid_
1518  for (it.mark_cycle_pt(); !it.cycled_list(); it.forward()) {
1519  ColSegment* seg = it.extract();
1520  table_grid_.InsertBBox(true, true, seg);
1521  }
1522 }
1523 
1524 void TableFinder::GrowTableBox(const TBOX& table_box, TBOX* result_box) {
1525  // TODO(nbeato): The growing code is a bit excessive right now.
1526  // By removing these lines, the partitions considered need
1527  // to have some overlap or be special cases. These lines could
1528  // be added again once a check is put in place to make sure that
1529  // growing tables don't stomp on a lot of non-table partitions.
1530 
1531  // search for horizontal ruling lines within the vertical margin
1532  // int vertical_margin = kRulingVerticalMargin * gridsize();
1533  TBOX search_box = table_box;
1534  // int top = MIN(search_box.top() + vertical_margin, tright().y());
1535  // int bottom = MAX(search_box.bottom() - vertical_margin, bleft().y());
1536  // search_box.set_top(top);
1537  // search_box.set_bottom(bottom);
1538 
1539  GrowTableToIncludePartials(table_box, search_box, result_box);
1540  GrowTableToIncludeLines(table_box, search_box, result_box);
1541  IncludeLeftOutColumnHeaders(result_box);
1542 }
1543 
1544 // Grow a table by increasing the size of the box to include
1545 // partitions with significant overlap with the table.
1547  const TBOX& search_range,
1548  TBOX* result_box) {
1549  // Rulings are in a different grid, so search 2 grids for rulings, text,
1550  // and table partitions that are not entirely within the new box.
1551  for (int i = 0; i < 2; ++i) {
1552  ColPartitionGrid* grid = (i == 0) ? &fragmented_text_grid_ :
1554  ColPartitionGridSearch rectsearch(grid);
1555  rectsearch.StartRectSearch(search_range);
1556  ColPartition* part = NULL;
1557  while ((part = rectsearch.NextRectSearch()) != NULL) {
1558  // Only include text and table types.
1559  if (part->IsImageType())
1560  continue;
1561  const TBOX& part_box = part->bounding_box();
1562  // Include partition in the table if more than half of it
1563  // is covered by the table
1564  if (part_box.overlap_fraction(table_box) > kMinOverlapWithTable) {
1565  *result_box = result_box->bounding_union(part_box);
1566  continue;
1567  }
1568  }
1569  }
1570 }
1571 
1572 // Grow a table by expanding to the extents of significantly
1573 // overlapping lines.
1575  const TBOX& search_range,
1576  TBOX* result_box) {
1578  rsearch.SetUniqueMode(true);
1579  rsearch.StartRectSearch(search_range);
1580  ColPartition* part = NULL;
1581  while ((part = rsearch.NextRectSearch()) != NULL) {
1582  // TODO(nbeato) This should also do vertical, but column
1583  // boundaries are breaking things. This function needs to be
1584  // updated to allow vertical lines as well.
1585  if (!part->IsLineType())
1586  continue;
1587  // Avoid the following function call if the result of the
1588  // function is irrelevant.
1589  const TBOX& part_box = part->bounding_box();
1590  if (result_box->contains(part_box))
1591  continue;
1592  // Include a partially overlapping horizontal line only if the
1593  // extra ColPartitions that will be included due to expansion
1594  // have large side spacing w.r.t. columns containing them.
1595  if (HLineBelongsToTable(*part, table_box))
1596  *result_box = result_box->bounding_union(part_box);
1597  // TODO(nbeato): Vertical
1598  }
1599 }
1600 
1601 // Checks whether the horizontal line belong to the table by looking at the
1602 // side spacing of extra ColParitions that will be included in the table
1603 // due to expansion
1605  const TBOX& table_box) {
1606  if (!part.IsHorizontalLine())
1607  return false;
1608  const TBOX& part_box = part.bounding_box();
1609  if (!part_box.major_x_overlap(table_box))
1610  return false;
1611  // Do not consider top-most horizontal line since it usually
1612  // originates from noise.
1613  // TODO(nbeato): I had to comment this out because the ruling grid doesn't
1614  // have neighbors solved.
1615  // if (!part.nearest_neighbor_above())
1616  // return false;
1617  const TBOX bbox = part_box.bounding_union(table_box);
1618  // In the "unioned table" box (the table extents expanded by the line),
1619  // keep track of how many partitions have significant padding to the left
1620  // and right. If more than half of the partitions covered by the new table
1621  // have significant spacing, the line belongs to the table and the table
1622  // grows to include all of the partitions.
1623  int num_extra_partitions = 0;
1624  int extra_space_to_right = 0;
1625  int extra_space_to_left = 0;
1626  // Rulings are in a different grid, so search 2 grids for rulings, text,
1627  // and table partitions that are introduced by the new box.
1628  for (int i = 0; i < 2; ++i) {
1629  ColPartitionGrid* grid = (i == 0) ? &clean_part_grid_ :
1631  // Start a rect search on bbox
1632  ColPartitionGridSearch rectsearch(grid);
1633  rectsearch.SetUniqueMode(true);
1634  rectsearch.StartRectSearch(bbox);
1635  ColPartition* extra_part = NULL;
1636  while ((extra_part = rectsearch.NextRectSearch()) != NULL) {
1637  // ColPartition already in table
1638  const TBOX& extra_part_box = extra_part->bounding_box();
1639  if (extra_part_box.overlap_fraction(table_box) > kMinOverlapWithTable)
1640  continue;
1641  // Non-text ColPartitions do not contribute
1642  if (extra_part->IsImageType())
1643  continue;
1644  // Consider this partition.
1645  num_extra_partitions++;
1646  // presence of a table cell is a strong hint, so just increment the scores
1647  // without looking at the spacing.
1648  if (extra_part->type() == PT_TABLE || extra_part->IsLineType()) {
1649  extra_space_to_right++;
1650  extra_space_to_left++;
1651  continue;
1652  }
1653  int space_threshold = kSideSpaceMargin * part.median_size();
1654  if (extra_part->space_to_right() > space_threshold)
1655  extra_space_to_right++;
1656  if (extra_part->space_to_left() > space_threshold)
1657  extra_space_to_left++;
1658  }
1659  }
1660  // tprintf("%d %d %d\n",
1661  // num_extra_partitions,extra_space_to_right,extra_space_to_left);
1662  return (extra_space_to_right > num_extra_partitions / 2) ||
1663  (extra_space_to_left > num_extra_partitions / 2);
1664 }
1665 
1666 // Look for isolated column headers above the given table box and
1667 // include them in the table
1669  // Start a search above the current table to look for column headers
1671  vsearch.StartVerticalSearch(table_box->left(), table_box->right(),
1672  table_box->top());
1673  ColPartition* neighbor = NULL;
1674  ColPartition* previous_neighbor = NULL;
1675  while ((neighbor = vsearch.NextVerticalSearch(false)) != NULL) {
1676  // Max distance to find a table heading.
1677  const int max_distance = kMaxColumnHeaderDistance *
1678  neighbor->median_size();
1679  int table_top = table_box->top();
1680  const TBOX& box = neighbor->bounding_box();
1681  // Do not continue if the next box is way above
1682  if (box.bottom() - table_top > max_distance)
1683  break;
1684  // Unconditionally include partitions of type TABLE or LINE
1685  // TODO(faisal): add some reasonable conditions here
1686  if (neighbor->type() == PT_TABLE || neighbor->IsLineType()) {
1687  table_box->set_top(box.top());
1688  previous_neighbor = NULL;
1689  continue;
1690  }
1691  // If there are two text partitions, one above the other, without a table
1692  // cell on their left or right side, consider them a barrier and quit
1693  if (previous_neighbor == NULL) {
1694  previous_neighbor = neighbor;
1695  } else {
1696  const TBOX& previous_box = previous_neighbor->bounding_box();
1697  if (!box.major_y_overlap(previous_box))
1698  break;
1699  }
1700  }
1701 }
1702 
1703 // Remove false alarms consiting of a single column based on their
1704 // projection on the x-axis. Projection of a real table on the x-axis
1705 // should have at least one zero-valley larger than the global median
1706 // x-height of the page.
1708  int page_width = tright().x() - bleft().x();
1709  ASSERT_HOST(page_width > 0);
1710  // create an integer array to hold projection on x-axis
1711  int* table_xprojection = new int[page_width];
1712  // Iterate through all tables in the table grid
1714  table_search(&table_grid_);
1715  table_search.StartFullSearch();
1716  ColSegment* table;
1717  while ((table = table_search.NextFullSearch()) != NULL) {
1718  TBOX table_box = table->bounding_box();
1719  // reset the projection array
1720  for (int i = 0; i < page_width; i++) {
1721  table_xprojection[i] = 0;
1722  }
1723  // Start a rect search on table_box
1725  rectsearch(&clean_part_grid_);
1726  rectsearch.SetUniqueMode(true);
1727  rectsearch.StartRectSearch(table_box);
1728  ColPartition* part;
1729  while ((part = rectsearch.NextRectSearch()) != NULL) {
1730  if (!part->IsTextType())
1731  continue; // Do not consider non-text partitions
1732  if (part->flow() == BTFT_LEADER)
1733  continue; // Assume leaders are in tables
1734  TBOX part_box = part->bounding_box();
1735  // Do not consider partitions partially covered by the table
1736  if (part_box.overlap_fraction(table_box) < kMinOverlapWithTable)
1737  continue;
1738  BLOBNBOX_CLIST* part_boxes = part->boxes();
1739  BLOBNBOX_C_IT pit(part_boxes);
1740 
1741  // Make sure overlapping blobs don't artificially inflate the number
1742  // of rows in the table. This happens frequently with things such as
1743  // decimals and split characters. Do this by assuming the column
1744  // partition is sorted mostly left to right and just clip
1745  // bounding boxes by the previous box's extent.
1746  int next_position_to_write = 0;
1747 
1748  for (pit.mark_cycle_pt(); !pit.cycled_list(); pit.forward()) {
1749  BLOBNBOX *pblob = pit.data();
1750  // ignore blob height for the purpose of projection since we
1751  // are only interested in finding valleys
1752  int xstart = pblob->bounding_box().left();
1753  int xend = pblob->bounding_box().right();
1754 
1755  xstart = MAX(xstart, next_position_to_write);
1756  for (int i = xstart; i < xend; i++)
1757  table_xprojection[i - bleft().x()]++;
1758  next_position_to_write = xend;
1759  }
1760  }
1761  // Find largest valley between two reasonable peaks in the table
1762  if (!GapInXProjection(table_xprojection, page_width)) {
1763  table_search.RemoveBBox();
1764  delete table;
1765  }
1766  }
1767  delete[] table_xprojection;
1768 }
1769 
1770 // Return true if at least one gap larger than the global x-height
1771 // exists in the horizontal projection
1772 bool TableFinder::GapInXProjection(int* xprojection, int length) {
1773  // Find peak value of the histogram
1774  int peak_value = 0;
1775  for (int i = 0; i < length; i++) {
1776  if (xprojection[i] > peak_value) {
1777  peak_value = xprojection[i];
1778  }
1779  }
1780  // Peak value represents the maximum number of horizontally
1781  // overlapping colpartitions, so this can be considered as the
1782  // number of rows in the table
1783  if (peak_value < kMinRowsInTable)
1784  return false;
1785  double projection_threshold = kSmallTableProjectionThreshold * peak_value;
1786  if (peak_value >= kLargeTableRowCount)
1787  projection_threshold = kLargeTableProjectionThreshold * peak_value;
1788  // Threshold the histogram
1789  for (int i = 0; i < length; i++) {
1790  xprojection[i] = (xprojection[i] >= projection_threshold) ? 1 : 0;
1791  }
1792  // Find the largest run of zeros between two ones
1793  int largest_gap = 0;
1794  int run_start = -1;
1795  for (int i = 1; i < length; i++) {
1796  // detect start of a run of zeros
1797  if (xprojection[i - 1] && !xprojection[i]) {
1798  run_start = i;
1799  }
1800  // detect end of a run of zeros and update the value of largest gap
1801  if (run_start != -1 && !xprojection[i - 1] && xprojection[i]) {
1802  int gap = i - run_start;
1803  if (gap > largest_gap)
1804  largest_gap = gap;
1805  run_start = -1;
1806  }
1807  }
1808  return largest_gap > kMaxXProjectionGapFactor * global_median_xheight_;
1809 }
1810 
1811 // Given the location of a table "guess", try to overlay a cellular
1812 // grid in the location, adjusting the boundaries.
1813 // TODO(nbeato): Falsely introduces:
1814 // -headers/footers (not any worse, too much overlap destroys cells)
1815 // -page numbers (not worse, included because maximize margins)
1816 // -equations (nicely fit into a celluar grid, but more sparsely)
1817 // -figures (random text box, also sparse)
1818 // -small left-aligned text areas with overlapping positioned whitespace
1819 // (rejected before)
1820 // Overall, this just needs some more work.
1822  ScrollView* table_win = NULL;
1823  if (textord_show_tables) {
1824  table_win = MakeWindow(0, 0, "Table Structure");
1827  // table_grid_.DisplayBoxes(table_win);
1828  }
1829 
1830 
1831  TableRecognizer recognizer;
1832  recognizer.Init();
1834  recognizer.set_text_grid(&fragmented_text_grid_);
1835  recognizer.set_max_text_height(global_median_xheight_ * 2.0);
1836  recognizer.set_min_height(1.5 * gridheight());
1837  // Loop over all of the tables and try to fit them.
1838  // Store the good tables here.
1839  ColSegment_CLIST good_tables;
1840  ColSegment_C_IT good_it(&good_tables);
1841 
1843  gsearch.StartFullSearch();
1844  ColSegment* found_table = NULL;
1845  while ((found_table = gsearch.NextFullSearch()) != NULL) {
1846  gsearch.RemoveBBox();
1847 
1848  // The goal is to make the tables persistent in a list.
1849  // When that happens, this will move into the search loop.
1850  const TBOX& found_box = found_table->bounding_box();
1851  StructuredTable* table_structure = recognizer.RecognizeTable(found_box);
1852 
1853  // Process a table. Good tables are inserted into the grid again later on
1854  // We can't change boxes in the grid while it is running a search.
1855  if (table_structure != NULL) {
1856  if (textord_show_tables) {
1857  table_structure->Display(table_win, ScrollView::LIME_GREEN);
1858  }
1859  found_table->set_bounding_box(table_structure->bounding_box());
1860  delete table_structure;
1861  good_it.add_after_then_move(found_table);
1862  } else {
1863  delete found_table;
1864  }
1865  }
1866  // TODO(nbeato): MERGE!! There is awesome info now available for merging.
1867 
1868  // At this point, the grid is empty. We can safely insert the good tables
1869  // back into grid.
1870  for (good_it.mark_cycle_pt(); !good_it.cycled_list(); good_it.forward())
1871  table_grid_.InsertBBox(true, true, good_it.extract());
1872 }
1873 
1874 // Displays the column segments in some window.
1876  ColSegment_LIST *segments,
1877  ScrollView::Color color) {
1878 #ifndef GRAPHICS_DISABLED
1879  win->Pen(color);
1880  win->Brush(ScrollView::NONE);
1881  ColSegment_IT it(segments);
1882  for (it.mark_cycle_pt(); !it.cycled_list(); it.forward()) {
1883  ColSegment* col = it.data();
1884  const TBOX& box = col->bounding_box();
1885  int left_x = box.left();
1886  int right_x = box.right();
1887  int top_y = box.top();
1888  int bottom_y = box.bottom();
1889  win->Rectangle(left_x, bottom_y, right_x, top_y);
1890  }
1891  win->UpdateWindow();
1892 #endif
1893 }
1894 
1896  ScrollView::Color color) {
1897 #ifndef GRAPHICS_DISABLED
1898  // Iterate the ColPartitions in the grid.
1900  gsearch(grid);
1901  gsearch.StartFullSearch();
1902  ColSegment* seg = NULL;
1903  while ((seg = gsearch.NextFullSearch()) != NULL) {
1904  const TBOX& box = seg->bounding_box();
1905  int left_x = box.left();
1906  int right_x = box.right();
1907  int top_y = box.top();
1908  int bottom_y = box.bottom();
1909  win->Brush(ScrollView::NONE);
1910  win->Pen(color);
1911  win->Rectangle(left_x, bottom_y, right_x, top_y);
1912  }
1913  win->UpdateWindow();
1914 #endif
1915 }
1916 
1917 // Displays the colpartitions using a new coloring on an existing window.
1918 // Note: This method is only for debug purpose during development and
1919 // would not be part of checked in code
1921  ColPartitionGrid* grid,
1922  ScrollView::Color default_color,
1923  ScrollView::Color table_color) {
1924 #ifndef GRAPHICS_DISABLED
1925  ScrollView::Color color = default_color;
1926  // Iterate the ColPartitions in the grid.
1928  gsearch(grid);
1929  gsearch.StartFullSearch();
1930  ColPartition* part = NULL;
1931  while ((part = gsearch.NextFullSearch()) != NULL) {
1932  color = default_color;
1933  if (part->type() == PT_TABLE)
1934  color = table_color;
1935 
1936  const TBOX& box = part->bounding_box();
1937  int left_x = box.left();
1938  int right_x = box.right();
1939  int top_y = box.top();
1940  int bottom_y = box.bottom();
1941  win->Brush(ScrollView::NONE);
1942  win->Pen(color);
1943  win->Rectangle(left_x, bottom_y, right_x, top_y);
1944  }
1945  win->UpdateWindow();
1946 #endif
1947 }
1949  ColPartitionGrid* grid,
1950  ScrollView::Color default_color) {
1951  DisplayColPartitions(win, grid, default_color, ScrollView::YELLOW);
1952 }
1953 
1955  ScrollView* win,
1956  ColPartitionGrid* grid,
1957  ScrollView::Color color) {
1958 #ifndef GRAPHICS_DISABLED
1959  // Iterate the ColPartitions in the grid.
1961  gsearch(grid);
1962  gsearch.StartFullSearch();
1963  ColPartition* part = NULL;
1964  while ((part = gsearch.NextFullSearch()) != NULL) {
1965  const TBOX& box = part->bounding_box();
1966  int left_x = box.left();
1967  int right_x = box.right();
1968  int top_y = box.top();
1969  int bottom_y = box.bottom();
1970 
1971  ColPartition* upper_part = part->nearest_neighbor_above();
1972  if (upper_part) {
1973  const TBOX& upper_box = upper_part->bounding_box();
1974  int mid_x = (left_x + right_x) / 2;
1975  int mid_y = (top_y + bottom_y) / 2;
1976  int other_x = (upper_box.left() + upper_box.right()) / 2;
1977  int other_y = (upper_box.top() + upper_box.bottom()) / 2;
1978  win->Brush(ScrollView::NONE);
1979  win->Pen(color);
1980  win->Line(mid_x, mid_y, other_x, other_y);
1981  }
1982  ColPartition* lower_part = part->nearest_neighbor_below();
1983  if (lower_part) {
1984  const TBOX& lower_box = lower_part->bounding_box();
1985  int mid_x = (left_x + right_x) / 2;
1986  int mid_y = (top_y + bottom_y) / 2;
1987  int other_x = (lower_box.left() + lower_box.right()) / 2;
1988  int other_y = (lower_box.top() + lower_box.bottom()) / 2;
1989  win->Brush(ScrollView::NONE);
1990  win->Pen(color);
1991  win->Line(mid_x, mid_y, other_x, other_y);
1992  }
1993  }
1994  win->UpdateWindow();
1995 #endif
1996 }
1997 
1998 // Merge all colpartitions in table regions to make them a single
1999 // colpartition and revert types of isolated table cells not
2000 // assigned to any table to their original types.
2002  ColPartitionSet** all_columns,
2003  WidthCallback* width_cb) {
2004  // Since we have table blocks already, remove table tags from all
2005  // colpartitions
2007  gsearch(grid);
2008  gsearch.StartFullSearch();
2009  ColPartition* part = NULL;
2010 
2011  while ((part = gsearch.NextFullSearch()) != NULL) {
2012  if (part->type() == PT_TABLE) {
2013  part->clear_table_type();
2014  }
2015  }
2016  // Now make a single colpartition out of each table block and remove
2017  // all colpartitions contained within a table
2019  table_search(&table_grid_);
2020  table_search.StartFullSearch();
2021  ColSegment* table;
2022  while ((table = table_search.NextFullSearch()) != NULL) {
2023  const TBOX& table_box = table->bounding_box();
2024  // Start a rect search on table_box
2026  rectsearch(grid);
2027  rectsearch.StartRectSearch(table_box);
2028  ColPartition* part;
2029  ColPartition* table_partition = NULL;
2030  while ((part = rectsearch.NextRectSearch()) != NULL) {
2031  // Do not consider image partitions
2032  if (!part->IsTextType())
2033  continue;
2034  TBOX part_box = part->bounding_box();
2035  // Include partition in the table if more than half of it
2036  // is covered by the table
2037  if (part_box.overlap_fraction(table_box) > kMinOverlapWithTable) {
2038  rectsearch.RemoveBBox();
2039  if (table_partition) {
2040  table_partition->Absorb(part, width_cb);
2041  } else {
2042  table_partition = part;
2043  }
2044  }
2045  }
2046  // Insert table colpartition back to part_grid_
2047  if (table_partition) {
2048  // To match the columns used when transforming to blocks, the new table
2049  // partition must have its first and last column set at the grid y that
2050  // corresponds to its bottom.
2051  const TBOX& table_box = table_partition->bounding_box();
2052  int grid_x, grid_y;
2053  grid->GridCoords(table_box.left(), table_box.bottom(), &grid_x, &grid_y);
2054  table_partition->SetPartitionType(resolution_, all_columns[grid_y]);
2055  table_partition->set_table_type();
2056  table_partition->set_blob_type(BRT_TEXT);
2057  table_partition->set_flow(BTFT_CHAIN);
2058  table_partition->SetBlobTypes();
2059  grid->InsertBBox(true, true, table_partition);
2060  }
2061  }
2062 }
2063 
2067  : ELIST_LINK(),
2068  num_table_cells_(0),
2069  num_text_cells_(0),
2070  type_(COL_UNKNOWN) {
2071 }
2073 }
2074 
2075 // Provides a color for BBGrid to draw the rectangle.
2077  const ScrollView::Color kBoxColors[PT_COUNT] = {
2082  };
2083  return kBoxColors[type_];
2084 }
2085 
2086 // Insert a box into this column segment
2087 void ColSegment::InsertBox(const TBOX& other) {
2088  bounding_box_ = bounding_box_.bounding_union(other);
2089 }
2090 
2091 // Set column segment type based on the ratio of text and table partitions
2092 // in it.
2094  if (num_table_cells_ > kTableColumnThreshold * num_text_cells_)
2095  type_ = COL_TABLE;
2096  else if (num_text_cells_ > num_table_cells_)
2097  type_ = COL_TEXT;
2098  else
2099  type_ = COL_MIXED;
2100 }
2101 
2102 } // namespace tesseract.
ColPartition * ColumnContaining(int x, int y)
const double kTableColumnThreshold
Definition: tablefind.cpp:92
const int kSideSpaceMargin
Definition: tablefind.cpp:105
void Line(int x1, int y1, int x2, int y2)
Definition: scrollview.cpp:538
const double kMaxGapInTextPartition
Definition: tablefind.cpp:72
void GetTableRegions(ColSegment_LIST *table_columns, ColSegment_LIST *table_regions)
Definition: tablefind.cpp:1327
const int kMaxColumnHeaderDistance
Definition: tablefind.cpp:88
void IncludeLeftOutColumnHeaders(TBOX *table_box)
Definition: tablefind.cpp:1668
bool MatchingStrokeWidth(const ColPartition &other, double fractional_tolerance, double constant_tolerance) const
TBOX intersection(const TBOX &box) const
Definition: rect.cpp:87
void set_space_below(int space)
Definition: colpartition.h:270
ColPartition * CopyButDontOwnBlobs()
ColPartition * nearest_neighbor_below() const
Definition: colpartition.h:255
bool overlap(const TBOX &box) const
Definition: rect.h:345
void MoveColSegmentsToGrid(ColSegment_LIST *segments, ColSegmentGrid *col_seg_grid)
Definition: tablefind.cpp:1180
int GridY() const
Definition: bbgrid.h:247
bool AllowTextPartition(const ColPartition &part) const
Definition: tablefind.cpp:493
void DisplayBoxes(ScrollView *window)
Definition: bbgrid.h:617
Definition: points.h:189
const double kAllowBlobWidth
Definition: tablefind.cpp:60
void set_nearest_neighbor_below(ColPartition *part)
Definition: colpartition.h:258
bool VSignificantCoreOverlap(const ColPartition &other) const
Definition: colpartition.h:387
void set_global_median_blob_width(int width)
Definition: tablefind.cpp:763
bool GapInXProjection(int *xprojection, int length)
Definition: tablefind.cpp:1772
ColSegmentGrid table_grid_
Definition: tablefind.h:423
void DisplayColPartitionConnections(ScrollView *win, ColPartitionGrid *grid, ScrollView::Color default_color)
Definition: tablefind.cpp:1954
const double kMinParagraphEndingTextToWhitespaceRatio
Definition: tablefind.cpp:135
const double kMaxBlobOverlapFactor
Definition: tablefind.cpp:80
void SetGlobalSpacings(ColPartitionGrid *grid)
Definition: tablefind.cpp:713
inT32 area() const
Definition: rect.h:118
void Brush(Color color)
Definition: scrollview.cpp:732
const ICOORD & bleft() const
Definition: tablefind.cpp:391
#define MAX_INT32
Definition: host.h:62
void LocateTables(ColPartitionGrid *grid, ColPartitionSet **columns, WidthCallback *width_cb, const FCOORD &reskew)
Definition: tablefind.cpp:263
int LeftAtY(int y) const
Definition: colpartition.h:340
void plot(ScrollView *window, float xorigin, float yorigin, float xscale, float yscale, ScrollView::Color colour) const
Definition: statistc.cpp:585
void Init(int gridsize, const ICOORD &bleft, const ICOORD &tright)
Definition: bbgrid.h:448
BLOBNBOX_CLIST * boxes()
Definition: colpartition.h:187
TBOX bounding_union(const TBOX &box) const
Definition: rect.cpp:129
ScrollView * MakeWindow(int x, int y, const char *window_name)
Definition: bbgrid.h:593
void GetColumnBlocks(ColPartitionSet **columns, ColSegment_LIST *col_segments)
Definition: tablefind.cpp:527
const ICOORD & bleft() const
Definition: bbgrid.h:73
const double kLargeTableProjectionThreshold
Definition: tablefind.cpp:110
inT16 x() const
access function
Definition: points.h:52
StructuredTable * RecognizeTable(const TBOX &guess_box)
Definition: tablerecog.cpp:736
bool HLineBelongsToTable(const ColPartition &part, const TBOX &table_box)
Definition: tablefind.cpp:1604
void set_flow(BlobTextFlowType f)
Definition: colpartition.h:157
void set_line_grid(ColPartitionGrid *lines)
Definition: tablerecog.cpp:723
int direction(EDGEPT *point)
Definition: vecfuncs.cpp:43
const double kAllowBlobHeight
Definition: tablefind.cpp:59
int gridsize() const
Definition: bbgrid.h:64
void InitializePartitions(ColPartitionSet **all_columns)
Definition: tablefind.cpp:583
const double kMaxXProjectionGapFactor
Definition: tablefind.cpp:139
bool HasLeaderAdjacent(const ColPartition &part)
Definition: tablefind.cpp:950
bool textord_show_tables
Definition: tablefind.cpp:146
int gridheight() const
Definition: bbgrid.h:70
#define BOOL_VAR(name, val, comment)
Definition: params.h:279
int gridheight() const
Definition: tablefind.cpp:388
const int kRulingVerticalMargin
Definition: tablefind.cpp:96
ColPartitionGrid leader_and_ruling_grid_
Definition: tablefind.h:415
void Clear()
Definition: bbgrid.h:459
const double kSmallTableProjectionThreshold
Definition: tablefind.cpp:109
Definition: capi.h:94
bool major_x_overlap(const TBOX &box) const
Definition: rect.h:402
bool IsHorizontalLine() const
Definition: colpartition.h:453
const int kAdjacentLeaderSearchPadding
Definition: tablefind.cpp:120
int space_to_right() const
Definition: colpartition.h:279
void AddBox(BLOBNBOX *box)
int gridwidth() const
Definition: bbgrid.h:67
const int kMinBoxesInTextPartition
Definition: tablefind.cpp:66
int RightAtY(int y) const
Definition: colpartition.h:344
const double kAllowTextArea
Definition: tablefind.cpp:54
BlobRegionType region_type() const
Definition: blobbox.h:268
const int kMinRowsInTable
Definition: tablefind.cpp:115
void InsertTextPartition(ColPartition *part)
Definition: tablefind.cpp:398
void RepositionIterator()
Definition: bbgrid.h:896
bool major_y_overlap(const TBOX &box) const
Definition: rect.h:429
#define ASSERT_HOST(x)
Definition: errcode.h:84
void GroupColumnBlocks(ColSegment_LIST *current_segments, ColSegment_LIST *col_segments)
Definition: tablefind.cpp:542
inT16 left() const
Definition: rect.h:68
void InsertRulingPartition(ColPartition *part)
Definition: tablefind.cpp:422
void set_space_above(int space)
Definition: colpartition.h:264
ColPartition * nearest_neighbor_above() const
Definition: colpartition.h:249
void SetUniqueMode(bool mode)
Definition: bbgrid.h:255
void set_top(int y)
Definition: rect.h:57
const int kMaxBlobWidth
Definition: tablefind.cpp:43
void StartVerticalSearch(int xmin, int xmax, int y)
Definition: bbgrid.h:792
void UpdateWindow()
Definition: scrollview.cpp:710
void SetPartitionType(int resolution, ColPartitionSet *columns)
bool textord_tablefind_show_stats
Definition: tablefind.cpp:150
BBC * NextFullSearch()
Definition: bbgrid.h:679
bool IsImageType() const
Definition: colpartition.h:423
const double kAllowBlobArea
Definition: tablefind.cpp:61
ColPartition * SplitAt(int split_x)
void DisplayColPartitions(ScrollView *win, ColPartitionGrid *grid, ScrollView::Color text_color, ScrollView::Color table_color)
Definition: tablefind.cpp:1920
const double kSplitPartitionSize
Definition: tablefind.cpp:47
bool AllowBlob(const BLOBNBOX &blob) const
Definition: tablefind.cpp:506
void set_min_height(int height)
Definition: tablerecog.cpp:726
Definition: capi.h:95
ColPartition * ShallowCopy() const
void GetColumnBoxes(int y_bottom, int y_top, ColSegment_LIST *segments)
void set_max_text_height(int height)
Definition: tablerecog.cpp:732
void Display(ScrollView *window, ScrollView::Color color)
Definition: tablerecog.cpp:290
void Rectangle(int x1, int y1, int x2, int y2)
Definition: scrollview.cpp:606
const double kAllowTextHeight
Definition: tablefind.cpp:52
void InsertBBox(bool h_spread, bool v_spread, BBC *bbox)
Definition: bbgrid.h:490
const int kMaxVerticalSpacing
Definition: tablefind.cpp:41
double median() const
Definition: statistc.cpp:239
const int kMaxBoxesInDataPartition
Definition: tablefind.cpp:69
const TBOX & bounding_box() const
Definition: colpartition.h:109
const double kStrokeWidthConstantTolerance
Definition: tablefind.cpp:144
inT16 y() const
access_function
Definition: points.h:56
bool BelongToOneTable(const TBOX &box1, const TBOX &box2)
Definition: tablefind.cpp:1448
const ICOORD & tright() const
Definition: bbgrid.h:76
double overlap_fraction(const TBOX &box) const
Definition: rect.h:378
void MarkPartitionsUsingLocalInformation()
Definition: tablefind.cpp:831
void DeleteObject(T *object)
Definition: tablefind.cpp:159
void InsertCleanPartitions(ColPartitionGrid *grid, TO_BLOCK *block)
Definition: tablefind.cpp:197
void GetTableColumns(ColSegment_LIST *table_columns)
Definition: tablefind.cpp:1277
void StartSideSearch(int x, int ymin, int ymax)
Definition: bbgrid.h:750
ScrollView::Color BoxColor() const
Definition: tablefind.cpp:2076
void GrowTableToIncludePartials(const TBOX &table_box, const TBOX &search_range, TBOX *result_box)
Definition: tablefind.cpp:1546
const double kMinOverlapWithTable
Definition: tablefind.cpp:100
const TBOX & bounding_box() const
Definition: tablefind.h:52
bool null_box() const
Definition: rect.h:46
bool contains(const FCOORD pt) const
Definition: rect.h:323
void set_space_to_right(int space)
Definition: colpartition.h:282
PolyBlockType type() const
Definition: colpartition.h:181
void MakeTableBlocks(ColPartitionGrid *grid, ColPartitionSet **columns, WidthCallback *width_cb)
Definition: tablefind.cpp:2001
void RefinePartitionPartners(bool get_desperate)
void InsertBox(const TBOX &other)
Definition: tablefind.cpp:2087
const double kParagraphEndingPreviousLineRatio
Definition: tablefind.cpp:125
void add(inT32 value, inT32 count)
Definition: statistc.cpp:101
BBC * NextVerticalSearch(bool top_to_bottom)
Definition: bbgrid.h:806
inT16 top() const
Definition: rect.h:54
#define MIN_INT32
Definition: host.h:70
#define MAX(x, y)
Definition: ndminx.h:24
ScrollView * MakeWindow(int x, int y, const char *window_name)
Definition: tablefind.cpp:522
void set_text_grid(ColPartitionGrid *text)
Definition: tablerecog.cpp:720
void set_space_to_left(int space)
Definition: colpartition.h:276
void SplitAndInsertFragmentedTextPartition(ColPartition *part)
Definition: tablefind.cpp:440
Definition: rect.h:30
void set_num_text_cells(int n)
Definition: tablefind.h:90
#define MIN(x, y)
Definition: ndminx.h:28
bool ConsecutiveBoxes(const TBOX &b1, const TBOX &b2)
Definition: tablefind.cpp:572
const double kMaxTableCellXheight
Definition: tablefind.cpp:84
BBC * NextSideSearch(bool right_to_left)
Definition: bbgrid.h:765
inT16 height() const
Definition: rect.h:104
const double kAllowTextWidth
Definition: tablefind.cpp:53
BBC * NextRectSearch()
Definition: bbgrid.h:846
void Init(int grid_size, const ICOORD &bottom_left, const ICOORD &top_right)
Definition: tablefind.cpp:185
inT16 right() const
Definition: rect.h:75
const double kStrokeWidthFractionalTolerance
Definition: tablefind.cpp:143
bool IsInSameColumnAs(const ColPartition &part) const
ColPartition * SingletonPartner(bool upper)
const int kLargeTableRowCount
Definition: tablefind.cpp:112
inT16 width() const
Definition: rect.h:111
void set_right(int x)
Definition: rect.h:78
void set_left(int x)
Definition: rect.h:71
Definition: statistc.h:33
bool textord_tablefind_show_mark
Definition: tablefind.cpp:148
void set_global_median_ledding(int ledding)
Definition: tablefind.cpp:766
void DisplayColSegments(ScrollView *win, ColSegment_LIST *cols, ScrollView::Color color)
Definition: tablefind.cpp:1875
static void SetPartitionSpacings(ColPartitionGrid *grid, ColPartitionSet **all_columns)
Definition: tablefind.cpp:590
void StartFullSearch()
Definition: bbgrid.h:669
void set_num_table_cells(int n)
Definition: tablefind.h:81
void Absorb(ColPartition *other, WidthCallback *cb)
inT16 bottom() const
Definition: rect.h:61
ColPartitionGrid clean_part_grid_
Definition: tablefind.h:413
bool MatchingSizes(const ColPartition &other) const
const TBOX & bounding_box() const
Definition: tablerecog.cpp:110
void GrowTableBox(const TBOX &table_box, TBOX *result_box)
Definition: tablefind.cpp:1524
void set_global_median_xheight(int xheight)
Definition: tablefind.cpp:760
void StartRectSearch(const TBOX &rect)
Definition: bbgrid.h:834
ColSegType type() const
Definition: tablefind.h:94
void SetColumnsType(ColSegment_LIST *col_segments)
Definition: tablefind.cpp:1147
void set_bounding_box(const TBOX &other)
Definition: tablefind.h:72
void set_bottom(int y)
Definition: rect.h:64
void set_left_to_right_language(bool order)
Definition: tablefind.cpp:181
const double kMinMaxGapInTextPartition
Definition: tablefind.cpp:76
void DisplayColSegmentGrid(ScrollView *win, ColSegmentGrid *grid, ScrollView::Color color)
Definition: tablefind.cpp:1895
void InsertLeaderPartition(ColPartition *part)
Definition: tablefind.cpp:414
const double kMaxParagraphEndingLeftSpaceMultiple
Definition: tablefind.cpp:129
ELISTIZE(AmbigSpec)
void set_inside_table_column(bool val)
Definition: colpartition.h:246
void InsertFragmentedTextPartition(ColPartition *part)
Definition: tablefind.cpp:406
void SetVerticalSpacing(ColPartition *part)
Definition: tablefind.cpp:670
const ICOORD & tright() const
Definition: tablefind.cpp:394
void set_nearest_neighbor_above(ColPartition *part)
Definition: colpartition.h:252
const TBOX & bounding_box() const
Definition: blobbox.h:215
void InsertImagePartition(ColPartition *part)
Definition: tablefind.cpp:425
#define CLISTIZE(CLASSNAME)
Definition: clst.h:913
bool textord_tablefind_recognize_tables
Definition: tablefind.cpp:152
ColSegmentGrid col_seg_grid_
Definition: tablefind.h:421
BlobTextFlowType flow() const
Definition: blobbox.h:280
ColPartitionGrid fragmented_text_grid_
Definition: tablefind.h:419
void GrowTableToIncludeLines(const TBOX &table_box, const TBOX &search_range, TBOX *result_box)
Definition: tablefind.cpp:1574
BlobTextFlowType flow() const
Definition: colpartition.h:154
BlobRegionType blob_type() const
Definition: colpartition.h:148
void Pen(Color color)
Definition: scrollview.cpp:726
integer coordinate
Definition: points.h:30
bool HasWideOrNoInterWordGap(ColPartition *part) const
Definition: tablefind.cpp:861
void GridCoords(int x, int y, int *grid_x, int *grid_y) const
Definition: bbgrid.cpp:54
void ClearGridData(void(*free_method)(BBC *))
Definition: bbgrid.h:468
void set_blob_type(BlobRegionType t)
Definition: colpartition.h:151