tesseract  4.00.00dev
tesseract::TableFinder Class Reference

#include <tablefind.h>

Public Member Functions

 TableFinder ()
 
 ~TableFinder ()
 
void set_resolution (int resolution)
 
void set_left_to_right_language (bool order)
 
void Init (int grid_size, const ICOORD &bottom_left, const ICOORD &top_right)
 
void InsertCleanPartitions (ColPartitionGrid *grid, TO_BLOCK *block)
 
void LocateTables (ColPartitionGrid *grid, ColPartitionSet **columns, WidthCallback *width_cb, const FCOORD &reskew)
 

Protected Member Functions

int gridsize () const
 
int gridwidth () const
 
int gridheight () const
 
const ICOORDbleft () const
 
const ICOORDtright () const
 
ScrollViewMakeWindow (int x, int y, const char *window_name)
 
void InsertTextPartition (ColPartition *part)
 
void InsertFragmentedTextPartition (ColPartition *part)
 
void InsertLeaderPartition (ColPartition *part)
 
void InsertRulingPartition (ColPartition *part)
 
void InsertImagePartition (ColPartition *part)
 
void SplitAndInsertFragmentedTextPartition (ColPartition *part)
 
bool AllowTextPartition (const ColPartition &part) const
 
bool AllowBlob (const BLOBNBOX &blob) const
 
void MoveColSegmentsToGrid (ColSegment_LIST *segments, ColSegmentGrid *col_seg_grid)
 
void InitializePartitions (ColPartitionSet **all_columns)
 
void SetVerticalSpacing (ColPartition *part)
 
void SetGlobalSpacings (ColPartitionGrid *grid)
 
void set_global_median_xheight (int xheight)
 
void set_global_median_blob_width (int width)
 
void set_global_median_ledding (int ledding)
 
void FindNeighbors ()
 
void MarkTablePartitions ()
 
void MarkPartitionsUsingLocalInformation ()
 
bool HasWideOrNoInterWordGap (ColPartition *part) const
 
bool HasLeaderAdjacent (const ColPartition &part)
 
void FilterFalseAlarms ()
 
void FilterParagraphEndings ()
 
void FilterHeaderAndFooter ()
 
void SmoothTablePartitionRuns ()
 
void GetColumnBlocks (ColPartitionSet **columns, ColSegment_LIST *col_segments)
 
void GroupColumnBlocks (ColSegment_LIST *current_segments, ColSegment_LIST *col_segments)
 
bool ConsecutiveBoxes (const TBOX &b1, const TBOX &b2)
 
void SetColumnsType (ColSegment_LIST *col_segments)
 
void GridMergeColumnBlocks ()
 
void GetTableColumns (ColSegment_LIST *table_columns)
 
void GetTableRegions (ColSegment_LIST *table_columns, ColSegment_LIST *table_regions)
 
void GridMergeTableRegions ()
 
bool BelongToOneTable (const TBOX &box1, const TBOX &box2)
 
void AdjustTableBoundaries ()
 
void GrowTableBox (const TBOX &table_box, TBOX *result_box)
 
void GrowTableToIncludePartials (const TBOX &table_box, const TBOX &search_range, TBOX *result_box)
 
void GrowTableToIncludeLines (const TBOX &table_box, const TBOX &search_range, TBOX *result_box)
 
bool HLineBelongsToTable (const ColPartition &part, const TBOX &table_box)
 
void IncludeLeftOutColumnHeaders (TBOX *table_box)
 
void DeleteSingleColumnTables ()
 
bool GapInXProjection (int *xprojection, int length)
 
void RecognizeTables ()
 
void DisplayColSegments (ScrollView *win, ColSegment_LIST *cols, ScrollView::Color color)
 
void DisplayColPartitions (ScrollView *win, ColPartitionGrid *grid, ScrollView::Color text_color, ScrollView::Color table_color)
 
void DisplayColPartitions (ScrollView *win, ColPartitionGrid *grid, ScrollView::Color default_color)
 
void DisplayColPartitionConnections (ScrollView *win, ColPartitionGrid *grid, ScrollView::Color default_color)
 
void DisplayColSegmentGrid (ScrollView *win, ColSegmentGrid *grid, ScrollView::Color color)
 
void MakeTableBlocks (ColPartitionGrid *grid, ColPartitionSet **columns, WidthCallback *width_cb)
 

Static Protected Member Functions

static void SetPartitionSpacings (ColPartitionGrid *grid, ColPartitionSet **all_columns)
 

Protected Attributes

int resolution_
 
int global_median_xheight_
 
int global_median_blob_width_
 
int global_median_ledding_
 
ColPartitionGrid clean_part_grid_
 
ColPartitionGrid leader_and_ruling_grid_
 
ColPartitionGrid fragmented_text_grid_
 
ColSegmentGrid col_seg_grid_
 
ColSegmentGrid table_grid_
 
bool left_to_right_language_
 

Detailed Description

Definition at line 131 of file tablefind.h.

Constructor & Destructor Documentation

◆ TableFinder()

tesseract::TableFinder::TableFinder ( )

Definition at line 163 of file tablefind.cpp.

◆ ~TableFinder()

tesseract::TableFinder::~TableFinder ( )

Definition at line 171 of file tablefind.cpp.

171  {
172  // ColPartitions and ColSegments created by this class for storage in grids
173  // need to be deleted explicitly.
174  clean_part_grid_.ClearGridData(&DeleteObject<ColPartition>);
175  leader_and_ruling_grid_.ClearGridData(&DeleteObject<ColPartition>);
176  fragmented_text_grid_.ClearGridData(&DeleteObject<ColPartition>);
177  col_seg_grid_.ClearGridData(&DeleteObject<ColSegment>);
178  table_grid_.ClearGridData(&DeleteObject<ColSegment>);
179 }
ColSegmentGrid table_grid_
Definition: tablefind.h:423
ColPartitionGrid leader_and_ruling_grid_
Definition: tablefind.h:415
ColPartitionGrid clean_part_grid_
Definition: tablefind.h:413
ColSegmentGrid col_seg_grid_
Definition: tablefind.h:421
ColPartitionGrid fragmented_text_grid_
Definition: tablefind.h:419
void ClearGridData(void(*free_method)(BBC *))
Definition: bbgrid.h:468

Member Function Documentation

◆ AdjustTableBoundaries()

void tesseract::TableFinder::AdjustTableBoundaries ( )
protected

Definition at line 1490 of file tablefind.cpp.

1490  {
1491  // Iterate the table regions in the grid
1492  ColSegment_CLIST adjusted_tables;
1493  ColSegment_C_IT it(&adjusted_tables);
1495  gsearch.StartFullSearch();
1496  ColSegment* table = NULL;
1497  while ((table = gsearch.NextFullSearch()) != NULL) {
1498  const TBOX& table_box = table->bounding_box();
1499  TBOX grown_box = table_box;
1500  GrowTableBox(table_box, &grown_box);
1501  // To prevent a table from expanding again, do not insert the
1502  // modified box back to the grid. Instead move it to a list and
1503  // and remove it from the grid. The list is moved later back to the grid.
1504  if (!grown_box.null_box()) {
1505  ColSegment* col = new ColSegment();
1506  col->InsertBox(grown_box);
1507  it.add_after_then_move(col);
1508  }
1509  gsearch.RemoveBBox();
1510  delete table;
1511  }
1512  // clear table grid to move final tables in it
1513  // TODO(nbeato): table_grid_ should already be empty. The above loop
1514  // removed everything. Maybe just assert it is empty?
1515  table_grid_.Clear();
1516  it.move_to_first();
1517  // move back final tables to table_grid_
1518  for (it.mark_cycle_pt(); !it.cycled_list(); it.forward()) {
1519  ColSegment* seg = it.extract();
1520  table_grid_.InsertBBox(true, true, seg);
1521  }
1522 }
ColSegmentGrid table_grid_
Definition: tablefind.h:423
void Clear()
Definition: bbgrid.h:459
void InsertBBox(bool h_spread, bool v_spread, BBC *bbox)
Definition: bbgrid.h:490
GridSearch< ColSegment, ColSegment_CLIST, ColSegment_C_IT > ColSegmentGridSearch
Definition: tablefind.h:121
bool null_box() const
Definition: rect.h:46
Definition: rect.h:30
void GrowTableBox(const TBOX &table_box, TBOX *result_box)
Definition: tablefind.cpp:1524

◆ AllowBlob()

bool tesseract::TableFinder::AllowBlob ( const BLOBNBOX blob) const
protected

Definition at line 506 of file tablefind.cpp.

506  {
507  const TBOX& box = blob.bounding_box();
508  const double kHeightRequired = global_median_xheight_ * kAllowBlobHeight;
509  const double kWidthRequired = global_median_blob_width_ * kAllowBlobWidth;
510  const int median_area = global_median_xheight_ * global_median_blob_width_;
511  const double kAreaRequired = median_area * kAllowBlobArea;
512  // Keep comparisons strictly greater to disallow 0!
513  return box.height() > kHeightRequired &&
514  box.width() > kWidthRequired &&
515  box.area() > kAreaRequired;
516 }
const double kAllowBlobWidth
Definition: tablefind.cpp:60
inT32 area() const
Definition: rect.h:118
const double kAllowBlobHeight
Definition: tablefind.cpp:59
const double kAllowBlobArea
Definition: tablefind.cpp:61
Definition: rect.h:30
inT16 height() const
Definition: rect.h:104
inT16 width() const
Definition: rect.h:111
const TBOX & bounding_box() const
Definition: blobbox.h:215

◆ AllowTextPartition()

bool tesseract::TableFinder::AllowTextPartition ( const ColPartition part) const
protected

Definition at line 493 of file tablefind.cpp.

493  {
494  const double kHeightRequired = global_median_xheight_ * kAllowTextHeight;
495  const double kWidthRequired = global_median_blob_width_ * kAllowTextWidth;
496  const int median_area = global_median_xheight_ * global_median_blob_width_;
497  const double kAreaPerBlobRequired = median_area * kAllowTextArea;
498  // Keep comparisons strictly greater to disallow 0!
499  return part.median_size() > kHeightRequired &&
500  part.median_width() > kWidthRequired &&
501  part.bounding_box().area() > kAreaPerBlobRequired * part.boxes_count();
502 }
const double kAllowTextArea
Definition: tablefind.cpp:54
const double kAllowTextHeight
Definition: tablefind.cpp:52
const double kAllowTextWidth
Definition: tablefind.cpp:53

◆ BelongToOneTable()

bool tesseract::TableFinder::BelongToOneTable ( const TBOX box1,
const TBOX box2 
)
protected

Definition at line 1448 of file tablefind.cpp.

1448  {
1449  // Check the obvious case. Most likely not true because overlapping boxes
1450  // should already be merged, but seems like a good thing to do in case things
1451  // change.
1452  if (box1.overlap(box2))
1453  return true;
1454  // Check for ColPartitions spanning both table regions
1455  TBOX bbox = box1.bounding_union(box2);
1456  // Start a rect search on bbox
1457  GridSearch<ColPartition, ColPartition_CLIST, ColPartition_C_IT>
1458  rectsearch(&clean_part_grid_);
1459  rectsearch.StartRectSearch(bbox);
1460  ColPartition* part = NULL;
1461  while ((part = rectsearch.NextRectSearch()) != NULL) {
1462  const TBOX& part_box = part->bounding_box();
1463  // return true if a colpartition spanning both table regions is found
1464  if (part_box.overlap(box1) && part_box.overlap(box2) &&
1465  !part->IsImageType())
1466  return true;
1467  }
1468  return false;
1469 }
bool overlap(const TBOX &box) const
Definition: rect.h:345
TBOX bounding_union(const TBOX &box) const
Definition: rect.cpp:129
Definition: rect.h:30
ColPartitionGrid clean_part_grid_
Definition: tablefind.h:413

◆ bleft()

const ICOORD & tesseract::TableFinder::bleft ( ) const
protected

Definition at line 391 of file tablefind.cpp.

391  {
392  return clean_part_grid_.bleft();
393 }
const ICOORD & bleft() const
Definition: bbgrid.h:73
ColPartitionGrid clean_part_grid_
Definition: tablefind.h:413

◆ ConsecutiveBoxes()

bool tesseract::TableFinder::ConsecutiveBoxes ( const TBOX b1,
const TBOX b2 
)
protected

Definition at line 572 of file tablefind.cpp.

572  {
573  int x_margin = 20;
574  int y_margin = 5;
575  return (abs(b1.left() - b2.left()) < x_margin) &&
576  (abs(b1.right() - b2.right()) < x_margin) &&
577  (abs(b1.top()-b2.bottom()) < y_margin ||
578  abs(b2.top()-b1.bottom()) < y_margin);
579 }
inT16 left() const
Definition: rect.h:68
inT16 top() const
Definition: rect.h:54
inT16 right() const
Definition: rect.h:75
inT16 bottom() const
Definition: rect.h:61

◆ DeleteSingleColumnTables()

void tesseract::TableFinder::DeleteSingleColumnTables ( )
protected

Definition at line 1707 of file tablefind.cpp.

1707  {
1708  int page_width = tright().x() - bleft().x();
1709  ASSERT_HOST(page_width > 0);
1710  // create an integer array to hold projection on x-axis
1711  int* table_xprojection = new int[page_width];
1712  // Iterate through all tables in the table grid
1713  GridSearch<ColSegment, ColSegment_CLIST, ColSegment_C_IT>
1714  table_search(&table_grid_);
1715  table_search.StartFullSearch();
1716  ColSegment* table;
1717  while ((table = table_search.NextFullSearch()) != NULL) {
1718  TBOX table_box = table->bounding_box();
1719  // reset the projection array
1720  for (int i = 0; i < page_width; i++) {
1721  table_xprojection[i] = 0;
1722  }
1723  // Start a rect search on table_box
1724  GridSearch<ColPartition, ColPartition_CLIST, ColPartition_C_IT>
1725  rectsearch(&clean_part_grid_);
1726  rectsearch.SetUniqueMode(true);
1727  rectsearch.StartRectSearch(table_box);
1728  ColPartition* part;
1729  while ((part = rectsearch.NextRectSearch()) != NULL) {
1730  if (!part->IsTextType())
1731  continue; // Do not consider non-text partitions
1732  if (part->flow() == BTFT_LEADER)
1733  continue; // Assume leaders are in tables
1734  TBOX part_box = part->bounding_box();
1735  // Do not consider partitions partially covered by the table
1736  if (part_box.overlap_fraction(table_box) < kMinOverlapWithTable)
1737  continue;
1738  BLOBNBOX_CLIST* part_boxes = part->boxes();
1739  BLOBNBOX_C_IT pit(part_boxes);
1740 
1741  // Make sure overlapping blobs don't artificially inflate the number
1742  // of rows in the table. This happens frequently with things such as
1743  // decimals and split characters. Do this by assuming the column
1744  // partition is sorted mostly left to right and just clip
1745  // bounding boxes by the previous box's extent.
1746  int next_position_to_write = 0;
1747 
1748  for (pit.mark_cycle_pt(); !pit.cycled_list(); pit.forward()) {
1749  BLOBNBOX *pblob = pit.data();
1750  // ignore blob height for the purpose of projection since we
1751  // are only interested in finding valleys
1752  int xstart = pblob->bounding_box().left();
1753  int xend = pblob->bounding_box().right();
1754 
1755  xstart = MAX(xstart, next_position_to_write);
1756  for (int i = xstart; i < xend; i++)
1757  table_xprojection[i - bleft().x()]++;
1758  next_position_to_write = xend;
1759  }
1760  }
1761  // Find largest valley between two reasonable peaks in the table
1762  if (!GapInXProjection(table_xprojection, page_width)) {
1763  table_search.RemoveBBox();
1764  delete table;
1765  }
1766  }
1767  delete[] table_xprojection;
1768 }
bool GapInXProjection(int *xprojection, int length)
Definition: tablefind.cpp:1772
ColSegmentGrid table_grid_
Definition: tablefind.h:423
const ICOORD & bleft() const
Definition: tablefind.cpp:391
inT16 x() const
access function
Definition: points.h:52
#define ASSERT_HOST(x)
Definition: errcode.h:84
inT16 left() const
Definition: rect.h:68
double overlap_fraction(const TBOX &box) const
Definition: rect.h:378
const double kMinOverlapWithTable
Definition: tablefind.cpp:100
#define MAX(x, y)
Definition: ndminx.h:24
Definition: rect.h:30
inT16 right() const
Definition: rect.h:75
ColPartitionGrid clean_part_grid_
Definition: tablefind.h:413
const ICOORD & tright() const
Definition: tablefind.cpp:394
const TBOX & bounding_box() const
Definition: blobbox.h:215

◆ DisplayColPartitionConnections()

void tesseract::TableFinder::DisplayColPartitionConnections ( ScrollView win,
ColPartitionGrid grid,
ScrollView::Color  default_color 
)
protected

Definition at line 1954 of file tablefind.cpp.

1957  {
1958 #ifndef GRAPHICS_DISABLED
1959  // Iterate the ColPartitions in the grid.
1960  GridSearch<ColPartition, ColPartition_CLIST, ColPartition_C_IT>
1961  gsearch(grid);
1962  gsearch.StartFullSearch();
1963  ColPartition* part = NULL;
1964  while ((part = gsearch.NextFullSearch()) != NULL) {
1965  const TBOX& box = part->bounding_box();
1966  int left_x = box.left();
1967  int right_x = box.right();
1968  int top_y = box.top();
1969  int bottom_y = box.bottom();
1970 
1971  ColPartition* upper_part = part->nearest_neighbor_above();
1972  if (upper_part) {
1973  const TBOX& upper_box = upper_part->bounding_box();
1974  int mid_x = (left_x + right_x) / 2;
1975  int mid_y = (top_y + bottom_y) / 2;
1976  int other_x = (upper_box.left() + upper_box.right()) / 2;
1977  int other_y = (upper_box.top() + upper_box.bottom()) / 2;
1978  win->Brush(ScrollView::NONE);
1979  win->Pen(color);
1980  win->Line(mid_x, mid_y, other_x, other_y);
1981  }
1982  ColPartition* lower_part = part->nearest_neighbor_below();
1983  if (lower_part) {
1984  const TBOX& lower_box = lower_part->bounding_box();
1985  int mid_x = (left_x + right_x) / 2;
1986  int mid_y = (top_y + bottom_y) / 2;
1987  int other_x = (lower_box.left() + lower_box.right()) / 2;
1988  int other_y = (lower_box.top() + lower_box.bottom()) / 2;
1989  win->Brush(ScrollView::NONE);
1990  win->Pen(color);
1991  win->Line(mid_x, mid_y, other_x, other_y);
1992  }
1993  }
1994  win->UpdateWindow();
1995 #endif
1996 }
void Line(int x1, int y1, int x2, int y2)
Definition: scrollview.cpp:538
void Brush(Color color)
Definition: scrollview.cpp:732
inT16 left() const
Definition: rect.h:68
void UpdateWindow()
Definition: scrollview.cpp:710
inT16 top() const
Definition: rect.h:54
Definition: rect.h:30
inT16 right() const
Definition: rect.h:75
inT16 bottom() const
Definition: rect.h:61
void Pen(Color color)
Definition: scrollview.cpp:726

◆ DisplayColPartitions() [1/2]

void tesseract::TableFinder::DisplayColPartitions ( ScrollView win,
ColPartitionGrid grid,
ScrollView::Color  text_color,
ScrollView::Color  table_color 
)
protected

Definition at line 1920 of file tablefind.cpp.

1923  {
1924 #ifndef GRAPHICS_DISABLED
1925  ScrollView::Color color = default_color;
1926  // Iterate the ColPartitions in the grid.
1927  GridSearch<ColPartition, ColPartition_CLIST, ColPartition_C_IT>
1928  gsearch(grid);
1929  gsearch.StartFullSearch();
1930  ColPartition* part = NULL;
1931  while ((part = gsearch.NextFullSearch()) != NULL) {
1932  color = default_color;
1933  if (part->type() == PT_TABLE)
1934  color = table_color;
1935 
1936  const TBOX& box = part->bounding_box();
1937  int left_x = box.left();
1938  int right_x = box.right();
1939  int top_y = box.top();
1940  int bottom_y = box.bottom();
1941  win->Brush(ScrollView::NONE);
1942  win->Pen(color);
1943  win->Rectangle(left_x, bottom_y, right_x, top_y);
1944  }
1945  win->UpdateWindow();
1946 #endif
1947 }
void Brush(Color color)
Definition: scrollview.cpp:732
Definition: capi.h:94
inT16 left() const
Definition: rect.h:68
void UpdateWindow()
Definition: scrollview.cpp:710
void Rectangle(int x1, int y1, int x2, int y2)
Definition: scrollview.cpp:606
inT16 top() const
Definition: rect.h:54
Definition: rect.h:30
inT16 right() const
Definition: rect.h:75
inT16 bottom() const
Definition: rect.h:61
void Pen(Color color)
Definition: scrollview.cpp:726

◆ DisplayColPartitions() [2/2]

void tesseract::TableFinder::DisplayColPartitions ( ScrollView win,
ColPartitionGrid grid,
ScrollView::Color  default_color 
)
protected

Definition at line 1948 of file tablefind.cpp.

1950  {
1951  DisplayColPartitions(win, grid, default_color, ScrollView::YELLOW);
1952 }
void DisplayColPartitions(ScrollView *win, ColPartitionGrid *grid, ScrollView::Color text_color, ScrollView::Color table_color)
Definition: tablefind.cpp:1920

◆ DisplayColSegmentGrid()

void tesseract::TableFinder::DisplayColSegmentGrid ( ScrollView win,
ColSegmentGrid grid,
ScrollView::Color  color 
)
protected

Definition at line 1895 of file tablefind.cpp.

1896  {
1897 #ifndef GRAPHICS_DISABLED
1898  // Iterate the ColPartitions in the grid.
1899  GridSearch<ColSegment, ColSegment_CLIST, ColSegment_C_IT>
1900  gsearch(grid);
1901  gsearch.StartFullSearch();
1902  ColSegment* seg = NULL;
1903  while ((seg = gsearch.NextFullSearch()) != NULL) {
1904  const TBOX& box = seg->bounding_box();
1905  int left_x = box.left();
1906  int right_x = box.right();
1907  int top_y = box.top();
1908  int bottom_y = box.bottom();
1909  win->Brush(ScrollView::NONE);
1910  win->Pen(color);
1911  win->Rectangle(left_x, bottom_y, right_x, top_y);
1912  }
1913  win->UpdateWindow();
1914 #endif
1915 }
void Brush(Color color)
Definition: scrollview.cpp:732
inT16 left() const
Definition: rect.h:68
void UpdateWindow()
Definition: scrollview.cpp:710
void Rectangle(int x1, int y1, int x2, int y2)
Definition: scrollview.cpp:606
inT16 top() const
Definition: rect.h:54
Definition: rect.h:30
inT16 right() const
Definition: rect.h:75
inT16 bottom() const
Definition: rect.h:61
void Pen(Color color)
Definition: scrollview.cpp:726

◆ DisplayColSegments()

void tesseract::TableFinder::DisplayColSegments ( ScrollView win,
ColSegment_LIST *  cols,
ScrollView::Color  color 
)
protected

Definition at line 1875 of file tablefind.cpp.

1877  {
1878 #ifndef GRAPHICS_DISABLED
1879  win->Pen(color);
1880  win->Brush(ScrollView::NONE);
1881  ColSegment_IT it(segments);
1882  for (it.mark_cycle_pt(); !it.cycled_list(); it.forward()) {
1883  ColSegment* col = it.data();
1884  const TBOX& box = col->bounding_box();
1885  int left_x = box.left();
1886  int right_x = box.right();
1887  int top_y = box.top();
1888  int bottom_y = box.bottom();
1889  win->Rectangle(left_x, bottom_y, right_x, top_y);
1890  }
1891  win->UpdateWindow();
1892 #endif
1893 }
void Brush(Color color)
Definition: scrollview.cpp:732
inT16 left() const
Definition: rect.h:68
void UpdateWindow()
Definition: scrollview.cpp:710
void Rectangle(int x1, int y1, int x2, int y2)
Definition: scrollview.cpp:606
inT16 top() const
Definition: rect.h:54
Definition: rect.h:30
inT16 right() const
Definition: rect.h:75
inT16 bottom() const
Definition: rect.h:61
void Pen(Color color)
Definition: scrollview.cpp:726

◆ FilterFalseAlarms()

void tesseract::TableFinder::FilterFalseAlarms ( )
protected

Definition at line 992 of file tablefind.cpp.

992  {
995  // TODO(nbeato): Fully justified text as non-table?
996 }

◆ FilterHeaderAndFooter()

void tesseract::TableFinder::FilterHeaderAndFooter ( )
protected

Definition at line 1078 of file tablefind.cpp.

1078  {
1079  // Consider top-most text colpartition as header and bottom most as footer
1080  ColPartition* header = NULL;
1081  ColPartition* footer = NULL;
1082  int max_top = MIN_INT32;
1083  int min_bottom = MAX_INT32;
1085  gsearch.StartFullSearch();
1086  ColPartition* part = NULL;
1087  while ((part = gsearch.NextFullSearch()) != NULL) {
1088  if (!part->IsTextType())
1089  continue; // Consider only text partitions
1090  int top = part->bounding_box().top();
1091  int bottom = part->bounding_box().bottom();
1092  if (top > max_top) {
1093  max_top = top;
1094  header = part;
1095  }
1096  if (bottom < min_bottom) {
1097  min_bottom = bottom;
1098  footer = part;
1099  }
1100  }
1101  if (header)
1102  header->clear_table_type();
1103  if (footer)
1104  footer->clear_table_type();
1105 }
GridSearch< ColPartition, ColPartition_CLIST, ColPartition_C_IT > ColPartitionGridSearch
Definition: colpartition.h:932
#define MAX_INT32
Definition: host.h:62
#define MIN_INT32
Definition: host.h:70
ColPartitionGrid clean_part_grid_
Definition: tablefind.h:413

◆ FilterParagraphEndings()

void tesseract::TableFinder::FilterParagraphEndings ( )
protected

Definition at line 998 of file tablefind.cpp.

998  {
999  // Detect last line of paragraph
1000  // Iterate the ColPartitions in the grid.
1002  gsearch.StartFullSearch();
1003  ColPartition* part = NULL;
1004  while ((part = gsearch.NextFullSearch()) != NULL) {
1005  if (part->type() != PT_TABLE)
1006  continue; // Consider only table partitions
1007 
1008  // Paragraph ending should have flowing text above it.
1009  ColPartition* upper_part = part->nearest_neighbor_above();
1010  if (!upper_part)
1011  continue;
1012  if (upper_part->type() != PT_FLOWING_TEXT)
1013  continue;
1014  if (upper_part->bounding_box().width() <
1015  2 * part->bounding_box().width())
1016  continue;
1017  // Check if its the last line of a paragraph.
1018  // In most cases, a paragraph ending should be left-aligned to text line
1019  // above it. Sometimes, it could be a 2 line paragraph, in which case
1020  // the line above it is indented.
1021  // To account for that, check if the partition center is to
1022  // the left of the one above it.
1023  int mid = (part->bounding_box().left() + part->bounding_box().right()) / 2;
1024  int upper_mid = (upper_part->bounding_box().left() +
1025  upper_part->bounding_box().right()) / 2;
1026  int current_spacing = 0; // spacing of the current line to margin
1027  int upper_spacing = 0; // spacing of the previous line to the margin
1029  // Left to right languages, use mid - left to figure out the distance
1030  // the middle is from the left margin.
1031  int left = MIN(part->bounding_box().left(),
1032  upper_part->bounding_box().left());
1033  current_spacing = mid - left;
1034  upper_spacing = upper_mid - left;
1035  } else {
1036  // Right to left languages, use right - mid to figure out the distance
1037  // the middle is from the right margin.
1038  int right = MAX(part->bounding_box().right(),
1039  upper_part->bounding_box().right());
1040  current_spacing = right - mid;
1041  upper_spacing = right - upper_mid;
1042  }
1043  if (current_spacing * kParagraphEndingPreviousLineRatio > upper_spacing)
1044  continue;
1045 
1046  // Paragraphs should have similar fonts.
1047  if (!part->MatchingSizes(*upper_part) ||
1048  !part->MatchingStrokeWidth(*upper_part, kStrokeWidthFractionalTolerance,
1050  continue;
1051  }
1052 
1053  // The last line of a paragraph should be left aligned.
1054  // TODO(nbeato): This would be untrue if the text was right aligned.
1055  // How often is that?
1056  if (part->space_to_left() >
1057  kMaxParagraphEndingLeftSpaceMultiple * part->median_size())
1058  continue;
1059  // The line above it should be right aligned (assuming justified format).
1060  // Since we can't assume justified text, we compare whitespace to text.
1061  // The above line should have majority spanning text (or the current
1062  // line could have fit on the previous line). So compare
1063  // whitespace to text.
1064  if (upper_part->bounding_box().width() <
1065  kMinParagraphEndingTextToWhitespaceRatio * upper_part->space_to_right())
1066  continue;
1067 
1068  // Ledding above the line should be less than ledding below
1069  if (part->space_above() >= part->space_below() ||
1070  part->space_above() > 2 * global_median_ledding_)
1071  continue;
1072 
1073  // If all checks failed, it is probably text.
1074  part->clear_table_type();
1075  }
1076 }
GridSearch< ColPartition, ColPartition_CLIST, ColPartition_C_IT > ColPartitionGridSearch
Definition: colpartition.h:932
const double kMinParagraphEndingTextToWhitespaceRatio
Definition: tablefind.cpp:135
Definition: capi.h:94
const double kStrokeWidthConstantTolerance
Definition: tablefind.cpp:144
const double kParagraphEndingPreviousLineRatio
Definition: tablefind.cpp:125
#define MAX(x, y)
Definition: ndminx.h:24
#define MIN(x, y)
Definition: ndminx.h:28
const double kStrokeWidthFractionalTolerance
Definition: tablefind.cpp:143
ColPartitionGrid clean_part_grid_
Definition: tablefind.h:413
const double kMaxParagraphEndingLeftSpaceMultiple
Definition: tablefind.cpp:129

◆ FindNeighbors()

void tesseract::TableFinder::FindNeighbors ( )
protected

Definition at line 770 of file tablefind.cpp.

770  {
772  gsearch.StartFullSearch();
773  ColPartition* part = NULL;
774  while ((part = gsearch.NextFullSearch()) != NULL) {
775  // TODO(nbeato): Rename this function, meaning is different now.
776  // IT is finding nearest neighbors its own way
777  //SetVerticalSpacing(part);
778 
779  ColPartition* upper = part->SingletonPartner(true);
780  if (upper)
781  part->set_nearest_neighbor_above(upper);
782 
783  ColPartition* lower = part->SingletonPartner(false);
784  if (lower)
785  part->set_nearest_neighbor_below(lower);
786  }
787 }
GridSearch< ColPartition, ColPartition_CLIST, ColPartition_C_IT > ColPartitionGridSearch
Definition: colpartition.h:932
ColPartitionGrid clean_part_grid_
Definition: tablefind.h:413

◆ GapInXProjection()

bool tesseract::TableFinder::GapInXProjection ( int xprojection,
int  length 
)
protected

Definition at line 1772 of file tablefind.cpp.

1772  {
1773  // Find peak value of the histogram
1774  int peak_value = 0;
1775  for (int i = 0; i < length; i++) {
1776  if (xprojection[i] > peak_value) {
1777  peak_value = xprojection[i];
1778  }
1779  }
1780  // Peak value represents the maximum number of horizontally
1781  // overlapping colpartitions, so this can be considered as the
1782  // number of rows in the table
1783  if (peak_value < kMinRowsInTable)
1784  return false;
1785  double projection_threshold = kSmallTableProjectionThreshold * peak_value;
1786  if (peak_value >= kLargeTableRowCount)
1787  projection_threshold = kLargeTableProjectionThreshold * peak_value;
1788  // Threshold the histogram
1789  for (int i = 0; i < length; i++) {
1790  xprojection[i] = (xprojection[i] >= projection_threshold) ? 1 : 0;
1791  }
1792  // Find the largest run of zeros between two ones
1793  int largest_gap = 0;
1794  int run_start = -1;
1795  for (int i = 1; i < length; i++) {
1796  // detect start of a run of zeros
1797  if (xprojection[i - 1] && !xprojection[i]) {
1798  run_start = i;
1799  }
1800  // detect end of a run of zeros and update the value of largest gap
1801  if (run_start != -1 && !xprojection[i - 1] && xprojection[i]) {
1802  int gap = i - run_start;
1803  if (gap > largest_gap)
1804  largest_gap = gap;
1805  run_start = -1;
1806  }
1807  }
1808  return largest_gap > kMaxXProjectionGapFactor * global_median_xheight_;
1809 }
const double kLargeTableProjectionThreshold
Definition: tablefind.cpp:110
const double kMaxXProjectionGapFactor
Definition: tablefind.cpp:139
const double kSmallTableProjectionThreshold
Definition: tablefind.cpp:109
const int kMinRowsInTable
Definition: tablefind.cpp:115
const int kLargeTableRowCount
Definition: tablefind.cpp:112

◆ GetColumnBlocks()

void tesseract::TableFinder::GetColumnBlocks ( ColPartitionSet **  columns,
ColSegment_LIST *  col_segments 
)
protected

Definition at line 527 of file tablefind.cpp.

528  {
529  for (int i = 0; i < gridheight(); ++i) {
530  ColPartitionSet* columns = all_columns[i];
531  if (columns != NULL) {
532  ColSegment_LIST new_blocks;
533  // Get boxes from the current vertical position on the grid
534  columns->GetColumnBoxes(i * gridsize(), (i+1) * gridsize(), &new_blocks);
535  // Merge the new_blocks boxes into column_blocks if they are well-aligned
536  GroupColumnBlocks(&new_blocks, column_blocks);
537  }
538  }
539 }
int gridheight() const
Definition: tablefind.cpp:388
void GroupColumnBlocks(ColSegment_LIST *current_segments, ColSegment_LIST *col_segments)
Definition: tablefind.cpp:542

◆ GetTableColumns()

void tesseract::TableFinder::GetTableColumns ( ColSegment_LIST *  table_columns)
protected

Definition at line 1277 of file tablefind.cpp.

1277  {
1278  ColSegment_IT it(table_columns);
1279  // Iterate the ColPartitions in the grid.
1280  GridSearch<ColPartition, ColPartition_CLIST, ColPartition_C_IT>
1281  gsearch(&clean_part_grid_);
1282  gsearch.StartFullSearch();
1283  ColPartition* part;
1284  while ((part = gsearch.NextFullSearch()) != NULL) {
1285  if (part->inside_table_column() || part->type() != PT_TABLE)
1286  continue; // prevent a partition to be assigned to multiple columns
1287  const TBOX& box = part->bounding_box();
1288  ColSegment* col = new ColSegment();
1289  col->InsertBox(box);
1290  part->set_inside_table_column(true);
1291  // Start a search below the current cell to find bottom neighbours
1292  // Note: a full search will always process things above it first, so
1293  // this should be starting at the highest cell and working its way down.
1294  GridSearch<ColPartition, ColPartition_CLIST, ColPartition_C_IT>
1295  vsearch(&clean_part_grid_);
1296  vsearch.StartVerticalSearch(box.left(), box.right(), box.bottom());
1297  ColPartition* neighbor = NULL;
1298  bool found_neighbours = false;
1299  while ((neighbor = vsearch.NextVerticalSearch(true)) != NULL) {
1300  // only consider neighbors not assigned to any column yet
1301  if (neighbor->inside_table_column())
1302  continue;
1303  // Horizontal lines should not break the flow
1304  if (neighbor->IsHorizontalLine())
1305  continue;
1306  // presence of a non-table neighbor marks the end of current
1307  // table column
1308  if (neighbor->type() != PT_TABLE)
1309  break;
1310  // add the neighbor partition to the table column
1311  const TBOX& neighbor_box = neighbor->bounding_box();
1312  col->InsertBox(neighbor_box);
1313  neighbor->set_inside_table_column(true);
1314  found_neighbours = true;
1315  }
1316  if (found_neighbours) {
1317  it.add_after_then_move(col);
1318  } else {
1319  part->set_inside_table_column(false);
1320  delete col;
1321  }
1322  }
1323 }
Definition: capi.h:94
inT16 left() const
Definition: rect.h:68
Definition: rect.h:30
inT16 right() const
Definition: rect.h:75
inT16 bottom() const
Definition: rect.h:61
ColPartitionGrid clean_part_grid_
Definition: tablefind.h:413

◆ GetTableRegions()

void tesseract::TableFinder::GetTableRegions ( ColSegment_LIST *  table_columns,
ColSegment_LIST *  table_regions 
)
protected

Definition at line 1327 of file tablefind.cpp.

1328  {
1329  ColSegment_IT cit(table_columns);
1330  ColSegment_IT rit(table_regions);
1331  // Iterate through column blocks
1332  GridSearch<ColSegment, ColSegment_CLIST, ColSegment_C_IT>
1333  gsearch(&col_seg_grid_);
1334  gsearch.StartFullSearch();
1335  ColSegment* part;
1336  int page_height = tright().y() - bleft().y();
1337  ASSERT_HOST(page_height > 0);
1338  // create a bool array to hold projection on y-axis
1339  bool* table_region = new bool[page_height];
1340  while ((part = gsearch.NextFullSearch()) != NULL) {
1341  const TBOX& part_box = part->bounding_box();
1342  // reset the projection array
1343  for (int i = 0; i < page_height; i++) {
1344  table_region[i] = false;
1345  }
1346  // iterate through all table columns to find regions in the current
1347  // page column block
1348  cit.move_to_first();
1349  for (cit.mark_cycle_pt(); !cit.cycled_list(); cit.forward()) {
1350  TBOX col_box = cit.data()->bounding_box();
1351  // find intersection region of table column and page column
1352  TBOX intersection_box = col_box.intersection(part_box);
1353  // project table column on the y-axis
1354  for (int i = intersection_box.bottom(); i < intersection_box.top(); i++) {
1355  table_region[i - bleft().y()] = true;
1356  }
1357  }
1358  // set x-limits of table regions to page column width
1359  TBOX current_table_box;
1360  current_table_box.set_left(part_box.left());
1361  current_table_box.set_right(part_box.right());
1362  // go through the y-axis projection to find runs of table
1363  // regions. Each run makes one table region.
1364  for (int i = 1; i < page_height; i++) {
1365  // detect start of a table region
1366  if (!table_region[i - 1] && table_region[i]) {
1367  current_table_box.set_bottom(i + bleft().y());
1368  }
1369  // TODO(nbeato): Is it guaranteed that the last row is not a table region?
1370  // detect end of a table region
1371  if (table_region[i - 1] && !table_region[i]) {
1372  current_table_box.set_top(i + bleft().y());
1373  if (!current_table_box.null_box()) {
1374  ColSegment* seg = new ColSegment();
1375  seg->InsertBox(current_table_box);
1376  rit.add_after_then_move(seg);
1377  }
1378  }
1379  }
1380  }
1381  delete[] table_region;
1382 }
TBOX intersection(const TBOX &box) const
Definition: rect.cpp:87
const ICOORD & bleft() const
Definition: tablefind.cpp:391
#define ASSERT_HOST(x)
Definition: errcode.h:84
inT16 left() const
Definition: rect.h:68
void set_top(int y)
Definition: rect.h:57
inT16 y() const
access_function
Definition: points.h:56
bool null_box() const
Definition: rect.h:46
inT16 top() const
Definition: rect.h:54
Definition: rect.h:30
inT16 right() const
Definition: rect.h:75
void set_right(int x)
Definition: rect.h:78
void set_left(int x)
Definition: rect.h:71
inT16 bottom() const
Definition: rect.h:61
void set_bottom(int y)
Definition: rect.h:64
const ICOORD & tright() const
Definition: tablefind.cpp:394
ColSegmentGrid col_seg_grid_
Definition: tablefind.h:421

◆ gridheight()

int tesseract::TableFinder::gridheight ( ) const
protected

Definition at line 388 of file tablefind.cpp.

388  {
389  return clean_part_grid_.gridheight();
390 }
int gridheight() const
Definition: bbgrid.h:70
ColPartitionGrid clean_part_grid_
Definition: tablefind.h:413

◆ GridMergeColumnBlocks()

void tesseract::TableFinder::GridMergeColumnBlocks ( )
protected

Definition at line 1199 of file tablefind.cpp.

1199  {
1200  int margin = gridsize();
1201 
1202  // Iterate the Column Blocks in the grid.
1203  GridSearch<ColSegment, ColSegment_CLIST, ColSegment_C_IT>
1204  gsearch(&col_seg_grid_);
1205  gsearch.StartFullSearch();
1206  ColSegment* seg;
1207  while ((seg = gsearch.NextFullSearch()) != NULL) {
1208  if (seg->type() != COL_TEXT)
1209  continue; // only consider text blocks for split detection
1210  bool neighbor_found = false;
1211  bool modified = false; // Modified at least once
1212  // keep expanding current box as long as neighboring table columns
1213  // are found above or below it.
1214  do {
1215  TBOX box = seg->bounding_box();
1216  // slightly expand the search region vertically
1217  int top_range = MIN(box.top() + margin, tright().y());
1218  int bottom_range = MAX(box.bottom() - margin, bleft().y());
1219  box.set_top(top_range);
1220  box.set_bottom(bottom_range);
1221  neighbor_found = false;
1222  GridSearch<ColSegment, ColSegment_CLIST, ColSegment_C_IT>
1223  rectsearch(&col_seg_grid_);
1224  rectsearch.StartRectSearch(box);
1225  ColSegment* neighbor = NULL;
1226  while ((neighbor = rectsearch.NextRectSearch()) != NULL) {
1227  if (neighbor == seg)
1228  continue;
1229  const TBOX& neighbor_box = neighbor->bounding_box();
1230  // If the neighbor box significantly overlaps with the current
1231  // box (due to the expansion of the current box in the
1232  // previous iteration of this loop), remove the neighbor box
1233  // and expand the current box to include it.
1234  if (neighbor_box.overlap_fraction(box) >= 0.9) {
1235  seg->InsertBox(neighbor_box);
1236  modified = true;
1237  rectsearch.RemoveBBox();
1238  gsearch.RepositionIterator();
1239  delete neighbor;
1240  continue;
1241  }
1242  // Only expand if the neighbor box is of table type
1243  if (neighbor->type() != COL_TABLE)
1244  continue;
1245  // Insert the neighbor box into the current column block
1246  if (neighbor_box.major_x_overlap(box) &&
1247  !box.contains(neighbor_box)) {
1248  seg->InsertBox(neighbor_box);
1249  neighbor_found = true;
1250  modified = true;
1251  rectsearch.RemoveBBox();
1252  gsearch.RepositionIterator();
1253  delete neighbor;
1254  }
1255  }
1256  } while (neighbor_found);
1257  if (modified) {
1258  // Because the box has changed, it has to be removed first.
1259  gsearch.RemoveBBox();
1260  col_seg_grid_.InsertBBox(true, true, seg);
1261  gsearch.RepositionIterator();
1262  }
1263  }
1264 }
const ICOORD & bleft() const
Definition: tablefind.cpp:391
bool major_x_overlap(const TBOX &box) const
Definition: rect.h:402
void set_top(int y)
Definition: rect.h:57
void InsertBBox(bool h_spread, bool v_spread, BBC *bbox)
Definition: bbgrid.h:490
inT16 y() const
access_function
Definition: points.h:56
double overlap_fraction(const TBOX &box) const
Definition: rect.h:378
bool contains(const FCOORD pt) const
Definition: rect.h:323
inT16 top() const
Definition: rect.h:54
#define MAX(x, y)
Definition: ndminx.h:24
Definition: rect.h:30
#define MIN(x, y)
Definition: ndminx.h:28
inT16 bottom() const
Definition: rect.h:61
void set_bottom(int y)
Definition: rect.h:64
const ICOORD & tright() const
Definition: tablefind.cpp:394
ColSegmentGrid col_seg_grid_
Definition: tablefind.h:421

◆ GridMergeTableRegions()

void tesseract::TableFinder::GridMergeTableRegions ( )
protected

Definition at line 1390 of file tablefind.cpp.

1390  {
1391  // Iterate the table regions in the grid.
1392  GridSearch<ColSegment, ColSegment_CLIST, ColSegment_C_IT>
1393  gsearch(&table_grid_);
1394  gsearch.StartFullSearch();
1395  ColSegment* seg = NULL;
1396  while ((seg = gsearch.NextFullSearch()) != NULL) {
1397  bool neighbor_found = false;
1398  bool modified = false; // Modified at least once
1399  do {
1400  // Start a rectangle search x-bounded by the image and y by the table
1401  const TBOX& box = seg->bounding_box();
1402  TBOX search_region(box);
1403  search_region.set_left(bleft().x());
1404  search_region.set_right(tright().x());
1405  neighbor_found = false;
1406  GridSearch<ColSegment, ColSegment_CLIST, ColSegment_C_IT>
1407  rectsearch(&table_grid_);
1408  rectsearch.StartRectSearch(search_region);
1409  ColSegment* neighbor = NULL;
1410  while ((neighbor = rectsearch.NextRectSearch()) != NULL) {
1411  if (neighbor == seg)
1412  continue;
1413  const TBOX& neighbor_box = neighbor->bounding_box();
1414  // Check if a neighbor box has a large overlap with the table
1415  // region. This may happen as a result of merging two table
1416  // regions in the previous iteration.
1417  if (neighbor_box.overlap_fraction(box) >= 0.9) {
1418  seg->InsertBox(neighbor_box);
1419  rectsearch.RemoveBBox();
1420  gsearch.RepositionIterator();
1421  delete neighbor;
1422  modified = true;
1423  continue;
1424  }
1425  // Check if two table regions belong together based on a common
1426  // horizontal ruling line
1427  if (BelongToOneTable(box, neighbor_box)) {
1428  seg->InsertBox(neighbor_box);
1429  neighbor_found = true;
1430  modified = true;
1431  rectsearch.RemoveBBox();
1432  gsearch.RepositionIterator();
1433  delete neighbor;
1434  }
1435  }
1436  } while (neighbor_found);
1437  if (modified) {
1438  // Because the box has changed, it has to be removed first.
1439  gsearch.RemoveBBox();
1440  table_grid_.InsertBBox(true, true, seg);
1441  gsearch.RepositionIterator();
1442  }
1443  }
1444 }
ColSegmentGrid table_grid_
Definition: tablefind.h:423
const ICOORD & bleft() const
Definition: tablefind.cpp:391
void InsertBBox(bool h_spread, bool v_spread, BBC *bbox)
Definition: bbgrid.h:490
bool BelongToOneTable(const TBOX &box1, const TBOX &box2)
Definition: tablefind.cpp:1448
double overlap_fraction(const TBOX &box) const
Definition: rect.h:378
Definition: rect.h:30
const ICOORD & tright() const
Definition: tablefind.cpp:394

◆ gridsize()

int tesseract::TableFinder::gridsize ( ) const
protected

Definition at line 382 of file tablefind.cpp.

382  {
383  return clean_part_grid_.gridsize();
384 }
int gridsize() const
Definition: bbgrid.h:64
ColPartitionGrid clean_part_grid_
Definition: tablefind.h:413

◆ gridwidth()

int tesseract::TableFinder::gridwidth ( ) const
protected

Definition at line 385 of file tablefind.cpp.

385  {
386  return clean_part_grid_.gridwidth();
387 }
int gridwidth() const
Definition: bbgrid.h:67
ColPartitionGrid clean_part_grid_
Definition: tablefind.h:413

◆ GroupColumnBlocks()

void tesseract::TableFinder::GroupColumnBlocks ( ColSegment_LIST *  current_segments,
ColSegment_LIST *  col_segments 
)
protected

Definition at line 542 of file tablefind.cpp.

543  {
544  ColSegment_IT src_it(new_blocks);
545  ColSegment_IT dest_it(column_blocks);
546  // iterate through the source list
547  for (src_it.mark_cycle_pt(); !src_it.cycled_list(); src_it.forward()) {
548  ColSegment* src_seg = src_it.data();
549  const TBOX& src_box = src_seg->bounding_box();
550  bool match_found = false;
551  // iterate through the destination list to find a matching column block
552  for (dest_it.mark_cycle_pt(); !dest_it.cycled_list(); dest_it.forward()) {
553  ColSegment* dest_seg = dest_it.data();
554  TBOX dest_box = dest_seg->bounding_box();
555  if (ConsecutiveBoxes(src_box, dest_box)) {
556  // If matching block is found, insert the current block into it
557  // and delete the soure block
558  dest_seg->InsertBox(src_box);
559  match_found = true;
560  delete src_it.extract();
561  break;
562  }
563  }
564  // If no match is found, just append the source block to column_blocks
565  if (!match_found) {
566  dest_it.add_after_then_move(src_it.extract());
567  }
568  }
569 }
Definition: rect.h:30
bool ConsecutiveBoxes(const TBOX &b1, const TBOX &b2)
Definition: tablefind.cpp:572

◆ GrowTableBox()

void tesseract::TableFinder::GrowTableBox ( const TBOX table_box,
TBOX result_box 
)
protected

Definition at line 1524 of file tablefind.cpp.

1524  {
1525  // TODO(nbeato): The growing code is a bit excessive right now.
1526  // By removing these lines, the partitions considered need
1527  // to have some overlap or be special cases. These lines could
1528  // be added again once a check is put in place to make sure that
1529  // growing tables don't stomp on a lot of non-table partitions.
1530 
1531  // search for horizontal ruling lines within the vertical margin
1532  // int vertical_margin = kRulingVerticalMargin * gridsize();
1533  TBOX search_box = table_box;
1534  // int top = MIN(search_box.top() + vertical_margin, tright().y());
1535  // int bottom = MAX(search_box.bottom() - vertical_margin, bleft().y());
1536  // search_box.set_top(top);
1537  // search_box.set_bottom(bottom);
1538 
1539  GrowTableToIncludePartials(table_box, search_box, result_box);
1540  GrowTableToIncludeLines(table_box, search_box, result_box);
1541  IncludeLeftOutColumnHeaders(result_box);
1542 }
void IncludeLeftOutColumnHeaders(TBOX *table_box)
Definition: tablefind.cpp:1668
void GrowTableToIncludePartials(const TBOX &table_box, const TBOX &search_range, TBOX *result_box)
Definition: tablefind.cpp:1546
Definition: rect.h:30
void GrowTableToIncludeLines(const TBOX &table_box, const TBOX &search_range, TBOX *result_box)
Definition: tablefind.cpp:1574

◆ GrowTableToIncludeLines()

void tesseract::TableFinder::GrowTableToIncludeLines ( const TBOX table_box,
const TBOX search_range,
TBOX result_box 
)
protected

Definition at line 1574 of file tablefind.cpp.

1576  {
1578  rsearch.SetUniqueMode(true);
1579  rsearch.StartRectSearch(search_range);
1580  ColPartition* part = NULL;
1581  while ((part = rsearch.NextRectSearch()) != NULL) {
1582  // TODO(nbeato) This should also do vertical, but column
1583  // boundaries are breaking things. This function needs to be
1584  // updated to allow vertical lines as well.
1585  if (!part->IsLineType())
1586  continue;
1587  // Avoid the following function call if the result of the
1588  // function is irrelevant.
1589  const TBOX& part_box = part->bounding_box();
1590  if (result_box->contains(part_box))
1591  continue;
1592  // Include a partially overlapping horizontal line only if the
1593  // extra ColPartitions that will be included due to expansion
1594  // have large side spacing w.r.t. columns containing them.
1595  if (HLineBelongsToTable(*part, table_box))
1596  *result_box = result_box->bounding_union(part_box);
1597  // TODO(nbeato): Vertical
1598  }
1599 }
GridSearch< ColPartition, ColPartition_CLIST, ColPartition_C_IT > ColPartitionGridSearch
Definition: colpartition.h:932
TBOX bounding_union(const TBOX &box) const
Definition: rect.cpp:129
bool HLineBelongsToTable(const ColPartition &part, const TBOX &table_box)
Definition: tablefind.cpp:1604
ColPartitionGrid leader_and_ruling_grid_
Definition: tablefind.h:415
bool contains(const FCOORD pt) const
Definition: rect.h:323
Definition: rect.h:30

◆ GrowTableToIncludePartials()

void tesseract::TableFinder::GrowTableToIncludePartials ( const TBOX table_box,
const TBOX search_range,
TBOX result_box 
)
protected

Definition at line 1546 of file tablefind.cpp.

1548  {
1549  // Rulings are in a different grid, so search 2 grids for rulings, text,
1550  // and table partitions that are not entirely within the new box.
1551  for (int i = 0; i < 2; ++i) {
1552  ColPartitionGrid* grid = (i == 0) ? &fragmented_text_grid_ :
1554  ColPartitionGridSearch rectsearch(grid);
1555  rectsearch.StartRectSearch(search_range);
1556  ColPartition* part = NULL;
1557  while ((part = rectsearch.NextRectSearch()) != NULL) {
1558  // Only include text and table types.
1559  if (part->IsImageType())
1560  continue;
1561  const TBOX& part_box = part->bounding_box();
1562  // Include partition in the table if more than half of it
1563  // is covered by the table
1564  if (part_box.overlap_fraction(table_box) > kMinOverlapWithTable) {
1565  *result_box = result_box->bounding_union(part_box);
1566  continue;
1567  }
1568  }
1569  }
1570 }
GridSearch< ColPartition, ColPartition_CLIST, ColPartition_C_IT > ColPartitionGridSearch
Definition: colpartition.h:932
TBOX bounding_union(const TBOX &box) const
Definition: rect.cpp:129
ColPartitionGrid leader_and_ruling_grid_
Definition: tablefind.h:415
double overlap_fraction(const TBOX &box) const
Definition: rect.h:378
const double kMinOverlapWithTable
Definition: tablefind.cpp:100
Definition: rect.h:30
ColPartitionGrid fragmented_text_grid_
Definition: tablefind.h:419

◆ HasLeaderAdjacent()

bool tesseract::TableFinder::HasLeaderAdjacent ( const ColPartition part)
protected

Definition at line 950 of file tablefind.cpp.

950  {
951  if (part.flow() == BTFT_LEADER)
952  return true;
953  // Search range is left and right bounded by an offset of the
954  // median xheight. This offset is to allow some tolerance to the
955  // the leaders on the page in the event that the alignment is still
956  // a bit off.
957  const TBOX& box = part.bounding_box();
958  const int search_size = kAdjacentLeaderSearchPadding * global_median_xheight_;
959  const int top = box.top() + search_size;
960  const int bottom = box.bottom() - search_size;
962  for (int direction = 0; direction < 2; ++direction) {
963  bool right_to_left = (direction == 0);
964  int x = right_to_left ? box.right() : box.left();
965  hsearch.StartSideSearch(x, bottom, top);
966  ColPartition* leader = NULL;
967  while ((leader = hsearch.NextSideSearch(right_to_left)) != NULL) {
968  // The leader could be a horizontal ruling in the grid.
969  // Make sure it is actually a leader.
970  if (leader->flow() != BTFT_LEADER)
971  continue;
972  // This should not happen, they are in different grids.
973  ASSERT_HOST(&part != leader);
974  // Make sure the leader shares a page column with the partition,
975  // otherwise we are spreading across columns.
976  if (!part.IsInSameColumnAs(*leader))
977  break;
978  // There should be a significant vertical overlap
979  if (!leader->VSignificantCoreOverlap(part))
980  continue;
981  // Leader passed all tests, so it is adjacent.
982  return true;
983  }
984  }
985  // No leaders are adjacent to the given partition.
986  return false;
987 }
GridSearch< ColPartition, ColPartition_CLIST, ColPartition_C_IT > ColPartitionGridSearch
Definition: colpartition.h:932
int direction(EDGEPT *point)
Definition: vecfuncs.cpp:43
ColPartitionGrid leader_and_ruling_grid_
Definition: tablefind.h:415
const int kAdjacentLeaderSearchPadding
Definition: tablefind.cpp:120
#define ASSERT_HOST(x)
Definition: errcode.h:84
inT16 left() const
Definition: rect.h:68
inT16 top() const
Definition: rect.h:54
Definition: rect.h:30
inT16 right() const
Definition: rect.h:75
inT16 bottom() const
Definition: rect.h:61

◆ HasWideOrNoInterWordGap()

bool tesseract::TableFinder::HasWideOrNoInterWordGap ( ColPartition part) const
protected

Definition at line 861 of file tablefind.cpp.

861  {
862  // Should only get text partitions.
863  ASSERT_HOST(part->IsTextType());
864  // Blob access
865  BLOBNBOX_CLIST* part_boxes = part->boxes();
866  BLOBNBOX_C_IT it(part_boxes);
867  // Check if this is a relatively small partition (such as a single word)
868  if (part->bounding_box().width() <
869  kMinBoxesInTextPartition * part->median_size() &&
870  part_boxes->length() < kMinBoxesInTextPartition)
871  return true;
872 
873  // Variables used to compute inter-blob spacing.
874  int current_x0 = -1;
875  int current_x1 = -1;
876  int previous_x1 = -1;
877  // Stores the maximum gap detected.
878  int largest_partition_gap_found = -1;
879  // Text partition gap limits. If this is text (and not a table),
880  // there should be at least one gap larger than min_gap and no gap
881  // larger than max_gap.
882  const double max_gap = kMaxGapInTextPartition * part->median_size();
883  const double min_gap = kMinMaxGapInTextPartition * part->median_size();
884 
885  for (it.mark_cycle_pt(); !it.cycled_list(); it.forward()) {
886  BLOBNBOX* blob = it.data();
887  current_x0 = blob->bounding_box().left();
888  current_x1 = blob->bounding_box().right();
889  if (previous_x1 != -1) {
890  int gap = current_x0 - previous_x1;
891 
892  // TODO(nbeato): Boxes may overlap? Huh?
893  // For example, mag.3B 8003_033.3B.tif in UNLV data. The titles/authors
894  // on the top right of the page are filtered out with this line.
895  // Note 2: Iterating over blobs in a partition, so we are looking for
896  // spacing between the words.
897  if (gap < 0) {
898  // More likely case, the blobs slightly overlap. This can happen
899  // with diacritics (accents) or broken alphabet symbols (characters).
900  // Merge boxes together by taking max of right sides.
901  if (-gap < part->median_size() * kMaxBlobOverlapFactor) {
902  previous_x1 = MAX(previous_x1, current_x1);
903  continue;
904  }
905  // Extreme case, blobs overlap significantly in the same partition...
906  // This should not happen often (if at all), but it does.
907  // TODO(nbeato): investigate cases when this happens.
908  else {
909  // The behavior before was to completely ignore this case.
910  }
911  }
912 
913  // If a large enough gap is found, mark it as a table cell (return true)
914  if (gap > max_gap)
915  return true;
916  if (gap > largest_partition_gap_found)
917  largest_partition_gap_found = gap;
918  }
919  previous_x1 = current_x1;
920  }
921  // Since no large gap was found, return false if the partition is too
922  // long to be a data cell
923  if (part->bounding_box().width() >
924  kMaxBoxesInDataPartition * part->median_size() ||
925  part_boxes->length() > kMaxBoxesInDataPartition)
926  return false;
927 
928  // A partition may be a single blob. In this case, it's an isolated symbol
929  // or non-text (such as a ruling or image).
930  // Detect these as table partitions? Shouldn't this be case by case?
931  // The behavior before was to ignore this, making max_partition_gap < 0
932  // and implicitly return true. Just making it explicit.
933  if (largest_partition_gap_found == -1)
934  return true;
935 
936  // return true if the maximum gap found is smaller than the minimum allowed
937  // max_gap in a text partition. This indicates that there is no significant
938  // space in the partition, hence it is likely a single word.
939  return largest_partition_gap_found < min_gap;
940 }
const double kMaxGapInTextPartition
Definition: tablefind.cpp:72
const double kMaxBlobOverlapFactor
Definition: tablefind.cpp:80
const int kMinBoxesInTextPartition
Definition: tablefind.cpp:66
#define ASSERT_HOST(x)
Definition: errcode.h:84
inT16 left() const
Definition: rect.h:68
const int kMaxBoxesInDataPartition
Definition: tablefind.cpp:69
#define MAX(x, y)
Definition: ndminx.h:24
inT16 right() const
Definition: rect.h:75
const double kMinMaxGapInTextPartition
Definition: tablefind.cpp:76
const TBOX & bounding_box() const
Definition: blobbox.h:215

◆ HLineBelongsToTable()

bool tesseract::TableFinder::HLineBelongsToTable ( const ColPartition part,
const TBOX table_box 
)
protected

Definition at line 1604 of file tablefind.cpp.

1605  {
1606  if (!part.IsHorizontalLine())
1607  return false;
1608  const TBOX& part_box = part.bounding_box();
1609  if (!part_box.major_x_overlap(table_box))
1610  return false;
1611  // Do not consider top-most horizontal line since it usually
1612  // originates from noise.
1613  // TODO(nbeato): I had to comment this out because the ruling grid doesn't
1614  // have neighbors solved.
1615  // if (!part.nearest_neighbor_above())
1616  // return false;
1617  const TBOX bbox = part_box.bounding_union(table_box);
1618  // In the "unioned table" box (the table extents expanded by the line),
1619  // keep track of how many partitions have significant padding to the left
1620  // and right. If more than half of the partitions covered by the new table
1621  // have significant spacing, the line belongs to the table and the table
1622  // grows to include all of the partitions.
1623  int num_extra_partitions = 0;
1624  int extra_space_to_right = 0;
1625  int extra_space_to_left = 0;
1626  // Rulings are in a different grid, so search 2 grids for rulings, text,
1627  // and table partitions that are introduced by the new box.
1628  for (int i = 0; i < 2; ++i) {
1629  ColPartitionGrid* grid = (i == 0) ? &clean_part_grid_ :
1631  // Start a rect search on bbox
1632  ColPartitionGridSearch rectsearch(grid);
1633  rectsearch.SetUniqueMode(true);
1634  rectsearch.StartRectSearch(bbox);
1635  ColPartition* extra_part = NULL;
1636  while ((extra_part = rectsearch.NextRectSearch()) != NULL) {
1637  // ColPartition already in table
1638  const TBOX& extra_part_box = extra_part->bounding_box();
1639  if (extra_part_box.overlap_fraction(table_box) > kMinOverlapWithTable)
1640  continue;
1641  // Non-text ColPartitions do not contribute
1642  if (extra_part->IsImageType())
1643  continue;
1644  // Consider this partition.
1645  num_extra_partitions++;
1646  // presence of a table cell is a strong hint, so just increment the scores
1647  // without looking at the spacing.
1648  if (extra_part->type() == PT_TABLE || extra_part->IsLineType()) {
1649  extra_space_to_right++;
1650  extra_space_to_left++;
1651  continue;
1652  }
1653  int space_threshold = kSideSpaceMargin * part.median_size();
1654  if (extra_part->space_to_right() > space_threshold)
1655  extra_space_to_right++;
1656  if (extra_part->space_to_left() > space_threshold)
1657  extra_space_to_left++;
1658  }
1659  }
1660  // tprintf("%d %d %d\n",
1661  // num_extra_partitions,extra_space_to_right,extra_space_to_left);
1662  return (extra_space_to_right > num_extra_partitions / 2) ||
1663  (extra_space_to_left > num_extra_partitions / 2);
1664 }
const int kSideSpaceMargin
Definition: tablefind.cpp:105
GridSearch< ColPartition, ColPartition_CLIST, ColPartition_C_IT > ColPartitionGridSearch
Definition: colpartition.h:932
TBOX bounding_union(const TBOX &box) const
Definition: rect.cpp:129
ColPartitionGrid leader_and_ruling_grid_
Definition: tablefind.h:415
Definition: capi.h:94
bool major_x_overlap(const TBOX &box) const
Definition: rect.h:402
double overlap_fraction(const TBOX &box) const
Definition: rect.h:378
const double kMinOverlapWithTable
Definition: tablefind.cpp:100
Definition: rect.h:30
ColPartitionGrid clean_part_grid_
Definition: tablefind.h:413

◆ IncludeLeftOutColumnHeaders()

void tesseract::TableFinder::IncludeLeftOutColumnHeaders ( TBOX table_box)
protected

Definition at line 1668 of file tablefind.cpp.

1668  {
1669  // Start a search above the current table to look for column headers
1671  vsearch.StartVerticalSearch(table_box->left(), table_box->right(),
1672  table_box->top());
1673  ColPartition* neighbor = NULL;
1674  ColPartition* previous_neighbor = NULL;
1675  while ((neighbor = vsearch.NextVerticalSearch(false)) != NULL) {
1676  // Max distance to find a table heading.
1677  const int max_distance = kMaxColumnHeaderDistance *
1678  neighbor->median_size();
1679  int table_top = table_box->top();
1680  const TBOX& box = neighbor->bounding_box();
1681  // Do not continue if the next box is way above
1682  if (box.bottom() - table_top > max_distance)
1683  break;
1684  // Unconditionally include partitions of type TABLE or LINE
1685  // TODO(faisal): add some reasonable conditions here
1686  if (neighbor->type() == PT_TABLE || neighbor->IsLineType()) {
1687  table_box->set_top(box.top());
1688  previous_neighbor = NULL;
1689  continue;
1690  }
1691  // If there are two text partitions, one above the other, without a table
1692  // cell on their left or right side, consider them a barrier and quit
1693  if (previous_neighbor == NULL) {
1694  previous_neighbor = neighbor;
1695  } else {
1696  const TBOX& previous_box = previous_neighbor->bounding_box();
1697  if (!box.major_y_overlap(previous_box))
1698  break;
1699  }
1700  }
1701 }
const int kMaxColumnHeaderDistance
Definition: tablefind.cpp:88
GridSearch< ColPartition, ColPartition_CLIST, ColPartition_C_IT > ColPartitionGridSearch
Definition: colpartition.h:932
Definition: capi.h:94
bool major_y_overlap(const TBOX &box) const
Definition: rect.h:429
inT16 left() const
Definition: rect.h:68
void set_top(int y)
Definition: rect.h:57
inT16 top() const
Definition: rect.h:54
Definition: rect.h:30
inT16 right() const
Definition: rect.h:75
inT16 bottom() const
Definition: rect.h:61
ColPartitionGrid clean_part_grid_
Definition: tablefind.h:413

◆ Init()

void tesseract::TableFinder::Init ( int  grid_size,
const ICOORD bottom_left,
const ICOORD top_right 
)

Definition at line 185 of file tablefind.cpp.

186  {
187  // Initialize clean partitions list and grid
188  clean_part_grid_.Init(grid_size, bottom_left, top_right);
189  leader_and_ruling_grid_.Init(grid_size, bottom_left, top_right);
190  fragmented_text_grid_.Init(grid_size, bottom_left, top_right);
191  col_seg_grid_.Init(grid_size, bottom_left, top_right);
192  table_grid_.Init(grid_size, bottom_left, top_right);
193 }
ColSegmentGrid table_grid_
Definition: tablefind.h:423
void Init(int gridsize, const ICOORD &bleft, const ICOORD &tright)
Definition: bbgrid.h:448
ColPartitionGrid leader_and_ruling_grid_
Definition: tablefind.h:415
ColPartitionGrid clean_part_grid_
Definition: tablefind.h:413
ColSegmentGrid col_seg_grid_
Definition: tablefind.h:421
ColPartitionGrid fragmented_text_grid_
Definition: tablefind.h:419

◆ InitializePartitions()

void tesseract::TableFinder::InitializePartitions ( ColPartitionSet **  all_columns)
protected

Definition at line 583 of file tablefind.cpp.

583  {
584  FindNeighbors();
585  SetPartitionSpacings(&clean_part_grid_, all_columns);
587 }
void SetGlobalSpacings(ColPartitionGrid *grid)
Definition: tablefind.cpp:713
static void SetPartitionSpacings(ColPartitionGrid *grid, ColPartitionSet **all_columns)
Definition: tablefind.cpp:590
ColPartitionGrid clean_part_grid_
Definition: tablefind.h:413

◆ InsertCleanPartitions()

void tesseract::TableFinder::InsertCleanPartitions ( ColPartitionGrid grid,
TO_BLOCK block 
)

Definition at line 197 of file tablefind.cpp.

198  {
199  // Calculate stats. This lets us filter partitions in AllowTextPartition()
200  // and filter blobs in AllowBlob().
201  SetGlobalSpacings(grid);
202 
203  // Iterate the ColPartitions in the grid.
204  ColPartitionGridSearch gsearch(grid);
205  gsearch.SetUniqueMode(true);
206  gsearch.StartFullSearch();
207  ColPartition* part = NULL;
208  while ((part = gsearch.NextFullSearch()) != NULL) {
209  // Reject partitions with nothing useful inside of them.
210  if (part->blob_type() == BRT_NOISE || part->bounding_box().area() <= 0)
211  continue;
212  ColPartition* clean_part = part->ShallowCopy();
213  ColPartition* leader_part = NULL;
214  if (part->IsLineType()) {
215  InsertRulingPartition(clean_part);
216  continue;
217  }
218  // Insert all non-text partitions to clean_parts
219  if (!part->IsTextType()) {
220  InsertImagePartition(clean_part);
221  continue;
222  }
223  // Insert text colpartitions after removing noisy components from them
224  // The leaders are split into a separate grid.
225  BLOBNBOX_CLIST* part_boxes = part->boxes();
226  BLOBNBOX_C_IT pit(part_boxes);
227  for (pit.mark_cycle_pt(); !pit.cycled_list(); pit.forward()) {
228  BLOBNBOX *pblob = pit.data();
229  // Bad blobs... happens in UNLV set.
230  // news.3G1, page 17 (around x=6)
231  if (!AllowBlob(*pblob))
232  continue;
233  if (pblob->flow() == BTFT_LEADER) {
234  if (leader_part == NULL) {
235  leader_part = part->ShallowCopy();
236  leader_part->set_flow(BTFT_LEADER);
237  }
238  leader_part->AddBox(pblob);
239  } else if (pblob->region_type() != BRT_NOISE) {
240  clean_part->AddBox(pblob);
241  }
242  }
243  clean_part->ComputeLimits();
244  ColPartition* fragmented = clean_part->CopyButDontOwnBlobs();
245  InsertTextPartition(clean_part);
247  if (leader_part != NULL) {
248  // TODO(nbeato): Note that ComputeLimits does not update the column
249  // information. So the leader may appear to span more columns than it
250  // really does later on when IsInSameColumnAs gets called to test
251  // for adjacent leaders.
252  leader_part->ComputeLimits();
253  InsertLeaderPartition(leader_part);
254  }
255  }
256 
257  // Make the partition partners better for upper and lower neighbors.
260 }
GridSearch< ColPartition, ColPartition_CLIST, ColPartition_C_IT > ColPartitionGridSearch
Definition: colpartition.h:932
void SetGlobalSpacings(ColPartitionGrid *grid)
Definition: tablefind.cpp:713
BlobRegionType region_type() const
Definition: blobbox.h:268
void InsertTextPartition(ColPartition *part)
Definition: tablefind.cpp:398
void InsertRulingPartition(ColPartition *part)
Definition: tablefind.cpp:422
bool AllowBlob(const BLOBNBOX &blob) const
Definition: tablefind.cpp:506
void RefinePartitionPartners(bool get_desperate)
void set_flow(BlobTextFlowType value)
Definition: blobbox.h:283
void SplitAndInsertFragmentedTextPartition(ColPartition *part)
Definition: tablefind.cpp:440
ColPartitionGrid clean_part_grid_
Definition: tablefind.h:413
void InsertLeaderPartition(ColPartition *part)
Definition: tablefind.cpp:414
void InsertImagePartition(ColPartition *part)
Definition: tablefind.cpp:425
BlobTextFlowType flow() const
Definition: blobbox.h:280

◆ InsertFragmentedTextPartition()

void tesseract::TableFinder::InsertFragmentedTextPartition ( ColPartition part)
protected

Definition at line 406 of file tablefind.cpp.

406  {
407  ASSERT_HOST(part != NULL);
408  if (AllowTextPartition(*part)) {
409  fragmented_text_grid_.InsertBBox(true, true, part);
410  } else {
411  delete part;
412  }
413 }
bool AllowTextPartition(const ColPartition &part) const
Definition: tablefind.cpp:493
#define ASSERT_HOST(x)
Definition: errcode.h:84
void InsertBBox(bool h_spread, bool v_spread, BBC *bbox)
Definition: bbgrid.h:490
ColPartitionGrid fragmented_text_grid_
Definition: tablefind.h:419

◆ InsertImagePartition()

void tesseract::TableFinder::InsertImagePartition ( ColPartition part)
protected

Definition at line 425 of file tablefind.cpp.

425  {
426  // NOTE: If images are placed into a different grid in the future,
427  // the function SetPartitionSpacings needs to be updated. It should
428  // be the only thing that cares about image partitions.
429  clean_part_grid_.InsertBBox(true, true, part);
430 }
void InsertBBox(bool h_spread, bool v_spread, BBC *bbox)
Definition: bbgrid.h:490
ColPartitionGrid clean_part_grid_
Definition: tablefind.h:413

◆ InsertLeaderPartition()

void tesseract::TableFinder::InsertLeaderPartition ( ColPartition part)
protected

Definition at line 414 of file tablefind.cpp.

414  {
415  ASSERT_HOST(part != NULL);
416  if (!part->IsEmpty() && part->bounding_box().area() > 0) {
417  leader_and_ruling_grid_.InsertBBox(true, true, part);
418  } else {
419  delete part;
420  }
421 }
ColPartitionGrid leader_and_ruling_grid_
Definition: tablefind.h:415
#define ASSERT_HOST(x)
Definition: errcode.h:84
void InsertBBox(bool h_spread, bool v_spread, BBC *bbox)
Definition: bbgrid.h:490

◆ InsertRulingPartition()

void tesseract::TableFinder::InsertRulingPartition ( ColPartition part)
protected

Definition at line 422 of file tablefind.cpp.

422  {
423  leader_and_ruling_grid_.InsertBBox(true, true, part);
424 }
ColPartitionGrid leader_and_ruling_grid_
Definition: tablefind.h:415
void InsertBBox(bool h_spread, bool v_spread, BBC *bbox)
Definition: bbgrid.h:490

◆ InsertTextPartition()

void tesseract::TableFinder::InsertTextPartition ( ColPartition part)
protected

Definition at line 398 of file tablefind.cpp.

398  {
399  ASSERT_HOST(part != NULL);
400  if (AllowTextPartition(*part)) {
401  clean_part_grid_.InsertBBox(true, true, part);
402  } else {
403  delete part;
404  }
405 }
bool AllowTextPartition(const ColPartition &part) const
Definition: tablefind.cpp:493
#define ASSERT_HOST(x)
Definition: errcode.h:84
void InsertBBox(bool h_spread, bool v_spread, BBC *bbox)
Definition: bbgrid.h:490
ColPartitionGrid clean_part_grid_
Definition: tablefind.h:413

◆ LocateTables()

void tesseract::TableFinder::LocateTables ( ColPartitionGrid grid,
ColPartitionSet **  columns,
WidthCallback width_cb,
const FCOORD reskew 
)

Definition at line 263 of file tablefind.cpp.

266  {
267  // initialize spacing, neighbors, and columns
268  InitializePartitions(all_columns);
269 
270 #ifndef GRAPHICS_DISABLED
271  if (textord_show_tables) {
272  ScrollView* table_win = MakeWindow(0, 300, "Column Partitions & Neighbors");
278 
279  table_win = MakeWindow(100, 300, "Fragmented Text");
281  }
282 #endif // GRAPHICS_DISABLED
283 
284  // mark, filter, and smooth candidate table partitions
286 
287  // Make single-column blocks from good_columns_ partitions. col_segments are
288  // moved to a grid later which takes the ownership
289  ColSegment_LIST column_blocks;
290  GetColumnBlocks(all_columns, &column_blocks);
291  // Set the ratio of candidate table partitions in each column
292  SetColumnsType(&column_blocks);
293 
294  // Move column segments to col_seg_grid_
295  MoveColSegmentsToGrid(&column_blocks, &col_seg_grid_);
296 
297  // Detect split in column layout that might have occurred due to the
298  // presence of a table. In such a case, merge the corresponding columns.
300 
301  // Group horizontally overlapping table partitions into table columns.
302  // table_columns created here get deleted at the end of this method.
303  ColSegment_LIST table_columns;
304  GetTableColumns(&table_columns);
305 
306  // Within each column, mark the range table regions occupy based on the
307  // table columns detected. table_regions are moved to a grid later which
308  // takes the ownership
309  ColSegment_LIST table_regions;
310  GetTableRegions(&table_columns, &table_regions);
311 
312 #ifndef GRAPHICS_DISABLED
314  ScrollView* table_win = MakeWindow(1200, 300, "Table Columns and Regions");
315  DisplayColSegments(table_win, &table_columns, ScrollView::DARK_TURQUOISE);
316  DisplayColSegments(table_win, &table_regions, ScrollView::YELLOW);
317  }
318 #endif // GRAPHICS_DISABLED
319 
320  // Merge table regions across columns for tables spanning multiple
321  // columns
322  MoveColSegmentsToGrid(&table_regions, &table_grid_);
324 
325  // Adjust table boundaries by including nearby horizontal lines and left
326  // out column headers
329 
331  // Remove false alarms consiting of a single column
333 
334 #ifndef GRAPHICS_DISABLED
335  if (textord_show_tables) {
336  ScrollView* table_win = MakeWindow(1200, 300, "Detected Table Locations");
338  DisplayColSegments(table_win, &table_columns, ScrollView::KHAKI);
339  table_grid_.DisplayBoxes(table_win);
340  }
341 #endif // GRAPHICS_DISABLED
342 
343  // Find table grid structure and reject tables that are malformed.
344  RecognizeTables();
346  RecognizeTables();
347 
348 #ifndef GRAPHICS_DISABLED
349  if (textord_show_tables) {
350  ScrollView* table_win = MakeWindow(1400, 600, "Recognized Tables");
353  table_grid_.DisplayBoxes(table_win);
354  }
355 #endif // GRAPHICS_DISABLED
356  } else {
357  // Remove false alarms consiting of a single column
358  // TODO(nbeato): verify this is a NOP after structured table rejection.
359  // Right now it isn't. If the recognize function is doing what it is
360  // supposed to do, this function is obsolete.
362 
363 #ifndef GRAPHICS_DISABLED
364  if (textord_show_tables) {
365  ScrollView* table_win = MakeWindow(1500, 300, "Detected Tables");
368  table_grid_.DisplayBoxes(table_win);
369  }
370 #endif // GRAPHICS_DISABLED
371  }
372 
373  // Merge all colpartitions in table regions to make them a single
374  // colpartition and revert types of isolated table cells not
375  // assigned to any table to their original types.
376  MakeTableBlocks(grid, all_columns, width_cb);
377 }
void GetTableRegions(ColSegment_LIST *table_columns, ColSegment_LIST *table_regions)
Definition: tablefind.cpp:1327
void MoveColSegmentsToGrid(ColSegment_LIST *segments, ColSegmentGrid *col_seg_grid)
Definition: tablefind.cpp:1180
void DisplayBoxes(ScrollView *window)
Definition: bbgrid.h:617
ColSegmentGrid table_grid_
Definition: tablefind.h:423
void DisplayColPartitionConnections(ScrollView *win, ColPartitionGrid *grid, ScrollView::Color default_color)
Definition: tablefind.cpp:1954
void GetColumnBlocks(ColPartitionSet **columns, ColSegment_LIST *col_segments)
Definition: tablefind.cpp:527
void InitializePartitions(ColPartitionSet **all_columns)
Definition: tablefind.cpp:583
bool textord_show_tables
Definition: tablefind.cpp:146
ColPartitionGrid leader_and_ruling_grid_
Definition: tablefind.h:415
void DisplayColPartitions(ScrollView *win, ColPartitionGrid *grid, ScrollView::Color text_color, ScrollView::Color table_color)
Definition: tablefind.cpp:1920
void GetTableColumns(ColSegment_LIST *table_columns)
Definition: tablefind.cpp:1277
void MakeTableBlocks(ColPartitionGrid *grid, ColPartitionSet **columns, WidthCallback *width_cb)
Definition: tablefind.cpp:2001
ScrollView * MakeWindow(int x, int y, const char *window_name)
Definition: tablefind.cpp:522
bool textord_tablefind_show_mark
Definition: tablefind.cpp:148
void DisplayColSegments(ScrollView *win, ColSegment_LIST *cols, ScrollView::Color color)
Definition: tablefind.cpp:1875
ColPartitionGrid clean_part_grid_
Definition: tablefind.h:413
void SetColumnsType(ColSegment_LIST *col_segments)
Definition: tablefind.cpp:1147
bool textord_tablefind_recognize_tables
Definition: tablefind.cpp:152
ColSegmentGrid col_seg_grid_
Definition: tablefind.h:421
ColPartitionGrid fragmented_text_grid_
Definition: tablefind.h:419

◆ MakeTableBlocks()

void tesseract::TableFinder::MakeTableBlocks ( ColPartitionGrid grid,
ColPartitionSet **  columns,
WidthCallback width_cb 
)
protected

Definition at line 2001 of file tablefind.cpp.

2003  {
2004  // Since we have table blocks already, remove table tags from all
2005  // colpartitions
2006  GridSearch<ColPartition, ColPartition_CLIST, ColPartition_C_IT>
2007  gsearch(grid);
2008  gsearch.StartFullSearch();
2009  ColPartition* part = NULL;
2010 
2011  while ((part = gsearch.NextFullSearch()) != NULL) {
2012  if (part->type() == PT_TABLE) {
2013  part->clear_table_type();
2014  }
2015  }
2016  // Now make a single colpartition out of each table block and remove
2017  // all colpartitions contained within a table
2018  GridSearch<ColSegment, ColSegment_CLIST, ColSegment_C_IT>
2019  table_search(&table_grid_);
2020  table_search.StartFullSearch();
2021  ColSegment* table;
2022  while ((table = table_search.NextFullSearch()) != NULL) {
2023  const TBOX& table_box = table->bounding_box();
2024  // Start a rect search on table_box
2025  GridSearch<ColPartition, ColPartition_CLIST, ColPartition_C_IT>
2026  rectsearch(grid);
2027  rectsearch.StartRectSearch(table_box);
2028  ColPartition* part;
2029  ColPartition* table_partition = NULL;
2030  while ((part = rectsearch.NextRectSearch()) != NULL) {
2031  // Do not consider image partitions
2032  if (!part->IsTextType())
2033  continue;
2034  TBOX part_box = part->bounding_box();
2035  // Include partition in the table if more than half of it
2036  // is covered by the table
2037  if (part_box.overlap_fraction(table_box) > kMinOverlapWithTable) {
2038  rectsearch.RemoveBBox();
2039  if (table_partition) {
2040  table_partition->Absorb(part, width_cb);
2041  } else {
2042  table_partition = part;
2043  }
2044  }
2045  }
2046  // Insert table colpartition back to part_grid_
2047  if (table_partition) {
2048  // To match the columns used when transforming to blocks, the new table
2049  // partition must have its first and last column set at the grid y that
2050  // corresponds to its bottom.
2051  const TBOX& table_box = table_partition->bounding_box();
2052  int grid_x, grid_y;
2053  grid->GridCoords(table_box.left(), table_box.bottom(), &grid_x, &grid_y);
2054  table_partition->SetPartitionType(resolution_, all_columns[grid_y]);
2055  table_partition->set_table_type();
2056  table_partition->set_blob_type(BRT_TEXT);
2057  table_partition->set_flow(BTFT_CHAIN);
2058  table_partition->SetBlobTypes();
2059  grid->InsertBBox(true, true, table_partition);
2060  }
2061  }
2062 }
ColSegmentGrid table_grid_
Definition: tablefind.h:423
Definition: capi.h:94
inT16 left() const
Definition: rect.h:68
double overlap_fraction(const TBOX &box) const
Definition: rect.h:378
const double kMinOverlapWithTable
Definition: tablefind.cpp:100
Definition: rect.h:30
inT16 bottom() const
Definition: rect.h:61

◆ MakeWindow()

ScrollView * tesseract::TableFinder::MakeWindow ( int  x,
int  y,
const char *  window_name 
)
protected

Definition at line 522 of file tablefind.cpp.

522  {
523  return clean_part_grid_.MakeWindow(x, y, window_name);
524 }
ScrollView * MakeWindow(int x, int y, const char *window_name)
Definition: bbgrid.h:593
ColPartitionGrid clean_part_grid_
Definition: tablefind.h:413

◆ MarkPartitionsUsingLocalInformation()

void tesseract::TableFinder::MarkPartitionsUsingLocalInformation ( )
protected

Definition at line 831 of file tablefind.cpp.

831  {
832  // Iterate the ColPartitions in the grid.
833  GridSearch<ColPartition, ColPartition_CLIST, ColPartition_C_IT>
834  gsearch(&clean_part_grid_);
835  gsearch.StartFullSearch();
836  ColPartition* part = NULL;
837  while ((part = gsearch.NextFullSearch()) != NULL) {
838  if (!part->IsTextType()) // Only consider text partitions
839  continue;
840  // Only consider partitions in dominant font size or smaller
841  if (part->median_size() > kMaxTableCellXheight * global_median_xheight_)
842  continue;
843  // Mark partitions with a large gap, or no significant gap as
844  // table partitions.
845  // Comments: It produces several false alarms at:
846  // - last line of a paragraph (fixed)
847  // - single word section headings
848  // - page headers and footers
849  // - numbered equations
850  // - line drawing regions
851  // TODO(faisal): detect and fix above-mentioned cases
852  if (HasWideOrNoInterWordGap(part) ||
853  HasLeaderAdjacent(*part)) {
854  part->set_table_type();
855  }
856  }
857 }
bool HasLeaderAdjacent(const ColPartition &part)
Definition: tablefind.cpp:950
const double kMaxTableCellXheight
Definition: tablefind.cpp:84
ColPartitionGrid clean_part_grid_
Definition: tablefind.h:413
bool HasWideOrNoInterWordGap(ColPartition *part) const
Definition: tablefind.cpp:861

◆ MarkTablePartitions()

void tesseract::TableFinder::MarkTablePartitions ( )
protected

Definition at line 793 of file tablefind.cpp.

793  {
796  ScrollView* table_win = MakeWindow(300, 300, "Initial Table Partitions");
800  }
803  ScrollView* table_win = MakeWindow(600, 300, "Filtered Table Partitions");
807  }
810  ScrollView* table_win = MakeWindow(900, 300, "Smoothed Table Partitions");
814  }
817  ScrollView* table_win = MakeWindow(900, 300, "Final Table Partitions");
821  }
822 }
bool textord_show_tables
Definition: tablefind.cpp:146
ColPartitionGrid leader_and_ruling_grid_
Definition: tablefind.h:415
void DisplayColPartitions(ScrollView *win, ColPartitionGrid *grid, ScrollView::Color text_color, ScrollView::Color table_color)
Definition: tablefind.cpp:1920
void MarkPartitionsUsingLocalInformation()
Definition: tablefind.cpp:831
ScrollView * MakeWindow(int x, int y, const char *window_name)
Definition: tablefind.cpp:522
bool textord_tablefind_show_mark
Definition: tablefind.cpp:148
ColPartitionGrid clean_part_grid_
Definition: tablefind.h:413

◆ MoveColSegmentsToGrid()

void tesseract::TableFinder::MoveColSegmentsToGrid ( ColSegment_LIST *  segments,
ColSegmentGrid col_seg_grid 
)
protected

Definition at line 1180 of file tablefind.cpp.

1181  {
1182  ColSegment_IT it(segments);
1183  for (it.mark_cycle_pt(); !it.cycled_list(); it.forward()) {
1184  ColSegment* seg = it.extract();
1185  col_seg_grid->InsertBBox(true, true, seg);
1186  }
1187 }

◆ RecognizeTables()

void tesseract::TableFinder::RecognizeTables ( )
protected

Definition at line 1821 of file tablefind.cpp.

1821  {
1822  ScrollView* table_win = NULL;
1823  if (textord_show_tables) {
1824  table_win = MakeWindow(0, 0, "Table Structure");
1827  // table_grid_.DisplayBoxes(table_win);
1828  }
1829 
1830 
1831  TableRecognizer recognizer;
1832  recognizer.Init();
1833  recognizer.set_line_grid(&leader_and_ruling_grid_);
1834  recognizer.set_text_grid(&fragmented_text_grid_);
1835  recognizer.set_max_text_height(global_median_xheight_ * 2.0);
1836  recognizer.set_min_height(1.5 * gridheight());
1837  // Loop over all of the tables and try to fit them.
1838  // Store the good tables here.
1839  ColSegment_CLIST good_tables;
1840  ColSegment_C_IT good_it(&good_tables);
1841 
1843  gsearch.StartFullSearch();
1844  ColSegment* found_table = NULL;
1845  while ((found_table = gsearch.NextFullSearch()) != NULL) {
1846  gsearch.RemoveBBox();
1847 
1848  // The goal is to make the tables persistent in a list.
1849  // When that happens, this will move into the search loop.
1850  const TBOX& found_box = found_table->bounding_box();
1851  StructuredTable* table_structure = recognizer.RecognizeTable(found_box);
1852 
1853  // Process a table. Good tables are inserted into the grid again later on
1854  // We can't change boxes in the grid while it is running a search.
1855  if (table_structure != NULL) {
1856  if (textord_show_tables) {
1857  table_structure->Display(table_win, ScrollView::LIME_GREEN);
1858  }
1859  found_table->set_bounding_box(table_structure->bounding_box());
1860  delete table_structure;
1861  good_it.add_after_then_move(found_table);
1862  } else {
1863  delete found_table;
1864  }
1865  }
1866  // TODO(nbeato): MERGE!! There is awesome info now available for merging.
1867 
1868  // At this point, the grid is empty. We can safely insert the good tables
1869  // back into grid.
1870  for (good_it.mark_cycle_pt(); !good_it.cycled_list(); good_it.forward())
1871  table_grid_.InsertBBox(true, true, good_it.extract());
1872 }
ColSegmentGrid table_grid_
Definition: tablefind.h:423
bool textord_show_tables
Definition: tablefind.cpp:146
int gridheight() const
Definition: tablefind.cpp:388
ColPartitionGrid leader_and_ruling_grid_
Definition: tablefind.h:415
void DisplayColPartitions(ScrollView *win, ColPartitionGrid *grid, ScrollView::Color text_color, ScrollView::Color table_color)
Definition: tablefind.cpp:1920
void InsertBBox(bool h_spread, bool v_spread, BBC *bbox)
Definition: bbgrid.h:490
GridSearch< ColSegment, ColSegment_CLIST, ColSegment_C_IT > ColSegmentGridSearch
Definition: tablefind.h:121
ScrollView * MakeWindow(int x, int y, const char *window_name)
Definition: tablefind.cpp:522
Definition: rect.h:30
ColPartitionGrid fragmented_text_grid_
Definition: tablefind.h:419

◆ set_global_median_blob_width()

void tesseract::TableFinder::set_global_median_blob_width ( int  width)
protected

Definition at line 763 of file tablefind.cpp.

763  {
765 }

◆ set_global_median_ledding()

void tesseract::TableFinder::set_global_median_ledding ( int  ledding)
protected

Definition at line 766 of file tablefind.cpp.

766  {
767  global_median_ledding_ = ledding;
768 }

◆ set_global_median_xheight()

void tesseract::TableFinder::set_global_median_xheight ( int  xheight)
protected

Definition at line 760 of file tablefind.cpp.

760  {
761  global_median_xheight_ = xheight;
762 }

◆ set_left_to_right_language()

void tesseract::TableFinder::set_left_to_right_language ( bool  order)

Definition at line 181 of file tablefind.cpp.

181  {
182  left_to_right_language_ = order;
183 }

◆ set_resolution()

void tesseract::TableFinder::set_resolution ( int  resolution)
inline

Definition at line 138 of file tablefind.h.

138  {
139  resolution_ = resolution;
140  }

◆ SetColumnsType()

void tesseract::TableFinder::SetColumnsType ( ColSegment_LIST *  col_segments)
protected

Definition at line 1147 of file tablefind.cpp.

1147  {
1148  ColSegment_IT it(column_blocks);
1149  for (it.mark_cycle_pt(); !it.cycled_list(); it.forward()) {
1150  ColSegment* seg = it.data();
1151  TBOX box = seg->bounding_box();
1152  int num_table_cells = 0;
1153  int num_text_cells = 0;
1154  GridSearch<ColPartition, ColPartition_CLIST, ColPartition_C_IT>
1155  rsearch(&clean_part_grid_);
1156  rsearch.SetUniqueMode(true);
1157  rsearch.StartRectSearch(box);
1158  ColPartition* part = NULL;
1159  while ((part = rsearch.NextRectSearch()) != NULL) {
1160  if (part->type() == PT_TABLE) {
1161  num_table_cells++;
1162  } else if (part->type() == PT_FLOWING_TEXT) {
1163  num_text_cells++;
1164  }
1165  }
1166  // If a column block has no text or table partition in it, it is not needed
1167  // for table detection.
1168  if (!num_table_cells && !num_text_cells) {
1169  delete it.extract();
1170  } else {
1171  seg->set_num_table_cells(num_table_cells);
1172  seg->set_num_text_cells(num_text_cells);
1173  // set column type based on the ratio of table to text cells
1174  seg->set_type();
1175  }
1176  }
1177 }
Definition: capi.h:94
Definition: rect.h:30
ColPartitionGrid clean_part_grid_
Definition: tablefind.h:413

◆ SetGlobalSpacings()

void tesseract::TableFinder::SetGlobalSpacings ( ColPartitionGrid grid)
protected

Definition at line 713 of file tablefind.cpp.

713  {
714  STATS xheight_stats(0, kMaxVerticalSpacing + 1);
715  STATS width_stats(0, kMaxBlobWidth + 1);
716  STATS ledding_stats(0, kMaxVerticalSpacing + 1);
717  // Iterate the ColPartitions in the grid.
718  ColPartitionGridSearch gsearch(grid);
719  gsearch.SetUniqueMode(true);
720  gsearch.StartFullSearch();
721  ColPartition* part = NULL;
722  while ((part = gsearch.NextFullSearch()) != NULL) {
723  // TODO(nbeato): HACK HACK HACK! medians are equal to partition length.
724  // ComputeLimits needs to get called somewhere outside of TableFinder
725  // to make sure the partitions are properly initialized.
726  // When this is called, SmoothPartitionPartners dies in an assert after
727  // table find runs. Alternative solution.
728  // part->ComputeLimits();
729  if (part->IsTextType()) {
730  // xheight_stats.add(part->median_size(), part->boxes_count());
731  // width_stats.add(part->median_width(), part->boxes_count());
732 
733  // This loop can be removed when above issues are fixed.
734  // Replace it with the 2 lines commented out above.
735  BLOBNBOX_C_IT it(part->boxes());
736  for (it.mark_cycle_pt(); !it.cycled_list(); it.forward()) {
737  xheight_stats.add(it.data()->bounding_box().height(), 1);
738  width_stats.add(it.data()->bounding_box().width(), 1);
739  }
740 
741  ledding_stats.add(part->space_above(), 1);
742  ledding_stats.add(part->space_below(), 1);
743  }
744  }
745  // Set estimates based on median of statistics obtained
746  set_global_median_xheight(static_cast<int>(xheight_stats.median() + 0.5));
747  set_global_median_blob_width(static_cast<int>(width_stats.median() + 0.5));
748  set_global_median_ledding(static_cast<int>(ledding_stats.median() + 0.5));
749  #ifndef GRAPHICS_DISABLED
751  const char* kWindowName = "X-height (R), X-width (G), and ledding (B)";
752  ScrollView* stats_win = MakeWindow(500, 10, kWindowName);
753  xheight_stats.plot(stats_win, 10, 200, 2, 15, ScrollView::RED);
754  width_stats.plot(stats_win, 10, 200, 2, 15, ScrollView::GREEN);
755  ledding_stats.plot(stats_win, 10, 200, 2, 15, ScrollView::BLUE);
756  }
757  #endif // GRAPHICS_DISABLED
758 }
void set_global_median_blob_width(int width)
Definition: tablefind.cpp:763
GridSearch< ColPartition, ColPartition_CLIST, ColPartition_C_IT > ColPartitionGridSearch
Definition: colpartition.h:932
const int kMaxBlobWidth
Definition: tablefind.cpp:43
bool textord_tablefind_show_stats
Definition: tablefind.cpp:150
const int kMaxVerticalSpacing
Definition: tablefind.cpp:41
ScrollView * MakeWindow(int x, int y, const char *window_name)
Definition: tablefind.cpp:522
Definition: statistc.h:33
void set_global_median_ledding(int ledding)
Definition: tablefind.cpp:766
void set_global_median_xheight(int xheight)
Definition: tablefind.cpp:760

◆ SetPartitionSpacings()

void tesseract::TableFinder::SetPartitionSpacings ( ColPartitionGrid grid,
ColPartitionSet **  all_columns 
)
staticprotected

Definition at line 590 of file tablefind.cpp.

591  {
592  // Iterate the ColPartitions in the grid.
593  ColPartitionGridSearch gsearch(grid);
594  gsearch.StartFullSearch();
595  ColPartition* part = NULL;
596  while ((part = gsearch.NextFullSearch()) != NULL) {
597  ColPartitionSet* columns = all_columns[gsearch.GridY()];
598  TBOX box = part->bounding_box();
599  int y = part->MidY();
600  ColPartition* left_column = columns->ColumnContaining(box.left(), y);
601  ColPartition* right_column = columns->ColumnContaining(box.right(), y);
602  // set distance from left column as space to the left
603  if (left_column) {
604  int left_space = MAX(0, box.left() - left_column->LeftAtY(y));
605  part->set_space_to_left(left_space);
606  }
607  // set distance from right column as space to the right
608  if (right_column) {
609  int right_space = MAX(0, right_column->RightAtY(y) - box.right());
610  part->set_space_to_right(right_space);
611  }
612 
613  // Look for images that may be closer.
614  // NOTE: used to be part_grid_, might cause issues now
615  ColPartitionGridSearch hsearch(grid);
616  hsearch.StartSideSearch(box.left(), box.bottom(), box.top());
617  ColPartition* neighbor = NULL;
618  while ((neighbor = hsearch.NextSideSearch(true)) != NULL) {
619  if (neighbor->type() == PT_PULLOUT_IMAGE ||
620  neighbor->type() == PT_FLOWING_IMAGE ||
621  neighbor->type() == PT_HEADING_IMAGE) {
622  int right = neighbor->bounding_box().right();
623  if (right < box.left()) {
624  int space = MIN(box.left() - right, part->space_to_left());
625  part->set_space_to_left(space);
626  }
627  }
628  }
629  hsearch.StartSideSearch(box.left(), box.bottom(), box.top());
630  neighbor = NULL;
631  while ((neighbor = hsearch.NextSideSearch(false)) != NULL) {
632  if (neighbor->type() == PT_PULLOUT_IMAGE ||
633  neighbor->type() == PT_FLOWING_IMAGE ||
634  neighbor->type() == PT_HEADING_IMAGE) {
635  int left = neighbor->bounding_box().left();
636  if (left > box.right()) {
637  int space = MIN(left - box.right(), part->space_to_right());
638  part->set_space_to_right(space);
639  }
640  }
641  }
642 
643  ColPartition* upper_part = part->SingletonPartner(true);
644  if (upper_part) {
645  int space = MAX(0, upper_part->bounding_box().bottom() -
646  part->bounding_box().bottom());
647  part->set_space_above(space);
648  } else {
649  // TODO(nbeato): What constitutes a good value?
650  // 0 is the default value when not set, explicitly noting it needs to
651  // be something else.
652  part->set_space_above(MAX_INT32);
653  }
654 
655  ColPartition* lower_part = part->SingletonPartner(false);
656  if (lower_part) {
657  int space = MAX(0, part->bounding_box().bottom() -
658  lower_part->bounding_box().bottom());
659  part->set_space_below(space);
660  } else {
661  // TODO(nbeato): What constitutes a good value?
662  // 0 is the default value when not set, explicitly noting it needs to
663  // be something else.
664  part->set_space_below(MAX_INT32);
665  }
666  }
667 }
GridSearch< ColPartition, ColPartition_CLIST, ColPartition_C_IT > ColPartitionGridSearch
Definition: colpartition.h:932
#define MAX_INT32
Definition: host.h:62
inT16 left() const
Definition: rect.h:68
inT16 top() const
Definition: rect.h:54
#define MAX(x, y)
Definition: ndminx.h:24
Definition: rect.h:30
#define MIN(x, y)
Definition: ndminx.h:28
inT16 right() const
Definition: rect.h:75
inT16 bottom() const
Definition: rect.h:61

◆ SetVerticalSpacing()

void tesseract::TableFinder::SetVerticalSpacing ( ColPartition part)
protected

Definition at line 670 of file tablefind.cpp.

670  {
671  TBOX box = part->bounding_box();
672  int top_range = MIN(box.top() + kMaxVerticalSpacing, tright().y());
673  int bottom_range = MAX(box.bottom() - kMaxVerticalSpacing, bleft().y());
674  box.set_top(top_range);
675  box.set_bottom(bottom_range);
676 
677  TBOX part_box = part->bounding_box();
678  // Start a rect search
679  GridSearch<ColPartition, ColPartition_CLIST, ColPartition_C_IT>
680  rectsearch(&clean_part_grid_);
681  rectsearch.StartRectSearch(box);
682  ColPartition* neighbor;
683  int min_space_above = kMaxVerticalSpacing;
684  int min_space_below = kMaxVerticalSpacing;
685  ColPartition* above_neighbor = NULL;
686  ColPartition* below_neighbor = NULL;
687  while ((neighbor = rectsearch.NextRectSearch()) != NULL) {
688  if (neighbor == part)
689  continue;
690  TBOX neighbor_box = neighbor->bounding_box();
691  if (neighbor_box.major_x_overlap(part_box)) {
692  int gap = abs(part->median_bottom() - neighbor->median_bottom());
693  // If neighbor is below current partition
694  if (neighbor_box.top() < part_box.bottom() &&
695  gap < min_space_below) {
696  min_space_below = gap;
697  below_neighbor = neighbor;
698  } // If neighbor is above current partition
699  else if (part_box.top() < neighbor_box.bottom() &&
700  gap < min_space_above) {
701  min_space_above = gap;
702  above_neighbor = neighbor;
703  }
704  }
705  }
706  part->set_space_above(min_space_above);
707  part->set_space_below(min_space_below);
708  part->set_nearest_neighbor_above(above_neighbor);
709  part->set_nearest_neighbor_below(below_neighbor);
710 }
const ICOORD & bleft() const
Definition: tablefind.cpp:391
bool major_x_overlap(const TBOX &box) const
Definition: rect.h:402
void set_top(int y)
Definition: rect.h:57
const int kMaxVerticalSpacing
Definition: tablefind.cpp:41
inT16 y() const
access_function
Definition: points.h:56
inT16 top() const
Definition: rect.h:54
#define MAX(x, y)
Definition: ndminx.h:24
Definition: rect.h:30
#define MIN(x, y)
Definition: ndminx.h:28
inT16 bottom() const
Definition: rect.h:61
ColPartitionGrid clean_part_grid_
Definition: tablefind.h:413
void set_bottom(int y)
Definition: rect.h:64
const ICOORD & tright() const
Definition: tablefind.cpp:394

◆ SmoothTablePartitionRuns()

void tesseract::TableFinder::SmoothTablePartitionRuns ( )
protected

Definition at line 1112 of file tablefind.cpp.

1112  {
1113  // Iterate the ColPartitions in the grid.
1115  gsearch.StartFullSearch();
1116  ColPartition* part = NULL;
1117  while ((part = gsearch.NextFullSearch()) != NULL) {
1118  if (part->type() >= PT_TABLE || part->type() == PT_UNKNOWN)
1119  continue; // Consider only text partitions
1120  ColPartition* upper_part = part->nearest_neighbor_above();
1121  ColPartition* lower_part = part->nearest_neighbor_below();
1122  if (!upper_part || !lower_part)
1123  continue;
1124  if (upper_part->type() == PT_TABLE && lower_part->type() == PT_TABLE)
1125  part->set_table_type();
1126  }
1127 
1128  // Pass 2, do the opposite. If both the upper and lower neighbors
1129  // exist and are not tables, this probably shouldn't be a table.
1130  gsearch.StartFullSearch();
1131  part = NULL;
1132  while ((part = gsearch.NextFullSearch()) != NULL) {
1133  if (part->type() != PT_TABLE)
1134  continue; // Consider only text partitions
1135  ColPartition* upper_part = part->nearest_neighbor_above();
1136  ColPartition* lower_part = part->nearest_neighbor_below();
1137 
1138  // table can't be by itself
1139  if ((upper_part && upper_part->type() != PT_TABLE) &&
1140  (lower_part && lower_part->type() != PT_TABLE)) {
1141  part->clear_table_type();
1142  }
1143  }
1144 }
GridSearch< ColPartition, ColPartition_CLIST, ColPartition_C_IT > ColPartitionGridSearch
Definition: colpartition.h:932
Definition: capi.h:94
ColPartitionGrid clean_part_grid_
Definition: tablefind.h:413

◆ SplitAndInsertFragmentedTextPartition()

void tesseract::TableFinder::SplitAndInsertFragmentedTextPartition ( ColPartition part)
protected

Definition at line 440 of file tablefind.cpp.

440  {
441  ASSERT_HOST(part != NULL);
442  // Bye bye empty partitions!
443  if (part->boxes()->empty()) {
444  delete part;
445  return;
446  }
447 
448  // The AllowBlob function prevents this.
449  ASSERT_HOST(part->median_width() > 0);
450  const double kThreshold = part->median_width() * kSplitPartitionSize;
451 
452  ColPartition* right_part = part;
453  bool found_split = true;
454  while (found_split) {
455  found_split = false;
456  BLOBNBOX_C_IT box_it(right_part->boxes());
457  // Blobs are sorted left side first. If blobs overlap,
458  // the previous blob may have a "more right" right side.
459  // Account for this by always keeping the largest "right"
460  // so far.
461  int previous_right = MIN_INT32;
462 
463  // Look for the next split in the partition.
464  for (box_it.mark_cycle_pt(); !box_it.cycled_list(); box_it.forward()) {
465  const TBOX& box = box_it.data()->bounding_box();
466  if (previous_right != MIN_INT32 &&
467  box.left() - previous_right > kThreshold) {
468  // We have a split position. Split the partition in two pieces.
469  // Insert the left piece in the grid and keep processing the right.
470  int mid_x = (box.left() + previous_right) / 2;
471  ColPartition* left_part = right_part;
472  right_part = left_part->SplitAt(mid_x);
473 
475  found_split = true;
476  break;
477  }
478 
479  // The right side of the previous blobs.
480  previous_right = MAX(previous_right, box.right());
481  }
482  }
483  // When a split is not found, the right part is minimized
484  // as much as possible, so process it.
485  InsertFragmentedTextPartition(right_part);
486 }
#define ASSERT_HOST(x)
Definition: errcode.h:84
inT16 left() const
Definition: rect.h:68
const double kSplitPartitionSize
Definition: tablefind.cpp:47
#define MIN_INT32
Definition: host.h:70
#define MAX(x, y)
Definition: ndminx.h:24
Definition: rect.h:30
inT16 right() const
Definition: rect.h:75
void InsertFragmentedTextPartition(ColPartition *part)
Definition: tablefind.cpp:406

◆ tright()

const ICOORD & tesseract::TableFinder::tright ( ) const
protected

Definition at line 394 of file tablefind.cpp.

394  {
395  return clean_part_grid_.tright();
396 }
const ICOORD & tright() const
Definition: bbgrid.h:76
ColPartitionGrid clean_part_grid_
Definition: tablefind.h:413

Member Data Documentation

◆ clean_part_grid_

ColPartitionGrid tesseract::TableFinder::clean_part_grid_
protected

Definition at line 413 of file tablefind.h.

◆ col_seg_grid_

ColSegmentGrid tesseract::TableFinder::col_seg_grid_
protected

Definition at line 421 of file tablefind.h.

◆ fragmented_text_grid_

ColPartitionGrid tesseract::TableFinder::fragmented_text_grid_
protected

Definition at line 419 of file tablefind.h.

◆ global_median_blob_width_

int tesseract::TableFinder::global_median_blob_width_
protected

Definition at line 407 of file tablefind.h.

◆ global_median_ledding_

int tesseract::TableFinder::global_median_ledding_
protected

Definition at line 409 of file tablefind.h.

◆ global_median_xheight_

int tesseract::TableFinder::global_median_xheight_
protected

Definition at line 405 of file tablefind.h.

◆ leader_and_ruling_grid_

ColPartitionGrid tesseract::TableFinder::leader_and_ruling_grid_
protected

Definition at line 415 of file tablefind.h.

◆ left_to_right_language_

bool tesseract::TableFinder::left_to_right_language_
protected

Definition at line 425 of file tablefind.h.

◆ resolution_

int tesseract::TableFinder::resolution_
protected

Definition at line 403 of file tablefind.h.

◆ table_grid_

ColSegmentGrid tesseract::TableFinder::table_grid_
protected

Definition at line 423 of file tablefind.h.


The documentation for this class was generated from the following files: