tesseract  4.00.00dev
colfind.h
Go to the documentation of this file.
1 // File: colfind.h
3 // Description: Class to find columns in the grid of BLOBNBOXes.
4 // Author: Ray Smith
5 // Created: Thu Feb 21 14:04:01 PST 2008
6 //
7 // (C) Copyright 2008, Google Inc.
8 // Licensed under the Apache License, Version 2.0 (the "License");
9 // you may not use this file except in compliance with the License.
10 // You may obtain a copy of the License at
11 // http://www.apache.org/licenses/LICENSE-2.0
12 // Unless required by applicable law or agreed to in writing, software
13 // distributed under the License is distributed on an "AS IS" BASIS,
14 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15 // See the License for the specific language governing permissions and
16 // limitations under the License.
17 //
19 
20 #ifndef TESSERACT_TEXTORD_COLFIND_H_
21 #define TESSERACT_TEXTORD_COLFIND_H_
22 
23 #include "colpartitiongrid.h"
24 #include "colpartitionset.h"
25 #include "debugpixa.h"
26 #include "imagefind.h"
27 #include "ocrblock.h"
28 #include "tabfind.h"
29 #include "textlineprojection.h"
30 
31 class BLOCK_LIST;
32 struct Boxa;
33 struct Pixa;
34 class DENORM;
35 class ScrollView;
36 class STATS;
37 class TO_BLOCK;
38 
39 namespace tesseract {
40 
41 extern BOOL_VAR_H(textord_tabfind_find_tables, false, "run table detection");
42 
43 class ColPartitionSet;
44 class ColPartitionSet_LIST;
45 class ColSegment_LIST;
46 class ColumnGroup_LIST;
47 class LineSpacing;
48 class StrokeWidth;
49 class TempColumn_LIST;
50 class EquationDetectBase;
51 
52 // The ColumnFinder class finds columns in the grid.
53 class ColumnFinder : public TabFind {
54  public:
55  // Gridsize is an estimate of the text size in the image. A suitable value
56  // is in TO_BLOCK::line_size after find_components has been used to make
57  // the blobs.
58  // bleft and tright are the bounds of the image (rectangle) being processed.
59  // vlines is a (possibly empty) list of TabVector and vertical_x and y are
60  // the sum logical vertical vector produced by LineFinder::FindVerticalLines.
61  // If cjk_script is true, then broken CJK characters are fixed during
62  // layout analysis to assist in detecting horizontal vs vertically written
63  // textlines.
64  ColumnFinder(int gridsize, const ICOORD& bleft, const ICOORD& tright,
65  int resolution, bool cjk_script, double aligned_gap_fraction,
66  TabVector_LIST* vlines, TabVector_LIST* hlines,
67  int vertical_x, int vertical_y);
68  virtual ~ColumnFinder();
69 
70  // Accessors for testing
71  const DENORM* denorm() const {
72  return denorm_;
73  }
74  const TextlineProjection* projection() const {
75  return &projection_;
76  }
77  void set_cjk_script(bool is_cjk) {
78  cjk_script_ = is_cjk;
79  }
80 
81  // ======================================================================
82  // The main function of ColumnFinder is broken into pieces to facilitate
83  // optional insertion of orientation and script detection in an efficient
84  // way. The calling sequence IS MANDATORY however, whether or not
85  // OSD is being used:
86  // 1. Construction.
87  // 2. SetupAndFilterNoise.
88  // 3. IsVerticallyAlignedText.
89  // 4. CorrectOrientation.
90  // 5. FindBlocks.
91  // 6. Destruction. Use of a single column finder for multiple images does not
92  // make sense.
93  // Throughout these steps, the ColPartitions are owned by part_grid_, which
94  // means that that it must be kept correct. Exception: big_parts_ owns its
95  // own ColPartitions.
96  // The BLOBNBOXes are owned by the input TO_BLOCK for the whole time, except
97  // for a phase in FindBlocks before TransformToBlocks, when they become
98  // owned by the ColPartitions. The owner() ColPartition of a BLOBNBOX
99  // indicates more of a betrothal for the majority of layout analysis, ie
100  // which ColPartition will take ownership when the blobs are release from
101  // the input TO_BLOCK. Exception: image_bblobs_ owns the fake blobs that
102  // are part of the image regions, as they are not on any TO_BLOCK list.
103  // TODO(rays) break up column finder further into smaller classes, as
104  // there is a lot more to it than column finding now.
105  // ======================================================================
106 
107  // Performs initial processing on the blobs in the input_block:
108  // Setup the part_grid, stroke_width_, nontext_map_.
109  // Obvious noise blobs are filtered out and used to mark the nontext_map_.
110  // Initial stroke-width analysis is used to get local text alignment
111  // direction, so the textline projection_ map can be setup.
112  // On return, IsVerticallyAlignedText may be called (now optionally) to
113  // determine the gross textline alignment of the page.
114  void SetupAndFilterNoise(PageSegMode pageseg_mode, Pix* photo_mask_pix,
115  TO_BLOCK* input_block);
116 
117  // Tests for vertical alignment of text (returning true if so), and generates
118  // a list of blobs (in osd_blobs) for orientation and script detection.
119  // block is the single block for the whole page or rectangle to be OCRed.
120  // Note that the vertical alignment may be due to text whose writing direction
121  // is vertical, like say Japanese, or due to text whose writing direction is
122  // horizontal but whose text appears vertically aligned because the image is
123  // not the right way up.
124  // find_vertical_text_ratio should be textord_tabfind_vertical_text_ratio.
125  bool IsVerticallyAlignedText(double find_vertical_text_ratio,
126  TO_BLOCK* block, BLOBNBOX_CLIST* osd_blobs);
127 
128  // Rotates the blobs and the TabVectors so that the gross writing direction
129  // (text lines) are horizontal and lines are read down the page.
130  // Applied rotation stored in rotation_.
131  // A second rotation is calculated for application during recognition to
132  // make the rotated blobs upright for recognition.
133  // Subsequent rotation stored in text_rotation_.
134  //
135  // Arguments:
136  // vertical_text_lines is true if the text lines are vertical.
137  // recognition_rotation [0..3] is the number of anti-clockwise 90 degree
138  // rotations from osd required for the text to be upright and readable.
139  void CorrectOrientation(TO_BLOCK* block, bool vertical_text_lines,
140  int recognition_rotation);
141 
142  // Finds blocks of text, image, rule line, table etc, returning them in the
143  // blocks and to_blocks
144  // (Each TO_BLOCK points to the basic BLOCK and adds more information.)
145  // Image blocks are generated by a combination of photo_mask_pix (which may
146  // NOT be NULL) and the rejected text found during preliminary textline
147  // finding.
148  // The input_block is the result of a call to find_components, and contains
149  // the blobs found in the image or rectangle to be OCRed. These blobs will be
150  // removed and placed in the output blocks, while unused ones will be deleted.
151  // If single_column is true, the input is treated as single column, but
152  // it is still divided into blocks of equal line spacing/text size.
153  // scaled_color is scaled down by scaled_factor from the input color image,
154  // and may be NULL if the input was not color.
155  // grey_pix is optional, but if present must match the photo_mask_pix in size,
156  // and must be a *real* grey image instead of binary_pix * 255.
157  // thresholds_pix is expected to be present iff grey_pix is present and
158  // can be an integer factor reduction of the grey_pix. It represents the
159  // thresholds that were used to create the binary_pix from the grey_pix.
160  // Small blobs that confuse the segmentation into lines are placed into
161  // diacritic_blobs, with the intention that they be put into the most
162  // appropriate word after the rest of layout analysis.
163  // Returns -1 if the user hits the 'd' key in the blocks window while running
164  // in debug mode, which requests a retry with more debug info.
165  int FindBlocks(PageSegMode pageseg_mode, Pix* scaled_color, int scaled_factor,
166  TO_BLOCK* block, Pix* photo_mask_pix, Pix* thresholds_pix,
167  Pix* grey_pix, DebugPixa* pixa_debug, BLOCK_LIST* blocks,
168  BLOBNBOX_LIST* diacritic_blobs, TO_BLOCK_LIST* to_blocks);
169 
170  // Get the rotation required to deskew, and its inverse rotation.
171  void GetDeskewVectors(FCOORD* deskew, FCOORD* reskew);
172 
173  // Set the equation detection pointer.
175 
176  private:
177  // Displays the blob and block bounding boxes in a window called Blocks.
178  void DisplayBlocks(BLOCK_LIST* blocks);
179  // Displays the column edges at each grid y coordinate defined by
180  // best_columns_.
181  void DisplayColumnBounds(PartSetVector* sets);
182 
184 
185  // Sets up column_sets_ (the determined column layout at each horizontal
186  // slice). Returns false if the page is empty.
187  bool MakeColumns(bool single_column);
188  // Attempt to improve the column_candidates by expanding the columns
189  // and adding new partitions from the partition sets in src_sets.
190  // Src_sets may be equal to column_candidates, in which case it will
191  // use them as a source to improve themselves.
192  void ImproveColumnCandidates(PartSetVector* src_sets,
193  PartSetVector* column_sets);
194  // Prints debug information on the column candidates.
195  void PrintColumnCandidates(const char* title);
196  // Finds the optimal set of columns that cover the entire image with as
197  // few changes in column partition as possible.
198  // Returns true if any part of the page is multi-column.
199  bool AssignColumns(const PartSetVector& part_sets);
200  // Finds the biggest range in part_sets_ that has no assigned column, but
201  // column assignment is possible.
202  bool BiggestUnassignedRange(int set_count, const bool* any_columns_possible,
203  int* start, int* end);
204  // Finds the modal compatible column_set_ index within the given range.
205  int RangeModalColumnSet(int** column_set_costs, const int* assigned_costs,
206  int start, int end);
207  // Given that there are many column_set_id compatible columns in the range,
208  // shrinks the range to the longest contiguous run of compatibility, allowing
209  // gaps where no columns are possible, but not where competing columns are
210  // possible.
211  void ShrinkRangeToLongestRun(int** column_set_costs,
212  const int* assigned_costs,
213  const bool* any_columns_possible,
214  int column_set_id,
215  int* best_start, int* best_end);
216  // Moves start in the direction of step, up to, but not including end while
217  // the only incompatible regions are no more than kMaxIncompatibleColumnCount
218  // in size, and the compatible regions beyond are bigger.
219  void ExtendRangePastSmallGaps(int** column_set_costs,
220  const int* assigned_costs,
221  const bool* any_columns_possible,
222  int column_set_id,
223  int step, int end, int* start);
224  // Assigns the given column_set_id to the part_sets_ in the given range.
225  void AssignColumnToRange(int column_set_id, int start, int end,
226  int** column_set_costs, int* assigned_costs);
227 
228  // Computes the mean_column_gap_.
229  void ComputeMeanColumnGap(bool any_multi_column);
230 
233 
234  // Hoovers up all un-owned blobs and deletes them.
235  // The rest get released from the block so the ColPartitions can pass
236  // ownership to the output blocks.
237  void ReleaseBlobsAndCleanupUnused(TO_BLOCK* block);
238  // Splits partitions that cross columns where they have nothing in the gap.
239  void GridSplitPartitions();
240  // Merges partitions where there is vertical overlap, within a single column,
241  // and the horizontal gap is small enough.
242  void GridMergePartitions();
243  // Inserts remaining noise blobs into the most applicable partition if any.
244  // If there is no applicable partition, then the blobs are deleted.
245  void InsertRemainingNoise(TO_BLOCK* block);
246  // Remove partitions that come from horizontal lines that look like
247  // underlines, but are not part of a table.
248  void GridRemoveUnderlinePartitions();
249  // Add horizontal line separators as partitions.
250  void GridInsertHLinePartitions();
251  // Add vertical line separators as partitions.
252  void GridInsertVLinePartitions();
253  // For every ColPartition in the grid, sets its type based on position
254  // in the columns.
255  void SetPartitionTypes();
256  // Only images remain with multiple types in a run of partners.
257  // Sets the type of all in the group to the maximum of the group.
258  void SmoothPartnerRuns();
259 
261 
262  // Helper functions for TransformToBlocks.
263  // Add the part to the temp list in the correct order.
264  void AddToTempPartList(ColPartition* part, ColPartition_CLIST* temp_list);
265  // Add everything from the temp list to the work_set assuming correct order.
266  void EmptyTempPartList(ColPartition_CLIST* temp_list,
267  WorkingPartSet_LIST* work_set);
268 
269  // Transform the grid of partitions to the output blocks.
270  void TransformToBlocks(BLOCK_LIST* blocks, TO_BLOCK_LIST* to_blocks);
271 
272  // Reflect the blob boxes (but not the outlines) in the y-axis so that
273  // the blocks get created in the correct RTL order. Rotates the blobs
274  // in the input_block and the bblobs list.
275  // The reflection is undone in RotateAndReskewBlocks by
276  // reflecting the blocks themselves, and then recomputing the blob bounding
277  // boxes.
278  void ReflectForRtl(TO_BLOCK* input_block, BLOBNBOX_LIST* bblobs);
279 
280  // Undo the deskew that was done in FindTabVectors, as recognition is done
281  // without correcting blobs or blob outlines for skew.
282  // Reskew the completed blocks to put them back to the original rotated coords
283  // that were created by CorrectOrientation.
284  // If the input_is_rtl, then reflect the blocks in the y-axis to undo the
285  // reflection that was done before FindTabVectors.
286  // Blocks that were identified as vertical text (relative to the rotated
287  // coordinates) are further rotated so the text lines are horizontal.
288  // blob polygonal outlines are rotated to match the position of the blocks
289  // that they are in, and their bounding boxes are recalculated to be accurate.
290  // Record appropriate inverse transformations and required
291  // classifier transformation in the blocks.
292  void RotateAndReskewBlocks(bool input_is_rtl, TO_BLOCK_LIST* to_blocks);
293 
294  // Computes the rotations for the block (to make textlines horizontal) and
295  // for the blobs (for classification) and sets the appropriate members
296  // of the given block.
297  // Returns the rotation that needs to be applied to the blobs to make
298  // them sit in the rotated block.
299  FCOORD ComputeBlockAndClassifyRotation(BLOCK* block);
300 
301  // If true then the page language is cjk, so it is safe to perform
302  // FixBrokenCJK.
303  bool cjk_script_;
304  // The minimum gutter width to apply for finding columns.
305  // Modified when vertical text is detected to prevent detection of
306  // vertical text lines as columns.
307  int min_gutter_width_;
308  // The mean gap between columns over the page.
309  int mean_column_gap_;
310  // Config param saved at construction time. Modifies min_gutter_width_ with
311  // vertical text to prevent detection of vertical text as columns.
312  double tabfind_aligned_gap_fraction_;
313  // The rotation vector needed to convert original coords to deskewed.
314  FCOORD deskew_;
315  // The rotation vector needed to convert deskewed back to original coords.
316  FCOORD reskew_;
317  // The rotation vector used to rotate vertically oriented pages.
318  FCOORD rotation_;
319  // The rotation vector needed to convert the rotated back to original coords.
320  FCOORD rerotate_;
321  // The additional rotation vector needed to rotate text for recognition.
322  FCOORD text_rotation_;
323  // The column_sets_ contain the ordered candidate ColPartitionSets that
324  // define the possible divisions of the page into columns.
325  PartSetVector column_sets_;
326  // A simple array of pointers to the best assigned column division at
327  // each grid y coordinate.
328  ColPartitionSet** best_columns_;
329  // The grid used for creating initial partitions with strokewidth.
330  StrokeWidth* stroke_width_;
331  // The grid used to hold ColPartitions after the columns have been determined.
332  ColPartitionGrid part_grid_;
333  // List of ColPartitions that are no longer needed after they have been
334  // turned into regions, but are kept around because they are referenced
335  // by the part_grid_.
336  ColPartition_LIST good_parts_;
337  // List of ColPartitions that are big and might be dropcap or vertically
338  // joined.
339  ColPartition_LIST big_parts_;
340  // List of ColPartitions that have been declared noise.
341  ColPartition_LIST noise_parts_;
342  // The fake blobs that are made from the images.
343  BLOBNBOX_LIST image_bblobs_;
344  // Horizontal line separators.
345  TabVector_LIST horizontal_lines_;
346  // Image map of photo/noise areas on the page.
347  Pix* nontext_map_;
348  // Textline projection map.
349  TextlineProjection projection_;
350  // Sequence of DENORMS that indicate how to get back to the original image
351  // coordinate space. The destructor must delete all the DENORMs in the chain.
352  DENORM* denorm_;
353 
354  // Various debug windows that automatically go away on completion.
355  ScrollView* input_blobs_win_;
356 
357  // The equation region detector pointer. Note: This pointer is passed in by
358  // member function SetEquationDetect, and releasing it is NOT owned by this
359  // class.
360  EquationDetectBase* equation_detect_;
361 
362  // Allow a subsequent instance to reuse the blocks window.
363  // Not thread-safe, but multiple threads shouldn't be using windows anyway.
364  static ScrollView* blocks_win_;
365 };
366 
367 } // namespace tesseract.
368 
369 #endif // TESSERACT_TEXTORD_COLFIND_H_
Definition: points.h:189
const DENORM * denorm() const
Definition: colfind.h:71
bool textord_tabfind_find_tables
Definition: colfind.cpp:67
void SetEquationDetect(EquationDetectBase *detect)
Definition: colfind.cpp:507
const ICOORD & bleft() const
Definition: bbgrid.h:73
void set_cjk_script(bool is_cjk)
Definition: colfind.h:77
int gridsize() const
Definition: bbgrid.h:64
bool IsVerticallyAlignedText(double find_vertical_text_ratio, TO_BLOCK *block, BLOBNBOX_CLIST *osd_blobs)
Definition: colfind.cpp:184
void GetDeskewVectors(FCOORD *deskew, FCOORD *reskew)
Definition: colfind.cpp:501
void SetupAndFilterNoise(PageSegMode pageseg_mode, Pix *photo_mask_pix, TO_BLOCK *input_block)
Definition: colfind.cpp:146
const ICOORD & tright() const
Definition: bbgrid.h:76
const TextlineProjection * projection() const
Definition: colfind.h:74
virtual ~ColumnFinder()
Definition: colfind.cpp:98
Definition: statistc.h:33
ColumnFinder(int gridsize, const ICOORD &bleft, const ICOORD &tright, int resolution, bool cjk_script, double aligned_gap_fraction, TabVector_LIST *vlines, TabVector_LIST *hlines, int vertical_x, int vertical_y)
Definition: colfind.cpp:77
void CorrectOrientation(TO_BLOCK *block, bool vertical_text_lines, int recognition_rotation)
Definition: colfind.cpp:202
#define BOOL_VAR_H(name, val, comment)
Definition: params.h:267
Definition: ocrblock.h:30
int FindBlocks(PageSegMode pageseg_mode, Pix *scaled_color, int scaled_factor, TO_BLOCK *block, Pix *photo_mask_pix, Pix *thresholds_pix, Pix *grey_pix, DebugPixa *pixa_debug, BLOCK_LIST *blocks, BLOBNBOX_LIST *diacritic_blobs, TO_BLOCK_LIST *to_blocks)
Definition: colfind.cpp:290
integer coordinate
Definition: points.h:30