tesseract  4.00.00dev
blobbox.h
Go to the documentation of this file.
1 /**********************************************************************
2  * File: blobbox.h (Formerly blobnbox.h)
3  * Description: Code for the textord blob class.
4  * Author: Ray Smith
5  * Created: Thu Jul 30 09:08:51 BST 1992
6  *
7  * (C) Copyright 1992, Hewlett-Packard Ltd.
8  ** Licensed under the Apache License, Version 2.0 (the "License");
9  ** you may not use this file except in compliance with the License.
10  ** You may obtain a copy of the License at
11  ** http://www.apache.org/licenses/LICENSE-2.0
12  ** Unless required by applicable law or agreed to in writing, software
13  ** distributed under the License is distributed on an "AS IS" BASIS,
14  ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15  ** See the License for the specific language governing permissions and
16  ** limitations under the License.
17  *
18  **********************************************************************/
19 
20 #ifndef BLOBBOX_H
21 #define BLOBBOX_H
22 
23 #include "clst.h"
24 #include "elst2.h"
25 #include "werd.h"
26 #include "ocrblock.h"
27 #include "statistc.h"
28 
30 {
31  PITCH_DUNNO, // insufficient data
32  PITCH_DEF_FIXED, // definitely fixed
33  PITCH_MAYBE_FIXED, // could be
38 };
39 
40 // The possible tab-stop types of each side of a BLOBNBOX.
41 // The ordering is important, as it is used for deleting dead-ends in the
42 // search. ALIGNED, CONFIRMED and VLINE should remain greater than the
43 // non-aligned, unset, or deleted members.
44 enum TabType {
45  TT_NONE, // Not a tab.
46  TT_DELETED, // Not a tab after detailed analysis.
47  TT_MAYBE_RAGGED, // Initial designation of a tab-stop candidate.
48  TT_MAYBE_ALIGNED, // Initial designation of a tab-stop candidate.
49  TT_CONFIRMED, // Aligned with neighbours.
50  TT_VLINE // Detected as a vertical line.
51 };
52 
53 // The possible region types of a BLOBNBOX.
54 // Note: keep all the text types > BRT_UNKNOWN and all the image types less.
55 // Keep in sync with kBlobTypes in colpartition.cpp and BoxColor, and the
56 // *Type static functions below.
58  BRT_NOISE, // Neither text nor image.
59  BRT_HLINE, // Horizontal separator line.
60  BRT_VLINE, // Vertical separator line.
61  BRT_RECTIMAGE, // Rectangular image.
62  BRT_POLYIMAGE, // Non-rectangular image.
63  BRT_UNKNOWN, // Not determined yet.
64  BRT_VERT_TEXT, // Vertical alignment, not necessarily vertically oriented.
65  BRT_TEXT, // Convincing text.
66 
67  BRT_COUNT // Number of possibilities.
68 };
69 
70 // enum for elements of arrays that refer to neighbours.
71 // NOTE: keep in this order, so ^2 can be used to flip direction.
78 };
79 
80 // enum for special type of text characters, such as math symbol or italic.
82  BSTT_NONE, // No special.
83  BSTT_ITALIC, // Italic style.
84  BSTT_DIGIT, // Digit symbols.
85  BSTT_MATH, // Mathmatical symobls (not including digit).
86  BSTT_UNCLEAR, // Characters with low recognition rate.
87  BSTT_SKIP, // Characters that we skip labeling (usually too small).
89 };
90 
92  return static_cast<BlobNeighbourDir>(dir ^ 2);
93 }
94 
95 // BlobTextFlowType indicates the quality of neighbouring information
96 // related to a chain of connected components, either horizontally or
97 // vertically. Also used by ColPartition for the collection of blobs
98 // within, which should all have the same value in most cases.
100  BTFT_NONE, // No text flow set yet.
101  BTFT_NONTEXT, // Flow too poor to be likely text.
102  BTFT_NEIGHBOURS, // Neighbours support flow in this direction.
103  BTFT_CHAIN, // There is a weak chain of text in this direction.
104  BTFT_STRONG_CHAIN, // There is a strong chain of text in this direction.
105  BTFT_TEXT_ON_IMAGE, // There is a strong chain of text on an image.
106  BTFT_LEADER, // Leader dots/dashes etc.
108 };
109 
110 // Returns true if type1 dominates type2 in a merge. Mostly determined by the
111 // ordering of the enum, LEADER is weak and dominates nothing.
112 // The function is anti-symmetric (t1 > t2) === !(t2 > t1), except that
113 // this cannot be true if t1 == t2, so the result is undefined.
115  // LEADER always loses.
116  if (type1 == BTFT_LEADER) return false;
117  if (type2 == BTFT_LEADER) return true;
118  // With those out of the way, the ordering of the enum determines the result.
119  return type1 >= type2;
120 }
121 
122 namespace tesseract {
123 class ColPartition;
124 }
125 
126 class BLOBNBOX;
128 class BLOBNBOX:public ELIST_LINK
129 {
130  public:
133  }
134  explicit BLOBNBOX(C_BLOB *srcblob) {
135  box = srcblob->bounding_box();
137  cblob_ptr = srcblob;
138  area = static_cast<int>(srcblob->area());
139  }
141  if (owns_cblob_) delete cblob_ptr;
142  }
143  static BLOBNBOX* RealBlob(C_OUTLINE* outline) {
144  C_BLOB* blob = new C_BLOB(outline);
145  return new BLOBNBOX(blob);
146  }
147 
148  // Rotates the box and the underlying blob.
149  void rotate(FCOORD rotation);
150 
151  // Methods that act on the box without touching the underlying blob.
152  // Reflect the box in the y-axis, leaving the underlying blob untouched.
153  void reflect_box_in_y_axis();
154  // Rotates the box by the angle given by rotation.
155  // If the blob is a diacritic, then only small rotations for skew
156  // correction can be applied.
157  void rotate_box(FCOORD rotation);
158  // Moves just the box by the given vector.
160  if (IsDiacritic()) {
161  box.move(v);
162  base_char_top_ += v.y();
163  base_char_bottom_ += v.y();
164  } else {
165  box.move(v);
166  set_diacritic_box(box);
167  }
168  }
169  void merge(BLOBNBOX *nextblob);
170  void really_merge(BLOBNBOX* other);
171  void chop( // fake chop blob
172  BLOBNBOX_IT *start_it, // location of this
173  BLOBNBOX_IT *blob_it, // iterator
174  FCOORD rotation, // for landscape
175  float xheight); // line height
176 
177  void NeighbourGaps(int gaps[BND_COUNT]) const;
178  void MinMaxGapsClipped(int* h_min, int* h_max,
179  int* v_min, int* v_max) const;
180  void CleanNeighbours();
181  // Returns positive if there is at least one side neighbour that has a
182  // similar stroke width and is not on the other side of a rule line.
183  int GoodTextBlob() const;
184  // Returns the number of side neighbours that are of type BRT_NOISE.
185  int NoisyNeighbours() const;
186 
187  // Returns true if the blob is noise and has no owner.
188  bool DeletableNoise() const {
189  return owner() == NULL && region_type() == BRT_NOISE;
190  }
191 
192  // Returns true, and sets vert_possible/horz_possible if the blob has some
193  // feature that makes it individually appear to flow one way.
194  // eg if it has a high aspect ratio, yet has a complex shape, such as a
195  // joined word in Latin, Arabic, or Hindi, rather than being a -, I, l, 1.
196  bool DefiniteIndividualFlow();
197 
198  // Returns true if there is no tabstop violation in merging this and other.
199  bool ConfirmNoTabViolation(const BLOBNBOX& other) const;
200 
201  // Returns true if other has a similar stroke width to this.
202  bool MatchingStrokeWidth(const BLOBNBOX& other,
203  double fractional_tolerance,
204  double constant_tolerance) const;
205 
206  // Returns a bounding box of the outline contained within the
207  // given horizontal range.
208  TBOX BoundsWithinLimits(int left, int right);
209 
210  // Estimates and stores the baseline position based on the shape of the
211  // outline.
213 
214  // Simple accessors.
215  const TBOX& bounding_box() const {
216  return box;
217  }
218  // Set the bounding box. Use with caution.
219  // Normally use compute_bounding_box instead.
220  void set_bounding_box(const TBOX& new_box) {
221  box = new_box;
222  base_char_top_ = box.top();
223  base_char_bottom_ = box.bottom();
224  }
226  box = cblob_ptr->bounding_box();
227  base_char_top_ = box.top();
228  base_char_bottom_ = box.bottom();
229  baseline_y_ = box.bottom();
230  }
231  const TBOX& reduced_box() const {
232  return red_box;
233  }
234  void set_reduced_box(TBOX new_box) {
235  red_box = new_box;
236  reduced = TRUE;
237  }
239  return area;
240  }
241  bool joined_to_prev() const {
242  return joined != 0;
243  }
244  bool red_box_set() const {
245  return reduced != 0;
246  }
247  int repeated_set() const {
248  return repeated_set_;
249  }
250  void set_repeated_set(int set_id) {
251  repeated_set_ = set_id;
252  }
253  C_BLOB *cblob() const {
254  return cblob_ptr;
255  }
257  return left_tab_type_;
258  }
259  void set_left_tab_type(TabType new_type) {
260  left_tab_type_ = new_type;
261  }
263  return right_tab_type_;
264  }
265  void set_right_tab_type(TabType new_type) {
266  right_tab_type_ = new_type;
267  }
269  return region_type_;
270  }
272  region_type_ = new_type;
273  }
275  return spt_type_;
276  }
278  spt_type_ = new_type;
279  }
281  return flow_;
282  }
284  flow_ = value;
285  }
286  bool vert_possible() const {
287  return vert_possible_;
288  }
289  void set_vert_possible(bool value) {
290  vert_possible_ = value;
291  }
292  bool horz_possible() const {
293  return horz_possible_;
294  }
295  void set_horz_possible(bool value) {
296  horz_possible_ = value;
297  }
298  int left_rule() const {
299  return left_rule_;
300  }
301  void set_left_rule(int new_left) {
302  left_rule_ = new_left;
303  }
304  int right_rule() const {
305  return right_rule_;
306  }
307  void set_right_rule(int new_right) {
308  right_rule_ = new_right;
309  }
310  int left_crossing_rule() const {
311  return left_crossing_rule_;
312  }
313  void set_left_crossing_rule(int new_left) {
314  left_crossing_rule_ = new_left;
315  }
316  int right_crossing_rule() const {
317  return right_crossing_rule_;
318  }
319  void set_right_crossing_rule(int new_right) {
320  right_crossing_rule_ = new_right;
321  }
322  float horz_stroke_width() const {
323  return horz_stroke_width_;
324  }
325  void set_horz_stroke_width(float width) {
326  horz_stroke_width_ = width;
327  }
328  float vert_stroke_width() const {
329  return vert_stroke_width_;
330  }
331  void set_vert_stroke_width(float width) {
332  vert_stroke_width_ = width;
333  }
334  float area_stroke_width() const {
335  return area_stroke_width_;
336  }
338  return owner_;
339  }
341  owner_ = new_owner;
342  }
343  bool leader_on_left() const {
344  return leader_on_left_;
345  }
346  void set_leader_on_left(bool flag) {
347  leader_on_left_ = flag;
348  }
349  bool leader_on_right() const {
350  return leader_on_right_;
351  }
352  void set_leader_on_right(bool flag) {
353  leader_on_right_ = flag;
354  }
356  return neighbours_[n];
357  }
359  return good_stroke_neighbours_[n];
360  }
362  neighbours_[n] = neighbour;
363  good_stroke_neighbours_[n] = good;
364  }
365  bool IsDiacritic() const {
366  return base_char_top_ != box.top() || base_char_bottom_ != box.bottom();
367  }
368  int base_char_top() const {
369  return base_char_top_;
370  }
371  int base_char_bottom() const {
372  return base_char_bottom_;
373  }
374  int baseline_position() const {
375  return baseline_y_;
376  }
377  int line_crossings() const {
378  return line_crossings_;
379  }
380  void set_line_crossings(int value) {
381  line_crossings_ = value;
382  }
383  void set_diacritic_box(const TBOX& diacritic_box) {
384  base_char_top_ = diacritic_box.top();
385  base_char_bottom_ = diacritic_box.bottom();
386  }
388  return base_char_blob_;
389  }
391  base_char_blob_ = blob;
392  }
393  void set_owns_cblob(bool value) { owns_cblob_ = value; }
394 
395  bool UniquelyVertical() const {
396  return vert_possible_ && !horz_possible_;
397  }
398  bool UniquelyHorizontal() const {
399  return horz_possible_ && !vert_possible_;
400  }
401 
402  // Returns true if the region type is text.
403  static bool IsTextType(BlobRegionType type) {
404  return type == BRT_TEXT || type == BRT_VERT_TEXT;
405  }
406  // Returns true if the region type is image.
407  static bool IsImageType(BlobRegionType type) {
408  return type == BRT_RECTIMAGE || type == BRT_POLYIMAGE;
409  }
410  // Returns true if the region type is line.
411  static bool IsLineType(BlobRegionType type) {
412  return type == BRT_HLINE || type == BRT_VLINE;
413  }
414  // Returns true if the region type cannot be merged.
415  static bool UnMergeableType(BlobRegionType type) {
416  return IsLineType(type) || IsImageType(type);
417  }
418  // Helper to call CleanNeighbours on all blobs on the list.
419  static void CleanNeighbours(BLOBNBOX_LIST* blobs);
420  // Helper to delete all the deletable blobs on the list.
421  static void DeleteNoiseBlobs(BLOBNBOX_LIST* blobs);
422  // Helper to compute edge offsets for all the blobs on the list.
423  // See coutln.h for an explanation of edge offsets.
424  static void ComputeEdgeOffsets(Pix* thresholds, Pix* grey,
425  BLOBNBOX_LIST* blobs);
426 
427 #ifndef GRAPHICS_DISABLED
428  // Helper to draw all the blobs on the list in the given body_colour,
429  // with child outlines in the child_colour.
430  static void PlotBlobs(BLOBNBOX_LIST* list,
431  ScrollView::Color body_colour,
432  ScrollView::Color child_colour,
433  ScrollView* win);
434  // Helper to draw only DeletableNoise blobs (unowned, BRT_NOISE) on the
435  // given list in the given body_colour, with child outlines in the
436  // child_colour.
437  static void PlotNoiseBlobs(BLOBNBOX_LIST* list,
438  ScrollView::Color body_colour,
439  ScrollView::Color child_colour,
440  ScrollView* win);
441 
443  BlobTextFlowType flow_type);
444 
445  // Keep in sync with BlobRegionType.
446  ScrollView::Color BoxColor() const;
447 
448  void plot(ScrollView* window, // window to draw in
449  ScrollView::Color blob_colour, // for outer bits
450  ScrollView::Color child_colour); // for holes
451 #endif
452 
453  // Initializes the bulk of the members to default values for use at
454  // construction time.
456  cblob_ptr = NULL;
457  owns_cblob_ = false;
458  area = 0;
459  area_stroke_width_ = 0.0f;
460  horz_stroke_width_ = 0.0f;
461  vert_stroke_width_ = 0.0f;
462  ReInit();
463  }
464  // Initializes members set by StrokeWidth and beyond, without discarding
465  // stored area and strokewidth values, which are expensive to calculate.
466  void ReInit() {
467  joined = false;
468  reduced = false;
469  repeated_set_ = 0;
470  left_tab_type_ = TT_NONE;
471  right_tab_type_ = TT_NONE;
472  region_type_ = BRT_UNKNOWN;
473  flow_ = BTFT_NONE;
474  spt_type_ = BSTT_SKIP;
475  left_rule_ = 0;
476  right_rule_ = 0;
477  left_crossing_rule_ = 0;
478  right_crossing_rule_ = 0;
479  if (area_stroke_width_ == 0.0f && area > 0 && cblob() != NULL)
480  area_stroke_width_ = 2.0f * area / cblob()->perimeter();
481  owner_ = NULL;
482  base_char_top_ = box.top();
483  base_char_bottom_ = box.bottom();
484  baseline_y_ = box.bottom();
485  line_crossings_ = 0;
486  base_char_blob_ = NULL;
487  horz_possible_ = false;
488  vert_possible_ = false;
489  leader_on_left_ = false;
490  leader_on_right_ = false;
491  ClearNeighbours();
492  }
493 
495  for (int n = 0; n < BND_COUNT; ++n) {
496  neighbours_[n] = NULL;
497  good_stroke_neighbours_[n] = false;
498  }
499  }
500 
501  private:
502  C_BLOB *cblob_ptr; // edgestep blob
503  TBOX box; // bounding box
504  TBOX red_box; // bounding box
505  int area:30; // enclosed area
506  int joined:1; // joined to prev
507  int reduced:1; // reduced box set
508  int repeated_set_; // id of the set of repeated blobs
509  TabType left_tab_type_; // Indicates tab-stop assessment
510  TabType right_tab_type_; // Indicates tab-stop assessment
511  BlobRegionType region_type_; // Type of region this blob belongs to
512  BlobTextFlowType flow_; // Quality of text flow.
513  inT16 left_rule_; // x-coord of nearest but not crossing rule line
514  inT16 right_rule_; // x-coord of nearest but not crossing rule line
515  inT16 left_crossing_rule_; // x-coord of nearest or crossing rule line
516  inT16 right_crossing_rule_; // x-coord of nearest or crossing rule line
517  inT16 base_char_top_; // y-coord of top/bottom of diacritic base,
518  inT16 base_char_bottom_; // if it exists else top/bottom of this blob.
519  inT16 baseline_y_; // Estimate of baseline position.
520  int line_crossings_; // Number of line intersections touched.
521  BLOBNBOX* base_char_blob_; // The blob that was the base char.
522  float horz_stroke_width_; // Median horizontal stroke width
523  float vert_stroke_width_; // Median vertical stroke width
524  float area_stroke_width_; // Stroke width from area/perimeter ratio.
525  tesseract::ColPartition* owner_; // Who will delete me when I am not needed
526  BlobSpecialTextType spt_type_; // Special text type.
527  BLOBNBOX* neighbours_[BND_COUNT];
528  bool good_stroke_neighbours_[BND_COUNT];
529  bool horz_possible_; // Could be part of horizontal flow.
530  bool vert_possible_; // Could be part of vertical flow.
531  bool leader_on_left_; // There is a leader to the left.
532  bool leader_on_right_; // There is a leader to the right.
533  // Iff true, then the destructor should delete the cblob_ptr.
534  // TODO(rays) migrate all uses to correctly setting this flag instead of
535  // deleting the C_BLOB before deleting the BLOBNBOX.
536  bool owns_cblob_;
537 };
538 
539 class TO_ROW: public ELIST2_LINK
540 {
541  public:
542  static const int kErrorWeight = 3;
543 
544  TO_ROW() {
545  clear();
546  } //empty
547  TO_ROW( //constructor
548  BLOBNBOX *blob, //from first blob
549  float top, //of row //target height
550  float bottom,
551  float row_size);
552 
553  void print() const;
554  float max_y() const { //access function
555  return y_max;
556  }
557  float min_y() const {
558  return y_min;
559  }
560  float mean_y() const {
561  return (y_min + y_max) / 2.0f;
562  }
563  float initial_min_y() const {
564  return initial_y_min;
565  }
566  float line_m() const { //access to line fit
567  return m;
568  }
569  float line_c() const {
570  return c;
571  }
572  float line_error() const {
573  return error;
574  }
575  float parallel_c() const {
576  return para_c;
577  }
578  float parallel_error() const {
579  return para_error;
580  }
581  float believability() const { //baseline goodness
582  return credibility;
583  }
584  float intercept() const { //real parallel_c
585  return y_origin;
586  }
587  void add_blob( //put in row
588  BLOBNBOX *blob, //blob to add
589  float top, //of row //target height
590  float bottom,
591  float row_size);
592  void insert_blob( //put in row in order
593  BLOBNBOX *blob);
594 
595  BLOBNBOX_LIST *blob_list() { //get list
596  return &blobs;
597  }
598 
599  void set_line( //set line spec
600  float new_m, //line to set
601  float new_c,
602  float new_error) {
603  m = new_m;
604  c = new_c;
605  error = new_error;
606  }
607  void set_parallel_line( //set fixed gradient line
608  float gradient, //page gradient
609  float new_c,
610  float new_error) {
611  para_c = new_c;
612  para_error = new_error;
613  credibility =
614  (float) (blobs.length () - kErrorWeight * new_error);
615  y_origin = (float) (new_c / sqrt (1 + gradient * gradient));
616  //real intercept
617  }
618  void set_limits( //set min,max
619  float new_min, //bottom and
620  float new_max) { //top of row
621  y_min = new_min;
622  y_max = new_max;
623  }
624  void compute_vertical_projection();
625  //get projection
626 
627  bool rep_chars_marked() const {
628  return num_repeated_sets_ != -1;
629  }
631  num_repeated_sets_ = -1;
632  }
633  int num_repeated_sets() const {
634  return num_repeated_sets_;
635  }
636  void set_num_repeated_sets(int num_sets) {
637  num_repeated_sets_ = num_sets;
638  }
639 
640  // true when dead
642  BOOL8 all_caps; // had no ascenders
643  BOOL8 used_dm_model; // in guessing pitch
644  inT16 projection_left; // start of projection
645  inT16 projection_right; // start of projection
646  PITCH_TYPE pitch_decision; // how strong is decision
647  float fixed_pitch; // pitch or 0
648  float fp_space; // sp if fixed pitch
649  float fp_nonsp; // nonsp if fixed pitch
650  float pr_space; // sp if prop
651  float pr_nonsp; // non sp if prop
652  float spacing; // to "next" row
653  float xheight; // of line
654  int xheight_evidence; // number of blobs of height xheight
655  float ascrise; // ascenders
656  float descdrop; // descenders
657  float body_size; // of CJK characters. Assumed to be
658  // xheight+ascrise for non-CJK text.
659  inT32 min_space; // min size for real space
660  inT32 max_nonspace; // max size of non-space
661  inT32 space_threshold; // space vs nonspace
662  float kern_size; // average non-space
663  float space_size; // average space
664  WERD_LIST rep_words; // repeated chars
665  ICOORDELT_LIST char_cells; // fixed pitch cells
666  QSPLINE baseline; // curved baseline
667  STATS projection; // vertical projection
668 
669  private:
670  void clear(); // clear all values to reasonable defaults
671 
672  BLOBNBOX_LIST blobs; //blobs in row
673  float y_min; //coords
674  float y_max;
675  float initial_y_min;
676  float m, c; //line spec
677  float error; //line error
678  float para_c; //constrained fit
679  float para_error;
680  float y_origin; //rotated para_c;
681  float credibility; //baseline believability
682  int num_repeated_sets_; // number of sets of repeated blobs
683  // set to -1 if we have not searched
684  // for repeated blobs in this row yet
685 };
686 
688 class TO_BLOCK:public ELIST_LINK
689 {
690  public:
691  TO_BLOCK() : pitch_decision(PITCH_DUNNO) {
692  clear();
693  } //empty
694  TO_BLOCK( //constructor
695  BLOCK *src_block); //real block
696  ~TO_BLOCK();
697 
698  void clear(); // clear all scalar members.
699 
700  TO_ROW_LIST *get_rows() { //access function
701  return &row_list;
702  }
703 
704  // Rotate all the blobnbox lists and the underlying block. Then update the
705  // median size statistic from the blobs list.
706  void rotate(const FCOORD& rotation) {
707  BLOBNBOX_LIST* blobnbox_list[] = {&blobs, &underlines, &noise_blobs,
708  &small_blobs, &large_blobs, NULL};
709  for (BLOBNBOX_LIST** list = blobnbox_list; *list != NULL; ++list) {
710  BLOBNBOX_IT it(*list);
711  for (it.mark_cycle_pt(); !it.cycled_list(); it.forward()) {
712  it.data()->rotate(rotation);
713  }
714  }
715  // Rotate the block
716  ASSERT_HOST(block->poly_block() != NULL);
717  block->rotate(rotation);
718  // Update the median size statistic from the blobs list.
719  STATS widths(0, block->bounding_box().width());
720  STATS heights(0, block->bounding_box().height());
721  BLOBNBOX_IT blob_it(&blobs);
722  for (blob_it.mark_cycle_pt(); !blob_it.cycled_list(); blob_it.forward()) {
723  widths.add(blob_it.data()->bounding_box().width(), 1);
724  heights.add(blob_it.data()->bounding_box().height(), 1);
725  }
726  block->set_median_size(static_cast<int>(widths.median() + 0.5),
727  static_cast<int>(heights.median() + 0.5));
728  }
729 
730  void print_rows() { //debug info
731  TO_ROW_IT row_it = &row_list;
732  TO_ROW *row;
733 
734  for (row_it.mark_cycle_pt(); !row_it.cycled_list();
735  row_it.forward()) {
736  row = row_it.data();
737  tprintf("Row range (%g,%g), para_c=%g, blobcount=%" PRId32 "\n",
738  row->min_y(), row->max_y(), row->parallel_c(),
739  row->blob_list()->length());
740  }
741  }
742 
743  // Reorganizes the blob lists with a different definition of small, medium
744  // and large, compared to the original definition.
745  // Height is still the primary filter key, but medium width blobs of small
746  // height become medium, and very wide blobs of small height stay small.
747  void ReSetAndReFilterBlobs();
748 
749  // Deletes noise blobs from all lists where not owned by a ColPartition.
750  void DeleteUnownedNoise();
751 
752  // Computes and stores the edge offsets on each blob for use in feature
753  // extraction, using greyscale if the supplied grey and thresholds pixes
754  // are 8-bit or otherwise (if NULL or not 8 bit) the original binary
755  // edge step outlines.
756  // Thresholds must either be the same size as grey or an integer down-scale
757  // of grey.
758  // See coutln.h for an explanation of edge offsets.
759  void ComputeEdgeOffsets(Pix* thresholds, Pix* grey);
760 
761 #ifndef GRAPHICS_DISABLED
762  // Draw the noise blobs from all lists in red.
763  void plot_noise_blobs(ScrollView* to_win);
764  // Draw the blobs on on the various lists in the block in different colors.
765  void plot_graded_blobs(ScrollView* to_win);
766 #endif
767 
768  BLOBNBOX_LIST blobs; //medium size
769  BLOBNBOX_LIST underlines; //underline blobs
770  BLOBNBOX_LIST noise_blobs; //very small
771  BLOBNBOX_LIST small_blobs; //fairly small
772  BLOBNBOX_LIST large_blobs; //big blobs
773  BLOCK *block; //real block
774  PITCH_TYPE pitch_decision; //how strong is decision
775  float line_spacing; //estimate
776  // line_size is a lower-bound estimate of the font size in pixels of
777  // the text in the block (with ascenders and descenders), being a small
778  // (1.25) multiple of the median height of filtered blobs.
779  // In most cases the font size will be bigger, but it will be closer
780  // if the text is allcaps, or in a no-x-height script.
781  float line_size; //estimate
782  float max_blob_size; //line assignment limit
783  float baseline_offset; //phase shift
784  float xheight; //median blob size
785  float fixed_pitch; //pitch or 0
786  float kern_size; //average non-space
787  float space_size; //average space
788  inT32 min_space; //min definite space
789  inT32 max_nonspace; //max definite
790  float fp_space; //sp if fixed pitch
791  float fp_nonsp; //nonsp if fixed pitch
792  float pr_space; //sp if prop
793  float pr_nonsp; //non sp if prop
794  TO_ROW *key_row; //starting row
795 
796  private:
797  TO_ROW_LIST row_list; //temporary rows
798 };
799 
802 "Weighting for error in believability");
803 void find_cblob_limits( //get y limits
804  C_BLOB *blob, //blob to search
805  float leftx, //x limits
806  float rightx,
807  FCOORD rotation, //for landscape
808  float &ymin, //output y limits
809  float &ymax);
810 void find_cblob_vlimits( //get y limits
811  C_BLOB *blob, //blob to search
812  float leftx, //x limits
813  float rightx,
814  float &ymin, //output y limits
815  float &ymax);
816 void find_cblob_hlimits( //get x limits
817  C_BLOB *blob, //blob to search
818  float bottomy, //y limits
819  float topy,
820  float &xmin, //output x limits
821  float &xymax);
822 C_BLOB *crotate_cblob( //rotate it
823  C_BLOB *blob, //blob to search
824  FCOORD rotation //for landscape
825  );
826 TBOX box_next( //get bounding box
827  BLOBNBOX_IT *it //iterator to blobds
828  );
829 TBOX box_next_pre_chopped( //get bounding box
830  BLOBNBOX_IT *it //iterator to blobds
831  );
832 void vertical_cblob_projection( //project outlines
833  C_BLOB *blob, //blob to project
834  STATS *stats //output
835  );
836 void vertical_coutline_projection( //project outlines
837  C_OUTLINE *outline, //outline to project
838  STATS *stats //output
839  );
840 #ifndef GRAPHICS_DISABLED
841 void plot_blob_list(ScrollView* win, // window to draw in
842  BLOBNBOX_LIST *list, // blob list
843  ScrollView::Color body_colour, // colour to draw
844  ScrollView::Color child_colour); // colour of child
845 #endif // GRAPHICS_DISABLED
846 #endif
void find_cblob_limits(C_BLOB *blob, float leftx, float rightx, FCOORD rotation, float &ymin, float &ymax)
Definition: blobbox.cpp:494
void NeighbourGaps(int gaps[BND_COUNT]) const
Definition: blobbox.cpp:176
void ConstructionInit()
Definition: blobbox.h:455
float kern_size
Definition: blobbox.h:786
inT16 projection_left
Definition: blobbox.h:644
BLOBNBOX_LIST underlines
Definition: blobbox.h:769
int num_repeated_sets() const
Definition: blobbox.h:633
void set_leader_on_right(bool flag)
Definition: blobbox.h:352
float pr_nonsp
Definition: blobbox.h:651
TBOX box_next_pre_chopped(BLOBNBOX_IT *it)
Definition: blobbox.cpp:660
#define TRUE
Definition: capi.h:45
Definition: points.h:189
float pr_nonsp
Definition: blobbox.h:793
int xheight_evidence
Definition: blobbox.h:654
int32_t inT32
Definition: host.h:38
inT32 max_nonspace
Definition: blobbox.h:660
void print_rows()
Definition: blobbox.h:730
float initial_min_y() const
Definition: blobbox.h:563
bool IsDiacritic() const
Definition: blobbox.h:365
int base_char_bottom() const
Definition: blobbox.h:371
static BLOBNBOX * RealBlob(C_OUTLINE *outline)
Definition: blobbox.h:143
PITCH_TYPE pitch_decision
Definition: blobbox.h:646
float kern_size
Definition: blobbox.h:662
void set_right_tab_type(TabType new_type)
Definition: blobbox.h:265
bool good_stroke_neighbour(BlobNeighbourDir n) const
Definition: blobbox.h:358
float xheight
Definition: blobbox.h:653
void set_diacritic_box(const TBOX &diacritic_box)
Definition: blobbox.h:383
BLOBNBOX_LIST large_blobs
Definition: blobbox.h:772
void find_cblob_vlimits(C_BLOB *blob, float leftx, float rightx, float &ymin, float &ymax)
Definition: blobbox.cpp:534
void vertical_cblob_projection(C_BLOB *blob, STATS *stats)
Definition: blobbox.cpp:863
float horz_stroke_width() const
Definition: blobbox.h:322
int right_crossing_rule() const
Definition: blobbox.h:316
float descdrop
Definition: blobbox.h:656
void set_horz_stroke_width(float width)
Definition: blobbox.h:325
float space_size
Definition: blobbox.h:663
TO_BLOCK()
Definition: blobbox.h:691
ScrollView::Color BoxColor() const
Definition: blobbox.cpp:476
void rotate_box(FCOORD rotation)
Definition: blobbox.cpp:66
bool red_box_set() const
Definition: blobbox.h:244
const TBOX & reduced_box() const
Definition: blobbox.h:231
inT32 space_threshold
Definition: blobbox.h:661
int left_rule() const
Definition: blobbox.h:298
float area_stroke_width() const
Definition: blobbox.h:334
float fp_space
Definition: blobbox.h:648
inT32 area()
Definition: stepblob.cpp:270
~BLOBNBOX()
Definition: blobbox.h:140
#define tprintf(...)
Definition: tprintf.h:31
void set_bounding_box(const TBOX &new_box)
Definition: blobbox.h:220
float intercept() const
Definition: blobbox.h:584
void set_neighbour(BlobNeighbourDir n, BLOBNBOX *neighbour, bool good)
Definition: blobbox.h:361
bool ConfirmNoTabViolation(const BLOBNBOX &other) const
Definition: blobbox.cpp:287
float fixed_pitch
Definition: blobbox.h:647
bool UniquelyVertical() const
Definition: blobbox.h:395
int GoodTextBlob() const
Definition: blobbox.cpp:221
void set_horz_possible(bool value)
Definition: blobbox.h:295
bool joined_to_prev() const
Definition: blobbox.h:241
void set_left_rule(int new_left)
Definition: blobbox.h:301
C_BLOB * cblob() const
Definition: blobbox.h:253
tesseract::ColPartition * owner() const
Definition: blobbox.h:337
void rotate(FCOORD rotation)
Definition: blobbox.cpp:50
void ReInit()
Definition: blobbox.h:466
inT32 min_space
Definition: blobbox.h:788
double textord_error_weight
void set_right_crossing_rule(int new_right)
Definition: blobbox.h:319
float min_y() const
Definition: blobbox.h:557
float body_size
Definition: blobbox.h:657
bool DeletableNoise() const
Definition: blobbox.h:188
void set_limits(float new_min, float new_max)
Definition: blobbox.h:618
float line_m() const
Definition: blobbox.h:566
BlobSpecialTextType special_text_type() const
Definition: blobbox.h:274
float line_c() const
Definition: blobbox.h:569
inT32 enclosed_area() const
Definition: blobbox.h:238
void find_cblob_hlimits(C_BLOB *blob, float bottomy, float topy, float &xmin, float &xymax)
Definition: blobbox.cpp:571
BlobRegionType region_type() const
Definition: blobbox.h:268
int16_t inT16
Definition: host.h:36
static void ComputeEdgeOffsets(Pix *thresholds, Pix *grey, BLOBNBOX_LIST *blobs)
Definition: blobbox.cpp:380
TBOX box_next(BLOBNBOX_IT *it)
Definition: blobbox.cpp:631
void set_region_type(BlobRegionType new_type)
Definition: blobbox.h:271
#define ASSERT_HOST(x)
Definition: errcode.h:84
bool horz_possible() const
Definition: blobbox.h:292
float baseline_offset
Definition: blobbox.h:783
static ScrollView::Color TextlineColor(BlobRegionType region_type, BlobTextFlowType flow_type)
Definition: blobbox.cpp:439
float pr_space
Definition: blobbox.h:650
int repeated_set() const
Definition: blobbox.h:247
inT32 min_space
Definition: blobbox.h:659
TabType
Definition: blobbox.h:44
void set_owner(tesseract::ColPartition *new_owner)
Definition: blobbox.h:340
TabType left_tab_type() const
Definition: blobbox.h:256
float line_size
Definition: blobbox.h:781
bool MatchingStrokeWidth(const BLOBNBOX &other, double fractional_tolerance, double constant_tolerance) const
Definition: blobbox.cpp:300
void set_parallel_line(float gradient, float new_c, float new_error)
Definition: blobbox.h:607
QSPLINE baseline
Definition: blobbox.h:666
void MinMaxGapsClipped(int *h_min, int *h_max, int *v_min, int *v_max) const
Definition: blobbox.cpp:195
float fixed_pitch
Definition: blobbox.h:785
BLOBNBOX_LIST small_blobs
Definition: blobbox.h:771
int line_crossings() const
Definition: blobbox.h:377
WERD_LIST rep_words
Definition: blobbox.h:664
float fp_nonsp
Definition: blobbox.h:649
TBOX BoundsWithinLimits(int left, int right)
Definition: blobbox.cpp:328
void rotate(const FCOORD &rotation)
Definition: blobbox.h:706
BLOBNBOX()
Definition: blobbox.h:131
bool DominatesInMerge(BlobTextFlowType type1, BlobTextFlowType type2)
Definition: blobbox.h:114
void really_merge(BLOBNBOX *other)
Definition: blobbox.cpp:98
void set_vert_possible(bool value)
Definition: blobbox.h:289
unsigned char BOOL8
Definition: host.h:44
float parallel_error() const
Definition: blobbox.h:578
TO_ROW()
Definition: blobbox.h:544
bool leader_on_left() const
Definition: blobbox.h:343
TabType right_tab_type() const
Definition: blobbox.h:262
void ClearNeighbours()
Definition: blobbox.h:494
#define double_VAR_H(name, val, comment)
Definition: params.h:273
C_BLOB * crotate_cblob(C_BLOB *blob, FCOORD rotation)
Definition: blobbox.cpp:606
inT16 y() const
access_function
Definition: points.h:56
BLOBNBOX_LIST blobs
Definition: blobbox.h:768
EXTERN ScrollView * to_win
Definition: drawtord.cpp:38
ICOORDELT_LIST char_cells
Definition: blobbox.h:665
#define ELIST2IZEH(CLASSNAME)
Definition: elst2.h:950
float fp_nonsp
Definition: blobbox.h:791
void set_reduced_box(TBOX new_box)
Definition: blobbox.h:234
void reflect_box_in_y_axis()
Definition: blobbox.cpp:57
void set_line(float new_m, float new_c, float new_error)
Definition: blobbox.h:599
PITCH_TYPE pitch_decision
Definition: blobbox.h:774
BLOBNBOX * base_char_blob() const
Definition: blobbox.h:387
static bool IsLineType(BlobRegionType type)
Definition: blobbox.h:411
BlobNeighbourDir DirOtherWay(BlobNeighbourDir dir)
Definition: blobbox.h:91
BlobSpecialTextType
Definition: blobbox.h:81
BLOBNBOX * neighbour(BlobNeighbourDir n) const
Definition: blobbox.h:355
bool leader_on_right() const
Definition: blobbox.h:349
float vert_stroke_width() const
Definition: blobbox.h:328
int left_crossing_rule() const
Definition: blobbox.h:310
float believability() const
Definition: blobbox.h:581
float pr_space
Definition: blobbox.h:792
void merge(BLOBNBOX *nextblob)
Definition: blobbox.cpp:87
inT16 top() const
Definition: rect.h:54
float max_y() const
Definition: blobbox.h:554
void set_flow(BlobTextFlowType value)
Definition: blobbox.h:283
int right_rule() const
Definition: blobbox.h:304
float fp_space
Definition: blobbox.h:790
static void DeleteNoiseBlobs(BLOBNBOX_LIST *blobs)
Definition: blobbox.cpp:367
Definition: rect.h:30
bool rep_chars_marked() const
Definition: blobbox.h:627
void set_leader_on_left(bool flag)
Definition: blobbox.h:346
void chop(BLOBNBOX_IT *start_it, BLOBNBOX_IT *blob_it, FCOORD rotation, float xheight)
Definition: blobbox.cpp:115
float xheight
Definition: blobbox.h:784
static bool UnMergeableType(BlobRegionType type)
Definition: blobbox.h:415
void set_line_crossings(int value)
Definition: blobbox.h:380
void vertical_coutline_projection(C_OUTLINE *outline, STATS *stats)
Definition: blobbox.cpp:883
bool vert_possible() const
Definition: blobbox.h:286
BOOL8 merged
Definition: blobbox.h:641
TBOX bounding_box() const
Definition: stepblob.cpp:250
void CleanNeighbours()
Definition: blobbox.cpp:209
bool UniquelyHorizontal() const
Definition: blobbox.h:398
void set_repeated_set(int set_id)
Definition: blobbox.h:250
void clear_rep_chars_marked()
Definition: blobbox.h:630
TO_ROW * key_row
Definition: blobbox.h:794
void set_num_repeated_sets(int num_sets)
Definition: blobbox.h:636
void set_left_crossing_rule(int new_left)
Definition: blobbox.h:313
BlobTextFlowType
Definition: blobbox.h:99
STATS projection
Definition: blobbox.h:667
Definition: statistc.h:33
void plot(ScrollView *window, ScrollView::Color blob_colour, ScrollView::Color child_colour)
Definition: blobbox.cpp:480
ELISTIZEH(AmbigSpec)
float max_blob_size
Definition: blobbox.h:782
void set_special_text_type(BlobSpecialTextType new_type)
Definition: blobbox.h:277
void set_base_char_blob(BLOBNBOX *blob)
Definition: blobbox.h:390
BLOBNBOX(C_BLOB *srcblob)
Definition: blobbox.h:134
inT16 bottom() const
Definition: rect.h:61
inT32 max_nonspace
Definition: blobbox.h:789
BLOCK * block
Definition: blobbox.h:773
int baseline_position() const
Definition: blobbox.h:374
static bool IsTextType(BlobRegionType type)
Definition: blobbox.h:403
inT16 projection_right
Definition: blobbox.h:645
void set_right_rule(int new_right)
Definition: blobbox.h:307
inT32 perimeter()
Definition: stepblob.cpp:289
BLOBNBOX_LIST * blob_list()
Definition: blobbox.h:595
float spacing
Definition: blobbox.h:652
TO_ROW_LIST * get_rows()
Definition: blobbox.h:700
void plot_blob_list(ScrollView *win, BLOBNBOX_LIST *list, ScrollView::Color body_colour, ScrollView::Color child_colour)
Definition: blobbox.cpp:1082
BLOBNBOX_LIST noise_blobs
Definition: blobbox.h:770
void set_left_tab_type(TabType new_type)
Definition: blobbox.h:259
void translate_box(ICOORD v)
Definition: blobbox.h:159
BlobRegionType
Definition: blobbox.h:57
static void PlotBlobs(BLOBNBOX_LIST *list, ScrollView::Color body_colour, ScrollView::Color child_colour, ScrollView *win)
Definition: blobbox.cpp:414
float line_spacing
Definition: blobbox.h:775
BlobNeighbourDir
Definition: blobbox.h:72
void EstimateBaselinePosition()
Definition: blobbox.cpp:352
float mean_y() const
Definition: blobbox.h:560
BOOL8 used_dm_model
Definition: blobbox.h:643
double v[max]
bool DefiniteIndividualFlow()
Definition: blobbox.cpp:247
float space_size
Definition: blobbox.h:787
const TBOX & bounding_box() const
Definition: blobbox.h:215
PITCH_TYPE
Definition: blobbox.h:29
BlobTextFlowType flow() const
Definition: blobbox.h:280
Definition: ocrblock.h:30
static void PlotNoiseBlobs(BLOBNBOX_LIST *list, ScrollView::Color body_colour, ScrollView::Color child_colour, ScrollView *win)
Definition: blobbox.cpp:427
void compute_bounding_box()
Definition: blobbox.h:225
void set_owns_cblob(bool value)
Definition: blobbox.h:393
int base_char_top() const
Definition: blobbox.h:368
int NoisyNeighbours() const
Definition: blobbox.cpp:232
float parallel_c() const
Definition: blobbox.h:575
float line_error() const
Definition: blobbox.h:572
float ascrise
Definition: blobbox.h:655
BOOL8 all_caps
Definition: blobbox.h:642
integer coordinate
Definition: points.h:30
static bool IsImageType(BlobRegionType type)
Definition: blobbox.h:407
void set_vert_stroke_width(float width)
Definition: blobbox.h:331