tesseract  4.00.00dev
blobs.h
Go to the documentation of this file.
1 /* -*-C-*-
2  ********************************************************************************
3  *
4  * File: blobs.h (Formerly blobs.h)
5  * Description: Blob definition
6  * Author: Mark Seaman, OCR Technology
7  * Created: Fri Oct 27 15:39:52 1989
8  * Modified: Thu Mar 28 15:33:38 1991 (Mark Seaman) marks@hpgrlt
9  * Language: C
10  * Package: N/A
11  * Status: Experimental (Do Not Distribute)
12  *
13  * (c) Copyright 1989, Hewlett-Packard Company.
14  ** Licensed under the Apache License, Version 2.0 (the "License");
15  ** you may not use this file except in compliance with the License.
16  ** You may obtain a copy of the License at
17  ** http://www.apache.org/licenses/LICENSE-2.0
18  ** Unless required by applicable law or agreed to in writing, software
19  ** distributed under the License is distributed on an "AS IS" BASIS,
20  ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
21  ** See the License for the specific language governing permissions and
22  ** limitations under the License.
23  *
24  *********************************************************************************/
25 
26 #ifndef BLOBS_H
27 #define BLOBS_H
28 
29 /*----------------------------------------------------------------------
30  I n c l u d e s
31 ----------------------------------------------------------------------*/
32 #include "clst.h"
33 #include "normalis.h"
34 #include "publictypes.h"
35 #include "rect.h"
36 #include "vecfuncs.h"
37 
38 class BLOCK;
39 class C_BLOB;
40 class C_OUTLINE;
41 class LLSQ;
42 class ROW;
43 class WERD;
44 
45 /*----------------------------------------------------------------------
46  T y p e s
47 ----------------------------------------------------------------------*/
48 #define EDGEPTFLAGS 4 /*concavity,length etc. */
49 
50 struct TPOINT {
51  TPOINT(): x(0), y(0) {}
52  TPOINT(inT16 vx, inT16 vy) : x(vx), y(vy) {}
53  TPOINT(const ICOORD &ic) : x(ic.x()), y(ic.y()) {}
54 
55  void operator+=(const TPOINT& other) {
56  x += other.x;
57  y += other.y;
58  }
59  void operator/=(int divisor) {
60  x /= divisor;
61  y /= divisor;
62  }
63  bool operator==(const TPOINT& other) const {
64  return x == other.x && y == other.y;
65  }
66  // Returns true when the two line segments cross each other.
67  // (Moved from outlines.cpp).
68  static bool IsCrossed(const TPOINT& a0, const TPOINT& a1, const TPOINT& b0,
69  const TPOINT& b1);
70 
71  inT16 x; // absolute x coord.
72  inT16 y; // absolute y coord.
73 };
74 typedef TPOINT VECTOR; // structure for coordinates.
75 
76 struct EDGEPT {
78  : next(NULL), prev(NULL), src_outline(NULL), start_step(0), step_count(0) {
79  memset(flags, 0, EDGEPTFLAGS * sizeof(flags[0]));
80  }
81  EDGEPT(const EDGEPT& src) : next(NULL), prev(NULL) {
82  CopyFrom(src);
83  }
84  EDGEPT& operator=(const EDGEPT& src) {
85  CopyFrom(src);
86  return *this;
87  }
88  // Copies the data elements, but leaves the pointers untouched.
89  void CopyFrom(const EDGEPT& src) {
90  pos = src.pos;
91  vec = src.vec;
92  memcpy(flags, src.flags, EDGEPTFLAGS * sizeof(flags[0]));
93  src_outline = src.src_outline;
94  start_step = src.start_step;
95  step_count = src.step_count;
96  }
97  // Returns the squared distance between the points, with the x-component
98  // weighted by x_factor.
99  int WeightedDistance(const EDGEPT& other, int x_factor) const {
100  int x_dist = pos.x - other.pos.x;
101  int y_dist = pos.y - other.pos.y;
102  return x_dist * x_dist * x_factor + y_dist * y_dist;
103  }
104  // Returns true if the positions are equal.
105  bool EqualPos(const EDGEPT& other) const { return pos == other.pos; }
106  // Returns the bounding box of the outline segment from *this to *end.
107  // Ignores hidden edge flags.
108  TBOX SegmentBox(const EDGEPT* end) const {
109  TBOX box(pos.x, pos.y, pos.x, pos.y);
110  const EDGEPT* pt = this;
111  do {
112  pt = pt->next;
113  if (pt->pos.x < box.left()) box.set_left(pt->pos.x);
114  if (pt->pos.x > box.right()) box.set_right(pt->pos.x);
115  if (pt->pos.y < box.bottom()) box.set_bottom(pt->pos.y);
116  if (pt->pos.y > box.top()) box.set_top(pt->pos.y);
117  } while (pt != end && pt != this);
118  return box;
119  }
120  // Returns the area of the outline segment from *this to *end.
121  // Ignores hidden edge flags.
122  int SegmentArea(const EDGEPT* end) const {
123  int area = 0;
124  const EDGEPT* pt = this->next;
125  do {
126  TPOINT origin_vec(pt->pos.x - pos.x, pt->pos.y - pos.y);
127  area += CROSS(origin_vec, pt->vec);
128  pt = pt->next;
129  } while (pt != end && pt != this);
130  return area;
131  }
132  // Returns true if the number of points in the outline segment from *this to
133  // *end is less that min_points and false if we get back to *this first.
134  // Ignores hidden edge flags.
135  bool ShortNonCircularSegment(int min_points, const EDGEPT* end) const {
136  int count = 0;
137  const EDGEPT* pt = this;
138  do {
139  if (pt == end) return true;
140  pt = pt->next;
141  ++count;
142  } while (pt != this && count <= min_points);
143  return false;
144  }
145 
146  // Accessors to hide or reveal a cut edge from feature extractors.
147  void Hide() {
148  flags[0] = true;
149  }
150  void Reveal() {
151  flags[0] = false;
152  }
153  bool IsHidden() const {
154  return flags[0] != 0;
155  }
156  void MarkChop() {
157  flags[2] = true;
158  }
159  bool IsChopPt() const {
160  return flags[2] != 0;
161  }
162 
163  TPOINT pos; // position
164  VECTOR vec; // vector to next point
165  // TODO(rays) Remove flags and replace with
166  // is_hidden, runlength, dir, and fixed. The only use
167  // of the flags other than is_hidden is in polyaprx.cpp.
168  char flags[EDGEPTFLAGS]; // concavity, length etc
169  EDGEPT* next; // anticlockwise element
170  EDGEPT* prev; // clockwise element
171  C_OUTLINE* src_outline; // Outline it came from.
172  // The following fields are not used if src_outline is NULL.
173  int start_step; // Location of pos in src_outline.
174  int step_count; // Number of steps used (may wrap around).
175 };
176 
177 // For use in chop and findseam to keep a list of which EDGEPTs were inserted.
179 
180 struct TESSLINE {
181  TESSLINE() : is_hole(false), loop(NULL), next(NULL) {}
182  TESSLINE(const TESSLINE& src) : loop(NULL), next(NULL) {
183  CopyFrom(src);
184  }
186  Clear();
187  }
188  TESSLINE& operator=(const TESSLINE& src) {
189  CopyFrom(src);
190  return *this;
191  }
192  // Consume the circular list of EDGEPTs to make a TESSLINE.
193  static TESSLINE* BuildFromOutlineList(EDGEPT* outline);
194  // Copies the data and the outline, but leaves next untouched.
195  void CopyFrom(const TESSLINE& src);
196  // Deletes owned data.
197  void Clear();
198  // Normalize in-place using the DENORM.
199  void Normalize(const DENORM& denorm);
200  // Rotates by the given rotation in place.
201  void Rotate(const FCOORD rotation);
202  // Moves by the given vec in place.
203  void Move(const ICOORD vec);
204  // Scales by the given factor in place.
205  void Scale(float factor);
206  // Sets up the start and vec members of the loop from the pos members.
207  void SetupFromPos();
208  // Recomputes the bounding box from the points in the loop.
209  void ComputeBoundingBox();
210  // Computes the min and max cross product of the outline points with the
211  // given vec and returns the results in min_xp and max_xp. Geometrically
212  // this is the left and right edge of the outline perpendicular to the
213  // given direction, but to get the distance units correct, you would
214  // have to divide by the modulus of vec.
215  void MinMaxCrossProduct(const TPOINT vec, int* min_xp, int* max_xp) const;
216 
217  TBOX bounding_box() const;
218  // Returns true if *this and other have equal bounding boxes.
219  bool SameBox(const TESSLINE& other) const {
220  return topleft == other.topleft && botright == other.botright;
221  }
222  // Returns true if the given line segment crosses any outline of this blob.
223  bool SegmentCrosses(const TPOINT& pt1, const TPOINT& pt2) const {
224  if (Contains(pt1) && Contains(pt2)) {
225  EDGEPT* pt = loop;
226  do {
227  if (TPOINT::IsCrossed(pt1, pt2, pt->pos, pt->next->pos)) return true;
228  pt = pt->next;
229  } while (pt != loop);
230  }
231  return false;
232  }
233  // Returns true if the point is contained within the outline box.
234  bool Contains(const TPOINT& pt) const {
235  return topleft.x <= pt.x && pt.x <= botright.x &&
236  botright.y <= pt.y && pt.y <= topleft.y;
237  }
238 
239  #ifndef GRAPHICS_DISABLED
240  void plot(ScrollView* window, ScrollView::Color color,
241  ScrollView::Color child_color);
242  #endif // GRAPHICS_DISABLED
243 
244  // Returns the first outline point that has a different src_outline to its
245  // predecessor, or, if all the same, the lowest indexed point.
246  EDGEPT* FindBestStartPt() const;
247 
248 
249  int BBArea() const {
250  return (botright.x - topleft.x) * (topleft.y - botright.y);
251  }
252 
253  TPOINT topleft; // Top left of loop.
254  TPOINT botright; // Bottom right of loop.
255  TPOINT start; // Start of loop.
256  bool is_hole; // True if this is a hole/child outline.
257  EDGEPT *loop; // Edgeloop.
258  TESSLINE *next; // Next outline in blob.
259 }; // Outline structure.
260 
261 struct TBLOB {
262  TBLOB() : outlines(NULL) {}
263  TBLOB(const TBLOB& src) : outlines(NULL) {
264  CopyFrom(src);
265  }
266  ~TBLOB() {
267  Clear();
268  }
269  TBLOB& operator=(const TBLOB& src) {
270  CopyFrom(src);
271  return *this;
272  }
273  // Factory to build a TBLOB from a C_BLOB with polygonal approximation along
274  // the way. If allow_detailed_fx is true, the EDGEPTs in the returned TBLOB
275  // contain pointers to the input C_OUTLINEs that enable higher-resolution
276  // feature extraction that does not use the polygonal approximation.
277  static TBLOB* PolygonalCopy(bool allow_detailed_fx, C_BLOB* src);
278  // Factory builds a blob with no outlines, but copies the other member data.
279  static TBLOB* ShallowCopy(const TBLOB& src);
280  // Normalizes the blob for classification only if needed.
281  // (Normally this means a non-zero classify rotation.)
282  // If no Normalization is needed, then NULL is returned, and the input blob
283  // can be used directly. Otherwise a new TBLOB is returned which must be
284  // deleted after use.
285  TBLOB* ClassifyNormalizeIfNeeded() const;
286 
287  // Copies the data and the outlines, but leaves next untouched.
288  void CopyFrom(const TBLOB& src);
289  // Deletes owned data.
290  void Clear();
291  // Sets up the built-in DENORM and normalizes the blob in-place.
292  // For parameters see DENORM::SetupNormalization, plus the inverse flag for
293  // this blob and the Pix for the full image.
294  void Normalize(const BLOCK* block,
295  const FCOORD* rotation,
296  const DENORM* predecessor,
297  float x_origin, float y_origin,
298  float x_scale, float y_scale,
299  float final_xshift, float final_yshift,
300  bool inverse, Pix* pix);
301  // Rotates by the given rotation in place.
302  void Rotate(const FCOORD rotation);
303  // Moves by the given vec in place.
304  void Move(const ICOORD vec);
305  // Scales by the given factor in place.
306  void Scale(float factor);
307  // Recomputes the bounding boxes of the outlines.
308  void ComputeBoundingBoxes();
309 
310  // Returns the number of outlines.
311  int NumOutlines() const;
312 
313  TBOX bounding_box() const;
314 
315  // Returns true if the given line segment crosses any outline of this blob.
316  bool SegmentCrossesOutline(const TPOINT& pt1, const TPOINT& pt2) const {
317  for (const TESSLINE* outline = outlines; outline != NULL;
318  outline = outline->next) {
319  if (outline->SegmentCrosses(pt1, pt2)) return true;
320  }
321  return false;
322  }
323  // Returns true if the point is contained within any of the outline boxes.
324  bool Contains(const TPOINT& pt) const {
325  for (const TESSLINE* outline = outlines; outline != NULL;
326  outline = outline->next) {
327  if (outline->Contains(pt)) return true;
328  }
329  return false;
330  }
331 
332  // Finds and deletes any duplicate outlines in this blob, without deleting
333  // their EDGEPTs.
334  void EliminateDuplicateOutlines();
335 
336  // Swaps the outlines of *this and next if needed to keep the centers in
337  // increasing x.
338  void CorrectBlobOrder(TBLOB* next);
339 
340  const DENORM& denorm() const {
341  return denorm_;
342  }
343 
344  #ifndef GRAPHICS_DISABLED
345  void plot(ScrollView* window, ScrollView::Color color,
346  ScrollView::Color child_color);
347  #endif // GRAPHICS_DISABLED
348 
349  int BBArea() const {
350  int total_area = 0;
351  for (TESSLINE* outline = outlines; outline != NULL; outline = outline->next)
352  total_area += outline->BBArea();
353  return total_area;
354  }
355 
356  // Computes the center of mass and second moments for the old baseline and
357  // 2nd moment normalizations. Returns the outline length.
358  // The input denorm should be the normalizations that have been applied from
359  // the image to the current state of this TBLOB.
360  int ComputeMoments(FCOORD* center, FCOORD* second_moments) const;
361  // Computes the precise bounding box of the coords that are generated by
362  // GetEdgeCoords. This may be different from the bounding box of the polygon.
363  void GetPreciseBoundingBox(TBOX* precise_box) const;
364  // Adds edges to the given vectors.
365  // For all the edge steps in all the outlines, or polygonal approximation
366  // where there are no edge steps, collects the steps into x_coords/y_coords.
367  // x_coords is a collection of the x-coords of vertical edges for each
368  // y-coord starting at box.bottom().
369  // y_coords is a collection of the y-coords of horizontal edges for each
370  // x-coord starting at box.left().
371  // Eg x_coords[0] is a collection of the x-coords of edges at y=bottom.
372  // Eg x_coords[1] is a collection of the x-coords of edges at y=bottom + 1.
373  void GetEdgeCoords(const TBOX& box,
374  GenericVector<GenericVector<int> >* x_coords,
375  GenericVector<GenericVector<int> >* y_coords) const;
376 
377  TESSLINE *outlines; // List of outlines in blob.
378 
379  private: // TODO(rays) Someday the data members will be private too.
380  // For all the edge steps in all the outlines, or polygonal approximation
381  // where there are no edge steps, collects the steps into the bounding_box,
382  // llsq and/or the x_coords/y_coords. Both are used in different kinds of
383  // normalization.
384  // For a description of x_coords, y_coords, see GetEdgeCoords above.
385  void CollectEdges(const TBOX& box,
386  TBOX* bounding_box, LLSQ* llsq,
387  GenericVector<GenericVector<int> >* x_coords,
388  GenericVector<GenericVector<int> >* y_coords) const;
389 
390  private:
391  // DENORM indicating the transformations that this blob has undergone so far.
392  DENORM denorm_;
393 }; // Blob structure.
394 
395 struct TWERD {
396  TWERD() : latin_script(false) {}
397  TWERD(const TWERD& src) {
398  CopyFrom(src);
399  }
400  ~TWERD() {
401  Clear();
402  }
403  TWERD& operator=(const TWERD& src) {
404  CopyFrom(src);
405  return *this;
406  }
407  // Factory to build a TWERD from a (C_BLOB) WERD, with polygonal
408  // approximation along the way.
409  static TWERD* PolygonalCopy(bool allow_detailed_fx, WERD* src);
410  // Baseline normalizes the blobs in-place, recording the normalization in the
411  // DENORMs in the blobs.
412  void BLNormalize(const BLOCK* block, const ROW* row, Pix* pix, bool inverse,
413  float x_height, float baseline_shift, bool numeric_mode,
415  const TBOX* norm_box,
416  DENORM* word_denorm);
417  // Copies the data and the blobs, but leaves next untouched.
418  void CopyFrom(const TWERD& src);
419  // Deletes owned data.
420  void Clear();
421  // Recomputes the bounding boxes of the blobs.
422  void ComputeBoundingBoxes();
423 
424  // Returns the number of blobs in the word.
425  int NumBlobs() const {
426  return blobs.size();
427  }
428  TBOX bounding_box() const;
429 
430  // Merges the blobs from start to end, not including end, and deletes
431  // the blobs between start and end.
432  void MergeBlobs(int start, int end);
433 
434  void plot(ScrollView* window);
435 
436  GenericVector<TBLOB*> blobs; // Blobs in word.
437  bool latin_script; // This word is in a latin-based script.
438 };
439 
440 /*----------------------------------------------------------------------
441  F u n c t i o n s
442 ----------------------------------------------------------------------*/
443 // TODO(rays) Make divisible_blob and divide_blobs members of TBLOB.
444 bool divisible_blob(TBLOB *blob, bool italic_blob, TPOINT* location);
445 
446 void divide_blobs(TBLOB *blob, TBLOB *other_blob, bool italic_blob,
447  const TPOINT& location);
448 
449 #endif
int start_step
Definition: blobs.h:173
TPOINT()
Definition: blobs.h:51
char flags[EDGEPTFLAGS]
Definition: blobs.h:168
TESSLINE * next
Definition: blobs.h:258
bool operator==(const TPOINT &other) const
Definition: blobs.h:63
bool IsChopPt() const
Definition: blobs.h:159
TPOINT pos
Definition: blobs.h:163
TBOX SegmentBox(const EDGEPT *end) const
Definition: blobs.h:108
Definition: points.h:189
TPOINT start
Definition: blobs.h:255
C_OUTLINE * src_outline
Definition: blobs.h:171
bool SegmentCrosses(const TPOINT &pt1, const TPOINT &pt2) const
Definition: blobs.h:223
void Normalize(float *Values)
bool Contains(const TPOINT &pt) const
Definition: blobs.h:324
EDGEPT * prev
Definition: blobs.h:170
int BBArea() const
Definition: blobs.h:349
#define CROSS(a, b)
Definition: vecfuncs.h:52
TESSLINE * outlines
Definition: blobs.h:377
~TESSLINE()
Definition: blobs.h:185
void MarkChop()
Definition: blobs.h:156
bool SameBox(const TESSLINE &other) const
Definition: blobs.h:219
EDGEPT()
Definition: blobs.h:77
void Hide()
Definition: blobs.h:147
EDGEPT(const EDGEPT &src)
Definition: blobs.h:81
bool SegmentCrossesOutline(const TPOINT &pt1, const TPOINT &pt2) const
Definition: blobs.h:316
int16_t inT16
Definition: host.h:36
TESSLINE(const TESSLINE &src)
Definition: blobs.h:182
inT16 left() const
Definition: rect.h:68
void Reveal()
Definition: blobs.h:150
Definition: blobs.h:395
void set_top(int y)
Definition: rect.h:57
#define EDGEPTFLAGS
Definition: blobs.h:48
void operator/=(int divisor)
Definition: blobs.h:59
void CopyFrom(const EDGEPT &src)
Definition: blobs.h:89
bool is_hole
Definition: blobs.h:256
~TWERD()
Definition: blobs.h:400
TPOINT botright
Definition: blobs.h:254
TPOINT(inT16 vx, inT16 vy)
Definition: blobs.h:52
int WeightedDistance(const EDGEPT &other, int x_factor) const
Definition: blobs.h:99
TESSLINE & operator=(const TESSLINE &src)
Definition: blobs.h:188
const DENORM & denorm() const
Definition: blobs.h:340
VECTOR vec
Definition: blobs.h:164
CLISTIZEH(EDGEPT)
EDGEPT * loop
Definition: blobs.h:257
bool Contains(const TPOINT &pt) const
Definition: blobs.h:234
TBLOB()
Definition: blobs.h:262
bool IsHidden() const
Definition: blobs.h:153
inT16 x
Definition: blobs.h:71
~TBLOB()
Definition: blobs.h:266
TPOINT topleft
Definition: blobs.h:253
int step_count
Definition: blobs.h:174
TPOINT(const ICOORD &ic)
Definition: blobs.h:53
EDGEPT * next
Definition: blobs.h:169
void divide_blobs(TBLOB *blob, TBLOB *other_blob, bool italic_blob, const TPOINT &location)
Definition: blobs.cpp:981
Definition: blobs.h:76
bool EqualPos(const EDGEPT &other) const
Definition: blobs.h:105
int NumBlobs() const
Definition: blobs.h:425
inT16 top() const
Definition: rect.h:54
int BBArea() const
Definition: blobs.h:249
inT16 y
Definition: blobs.h:72
Definition: rect.h:30
GenericVector< TBLOB * > blobs
Definition: blobs.h:436
bool latin_script
Definition: blobs.h:437
TBLOB(const TBLOB &src)
Definition: blobs.h:263
Definition: blobs.h:261
TWERD & operator=(const TWERD &src)
Definition: blobs.h:403
Definition: linlsq.h:26
bool ShortNonCircularSegment(int min_points, const EDGEPT *end) const
Definition: blobs.h:135
Definition: blobs.h:50
inT16 right() const
Definition: rect.h:75
TWERD(const TWERD &src)
Definition: blobs.h:397
EDGEPT & operator=(const EDGEPT &src)
Definition: blobs.h:84
void set_right(int x)
Definition: rect.h:78
void operator+=(const TPOINT &other)
Definition: blobs.h:55
void set_left(int x)
Definition: rect.h:71
static bool IsCrossed(const TPOINT &a0, const TPOINT &a1, const TPOINT &b0, const TPOINT &b1)
Definition: blobs.cpp:73
inT16 bottom() const
Definition: rect.h:61
int SegmentArea(const EDGEPT *end) const
Definition: blobs.h:122
void set_bottom(int y)
Definition: rect.h:64
bool divisible_blob(TBLOB *blob, bool italic_blob, TPOINT *location)
Definition: blobs.cpp:932
TBLOB & operator=(const TBLOB &src)
Definition: blobs.h:269
TPOINT VECTOR
Definition: blobs.h:74
Definition: werd.h:60
TESSLINE()
Definition: blobs.h:181
Definition: ocrrow.h:32
int count(LIST var_list)
Definition: oldlist.cpp:103
Definition: ocrblock.h:30
TWERD()
Definition: blobs.h:396
integer coordinate
Definition: points.h:30