tesseract  4.00.00dev
strokewidth.cpp
Go to the documentation of this file.
1 // File: strokewidth.cpp
3 // Description: Subclass of BBGrid to find uniformity of strokewidth.
4 // Author: Ray Smith
5 // Created: Mon Mar 31 16:17:01 PST 2008
6 //
7 // (C) Copyright 2008, Google Inc.
8 // Licensed under the Apache License, Version 2.0 (the "License");
9 // you may not use this file except in compliance with the License.
10 // You may obtain a copy of the License at
11 // http://www.apache.org/licenses/LICENSE-2.0
12 // Unless required by applicable law or agreed to in writing, software
13 // distributed under the License is distributed on an "AS IS" BASIS,
14 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15 // See the License for the specific language governing permissions and
16 // limitations under the License.
17 //
19 
20 #ifdef _MSC_VER
21 #pragma warning(disable:4244) // Conversion warnings
22 #endif
23 
24 #ifdef HAVE_CONFIG_H
25 #include "config_auto.h"
26 #endif
27 
28 #include "strokewidth.h"
29 
30 #include <math.h>
31 
32 #include "blobbox.h"
33 #include "colpartition.h"
34 #include "colpartitiongrid.h"
35 #include "imagefind.h"
36 #include "linlsq.h"
37 #include "statistc.h"
38 #include "tabfind.h"
39 #include "textlineprojection.h"
40 #include "tordmain.h" // For SetBlobStrokeWidth.
41 
42 namespace tesseract {
43 
44 INT_VAR(textord_tabfind_show_strokewidths, 0, "Show stroke widths");
45 BOOL_VAR(textord_tabfind_only_strokewidths, false, "Only run stroke widths");
46 
48 const double kStrokeWidthFractionTolerance = 0.125;
53 const double kStrokeWidthTolerance = 1.5;
54 // Same but for CJK we are a bit more generous.
55 const double kStrokeWidthFractionCJK = 0.25;
56 const double kStrokeWidthCJK = 2.0;
57 // Radius in grid cells of search for broken CJK. Doesn't need to be very
58 // large as the grid size should be about the size of a character anyway.
59 const int kCJKRadius = 2;
60 // Max distance fraction of size to join close but broken CJK characters.
61 const double kCJKBrokenDistanceFraction = 0.25;
62 // Max number of components in a broken CJK character.
63 const int kCJKMaxComponents = 8;
64 // Max aspect ratio of CJK broken characters when put back together.
65 const double kCJKAspectRatio = 1.25;
66 // Max increase in aspect ratio of CJK broken characters when merged.
67 const double kCJKAspectRatioIncrease = 1.0625;
68 // Max multiple of the grid size that will be used in computing median CJKsize.
69 const int kMaxCJKSizeRatio = 5;
70 // Min fraction of blobs broken CJK to iterate and run it again.
71 const double kBrokenCJKIterationFraction = 0.125;
72 // Multiple of gridsize as x-padding for a search box for diacritic base
73 // characters.
74 const double kDiacriticXPadRatio = 7.0;
75 // Multiple of gridsize as y-padding for a search box for diacritic base
76 // characters.
77 const double kDiacriticYPadRatio = 1.75;
78 // Min multiple of diacritic height that a neighbour must be to be a
79 // convincing base character.
80 const double kMinDiacriticSizeRatio = 1.0625;
81 // Max multiple of a textline's median height as a threshold for the sum of
82 // a diacritic's farthest x and y distances (gap + size).
83 const double kMaxDiacriticDistanceRatio = 1.25;
84 // Max x-gap between a diacritic and its base char as a fraction of the height
85 // of the base char (allowing other blobs to fill the gap.)
87 // Ratio between longest side of a line and longest side of a character.
88 // (neighbor_min > blob_min * kLineTrapShortest &&
89 // neighbor_max < blob_max / kLineTrapLongest)
90 // => neighbor is a grapheme and blob is a line.
91 const int kLineTrapLongest = 4;
92 // Ratio between shortest side of a line and shortest side of a character.
93 const int kLineTrapShortest = 2;
94 // Max aspect ratio of the total box before CountNeighbourGaps
95 // decides immediately based on the aspect ratio.
96 const int kMostlyOneDirRatio = 3;
97 // Aspect ratio for a blob to be considered as line residue.
98 const double kLineResidueAspectRatio = 8.0;
99 // Padding ratio for line residue search box.
100 const int kLineResiduePadRatio = 3;
101 // Min multiple of neighbour size for a line residue to be genuine.
102 const double kLineResidueSizeRatio = 1.75;
103 // Aspect ratio filter for OSD.
104 const float kSizeRatioToReject = 2.0;
105 // Expansion factor for search box for good neighbours.
106 const double kNeighbourSearchFactor = 2.5;
107 // Factor of increase of overlap when adding diacritics to make an image noisy.
108 const double kNoiseOverlapGrowthFactor = 4.0;
109 // Fraction of the image size to add overlap when adding diacritics for an
110 // image to qualify as noisy.
111 const double kNoiseOverlapAreaFactor = 1.0 / 512;
112 
114  const ICOORD& bleft, const ICOORD& tright)
115  : BlobGrid(gridsize, bleft, tright), nontext_map_(NULL), projection_(NULL),
116  denorm_(NULL), grid_box_(bleft, tright), rerotation_(1.0f, 0.0f) {
117  leaders_win_ = NULL;
118  widths_win_ = NULL;
119  initial_widths_win_ = NULL;
120  chains_win_ = NULL;
121  diacritics_win_ = NULL;
122  textlines_win_ = NULL;
123  smoothed_win_ = NULL;
124 }
125 
127  if (widths_win_ != NULL) {
128  #ifndef GRAPHICS_DISABLED
129  delete widths_win_->AwaitEvent(SVET_DESTROY);
130  #endif // GRAPHICS_DISABLED
132  exit(0);
133  delete widths_win_;
134  }
135  delete leaders_win_;
136  delete initial_widths_win_;
137  delete chains_win_;
138  delete textlines_win_;
139  delete smoothed_win_;
140  delete diacritics_win_;
141 }
142 
143 // Sets the neighbours member of the medium-sized blobs in the block.
144 // Searches on 4 sides of each blob for similar-sized, similar-strokewidth
145 // blobs and sets pointers to the good neighbours.
147  // Run a preliminary strokewidth neighbour detection on the medium blobs.
148  InsertBlobList(&block->blobs);
149  BLOBNBOX_IT blob_it(&block->blobs);
150  for (blob_it.mark_cycle_pt(); !blob_it.cycled_list(); blob_it.forward()) {
151  SetNeighbours(false, false, blob_it.data());
152  }
153  Clear();
154 }
155 
156 // Sets the neighbour/textline writing direction members of the medium
157 // and large blobs with optional repair of broken CJK characters first.
158 // Repair of broken CJK is needed here because broken CJK characters
159 // can fool the textline direction detection algorithm.
161  bool cjk_merge,
162  TO_BLOCK* input_block) {
163  // Setup the grid with the remaining (non-noise) blobs.
164  InsertBlobs(input_block);
165  // Repair broken CJK characters if needed.
166  while (cjk_merge && FixBrokenCJK(input_block));
167  // Grade blobs by inspection of neighbours.
168  FindTextlineFlowDirection(pageseg_mode, false);
169  // Clear the grid ready for rotation or leader finding.
170  Clear();
171 }
172 
173 // Helper to collect and count horizontal and vertical blobs from a list.
174 static void CollectHorizVertBlobs(BLOBNBOX_LIST* input_blobs,
175  int* num_vertical_blobs,
176  int* num_horizontal_blobs,
177  BLOBNBOX_CLIST* vertical_blobs,
178  BLOBNBOX_CLIST* horizontal_blobs,
179  BLOBNBOX_CLIST* nondescript_blobs) {
180  BLOBNBOX_C_IT v_it(vertical_blobs);
181  BLOBNBOX_C_IT h_it(horizontal_blobs);
182  BLOBNBOX_C_IT n_it(nondescript_blobs);
183  BLOBNBOX_IT blob_it(input_blobs);
184  for (blob_it.mark_cycle_pt(); !blob_it.cycled_list(); blob_it.forward()) {
185  BLOBNBOX* blob = blob_it.data();
186  const TBOX& box = blob->bounding_box();
187  float y_x = static_cast<float>(box.height()) / box.width();
188  float x_y = 1.0f / y_x;
189  // Select a >= 1.0 ratio
190  float ratio = x_y > y_x ? x_y : y_x;
191  // If the aspect ratio is small and we want them for osd, save the blob.
192  bool ok_blob = ratio <= kSizeRatioToReject;
193  if (blob->UniquelyVertical()) {
194  ++*num_vertical_blobs;
195  if (ok_blob) v_it.add_after_then_move(blob);
196  } else if (blob->UniquelyHorizontal()) {
197  ++*num_horizontal_blobs;
198  if (ok_blob) h_it.add_after_then_move(blob);
199  } else if (ok_blob) {
200  n_it.add_after_then_move(blob);
201  }
202  }
203 }
204 
205 
206 // Types all the blobs as vertical or horizontal text or unknown and
207 // returns true if the majority are vertical.
208 // If the blobs are rotated, it is necessary to call CorrectForRotation
209 // after rotating everything, otherwise the work done here will be enough.
210 // If osd_blobs is not null, a list of blobs from the dominant textline
211 // direction are returned for use in orientation and script detection.
212 bool StrokeWidth::TestVerticalTextDirection(double find_vertical_text_ratio,
213  TO_BLOCK* block,
214  BLOBNBOX_CLIST* osd_blobs) {
215  int vertical_boxes = 0;
216  int horizontal_boxes = 0;
217  // Count vertical normal and large blobs.
218  BLOBNBOX_CLIST vertical_blobs;
219  BLOBNBOX_CLIST horizontal_blobs;
220  BLOBNBOX_CLIST nondescript_blobs;
221  CollectHorizVertBlobs(&block->blobs, &vertical_boxes, &horizontal_boxes,
222  &vertical_blobs, &horizontal_blobs, &nondescript_blobs);
223  CollectHorizVertBlobs(&block->large_blobs, &vertical_boxes, &horizontal_boxes,
224  &vertical_blobs, &horizontal_blobs, &nondescript_blobs);
226  tprintf("TextDir hbox=%d vs vbox=%d, %dH, %dV, %dN osd blobs\n",
227  horizontal_boxes, vertical_boxes,
228  horizontal_blobs.length(), vertical_blobs.length(),
229  nondescript_blobs.length());
230  if (osd_blobs != NULL && vertical_boxes == 0 && horizontal_boxes == 0) {
231  // Only nondescript blobs available, so return those.
232  BLOBNBOX_C_IT osd_it(osd_blobs);
233  osd_it.add_list_after(&nondescript_blobs);
234  return false;
235  }
236  int min_vert_boxes = static_cast<int>((vertical_boxes + horizontal_boxes) *
237  find_vertical_text_ratio);
238  if (vertical_boxes >= min_vert_boxes) {
239  if (osd_blobs != NULL) {
240  BLOBNBOX_C_IT osd_it(osd_blobs);
241  osd_it.add_list_after(&vertical_blobs);
242  }
243  return true;
244  } else {
245  if (osd_blobs != NULL) {
246  BLOBNBOX_C_IT osd_it(osd_blobs);
247  osd_it.add_list_after(&horizontal_blobs);
248  }
249  return false;
250  }
251 }
252 
253 // Corrects the data structures for the given rotation.
255  ColPartitionGrid* part_grid) {
256  Init(part_grid->gridsize(), part_grid->bleft(), part_grid->tright());
257  grid_box_ = TBOX(bleft(), tright());
258  rerotation_.set_x(rotation.x());
259  rerotation_.set_y(-rotation.y());
260 }
261 
262 // Finds leader partitions and inserts them into the given part_grid.
264  ColPartitionGrid* part_grid) {
265  Clear();
266  // Find and isolate leaders in the noise list.
267  ColPartition_LIST leader_parts;
268  FindLeadersAndMarkNoise(block, &leader_parts);
269  // Setup the strokewidth grid with the block's remaining (non-noise) blobs.
270  InsertBlobList(&block->blobs);
271  // Mark blobs that have leader neighbours.
272  for (ColPartition_IT it(&leader_parts); !it.empty(); it.forward()) {
273  ColPartition* part = it.extract();
274  part->ClaimBoxes();
275  MarkLeaderNeighbours(part, LR_LEFT);
276  MarkLeaderNeighbours(part, LR_RIGHT);
277  part_grid->InsertBBox(true, true, part);
278  }
279 }
280 
281 // Finds and marks noise those blobs that look like bits of vertical lines
282 // that would otherwise screw up layout analysis.
283 void StrokeWidth::RemoveLineResidue(ColPartition_LIST* big_part_list) {
284  BlobGridSearch gsearch(this);
285  BLOBNBOX* bbox;
286  // For every vertical line-like bbox in the grid, search its neighbours
287  // to find the tallest, and if the original box is taller by sufficient
288  // margin, then call it line residue and delete it.
289  gsearch.StartFullSearch();
290  while ((bbox = gsearch.NextFullSearch()) != NULL) {
291  TBOX box = bbox->bounding_box();
292  if (box.height() < box.width() * kLineResidueAspectRatio)
293  continue;
294  // Set up a rectangle search around the blob to find the size of its
295  // neighbours.
296  int padding = box.height() * kLineResiduePadRatio;
297  TBOX search_box = box;
298  search_box.pad(padding, padding);
299  bool debug = AlignedBlob::WithinTestRegion(2, box.left(),
300  box.bottom());
301  // Find the largest object in the search box not equal to bbox.
302  BlobGridSearch rsearch(this);
303  int max_size = 0;
304  BLOBNBOX* n;
305  rsearch.StartRectSearch(search_box);
306  while ((n = rsearch.NextRectSearch()) != NULL) {
307  if (n == bbox) continue;
308  TBOX nbox = n->bounding_box();
309  if (nbox.height() > max_size) {
310  max_size = nbox.height();
311  }
312  }
313  if (debug) {
314  tprintf("Max neighbour size=%d for candidate line box at:", max_size);
315  box.print();
316  }
317  if (max_size * kLineResidueSizeRatio < box.height()) {
318  #ifndef GRAPHICS_DISABLED
319  if (leaders_win_ != NULL) {
320  // We are debugging, so display deleted in pink blobs in the same
321  // window that we use to display leader detection.
322  leaders_win_->Pen(ScrollView::PINK);
323  leaders_win_->Rectangle(box.left(), box.bottom(),
324  box.right(), box.top());
325  }
326  #endif // GRAPHICS_DISABLED
327  ColPartition::MakeBigPartition(bbox, big_part_list);
328  }
329  }
330 }
331 
332 // Types all the blobs as vertical text or horizontal text or unknown and
333 // puts them into initial ColPartitions in the supplied part_grid.
334 // rerotation determines how to get back to the image coordinates from the
335 // blob coordinates (since they may have been rotated for vertical text).
336 // block is the single block for the whole page or rectangle to be OCRed.
337 // nontext_pix (full-size), is a binary mask used to prevent merges across
338 // photo/text boundaries. It is not kept beyond this function.
339 // denorm provides a mapping back to the image from the current blob
340 // coordinate space.
341 // projection provides a measure of textline density over the image and
342 // provides functions to assist with diacritic detection. It should be a
343 // pointer to a new TextlineProjection, and will be setup here.
344 // part_grid is the output grid of textline partitions.
345 // Large blobs that cause overlap are put in separate partitions and added
346 // to the big_parts list.
348  PageSegMode pageseg_mode, const FCOORD& rerotation, TO_BLOCK* block,
349  Pix* nontext_pix, const DENORM* denorm, bool cjk_script,
350  TextlineProjection* projection, BLOBNBOX_LIST* diacritic_blobs,
351  ColPartitionGrid* part_grid, ColPartition_LIST* big_parts) {
352  nontext_map_ = nontext_pix;
353  projection_ = projection;
354  denorm_ = denorm;
355  // Clear and re Insert to take advantage of the tab stops in the blobs.
356  Clear();
357  // Setup the strokewidth grid with the remaining non-noise, non-leader blobs.
358  InsertBlobs(block);
359 
360  // Run FixBrokenCJK() again if the page is CJK.
361  if (cjk_script) {
362  FixBrokenCJK(block);
363  }
364  FindTextlineFlowDirection(pageseg_mode, false);
365  projection_->ConstructProjection(block, rerotation, nontext_map_);
367  ScrollView* line_blobs_win = MakeWindow(0, 0, "Initial textline Blobs");
368  projection_->PlotGradedBlobs(&block->blobs, line_blobs_win);
369  projection_->PlotGradedBlobs(&block->small_blobs, line_blobs_win);
370  }
371  projection_->MoveNonTextlineBlobs(&block->blobs, &block->noise_blobs);
372  projection_->MoveNonTextlineBlobs(&block->small_blobs, &block->noise_blobs);
373  // Clear and re Insert to take advantage of the removed diacritics.
374  Clear();
375  InsertBlobs(block);
376  FCOORD skew;
377  FindTextlineFlowDirection(pageseg_mode, true);
379  FindInitialPartitions(pageseg_mode, rerotation, true, block,
380  diacritic_blobs, part_grid, big_parts, &skew);
381  if (r == PFR_NOISE) {
382  tprintf("Detected %d diacritics\n", diacritic_blobs->length());
383  // Noise was found, and removed.
384  Clear();
385  InsertBlobs(block);
386  FindTextlineFlowDirection(pageseg_mode, true);
387  r = FindInitialPartitions(pageseg_mode, rerotation, false, block,
388  diacritic_blobs, part_grid, big_parts, &skew);
389  }
390  nontext_map_ = NULL;
391  projection_ = NULL;
392  denorm_ = NULL;
393 }
394 
395 static void PrintBoxWidths(BLOBNBOX* neighbour) {
396  const TBOX& nbox = neighbour->bounding_box();
397  tprintf("Box (%d,%d)->(%d,%d): h-width=%.1f, v-width=%.1f p-width=%1.f\n",
398  nbox.left(), nbox.bottom(), nbox.right(), nbox.top(),
399  neighbour->horz_stroke_width(), neighbour->vert_stroke_width(),
400  2.0 * neighbour->cblob()->area()/neighbour->cblob()->perimeter());
401 }
402 
404 void StrokeWidth::HandleClick(int x, int y) {
406  // Run a radial search for blobs that overlap.
407  BlobGridSearch radsearch(this);
408  radsearch.StartRadSearch(x, y, 1);
409  BLOBNBOX* neighbour;
410  FCOORD click(static_cast<float>(x), static_cast<float>(y));
411  while ((neighbour = radsearch.NextRadSearch()) != NULL) {
412  TBOX nbox = neighbour->bounding_box();
413  if (nbox.contains(click) && neighbour->cblob() != NULL) {
414  PrintBoxWidths(neighbour);
415  if (neighbour->neighbour(BND_LEFT) != NULL)
416  PrintBoxWidths(neighbour->neighbour(BND_LEFT));
417  if (neighbour->neighbour(BND_RIGHT) != NULL)
418  PrintBoxWidths(neighbour->neighbour(BND_RIGHT));
419  if (neighbour->neighbour(BND_ABOVE) != NULL)
420  PrintBoxWidths(neighbour->neighbour(BND_ABOVE));
421  if (neighbour->neighbour(BND_BELOW) != NULL)
422  PrintBoxWidths(neighbour->neighbour(BND_BELOW));
423  int gaps[BND_COUNT];
424  neighbour->NeighbourGaps(gaps);
425  tprintf("Left gap=%d, right=%d, above=%d, below=%d, horz=%d, vert=%d\n"
426  "Good= %d %d %d %d\n",
427  gaps[BND_LEFT], gaps[BND_RIGHT],
428  gaps[BND_ABOVE], gaps[BND_BELOW],
429  neighbour->horz_possible(),
430  neighbour->vert_possible(),
431  neighbour->good_stroke_neighbour(BND_LEFT),
432  neighbour->good_stroke_neighbour(BND_RIGHT),
433  neighbour->good_stroke_neighbour(BND_ABOVE),
434  neighbour->good_stroke_neighbour(BND_BELOW));
435  break;
436  }
437  }
438 }
439 
440 // Detects and marks leader dots/dashes.
441 // Leaders are horizontal chains of small or noise blobs that look
442 // monospace according to ColPartition::MarkAsLeaderIfMonospaced().
443 // Detected leaders become the only occupants of the block->small_blobs list.
444 // Non-leader small blobs get moved to the blobs list.
445 // Non-leader noise blobs remain singletons in the noise list.
446 // All small and noise blobs in high density regions are marked BTFT_NONTEXT.
447 // block is the single block for the whole page or rectangle to be OCRed.
448 // leader_parts is the output.
449 void StrokeWidth::FindLeadersAndMarkNoise(TO_BLOCK* block,
450  ColPartition_LIST* leader_parts) {
451  InsertBlobList(&block->small_blobs);
452  InsertBlobList(&block->noise_blobs);
453  BlobGridSearch gsearch(this);
454  BLOBNBOX* bbox;
455  // For every bbox in the grid, set its neighbours.
456  gsearch.StartFullSearch();
457  while ((bbox = gsearch.NextFullSearch()) != NULL) {
458  SetNeighbours(true, false, bbox);
459  }
460  ColPartition_IT part_it(leader_parts);
461  gsearch.StartFullSearch();
462  while ((bbox = gsearch.NextFullSearch()) != NULL) {
463  if (bbox->flow() == BTFT_NONE) {
464  if (bbox->neighbour(BND_RIGHT) == NULL &&
465  bbox->neighbour(BND_LEFT) == NULL)
466  continue;
467  // Put all the linked blobs into a ColPartition.
468  ColPartition* part = new ColPartition(BRT_UNKNOWN, ICOORD(0, 1));
469  BLOBNBOX* blob;
470  for (blob = bbox; blob != NULL && blob->flow() == BTFT_NONE;
471  blob = blob->neighbour(BND_RIGHT))
472  part->AddBox(blob);
473  for (blob = bbox->neighbour(BND_LEFT); blob != NULL &&
474  blob->flow() == BTFT_NONE;
475  blob = blob->neighbour(BND_LEFT))
476  part->AddBox(blob);
477  if (part->MarkAsLeaderIfMonospaced())
478  part_it.add_after_then_move(part);
479  else
480  delete part;
481  }
482  }
484  leaders_win_ = DisplayGoodBlobs("LeaderNeighbours", 0, 0);
485  }
486  // Move any non-leaders from the small to the blobs list, as they are
487  // most likely dashes or broken characters.
488  BLOBNBOX_IT blob_it(&block->blobs);
489  BLOBNBOX_IT small_it(&block->small_blobs);
490  for (small_it.mark_cycle_pt(); !small_it.cycled_list(); small_it.forward()) {
491  BLOBNBOX* blob = small_it.data();
492  if (blob->flow() != BTFT_LEADER) {
493  if (blob->flow() == BTFT_NEIGHBOURS)
494  blob->set_flow(BTFT_NONE);
495  blob->ClearNeighbours();
496  blob_it.add_to_end(small_it.extract());
497  }
498  }
499  // Move leaders from the noise list to the small list, leaving the small
500  // list exclusively leaders, so they don't get processed further,
501  // and the remaining small blobs all in the noise list.
502  BLOBNBOX_IT noise_it(&block->noise_blobs);
503  for (noise_it.mark_cycle_pt(); !noise_it.cycled_list(); noise_it.forward()) {
504  BLOBNBOX* blob = noise_it.data();
505  if (blob->flow() == BTFT_LEADER || blob->joined_to_prev()) {
506  small_it.add_to_end(noise_it.extract());
507  } else if (blob->flow() == BTFT_NEIGHBOURS) {
508  blob->set_flow(BTFT_NONE);
509  blob->ClearNeighbours();
510  }
511  }
512  // Clear the grid as we don't want the small stuff hanging around in it.
513  Clear();
514 }
515 
518 void StrokeWidth::InsertBlobs(TO_BLOCK* block) {
519  InsertBlobList(&block->blobs);
520  InsertBlobList(&block->large_blobs);
521 }
522 
523 // Checks the left or right side of the given leader partition and sets the
524 // (opposite) leader_on_right or leader_on_left flags for blobs
525 // that are next to the given side of the given leader partition.
526 void StrokeWidth::MarkLeaderNeighbours(const ColPartition* part,
527  LeftOrRight side) {
528  const TBOX& part_box = part->bounding_box();
529  BlobGridSearch blobsearch(this);
530  // Search to the side of the leader for the nearest neighbour.
531  BLOBNBOX* best_blob = NULL;
532  int best_gap = 0;
533  blobsearch.StartSideSearch(side == LR_LEFT ? part_box.left()
534  : part_box.right(),
535  part_box.bottom(), part_box.top());
536  BLOBNBOX* blob;
537  while ((blob = blobsearch.NextSideSearch(side == LR_LEFT)) != NULL) {
538  const TBOX& blob_box = blob->bounding_box();
539  if (!blob_box.y_overlap(part_box))
540  continue;
541  int x_gap = blob_box.x_gap(part_box);
542  if (x_gap > 2 * gridsize()) {
543  break;
544  } else if (best_blob == NULL || x_gap < best_gap) {
545  best_blob = blob;
546  best_gap = x_gap;
547  }
548  }
549  if (best_blob != NULL) {
550  if (side == LR_LEFT)
551  best_blob->set_leader_on_right(true);
552  else
553  best_blob->set_leader_on_left(true);
554  #ifndef GRAPHICS_DISABLED
555  if (leaders_win_ != NULL) {
556  leaders_win_->Pen(side == LR_LEFT ? ScrollView::RED : ScrollView::GREEN);
557  const TBOX& blob_box = best_blob->bounding_box();
558  leaders_win_->Rectangle(blob_box.left(), blob_box.bottom(),
559  blob_box.right(), blob_box.top());
560  }
561  #endif // GRAPHICS_DISABLED
562  }
563 }
564 
565 // Helper to compute the UQ of the square-ish CJK charcters.
566 static int UpperQuartileCJKSize(int gridsize, BLOBNBOX_LIST* blobs) {
567  STATS sizes(0, gridsize * kMaxCJKSizeRatio);
568  BLOBNBOX_IT it(blobs);
569  for (it.mark_cycle_pt(); !it.cycled_list(); it.forward()) {
570  BLOBNBOX* blob = it.data();
571  int width = blob->bounding_box().width();
572  int height = blob->bounding_box().height();
573  if (width <= height * kCJKAspectRatio && height < width * kCJKAspectRatio)
574  sizes.add(height, 1);
575  }
576  return static_cast<int>(sizes.ile(0.75f) + 0.5);
577 }
578 
579 // Fix broken CJK characters, using the fake joined blobs mechanism.
580 // Blobs are really merged, ie the master takes all the outlines and the
581 // others are deleted.
582 // Returns true if sufficient blobs are merged that it may be worth running
583 // again, due to a better estimate of character size.
584 bool StrokeWidth::FixBrokenCJK(TO_BLOCK* block) {
585  BLOBNBOX_LIST* blobs = &block->blobs;
586  int median_height = UpperQuartileCJKSize(gridsize(), blobs);
587  int max_dist = static_cast<int>(median_height * kCJKBrokenDistanceFraction);
588  int max_size = static_cast<int>(median_height * kCJKAspectRatio);
589  int num_fixed = 0;
590  BLOBNBOX_IT blob_it(blobs);
591 
592  for (blob_it.mark_cycle_pt(); !blob_it.cycled_list(); blob_it.forward()) {
593  BLOBNBOX* blob = blob_it.data();
594  if (blob->cblob() == NULL || blob->cblob()->out_list()->empty())
595  continue;
596  TBOX bbox = blob->bounding_box();
597  bool debug = AlignedBlob::WithinTestRegion(3, bbox.left(),
598  bbox.bottom());
599  if (debug) {
600  tprintf("Checking for Broken CJK (max size=%d):", max_size);
601  bbox.print();
602  }
603  // Generate a list of blobs that overlap or are near enough to merge.
604  BLOBNBOX_CLIST overlapped_blobs;
605  AccumulateOverlaps(blob, debug, max_size, max_dist,
606  &bbox, &overlapped_blobs);
607  if (!overlapped_blobs.empty()) {
608  // There are overlapping blobs, so qualify them as being satisfactory
609  // before removing them from the grid and replacing them with the union.
610  // The final box must be roughly square.
611  if (bbox.width() > bbox.height() * kCJKAspectRatio ||
612  bbox.height() > bbox.width() * kCJKAspectRatio) {
613  if (debug) {
614  tprintf("Bad final aspectratio:");
615  bbox.print();
616  }
617  continue;
618  }
619  // There can't be too many blobs to merge.
620  if (overlapped_blobs.length() >= kCJKMaxComponents) {
621  if (debug)
622  tprintf("Too many neighbours: %d\n", overlapped_blobs.length());
623  continue;
624  }
625  // The strokewidths must match amongst the join candidates.
626  BLOBNBOX_C_IT n_it(&overlapped_blobs);
627  for (n_it.mark_cycle_pt(); !n_it.cycled_list(); n_it.forward()) {
628  BLOBNBOX* neighbour = NULL;
629  neighbour = n_it.data();
630  if (!blob->MatchingStrokeWidth(*neighbour, kStrokeWidthFractionCJK,
631  kStrokeWidthCJK))
632  break;
633  }
634  if (!n_it.cycled_list()) {
635  if (debug) {
636  tprintf("Bad stroke widths:");
637  PrintBoxWidths(blob);
638  }
639  continue; // Not good enough.
640  }
641 
642  // Merge all the candidates into blob.
643  // We must remove blob from the grid and reinsert it after merging
644  // to maintain the integrity of the grid.
645  RemoveBBox(blob);
646  // Everything else will be calculated later.
647  for (n_it.mark_cycle_pt(); !n_it.cycled_list(); n_it.forward()) {
648  BLOBNBOX* neighbour = n_it.data();
649  RemoveBBox(neighbour);
650  // Mark empty blob for deletion.
651  neighbour->set_region_type(BRT_NOISE);
652  blob->really_merge(neighbour);
653  if (rerotation_.x() != 1.0f || rerotation_.y() != 0.0f) {
654  blob->rotate_box(rerotation_);
655  }
656  }
657  InsertBBox(true, true, blob);
658  ++num_fixed;
659  if (debug) {
660  tprintf("Done! Final box:");
661  bbox.print();
662  }
663  }
664  }
665  // Count remaining blobs.
666  int num_remaining = 0;
667  for (blob_it.mark_cycle_pt(); !blob_it.cycled_list(); blob_it.forward()) {
668  BLOBNBOX* blob = blob_it.data();
669  if (blob->cblob() != NULL && !blob->cblob()->out_list()->empty()) {
670  ++num_remaining;
671  }
672  }
673  // Permanently delete all the marked blobs after first removing all
674  // references in the neighbour members.
675  block->DeleteUnownedNoise();
676  return num_fixed > num_remaining * kBrokenCJKIterationFraction;
677 }
678 
679 // Helper function to determine whether it is reasonable to merge the
680 // bbox and the nbox for repairing broken CJK.
681 // The distance apart must not exceed max_dist, the combined size must
682 // not exceed max_size, and the aspect ratio must either improve or at
683 // least not get worse by much.
684 static bool AcceptableCJKMerge(const TBOX& bbox, const TBOX& nbox,
685  bool debug, int max_size, int max_dist,
686  int* x_gap, int* y_gap) {
687  *x_gap = bbox.x_gap(nbox);
688  *y_gap = bbox.y_gap(nbox);
689  TBOX merged(nbox);
690  merged += bbox;
691  if (debug) {
692  tprintf("gaps = %d, %d, merged_box:", *x_gap, *y_gap);
693  merged.print();
694  }
695  if (*x_gap <= max_dist && *y_gap <= max_dist &&
696  merged.width() <= max_size && merged.height() <= max_size) {
697  // Close enough to call overlapping. Check aspect ratios.
698  double old_ratio = static_cast<double>(bbox.width()) / bbox.height();
699  if (old_ratio < 1.0) old_ratio = 1.0 / old_ratio;
700  double new_ratio = static_cast<double>(merged.width()) / merged.height();
701  if (new_ratio < 1.0) new_ratio = 1.0 / new_ratio;
702  if (new_ratio <= old_ratio * kCJKAspectRatioIncrease)
703  return true;
704  }
705  return false;
706 }
707 
708 // Collect blobs that overlap or are within max_dist of the input bbox.
709 // Return them in the list of blobs and expand the bbox to be the union
710 // of all the boxes. not_this is excluded from the search, as are blobs
711 // that cause the merged box to exceed max_size in either dimension.
712 void StrokeWidth::AccumulateOverlaps(const BLOBNBOX* not_this, bool debug,
713  int max_size, int max_dist,
714  TBOX* bbox, BLOBNBOX_CLIST* blobs) {
715  // While searching, nearests holds the nearest failed blob in each
716  // direction. When we have a nearest in each of the 4 directions, then
717  // the search is over, and at this point the final bbox must not overlap
718  // any of the nearests.
719  BLOBNBOX* nearests[BND_COUNT];
720  for (int i = 0; i < BND_COUNT; ++i) {
721  nearests[i] = NULL;
722  }
723  int x = (bbox->left() + bbox->right()) / 2;
724  int y = (bbox->bottom() + bbox->top()) / 2;
725  // Run a radial search for blobs that overlap or are sufficiently close.
726  BlobGridSearch radsearch(this);
727  radsearch.StartRadSearch(x, y, kCJKRadius);
728  BLOBNBOX* neighbour;
729  while ((neighbour = radsearch.NextRadSearch()) != NULL) {
730  if (neighbour == not_this) continue;
731  TBOX nbox = neighbour->bounding_box();
732  int x_gap, y_gap;
733  if (AcceptableCJKMerge(*bbox, nbox, debug, max_size, max_dist,
734  &x_gap, &y_gap)) {
735  // Close enough to call overlapping. Merge boxes.
736  *bbox += nbox;
737  blobs->add_sorted(SortByBoxLeft<BLOBNBOX>, true, neighbour);
738  if (debug) {
739  tprintf("Added:");
740  nbox.print();
741  }
742  // Since we merged, search the nearests, as some might now me mergeable.
743  for (int dir = 0; dir < BND_COUNT; ++dir) {
744  if (nearests[dir] == NULL) continue;
745  nbox = nearests[dir]->bounding_box();
746  if (AcceptableCJKMerge(*bbox, nbox, debug, max_size,
747  max_dist, &x_gap, &y_gap)) {
748  // Close enough to call overlapping. Merge boxes.
749  *bbox += nbox;
750  blobs->add_sorted(SortByBoxLeft<BLOBNBOX>, true, nearests[dir]);
751  if (debug) {
752  tprintf("Added:");
753  nbox.print();
754  }
755  nearests[dir] = NULL;
756  dir = -1; // Restart the search.
757  }
758  }
759  } else if (x_gap < 0 && x_gap <= y_gap) {
760  // A vertical neighbour. Record the nearest.
761  BlobNeighbourDir dir = nbox.top() > bbox->top() ? BND_ABOVE : BND_BELOW;
762  if (nearests[dir] == NULL ||
763  y_gap < bbox->y_gap(nearests[dir]->bounding_box())) {
764  nearests[dir] = neighbour;
765  }
766  } else if (y_gap < 0 && y_gap <= x_gap) {
767  // A horizontal neighbour. Record the nearest.
768  BlobNeighbourDir dir = nbox.left() > bbox->left() ? BND_RIGHT : BND_LEFT;
769  if (nearests[dir] == NULL ||
770  x_gap < bbox->x_gap(nearests[dir]->bounding_box())) {
771  nearests[dir] = neighbour;
772  }
773  }
774  // If all nearests are non-null, then we have finished.
775  if (nearests[BND_LEFT] && nearests[BND_RIGHT] &&
776  nearests[BND_ABOVE] && nearests[BND_BELOW])
777  break;
778  }
779  // Final overlap with a nearest is not allowed.
780  for (int dir = 0; dir < BND_COUNT; ++dir) {
781  if (nearests[dir] == NULL) continue;
782  const TBOX& nbox = nearests[dir]->bounding_box();
783  if (debug) {
784  tprintf("Testing for overlap with:");
785  nbox.print();
786  }
787  if (bbox->overlap(nbox)) {
788  blobs->shallow_clear();
789  if (debug)
790  tprintf("Final box overlaps nearest\n");
791  return;
792  }
793  }
794 }
795 
796 // For each blob in this grid, Finds the textline direction to be horizontal
797 // or vertical according to distance to neighbours and 1st and 2nd order
798 // neighbours. Non-text tends to end up without a definite direction.
799 // Result is setting of the neighbours and vert_possible/horz_possible
800 // flags in the BLOBNBOXes currently in this grid.
801 // This function is called more than once if page orientation is uncertain,
802 // so display_if_debugging is true on the final call to display the results.
803 void StrokeWidth::FindTextlineFlowDirection(PageSegMode pageseg_mode,
804  bool display_if_debugging) {
805  BlobGridSearch gsearch(this);
806  BLOBNBOX* bbox;
807  // For every bbox in the grid, set its neighbours.
808  gsearch.StartFullSearch();
809  while ((bbox = gsearch.NextFullSearch()) != NULL) {
810  SetNeighbours(false, display_if_debugging, bbox);
811  }
812  // Where vertical or horizontal wins by a big margin, clarify it.
813  gsearch.StartFullSearch();
814  while ((bbox = gsearch.NextFullSearch()) != NULL) {
815  SimplifyObviousNeighbours(bbox);
816  }
817  // Now try to make the blobs only vertical or horizontal using neighbours.
818  gsearch.StartFullSearch();
819  while ((bbox = gsearch.NextFullSearch()) != NULL) {
820  if (FindingVerticalOnly(pageseg_mode)) {
821  bbox->set_vert_possible(true);
822  bbox->set_horz_possible(false);
823  } else if (FindingHorizontalOnly(pageseg_mode)) {
824  bbox->set_vert_possible(false);
825  bbox->set_horz_possible(true);
826  } else {
827  SetNeighbourFlows(bbox);
828  }
829  }
830  if ((textord_tabfind_show_strokewidths && display_if_debugging) ||
832  initial_widths_win_ = DisplayGoodBlobs("InitialStrokewidths", 400, 0);
833  }
834  // Improve flow direction with neighbours.
835  gsearch.StartFullSearch();
836  while ((bbox = gsearch.NextFullSearch()) != NULL) {
837  SmoothNeighbourTypes(pageseg_mode, false, bbox);
838  }
839  // Now allow reset of firm values to fix renegades.
840  gsearch.StartFullSearch();
841  while ((bbox = gsearch.NextFullSearch()) != NULL) {
842  SmoothNeighbourTypes(pageseg_mode, true, bbox);
843  }
844  // Repeat.
845  gsearch.StartFullSearch();
846  while ((bbox = gsearch.NextFullSearch()) != NULL) {
847  SmoothNeighbourTypes(pageseg_mode, true, bbox);
848  }
849  if ((textord_tabfind_show_strokewidths && display_if_debugging) ||
851  widths_win_ = DisplayGoodBlobs("ImprovedStrokewidths", 800, 0);
852  }
853 }
854 
855 // Sets the neighbours and good_stroke_neighbours members of the blob by
856 // searching close on all 4 sides.
857 // When finding leader dots/dashes, there is a slightly different rule for
858 // what makes a good neighbour.
859 void StrokeWidth::SetNeighbours(bool leaders, bool activate_line_trap,
860  BLOBNBOX* blob) {
861  int line_trap_count = 0;
862  for (int dir = 0; dir < BND_COUNT; ++dir) {
863  BlobNeighbourDir bnd = static_cast<BlobNeighbourDir>(dir);
864  line_trap_count += FindGoodNeighbour(bnd, leaders, blob);
865  }
866  if (line_trap_count > 0 && activate_line_trap) {
867  // It looks like a line so isolate it by clearing its neighbours.
868  blob->ClearNeighbours();
869  const TBOX& box = blob->bounding_box();
870  blob->set_region_type(box.width() > box.height() ? BRT_HLINE : BRT_VLINE);
871  }
872 }
873 
874 
875 // Sets the good_stroke_neighbours member of the blob if it has a
876 // GoodNeighbour on the given side.
877 // Also sets the neighbour in the blob, whether or not a good one is found.
878 // Returns the number of blobs in the nearby search area that would lead us to
879 // believe that this blob is a line separator.
880 // Leaders get extra special lenient treatment.
881 int StrokeWidth::FindGoodNeighbour(BlobNeighbourDir dir, bool leaders,
882  BLOBNBOX* blob) {
883  // Search for neighbours that overlap vertically.
884  TBOX blob_box = blob->bounding_box();
885  bool debug = AlignedBlob::WithinTestRegion(2, blob_box.left(),
886  blob_box.bottom());
887  if (debug) {
888  tprintf("FGN in dir %d for blob:", dir);
889  blob_box.print();
890  }
891  int top = blob_box.top();
892  int bottom = blob_box.bottom();
893  int left = blob_box.left();
894  int right = blob_box.right();
895  int width = right - left;
896  int height = top - bottom;
897 
898  // A trap to detect lines tests for the min dimension of neighbours
899  // being larger than a multiple of the min dimension of the line
900  // and the larger dimension being smaller than a fraction of the max
901  // dimension of the line.
902  int line_trap_max = MAX(width, height) / kLineTrapLongest;
903  int line_trap_min = MIN(width, height) * kLineTrapShortest;
904  int line_trap_count = 0;
905 
906  int min_good_overlap = (dir == BND_LEFT || dir == BND_RIGHT)
907  ? height / 2 : width / 2;
908  int min_decent_overlap = (dir == BND_LEFT || dir == BND_RIGHT)
909  ? height / 3 : width / 3;
910  if (leaders)
911  min_good_overlap = min_decent_overlap = 1;
912 
913  int search_pad = static_cast<int>(
914  sqrt(static_cast<double>(width * height)) * kNeighbourSearchFactor);
915  if (gridsize() > search_pad)
916  search_pad = gridsize();
917  TBOX search_box = blob_box;
918  // Pad the search in the appropriate direction.
919  switch (dir) {
920  case BND_LEFT:
921  search_box.set_left(search_box.left() - search_pad);
922  break;
923  case BND_RIGHT:
924  search_box.set_right(search_box.right() + search_pad);
925  break;
926  case BND_BELOW:
927  search_box.set_bottom(search_box.bottom() - search_pad);
928  break;
929  case BND_ABOVE:
930  search_box.set_top(search_box.top() + search_pad);
931  break;
932  case BND_COUNT:
933  return 0;
934  }
935 
936  BlobGridSearch rectsearch(this);
937  rectsearch.StartRectSearch(search_box);
938  BLOBNBOX* best_neighbour = NULL;
939  double best_goodness = 0.0;
940  bool best_is_good = false;
941  BLOBNBOX* neighbour;
942  while ((neighbour = rectsearch.NextRectSearch()) != NULL) {
943  TBOX nbox = neighbour->bounding_box();
944  if (neighbour == blob)
945  continue;
946  int mid_x = (nbox.left() + nbox.right()) / 2;
947  if (mid_x < blob->left_rule() || mid_x > blob->right_rule())
948  continue; // In a different column.
949  if (debug) {
950  tprintf("Neighbour at:");
951  nbox.print();
952  }
953 
954  // Last-minute line detector. There is a small upper limit to the line
955  // width accepted by the morphological line detector.
956  int n_width = nbox.width();
957  int n_height = nbox.height();
958  if (MIN(n_width, n_height) > line_trap_min &&
959  MAX(n_width, n_height) < line_trap_max)
960  ++line_trap_count;
961  // Heavily joined text, such as Arabic may have very different sizes when
962  // looking at the maxes, but the heights may be almost identical, so check
963  // for a difference in height if looking sideways or width vertically.
964  if (TabFind::VeryDifferentSizes(MAX(n_width, n_height),
965  MAX(width, height)) &&
966  (((dir == BND_LEFT || dir ==BND_RIGHT) &&
967  TabFind::DifferentSizes(n_height, height)) ||
968  ((dir == BND_BELOW || dir ==BND_ABOVE) &&
969  TabFind::DifferentSizes(n_width, width)))) {
970  if (debug) tprintf("Bad size\n");
971  continue; // Could be a different font size or non-text.
972  }
973  // Amount of vertical overlap between the blobs.
974  int overlap;
975  // If the overlap is along the short side of the neighbour, and it
976  // is fully overlapped, then perp_overlap holds the length of the long
977  // side of the neighbour. A measure to include hyphens and dashes as
978  // legitimate neighbours.
979  int perp_overlap;
980  int gap;
981  if (dir == BND_LEFT || dir == BND_RIGHT) {
982  overlap = MIN(nbox.top(), top) - MAX(nbox.bottom(), bottom);
983  if (overlap == nbox.height() && nbox.width() > nbox.height())
984  perp_overlap = nbox.width();
985  else
986  perp_overlap = overlap;
987  gap = dir == BND_LEFT ? left - nbox.left() : nbox.right() - right;
988  if (gap <= 0) {
989  if (debug) tprintf("On wrong side\n");
990  continue; // On the wrong side.
991  }
992  gap -= n_width;
993  } else {
994  overlap = MIN(nbox.right(), right) - MAX(nbox.left(), left);
995  if (overlap == nbox.width() && nbox.height() > nbox.width())
996  perp_overlap = nbox.height();
997  else
998  perp_overlap = overlap;
999  gap = dir == BND_BELOW ? bottom - nbox.bottom() : nbox.top() - top;
1000  if (gap <= 0) {
1001  if (debug) tprintf("On wrong side\n");
1002  continue; // On the wrong side.
1003  }
1004  gap -= n_height;
1005  }
1006  if (-gap > overlap) {
1007  if (debug) tprintf("Overlaps wrong way\n");
1008  continue; // Overlaps the wrong way.
1009  }
1010  if (perp_overlap < min_decent_overlap) {
1011  if (debug) tprintf("Doesn't overlap enough\n");
1012  continue; // Doesn't overlap enough.
1013  }
1014  bool bad_sizes = TabFind::DifferentSizes(height, n_height) &&
1015  TabFind::DifferentSizes(width, n_width);
1016  bool is_good = overlap >= min_good_overlap && !bad_sizes &&
1017  blob->MatchingStrokeWidth(*neighbour,
1018  kStrokeWidthFractionTolerance,
1019  kStrokeWidthTolerance);
1020  // Best is a fuzzy combination of gap, overlap and is good.
1021  // Basically if you make one thing twice as good without making
1022  // anything else twice as bad, then it is better.
1023  if (gap < 1) gap = 1;
1024  double goodness = (1.0 + is_good) * overlap / gap;
1025  if (debug) {
1026  tprintf("goodness = %g vs best of %g, good=%d, overlap=%d, gap=%d\n",
1027  goodness, best_goodness, is_good, overlap, gap);
1028  }
1029  if (goodness > best_goodness) {
1030  best_neighbour = neighbour;
1031  best_goodness = goodness;
1032  best_is_good = is_good;
1033  }
1034  }
1035  blob->set_neighbour(dir, best_neighbour, best_is_good);
1036  return line_trap_count;
1037 }
1038 
1039 // Helper to get a list of 1st-order neighbours.
1040 static void ListNeighbours(const BLOBNBOX* blob,
1041  BLOBNBOX_CLIST* neighbours) {
1042  for (int dir = 0; dir < BND_COUNT; ++dir) {
1043  BlobNeighbourDir bnd = static_cast<BlobNeighbourDir>(dir);
1044  BLOBNBOX* neighbour = blob->neighbour(bnd);
1045  if (neighbour != NULL) {
1046  neighbours->add_sorted(SortByBoxLeft<BLOBNBOX>, true, neighbour);
1047  }
1048  }
1049 }
1050 
1051 // Helper to get a list of 1st and 2nd order neighbours.
1052 static void List2ndNeighbours(const BLOBNBOX* blob,
1053  BLOBNBOX_CLIST* neighbours) {
1054  ListNeighbours(blob, neighbours);
1055  for (int dir = 0; dir < BND_COUNT; ++dir) {
1056  BlobNeighbourDir bnd = static_cast<BlobNeighbourDir>(dir);
1057  BLOBNBOX* neighbour = blob->neighbour(bnd);
1058  if (neighbour != NULL) {
1059  ListNeighbours(neighbour, neighbours);
1060  }
1061  }
1062 }
1063 
1064 // Helper to get a list of 1st, 2nd and 3rd order neighbours.
1065 static void List3rdNeighbours(const BLOBNBOX* blob,
1066  BLOBNBOX_CLIST* neighbours) {
1067  List2ndNeighbours(blob, neighbours);
1068  for (int dir = 0; dir < BND_COUNT; ++dir) {
1069  BlobNeighbourDir bnd = static_cast<BlobNeighbourDir>(dir);
1070  BLOBNBOX* neighbour = blob->neighbour(bnd);
1071  if (neighbour != NULL) {
1072  List2ndNeighbours(neighbour, neighbours);
1073  }
1074  }
1075 }
1076 
1077 // Helper to count the evidence for verticalness or horizontalness
1078 // in a list of neighbours.
1079 static void CountNeighbourGaps(bool debug, BLOBNBOX_CLIST* neighbours,
1080  int* pure_h_count, int* pure_v_count) {
1081  if (neighbours->length() <= kMostlyOneDirRatio)
1082  return;
1083  BLOBNBOX_C_IT it(neighbours);
1084  for (it.mark_cycle_pt(); !it.cycled_list(); it.forward()) {
1085  BLOBNBOX* blob = it.data();
1086  int h_min, h_max, v_min, v_max;
1087  blob->MinMaxGapsClipped(&h_min, &h_max, &v_min, &v_max);
1088  if (debug)
1089  tprintf("Hgaps [%d,%d], vgaps [%d,%d]:", h_min, h_max, v_min, v_max);
1090  if (h_max < v_min ||
1091  blob->leader_on_left() || blob->leader_on_right()) {
1092  // Horizontal gaps are clear winners. Count a pure horizontal.
1093  ++*pure_h_count;
1094  if (debug) tprintf("Horz at:");
1095  } else if (v_max < h_min) {
1096  // Vertical gaps are clear winners. Clear a pure vertical.
1097  ++*pure_v_count;
1098  if (debug) tprintf("Vert at:");
1099  } else {
1100  if (debug) tprintf("Neither at:");
1101  }
1102  if (debug)
1103  blob->bounding_box().print();
1104  }
1105 }
1106 
1107 // Makes the blob to be only horizontal or vertical where evidence
1108 // is clear based on gaps of 2nd order neighbours, or definite individual
1109 // blobs.
1110 void StrokeWidth::SetNeighbourFlows(BLOBNBOX* blob) {
1111  if (blob->DefiniteIndividualFlow())
1112  return;
1113  bool debug = AlignedBlob::WithinTestRegion(2, blob->bounding_box().left(),
1114  blob->bounding_box().bottom());
1115  if (debug) {
1116  tprintf("SetNeighbourFlows (current flow=%d, type=%d) on:",
1117  blob->flow(), blob->region_type());
1118  blob->bounding_box().print();
1119  }
1120  BLOBNBOX_CLIST neighbours;
1121  List3rdNeighbours(blob, &neighbours);
1122  // The number of pure horizontal and vertical neighbours.
1123  int pure_h_count = 0;
1124  int pure_v_count = 0;
1125  CountNeighbourGaps(debug, &neighbours, &pure_h_count, &pure_v_count);
1126  if (debug) {
1127  HandleClick(blob->bounding_box().left() + 1,
1128  blob->bounding_box().bottom() + 1);
1129  tprintf("SetFlows: h_count=%d, v_count=%d\n",
1130  pure_h_count, pure_v_count);
1131  }
1132  if (!neighbours.empty()) {
1133  blob->set_vert_possible(true);
1134  blob->set_horz_possible(true);
1135  if (pure_h_count > 2 * pure_v_count) {
1136  // Horizontal gaps are clear winners. Clear vertical neighbours.
1137  blob->set_vert_possible(false);
1138  } else if (pure_v_count > 2 * pure_h_count) {
1139  // Vertical gaps are clear winners. Clear horizontal neighbours.
1140  blob->set_horz_possible(false);
1141  }
1142  } else {
1143  // Lonely blob. Can't tell its flow direction.
1144  blob->set_vert_possible(false);
1145  blob->set_horz_possible(false);
1146  }
1147 }
1148 
1149 
1150 // Helper to count the number of horizontal and vertical blobs in a list.
1151 static void CountNeighbourTypes(BLOBNBOX_CLIST* neighbours,
1152  int* pure_h_count, int* pure_v_count) {
1153  BLOBNBOX_C_IT it(neighbours);
1154  for (it.mark_cycle_pt(); !it.cycled_list(); it.forward()) {
1155  BLOBNBOX* blob = it.data();
1156  if (blob->UniquelyHorizontal())
1157  ++*pure_h_count;
1158  if (blob->UniquelyVertical())
1159  ++*pure_v_count;
1160  }
1161 }
1162 
1163 // Nullify the neighbours in the wrong directions where the direction
1164 // is clear-cut based on a distance margin. Good for isolating vertical
1165 // text from neighbouring horizontal text.
1166 void StrokeWidth::SimplifyObviousNeighbours(BLOBNBOX* blob) {
1167  // Case 1: We have text that is likely several characters, blurry and joined
1168  // together.
1169  if ((blob->bounding_box().width() > 3 * blob->area_stroke_width() &&
1170  blob->bounding_box().height() > 3 * blob->area_stroke_width())) {
1171  // The blob is complex (not stick-like).
1172  if (blob->bounding_box().width() > 4 * blob->bounding_box().height()) {
1173  // Horizontal conjoined text.
1174  blob->set_neighbour(BND_ABOVE, NULL, false);
1175  blob->set_neighbour(BND_BELOW, NULL, false);
1176  return;
1177  }
1178  if (blob->bounding_box().height() > 4 * blob->bounding_box().width()) {
1179  // Vertical conjoined text.
1180  blob->set_neighbour(BND_LEFT, NULL, false);
1181  blob->set_neighbour(BND_RIGHT, NULL, false);
1182  return;
1183  }
1184  }
1185 
1186  // Case 2: This blob is likely a single character.
1187  int margin = gridsize() / 2;
1188  int h_min, h_max, v_min, v_max;
1189  blob->MinMaxGapsClipped(&h_min, &h_max, &v_min, &v_max);
1190  if ((h_max + margin < v_min && h_max < margin / 2) ||
1191  blob->leader_on_left() || blob->leader_on_right()) {
1192  // Horizontal gaps are clear winners. Clear vertical neighbours.
1193  blob->set_neighbour(BND_ABOVE, NULL, false);
1194  blob->set_neighbour(BND_BELOW, NULL, false);
1195  } else if (v_max + margin < h_min && v_max < margin / 2) {
1196  // Vertical gaps are clear winners. Clear horizontal neighbours.
1197  blob->set_neighbour(BND_LEFT, NULL, false);
1198  blob->set_neighbour(BND_RIGHT, NULL, false);
1199  }
1200 }
1201 
1202 // Smoothes the vertical/horizontal type of the blob based on the
1203 // 2nd-order neighbours. If reset_all is true, then all blobs are
1204 // changed. Otherwise, only ambiguous blobs are processed.
1205 void StrokeWidth::SmoothNeighbourTypes(PageSegMode pageseg_mode, bool reset_all,
1206  BLOBNBOX* blob) {
1207  if ((blob->vert_possible() && blob->horz_possible()) || reset_all) {
1208  // There are both horizontal and vertical so try to fix it.
1209  BLOBNBOX_CLIST neighbours;
1210  List2ndNeighbours(blob, &neighbours);
1211  // The number of pure horizontal and vertical neighbours.
1212  int pure_h_count = 0;
1213  int pure_v_count = 0;
1214  CountNeighbourTypes(&neighbours, &pure_h_count, &pure_v_count);
1216  blob->bounding_box().bottom())) {
1217  HandleClick(blob->bounding_box().left() + 1,
1218  blob->bounding_box().bottom() + 1);
1219  tprintf("pure_h=%d, pure_v=%d\n",
1220  pure_h_count, pure_v_count);
1221  }
1222  if (pure_h_count > pure_v_count && !FindingVerticalOnly(pageseg_mode)) {
1223  // Horizontal gaps are clear winners. Clear vertical neighbours.
1224  blob->set_vert_possible(false);
1225  blob->set_horz_possible(true);
1226  } else if (pure_v_count > pure_h_count &&
1227  !FindingHorizontalOnly(pageseg_mode)) {
1228  // Vertical gaps are clear winners. Clear horizontal neighbours.
1229  blob->set_horz_possible(false);
1230  blob->set_vert_possible(true);
1231  }
1232  } else if (AlignedBlob::WithinTestRegion(2, blob->bounding_box().left(),
1233  blob->bounding_box().bottom())) {
1234  HandleClick(blob->bounding_box().left() + 1,
1235  blob->bounding_box().bottom() + 1);
1236  tprintf("Clean on pass 3!\n");
1237  }
1238 }
1239 
1240 // Partition creation. Accumulates vertical and horizontal text chains,
1241 // puts the remaining blobs in as unknowns, and then merges/splits to
1242 // minimize overlap and smoothes the types with neighbours and the color
1243 // image if provided. rerotation is used to rotate the coordinate space
1244 // back to the nontext_map_ image.
1245 // If find_problems is true, detects possible noise pollution by the amount
1246 // of partition overlap that is created by the diacritics. If excessive, the
1247 // noise is separated out into diacritic blobs, and PFR_NOISE is returned.
1248 // [TODO(rays): if the partition overlap is caused by heavy skew, deskews
1249 // the components, saves the skew_angle and returns PFR_SKEW.] If the return
1250 // is not PFR_OK, the job is incomplete, and FindInitialPartitions must be
1251 // called again after cleaning up the partly done work.
1252 PartitionFindResult StrokeWidth::FindInitialPartitions(
1253  PageSegMode pageseg_mode, const FCOORD& rerotation, bool find_problems,
1254  TO_BLOCK* block, BLOBNBOX_LIST* diacritic_blobs,
1255  ColPartitionGrid* part_grid, ColPartition_LIST* big_parts,
1256  FCOORD* skew_angle) {
1257  if (!FindingHorizontalOnly(pageseg_mode)) FindVerticalTextChains(part_grid);
1258  if (!FindingVerticalOnly(pageseg_mode)) FindHorizontalTextChains(part_grid);
1260  chains_win_ = MakeWindow(0, 400, "Initial text chains");
1261  part_grid->DisplayBoxes(chains_win_);
1262  projection_->DisplayProjection();
1263  }
1264  if (find_problems) {
1265  // TODO(rays) Do something to find skew, set skew_angle and return if there
1266  // is some.
1267  }
1268  part_grid->SplitOverlappingPartitions(big_parts);
1269  EasyMerges(part_grid);
1270  RemoveLargeUnusedBlobs(block, part_grid, big_parts);
1271  TBOX grid_box(bleft(), tright());
1272  while (part_grid->GridSmoothNeighbours(BTFT_CHAIN, nontext_map_, grid_box,
1273  rerotation));
1274  while (part_grid->GridSmoothNeighbours(BTFT_NEIGHBOURS, nontext_map_,
1275  grid_box, rerotation));
1276  int pre_overlap = part_grid->ComputeTotalOverlap(NULL);
1277  TestDiacritics(part_grid, block);
1278  MergeDiacritics(block, part_grid);
1279  if (find_problems && diacritic_blobs != NULL &&
1280  DetectAndRemoveNoise(pre_overlap, grid_box, block, part_grid,
1281  diacritic_blobs)) {
1282  return PFR_NOISE;
1283  }
1285  textlines_win_ = MakeWindow(400, 400, "GoodTextline blobs");
1286  part_grid->DisplayBoxes(textlines_win_);
1287  diacritics_win_ = DisplayDiacritics("Diacritics", 0, 0, block);
1288  }
1289  PartitionRemainingBlobs(pageseg_mode, part_grid);
1290  part_grid->SplitOverlappingPartitions(big_parts);
1291  EasyMerges(part_grid);
1292  while (part_grid->GridSmoothNeighbours(BTFT_CHAIN, nontext_map_, grid_box,
1293  rerotation));
1294  while (part_grid->GridSmoothNeighbours(BTFT_NEIGHBOURS, nontext_map_,
1295  grid_box, rerotation));
1296  // Now eliminate strong stuff in a sea of the opposite.
1297  while (part_grid->GridSmoothNeighbours(BTFT_STRONG_CHAIN, nontext_map_,
1298  grid_box, rerotation));
1300  smoothed_win_ = MakeWindow(800, 400, "Smoothed blobs");
1301  part_grid->DisplayBoxes(smoothed_win_);
1302  }
1303  return PFR_OK;
1304 }
1305 
1306 // Detects noise by a significant increase in partition overlap from
1307 // pre_overlap to now, and removes noise from the union of all the overlapping
1308 // partitions, placing the blobs in diacritic_blobs. Returns true if any noise
1309 // was found and removed.
1310 bool StrokeWidth::DetectAndRemoveNoise(int pre_overlap, const TBOX& grid_box,
1311  TO_BLOCK* block,
1312  ColPartitionGrid* part_grid,
1313  BLOBNBOX_LIST* diacritic_blobs) {
1314  ColPartitionGrid* noise_grid = NULL;
1315  int post_overlap = part_grid->ComputeTotalOverlap(&noise_grid);
1316  if (pre_overlap == 0) pre_overlap = 1;
1317  BLOBNBOX_IT diacritic_it(diacritic_blobs);
1318  if (noise_grid != NULL) {
1319  if (post_overlap > pre_overlap * kNoiseOverlapGrowthFactor &&
1320  post_overlap > grid_box.area() * kNoiseOverlapAreaFactor) {
1321  // This is noisy enough to fix.
1323  ScrollView* noise_win = MakeWindow(1000, 500, "Noise Areas");
1324  noise_grid->DisplayBoxes(noise_win);
1325  }
1326  part_grid->DeleteNonLeaderParts();
1327  BLOBNBOX_IT blob_it(&block->noise_blobs);
1328  ColPartitionGridSearch rsearch(noise_grid);
1329  for (blob_it.mark_cycle_pt(); !blob_it.cycled_list(); blob_it.forward()) {
1330  BLOBNBOX* blob = blob_it.data();
1331  blob->ClearNeighbours();
1332  if (!blob->IsDiacritic() || blob->owner() != NULL)
1333  continue; // Not a noise candidate.
1334  TBOX blob_box(blob->bounding_box());
1335  TBOX search_box(blob->bounding_box());
1336  search_box.pad(gridsize(), gridsize());
1337  rsearch.StartRectSearch(search_box);
1338  ColPartition* part = rsearch.NextRectSearch();
1339  if (part != NULL) {
1340  // Consider blob as possible noise.
1341  blob->set_owns_cblob(true);
1342  blob->compute_bounding_box();
1343  diacritic_it.add_after_then_move(blob_it.extract());
1344  }
1345  }
1346  noise_grid->DeleteParts();
1347  delete noise_grid;
1348  return true;
1349  }
1350  noise_grid->DeleteParts();
1351  delete noise_grid;
1352  }
1353  return false;
1354 }
1355 
1356 // Helper verifies that blob's neighbour in direction dir is good to add to a
1357 // vertical text chain by returning the neighbour if it is not null, not owned,
1358 // and not uniquely horizontal, as well as its neighbour in the opposite
1359 // direction is blob.
1360 static BLOBNBOX* MutualUnusedVNeighbour(const BLOBNBOX* blob,
1361  BlobNeighbourDir dir) {
1362  BLOBNBOX* next_blob = blob->neighbour(dir);
1363  if (next_blob == NULL || next_blob->owner() != NULL ||
1364  next_blob->UniquelyHorizontal())
1365  return NULL;
1366  if (next_blob->neighbour(DirOtherWay(dir)) == blob)
1367  return next_blob;
1368  return NULL;
1369 }
1370 
1371 // Finds vertical chains of text-like blobs and puts them in ColPartitions.
1372 void StrokeWidth::FindVerticalTextChains(ColPartitionGrid* part_grid) {
1373  // A PageSegMode that forces vertical textlines with the current rotation.
1374  PageSegMode pageseg_mode =
1375  rerotation_.y() == 0.0f ? PSM_SINGLE_BLOCK_VERT_TEXT : PSM_SINGLE_COLUMN;
1376  BlobGridSearch gsearch(this);
1377  BLOBNBOX* bbox;
1378  gsearch.StartFullSearch();
1379  while ((bbox = gsearch.NextFullSearch()) != NULL) {
1380  // Only process boxes that have no horizontal hope and have not yet
1381  // been included in a chain.
1382  BLOBNBOX* blob;
1383  if (bbox->owner() == NULL && bbox->UniquelyVertical() &&
1384  (blob = MutualUnusedVNeighbour(bbox, BND_ABOVE)) != NULL) {
1385  // Put all the linked blobs into a ColPartition.
1386  ColPartition* part = new ColPartition(BRT_VERT_TEXT, ICOORD(0, 1));
1387  part->AddBox(bbox);
1388  while (blob != NULL) {
1389  part->AddBox(blob);
1390  blob = MutualUnusedVNeighbour(blob, BND_ABOVE);
1391  }
1392  blob = MutualUnusedVNeighbour(bbox, BND_BELOW);
1393  while (blob != NULL) {
1394  part->AddBox(blob);
1395  blob = MutualUnusedVNeighbour(blob, BND_BELOW);
1396  }
1397  CompletePartition(pageseg_mode, part, part_grid);
1398  }
1399  }
1400 }
1401 
1402 // Helper verifies that blob's neighbour in direction dir is good to add to a
1403 // horizontal text chain by returning the neighbour if it is not null, not
1404 // owned, and not uniquely vertical, as well as its neighbour in the opposite
1405 // direction is blob.
1406 static BLOBNBOX* MutualUnusedHNeighbour(const BLOBNBOX* blob,
1407  BlobNeighbourDir dir) {
1408  BLOBNBOX* next_blob = blob->neighbour(dir);
1409  if (next_blob == NULL || next_blob->owner() != NULL ||
1410  next_blob->UniquelyVertical())
1411  return NULL;
1412  if (next_blob->neighbour(DirOtherWay(dir)) == blob)
1413  return next_blob;
1414  return NULL;
1415 }
1416 
1417 // Finds horizontal chains of text-like blobs and puts them in ColPartitions.
1418 void StrokeWidth::FindHorizontalTextChains(ColPartitionGrid* part_grid) {
1419  // A PageSegMode that forces horizontal textlines with the current rotation.
1420  PageSegMode pageseg_mode =
1421  rerotation_.y() == 0.0f ? PSM_SINGLE_COLUMN : PSM_SINGLE_BLOCK_VERT_TEXT;
1422  BlobGridSearch gsearch(this);
1423  BLOBNBOX* bbox;
1424  gsearch.StartFullSearch();
1425  while ((bbox = gsearch.NextFullSearch()) != NULL) {
1426  BLOBNBOX* blob;
1427  if (bbox->owner() == NULL && bbox->UniquelyHorizontal() &&
1428  (blob = MutualUnusedHNeighbour(bbox, BND_RIGHT)) != NULL) {
1429  // Put all the linked blobs into a ColPartition.
1430  ColPartition* part = new ColPartition(BRT_TEXT, ICOORD(0, 1));
1431  part->AddBox(bbox);
1432  while (blob != NULL) {
1433  part->AddBox(blob);
1434  blob = MutualUnusedHNeighbour(blob, BND_RIGHT);
1435  }
1436  blob = MutualUnusedHNeighbour(bbox, BND_LEFT);
1437  while (blob != NULL) {
1438  part->AddBox(blob);
1439  blob = MutualUnusedVNeighbour(blob, BND_LEFT);
1440  }
1441  CompletePartition(pageseg_mode, part, part_grid);
1442  }
1443  }
1444 }
1445 
1446 // Finds diacritics and saves their base character in the blob.
1447 // The objective is to move all diacritics to the noise_blobs list, so
1448 // they don't mess up early textline finding/merging, or force splits
1449 // on textlines that overlap a bit. Blobs that become diacritics must be
1450 // either part of no ColPartition (NULL owner) or in a small partition in
1451 // which ALL the blobs are diacritics, in which case the partition is
1452 // exploded (deleted) back to its blobs.
1453 void StrokeWidth::TestDiacritics(ColPartitionGrid* part_grid, TO_BLOCK* block) {
1454  BlobGrid small_grid(gridsize(), bleft(), tright());
1455  small_grid.InsertBlobList(&block->noise_blobs);
1456  small_grid.InsertBlobList(&block->blobs);
1457  int medium_diacritics = 0;
1458  int small_diacritics = 0;
1459  BLOBNBOX_IT small_it(&block->noise_blobs);
1460  for (small_it.mark_cycle_pt(); !small_it.cycled_list(); small_it.forward()) {
1461  BLOBNBOX* blob = small_it.data();
1462  if (blob->owner() == NULL && !blob->IsDiacritic() &&
1463  DiacriticBlob(&small_grid, blob)) {
1464  ++small_diacritics;
1465  }
1466  }
1467  BLOBNBOX_IT blob_it(&block->blobs);
1468  for (blob_it.mark_cycle_pt(); !blob_it.cycled_list(); blob_it.forward()) {
1469  BLOBNBOX* blob = blob_it.data();
1470  if (blob->IsDiacritic()) {
1471  small_it.add_to_end(blob_it.extract());
1472  continue; // Already a diacritic.
1473  }
1474  ColPartition* part = blob->owner();
1475  if (part == NULL && DiacriticBlob(&small_grid, blob)) {
1476  ++medium_diacritics;
1477  RemoveBBox(blob);
1478  small_it.add_to_end(blob_it.extract());
1479  } else if (part != NULL && !part->block_owned() &&
1480  part->boxes_count() < 3) {
1481  // We allow blobs in small partitions to become diacritics if ALL the
1482  // blobs in the partition qualify as we can then cleanly delete the
1483  // partition, turn all the blobs in it to diacritics and they can be
1484  // merged into the base character partition more easily than merging
1485  // the partitions.
1486  BLOBNBOX_C_IT box_it(part->boxes());
1487  for (box_it.mark_cycle_pt(); !box_it.cycled_list() &&
1488  DiacriticBlob(&small_grid, box_it.data());
1489  box_it.forward());
1490  if (box_it.cycled_list()) {
1491  // They are all good.
1492  while (!box_it.empty()) {
1493  // Liberate the blob from its partition so it can be treated
1494  // as a diacritic and merged explicitly with the base part.
1495  // The blob is really owned by the block. The partition "owner"
1496  // is NULLed to allow the blob to get merged with its base character
1497  // partition.
1498  BLOBNBOX* box = box_it.extract();
1499  box->set_owner(NULL);
1500  box_it.forward();
1501  ++medium_diacritics;
1502  // We remove the blob from the grid so it isn't found by subsequent
1503  // searches where we might not want to include diacritics.
1504  RemoveBBox(box);
1505  }
1506  // We only move the one blob to the small list here, but the others
1507  // all get moved by the test at the top of the loop.
1508  small_it.add_to_end(blob_it.extract());
1509  part_grid->RemoveBBox(part);
1510  delete part;
1511  }
1512  } else if (AlignedBlob::WithinTestRegion(2, blob->bounding_box().left(),
1513  blob->bounding_box().bottom())) {
1514  tprintf("Blob not available to be a diacritic at:");
1515  blob->bounding_box().print();
1516  }
1517  }
1519  tprintf("Found %d small diacritics, %d medium\n",
1520  small_diacritics, medium_diacritics);
1521  }
1522 }
1523 
1524 // Searches this grid for an appropriately close and sized neighbour of the
1525 // given [small] blob. If such a blob is found, the diacritic base is saved
1526 // in the blob and true is returned.
1527 // The small_grid is a secondary grid that contains the small/noise objects
1528 // that are not in this grid, but may be useful for determining a connection
1529 // between blob and its potential base character. (See DiacriticXGapFilled.)
1530 bool StrokeWidth::DiacriticBlob(BlobGrid* small_grid, BLOBNBOX* blob) {
1531  if (BLOBNBOX::UnMergeableType(blob->region_type()) ||
1532  blob->region_type() == BRT_VERT_TEXT)
1533  return false;
1534  TBOX small_box(blob->bounding_box());
1535  bool debug = AlignedBlob::WithinTestRegion(2, small_box.left(),
1536  small_box.bottom());
1537  if (debug) {
1538  tprintf("Testing blob for diacriticness at:");
1539  small_box.print();
1540  }
1541  int x = (small_box.left() + small_box.right()) / 2;
1542  int y = (small_box.bottom() + small_box.top()) / 2;
1543  int grid_x, grid_y;
1544  GridCoords(x, y, &grid_x, &grid_y);
1545  int height = small_box.height();
1546  // Setup a rectangle search to find its nearest base-character neighbour.
1547  // We keep 2 different best candidates:
1548  // best_x_overlap is a category of base characters that have an overlap in x
1549  // (like a acute) in which we look for the least y-gap, computed using the
1550  // projection to favor base characters in the same textline.
1551  // best_y_overlap is a category of base characters that have no x overlap,
1552  // (nominally a y-overlap is preferrecd but not essential) in which we
1553  // look for the least weighted sum of x-gap and y-gap, with x-gap getting
1554  // a lower weight to catch quotes at the end of a textline.
1555  // NOTE that x-gap and y-gap are measured from the nearest side of the base
1556  // character to the FARTHEST side of the diacritic to allow small diacritics
1557  // to be a reasonable distance away, but not big diacritics.
1558  BLOBNBOX* best_x_overlap = NULL;
1559  BLOBNBOX* best_y_overlap = NULL;
1560  int best_total_dist = 0;
1561  int best_y_gap = 0;
1562  TBOX best_xbox;
1563  // TODO(rays) the search box could be setup using the projection as a guide.
1564  TBOX search_box(small_box);
1565  int x_pad = IntCastRounded(gridsize() * kDiacriticXPadRatio);
1566  int y_pad = IntCastRounded(gridsize() * kDiacriticYPadRatio);
1567  search_box.pad(x_pad, y_pad);
1568  BlobGridSearch rsearch(this);
1569  rsearch.SetUniqueMode(true);
1570  int min_height = height * kMinDiacriticSizeRatio;
1571  rsearch.StartRectSearch(search_box);
1572  BLOBNBOX* neighbour;
1573  while ((neighbour = rsearch.NextRectSearch()) != NULL) {
1574  if (BLOBNBOX::UnMergeableType(neighbour->region_type()) ||
1575  neighbour == blob || neighbour->owner() == blob->owner())
1576  continue;
1577  TBOX nbox = neighbour->bounding_box();
1578  if (neighbour->owner() == NULL || neighbour->owner()->IsVerticalType() ||
1579  (neighbour->flow() != BTFT_CHAIN &&
1580  neighbour->flow() != BTFT_STRONG_CHAIN)) {
1581  if (debug) {
1582  tprintf("Neighbour not strong enough:");
1583  nbox.print();
1584  }
1585  continue; // Diacritics must be attached to strong text.
1586  }
1587  if (nbox.height() < min_height) {
1588  if (debug) {
1589  tprintf("Neighbour not big enough:");
1590  nbox.print();
1591  }
1592  continue; // Too small to be the base character.
1593  }
1594  int x_gap = small_box.x_gap(nbox);
1595  int y_gap = small_box.y_gap(nbox);
1596  int total_distance = projection_->DistanceOfBoxFromBox(small_box, nbox,
1597  true, denorm_,
1598  debug);
1599  if (debug) tprintf("xgap=%d, y=%d, total dist=%d\n",
1600  x_gap, y_gap, total_distance);
1601  if (total_distance >
1602  neighbour->owner()->median_size() * kMaxDiacriticDistanceRatio) {
1603  if (debug) {
1604  tprintf("Neighbour with median size %d too far away:",
1605  neighbour->owner()->median_size());
1606  neighbour->bounding_box().print();
1607  }
1608  continue; // Diacritics must not be too distant.
1609  }
1610  if (x_gap <= 0) {
1611  if (debug) {
1612  tprintf("Computing reduced box for :");
1613  nbox.print();
1614  }
1615  int left = small_box.left() - small_box.width();
1616  int right = small_box.right() + small_box.width();
1617  nbox = neighbour->BoundsWithinLimits(left, right);
1618  y_gap = small_box.y_gap(nbox);
1619  if (best_x_overlap == NULL || y_gap < best_y_gap) {
1620  best_x_overlap = neighbour;
1621  best_xbox = nbox;
1622  best_y_gap = y_gap;
1623  if (debug) {
1624  tprintf("New best:");
1625  nbox.print();
1626  }
1627  } else if (debug) {
1628  tprintf("Shrunken box doesn't win:");
1629  nbox.print();
1630  }
1631  } else if (blob->ConfirmNoTabViolation(*neighbour)) {
1632  if (best_y_overlap == NULL || total_distance < best_total_dist) {
1633  if (debug) {
1634  tprintf("New best y overlap:");
1635  nbox.print();
1636  }
1637  best_y_overlap = neighbour;
1638  best_total_dist = total_distance;
1639  } else if (debug) {
1640  tprintf("New y overlap box doesn't win:");
1641  nbox.print();
1642  }
1643  } else if (debug) {
1644  tprintf("Neighbour wrong side of a tab:");
1645  nbox.print();
1646  }
1647  }
1648  if (best_x_overlap != NULL &&
1649  (best_y_overlap == NULL ||
1650  best_xbox.major_y_overlap(best_y_overlap->bounding_box()))) {
1651  blob->set_diacritic_box(best_xbox);
1652  blob->set_base_char_blob(best_x_overlap);
1653  if (debug) {
1654  tprintf("DiacriticBlob OK! (x-overlap:");
1655  small_box.print();
1656  best_xbox.print();
1657  }
1658  return true;
1659  }
1660  if (best_y_overlap != NULL &&
1661  DiacriticXGapFilled(small_grid, small_box,
1662  best_y_overlap->bounding_box()) &&
1663  NoNoiseInBetween(small_box, best_y_overlap->bounding_box())) {
1664  blob->set_diacritic_box(best_y_overlap->bounding_box());
1665  blob->set_base_char_blob(best_y_overlap);
1666  if (debug) {
1667  tprintf("DiacriticBlob OK! (y-overlap:");
1668  small_box.print();
1669  best_y_overlap->bounding_box().print();
1670  }
1671  return true;
1672  }
1673  if (debug) {
1674  tprintf("DiacriticBlob fails:");
1675  small_box.print();
1676  tprintf("Best x+y gap = %d, y = %d\n", best_total_dist, best_y_gap);
1677  if (best_y_overlap != NULL) {
1678  tprintf("XGapFilled=%d, NoiseBetween=%d\n",
1679  DiacriticXGapFilled(small_grid, small_box,
1680  best_y_overlap->bounding_box()),
1681  NoNoiseInBetween(small_box, best_y_overlap->bounding_box()));
1682  }
1683  }
1684  return false;
1685 }
1686 
1687 // Returns true if there is no gap between the base char and the diacritic
1688 // bigger than a fraction of the height of the base char:
1689 // Eg: line end.....'
1690 // The quote is a long way from the end of the line, yet it needs to be a
1691 // diacritic. To determine that the quote is not part of an image, or
1692 // a different text block, we check for other marks in the gap between
1693 // the base char and the diacritic.
1694 // '<--Diacritic
1695 // |---------|
1696 // | |<-toobig-gap->
1697 // | Base |<ok gap>
1698 // |---------| x<-----Dot occupying gap
1699 // The grid is const really.
1700 bool StrokeWidth::DiacriticXGapFilled(BlobGrid* grid,
1701  const TBOX& diacritic_box,
1702  const TBOX& base_box) {
1703  // Since most gaps are small, use an iterative algorithm to search the gap.
1704  int max_gap = IntCastRounded(base_box.height() *
1706  TBOX occupied_box(base_box);
1707  int diacritic_gap;
1708  while ((diacritic_gap = diacritic_box.x_gap(occupied_box)) > max_gap) {
1709  TBOX search_box(occupied_box);
1710  if (diacritic_box.left() > search_box.right()) {
1711  // We are looking right.
1712  search_box.set_left(search_box.right());
1713  search_box.set_right(search_box.left() + max_gap);
1714  } else {
1715  // We are looking left.
1716  search_box.set_right(search_box.left());
1717  search_box.set_left(search_box.left() - max_gap);
1718  }
1719  BlobGridSearch rsearch(grid);
1720  rsearch.StartRectSearch(search_box);
1721  BLOBNBOX* neighbour;
1722  while ((neighbour = rsearch.NextRectSearch()) != NULL) {
1723  const TBOX& nbox = neighbour->bounding_box();
1724  if (nbox.x_gap(diacritic_box) < diacritic_gap) {
1725  if (nbox.left() < occupied_box.left())
1726  occupied_box.set_left(nbox.left());
1727  if (nbox.right() > occupied_box.right())
1728  occupied_box.set_right(nbox.right());
1729  break;
1730  }
1731  }
1732  if (neighbour == NULL)
1733  return false; // Found a big gap.
1734  }
1735  return true; // The gap was filled.
1736 }
1737 
1738 // Merges diacritics with the ColPartition of the base character blob.
1739 void StrokeWidth::MergeDiacritics(TO_BLOCK* block,
1740  ColPartitionGrid* part_grid) {
1741  BLOBNBOX_IT small_it(&block->noise_blobs);
1742  for (small_it.mark_cycle_pt(); !small_it.cycled_list(); small_it.forward()) {
1743  BLOBNBOX* blob = small_it.data();
1744  if (blob->base_char_blob() != NULL) {
1745  ColPartition* part = blob->base_char_blob()->owner();
1746  // The base character must be owned by a partition and that partition
1747  // must not be on the big_parts list (not block owned).
1748  if (part != NULL && !part->block_owned() && blob->owner() == NULL &&
1749  blob->IsDiacritic()) {
1750  // The partition has to be removed from the grid and reinserted
1751  // because its bounding box may change.
1752  part_grid->RemoveBBox(part);
1753  part->AddBox(blob);
1754  blob->set_region_type(part->blob_type());
1755  blob->set_flow(part->flow());
1756  blob->set_owner(part);
1757  part_grid->InsertBBox(true, true, part);
1758  }
1759  // Set all base chars to NULL before any blobs get deleted.
1760  blob->set_base_char_blob(NULL);
1761  }
1762  }
1763 }
1764 
1765 // Any blobs on the large_blobs list of block that are still unowned by a
1766 // ColPartition, are probably drop-cap or vertically touching so the blobs
1767 // are removed to the big_parts list and treated separately.
1768 void StrokeWidth::RemoveLargeUnusedBlobs(TO_BLOCK* block,
1769  ColPartitionGrid* part_grid,
1770  ColPartition_LIST* big_parts) {
1771  BLOBNBOX_IT large_it(&block->large_blobs);
1772  for (large_it.mark_cycle_pt(); !large_it.cycled_list(); large_it.forward()) {
1773  BLOBNBOX* blob = large_it.data();
1774  ColPartition* big_part = blob->owner();
1775  if (big_part == NULL) {
1776  // Large blobs should have gone into partitions by now if they are
1777  // genuine characters, so move any unowned ones out to the big parts
1778  // list. This will include drop caps and vertically touching characters.
1779  ColPartition::MakeBigPartition(blob, big_parts);
1780  }
1781  }
1782 }
1783 
1784 // All remaining unused blobs are put in individual ColPartitions.
1785 void StrokeWidth::PartitionRemainingBlobs(PageSegMode pageseg_mode,
1786  ColPartitionGrid* part_grid) {
1787  BlobGridSearch gsearch(this);
1788  BLOBNBOX* bbox;
1789  int prev_grid_x = -1;
1790  int prev_grid_y = -1;
1791  BLOBNBOX_CLIST cell_list;
1792  BLOBNBOX_C_IT cell_it(&cell_list);
1793  bool cell_all_noise = true;
1794  gsearch.StartFullSearch();
1795  while ((bbox = gsearch.NextFullSearch()) != NULL) {
1796  int grid_x = gsearch.GridX();
1797  int grid_y = gsearch.GridY();
1798  if (grid_x != prev_grid_x || grid_y != prev_grid_y) {
1799  // New cell. Process old cell.
1800  MakePartitionsFromCellList(pageseg_mode, cell_all_noise, part_grid,
1801  &cell_list);
1802  cell_it.set_to_list(&cell_list);
1803  prev_grid_x = grid_x;
1804  prev_grid_y = grid_y;
1805  cell_all_noise = true;
1806  }
1807  if (bbox->owner() == NULL) {
1808  cell_it.add_to_end(bbox);
1809  if (bbox->flow() != BTFT_NONTEXT)
1810  cell_all_noise = false;
1811  } else {
1812  cell_all_noise = false;
1813  }
1814  }
1815  MakePartitionsFromCellList(pageseg_mode, cell_all_noise, part_grid,
1816  &cell_list);
1817 }
1818 
1819 // If combine, put all blobs in the cell_list into a single partition, otherwise
1820 // put each one into its own partition.
1821 void StrokeWidth::MakePartitionsFromCellList(PageSegMode pageseg_mode,
1822  bool combine,
1823  ColPartitionGrid* part_grid,
1824  BLOBNBOX_CLIST* cell_list) {
1825  if (cell_list->empty())
1826  return;
1827  BLOBNBOX_C_IT cell_it(cell_list);
1828  if (combine) {
1829  BLOBNBOX* bbox = cell_it.extract();
1830  ColPartition* part = new ColPartition(bbox->region_type(), ICOORD(0, 1));
1831  part->AddBox(bbox);
1832  part->set_flow(bbox->flow());
1833  for (cell_it.forward(); !cell_it.empty(); cell_it.forward()) {
1834  part->AddBox(cell_it.extract());
1835  }
1836  CompletePartition(pageseg_mode, part, part_grid);
1837  } else {
1838  for (; !cell_it.empty(); cell_it.forward()) {
1839  BLOBNBOX* bbox = cell_it.extract();
1840  ColPartition* part = new ColPartition(bbox->region_type(), ICOORD(0, 1));
1841  part->set_flow(bbox->flow());
1842  part->AddBox(bbox);
1843  CompletePartition(pageseg_mode, part, part_grid);
1844  }
1845  }
1846 }
1847 
1848 // Helper function to finish setting up a ColPartition and insert into
1849 // part_grid.
1850 void StrokeWidth::CompletePartition(PageSegMode pageseg_mode,
1851  ColPartition* part,
1852  ColPartitionGrid* part_grid) {
1853  part->ComputeLimits();
1854  TBOX box = part->bounding_box();
1855  bool debug = AlignedBlob::WithinTestRegion(2, box.left(),
1856  box.bottom());
1857  int value = projection_->EvaluateColPartition(*part, denorm_, debug);
1858  // Override value if pageseg_mode disagrees.
1859  if (value > 0 && FindingVerticalOnly(pageseg_mode)) {
1860  value = part->boxes_count() == 1 ? 0 : -2;
1861  } else if (value < 0 && FindingHorizontalOnly(pageseg_mode)) {
1862  value = part->boxes_count() == 1 ? 0 : 2;
1863  }
1865  part->ClaimBoxes();
1866  part_grid->InsertBBox(true, true, part);
1867 }
1868 
1869 // Merge partitions where the merge appears harmless.
1870 // As this
1871 void StrokeWidth::EasyMerges(ColPartitionGrid* part_grid) {
1872  part_grid->Merges(
1873  NewPermanentTessCallback(this, &StrokeWidth::OrientationSearchBox),
1874  NewPermanentTessCallback(this, &StrokeWidth::ConfirmEasyMerge));
1875 }
1876 
1877 // Compute a search box based on the orientation of the partition.
1878 // Returns true if a suitable box can be calculated.
1879 // Callback for EasyMerges.
1880 bool StrokeWidth::OrientationSearchBox(ColPartition* part, TBOX* box) {
1881  if (part->IsVerticalType()) {
1882  box->set_top(box->top() + box->width());
1883  box->set_bottom(box->bottom() - box->width());
1884  } else {
1885  box->set_left(box->left() - box->height());
1886  box->set_right(box->right() + box->height());
1887  }
1888  return true;
1889 }
1890 
1891 // Merge confirmation callback for EasyMerges.
1892 bool StrokeWidth::ConfirmEasyMerge(const ColPartition* p1,
1893  const ColPartition* p2) {
1894  ASSERT_HOST(p1 != NULL && p2 != NULL);
1895  ASSERT_HOST(!p1->IsEmpty() && !p2->IsEmpty());
1896  if ((p1->flow() == BTFT_NONTEXT && p2->flow() >= BTFT_CHAIN) ||
1897  (p1->flow() >= BTFT_CHAIN && p2->flow() == BTFT_NONTEXT))
1898  return false; // Don't merge confirmed image with text.
1899  if ((p1->IsVerticalType() || p2->IsVerticalType()) &&
1900  p1->HCoreOverlap(*p2) <= 0 &&
1901  ((!p1->IsSingleton() &&
1902  !p2->IsSingleton()) ||
1903  !p1->bounding_box().major_overlap(p2->bounding_box())))
1904  return false; // Overlap must be in the text line.
1905  if ((p1->IsHorizontalType() || p2->IsHorizontalType()) &&
1906  p1->VCoreOverlap(*p2) <= 0 &&
1907  ((!p1->IsSingleton() &&
1908  !p2->IsSingleton()) ||
1909  (!p1->bounding_box().major_overlap(p2->bounding_box()) &&
1910  !p1->OKDiacriticMerge(*p2, false) &&
1911  !p2->OKDiacriticMerge(*p1, false))))
1912  return false; // Overlap must be in the text line.
1913  if (!p1->ConfirmNoTabViolation(*p2))
1914  return false;
1915  if (p1->flow() <= BTFT_NONTEXT && p2->flow() <= BTFT_NONTEXT)
1916  return true;
1917  return NoNoiseInBetween(p1->bounding_box(), p2->bounding_box());
1918 }
1919 
1920 // Returns true if there is no significant noise in between the boxes.
1921 bool StrokeWidth::NoNoiseInBetween(const TBOX& box1, const TBOX& box2) const {
1922  return ImageFind::BlankImageInBetween(box1, box2, grid_box_, rerotation_,
1923  nontext_map_);
1924 }
1925 
1929 ScrollView* StrokeWidth::DisplayGoodBlobs(const char* window_name,
1930  int x, int y) {
1931  ScrollView* window = NULL;
1932 #ifndef GRAPHICS_DISABLED
1933  window = MakeWindow(x, y, window_name);
1934  // For every blob in the grid, display it.
1935  window->Brush(ScrollView::NONE);
1936 
1937  // For every bbox in the grid, display it.
1938  BlobGridSearch gsearch(this);
1939  gsearch.StartFullSearch();
1940  BLOBNBOX* bbox;
1941  while ((bbox = gsearch.NextFullSearch()) != NULL) {
1942  const TBOX& box = bbox->bounding_box();
1943  int left_x = box.left();
1944  int right_x = box.right();
1945  int top_y = box.top();
1946  int bottom_y = box.bottom();
1947  int goodness = bbox->GoodTextBlob();
1948  BlobRegionType blob_type = bbox->region_type();
1949  if (bbox->UniquelyVertical())
1950  blob_type = BRT_VERT_TEXT;
1951  if (bbox->UniquelyHorizontal())
1952  blob_type = BRT_TEXT;
1953  BlobTextFlowType flow = bbox->flow();
1954  if (flow == BTFT_NONE) {
1955  if (goodness == 0)
1956  flow = BTFT_NEIGHBOURS;
1957  else if (goodness == 1)
1958  flow = BTFT_CHAIN;
1959  else
1960  flow = BTFT_STRONG_CHAIN;
1961  }
1962  window->Pen(BLOBNBOX::TextlineColor(blob_type, flow));
1963  window->Rectangle(left_x, bottom_y, right_x, top_y);
1964  }
1965  window->Update();
1966 #endif
1967  return window;
1968 }
1969 
1970 static void DrawDiacriticJoiner(const BLOBNBOX* blob, ScrollView* window) {
1971 #ifndef GRAPHICS_DISABLED
1972  const TBOX& blob_box(blob->bounding_box());
1973  int top = MAX(blob_box.top(), blob->base_char_top());
1974  int bottom = MIN(blob_box.bottom(), blob->base_char_bottom());
1975  int x = (blob_box.left() + blob_box.right()) / 2;
1976  window->Line(x, top, x, bottom);
1977 #endif // GRAPHICS_DISABLED
1978 }
1979 
1980 // Displays blobs colored according to whether or not they are diacritics.
1981 ScrollView* StrokeWidth::DisplayDiacritics(const char* window_name,
1982  int x, int y, TO_BLOCK* block) {
1983  ScrollView* window = NULL;
1984 #ifndef GRAPHICS_DISABLED
1985  window = MakeWindow(x, y, window_name);
1986  // For every blob in the grid, display it.
1987  window->Brush(ScrollView::NONE);
1988 
1989  BLOBNBOX_IT it(&block->blobs);
1990  for (it.mark_cycle_pt(); !it.cycled_list(); it.forward()) {
1991  BLOBNBOX* blob = it.data();
1992  if (blob->IsDiacritic()) {
1993  window->Pen(ScrollView::GREEN);
1994  DrawDiacriticJoiner(blob, window);
1995  } else {
1996  window->Pen(blob->BoxColor());
1997  }
1998  const TBOX& box = blob->bounding_box();
1999  window->Rectangle(box.left(), box. bottom(), box.right(), box.top());
2000  }
2001  it.set_to_list(&block->noise_blobs);
2002  for (it.mark_cycle_pt(); !it.cycled_list(); it.forward()) {
2003  BLOBNBOX* blob = it.data();
2004  if (blob->IsDiacritic()) {
2005  window->Pen(ScrollView::GREEN);
2006  DrawDiacriticJoiner(blob, window);
2007  } else {
2008  window->Pen(ScrollView::WHITE);
2009  }
2010  const TBOX& box = blob->bounding_box();
2011  window->Rectangle(box.left(), box. bottom(), box.right(), box.top());
2012  }
2013  window->Update();
2014 #endif
2015  return window;
2016 }
2017 
2018 } // namespace tesseract.
const int kLineResiduePadRatio
void SplitOverlappingPartitions(ColPartition_LIST *big_parts)
void Line(int x1, int y1, int x2, int y2)
Definition: scrollview.cpp:538
void NeighbourGaps(int gaps[BND_COUNT]) const
Definition: blobbox.cpp:176
const double kNoiseOverlapGrowthFactor
bool IsVerticalType() const
Definition: colpartition.h:435
bool y_overlap(const TBOX &box) const
Definition: rect.h:418
const double kCJKAspectRatio
Definition: strokewidth.cpp:65
C_OUTLINE_LIST * out_list()
Definition: stepblob.h:64
const double kNeighbourSearchFactor
void set_leader_on_right(bool flag)
Definition: blobbox.h:352
bool overlap(const TBOX &box) const
Definition: rect.h:345
int GridY() const
Definition: bbgrid.h:247
void InsertBlobList(BLOBNBOX_LIST *blobs)
Definition: blobgrid.cpp:34
void DisplayBoxes(ScrollView *window)
Definition: bbgrid.h:617
int VCoreOverlap(const ColPartition &other) const
Definition: colpartition.h:375
Definition: points.h:189
static ColPartition * MakeBigPartition(BLOBNBOX *box, ColPartition_LIST *big_part_list)
_ConstTessMemberResultCallback_0_0< false, R, T1 >::base * NewPermanentTessCallback(const T1 *obj, R(T2::*member)() const)
Definition: tesscallback.h:116
bool GridSmoothNeighbours(BlobTextFlowType source_type, Pix *nontext_map, const TBOX &im_box, const FCOORD &rerotation)
bool IsDiacritic() const
Definition: blobbox.h:365
int base_char_bottom() const
Definition: blobbox.h:371
const int kCJKMaxComponents
Definition: strokewidth.cpp:63
bool good_stroke_neighbour(BlobNeighbourDir n) const
Definition: blobbox.h:358
const int kCJKRadius
Definition: strokewidth.cpp:59
void PlotGradedBlobs(BLOBNBOX_LIST *blobs, ScrollView *win)
void set_diacritic_box(const TBOX &diacritic_box)
Definition: blobbox.h:383
inT32 area() const
Definition: rect.h:118
BLOBNBOX_LIST large_blobs
Definition: blobbox.h:772
void Brush(Color color)
Definition: scrollview.cpp:732
const double kMaxDiacriticDistanceRatio
Definition: strokewidth.cpp:83
float horz_stroke_width() const
Definition: blobbox.h:322
bool ConfirmNoTabViolation(const ColPartition &other) const
void Init(int gridsize, const ICOORD &bleft, const ICOORD &tright)
Definition: bbgrid.h:448
BLOBNBOX_CLIST * boxes()
Definition: colpartition.h:187
ScrollView::Color BoxColor() const
Definition: blobbox.cpp:476
int DistanceOfBoxFromBox(const TBOX &from_box, const TBOX &to_box, bool horizontal_textline, const DENORM *denorm, bool debug) const
void rotate_box(FCOORD rotation)
Definition: blobbox.cpp:66
ScrollView * MakeWindow(int x, int y, const char *window_name)
Definition: bbgrid.h:593
const ICOORD & bleft() const
Definition: bbgrid.h:73
float area_stroke_width() const
Definition: blobbox.h:334
inT32 area()
Definition: stepblob.cpp:270
virtual void HandleClick(int x, int y)
const double kCJKBrokenDistanceFraction
Definition: strokewidth.cpp:61
#define tprintf(...)
Definition: tprintf.h:31
const int kMostlyOneDirRatio
Definition: strokewidth.cpp:96
virtual void HandleClick(int x, int y)
Definition: bbgrid.h:659
void set_flow(BlobTextFlowType f)
Definition: colpartition.h:157
bool ConfirmNoTabViolation(const BLOBNBOX &other) const
Definition: blobbox.cpp:287
void set_neighbour(BlobNeighbourDir n, BLOBNBOX *neighbour, bool good)
Definition: blobbox.h:361
const int kLineTrapLongest
Definition: strokewidth.cpp:91
int gridsize() const
Definition: bbgrid.h:64
bool UniquelyVertical() const
Definition: blobbox.h:395
int GoodTextBlob() const
Definition: blobbox.cpp:221
void set_horz_possible(bool value)
Definition: blobbox.h:295
const double kBrokenCJKIterationFraction
Definition: strokewidth.cpp:71
bool joined_to_prev() const
Definition: blobbox.h:241
void set_x(float xin)
rewrite function
Definition: points.h:216
const double kStrokeWidthFractionTolerance
Definition: strokewidth.cpp:48
C_BLOB * cblob() const
Definition: blobbox.h:253
tesseract::ColPartition * owner() const
Definition: blobbox.h:337
#define BOOL_VAR(name, val, comment)
Definition: params.h:279
int IntCastRounded(double x)
Definition: helpers.h:179
void AddBox(BLOBNBOX *box)
void set_y(float yin)
rewrite function
Definition: points.h:220
BlobRegionType region_type() const
Definition: blobbox.h:268
void set_region_type(BlobRegionType new_type)
Definition: blobbox.h:271
bool major_y_overlap(const TBOX &box) const
Definition: rect.h:429
#define ASSERT_HOST(x)
Definition: errcode.h:84
inT16 left() const
Definition: rect.h:68
bool horz_possible() const
Definition: blobbox.h:292
Assume a single column of text of variable sizes.
Definition: publictypes.h:157
static ScrollView::Color TextlineColor(BlobRegionType region_type, BlobTextFlowType flow_type)
Definition: blobbox.cpp:439
void SetUniqueMode(bool mode)
Definition: bbgrid.h:255
void set_top(int y)
Definition: rect.h:57
const int kLineTrapShortest
Definition: strokewidth.cpp:93
const double kLineResidueAspectRatio
Definition: strokewidth.cpp:98
void set_owner(tesseract::ColPartition *new_owner)
Definition: blobbox.h:340
int y_gap(const TBOX &box) const
Definition: rect.h:225
bool MatchingStrokeWidth(const BLOBNBOX &other, double fractional_tolerance, double constant_tolerance) const
Definition: blobbox.cpp:300
void CorrectForRotation(const FCOORD &rerotation, ColPartitionGrid *part_grid)
void MinMaxGapsClipped(int *h_min, int *h_max, int *v_min, int *v_max) const
Definition: blobbox.cpp:195
void Merges(TessResultCallback2< bool, ColPartition *, TBOX *> *box_cb, TessResultCallback2< bool, const ColPartition *, const ColPartition *> *confirm_cb)
StrokeWidth(int gridsize, const ICOORD &bleft, const ICOORD &tright)
BBC * NextFullSearch()
Definition: bbgrid.h:679
void ConstructProjection(TO_BLOCK *input_block, const FCOORD &rotation, Pix *nontext_map)
BLOBNBOX_LIST small_blobs
Definition: blobbox.h:771
TBOX BoundsWithinLimits(int left, int right)
Definition: blobbox.cpp:328
void Rectangle(int x1, int y1, int x2, int y2)
Definition: scrollview.cpp:606
int textord_debug_tabfind
Definition: alignedblob.cpp:27
void really_merge(BLOBNBOX *other)
Definition: blobbox.cpp:98
void set_vert_possible(bool value)
Definition: blobbox.h:289
void InsertBBox(bool h_spread, bool v_spread, BBC *bbox)
Definition: bbgrid.h:490
bool leader_on_left() const
Definition: blobbox.h:343
void ClearNeighbours()
Definition: blobbox.h:494
const TBOX & bounding_box() const
Definition: colpartition.h:109
static void Update()
Definition: scrollview.cpp:715
const double kStrokeWidthCJK
Definition: strokewidth.cpp:56
const ICOORD & tright() const
Definition: bbgrid.h:76
int ComputeTotalOverlap(ColPartitionGrid **overlap_grid)
BLOBNBOX_LIST blobs
Definition: blobbox.h:768
#define INT_VAR(name, val, comment)
Definition: params.h:276
const double kCJKAspectRatioIncrease
Definition: strokewidth.cpp:67
void DeleteUnownedNoise()
Definition: blobbox.cpp:1033
void StartSideSearch(int x, int ymin, int ymax)
Definition: bbgrid.h:750
BLOBNBOX * base_char_blob() const
Definition: blobbox.h:387
void StartRadSearch(int x, int y, int max_radius)
Definition: bbgrid.h:702
void SetRegionAndFlowTypesFromProjectionValue(int value)
BlobNeighbourDir DirOtherWay(BlobNeighbourDir dir)
Definition: blobbox.h:91
void pad(int xpad, int ypad)
Definition: rect.h:127
bool leader_on_right() const
Definition: blobbox.h:349
BLOBNBOX * neighbour(BlobNeighbourDir n) const
Definition: blobbox.h:355
bool contains(const FCOORD pt) const
Definition: rect.h:323
float vert_stroke_width() const
Definition: blobbox.h:328
void MoveNonTextlineBlobs(BLOBNBOX_LIST *blobs, BLOBNBOX_LIST *small_blobs) const
bool major_overlap(const TBOX &box) const
Definition: rect.h:358
static bool WithinTestRegion(int detail_level, int x, int y)
bool IsSingleton() const
Definition: colpartition.h:361
const double kLineResidueSizeRatio
void add(inT32 value, inT32 count)
Definition: statistc.cpp:101
inT16 top() const
Definition: rect.h:54
#define MAX(x, y)
Definition: ndminx.h:24
int EvaluateColPartition(const ColPartition &part, const DENORM *denorm, bool debug) const
void set_flow(BlobTextFlowType value)
Definition: blobbox.h:283
int right_rule() const
Definition: blobbox.h:304
Definition: rect.h:30
const float kSizeRatioToReject
static bool DifferentSizes(int size1, int size2)
Definition: tabfind.cpp:408
void set_leader_on_left(bool flag)
Definition: blobbox.h:346
#define MIN(x, y)
Definition: ndminx.h:28
bool block_owned() const
Definition: colpartition.h:205
static bool UnMergeableType(BlobRegionType type)
Definition: blobbox.h:415
void SetNeighboursOnMediumBlobs(TO_BLOCK *block)
bool vert_possible() const
Definition: blobbox.h:286
bool UniquelyHorizontal() const
Definition: blobbox.h:398
BBC * NextSideSearch(bool right_to_left)
Definition: bbgrid.h:765
inT16 height() const
Definition: rect.h:104
BBC * NextRectSearch()
Definition: bbgrid.h:846
void FindLeaderPartitions(TO_BLOCK *block, ColPartitionGrid *part_grid)
float y() const
Definition: points.h:212
inT16 right() const
Definition: rect.h:75
const double kStrokeWidthTolerance
Definition: strokewidth.cpp:53
static bool VeryDifferentSizes(int size1, int size2)
Definition: tabfind.cpp:414
inT16 width() const
Definition: rect.h:111
void set_right(int x)
Definition: rect.h:78
void set_left(int x)
Definition: rect.h:71
BlobTextFlowType
Definition: blobbox.h:99
int textord_tabfind_show_strokewidths
Definition: strokewidth.cpp:44
Definition: statistc.h:33
void print() const
Definition: rect.h:270
void StartFullSearch()
Definition: bbgrid.h:669
void set_base_char_blob(BLOBNBOX *blob)
Definition: blobbox.h:390
double ile(double frac) const
Definition: statistc.cpp:174
inT16 bottom() const
Definition: rect.h:61
void StartRectSearch(const TBOX &rect)
Definition: bbgrid.h:834
const double kDiacriticXPadRatio
Definition: strokewidth.cpp:74
void set_bottom(int y)
Definition: rect.h:64
bool textord_tabfind_only_strokewidths
Definition: strokewidth.cpp:45
const int kMaxCJKSizeRatio
Definition: strokewidth.cpp:69
inT32 perimeter()
Definition: stepblob.cpp:289
void FindTextlineDirectionAndFixBrokenCJK(PageSegMode pageseg_mode, bool cjk_merge, TO_BLOCK *input_block)
const double kStrokeWidthFractionCJK
Definition: strokewidth.cpp:55
BLOBNBOX_LIST noise_blobs
Definition: blobbox.h:770
bool TestVerticalTextDirection(double find_vertical_text_ratio, TO_BLOCK *block, BLOBNBOX_CLIST *osd_blobs)
BBC * NextRadSearch()
Definition: bbgrid.h:717
BlobRegionType
Definition: blobbox.h:57
bool OKDiacriticMerge(const ColPartition &candidate, bool debug) const
BlobNeighbourDir
Definition: blobbox.h:72
static bool BlankImageInBetween(const TBOX &box1, const TBOX &box2, const TBOX &im_box, const FCOORD &rotation, Pix *pix)
Definition: imagefind.cpp:570
int GridX() const
Definition: bbgrid.h:244
PartitionFindResult
Definition: strokewidth.h:46
bool IsHorizontalType() const
Definition: colpartition.h:439
bool DefiniteIndividualFlow()
Definition: blobbox.cpp:247
float x() const
Definition: points.h:209
const TBOX & bounding_box() const
Definition: blobbox.h:215
const double kMinDiacriticSizeRatio
Definition: strokewidth.cpp:80
BlobTextFlowType flow() const
Definition: blobbox.h:280
void compute_bounding_box()
Definition: blobbox.h:225
void set_owns_cblob(bool value)
Definition: blobbox.h:393
int base_char_top() const
Definition: blobbox.h:368
const double kNoiseOverlapAreaFactor
BlobTextFlowType flow() const
Definition: colpartition.h:154
BlobRegionType blob_type() const
Definition: colpartition.h:148
void Pen(Color color)
Definition: scrollview.cpp:726
int HCoreOverlap(const ColPartition &other) const
Definition: colpartition.h:381
const double kDiacriticYPadRatio
Definition: strokewidth.cpp:77
void GradeBlobsIntoPartitions(PageSegMode pageseg_mode, const FCOORD &rerotation, TO_BLOCK *block, Pix *nontext_pix, const DENORM *denorm, bool cjk_script, TextlineProjection *projection, BLOBNBOX_LIST *diacritic_blobs, ColPartitionGrid *part_grid, ColPartition_LIST *big_parts)
integer coordinate
Definition: points.h:30
SVEvent * AwaitEvent(SVEventType type)
Definition: scrollview.cpp:449
const double kMaxDiacriticGapToBaseCharHeight
Definition: strokewidth.cpp:86
void GridCoords(int x, int y, int *grid_x, int *grid_y) const
Definition: bbgrid.cpp:54
void RemoveLineResidue(ColPartition_LIST *big_part_list)
int x_gap(const TBOX &box) const
Definition: rect.h:217