tesseract  4.00.00dev
tospace.cpp
Go to the documentation of this file.
1 // Licensed under the Apache License, Version 2.0 (the "License");
2 // you may not use this file except in compliance with the License.
3 // You may obtain a copy of the License at
4 // http://www.apache.org/licenses/LICENSE-2.0
5 // Unless required by applicable law or agreed to in writing, software
6 // distributed under the License is distributed on an "AS IS" BASIS,
7 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
8 // See the License for the specific language governing permissions and
9 // limitations under the License.
10 /**********************************************************************
11  * tospace.cpp
12  *
13  * Compute fuzzy word spacing thresholds for each row.
14  * I.e. set : max_nonspace
15  * space_threshold
16  * min_space
17  * kern_size
18  * space_size
19  * for each row.
20  * ONLY FOR PROPORTIONAL BLOCKS - FIXED PITCH IS ASSUMED ALREADY DONE
21  *
22  * Note: functions in this file were originally not members of any
23  * class or enclosed by any namespace. Now they are all static members
24  * of the Textord class.
25  *
26  **********************************************************************/
27 
28 #include "drawtord.h"
29 #include "ndminx.h"
30 #include "statistc.h"
31 #include "textord.h"
32 #include "tovars.h"
33 
34 // Include automatically generated configuration file if running autoconf.
35 #ifdef HAVE_CONFIG_H
36 #include "config_auto.h"
37 #endif
38 
39 #define MAXSPACING 128 /*max expected spacing in pix */
40 
41 namespace tesseract {
43  ICOORD page_tr, //topright of page
44  TO_BLOCK_LIST *blocks //blocks on page
45  ) {
46  TO_BLOCK_IT block_it; //iterator
47  TO_BLOCK *block; //current block;
48  TO_ROW_IT row_it; //row iterator
49  TO_ROW *row; //current row
50  int block_index; //block number
51  int row_index; //row number
52  //estimated width of real spaces for whole block
53  inT16 block_space_gap_width;
54  //estimated width of non space gaps for whole block
55  inT16 block_non_space_gap_width;
56  BOOL8 old_text_ord_proportional;//old fixed/prop result
57  GAPMAP *gapmap = NULL; //map of big vert gaps in blk
58 
59  block_it.set_to_list (blocks);
60  block_index = 1;
61  for (block_it.mark_cycle_pt (); !block_it.cycled_list ();
62  block_it.forward ()) {
63  block = block_it.data ();
64  gapmap = new GAPMAP (block);
65  block_spacing_stats(block,
66  gapmap,
67  old_text_ord_proportional,
68  block_space_gap_width,
69  block_non_space_gap_width);
70  // Make sure relative values of block-level space and non-space gap
71  // widths are reasonable. The ratio of 1:3 is also used in
72  // block_spacing_stats, to corrrect the block_space_gap_width
73  // Useful for arabic and hindi, when the non-space gap width is
74  // often over-estimated and should not be trusted. A similar ratio
75  // is found in block_spacing_stats.
77  (float) block_space_gap_width / block_non_space_gap_width < 3.0) {
78  block_non_space_gap_width = (inT16) floor (block_space_gap_width / 3.0);
79  }
80  row_it.set_to_list (block->get_rows ());
81  row_index = 1;
82  for (row_it.mark_cycle_pt (); !row_it.cycled_list (); row_it.forward ()) {
83  row = row_it.data ();
84  if ((row->pitch_decision == PITCH_DEF_PROP) ||
85  (row->pitch_decision == PITCH_CORR_PROP)) {
86  if ((tosp_debug_level > 0) && !old_text_ord_proportional)
87  tprintf ("Block %d Row %d: Now Proportional\n",
88  block_index, row_index);
89  row_spacing_stats(row,
90  gapmap,
91  block_index,
92  row_index,
93  block_space_gap_width,
94  block_non_space_gap_width);
95  }
96  else {
97  if ((tosp_debug_level > 0) && old_text_ord_proportional)
98  tprintf
99  ("Block %d Row %d: Now Fixed Pitch Decision:%d fp flag:%f\n",
100  block_index, row_index, row->pitch_decision,
101  row->fixed_pitch);
102  }
103 #ifndef GRAPHICS_DISABLED
106 #endif
107  row_index++;
108  }
109  delete gapmap;
110  block_index++;
111  }
112 }
113 
114 
115 /*************************************************************************
116  * block_spacing_stats()
117  *************************************************************************/
118 
119 void Textord::block_spacing_stats(
120  TO_BLOCK *block,
121  GAPMAP *gapmap,
122  BOOL8 &old_text_ord_proportional,
123  inT16 &block_space_gap_width, // resulting estimate
124  inT16 &block_non_space_gap_width // resulting estimate
125  ) {
126  TO_ROW_IT row_it; // row iterator
127  TO_ROW *row; // current row
128  BLOBNBOX_IT blob_it; // iterator
129 
130  STATS centre_to_centre_stats (0, MAXSPACING);
131  // DEBUG USE ONLY
132  STATS all_gap_stats (0, MAXSPACING);
133  STATS space_gap_stats (0, MAXSPACING);
134  inT16 minwidth = MAXSPACING; // narrowest blob
135  TBOX blob_box;
136  TBOX prev_blob_box;
137  inT16 centre_to_centre;
138  inT16 gap_width;
139  float real_space_threshold;
140  float iqr_centre_to_centre; // DEBUG USE ONLY
141  float iqr_all_gap_stats; // DEBUG USE ONLY
142  inT32 end_of_row;
143  inT32 row_length;
144 
145  row_it.set_to_list (block->get_rows ());
146  for (row_it.mark_cycle_pt (); !row_it.cycled_list (); row_it.forward ()) {
147  row = row_it.data ();
148  if (!row->blob_list ()->empty () &&
150  (row->pitch_decision == PITCH_DEF_PROP) ||
151  (row->pitch_decision == PITCH_CORR_PROP))) {
152  blob_it.set_to_list (row->blob_list ());
153  blob_it.mark_cycle_pt ();
154  end_of_row = blob_it.data_relative (-1)->bounding_box ().right ();
156  blob_box = box_next_pre_chopped (&blob_it);
157  else if (tosp_stats_use_xht_gaps)
158  blob_box = reduced_box_next (row, &blob_it);
159  else
160  blob_box = box_next (&blob_it);
161  row_length = end_of_row - blob_box.left ();
162  if (blob_box.width () < minwidth)
163  minwidth = blob_box.width ();
164  prev_blob_box = blob_box;
165  while (!blob_it.cycled_list ()) {
167  blob_box = box_next_pre_chopped (&blob_it);
168  else if (tosp_stats_use_xht_gaps)
169  blob_box = reduced_box_next (row, &blob_it);
170  else
171  blob_box = box_next (&blob_it);
172  if (blob_box.width () < minwidth)
173  minwidth = blob_box.width ();
174  gap_width = blob_box.left () - prev_blob_box.right ();
175  if (!ignore_big_gap (row, row_length, gapmap,
176  prev_blob_box.right (), blob_box.left ())) {
177  all_gap_stats.add (gap_width, 1);
178 
179  centre_to_centre = (blob_box.left () + blob_box.right () -
180  (prev_blob_box.left () +
181  prev_blob_box.right ())) / 2;
182  //DEBUG
183  centre_to_centre_stats.add (centre_to_centre, 1);
184  // DEBUG
185  }
186  prev_blob_box = blob_box;
187  }
188  }
189  }
190 
191  //Inadequate samples
192  if (all_gap_stats.get_total () <= 1) {
193  block_non_space_gap_width = minwidth;
194  block_space_gap_width = -1; //No est. space width
195  //DEBUG
196  old_text_ord_proportional = TRUE;
197  }
198  else {
199  /* For debug only ..... */
200  iqr_centre_to_centre = centre_to_centre_stats.ile (0.75) -
201  centre_to_centre_stats.ile (0.25);
202  iqr_all_gap_stats = all_gap_stats.ile (0.75) - all_gap_stats.ile (0.25);
203  old_text_ord_proportional =
204  iqr_centre_to_centre * 2 > iqr_all_gap_stats;
205  /* .......For debug only */
206 
207  /*
208  The median of the gaps is used as an estimate of the NON-SPACE gap width.
209  This RELIES on the assumption that there are more gaps WITHIN words than
210  BETWEEN words in a block
211 
212  Now try to estimate the width of a real space for all real spaces in the
213  block. Do this by using a crude threshold to ignore "narrow" gaps, then
214  find the median of the "wide" gaps and use this.
215  */
216  block_non_space_gap_width = (inT16) floor (all_gap_stats.median ());
217  // median gap
218 
219  row_it.set_to_list (block->get_rows ());
220  for (row_it.mark_cycle_pt (); !row_it.cycled_list (); row_it.forward ()) {
221  row = row_it.data ();
222  if (!row->blob_list ()->empty () &&
224  (row->pitch_decision == PITCH_DEF_PROP) ||
225  (row->pitch_decision == PITCH_CORR_PROP))) {
226  real_space_threshold =
227  MAX (tosp_init_guess_kn_mult * block_non_space_gap_width,
229  blob_it.set_to_list (row->blob_list ());
230  blob_it.mark_cycle_pt ();
231  end_of_row =
232  blob_it.data_relative (-1)->bounding_box ().right ();
234  blob_box = box_next_pre_chopped (&blob_it);
235  else if (tosp_stats_use_xht_gaps)
236  blob_box = reduced_box_next (row, &blob_it);
237  else
238  blob_box = box_next (&blob_it);
239  row_length = blob_box.left () - end_of_row;
240  prev_blob_box = blob_box;
241  while (!blob_it.cycled_list ()) {
243  blob_box = box_next_pre_chopped (&blob_it);
244  else if (tosp_stats_use_xht_gaps)
245  blob_box = reduced_box_next (row, &blob_it);
246  else
247  blob_box = box_next (&blob_it);
248  gap_width = blob_box.left () - prev_blob_box.right ();
249  if ((gap_width > real_space_threshold) &&
250  !ignore_big_gap (row, row_length, gapmap,
251  prev_blob_box.right (),
252  blob_box.left ())) {
253  /*
254  If tosp_use_cert_spaces is enabled, the estimate of the space gap is
255  restricted to obvious spaces - those wider than half the xht or those
256  with wide blobs on both sides - i.e not things that are suspect 1's or
257  punctuation that is sometimes widely spaced.
258  */
260  (gap_width >
262  ||
263  ((gap_width >
266  || (!narrow_blob (row, prev_blob_box)
267  && !narrow_blob (row, blob_box))))
268  || (wide_blob (row, prev_blob_box)
269  && wide_blob (row, blob_box)))
270  space_gap_stats.add (gap_width, 1);
271  }
272  prev_blob_box = blob_box;
273  }
274  }
275  }
276  //Inadequate samples
277  if (space_gap_stats.get_total () <= 2)
278  block_space_gap_width = -1;//No est. space width
279  else
280  block_space_gap_width =
281  MAX ((inT16) floor (space_gap_stats.median ()),
282  3 * block_non_space_gap_width);
283  }
284 }
285 
286 
287 /*************************************************************************
288  * row_spacing_stats()
289  * Set values for min_space, max_non_space based on row stats only
290  * If failure - return 0 values.
291  *************************************************************************/
292 void Textord::row_spacing_stats(
293  TO_ROW *row,
294  GAPMAP *gapmap,
295  inT16 block_idx,
296  inT16 row_idx,
297  inT16 block_space_gap_width, //estimate for block
298  inT16 block_non_space_gap_width //estimate for block
299  ) {
300  //iterator
301  BLOBNBOX_IT blob_it = row->blob_list ();
302  STATS all_gap_stats (0, MAXSPACING);
303  STATS cert_space_gap_stats (0, MAXSPACING);
304  STATS all_space_gap_stats (0, MAXSPACING);
305  STATS small_gap_stats (0, MAXSPACING);
306  TBOX blob_box;
307  TBOX prev_blob_box;
308  inT16 gap_width;
309  inT16 real_space_threshold = 0;
310  inT16 max = 0;
311  inT16 index;
312  inT16 large_gap_count = 0;
313  BOOL8 suspected_table;
314  inT32 max_max_nonspace; //upper bound
315  BOOL8 good_block_space_estimate = block_space_gap_width > 0;
316  inT32 end_of_row;
317  inT32 row_length = 0;
318  float sane_space;
319  inT32 sane_threshold;
320 
321  /* Collect first pass stats for row */
322 
323  if (!good_block_space_estimate)
324  block_space_gap_width = inT16 (floor (row->xheight / 2));
325  if (!row->blob_list ()->empty ()) {
326  if (tosp_threshold_bias1 > 0)
327  real_space_threshold =
328  block_non_space_gap_width +
329  inT16 (floor (0.5 +
330  tosp_threshold_bias1 * (block_space_gap_width -
331  block_non_space_gap_width)));
332  else
333  real_space_threshold = //Old TO method
334  (block_space_gap_width + block_non_space_gap_width) / 2;
335  blob_it.set_to_list (row->blob_list ());
336  blob_it.mark_cycle_pt ();
337  end_of_row = blob_it.data_relative (-1)->bounding_box ().right ();
339  blob_box = box_next_pre_chopped (&blob_it);
340  else if (tosp_stats_use_xht_gaps)
341  blob_box = reduced_box_next (row, &blob_it);
342  else
343  blob_box = box_next (&blob_it);
344  row_length = end_of_row - blob_box.left ();
345  prev_blob_box = blob_box;
346  while (!blob_it.cycled_list ()) {
348  blob_box = box_next_pre_chopped (&blob_it);
349  else if (tosp_stats_use_xht_gaps)
350  blob_box = reduced_box_next (row, &blob_it);
351  else
352  blob_box = box_next (&blob_it);
353  gap_width = blob_box.left () - prev_blob_box.right ();
354  if (ignore_big_gap (row, row_length, gapmap,
355  prev_blob_box.right (), blob_box.left ()))
356  large_gap_count++;
357  else {
358  if (gap_width >= real_space_threshold) {
360  (gap_width > tosp_fuzzy_space_factor2 * row->xheight) ||
361  ((gap_width > tosp_fuzzy_space_factor1 * row->xheight)
363  || (!narrow_blob (row, prev_blob_box)
364  && !narrow_blob (row, blob_box))))
365  || (wide_blob (row, prev_blob_box)
366  && wide_blob (row, blob_box)))
367  cert_space_gap_stats.add (gap_width, 1);
368  all_space_gap_stats.add (gap_width, 1);
369  }
370  else
371  small_gap_stats.add (gap_width, 1);
372  all_gap_stats.add (gap_width, 1);
373  }
374  prev_blob_box = blob_box;
375  }
376  }
377  suspected_table = (large_gap_count > 1) ||
378  ((large_gap_count > 0) &&
379  (all_gap_stats.get_total () <= tosp_few_samples));
380 
381  /* Now determine row kern size, space size and threshold */
382 
383  if ((cert_space_gap_stats.get_total () >=
385  ((suspected_table ||
386  all_gap_stats.get_total () <= tosp_short_row) &&
387  cert_space_gap_stats.get_total () > 0)) {
388  old_to_method(row,
389  &all_gap_stats,
390  &cert_space_gap_stats,
391  &small_gap_stats,
392  block_space_gap_width,
393  block_non_space_gap_width);
394  } else {
396  !isolated_row_stats (row, gapmap, &all_gap_stats, suspected_table,
397  block_idx, row_idx)) {
399  tprintf ("B:%d R:%d -- Inadequate certain spaces.\n",
400  block_idx, row_idx);
401  if (tosp_row_use_cert_spaces1 && good_block_space_estimate) {
402  //Use block default
403  row->space_size = block_space_gap_width;
404  if (all_gap_stats.get_total () > tosp_redo_kern_limit)
405  row->kern_size = all_gap_stats.median ();
406  else
407  row->kern_size = block_non_space_gap_width;
408  row->space_threshold =
409  inT32 (floor ((row->space_size + row->kern_size) /
411  }
412  else
413  old_to_method(row,
414  &all_gap_stats,
415  &all_space_gap_stats,
416  &small_gap_stats,
417  block_space_gap_width,
418  block_non_space_gap_width);
419  }
420  }
421 
422  if (tosp_improve_thresh && !suspected_table)
423  improve_row_threshold(row, &all_gap_stats);
424 
425  /* Now lets try to be careful not to do anything silly with tables when we
426  are ignoring big gaps*/
427  if (tosp_sanity_method == 0) {
428  if (suspected_table &&
429  (row->space_size < tosp_table_kn_sp_ratio * row->kern_size)) {
430  if (tosp_debug_level > 5)
431  tprintf("B:%d R:%d -- DON'T BELIEVE SPACE %3.2f %d %3.2f.\n", block_idx,
432  row_idx, row->kern_size, row->space_threshold, row->space_size);
433  row->space_threshold =
435  row->space_size = MAX (row->space_threshold + 1, row->xheight);
436  }
437  }
438  else if (tosp_sanity_method == 1) {
439  sane_space = row->space_size;
440  /* NEVER let space size get too close to kern size */
441  if ((row->space_size < tosp_min_sane_kn_sp * MAX (row->kern_size, 2.5))
442  || ((row->space_size - row->kern_size) <
443  (tosp_silly_kn_sp_gap * row->xheight))) {
444  if (good_block_space_estimate &&
445  (block_space_gap_width >= tosp_min_sane_kn_sp * row->kern_size))
446  sane_space = block_space_gap_width;
447  else
448  sane_space =
449  MAX (tosp_min_sane_kn_sp * MAX (row->kern_size, 2.5),
450  row->xheight / 2);
451  if (tosp_debug_level > 5)
452  tprintf("B:%d R:%d -- DON'T BELIEVE SPACE %3.2f %d %3.2f -> %3.2f.\n",
453  block_idx, row_idx, row->kern_size, row->space_threshold,
454  row->space_size, sane_space);
455  row->space_size = sane_space;
456  row->space_threshold =
457  inT32 (floor ((row->space_size + row->kern_size) /
459  }
460  /* NEVER let threshold get VERY far away from kern */
461  sane_threshold = inT32 (floor (tosp_max_sane_kn_thresh *
462  MAX (row->kern_size, 2.5)));
463  if (row->space_threshold > sane_threshold) {
464  if (tosp_debug_level > 5)
465  tprintf("B:%d R:%d -- DON'T BELIEVE THRESH %3.2f %d %3.2f->%d.\n",
466  block_idx, row_idx, row->kern_size, row->space_threshold,
467  row->space_size, sane_threshold);
468  row->space_threshold = sane_threshold;
469  if (row->space_size <= sane_threshold)
470  row->space_size = row->space_threshold + 1.0f;
471  }
472  /* Beware of tables - there may be NO spaces */
473  if (suspected_table) {
474  sane_space = MAX (tosp_table_kn_sp_ratio * row->kern_size,
476  sane_threshold = inT32 (floor ((sane_space + row->kern_size) / 2));
477 
478  if ((row->space_size < sane_space) ||
479  (row->space_threshold < sane_threshold)) {
480  if (tosp_debug_level > 5)
481  tprintf ("B:%d R:%d -- SUSPECT NO SPACES %3.2f %d %3.2f.\n",
482  block_idx, row_idx,
483  row->kern_size,
484  row->space_threshold, row->space_size);
485  //the minimum sane value
486  row->space_threshold = (inT32) sane_space;
487  row->space_size = MAX (row->space_threshold + 1, row->xheight);
488  }
489  }
490  }
491 
492  /* Now lets try to put some error limits on the threshold */
493 
494  if (tosp_old_to_method) {
495  /* Old textord made a space if gap >= threshold */
496  //NO FUZZY SPACES YET
497  row->max_nonspace = row->space_threshold;
498  //NO FUZZY SPACES YET
499  row->min_space = row->space_threshold + 1;
500  }
501  else {
502  /* Any gap greater than 0.6 x-ht is bound to be a space (isn't it:-) */
503  row->min_space =
504  MIN (inT32 (ceil (tosp_fuzzy_space_factor * row->xheight)),
505  inT32 (row->space_size));
506  if (row->min_space <= row->space_threshold)
507  // Don't be silly
508  row->min_space = row->space_threshold + 1;
509  /*
510  Lets try to guess the max certain kern gap by looking at the cluster of
511  kerns for the row. The row is proportional so the kerns should cluster
512  tightly at the bottom of the distribution. We also expect most gaps to be
513  kerns. Find the maximum of the kern piles between 0 and twice the kern
514  estimate. Piles before the first one with less than 1/10 the maximum
515  number of samples can be taken as certain kerns.
516 
517  Of course, there are some cases where the kern peak and space peaks merge,
518  so we will put an UPPER limit on the max certain kern gap of some fraction
519  below the threshold.
520  */
521 
522  max_max_nonspace = inT32 ((row->space_threshold + row->kern_size) / 2);
523 
524  //default
525  row->max_nonspace = max_max_nonspace;
526  for (index = 0; index <= max_max_nonspace; index++) {
527  if (all_gap_stats.pile_count (index) > max)
528  max = all_gap_stats.pile_count (index);
529  if ((index > row->kern_size) &&
530  (all_gap_stats.pile_count (index) < 0.1 * max)) {
531  row->max_nonspace = index;
532  break;
533  }
534  }
535  }
536 
537  /* Yet another algorithm - simpler this time - just choose a fraction of the
538  threshold to space range */
539 
540  if ((tosp_fuzzy_sp_fraction > 0) &&
541  (row->space_size > row->space_threshold))
542  row->min_space = MAX (row->min_space,
543  (inT32) ceil (row->space_threshold +
545  (row->space_size -
546  row->space_threshold)));
547 
548  /* Ensure that ANY space less than some multiplier times the kern size is
549  fuzzy. In tables there is a risk of erroneously setting a small space size
550  when there are no real spaces. Sometimes tables have text squashed into
551  columns so that the kn->sp ratio is small anyway - this means that we can't
552  use this to force a wider separation - hence we rely on context to join any
553  dubious breaks. */
554 
555  if ((tosp_table_fuzzy_kn_sp_ratio > 0) &&
556  (suspected_table || tosp_fuzzy_limit_all))
557  row->min_space = MAX (row->min_space,
559  row->kern_size));
560 
561  if ((tosp_fuzzy_kn_fraction > 0) && (row->kern_size < row->space_threshold)) {
562  row->max_nonspace = (inT32) floor (0.5 + row->kern_size +
564  (row->space_threshold -
565  row->kern_size));
566  }
567  if (row->max_nonspace > row->space_threshold) {
568  // Don't be silly
569  row->max_nonspace = row->space_threshold;
570  }
571 
572  if (tosp_debug_level > 5)
573  tprintf
574  ("B:%d R:%d L:%d-- Kn:%d Sp:%d Thr:%d -- Kn:%3.2f (%d) Thr:%d (%d) Sp:%3.2f\n",
575  block_idx, row_idx, row_length, block_non_space_gap_width,
576  block_space_gap_width, real_space_threshold, row->kern_size,
577  row->max_nonspace, row->space_threshold, row->min_space,
578  row->space_size);
579  if (tosp_debug_level > 10)
580  tprintf("row->kern_size = %3.2f, row->space_size = %3.2f, "
581  "row->space_threshold = %d\n",
582  row->kern_size, row->space_size, row->space_threshold);
583 }
584 
585 void Textord::old_to_method(
586  TO_ROW *row,
587  STATS *all_gap_stats,
588  STATS *space_gap_stats,
589  STATS *small_gap_stats,
590  inT16 block_space_gap_width, //estimate for block
591  inT16 block_non_space_gap_width //estimate for block
592  ) {
593  /* First, estimate row space size */
594  /* Old to condition was > 2 */
595  if (space_gap_stats->get_total () >= tosp_enough_space_samples_for_median) {
596  //Adequate samples
597  /* Set space size to median of spaces BUT limits it if it seems wildly out */
598  row->space_size = space_gap_stats->median ();
599  if (row->space_size > block_space_gap_width * 1.5) {
601  row->space_size = block_space_gap_width * 1.5;
602  else
603  //BUG??? should be *1.5
604  row->space_size = block_space_gap_width;
605  }
606  if (row->space_size < (block_non_space_gap_width * 2) + 1)
607  row->space_size = (block_non_space_gap_width * 2) + 1;
608  }
609  //Only 1 or 2 samples
610  else if (space_gap_stats->get_total () >= 1) {
611  //hence mean not median
612  row->space_size = space_gap_stats->mean ();
613  if (row->space_size > block_space_gap_width * 1.5) {
615  row->space_size = block_space_gap_width * 1.5;
616  else
617  //BUG??? should be *1.5
618  row->space_size = block_space_gap_width;
619  }
620  if (row->space_size < (block_non_space_gap_width * 3) + 1)
621  row->space_size = (block_non_space_gap_width * 3) + 1;
622  }
623  else {
624  //Use block default
625  row->space_size = block_space_gap_width;
626  }
627 
628  /* Next, estimate row kern size */
630  (small_gap_stats->get_total () > tosp_redo_kern_limit))
631  row->kern_size = small_gap_stats->median ();
632  else if (all_gap_stats->get_total () > tosp_redo_kern_limit)
633  row->kern_size = all_gap_stats->median ();
634  else //old TO -SAME FOR ALL ROWS
635  row->kern_size = block_non_space_gap_width;
636 
637  /* Finally, estimate row space threshold */
638  if (tosp_threshold_bias2 > 0) {
639  row->space_threshold =
640  inT32 (floor (0.5 + row->kern_size +
642  row->kern_size)));
643  } else {
644  /*
645  NOTE old text ord uses (space_size + kern_size + 1)/2 as the threshold
646  and holds this in a float. The use is with a >= test
647  NEW textord uses an integer threshold and a > test
648  It comes to the same thing.
649  (Though there is a difference in that old textor has integer space_size
650  and kern_size.)
651  */
652  row->space_threshold =
653  inT32 (floor ((row->space_size + row->kern_size) / 2));
654  }
655 
656  // Apply the same logic and ratios as in row_spacing_stats to
657  // restrict relative values of the row's space_size, kern_size, and
658  // space_threshold
660  ((row->space_size <
661  tosp_min_sane_kn_sp * MAX (row->kern_size, 2.5)) ||
662  ((row->space_size - row->kern_size) <
663  tosp_silly_kn_sp_gap * row->xheight))) {
664  if (row->kern_size > 2.5)
666  row->space_threshold = inT32 (floor ((row->space_size + row->kern_size) /
668  }
669 }
670 
671 
672 /*************************************************************************
673  * isolated_row_stats()
674  * Set values for min_space, max_non_space based on row stats only
675  *************************************************************************/
676 BOOL8 Textord::isolated_row_stats(TO_ROW *row,
677  GAPMAP *gapmap,
678  STATS *all_gap_stats,
679  BOOL8 suspected_table,
680  inT16 block_idx,
681  inT16 row_idx) {
682  float kern_estimate;
683  float crude_threshold_estimate;
684  inT16 small_gaps_count;
685  inT16 total;
686  //iterator
687  BLOBNBOX_IT blob_it = row->blob_list ();
688  STATS cert_space_gap_stats (0, MAXSPACING);
689  STATS all_space_gap_stats (0, MAXSPACING);
690  STATS small_gap_stats (0, MAXSPACING);
691  TBOX blob_box;
692  TBOX prev_blob_box;
693  inT16 gap_width;
694  inT32 end_of_row;
695  inT32 row_length;
696 
697  kern_estimate = all_gap_stats->median ();
698  crude_threshold_estimate = MAX (tosp_init_guess_kn_mult * kern_estimate,
700  small_gaps_count = stats_count_under (all_gap_stats,
701  (inT16)
702  ceil (crude_threshold_estimate));
703  total = all_gap_stats->get_total ();
704 
705  if ((total <= tosp_redo_kern_limit) ||
706  ((small_gaps_count / (float) total) < tosp_enough_small_gaps) ||
707  (total - small_gaps_count < 1)) {
708  if (tosp_debug_level > 5)
709  tprintf("B:%d R:%d -- Can't do isolated row stats.\n", block_idx,
710  row_idx);
711  return FALSE;
712  }
713  blob_it.set_to_list (row->blob_list ());
714  blob_it.mark_cycle_pt ();
715  end_of_row = blob_it.data_relative (-1)->bounding_box ().right ();
717  blob_box = box_next_pre_chopped (&blob_it);
718  else if (tosp_stats_use_xht_gaps)
719  blob_box = reduced_box_next (row, &blob_it);
720  else
721  blob_box = box_next (&blob_it);
722  row_length = end_of_row - blob_box.left ();
723  prev_blob_box = blob_box;
724  while (!blob_it.cycled_list ()) {
726  blob_box = box_next_pre_chopped (&blob_it);
727  else if (tosp_stats_use_xht_gaps)
728  blob_box = reduced_box_next (row, &blob_it);
729  else
730  blob_box = box_next (&blob_it);
731  gap_width = blob_box.left () - prev_blob_box.right ();
732  if (!ignore_big_gap (row, row_length, gapmap,
733  prev_blob_box.right (), blob_box.left ()) &&
734  (gap_width > crude_threshold_estimate)) {
735  if ((gap_width > tosp_fuzzy_space_factor2 * row->xheight) ||
736  ((gap_width > tosp_fuzzy_space_factor1 * row->xheight) &&
738  (!narrow_blob (row, prev_blob_box) &&
739  !narrow_blob (row, blob_box)))) ||
740  (wide_blob (row, prev_blob_box) && wide_blob (row, blob_box)))
741  cert_space_gap_stats.add (gap_width, 1);
742  all_space_gap_stats.add (gap_width, 1);
743  }
744  if (gap_width < crude_threshold_estimate)
745  small_gap_stats.add (gap_width, 1);
746 
747  prev_blob_box = blob_box;
748  }
749  if (cert_space_gap_stats.get_total () >=
751  //median
752  row->space_size = cert_space_gap_stats.median ();
753  else if (suspected_table && (cert_space_gap_stats.get_total () > 0))
754  //to avoid spaced
755  row->space_size = cert_space_gap_stats.mean ();
756  // 1's in tables
757  else if (all_space_gap_stats.get_total () >=
759  //median
760  row->space_size = all_space_gap_stats.median ();
761  else
762  row->space_size = all_space_gap_stats.mean ();
763 
765  row->kern_size = small_gap_stats.median ();
766  else
767  row->kern_size = all_gap_stats->median ();
768  row->space_threshold =
769  inT32 (floor ((row->space_size + row->kern_size) / 2));
770  /* Sanity check */
771  if ((row->kern_size >= row->space_threshold) ||
772  (row->space_threshold >= row->space_size) ||
773  (row->space_threshold <= 0)) {
774  if (tosp_debug_level > 5)
775  tprintf ("B:%d R:%d -- Isolated row stats SANITY FAILURE: %f %d %f\n",
776  block_idx, row_idx,
777  row->kern_size, row->space_threshold, row->space_size);
778  row->kern_size = 0.0f;
779  row->space_threshold = 0;
780  row->space_size = 0.0f;
781  return FALSE;
782  }
783 
784  if (tosp_debug_level > 5)
785  tprintf ("B:%d R:%d -- Isolated row stats: %f %d %f\n",
786  block_idx, row_idx,
787  row->kern_size, row->space_threshold, row->space_size);
788  return TRUE;
789 }
790 
791 inT16 Textord::stats_count_under(STATS *stats, inT16 threshold) {
792  inT16 index;
793  inT16 total = 0;
794 
795  for (index = 0; index < threshold; index++)
796  total += stats->pile_count (index);
797  return total;
798 }
799 
800 
801 /*************************************************************************
802  * improve_row_threshold()
803  * Try to recognise a "normal line" -
804  * > 25 gaps
805  * && space > 3 * kn && space > 10
806  * (I.e. reasonably large space and kn:sp ratio)
807  * && > 3/4 # gaps < kn + (sp - kn)/3
808  * (I.e. most gaps are well away from space estimate)
809  * && a gap of max( 3, (sp - kn)/3 ) empty histogram positions is found
810  * somewhere in the histogram between kn and sp
811  * THEN set the threshold and fuzzy limits to this gap - ie NO fuzzies
812  * NO!!!!! the bristol line has "11" with a gap of 12 between the 1's!!!
813  * try moving the default threshold to within this band but leave the
814  * fuzzy limit calculation as at present.
815  *************************************************************************/
816 void Textord::improve_row_threshold(TO_ROW *row, STATS *all_gap_stats) {
817  float sp = row->space_size;
818  float kn = row->kern_size;
819  inT16 reqd_zero_width = 0;
820  inT16 zero_width = 0;
821  inT16 zero_start = 0;
822  inT16 index = 0;
823 
824  if (tosp_debug_level > 10)
825  tprintf ("Improve row threshold 0");
826  if ((all_gap_stats->get_total () <= 25) ||
827  (sp <= 10) ||
828  (sp <= 3 * kn) ||
829  (stats_count_under (all_gap_stats,
830  (inT16) ceil (kn + (sp - kn) / 3 + 0.5)) <
831  (0.75 * all_gap_stats->get_total ())))
832  return;
833  if (tosp_debug_level > 10)
834  tprintf (" 1");
835  /*
836  Look for the first region of all 0's in the histogram which is wider than
837  max( 3, (sp - kn)/3 ) and starts between kn and sp. If found, and current
838  threshold is not within it, move the threshold so that is is just inside it.
839  */
840  reqd_zero_width = (inT16) floor ((sp - kn) / 3 + 0.5);
841  if (reqd_zero_width < 3)
842  reqd_zero_width = 3;
843 
844  for (index = inT16 (ceil (kn)); index < inT16 (floor (sp)); index++) {
845  if (all_gap_stats->pile_count (index) == 0) {
846  if (zero_width == 0)
847  zero_start = index;
848  zero_width++;
849  }
850  else {
851  if (zero_width >= reqd_zero_width)
852  break;
853  else {
854  zero_width = 0;
855  }
856  }
857  }
858  index--;
859  if (tosp_debug_level > 10)
860  tprintf (" reqd_z_width: %d found %d 0's, starting %d; thresh: %d/n",
861  reqd_zero_width, zero_width, zero_start, row->space_threshold);
862  if ((zero_width < reqd_zero_width) ||
863  ((row->space_threshold >= zero_start) &&
864  (row->space_threshold <= index)))
865  return;
866  if (tosp_debug_level > 10)
867  tprintf (" 2");
868  if (row->space_threshold < zero_start) {
869  if (tosp_debug_level > 5)
870  tprintf
871  ("Improve row kn:%5.2f sp:%5.2f 0's: %d -> %d thresh:%d -> %d\n",
872  kn, sp, zero_start, index, row->space_threshold, zero_start);
873  row->space_threshold = zero_start;
874  }
875  if (row->space_threshold > index) {
876  if (tosp_debug_level > 5)
877  tprintf
878  ("Improve row kn:%5.2f sp:%5.2f 0's: %d -> %d thresh:%d -> %d\n",
879  kn, sp, zero_start, index, row->space_threshold, index);
880  row->space_threshold = index;
881  }
882 }
883 
884 
885 /**********************************************************************
886  * make_prop_words
887  *
888  * Convert a TO_BLOCK to a BLOCK.
889  **********************************************************************/
891  TO_ROW *row, // row to make
892  FCOORD rotation // for drawing
893  ) {
894  BOOL8 bol; // start of line
895  /* prev_ values are for start of word being built. non prev_ values are for
896  the gap between the word being built and the next one. */
897  BOOL8 prev_fuzzy_sp; // probably space
898  BOOL8 prev_fuzzy_non; // probably not
899  uinT8 prev_blanks; // in front of word
900  BOOL8 fuzzy_sp = false; // probably space
901  BOOL8 fuzzy_non = false; // probably not
902  uinT8 blanks = 0; // in front of word
903  BOOL8 prev_gap_was_a_space = FALSE;
904  BOOL8 break_at_next_gap = FALSE;
905  ROW *real_row; // output row
906  C_OUTLINE_IT cout_it;
907  C_BLOB_LIST cblobs;
908  C_BLOB_IT cblob_it = &cblobs;
909  WERD_LIST words;
910  WERD_IT word_it; // new words
911  WERD *word; // new word
912  WERD_IT rep_char_it; // repeated char words
913  inT32 next_rep_char_word_right = MAX_INT32;
914  float repetition_spacing; // gap between repetitions
915  inT32 xstarts[2]; // row ends
916  inT32 prev_x; // end of prev blob
917  BLOBNBOX *bblob; // current blob
918  TBOX blob_box; // bounding box
919  BLOBNBOX_IT box_it; // iterator
920  TBOX prev_blob_box;
921  TBOX next_blob_box;
922  inT16 prev_gap = MAX_INT16;
923  inT16 current_gap = MAX_INT16;
924  inT16 next_gap = MAX_INT16;
925  inT16 prev_within_xht_gap = MAX_INT16;
926  inT16 current_within_xht_gap = MAX_INT16;
927  inT16 next_within_xht_gap = MAX_INT16;
928  inT16 word_count = 0;
929 
930  rep_char_it.set_to_list (&(row->rep_words));
931  if (!rep_char_it.empty ()) {
932  next_rep_char_word_right =
933  rep_char_it.data ()->bounding_box ().right ();
934  }
935 
936  prev_x = -MAX_INT16;
937  cblob_it.set_to_list (&cblobs);
938  box_it.set_to_list (row->blob_list ());
939  word_it.set_to_list (&words);
940  bol = TRUE;
941  prev_blanks = 0;
942  prev_fuzzy_sp = FALSE;
943  prev_fuzzy_non = FALSE;
944  if (!box_it.empty ()) {
945  xstarts[0] = box_it.data ()->bounding_box ().left ();
946  if (xstarts[0] > next_rep_char_word_right) {
947  /* We need to insert a repeated char word at the start of the row */
948  word = rep_char_it.extract ();
949  word_it.add_after_then_move (word);
950  /* Set spaces before repeated char word */
951  word->set_flag (W_BOL, TRUE);
952  bol = FALSE;
953  word->set_blanks (0);
954  //NO uncertainty
955  word->set_flag (W_FUZZY_SP, FALSE);
956  word->set_flag (W_FUZZY_NON, FALSE);
957  xstarts[0] = word->bounding_box ().left ();
958  /* Set spaces after repeated char word (and leave current word set) */
959  repetition_spacing = find_mean_blob_spacing (word);
960  current_gap = box_it.data ()->bounding_box ().left () -
961  next_rep_char_word_right;
962  current_within_xht_gap = current_gap;
963  if (current_gap > tosp_rep_space * repetition_spacing) {
964  prev_blanks = (uinT8) floor (current_gap / row->space_size);
965  if (prev_blanks < 1)
966  prev_blanks = 1;
967  }
968  else
969  prev_blanks = 0;
970  if (tosp_debug_level > 5)
971  tprintf ("Repch wd at BOL(%d, %d). rep spacing %5.2f; Rgap:%d ",
972  box_it.data ()->bounding_box ().left (),
973  box_it.data ()->bounding_box ().bottom (),
974  repetition_spacing, current_gap);
975  prev_fuzzy_sp = FALSE;
976  prev_fuzzy_non = FALSE;
977  if (rep_char_it.empty ()) {
978  next_rep_char_word_right = MAX_INT32;
979  }
980  else {
981  rep_char_it.forward ();
982  next_rep_char_word_right =
983  rep_char_it.data ()->bounding_box ().right ();
984  }
985  }
986 
987  peek_at_next_gap(row,
988  box_it,
989  next_blob_box,
990  next_gap,
991  next_within_xht_gap);
992  do {
993  bblob = box_it.data ();
994  blob_box = bblob->bounding_box ();
995  if (bblob->joined_to_prev ()) {
996  if (bblob->cblob () != NULL) {
997  cout_it.set_to_list (cblob_it.data ()->out_list ());
998  cout_it.move_to_last ();
999  cout_it.add_list_after (bblob->cblob ()->out_list ());
1000  delete bblob->cblob ();
1001  }
1002  } else {
1003  if (bblob->cblob() != NULL)
1004  cblob_it.add_after_then_move (bblob->cblob ());
1005  prev_x = blob_box.right ();
1006  }
1007  box_it.forward (); //next one
1008  bblob = box_it.data ();
1009  blob_box = bblob->bounding_box ();
1010 
1011  if (!bblob->joined_to_prev() && bblob->cblob() != NULL) {
1012  /* Real Blob - not multiple outlines or pre-chopped */
1013  prev_gap = current_gap;
1014  prev_within_xht_gap = current_within_xht_gap;
1015  prev_blob_box = next_blob_box;
1016  current_gap = next_gap;
1017  current_within_xht_gap = next_within_xht_gap;
1018  peek_at_next_gap(row,
1019  box_it,
1020  next_blob_box,
1021  next_gap,
1022  next_within_xht_gap);
1023 
1024  inT16 prev_gap_arg = prev_gap;
1025  inT16 next_gap_arg = next_gap;
1026  if (tosp_only_use_xht_gaps) {
1027  prev_gap_arg = prev_within_xht_gap;
1028  next_gap_arg = next_within_xht_gap;
1029  }
1030  // Decide if a word-break should be inserted
1031  if (blob_box.left () > next_rep_char_word_right ||
1032  make_a_word_break(row, blob_box, prev_gap_arg, prev_blob_box,
1033  current_gap, current_within_xht_gap,
1034  next_blob_box, next_gap_arg,
1035  blanks, fuzzy_sp, fuzzy_non,
1036  prev_gap_was_a_space,
1037  break_at_next_gap) ||
1038  box_it.at_first()) {
1039  /* Form a new word out of the blobs collected */
1040  word = new WERD (&cblobs, prev_blanks, NULL);
1041  word_count++;
1042  word_it.add_after_then_move (word);
1043  if (bol) {
1044  word->set_flag (W_BOL, TRUE);
1045  bol = FALSE;
1046  }
1047  if (prev_fuzzy_sp)
1048  //probably space
1049  word->set_flag (W_FUZZY_SP, TRUE);
1050  else if (prev_fuzzy_non)
1051  word->set_flag (W_FUZZY_NON, TRUE);
1052  //probably not
1053 
1054  if (blob_box.left () > next_rep_char_word_right) {
1055  /* We need to insert a repeated char word */
1056  word = rep_char_it.extract ();
1057  word_it.add_after_then_move (word);
1058 
1059  /* Set spaces before repeated char word */
1060  repetition_spacing = find_mean_blob_spacing (word);
1061  current_gap = word->bounding_box ().left () - prev_x;
1062  current_within_xht_gap = current_gap;
1063  if (current_gap > tosp_rep_space * repetition_spacing) {
1064  blanks =
1065  (uinT8) floor (current_gap / row->space_size);
1066  if (blanks < 1)
1067  blanks = 1;
1068  }
1069  else
1070  blanks = 0;
1071  if (tosp_debug_level > 5)
1072  tprintf
1073  ("Repch wd (%d,%d) rep gap %5.2f; Lgap:%d (%d blanks);",
1074  word->bounding_box ().left (),
1075  word->bounding_box ().bottom (),
1076  repetition_spacing, current_gap, blanks);
1077  word->set_blanks (blanks);
1078  //NO uncertainty
1079  word->set_flag (W_FUZZY_SP, FALSE);
1080  word->set_flag (W_FUZZY_NON, FALSE);
1081 
1082  /* Set spaces after repeated char word (and leave current word set) */
1083  current_gap =
1084  blob_box.left () - next_rep_char_word_right;
1085  if (current_gap > tosp_rep_space * repetition_spacing) {
1086  blanks = (uinT8) (current_gap / row->space_size);
1087  if (blanks < 1)
1088  blanks = 1;
1089  }
1090  else
1091  blanks = 0;
1092  if (tosp_debug_level > 5)
1093  tprintf (" Rgap:%d (%d blanks)\n",
1094  current_gap, blanks);
1095  fuzzy_sp = FALSE;
1096  fuzzy_non = FALSE;
1097 
1098  if (rep_char_it.empty ()) {
1099  next_rep_char_word_right = MAX_INT32;
1100  }
1101  else {
1102  rep_char_it.forward ();
1103  next_rep_char_word_right =
1104  rep_char_it.data ()->bounding_box ().right ();
1105  }
1106  }
1107 
1108  if (box_it.at_first () && rep_char_it.empty ()) {
1109  //at end of line
1110  word->set_flag (W_EOL, TRUE);
1111  xstarts[1] = prev_x;
1112  }
1113  else {
1114  prev_blanks = blanks;
1115  prev_fuzzy_sp = fuzzy_sp;
1116  prev_fuzzy_non = fuzzy_non;
1117  }
1118  }
1119  }
1120  }
1121  while (!box_it.at_first ()); //until back at start
1122 
1123  /* Insert any further repeated char words */
1124  while (!rep_char_it.empty ()) {
1125  word = rep_char_it.extract ();
1126  word_it.add_after_then_move (word);
1127 
1128  /* Set spaces before repeated char word */
1129  repetition_spacing = find_mean_blob_spacing (word);
1130  current_gap = word->bounding_box ().left () - prev_x;
1131  if (current_gap > tosp_rep_space * repetition_spacing) {
1132  blanks = (uinT8) floor (current_gap / row->space_size);
1133  if (blanks < 1)
1134  blanks = 1;
1135  }
1136  else
1137  blanks = 0;
1138  if (tosp_debug_level > 5)
1139  tprintf(
1140  "Repch wd at EOL (%d,%d). rep spacing %5.2f; Lgap:%d (%d blanks)\n",
1141  word->bounding_box().left(), word->bounding_box().bottom(),
1142  repetition_spacing, current_gap, blanks);
1143  word->set_blanks (blanks);
1144  //NO uncertainty
1145  word->set_flag (W_FUZZY_SP, FALSE);
1146  word->set_flag (W_FUZZY_NON, FALSE);
1147  prev_x = word->bounding_box ().right ();
1148  if (rep_char_it.empty ()) {
1149  //at end of line
1150  word->set_flag (W_EOL, TRUE);
1151  xstarts[1] = prev_x;
1152  }
1153  else {
1154  rep_char_it.forward ();
1155  }
1156  }
1157  real_row = new ROW (row,
1158  (inT16) row->kern_size, (inT16) row->space_size);
1159  word_it.set_to_list (real_row->word_list ());
1160  //put words in row
1161  word_it.add_list_after (&words);
1162  real_row->recalc_bounding_box ();
1163 
1164  if (tosp_debug_level > 4) {
1165  tprintf ("Row: Made %d words in row ((%d,%d)(%d,%d))\n",
1166  word_count,
1167  real_row->bounding_box ().left (),
1168  real_row->bounding_box ().bottom (),
1169  real_row->bounding_box ().right (),
1170  real_row->bounding_box ().top ());
1171  }
1172  return real_row;
1173  }
1174  return NULL;
1175 }
1176 
1177 /**********************************************************************
1178  * make_blob_words
1179  *
1180  * Converts words into blobs so that each blob is a single character.
1181  * Used for chopper test.
1182  **********************************************************************/
1184  TO_ROW *row, // row to make
1185  FCOORD rotation // for drawing
1186  ) {
1187  bool bol; // start of line
1188  ROW *real_row; // output row
1189  C_OUTLINE_IT cout_it;
1190  C_BLOB_LIST cblobs;
1191  C_BLOB_IT cblob_it = &cblobs;
1192  WERD_LIST words;
1193  WERD_IT word_it; // new words
1194  WERD *word; // new word
1195  BLOBNBOX *bblob; // current blob
1196  TBOX blob_box; // bounding box
1197  BLOBNBOX_IT box_it; // iterator
1198  inT16 word_count = 0;
1199 
1200  cblob_it.set_to_list(&cblobs);
1201  box_it.set_to_list(row->blob_list());
1202  word_it.set_to_list(&words);
1203  bol = TRUE;
1204  if (!box_it.empty()) {
1205 
1206  do {
1207  bblob = box_it.data();
1208  blob_box = bblob->bounding_box();
1209  if (bblob->joined_to_prev()) {
1210  if (bblob->cblob() != NULL) {
1211  cout_it.set_to_list(cblob_it.data()->out_list());
1212  cout_it.move_to_last();
1213  cout_it.add_list_after(bblob->cblob()->out_list());
1214  delete bblob->cblob();
1215  }
1216  } else {
1217  if (bblob->cblob() != NULL)
1218  cblob_it.add_after_then_move(bblob->cblob());
1219  }
1220  box_it.forward(); // next one
1221  bblob = box_it.data();
1222  blob_box = bblob->bounding_box();
1223 
1224  if (!bblob->joined_to_prev() && !cblobs.empty()) {
1225  word = new WERD(&cblobs, 1, NULL);
1226  word_count++;
1227  word_it.add_after_then_move(word);
1228  if (bol) {
1229  word->set_flag(W_BOL, TRUE);
1230  bol = FALSE;
1231  }
1232  if (box_it.at_first()) { // at end of line
1233  word->set_flag(W_EOL, TRUE);
1234  }
1235  }
1236  }
1237  while (!box_it.at_first()); // until back at start
1238  /* Setup the row with created words. */
1239  real_row = new ROW(row, (inT16) row->kern_size, (inT16) row->space_size);
1240  word_it.set_to_list(real_row->word_list());
1241  //put words in row
1242  word_it.add_list_after(&words);
1243  real_row->recalc_bounding_box();
1244  if (tosp_debug_level > 4) {
1245  tprintf ("Row:Made %d words in row ((%d,%d)(%d,%d))\n",
1246  word_count,
1247  real_row->bounding_box().left(),
1248  real_row->bounding_box().bottom(),
1249  real_row->bounding_box().right(),
1250  real_row->bounding_box().top());
1251  }
1252  return real_row;
1253  }
1254  return NULL;
1255 }
1256 
1257 BOOL8 Textord::make_a_word_break(
1258  TO_ROW *row, // row being made
1259  TBOX blob_box, // for next_blob // how many blanks?
1260  inT16 prev_gap,
1261  TBOX prev_blob_box,
1262  inT16 real_current_gap,
1263  inT16 within_xht_current_gap,
1264  TBOX next_blob_box,
1265  inT16 next_gap,
1266  uinT8 &blanks,
1267  BOOL8 &fuzzy_sp,
1268  BOOL8 &fuzzy_non,
1269  BOOL8& prev_gap_was_a_space,
1270  BOOL8& break_at_next_gap) {
1271  BOOL8 space;
1272  inT16 current_gap;
1273  float fuzzy_sp_to_kn_limit;
1274 
1275  if (break_at_next_gap) {
1276  break_at_next_gap = FALSE;
1277  return TRUE;
1278  }
1279  /* Inhibit using the reduced gap if
1280  The kerning is large - chars are not kerned and reducing "f"s can cause
1281  erroneous blanks
1282  OR The real gap is less than 0
1283  OR The real gap is less than the kerning estimate
1284  */
1285  if ((row->kern_size > tosp_large_kerning * row->xheight) ||
1287  (real_current_gap < tosp_dont_fool_with_small_kerns * row->kern_size)))
1288  //Ignore the difference
1289  within_xht_current_gap = real_current_gap;
1290 
1292  current_gap = within_xht_current_gap;
1293  else
1294  current_gap = real_current_gap;
1295 
1296  if (tosp_old_to_method) {
1297  //Boring old method
1298  space = current_gap > row->max_nonspace;
1299  if (space && (current_gap < MAX_INT16)) {
1300  if (current_gap < row->min_space) {
1301  if (current_gap > row->space_threshold) {
1302  blanks = 1;
1303  fuzzy_sp = TRUE;
1304  fuzzy_non = FALSE;
1305  }
1306  else {
1307  blanks = 0;
1308  fuzzy_sp = FALSE;
1309  fuzzy_non = TRUE;
1310  }
1311  }
1312  else {
1313  blanks = (uinT8) (current_gap / row->space_size);
1314  if (blanks < 1)
1315  blanks = 1;
1316  fuzzy_sp = FALSE;
1317  fuzzy_non = FALSE;
1318  }
1319  }
1320  return space;
1321  }
1322  else {
1323  /* New exciting heuristic method */
1324  if (prev_blob_box.null_box ()) // Beginning of row
1325  prev_gap_was_a_space = TRUE;
1326 
1327  //Default as old TO
1328  space = current_gap > row->space_threshold;
1329 
1330  /* Set defaults for the word break incase we find one. Currently there are
1331  no fuzzy spaces. Depending on the reliability of the different heuristics
1332  we may need to set PARTICULAR spaces to fuzzy or not. The values will ONLY
1333  be used if the function returns TRUE - ie the word is to be broken.
1334  */
1335  int num_blanks = current_gap;
1336  if (row->space_size > 1.0f)
1337  num_blanks = IntCastRounded(current_gap / row->space_size);
1338  blanks = static_cast<uinT8>(ClipToRange(num_blanks, 1, MAX_UINT8));
1339  fuzzy_sp = FALSE;
1340  fuzzy_non = FALSE;
1341  /*
1342  If xht measure causes gap to flip one of the 3 thresholds act accordingly -
1343  despite any other heuristics - the MINIMUM action is to pass a fuzzy kern to
1344  context.
1345  */
1346  if (tosp_use_xht_gaps &&
1347  (real_current_gap <= row->max_nonspace) &&
1348  (within_xht_current_gap > row->max_nonspace)) {
1349  space = TRUE;
1350  fuzzy_non = TRUE;
1351 #ifndef GRAPHICS_DISABLED
1352  mark_gap (blob_box, 20,
1353  prev_gap, prev_blob_box.width (),
1354  current_gap, next_blob_box.width (), next_gap);
1355 #endif
1356  }
1357  else if (tosp_use_xht_gaps &&
1358  (real_current_gap <= row->space_threshold) &&
1359  (within_xht_current_gap > row->space_threshold)) {
1360  space = TRUE;
1362  fuzzy_sp = TRUE;
1363  else
1364  fuzzy_non = TRUE;
1365 #ifndef GRAPHICS_DISABLED
1366  mark_gap (blob_box, 21,
1367  prev_gap, prev_blob_box.width (),
1368  current_gap, next_blob_box.width (), next_gap);
1369 #endif
1370  }
1371  else if (tosp_use_xht_gaps &&
1372  (real_current_gap < row->min_space) &&
1373  (within_xht_current_gap >= row->min_space)) {
1374  space = TRUE;
1375 #ifndef GRAPHICS_DISABLED
1376  mark_gap (blob_box, 22,
1377  prev_gap, prev_blob_box.width (),
1378  current_gap, next_blob_box.width (), next_gap);
1379 #endif
1380  }
1381  else if (tosp_force_wordbreak_on_punct &&
1382  !suspected_punct_blob(row, prev_blob_box) &&
1383  suspected_punct_blob(row, blob_box)) {
1384  break_at_next_gap = TRUE;
1385  }
1386  /* Now continue with normal heuristics */
1387  else if ((current_gap < row->min_space) &&
1388  (current_gap > row->space_threshold)) {
1389  /* Heuristics to turn dubious spaces to kerns */
1391  fuzzy_sp_to_kn_limit = row->kern_size +
1393  (row->space_size - row->kern_size);
1394  else
1395  fuzzy_sp_to_kn_limit = 99999.0f;
1396 
1397  /* If current gap is significantly smaller than the previous space the other
1398  side of a narrow blob then this gap is a kern. */
1399  if ((prev_blob_box.width () > 0) &&
1400  narrow_blob (row, prev_blob_box) &&
1401  prev_gap_was_a_space &&
1402  (current_gap <= tosp_gap_factor * prev_gap)) {
1403  if ((tosp_all_flips_fuzzy) ||
1404  (current_gap > fuzzy_sp_to_kn_limit)) {
1406  fuzzy_non = TRUE;
1407  else
1408  fuzzy_sp = TRUE;
1409  }
1410  else
1411  space = FALSE;
1412 #ifndef GRAPHICS_DISABLED
1413  mark_gap (blob_box, 1,
1414  prev_gap, prev_blob_box.width (),
1415  current_gap, next_blob_box.width (), next_gap);
1416 #endif
1417  }
1418  /* If current gap not much bigger than the previous kern the other side of a
1419  narrow blob then this gap is a kern as well */
1420  else if ((prev_blob_box.width () > 0) &&
1421  narrow_blob (row, prev_blob_box) &&
1422  !prev_gap_was_a_space &&
1423  (current_gap * tosp_gap_factor <= prev_gap)) {
1424  if ((tosp_all_flips_fuzzy) ||
1425  (current_gap > fuzzy_sp_to_kn_limit)) {
1427  fuzzy_non = TRUE;
1428  else
1429  fuzzy_sp = TRUE;
1430  }
1431  else
1432  space = FALSE;
1433 #ifndef GRAPHICS_DISABLED
1434  mark_gap (blob_box, 2,
1435  prev_gap, prev_blob_box.width (),
1436  current_gap, next_blob_box.width (), next_gap);
1437 #endif
1438  }
1439  else if ((next_blob_box.width () > 0) &&
1440  narrow_blob (row, next_blob_box) &&
1441  (next_gap > row->space_threshold) &&
1442  (current_gap <= tosp_gap_factor * next_gap)) {
1443  if ((tosp_all_flips_fuzzy) ||
1444  (current_gap > fuzzy_sp_to_kn_limit)) {
1446  fuzzy_non = TRUE;
1447  else
1448  fuzzy_sp = TRUE;
1449  }
1450  else
1451  space = FALSE;
1452 #ifndef GRAPHICS_DISABLED
1453  mark_gap (blob_box, 3,
1454  prev_gap, prev_blob_box.width (),
1455  current_gap, next_blob_box.width (), next_gap);
1456 #endif
1457  }
1458  else if ((next_blob_box.width () > 0) &&
1459  narrow_blob (row, next_blob_box) &&
1460  (next_gap <= row->space_threshold) &&
1461  (current_gap * tosp_gap_factor <= next_gap)) {
1462  if ((tosp_all_flips_fuzzy) ||
1463  (current_gap > fuzzy_sp_to_kn_limit)) {
1465  fuzzy_non = TRUE;
1466  else
1467  fuzzy_sp = TRUE;
1468  }
1469  else
1470  space = FALSE;
1471 #ifndef GRAPHICS_DISABLED
1472  mark_gap (blob_box, 4,
1473  prev_gap, prev_blob_box.width (),
1474  current_gap, next_blob_box.width (), next_gap);
1475 #endif
1476  }
1477  else if ((((next_blob_box.width () > 0) &&
1478  narrow_blob (row, next_blob_box)) ||
1479  ((prev_blob_box.width () > 0) &&
1480  narrow_blob (row, prev_blob_box)))) {
1481  fuzzy_sp = TRUE;
1482 #ifndef GRAPHICS_DISABLED
1483  mark_gap (blob_box, 6,
1484  prev_gap, prev_blob_box.width (),
1485  current_gap, next_blob_box.width (), next_gap);
1486 #endif
1487  }
1488  }
1489  else if ((current_gap > row->max_nonspace) &&
1490  (current_gap <= row->space_threshold)) {
1491 
1492  /* Heuristics to turn dubious kerns to spaces */
1493  /* TRIED THIS BUT IT MADE THINGS WORSE
1494  if ( prev_gap == MAX_INT16 )
1495  prev_gap = 0; // start of row
1496  if ( next_gap == MAX_INT16 )
1497  next_gap = 0; // end of row
1498  */
1499  if ((prev_blob_box.width () > 0) &&
1500  (next_blob_box.width () > 0) &&
1501  (current_gap >=
1502  tosp_kern_gap_factor1 * MAX (prev_gap, next_gap)) &&
1503  wide_blob (row, prev_blob_box) &&
1504  wide_blob (row, next_blob_box)) {
1505 
1506  space = TRUE;
1507  /*
1508  tosp_flip_caution is an attempt to stop the default changing in cases
1509  where there is a large difference between the kern and space estimates.
1510  See problem in 'chiefs' where "have" gets split in the quotation.
1511  */
1512  if ((tosp_flip_fuzz_kn_to_sp) &&
1513  ((tosp_flip_caution <= 0) ||
1514  (tosp_flip_caution * row->kern_size > row->space_size)))
1515  fuzzy_sp = TRUE;
1516  else
1517  fuzzy_non = TRUE;
1518 #ifndef GRAPHICS_DISABLED
1519  mark_gap (blob_box, 7,
1520  prev_gap, prev_blob_box.width (),
1521  current_gap, next_blob_box.width (), next_gap);
1522 #endif
1523  } else if (prev_blob_box.width() > 0 &&
1524  next_blob_box.width() > 0 &&
1525  current_gap > 5 && // Rule 9 handles small gap, big ratio.
1526  current_gap >=
1527  tosp_kern_gap_factor2 * MAX(prev_gap, next_gap) &&
1528  !(narrow_blob(row, prev_blob_box) ||
1529  suspected_punct_blob(row, prev_blob_box)) &&
1530  !(narrow_blob(row, next_blob_box) ||
1531  suspected_punct_blob(row, next_blob_box))) {
1532  space = TRUE;
1533  fuzzy_non = TRUE;
1534 #ifndef GRAPHICS_DISABLED
1535  mark_gap (blob_box, 8,
1536  prev_gap, prev_blob_box.width (),
1537  current_gap, next_blob_box.width (), next_gap);
1538 #endif
1539  }
1540  else if ((tosp_kern_gap_factor3 > 0) &&
1541  (prev_blob_box.width () > 0) &&
1542  (next_blob_box.width () > 0) &&
1543  (current_gap >= tosp_kern_gap_factor3 * MAX (prev_gap, next_gap)) &&
1545  (!suspected_punct_blob (row, prev_blob_box) &&
1546  !suspected_punct_blob (row, next_blob_box)))) {
1547  space = TRUE;
1548  fuzzy_non = TRUE;
1549 #ifndef GRAPHICS_DISABLED
1550  mark_gap (blob_box, 9,
1551  prev_gap, prev_blob_box.width (),
1552  current_gap, next_blob_box.width (), next_gap);
1553 #endif
1554  }
1555  }
1556  if (tosp_debug_level > 10)
1557  tprintf("word break = %d current_gap = %d, prev_gap = %d, "
1558  "next_gap = %d\n", space ? 1 : 0, current_gap,
1559  prev_gap, next_gap);
1560  prev_gap_was_a_space = space && !(fuzzy_non);
1561  return space;
1562  }
1563 }
1564 
1565 BOOL8 Textord::narrow_blob(TO_ROW *row, TBOX blob_box) {
1566  BOOL8 result;
1567  result = ((blob_box.width () <= tosp_narrow_fraction * row->xheight) ||
1568  (((float) blob_box.width () / blob_box.height ()) <=
1570  return result;
1571 }
1572 
1573 BOOL8 Textord::wide_blob(TO_ROW *row, TBOX blob_box) {
1574  BOOL8 result;
1575  if (tosp_wide_fraction > 0) {
1576  if (tosp_wide_aspect_ratio > 0)
1577  result = ((blob_box.width () >= tosp_wide_fraction * row->xheight) &&
1578  (((float) blob_box.width () / blob_box.height ()) >
1580  else
1581  result = (blob_box.width () >= tosp_wide_fraction * row->xheight);
1582  }
1583  else
1584  result = !narrow_blob (row, blob_box);
1585  return result;
1586 }
1587 
1588 BOOL8 Textord::suspected_punct_blob(TO_ROW *row, TBOX box) {
1589  BOOL8 result;
1590  float baseline;
1591  float blob_x_centre;
1592  /* Find baseline of centre of blob */
1593  blob_x_centre = (box.right () + box.left ()) / 2.0;
1594  baseline = row->baseline.y (blob_x_centre);
1595 
1596  result = (box.height () <= 0.66 * row->xheight) ||
1597  (box.top () < baseline + row->xheight / 2.0) ||
1598  (box.bottom () > baseline + row->xheight / 2.0);
1599  return result;
1600 }
1601 
1602 
1603 void Textord::peek_at_next_gap(TO_ROW *row,
1604  BLOBNBOX_IT box_it,
1605  TBOX &next_blob_box,
1606  inT16 &next_gap,
1607  inT16 &next_within_xht_gap) {
1608  TBOX next_reduced_blob_box;
1609  TBOX bit_beyond;
1610  BLOBNBOX_IT reduced_box_it = box_it;
1611 
1612  next_blob_box = box_next (&box_it);
1613  next_reduced_blob_box = reduced_box_next (row, &reduced_box_it);
1614  if (box_it.at_first ()) {
1615  next_gap = MAX_INT16;
1616  next_within_xht_gap = MAX_INT16;
1617  }
1618  else {
1619  bit_beyond = box_it.data ()->bounding_box ();
1620  next_gap = bit_beyond.left () - next_blob_box.right ();
1621  bit_beyond = reduced_box_next (row, &reduced_box_it);
1622  next_within_xht_gap =
1623  bit_beyond.left () - next_reduced_blob_box.right ();
1624  }
1625 }
1626 
1627 
1628 #ifndef GRAPHICS_DISABLED
1629 void Textord::mark_gap(
1630  TBOX blob, // blob following gap
1631  inT16 rule, // heuristic id
1632  inT16 prev_gap,
1633  inT16 prev_blob_width,
1634  inT16 current_gap,
1635  inT16 next_blob_width,
1636  inT16 next_gap) {
1637  ScrollView::Color col; //of ellipse marking flipped gap
1638 
1639  switch (rule) {
1640  case 1:
1641  col = ScrollView::RED;
1642  break;
1643  case 2:
1644  col = ScrollView::CYAN;
1645  break;
1646  case 3:
1647  col = ScrollView::GREEN;
1648  break;
1649  case 4:
1650  col = ScrollView::BLACK;
1651  break;
1652  case 5:
1653  col = ScrollView::MAGENTA;
1654  break;
1655  case 6:
1656  col = ScrollView::BLUE;
1657  break;
1658 
1659  case 7:
1660  col = ScrollView::WHITE;
1661  break;
1662  case 8:
1663  col = ScrollView::YELLOW;
1664  break;
1665  case 9:
1666  col = ScrollView::BLACK;
1667  break;
1668 
1669  case 20:
1670  col = ScrollView::CYAN;
1671  break;
1672  case 21:
1673  col = ScrollView::GREEN;
1674  break;
1675  case 22:
1676  col = ScrollView::MAGENTA;
1677  break;
1678  default:
1679  col = ScrollView::BLACK;
1680  }
1682  to_win->Pen(col);
1683  /* if (rule < 20)
1684  //interior_style(to_win, INT_SOLID, FALSE);
1685  else
1686  //interior_style(to_win, INT_HOLLOW, TRUE);*/
1687  //x radius
1688  to_win->Ellipse (current_gap / 2.0f,
1689  blob.height () / 2.0f, //y radius
1690  //x centre
1691  blob.left () - current_gap / 2.0f,
1692  //y centre
1693  blob.bottom () + blob.height () / 2.0f);
1694  }
1695  if (tosp_debug_level > 5)
1696  tprintf(" (%d,%d) Sp<->Kn Rule %d %d %d %d %d %d\n",
1697  blob.left() - current_gap / 2, blob.bottom(), rule, prev_gap,
1698  prev_blob_width, current_gap, next_blob_width, next_gap);
1699 }
1700 #endif
1701 
1702 float Textord::find_mean_blob_spacing(WERD *word) {
1703  C_BLOB_IT cblob_it;
1704  TBOX blob_box;
1705  inT32 gap_sum = 0;
1706  inT16 gap_count = 0;
1707  inT16 prev_right;
1708 
1709  cblob_it.set_to_list (word->cblob_list ());
1710  if (!cblob_it.empty ()) {
1711  cblob_it.mark_cycle_pt ();
1712  prev_right = cblob_it.data ()->bounding_box ().right ();
1713  //first blob
1714  cblob_it.forward ();
1715  for (; !cblob_it.cycled_list (); cblob_it.forward ()) {
1716  blob_box = cblob_it.data ()->bounding_box ();
1717  gap_sum += blob_box.left () - prev_right;
1718  gap_count++;
1719  prev_right = blob_box.right ();
1720  }
1721  }
1722  if (gap_count > 0)
1723  return (gap_sum / (float) gap_count);
1724  else
1725  return 0.0f;
1726 }
1727 
1728 
1729 BOOL8 Textord::ignore_big_gap(TO_ROW *row,
1730  inT32 row_length,
1731  GAPMAP *gapmap,
1732  inT16 left,
1733  inT16 right) {
1734  inT16 gap = right - left + 1;
1735 
1736  if (tosp_ignore_big_gaps > 999) return FALSE; // Don't ignore
1737  if (tosp_ignore_big_gaps > 0)
1738  return (gap > tosp_ignore_big_gaps * row->xheight);
1739  if (gap > tosp_ignore_very_big_gaps * row->xheight)
1740  return TRUE;
1741  if (tosp_ignore_big_gaps == 0) {
1742  if ((gap > 2.1 * row->xheight) && (row_length > 20 * row->xheight))
1743  return TRUE;
1744  if ((gap > 1.75 * row->xheight) &&
1745  ((row_length > 35 * row->xheight) ||
1746  gapmap->table_gap (left, right)))
1747  return TRUE;
1748  }
1749  else {
1750  /* ONLY time gaps < 3.0 * xht are ignored is when they are part of a table */
1751  if ((gap > gapmap_big_gaps * row->xheight) &&
1752  gapmap->table_gap (left, right))
1753  return TRUE;
1754  }
1755  return FALSE;
1756 }
1757 
1758 /**********************************************************************
1759  * reduced_box_next
1760  *
1761  * Compute the bounding box of this blob with merging of x overlaps
1762  * but no pre-chopping.
1763  * Then move the iterator on to the start of the next blob.
1764  * DON'T reduce the box for small things - eg punctuation.
1765  **********************************************************************/
1766 TBOX Textord::reduced_box_next(
1767  TO_ROW *row, // current row
1768  BLOBNBOX_IT *it // iterator to blobds
1769  ) {
1770  BLOBNBOX *blob; //current blob
1771  BLOBNBOX *head_blob; //place to store box
1772  TBOX full_box; //full blob boundg box
1773  TBOX reduced_box; //box of significant part
1774  inT16 left_above_xht; //ABOVE xht left limit
1775  inT16 new_left_above_xht; //ABOVE xht left limit
1776 
1777  blob = it->data ();
1778  if (blob->red_box_set ()) {
1779  reduced_box = blob->reduced_box ();
1780  do {
1781  it->forward();
1782  blob = it->data();
1783  }
1784  while (blob->cblob() == NULL || blob->joined_to_prev());
1785  return reduced_box;
1786  }
1787  head_blob = blob;
1788  full_box = blob->bounding_box ();
1789  reduced_box = reduced_box_for_blob (blob, row, &left_above_xht);
1790  do {
1791  it->forward ();
1792  blob = it->data ();
1793  if (blob->cblob() == NULL)
1794  //was pre-chopped
1795  full_box += blob->bounding_box ();
1796  else if (blob->joined_to_prev ()) {
1797  reduced_box +=
1798  reduced_box_for_blob(blob, row, &new_left_above_xht);
1799  left_above_xht = MIN (left_above_xht, new_left_above_xht);
1800  }
1801  }
1802  //until next real blob
1803  while (blob->cblob() == NULL || blob->joined_to_prev());
1804 
1805  if ((reduced_box.width () > 0) &&
1806  ((reduced_box.left () + tosp_near_lh_edge * reduced_box.width ())
1807  < left_above_xht) && (reduced_box.height () > 0.7 * row->xheight)) {
1808 #ifndef GRAPHICS_DISABLED
1811 #endif
1812  }
1813  else
1814  reduced_box = full_box;
1815  head_blob->set_reduced_box (reduced_box);
1816  return reduced_box;
1817 }
1818 
1819 
1820 /*************************************************************************
1821  * reduced_box_for_blob()
1822  * Find box for blob which is the same height and y position as the whole blob,
1823  * but whose left limit is the left most position of the blob ABOVE the
1824  * baseline and whose right limit is the right most position of the blob BELOW
1825  * the xheight.
1826  *
1827  *
1828  * !!!!!!! WONT WORK WITH LARGE UPPER CASE CHARS - T F V W - look at examples on
1829  * "home". Perhaps we need something which say if the width ABOVE the
1830  * xht alone includes the whole of the reduced width, then use the full
1831  * blob box - Might still fail on italic F
1832  *
1833  * Alternatively we could be a little less severe and only reduce the
1834  * left and right edges by half the difference between the full box and
1835  * the reduced box.
1836  *
1837  * NOTE that we need to rotate all the coordinates as
1838  * find_blob_limits finds the y min and max within a specified x band
1839  *************************************************************************/
1840 TBOX Textord::reduced_box_for_blob(
1841  BLOBNBOX *blob,
1842  TO_ROW *row,
1843  inT16 *left_above_xht) {
1844  float baseline;
1845  float blob_x_centre;
1846  float left_limit;
1847  float right_limit;
1848  float junk;
1849  TBOX blob_box;
1850 
1851  /* Find baseline of centre of blob */
1852 
1853  blob_box = blob->bounding_box ();
1854  blob_x_centre = (blob_box.left () + blob_box.right ()) / 2.0;
1855  baseline = row->baseline.y (blob_x_centre);
1856 
1857  /*
1858  Find LH limit of blob ABOVE the xht. This is so that we can detect certain
1859  caps ht chars which should NOT have their box reduced: T, Y, V, W etc
1860  */
1861  left_limit = (float) MAX_INT32;
1862  junk = (float) -MAX_INT32;
1863  find_cblob_hlimits(blob->cblob(), (baseline + 1.1 * row->xheight),
1864  static_cast<float>(MAX_INT16), left_limit, junk);
1865  if (left_limit > junk)
1866  *left_above_xht = MAX_INT16; //No area above xht
1867  else
1868  *left_above_xht = (inT16) floor (left_limit);
1869  /*
1870  Find reduced LH limit of blob - the left extent of the region ABOVE the
1871  baseline.
1872  */
1873  left_limit = (float) MAX_INT32;
1874  junk = (float) -MAX_INT32;
1875  find_cblob_hlimits(blob->cblob(), baseline, static_cast<float>(MAX_INT16),
1876  left_limit, junk);
1877 
1878  if (left_limit > junk)
1879  return TBOX (); //no area within xht so return empty box
1880  /*
1881  Find reduced RH limit of blob - the right extent of the region BELOW the xht.
1882  */
1883  junk = (float) MAX_INT32;
1884  right_limit = (float) -MAX_INT32;
1885  find_cblob_hlimits(blob->cblob(), static_cast<float>(-MAX_INT16),
1886  (baseline + row->xheight), junk, right_limit);
1887  if (junk > right_limit)
1888  return TBOX (); //no area within xht so return empty box
1889 
1890  return TBOX (ICOORD ((inT16) floor (left_limit), blob_box.bottom ()),
1891  ICOORD ((inT16) ceil (right_limit), blob_box.top ()));
1892 }
1893 } // namespace tesseract
double tosp_rep_space
Definition: textord.h:341
EXTERN bool textord_show_initial_words
Definition: tovars.cpp:25
double tosp_fuzzy_space_factor
Definition: textord.h:327
double tosp_silly_kn_sp_gap
Definition: textord.h:369
bool tosp_improve_thresh
Definition: textord.h:301
TBOX box_next_pre_chopped(BLOBNBOX_IT *it)
Definition: blobbox.cpp:660
double tosp_ignore_big_gaps
Definition: textord.h:339
C_OUTLINE_LIST * out_list()
Definition: stepblob.h:64
void plot(ScrollView *fd) const
Definition: rect.h:278
bool tosp_old_to_method
Definition: textord.h:263
#define TRUE
Definition: capi.h:45
Definition: points.h:189
inT32 get_total() const
Definition: statistc.h:86
int32_t inT32
Definition: host.h:38
inT32 max_nonspace
Definition: blobbox.h:660
double tosp_threshold_bias2
Definition: textord.h:318
PITCH_TYPE pitch_decision
Definition: blobbox.h:646
float kern_size
Definition: blobbox.h:662
float xheight
Definition: blobbox.h:653
bool tosp_row_use_cert_spaces1
Definition: textord.h:283
bool tosp_old_to_constrain_sp_kn
Definition: textord.h:266
double tosp_fuzzy_space_factor1
Definition: textord.h:329
#define MAX_INT32
Definition: host.h:62
void find_cblob_hlimits(C_BLOB *blob, float bottomy, float topy, float &xmin, float &xmax)
Definition: blobbox.cpp:571
#define MAX_UINT8
Definition: host.h:63
double tosp_min_sane_kn_sp
Definition: textord.h:353
float space_size
Definition: blobbox.h:663
bool tosp_row_use_cert_spaces
Definition: textord.h:279
bool red_box_set() const
Definition: blobbox.h:244
#define MAX_INT16
Definition: host.h:61
Definition: werd.h:36
const TBOX & reduced_box() const
Definition: blobbox.h:231
bool tosp_recovery_isolated_row_stats
Definition: textord.h:285
bool tosp_flip_fuzz_kn_to_sp
Definition: textord.h:298
inT32 space_threshold
Definition: blobbox.h:661
double tosp_table_kn_sp_ratio
Definition: textord.h:345
double tosp_pass_wide_fuzz_sp_to_context
Definition: textord.h:371
void set_flag(WERD_FLAGS mask, BOOL8 value)
Definition: werd.h:129
#define tprintf(...)
Definition: tprintf.h:31
float fixed_pitch
Definition: blobbox.h:647
double tosp_table_fuzzy_kn_sp_ratio
Definition: textord.h:349
EXTERN double gapmap_big_gaps
Definition: gap_map.cpp:19
bool joined_to_prev() const
Definition: blobbox.h:241
WERD_LIST * word_list()
Definition: ocrrow.h:52
C_BLOB * cblob() const
Definition: blobbox.h:253
#define MAXSPACING
Definition: tospace.cpp:39
int IntCastRounded(double x)
Definition: helpers.h:179
bool tosp_fuzzy_limit_all
Definition: textord.h:289
inT32 pile_count(inT32 value) const
Definition: statistc.h:78
bool tosp_block_use_cert_spaces
Definition: textord.h:277
bool tosp_old_to_bug_fix
Definition: textord.h:275
TBOX bounding_box() const
Definition: werd.cpp:160
int16_t inT16
Definition: host.h:36
double tosp_wide_aspect_ratio
Definition: textord.h:325
inT16 left() const
Definition: rect.h:68
BOOL8 table_gap(inT16 left, inT16 right)
Definition: gap_map.cpp:159
bool tosp_use_pre_chopping
Definition: textord.h:273
double tosp_old_sp_kn_th_factor
Definition: textord.h:314
inT32 min_space
Definition: blobbox.h:659
bool tosp_rule_9_test_punct
Definition: textord.h:297
double tosp_enough_small_gaps
Definition: textord.h:343
QSPLINE baseline
Definition: blobbox.h:666
bool tosp_all_flips_fuzzy
Definition: textord.h:287
WERD_LIST rep_words
Definition: blobbox.h:664
bool tosp_use_xht_gaps
Definition: textord.h:293
TBOX box_next(BLOBNBOX_IT *it)
Definition: blobbox.cpp:631
void to_spacing(ICOORD page_tr, TO_BLOCK_LIST *blocks)
Definition: tospace.cpp:42
double tosp_table_xht_sp_ratio
Definition: textord.h:347
unsigned char BOOL8
Definition: host.h:44
double median() const
Definition: statistc.cpp:239
#define FALSE
Definition: capi.h:46
double y(double x) const
Definition: quspline.cpp:217
double tosp_kern_gap_factor2
Definition: textord.h:336
T ClipToRange(const T &x, const T &lower_bound, const T &upper_bound)
Definition: helpers.h:122
EXTERN ScrollView * to_win
Definition: drawtord.cpp:38
void set_reduced_box(TBOX new_box)
Definition: blobbox.h:234
bool null_box() const
Definition: rect.h:46
void plot_word_decisions(ScrollView *win, inT16 pitch, TO_ROW *row)
Definition: drawtord.cpp:250
Definition: werd.h:35
bool tosp_only_use_prop_rows
Definition: textord.h:268
TBOX bounding_box() const
Definition: ocrrow.h:85
bool tosp_stats_use_xht_gaps
Definition: textord.h:291
void add(inT32 value, inT32 count)
Definition: statistc.cpp:101
double tosp_init_guess_xht_mult
Definition: textord.h:357
inT16 top() const
Definition: rect.h:54
int tosp_redo_kern_limit
Definition: textord.h:306
#define MAX(x, y)
Definition: ndminx.h:24
int tosp_enough_space_samples_for_median
Definition: textord.h:304
bool tosp_only_use_xht_gaps
Definition: textord.h:295
int tosp_sanity_method
Definition: textord.h:311
double tosp_threshold_bias1
Definition: textord.h:316
double tosp_flip_caution
Definition: textord.h:361
Definition: rect.h:30
double tosp_gap_factor
Definition: textord.h:332
#define MIN(x, y)
Definition: ndminx.h:28
double tosp_dont_fool_with_small_kerns
Definition: textord.h:365
double tosp_fuzzy_space_factor2
Definition: textord.h:331
void Ellipse(int x, int y, int width, int height)
Definition: scrollview.cpp:615
C_BLOB_LIST * cblob_list()
Definition: werd.h:100
ROW * make_prop_words(TO_ROW *row, FCOORD rotation)
Definition: tospace.cpp:890
inT16 height() const
Definition: rect.h:104
uint8_t uinT8
Definition: host.h:35
inT16 right() const
Definition: rect.h:75
double tosp_wide_fraction
Definition: textord.h:323
bool tosp_only_small_gaps_for_kern
Definition: textord.h:286
inT16 width() const
Definition: rect.h:111
Definition: statistc.h:33
double tosp_kern_gap_factor3
Definition: textord.h:338
bool tosp_force_wordbreak_on_punct
Definition: textord.h:271
double tosp_ignore_very_big_gaps
Definition: textord.h:340
double tosp_init_guess_kn_mult
Definition: textord.h:355
const int max
double ile(double frac) const
Definition: statistc.cpp:174
inT16 bottom() const
Definition: rect.h:61
bool tosp_flip_fuzz_sp_to_kn
Definition: textord.h:299
double tosp_near_lh_edge
Definition: textord.h:367
double tosp_narrow_fraction
Definition: textord.h:320
void set_blanks(uinT8 new_blanks)
Definition: werd.h:107
double tosp_kern_gap_factor1
Definition: textord.h:334
double mean() const
Definition: statistc.cpp:135
double tosp_max_sane_kn_thresh
Definition: textord.h:359
BLOBNBOX_LIST * blob_list()
Definition: blobbox.h:595
TO_ROW_LIST * get_rows()
Definition: blobbox.h:700
double tosp_narrow_aspect_ratio
Definition: textord.h:322
Definition: werd.h:60
void recalc_bounding_box()
Definition: ocrrow.cpp:101
Definition: ocrrow.h:32
const TBOX & bounding_box() const
Definition: blobbox.h:215
ROW * make_blob_words(TO_ROW *row, FCOORD rotation)
Definition: tospace.cpp:1183
double tosp_fuzzy_sp_fraction
Definition: textord.h:351
bool tosp_narrow_blobs_not_cert
Definition: textord.h:281
Definition: gap_map.h:15
void Pen(Color color)
Definition: scrollview.cpp:726
integer coordinate
Definition: points.h:30
double tosp_large_kerning
Definition: textord.h:363
double tosp_fuzzy_kn_fraction
Definition: textord.h:350