tesseract  4.00.00dev
docqual.cpp
Go to the documentation of this file.
1 /******************************************************************
2  * File: docqual.cpp (Formerly docqual.c)
3  * Description: Document Quality Metrics
4  * Author: Phil Cheatle
5  * Created: Mon May 9 11:27:28 BST 1994
6  *
7  * (C) Copyright 1994, Hewlett-Packard Ltd.
8  ** Licensed under the Apache License, Version 2.0 (the "License");
9  ** you may not use this file except in compliance with the License.
10  ** You may obtain a copy of the License at
11  ** http://www.apache.org/licenses/LICENSE-2.0
12  ** Unless required by applicable law or agreed to in writing, software
13  ** distributed under the License is distributed on an "AS IS" BASIS,
14  ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15  ** See the License for the specific language governing permissions and
16  ** limitations under the License.
17  *
18  **********************************************************************/
19 
20 #ifdef _MSC_VER
21 #pragma warning(disable:4244) // Conversion warnings
22 #endif
23 
24 #include <ctype.h>
25 #include "docqual.h"
26 #include "reject.h"
27 #include "tesscallback.h"
28 #include "tessvars.h"
29 #include "globals.h"
30 #include "tesseractclass.h"
31 
32 namespace tesseract{
33 
34 // A little class to provide the callbacks as we have no pre-bound args.
36  explicit DocQualCallbacks(WERD_RES* word0)
37  : word(word0), match_count(0), accepted_match_count(0) {}
38 
39  void CountMatchingBlobs(int index) {
40  ++match_count;
41  }
42 
43  void CountAcceptedBlobs(int index) {
44  if (word->reject_map[index].accepted())
46  ++match_count;
47  }
48 
49  void AcceptIfGoodQuality(int index) {
50  if (word->reject_map[index].accept_if_good_quality())
51  word->reject_map[index].setrej_quality_accept();
52  }
53 
57 };
58 
59 /*************************************************************************
60  * word_blob_quality()
61  * How many blobs in the box_word are identical to those of the inword?
62  * ASSUME blobs in both initial word and box_word are in ascending order of
63  * left hand blob edge.
64  *************************************************************************/
66  if (word->bln_boxes == NULL ||
67  word->rebuild_word == NULL || word->rebuild_word->blobs.empty())
68  return 0;
69 
70  DocQualCallbacks cb(word);
72  *word->rebuild_word,
74  return cb.match_count;
75 }
76 
78  inT16 i = 0;
79  inT16 err_count = 0;
80 
81  if (word->rebuild_word != NULL) {
82  for (int b = 0; b < word->rebuild_word->NumBlobs(); ++b) {
83  TBLOB* blob = word->rebuild_word->blobs[b];
84  err_count += count_outline_errs(word->best_choice->unichar_string()[i],
85  blob->NumOutlines());
86  i++;
87  }
88  }
89  return err_count;
90 }
91 
92 /*************************************************************************
93  * word_char_quality()
94  * Combination of blob quality and outline quality - how many good chars are
95  * there? - I.e chars which pass the blob AND outline tests.
96  *************************************************************************/
98  ROW *row,
101  if (word->bln_boxes == NULL || word->rebuild_word == NULL ||
102  word->rebuild_word->blobs.empty()) {
103  *match_count = 0;
104  *accepted_match_count = 0;
105  return;
106  }
107 
108  DocQualCallbacks cb(word);
110  *word->rebuild_word,
112  *match_count = cb.match_count;
113  *accepted_match_count = cb.accepted_match_count;
114 }
115 
116 /*************************************************************************
117  * unrej_good_chs()
118  * Unreject POTENTIAL rejects if the blob passes the blob and outline checks
119  *************************************************************************/
121  if (word->bln_boxes == NULL ||
122  word->rebuild_word == NULL || word->rebuild_word->blobs.empty())
123  return;
124 
125  DocQualCallbacks cb(word);
127  *word->rebuild_word,
129 }
130 
131 inT16 Tesseract::count_outline_errs(char c, inT16 outline_count) {
132  int expected_outline_count;
133 
134  if (STRING (outlines_odd).contains (c))
135  return 0; // Don't use this char
136  else if (STRING (outlines_2).contains (c))
137  expected_outline_count = 2;
138  else
139  expected_outline_count = 1;
140  return abs (outline_count - expected_outline_count);
141 }
142 
144  BOOL8 good_quality_doc) {
145  if ((tessedit_good_quality_unrej && good_quality_doc))
146  unrej_good_quality_words(page_res_it);
147  doc_and_block_rejection(page_res_it, good_quality_doc);
148  if (unlv_tilde_crunching) {
149  tilde_crunch(page_res_it);
150  tilde_delete(page_res_it);
151  }
152 }
153 
154 /*************************************************************************
155  * unrej_good_quality_words()
156  * Accept potential rejects in words which pass the following checks:
157  * - Contains a potential reject
158  * - Word looks like a sensible alpha word.
159  * - Word segmentation is the same as the original image
160  * - All characters have the expected number of outlines
161  * NOTE - the rejection counts are recalculated after unrejection
162  * - CAN'T do it in a single pass without a bit of fiddling
163  * - keep it simple but inefficient
164  *************************************************************************/
165 void Tesseract::unrej_good_quality_words( //unreject potential
166  PAGE_RES_IT &page_res_it) {
167  WERD_RES *word;
168  ROW_RES *current_row;
169  BLOCK_RES *current_block;
170  int i;
171 
172  page_res_it.restart_page ();
173  while (page_res_it.word () != NULL) {
174  check_debug_pt (page_res_it.word (), 100);
175  if (bland_unrej) {
176  word = page_res_it.word ();
177  for (i = 0; i < word->reject_map.length (); i++) {
178  if (word->reject_map[i].accept_if_good_quality ())
179  word->reject_map[i].setrej_quality_accept ();
180  }
181  page_res_it.forward ();
182  }
183  else if ((page_res_it.row ()->char_count > 0) &&
184  ((page_res_it.row ()->rej_count /
185  (float) page_res_it.row ()->char_count) <=
186  quality_rowrej_pc)) {
187  word = page_res_it.word ();
189  (tessedit_unrej_any_wd ||
190  acceptable_word_string(*word->uch_set,
191  word->best_choice->unichar_string().string(),
193  != AC_UNACCEPTABLE)) {
194  unrej_good_chs(word, page_res_it.row ()->row);
195  }
196  page_res_it.forward ();
197  }
198  else {
199  /* Skip to end of dodgy row */
200  current_row = page_res_it.row ();
201  while ((page_res_it.word () != NULL) &&
202  (page_res_it.row () == current_row))
203  page_res_it.forward ();
204  }
205  check_debug_pt (page_res_it.word (), 110);
206  }
207  page_res_it.restart_page ();
208  page_res_it.page_res->char_count = 0;
209  page_res_it.page_res->rej_count = 0;
210  current_block = NULL;
211  current_row = NULL;
212  while (page_res_it.word () != NULL) {
213  if (current_block != page_res_it.block ()) {
214  current_block = page_res_it.block ();
215  current_block->char_count = 0;
216  current_block->rej_count = 0;
217  }
218  if (current_row != page_res_it.row ()) {
219  current_row = page_res_it.row ();
220  current_row->char_count = 0;
221  current_row->rej_count = 0;
222  current_row->whole_word_rej_count = 0;
223  }
224  page_res_it.rej_stat_word ();
225  page_res_it.forward ();
226  }
227 }
228 
229 
230 /*************************************************************************
231  * doc_and_block_rejection()
232  *
233  * If the page has too many rejects - reject all of it.
234  * If any block has too many rejects - reject all words in the block
235  *************************************************************************/
236 
237 void Tesseract::doc_and_block_rejection( //reject big chunks
238  PAGE_RES_IT &page_res_it,
239  BOOL8 good_quality_doc) {
240  inT16 block_no = 0;
241  inT16 row_no = 0;
242  BLOCK_RES *current_block;
243  ROW_RES *current_row;
244 
245  BOOL8 rej_word;
246  BOOL8 prev_word_rejected;
247  inT16 char_quality = 0;
248  inT16 accepted_char_quality;
249 
250  if (page_res_it.page_res->rej_count * 100.0 /
251  page_res_it.page_res->char_count > tessedit_reject_doc_percent) {
252  reject_whole_page(page_res_it);
253  if (tessedit_debug_doc_rejection) {
254  tprintf("REJECT ALL #chars: %d #Rejects: %d; \n",
255  page_res_it.page_res->char_count,
256  page_res_it.page_res->rej_count);
257  }
258  } else {
259  if (tessedit_debug_doc_rejection) {
260  tprintf("NO PAGE REJECTION #chars: %d # Rejects: %d; \n",
261  page_res_it.page_res->char_count,
262  page_res_it.page_res->rej_count);
263  }
264 
265  /* Walk blocks testing for block rejection */
266 
267  page_res_it.restart_page();
268  WERD_RES* word;
269  while ((word = page_res_it.word()) != NULL) {
270  current_block = page_res_it.block();
271  block_no = current_block->block->index();
272  if (current_block->char_count > 0 &&
273  (current_block->rej_count * 100.0 / current_block->char_count) >
274  tessedit_reject_block_percent) {
275  if (tessedit_debug_block_rejection) {
276  tprintf("REJECTING BLOCK %d #chars: %d; #Rejects: %d\n",
277  block_no, current_block->char_count,
278  current_block->rej_count);
279  }
280  prev_word_rejected = FALSE;
281  while ((word = page_res_it.word()) != NULL &&
282  (page_res_it.block() == current_block)) {
283  if (tessedit_preserve_blk_rej_perfect_wds) {
284  rej_word = word->reject_map.reject_count() > 0 ||
285  word->reject_map.length () < tessedit_preserve_min_wd_len;
286  if (rej_word && tessedit_dont_blkrej_good_wds &&
287  word->reject_map.length() >= tessedit_preserve_min_wd_len &&
288  acceptable_word_string(
289  *word->uch_set,
290  word->best_choice->unichar_string().string(),
291  word->best_choice->unichar_lengths().string()) !=
292  AC_UNACCEPTABLE) {
293  word_char_quality(word, page_res_it.row()->row,
294  &char_quality,
295  &accepted_char_quality);
296  rej_word = char_quality != word->reject_map.length();
297  }
298  } else {
299  rej_word = TRUE;
300  }
301  if (rej_word) {
302  /*
303  Reject spacing if both current and prev words are rejected.
304  NOTE - this is NOT restricted to FUZZY spaces. - When tried this
305  generated more space errors.
306  */
307  if (tessedit_use_reject_spaces &&
308  prev_word_rejected &&
309  page_res_it.prev_row() == page_res_it.row() &&
310  word->word->space() == 1)
311  word->reject_spaces = TRUE;
313  }
314  prev_word_rejected = rej_word;
315  page_res_it.forward();
316  }
317  } else {
318  if (tessedit_debug_block_rejection) {
319  tprintf("NOT REJECTING BLOCK %d #chars: %d # Rejects: %d; \n",
320  block_no, page_res_it.block()->char_count,
321  page_res_it.block()->rej_count);
322  }
323 
324  /* Walk rows in block testing for row rejection */
325  row_no = 0;
326  while (page_res_it.word() != NULL &&
327  page_res_it.block() == current_block) {
328  current_row = page_res_it.row();
329  row_no++;
330  /* Reject whole row if:
331  fraction of chars on row which are rejected exceed a limit AND
332  fraction rejects which occur in WHOLE WERD rejects is LESS THAN a
333  limit
334  */
335  if (current_row->char_count > 0 &&
336  (current_row->rej_count * 100.0 / current_row->char_count) >
337  tessedit_reject_row_percent &&
338  (current_row->whole_word_rej_count * 100.0 /
339  current_row->rej_count) <
340  tessedit_whole_wd_rej_row_percent) {
341  if (tessedit_debug_block_rejection) {
342  tprintf("REJECTING ROW %d #chars: %d; #Rejects: %d\n",
343  row_no, current_row->char_count,
344  current_row->rej_count);
345  }
346  prev_word_rejected = FALSE;
347  while ((word = page_res_it.word()) != NULL &&
348  page_res_it.row () == current_row) {
349  /* Preserve words on good docs unless they are mostly rejected*/
350  if (!tessedit_row_rej_good_docs && good_quality_doc) {
351  rej_word = word->reject_map.reject_count() /
352  static_cast<float>(word->reject_map.length()) >
353  tessedit_good_doc_still_rowrej_wd;
354  } else if (tessedit_preserve_row_rej_perfect_wds) {
355  /* Preserve perfect words anyway */
356  rej_word = word->reject_map.reject_count() > 0 ||
357  word->reject_map.length () < tessedit_preserve_min_wd_len;
358  if (rej_word && tessedit_dont_rowrej_good_wds &&
359  word->reject_map.length() >= tessedit_preserve_min_wd_len &&
360  acceptable_word_string(*word->uch_set,
361  word->best_choice->unichar_string().string(),
362  word->best_choice->unichar_lengths().string()) !=
363  AC_UNACCEPTABLE) {
364  word_char_quality(word, page_res_it.row()->row,
365  &char_quality,
366  &accepted_char_quality);
367  rej_word = char_quality != word->reject_map.length();
368  }
369  } else {
370  rej_word = TRUE;
371  }
372  if (rej_word) {
373  /*
374  Reject spacing if both current and prev words are rejected.
375  NOTE - this is NOT restricted to FUZZY spaces. - When tried
376  this generated more space errors.
377  */
378  if (tessedit_use_reject_spaces &&
379  prev_word_rejected &&
380  page_res_it.prev_row() == page_res_it.row() &&
381  word->word->space () == 1)
382  word->reject_spaces = TRUE;
384  }
385  prev_word_rejected = rej_word;
386  page_res_it.forward();
387  }
388  } else {
389  if (tessedit_debug_block_rejection) {
390  tprintf("NOT REJECTING ROW %d #chars: %d # Rejects: %d; \n",
391  row_no, current_row->char_count, current_row->rej_count);
392  }
393  while (page_res_it.word() != NULL &&
394  page_res_it.row() == current_row)
395  page_res_it.forward();
396  }
397  }
398  }
399  }
400  }
401 }
402 
403 } // namespace tesseract
404 
405 /*************************************************************************
406  * reject_whole_page()
407  * Don't believe any of it - set the reject map to 00..00 in all words
408  *
409  *************************************************************************/
410 
411 void reject_whole_page(PAGE_RES_IT &page_res_it) {
412  page_res_it.restart_page ();
413  while (page_res_it.word () != NULL) {
414  page_res_it.word ()->reject_map.rej_word_doc_rej ();
415  page_res_it.forward ();
416  }
417  //whole page is rejected
418  page_res_it.page_res->rejected = TRUE;
419 }
420 
421 namespace tesseract {
423  WERD_RES *word;
424  GARBAGE_LEVEL garbage_level;
425  PAGE_RES_IT copy_it;
426  BOOL8 prev_potential_marked = FALSE;
427  BOOL8 found_terrible_word = FALSE;
428  BOOL8 ok_dict_word;
429 
430  page_res_it.restart_page();
431  while (page_res_it.word() != NULL) {
432  POLY_BLOCK* pb = page_res_it.block()->block->poly_block();
433  if (pb != NULL && !pb->IsText()) {
434  page_res_it.forward();
435  continue;
436  }
437  word = page_res_it.word();
438 
439  if (crunch_early_convert_bad_unlv_chs)
440  convert_bad_unlv_chs(word);
441 
442  if (crunch_early_merge_tess_fails)
443  word->merge_tess_fails();
444 
445  if (word->reject_map.accept_count () != 0) {
446  found_terrible_word = FALSE;
447  //Forget earlier potential crunches
448  prev_potential_marked = FALSE;
449  }
450  else {
451  ok_dict_word = safe_dict_word(word);
452  garbage_level = garbage_word (word, ok_dict_word);
453 
454  if ((garbage_level != G_NEVER_CRUNCH) &&
455  (terrible_word_crunch (word, garbage_level))) {
456  if (crunch_debug > 0) {
457  tprintf ("T CRUNCHING: \"%s\"\n",
458  word->best_choice->unichar_string().string());
459  }
461  if (prev_potential_marked) {
462  while (copy_it.word () != word) {
463  if (crunch_debug > 0) {
464  tprintf ("P1 CRUNCHING: \"%s\"\n",
465  copy_it.word()->best_choice->unichar_string().string());
466  }
467  copy_it.word ()->unlv_crunch_mode = CR_KEEP_SPACE;
468  copy_it.forward ();
469  }
470  prev_potential_marked = FALSE;
471  }
472  found_terrible_word = TRUE;
473  }
474  else if ((garbage_level != G_NEVER_CRUNCH) &&
475  (potential_word_crunch (word,
476  garbage_level, ok_dict_word))) {
477  if (found_terrible_word) {
478  if (crunch_debug > 0) {
479  tprintf ("P2 CRUNCHING: \"%s\"\n",
480  word->best_choice->unichar_string().string());
481  }
483  }
484  else if (!prev_potential_marked) {
485  copy_it = page_res_it;
486  prev_potential_marked = TRUE;
487  if (crunch_debug > 1) {
488  tprintf ("P3 CRUNCHING: \"%s\"\n",
489  word->best_choice->unichar_string().string());
490  }
491  }
492  }
493  else {
494  found_terrible_word = FALSE;
495  //Forget earlier potential crunches
496  prev_potential_marked = FALSE;
497  if (crunch_debug > 2) {
498  tprintf ("NO CRUNCH: \"%s\"\n",
499  word->best_choice->unichar_string().string());
500  }
501  }
502  }
503  page_res_it.forward ();
504  }
505 }
506 
507 
509  GARBAGE_LEVEL garbage_level) {
510  float rating_per_ch;
511  int adjusted_len;
512  int crunch_mode = 0;
513 
514  if ((word->best_choice->unichar_string().length () == 0) ||
515  (strspn (word->best_choice->unichar_string().string(), " ") ==
517  crunch_mode = 1;
518  else {
519  adjusted_len = word->reject_map.length ();
520  if (adjusted_len > crunch_rating_max)
521  adjusted_len = crunch_rating_max;
522  rating_per_ch = word->best_choice->rating () / adjusted_len;
523 
524  if (rating_per_ch > crunch_terrible_rating)
525  crunch_mode = 2;
526  else if (crunch_terrible_garbage && (garbage_level == G_TERRIBLE))
527  crunch_mode = 3;
528  else if ((word->best_choice->certainty () < crunch_poor_garbage_cert) &&
529  (garbage_level != G_OK))
530  crunch_mode = 4;
531  else if ((rating_per_ch > crunch_poor_garbage_rate) &&
532  (garbage_level != G_OK))
533  crunch_mode = 5;
534  }
535  if (crunch_mode > 0) {
536  if (crunch_debug > 2) {
537  tprintf ("Terrible_word_crunch (%d) on \"%s\"\n",
538  crunch_mode, word->best_choice->unichar_string().string());
539  }
540  return TRUE;
541  }
542  else
543  return FALSE;
544 }
545 
547  GARBAGE_LEVEL garbage_level,
548  BOOL8 ok_dict_word) {
549  float rating_per_ch;
550  int adjusted_len;
551  const char *str = word->best_choice->unichar_string().string();
552  const char *lengths = word->best_choice->unichar_lengths().string();
553  BOOL8 word_crunchable;
554  int poor_indicator_count = 0;
555 
556  word_crunchable = !crunch_leave_accept_strings ||
557  word->reject_map.length() < 3 ||
558  (acceptable_word_string(*word->uch_set,
559  str, lengths) == AC_UNACCEPTABLE &&
560  !ok_dict_word);
561 
562  adjusted_len = word->reject_map.length();
563  if (adjusted_len > 10)
564  adjusted_len = 10;
565  rating_per_ch = word->best_choice->rating() / adjusted_len;
566 
567  if (rating_per_ch > crunch_pot_poor_rate) {
568  if (crunch_debug > 2) {
569  tprintf("Potential poor rating on \"%s\"\n",
570  word->best_choice->unichar_string().string());
571  }
572  poor_indicator_count++;
573  }
574 
575  if (word_crunchable &&
576  word->best_choice->certainty() < crunch_pot_poor_cert) {
577  if (crunch_debug > 2) {
578  tprintf("Potential poor cert on \"%s\"\n",
579  word->best_choice->unichar_string().string());
580  }
581  poor_indicator_count++;
582  }
583 
584  if (garbage_level != G_OK) {
585  if (crunch_debug > 2) {
586  tprintf("Potential garbage on \"%s\"\n",
587  word->best_choice->unichar_string().string());
588  }
589  poor_indicator_count++;
590  }
591  return poor_indicator_count >= crunch_pot_indicators;
592 }
593 
595  WERD_RES *word;
596  PAGE_RES_IT copy_it;
597  BOOL8 deleting_from_bol = FALSE;
598  BOOL8 marked_delete_point = FALSE;
599  inT16 debug_delete_mode;
600  CRUNCH_MODE delete_mode;
601  inT16 x_debug_delete_mode;
602  CRUNCH_MODE x_delete_mode;
603 
604  page_res_it.restart_page();
605  while (page_res_it.word() != NULL) {
606  word = page_res_it.word();
607 
608  delete_mode = word_deletable (word, debug_delete_mode);
609  if (delete_mode != CR_NONE) {
610  if (word->word->flag (W_BOL) || deleting_from_bol) {
611  if (crunch_debug > 0) {
612  tprintf ("BOL CRUNCH DELETING(%d): \"%s\"\n",
613  debug_delete_mode,
614  word->best_choice->unichar_string().string());
615  }
616  word->unlv_crunch_mode = delete_mode;
617  deleting_from_bol = TRUE;
618  } else if (word->word->flag(W_EOL)) {
619  if (marked_delete_point) {
620  while (copy_it.word() != word) {
621  x_delete_mode = word_deletable (copy_it.word (),
622  x_debug_delete_mode);
623  if (crunch_debug > 0) {
624  tprintf ("EOL CRUNCH DELETING(%d): \"%s\"\n",
625  x_debug_delete_mode,
626  copy_it.word()->best_choice->unichar_string().string());
627  }
628  copy_it.word ()->unlv_crunch_mode = x_delete_mode;
629  copy_it.forward ();
630  }
631  }
632  if (crunch_debug > 0) {
633  tprintf ("EOL CRUNCH DELETING(%d): \"%s\"\n",
634  debug_delete_mode,
635  word->best_choice->unichar_string().string());
636  }
637  word->unlv_crunch_mode = delete_mode;
638  deleting_from_bol = FALSE;
639  marked_delete_point = FALSE;
640  }
641  else {
642  if (!marked_delete_point) {
643  copy_it = page_res_it;
644  marked_delete_point = TRUE;
645  }
646  }
647  }
648  else {
649  deleting_from_bol = FALSE;
650  //Forget earlier potential crunches
651  marked_delete_point = FALSE;
652  }
653  /*
654  The following step has been left till now as the tess fails are used to
655  determine if the word is deletable.
656  */
657  if (!crunch_early_merge_tess_fails)
658  word->merge_tess_fails();
659  page_res_it.forward ();
660  }
661 }
662 
663 
665  int i;
666  UNICHAR_ID unichar_dash = word_res->uch_set->unichar_to_id("-");
667  UNICHAR_ID unichar_space = word_res->uch_set->unichar_to_id(" ");
668  UNICHAR_ID unichar_tilde = word_res->uch_set->unichar_to_id("~");
669  UNICHAR_ID unichar_pow = word_res->uch_set->unichar_to_id("^");
670  for (i = 0; i < word_res->reject_map.length(); ++i) {
671  if (word_res->best_choice->unichar_id(i) == unichar_tilde) {
672  word_res->best_choice->set_unichar_id(unichar_dash, i);
673  if (word_res->reject_map[i].accepted ())
674  word_res->reject_map[i].setrej_unlv_rej ();
675  }
676  if (word_res->best_choice->unichar_id(i) == unichar_pow) {
677  word_res->best_choice->set_unichar_id(unichar_space, i);
678  if (word_res->reject_map[i].accepted ())
679  word_res->reject_map[i].setrej_unlv_rej ();
680  }
681  }
682 }
683 
685  enum STATES
686  {
687  JUNK,
688  FIRST_UPPER,
689  FIRST_LOWER,
690  FIRST_NUM,
691  SUBSEQUENT_UPPER,
692  SUBSEQUENT_LOWER,
693  SUBSEQUENT_NUM
694  };
695  const char *str = word->best_choice->unichar_string().string();
696  const char *lengths = word->best_choice->unichar_lengths().string();
697  STATES state = JUNK;
698  int len = 0;
699  int isolated_digits = 0;
700  int isolated_alphas = 0;
701  int bad_char_count = 0;
702  int tess_rejs = 0;
703  int dodgy_chars = 0;
704  int ok_chars;
705  UNICHAR_ID last_char = -1;
706  int alpha_repetition_count = 0;
707  int longest_alpha_repetition_count = 0;
708  int longest_lower_run_len = 0;
709  int lower_string_count = 0;
710  int longest_upper_run_len = 0;
711  int upper_string_count = 0;
712  int total_alpha_count = 0;
713  int total_digit_count = 0;
714 
715  for (; *str != '\0'; str += *(lengths++)) {
716  len++;
717  if (word->uch_set->get_isupper (str, *lengths)) {
718  total_alpha_count++;
719  switch (state) {
720  case SUBSEQUENT_UPPER:
721  case FIRST_UPPER:
722  state = SUBSEQUENT_UPPER;
723  upper_string_count++;
724  if (longest_upper_run_len < upper_string_count)
725  longest_upper_run_len = upper_string_count;
726  if (last_char == word->uch_set->unichar_to_id(str, *lengths)) {
727  alpha_repetition_count++;
728  if (longest_alpha_repetition_count < alpha_repetition_count) {
729  longest_alpha_repetition_count = alpha_repetition_count;
730  }
731  }
732  else {
733  last_char = word->uch_set->unichar_to_id(str, *lengths);
734  alpha_repetition_count = 1;
735  }
736  break;
737  case FIRST_NUM:
738  isolated_digits++;
739  default:
740  state = FIRST_UPPER;
741  last_char = word->uch_set->unichar_to_id(str, *lengths);
742  alpha_repetition_count = 1;
743  upper_string_count = 1;
744  break;
745  }
746  }
747  else if (word->uch_set->get_islower (str, *lengths)) {
748  total_alpha_count++;
749  switch (state) {
750  case SUBSEQUENT_LOWER:
751  case FIRST_LOWER:
752  state = SUBSEQUENT_LOWER;
753  lower_string_count++;
754  if (longest_lower_run_len < lower_string_count)
755  longest_lower_run_len = lower_string_count;
756  if (last_char == word->uch_set->unichar_to_id(str, *lengths)) {
757  alpha_repetition_count++;
758  if (longest_alpha_repetition_count < alpha_repetition_count) {
759  longest_alpha_repetition_count = alpha_repetition_count;
760  }
761  }
762  else {
763  last_char = word->uch_set->unichar_to_id(str, *lengths);
764  alpha_repetition_count = 1;
765  }
766  break;
767  case FIRST_NUM:
768  isolated_digits++;
769  default:
770  state = FIRST_LOWER;
771  last_char = word->uch_set->unichar_to_id(str, *lengths);
772  alpha_repetition_count = 1;
773  lower_string_count = 1;
774  break;
775  }
776  }
777  else if (word->uch_set->get_isdigit (str, *lengths)) {
778  total_digit_count++;
779  switch (state) {
780  case FIRST_NUM:
781  state = SUBSEQUENT_NUM;
782  case SUBSEQUENT_NUM:
783  break;
784  case FIRST_UPPER:
785  case FIRST_LOWER:
786  isolated_alphas++;
787  default:
788  state = FIRST_NUM;
789  break;
790  }
791  }
792  else {
793  if (*lengths == 1 && *str == ' ')
794  tess_rejs++;
795  else
796  bad_char_count++;
797  switch (state) {
798  case FIRST_NUM:
799  isolated_digits++;
800  break;
801  case FIRST_UPPER:
802  case FIRST_LOWER:
803  isolated_alphas++;
804  default:
805  break;
806  }
807  state = JUNK;
808  }
809  }
810 
811  switch (state) {
812  case FIRST_NUM:
813  isolated_digits++;
814  break;
815  case FIRST_UPPER:
816  case FIRST_LOWER:
817  isolated_alphas++;
818  default:
819  break;
820  }
821 
822  if (crunch_include_numerals) {
823  total_alpha_count += total_digit_count - isolated_digits;
824  }
825 
826  if (crunch_leave_ok_strings && len >= 4 &&
827  2 * (total_alpha_count - isolated_alphas) > len &&
828  longest_alpha_repetition_count < crunch_long_repetitions) {
829  if ((crunch_accept_ok &&
830  acceptable_word_string(*word->uch_set, str, lengths) !=
831  AC_UNACCEPTABLE) ||
832  longest_lower_run_len > crunch_leave_lc_strings ||
833  longest_upper_run_len > crunch_leave_uc_strings)
834  return G_NEVER_CRUNCH;
835  }
836  if (word->reject_map.length() > 1 &&
837  strpbrk(str, " ") == NULL &&
838  (word->best_choice->permuter() == SYSTEM_DAWG_PERM ||
839  word->best_choice->permuter() == FREQ_DAWG_PERM ||
840  word->best_choice->permuter() == USER_DAWG_PERM ||
841  word->best_choice->permuter() == NUMBER_PERM ||
842  acceptable_word_string(*word->uch_set, str, lengths) !=
843  AC_UNACCEPTABLE || ok_dict_word))
844  return G_OK;
845 
846  ok_chars = len - bad_char_count - isolated_digits -
847  isolated_alphas - tess_rejs;
848 
849  if (crunch_debug > 3) {
850  tprintf("garbage_word: \"%s\"\n",
851  word->best_choice->unichar_string().string());
852  tprintf("LEN: %d bad: %d iso_N: %d iso_A: %d rej: %d\n",
853  len,
854  bad_char_count, isolated_digits, isolated_alphas, tess_rejs);
855  }
856  if (bad_char_count == 0 &&
857  tess_rejs == 0 &&
858  (len > isolated_digits + isolated_alphas || len <= 2))
859  return G_OK;
860 
861  if (tess_rejs > ok_chars ||
862  (tess_rejs > 0 && (bad_char_count + tess_rejs) * 2 > len))
863  return G_TERRIBLE;
864 
865  if (len > 4) {
866  dodgy_chars = 2 * tess_rejs + bad_char_count + isolated_digits +
867  isolated_alphas;
868  if (dodgy_chars > 5 || (dodgy_chars / (float) len) > 0.5)
869  return G_DODGY;
870  else
871  return G_OK;
872  } else {
873  dodgy_chars = 2 * tess_rejs + bad_char_count;
874  if ((len == 4 && dodgy_chars > 2) ||
875  (len == 3 && dodgy_chars > 2) || dodgy_chars >= len)
876  return G_DODGY;
877  else
878  return G_OK;
879  }
880 }
881 
882 
883 /*************************************************************************
884  * word_deletable()
885  * DELETE WERDS AT ENDS OF ROWS IF
886  * Word is crunched &&
887  * ( string length = 0 OR
888  * > 50% of chars are "|" (before merging) OR
889  * certainty < -10 OR
890  * rating /char > 60 OR
891  * TOP of word is more than 0.5 xht BELOW baseline OR
892  * BOTTOM of word is more than 0.5 xht ABOVE xht OR
893  * length of word < 3xht OR
894  * height of word < 0.7 xht OR
895  * height of word > 3.0 xht OR
896  * >75% of the outline BBs have longest dimension < 0.5xht
897  *************************************************************************/
898 
900  int word_len = word->reject_map.length ();
901  float rating_per_ch;
902  TBOX box; //BB of word
903 
904  if (word->unlv_crunch_mode == CR_NONE) {
905  delete_mode = 0;
906  return CR_NONE;
907  }
908 
909  if (word_len == 0) {
910  delete_mode = 1;
911  return CR_DELETE;
912  }
913 
914  if (word->rebuild_word != NULL) {
915  // Cube leaves rebuild_word NULL.
916  box = word->rebuild_word->bounding_box();
917  if (box.height () < crunch_del_min_ht * kBlnXHeight) {
918  delete_mode = 4;
919  return CR_DELETE;
920  }
921 
922  if (noise_outlines(word->rebuild_word)) {
923  delete_mode = 5;
924  return CR_DELETE;
925  }
926  }
927 
928  if ((failure_count (word) * 1.5) > word_len) {
929  delete_mode = 2;
930  return CR_LOOSE_SPACE;
931  }
932 
933  if (word->best_choice->certainty () < crunch_del_cert) {
934  delete_mode = 7;
935  return CR_LOOSE_SPACE;
936  }
937 
938  rating_per_ch = word->best_choice->rating () / word_len;
939 
940  if (rating_per_ch > crunch_del_rating) {
941  delete_mode = 8;
942  return CR_LOOSE_SPACE;
943  }
944 
945  if (box.top () < kBlnBaselineOffset - crunch_del_low_word * kBlnXHeight) {
946  delete_mode = 9;
947  return CR_LOOSE_SPACE;
948  }
949 
950  if (box.bottom () >
951  kBlnBaselineOffset + crunch_del_high_word * kBlnXHeight) {
952  delete_mode = 10;
953  return CR_LOOSE_SPACE;
954  }
955 
956  if (box.height () > crunch_del_max_ht * kBlnXHeight) {
957  delete_mode = 11;
958  return CR_LOOSE_SPACE;
959  }
960 
961  if (box.width () < crunch_del_min_width * kBlnXHeight) {
962  delete_mode = 3;
963  return CR_LOOSE_SPACE;
964  }
965 
966  delete_mode = 0;
967  return CR_NONE;
968 }
969 
971  const char *str = word->best_choice->unichar_string().string();
972  int tess_rejs = 0;
973 
974  for (; *str != '\0'; str++) {
975  if (*str == ' ')
976  tess_rejs++;
977  }
978  return tess_rejs;
979 }
980 
981 
983  TBOX box; // BB of outline
984  inT16 outline_count = 0;
985  inT16 small_outline_count = 0;
986  inT16 max_dimension;
987  float small_limit = kBlnXHeight * crunch_small_outlines_size;
988 
989  for (int b = 0; b < word->NumBlobs(); ++b) {
990  TBLOB* blob = word->blobs[b];
991  for (TESSLINE* ol = blob->outlines; ol != NULL; ol = ol->next) {
992  outline_count++;
993  box = ol->bounding_box();
994  if (box.height() > box.width())
995  max_dimension = box.height();
996  else
997  max_dimension = box.width();
998  if (max_dimension < small_limit)
999  small_outline_count++;
1000  }
1001  }
1002  return small_outline_count >= outline_count;
1003 }
1004 
1005 } // namespace tesseract
TESSLINE * next
Definition: blobs.h:258
inT32 rej_count
Definition: pageres.h:129
void AcceptIfGoodQuality(int index)
Definition: docqual.cpp:49
inT32 char_count
Definition: pageres.h:60
void convert_bad_unlv_chs(WERD_RES *word_res)
Definition: docqual.cpp:664
Unacceptable word.
Definition: control.h:36
#define TRUE
Definition: capi.h:45
UNICHAR_ID unichar_id(int index) const
Definition: ratngs.h:313
void rej_word_block_rej()
Definition: rejctmap.cpp:503
_ConstTessMemberResultCallback_0_0< false, R, T1 >::base * NewPermanentTessCallback(const T1 *obj, R(T2::*member)() const)
Definition: tesscallback.h:116
int index() const
Definition: pdblock.h:67
int UNICHAR_ID
Definition: unichar.h:33
void ProcessMatchedBlobs(const TWERD &other, TessCallback1< int > *cb) const
Definition: boxword.cpp:193
WERD_CHOICE * best_choice
Definition: pageres.h:219
void unrej_good_chs(WERD_RES *word, ROW *row)
Definition: docqual.cpp:120
const STRING & unichar_lengths() const
Definition: ratngs.h:546
ROW_RES * prev_row() const
Definition: pageres.h:730
Definition: werd.h:36
TESSLINE * outlines
Definition: blobs.h:377
const int kBlnXHeight
Definition: normalis.h:28
inT32 rej_count
Definition: pageres.h:61
TWERD * rebuild_word
Definition: pageres.h:244
#define tprintf(...)
Definition: tprintf.h:31
void quality_based_rejection(PAGE_RES_IT &page_res_it, BOOL8 good_quality_doc)
Definition: docqual.cpp:143
int NumOutlines() const
Definition: blobs.cpp:469
BLOCK * block
Definition: pageres.h:99
void rej_stat_word()
Definition: pageres.cpp:1675
const char * string() const
Definition: strngs.cpp:198
const int kBlnBaselineOffset
Definition: normalis.h:29
ROW * row
Definition: pageres.h:127
bool empty() const
Definition: genericvector.h:90
inT32 length() const
Definition: strngs.cpp:193
inT16 word_blob_quality(WERD_RES *word, ROW *row)
Definition: docqual.cpp:65
void doc_and_block_rejection(PAGE_RES_IT &page_res_it, BOOL8 good_quality_doc)
Definition: docqual.cpp:237
int16_t inT16
Definition: host.h:36
BOOL8 potential_word_crunch(WERD_RES *word, GARBAGE_LEVEL garbage_level, BOOL8 ok_dict_word)
Definition: docqual.cpp:546
ROW_RES * row() const
Definition: pageres.h:739
BOOL8 flag(WERD_FLAGS mask) const
Definition: werd.h:128
inT16 accept_count()
Definition: rejctmap.cpp:329
BOOL8 reject_spaces
Definition: pageres.h:320
Definition: blobs.h:395
inT32 length() const
Definition: rejctmap.h:235
uinT8 permuter() const
Definition: ratngs.h:344
WERD_RES * restart_page()
Definition: pageres.h:683
bool IsText() const
Definition: polyblk.h:52
CRUNCH_MODE word_deletable(WERD_RES *word, inT16 &delete_mode)
Definition: docqual.cpp:899
inT32 whole_word_rej_count
Definition: pageres.h:130
PAGE_RES * page_res
Definition: pageres.h:661
CRUNCH_MODE
Definition: pageres.h:145
WERD_RES * forward()
Definition: pageres.h:716
bool get_isdigit(UNICHAR_ID unichar_id) const
Definition: unicharset.h:472
inT16 reject_count()
Definition: rejctmap.h:241
inT32 rej_count
Definition: pageres.h:101
BOOL8 rejected
Definition: pageres.h:63
unsigned char BOOL8
Definition: host.h:44
Definition: strngs.h:45
#define FALSE
Definition: capi.h:46
Definition: docqual.h:28
inT32 char_count
Definition: pageres.h:128
GARBAGE_LEVEL garbage_word(WERD_RES *word, BOOL8 ok_dict_word)
Definition: docqual.cpp:684
BOOL8 terrible_word_crunch(WERD_RES *word, GARBAGE_LEVEL garbage_level)
Definition: docqual.cpp:508
void tilde_crunch(PAGE_RES_IT &page_res_it)
Definition: docqual.cpp:422
float certainty() const
Definition: ratngs.h:328
void word_char_quality(WERD_RES *word, ROW *row, inT16 *match_count, inT16 *accepted_match_count)
Definition: docqual.cpp:97
Definition: werd.h:35
CRUNCH_MODE unlv_crunch_mode
Definition: pageres.h:294
int NumBlobs() const
Definition: blobs.h:425
inT16 top() const
Definition: rect.h:54
BOOL8 quality_recoverable_rejects()
Definition: rejctmap.cpp:352
void tilde_delete(PAGE_RES_IT &page_res_it)
Definition: docqual.cpp:594
void set_unichar_id(UNICHAR_ID unichar_id, int index)
Definition: ratngs.h:357
const STRING & unichar_string() const
Definition: ratngs.h:539
Definition: rect.h:30
GenericVector< TBLOB * > blobs
Definition: blobs.h:436
tesseract::BoxWord * bln_boxes
Definition: pageres.h:184
POLY_BLOCK * poly_block() const
Definition: pdblock.h:55
inT16 word_outline_errs(WERD_RES *word)
Definition: docqual.cpp:77
Definition: blobs.h:261
void merge_tess_fails()
Definition: pageres.cpp:1062
inT16 height() const
Definition: rect.h:104
WERD * word
Definition: pageres.h:175
inT16 width() const
Definition: rect.h:111
DocQualCallbacks(WERD_RES *word0)
Definition: docqual.cpp:36
inT16 count_outline_errs(char c, inT16 outline_count)
Definition: docqual.cpp:131
BOOL8 noise_outlines(TWERD *word)
Definition: docqual.cpp:982
WERD_RES * word() const
Definition: pageres.h:736
inT16 bottom() const
Definition: rect.h:61
uinT32 unsigned_size() const
Definition: strngs.h:71
bool get_isupper(UNICHAR_ID unichar_id) const
Definition: unicharset.h:465
void CountMatchingBlobs(int index)
Definition: docqual.cpp:39
const UNICHARSET * uch_set
Definition: pageres.h:192
bool get_islower(UNICHAR_ID unichar_id) const
Definition: unicharset.h:458
void rej_word_row_rej()
Definition: rejctmap.cpp:512
UNICHAR_ID unichar_to_id(const char *const unichar_repr) const
Definition: unicharset.cpp:194
inT32 char_count
Definition: pageres.h:100
void rej_word_doc_rej()
Definition: rejctmap.cpp:494
REJMAP reject_map
Definition: pageres.h:271
GARBAGE_LEVEL
Definition: docqual.h:25
uinT8 space()
Definition: werd.h:104
Definition: ocrrow.h:32
inT16 failure_count(WERD_RES *word)
Definition: docqual.cpp:970
void unrej_good_quality_words(PAGE_RES_IT &page_res_it)
Definition: docqual.cpp:165
void CountAcceptedBlobs(int index)
Definition: docqual.cpp:43
TBOX bounding_box() const
Definition: blobs.cpp:879
BLOCK_RES * block() const
Definition: pageres.h:742
float rating() const
Definition: ratngs.h:325
void reject_whole_page(PAGE_RES_IT &page_res_it)
Definition: docqual.cpp:411