tesseract  4.00.00dev
reject.cpp File Reference
#include "tessvars.h"
#include "scanutils.h"
#include <ctype.h>
#include <string.h>
#include "genericvector.h"
#include "reject.h"
#include "control.h"
#include "docqual.h"
#include "globaloc.h"
#include "globals.h"
#include "helpers.h"
#include "tesseractclass.h"

Go to the source code of this file.

Namespaces

 tesseract
 

Functions

 CLISTIZEH (STRING) CLISTIZE(STRING) namespace tesseract
 
void reject_blanks (WERD_RES *word)
 
void reject_poor_matches (WERD_RES *word)
 
float compute_reject_threshold (WERD_CHOICE *word)
 

Function Documentation

◆ CLISTIZEH()

CLISTIZEH ( STRING  )

Definition at line 48 of file reject.cpp.

56  {
57 void Tesseract::set_done(WERD_RES *word, inT16 pass) {
58  word->done = word->tess_accepted &&
59  (strchr(word->best_choice->unichar_string().string(), ' ') == NULL);
60  bool word_is_ambig = word->best_choice->dangerous_ambig_found();
61  bool word_from_dict = word->best_choice->permuter() == SYSTEM_DAWG_PERM ||
62  word->best_choice->permuter() == FREQ_DAWG_PERM ||
64  if (word->done && (pass == 1) && (!word_from_dict || word_is_ambig) &&
65  one_ell_conflict(word, FALSE)) {
66  if (tessedit_rejection_debug) tprintf("one_ell_conflict detected\n");
67  word->done = FALSE;
68  }
69  if (word->done && ((!word_from_dict &&
70  word->best_choice->permuter() != NUMBER_PERM) || word_is_ambig)) {
71  if (tessedit_rejection_debug) tprintf("non-dict or ambig word detected\n");
72  word->done = FALSE;
73  }
74  if (tessedit_rejection_debug) {
75  tprintf("set_done(): done=%d\n", word->done);
76  word->best_choice->print("");
77  }
78 }
79 
80 
81 /*************************************************************************
82  * make_reject_map()
83  *
84  * Sets the done flag to indicate whether the resylt is acceptable.
85  *
86  * Sets a reject map for the word.
87  *************************************************************************/
88 void Tesseract::make_reject_map(WERD_RES *word, ROW *row, inT16 pass) {
89  int i;
90  int offset;
91 
92  flip_0O(word);
93  check_debug_pt(word, -1); // For trap only
94  set_done(word, pass); // Set acceptance
96  reject_blanks(word);
97  /*
98  0: Rays original heuristic - the baseline
99  */
100  if (tessedit_reject_mode == 0) {
101  if (!word->done)
102  reject_poor_matches(word);
103  } else if (tessedit_reject_mode == 5) {
104  /*
105  5: Reject I/1/l from words where there is no strong contextual confirmation;
106  the whole of any unacceptable words (incl PERM rej of dubious 1/I/ls);
107  and the whole of any words which are very small
108  */
109  if (kBlnXHeight / word->denorm.y_scale() <= min_sane_x_ht_pixels) {
111  } else {
112  one_ell_conflict(word, TRUE);
113  /*
114  Originally the code here just used the done flag. Now I have duplicated
115  and unpacked the conditions for setting the done flag so that each
116  mechanism can be turned on or off independently. This works WITHOUT
117  affecting the done flag setting.
118  */
119  if (rej_use_tess_accepted && !word->tess_accepted)
121 
122  if (rej_use_tess_blanks &&
123  (strchr (word->best_choice->unichar_string().string (), ' ') != NULL))
125 
126  WERD_CHOICE* best_choice = word->best_choice;
127  if (rej_use_good_perm) {
128  if ((best_choice->permuter() == SYSTEM_DAWG_PERM ||
129  best_choice->permuter() == FREQ_DAWG_PERM ||
130  best_choice->permuter() == USER_DAWG_PERM) &&
131  (!rej_use_sensible_wd ||
132  acceptable_word_string(*word->uch_set,
133  best_choice->unichar_string().string(),
134  best_choice->unichar_lengths().string()) !=
135  AC_UNACCEPTABLE)) {
136  // PASSED TEST
137  } else if (best_choice->permuter() == NUMBER_PERM) {
138  if (rej_alphas_in_number_perm) {
139  for (i = 0, offset = 0;
140  best_choice->unichar_string()[offset] != '\0';
141  offset += best_choice->unichar_lengths()[i++]) {
142  if (word->reject_map[i].accepted() &&
143  word->uch_set->get_isalpha(
144  best_choice->unichar_string().string() + offset,
145  best_choice->unichar_lengths()[i]))
146  word->reject_map[i].setrej_bad_permuter();
147  // rej alpha
148  }
149  }
150  } else {
152  }
153  }
154  /* Ambig word rejection was here once !!*/
155  }
156  } else {
157  tprintf("BAD tessedit_reject_mode\n");
158  err_exit();
159  }
160 
161  if (tessedit_image_border > -1)
162  reject_edge_blobs(word);
163 
164  check_debug_pt (word, 10);
165  if (tessedit_rejection_debug) {
166  tprintf("Permuter Type = %d\n", word->best_choice->permuter ());
167  tprintf("Certainty: %f Rating: %f\n",
168  word->best_choice->certainty (), word->best_choice->rating ());
169  tprintf("Dict word: %d\n", dict_word(*(word->best_choice)));
170  }
171 
172  flip_hyphens(word);
173  check_debug_pt(word, 20);
174 }
175 } // namespace tesseract
void rej_word_small_xht()
Definition: rejctmap.cpp:413
void reject_poor_matches(WERD_RES *word)
Definition: reject.cpp:207
void flip_hyphens(WERD_RES *word)
Unacceptable word.
Definition: control.h:36
#define TRUE
Definition: capi.h:45
void reject_blanks(WERD_RES *word)
Definition: reject.cpp:178
void print() const
Definition: ratngs.h:578
WERD_CHOICE * best_choice
Definition: pageres.h:219
const STRING & unichar_lengths() const
Definition: ratngs.h:546
void flip_0O(WERD_RES *word)
const int kBlnXHeight
Definition: normalis.h:28
#define tprintf(...)
Definition: tprintf.h:31
const char * string() const
Definition: strngs.cpp:198
voidpf uLong offset
Definition: ioapi.h:42
float y_scale() const
Definition: normalis.h:272
void err_exit()
Definition: globaloc.cpp:74
inT32 length() const
Definition: strngs.cpp:193
int16_t inT16
Definition: host.h:36
uinT8 permuter() const
Definition: ratngs.h:344
bool get_isalpha(UNICHAR_ID unichar_id) const
Definition: unicharset.h:451
#define FALSE
Definition: capi.h:46
bool dangerous_ambig_found() const
Definition: ratngs.h:361
BOOL8 tess_accepted
Definition: pageres.h:280
float certainty() const
Definition: ratngs.h:328
const STRING & unichar_string() const
Definition: ratngs.h:539
DENORM denorm
Definition: pageres.h:190
void rej_word_bad_permuter()
Definition: rejctmap.cpp:449
const UNICHARSET * uch_set
Definition: pageres.h:192
void initialise(inT16 length)
Definition: rejctmap.cpp:318
REJMAP reject_map
Definition: pageres.h:271
Definition: ocrrow.h:32
void rej_word_contains_blanks()
Definition: rejctmap.cpp:440
BOOL8 done
Definition: pageres.h:282
float rating() const
Definition: ratngs.h:325
void rej_word_not_tess_accepted()
Definition: rejctmap.cpp:431

◆ compute_reject_threshold()

float compute_reject_threshold ( WERD_CHOICE word)

Definition at line 226 of file reject.cpp.

226  {
227  float threshold; // rejection threshold
228  float bestgap = 0.0f; // biggest gap
229  float gapstart; // bottom of gap
230  // super iterator
231  BLOB_CHOICE_IT choice_it; // real iterator
232 
233  int blob_count = word->length();
234  GenericVector<float> ratings;
235  ratings.resize_no_init(blob_count);
236  for (int i = 0; i < blob_count; ++i) {
237  ratings[i] = word->certainty(i);
238  }
239  ratings.sort();
240  gapstart = ratings[0] - 1; // all reject if none better
241  if (blob_count >= 3) {
242  for (int index = 0; index < blob_count - 1; index++) {
243  if (ratings[index + 1] - ratings[index] > bestgap) {
244  bestgap = ratings[index + 1] - ratings[index];
245  // find biggest
246  gapstart = ratings[index];
247  }
248  }
249  }
250  threshold = gapstart + bestgap / 2;
251 
252  return threshold;
253 }
int length() const
Definition: ratngs.h:301
void resize_no_init(int size)
Definition: genericvector.h:66
float certainty() const
Definition: ratngs.h:328

◆ reject_blanks()

void reject_blanks ( WERD_RES word)

Definition at line 178 of file reject.cpp.

178  {
179  inT16 i;
180  inT16 offset;
181 
182  for (i = 0, offset = 0; word->best_choice->unichar_string()[offset] != '\0';
183  offset += word->best_choice->unichar_lengths()[i], i += 1) {
184  if (word->best_choice->unichar_string()[offset] == ' ')
185  //rej unrecognised blobs
186  word->reject_map[i].setrej_tess_failure ();
187  }
188 }
WERD_CHOICE * best_choice
Definition: pageres.h:219
const STRING & unichar_lengths() const
Definition: ratngs.h:546
voidpf uLong offset
Definition: ioapi.h:42
int16_t inT16
Definition: host.h:36
const STRING & unichar_string() const
Definition: ratngs.h:539
REJMAP reject_map
Definition: pageres.h:271

◆ reject_poor_matches()

void reject_poor_matches ( WERD_RES word)

Definition at line 207 of file reject.cpp.

207  {
208  float threshold = compute_reject_threshold(word->best_choice);
209  for (int i = 0; i < word->best_choice->length(); ++i) {
210  if (word->best_choice->unichar_id(i) == UNICHAR_SPACE)
211  word->reject_map[i].setrej_tess_failure();
212  else if (word->best_choice->certainty(i) < threshold)
213  word->reject_map[i].setrej_poor_match();
214  }
215 }
UNICHAR_ID unichar_id(int index) const
Definition: ratngs.h:313
WERD_CHOICE * best_choice
Definition: pageres.h:219
int length() const
Definition: ratngs.h:301
float certainty() const
Definition: ratngs.h:328
REJMAP reject_map
Definition: pageres.h:271
float compute_reject_threshold(WERD_CHOICE *word)
Definition: reject.cpp:226