tesseract  4.00.00dev
permdawg.cpp
Go to the documentation of this file.
1 /* -*-C-*-
2  ********************************************************************************
3  *
4  * File: permdawg.c (Formerly permdawg.c)
5  * Description: Scale word choices by a dictionary
6  * Author: Mark Seaman, OCR Technology
7  * Created: Fri Oct 16 14:37:00 1987
8  * Modified: Tue Jul 9 15:43:18 1991 (Mark Seaman) marks@hpgrlt
9  * Language: C
10  * Package: N/A
11  * Status: Reusable Software Component
12  *
13  * (c) Copyright 1987, Hewlett-Packard Company.
14  ** Licensed under the Apache License, Version 2.0 (the "License");
15  ** you may not use this file except in compliance with the License.
16  ** You may obtain a copy of the License at
17  ** http://www.apache.org/licenses/LICENSE-2.0
18  ** Unless required by applicable law or agreed to in writing, software
19  ** distributed under the License is distributed on an "AS IS" BASIS,
20  ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
21  ** See the License for the specific language governing permissions and
22  ** limitations under the License.
23  *
24  *********************************************************************************/
25 /*----------------------------------------------------------------------
26  I n c l u d e s
27 ----------------------------------------------------------------------*/
28 
29 #include "cutil.h"
30 #include "dawg.h"
31 #include "globals.h"
32 #include "ndminx.h"
33 #include "stopper.h"
34 #include "tprintf.h"
35 #include "params.h"
36 
37 #include <ctype.h>
38 #include "dict.h"
39 
40 /*----------------------------------------------------------------------
41  F u n c t i o n s
42 ----------------------------------------------------------------------*/
43 namespace tesseract {
44 
52  const char *debug, const BLOB_CHOICE_LIST_VECTOR &char_choices,
53  int char_choice_index, const CHAR_FRAGMENT_INFO *prev_char_frag_info,
54  bool word_ending, WERD_CHOICE *word, float certainties[], float *limit,
55  WERD_CHOICE *best_choice, int *attempts_left, void *void_more_args) {
56  DawgArgs *more_args = static_cast<DawgArgs*>(void_more_args);
57  word_ending = (char_choice_index == char_choices.size()-1);
58  int word_index = word->length() - 1;
59  if (best_choice->rating() < *limit) return;
60  // Look up char in DAWG
61 
62  // If the current unichar is an ngram first try calling
63  // letter_is_okay() for each unigram it contains separately.
64  UNICHAR_ID orig_uch_id = word->unichar_id(word_index);
65  bool checked_unigrams = false;
66  if (getUnicharset().get_isngram(orig_uch_id)) {
67  if (dawg_debug_level) {
68  tprintf("checking unigrams in an ngram %s\n",
69  getUnicharset().debug_str(orig_uch_id).string());
70  }
71  int num_unigrams = 0;
72  word->remove_last_unichar_id();
74  const char *ngram_str = getUnicharset().id_to_unichar(orig_uch_id);
75  // Since the string came out of the unicharset, failure is impossible.
76  ASSERT_HOST(getUnicharset().encode_string(ngram_str, true, &encoding, NULL,
77  NULL));
78  bool unigrams_ok = true;
79  // Construct DawgArgs that reflect the current state.
80  DawgPositionVector unigram_active_dawgs = *(more_args->active_dawgs);
81  DawgPositionVector unigram_updated_dawgs;
82  DawgArgs unigram_dawg_args(&unigram_active_dawgs,
83  &unigram_updated_dawgs,
84  more_args->permuter);
85  // Check unigrams in the ngram with letter_is_okay().
86  for (int i = 0; unigrams_ok && i < encoding.size(); ++i) {
87  UNICHAR_ID uch_id = encoding[i];
88  ASSERT_HOST(uch_id != INVALID_UNICHAR_ID);
89  ++num_unigrams;
90  word->append_unichar_id(uch_id, 1, 0.0, 0.0);
91  unigrams_ok = (this->*letter_is_okay_)(
92  &unigram_dawg_args,
93  word->unichar_id(word_index+num_unigrams-1),
94  word_ending && i == encoding.size() - 1);
95  (*unigram_dawg_args.active_dawgs) = *(unigram_dawg_args.updated_dawgs);
96  if (dawg_debug_level) {
97  tprintf("unigram %s is %s\n",
98  getUnicharset().debug_str(uch_id).string(),
99  unigrams_ok ? "OK" : "not OK");
100  }
101  }
102  // Restore the word and copy the updated dawg state if needed.
103  while (num_unigrams-- > 0) word->remove_last_unichar_id();
104  word->append_unichar_id_space_allocated(orig_uch_id, 1, 0.0, 0.0);
105  if (unigrams_ok) {
106  checked_unigrams = true;
107  more_args->permuter = unigram_dawg_args.permuter;
108  *(more_args->updated_dawgs) = *(unigram_dawg_args.updated_dawgs);
109  }
110  }
111 
112  // Check which dawgs from the dawgs_ vector contain the word
113  // up to and including the current unichar.
114  if (checked_unigrams || (this->*letter_is_okay_)(
115  more_args, word->unichar_id(word_index), word_ending)) {
116  // Add a new word choice
117  if (word_ending) {
118  if (dawg_debug_level) {
119  tprintf("found word = %s\n", word->debug_string().string());
120  }
121  if (strcmp(output_ambig_words_file.string(), "") != 0) {
122  if (output_ambig_words_file_ == NULL) {
123  output_ambig_words_file_ =
124  fopen(output_ambig_words_file.string(), "wb+");
125  if (output_ambig_words_file_ == NULL) {
126  tprintf("Failed to open output_ambig_words_file %s\n",
127  output_ambig_words_file.string());
128  exit(1);
129  }
130  STRING word_str;
131  word->string_and_lengths(&word_str, NULL);
132  word_str += " ";
133  fprintf(output_ambig_words_file_, "%s", word_str.string());
134  }
135  STRING word_str;
136  word->string_and_lengths(&word_str, NULL);
137  word_str += " ";
138  fprintf(output_ambig_words_file_, "%s", word_str.string());
139  }
140  WERD_CHOICE *adjusted_word = word;
141  adjusted_word->set_permuter(more_args->permuter);
142  update_best_choice(*adjusted_word, best_choice);
143  } else { // search the next letter
144  // Make updated_* point to the next entries in the DawgPositionVector
145  // arrays (that were originally created in dawg_permute_and_select)
146  ++(more_args->updated_dawgs);
147  // Make active_dawgs and constraints point to the updated ones.
148  ++(more_args->active_dawgs);
149  permute_choices(debug, char_choices, char_choice_index + 1,
150  prev_char_frag_info, word, certainties, limit,
151  best_choice, attempts_left, more_args);
152  // Restore previous state to explore another letter in this position.
153  --(more_args->updated_dawgs);
154  --(more_args->active_dawgs);
155  }
156  } else {
157  if (dawg_debug_level) {
158  tprintf("last unichar not OK at index %d in %s\n",
159  word_index, word->debug_string().string());
160  }
161  }
162 }
163 
164 
175  const BLOB_CHOICE_LIST_VECTOR &char_choices, float rating_limit) {
176  WERD_CHOICE *best_choice = new WERD_CHOICE(&getUnicharset());
177  best_choice->make_bad();
178  best_choice->set_rating(rating_limit);
179  if (char_choices.length() == 0 || char_choices.length() > MAX_WERD_LENGTH)
180  return best_choice;
181  DawgPositionVector *active_dawgs =
182  new DawgPositionVector[char_choices.length() + 1];
183  init_active_dawgs(&(active_dawgs[0]), true);
184  DawgArgs dawg_args(&(active_dawgs[0]), &(active_dawgs[1]), NO_PERM);
186 
187  float certainties[MAX_WERD_LENGTH];
189  int attempts_left = max_permuter_attempts;
190  permute_choices((dawg_debug_level) ? "permute_dawg_debug" : NULL,
191  char_choices, 0, NULL, &word, certainties, &rating_limit, best_choice,
192  &attempts_left, &dawg_args);
193  delete[] active_dawgs;
194  return best_choice;
195 }
196 
204  const char *debug,
205  const BLOB_CHOICE_LIST_VECTOR &char_choices,
206  int char_choice_index,
207  const CHAR_FRAGMENT_INFO *prev_char_frag_info,
208  WERD_CHOICE *word,
209  float certainties[],
210  float *limit,
211  WERD_CHOICE *best_choice,
212  int *attempts_left,
213  void *more_args) {
214  if (debug) {
215  tprintf("%s permute_choices: char_choice_index=%d"
216  " limit=%g rating=%g, certainty=%g word=%s\n",
217  debug, char_choice_index, *limit, word->rating(),
218  word->certainty(), word->debug_string().string());
219  }
220  if (char_choice_index < char_choices.length()) {
221  BLOB_CHOICE_IT blob_choice_it;
222  blob_choice_it.set_to_list(char_choices.get(char_choice_index));
223  for (blob_choice_it.mark_cycle_pt(); !blob_choice_it.cycled_list();
224  blob_choice_it.forward()) {
225  (*attempts_left)--;
226  append_choices(debug, char_choices, *(blob_choice_it.data()),
227  char_choice_index, prev_char_frag_info, word,
228  certainties, limit, best_choice, attempts_left, more_args);
229  if (*attempts_left <= 0) {
230  if (debug) tprintf("permute_choices(): attempts_left is 0\n");
231  break;
232  }
233  }
234  }
235 }
236 
246  const char *debug,
247  const BLOB_CHOICE_LIST_VECTOR &char_choices,
248  const BLOB_CHOICE &blob_choice,
249  int char_choice_index,
250  const CHAR_FRAGMENT_INFO *prev_char_frag_info,
251  WERD_CHOICE *word,
252  float certainties[],
253  float *limit,
254  WERD_CHOICE *best_choice,
255  int *attempts_left,
256  void *more_args) {
257  int word_ending =
258  (char_choice_index == char_choices.length() - 1) ? true : false;
259 
260  // Deal with fragments.
261  CHAR_FRAGMENT_INFO char_frag_info;
262  if (!fragment_state_okay(blob_choice.unichar_id(), blob_choice.rating(),
263  blob_choice.certainty(), prev_char_frag_info, debug,
264  word_ending, &char_frag_info)) {
265  return; // blob_choice must be an invalid fragment
266  }
267  // Search the next letter if this character is a fragment.
268  if (char_frag_info.unichar_id == INVALID_UNICHAR_ID) {
269  permute_choices(debug, char_choices, char_choice_index + 1,
270  &char_frag_info, word, certainties, limit,
271  best_choice, attempts_left, more_args);
272  return;
273  }
274 
275  // Add the next unichar.
276  float old_rating = word->rating();
277  float old_certainty = word->certainty();
278  uinT8 old_permuter = word->permuter();
279  certainties[word->length()] = char_frag_info.certainty;
281  char_frag_info.unichar_id, char_frag_info.num_fragments,
282  char_frag_info.rating, char_frag_info.certainty);
283 
284  // Explore the next unichar.
285  (this->*go_deeper_fxn_)(debug, char_choices, char_choice_index,
286  &char_frag_info, word_ending, word, certainties,
287  limit, best_choice, attempts_left, more_args);
288 
289  // Remove the unichar we added to explore other choices in it's place.
290  word->remove_last_unichar_id();
291  word->set_rating(old_rating);
292  word->set_certainty(old_certainty);
293  word->set_permuter(old_permuter);
294 }
295 
322  float curr_rating, float curr_certainty,
323  const CHAR_FRAGMENT_INFO *prev_char_frag_info,
324  const char *debug, int word_ending,
325  CHAR_FRAGMENT_INFO *char_frag_info) {
326  const CHAR_FRAGMENT *this_fragment =
327  getUnicharset().get_fragment(curr_unichar_id);
328  const CHAR_FRAGMENT *prev_fragment =
329  prev_char_frag_info != NULL ? prev_char_frag_info->fragment : NULL;
330 
331  // Print debug info for fragments.
332  if (debug && (prev_fragment || this_fragment)) {
333  tprintf("%s check fragments: choice=%s word_ending=%d\n", debug,
334  getUnicharset().debug_str(curr_unichar_id).string(),
335  word_ending);
336  if (prev_fragment) {
337  tprintf("prev_fragment %s\n", prev_fragment->to_string().string());
338  }
339  if (this_fragment) {
340  tprintf("this_fragment %s\n", this_fragment->to_string().string());
341  }
342  }
343 
344  char_frag_info->unichar_id = curr_unichar_id;
345  char_frag_info->fragment = this_fragment;
346  char_frag_info->rating = curr_rating;
347  char_frag_info->certainty = curr_certainty;
348  char_frag_info->num_fragments = 1;
349  if (prev_fragment && !this_fragment) {
350  if (debug) tprintf("Skip choice with incomplete fragment\n");
351  return false;
352  }
353  if (this_fragment) {
354  // We are dealing with a fragment.
355  char_frag_info->unichar_id = INVALID_UNICHAR_ID;
356  if (prev_fragment) {
357  if (!this_fragment->is_continuation_of(prev_fragment)) {
358  if (debug) tprintf("Non-matching fragment piece\n");
359  return false;
360  }
361  if (this_fragment->is_ending()) {
362  char_frag_info->unichar_id =
363  getUnicharset().unichar_to_id(this_fragment->get_unichar());
364  char_frag_info->fragment = NULL;
365  if (debug) {
366  tprintf("Built character %s from fragments\n",
367  getUnicharset().debug_str(
368  char_frag_info->unichar_id).string());
369  }
370  } else {
371  if (debug) tprintf("Record fragment continuation\n");
372  char_frag_info->fragment = this_fragment;
373  }
374  // Update certainty and rating.
375  char_frag_info->rating =
376  prev_char_frag_info->rating + curr_rating;
377  char_frag_info->num_fragments = prev_char_frag_info->num_fragments + 1;
378  char_frag_info->certainty =
379  MIN(curr_certainty, prev_char_frag_info->certainty);
380  } else {
381  if (this_fragment->is_beginning()) {
382  if (debug) tprintf("Record fragment beginning\n");
383  } else {
384  if (debug) {
385  tprintf("Non-starting fragment piece with no prev_fragment\n");
386  }
387  return false;
388  }
389  }
390  }
391  if (word_ending && char_frag_info->fragment) {
392  if (debug) tprintf("Word can not end with a fragment\n");
393  return false;
394  }
395  return true;
396 }
397 
398 } // namespace tesseract
void update_best_choice(const WERD_CHOICE &word, WERD_CHOICE *best_choice)
Definition: dict.h:170
void permute_choices(const char *debug, const BLOB_CHOICE_LIST_VECTOR &char_choices, int char_choice_index, const CHAR_FRAGMENT_INFO *prev_char_frag_info, WERD_CHOICE *word, float certainties[], float *limit, WERD_CHOICE *best_choice, int *attempts_left, void *more_args)
Definition: permdawg.cpp:203
void set_certainty(float new_val)
Definition: ratngs.h:370
const UNICHARSET & getUnicharset() const
Definition: dict.h:97
bool is_ending() const
Definition: unicharset.h:102
UNICHAR_ID unichar_id(int index) const
Definition: ratngs.h:313
float certainty
Definition: dict.h:44
int UNICHAR_ID
Definition: unichar.h:33
bool is_beginning() const
Definition: unicharset.h:99
const CHAR_FRAGMENT * get_fragment(UNICHAR_ID unichar_id) const
Definition: unicharset.h:694
int length() const
Definition: ratngs.h:301
const char * get_unichar() const
Definition: unicharset.h:64
int dawg_debug_level
Definition: dict.h:605
void init_active_dawgs(DawgPositionVector *active_dawgs, bool ambigs_mode) const
Definition: dict.cpp:568
float rating() const
Definition: ratngs.h:79
const STRING debug_string() const
Definition: ratngs.h:503
#define tprintf(...)
Definition: tprintf.h:31
const char * string() const
Definition: strngs.cpp:198
void append_unichar_id(UNICHAR_ID unichar_id, int blob_count, float rating, float certainty)
Definition: ratngs.cpp:446
int size() const
Definition: genericvector.h:72
const CHAR_FRAGMENT * fragment
Definition: dict.h:41
float certainty() const
Definition: ratngs.h:82
static STRING to_string(const char *unichar, int pos, int total, bool natural)
void append_choices(const char *debug, const BLOB_CHOICE_LIST_VECTOR &char_choices, const BLOB_CHOICE &blob_choice, int char_choice_index, const CHAR_FRAGMENT_INFO *prev_char_frag_info, WERD_CHOICE *word, float certainties[], float *limit, WERD_CHOICE *best_choice, int *attempts_left, void *more_args)
Definition: permdawg.cpp:245
#define ASSERT_HOST(x)
Definition: errcode.h:84
void string_and_lengths(STRING *word_str, STRING *word_lengths_str) const
Definition: ratngs.cpp:427
const char * id_to_unichar(UNICHAR_ID id) const
Definition: unicharset.cpp:266
int num_fragments
Definition: dict.h:42
uinT8 permuter() const
Definition: ratngs.h:344
void remove_last_unichar_id()
Definition: ratngs.h:481
void(Dict::* go_deeper_fxn_)(const char *debug, const BLOB_CHOICE_LIST_VECTOR &char_choices, int char_choice_index, const CHAR_FRAGMENT_INFO *prev_char_frag_info, bool word_ending, WERD_CHOICE *word, float certainties[], float *limit, WERD_CHOICE *best_choice, int *attempts_left, void *void_more_args)
Pointer to go_deeper function.
Definition: dict.h:204
bool fragment_state_okay(UNICHAR_ID curr_unichar_id, float curr_rating, float curr_certainty, const CHAR_FRAGMENT_INFO *prev_char_frag_info, const char *debug, int word_ending, CHAR_FRAGMENT_INFO *char_frag_info)
Definition: permdawg.cpp:321
float rating
Definition: dict.h:43
Definition: strngs.h:45
DawgPositionVector * updated_dawgs
Definition: dict.h:81
bool is_continuation_of(const CHAR_FRAGMENT *fragment) const
Definition: unicharset.h:92
T & get(int index) const
float certainty() const
Definition: ratngs.h:328
int max_permuter_attempts
Definition: dict.h:647
int length() const
Definition: genericvector.h:85
char * output_ambig_words_file
Definition: dict.h:603
void set_rating(float new_val)
Definition: ratngs.h:367
#define MIN(x, y)
Definition: ndminx.h:28
void append_unichar_id_space_allocated(UNICHAR_ID unichar_id, int blob_count, float rating, float certainty)
Definition: ratngs.h:450
DawgPositionVector * active_dawgs
Definition: dict.h:80
void make_bad()
Set the fields in this choice to be default (bad) values.
Definition: ratngs.h:441
uint8_t uinT8
Definition: host.h:35
PermuterType permuter
Definition: dict.h:82
UNICHAR_ID unichar_id() const
Definition: ratngs.h:76
WERD_CHOICE * dawg_permute_and_select(const BLOB_CHOICE_LIST_VECTOR &char_choices, float rating_limit)
Definition: permdawg.cpp:174
UNICHAR_ID unichar_to_id(const char *const unichar_repr) const
Definition: unicharset.cpp:194
void set_permuter(uinT8 perm)
Definition: ratngs.h:373
int(Dict::* letter_is_okay_)(void *void_dawg_args, UNICHAR_ID unichar_id, bool word_end) const
Definition: dict.h:356
UNICHAR_ID unichar_id
Definition: dict.h:40
float rating() const
Definition: ratngs.h:325
void go_deeper_dawg_fxn(const char *debug, const BLOB_CHOICE_LIST_VECTOR &char_choices, int char_choice_index, const CHAR_FRAGMENT_INFO *prev_char_frag_info, bool word_ending, WERD_CHOICE *word, float certainties[], float *limit, WERD_CHOICE *best_choice, int *attempts_left, void *void_more_args)
Definition: permdawg.cpp:51
#define MAX_WERD_LENGTH
Definition: dict.h:35