tesseract  4.00.00dev
language_model.h
Go to the documentation of this file.
1 // File: language_model.h
3 // Description: Functions that utilize the knowledge about the properties,
4 // structure and statistics of the language to help segmentation
5 // search.
6 // Author: Daria Antonova
7 // Created: Mon Nov 11 11:26:43 PST 2009
8 //
9 // (C) Copyright 2009, Google Inc.
10 // Licensed under the Apache License, Version 2.0 (the "License");
11 // you may not use this file except in compliance with the License.
12 // You may obtain a copy of the License at
13 // http://www.apache.org/licenses/LICENSE-2.0
14 // Unless required by applicable law or agreed to in writing, software
15 // distributed under the License is distributed on an "AS IS" BASIS,
16 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
17 // See the License for the specific language governing permissions and
18 // limitations under the License.
19 //
21 
22 #ifndef TESSERACT_WORDREC_LANGUAGE_MODEL_H_
23 #define TESSERACT_WORDREC_LANGUAGE_MODEL_H_
24 
25 #include "associate.h"
26 #include "dawg.h"
27 #include "dict.h"
28 #include "fontinfo.h"
29 #include "intproto.h"
30 #include "lm_consistency.h"
31 #include "lm_pain_points.h"
32 #include "lm_state.h"
33 #include "matrix.h"
34 #include "params.h"
35 #include "pageres.h"
36 #include "params_model.h"
37 
38 namespace tesseract {
39 
40 // This class that contains the data structures and functions necessary
41 // to represent and use the knowledge about the language.
43  public:
44  // Masks for keeping track of top choices that should not be pruned out.
48  static const LanguageModelFlagsType kDigitFlag = 0x8;
50 
51  // Denominator for normalizing per-letter ngram cost when deriving
52  // penalty adjustments.
53  static const float kMaxAvgNgramCost;
54 
55  LanguageModel(const UnicityTable<FontInfo> *fontinfo_table, Dict *dict);
57 
58  // Fills the given floats array with features extracted from path represented
59  // by the given ViterbiStateEntry. See ccstruct/params_training_featdef.h
60  // for feature information.
61  // Note: the function assumes that features points to an array of size
62  // PTRAIN_NUM_FEATURE_TYPES.
63  static void ExtractFeaturesFromPath(const ViterbiStateEntry &vse,
64  float features[]);
65 
66  // Updates data structures that are used for the duration of the segmentation
67  // search on the current word;
68  void InitForWord(const WERD_CHOICE *prev_word,
69  bool fixed_pitch, float max_char_wh_ratio,
70  float rating_cert_scale);
71 
72  // Updates language model state of the given BLOB_CHOICE_LIST (from
73  // the ratings matrix) a its parent. Updates pain_points if new
74  // problematic points are found in the segmentation graph.
75  //
76  // At most language_model_viterbi_list_size are kept in each
77  // LanguageModelState.viterbi_state_entries list.
78  // At most language_model_viterbi_list_max_num_prunable of those are prunable
79  // (non-dictionary) paths.
80  // The entries that represent dictionary word paths are kept at the front
81  // of the list.
82  // The list ordered by cost that is computed collectively by several
83  // language model components (currently dawg and ngram components).
84  bool UpdateState(
85  bool just_classified,
86  int curr_col, int curr_row,
87  BLOB_CHOICE_LIST *curr_list,
88  LanguageModelState *parent_node,
89  LMPainPoints *pain_points,
90  WERD_RES *word_res,
91  BestChoiceBundle *best_choice_bundle,
92  BlamerBundle *blamer_bundle);
93 
94  // Returns true if an acceptable best choice was discovered.
96  inline void SetAcceptableChoiceFound(bool val) {
98  }
99  // Returns the reference to ParamsModel.
101 
102  protected:
103 
104  inline float CertaintyScore(float cert) {
106  // cert is assumed to be between 0 and -dict_->certainty_scale.
107  // If you enable language_model_use_sigmoidal_certainty, you
108  // need to adjust language_model_ngram_nonmatch_score as well.
109  cert = -cert / dict_->certainty_scale;
110  return 1.0f / (1.0f + exp(10.0f * cert));
111  } else {
112  return (-1.0f / cert);
113  }
114  }
115 
116  inline float ComputeAdjustment(int num_problems, float penalty) {
117  if (num_problems == 0) return 0.0f;
118  if (num_problems == 1) return penalty;
119  return (penalty + (language_model_penalty_increment *
120  static_cast<float>(num_problems-1)));
121  }
122 
123  // Computes the adjustment to the ratings sum based on the given
124  // consistency_info. The paths with invalid punctuation, inconsistent
125  // case and character type are penalized proportionally to the number
126  // of inconsistencies on the path.
128  const LanguageModelDawgInfo *dawg_info,
129  const LMConsistencyInfo &consistency_info) {
130  if (dawg_info != NULL) {
131  return ComputeAdjustment(consistency_info.NumInconsistentCase(),
133  (consistency_info.inconsistent_script ?
135  }
136  return (ComputeAdjustment(consistency_info.NumInconsistentPunc(),
138  ComputeAdjustment(consistency_info.NumInconsistentCase(),
140  ComputeAdjustment(consistency_info.NumInconsistentChartype(),
142  ComputeAdjustment(consistency_info.NumInconsistentSpaces(),
144  (consistency_info.inconsistent_script ?
146  (consistency_info.inconsistent_font ?
148  }
149 
150  // Returns an adjusted ratings sum that includes inconsistency penalties,
151  // penalties for non-dictionary paths and paths with dips in ngram
152  // probability.
154 
155  // Finds the first lower and upper case letter and first digit in curr_list.
156  // Uses the first character in the list in place of empty results.
157  // Returns true if both alpha and digits are found.
158  bool GetTopLowerUpperDigit(BLOB_CHOICE_LIST *curr_list,
159  BLOB_CHOICE **first_lower,
160  BLOB_CHOICE **first_upper,
161  BLOB_CHOICE **first_digit) const;
162  // Forces there to be at least one entry in the overall set of the
163  // viterbi_state_entries of each element of parent_node that has the
164  // top_choice_flag set for lower, upper and digit using the same rules as
165  // GetTopLowerUpperDigit, setting the flag on the first found suitable
166  // candidate, whether or not the flag is set on some other parent.
167  // Returns 1 if both alpha and digits are found among the parents, -1 if no
168  // parents are found at all (a legitimate case), and 0 otherwise.
169  int SetTopParentLowerUpperDigit(LanguageModelState *parent_node) const;
170 
171  // Finds the next ViterbiStateEntry with which the given unichar_id can
172  // combine sensibly, taking into account any mixed alnum/mixed case
173  // situation, and whether this combination has been inspected before.
175  bool just_classified, bool mixed_alnum,
176  const BLOB_CHOICE* bc, LanguageModelFlagsType blob_choice_flags,
177  const UNICHARSET& unicharset, WERD_RES* word_res,
178  ViterbiStateEntry_IT* vse_it,
179  LanguageModelFlagsType* top_choice_flags) const;
180  // Helper function that computes the cost of the path composed of the
181  // path in the given parent ViterbiStateEntry and the given BLOB_CHOICE.
182  // If the new path looks good enough, adds a new ViterbiStateEntry to the
183  // list of viterbi entries in the given BLOB_CHOICE and returns true.
185  LanguageModelFlagsType top_choice_flags, float denom, bool word_end,
186  int curr_col, int curr_row, BLOB_CHOICE *b,
187  LanguageModelState *curr_state, ViterbiStateEntry *parent_vse,
188  LMPainPoints *pain_points, WERD_RES *word_res,
189  BestChoiceBundle *best_choice_bundle, BlamerBundle *blamer_bundle);
190 
191  // Determines whether a potential entry is a true top choice and
192  // updates changed accordingly.
193  //
194  // Note: The function assumes that b, top_choice_flags and changed
195  // are not NULL.
197  const ViterbiStateEntry *parent_vse,
198  LanguageModelState *lms);
199 
200  // Calls dict_->LetterIsOk() with DawgArgs initialized from parent_vse and
201  // unichar from b.unichar_id(). Constructs and returns LanguageModelDawgInfo
202  // with updated active dawgs, constraints and permuter.
203  //
204  // Note: the caller is responsible for deleting the returned pointer.
206  int curr_col, int curr_row,
207  const BLOB_CHOICE &b,
208  const ViterbiStateEntry *parent_vse);
209 
210  // Computes p(unichar | parent context) and records it in ngram_cost.
211  // If b.unichar_id() is an unlikely continuation of the parent context
212  // sets found_small_prob to true and returns NULL.
213  // Otherwise creates a new LanguageModelNgramInfo entry containing the
214  // updated context (that includes b.unichar_id() at the end) and returns it.
215  //
216  // Note: the caller is responsible for deleting the returned pointer.
218  const char *unichar, float certainty, float denom,
219  int curr_col, int curr_row, float outline_length,
220  const ViterbiStateEntry *parent_vse);
221 
222  // Computes -(log(prob(classifier)) + log(prob(ngram model)))
223  // for the given unichar in the given context. If there are multiple
224  // unichars at one position - takes the average of their probabilities.
225  // UNICHAR::utf8_step() is used to separate out individual UTF8 characters,
226  // since probability_in_context() can only handle one at a time (while
227  // unicharset might contain ngrams and glyphs composed from multiple UTF8
228  // characters).
229  float ComputeNgramCost(const char *unichar, float certainty, float denom,
230  const char *context, int *unichar_step_len,
231  bool *found_small_prob, float *ngram_prob);
232 
233  // Computes the normalization factors for the classifier confidences
234  // (used by ComputeNgramCost()).
235  float ComputeDenom(BLOB_CHOICE_LIST *curr_list);
236 
237  // Fills the given consistenty_info based on parent_vse.consistency_info
238  // and on the consistency of the given unichar_id with parent_vse.
239  void FillConsistencyInfo(
240  int curr_col, bool word_end, BLOB_CHOICE *b,
241  ViterbiStateEntry *parent_vse,
242  WERD_RES *word_res,
243  LMConsistencyInfo *consistency_info);
244 
245  // Constructs WERD_CHOICE by recording unichar_ids of the BLOB_CHOICEs
246  // on the path represented by the given BLOB_CHOICE and language model
247  // state entries (lmse, dse). The path is re-constructed by following
248  // the parent pointers in the the lang model state entries). If the
249  // constructed WERD_CHOICE is better than the best/raw choice recorded
250  // in the best_choice_bundle, this function updates the corresponding
251  // fields and sets best_choice_bunldle->updated to true.
253  LMPainPoints *pain_points,
254  WERD_RES *word_res,
255  BestChoiceBundle *best_choice_bundle,
256  BlamerBundle *blamer_bundle);
257 
258  // Constructs a WERD_CHOICE by tracing parent pointers starting with
259  // the given LanguageModelStateEntry. Returns the constructed word.
260  // Updates best_char_choices, certainties and state if they are not
261  // NULL (best_char_choices and certainties are assumed to have the
262  // length equal to lmse->length).
263  // The caller is responsible for freeing memory associated with the
264  // returned WERD_CHOICE.
266  WERD_RES *word_res,
267  DANGERR *fixpt,
268  BlamerBundle *blamer_bundle,
269  bool *truth_path);
270 
271  // Wrapper around AssociateUtils::ComputeStats().
272  inline void ComputeAssociateStats(int col, int row,
273  float max_char_wh_ratio,
274  ViterbiStateEntry *parent_vse,
275  WERD_RES *word_res,
276  AssociateStats *associate_stats) {
278  col, row,
279  (parent_vse != NULL) ? &(parent_vse->associate_stats) : NULL,
280  (parent_vse != NULL) ? parent_vse->length : 0,
281  fixed_pitch_, max_char_wh_ratio,
282  word_res, language_model_debug_level > 2, associate_stats);
283  }
284 
285  // Returns true if the path with such top_choice_flags and dawg_info
286  // could be pruned out (i.e. is neither a system/user/frequent dictionary
287  // nor a top choice path).
288  // In non-space delimited languages all paths can be "somewhat" dictionary
289  // words. In such languages we can not do dictionary-driven path pruning,
290  // so paths with non-empty dawg_info are considered prunable.
291  inline bool PrunablePath(const ViterbiStateEntry &vse) {
292  if (vse.top_choice_flags) return false;
293  if (vse.dawg_info != NULL &&
296  vse.dawg_info->permuter == FREQ_DAWG_PERM)) return false;
297  return true;
298  }
299 
300  // Returns true if the given ViterbiStateEntry represents an acceptable path.
301  inline bool AcceptablePath(const ViterbiStateEntry &vse) {
302  return (vse.dawg_info != NULL || vse.Consistent() ||
303  (vse.ngram_info != NULL && !vse.ngram_info->pruned));
304  }
305 
306  public:
307  // Parameters.
308  INT_VAR_H(language_model_debug_level, 0, "Language model debug level");
310  "Turn on/off the use of character ngram model");
312  "Maximum order of the character ngram model");
314  "Maximum number of prunable (those for which PrunablePath() is"
315  " true) entries in each viterbi list recorded in BLOB_CHOICEs");
317  "Maximum size of viterbi lists recorded in BLOB_CHOICEs");
319  "To avoid overly small denominators use this as the floor"
320  " of the probability returned by the ngram model");
322  "Average classifier score of a non-matching unichar");
324  "Use only the first UTF8 step of the given string"
325  " when computing log probabilities");
327  "Strength of the character ngram model relative to the"
328  " character classifier ");
330  "Factor to bring log-probs into the same range as ratings"
331  " when multiplied by outline length ");
333  "Words are delimited by space");
335  "Minimum length of compound words");
336  // Penalties used for adjusting path costs and final word rating.
338  "Penalty for words not in the frequent word dictionary");
340  "Penalty for non-dictionary words");
342  "Penalty for inconsistent punctuation");
344  "Penalty for inconsistent case");
346  "Penalty for inconsistent script");
348  "Penalty for inconsistent character type");
350  "Penalty for inconsistent font");
352  "Penalty for inconsistent spacing");
353  double_VAR_H(language_model_penalty_increment, 0.01, "Penalty increment");
354  INT_VAR_H(wordrec_display_segmentations, 0, "Display Segmentations");
356  "Use sigmoidal score for certainty");
357 
358 
359  protected:
360  // Member Variables.
361 
362  // Temporary DawgArgs struct that is re-used across different words to
363  // avoid dynamic memory re-allocation (should be cleared before each use).
365  // Scaling for recovering blob outline length from rating and certainty.
367 
368  // The following variables are set at construction time.
369 
370  // Pointer to fontinfo table (not owned by LanguageModel).
372 
373  // Pointer to Dict class, that is used for querying the dictionaries
374  // (the pointer is not owned by LanguageModel).
376 
377  // TODO(daria): the following variables should become LanguageModel params
378  // when the old code in bestfirst.cpp and heuristic.cpp is deprecated.
379  //
380  // Set to true if we are dealing with fixed pitch text
381  // (set to assume_fixed_pitch_char_segment).
383  // Max char width-to-height ratio allowed
384  // (set to segsearch_max_char_wh_ratio).
386 
387  // The following variables are initialized with InitForWord().
388 
389  // String representation of the classification of the previous word
390  // (since this is only used by the character ngram model component,
391  // only the last language_model_ngram_order of the word are stored).
394  // Active dawg vector.
397  // Set to true if acceptable choice was discovered.
398  // Note: it would be nice to use this to terminate the search once an
399  // acceptable choices is found. However we do not do that and once an
400  // acceptable choice is found we finish looking for alternative choices
401  // in the current segmentation graph and then exit the search (no more
402  // classifications are done after an acceptable choice is found).
403  // This is needed in order to let the search find the words very close to
404  // the best choice in rating (e.g. what/What, Cat/cat, etc) and log these
405  // choices. This way the stopper will know that the best choice is not
406  // ambiguous (i.e. there are best choices in the best choice list that have
407  // ratings close to the very best one) and will be less likely to mis-adapt.
409  // Set to true if a choice representing correct segmentation was explored.
411 
412  // Params models containing weights for for computing ViterbiStateEntry costs.
414 };
415 
416 } // namespace tesseract
417 
418 #endif // TESSERACT_WORDREC_LANGUAGE_MODEL_H_
bool GetTopLowerUpperDigit(BLOB_CHOICE_LIST *curr_list, BLOB_CHOICE **first_lower, BLOB_CHOICE **first_upper, BLOB_CHOICE **first_digit) const
double certainty_scale
Definition: dict.h:611
bool AcceptablePath(const ViterbiStateEntry &vse)
int language_model_viterbi_list_max_num_prunable
float ComputeAdjustment(int num_problems, float penalty)
static const LanguageModelFlagsType kUpperCaseFlag
float ComputeAdjustedPathCost(ViterbiStateEntry *vse)
ViterbiStateEntry * GetNextParentVSE(bool just_classified, bool mixed_alnum, const BLOB_CHOICE *bc, LanguageModelFlagsType blob_choice_flags, const UNICHARSET &unicharset, WERD_RES *word_res, ViterbiStateEntry_IT *vse_it, LanguageModelFlagsType *top_choice_flags) const
void GenerateTopChoiceInfo(ViterbiStateEntry *new_vse, const ViterbiStateEntry *parent_vse, LanguageModelState *lms)
float ComputeDenom(BLOB_CHOICE_LIST *curr_list)
static const LanguageModelFlagsType kDigitFlag
void SetAcceptableChoiceFound(bool val)
void InitForWord(const WERD_CHOICE *prev_word, bool fixed_pitch, float max_char_wh_ratio, float rating_cert_scale)
LanguageModelNgramInfo * GenerateNgramInfo(const char *unichar, float certainty, float denom, int curr_col, int curr_row, float outline_length, const ViterbiStateEntry *parent_vse)
bool language_model_ngram_space_delimited_language
float ComputeNgramCost(const char *unichar, float certainty, float denom, const char *context, int *unichar_step_len, bool *found_small_prob, float *ngram_prob)
int SetTopParentLowerUpperDigit(LanguageModelState *parent_node) const
static void ExtractFeaturesFromPath(const ViterbiStateEntry &vse, float features[])
static const LanguageModelFlagsType kLowerCaseFlag
double language_model_penalty_non_dict_word
bool language_model_ngram_use_only_first_uft8_step
unsigned char LanguageModelFlagsType
Used for expressing various language model flags.
Definition: lm_state.h:37
const UnicityTable< FontInfo > * fontinfo_table_
float ComputeConsistencyAdjustment(const LanguageModelDawgInfo *dawg_info, const LMConsistencyInfo &consistency_info)
bool UpdateState(bool just_classified, int curr_col, int curr_row, BLOB_CHOICE_LIST *curr_list, LanguageModelState *parent_node, LMPainPoints *pain_points, WERD_RES *word_res, BestChoiceBundle *best_choice_bundle, BlamerBundle *blamer_bundle)
DawgPositionVector very_beginning_active_dawgs_
Definition: strngs.h:45
float CertaintyScore(float cert)
#define double_VAR_H(name, val, comment)
Definition: params.h:273
Bundle together all the things pertaining to the best choice/state.
Definition: lm_state.h:215
ParamsModel & getParamsModel()
static const float kMaxAvgNgramCost
double language_model_penalty_non_freq_dict_word
static void ComputeStats(int col, int row, const AssociateStats *parent_stats, int parent_path_length, bool fixed_pitch, float max_char_wh_ratio, WERD_RES *word_res, bool debug, AssociateStats *stats)
Definition: associate.cpp:37
LanguageModel(const UnicityTable< FontInfo > *fontinfo_table, Dict *dict)
static const LanguageModelFlagsType kXhtConsistentFlag
Struct to store information maintained by various language model components.
Definition: lm_state.h:193
LanguageModelNgramInfo * ngram_info
Definition: lm_state.h:182
bool AddViterbiStateEntry(LanguageModelFlagsType top_choice_flags, float denom, bool word_end, int curr_col, int curr_row, BLOB_CHOICE *b, LanguageModelState *curr_state, ViterbiStateEntry *parent_vse, LMPainPoints *pain_points, WERD_RES *word_res, BestChoiceBundle *best_choice_bundle, BlamerBundle *blamer_bundle)
double language_model_ngram_nonmatch_score
LanguageModelDawgInfo * GenerateDawgInfo(bool word_end, int curr_col, int curr_row, const BLOB_CHOICE &b, const ViterbiStateEntry *parent_vse)
bool PrunablePath(const ViterbiStateEntry &vse)
void UpdateBestChoice(ViterbiStateEntry *vse, LMPainPoints *pain_points, WERD_RES *word_res, BestChoiceBundle *best_choice_bundle, BlamerBundle *blamer_bundle)
LanguageModelFlagsType top_choice_flags
Definition: lm_state.h:174
#define INT_VAR_H(name, val, comment)
Definition: params.h:264
DawgPositionVector beginning_active_dawgs_
LanguageModelDawgInfo * dawg_info
Definition: lm_state.h:178
#define BOOL_VAR_H(name, val, comment)
Definition: params.h:267
void ComputeAssociateStats(int col, int row, float max_char_wh_ratio, ViterbiStateEntry *parent_vse, WERD_RES *word_res, AssociateStats *associate_stats)
void FillConsistencyInfo(int curr_col, bool word_end, BLOB_CHOICE *b, ViterbiStateEntry *parent_vse, WERD_RES *word_res, LMConsistencyInfo *consistency_info)
static const LanguageModelFlagsType kSmallestRatingFlag
AssociateStats associate_stats
Definition: lm_state.h:170
const char features[]
Definition: feature_tests.c:2
WERD_CHOICE * ConstructWord(ViterbiStateEntry *vse, WERD_RES *word_res, DANGERR *fixpt, BlamerBundle *blamer_bundle, bool *truth_path)