tesseract  4.00.00dev
blamer.h
Go to the documentation of this file.
1 // File: blamer.h
3 // Description: Module allowing precise error causes to be allocated.
4 // Author: Rike Antonova
5 // Refactored: Ray Smith
6 // Created: Mon Feb 04 14:37:01 PST 2013
7 //
8 // (C) Copyright 2013, Google Inc.
9 // Licensed under the Apache License, Version 2.0 (the "License");
10 // you may not use this file except in compliance with the License.
11 // You may obtain a copy of the License at
12 // http://www.apache.org/licenses/LICENSE-2.0
13 // Unless required by applicable law or agreed to in writing, software
14 // distributed under the License is distributed on an "AS IS" BASIS,
15 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
16 // See the License for the specific language governing permissions and
17 // limitations under the License.
18 //
20 
21 #ifndef TESSERACT_CCSTRUCT_BLAMER_H_
22 #define TESSERACT_CCSTRUCT_BLAMER_H_
23 
24 #include <stdio.h>
25 #include "boxword.h"
26 #include "genericvector.h"
27 #include "matrix.h"
29 #include "ratngs.h"
30 #include "strngs.h"
31 #include "tesscallback.h"
32 
33 static const inT16 kBlamerBoxTolerance = 5;
34 
35 // Enum for expressing the source of error.
36 // Note: Please update kIncorrectResultReasonNames when modifying this enum.
38  // The text recorded in best choice == truth text
40  // Either: Top choice is incorrect and is a dictionary word (language model
41  // is unlikely to help correct such errors, so blame the classifier).
42  // Or: the correct unichar was not included in shortlist produced by the
43  // classifier at all.
45  // Chopper have not found one or more splits that correspond to the correct
46  // character bounding boxes recorded in BlamerBundle::truth_word.
48  // Classifier did include correct unichars for each blob in the correct
49  // segmentation, however its rating could have been too bad to allow the
50  // language model to pull out the correct choice. On the other hand the
51  // strength of the language model might have been too weak to favor the
52  // correct answer, this we call this case a classifier-language model
53  // tradeoff error.
55  // Page layout failed to produce the correct bounding box. Blame page layout
56  // if the truth was not found for the word, which implies that the bounding
57  // box of the word was incorrect (no truth word had a similar bounding box).
59  // SegSearch heuristic prevented one or more blobs from the correct
60  // segmentation state to be classified (e.g. the blob was too wide).
62  // The correct segmentaiton state was not explored because of poor SegSearch
63  // pain point prioritization. We blame SegSearch pain point prioritization
64  // if the best rating of a choice constructed from correct segmentation is
65  // better than that of the best choice (i.e. if we got to explore the correct
66  // segmentation state, language model would have picked the correct choice).
68  // Same as IRR_CLASS_LM_TRADEOFF, but used when we only run chopper on a word,
69  // and thus use the old language model (permuters).
70  // TODO(antonova): integrate the new language mode with chopper
72  // If there is an incorrect adaptive template match with a better score than
73  // a correct one (either pre-trained or adapted), mark this as adaption error.
75  // split_and_recog_word() failed to find a suitable split in truth.
77  // Truth is not available for this word (e.g. when words in corrected content
78  // file are turned into ~~~~ because an appropriate alignment was not found.
80  // The text recorded in best choice != truth text, but none of the above
81  // reasons are set.
83 
85 };
86 
87 // Blamer-related information to determine the source of errors.
88 struct BlamerBundle {
89  static const char *IncorrectReasonName(IncorrectResultReason irr);
90  BlamerBundle() : truth_has_char_boxes_(false),
91  incorrect_result_reason_(IRR_CORRECT),
92  lattice_data_(NULL) { ClearResults(); }
93  BlamerBundle(const BlamerBundle &other) {
94  this->CopyTruth(other);
95  this->CopyResults(other);
96  }
97  ~BlamerBundle() { delete[] lattice_data_; }
98 
99  // Accessors.
100  STRING TruthString() const {
101  STRING truth_str;
102  for (int i = 0; i < truth_text_.length(); ++i)
103  truth_str += truth_text_[i];
104  return truth_str;
105  }
107  return incorrect_result_reason_;
108  }
109  bool NoTruth() const {
110  return incorrect_result_reason_ == IRR_NO_TRUTH ||
111  incorrect_result_reason_ == IRR_PAGE_LAYOUT;
112  }
113  bool HasDebugInfo() const {
114  return debug_.length() > 0 || misadaption_debug_.length() > 0;
115  }
116  const STRING& debug() const {
117  return debug_;
118  }
119  const STRING& misadaption_debug() const {
120  return misadaption_debug_;
121  }
122  void UpdateBestRating(float rating) {
123  if (rating < best_correctly_segmented_rating_)
124  best_correctly_segmented_rating_ = rating;
125  }
127  return correct_segmentation_cols_.length();
128  }
129  // Returns true if the given ratings matrix col,row position is included
130  // in the correct segmentation path at the given index.
131  bool MatrixPositionCorrect(int index, const MATRIX_COORD& coord) {
132  return correct_segmentation_cols_[index] == coord.col &&
133  correct_segmentation_rows_[index] == coord.row;
134  }
136  best_choice_is_dict_and_top_choice_ = value;
137  }
138  const char* lattice_data() const {
139  return lattice_data_;
140  }
141  int lattice_size() const {
142  return lattice_size_; // size of lattice_data in bytes
143  }
144  void set_lattice_data(const char* data, int size) {
145  lattice_size_ = size;
146  delete [] lattice_data_;
147  lattice_data_ = new char[lattice_size_];
148  memcpy(lattice_data_, data, lattice_size_);
149  }
151  return params_training_bundle_;
152  }
153  // Adds a new ParamsTrainingHypothesis to the current hypothesis list.
155  params_training_bundle_.AddHypothesis(hypo);
156  }
157 
158  // Functions to setup the blamer.
159  // Whole word string, whole word bounding box.
160  void SetWordTruth(const UNICHARSET& unicharset,
161  const char* truth_str, const TBOX& word_box);
162  // Single "character" string, "character" bounding box.
163  // May be called multiple times to indicate the characters in a word.
164  void SetSymbolTruth(const UNICHARSET& unicharset,
165  const char* char_str, const TBOX& char_box);
166  // Marks that there is something wrong with the truth text, like it contains
167  // reject characters.
168  void SetRejectedTruth();
169 
170  // Returns true if the provided word_choice is correct.
171  bool ChoiceIsCorrect(const WERD_CHOICE* word_choice) const;
172 
173  void ClearResults() {
174  norm_truth_word_.DeleteAllBoxes();
175  norm_box_tolerance_ = 0;
176  if (!NoTruth()) incorrect_result_reason_ = IRR_CORRECT;
177  debug_ = "";
178  segsearch_is_looking_for_blame_ = false;
179  best_correctly_segmented_rating_ = WERD_CHOICE::kBadRating;
180  correct_segmentation_cols_.clear();
181  correct_segmentation_rows_.clear();
182  best_choice_is_dict_and_top_choice_ = false;
183  delete[] lattice_data_;
184  lattice_data_ = NULL;
185  lattice_size_ = 0;
186  }
187  void CopyTruth(const BlamerBundle &other) {
188  truth_has_char_boxes_ = other.truth_has_char_boxes_;
189  truth_word_ = other.truth_word_;
190  truth_text_ = other.truth_text_;
191  incorrect_result_reason_ =
192  (other.NoTruth() ? other.incorrect_result_reason_ : IRR_CORRECT);
193  }
194  void CopyResults(const BlamerBundle &other) {
195  norm_truth_word_ = other.norm_truth_word_;
196  norm_box_tolerance_ = other.norm_box_tolerance_;
197  incorrect_result_reason_ = other.incorrect_result_reason_;
198  segsearch_is_looking_for_blame_ = other.segsearch_is_looking_for_blame_;
199  best_correctly_segmented_rating_ = other.best_correctly_segmented_rating_;
200  correct_segmentation_cols_ = other.correct_segmentation_cols_;
201  correct_segmentation_rows_ = other.correct_segmentation_rows_;
202  best_choice_is_dict_and_top_choice_ =
203  other.best_choice_is_dict_and_top_choice_;
204  if (other.lattice_data_ != NULL) {
205  lattice_data_ = new char[other.lattice_size_];
206  memcpy(lattice_data_, other.lattice_data_, other.lattice_size_);
207  lattice_size_ = other.lattice_size_;
208  } else {
209  lattice_data_ = NULL;
210  }
211  }
212  const char *IncorrectReason() const;
213 
214  // Appends choice and truth details to the given debug string.
215  void FillDebugString(const STRING &msg, const WERD_CHOICE *choice,
216  STRING *debug);
217 
218  // Sets up the norm_truth_word from truth_word using the given DENORM.
219  void SetupNormTruthWord(const DENORM& denorm);
220 
221  // Splits *this into two pieces in bundle1 and bundle2 (preallocated, empty
222  // bundles) where the right edge/ of the left-hand word is word1_right,
223  // and the left edge of the right-hand word is word2_left.
224  void SplitBundle(int word1_right, int word2_left, bool debug,
225  BlamerBundle* bundle1, BlamerBundle* bundle2) const;
226  // "Joins" the blames from bundle1 and bundle2 into *this.
227  void JoinBlames(const BlamerBundle& bundle1, const BlamerBundle& bundle2,
228  bool debug);
229 
230  // If a blob with the same bounding box as one of the truth character
231  // bounding boxes is not classified as the corresponding truth character
232  // blames character classifier for incorrect answer.
233  void BlameClassifier(const UNICHARSET& unicharset,
234  const TBOX& blob_box,
235  const BLOB_CHOICE_LIST& choices,
236  bool debug);
237 
238 
239  // Checks whether chops were made at all the character bounding box
240  // boundaries in word->truth_word. If not - blames the chopper for an
241  // incorrect answer.
242  void SetChopperBlame(const WERD_RES* word, bool debug);
243  // Blames the classifier or the language model if, after running only the
244  // chopper, best_choice is incorrect and no blame has been yet set.
245  // Blames the classifier if best_choice is classifier's top choice and is a
246  // dictionary word (i.e. language model could not have helped).
247  // Otherwise, blames the language model (formerly permuter word adjustment).
249  const WERD_RES* word,
250  const UNICHARSET& unicharset, bool valid_permuter, bool debug);
251  // Sets up the correct_segmentation_* to mark the correct bounding boxes.
252  void SetupCorrectSegmentation(const TWERD* word, bool debug);
253 
254  // Returns true if a guided segmentation search is needed.
255  bool GuidedSegsearchNeeded(const WERD_CHOICE *best_choice) const;
256  // Setup ready to guide the segmentation search to the correct segmentation.
257  // The callback pp_cb is used to avoid a cyclic dependency.
258  // It calls into LMPainPoints::GenerateForBlamer by pre-binding the
259  // WERD_RES, and the LMPainPoints itself.
260  // pp_cb must be a permanent callback, and should be deleted by the caller.
261  void InitForSegSearch(const WERD_CHOICE *best_choice,
262  MATRIX* ratings, UNICHAR_ID wildcard_id,
263  bool debug, STRING *debug_str,
265  // Returns true if the guided segsearch is in progress.
266  bool GuidedSegsearchStillGoing() const;
267  // The segmentation search has ended. Sets the blame appropriately.
268  void FinishSegSearch(const WERD_CHOICE *best_choice,
269  bool debug, STRING *debug_str);
270 
271  // If the bundle is null or still does not indicate the correct result,
272  // fix it and use some backup reason for the blame.
273  static void LastChanceBlame(bool debug, WERD_RES* word);
274 
275  // Sets the misadaption debug if this word is incorrect, as this word is
276  // being adapted to.
277  void SetMisAdaptionDebug(const WERD_CHOICE *best_choice, bool debug);
278 
279  private:
280  void SetBlame(IncorrectResultReason irr, const STRING &msg,
281  const WERD_CHOICE *choice, bool debug) {
282  incorrect_result_reason_ = irr;
283  debug_ = IncorrectReason();
284  debug_ += " to blame: ";
285  FillDebugString(msg, choice, &debug_);
286  if (debug) tprintf("SetBlame(): %s", debug_.string());
287  }
288 
289  private:
290  // Set to true when bounding boxes for individual unichars are recorded.
291  bool truth_has_char_boxes_;
292  // The true_word (in the original image coordinate space) contains ground
293  // truth bounding boxes for this WERD_RES.
294  tesseract::BoxWord truth_word_;
295  // Same as above, but in normalized coordinates
296  // (filled in by WERD_RES::SetupForRecognition()).
297  tesseract::BoxWord norm_truth_word_;
298  // Tolerance for bounding box comparisons in normalized space.
299  int norm_box_tolerance_;
300  // Contains ground truth unichar for each of the bounding boxes in truth_word.
301  GenericVector<STRING> truth_text_;
302  // The reason for incorrect OCR result.
303  IncorrectResultReason incorrect_result_reason_;
304  // Debug text associated with the blame.
305  STRING debug_;
306  // Misadaption debug information (filled in if this word was misadapted to).
307  STRING misadaption_debug_;
308  // Variables used by the segmentation search when looking for the blame.
309  // Set to true while segmentation search is continued after the usual
310  // termination condition in order to look for the blame.
311  bool segsearch_is_looking_for_blame_;
312  // Best rating for correctly segmented path
313  // (set and used by SegSearch when looking for blame).
314  float best_correctly_segmented_rating_;
315  // Vectors populated by SegSearch to indicate column and row indices that
316  // correspond to blobs with correct bounding boxes.
317  GenericVector<int> correct_segmentation_cols_;
318  GenericVector<int> correct_segmentation_rows_;
319  // Set to true if best choice is a dictionary word and
320  // classifier's top choice.
321  bool best_choice_is_dict_and_top_choice_;
322  // Serialized segmentation search lattice.
323  char *lattice_data_;
324  int lattice_size_; // size of lattice_data in bytes
325  // Information about hypotheses (paths) explored by the segmentation search.
326  tesseract::ParamsTrainingBundle params_training_bundle_;
327 };
328 
329 
330 #endif // TESSERACT_CCSTRUCT_BLAMER_H_
void CopyResults(const BlamerBundle &other)
Definition: blamer.h:194
void CopyTruth(const BlamerBundle &other)
Definition: blamer.h:187
void SetRejectedTruth()
Definition: blamer.cpp:105
int UNICHAR_ID
Definition: unichar.h:33
void BlameClassifier(const UNICHARSET &unicharset, const TBOX &blob_box, const BLOB_CHOICE_LIST &choices, bool debug)
Definition: blamer.cpp:257
const char * lattice_data() const
Definition: blamer.h:138
int lattice_size() const
Definition: blamer.h:141
voidpf void uLong size
Definition: ioapi.h:39
void SplitBundle(int word1_right, int word2_left, bool debug, BlamerBundle *bundle1, BlamerBundle *bundle2) const
Definition: blamer.cpp:169
void FillDebugString(const STRING &msg, const WERD_CHOICE *choice, STRING *debug)
Definition: blamer.cpp:123
void FinishSegSearch(const WERD_CHOICE *best_choice, bool debug, STRING *debug_str)
Definition: blamer.cpp:506
#define tprintf(...)
Definition: tprintf.h:31
const char * string() const
Definition: strngs.cpp:198
void SetChopperBlame(const WERD_RES *word, bool debug)
Definition: blamer.cpp:310
const STRING & misadaption_debug() const
Definition: blamer.h:119
inT32 length() const
Definition: strngs.cpp:193
int16_t inT16
Definition: host.h:36
~BlamerBundle()
Definition: blamer.h:97
const char * IncorrectReason() const
Definition: blamer.cpp:60
const tesseract::ParamsTrainingBundle & params_training_bundle() const
Definition: blamer.h:150
void BlameClassifierOrLangModel(const WERD_RES *word, const UNICHARSET &unicharset, bool valid_permuter, bool debug)
Definition: blamer.cpp:369
IncorrectResultReason incorrect_result_reason() const
Definition: blamer.h:106
Definition: blobs.h:395
bool GuidedSegsearchStillGoing() const
Definition: blamer.cpp:501
void SetWordTruth(const UNICHARSET &unicharset, const char *truth_str, const TBOX &word_box)
Definition: blamer.cpp:66
void ClearResults()
Definition: blamer.h:173
const STRING & debug() const
Definition: blamer.h:116
void set_lattice_data(const char *data, int size)
Definition: blamer.h:144
Definition: strngs.h:45
static const float kBadRating
Definition: ratngs.h:273
bool GuidedSegsearchNeeded(const WERD_CHOICE *best_choice) const
Definition: blamer.cpp:461
BlamerBundle(const BlamerBundle &other)
Definition: blamer.h:93
ParamsTrainingHypothesis & AddHypothesis(const ParamsTrainingHypothesis &other)
static void LastChanceBlame(bool debug, WERD_RES *word)
Definition: blamer.cpp:547
BlamerBundle()
Definition: blamer.h:90
int length() const
Definition: genericvector.h:85
void set_best_choice_is_dict_and_top_choice(bool value)
Definition: blamer.h:135
bool NoTruth() const
Definition: blamer.h:109
bool MatrixPositionCorrect(int index, const MATRIX_COORD &coord)
Definition: blamer.h:131
bool ChoiceIsCorrect(const WERD_CHOICE *word_choice) const
Definition: blamer.cpp:111
bool HasDebugInfo() const
Definition: blamer.h:113
Definition: rect.h:30
void SetupNormTruthWord(const DENORM &denorm)
Definition: blamer.cpp:145
void JoinBlames(const BlamerBundle &bundle1, const BlamerBundle &bundle2, bool debug)
Definition: blamer.cpp:225
Definition: matrix.h:563
IncorrectResultReason
Definition: blamer.h:37
void AddHypothesis(const tesseract::ParamsTrainingHypothesis &hypo)
Definition: blamer.h:154
void SetupCorrectSegmentation(const TWERD *word, bool debug)
Definition: blamer.cpp:407
void UpdateBestRating(float rating)
Definition: blamer.h:122
int correct_segmentation_length() const
Definition: blamer.h:126
static const char * IncorrectReasonName(IncorrectResultReason irr)
Definition: blamer.cpp:56
void SetSymbolTruth(const UNICHARSET &unicharset, const char *char_str, const TBOX &char_box)
Definition: blamer.cpp:86
void SetMisAdaptionDebug(const WERD_CHOICE *best_choice, bool debug)
Definition: blamer.cpp:574
STRING TruthString() const
Definition: blamer.h:100
void DeleteAllBoxes()
Definition: boxword.cpp:177
void InitForSegSearch(const WERD_CHOICE *best_choice, MATRIX *ratings, UNICHAR_ID wildcard_id, bool debug, STRING *debug_str, TessResultCallback2< bool, int, int > *pp_cb)
Definition: blamer.cpp:473