tesseract  4.00.00dev
segsearch.cpp
Go to the documentation of this file.
1 // File: segsearch.h
3 // Description: Segmentation search functions.
4 // Author: Daria Antonova
5 // Created: Mon Jun 23 11:26:43 PDT 2008
6 //
7 // (C) Copyright 2009, Google Inc.
8 // Licensed under the Apache License, Version 2.0 (the "License");
9 // you may not use this file except in compliance with the License.
10 // You may obtain a copy of the License at
11 // http://www.apache.org/licenses/LICENSE-2.0
12 // Unless required by applicable law or agreed to in writing, software
13 // distributed under the License is distributed on an "AS IS" BASIS,
14 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15 // See the License for the specific language governing permissions and
16 // limitations under the License.
17 //
19 
20 #include "wordrec.h"
21 
22 #include "associate.h"
23 #include "language_model.h"
24 #include "matrix.h"
25 #include "params.h"
26 #include "lm_pain_points.h"
27 #include "ratngs.h"
28 
29 namespace tesseract {
30 
31 void Wordrec::DoSegSearch(WERD_RES* word_res) {
32  BestChoiceBundle best_choice_bundle(word_res->ratings->dimension());
33  // Run Segmentation Search.
34  SegSearch(word_res, &best_choice_bundle, NULL);
35 }
36 
38  BestChoiceBundle* best_choice_bundle,
39  BlamerBundle* blamer_bundle) {
44  // Compute scaling factor that will help us recover blob outline length
45  // from classifier rating and certainty for the blob.
46  float rating_cert_scale = -1.0 * getDict().certainty_scale / rating_scale;
48  InitialSegSearch(word_res, &pain_points, &pending, best_choice_bundle,
49  blamer_bundle);
50 
51  if (!SegSearchDone(0)) { // find a better choice
52  if (chop_enable && word_res->chopped_word != NULL) {
53  improve_by_chopping(rating_cert_scale, word_res, best_choice_bundle,
54  blamer_bundle, &pain_points, &pending);
55  }
56  if (chop_debug) SEAM::PrintSeams("Final seam list:", word_res->seam_array);
57 
58  if (blamer_bundle != NULL &&
59  !blamer_bundle->ChoiceIsCorrect(word_res->best_choice)) {
60  blamer_bundle->SetChopperBlame(word_res, wordrec_debug_blamer);
61  }
62  }
63  // Keep trying to find a better path by fixing the "pain points".
64 
65  MATRIX_COORD pain_point;
66  float pain_point_priority;
67  int num_futile_classifications = 0;
68  STRING blamer_debug;
69  while (wordrec_enable_assoc &&
70  (!SegSearchDone(num_futile_classifications) ||
71  (blamer_bundle != NULL &&
72  blamer_bundle->GuidedSegsearchStillGoing()))) {
73  // Get the next valid "pain point".
74  bool found_nothing = true;
75  LMPainPointsType pp_type;
76  while ((pp_type = pain_points.Deque(&pain_point, &pain_point_priority)) !=
77  LM_PPTYPE_NUM) {
78  if (!pain_point.Valid(*word_res->ratings)) {
79  word_res->ratings->IncreaseBandSize(
80  pain_point.row - pain_point.col + 1);
81  }
82  if (pain_point.Valid(*word_res->ratings) &&
83  !word_res->ratings->Classified(pain_point.col, pain_point.row,
84  getDict().WildcardID())) {
85  found_nothing = false;
86  break;
87  }
88  }
89  if (found_nothing) {
90  if (segsearch_debug_level > 0) tprintf("Pain points queue is empty\n");
91  break;
92  }
93  ProcessSegSearchPainPoint(pain_point_priority, pain_point,
95  &pending, word_res, &pain_points, blamer_bundle);
96 
97  UpdateSegSearchNodes(rating_cert_scale, pain_point.col, &pending,
98  word_res, &pain_points, best_choice_bundle,
99  blamer_bundle);
100  if (!best_choice_bundle->updated) ++num_futile_classifications;
101 
102  if (segsearch_debug_level > 0) {
103  tprintf("num_futile_classifications %d\n", num_futile_classifications);
104  }
105 
106  best_choice_bundle->updated = false; // reset updated
107 
108  // See if it's time to terminate SegSearch or time for starting a guided
109  // search for the true path to find the blame for the incorrect best_choice.
110  if (SegSearchDone(num_futile_classifications) &&
111  blamer_bundle != NULL &&
112  blamer_bundle->GuidedSegsearchNeeded(word_res->best_choice)) {
113  InitBlamerForSegSearch(word_res, &pain_points, blamer_bundle,
114  &blamer_debug);
115  }
116  } // end while loop exploring alternative paths
117  if (blamer_bundle != NULL) {
118  blamer_bundle->FinishSegSearch(word_res->best_choice,
119  wordrec_debug_blamer, &blamer_debug);
120  }
121 
122  if (segsearch_debug_level > 0) {
123  tprintf("Done with SegSearch (AcceptableChoiceFound: %d)\n",
125  }
126 }
127 
128 // Setup and run just the initial segsearch on an established matrix,
129 // without doing any additional chopping or joining.
130 void Wordrec::WordSearch(WERD_RES* word_res) {
136  BestChoiceBundle best_choice_bundle(word_res->ratings->dimension());
137  // Run Segmentation Search.
138  InitialSegSearch(word_res, &pain_points, &pending, &best_choice_bundle, NULL);
139  if (segsearch_debug_level > 0) {
140  tprintf("Ending ratings matrix%s:\n",
141  wordrec_enable_assoc ? " (with assoc)" : "");
142  word_res->ratings->print(getDict().getUnicharset());
143  }
144 }
145 
146 
147 // Setup and run just the initial segsearch on an established matrix,
148 // without doing any additional chopping or joining.
149 // (Internal factored version that can be used as part of the main SegSearch.)
150 void Wordrec::InitialSegSearch(WERD_RES* word_res, LMPainPoints* pain_points,
152  BestChoiceBundle* best_choice_bundle,
153  BlamerBundle* blamer_bundle) {
154  if (segsearch_debug_level > 0) {
155  tprintf("Starting SegSearch on ratings matrix%s:\n",
156  wordrec_enable_assoc ? " (with assoc)" : "");
157  word_res->ratings->print(getDict().getUnicharset());
158  }
159 
160  pain_points->GenerateInitial(word_res);
161 
162  // Compute scaling factor that will help us recover blob outline length
163  // from classifier rating and certainty for the blob.
164  float rating_cert_scale = -1.0 * getDict().certainty_scale / rating_scale;
165 
168  segsearch_max_char_wh_ratio, rating_cert_scale);
169 
170  // Initialize blamer-related information: map character boxes recorded in
171  // blamer_bundle->norm_truth_word to the corresponding i,j indices in the
172  // ratings matrix. We expect this step to succeed, since when running the
173  // chopper we checked that the correct chops are present.
174  if (blamer_bundle != NULL) {
175  blamer_bundle->SetupCorrectSegmentation(word_res->chopped_word,
177  }
178 
179  // pending[col] tells whether there is update work to do to combine
180  // best_choice_bundle->beam[col - 1] with some BLOB_CHOICEs in matrix[col, *].
181  // As the language model state is updated, pending entries are modified to
182  // minimize duplication of work. It is important that during the update the
183  // children are considered in the non-decreasing order of their column, since
184  // this guarantees that all the parents would be up to date before an update
185  // of a child is done.
186  pending->init_to_size(word_res->ratings->dimension(), SegSearchPending());
187 
188  // Search the ratings matrix for the initial best path.
189  (*pending)[0].SetColumnClassified();
190  UpdateSegSearchNodes(rating_cert_scale, 0, pending, word_res,
191  pain_points, best_choice_bundle, blamer_bundle);
192 }
193 
195  float rating_cert_scale,
196  int starting_col,
198  WERD_RES *word_res,
199  LMPainPoints *pain_points,
200  BestChoiceBundle *best_choice_bundle,
201  BlamerBundle *blamer_bundle) {
202  MATRIX *ratings = word_res->ratings;
203  ASSERT_HOST(ratings->dimension() == pending->size());
204  ASSERT_HOST(ratings->dimension() == best_choice_bundle->beam.size());
205  for (int col = starting_col; col < ratings->dimension(); ++col) {
206  if (!(*pending)[col].WorkToDo()) continue;
207  int first_row = col;
208  int last_row = MIN(ratings->dimension() - 1,
209  col + ratings->bandwidth() - 1);
210  if ((*pending)[col].SingleRow() >= 0) {
211  first_row = last_row = (*pending)[col].SingleRow();
212  }
213  if (segsearch_debug_level > 0) {
214  tprintf("\n\nUpdateSegSearchNodes: col=%d, rows=[%d,%d], alljust=%d\n",
215  col, first_row, last_row,
216  (*pending)[col].IsRowJustClassified(MAX_INT32));
217  }
218  // Iterate over the pending list for this column.
219  for (int row = first_row; row <= last_row; ++row) {
220  // Update language model state of this child+parent pair.
221  BLOB_CHOICE_LIST *current_node = ratings->get(col, row);
222  LanguageModelState *parent_node =
223  col == 0 ? NULL : best_choice_bundle->beam[col - 1];
224  if (current_node != NULL &&
225  language_model_->UpdateState((*pending)[col].IsRowJustClassified(row),
226  col, row, current_node, parent_node,
227  pain_points, word_res,
228  best_choice_bundle, blamer_bundle) &&
229  row + 1 < ratings->dimension()) {
230  // Since the language model state of this entry changed, process all
231  // the child column.
232  (*pending)[row + 1].RevisitWholeColumn();
233  if (segsearch_debug_level > 0) {
234  tprintf("Added child col=%d to pending\n", row + 1);
235  }
236  } // end if UpdateState.
237  } // end for row.
238  } // end for col.
239  if (best_choice_bundle->best_vse != NULL) {
240  ASSERT_HOST(word_res->StatesAllValid());
241  if (best_choice_bundle->best_vse->updated) {
242  pain_points->GenerateFromPath(rating_cert_scale,
243  best_choice_bundle->best_vse, word_res);
244  if (!best_choice_bundle->fixpt.empty()) {
245  pain_points->GenerateFromAmbigs(best_choice_bundle->fixpt,
246  best_choice_bundle->best_vse, word_res);
247  }
248  }
249  }
250  // The segsearch is completed. Reset all updated flags on all VSEs and reset
251  // all pendings.
252  for (int col = 0; col < pending->size(); ++col) {
253  (*pending)[col].Clear();
254  ViterbiStateEntry_IT
255  vse_it(&best_choice_bundle->beam[col]->viterbi_state_entries);
256  for (vse_it.mark_cycle_pt(); !vse_it.cycled_list(); vse_it.forward()) {
257  vse_it.data()->updated = false;
258  }
259  }
260 }
261 
263  float pain_point_priority,
264  const MATRIX_COORD &pain_point, const char* pain_point_type,
265  GenericVector<SegSearchPending>* pending, WERD_RES *word_res,
266  LMPainPoints *pain_points, BlamerBundle *blamer_bundle) {
267  if (segsearch_debug_level > 0) {
268  tprintf("Classifying pain point %s priority=%.4f, col=%d, row=%d\n",
269  pain_point_type, pain_point_priority,
270  pain_point.col, pain_point.row);
271  }
272  ASSERT_HOST(pain_points != NULL);
273  MATRIX *ratings = word_res->ratings;
274  // Classify blob [pain_point.col pain_point.row]
275  if (!pain_point.Valid(*ratings)) {
276  ratings->IncreaseBandSize(pain_point.row + 1 - pain_point.col);
277  }
278  ASSERT_HOST(pain_point.Valid(*ratings));
279  BLOB_CHOICE_LIST *classified = classify_piece(word_res->seam_array,
280  pain_point.col, pain_point.row,
281  pain_point_type,
282  word_res->chopped_word,
283  blamer_bundle);
284  BLOB_CHOICE_LIST *lst = ratings->get(pain_point.col, pain_point.row);
285  if (lst == NULL) {
286  ratings->put(pain_point.col, pain_point.row, classified);
287  } else {
288  // We can not delete old BLOB_CHOICEs, since they might contain
289  // ViterbiStateEntries that are parents of other "active" entries.
290  // Thus if the matrix cell already contains classifications we add
291  // the new ones to the beginning of the list.
292  BLOB_CHOICE_IT it(lst);
293  it.add_list_before(classified);
294  delete classified; // safe to delete, since empty after add_list_before()
295  classified = NULL;
296  }
297 
298  if (segsearch_debug_level > 0) {
299  print_ratings_list("Updated ratings matrix with a new entry:",
300  ratings->get(pain_point.col, pain_point.row),
301  getDict().getUnicharset());
302  ratings->print(getDict().getUnicharset());
303  }
304 
305  // Insert initial "pain points" to join the newly classified blob
306  // with its left and right neighbors.
307  if (classified != NULL && !classified->empty()) {
308  if (pain_point.col > 0) {
309  pain_points->GeneratePainPoint(
310  pain_point.col - 1, pain_point.row, LM_PPTYPE_SHAPE, 0.0,
311  true, segsearch_max_char_wh_ratio, word_res);
312  }
313  if (pain_point.row + 1 < ratings->dimension()) {
314  pain_points->GeneratePainPoint(
315  pain_point.col, pain_point.row + 1, LM_PPTYPE_SHAPE, 0.0,
316  true, segsearch_max_char_wh_ratio, word_res);
317  }
318  }
319  (*pending)[pain_point.col].SetBlobClassified(pain_point.row);
320 }
321 
322 // Resets enough of the results so that the Viterbi search is re-run.
323 // Needed when the n-gram model is enabled, as the multi-length comparison
324 // implementation will re-value existing paths to worse values.
326  BestChoiceBundle* best_choice_bundle,
328  // TODO(rays) More refactoring required here.
329  // Delete existing viterbi states.
330  for (int col = 0; col < best_choice_bundle->beam.size(); ++col) {
331  best_choice_bundle->beam[col]->Clear();
332  }
333  // Reset best_choice_bundle.
334  word_res->ClearWordChoices();
335  best_choice_bundle->best_vse = NULL;
336  // Clear out all existing pendings and add a new one for the first column.
337  (*pending)[0].SetColumnClassified();
338  for (int i = 1; i < pending->size(); ++i)
339  (*pending)[i].Clear();
340 }
341 
343  LMPainPoints *pain_points,
344  BlamerBundle *blamer_bundle,
345  STRING *blamer_debug) {
346  pain_points->Clear(); // Clear pain points heap.
348  pain_points, &LMPainPoints::GenerateForBlamer,
349  static_cast<double>(segsearch_max_char_wh_ratio), word_res);
350  blamer_bundle->InitForSegSearch(word_res->best_choice, word_res->ratings,
351  getDict().WildcardID(), wordrec_debug_blamer,
352  blamer_debug, pp_cb);
353  delete pp_cb;
354 }
355 
356 } // namespace tesseract
bool SegSearchDone(int num_futile_classifications)
Definition: wordrec.h:425
void DoSegSearch(WERD_RES *word_res)
Definition: segsearch.cpp:31
bool GeneratePainPoint(int col, int row, LMPainPointsType pp_type, float special_priority, bool ok_to_extend, float max_char_wh_ratio, WERD_RES *word_res)
const UNICHARSET & getUnicharset() const
Definition: dict.h:97
double certainty_scale
Definition: dict.h:611
_ConstTessMemberResultCallback_0_0< false, R, T1 >::base * NewPermanentTessCallback(const T1 *obj, R(T2::*member)() const)
Definition: tesscallback.h:116
void init_to_size(int size, T t)
ViterbiStateEntry * best_vse
Best ViterbiStateEntry and BLOB_CHOICE.
Definition: lm_state.h:233
WERD_CHOICE * best_choice
Definition: pageres.h:219
Dict & getDict()
Definition: classify.h:65
#define MAX_INT32
Definition: host.h:62
void print(const UNICHARSET &unicharset) const
Definition: matrix.cpp:112
bool wordrec_enable_assoc
Definition: wordrec.h:130
void GenerateInitial(WERD_RES *word_res)
void GenerateFromAmbigs(const DANGERR &fixpt, ViterbiStateEntry *vse, WERD_RES *word_res)
T get(ICOORD pos) const
Definition: matrix.h:223
void InitForWord(const WERD_CHOICE *prev_word, bool fixed_pitch, float max_char_wh_ratio, float rating_cert_scale)
void FinishSegSearch(const WERD_CHOICE *best_choice, bool debug, STRING *debug_str)
Definition: blamer.cpp:506
#define tprintf(...)
Definition: tprintf.h:31
bool StatesAllValid()
Definition: pageres.cpp:450
void SetChopperBlame(const WERD_RES *word, bool debug)
Definition: blamer.cpp:310
bool empty() const
Definition: genericvector.h:90
int size() const
Definition: genericvector.h:72
void UpdateSegSearchNodes(float rating_cert_scale, int starting_col, GenericVector< SegSearchPending > *pending, WERD_RES *word_res, LMPainPoints *pain_points, BestChoiceBundle *best_choice_bundle, BlamerBundle *blamer_bundle)
Definition: segsearch.cpp:194
PointerVector< LanguageModelState > beam
Definition: lm_state.h:231
void ResetNGramSearch(WERD_RES *word_res, BestChoiceBundle *best_choice_bundle, GenericVector< SegSearchPending > *pending)
Definition: segsearch.cpp:325
void InitBlamerForSegSearch(WERD_RES *word_res, LMPainPoints *pain_points, BlamerBundle *blamer_bundle, STRING *blamer_debug)
Definition: segsearch.cpp:342
#define ASSERT_HOST(x)
Definition: errcode.h:84
MATRIX * ratings
Definition: pageres.h:215
double segsearch_max_char_wh_ratio
Definition: wordrec.h:175
bool GuidedSegsearchStillGoing() const
Definition: blamer.cpp:501
bool assume_fixed_pitch_char_segment
Definition: wordrec.h:161
bool UpdateState(bool just_classified, int curr_col, int curr_row, BLOB_CHOICE_LIST *curr_list, LanguageModelState *parent_node, LMPainPoints *pain_points, WERD_RES *word_res, BestChoiceBundle *best_choice_bundle, BlamerBundle *blamer_bundle)
bool updated
Flag to indicate whether anything was changed.
Definition: lm_state.h:225
Definition: strngs.h:45
bool GuidedSegsearchNeeded(const WERD_CHOICE *best_choice) const
Definition: blamer.cpp:461
Bundle together all the things pertaining to the best choice/state.
Definition: lm_state.h:215
int segsearch_max_pain_points
Definition: wordrec.h:171
void ProcessSegSearchPainPoint(float pain_point_priority, const MATRIX_COORD &pain_point, const char *pain_point_type, GenericVector< SegSearchPending > *pending, WERD_RES *word_res, LMPainPoints *pain_points, BlamerBundle *blamer_bundle)
Definition: segsearch.cpp:262
int bandwidth() const
Definition: matrix.h:523
DANGERR fixpt
Places to try to fix the word suggested by ambiguity checking.
Definition: lm_state.h:227
void ClearWordChoices()
Definition: pageres.cpp:1175
Struct to store information maintained by various language model components.
Definition: lm_state.h:193
int dimension() const
Definition: matrix.h:521
bool ChoiceIsCorrect(const WERD_CHOICE *word_choice) const
Definition: blamer.cpp:111
void put(ICOORD pos, const T &thing)
Definition: matrix.h:215
#define MIN(x, y)
Definition: ndminx.h:28
bool Classified(int col, int row, int wildcard_id) const
Definition: matrix.cpp:36
Definition: matrix.h:563
int segsearch_debug_level
Definition: wordrec.h:169
void improve_by_chopping(float rating_cert_scale, WERD_RES *word, BestChoiceBundle *best_choice_bundle, BlamerBundle *blamer_bundle, LMPainPoints *pain_points, GenericVector< SegSearchPending > *pending)
Definition: chopper.cpp:456
bool Valid(const MATRIX &m) const
Definition: matrix.h:601
virtual BLOB_CHOICE_LIST * classify_piece(const GenericVector< SEAM *> &seams, inT16 start, inT16 end, const char *description, TWERD *word, BlamerBundle *blamer_bundle)
Definition: pieces.cpp:56
void SetupCorrectSegmentation(const TWERD *word, bool debug)
Definition: blamer.cpp:407
static void PrintSeams(const char *label, const GenericVector< SEAM *> &seams)
Definition: seam.cpp:173
LMPainPointsType Deque(MATRIX_COORD *pp, float *priority)
void GenerateFromPath(float rating_cert_scale, ViterbiStateEntry *vse, WERD_RES *word_res)
void print_ratings_list(const char *msg, BLOB_CHOICE_LIST *ratings, const UNICHARSET &current_unicharset)
Definition: ratngs.cpp:819
LanguageModel * language_model_
Definition: wordrec.h:410
bool wordrec_debug_blamer
Definition: wordrec.h:167
void IncreaseBandSize(int bandwidth)
Definition: matrix.cpp:49
TWERD * chopped_word
Definition: pageres.h:201
void InitialSegSearch(WERD_RES *word_res, LMPainPoints *pain_points, GenericVector< SegSearchPending > *pending, BestChoiceBundle *best_choice_bundle, BlamerBundle *blamer_bundle)
Definition: segsearch.cpp:150
void WordSearch(WERD_RES *word_res)
Definition: segsearch.cpp:130
bool GenerateForBlamer(double max_char_wh_ratio, WERD_RES *word_res, int col, int row)
static const char * PainPointDescription(LMPainPointsType type)
void SegSearch(WERD_RES *word_res, BestChoiceBundle *best_choice_bundle, BlamerBundle *blamer_bundle)
Definition: segsearch.cpp:37
WERD_CHOICE * prev_word_best_choice_
Definition: wordrec.h:415
GenericVector< SEAM * > seam_array
Definition: pageres.h:203
void InitForSegSearch(const WERD_CHOICE *best_choice, MATRIX *ratings, UNICHAR_ID wildcard_id, bool debug, STRING *debug_str, TessResultCallback2< bool, int, int > *pp_cb)
Definition: blamer.cpp:473