tesseract  4.00.00dev
chopper.cpp
Go to the documentation of this file.
1 /* -*-C-*-
2  ********************************************************************************
3  *
4  * File: chopper.c (Formerly chopper.c)
5  * Description:
6  * Author: Mark Seaman, OCR Technology
7  * Created: Fri Oct 16 14:37:00 1987
8  * Modified: Tue Jul 30 16:18:52 1991 (Mark Seaman) marks@hpgrlt
9  * Language: C
10  * Package: N/A
11  * Status: Reusable Software Component
12  *
13  * (c) Copyright 1987, Hewlett-Packard Company.
14  ** Licensed under the Apache License, Version 2.0 (the "License");
15  ** you may not use this file except in compliance with the License.
16  ** You may obtain a copy of the License at
17  ** http://www.apache.org/licenses/LICENSE-2.0
18  ** Unless required by applicable law or agreed to in writing, software
19  ** distributed under the License is distributed on an "AS IS" BASIS,
20  ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
21  ** See the License for the specific language governing permissions and
22  ** limitations under the License.
23  *
24  **************************************************************************/
25 
26 /*----------------------------------------------------------------------
27  I n c l u d e s
28 ----------------------------------------------------------------------*/
29 
30 #include <math.h>
31 
32 #include "chopper.h"
33 
34 #include "assert.h"
35 #include "associate.h"
36 #include "blobs.h"
37 #include "callcpp.h"
38 #include "const.h"
39 #include "findseam.h"
40 #include "globals.h"
41 #include "render.h"
42 #include "pageres.h"
43 #include "seam.h"
44 #include "stopper.h"
45 #include "structures.h"
46 #include "unicharset.h"
47 #include "wordrec.h"
48 
49 // Include automatically generated configuration file if running autoconf.
50 #ifdef HAVE_CONFIG_H
51 #include "config_auto.h"
52 #endif
53 
54 // Even though the limit on the number of chunks may now be removed, keep
55 // the same limit for repeatable behavior, and it may be a speed advantage.
56 static const int kMaxNumChunks = 64;
57 
58 /*----------------------------------------------------------------------
59  F u n c t i o n s
60 ----------------------------------------------------------------------*/
66 void preserve_outline(EDGEPT *start) {
67  EDGEPT *srcpt;
68 
69  if (start == NULL)
70  return;
71  srcpt = start;
72  do {
73  srcpt->flags[1] = 1;
74  srcpt = srcpt->next;
75  }
76  while (srcpt != start);
77  srcpt->flags[1] = 2;
78 }
79 
80 
81 /**************************************************************************/
83  TESSLINE *outline;
84 
85  for (outline = srcline; outline != NULL; outline = outline->next) {
86  preserve_outline (outline->loop);
87  }
88 }
89 
90 
97  EDGEPT *srcpt;
98  EDGEPT *real_start;
99 
100  if (start == NULL)
101  return NULL;
102  srcpt = start;
103  do {
104  if (srcpt->flags[1] == 2)
105  break;
106  srcpt = srcpt->next;
107  }
108  while (srcpt != start);
109  real_start = srcpt;
110  do {
111  srcpt = srcpt->next;
112  if (srcpt->prev->flags[1] == 0) {
113  remove_edgept(srcpt->prev);
114  }
115  }
116  while (srcpt != real_start);
117  return real_start;
118 }
119 
120 
121 /******************************************************************************/
123  TESSLINE *outline;
124 
125  for (outline = srcline; outline != NULL; outline = outline->next) {
126  outline->loop = restore_outline (outline->loop);
127  outline->start = outline->loop->pos;
128  }
129 }
130 
131 // Helper runs all the checks on a seam to make sure it is valid.
132 // Returns the seam if OK, otherwise deletes the seam and returns NULL.
133 static SEAM* CheckSeam(int debug_level, inT32 blob_number, TWERD* word,
134  TBLOB* blob, TBLOB* other_blob,
135  const GenericVector<SEAM*>& seams, SEAM* seam) {
136  if (seam == NULL || blob->outlines == NULL || other_blob->outlines == NULL ||
137  total_containment(blob, other_blob) || check_blob(other_blob) ||
138  !seam->ContainedByBlob(*blob) || !seam->ContainedByBlob(*other_blob) ||
139  any_shared_split_points(seams, seam) ||
140  !seam->PrepareToInsertSeam(seams, word->blobs, blob_number, false)) {
141  word->blobs.remove(blob_number + 1);
142  if (seam) {
143  seam->UndoSeam(blob, other_blob);
144  delete seam;
145  seam = NULL;
146 #ifndef GRAPHICS_DISABLED
147  if (debug_level) {
148  if (debug_level >2)
149  display_blob(blob, Red);
150  tprintf("\n** seam being removed ** \n");
151  }
152 #endif
153  } else {
154  delete other_blob;
155  }
156  return NULL;
157  }
158  return seam;
159 }
160 
161 
168 namespace tesseract {
169 SEAM *Wordrec::attempt_blob_chop(TWERD *word, TBLOB *blob, inT32 blob_number,
170  bool italic_blob,
171  const GenericVector<SEAM*>& seams) {
174  TBLOB *other_blob = TBLOB::ShallowCopy(*blob); /* Make new blob */
175  // Insert it into the word.
176  word->blobs.insert(other_blob, blob_number + 1);
177 
178  SEAM *seam = NULL;
179  if (prioritize_division) {
180  TPOINT location;
181  if (divisible_blob(blob, italic_blob, &location)) {
182  seam = new SEAM(0.0f, location);
183  }
184  }
185  if (seam == NULL)
186  seam = pick_good_seam(blob);
187  if (chop_debug) {
188  if (seam != NULL)
189  seam->Print("Good seam picked=");
190  else
191  tprintf("\n** no seam picked *** \n");
192  }
193  if (seam) {
194  seam->ApplySeam(italic_blob, blob, other_blob);
195  }
196 
197  seam = CheckSeam(chop_debug, blob_number, word, blob, other_blob,
198  seams, seam);
199  if (seam == NULL) {
203  // If the blob can simply be divided into outlines, then do that.
204  TPOINT location;
205  if (divisible_blob(blob, italic_blob, &location)) {
206  other_blob = TBLOB::ShallowCopy(*blob); /* Make new blob */
207  word->blobs.insert(other_blob, blob_number + 1);
208  seam = new SEAM(0.0f, location);
209  seam->ApplySeam(italic_blob, blob, other_blob);
210  seam = CheckSeam(chop_debug, blob_number, word, blob, other_blob,
211  seams, seam);
212  }
213  }
214  }
215  if (seam != NULL) {
216  // Make sure this seam doesn't get chopped again.
217  seam->Finalize();
218  }
219  return seam;
220 }
221 
222 
224  bool italic_blob,
225  const GenericVector<SEAM*>& seams) {
226  return attempt_blob_chop(word, word->blobs[blob_number], blob_number,
227  italic_blob, seams);
228 }
229 
230 
232  bool italic_blob, WERD_RES *word_res,
233  int *blob_number) {
234  TWERD *word = word_res->chopped_word;
235  for (*blob_number = 0; *blob_number < word->NumBlobs(); ++*blob_number) {
236  TBLOB *blob = word->blobs[*blob_number];
237  TPOINT topleft, botright;
238  topleft.x = blob->bounding_box().left();
239  topleft.y = blob->bounding_box().top();
240  botright.x = blob->bounding_box().right();
241  botright.y = blob->bounding_box().bottom();
242 
243  TPOINT original_topleft, original_botright;
244  word_res->denorm.DenormTransform(NULL, topleft, &original_topleft);
245  word_res->denorm.DenormTransform(NULL, botright, &original_botright);
246 
247  TBOX original_box = TBOX(original_topleft.x, original_botright.y,
248  original_botright.x, original_topleft.y);
249 
250  bool almost_equal_box = false;
251  int num_overlap = 0;
252  for (int i = 0; i < boxes.size(); i++) {
253  if (original_box.overlap_fraction(boxes[i]) > 0.125)
254  num_overlap++;
255  if (original_box.almost_equal(boxes[i], 3))
256  almost_equal_box = true;
257  }
258 
259  TPOINT location;
260  if (divisible_blob(blob, italic_blob, &location) ||
261  (!almost_equal_box && num_overlap > 1)) {
262  SEAM *seam = attempt_blob_chop(word, blob, *blob_number,
263  italic_blob, word_res->seam_array);
264  if (seam != NULL)
265  return seam;
266  }
267  }
268 
269  *blob_number = -1;
270  return NULL;
271 }
272 
273 } // namespace tesseract
274 
275 
282  int length;
283  int index;
284 
285  length = seams.size();
286  for (index = 0; index < length; index++)
287  if (seam->SharesPosition(*seams[index])) return TRUE;
288  return FALSE;
289 }
290 
291 
297 int check_blob(TBLOB *blob) {
298  TESSLINE *outline;
299  EDGEPT *edgept;
300 
301  for (outline = blob->outlines; outline != NULL; outline = outline->next) {
302  edgept = outline->loop;
303  do {
304  if (edgept == NULL)
305  break;
306  edgept = edgept->next;
307  }
308  while (edgept != outline->loop);
309  if (edgept == NULL)
310  return 1;
311  }
312  return 0;
313 }
314 
315 
316 namespace tesseract {
330  DANGERR *fixpt,
331  bool split_next_to_fragment,
332  bool italic_blob,
333  WERD_RES* word,
334  int* blob_number) {
335  float rating_ceiling = MAX_FLOAT32;
336  SEAM *seam = NULL;
337  do {
338  *blob_number = select_blob_to_split_from_fixpt(fixpt);
339  if (chop_debug) tprintf("blob_number from fixpt = %d\n", *blob_number);
340  bool split_point_from_dict = (*blob_number != -1);
341  if (split_point_from_dict) {
342  fixpt->clear();
343  } else {
344  *blob_number = select_blob_to_split(blob_choices, rating_ceiling,
345  split_next_to_fragment);
346  }
347  if (chop_debug) tprintf("blob_number = %d\n", *blob_number);
348  if (*blob_number == -1)
349  return NULL;
350 
351  // TODO(rays) it may eventually help to allow italic_blob to be true,
352  seam = chop_numbered_blob(word->chopped_word, *blob_number, italic_blob,
353  word->seam_array);
354  if (seam != NULL)
355  return seam; // Success!
356  if (blob_choices[*blob_number] == NULL)
357  return NULL;
358  if (!split_point_from_dict) {
359  // We chopped the worst rated blob, try something else next time.
360  rating_ceiling = blob_choices[*blob_number]->rating();
361  }
362  } while (true);
363  return seam;
364 }
365 
374  const GenericVector<BLOB_CHOICE*>& blob_choices,
375  WERD_RES* word_res,
376  int* blob_number) {
377  if (prioritize_division) {
378  return chop_overlapping_blob(boxes, true, word_res, blob_number);
379  } else {
380  return improve_one_blob(blob_choices, NULL, false, true, word_res,
381  blob_number);
382  }
383 }
384 
394  int num_blobs = word->chopped_word->NumBlobs();
395  if (word->ratings == NULL) {
396  word->ratings = new MATRIX(num_blobs, wordrec_max_join_chunks);
397  }
398  if (word->ratings->get(0, 0) == NULL) {
399  // Run initial classification.
400  for (int b = 0; b < num_blobs; ++b) {
401  BLOB_CHOICE_LIST* choices = classify_piece(word->seam_array, b, b,
402  "Initial:", word->chopped_word,
403  word->blamer_bundle);
404  word->ratings->put(b, b, choices);
405  }
406  } else {
407  // Blobs have been pre-classified. Set matrix cell for all blob choices
408  for (int col = 0; col < word->ratings->dimension(); ++col) {
409  for (int row = col; row < word->ratings->dimension() &&
410  row < col + word->ratings->bandwidth(); ++row) {
411  BLOB_CHOICE_LIST* choices = word->ratings->get(col, row);
412  if (choices != NULL) {
413  BLOB_CHOICE_IT bc_it(choices);
414  for (bc_it.mark_cycle_pt(); !bc_it.cycled_list(); bc_it.forward()) {
415  bc_it.data()->set_matrix_cell(col, row);
416  }
417  }
418  }
419  }
420  }
421 
422  // Run Segmentation Search.
423  BestChoiceBundle best_choice_bundle(word->ratings->dimension());
424  SegSearch(word, &best_choice_bundle, word->blamer_bundle);
425 
426  if (word->best_choice == NULL) {
427  // SegSearch found no valid paths, so just use the leading diagonal.
429  }
430  word->RebuildBestState();
431  // If we finished without a hyphen at the end of the word, let the next word
432  // be found in the dictionary.
433  if (word->word->flag(W_EOL) &&
434  !getDict().has_hyphen_end(*word->best_choice)) {
435  getDict().reset_hyphen_vars(true);
436  }
437 
438  if (word->blamer_bundle != NULL && this->fill_lattice_ != NULL) {
439  CallFillLattice(*word->ratings, word->best_choices,
440  *word->uch_set, word->blamer_bundle);
441  }
442  if (wordrec_debug_level > 0) {
443  tprintf("Final Ratings Matrix:\n");
444  word->ratings->print(getDict().getUnicharset());
445  }
446  word->FilterWordChoices(getDict().stopper_debug_level);
447 }
448 
456 void Wordrec::improve_by_chopping(float rating_cert_scale,
457  WERD_RES* word,
458  BestChoiceBundle* best_choice_bundle,
459  BlamerBundle* blamer_bundle,
460  LMPainPoints* pain_points,
462  int blob_number;
463  do { // improvement loop.
464  // Make a simple vector of BLOB_CHOICEs to make it easy to pick which
465  // one to chop.
466  GenericVector<BLOB_CHOICE*> blob_choices;
467  int num_blobs = word->ratings->dimension();
468  for (int i = 0; i < num_blobs; ++i) {
469  BLOB_CHOICE_LIST* choices = word->ratings->get(i, i);
470  if (choices == NULL || choices->empty()) {
471  blob_choices.push_back(NULL);
472  } else {
473  BLOB_CHOICE_IT bc_it(choices);
474  blob_choices.push_back(bc_it.data());
475  }
476  }
477  SEAM* seam = improve_one_blob(blob_choices, &best_choice_bundle->fixpt,
478  false, false, word, &blob_number);
479  if (seam == NULL) break;
480  // A chop has been made. We have to correct all the data structures to
481  // take into account the extra bottom-level blob.
482  // Put the seam into the seam_array and correct everything else on the
483  // word: ratings matrix (including matrix location in the BLOB_CHOICES),
484  // states in WERD_CHOICEs, and blob widths.
485  word->InsertSeam(blob_number, seam);
486  // Insert a new entry in the beam array.
487  best_choice_bundle->beam.insert(new LanguageModelState, blob_number);
488  // Fixpts are outdated, but will get recalculated.
489  best_choice_bundle->fixpt.clear();
490  // Remap existing pain points.
491  pain_points->RemapForSplit(blob_number);
492  // Insert a new pending at the chop point.
493  pending->insert(SegSearchPending(), blob_number);
494 
495  // Classify the two newly created blobs using ProcessSegSearchPainPoint,
496  // as that updates the pending correctly and adds new pain points.
497  MATRIX_COORD pain_point(blob_number, blob_number);
498  ProcessSegSearchPainPoint(0.0f, pain_point, "Chop1", pending, word,
499  pain_points, blamer_bundle);
500  pain_point.col = blob_number + 1;
501  pain_point.row = blob_number + 1;
502  ProcessSegSearchPainPoint(0.0f, pain_point, "Chop2", pending, word,
503  pain_points, blamer_bundle);
505  // N-gram evaluation depends on the number of blobs in a chunk, so we
506  // have to re-evaluate everything in the word.
507  ResetNGramSearch(word, best_choice_bundle, pending);
508  blob_number = 0;
509  }
510  // Run language model incrementally. (Except with the n-gram model on.)
511  UpdateSegSearchNodes(rating_cert_scale, blob_number, pending,
512  word, pain_points, best_choice_bundle, blamer_bundle);
513  } while (!language_model_->AcceptableChoiceFound() &&
514  word->ratings->dimension() < kMaxNumChunks);
515 
516  // If after running only the chopper best_choice is incorrect and no blame
517  // has been yet set, blame the classifier if best_choice is classifier's
518  // top choice and is a dictionary word (i.e. language model could not have
519  // helped). Otherwise blame the tradeoff between the classifier and
520  // the old language model (permuters).
521  if (word->blamer_bundle != NULL &&
523  !word->blamer_bundle->ChoiceIsCorrect(word->best_choice)) {
524  bool valid_permuter = word->best_choice != NULL &&
527  getDict().getUnicharset(),
528  valid_permuter,
530  }
531 }
532 
533 
534 /**********************************************************************
535  * select_blob_to_split
536  *
537  * These are the results of the last classification. Find a likely
538  * place to apply splits. If none, return -1.
539  **********************************************************************/
541  const GenericVector<BLOB_CHOICE*>& blob_choices,
542  float rating_ceiling, bool split_next_to_fragment) {
543  BLOB_CHOICE *blob_choice;
544  int x;
545  float worst = -MAX_FLOAT32;
546  int worst_index = -1;
547  float worst_near_fragment = -MAX_FLOAT32;
548  int worst_index_near_fragment = -1;
549  const CHAR_FRAGMENT **fragments = NULL;
550 
551  if (chop_debug) {
552  if (rating_ceiling < MAX_FLOAT32)
553  tprintf("rating_ceiling = %8.4f\n", rating_ceiling);
554  else
555  tprintf("rating_ceiling = No Limit\n");
556  }
557 
558  if (split_next_to_fragment && blob_choices.size() > 0) {
559  fragments = new const CHAR_FRAGMENT *[blob_choices.length()];
560  if (blob_choices[0] != NULL) {
561  fragments[0] = getDict().getUnicharset().get_fragment(
562  blob_choices[0]->unichar_id());
563  } else {
564  fragments[0] = NULL;
565  }
566  }
567 
568  for (x = 0; x < blob_choices.size(); ++x) {
569  if (blob_choices[x] == NULL) {
570  delete[] fragments;
571  return x;
572  } else {
573  blob_choice = blob_choices[x];
574  // Populate fragments for the following position.
575  if (split_next_to_fragment && x+1 < blob_choices.size()) {
576  if (blob_choices[x + 1] != NULL) {
577  fragments[x + 1] = getDict().getUnicharset().get_fragment(
578  blob_choices[x + 1]->unichar_id());
579  } else {
580  fragments[x + 1] = NULL;
581  }
582  }
583  if (blob_choice->rating() < rating_ceiling &&
584  blob_choice->certainty() < tessedit_certainty_threshold) {
585  // Update worst and worst_index.
586  if (blob_choice->rating() > worst) {
587  worst_index = x;
588  worst = blob_choice->rating();
589  }
590  if (split_next_to_fragment) {
591  // Update worst_near_fragment and worst_index_near_fragment.
592  bool expand_following_fragment =
593  (x + 1 < blob_choices.size() &&
594  fragments[x+1] != NULL && !fragments[x+1]->is_beginning());
595  bool expand_preceding_fragment =
596  (x > 0 && fragments[x-1] != NULL && !fragments[x-1]->is_ending());
597  if ((expand_following_fragment || expand_preceding_fragment) &&
598  blob_choice->rating() > worst_near_fragment) {
599  worst_index_near_fragment = x;
600  worst_near_fragment = blob_choice->rating();
601  if (chop_debug) {
602  tprintf("worst_index_near_fragment=%d"
603  " expand_following_fragment=%d"
604  " expand_preceding_fragment=%d\n",
605  worst_index_near_fragment,
606  expand_following_fragment,
607  expand_preceding_fragment);
608  }
609  }
610  }
611  }
612  }
613  }
614  delete[] fragments;
615  // TODO(daria): maybe a threshold of badness for
616  // worst_near_fragment would be useful.
617  return worst_index_near_fragment != -1 ?
618  worst_index_near_fragment : worst_index;
619 }
620 
621 /**********************************************************************
622  * select_blob_to_split_from_fixpt
623  *
624  * Given the fix point from a dictionary search, if there is a single
625  * dangerous blob that maps to multiple characters, return that blob
626  * index as a place we need to split. If none, return -1.
627  **********************************************************************/
629  if (!fixpt)
630  return -1;
631  for (int i = 0; i < fixpt->size(); i++) {
632  if ((*fixpt)[i].begin + 1 == (*fixpt)[i].end &&
633  (*fixpt)[i].dangerous &&
634  (*fixpt)[i].correct_is_ngram) {
635  return (*fixpt)[i].begin;
636  }
637  }
638  return -1;
639 }
640 
641 
642 } // namespace tesseract
643 
644 
645 /**********************************************************************
646  * total_containment
647  *
648  * Check to see if one of these outlines is totally contained within
649  * the bounding box of the other.
650  **********************************************************************/
652  TBOX box1 = blob1->bounding_box();
653  TBOX box2 = blob2->bounding_box();
654  return box1.contains(box2) || box2.contains(box1);
655 }
char flags[EDGEPTFLAGS]
Definition: blobs.h:168
TESSLINE * next
Definition: blobs.h:258
int wordrec_debug_level
Definition: wordrec.h:162
const UNICHARSET & getUnicharset() const
Definition: dict.h:97
TPOINT pos
Definition: blobs.h:163
bool is_ending() const
Definition: unicharset.h:102
#define TRUE
Definition: capi.h:45
SEAM * attempt_blob_chop(TWERD *word, TBLOB *blob, inT32 blob_number, bool italic_blob, const GenericVector< SEAM *> &seams)
Definition: chopper.cpp:169
TPOINT start
Definition: blobs.h:255
void RebuildBestState()
Definition: pageres.cpp:800
int32_t inT32
Definition: host.h:38
bool PrepareToInsertSeam(const GenericVector< SEAM *> &seams, const GenericVector< TBLOB *> &blobs, int insert_index, bool modify)
Definition: seam.cpp:82
bool is_beginning() const
Definition: unicharset.h:99
bool ContainedByBlob(const TBLOB &blob) const
Definition: seam.h:79
const CHAR_FRAGMENT * get_fragment(UNICHAR_ID unichar_id) const
Definition: unicharset.h:694
void restore_outline_tree(TESSLINE *srcline)
Definition: chopper.cpp:122
WERD_CHOICE * best_choice
Definition: pageres.h:219
Dict & getDict()
Definition: classify.h:65
BlamerBundle * blamer_bundle
Definition: pageres.h:230
static bool valid_word_permuter(uinT8 perm, bool numbers_ok)
Check all the DAWGs to see if this word is in any of them.
Definition: dict.h:455
void print(const UNICHARSET &unicharset) const
Definition: matrix.cpp:112
int wordrec_max_join_chunks
Definition: wordrec.h:164
int select_blob_to_split(const GenericVector< BLOB_CHOICE *> &blob_choices, float rating_ceiling, bool split_next_to_fragment)
Definition: chopper.cpp:540
void Finalize()
Definition: seam.h:116
EDGEPT * prev
Definition: blobs.h:170
inT16 total_containment(TBLOB *blob1, TBLOB *blob2)
Definition: chopper.cpp:651
bool allow_blob_division
Definition: classify.h:381
SEAM * improve_one_blob(const GenericVector< BLOB_CHOICE *> &blob_choices, DANGERR *fixpt, bool split_next_to_fragment, bool italic_blob, WERD_RES *word, int *blob_number)
Definition: chopper.cpp:329
bool SharesPosition(const SEAM &other) const
Definition: seam.h:95
void remove(int index)
Definition: werd.h:36
TESSLINE * outlines
Definition: blobs.h:377
T get(ICOORD pos) const
Definition: matrix.h:223
int push_back(T object)
float rating() const
Definition: ratngs.h:79
void preserve_outline_tree(TESSLINE *srcline)
Definition: chopper.cpp:82
#define tprintf(...)
Definition: tprintf.h:31
int select_blob_to_split_from_fixpt(DANGERR *fixpt)
Definition: chopper.cpp:628
SEAM * chop_overlapping_blob(const GenericVector< TBOX > &boxes, bool italic_blob, WERD_RES *word_res, int *blob_number)
Definition: chopper.cpp:231
void remove_edgept(EDGEPT *point)
Definition: split.cpp:208
int repair_unchopped_blobs
Definition: wordrec.h:137
void FilterWordChoices(int debug_level)
Definition: pageres.cpp:505
void chop_word_main(WERD_RES *word)
Definition: chopper.cpp:393
int size() const
Definition: genericvector.h:72
void UpdateSegSearchNodes(float rating_cert_scale, int starting_col, GenericVector< SegSearchPending > *pending, WERD_RES *word_res, LMPainPoints *pain_points, BestChoiceBundle *best_choice_bundle, BlamerBundle *blamer_bundle)
Definition: segsearch.cpp:194
float certainty() const
Definition: ratngs.h:82
int16_t inT16
Definition: host.h:36
PointerVector< LanguageModelState > beam
Definition: lm_state.h:231
EDGEPT * restore_outline(EDGEPT *start)
Definition: chopper.cpp:96
void display_blob(TBLOB *blob, C_COL color)
Definition: render.cpp:64
void ResetNGramSearch(WERD_RES *word_res, BestChoiceBundle *best_choice_bundle, GenericVector< SegSearchPending > *pending)
Definition: segsearch.cpp:325
BOOL8 flag(WERD_FLAGS mask) const
Definition: werd.h:128
inT16 left() const
Definition: rect.h:68
void BlameClassifierOrLangModel(const WERD_RES *word, const UNICHARSET &unicharset, bool valid_permuter, bool debug)
Definition: blamer.cpp:369
MATRIX * ratings
Definition: pageres.h:215
IncorrectResultReason incorrect_result_reason() const
Definition: blamer.h:106
Definition: blobs.h:395
uinT8 permuter() const
Definition: ratngs.h:344
bool divisible_blob(TBLOB *blob, bool italic_blob, TPOINT *location)
Definition: blobs.cpp:932
void InsertSeam(int blob_number, SEAM *seam)
Definition: pageres.cpp:410
void preserve_outline(EDGEPT *start)
Definition: chopper.cpp:66
void reset_hyphen_vars(bool last_word_on_line)
Definition: hyphen.cpp:32
void UndoSeam(TBLOB *blob, TBLOB *other_blob) const
Definition: seam.cpp:140
void insert(T t, int index)
void DenormTransform(const DENORM *last_denorm, const TPOINT &pt, TPOINT *original) const
Definition: normalis.cpp:389
Definition: callcpp.h:35
void Print(const char *label) const
Definition: seam.cpp:160
EDGEPT * loop
Definition: blobs.h:257
Definition: seam.h:44
WERD_CHOICE_LIST best_choices
Definition: pageres.h:227
#define FALSE
Definition: capi.h:46
void FakeWordFromRatings(PermuterType permuter)
Definition: pageres.cpp:893
inT16 x
Definition: blobs.h:71
Bundle together all the things pertaining to the best choice/state.
Definition: lm_state.h:215
double overlap_fraction(const TBOX &box) const
Definition: rect.h:378
void ApplySeam(bool italic_blob, TBLOB *blob, TBLOB *other_blob) const
Definition: seam.cpp:124
void ProcessSegSearchPainPoint(float pain_point_priority, const MATRIX_COORD &pain_point, const char *pain_point_type, GenericVector< SegSearchPending > *pending, WERD_RES *word_res, LMPainPoints *pain_points, BlamerBundle *blamer_bundle)
Definition: segsearch.cpp:262
EDGEPT * next
Definition: blobs.h:169
#define MAX_FLOAT32
Definition: host.h:66
void CallFillLattice(const MATRIX &ratings, const WERD_CHOICE_LIST &best_choices, const UNICHARSET &unicharset, BlamerBundle *blamer_bundle)
Definition: wordrec.h:195
Definition: blobs.h:76
bool contains(const FCOORD pt) const
Definition: rect.h:323
int bandwidth() const
Definition: matrix.h:523
DANGERR fixpt
Places to try to fix the word suggested by ambiguity checking.
Definition: lm_state.h:227
double tessedit_certainty_threshold
Definition: wordrec.h:138
int length() const
Definition: genericvector.h:85
int NumBlobs() const
Definition: blobs.h:425
inT16 top() const
Definition: rect.h:54
int check_blob(TBLOB *blob)
Definition: chopper.cpp:297
bool almost_equal(const TBOX &box, int tolerance) const
Definition: rect.cpp:258
Struct to store information maintained by various language model components.
Definition: lm_state.h:193
int dimension() const
Definition: matrix.h:521
inT16 y
Definition: blobs.h:72
bool ChoiceIsCorrect(const WERD_CHOICE *word_choice) const
Definition: blamer.cpp:111
Definition: rect.h:30
GenericVector< TBLOB * > blobs
Definition: blobs.h:436
void put(ICOORD pos, const T &thing)
Definition: matrix.h:215
Definition: matrix.h:563
Definition: blobs.h:261
DENORM denorm
Definition: pageres.h:190
void improve_by_chopping(float rating_cert_scale, WERD_RES *word, BestChoiceBundle *best_choice_bundle, BlamerBundle *blamer_bundle, LMPainPoints *pain_points, GenericVector< SegSearchPending > *pending)
Definition: chopper.cpp:456
Definition: blobs.h:50
WERD * word
Definition: pageres.h:175
inT16 right() const
Definition: rect.h:75
virtual BLOB_CHOICE_LIST * classify_piece(const GenericVector< SEAM *> &seams, inT16 start, inT16 end, const char *description, TWERD *word, BlamerBundle *blamer_bundle)
Definition: pieces.cpp:56
SEAM * chop_one_blob(const GenericVector< TBOX > &boxes, const GenericVector< BLOB_CHOICE *> &blob_choices, WERD_RES *word_res, int *blob_number)
Definition: chopper.cpp:373
int any_shared_split_points(const GenericVector< SEAM *> &seams, SEAM *seam)
Definition: chopper.cpp:281
static TBLOB * ShallowCopy(const TBLOB &src)
Definition: blobs.cpp:352
SEAM * pick_good_seam(TBLOB *blob)
Definition: findseam.cpp:215
inT16 bottom() const
Definition: rect.h:61
SEAM * chop_numbered_blob(TWERD *word, inT32 blob_number, bool italic_blob, const GenericVector< SEAM *> &seams)
Definition: chopper.cpp:223
void RemapForSplit(int index)
LanguageModel * language_model_
Definition: wordrec.h:410
bool wordrec_debug_blamer
Definition: wordrec.h:167
const UNICHARSET * uch_set
Definition: pageres.h:192
TBOX bounding_box() const
Definition: blobs.cpp:482
TWERD * chopped_word
Definition: pageres.h:201
void SegSearch(WERD_RES *word_res, BestChoiceBundle *best_choice_bundle, BlamerBundle *blamer_bundle)
Definition: segsearch.cpp:37
bool has_hyphen_end(UNICHAR_ID unichar_id, bool first_pos) const
Check whether the word has a hyphen at the end.
Definition: dict.h:143
GenericVector< SEAM * > seam_array
Definition: pageres.h:203
bool prioritize_division
Definition: classify.h:386