tesseract  4.00.00dev
wordrec.h
Go to the documentation of this file.
1 // File: wordrec.h
3 // Description: wordrec class.
4 // Author: Samuel Charron
5 //
6 // (C) Copyright 2006, Google Inc.
7 // Licensed under the Apache License, Version 2.0 (the "License");
8 // you may not use this file except in compliance with the License.
9 // You may obtain a copy of the License at
10 // http://www.apache.org/licenses/LICENSE-2.0
11 // Unless required by applicable law or agreed to in writing, software
12 // distributed under the License is distributed on an "AS IS" BASIS,
13 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 // See the License for the specific language governing permissions and
15 // limitations under the License.
16 //
18 
19 #ifndef TESSERACT_WORDREC_WORDREC_H_
20 #define TESSERACT_WORDREC_WORDREC_H_
21 
22 #include "associate.h"
23 #include "classify.h"
24 #include "dict.h"
25 #include "language_model.h"
26 #include "ratngs.h"
27 #include "matrix.h"
28 #include "gradechop.h"
29 #include "seam.h"
30 #include "findseam.h"
31 #include "callcpp.h"
32 
33 class WERD_RES;
34 
35 namespace tesseract {
36 
37 // A class for storing which nodes are to be processed by the segmentation
38 // search. There is a single SegSearchPending for each column in the ratings
39 // matrix, and it indicates whether the segsearch should combine all
40 // BLOB_CHOICES in the column, or just the given row with the parents
41 // corresponding to *this SegSearchPending, and whether only updated parent
42 // ViterbiStateEntries should be combined, or all, with the BLOB_CHOICEs.
44  public:
46  : classified_row_(-1),
47  revisit_whole_column_(false),
48  column_classified_(false) {}
49 
50  // Marks the whole column as just classified. Used to start a search on
51  // a newly initialized ratings matrix.
53  column_classified_ = true;
54  }
55  // Marks the matrix entry at the given row as just classified.
56  // Used after classifying a new matrix cell.
57  // Additional to, not overriding a previous RevisitWholeColumn.
58  void SetBlobClassified(int row) {
59  classified_row_ = row;
60  }
61  // Marks the whole column as needing work, but not just classified.
62  // Used when the parent vse list is updated.
63  // Additional to, not overriding a previous SetBlobClassified.
65  revisit_whole_column_ = true;
66  }
67 
68  // Clears *this to indicate no work to do.
69  void Clear() {
70  classified_row_ = -1;
71  revisit_whole_column_ = false;
72  column_classified_ = false;
73  }
74 
75  // Returns true if there are updates to do in the column that *this
76  // represents.
77  bool WorkToDo() const {
78  return revisit_whole_column_ || column_classified_ || classified_row_ >= 0;
79  }
80  // Returns true if the given row was just classified.
81  bool IsRowJustClassified(int row) const {
82  return row == classified_row_ || column_classified_;
83  }
84  // Returns the single row to process if there is only one, otherwise -1.
85  int SingleRow() const {
86  return revisit_whole_column_ || column_classified_ ? -1 : classified_row_;
87  }
88 
89  private:
90  // If non-negative, indicates the single row in the ratings matrix that has
91  // just been classified, and so should be combined with all the parents in the
92  // column that this SegSearchPending represents.
93  // Operates independently of revisit_whole_column.
94  int classified_row_;
95  // If revisit_whole_column is true, then all BLOB_CHOICEs in this column will
96  // be processed, but classified_row can indicate a row that is newly
97  // classified. Overridden if column_classified is true.
98  bool revisit_whole_column_;
99  // If column_classified is true, parent vses are processed with all rows
100  // regardless of whether they are just updated, overriding
101  // revisit_whole_column and classified_row.
102  bool column_classified_;
103 };
104 
105 
106 /* ccmain/tstruct.cpp *********************************************************/
107 class FRAGMENT:public ELIST_LINK
108 {
109  public:
110  FRAGMENT() { //constructor
111  }
112  FRAGMENT(EDGEPT *head_pt, //start
113  EDGEPT *tail_pt); //end
114 
115  ICOORD head; //coords of start
116  ICOORD tail; //coords of end
117  EDGEPT *headpt; //start point
118  EDGEPT *tailpt; //end point
119 };
121 
122 
123 class Wordrec : public Classify {
124  public:
125  // config parameters *******************************************************
126  BOOL_VAR_H(merge_fragments_in_matrix, TRUE,
127  "Merge the fragments in the ratings matrix and delete them "
128  "after merging");
129  BOOL_VAR_H(wordrec_no_block, FALSE, "Don't output block information");
130  BOOL_VAR_H(wordrec_enable_assoc, TRUE, "Associator Enable");
131  BOOL_VAR_H(force_word_assoc, FALSE,
132  "force associator to run regardless of what enable_assoc is."
133  "This is used for CJK where component grouping is necessary.");
134  double_VAR_H(wordrec_worst_state, 1, "Worst segmentation state");
135  BOOL_VAR_H(fragments_guide_chopper, FALSE,
136  "Use information from fragments to guide chopping process");
137  INT_VAR_H(repair_unchopped_blobs, 1, "Fix blobs that aren't chopped");
138  double_VAR_H(tessedit_certainty_threshold, -2.25, "Good blob limit");
139  INT_VAR_H(chop_debug, 0, "Chop debug");
140  BOOL_VAR_H(chop_enable, 1, "Chop enable");
141  BOOL_VAR_H(chop_vertical_creep, 0, "Vertical creep");
142  INT_VAR_H(chop_split_length, 10000, "Split Length");
143  INT_VAR_H(chop_same_distance, 2, "Same distance");
144  INT_VAR_H(chop_min_outline_points, 6, "Min Number of Points on Outline");
145  INT_VAR_H(chop_seam_pile_size, 150, "Max number of seams in seam_pile");
146  BOOL_VAR_H(chop_new_seam_pile, 1, "Use new seam_pile");
147  INT_VAR_H(chop_inside_angle, -50, "Min Inside Angle Bend");
148  INT_VAR_H(chop_min_outline_area, 2000, "Min Outline Area");
149  double_VAR_H(chop_split_dist_knob, 0.5, "Split length adjustment");
150  double_VAR_H(chop_overlap_knob, 0.9, "Split overlap adjustment");
151  double_VAR_H(chop_center_knob, 0.15, "Split center adjustment");
152  INT_VAR_H(chop_centered_maxwidth, 90, "Width of (smaller) chopped blobs "
153  "above which we don't care that a chop is not near the center.");
154  double_VAR_H(chop_sharpness_knob, 0.06, "Split sharpness adjustment");
155  double_VAR_H(chop_width_change_knob, 5.0, "Width change adjustment");
156  double_VAR_H(chop_ok_split, 100.0, "OK split limit");
157  double_VAR_H(chop_good_split, 50.0, "Good split limit");
158  INT_VAR_H(chop_x_y_weight, 3, "X / Y length weight");
159  INT_VAR_H(segment_adjust_debug, 0, "Segmentation adjustment debug");
160  BOOL_VAR_H(assume_fixed_pitch_char_segment, FALSE,
161  "include fixed-pitch heuristics in char segmentation");
162  INT_VAR_H(wordrec_debug_level, 0, "Debug level for wordrec");
163  INT_VAR_H(wordrec_max_join_chunks, 4,
164  "Max number of broken pieces to associate");
165  BOOL_VAR_H(wordrec_skip_no_truth_words, false,
166  "Only run OCR for words that had truth recorded in BlamerBundle");
167  BOOL_VAR_H(wordrec_debug_blamer, false, "Print blamer debug messages");
168  BOOL_VAR_H(wordrec_run_blamer, false, "Try to set the blame for errors");
169  INT_VAR_H(segsearch_debug_level, 0, "SegSearch debug level");
170  INT_VAR_H(segsearch_max_pain_points, 2000,
171  "Maximum number of pain points stored in the queue");
172  INT_VAR_H(segsearch_max_futile_classifications, 10,
173  "Maximum number of pain point classifications per word.");
174  double_VAR_H(segsearch_max_char_wh_ratio, 2.0,
175  "Maximum character width-to-height ratio");
176  BOOL_VAR_H(save_alt_choices, true,
177  "Save alternative paths found during chopping "
178  "and segmentation search");
179 
180  // methods from wordrec/*.cpp ***********************************************
181  Wordrec();
182  virtual ~Wordrec();
183 
184  // Fills word->alt_choices with alternative paths found during
185  // chopping/segmentation search that are kept in best_choices.
186  void SaveAltChoices(const LIST &best_choices, WERD_RES *word);
187 
188  // Fills character choice lattice in the given BlamerBundle
189  // using the given ratings matrix and best choice list.
190  void FillLattice(const MATRIX &ratings, const WERD_CHOICE_LIST &best_choices,
191  const UNICHARSET &unicharset, BlamerBundle *blamer_bundle);
192 
193  // Calls fill_lattice_ member function
194  // (assumes that fill_lattice_ is not NULL).
195  void CallFillLattice(const MATRIX &ratings,
196  const WERD_CHOICE_LIST &best_choices,
197  const UNICHARSET &unicharset,
198  BlamerBundle *blamer_bundle) {
199  (this->*fill_lattice_)(ratings, best_choices, unicharset, blamer_bundle);
200  }
201 
202  // tface.cpp
203  void program_editup(const char *textbase, TessdataManager *init_classifier,
204  TessdataManager *init_dict);
205  void cc_recog(WERD_RES *word);
206  void program_editdown(inT32 elasped_time);
207  void set_pass1();
208  void set_pass2();
209  int end_recog();
210  BLOB_CHOICE_LIST *call_matcher(TBLOB* blob);
211  int dict_word(const WERD_CHOICE &word);
212  // wordclass.cpp
213  BLOB_CHOICE_LIST *classify_blob(TBLOB *blob,
214  const char *string,
215  C_COL color,
216  BlamerBundle *blamer_bundle);
217 
218  // segsearch.cpp
219  // SegSearch works on the lower diagonal matrix of BLOB_CHOICE_LISTs.
220  // Each entry in the matrix represents the classification choice
221  // for a chunk, i.e. an entry in row 2, column 1 represents the list
222  // of ratings for the chunks 1 and 2 classified as a single blob.
223  // The entries on the diagonal of the matrix are classifier choice lists
224  // for a single chunk from the maximal segmentation.
225  //
226  // The ratings matrix given to SegSearch represents the segmentation
227  // graph / trellis for the current word. The nodes in the graph are the
228  // individual BLOB_CHOICEs in each of the BLOB_CHOICE_LISTs in the ratings
229  // matrix. The children of each node (nodes connected by outgoing links)
230  // are the entries in the column that is equal to node's row+1. The parents
231  // (nodes connected by the incoming links) are the entries in the row that
232  // is equal to the node's column-1. Here is an example ratings matrix:
233  //
234  // 0 1 2 3 4
235  // -------------------------
236  // 0| c,( |
237  // 1| d l,1 |
238  // 2| o |
239  // 3| c,( |
240  // 4| g,y l,1 |
241  // -------------------------
242  //
243  // In the example above node "o" has children (outgoing connection to nodes)
244  // "c","(","g","y" and parents (incoming connections from nodes) "l","1","d".
245  //
246  // The objective of the search is to find the least cost path, where the cost
247  // is determined by the language model components and the properties of the
248  // cut between the blobs on the path. SegSearch starts by populating the
249  // matrix with the all the entries that were classified by the chopper and
250  // finding the initial best path. Based on the classifier ratings, language
251  // model scores and the properties of each cut, a list of "pain points" is
252  // constructed - those are the points on the path where the choices do not
253  // look consistent with the neighboring choices, the cuts look particularly
254  // problematic, or the certainties of the blobs are low. The most troublesome
255  // "pain point" is picked from the list and the new entry in the ratings
256  // matrix corresponding to this "pain point" is filled in. Then the language
257  // model state is updated to reflect the new classification and the new
258  // "pain points" are added to the list and the next most troublesome
259  // "pain point" is determined. This continues until either the word choice
260  // composed from the best paths in the segmentation graph is "good enough"
261  // (e.g. above a certain certainty threshold, is an unambiguous dictionary
262  // word, etc) or there are no more "pain points" to explore.
263  //
264  // If associate_blobs is set to false no new classifications will be done
265  // to combine blobs. Segmentation search will run only one "iteration"
266  // on the classifications already recorded in chunks_record.ratings.
267  //
268  // Note: this function assumes that word_res, best_choice_bundle arguments
269  // are not NULL.
270  void SegSearch(WERD_RES* word_res,
271  BestChoiceBundle* best_choice_bundle,
272  BlamerBundle* blamer_bundle);
273  // Setup and run just the initial segsearch on an established matrix,
274  // without doing any additional chopping or joining.
275  void WordSearch(WERD_RES* word_res);
276 
277  // Setup and run just the initial segsearch on an established matrix,
278  // without doing any additional chopping or joining.
279  // (Internal factored version that can be used as part of the main SegSearch.)
280  void InitialSegSearch(WERD_RES* word_res, LMPainPoints* pain_points,
282  BestChoiceBundle* best_choice_bundle,
283  BlamerBundle* blamer_bundle);
284 
285  // Runs SegSearch() function (above) without needing a best_choice_bundle
286  // or blamer_bundle. Used for testing.
287  void DoSegSearch(WERD_RES* word_res);
288 
289  // chop.cpp
290  PRIORITY point_priority(EDGEPT *point);
291  void add_point_to_list(PointHeap* point_heap, EDGEPT *point);
292  // Returns true if the edgept supplied as input is an inside angle. This
293  // is determined by the angular change of the vectors from point to point.
294  bool is_inside_angle(EDGEPT *pt);
295  int angle_change(EDGEPT *point1, EDGEPT *point2, EDGEPT *point3);
296  EDGEPT *pick_close_point(EDGEPT *critical_point,
297  EDGEPT *vertical_point,
298  int *best_dist);
299  void prioritize_points(TESSLINE *outline, PointHeap* points);
300  void new_min_point(EDGEPT *local_min, PointHeap* points);
301  void new_max_point(EDGEPT *local_max, PointHeap* points);
302  void vertical_projection_point(EDGEPT *split_point, EDGEPT *target_point,
303  EDGEPT** best_point,
304  EDGEPT_CLIST *new_points);
305 
306  // chopper.cpp
307  SEAM *attempt_blob_chop(TWERD *word, TBLOB *blob, inT32 blob_number,
308  bool italic_blob, const GenericVector<SEAM*>& seams);
309  SEAM *chop_numbered_blob(TWERD *word, inT32 blob_number,
310  bool italic_blob, const GenericVector<SEAM*>& seams);
311  SEAM *chop_overlapping_blob(const GenericVector<TBOX>& boxes,
312  bool italic_blob,
313  WERD_RES *word_res, int *blob_number);
314  SEAM *improve_one_blob(const GenericVector<BLOB_CHOICE*> &blob_choices,
315  DANGERR *fixpt,
316  bool split_next_to_fragment,
317  bool italic_blob,
318  WERD_RES *word,
319  int *blob_number);
320  SEAM *chop_one_blob(const GenericVector<TBOX> &boxes,
321  const GenericVector<BLOB_CHOICE*> &blob_choices,
322  WERD_RES *word_res,
323  int *blob_number);
324  void chop_word_main(WERD_RES *word);
325  void improve_by_chopping(float rating_cert_scale,
326  WERD_RES *word,
327  BestChoiceBundle *best_choice_bundle,
328  BlamerBundle *blamer_bundle,
329  LMPainPoints *pain_points,
331  int select_blob_to_split(const GenericVector<BLOB_CHOICE*> &blob_choices,
332  float rating_ceiling,
333  bool split_next_to_fragment);
334  int select_blob_to_split_from_fixpt(DANGERR *fixpt);
335 
336  // findseam.cpp
337  void add_seam_to_queue(float new_priority, SEAM *new_seam, SeamQueue* seams);
338  void choose_best_seam(SeamQueue *seam_queue, const SPLIT *split,
339  PRIORITY priority, SEAM **seam_result, TBLOB *blob,
340  SeamPile *seam_pile);
341  void combine_seam(const SeamPile& seam_pile,
342  const SEAM* seam, SeamQueue* seam_queue);
343  SEAM *pick_good_seam(TBLOB *blob);
344  void try_point_pairs (EDGEPT * points[MAX_NUM_POINTS],
345  inT16 num_points,
346  SeamQueue* seam_queue,
347  SeamPile* seam_pile,
348  SEAM ** seam, TBLOB * blob);
349  void try_vertical_splits(EDGEPT * points[MAX_NUM_POINTS],
350  inT16 num_points,
351  EDGEPT_CLIST *new_points,
352  SeamQueue* seam_queue,
353  SeamPile* seam_pile,
354  SEAM ** seam, TBLOB * blob);
355 
356  // gradechop.cpp
357  PRIORITY grade_split_length(register SPLIT *split);
358  PRIORITY grade_sharpness(register SPLIT *split);
359 
360  // outlines.cpp
361  bool near_point(EDGEPT *point, EDGEPT *line_pt_0, EDGEPT *line_pt_1,
362  EDGEPT **near_pt);
363 
364  // pieces.cpp
365  virtual BLOB_CHOICE_LIST *classify_piece(const GenericVector<SEAM*>& seams,
366  inT16 start,
367  inT16 end,
368  const char* description,
369  TWERD *word,
370  BlamerBundle *blamer_bundle);
371  // Try to merge fragments in the ratings matrix and put the result in
372  // the corresponding row and column
373  void merge_fragments(MATRIX *ratings,
374  inT16 num_blobs);
375  // Recursively go through the ratings matrix to find lists of fragments
376  // to be merged in the function merge_and_put_fragment_lists.
377  // current_frag is the position of the piece we are looking for.
378  // current_row is the row in the rating matrix we are currently at.
379  // start is the row we started initially, so that we can know where
380  // to append the results to the matrix. num_frag_parts is the total
381  // number of pieces we are looking for and num_blobs is the size of the
382  // ratings matrix.
383  void get_fragment_lists(inT16 current_frag,
384  inT16 current_row,
385  inT16 start,
386  inT16 num_frag_parts,
387  inT16 num_blobs,
388  MATRIX *ratings,
389  BLOB_CHOICE_LIST *choice_lists);
390  // Merge the fragment lists in choice_lists and append it to the
391  // ratings matrix
392  void merge_and_put_fragment_lists(inT16 row,
393  inT16 column,
394  inT16 num_frag_parts,
395  BLOB_CHOICE_LIST *choice_lists,
396  MATRIX *ratings);
397  // Filter the fragment list so that the filtered_choices only contain
398  // fragments that are in the correct position. choices is the list
399  // that we are going to filter. fragment_pos is the position in the
400  // fragment that we are looking for and num_frag_parts is the the
401  // total number of pieces. The result will be appended to
402  // filtered_choices.
403  void fill_filtered_fragment_list(BLOB_CHOICE_LIST *choices,
404  int fragment_pos,
405  int num_frag_parts,
406  BLOB_CHOICE_LIST *filtered_choices);
407 
408  // Member variables.
409 
412  // Stores the best choice for the previous word in the paragraph.
413  // This variable is modified by PAGE_RES_IT when iterating over
414  // words to OCR on the page.
416  // Sums of blame reasons computed by the blamer.
418  // Function used to fill char choice lattices.
419  void (Wordrec::*fill_lattice_)(const MATRIX &ratings,
420  const WERD_CHOICE_LIST &best_choices,
421  const UNICHARSET &unicharset,
422  BlamerBundle *blamer_bundle);
423 
424  protected:
425  inline bool SegSearchDone(int num_futile_classifications) {
426  return (language_model_->AcceptableChoiceFound() ||
427  num_futile_classifications >=
428  segsearch_max_futile_classifications);
429  }
430 
431  // Updates the language model state recorded for the child entries specified
432  // in pending[starting_col]. Enqueues the children of the updated entries
433  // into pending and proceeds to update (and remove from pending) all the
434  // remaining entries in pending[col] (col >= starting_col). Upon termination
435  // of this function all the pending[col] lists will be empty.
436  //
437  // The arguments:
438  //
439  // starting_col: index of the column in chunks_record->ratings from
440  // which the update should be started
441  //
442  // pending: list of entries listing chunks_record->ratings entries
443  // that should be updated
444  //
445  // pain_points: priority heap listing the pain points generated by
446  // the language model
447  //
448  // temp_pain_points: temporary storage for tentative pain points generated
449  // by the language model after a single call to LanguageModel::UpdateState()
450  // (the argument is passed in rather than created before each
451  // LanguageModel::UpdateState() call to avoid dynamic memory re-allocation)
452  //
453  // best_choice_bundle: a collection of variables that should be updated
454  // if a new best choice is found
455  //
456  void UpdateSegSearchNodes(
457  float rating_cert_scale,
458  int starting_col,
460  WERD_RES *word_res,
461  LMPainPoints *pain_points,
462  BestChoiceBundle *best_choice_bundle,
463  BlamerBundle *blamer_bundle);
464 
465  // Process the given pain point: classify the corresponding blob, enqueue
466  // new pain points to join the newly classified blob with its neighbors.
467  void ProcessSegSearchPainPoint(float pain_point_priority,
468  const MATRIX_COORD &pain_point,
469  const char* pain_point_type,
471  WERD_RES *word_res,
472  LMPainPoints *pain_points,
473  BlamerBundle *blamer_bundle);
474  // Resets enough of the results so that the Viterbi search is re-run.
475  // Needed when the n-gram model is enabled, as the multi-length comparison
476  // implementation will re-value existing paths to worse values.
477  void ResetNGramSearch(WERD_RES* word_res,
478  BestChoiceBundle* best_choice_bundle,
480 
481  // Add pain points for classifying blobs on the correct segmentation path
482  // (so that we can evaluate correct segmentation path and discover the reason
483  // for incorrect result).
484  void InitBlamerForSegSearch(WERD_RES *word_res,
485  LMPainPoints *pain_points,
486  BlamerBundle *blamer_bundle,
487  STRING *blamer_debug);
488 };
489 
490 
491 } // namespace tesseract
492 
493 #endif // TESSERACT_WORDREC_WORDREC_H_
bool SegSearchDone(int num_futile_classifications)
Definition: wordrec.h:425
#define TRUE
Definition: capi.h:45
int32_t inT32
Definition: host.h:38
C_COL
Definition: callcpp.h:32
PRIORITY pass2_ok_split
Definition: wordrec.h:411
int16_t inT16
Definition: host.h:36
EDGEPT * headpt
Definition: wordrec.h:117
EDGEPT * tailpt
Definition: wordrec.h:118
Definition: blobs.h:395
#define MAX_NUM_POINTS
Definition: chop.h:39
float PRIORITY
Definition: seam.h:42
bool IsRowJustClassified(int row) const
Definition: wordrec.h:81
Definition: seam.h:44
Definition: strngs.h:45
#define FALSE
Definition: capi.h:46
#define double_VAR_H(name, val, comment)
Definition: params.h:273
Bundle together all the things pertaining to the best choice/state.
Definition: lm_state.h:215
void SetBlobClassified(int row)
Definition: wordrec.h:58
bool WorkToDo() const
Definition: wordrec.h:77
void CallFillLattice(const MATRIX &ratings, const WERD_CHOICE_LIST &best_choices, const UNICHARSET &unicharset, BlamerBundle *blamer_bundle)
Definition: wordrec.h:195
Definition: blobs.h:76
Definition: matrix.h:563
Definition: blobs.h:261
ELISTIZEH(AmbigSpec)
GenericVector< int > blame_reasons_
Definition: wordrec.h:417
#define INT_VAR_H(name, val, comment)
Definition: params.h:264
LanguageModel * language_model_
Definition: wordrec.h:410
#define BOOL_VAR_H(name, val, comment)
Definition: params.h:267
GridSearch< WordWithBox, WordWithBox_CLIST, WordWithBox_C_IT > WordSearch
Definition: textord.h:66
Definition: split.h:37
WERD_CHOICE * prev_word_best_choice_
Definition: wordrec.h:415
integer coordinate
Definition: points.h:30