tesseract  4.00.00dev
paragraphs_internal.h
Go to the documentation of this file.
1 /**********************************************************************
2  * File: paragraphs.h
3  * Description: Paragraph Detection internal data structures.
4  * Author: David Eger
5  * Created: 11 March 2011
6  *
7  * (C) Copyright 2011, Google Inc.
8  ** Licensed under the Apache License, Version 2.0 (the "License");
9  ** you may not use this file except in compliance with the License.
10  ** You may obtain a copy of the License at
11  ** http://www.apache.org/licenses/LICENSE-2.0
12  ** Unless required by applicable law or agreed to in writing, software
13  ** distributed under the License is distributed on an "AS IS" BASIS,
14  ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15  ** See the License for the specific language governing permissions and
16  ** limitations under the License.
17  *
18  **********************************************************************/
19 
20 #ifndef TESSERACT_CCMAIN_PARAGRAPHS_INTERNAL_H_
21 #define TESSERACT_CCMAIN_PARAGRAPHS_INTERNAL_H_
22 
23 #include "paragraphs.h"
24 #ifdef _MSC_VER
25 #include <string>
26 #else
27 #include "strings.h"
28 #endif
29 
30 // NO CODE OUTSIDE OF paragraphs.cpp AND TESTS SHOULD NEED TO ACCESS
31 // DATA STRUCTURES OR FUNCTIONS IN THIS FILE.
32 
33 class WERD_CHOICE;
34 
35 namespace tesseract {
36 
37 // Return whether the given word is likely to be a list item start word.
38 bool AsciiLikelyListItem(const STRING &word);
39 
40 // Return the first Unicode Codepoint from werd[pos].
41 int UnicodeFor(const UNICHARSET *u, const WERD_CHOICE *werd, int pos);
42 
43 // Set right word attributes given either a unicharset and werd or a utf8
44 // string.
45 void RightWordAttributes(const UNICHARSET *unicharset, const WERD_CHOICE *werd,
46  const STRING &utf8,
47  bool *is_list, bool *starts_idea, bool *ends_idea);
48 
49 // Set left word attributes given either a unicharset and werd or a utf8 string.
50 void LeftWordAttributes(const UNICHARSET *unicharset, const WERD_CHOICE *werd,
51  const STRING &utf8,
52  bool *is_list, bool *starts_idea, bool *ends_idea);
53 
54 enum LineType {
55  LT_START = 'S', // First line of a paragraph.
56  LT_BODY = 'C', // Continuation line of a paragraph.
57  LT_UNKNOWN = 'U', // No clues.
58  LT_MULTIPLE = 'M', // Matches for both LT_START and LT_BODY.
59 };
60 
61 // The first paragraph in a page of body text is often un-indented.
62 // This is a typographic convention which is common to indicate either that:
63 // (1) The paragraph is the continuation of a previous paragraph, or
64 // (2) The paragraph is the first paragraph in a chapter.
65 //
66 // I refer to such paragraphs as "crown"s, and the output of the paragraph
67 // detection algorithm attempts to give them the same paragraph model as
68 // the rest of the body text.
69 //
70 // Nonetheless, while building hypotheses, it is useful to mark the lines
71 // of crown paragraphs temporarily as crowns, either aligned left or right.
72 extern const ParagraphModel *kCrownLeft;
73 extern const ParagraphModel *kCrownRight;
74 
75 inline bool StrongModel(const ParagraphModel *model) {
76  return model != NULL && model != kCrownLeft && model != kCrownRight;
77 }
78 
81  LineHypothesis(LineType line_type, const ParagraphModel *m)
82  : ty(line_type), model(m) {}
84  : ty(other.ty), model(other.model) {}
85 
86  bool operator==(const LineHypothesis &other) const {
87  return ty == other.ty && model == other.model;
88  }
89 
92 };
93 
94 class ParagraphTheory; // Forward Declaration
95 
97 
98 // Row Scratch Registers are data generated by the paragraph detection
99 // algorithm based on a RowInfo input.
101  public:
102  // We presume row will outlive us.
103  void Init(const RowInfo &row);
104 
105  LineType GetLineType() const;
106 
107  LineType GetLineType(const ParagraphModel *model) const;
108 
109  // Mark this as a start line type, sans model. This is useful for the
110  // initial marking of probable body lines or paragraph start lines.
111  void SetStartLine();
112 
113  // Mark this as a body line type, sans model. This is useful for the
114  // initial marking of probably body lines or paragraph start lines.
115  void SetBodyLine();
116 
117  // Record that this row fits as a paragraph start line in the given model,
118  void AddStartLine(const ParagraphModel *model);
119  // Record that this row fits as a paragraph body line in the given model,
120  void AddBodyLine(const ParagraphModel *model);
121 
122  // Clear all hypotheses about this line.
123  void SetUnknown() { hypotheses_.truncate(0); }
124 
125  // Append all hypotheses of strong models that match this row as a start.
126  void StartHypotheses(SetOfModels *models) const;
127 
128  // Append all hypotheses of strong models matching this row.
129  void StrongHypotheses(SetOfModels *models) const;
130 
131  // Append all hypotheses for this row.
132  void NonNullHypotheses(SetOfModels *models) const;
133 
134  // Discard any hypotheses whose model is not in the given list.
135  void DiscardNonMatchingHypotheses(const SetOfModels &models);
136 
137  // If we have only one hypothesis and that is that this line is a paragraph
138  // start line of a certain model, return that model. Else return NULL.
139  const ParagraphModel *UniqueStartHypothesis() const;
140 
141  // If we have only one hypothesis and that is that this line is a paragraph
142  // body line of a certain model, return that model. Else return NULL.
143  const ParagraphModel *UniqueBodyHypothesis() const;
144 
145  // Return the indentation for the side opposite of the aligned side.
147  switch (just) {
148  case tesseract::JUSTIFICATION_RIGHT: return lindent_;
149  case tesseract::JUSTIFICATION_LEFT: return rindent_;
150  default: return lindent_ > rindent_ ? lindent_ : rindent_;
151  }
152  }
153 
154  // Return the indentation for the side the text is aligned to.
156  switch (just) {
157  case tesseract::JUSTIFICATION_RIGHT: return rindent_;
158  case tesseract::JUSTIFICATION_LEFT: return lindent_;
159  default: return lindent_ > rindent_ ? lindent_ : rindent_;
160  }
161  }
162 
163  // Append header fields to a vector of row headings.
164  static void AppendDebugHeaderFields(GenericVector<STRING> *header);
165 
166  // Append data for this row to a vector of debug strings.
167  void AppendDebugInfo(const ParagraphTheory &theory,
168  GenericVector<STRING> *dbg) const;
169 
170  const RowInfo *ri_;
171 
172  // These four constants form a horizontal box model for the white space
173  // on the edges of each line. At each point in the algorithm, the following
174  // shall hold:
175  // ri_->pix_ldistance = lmargin_ + lindent_
176  // ri_->pix_rdistance = rindent_ + rmargin_
177  int lmargin_;
178  int lindent_;
179  int rindent_;
180  int rmargin_;
181 
182  private:
183  // Hypotheses of either LT_START or LT_BODY
185 };
186 
187 // A collection of convenience functions for wrapping the set of
188 // Paragraph Models we believe correctly model the paragraphs in the image.
190  public:
191  // We presume models will outlive us, and that models will take ownership
192  // of any ParagraphModel *'s we add.
194  : models_(models) {}
195  GenericVector<ParagraphModel *> &models() { return *models_; }
196  const GenericVector<ParagraphModel *> &models() const { return *models_; }
197 
198  // Return an existing model if one that is Comparable() can be found.
199  // Else, allocate a new copy of model to save and return a pointer to it.
200  const ParagraphModel *AddModel(const ParagraphModel &model);
201 
202  // Discard any models we've made that are not in the list of used models.
203  void DiscardUnusedModels(const SetOfModels &used_models);
204 
205  // Return the set of all non-centered models.
206  void NonCenteredModels(SetOfModels *models);
207 
208  // If any of the non-centered paragraph models we know about fit
209  // rows[start, end), return it. Else NULL.
210  const ParagraphModel *Fits(const GenericVector<RowScratchRegisters> *rows,
211  int start, int end) const;
212 
213  int IndexOf(const ParagraphModel *model) const;
214 
215  private:
217  GenericVectorEqEq<ParagraphModel *> models_we_added_;
218 };
219 
221  int row, const ParagraphModel *model);
223  int row, const ParagraphModel *model);
225  int a, int b, const ParagraphModel *model);
226 
227 // A class for smearing Paragraph Model hypotheses to surrounding rows.
228 // The idea here is that StrongEvidenceClassify first marks only exceedingly
229 // obvious start and body rows and constructs models of them. Thereafter,
230 // we may have left over unmarked lines (mostly end-of-paragraph lines) which
231 // were too short to have much confidence about, but which fit the models we've
232 // constructed perfectly and which we ought to mark. This class is used to
233 // "smear" our models over the text.
235  public:
237  int row_start, int row_end,
238  ParagraphTheory *theory);
239 
240  // Smear forward paragraph models from existing row markings to subsequent
241  // text lines if they fit, and mark any thereafter still unmodeled rows
242  // with any model in the theory that fits them.
243  void Smear();
244 
245  private:
246  // Record in open_models_ for rows [start_row, end_row) the list of models
247  // currently open at each row.
248  // A model is still open in a row if some previous row has said model as a
249  // start hypothesis, and all rows since (including this row) would fit as
250  // either a body or start line in that model.
251  void CalculateOpenModels(int row_start, int row_end);
252 
253  SetOfModels &OpenModels(int row) {
254  return open_models_[row - row_start_ + 1];
255  }
256 
257  ParagraphTheory *theory_;
259  int row_start_;
260  int row_end_;
261 
262  // open_models_ corresponds to rows[start_row_ - 1, end_row_]
263  //
264  // open_models_: Contains models which there was an active (open) paragraph
265  // as of the previous line and for which the left and right
266  // indents admit the possibility that this text line continues
267  // to fit the same model.
268  // TODO(eger): Think about whether we can get rid of "Open" models and just
269  // use the current hypotheses on RowScratchRegisters.
270  GenericVector<SetOfModels> open_models_;
271 };
272 
273 // Clear all hypotheses about lines [start, end) and reset the margins to the
274 // percentile (0..100) value of the left and right row edges for this run of
275 // rows.
277  GenericVector<RowScratchRegisters> *rows, int start, int end,
278  int percentile);
279 
280 // Return the median inter-word space in rows[row_start, row_end).
282  int row_start, int row_end);
283 
284 // Return whether the first word on the after line can fit in the space at
285 // the end of the before line (knowing which way the text is aligned and read).
286 bool FirstWordWouldHaveFit(const RowScratchRegisters &before,
287  const RowScratchRegisters &after,
288  tesseract::ParagraphJustification justification);
289 
290 // Return whether the first word on the after line can fit in the space at
291 // the end of the before line (not knowing the text alignment).
292 bool FirstWordWouldHaveFit(const RowScratchRegisters &before,
293  const RowScratchRegisters &after);
294 
295 // Do rows[start, end) form a single instance of the given paragraph model?
297  int start, int end, const ParagraphModel *model);
298 
299 // Do the text and geometry of two rows support a paragraph break between them?
300 bool LikelyParagraphStart(const RowScratchRegisters &before,
301  const RowScratchRegisters &after,
303 
304 // Given a set of row_owners pointing to PARAs or NULL (no paragraph known),
305 // normalize each row_owner to point to an actual PARA, and output the
306 // paragraphs in order onto paragraphs.
308  GenericVector<PARA *> *row_owners,
309  PARA_LIST *paragraphs);
310 
311 } // namespace
312 #endif // TESSERACT_CCMAIN_PARAGRAPHS_INTERNAL_H_
double u[max]
void DiscardUnusedModels(const GenericVector< RowScratchRegisters > &rows, ParagraphTheory *theory)
bool AsciiLikelyListItem(const STRING &word)
Definition: paragraphs.cpp:268
bool StrongModel(const ParagraphModel *model)
const ParagraphModel * kCrownRight
Definition: paragraphs.cpp:48
bool ValidFirstLine(const GenericVector< RowScratchRegisters > *rows, int row, const ParagraphModel *model)
bool LikelyParagraphStart(const RowScratchRegisters &before, const RowScratchRegisters &after)
bool RowsFitModel(const GenericVector< RowScratchRegisters > *rows, int start, int end, const ParagraphModel *model)
GenericVector< ParagraphModel * > & models()
int UnicodeFor(const UNICHARSET *u, const WERD_CHOICE *werd, int pos)
Definition: paragraphs.cpp:275
void RightWordAttributes(const UNICHARSET *unicharset, const WERD_CHOICE *werd, const STRING &utf8, bool *is_list, bool *starts_idea, bool *ends_idea)
Definition: paragraphs.cpp:442
void RecomputeMarginsAndClearHypotheses(GenericVector< RowScratchRegisters > *rows, int start, int end, int percentile)
LineHypothesis(const LineHypothesis &other)
GenericVectorEqEq< const ParagraphModel * > SetOfModels
const ParagraphModel * model
const GenericVector< ParagraphModel * > & models() const
Definition: strngs.h:45
ParagraphTheory(GenericVector< ParagraphModel *> *models)
bool FirstWordWouldHaveFit(const RowScratchRegisters &before, const RowScratchRegisters &after, tesseract::ParagraphJustification justification)
bool operator==(const LineHypothesis &other) const
int InterwordSpace(const GenericVector< RowScratchRegisters > &rows, int row_start, int row_end)
int OffsideIndent(tesseract::ParagraphJustification just) const
int AlignsideIndent(tesseract::ParagraphJustification just) const
bool ValidBodyLine(const GenericVector< RowScratchRegisters > *rows, int row, const ParagraphModel *model)
ParagraphJustification
Definition: publictypes.h:239
void LeftWordAttributes(const UNICHARSET *unicharset, const WERD_CHOICE *werd, const STRING &utf8, bool *is_list, bool *starts_idea, bool *ends_idea)
Definition: paragraphs.cpp:395
LineHypothesis(LineType line_type, const ParagraphModel *m)
const ParagraphModel * kCrownLeft
Definition: paragraphs.cpp:46
bool CrownCompatible(const GenericVector< RowScratchRegisters > *rows, int a, int b, const ParagraphModel *model)
void CanonicalizeDetectionResults(GenericVector< PARA *> *row_owners, PARA_LIST *paragraphs)