tesseract  4.00.00dev
paragraphs.cpp
Go to the documentation of this file.
1 /**********************************************************************
2  * File: paragraphs.cpp
3  * Description: Paragraph detection for tesseract.
4  * Author: David Eger
5  * Created: 25 February 2011
6  *
7  * (C) Copyright 2011, Google Inc.
8  ** Licensed under the Apache License, Version 2.0 (the "License");
9  ** you may not use this file except in compliance with the License.
10  ** You may obtain a copy of the License at
11  ** http://www.apache.org/licenses/LICENSE-2.0
12  ** Unless required by applicable law or agreed to in writing, software
13  ** distributed under the License is distributed on an "AS IS" BASIS,
14  ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15  ** See the License for the specific language governing permissions and
16  ** limitations under the License.
17  *
18  **********************************************************************/
19 #ifdef _MSC_VER
20 #define __func__ __FUNCTION__
21 #endif
22 
23 #include <ctype.h>
24 #include <memory> // std::unique_ptr
25 
26 #include "genericvector.h"
27 #include "helpers.h"
28 #include "mutableiterator.h"
29 #include "ocrpara.h"
30 #include "pageres.h"
31 #include "paragraphs.h"
32 #include "paragraphs_internal.h"
33 #include "publictypes.h"
34 #include "ratngs.h"
35 #include "rect.h"
36 #include "statistc.h"
37 #include "strngs.h"
38 #include "tprintf.h"
39 #include "unicharset.h"
40 #include "unicodes.h"
41 
42 namespace tesseract {
43 
44 // Special "weak" ParagraphModels.
46  = reinterpret_cast<ParagraphModel *>(0xDEAD111F);
48  = reinterpret_cast<ParagraphModel *>(0xDEAD888F);
49 
50 // Given the width of a typical space between words, what is the threshold
51 // by which by which we think left and right alignments for paragraphs
52 // can vary and still be aligned.
53 static int Epsilon(int space_pix) {
54  return space_pix * 4 / 5;
55 }
56 
57 static bool AcceptableRowArgs(
58  int debug_level, int min_num_rows, const char *function_name,
60  int row_start, int row_end) {
61  if (row_start < 0 || row_end > rows->size() || row_start > row_end) {
62  tprintf("Invalid arguments rows[%d, %d) while rows is of size %d.\n",
63  row_start, row_end, rows->size());
64  return false;
65  }
66  if (row_end - row_start < min_num_rows) {
67  if (debug_level > 1) {
68  tprintf("# Too few rows[%d, %d) for %s.\n",
69  row_start, row_end, function_name);
70  }
71  return false;
72  }
73  return true;
74 }
75 
76 // =============================== Debug Code ================================
77 
78 // Convert an integer to a decimal string.
79 static STRING StrOf(int num) {
80  char buffer[30];
81  snprintf(buffer, sizeof(buffer), "%d", num);
82  return STRING(buffer);
83 }
84 
85 // Given a row-major matrix of unicode text and a column separator, print
86 // a formatted table. For ASCII, we get good column alignment.
87 static void PrintTable(const GenericVector<GenericVector<STRING> > &rows,
88  const STRING &colsep) {
89  GenericVector<int> max_col_widths;
90  for (int r = 0; r < rows.size(); r++) {
91  int num_columns = rows[r].size();
92  for (int c = 0; c < num_columns; c++) {
93  int num_unicodes = 0;
94  for (int i = 0; i < rows[r][c].size(); i++) {
95  if ((rows[r][c][i] & 0xC0) != 0x80) num_unicodes++;
96  }
97  if (c >= max_col_widths.size()) {
98  max_col_widths.push_back(num_unicodes);
99  } else {
100  if (num_unicodes > max_col_widths[c])
101  max_col_widths[c] = num_unicodes;
102  }
103  }
104  }
105 
106  GenericVector<STRING> col_width_patterns;
107  for (int c = 0; c < max_col_widths.size(); c++) {
108  col_width_patterns.push_back(
109  STRING("%-") + StrOf(max_col_widths[c]) + "s");
110  }
111 
112  for (int r = 0; r < rows.size(); r++) {
113  for (int c = 0; c < rows[r].size(); c++) {
114  if (c > 0)
115  tprintf("%s", colsep.string());
116  tprintf(col_width_patterns[c].string(), rows[r][c].string());
117  }
118  tprintf("\n");
119  }
120 }
121 
122 STRING RtlEmbed(const STRING &word, bool rtlify) {
123  if (rtlify)
124  return STRING(kRLE) + word + STRING(kPDF);
125  return word;
126 }
127 
128 // Print the current thoughts of the paragraph detector.
129 static void PrintDetectorState(const ParagraphTheory &theory,
133  output.back().push_back("#row");
134  output.back().push_back("space");
135  output.back().push_back("..");
136  output.back().push_back("lword[widthSEL]");
137  output.back().push_back("rword[widthSEL]");
139  output.back().push_back("text");
140 
141  for (int i = 0; i < rows.size(); i++) {
143  GenericVector<STRING> &row = output.back();
144  const RowInfo& ri = *rows[i].ri_;
145  row.push_back(StrOf(i));
146  row.push_back(StrOf(ri.average_interword_space));
147  row.push_back(ri.has_leaders ? ".." : " ");
148  row.push_back(RtlEmbed(ri.lword_text, !ri.ltr) +
149  "[" + StrOf(ri.lword_box.width()) +
150  (ri.lword_likely_starts_idea ? "S" : "s") +
151  (ri.lword_likely_ends_idea ? "E" : "e") +
152  (ri.lword_indicates_list_item ? "L" : "l") +
153  "]");
154  row.push_back(RtlEmbed(ri.rword_text, !ri.ltr) +
155  "[" + StrOf(ri.rword_box.width()) +
156  (ri.rword_likely_starts_idea ? "S" : "s") +
157  (ri.rword_likely_ends_idea ? "E" : "e") +
158  (ri.rword_indicates_list_item ? "L" : "l") +
159  "]");
160  rows[i].AppendDebugInfo(theory, &row);
161  row.push_back(RtlEmbed(ri.text, !ri.ltr));
162  }
163  PrintTable(output, " ");
164 
165  tprintf("Active Paragraph Models:\n");
166  for (int m = 0; m < theory.models().size(); m++) {
167  tprintf(" %d: %s\n", m + 1, theory.models()[m]->ToString().string());
168  }
169 }
170 
171 static void DebugDump(
172  bool should_print,
173  const STRING &phase,
174  const ParagraphTheory &theory,
176  if (!should_print)
177  return;
178  tprintf("# %s\n", phase.string());
179  PrintDetectorState(theory, rows);
180 }
181 
182 // Print out the text for rows[row_start, row_end)
183 static void PrintRowRange(const GenericVector<RowScratchRegisters> &rows,
184  int row_start, int row_end) {
185  tprintf("======================================\n");
186  for (int row = row_start; row < row_end; row++) {
187  tprintf("%s\n", rows[row].ri_->text.string());
188  }
189  tprintf("======================================\n");
190 }
191 
192 // ============= Brain Dead Language Model (ASCII Version) ===================
193 
194 bool IsLatinLetter(int ch) {
195  return (ch >= 'a' && ch <= 'z') || (ch >= 'A' && ch <= 'Z');
196 }
197 
198 bool IsDigitLike(int ch) {
199  return ch == 'o' || ch == 'O' || ch == 'l' || ch == 'I';
200 }
201 
202 bool IsOpeningPunct(int ch) {
203  return strchr("'\"({[", ch) != NULL;
204 }
205 
206 bool IsTerminalPunct(int ch) {
207  return strchr(":'\".?!]})", ch) != NULL;
208 }
209 
210 // Return a pointer after consuming as much text as qualifies as roman numeral.
211 const char *SkipChars(const char *str, const char *toskip) {
212  while (*str != '\0' && strchr(toskip, *str)) { str++; }
213  return str;
214 }
215 
216 const char *SkipChars(const char *str, bool (*skip)(int)) {
217  while (*str != '\0' && skip(*str)) { str++; }
218  return str;
219 }
220 
221 const char *SkipOne(const char *str, const char *toskip) {
222  if (*str != '\0' && strchr(toskip, *str)) return str + 1;
223  return str;
224 }
225 
226 // Return whether it is very likely that this is a numeral marker that could
227 // start a list item. Some examples include:
228 // A I iii. VI (2) 3.5. [C-4]
229 bool LikelyListNumeral(const STRING &word) {
230  const char *kRomans = "ivxlmdIVXLMD";
231  const char *kDigits = "012345789";
232  const char *kOpen = "[{(";
233  const char *kSep = ":;-.,";
234  const char *kClose = "]})";
235 
236  int num_segments = 0;
237  const char *pos = word.string();
238  while (*pos != '\0' && num_segments < 3) {
239  // skip up to two open parens.
240  const char *numeral_start = SkipOne(SkipOne(pos, kOpen), kOpen);
241  const char *numeral_end = SkipChars(numeral_start, kRomans);
242  if (numeral_end != numeral_start) {
243  // Got Roman Numeral. Great.
244  } else {
245  numeral_end = SkipChars(numeral_start, kDigits);
246  if (numeral_end == numeral_start) {
247  // If there's a single latin letter, we can use that.
248  numeral_end = SkipChars(numeral_start, IsLatinLetter);
249  if (numeral_end - numeral_start != 1)
250  break;
251  }
252  }
253  // We got some sort of numeral.
254  num_segments++;
255  // Skip any trailing parens or punctuation.
256  pos = SkipChars(SkipChars(numeral_end, kClose), kSep);
257  if (pos == numeral_end)
258  break;
259  }
260  return *pos == '\0';
261 }
262 
263 bool LikelyListMark(const STRING &word) {
264  const char *kListMarks = "0Oo*.,+.";
265  return word.size() == 1 && strchr(kListMarks, word[0]) != NULL;
266 }
267 
268 bool AsciiLikelyListItem(const STRING &word) {
269  return LikelyListMark(word) || LikelyListNumeral(word);
270 }
271 
272 // ========== Brain Dead Language Model (Tesseract Version) ================
273 
274 // Return the first Unicode Codepoint from werd[pos].
275 int UnicodeFor(const UNICHARSET *u, const WERD_CHOICE *werd, int pos) {
276  if (!u || !werd || pos > werd->length())
277  return 0;
278  return UNICHAR(u->id_to_unichar(werd->unichar_id(pos)), -1).first_uni();
279 }
280 
281 // A useful helper class for finding the first j >= i so that word[j]
282 // does not have given character type.
284  public:
285  UnicodeSpanSkipper(const UNICHARSET *unicharset, const WERD_CHOICE *word)
286  : u_(unicharset), word_(word) { wordlen_ = word->length(); }
287 
288  // Given an input position, return the first position >= pos not punc.
289  int SkipPunc(int pos);
290  // Given an input position, return the first position >= pos not digit.
291  int SkipDigits(int pos);
292  // Given an input position, return the first position >= pos not roman.
293  int SkipRomans(int pos);
294  // Given an input position, return the first position >= pos not alpha.
295  int SkipAlpha(int pos);
296 
297  private:
298  const UNICHARSET *u_;
299  const WERD_CHOICE *word_;
300  int wordlen_;
301 };
302 
304  while (pos < wordlen_ && u_->get_ispunctuation(word_->unichar_id(pos))) pos++;
305  return pos;
306 }
307 
309  while (pos < wordlen_ && (u_->get_isdigit(word_->unichar_id(pos)) ||
310  IsDigitLike(UnicodeFor(u_, word_, pos)))) pos++;
311  return pos;
312 }
313 
315  const char *kRomans = "ivxlmdIVXLMD";
316  while (pos < wordlen_) {
317  int ch = UnicodeFor(u_, word_, pos);
318  if (ch >= 0xF0 || strchr(kRomans, ch) == 0) break;
319  pos++;
320  }
321  return pos;
322 }
323 
325  while (pos < wordlen_ && u_->get_isalpha(word_->unichar_id(pos))) pos++;
326  return pos;
327 }
328 
329 bool LikelyListMarkUnicode(int ch) {
330  if (ch < 0x80) {
331  STRING single_ch;
332  single_ch += ch;
333  return LikelyListMark(single_ch);
334  }
335  switch (ch) {
336  // TODO(eger) expand this list of unicodes as needed.
337  case 0x00B0: // degree sign
338  case 0x2022: // bullet
339  case 0x25E6: // white bullet
340  case 0x00B7: // middle dot
341  case 0x25A1: // white square
342  case 0x25A0: // black square
343  case 0x25AA: // black small square
344  case 0x2B1D: // black very small square
345  case 0x25BA: // black right-pointing pointer
346  case 0x25CF: // black circle
347  case 0x25CB: // white circle
348  return true;
349  default:
350  break; // fall through
351  }
352  return false;
353 }
354 
355 // Return whether it is very likely that this is a numeral marker that could
356 // start a list item. Some examples include:
357 // A I iii. VI (2) 3.5. [C-4]
358 bool UniLikelyListItem(const UNICHARSET *u, const WERD_CHOICE *werd) {
359  if (werd->length() == 1 && LikelyListMarkUnicode(UnicodeFor(u, werd, 0)))
360  return true;
361 
362  UnicodeSpanSkipper m(u, werd);
363  int num_segments = 0;
364  int pos = 0;
365  while (pos < werd->length() && num_segments < 3) {
366  int numeral_start = m.SkipPunc(pos);
367  if (numeral_start > pos + 1) break;
368  int numeral_end = m.SkipRomans(numeral_start);
369  if (numeral_end == numeral_start) {
370  numeral_end = m.SkipDigits(numeral_start);
371  if (numeral_end == numeral_start) {
372  // If there's a single latin letter, we can use that.
373  numeral_end = m.SkipAlpha(numeral_start);
374  if (numeral_end - numeral_start != 1)
375  break;
376  }
377  }
378  // We got some sort of numeral.
379  num_segments++;
380  // Skip any trailing punctuation.
381  pos = m.SkipPunc(numeral_end);
382  if (pos == numeral_end)
383  break;
384  }
385  return pos == werd->length();
386 }
387 
388 // ========= Brain Dead Language Model (combined entry points) ================
389 
390 // Given the leftmost word of a line either as a Tesseract unicharset + werd
391 // or a utf8 string, set the following attributes for it:
392 // is_list - this word might be a list number or bullet.
393 // starts_idea - this word is likely to start a sentence.
394 // ends_idea - this word is likely to end a sentence.
395 void LeftWordAttributes(const UNICHARSET *unicharset, const WERD_CHOICE *werd,
396  const STRING &utf8,
397  bool *is_list, bool *starts_idea, bool *ends_idea) {
398  *is_list = false;
399  *starts_idea = false;
400  *ends_idea = false;
401  if (utf8.size() == 0 || (werd != NULL && werd->length() == 0)) { // Empty
402  *ends_idea = true;
403  return;
404  }
405 
406  if (unicharset && werd) { // We have a proper werd and unicharset so use it.
407  if (UniLikelyListItem(unicharset, werd)) {
408  *is_list = true;
409  *starts_idea = true;
410  *ends_idea = true;
411  }
412  if (unicharset->get_isupper(werd->unichar_id(0))) {
413  *starts_idea = true;
414  }
415  if (unicharset->get_ispunctuation(werd->unichar_id(0))) {
416  *starts_idea = true;
417  *ends_idea = true;
418  }
419  } else { // Assume utf8 is mostly ASCII
420  if (AsciiLikelyListItem(utf8)) {
421  *is_list = true;
422  *starts_idea = true;
423  }
424  int start_letter = utf8[0];
425  if (IsOpeningPunct(start_letter)) {
426  *starts_idea = true;
427  }
428  if (IsTerminalPunct(start_letter)) {
429  *ends_idea = true;
430  }
431  if (start_letter >= 'A' && start_letter <= 'Z') {
432  *starts_idea = true;
433  }
434  }
435 }
436 
437 // Given the rightmost word of a line either as a Tesseract unicharset + werd
438 // or a utf8 string, set the following attributes for it:
439 // is_list - this word might be a list number or bullet.
440 // starts_idea - this word is likely to start a sentence.
441 // ends_idea - this word is likely to end a sentence.
442 void RightWordAttributes(const UNICHARSET *unicharset, const WERD_CHOICE *werd,
443  const STRING &utf8,
444  bool *is_list, bool *starts_idea, bool *ends_idea) {
445  *is_list = false;
446  *starts_idea = false;
447  *ends_idea = false;
448  if (utf8.size() == 0 || (werd != NULL && werd->length() == 0)) { // Empty
449  *ends_idea = true;
450  return;
451  }
452 
453  if (unicharset && werd) { // We have a proper werd and unicharset so use it.
454  if (UniLikelyListItem(unicharset, werd)) {
455  *is_list = true;
456  *starts_idea = true;
457  }
458  UNICHAR_ID last_letter = werd->unichar_id(werd->length() - 1);
459  if (unicharset->get_ispunctuation(last_letter)) {
460  *ends_idea = true;
461  }
462  } else { // Assume utf8 is mostly ASCII
463  if (AsciiLikelyListItem(utf8)) {
464  *is_list = true;
465  *starts_idea = true;
466  }
467  int last_letter = utf8[utf8.size() - 1];
468  if (IsOpeningPunct(last_letter) || IsTerminalPunct(last_letter)) {
469  *ends_idea = true;
470  }
471  }
472 }
473 
474 // =============== Implementation of RowScratchRegisters =====================
475 /* static */
477  GenericVector<STRING> *header) {
478  header->push_back("[lmarg,lind;rind,rmarg]");
479  header->push_back("model");
480 }
481 
483  GenericVector<STRING> *dbg) const {
484  char s[30];
485  snprintf(s, sizeof(s), "[%3d,%3d;%3d,%3d]",
486  lmargin_, lindent_, rindent_, rmargin_);
487  dbg->push_back(s);
488  STRING model_string;
489  model_string += static_cast<char>(GetLineType());
490  model_string += ":";
491 
492  int model_numbers = 0;
493  for (int h = 0; h < hypotheses_.size(); h++) {
494  if (hypotheses_[h].model == NULL)
495  continue;
496  if (model_numbers > 0)
497  model_string += ",";
498  if (StrongModel(hypotheses_[h].model)) {
499  model_string += StrOf(1 + theory.IndexOf(hypotheses_[h].model));
500  } else if (hypotheses_[h].model == kCrownLeft) {
501  model_string += "CrL";
502  } else if (hypotheses_[h].model == kCrownRight) {
503  model_string += "CrR";
504  }
505  model_numbers++;
506  }
507  if (model_numbers == 0)
508  model_string += "0";
509 
510  dbg->push_back(model_string);
511 }
512 
514  ri_ = &row;
515  lmargin_ = 0;
516  lindent_ = row.pix_ldistance;
517  rmargin_ = 0;
518  rindent_ = row.pix_rdistance;
519 }
520 
522  if (hypotheses_.empty())
523  return LT_UNKNOWN;
524  bool has_start = false;
525  bool has_body = false;
526  for (int i = 0; i < hypotheses_.size(); i++) {
527  switch (hypotheses_[i].ty) {
528  case LT_START: has_start = true; break;
529  case LT_BODY: has_body = true; break;
530  default:
531  tprintf("Encountered bad value in hypothesis list: %c\n",
532  hypotheses_[i].ty);
533  break;
534  }
535  }
536  if (has_start && has_body)
537  return LT_MULTIPLE;
538  return has_start ? LT_START : LT_BODY;
539 }
540 
542  if (hypotheses_.empty())
543  return LT_UNKNOWN;
544  bool has_start = false;
545  bool has_body = false;
546  for (int i = 0; i < hypotheses_.size(); i++) {
547  if (hypotheses_[i].model != model)
548  continue;
549  switch (hypotheses_[i].ty) {
550  case LT_START: has_start = true; break;
551  case LT_BODY: has_body = true; break;
552  default:
553  tprintf("Encountered bad value in hypothesis list: %c\n",
554  hypotheses_[i].ty);
555  break;
556  }
557  }
558  if (has_start && has_body)
559  return LT_MULTIPLE;
560  return has_start ? LT_START : LT_BODY;
561 }
562 
564  LineType current_lt = GetLineType();
565  if (current_lt != LT_UNKNOWN && current_lt != LT_START) {
566  tprintf("Trying to set a line to be START when it's already BODY.\n");
567  }
568  if (current_lt == LT_UNKNOWN || current_lt == LT_BODY) {
569  hypotheses_.push_back_new(LineHypothesis(LT_START, NULL));
570  }
571 }
572 
574  LineType current_lt = GetLineType();
575  if (current_lt != LT_UNKNOWN && current_lt != LT_BODY) {
576  tprintf("Trying to set a line to be BODY when it's already START.\n");
577  }
578  if (current_lt == LT_UNKNOWN || current_lt == LT_START) {
579  hypotheses_.push_back_new(LineHypothesis(LT_BODY, NULL));
580  }
581 }
582 
584  hypotheses_.push_back_new(LineHypothesis(LT_START, model));
585  int old_idx = hypotheses_.get_index(LineHypothesis(LT_START, NULL));
586  if (old_idx >= 0)
587  hypotheses_.remove(old_idx);
588 }
589 
591  hypotheses_.push_back_new(LineHypothesis(LT_BODY, model));
592  int old_idx = hypotheses_.get_index(LineHypothesis(LT_BODY, NULL));
593  if (old_idx >= 0)
594  hypotheses_.remove(old_idx);
595 }
596 
598  for (int h = 0; h < hypotheses_.size(); h++) {
599  if (hypotheses_[h].ty == LT_START && StrongModel(hypotheses_[h].model))
600  models->push_back_new(hypotheses_[h].model);
601  }
602 }
603 
605  for (int h = 0; h < hypotheses_.size(); h++) {
606  if (StrongModel(hypotheses_[h].model))
607  models->push_back_new(hypotheses_[h].model);
608  }
609 }
610 
612  for (int h = 0; h < hypotheses_.size(); h++) {
613  if (hypotheses_[h].model != NULL)
614  models->push_back_new(hypotheses_[h].model);
615  }
616 }
617 
619  if (hypotheses_.size() != 1 || hypotheses_[0].ty != LT_START)
620  return NULL;
621  return hypotheses_[0].model;
622 }
623 
625  if (hypotheses_.size() != 1 || hypotheses_[0].ty != LT_BODY)
626  return NULL;
627  return hypotheses_[0].model;
628 }
629 
630 // Discard any hypotheses whose model is not in the given list.
632  const SetOfModels &models) {
633  if (models.empty())
634  return;
635  for (int h = hypotheses_.size() - 1; h >= 0; h--) {
636  if (!models.contains(hypotheses_[h].model)) {
637  hypotheses_.remove(h);
638  }
639  }
640 }
641 
642 // ============ Geometry based Paragraph Detection Algorithm =================
643 
644 struct Cluster {
645  Cluster() : center(0), count(0) {}
646  Cluster(int cen, int num) : center(cen), count(num) {}
647 
648  int center; // The center of the cluster.
649  int count; // The number of entries within the cluster.
650 };
651 
653  public:
654  explicit SimpleClusterer(int max_cluster_width)
655  : max_cluster_width_(max_cluster_width) {}
656  void Add(int value) { values_.push_back(value); }
657  int size() const { return values_.size(); }
658  void GetClusters(GenericVector<Cluster> *clusters);
659 
660  private:
661  int max_cluster_width_;
662  GenericVectorEqEq<int> values_;
663 };
664 
665 // Return the index of the cluster closest to value.
666 int ClosestCluster(const GenericVector<Cluster> &clusters, int value) {
667  int best_index = 0;
668  for (int i = 0; i < clusters.size(); i++) {
669  if (abs(value - clusters[i].center) <
670  abs(value - clusters[best_index].center))
671  best_index = i;
672  }
673  return best_index;
674 }
675 
677  clusters->clear();
678  values_.sort();
679  for (int i = 0; i < values_.size();) {
680  int orig_i = i;
681  int lo = values_[i];
682  int hi = lo;
683  while (++i < values_.size() && values_[i] <= lo + max_cluster_width_) {
684  hi = values_[i];
685  }
686  clusters->push_back(Cluster((hi + lo) / 2, i - orig_i));
687  }
688 }
689 
690 // Calculate left- and right-indent tab stop values seen in
691 // rows[row_start, row_end) given a tolerance of tolerance.
693  int row_start, int row_end,
694  int tolerance,
695  GenericVector<Cluster> *left_tabs,
696  GenericVector<Cluster> *right_tabs) {
697  if (!AcceptableRowArgs(0, 1, __func__, rows, row_start, row_end))
698  return;
699  // First pass: toss all left and right indents into clusterers.
700  SimpleClusterer initial_lefts(tolerance);
701  SimpleClusterer initial_rights(tolerance);
702  GenericVector<Cluster> initial_left_tabs;
703  GenericVector<Cluster> initial_right_tabs;
704  for (int i = row_start; i < row_end; i++) {
705  initial_lefts.Add((*rows)[i].lindent_);
706  initial_rights.Add((*rows)[i].rindent_);
707  }
708  initial_lefts.GetClusters(&initial_left_tabs);
709  initial_rights.GetClusters(&initial_right_tabs);
710 
711  // Second pass: cluster only lines that are not "stray"
712  // An example of a stray line is a page number -- a line whose start
713  // and end tab-stops are far outside the typical start and end tab-stops
714  // for the block.
715  // Put another way, we only cluster data from lines whose start or end
716  // tab stop is frequent.
717  SimpleClusterer lefts(tolerance);
718  SimpleClusterer rights(tolerance);
719 
720  // Outlier elimination. We might want to switch this to test outlier-ness
721  // based on how strange a position an outlier is in instead of or in addition
722  // to how rare it is. These outliers get re-added if we end up having too
723  // few tab stops, to work with, however.
724  int infrequent_enough_to_ignore = 0;
725  if (row_end - row_start >= 8) infrequent_enough_to_ignore = 1;
726  if (row_end - row_start >= 20) infrequent_enough_to_ignore = 2;
727 
728  for (int i = row_start; i < row_end; i++) {
729  int lidx = ClosestCluster(initial_left_tabs, (*rows)[i].lindent_);
730  int ridx = ClosestCluster(initial_right_tabs, (*rows)[i].rindent_);
731  if (initial_left_tabs[lidx].count > infrequent_enough_to_ignore ||
732  initial_right_tabs[ridx].count > infrequent_enough_to_ignore) {
733  lefts.Add((*rows)[i].lindent_);
734  rights.Add((*rows)[i].rindent_);
735  }
736  }
737  lefts.GetClusters(left_tabs);
738  rights.GetClusters(right_tabs);
739 
740  if ((left_tabs->size() == 1 && right_tabs->size() >= 4) ||
741  (right_tabs->size() == 1 && left_tabs->size() >= 4)) {
742  // One side is really ragged, and the other only has one tab stop,
743  // so those "insignificant outliers" are probably important, actually.
744  // This often happens on a page of an index. Add back in the ones
745  // we omitted in the first pass.
746  for (int i = row_start; i < row_end; i++) {
747  int lidx = ClosestCluster(initial_left_tabs, (*rows)[i].lindent_);
748  int ridx = ClosestCluster(initial_right_tabs, (*rows)[i].rindent_);
749  if (!(initial_left_tabs[lidx].count > infrequent_enough_to_ignore ||
750  initial_right_tabs[ridx].count > infrequent_enough_to_ignore)) {
751  lefts.Add((*rows)[i].lindent_);
752  rights.Add((*rows)[i].rindent_);
753  }
754  }
755  }
756  lefts.GetClusters(left_tabs);
757  rights.GetClusters(right_tabs);
758 
759  // If one side is almost a two-indent aligned side, and the other clearly
760  // isn't, try to prune out the least frequent tab stop from that side.
761  if (left_tabs->size() == 3 && right_tabs->size() >= 4) {
762  int to_prune = -1;
763  for (int i = left_tabs->size() - 1; i >= 0; i--) {
764  if (to_prune < 0 ||
765  (*left_tabs)[i].count < (*left_tabs)[to_prune].count) {
766  to_prune = i;
767  }
768  }
769  if (to_prune >= 0 &&
770  (*left_tabs)[to_prune].count <= infrequent_enough_to_ignore) {
771  left_tabs->remove(to_prune);
772  }
773  }
774  if (right_tabs->size() == 3 && left_tabs->size() >= 4) {
775  int to_prune = -1;
776  for (int i = right_tabs->size() - 1; i >= 0; i--) {
777  if (to_prune < 0 ||
778  (*right_tabs)[i].count < (*right_tabs)[to_prune].count) {
779  to_prune = i;
780  }
781  }
782  if (to_prune >= 0 &&
783  (*right_tabs)[to_prune].count <= infrequent_enough_to_ignore) {
784  right_tabs->remove(to_prune);
785  }
786  }
787 }
788 
789 // Given a paragraph model mark rows[row_start, row_end) as said model
790 // start or body lines.
791 //
792 // Case 1: model->first_indent_ != model->body_indent_
793 // Differentiating the paragraph start lines from the paragraph body lines in
794 // this case is easy, we just see how far each line is indented.
795 //
796 // Case 2: model->first_indent_ == model->body_indent_
797 // Here, we find end-of-paragraph lines by looking for "short lines."
798 // What constitutes a "short line" changes depending on whether the text
799 // ragged-right[left] or fully justified (aligned left and right).
800 //
801 // Case 2a: Ragged Right (or Left) text. (eop_threshold == 0)
802 // We have a new paragraph it the first word would have at the end
803 // of the previous line.
804 //
805 // Case 2b: Fully Justified. (eop_threshold > 0)
806 // We mark a line as short (end of paragraph) if the offside indent
807 // is greater than eop_threshold.
809  int row_start, int row_end,
810  const ParagraphModel *model,
811  bool ltr,
812  int eop_threshold) {
813  if (!AcceptableRowArgs(0, 0, __func__, rows, row_start, row_end))
814  return;
815  for (int row = row_start; row < row_end; row++) {
816  bool valid_first = ValidFirstLine(rows, row, model);
817  bool valid_body = ValidBodyLine(rows, row, model);
818  if (valid_first && !valid_body) {
819  (*rows)[row].AddStartLine(model);
820  } else if (valid_body && !valid_first) {
821  (*rows)[row].AddBodyLine(model);
822  } else if (valid_body && valid_first) {
823  bool after_eop = (row == row_start);
824  if (row > row_start) {
825  if (eop_threshold > 0) {
826  if (model->justification() == JUSTIFICATION_LEFT) {
827  after_eop = (*rows)[row - 1].rindent_ > eop_threshold;
828  } else {
829  after_eop = (*rows)[row - 1].lindent_ > eop_threshold;
830  }
831  } else {
832  after_eop = FirstWordWouldHaveFit((*rows)[row - 1], (*rows)[row],
833  model->justification());
834  }
835  }
836  if (after_eop) {
837  (*rows)[row].AddStartLine(model);
838  } else {
839  (*rows)[row].AddBodyLine(model);
840  }
841  } else {
842  // Do nothing. Stray row.
843  }
844  }
845 }
846 
847 // GeometricClassifierState holds all of the information we'll use while
848 // trying to determine a paragraph model for the text lines in a block of
849 // text:
850 // + the rows under consideration [row_start, row_end)
851 // + the common left- and right-indent tab stops
852 // + does the block start out left-to-right or right-to-left
853 // Further, this struct holds the data we amass for the (single) ParagraphModel
854 // we'll assign to the text lines (assuming we get that far).
858  int r_start, int r_end)
859  : debug_level(dbg_level), rows(r), row_start(r_start), row_end(r_end),
860  margin(0) {
861  tolerance = InterwordSpace(*r, r_start, r_end);
862  CalculateTabStops(r, r_start, r_end, tolerance,
863  &left_tabs, &right_tabs);
864  if (debug_level >= 3) {
865  tprintf("Geometry: TabStop cluster tolerance = %d; "
866  "%d left tabs; %d right tabs\n",
867  tolerance, left_tabs.size(), right_tabs.size());
868  }
869  ltr = (*r)[r_start].ri_->ltr;
870  }
871 
874  margin = (*rows)[row_start].lmargin_;
875  }
876 
879  margin = (*rows)[row_start].rmargin_;
880  }
881 
882  // Align tabs are the tab stops the text is aligned to.
884  if (just == tesseract::JUSTIFICATION_RIGHT) return right_tabs;
885  return left_tabs;
886  }
887 
888  // Offside tabs are the tab stops opposite the tabs used to align the text.
889  //
890  // Note that for a left-to-right text which is aligned to the right such as
891  // this function comment, the offside tabs are the horizontal tab stops
892  // marking the beginning of ("Note", "this" and "marking").
894  if (just == tesseract::JUSTIFICATION_RIGHT) return left_tabs;
895  return right_tabs;
896  }
897 
898  // Return whether the i'th row extends from the leftmost left tab stop
899  // to the right most right tab stop.
900  bool IsFullRow(int i) const {
901  return ClosestCluster(left_tabs, (*rows)[i].lindent_) == 0 &&
902  ClosestCluster(right_tabs, (*rows)[i].rindent_) == 0;
903  }
904 
905  int AlignsideTabIndex(int row_idx) const {
906  return ClosestCluster(AlignTabs(), (*rows)[row_idx].AlignsideIndent(just));
907  }
908 
909  // Given what we know about the paragraph justification (just), would the
910  // first word of row_b have fit at the end of row_a?
911  bool FirstWordWouldHaveFit(int row_a, int row_b) {
913  (*rows)[row_a], (*rows)[row_b], just);
914  }
915 
916  void PrintRows() const { PrintRowRange(*rows, row_start, row_end); }
917 
918  void Fail(int min_debug_level, const char *why) const {
919  if (debug_level < min_debug_level) return;
920  tprintf("# %s\n", why);
921  PrintRows();
922  }
923 
925  return ParagraphModel(just, margin, first_indent, body_indent, tolerance);
926  }
927 
928  // We print out messages with a debug level at least as great as debug_level.
930 
931  // The Geometric Classifier was asked to find a single paragraph model
932  // to fit the text rows (*rows)[row_start, row_end)
935  int row_end;
936 
937  // The amount by which we expect the text edge can vary and still be aligned.
939 
940  // Is the script in this text block left-to-right?
941  // HORRIBLE ROUGH APPROXIMATION. TODO(eger): Improve
942  bool ltr;
943 
944  // These left and right tab stops were determined to be the common tab
945  // stops for the given text.
948 
949  // These are parameters we must determine to create a ParagraphModel.
951  int margin;
954 
955  // eop_threshold > 0 if the text is fully justified. See MarkRowsWithModel()
957 };
958 
959 // Given a section of text where strong textual clues did not help identifying
960 // paragraph breaks, and for which the left and right indents have exactly
961 // three tab stops between them, attempt to find the paragraph breaks based
962 // solely on the outline of the text and whether the script is left-to-right.
963 //
964 // Algorithm Detail:
965 // The selected rows are in the form of a rectangle except
966 // for some number of "short lines" of the same length:
967 //
968 // (A1) xxxxxxxxxxxxx (B1) xxxxxxxxxxxx
969 // xxxxxxxxxxx xxxxxxxxxx # A "short" line.
970 // xxxxxxxxxxxxx xxxxxxxxxxxx
971 // xxxxxxxxxxxxx xxxxxxxxxxxx
972 //
973 // We have a slightly different situation if the only short
974 // line is at the end of the excerpt.
975 //
976 // (A2) xxxxxxxxxxxxx (B2) xxxxxxxxxxxx
977 // xxxxxxxxxxxxx xxxxxxxxxxxx
978 // xxxxxxxxxxxxx xxxxxxxxxxxx
979 // xxxxxxxxxxx xxxxxxxxxx # A "short" line.
980 //
981 // We'll interpret these as follows based on the reasoning in the comment for
982 // GeometricClassify():
983 // [script direction: first indent, body indent]
984 // (A1) LtR: 2,0 RtL: 0,0 (B1) LtR: 0,0 RtL: 2,0
985 // (A2) LtR: 2,0 RtL: CrR (B2) LtR: CrL RtL: 2,0
987  int debug_level,
989  ParagraphTheory *theory) {
990  int num_rows = s.row_end - s.row_start;
991  int num_full_rows = 0;
992  int last_row_full = 0;
993  for (int i = s.row_start; i < s.row_end; i++) {
994  if (s.IsFullRow(i)) {
995  num_full_rows++;
996  if (i == s.row_end - 1) last_row_full++;
997  }
998  }
999 
1000  if (num_full_rows < 0.7 * num_rows) {
1001  s.Fail(1, "Not enough full lines to know which lines start paras.");
1002  return;
1003  }
1004 
1005  // eop_threshold gets set if we're fully justified; see MarkRowsWithModel()
1006  s.eop_threshold = 0;
1007 
1008  if (s.ltr) {
1010  } else {
1012  }
1013 
1014  if (debug_level > 0) {
1015  tprintf("# Not enough variety for clear outline classification. "
1016  "Guessing these are %s aligned based on script.\n",
1017  s.ltr ? "left" : "right");
1018  s.PrintRows();
1019  }
1020 
1021  if (s.AlignTabs().size() == 2) { // case A1 or A2
1022  s.first_indent = s.AlignTabs()[1].center;
1023  s.body_indent = s.AlignTabs()[0].center;
1024  } else { // case B1 or B2
1025  if (num_rows - 1 == num_full_rows - last_row_full) {
1026  // case B2
1027  const ParagraphModel *model = s.ltr ? kCrownLeft : kCrownRight;
1028  (*s.rows)[s.row_start].AddStartLine(model);
1029  for (int i = s.row_start + 1; i < s.row_end; i++) {
1030  (*s.rows)[i].AddBodyLine(model);
1031  }
1032  return;
1033  } else {
1034  // case B1
1035  s.first_indent = s.body_indent = s.AlignTabs()[0].center;
1036  s.eop_threshold = (s.OffsideTabs()[0].center +
1037  s.OffsideTabs()[1].center) / 2;
1038  }
1039  }
1040  const ParagraphModel *model = theory->AddModel(s.Model());
1041  MarkRowsWithModel(s.rows, s.row_start, s.row_end, model,
1042  s.ltr, s.eop_threshold);
1043  return;
1044 }
1045 
1046 // This function is called if strong textual clues were not available, but
1047 // the caller hopes that the paragraph breaks will be super obvious just
1048 // by the outline of the text.
1049 //
1050 // The particularly difficult case is figuring out what's going on if you
1051 // don't have enough short paragraph end lines to tell us what's going on.
1052 //
1053 // For instance, let's say you have the following outline:
1054 //
1055 // (A1) xxxxxxxxxxxxxxxxxxxxxx
1056 // xxxxxxxxxxxxxxxxxxxx
1057 // xxxxxxxxxxxxxxxxxxxxxx
1058 // xxxxxxxxxxxxxxxxxxxxxx
1059 //
1060 // Even if we know that the text is left-to-right and so will probably be
1061 // left-aligned, both of the following are possible texts:
1062 //
1063 // (A1a) 1. Here our list item
1064 // with two full lines.
1065 // 2. Here a second item.
1066 // 3. Here our third one.
1067 //
1068 // (A1b) so ends paragraph one.
1069 // Here starts another
1070 // paragraph we want to
1071 // read. This continues
1072 //
1073 // These examples are obvious from the text and should have been caught
1074 // by the StrongEvidenceClassify pass. However, for languages where we don't
1075 // have capital letters to go on (e.g. Hebrew, Arabic, Hindi, Chinese),
1076 // it's worth guessing that (A1b) is the correct interpretation if there are
1077 // far more "full" lines than "short" lines.
1078 void GeometricClassify(int debug_level,
1080  int row_start, int row_end,
1081  ParagraphTheory *theory) {
1082  if (!AcceptableRowArgs(debug_level, 4, __func__, rows, row_start, row_end))
1083  return;
1084  if (debug_level > 1) {
1085  tprintf("###############################################\n");
1086  tprintf("##### GeometricClassify( rows[%d:%d) ) ####\n",
1087  row_start, row_end);
1088  tprintf("###############################################\n");
1089  }
1090  RecomputeMarginsAndClearHypotheses(rows, row_start, row_end, 10);
1091 
1092  GeometricClassifierState s(debug_level, rows, row_start, row_end);
1093  if (s.left_tabs.size() > 2 && s.right_tabs.size() > 2) {
1094  s.Fail(2, "Too much variety for simple outline classification.");
1095  return;
1096  }
1097  if (s.left_tabs.size() <= 1 && s.right_tabs.size() <= 1) {
1098  s.Fail(1, "Not enough variety for simple outline classification.");
1099  return;
1100  }
1101  if (s.left_tabs.size() + s.right_tabs.size() == 3) {
1102  GeometricClassifyThreeTabStopTextBlock(debug_level, s, theory);
1103  return;
1104  }
1105 
1106  // At this point, we know that one side has at least two tab stops, and the
1107  // other side has one or two tab stops.
1108  // Left to determine:
1109  // (1) Which is the body indent and which is the first line indent?
1110  // (2) Is the text fully justified?
1111 
1112  // If one side happens to have three or more tab stops, assume that side
1113  // is opposite of the aligned side.
1114  if (s.right_tabs.size() > 2) {
1116  } else if (s.left_tabs.size() > 2) {
1118  } else if (s.ltr) { // guess based on script direction
1120  } else {
1122  }
1123 
1124  if (s.AlignTabs().size() == 2) {
1125  // For each tab stop on the aligned side, how many of them appear
1126  // to be paragraph start lines? [first lines]
1127  int firsts[2] = {0, 0};
1128  // Count the first line as a likely paragraph start line.
1129  firsts[s.AlignsideTabIndex(s.row_start)]++;
1130  // For each line, if the first word would have fit on the previous
1131  // line count it as a likely paragraph start line.
1132  bool jam_packed = true;
1133  for (int i = s.row_start + 1; i < s.row_end; i++) {
1134  if (s.FirstWordWouldHaveFit(i - 1, i)) {
1135  firsts[s.AlignsideTabIndex(i)]++;
1136  jam_packed = false;
1137  }
1138  }
1139  // Make an extra accounting for the last line of the paragraph just
1140  // in case it's the only short line in the block. That is, take its
1141  // first word as typical and see if this looks like the *last* line
1142  // of a paragraph. If so, mark the *other* indent as probably a first.
1143  if (jam_packed && s.FirstWordWouldHaveFit(s.row_end - 1, s.row_end - 1)) {
1144  firsts[1 - s.AlignsideTabIndex(s.row_end - 1)]++;
1145  }
1146 
1147  int percent0firsts, percent1firsts;
1148  percent0firsts = (100 * firsts[0]) / s.AlignTabs()[0].count;
1149  percent1firsts = (100 * firsts[1]) / s.AlignTabs()[1].count;
1150 
1151  // TODO(eger): Tune these constants if necessary.
1152  if ((percent0firsts < 20 && 30 < percent1firsts) ||
1153  percent0firsts + 30 < percent1firsts) {
1154  s.first_indent = s.AlignTabs()[1].center;
1155  s.body_indent = s.AlignTabs()[0].center;
1156  } else if ((percent1firsts < 20 && 30 < percent0firsts) ||
1157  percent1firsts + 30 < percent0firsts) {
1158  s.first_indent = s.AlignTabs()[0].center;
1159  s.body_indent = s.AlignTabs()[1].center;
1160  } else {
1161  // Ambiguous! Probably lineated (poetry)
1162  if (debug_level > 1) {
1163  tprintf("# Cannot determine %s indent likely to start paragraphs.\n",
1164  s.just == tesseract::JUSTIFICATION_LEFT ? "left" : "right");
1165  tprintf("# Indent of %d looks like a first line %d%% of the time.\n",
1166  s.AlignTabs()[0].center, percent0firsts);
1167  tprintf("# Indent of %d looks like a first line %d%% of the time.\n",
1168  s.AlignTabs()[1].center, percent1firsts);
1169  s.PrintRows();
1170  }
1171  return;
1172  }
1173  } else {
1174  // There's only one tab stop for the "aligned to" side.
1175  s.first_indent = s.body_indent = s.AlignTabs()[0].center;
1176  }
1177 
1178  // At this point, we have our model.
1179  const ParagraphModel *model = theory->AddModel(s.Model());
1180 
1181  // Now all we have to do is figure out if the text is fully justified or not.
1182  // eop_threshold: default to fully justified unless we see evidence below.
1183  // See description on MarkRowsWithModel()
1184  s.eop_threshold =
1185  (s.OffsideTabs()[0].center + s.OffsideTabs()[1].center) / 2;
1186  // If the text is not fully justified, re-set the eop_threshold to 0.
1187  if (s.AlignTabs().size() == 2) {
1188  // Paragraphs with a paragraph-start indent.
1189  for (int i = s.row_start; i < s.row_end - 1; i++) {
1190  if (ValidFirstLine(s.rows, i + 1, model) &&
1191  !NearlyEqual(s.OffsideTabs()[0].center,
1192  (*s.rows)[i].OffsideIndent(s.just), s.tolerance)) {
1193  // We found a non-end-of-paragraph short line: not fully justified.
1194  s.eop_threshold = 0;
1195  break;
1196  }
1197  }
1198  } else {
1199  // Paragraphs with no paragraph-start indent.
1200  for (int i = s.row_start; i < s.row_end - 1; i++) {
1201  if (!s.FirstWordWouldHaveFit(i, i + 1) &&
1202  !NearlyEqual(s.OffsideTabs()[0].center,
1203  (*s.rows)[i].OffsideIndent(s.just), s.tolerance)) {
1204  // We found a non-end-of-paragraph short line: not fully justified.
1205  s.eop_threshold = 0;
1206  break;
1207  }
1208  }
1209  }
1210  MarkRowsWithModel(rows, row_start, row_end, model, s.ltr, s.eop_threshold);
1211 }
1212 
1213 // =============== Implementation of ParagraphTheory =====================
1214 
1216  for (int i = 0; i < models_->size(); i++) {
1217  if ((*models_)[i]->Comparable(model))
1218  return (*models_)[i];
1219  }
1220  ParagraphModel *m = new ParagraphModel(model);
1221  models_->push_back(m);
1222  models_we_added_.push_back_new(m);
1223  return m;
1224 }
1225 
1227  for (int i = models_->size() - 1; i >= 0; i--) {
1228  ParagraphModel *m = (*models_)[i];
1229  if (!used_models.contains(m) && models_we_added_.contains(m)) {
1230  models_->remove(i);
1231  models_we_added_.remove(models_we_added_.get_index(m));
1232  delete m;
1233  }
1234  }
1235 }
1236 
1237 // Examine rows[start, end) and try to determine if an existing non-centered
1238 // paragraph model would fit them perfectly. If so, return a pointer to it.
1239 // If not, return NULL.
1241  const GenericVector<RowScratchRegisters> *rows, int start, int end) const {
1242  for (int m = 0; m < models_->size(); m++) {
1243  const ParagraphModel *model = (*models_)[m];
1244  if (model->justification() != JUSTIFICATION_CENTER &&
1245  RowsFitModel(rows, start, end, model))
1246  return model;
1247  }
1248  return NULL;
1249 }
1250 
1252  for (int m = 0; m < models_->size(); m++) {
1253  const ParagraphModel *model = (*models_)[m];
1254  if (model->justification() != JUSTIFICATION_CENTER)
1255  models->push_back_new(model);
1256  }
1257 }
1258 
1259 int ParagraphTheory::IndexOf(const ParagraphModel *model) const {
1260  for (int i = 0; i < models_->size(); i++) {
1261  if ((*models_)[i] == model)
1262  return i;
1263  }
1264  return -1;
1265 }
1266 
1268  int row, const ParagraphModel *model) {
1269  if (!StrongModel(model)) {
1270  tprintf("ValidFirstLine() should only be called with strong models!\n");
1271  }
1272  return StrongModel(model) &&
1273  model->ValidFirstLine(
1274  (*rows)[row].lmargin_, (*rows)[row].lindent_,
1275  (*rows)[row].rindent_, (*rows)[row].rmargin_);
1276 }
1277 
1279  int row, const ParagraphModel *model) {
1280  if (!StrongModel(model)) {
1281  tprintf("ValidBodyLine() should only be called with strong models!\n");
1282  }
1283  return StrongModel(model) &&
1284  model->ValidBodyLine(
1285  (*rows)[row].lmargin_, (*rows)[row].lindent_,
1286  (*rows)[row].rindent_, (*rows)[row].rmargin_);
1287 }
1288 
1290  int a, int b, const ParagraphModel *model) {
1291  if (model != kCrownRight && model != kCrownLeft) {
1292  tprintf("CrownCompatible() should only be called with crown models!\n");
1293  return false;
1294  }
1295  RowScratchRegisters &row_a = (*rows)[a];
1296  RowScratchRegisters &row_b = (*rows)[b];
1297  if (model == kCrownRight) {
1298  return NearlyEqual(row_a.rindent_ + row_a.rmargin_,
1299  row_b.rindent_ + row_b.rmargin_,
1300  Epsilon(row_a.ri_->average_interword_space));
1301  }
1302  return NearlyEqual(row_a.lindent_ + row_a.lmargin_,
1303  row_b.lindent_ + row_b.lmargin_,
1304  Epsilon(row_a.ri_->average_interword_space));
1305 }
1306 
1307 
1308 // =============== Implementation of ParagraphModelSmearer ====================
1309 
1312  int row_start, int row_end, ParagraphTheory *theory)
1313  : theory_(theory), rows_(rows), row_start_(row_start),
1314  row_end_(row_end) {
1315  if (!AcceptableRowArgs(0, 0, __func__, rows, row_start, row_end)) {
1316  row_start_ = 0;
1317  row_end_ = 0;
1318  return;
1319  }
1320  SetOfModels no_models;
1321  for (int row = row_start - 1; row <= row_end; row++) {
1322  open_models_.push_back(no_models);
1323  }
1324 }
1325 
1326 // see paragraphs_internal.h
1327 void ParagraphModelSmearer::CalculateOpenModels(int row_start, int row_end) {
1328  SetOfModels no_models;
1329  if (row_start < row_start_) row_start = row_start_;
1330  if (row_end > row_end_) row_end = row_end_;
1331 
1332  for (int row = (row_start > 0) ? row_start - 1 : row_start; row < row_end;
1333  row++) {
1334  if ((*rows_)[row].ri_->num_words == 0) {
1335  OpenModels(row + 1) = no_models;
1336  } else {
1337  SetOfModels &opened = OpenModels(row);
1338  (*rows_)[row].StartHypotheses(&opened);
1339 
1340  // Which models survive the transition from row to row + 1?
1341  SetOfModels still_open;
1342  for (int m = 0; m < opened.size(); m++) {
1343  if (ValidFirstLine(rows_, row, opened[m]) ||
1344  ValidBodyLine(rows_, row, opened[m])) {
1345  // This is basic filtering; we check likely paragraph starty-ness down
1346  // below in Smear() -- you know, whether the first word would have fit
1347  // and such.
1348  still_open.push_back_new(opened[m]);
1349  }
1350  }
1351  OpenModels(row + 1) = still_open;
1352  }
1353  }
1354 }
1355 
1356 // see paragraphs_internal.h
1358  CalculateOpenModels(row_start_, row_end_);
1359 
1360  // For each row which we're unsure about (that is, it is LT_UNKNOWN or
1361  // we have multiple LT_START hypotheses), see if there's a model that
1362  // was recently used (an "open" model) which might model it well.
1363  for (int i = row_start_; i < row_end_; i++) {
1364  RowScratchRegisters &row = (*rows_)[i];
1365  if (row.ri_->num_words == 0)
1366  continue;
1367 
1368  // Step One:
1369  // Figure out if there are "open" models which are left-alined or
1370  // right-aligned. This is important for determining whether the
1371  // "first" word in a row would fit at the "end" of the previous row.
1372  bool left_align_open = false;
1373  bool right_align_open = false;
1374  for (int m = 0; m < OpenModels(i).size(); m++) {
1375  switch (OpenModels(i)[m]->justification()) {
1376  case JUSTIFICATION_LEFT: left_align_open = true; break;
1377  case JUSTIFICATION_RIGHT: right_align_open = true; break;
1378  default: left_align_open = right_align_open = true;
1379  }
1380  }
1381  // Step Two:
1382  // Use that knowledge to figure out if this row is likely to
1383  // start a paragraph.
1384  bool likely_start;
1385  if (i == 0) {
1386  likely_start = true;
1387  } else {
1388  if ((left_align_open && right_align_open) ||
1389  (!left_align_open && !right_align_open)) {
1390  likely_start = LikelyParagraphStart((*rows_)[i - 1], row,
1391  JUSTIFICATION_LEFT) ||
1392  LikelyParagraphStart((*rows_)[i - 1], row,
1394  } else if (left_align_open) {
1395  likely_start = LikelyParagraphStart((*rows_)[i - 1], row,
1397  } else {
1398  likely_start = LikelyParagraphStart((*rows_)[i - 1], row,
1400  }
1401  }
1402 
1403  // Step Three:
1404  // If this text line seems like an obvious first line of an
1405  // open model, or an obvious continuation of an existing
1406  // modelled paragraph, mark it up.
1407  if (likely_start) {
1408  // Add Start Hypotheses for all Open models that fit.
1409  for (int m = 0; m < OpenModels(i).size(); m++) {
1410  if (ValidFirstLine(rows_, i, OpenModels(i)[m])) {
1411  row.AddStartLine(OpenModels(i)[m]);
1412  }
1413  }
1414  } else {
1415  // Add relevant body line hypotheses.
1416  SetOfModels last_line_models;
1417  if (i > 0) {
1418  (*rows_)[i - 1].StrongHypotheses(&last_line_models);
1419  } else {
1420  theory_->NonCenteredModels(&last_line_models);
1421  }
1422  for (int m = 0; m < last_line_models.size(); m++) {
1423  const ParagraphModel *model = last_line_models[m];
1424  if (ValidBodyLine(rows_, i, model))
1425  row.AddBodyLine(model);
1426  }
1427  }
1428 
1429  // Step Four:
1430  // If we're still quite unsure about this line, go through all
1431  // models in our theory and see if this row could be the start
1432  // of any of our models.
1433  if (row.GetLineType() == LT_UNKNOWN ||
1434  (row.GetLineType() == LT_START && !row.UniqueStartHypothesis())) {
1435  SetOfModels all_models;
1436  theory_->NonCenteredModels(&all_models);
1437  for (int m = 0; m < all_models.size(); m++) {
1438  if (ValidFirstLine(rows_, i, all_models[m])) {
1439  row.AddStartLine(all_models[m]);
1440  }
1441  }
1442  }
1443  // Step Five:
1444  // Since we may have updated the hypotheses about this row, we need
1445  // to recalculate the Open models for the rest of rows[i + 1, row_end)
1446  if (row.GetLineType() != LT_UNKNOWN) {
1447  CalculateOpenModels(i + 1, row_end_);
1448  }
1449  }
1450 }
1451 
1452 // ================ Main Paragraph Detection Algorithm =======================
1453 
1454 // Find out what ParagraphModels are actually used, and discard any
1455 // that are not.
1457  ParagraphTheory *theory) {
1458  SetOfModels used_models;
1459  for (int i = 0; i < rows.size(); i++) {
1460  rows[i].StrongHypotheses(&used_models);
1461  }
1462  theory->DiscardUnusedModels(used_models);
1463 }
1464 
1465 // DowngradeWeakestToCrowns:
1466 // Forget any flush-{left, right} models unless we see two or more
1467 // of them in sequence.
1468 //
1469 // In pass 3, we start to classify even flush-left paragraphs (paragraphs
1470 // where the first line and body indent are the same) as having proper Models.
1471 // This is generally dangerous, since if you start imagining that flush-left
1472 // is a typical paragraph model when it is not, it will lead you to chop normal
1473 // indented paragraphs in the middle whenever a sentence happens to start on a
1474 // new line (see "This" above). What to do?
1475 // What we do is to take any paragraph which is flush left and is not
1476 // preceded by another paragraph of the same model and convert it to a "Crown"
1477 // paragraph. This is a weak pseudo-ParagraphModel which is a placeholder
1478 // for later. It means that the paragraph is flush, but it would be desirable
1479 // to mark it as the same model as following text if it fits. This downgrade
1480 // FlushLeft -> CrownLeft -> Model of following paragraph. Means that we
1481 // avoid making flush left Paragraph Models whenever we see a top-of-the-page
1482 // half-of-a-paragraph. and instead we mark it the same as normal body text.
1483 //
1484 // Implementation:
1485 //
1486 // Comb backwards through the row scratch registers, and turn any
1487 // sequences of body lines of equivalent type abutted against the beginning
1488 // or a body or start line of a different type into a crown paragraph.
1489 void DowngradeWeakestToCrowns(int debug_level,
1490  ParagraphTheory *theory,
1492  int start;
1493  for (int end = rows->size(); end > 0; end = start) {
1494  // Search back for a body line of a unique type.
1495  const ParagraphModel *model = NULL;
1496  while (end > 0 &&
1497  (model = (*rows)[end - 1].UniqueBodyHypothesis()) == NULL) {
1498  end--;
1499  }
1500  if (end == 0) break;
1501  start = end - 1;
1502  while (start >= 0 && (*rows)[start].UniqueBodyHypothesis() == model) {
1503  start--; // walk back to the first line that is not the same body type.
1504  }
1505  if (start >= 0 && (*rows)[start].UniqueStartHypothesis() == model &&
1506  StrongModel(model) &&
1507  NearlyEqual(model->first_indent(), model->body_indent(),
1508  model->tolerance())) {
1509  start--;
1510  }
1511  start++;
1512  // Now rows[start, end) is a sequence of unique body hypotheses of model.
1513  if (StrongModel(model) && model->justification() == JUSTIFICATION_CENTER)
1514  continue;
1515  if (!StrongModel(model)) {
1516  while (start > 0 &&
1517  CrownCompatible(rows, start - 1, start, model))
1518  start--;
1519  }
1520  if (start == 0 ||
1521  (!StrongModel(model)) ||
1522  (StrongModel(model) && !ValidFirstLine(rows, start - 1, model))) {
1523  // crownify rows[start, end)
1524  const ParagraphModel *crown_model = model;
1525  if (StrongModel(model)) {
1526  if (model->justification() == JUSTIFICATION_LEFT)
1527  crown_model = kCrownLeft;
1528  else
1529  crown_model = kCrownRight;
1530  }
1531  (*rows)[start].SetUnknown();
1532  (*rows)[start].AddStartLine(crown_model);
1533  for (int row = start + 1; row < end; row++) {
1534  (*rows)[row].SetUnknown();
1535  (*rows)[row].AddBodyLine(crown_model);
1536  }
1537  }
1538  }
1539  DiscardUnusedModels(*rows, theory);
1540 }
1541 
1542 
1543 // Clear all hypotheses about lines [start, end) and reset margins.
1544 //
1545 // The empty space between the left of a row and the block boundary (and
1546 // similarly for the right) is split into two pieces: margin and indent.
1547 // In initial processing, we assume the block is tight and the margin for
1548 // all lines is set to zero. However, if our first pass does not yield
1549 // models for everything, it may be due to an inset paragraph like a
1550 // block-quote. In that case, we make a second pass over that unmarked
1551 // section of the page and reset the "margin" portion of the empty space
1552 // to the common amount of space at the ends of the lines under consid-
1553 // eration. This would be equivalent to percentile set to 0. However,
1554 // sometimes we have a single character sticking out in the right margin
1555 // of a text block (like the 'r' in 'for' on line 3 above), and we can
1556 // really just ignore it as an outlier. To express this, we allow the
1557 // user to specify the percentile (0..100) of indent values to use as
1558 // the common margin for each row in the run of rows[start, end).
1560  GenericVector<RowScratchRegisters> *rows, int start, int end,
1561  int percentile) {
1562  if (!AcceptableRowArgs(0, 0, __func__, rows, start, end))
1563  return;
1564 
1565  int lmin, lmax, rmin, rmax;
1566  lmin = lmax = (*rows)[start].lmargin_ + (*rows)[start].lindent_;
1567  rmin = rmax = (*rows)[start].rmargin_ + (*rows)[start].rindent_;
1568  for (int i = start; i < end; i++) {
1569  RowScratchRegisters &sr = (*rows)[i];
1570  sr.SetUnknown();
1571  if (sr.ri_->num_words == 0)
1572  continue;
1573  UpdateRange(sr.lmargin_ + sr.lindent_, &lmin, &lmax);
1574  UpdateRange(sr.rmargin_ + sr.rindent_, &rmin, &rmax);
1575  }
1576  STATS lefts(lmin, lmax + 1);
1577  STATS rights(rmin, rmax + 1);
1578  for (int i = start; i < end; i++) {
1579  RowScratchRegisters &sr = (*rows)[i];
1580  if (sr.ri_->num_words == 0)
1581  continue;
1582  lefts.add(sr.lmargin_ + sr.lindent_, 1);
1583  rights.add(sr.rmargin_ + sr.rindent_, 1);
1584  }
1585  int ignorable_left = lefts.ile(ClipToRange(percentile, 0, 100) / 100.0);
1586  int ignorable_right = rights.ile(ClipToRange(percentile, 0, 100) / 100.0);
1587  for (int i = start; i < end; i++) {
1588  RowScratchRegisters &sr = (*rows)[i];
1589  int ldelta = ignorable_left - sr.lmargin_;
1590  sr.lmargin_ += ldelta;
1591  sr.lindent_ -= ldelta;
1592  int rdelta = ignorable_right - sr.rmargin_;
1593  sr.rmargin_ += rdelta;
1594  sr.rindent_ -= rdelta;
1595  }
1596 }
1597 
1598 // Return the median inter-word space in rows[row_start, row_end).
1600  int row_start, int row_end) {
1601  if (row_end < row_start + 1) return 1;
1602  int word_height = (rows[row_start].ri_->lword_box.height() +
1603  rows[row_end - 1].ri_->lword_box.height()) / 2;
1604  int word_width = (rows[row_start].ri_->lword_box.width() +
1605  rows[row_end - 1].ri_->lword_box.width()) / 2;
1606  STATS spacing_widths(0, 5 + word_width);
1607  for (int i = row_start; i < row_end; i++) {
1608  if (rows[i].ri_->num_words > 1) {
1609  spacing_widths.add(rows[i].ri_->average_interword_space, 1);
1610  }
1611  }
1612  int minimum_reasonable_space = word_height / 3;
1613  if (minimum_reasonable_space < 2)
1614  minimum_reasonable_space = 2;
1615  int median = spacing_widths.median();
1616  return (median > minimum_reasonable_space)
1617  ? median : minimum_reasonable_space;
1618 }
1619 
1620 // Return whether the first word on the after line can fit in the space at
1621 // the end of the before line (knowing which way the text is aligned and read).
1623  const RowScratchRegisters &after,
1624  tesseract::ParagraphJustification justification) {
1625  if (before.ri_->num_words == 0 || after.ri_->num_words == 0)
1626  return true;
1627 
1628  if (justification == JUSTIFICATION_UNKNOWN) {
1629  tprintf("Don't call FirstWordWouldHaveFit(r, s, JUSTIFICATION_UNKNOWN).\n");
1630  }
1631  int available_space;
1632  if (justification == JUSTIFICATION_CENTER) {
1633  available_space = before.lindent_ + before.rindent_;
1634  } else {
1635  available_space = before.OffsideIndent(justification);
1636  }
1637  available_space -= before.ri_->average_interword_space;
1638 
1639  if (before.ri_->ltr)
1640  return after.ri_->lword_box.width() < available_space;
1641  return after.ri_->rword_box.width() < available_space;
1642 }
1643 
1644 // Return whether the first word on the after line can fit in the space at
1645 // the end of the before line (not knowing which way the text goes) in a left
1646 // or right alignemnt.
1648  const RowScratchRegisters &after) {
1649  if (before.ri_->num_words == 0 || after.ri_->num_words == 0)
1650  return true;
1651 
1652  int available_space = before.lindent_;
1653  if (before.rindent_ > available_space)
1654  available_space = before.rindent_;
1655  available_space -= before.ri_->average_interword_space;
1656 
1657  if (before.ri_->ltr)
1658  return after.ri_->lword_box.width() < available_space;
1659  return after.ri_->rword_box.width() < available_space;
1660 }
1661 
1663  const RowScratchRegisters &after) {
1664  if (before.ri_->ltr) {
1665  return before.ri_->rword_likely_ends_idea &&
1667  } else {
1668  return before.ri_->lword_likely_ends_idea &&
1670  }
1671 }
1672 
1674  const RowScratchRegisters &after) {
1675  return before.ri_->num_words == 0 ||
1676  (FirstWordWouldHaveFit(before, after) &&
1677  TextSupportsBreak(before, after));
1678 }
1679 
1681  const RowScratchRegisters &after,
1683  return before.ri_->num_words == 0 ||
1684  (FirstWordWouldHaveFit(before, after, j) &&
1685  TextSupportsBreak(before, after));
1686 }
1687 
1688 // Examine rows[start, end) and try to determine what sort of ParagraphModel
1689 // would fit them as a single paragraph.
1690 // If we can't produce a unique model justification_ = JUSTIFICATION_UNKNOWN.
1691 // If the rows given could be a consistent start to a paragraph, set *consistent
1692 // true.
1695  int start, int end, int tolerance, bool *consistent) {
1696  int ltr_line_count = 0;
1697  for (int i = start; i < end; i++) {
1698  ltr_line_count += static_cast<int>((*rows)[i].ri_->ltr);
1699  }
1700  bool ltr = (ltr_line_count >= (end - start) / 2);
1701 
1702  *consistent = true;
1703  if (!AcceptableRowArgs(0, 2, __func__, rows, start, end))
1704  return ParagraphModel();
1705 
1706  // Ensure the caller only passed us a region with a common rmargin and
1707  // lmargin.
1708  int lmargin = (*rows)[start].lmargin_;
1709  int rmargin = (*rows)[start].rmargin_;
1710  int lmin, lmax, rmin, rmax, cmin, cmax;
1711  lmin = lmax = (*rows)[start + 1].lindent_;
1712  rmin = rmax = (*rows)[start + 1].rindent_;
1713  cmin = cmax = 0;
1714  for (int i = start + 1; i < end; i++) {
1715  if ((*rows)[i].lmargin_ != lmargin || (*rows)[i].rmargin_ != rmargin) {
1716  tprintf("Margins don't match! Software error.\n");
1717  *consistent = false;
1718  return ParagraphModel();
1719  }
1720  UpdateRange((*rows)[i].lindent_, &lmin, &lmax);
1721  UpdateRange((*rows)[i].rindent_, &rmin, &rmax);
1722  UpdateRange((*rows)[i].rindent_ - (*rows)[i].lindent_, &cmin, &cmax);
1723  }
1724  int ldiff = lmax - lmin;
1725  int rdiff = rmax - rmin;
1726  int cdiff = cmax - cmin;
1727  if (rdiff > tolerance && ldiff > tolerance) {
1728  if (cdiff < tolerance * 2) {
1729  if (end - start < 3)
1730  return ParagraphModel();
1731  return ParagraphModel(JUSTIFICATION_CENTER, 0, 0, 0, tolerance);
1732  }
1733  *consistent = false;
1734  return ParagraphModel();
1735  }
1736  if (end - start < 3) // Don't return a model for two line paras.
1737  return ParagraphModel();
1738 
1739  // These booleans keep us from saying something is aligned left when the body
1740  // left variance is too large.
1741  bool body_admits_left_alignment = ldiff < tolerance;
1742  bool body_admits_right_alignment = rdiff < tolerance;
1743 
1744  ParagraphModel left_model =
1745  ParagraphModel(JUSTIFICATION_LEFT, lmargin, (*rows)[start].lindent_,
1746  (lmin + lmax) / 2, tolerance);
1747  ParagraphModel right_model =
1748  ParagraphModel(JUSTIFICATION_RIGHT, rmargin, (*rows)[start].rindent_,
1749  (rmin + rmax) / 2, tolerance);
1750 
1751  // These booleans keep us from having an indent on the "wrong side" for the
1752  // first line.
1753  bool text_admits_left_alignment = ltr || left_model.is_flush();
1754  bool text_admits_right_alignment = !ltr || right_model.is_flush();
1755 
1756  // At least one of the edges is less than tolerance in variance.
1757  // If the other is obviously ragged, it can't be the one aligned to.
1758  // [Note the last line is included in this raggedness.]
1759  if (tolerance < rdiff) {
1760  if (body_admits_left_alignment && text_admits_left_alignment)
1761  return left_model;
1762  *consistent = false;
1763  return ParagraphModel();
1764  }
1765  if (tolerance < ldiff) {
1766  if (body_admits_right_alignment && text_admits_right_alignment)
1767  return right_model;
1768  *consistent = false;
1769  return ParagraphModel();
1770  }
1771 
1772  // At this point, we know the body text doesn't vary much on either side.
1773 
1774  // If the first line juts out oddly in one direction or the other,
1775  // that likely indicates the side aligned to.
1776  int first_left = (*rows)[start].lindent_;
1777  int first_right = (*rows)[start].rindent_;
1778 
1779  if (ltr && body_admits_left_alignment &&
1780  (first_left < lmin || first_left > lmax))
1781  return left_model;
1782  if (!ltr && body_admits_right_alignment &&
1783  (first_right < rmin || first_right > rmax))
1784  return right_model;
1785 
1786  *consistent = false;
1787  return ParagraphModel();
1788 }
1789 
1790 // Examine rows[start, end) and try to determine what sort of ParagraphModel
1791 // would fit them as a single paragraph. If nothing fits,
1792 // justification_ = JUSTIFICATION_UNKNOWN and print the paragraph to debug
1793 // output if we're debugging.
1795  int debug_level,
1797  int start, int end, int tolerance) {
1798  bool unused_consistent;
1800  rows, start, end, tolerance, &unused_consistent);
1801  if (debug_level >= 2 && retval.justification() == JUSTIFICATION_UNKNOWN) {
1802  tprintf("Could not determine a model for this paragraph:\n");
1803  PrintRowRange(*rows, start, end);
1804  }
1805  return retval;
1806 }
1807 
1808 // Do rows[start, end) form a single instance of the given paragraph model?
1810  int start, int end, const ParagraphModel *model) {
1811  if (!AcceptableRowArgs(0, 1, __func__, rows, start, end))
1812  return false;
1813  if (!ValidFirstLine(rows, start, model)) return false;
1814  for (int i = start + 1 ; i < end; i++) {
1815  if (!ValidBodyLine(rows, i, model)) return false;
1816  }
1817  return true;
1818 }
1819 
1820 // Examine rows[row_start, row_end) as an independent section of text,
1821 // and mark rows that are exceptionally clear as start-of-paragraph
1822 // and paragraph-body lines.
1823 //
1824 // We presume that any lines surrounding rows[row_start, row_end) may
1825 // have wildly different paragraph models, so we don't key any data off
1826 // of those lines.
1827 //
1828 // We only take the very strongest signals, as we don't want to get
1829 // confused and marking up centered text, poetry, or source code as
1830 // clearly part of a typical paragraph.
1832  int row_start, int row_end) {
1833  // Record patently obvious body text.
1834  for (int i = row_start + 1; i < row_end; i++) {
1835  const RowScratchRegisters &prev = (*rows)[i - 1];
1836  RowScratchRegisters &curr = (*rows)[i];
1837  tesseract::ParagraphJustification typical_justification =
1839  if (!curr.ri_->rword_likely_starts_idea &&
1840  !curr.ri_->lword_likely_starts_idea &&
1841  !FirstWordWouldHaveFit(prev, curr, typical_justification)) {
1842  curr.SetBodyLine();
1843  }
1844  }
1845 
1846  // Record patently obvious start paragraph lines.
1847  //
1848  // It's an extremely good signal of the start of a paragraph that
1849  // the first word would have fit on the end of the previous line.
1850  // However, applying just that signal would have us mark random
1851  // start lines of lineated text (poetry and source code) and some
1852  // centered headings as paragraph start lines. Therefore, we use
1853  // a second qualification for a paragraph start: Not only should
1854  // the first word of this line have fit on the previous line,
1855  // but also, this line should go full to the right of the block,
1856  // disallowing a subsequent word from having fit on this line.
1857 
1858  // First row:
1859  {
1860  RowScratchRegisters &curr = (*rows)[row_start];
1861  RowScratchRegisters &next = (*rows)[row_start + 1];
1864  if (curr.GetLineType() == LT_UNKNOWN &&
1865  !FirstWordWouldHaveFit(curr, next, j) &&
1866  (curr.ri_->lword_likely_starts_idea ||
1867  curr.ri_->rword_likely_starts_idea)) {
1868  curr.SetStartLine();
1869  }
1870  }
1871  // Middle rows
1872  for (int i = row_start + 1; i < row_end - 1; i++) {
1873  RowScratchRegisters &prev = (*rows)[i - 1];
1874  RowScratchRegisters &curr = (*rows)[i];
1875  RowScratchRegisters &next = (*rows)[i + 1];
1878  if (curr.GetLineType() == LT_UNKNOWN &&
1879  !FirstWordWouldHaveFit(curr, next, j) &&
1880  LikelyParagraphStart(prev, curr, j)) {
1881  curr.SetStartLine();
1882  }
1883  }
1884  // Last row
1885  { // the short circuit at the top means we have at least two lines.
1886  RowScratchRegisters &prev = (*rows)[row_end - 2];
1887  RowScratchRegisters &curr = (*rows)[row_end - 1];
1890  if (curr.GetLineType() == LT_UNKNOWN &&
1891  !FirstWordWouldHaveFit(curr, curr, j) &&
1892  LikelyParagraphStart(prev, curr, j)) {
1893  curr.SetStartLine();
1894  }
1895  }
1896 }
1897 
1898 // Look for sequences of a start line followed by some body lines in
1899 // rows[row_start, row_end) and create ParagraphModels for them if
1900 // they seem coherent.
1901 void ModelStrongEvidence(int debug_level,
1903  int row_start, int row_end,
1904  bool allow_flush_models,
1905  ParagraphTheory *theory) {
1906  if (!AcceptableRowArgs(debug_level, 2, __func__, rows, row_start, row_end))
1907  return;
1908 
1909  int start = row_start;
1910  while (start < row_end) {
1911  while (start < row_end && (*rows)[start].GetLineType() != LT_START)
1912  start++;
1913  if (start >= row_end - 1)
1914  break;
1915 
1916  int tolerance = Epsilon((*rows)[start + 1].ri_->average_interword_space);
1917  int end = start;
1918  ParagraphModel last_model;
1919  bool next_consistent;
1920  do {
1921  ++end;
1922  // rows[row, end) was consistent.
1923  // If rows[row, end + 1) is not consistent,
1924  // just model rows[row, end)
1925  if (end < row_end - 1) {
1926  RowScratchRegisters &next = (*rows)[end];
1927  LineType lt = next.GetLineType();
1928  next_consistent = lt == LT_BODY ||
1929  (lt == LT_UNKNOWN &&
1930  !FirstWordWouldHaveFit((*rows)[end - 1], (*rows)[end]));
1931  } else {
1932  next_consistent = false;
1933  }
1934  if (next_consistent) {
1936  rows, start, end + 1, tolerance, &next_consistent);
1937  if (((*rows)[start].ri_->ltr &&
1938  last_model.justification() == JUSTIFICATION_LEFT &&
1939  next_model.justification() != JUSTIFICATION_LEFT) ||
1940  (!(*rows)[start].ri_->ltr &&
1941  last_model.justification() == JUSTIFICATION_RIGHT &&
1942  next_model.justification() != JUSTIFICATION_RIGHT)) {
1943  next_consistent = false;
1944  }
1945  last_model = next_model;
1946  } else {
1947  next_consistent = false;
1948  }
1949  } while (next_consistent && end < row_end);
1950  // At this point, rows[start, end) looked like it could have been a
1951  // single paragraph. If we can make a good ParagraphModel for it,
1952  // do so and mark this sequence with that model.
1953  if (end > start + 1) {
1954  // emit a new paragraph if we have more than one line.
1955  const ParagraphModel *model = NULL;
1957  debug_level, rows, start, end,
1958  Epsilon(InterwordSpace(*rows, start, end)));
1959  if (new_model.justification() == JUSTIFICATION_UNKNOWN) {
1960  // couldn't create a good model, oh well.
1961  } else if (new_model.is_flush()) {
1962  if (end == start + 2) {
1963  // It's very likely we just got two paragraph starts in a row.
1964  end = start + 1;
1965  } else if (start == row_start) {
1966  // Mark this as a Crown.
1967  if (new_model.justification() == JUSTIFICATION_LEFT) {
1968  model = kCrownLeft;
1969  } else {
1970  model = kCrownRight;
1971  }
1972  } else if (allow_flush_models) {
1973  model = theory->AddModel(new_model);
1974  }
1975  } else {
1976  model = theory->AddModel(new_model);
1977  }
1978  if (model) {
1979  (*rows)[start].AddStartLine(model);
1980  for (int i = start + 1; i < end; i++) {
1981  (*rows)[i].AddBodyLine(model);
1982  }
1983  }
1984  }
1985  start = end;
1986  }
1987 }
1988 
1989 // We examine rows[row_start, row_end) and do the following:
1990 // (1) Clear all existing hypotheses for the rows being considered.
1991 // (2) Mark up any rows as exceptionally likely to be paragraph starts
1992 // or paragraph body lines as such using both geometric and textual
1993 // clues.
1994 // (3) Form models for any sequence of start + continuation lines.
1995 // (4) Smear the paragraph models to cover surrounding text.
1996 void StrongEvidenceClassify(int debug_level,
1998  int row_start, int row_end,
1999  ParagraphTheory *theory) {
2000  if (!AcceptableRowArgs(debug_level, 2, __func__, rows, row_start, row_end))
2001  return;
2002 
2003  if (debug_level > 1) {
2004  tprintf("#############################################\n");
2005  tprintf("# StrongEvidenceClassify( rows[%d:%d) )\n", row_start, row_end);
2006  tprintf("#############################################\n");
2007  }
2008 
2009  RecomputeMarginsAndClearHypotheses(rows, row_start, row_end, 10);
2010  MarkStrongEvidence(rows, row_start, row_end);
2011 
2012  DebugDump(debug_level > 2, "Initial strong signals.", *theory, *rows);
2013 
2014  // Create paragraph models.
2015  ModelStrongEvidence(debug_level, rows, row_start, row_end, false, theory);
2016 
2017  DebugDump(debug_level > 2, "Unsmeared hypotheses.s.", *theory, *rows);
2018 
2019  // At this point, some rows are marked up as paragraphs with model numbers,
2020  // and some rows are marked up as either LT_START or LT_BODY. Now let's
2021  // smear any good paragraph hypotheses forward and backward.
2022  ParagraphModelSmearer smearer(rows, row_start, row_end, theory);
2023  smearer.Smear();
2024 }
2025 
2027  int row_start, int row_end,
2028  ParagraphTheory *theory) {
2029  for (int i = row_start + 1; i < row_end - 1; i++) {
2030  if ((*rows)[i - 1].ri_->has_leaders &&
2031  (*rows)[i].ri_->has_leaders &&
2032  (*rows)[i + 1].ri_->has_leaders) {
2033  const ParagraphModel *model = theory->AddModel(
2034  ParagraphModel(JUSTIFICATION_UNKNOWN, 0, 0, 0, 0));
2035  (*rows)[i].AddStartLine(model);
2036  }
2037  }
2038 }
2039 
2040 // Collect sequences of unique hypotheses in row registers and create proper
2041 // paragraphs for them, referencing the paragraphs in row_owners.
2043  int debug_level,
2045  GenericVector<PARA *> *row_owners,
2046  ParagraphTheory *theory) {
2047  int end = rows.size();
2048  int start;
2049  for (; end > 0; end = start) {
2050  start = end - 1;
2051  const ParagraphModel *model = NULL;
2052  // TODO(eger): Be smarter about dealing with multiple hypotheses.
2053  bool single_line_paragraph = false;
2054  SetOfModels models;
2055  rows[start].NonNullHypotheses(&models);
2056  if (!models.empty()) {
2057  model = models[0];
2058  if (rows[start].GetLineType(model) != LT_BODY)
2059  single_line_paragraph = true;
2060  }
2061  if (model && !single_line_paragraph) {
2062  // walk back looking for more body lines and then a start line.
2063  while (--start > 0 && rows[start].GetLineType(model) == LT_BODY) {
2064  // do nothing
2065  }
2066  if (start < 0 || rows[start].GetLineType(model) != LT_START) {
2067  model = NULL;
2068  }
2069  }
2070  if (model == NULL) {
2071  continue;
2072  }
2073  // rows[start, end) should be a paragraph.
2074  PARA *p = new PARA();
2075  if (model == kCrownLeft || model == kCrownRight) {
2077  // Crown paragraph.
2078  // If we can find an existing ParagraphModel that fits, use it,
2079  // else create a new one.
2080  for (int row = end; row < rows.size(); row++) {
2081  if ((*row_owners)[row] &&
2082  (ValidBodyLine(&rows, start, (*row_owners)[row]->model) &&
2083  (start == 0 ||
2084  ValidFirstLine(&rows, start, (*row_owners)[row]->model)))) {
2085  model = (*row_owners)[row]->model;
2086  break;
2087  }
2088  }
2089  if (model == kCrownLeft) {
2090  // No subsequent model fits, so cons one up.
2091  model = theory->AddModel(ParagraphModel(
2092  JUSTIFICATION_LEFT, rows[start].lmargin_ + rows[start].lindent_,
2093  0, 0, Epsilon(rows[start].ri_->average_interword_space)));
2094  } else if (model == kCrownRight) {
2095  // No subsequent model fits, so cons one up.
2096  model = theory->AddModel(ParagraphModel(
2097  JUSTIFICATION_RIGHT, rows[start].rmargin_ + rows[start].rmargin_,
2098  0, 0, Epsilon(rows[start].ri_->average_interword_space)));
2099  }
2100  }
2101  rows[start].SetUnknown();
2102  rows[start].AddStartLine(model);
2103  for (int i = start + 1; i < end; i++) {
2104  rows[i].SetUnknown();
2105  rows[i].AddBodyLine(model);
2106  }
2107  p->model = model;
2108  p->has_drop_cap = rows[start].ri_->has_drop_cap;
2109  p->is_list_item =
2111  ? rows[start].ri_->rword_indicates_list_item
2112  : rows[start].ri_->lword_indicates_list_item;
2113  for (int row = start; row < end; row++) {
2114  if ((*row_owners)[row] != NULL) {
2115  tprintf("Memory leak! ConvertHypothesizeModelRunsToParagraphs() called "
2116  "more than once!\n");
2117  delete (*row_owners)[row];
2118  }
2119  (*row_owners)[row] = p;
2120  }
2121  }
2122 }
2123 
2124 struct Interval {
2125  Interval() : begin(0), end(0) {}
2126  Interval(int b, int e) : begin(b), end(e) {}
2127 
2128  int begin;
2129  int end;
2130 };
2131 
2132 // Return whether rows[row] appears to be stranded, meaning that the evidence
2133 // for this row is very weak due to context. For instance, two lines of source
2134 // code may happen to be indented at the same tab vector as body text starts,
2135 // leading us to think they are two start-of-paragraph lines. This is not
2136 // optimal. However, we also don't want to mark a sequence of short dialog
2137 // as "weak," so our heuristic is:
2138 // (1) If a line is surrounded by lines of unknown type, it's weak.
2139 // (2) If two lines in a row are start lines for a given paragraph type, but
2140 // after that the same paragraph type does not continue, they're weak.
2142  SetOfModels row_models;
2143  rows[row].StrongHypotheses(&row_models);
2144 
2145  for (int m = 0; m < row_models.size(); m++) {
2146  bool all_starts = rows[row].GetLineType();
2147  int run_length = 1;
2148  bool continues = true;
2149  for (int i = row - 1; i >= 0 && continues; i--) {
2150  SetOfModels models;
2151  rows[i].NonNullHypotheses(&models);
2152  switch (rows[i].GetLineType(row_models[m])) {
2153  case LT_START: run_length++; break;
2154  case LT_MULTIPLE: // explicit fall-through
2155  case LT_BODY: run_length++; all_starts = false; break;
2156  case LT_UNKNOWN: // explicit fall-through
2157  default: continues = false;
2158  }
2159  }
2160  continues = true;
2161  for (int i = row + 1; i < rows.size() && continues; i++) {
2162  SetOfModels models;
2163  rows[i].NonNullHypotheses(&models);
2164  switch (rows[i].GetLineType(row_models[m])) {
2165  case LT_START: run_length++; break;
2166  case LT_MULTIPLE: // explicit fall-through
2167  case LT_BODY: run_length++; all_starts = false; break;
2168  case LT_UNKNOWN: // explicit fall-through
2169  default: continues = false;
2170  }
2171  }
2172  if (run_length > 2 || (!all_starts && run_length > 1)) return false;
2173  }
2174  return true;
2175 }
2176 
2177 // Go through rows[row_start, row_end) and gather up sequences that need better
2178 // classification.
2179 // + Sequences of non-empty rows without hypotheses.
2180 // + Crown paragraphs not immediately followed by a strongly modeled line.
2181 // + Single line paragraphs surrounded by text that doesn't match the
2182 // model.
2184  GenericVector<Interval> *to_fix,
2185  int row_start, int row_end) {
2186  to_fix->clear();
2187  for (int i = row_start; i < row_end; i++) {
2188  bool needs_fixing = false;
2189 
2190  SetOfModels models;
2191  SetOfModels models_w_crowns;
2192  rows[i].StrongHypotheses(&models);
2193  rows[i].NonNullHypotheses(&models_w_crowns);
2194  if (models.empty() && !models_w_crowns.empty()) {
2195  // Crown paragraph. Is it followed by a modeled line?
2196  for (int end = i + 1; end < rows.size(); end++) {
2197  SetOfModels end_models;
2198  SetOfModels strong_end_models;
2199  rows[end].NonNullHypotheses(&end_models);
2200  rows[end].StrongHypotheses(&strong_end_models);
2201  if (end_models.empty()) {
2202  needs_fixing = true;
2203  break;
2204  } else if (!strong_end_models.empty()) {
2205  needs_fixing = false;
2206  break;
2207  }
2208  }
2209  } else if (models.empty() && rows[i].ri_->num_words > 0) {
2210  // No models at all.
2211  needs_fixing = true;
2212  }
2213 
2214  if (!needs_fixing && !models.empty()) {
2215  needs_fixing = RowIsStranded(rows, i);
2216  }
2217 
2218  if (needs_fixing) {
2219  if (!to_fix->empty() && to_fix->back().end == i - 1)
2220  to_fix->back().end = i;
2221  else
2222  to_fix->push_back(Interval(i, i));
2223  }
2224  }
2225  // Convert inclusive intervals to half-open intervals.
2226  for (int i = 0; i < to_fix->size(); i++) {
2227  (*to_fix)[i].end = (*to_fix)[i].end + 1;
2228  }
2229 }
2230 
2231 // Given a set of row_owners pointing to PARAs or NULL (no paragraph known),
2232 // normalize each row_owner to point to an actual PARA, and output the
2233 // paragraphs in order onto paragraphs.
2235  GenericVector<PARA *> *row_owners,
2236  PARA_LIST *paragraphs) {
2237  GenericVector<PARA *> &rows = *row_owners;
2238  paragraphs->clear();
2239  PARA_IT out(paragraphs);
2240  PARA *formerly_null = NULL;
2241  for (int i = 0; i < rows.size(); i++) {
2242  if (rows[i] == NULL) {
2243  if (i == 0 || rows[i - 1] != formerly_null) {
2244  rows[i] = formerly_null = new PARA();
2245  } else {
2246  rows[i] = formerly_null;
2247  continue;
2248  }
2249  } else if (i > 0 && rows[i - 1] == rows[i]) {
2250  continue;
2251  }
2252  out.add_after_then_move(rows[i]);
2253  }
2254 }
2255 
2256 // Main entry point for Paragraph Detection Algorithm.
2257 //
2258 // Given a set of equally spaced textlines (described by row_infos),
2259 // Split them into paragraphs.
2260 //
2261 // Output:
2262 // row_owners - one pointer for each row, to the paragraph it belongs to.
2263 // paragraphs - this is the actual list of PARA objects.
2264 // models - the list of paragraph models referenced by the PARA objects.
2265 // caller is responsible for deleting the models.
2266 void DetectParagraphs(int debug_level,
2267  GenericVector<RowInfo> *row_infos,
2268  GenericVector<PARA *> *row_owners,
2269  PARA_LIST *paragraphs,
2272  ParagraphTheory theory(models);
2273 
2274  // Initialize row_owners to be a bunch of NULL pointers.
2275  row_owners->init_to_size(row_infos->size(), NULL);
2276 
2277  // Set up row scratch registers for the main algorithm.
2278  rows.init_to_size(row_infos->size(), RowScratchRegisters());
2279  for (int i = 0; i < row_infos->size(); i++) {
2280  rows[i].Init((*row_infos)[i]);
2281  }
2282 
2283  // Pass 1:
2284  // Detect sequences of lines that all contain leader dots (.....)
2285  // These are likely Tables of Contents. If there are three text lines in
2286  // a row with leader dots, it's pretty safe to say the middle one should
2287  // be a paragraph of its own.
2288  SeparateSimpleLeaderLines(&rows, 0, rows.size(), &theory);
2289 
2290  DebugDump(debug_level > 1, "End of Pass 1", theory, rows);
2291 
2292  GenericVector<Interval> leftovers;
2293  LeftoverSegments(rows, &leftovers, 0, rows.size());
2294  for (int i = 0; i < leftovers.size(); i++) {
2295  // Pass 2a:
2296  // Find any strongly evidenced start-of-paragraph lines. If they're
2297  // followed by two lines that look like body lines, make a paragraph
2298  // model for that and see if that model applies throughout the text
2299  // (that is, "smear" it).
2300  StrongEvidenceClassify(debug_level, &rows,
2301  leftovers[i].begin, leftovers[i].end, &theory);
2302 
2303  // Pass 2b:
2304  // If we had any luck in pass 2a, we got part of the page and didn't
2305  // know how to classify a few runs of rows. Take the segments that
2306  // didn't find a model and reprocess them individually.
2307  GenericVector<Interval> leftovers2;
2308  LeftoverSegments(rows, &leftovers2, leftovers[i].begin, leftovers[i].end);
2309  bool pass2a_was_useful = leftovers2.size() > 1 ||
2310  (leftovers2.size() == 1 &&
2311  (leftovers2[0].begin != 0 || leftovers2[0].end != rows.size()));
2312  if (pass2a_was_useful) {
2313  for (int j = 0; j < leftovers2.size(); j++) {
2314  StrongEvidenceClassify(debug_level, &rows,
2315  leftovers2[j].begin, leftovers2[j].end,
2316  &theory);
2317  }
2318  }
2319  }
2320 
2321  DebugDump(debug_level > 1, "End of Pass 2", theory, rows);
2322 
2323  // Pass 3:
2324  // These are the dregs for which we didn't have enough strong textual
2325  // and geometric clues to form matching models for. Let's see if
2326  // the geometric clues are simple enough that we could just use those.
2327  LeftoverSegments(rows, &leftovers, 0, rows.size());
2328  for (int i = 0; i < leftovers.size(); i++) {
2329  GeometricClassify(debug_level, &rows,
2330  leftovers[i].begin, leftovers[i].end, &theory);
2331  }
2332 
2333  // Undo any flush models for which there's little evidence.
2334  DowngradeWeakestToCrowns(debug_level, &theory, &rows);
2335 
2336  DebugDump(debug_level > 1, "End of Pass 3", theory, rows);
2337 
2338  // Pass 4:
2339  // Take everything that's still not marked up well and clear all markings.
2340  LeftoverSegments(rows, &leftovers, 0, rows.size());
2341  for (int i = 0; i < leftovers.size(); i++) {
2342  for (int j = leftovers[i].begin; j < leftovers[i].end; j++) {
2343  rows[j].SetUnknown();
2344  }
2345  }
2346 
2347  DebugDump(debug_level > 1, "End of Pass 4", theory, rows);
2348 
2349  // Convert all of the unique hypothesis runs to PARAs.
2350  ConvertHypothesizedModelRunsToParagraphs(debug_level, rows, row_owners,
2351  &theory);
2352 
2353  DebugDump(debug_level > 0, "Final Paragraph Segmentation", theory, rows);
2354 
2355  // Finally, clean up any dangling NULL row paragraph parents.
2356  CanonicalizeDetectionResults(row_owners, paragraphs);
2357 }
2358 
2359 // ============ Code interfacing with the rest of Tesseract ==================
2360 
2362  RowInfo *info) {
2363  // Set up text, lword_text, and rword_text (mostly for debug printing).
2364  STRING fake_text;
2365  PageIterator pit(static_cast<const PageIterator&>(it));
2366  bool first_word = true;
2367  if (!pit.Empty(RIL_WORD)) {
2368  do {
2369  fake_text += "x";
2370  if (first_word) info->lword_text += "x";
2371  info->rword_text += "x";
2372  if (pit.IsAtFinalElement(RIL_WORD, RIL_SYMBOL) &&
2374  fake_text += " ";
2375  info->rword_text = "";
2376  first_word = false;
2377  }
2378  } while (!pit.IsAtFinalElement(RIL_TEXTLINE, RIL_SYMBOL) &&
2379  pit.Next(RIL_SYMBOL));
2380  }
2381  if (fake_text.size() == 0) return;
2382 
2383  int lspaces = info->pix_ldistance / info->average_interword_space;
2384  for (int i = 0; i < lspaces; i++) {
2385  info->text += ' ';
2386  }
2387  info->text += fake_text;
2388 
2389  // Set up lword_box, rword_box, and num_words.
2390  PAGE_RES_IT page_res_it = *it.PageResIt();
2391  WERD_RES *word_res = page_res_it.restart_row();
2392  ROW_RES *this_row = page_res_it.row();
2393 
2394  WERD_RES *lword = NULL;
2395  WERD_RES *rword = NULL;
2396  info->num_words = 0;
2397  do {
2398  if (word_res) {
2399  if (!lword) lword = word_res;
2400  if (rword != word_res) info->num_words++;
2401  rword = word_res;
2402  }
2403  word_res = page_res_it.forward();
2404  } while (page_res_it.row() == this_row);
2405 
2406  if (lword) info->lword_box = lword->word->bounding_box();
2407  if (rword) info->rword_box = rword->word->bounding_box();
2408 }
2409 
2410 
2411 // Given a Tesseract Iterator pointing to a text line, fill in the paragraph
2412 // detector RowInfo with all relevant information from the row.
2413 void InitializeRowInfo(bool after_recognition,
2414  const MutableIterator &it,
2415  RowInfo *info) {
2416  if (it.PageResIt()->row() != NULL) {
2417  ROW *row = it.PageResIt()->row()->row;
2418  info->pix_ldistance = row->lmargin();
2419  info->pix_rdistance = row->rmargin();
2420  info->average_interword_space =
2421  row->space() > 0 ? row->space() : MAX(row->x_height(), 1);
2422  info->pix_xheight = row->x_height();
2423  info->has_leaders = false;
2424  info->has_drop_cap = row->has_drop_cap();
2425  info->ltr = true; // set below depending on word scripts
2426  } else {
2427  info->pix_ldistance = info->pix_rdistance = 0;
2428  info->average_interword_space = 1;
2429  info->pix_xheight = 1.0;
2430  info->has_leaders = false;
2431  info->has_drop_cap = false;
2432  info->ltr = true;
2433  }
2434 
2435  info->num_words = 0;
2436  info->lword_indicates_list_item = false;
2437  info->lword_likely_starts_idea = false;
2438  info->lword_likely_ends_idea = false;
2439  info->rword_indicates_list_item = false;
2440  info->rword_likely_starts_idea = false;
2441  info->rword_likely_ends_idea = false;
2442  info->has_leaders = false;
2443  info->ltr = 1;
2444 
2445  if (!after_recognition) {
2447  return;
2448  }
2449  info->text = "";
2450  const std::unique_ptr<const char[]> text(it.GetUTF8Text(RIL_TEXTLINE));
2451  int trailing_ws_idx = strlen(text.get()); // strip trailing space
2452  while (trailing_ws_idx > 0 &&
2453  // isspace() only takes ASCII
2454  ((text[trailing_ws_idx - 1] & 0x80) == 0) &&
2455  isspace(text[trailing_ws_idx - 1]))
2456  trailing_ws_idx--;
2457  if (trailing_ws_idx > 0) {
2458  int lspaces = info->pix_ldistance / info->average_interword_space;
2459  for (int i = 0; i < lspaces; i++)
2460  info->text += ' ';
2461  for (int i = 0; i < trailing_ws_idx; i++)
2462  info->text += text[i];
2463  }
2464 
2465  if (info->text.size() == 0) {
2466  return;
2467  }
2468 
2469  PAGE_RES_IT page_res_it = *it.PageResIt();
2471  WERD_RES *word_res = page_res_it.restart_row();
2472  ROW_RES *this_row = page_res_it.row();
2473  int num_leaders = 0;
2474  int ltr = 0;
2475  int rtl = 0;
2476  do {
2477  if (word_res && word_res->best_choice->unichar_string().length() > 0) {
2478  werds.push_back(word_res);
2479  ltr += word_res->AnyLtrCharsInWord() ? 1 : 0;
2480  rtl += word_res->AnyRtlCharsInWord() ? 1 : 0;
2481  if (word_res->word->flag(W_REP_CHAR)) num_leaders++;
2482  }
2483  word_res = page_res_it.forward();
2484  } while (page_res_it.row() == this_row);
2485  info->ltr = ltr >= rtl;
2486  info->has_leaders = num_leaders > 3;
2487  info->num_words = werds.size();
2488  if (!werds.empty()) {
2489  WERD_RES *lword = werds[0], *rword = werds[werds.size() - 1];
2490  info->lword_text = lword->best_choice->unichar_string().string();
2491  info->rword_text = rword->best_choice->unichar_string().string();
2492  info->lword_box = lword->word->bounding_box();
2493  info->rword_box = rword->word->bounding_box();
2494  LeftWordAttributes(lword->uch_set, lword->best_choice,
2495  info->lword_text,
2497  &info->lword_likely_starts_idea,
2498  &info->lword_likely_ends_idea);
2499  RightWordAttributes(rword->uch_set, rword->best_choice,
2500  info->rword_text,
2502  &info->rword_likely_starts_idea,
2503  &info->rword_likely_ends_idea);
2504  }
2505 }
2506 
2507 // This is called after rows have been identified and words are recognized.
2508 // Much of this could be implemented before word recognition, but text helps
2509 // to identify bulleted lists and gives good signals for sentence boundaries.
2510 void DetectParagraphs(int debug_level,
2511  bool after_text_recognition,
2512  const MutableIterator *block_start,
2514  // Clear out any preconceived notions.
2515  if (block_start->Empty(RIL_TEXTLINE)) {
2516  return;
2517  }
2518  BLOCK *block = block_start->PageResIt()->block()->block;
2519  block->para_list()->clear();
2520  bool is_image_block = block->poly_block() && !block->poly_block()->IsText();
2521 
2522  // Convert the Tesseract structures to RowInfos
2523  // for the paragraph detection algorithm.
2524  MutableIterator row(*block_start);
2525  if (row.Empty(RIL_TEXTLINE))
2526  return; // end of input already.
2527 
2528  GenericVector<RowInfo> row_infos;
2529  do {
2530  if (!row.PageResIt()->row())
2531  continue; // empty row.
2532  row.PageResIt()->row()->row->set_para(NULL);
2533  row_infos.push_back(RowInfo());
2534  RowInfo &ri = row_infos.back();
2535  InitializeRowInfo(after_text_recognition, row, &ri);
2536  } while (!row.IsAtFinalElement(RIL_BLOCK, RIL_TEXTLINE) &&
2537  row.Next(RIL_TEXTLINE));
2538 
2539  // If we're called before text recognition, we might not have
2540  // tight block bounding boxes, so trim by the minimum on each side.
2541  if (!row_infos.empty()) {
2542  int min_lmargin = row_infos[0].pix_ldistance;
2543  int min_rmargin = row_infos[0].pix_rdistance;
2544  for (int i = 1; i < row_infos.size(); i++) {
2545  if (row_infos[i].pix_ldistance < min_lmargin)
2546  min_lmargin = row_infos[i].pix_ldistance;
2547  if (row_infos[i].pix_rdistance < min_rmargin)
2548  min_rmargin = row_infos[i].pix_rdistance;
2549  }
2550  if (min_lmargin > 0 || min_rmargin > 0) {
2551  for (int i = 0; i < row_infos.size(); i++) {
2552  row_infos[i].pix_ldistance -= min_lmargin;
2553  row_infos[i].pix_rdistance -= min_rmargin;
2554  }
2555  }
2556  }
2557 
2558  // Run the paragraph detection algorithm.
2559  GenericVector<PARA *> row_owners;
2560  GenericVector<PARA *> the_paragraphs;
2561  if (!is_image_block) {
2562  DetectParagraphs(debug_level, &row_infos, &row_owners, block->para_list(),
2563  models);
2564  } else {
2565  row_owners.init_to_size(row_infos.size(), NULL);
2566  CanonicalizeDetectionResults(&row_owners, block->para_list());
2567  }
2568 
2569  // Now stitch in the row_owners into the rows.
2570  row = *block_start;
2571  for (int i = 0; i < row_owners.size(); i++) {
2572  while (!row.PageResIt()->row())
2573  row.Next(RIL_TEXTLINE);
2574  row.PageResIt()->row()->row->set_para(row_owners[i]);
2575  row.Next(RIL_TEXTLINE);
2576  }
2577 }
2578 
2579 } // namespace
double u[max]
void GetClusters(GenericVector< Cluster > *clusters)
Definition: paragraphs.cpp:676
void DiscardUnusedModels(const GenericVector< RowScratchRegisters > &rows, ParagraphTheory *theory)
bool has_drop_cap
Definition: ocrpara.h:46
const PAGE_RES_IT * PageResIt() const
void DetectParagraphs(int debug_level, GenericVector< RowInfo > *row_infos, GenericVector< PARA *> *row_owners, PARA_LIST *paragraphs, GenericVector< ParagraphModel *> *models)
bool RowIsStranded(const GenericVector< RowScratchRegisters > &rows, int row)
const char * kPDF
Definition: unicodes.cpp:30
int ClosestCluster(const GenericVector< Cluster > &clusters, int value)
Definition: paragraphs.cpp:666
bool AsciiLikelyListItem(const STRING &word)
Definition: paragraphs.cpp:268
tesseract::ParagraphJustification justification() const
Definition: ocrpara.h:164
void InitializeRowInfo(bool after_recognition, const MutableIterator &it, RowInfo *info)
void NonNullHypotheses(SetOfModels *models) const
Definition: paragraphs.cpp:611
bool Empty(PageIteratorLevel level) const
Cluster(int cen, int num)
Definition: paragraphs.cpp:646
bool StrongModel(const ParagraphModel *model)
bool rword_likely_ends_idea
Definition: paragraphs.h:77
virtual bool Next(PageIteratorLevel level)
UNICHAR_ID unichar_id(int index) const
Definition: ratngs.h:313
WERD_RES * restart_row()
Definition: pageres.cpp:1638
int average_interword_space
Definition: paragraphs.h:52
int tolerance() const
Definition: ocrpara.h:170
const ParagraphModel * kCrownRight
Definition: paragraphs.cpp:48
void init_to_size(int size, T t)
bool ValidFirstLine(const GenericVector< RowScratchRegisters > *rows, int row, const ParagraphModel *model)
int UNICHAR_ID
Definition: unichar.h:33
bool IsOpeningPunct(int ch)
Definition: paragraphs.cpp:202
void StrongEvidenceClassify(int debug_level, GenericVector< RowScratchRegisters > *rows, int row_start, int row_end, ParagraphTheory *theory)
bool get_ispunctuation(UNICHAR_ID unichar_id) const
Definition: unicharset.h:479
void Fail(int min_debug_level, const char *why) const
Definition: paragraphs.cpp:918
WERD_CHOICE * best_choice
Definition: pageres.h:219
int length() const
Definition: ratngs.h:301
virtual char * GetUTF8Text(PageIteratorLevel level) const
void LeftoverSegments(const GenericVector< RowScratchRegisters > &rows, GenericVector< Interval > *to_fix, int row_start, int row_end)
bool LikelyParagraphStart(const RowScratchRegisters &before, const RowScratchRegisters &after)
bool RowsFitModel(const GenericVector< RowScratchRegisters > *rows, int start, int end, const ParagraphModel *model)
const GenericVector< Cluster > & OffsideTabs() const
Definition: paragraphs.cpp:893
ParagraphModel InternalParagraphModelByOutline(const GenericVector< RowScratchRegisters > *rows, int start, int end, int tolerance, bool *consistent)
bool AnyRtlCharsInWord() const
Definition: pageres.h:375
void DiscardNonMatchingHypotheses(const SetOfModels &models)
Definition: paragraphs.cpp:631
bool lword_likely_ends_idea
Definition: paragraphs.h:73
void InitializeTextAndBoxesPreRecognition(const MutableIterator &it, RowInfo *info)
void remove(int index)
ParagraphModelSmearer(GenericVector< RowScratchRegisters > *rows, int row_start, int row_end, ParagraphTheory *theory)
SimpleClusterer(int max_cluster_width)
Definition: paragraphs.cpp:654
int AlignsideTabIndex(int row_idx) const
Definition: paragraphs.cpp:905
void CalculateTabStops(GenericVector< RowScratchRegisters > *rows, int row_start, int row_end, int tolerance, GenericVector< Cluster > *left_tabs, GenericVector< Cluster > *right_tabs)
Definition: paragraphs.cpp:692
int push_back(T object)
GenericVector< ParagraphModel * > & models()
int UnicodeFor(const UNICHARSET *u, const WERD_CHOICE *werd, int pos)
Definition: paragraphs.cpp:275
#define tprintf(...)
Definition: tprintf.h:31
const GenericVector< Cluster > & AlignTabs() const
Definition: paragraphs.cpp:883
void RightWordAttributes(const UNICHARSET *unicharset, const WERD_CHOICE *werd, const STRING &utf8, bool *is_list, bool *starts_idea, bool *ends_idea)
Definition: paragraphs.cpp:442
int body_indent() const
Definition: ocrpara.h:169
GenericVector< RowScratchRegisters > * rows
Definition: paragraphs.cpp:933
BLOCK * block
Definition: pageres.h:99
const char * string() const
Definition: strngs.cpp:198
int first_indent() const
Definition: ocrpara.h:168
bool IsTerminalPunct(int ch)
Definition: paragraphs.cpp:206
ROW * row
Definition: pageres.h:127
void RecomputeMarginsAndClearHypotheses(GenericVector< RowScratchRegisters > *rows, int start, int end, int percentile)
bool empty() const
Definition: genericvector.h:90
float x_height() const
Definition: ocrrow.h:61
bool NearlyEqual(T x, T y, T tolerance)
Definition: host.h:87
GeometricClassifierState(int dbg_level, GenericVector< RowScratchRegisters > *r, int r_start, int r_end)
Definition: paragraphs.cpp:856
void MarkStrongEvidence(GenericVector< RowScratchRegisters > *rows, int row_start, int row_end)
inT32 length() const
Definition: strngs.cpp:193
void AddStartLine(const ParagraphModel *model)
Definition: paragraphs.cpp:583
ParagraphModel Model() const
Definition: paragraphs.cpp:924
bool is_flush() const
Definition: ocrpara.h:171
const char * kRLE
Definition: unicodes.cpp:29
bool UniLikelyListItem(const UNICHARSET *u, const WERD_CHOICE *werd)
Definition: paragraphs.cpp:358
int size() const
Definition: genericvector.h:72
void set_para(PARA *p)
Definition: ocrrow.h:112
virtual bool IsAtFinalElement(PageIteratorLevel level, PageIteratorLevel element) const
virtual bool IsAtFinalElement(PageIteratorLevel level, PageIteratorLevel element) const
TBOX bounding_box() const
Definition: werd.cpp:160
bool lword_likely_starts_idea
Definition: paragraphs.h:72
ROW_RES * row() const
Definition: pageres.h:739
bool TextSupportsBreak(const RowScratchRegisters &before, const RowScratchRegisters &after)
BOOL8 flag(WERD_FLAGS mask) const
Definition: werd.h:128
const char * id_to_unichar(UNICHAR_ID id) const
Definition: unicharset.cpp:266
bool IsDigitLike(int ch)
Definition: paragraphs.cpp:198
int IndexOf(const ParagraphModel *model) const
bool IsText() const
Definition: polyblk.h:52
void GeometricClassify(int debug_level, GenericVector< RowScratchRegisters > *rows, int row_start, int row_end, ParagraphTheory *theory)
inT16 lmargin() const
Definition: ocrrow.h:98
bool IsLatinLetter(int ch)
Definition: paragraphs.cpp:194
inT16 rmargin() const
Definition: ocrrow.h:101
inT32 size() const
Definition: strngs.h:69
WERD_RES * forward()
Definition: pageres.h:716
bool get_isdigit(UNICHAR_ID unichar_id) const
Definition: unicharset.h:472
void GeometricClassifyThreeTabStopTextBlock(int debug_level, GeometricClassifierState &s, ParagraphTheory *theory)
Definition: paragraphs.cpp:986
const ParagraphModel * model
Definition: ocrpara.h:36
bool ValidFirstLine(int lmargin, int lindent, int rindent, int rmargin) const
Definition: ocrpara.cpp:46
double median() const
Definition: statistc.cpp:239
Definition: strngs.h:45
void DowngradeWeakestToCrowns(int debug_level, ParagraphTheory *theory, GenericVector< RowScratchRegisters > *rows)
bool FirstWordWouldHaveFit(int row_a, int row_b)
Definition: paragraphs.cpp:911
tesseract::ParagraphJustification just
Definition: paragraphs.cpp:950
void ModelStrongEvidence(int debug_level, GenericVector< RowScratchRegisters > *rows, int row_start, int row_end, bool allow_flush_models, ParagraphTheory *theory)
T ClipToRange(const T &x, const T &lower_bound, const T &upper_bound)
Definition: helpers.h:122
bool contains(T object) const
bool rword_likely_starts_idea
Definition: paragraphs.h:76
bool LikelyListNumeral(const STRING &word)
Definition: paragraphs.cpp:229
bool FirstWordWouldHaveFit(const RowScratchRegisters &before, const RowScratchRegisters &after)
const ParagraphModel * UniqueStartHypothesis() const
Definition: paragraphs.cpp:618
bool is_list_item
Definition: ocrpara.h:38
bool FirstWordWouldHaveFit(const RowScratchRegisters &before, const RowScratchRegisters &after, tesseract::ParagraphJustification justification)
void AppendDebugInfo(const ParagraphTheory &theory, GenericVector< STRING > *dbg) const
Definition: paragraphs.cpp:482
void StrongHypotheses(SetOfModels *models) const
Definition: paragraphs.cpp:604
bool is_very_first_or_continuation
Definition: ocrpara.h:43
const ParagraphModel * Fits(const GenericVector< RowScratchRegisters > *rows, int start, int end) const
void MarkRowsWithModel(GenericVector< RowScratchRegisters > *rows, int row_start, int row_end, const ParagraphModel *model, bool ltr, int eop_threshold)
Definition: paragraphs.cpp:808
int first_uni() const
Definition: unichar.cpp:97
PARA_LIST * para_list()
Definition: ocrblock.h:128
void StartHypotheses(SetOfModels *models) const
Definition: paragraphs.cpp:597
void add(inT32 value, inT32 count)
Definition: statistc.cpp:101
int InterwordSpace(const GenericVector< RowScratchRegisters > &rows, int row_start, int row_end)
void DiscardUnusedModels(const SetOfModels &used_models)
int OffsideIndent(tesseract::ParagraphJustification just) const
#define MAX(x, y)
Definition: ndminx.h:24
virtual bool Next(PageIteratorLevel level)
static void AppendDebugHeaderFields(GenericVector< STRING > *header)
Definition: paragraphs.cpp:476
const STRING & unichar_string() const
Definition: ratngs.h:539
T & back() const
void AddBodyLine(const ParagraphModel *model)
Definition: paragraphs.cpp:590
POLY_BLOCK * poly_block() const
Definition: pdblock.h:55
const ParagraphModel * AddModel(const ParagraphModel &model)
const ParagraphModel * UniqueBodyHypothesis() const
Definition: paragraphs.cpp:624
WERD * word
Definition: pageres.h:175
void Init(const RowInfo &row)
Definition: paragraphs.cpp:513
bool lword_indicates_list_item
Definition: paragraphs.h:71
inT16 width() const
Definition: rect.h:111
Interval(int b, int e)
Definition: statistc.h:33
Definition: ocrpara.h:29
double ile(double frac) const
Definition: statistc.cpp:174
bool LikelyListMarkUnicode(int ch)
Definition: paragraphs.cpp:329
bool LikelyListMark(const STRING &word)
Definition: paragraphs.cpp:263
const char * SkipChars(const char *str, const char *toskip)
Definition: paragraphs.cpp:211
bool ValidBodyLine(const GenericVector< RowScratchRegisters > *rows, int row, const ParagraphModel *model)
GenericVector< Cluster > left_tabs
Definition: paragraphs.cpp:946
bool get_isupper(UNICHAR_ID unichar_id) const
Definition: unicharset.h:465
const UNICHARSET * uch_set
Definition: pageres.h:192
ParagraphJustification
Definition: publictypes.h:239
void LeftWordAttributes(const UNICHARSET *unicharset, const WERD_CHOICE *werd, const STRING &utf8, bool *is_list, bool *starts_idea, bool *ends_idea)
Definition: paragraphs.cpp:395
void SeparateSimpleLeaderLines(GenericVector< RowScratchRegisters > *rows, int row_start, int row_end, ParagraphTheory *theory)
bool AnyLtrCharsInWord() const
Definition: pageres.h:392
void ConvertHypothesizedModelRunsToParagraphs(int debug_level, const GenericVector< RowScratchRegisters > &rows, GenericVector< PARA *> *row_owners, ParagraphTheory *theory)
const char * SkipOne(const char *str, const char *toskip)
Definition: paragraphs.cpp:221
void UpdateRange(const T1 &x, T2 *lower_bound, T2 *upper_bound)
Definition: helpers.h:132
Definition: ocrrow.h:32
int count(LIST var_list)
Definition: oldlist.cpp:103
bool rword_indicates_list_item
Definition: paragraphs.h:75
bool ValidBodyLine(int lmargin, int lindent, int rindent, int rmargin) const
Definition: ocrpara.cpp:63
Definition: ocrblock.h:30
ParagraphModel ParagraphModelByOutline(int debug_level, const GenericVector< RowScratchRegisters > *rows, int start, int end, int tolerance)
int push_back_new(T object)
void NonCenteredModels(SetOfModels *models)
GenericVector< Cluster > right_tabs
Definition: paragraphs.cpp:947
BLOCK_RES * block() const
Definition: pageres.h:742
const ParagraphModel * kCrownLeft
Definition: paragraphs.cpp:46
bool CrownCompatible(const GenericVector< RowScratchRegisters > *rows, int a, int b, const ParagraphModel *model)
void CanonicalizeDetectionResults(GenericVector< PARA *> *row_owners, PARA_LIST *paragraphs)
inT32 space() const
Definition: ocrrow.h:76
bool has_drop_cap() const
Definition: ocrrow.h:108
UnicodeSpanSkipper(const UNICHARSET *unicharset, const WERD_CHOICE *word)
Definition: paragraphs.cpp:285
STRING RtlEmbed(const STRING &word, bool rtlify)
Definition: paragraphs.cpp:122