20 #define __func__ __FUNCTION__ 53 static int Epsilon(
int space_pix) {
54 return space_pix * 4 / 5;
57 static bool AcceptableRowArgs(
58 int debug_level,
int min_num_rows,
const char *function_name,
60 int row_start,
int row_end) {
61 if (row_start < 0 || row_end > rows->
size() || row_start > row_end) {
62 tprintf(
"Invalid arguments rows[%d, %d) while rows is of size %d.\n",
63 row_start, row_end, rows->
size());
66 if (row_end - row_start < min_num_rows) {
67 if (debug_level > 1) {
68 tprintf(
"# Too few rows[%d, %d) for %s.\n",
69 row_start, row_end, function_name);
79 static STRING StrOf(
int num) {
81 snprintf(buffer,
sizeof(buffer),
"%d", num);
90 for (
int r = 0; r < rows.
size(); r++) {
91 int num_columns = rows[r].
size();
92 for (
int c = 0; c < num_columns; c++) {
94 for (
int i = 0; i < rows[r][c].
size(); i++) {
95 if ((rows[r][c][i] & 0xC0) != 0x80) num_unicodes++;
97 if (c >= max_col_widths.
size()) {
100 if (num_unicodes > max_col_widths[c])
101 max_col_widths[c] = num_unicodes;
107 for (
int c = 0; c < max_col_widths.
size(); c++) {
109 STRING(
"%-") + StrOf(max_col_widths[c]) +
"s");
112 for (
int r = 0; r < rows.
size(); r++) {
113 for (
int c = 0; c < rows[r].
size(); c++) {
116 tprintf(col_width_patterns[c].
string(), rows[r][c].
string());
133 output.
back().push_back(
"#row");
134 output.
back().push_back(
"space");
135 output.
back().push_back(
"..");
136 output.
back().push_back(
"lword[widthSEL]");
137 output.
back().push_back(
"rword[widthSEL]");
139 output.
back().push_back(
"text");
141 for (
int i = 0; i < rows.
size(); i++) {
144 const RowInfo& ri = *rows[i].ri_;
160 rows[i].AppendDebugInfo(theory, &row);
163 PrintTable(output,
" ");
165 tprintf(
"Active Paragraph Models:\n");
166 for (
int m = 0; m < theory.
models().
size(); m++) {
167 tprintf(
" %d: %s\n", m + 1, theory.
models()[m]->ToString().string());
171 static void DebugDump(
179 PrintDetectorState(theory, rows);
184 int row_start,
int row_end) {
185 tprintf(
"======================================\n");
186 for (
int row = row_start; row < row_end; row++) {
187 tprintf(
"%s\n", rows[row].ri_->text.string());
189 tprintf(
"======================================\n");
195 return (ch >=
'a' && ch <=
'z') || (ch >=
'A' && ch <=
'Z');
199 return ch ==
'o' || ch ==
'O' || ch ==
'l' || ch ==
'I';
203 return strchr(
"'\"({[", ch) != NULL;
207 return strchr(
":'\".?!]})", ch) != NULL;
211 const char *
SkipChars(
const char *str,
const char *toskip) {
212 while (*str !=
'\0' && strchr(toskip, *str)) { str++; }
216 const char *
SkipChars(
const char *str,
bool (*skip)(
int)) {
217 while (*str !=
'\0' && skip(*str)) { str++; }
221 const char *
SkipOne(
const char *str,
const char *toskip) {
222 if (*str !=
'\0' && strchr(toskip, *str))
return str + 1;
230 const char *kRomans =
"ivxlmdIVXLMD";
231 const char *kDigits =
"012345789";
232 const char *kOpen =
"[{(";
233 const char *kSep =
":;-.,";
234 const char *kClose =
"]})";
236 int num_segments = 0;
237 const char *pos = word.
string();
238 while (*pos !=
'\0' && num_segments < 3) {
241 const char *numeral_end =
SkipChars(numeral_start, kRomans);
242 if (numeral_end != numeral_start) {
245 numeral_end =
SkipChars(numeral_start, kDigits);
246 if (numeral_end == numeral_start) {
249 if (numeral_end - numeral_start != 1)
257 if (pos == numeral_end)
264 const char *kListMarks =
"0Oo*.,+.";
265 return word.
size() == 1 && strchr(kListMarks, word[0]) != NULL;
276 if (!u || !werd || pos > werd->
length())
286 : u_(unicharset), word_(word) { wordlen_ = word->
length(); }
304 while (pos < wordlen_ && u_->get_ispunctuation(word_->
unichar_id(pos))) pos++;
315 const char *kRomans =
"ivxlmdIVXLMD";
316 while (pos < wordlen_) {
318 if (ch >= 0xF0 || strchr(kRomans, ch) == 0)
break;
325 while (pos < wordlen_ && u_->get_isalpha(word_->
unichar_id(pos))) pos++;
363 int num_segments = 0;
365 while (pos < werd->length() && num_segments < 3) {
366 int numeral_start = m.
SkipPunc(pos);
367 if (numeral_start > pos + 1)
break;
368 int numeral_end = m.
SkipRomans(numeral_start);
369 if (numeral_end == numeral_start) {
371 if (numeral_end == numeral_start) {
373 numeral_end = m.
SkipAlpha(numeral_start);
374 if (numeral_end - numeral_start != 1)
382 if (pos == numeral_end)
385 return pos == werd->
length();
397 bool *is_list,
bool *starts_idea,
bool *ends_idea) {
399 *starts_idea =
false;
401 if (utf8.
size() == 0 || (werd != NULL && werd->
length() == 0)) {
406 if (unicharset && werd) {
424 int start_letter = utf8[0];
431 if (start_letter >=
'A' && start_letter <=
'Z') {
444 bool *is_list,
bool *starts_idea,
bool *ends_idea) {
446 *starts_idea =
false;
448 if (utf8.
size() == 0 || (werd != NULL && werd->
length() == 0)) {
453 if (unicharset && werd) {
467 int last_letter = utf8[utf8.
size() - 1];
478 header->
push_back(
"[lmarg,lind;rind,rmarg]");
485 snprintf(s,
sizeof(s),
"[%3d,%3d;%3d,%3d]",
486 lmargin_, lindent_, rindent_, rmargin_);
489 model_string +=
static_cast<char>(GetLineType());
492 int model_numbers = 0;
493 for (
int h = 0; h < hypotheses_.size(); h++) {
494 if (hypotheses_[h].model == NULL)
496 if (model_numbers > 0)
499 model_string += StrOf(1 + theory.
IndexOf(hypotheses_[h].model));
500 }
else if (hypotheses_[h].model ==
kCrownLeft) {
501 model_string +=
"CrL";
503 model_string +=
"CrR";
507 if (model_numbers == 0)
522 if (hypotheses_.empty())
524 bool has_start =
false;
525 bool has_body =
false;
526 for (
int i = 0; i < hypotheses_.size(); i++) {
527 switch (hypotheses_[i].ty) {
528 case LT_START: has_start =
true;
break;
529 case LT_BODY: has_body =
true;
break;
531 tprintf(
"Encountered bad value in hypothesis list: %c\n",
536 if (has_start && has_body)
542 if (hypotheses_.empty())
544 bool has_start =
false;
545 bool has_body =
false;
546 for (
int i = 0; i < hypotheses_.size(); i++) {
547 if (hypotheses_[i].model != model)
549 switch (hypotheses_[i].ty) {
550 case LT_START: has_start =
true;
break;
551 case LT_BODY: has_body =
true;
break;
553 tprintf(
"Encountered bad value in hypothesis list: %c\n",
558 if (has_start && has_body)
564 LineType current_lt = GetLineType();
566 tprintf(
"Trying to set a line to be START when it's already BODY.\n");
574 LineType current_lt = GetLineType();
576 tprintf(
"Trying to set a line to be BODY when it's already START.\n");
587 hypotheses_.remove(old_idx);
594 hypotheses_.remove(old_idx);
598 for (
int h = 0; h < hypotheses_.size(); h++) {
605 for (
int h = 0; h < hypotheses_.size(); h++) {
612 for (
int h = 0; h < hypotheses_.size(); h++) {
613 if (hypotheses_[h].model != NULL)
619 if (hypotheses_.size() != 1 || hypotheses_[0].ty !=
LT_START)
621 return hypotheses_[0].model;
625 if (hypotheses_.size() != 1 || hypotheses_[0].ty !=
LT_BODY)
627 return hypotheses_[0].model;
635 for (
int h = hypotheses_.size() - 1; h >= 0; h--) {
636 if (!models.
contains(hypotheses_[h].model)) {
637 hypotheses_.remove(h);
655 : max_cluster_width_(max_cluster_width) {}
656 void Add(
int value) { values_.push_back(value); }
657 int size()
const {
return values_.size(); }
661 int max_cluster_width_;
668 for (
int i = 0; i < clusters.
size(); i++) {
669 if (abs(value - clusters[i].center) <
670 abs(value - clusters[best_index].center))
679 for (
int i = 0; i < values_.size();) {
683 while (++i < values_.size() && values_[i] <= lo + max_cluster_width_) {
693 int row_start,
int row_end,
697 if (!AcceptableRowArgs(0, 1, __func__, rows, row_start, row_end))
704 for (
int i = row_start; i < row_end; i++) {
705 initial_lefts.
Add((*rows)[i].lindent_);
706 initial_rights.
Add((*rows)[i].rindent_);
724 int infrequent_enough_to_ignore = 0;
725 if (row_end - row_start >= 8) infrequent_enough_to_ignore = 1;
726 if (row_end - row_start >= 20) infrequent_enough_to_ignore = 2;
728 for (
int i = row_start; i < row_end; i++) {
729 int lidx =
ClosestCluster(initial_left_tabs, (*rows)[i].lindent_);
730 int ridx =
ClosestCluster(initial_right_tabs, (*rows)[i].rindent_);
731 if (initial_left_tabs[lidx].
count > infrequent_enough_to_ignore ||
732 initial_right_tabs[ridx].
count > infrequent_enough_to_ignore) {
733 lefts.
Add((*rows)[i].lindent_);
734 rights.
Add((*rows)[i].rindent_);
740 if ((left_tabs->
size() == 1 && right_tabs->
size() >= 4) ||
741 (right_tabs->
size() == 1 && left_tabs->
size() >= 4)) {
746 for (
int i = row_start; i < row_end; i++) {
747 int lidx =
ClosestCluster(initial_left_tabs, (*rows)[i].lindent_);
748 int ridx =
ClosestCluster(initial_right_tabs, (*rows)[i].rindent_);
749 if (!(initial_left_tabs[lidx].
count > infrequent_enough_to_ignore ||
750 initial_right_tabs[ridx].
count > infrequent_enough_to_ignore)) {
751 lefts.
Add((*rows)[i].lindent_);
752 rights.
Add((*rows)[i].rindent_);
761 if (left_tabs->
size() == 3 && right_tabs->
size() >= 4) {
763 for (
int i = left_tabs->
size() - 1; i >= 0; i--) {
765 (*left_tabs)[i].count < (*left_tabs)[to_prune].count) {
770 (*left_tabs)[to_prune].count <= infrequent_enough_to_ignore) {
771 left_tabs->
remove(to_prune);
774 if (right_tabs->
size() == 3 && left_tabs->
size() >= 4) {
776 for (
int i = right_tabs->
size() - 1; i >= 0; i--) {
778 (*right_tabs)[i].count < (*right_tabs)[to_prune].count) {
783 (*right_tabs)[to_prune].count <= infrequent_enough_to_ignore) {
784 right_tabs->
remove(to_prune);
809 int row_start,
int row_end,
813 if (!AcceptableRowArgs(0, 0, __func__, rows, row_start, row_end))
815 for (
int row = row_start; row < row_end; row++) {
818 if (valid_first && !valid_body) {
819 (*rows)[row].AddStartLine(model);
820 }
else if (valid_body && !valid_first) {
821 (*rows)[row].AddBodyLine(model);
822 }
else if (valid_body && valid_first) {
823 bool after_eop = (row == row_start);
824 if (row > row_start) {
825 if (eop_threshold > 0) {
827 after_eop = (*rows)[row - 1].rindent_ > eop_threshold;
829 after_eop = (*rows)[row - 1].lindent_ > eop_threshold;
837 (*rows)[row].AddStartLine(model);
839 (*rows)[row].AddBodyLine(model);
858 int r_start,
int r_end)
859 : debug_level(dbg_level), rows(r), row_start(r_start), row_end(r_end),
863 &left_tabs, &right_tabs);
864 if (debug_level >= 3) {
865 tprintf(
"Geometry: TabStop cluster tolerance = %d; " 866 "%d left tabs; %d right tabs\n",
867 tolerance, left_tabs.size(), right_tabs.size());
869 ltr = (*r)[r_start].ri_->ltr;
874 margin = (*rows)[row_start].lmargin_;
879 margin = (*rows)[row_start].rmargin_;
906 return ClosestCluster(AlignTabs(), (*rows)[row_idx].AlignsideIndent(just));
913 (*rows)[row_a], (*rows)[row_b], just);
916 void PrintRows()
const { PrintRowRange(*rows, row_start, row_end); }
918 void Fail(
int min_debug_level,
const char *why)
const {
919 if (debug_level < min_debug_level)
return;
925 return ParagraphModel(just, margin, first_indent, body_indent, tolerance);
991 int num_full_rows = 0;
992 int last_row_full = 0;
996 if (i == s.
row_end - 1) last_row_full++;
1000 if (num_full_rows < 0.7 * num_rows) {
1001 s.
Fail(1,
"Not enough full lines to know which lines start paras.");
1014 if (debug_level > 0) {
1015 tprintf(
"# Not enough variety for clear outline classification. " 1016 "Guessing these are %s aligned based on script.\n",
1017 s.
ltr ?
"left" :
"right");
1025 if (num_rows - 1 == num_full_rows - last_row_full) {
1030 (*s.
rows)[i].AddBodyLine(model);
1080 int row_start,
int row_end,
1082 if (!AcceptableRowArgs(debug_level, 4, __func__, rows, row_start, row_end))
1084 if (debug_level > 1) {
1085 tprintf(
"###############################################\n");
1086 tprintf(
"##### GeometricClassify( rows[%d:%d) ) ####\n",
1087 row_start, row_end);
1088 tprintf(
"###############################################\n");
1094 s.
Fail(2,
"Too much variety for simple outline classification.");
1098 s.
Fail(1,
"Not enough variety for simple outline classification.");
1127 int firsts[2] = {0, 0};
1132 bool jam_packed =
true;
1147 int percent0firsts, percent1firsts;
1148 percent0firsts = (100 * firsts[0]) / s.
AlignTabs()[0].count;
1149 percent1firsts = (100 * firsts[1]) / s.
AlignTabs()[1].count;
1152 if ((percent0firsts < 20 && 30 < percent1firsts) ||
1153 percent0firsts + 30 < percent1firsts) {
1156 }
else if ((percent1firsts < 20 && 30 < percent0firsts) ||
1157 percent1firsts + 30 < percent0firsts) {
1162 if (debug_level > 1) {
1163 tprintf(
"# Cannot determine %s indent likely to start paragraphs.\n",
1165 tprintf(
"# Indent of %d looks like a first line %d%% of the time.\n",
1166 s.
AlignTabs()[0].center, percent0firsts);
1167 tprintf(
"# Indent of %d looks like a first line %d%% of the time.\n",
1168 s.
AlignTabs()[1].center, percent1firsts);
1216 for (
int i = 0; i < models_->size(); i++) {
1217 if ((*models_)[i]->Comparable(model))
1218 return (*models_)[i];
1221 models_->push_back(m);
1222 models_we_added_.push_back_new(m);
1227 for (
int i = models_->size() - 1; i >= 0; i--) {
1229 if (!used_models.
contains(m) && models_we_added_.contains(m)) {
1231 models_we_added_.remove(models_we_added_.get_index(m));
1242 for (
int m = 0; m < models_->size(); m++) {
1252 for (
int m = 0; m < models_->size(); m++) {
1260 for (
int i = 0; i < models_->size(); i++) {
1261 if ((*models_)[i] == model)
1270 tprintf(
"ValidFirstLine() should only be called with strong models!\n");
1274 (*rows)[row].lmargin_, (*rows)[row].lindent_,
1275 (*rows)[row].rindent_, (*rows)[row].rmargin_);
1281 tprintf(
"ValidBodyLine() should only be called with strong models!\n");
1285 (*rows)[row].lmargin_, (*rows)[row].lindent_,
1286 (*rows)[row].rindent_, (*rows)[row].rmargin_);
1292 tprintf(
"CrownCompatible() should only be called with crown models!\n");
1313 : theory_(theory), rows_(rows), row_start_(row_start),
1315 if (!AcceptableRowArgs(0, 0, __func__, rows, row_start, row_end)) {
1321 for (
int row = row_start - 1; row <= row_end; row++) {
1322 open_models_.push_back(no_models);
1327 void ParagraphModelSmearer::CalculateOpenModels(
int row_start,
int row_end) {
1329 if (row_start < row_start_) row_start = row_start_;
1330 if (row_end > row_end_) row_end = row_end_;
1332 for (
int row = (row_start > 0) ? row_start - 1 : row_start; row < row_end;
1334 if ((*rows_)[row].ri_->num_words == 0) {
1335 OpenModels(row + 1) = no_models;
1338 (*rows_)[row].StartHypotheses(&opened);
1342 for (
int m = 0; m < opened.
size(); m++) {
1351 OpenModels(row + 1) = still_open;
1358 CalculateOpenModels(row_start_, row_end_);
1363 for (
int i = row_start_; i < row_end_; i++) {
1372 bool left_align_open =
false;
1373 bool right_align_open =
false;
1374 for (
int m = 0; m < OpenModels(i).
size(); m++) {
1375 switch (OpenModels(i)[m]->justification()) {
1378 default: left_align_open = right_align_open =
true;
1386 likely_start =
true;
1388 if ((left_align_open && right_align_open) ||
1389 (!left_align_open && !right_align_open)) {
1394 }
else if (left_align_open) {
1409 for (
int m = 0; m < OpenModels(i).
size(); m++) {
1418 (*rows_)[i - 1].StrongHypotheses(&last_line_models);
1422 for (
int m = 0; m < last_line_models.
size(); m++) {
1437 for (
int m = 0; m < all_models.
size(); m++) {
1447 CalculateOpenModels(i + 1, row_end_);
1459 for (
int i = 0; i < rows.
size(); i++) {
1460 rows[i].StrongHypotheses(&used_models);
1493 for (
int end = rows->
size(); end > 0; end = start) {
1497 (model = (*rows)[end - 1].UniqueBodyHypothesis()) == NULL) {
1500 if (end == 0)
break;
1502 while (start >= 0 && (*rows)[start].UniqueBodyHypothesis() == model) {
1505 if (start >= 0 && (*rows)[start].UniqueStartHypothesis() == model &&
1531 (*rows)[start].SetUnknown();
1532 (*rows)[start].AddStartLine(crown_model);
1533 for (
int row = start + 1; row < end; row++) {
1534 (*rows)[row].SetUnknown();
1535 (*rows)[row].AddBodyLine(crown_model);
1562 if (!AcceptableRowArgs(0, 0, __func__, rows, start, end))
1565 int lmin, lmax, rmin, rmax;
1566 lmin = lmax = (*rows)[start].lmargin_ + (*rows)[start].lindent_;
1567 rmin = rmax = (*rows)[start].rmargin_ + (*rows)[start].rindent_;
1568 for (
int i = start; i < end; i++) {
1576 STATS lefts(lmin, lmax + 1);
1577 STATS rights(rmin, rmax + 1);
1578 for (
int i = start; i < end; i++) {
1585 int ignorable_left = lefts.
ile(
ClipToRange(percentile, 0, 100) / 100.0);
1586 int ignorable_right = rights.
ile(
ClipToRange(percentile, 0, 100) / 100.0);
1587 for (
int i = start; i < end; i++) {
1589 int ldelta = ignorable_left - sr.
lmargin_;
1592 int rdelta = ignorable_right - sr.
rmargin_;
1600 int row_start,
int row_end) {
1601 if (row_end < row_start + 1)
return 1;
1602 int word_height = (rows[row_start].ri_->lword_box.height() +
1603 rows[row_end - 1].ri_->lword_box.height()) / 2;
1604 int word_width = (rows[row_start].ri_->lword_box.width() +
1605 rows[row_end - 1].ri_->lword_box.width()) / 2;
1606 STATS spacing_widths(0, 5 + word_width);
1607 for (
int i = row_start; i < row_end; i++) {
1608 if (rows[i].ri_->num_words > 1) {
1609 spacing_widths.
add(rows[i].ri_->average_interword_space, 1);
1612 int minimum_reasonable_space = word_height / 3;
1613 if (minimum_reasonable_space < 2)
1614 minimum_reasonable_space = 2;
1615 int median = spacing_widths.
median();
1616 return (median > minimum_reasonable_space)
1617 ? median : minimum_reasonable_space;
1629 tprintf(
"Don't call FirstWordWouldHaveFit(r, s, JUSTIFICATION_UNKNOWN).\n");
1631 int available_space;
1652 int available_space = before.
lindent_;
1653 if (before.
rindent_ > available_space)
1695 int start,
int end,
int tolerance,
bool *consistent) {
1696 int ltr_line_count = 0;
1697 for (
int i = start; i < end; i++) {
1698 ltr_line_count +=
static_cast<int>((*rows)[i].ri_->ltr);
1700 bool ltr = (ltr_line_count >= (end - start) / 2);
1703 if (!AcceptableRowArgs(0, 2, __func__, rows, start, end))
1708 int lmargin = (*rows)[start].lmargin_;
1709 int rmargin = (*rows)[start].rmargin_;
1710 int lmin, lmax, rmin, rmax, cmin, cmax;
1711 lmin = lmax = (*rows)[start + 1].lindent_;
1712 rmin = rmax = (*rows)[start + 1].rindent_;
1714 for (
int i = start + 1; i < end; i++) {
1715 if ((*rows)[i].lmargin_ != lmargin || (*rows)[i].rmargin_ != rmargin) {
1716 tprintf(
"Margins don't match! Software error.\n");
1717 *consistent =
false;
1722 UpdateRange((*rows)[i].rindent_ - (*rows)[i].lindent_, &cmin, &cmax);
1724 int ldiff = lmax - lmin;
1725 int rdiff = rmax - rmin;
1726 int cdiff = cmax - cmin;
1727 if (rdiff > tolerance && ldiff > tolerance) {
1728 if (cdiff < tolerance * 2) {
1729 if (end - start < 3)
1733 *consistent =
false;
1736 if (end - start < 3)
1741 bool body_admits_left_alignment = ldiff < tolerance;
1742 bool body_admits_right_alignment = rdiff < tolerance;
1746 (lmin + lmax) / 2, tolerance);
1749 (rmin + rmax) / 2, tolerance);
1753 bool text_admits_left_alignment = ltr || left_model.
is_flush();
1754 bool text_admits_right_alignment = !ltr || right_model.
is_flush();
1759 if (tolerance < rdiff) {
1760 if (body_admits_left_alignment && text_admits_left_alignment)
1762 *consistent =
false;
1765 if (tolerance < ldiff) {
1766 if (body_admits_right_alignment && text_admits_right_alignment)
1768 *consistent =
false;
1776 int first_left = (*rows)[start].lindent_;
1777 int first_right = (*rows)[start].rindent_;
1779 if (ltr && body_admits_left_alignment &&
1780 (first_left < lmin || first_left > lmax))
1782 if (!ltr && body_admits_right_alignment &&
1783 (first_right < rmin || first_right > rmax))
1786 *consistent =
false;
1797 int start,
int end,
int tolerance) {
1798 bool unused_consistent;
1800 rows, start, end, tolerance, &unused_consistent);
1802 tprintf(
"Could not determine a model for this paragraph:\n");
1803 PrintRowRange(*rows, start, end);
1811 if (!AcceptableRowArgs(0, 1, __func__, rows, start, end))
1814 for (
int i = start + 1 ; i < end; i++) {
1832 int row_start,
int row_end) {
1834 for (
int i = row_start + 1; i < row_end; i++) {
1872 for (
int i = row_start + 1; i < row_end - 1; i++) {
1903 int row_start,
int row_end,
1904 bool allow_flush_models,
1906 if (!AcceptableRowArgs(debug_level, 2, __func__, rows, row_start, row_end))
1909 int start = row_start;
1910 while (start < row_end) {
1911 while (start < row_end && (*rows)[start].GetLineType() !=
LT_START)
1913 if (start >= row_end - 1)
1916 int tolerance = Epsilon((*rows)[start + 1].ri_->average_interword_space);
1919 bool next_consistent;
1925 if (end < row_end - 1) {
1928 next_consistent = lt ==
LT_BODY ||
1932 next_consistent =
false;
1934 if (next_consistent) {
1936 rows, start, end + 1, tolerance, &next_consistent);
1937 if (((*rows)[start].ri_->ltr &&
1940 (!(*rows)[start].ri_->ltr &&
1943 next_consistent =
false;
1945 last_model = next_model;
1947 next_consistent =
false;
1949 }
while (next_consistent && end < row_end);
1953 if (end > start + 1) {
1957 debug_level, rows, start, end,
1962 if (end == start + 2) {
1965 }
else if (start == row_start) {
1972 }
else if (allow_flush_models) {
1973 model = theory->
AddModel(new_model);
1976 model = theory->
AddModel(new_model);
1979 (*rows)[start].AddStartLine(model);
1980 for (
int i = start + 1; i < end; i++) {
1981 (*rows)[i].AddBodyLine(model);
1998 int row_start,
int row_end,
2000 if (!AcceptableRowArgs(debug_level, 2, __func__, rows, row_start, row_end))
2003 if (debug_level > 1) {
2004 tprintf(
"#############################################\n");
2005 tprintf(
"# StrongEvidenceClassify( rows[%d:%d) )\n", row_start, row_end);
2006 tprintf(
"#############################################\n");
2012 DebugDump(debug_level > 2,
"Initial strong signals.", *theory, *rows);
2017 DebugDump(debug_level > 2,
"Unsmeared hypotheses.s.", *theory, *rows);
2027 int row_start,
int row_end,
2029 for (
int i = row_start + 1; i < row_end - 1; i++) {
2030 if ((*rows)[i - 1].ri_->has_leaders &&
2031 (*rows)[i].ri_->has_leaders &&
2032 (*rows)[i + 1].ri_->has_leaders) {
2035 (*rows)[i].AddStartLine(model);
2047 int end = rows.
size();
2049 for (; end > 0; end = start) {
2053 bool single_line_paragraph =
false;
2055 rows[start].NonNullHypotheses(&models);
2056 if (!models.
empty()) {
2058 if (rows[start].GetLineType(model) !=
LT_BODY)
2059 single_line_paragraph =
true;
2061 if (model && !single_line_paragraph) {
2063 while (--start > 0 && rows[start].GetLineType(model) ==
LT_BODY) {
2066 if (start < 0 || rows[start].GetLineType(model) !=
LT_START) {
2070 if (model == NULL) {
2080 for (
int row = end; row < rows.
size(); row++) {
2081 if ((*row_owners)[row] &&
2085 model = (*row_owners)[row]->model;
2093 0, 0, Epsilon(rows[start].ri_->average_interword_space)));
2098 0, 0, Epsilon(rows[start].ri_->average_interword_space)));
2101 rows[start].SetUnknown();
2102 rows[start].AddStartLine(model);
2103 for (
int i = start + 1; i < end; i++) {
2104 rows[i].SetUnknown();
2105 rows[i].AddBodyLine(model);
2111 ? rows[start].ri_->rword_indicates_list_item
2112 : rows[start].ri_->lword_indicates_list_item;
2113 for (
int row = start; row < end; row++) {
2114 if ((*row_owners)[row] != NULL) {
2115 tprintf(
"Memory leak! ConvertHypothesizeModelRunsToParagraphs() called " 2116 "more than once!\n");
2117 delete (*row_owners)[row];
2119 (*row_owners)[row] = p;
2143 rows[row].StrongHypotheses(&row_models);
2145 for (
int m = 0; m < row_models.
size(); m++) {
2146 bool all_starts = rows[row].GetLineType();
2148 bool continues =
true;
2149 for (
int i = row - 1; i >= 0 && continues; i--) {
2151 rows[i].NonNullHypotheses(&models);
2152 switch (rows[i].GetLineType(row_models[m])) {
2153 case LT_START: run_length++;
break;
2155 case LT_BODY: run_length++; all_starts =
false;
break;
2157 default: continues =
false;
2161 for (
int i = row + 1; i < rows.
size() && continues; i++) {
2163 rows[i].NonNullHypotheses(&models);
2164 switch (rows[i].GetLineType(row_models[m])) {
2165 case LT_START: run_length++;
break;
2167 case LT_BODY: run_length++; all_starts =
false;
break;
2169 default: continues =
false;
2172 if (run_length > 2 || (!all_starts && run_length > 1))
return false;
2185 int row_start,
int row_end) {
2187 for (
int i = row_start; i < row_end; i++) {
2188 bool needs_fixing =
false;
2192 rows[i].StrongHypotheses(&models);
2193 rows[i].NonNullHypotheses(&models_w_crowns);
2194 if (models.
empty() && !models_w_crowns.
empty()) {
2196 for (
int end = i + 1; end < rows.
size(); end++) {
2199 rows[end].NonNullHypotheses(&end_models);
2200 rows[end].StrongHypotheses(&strong_end_models);
2201 if (end_models.
empty()) {
2202 needs_fixing =
true;
2204 }
else if (!strong_end_models.
empty()) {
2205 needs_fixing =
false;
2209 }
else if (models.
empty() && rows[i].ri_->num_words > 0) {
2211 needs_fixing =
true;
2214 if (!needs_fixing && !models.
empty()) {
2219 if (!to_fix->
empty() && to_fix->
back().end == i - 1)
2220 to_fix->
back().end = i;
2226 for (
int i = 0; i < to_fix->
size(); i++) {
2227 (*to_fix)[i].end = (*to_fix)[i].end + 1;
2236 PARA_LIST *paragraphs) {
2238 paragraphs->
clear();
2239 PARA_IT out(paragraphs);
2240 PARA *formerly_null = NULL;
2241 for (
int i = 0; i < rows.
size(); i++) {
2242 if (rows[i] == NULL) {
2243 if (i == 0 || rows[i - 1] != formerly_null) {
2244 rows[i] = formerly_null =
new PARA();
2246 rows[i] = formerly_null;
2249 }
else if (i > 0 && rows[i - 1] == rows[i]) {
2252 out.add_after_then_move(rows[i]);
2269 PARA_LIST *paragraphs,
2279 for (
int i = 0; i < row_infos->
size(); i++) {
2280 rows[i].Init((*row_infos)[i]);
2290 DebugDump(debug_level > 1,
"End of Pass 1", theory, rows);
2294 for (
int i = 0; i < leftovers.
size(); i++) {
2301 leftovers[i].begin, leftovers[i].end, &theory);
2309 bool pass2a_was_useful = leftovers2.
size() > 1 ||
2310 (leftovers2.
size() == 1 &&
2311 (leftovers2[0].begin != 0 || leftovers2[0].end != rows.
size()));
2312 if (pass2a_was_useful) {
2313 for (
int j = 0; j < leftovers2.
size(); j++) {
2315 leftovers2[j].begin, leftovers2[j].end,
2321 DebugDump(debug_level > 1,
"End of Pass 2", theory, rows);
2328 for (
int i = 0; i < leftovers.
size(); i++) {
2330 leftovers[i].begin, leftovers[i].end, &theory);
2336 DebugDump(debug_level > 1,
"End of Pass 3", theory, rows);
2341 for (
int i = 0; i < leftovers.
size(); i++) {
2342 for (
int j = leftovers[i].begin; j < leftovers[i].end; j++) {
2343 rows[j].SetUnknown();
2347 DebugDump(debug_level > 1,
"End of Pass 4", theory, rows);
2353 DebugDump(debug_level > 0,
"Final Paragraph Segmentation", theory, rows);
2365 PageIterator pit(static_cast<const PageIterator&>(it));
2366 bool first_word =
true;
2381 if (fake_text.
size() == 0)
return;
2384 for (
int i = 0; i < lspaces; i++) {
2387 info->
text += fake_text;
2399 if (!lword) lword = word_res;
2400 if (rword != word_res) info->
num_words++;
2403 word_res = page_res_it.
forward();
2404 }
while (page_res_it.
row() == this_row);
2445 if (!after_recognition) {
2451 int trailing_ws_idx = strlen(text.get());
2452 while (trailing_ws_idx > 0 &&
2454 ((text[trailing_ws_idx - 1] & 0x80) == 0) &&
2455 isspace(text[trailing_ws_idx - 1]))
2457 if (trailing_ws_idx > 0) {
2459 for (
int i = 0; i < lspaces; i++)
2461 for (
int i = 0; i < trailing_ws_idx; i++)
2462 info->
text += text[i];
2473 int num_leaders = 0;
2483 word_res = page_res_it.
forward();
2484 }
while (page_res_it.
row() == this_row);
2485 info->
ltr = ltr >= rtl;
2488 if (!werds.
empty()) {
2489 WERD_RES *lword = werds[0], *rword = werds[werds.
size() - 1];
2493 info->
rword_box = rword->word->bounding_box();
2511 bool after_text_recognition,
2541 if (!row_infos.
empty()) {
2542 int min_lmargin = row_infos[0].pix_ldistance;
2543 int min_rmargin = row_infos[0].pix_rdistance;
2544 for (
int i = 1; i < row_infos.
size(); i++) {
2545 if (row_infos[i].pix_ldistance < min_lmargin)
2546 min_lmargin = row_infos[i].pix_ldistance;
2547 if (row_infos[i].pix_rdistance < min_rmargin)
2548 min_rmargin = row_infos[i].pix_rdistance;
2550 if (min_lmargin > 0 || min_rmargin > 0) {
2551 for (
int i = 0; i < row_infos.
size(); i++) {
2552 row_infos[i].pix_ldistance -= min_lmargin;
2553 row_infos[i].pix_rdistance -= min_rmargin;
2561 if (!is_image_block) {
2571 for (
int i = 0; i < row_owners.
size(); i++) {
2572 while (!row.PageResIt()->row())
2574 row.PageResIt()->row()->row->set_para(row_owners[i]);
void GetClusters(GenericVector< Cluster > *clusters)
void DiscardUnusedModels(const GenericVector< RowScratchRegisters > &rows, ParagraphTheory *theory)
const PAGE_RES_IT * PageResIt() const
void DetectParagraphs(int debug_level, GenericVector< RowInfo > *row_infos, GenericVector< PARA *> *row_owners, PARA_LIST *paragraphs, GenericVector< ParagraphModel *> *models)
bool RowIsStranded(const GenericVector< RowScratchRegisters > &rows, int row)
int ClosestCluster(const GenericVector< Cluster > &clusters, int value)
bool AsciiLikelyListItem(const STRING &word)
tesseract::ParagraphJustification justification() const
void InitializeRowInfo(bool after_recognition, const MutableIterator &it, RowInfo *info)
void NonNullHypotheses(SetOfModels *models) const
bool Empty(PageIteratorLevel level) const
Cluster(int cen, int num)
bool StrongModel(const ParagraphModel *model)
bool rword_likely_ends_idea
void AssumeRightJustification()
virtual bool Next(PageIteratorLevel level)
UNICHAR_ID unichar_id(int index) const
int average_interword_space
const ParagraphModel * kCrownRight
void init_to_size(int size, T t)
bool ValidFirstLine(const GenericVector< RowScratchRegisters > *rows, int row, const ParagraphModel *model)
bool IsOpeningPunct(int ch)
void StrongEvidenceClassify(int debug_level, GenericVector< RowScratchRegisters > *rows, int row_start, int row_end, ParagraphTheory *theory)
bool get_ispunctuation(UNICHAR_ID unichar_id) const
void Fail(int min_debug_level, const char *why) const
WERD_CHOICE * best_choice
virtual char * GetUTF8Text(PageIteratorLevel level) const
void LeftoverSegments(const GenericVector< RowScratchRegisters > &rows, GenericVector< Interval > *to_fix, int row_start, int row_end)
bool IsFullRow(int i) const
bool LikelyParagraphStart(const RowScratchRegisters &before, const RowScratchRegisters &after)
bool RowsFitModel(const GenericVector< RowScratchRegisters > *rows, int start, int end, const ParagraphModel *model)
const GenericVector< Cluster > & OffsideTabs() const
ParagraphModel InternalParagraphModelByOutline(const GenericVector< RowScratchRegisters > *rows, int start, int end, int tolerance, bool *consistent)
bool AnyRtlCharsInWord() const
void DiscardNonMatchingHypotheses(const SetOfModels &models)
bool lword_likely_ends_idea
void InitializeTextAndBoxesPreRecognition(const MutableIterator &it, RowInfo *info)
ParagraphModelSmearer(GenericVector< RowScratchRegisters > *rows, int row_start, int row_end, ParagraphTheory *theory)
SimpleClusterer(int max_cluster_width)
int AlignsideTabIndex(int row_idx) const
void CalculateTabStops(GenericVector< RowScratchRegisters > *rows, int row_start, int row_end, int tolerance, GenericVector< Cluster > *left_tabs, GenericVector< Cluster > *right_tabs)
GenericVector< ParagraphModel * > & models()
int UnicodeFor(const UNICHARSET *u, const WERD_CHOICE *werd, int pos)
const GenericVector< Cluster > & AlignTabs() const
void RightWordAttributes(const UNICHARSET *unicharset, const WERD_CHOICE *werd, const STRING &utf8, bool *is_list, bool *starts_idea, bool *ends_idea)
GenericVector< RowScratchRegisters > * rows
const char * string() const
bool IsTerminalPunct(int ch)
void RecomputeMarginsAndClearHypotheses(GenericVector< RowScratchRegisters > *rows, int start, int end, int percentile)
LineType GetLineType() const
bool NearlyEqual(T x, T y, T tolerance)
GeometricClassifierState(int dbg_level, GenericVector< RowScratchRegisters > *r, int r_start, int r_end)
void MarkStrongEvidence(GenericVector< RowScratchRegisters > *rows, int row_start, int row_end)
void AddStartLine(const ParagraphModel *model)
ParagraphModel Model() const
bool UniLikelyListItem(const UNICHARSET *u, const WERD_CHOICE *werd)
virtual bool IsAtFinalElement(PageIteratorLevel level, PageIteratorLevel element) const
virtual bool IsAtFinalElement(PageIteratorLevel level, PageIteratorLevel element) const
TBOX bounding_box() const
bool lword_likely_starts_idea
bool TextSupportsBreak(const RowScratchRegisters &before, const RowScratchRegisters &after)
BOOL8 flag(WERD_FLAGS mask) const
const char * id_to_unichar(UNICHAR_ID id) const
int IndexOf(const ParagraphModel *model) const
void GeometricClassify(int debug_level, GenericVector< RowScratchRegisters > *rows, int row_start, int row_end, ParagraphTheory *theory)
bool IsLatinLetter(int ch)
bool get_isdigit(UNICHAR_ID unichar_id) const
void GeometricClassifyThreeTabStopTextBlock(int debug_level, GeometricClassifierState &s, ParagraphTheory *theory)
const ParagraphModel * model
bool ValidFirstLine(int lmargin, int lindent, int rindent, int rmargin) const
void DowngradeWeakestToCrowns(int debug_level, ParagraphTheory *theory, GenericVector< RowScratchRegisters > *rows)
bool FirstWordWouldHaveFit(int row_a, int row_b)
tesseract::ParagraphJustification just
void ModelStrongEvidence(int debug_level, GenericVector< RowScratchRegisters > *rows, int row_start, int row_end, bool allow_flush_models, ParagraphTheory *theory)
T ClipToRange(const T &x, const T &lower_bound, const T &upper_bound)
bool contains(T object) const
bool rword_likely_starts_idea
bool LikelyListNumeral(const STRING &word)
bool FirstWordWouldHaveFit(const RowScratchRegisters &before, const RowScratchRegisters &after)
const ParagraphModel * UniqueStartHypothesis() const
bool FirstWordWouldHaveFit(const RowScratchRegisters &before, const RowScratchRegisters &after, tesseract::ParagraphJustification justification)
void AppendDebugInfo(const ParagraphTheory &theory, GenericVector< STRING > *dbg) const
void StrongHypotheses(SetOfModels *models) const
bool is_very_first_or_continuation
const ParagraphModel * Fits(const GenericVector< RowScratchRegisters > *rows, int start, int end) const
void MarkRowsWithModel(GenericVector< RowScratchRegisters > *rows, int row_start, int row_end, const ParagraphModel *model, bool ltr, int eop_threshold)
void StartHypotheses(SetOfModels *models) const
void add(inT32 value, inT32 count)
int InterwordSpace(const GenericVector< RowScratchRegisters > &rows, int row_start, int row_end)
void DiscardUnusedModels(const SetOfModels &used_models)
int OffsideIndent(tesseract::ParagraphJustification just) const
virtual bool Next(PageIteratorLevel level)
static void AppendDebugHeaderFields(GenericVector< STRING > *header)
const STRING & unichar_string() const
void AddBodyLine(const ParagraphModel *model)
POLY_BLOCK * poly_block() const
const ParagraphModel * AddModel(const ParagraphModel &model)
const ParagraphModel * UniqueBodyHypothesis() const
void AssumeLeftJustification()
void Init(const RowInfo &row)
bool lword_indicates_list_item
double ile(double frac) const
bool LikelyListMarkUnicode(int ch)
bool LikelyListMark(const STRING &word)
const char * SkipChars(const char *str, const char *toskip)
bool ValidBodyLine(const GenericVector< RowScratchRegisters > *rows, int row, const ParagraphModel *model)
GenericVector< Cluster > left_tabs
bool get_isupper(UNICHAR_ID unichar_id) const
const UNICHARSET * uch_set
void LeftWordAttributes(const UNICHARSET *unicharset, const WERD_CHOICE *werd, const STRING &utf8, bool *is_list, bool *starts_idea, bool *ends_idea)
void SeparateSimpleLeaderLines(GenericVector< RowScratchRegisters > *rows, int row_start, int row_end, ParagraphTheory *theory)
bool AnyLtrCharsInWord() const
void ConvertHypothesizedModelRunsToParagraphs(int debug_level, const GenericVector< RowScratchRegisters > &rows, GenericVector< PARA *> *row_owners, ParagraphTheory *theory)
const char * SkipOne(const char *str, const char *toskip)
void UpdateRange(const T1 &x, T2 *lower_bound, T2 *upper_bound)
bool rword_indicates_list_item
bool ValidBodyLine(int lmargin, int lindent, int rindent, int rmargin) const
ParagraphModel ParagraphModelByOutline(int debug_level, const GenericVector< RowScratchRegisters > *rows, int start, int end, int tolerance)
int push_back_new(T object)
void NonCenteredModels(SetOfModels *models)
GenericVector< Cluster > right_tabs
BLOCK_RES * block() const
const ParagraphModel * kCrownLeft
bool CrownCompatible(const GenericVector< RowScratchRegisters > *rows, int a, int b, const ParagraphModel *model)
void CanonicalizeDetectionResults(GenericVector< PARA *> *row_owners, PARA_LIST *paragraphs)
bool has_drop_cap() const
UnicodeSpanSkipper(const UNICHARSET *unicharset, const WERD_CHOICE *word)
STRING RtlEmbed(const STRING &word, bool rtlify)