tesseract  4.00.00dev
tesseract::EquationDetect Class Reference

#include <equationdetect.h>

Inheritance diagram for tesseract::EquationDetect:
tesseract::EquationDetectBase

Public Types

enum  IndentType {
  NO_INDENT, LEFT_INDENT, RIGHT_INDENT, BOTH_INDENT,
  INDENT_TYPE_COUNT
}
 

Public Member Functions

 EquationDetect (const char *equ_datapath, const char *equ_language)
 
 ~EquationDetect ()
 
void SetLangTesseract (Tesseract *lang_tesseract)
 
int LabelSpecialText (TO_BLOCK *to_block)
 
int FindEquationParts (ColPartitionGrid *part_grid, ColPartitionSet **best_columns)
 
void SetResolution (const int resolution)
 
- Public Member Functions inherited from tesseract::EquationDetectBase
 EquationDetectBase ()
 
virtual ~EquationDetectBase ()
 

Protected Member Functions

void IdentifySpecialText (BLOBNBOX *blob, const int height_th)
 
BlobSpecialTextType EstimateTypeForUnichar (const UNICHARSET &unicharset, const UNICHAR_ID id) const
 
void IdentifySpecialText ()
 
void IdentifyBlobsToSkip (ColPartition *part)
 
void MergePartsByLocation ()
 
void SearchByOverlap (ColPartition *seed, GenericVector< ColPartition *> *parts_overlap)
 
void InsertPartAfterAbsorb (ColPartition *part)
 
void IdentifySeedParts ()
 
bool CheckSeedBlobsCount (ColPartition *part)
 
float ComputeForegroundDensity (const TBOX &tbox)
 
bool CheckForSeed2 (const GenericVector< int > &indented_texts_left, const float foreground_density_th, ColPartition *part)
 
int CountAlignment (const GenericVector< int > &sorted_vec, const int val) const
 
bool CheckSeedFgDensity (const float density_th, ColPartition *part)
 
void SplitCPHorLite (ColPartition *part, GenericVector< TBOX > *splitted_boxes)
 
void SplitCPHor (ColPartition *part, GenericVector< ColPartition *> *parts_splitted)
 
bool CheckSeedDensity (const float math_density_high, const float math_density_low, const ColPartition *part) const
 
IndentType IsIndented (ColPartition *part)
 
void IdentifyInlineParts ()
 
void ComputeCPsSuperBBox ()
 
void IdentifyInlinePartsHorizontal ()
 
int EstimateTextPartLineSpacing ()
 
void IdentifyInlinePartsVertical (const bool top_to_bottom, const int textPartsLineSpacing)
 
bool IsInline (const bool search_bottom, const int textPartsLineSpacing, ColPartition *part)
 
bool ExpandSeed (ColPartition *seed)
 
void ExpandSeedHorizontal (const bool search_left, ColPartition *seed, GenericVector< ColPartition *> *parts_to_merge)
 
void ExpandSeedVertical (const bool search_bottom, ColPartition *seed, GenericVector< ColPartition *> *parts_to_merge)
 
bool IsNearSmallNeighbor (const TBOX &seed_box, const TBOX &part_box) const
 
bool CheckSeedNeighborDensity (const ColPartition *part) const
 
void ProcessMathBlockSatelliteParts ()
 
bool IsMathBlockSatellite (ColPartition *part, GenericVector< ColPartition *> *math_blocks)
 
ColPartitionSearchNNVertical (const bool search_bottom, const ColPartition *part)
 
bool IsNearMathNeighbor (const int y_gap, const ColPartition *neighbor) const
 
void GetOutputTiffName (const char *name, STRING *image_name) const
 
void PaintColParts (const STRING &outfile) const
 
void PaintSpecialTexts (const STRING &outfile) const
 
void PrintSpecialBlobsDensity (const ColPartition *part) const
 

Protected Attributes

Tesseract equ_tesseract_
 
Tesseractlang_tesseract_
 
ColPartitionGridpart_grid_
 
ColPartitionSet ** best_columns_
 
TBOXcps_super_bbox_
 
GenericVector< ColPartition * > cp_seeds_
 
int resolution_
 
int page_count_
 

Additional Inherited Members

- Static Public Member Functions inherited from tesseract::EquationDetectBase
static void RenderSpecialText (Pix *pix, BLOBNBOX *blob)
 

Detailed Description

Definition at line 43 of file equationdetect.h.

Member Enumeration Documentation

◆ IndentType

Constructor & Destructor Documentation

◆ EquationDetect()

tesseract::EquationDetect::EquationDetect ( const char *  equ_datapath,
const char *  equ_language 
)

Definition at line 105 of file equationdetect.cpp.

106  {
107  const char* default_name = "equ";
108  if (equ_name == NULL) {
109  equ_name = default_name;
110  }
111  lang_tesseract_ = NULL;
112  resolution_ = 0;
113  page_count_ = 0;
114 
115  if (equ_tesseract_.init_tesseract(equ_datapath, equ_name,
117  tprintf("Warning: equation region detection requested,"
118  " but %s failed to load from %s\n", equ_name, equ_datapath);
119  }
120 
121  cps_super_bbox_ = NULL;
122 }
#define tprintf(...)
Definition: tprintf.h:31
int init_tesseract(const char *arg0, const char *textbase, const char *language, OcrEngineMode oem, char **configs, int configs_size, const GenericVector< STRING > *vars_vec, const GenericVector< STRING > *vars_values, bool set_only_init_params, TessdataManager *mgr)
Definition: tessedit.cpp:295

◆ ~EquationDetect()

tesseract::EquationDetect::~EquationDetect ( )

Definition at line 124 of file equationdetect.cpp.

124  {
125  delete(cps_super_bbox_);
126 }

Member Function Documentation

◆ CheckForSeed2()

bool tesseract::EquationDetect::CheckForSeed2 ( const GenericVector< int > &  indented_texts_left,
const float  foreground_density_th,
ColPartition part 
)
protected

Definition at line 743 of file equationdetect.cpp.

746  {
747  ASSERT_HOST(part);
748  const TBOX& box = part->bounding_box();
749 
750  // Check if it is aligned with any indented_texts_left.
751  if (!indented_texts_left.empty() &&
752  CountAlignment(indented_texts_left, box.left()) >=
754  return false;
755  }
756 
757  // Check the foreground density.
758  if (ComputeForegroundDensity(box) > foreground_density_th) {
759  return false;
760  }
761 
762  return true;
763 }
float ComputeForegroundDensity(const TBOX &tbox)
const int kLeftIndentAlignmentCountTh
int CountAlignment(const GenericVector< int > &sorted_vec, const int val) const
bool empty() const
Definition: genericvector.h:90
#define ASSERT_HOST(x)
Definition: errcode.h:84
inT16 left() const
Definition: rect.h:68
Definition: rect.h:30

◆ CheckSeedBlobsCount()

bool tesseract::EquationDetect::CheckSeedBlobsCount ( ColPartition part)
protected

Definition at line 990 of file equationdetect.cpp.

990  {
991  if (!part) {
992  return false;
993  }
994  const int kSeedMathBlobsCount = 2;
995  const int kSeedMathDigitBlobsCount = 5;
996 
997  int blobs = part->boxes_count(),
998  math_blobs = part->SpecialBlobsCount(BSTT_MATH),
999  digit_blobs = part->SpecialBlobsCount(BSTT_DIGIT);
1000  if (blobs < kSeedBlobsCountTh || math_blobs <= kSeedMathBlobsCount ||
1001  math_blobs + digit_blobs <= kSeedMathDigitBlobsCount) {
1002  return false;
1003  }
1004 
1005  return true;
1006 }
const int kSeedBlobsCountTh

◆ CheckSeedDensity()

bool tesseract::EquationDetect::CheckSeedDensity ( const float  math_density_high,
const float  math_density_low,
const ColPartition part 
) const
protected

Definition at line 1008 of file equationdetect.cpp.

1011  {
1012  ASSERT_HOST(part);
1013  float math_digit_density = part->SpecialBlobsDensity(BSTT_MATH)
1014  + part->SpecialBlobsDensity(BSTT_DIGIT);
1015  float italic_density = part->SpecialBlobsDensity(BSTT_ITALIC);
1016  if (math_digit_density > math_density_high) {
1017  return true;
1018  }
1019  if (math_digit_density + italic_density > kMathItalicDensityTh &&
1020  math_digit_density > math_density_low) {
1021  return true;
1022  }
1023 
1024  return false;
1025 }
const float kMathItalicDensityTh
#define ASSERT_HOST(x)
Definition: errcode.h:84

◆ CheckSeedFgDensity()

bool tesseract::EquationDetect::CheckSeedFgDensity ( const float  density_th,
ColPartition part 
)
protected

Definition at line 631 of file equationdetect.cpp.

632  {
633  ASSERT_HOST(part);
634 
635  // Split part horizontall, and check for each sub part.
636  GenericVector<TBOX> sub_boxes;
637  SplitCPHorLite(part, &sub_boxes);
638  float parts_passed = 0.0;
639  for (int i = 0; i < sub_boxes.size(); ++i) {
640  float density = ComputeForegroundDensity(sub_boxes[i]);
641  if (density < density_th) {
642  parts_passed++;
643  }
644  }
645 
646  // If most sub parts passed, then we return true.
647  const float kSeedPartRatioTh = 0.3;
648  bool retval = (parts_passed / sub_boxes.size() >= kSeedPartRatioTh);
649 
650  return retval;
651 }
float ComputeForegroundDensity(const TBOX &tbox)
void SplitCPHorLite(ColPartition *part, GenericVector< TBOX > *splitted_boxes)
int size() const
Definition: genericvector.h:72
#define ASSERT_HOST(x)
Definition: errcode.h:84

◆ CheckSeedNeighborDensity()

bool tesseract::EquationDetect::CheckSeedNeighborDensity ( const ColPartition part) const
protected

Definition at line 1299 of file equationdetect.cpp.

1299  {
1300  ASSERT_HOST(part);
1301  if (part->boxes_count() < kSeedBlobsCountTh) {
1302  // Too few blobs, skip the check.
1303  return true;
1304  }
1305 
1306  // We check the math blobs density and the unclear blobs density.
1307  if (part->SpecialBlobsDensity(BSTT_MATH) +
1308  part->SpecialBlobsDensity(BSTT_DIGIT) > kMathDigitDensityTh1 ||
1309  part->SpecialBlobsDensity(BSTT_UNCLEAR) > kUnclearDensityTh) {
1310  return true;
1311  }
1312 
1313  return false;
1314 }
const int kSeedBlobsCountTh
#define ASSERT_HOST(x)
Definition: errcode.h:84
const float kUnclearDensityTh
const float kMathDigitDensityTh1

◆ ComputeCPsSuperBBox()

void tesseract::EquationDetect::ComputeCPsSuperBBox ( )
protected

Definition at line 796 of file equationdetect.cpp.

796  {
798  ColPartition *part = NULL;
799  gsearch.StartFullSearch();
800  if (cps_super_bbox_) {
801  delete cps_super_bbox_;
802  }
803  cps_super_bbox_ = new TBOX();
804  while ((part = gsearch.NextFullSearch()) != NULL) {
805  (*cps_super_bbox_) += part->bounding_box();
806  }
807 }
ColPartitionGrid * part_grid_
GridSearch< ColPartition, ColPartition_CLIST, ColPartition_C_IT > ColPartitionGridSearch
Definition: colpartition.h:932
Definition: rect.h:30

◆ ComputeForegroundDensity()

float tesseract::EquationDetect::ComputeForegroundDensity ( const TBOX tbox)
protected

Definition at line 617 of file equationdetect.cpp.

617  {
618  Pix *pix_bi = lang_tesseract_->pix_binary();
619  int pix_height = pixGetHeight(pix_bi);
620  Box* box = boxCreate(tbox.left(), pix_height - tbox.top(),
621  tbox.width(), tbox.height());
622  Pix *pix_sub = pixClipRectangle(pix_bi, box, NULL);
623  l_float32 fract;
624  pixForegroundFraction(pix_sub, &fract);
625  pixDestroy(&pix_sub);
626  boxDestroy(&box);
627 
628  return fract;
629 }
Pix * pix_binary() const
inT16 left() const
Definition: rect.h:68
inT16 top() const
Definition: rect.h:54
inT16 height() const
Definition: rect.h:104
inT16 width() const
Definition: rect.h:111

◆ CountAlignment()

int tesseract::EquationDetect::CountAlignment ( const GenericVector< int > &  sorted_vec,
const int  val 
) const
protected

Definition at line 765 of file equationdetect.cpp.

766  {
767  if (sorted_vec.empty()) {
768  return 0;
769  }
770  const int kDistTh = static_cast<int>(roundf(0.03 * resolution_));
771  int pos = sorted_vec.binary_search(val), count = 0;
772 
773  // Search left side.
774  int index = pos;
775  while (index >= 0 && abs(val - sorted_vec[index--]) < kDistTh) {
776  count++;
777  }
778 
779  // Search right side.
780  index = pos + 1;
781  while (index < sorted_vec.size() && sorted_vec[index++] - val < kDistTh) {
782  count++;
783  }
784 
785  return count;
786 }
bool empty() const
Definition: genericvector.h:90
int size() const
Definition: genericvector.h:72
int binary_search(const T &target) const
float roundf(float num)
Definition: mathfix.h:35
int count(LIST var_list)
Definition: oldlist.cpp:103

◆ EstimateTextPartLineSpacing()

int tesseract::EquationDetect::EstimateTextPartLineSpacing ( )
protected

Definition at line 874 of file equationdetect.cpp.

874  {
876 
877  // Get the y gap between text partitions;
878  ColPartition *current = NULL, *prev = NULL;
879  gsearch.StartFullSearch();
880  GenericVector<int> ygaps;
881  while ((current = gsearch.NextFullSearch()) != NULL) {
882  if (!PTIsTextType(current->type())) {
883  continue;
884  }
885  if (prev != NULL) {
886  const TBOX &current_box = current->bounding_box();
887  const TBOX &prev_box = prev->bounding_box();
888  // prev and current should be x major overlap and non y overlap.
889  if (current_box.major_x_overlap(prev_box) &&
890  !current_box.y_overlap(prev_box)) {
891  int gap = current_box.y_gap(prev_box);
892  if (gap < MIN(current_box.height(), prev_box.height())) {
893  // The gap should be smaller than the height of the bounding boxes.
894  ygaps.push_back(gap);
895  }
896  }
897  }
898  prev = current;
899  }
900 
901  if (ygaps.size() < 8) { // We do not have enough data.
902  return -1;
903  }
904 
905  // Compute the line spacing from ygaps: use the mean of the first half.
906  ygaps.sort();
907  int spacing = 0, count;
908  for (count = 0; count < ygaps.size() / 2; count++) {
909  spacing += ygaps[count];
910  }
911  return spacing / count;
912 }
ColPartitionGrid * part_grid_
bool y_overlap(const TBOX &box) const
Definition: rect.h:418
GridSearch< ColPartition, ColPartition_CLIST, ColPartition_C_IT > ColPartitionGridSearch
Definition: colpartition.h:932
int push_back(T object)
bool major_x_overlap(const TBOX &box) const
Definition: rect.h:402
int size() const
Definition: genericvector.h:72
int y_gap(const TBOX &box) const
Definition: rect.h:225
Definition: rect.h:30
#define MIN(x, y)
Definition: ndminx.h:28
inT16 height() const
Definition: rect.h:104
bool PTIsTextType(PolyBlockType type)
Definition: publictypes.h:70
int count(LIST var_list)
Definition: oldlist.cpp:103

◆ EstimateTypeForUnichar()

BlobSpecialTextType tesseract::EquationDetect::EstimateTypeForUnichar ( const UNICHARSET unicharset,
const UNICHAR_ID  id 
) const
protected

Definition at line 230 of file equationdetect.cpp.

231  {
232  STRING s = unicharset.id_to_unichar(id);
233  if (unicharset.get_isalpha(id)) {
234  return BSTT_NONE;
235  }
236 
237  if (unicharset.get_ispunctuation(id)) {
238  // Exclude some special texts that are likely to be confused as math symbol.
239  static GenericVector<UNICHAR_ID> ids_to_exclude;
240  if (ids_to_exclude.empty()) {
241  static const STRING kCharsToEx[] = {"'", "`", "\"", "\\", ",", ".",
242  "〈", "〉", "《", "》", "」", "「", ""};
243  int i = 0;
244  while (kCharsToEx[i] != "") {
245  ids_to_exclude.push_back(
246  unicharset.unichar_to_id(kCharsToEx[i++].string()));
247  }
248  ids_to_exclude.sort();
249  }
250  return ids_to_exclude.bool_binary_search(id) ? BSTT_NONE : BSTT_MATH;
251  }
252 
253  // Check if it is digit. In addition to the isdigit attribute, we also check
254  // if this character belongs to those likely to be confused with a digit.
255  static const STRING kDigitsChars = "|";
256  if (unicharset.get_isdigit(id) ||
257  (s.length() == 1 && kDigitsChars.contains(s[0]))) {
258  return BSTT_DIGIT;
259  } else {
260  return BSTT_MATH;
261  }
262 }
bool get_ispunctuation(UNICHAR_ID unichar_id) const
Definition: unicharset.h:479
int push_back(T object)
bool bool_binary_search(const T &target) const
const char * string() const
Definition: strngs.cpp:198
bool empty() const
Definition: genericvector.h:90
inT32 length() const
Definition: strngs.cpp:193
const char * id_to_unichar(UNICHAR_ID id) const
Definition: unicharset.cpp:266
bool get_isalpha(UNICHAR_ID unichar_id) const
Definition: unicharset.h:451
bool get_isdigit(UNICHAR_ID unichar_id) const
Definition: unicharset.h:472
Definition: strngs.h:45
BOOL8 contains(const char c) const
Definition: strngs.cpp:189
UNICHAR_ID unichar_to_id(const char *const unichar_repr) const
Definition: unicharset.cpp:194

◆ ExpandSeed()

bool tesseract::EquationDetect::ExpandSeed ( ColPartition seed)
protected

Definition at line 1091 of file equationdetect.cpp.

1091  {
1092  if (seed == NULL || // This seed has been absorbed by other seeds.
1093  seed->IsVerticalType()) { // We skip vertical type right now.
1094  return false;
1095  }
1096 
1097  // Expand in four directions.
1098  GenericVector<ColPartition*> parts_to_merge;
1099  ExpandSeedHorizontal(true, seed, &parts_to_merge);
1100  ExpandSeedHorizontal(false, seed, &parts_to_merge);
1101  ExpandSeedVertical(true, seed, &parts_to_merge);
1102  ExpandSeedVertical(false, seed, &parts_to_merge);
1103  SearchByOverlap(seed, &parts_to_merge);
1104 
1105  if (parts_to_merge.empty()) { // We don't find any partition to merge.
1106  return false;
1107  }
1108 
1109  // Merge all partitions in parts_to_merge with seed. We first remove seed
1110  // from part_grid_ as its bounding box is going to expand. Then we add it
1111  // back after it aborbs all parts_to_merge parititions.
1112  part_grid_->RemoveBBox(seed);
1113  for (int i = 0; i < parts_to_merge.size(); ++i) {
1114  ColPartition* part = parts_to_merge[i];
1115  if (part->type() == PT_EQUATION) {
1116  // If part is in cp_seeds_, then we mark it as NULL so that we won't
1117  // process it again.
1118  for (int j = 0; j < cp_seeds_.size(); ++j) {
1119  if (part == cp_seeds_[j]) {
1120  cp_seeds_[j] = NULL;
1121  break;
1122  }
1123  }
1124  }
1125 
1126  // part has already been removed from part_grid_ in function
1127  // ExpandSeedHorizontal/ExpandSeedVertical.
1128  seed->Absorb(part, NULL);
1129  }
1130 
1131  return true;
1132 }
ColPartitionGrid * part_grid_
GenericVector< ColPartition * > cp_seeds_
bool empty() const
Definition: genericvector.h:90
int size() const
Definition: genericvector.h:72
void ExpandSeedHorizontal(const bool search_left, ColPartition *seed, GenericVector< ColPartition *> *parts_to_merge)
void SearchByOverlap(ColPartition *seed, GenericVector< ColPartition *> *parts_overlap)
void RemoveBBox(BBC *bbox)
Definition: bbgrid.h:537
void ExpandSeedVertical(const bool search_bottom, ColPartition *seed, GenericVector< ColPartition *> *parts_to_merge)

◆ ExpandSeedHorizontal()

void tesseract::EquationDetect::ExpandSeedHorizontal ( const bool  search_left,
ColPartition seed,
GenericVector< ColPartition *> *  parts_to_merge 
)
protected

Definition at line 1134 of file equationdetect.cpp.

1137  {
1138  ASSERT_HOST(seed != NULL && parts_to_merge != NULL);
1139  const float kYOverlapTh = 0.6;
1140  const int kXGapTh = static_cast<int>(roundf(0.2 * resolution_));
1141 
1143  const TBOX& seed_box(seed->bounding_box());
1144  int x = search_left ? seed_box.left() : seed_box.right();
1145  search.StartSideSearch(x, seed_box.bottom(), seed_box.top());
1146  search.SetUniqueMode(true);
1147 
1148  // Search iteratively.
1149  ColPartition *part = NULL;
1150  while ((part = search.NextSideSearch(search_left)) != NULL) {
1151  if (part == seed) {
1152  continue;
1153  }
1154  const TBOX& part_box(part->bounding_box());
1155  if (part_box.x_gap(seed_box) > kXGapTh) { // Out of scope.
1156  break;
1157  }
1158 
1159  // Check part location.
1160  if ((part_box.left() >= seed_box.left() && search_left) ||
1161  (part_box.right() <= seed_box.right() && !search_left)) {
1162  continue;
1163  }
1164 
1165  if (part->type() != PT_EQUATION) { // Non-equation type.
1166  // Skip PT_LINLINE_EQUATION and non text type.
1167  if (part->type() == PT_INLINE_EQUATION ||
1168  (!IsTextOrEquationType(part->type()) &&
1169  part->blob_type() != BRT_HLINE)) {
1170  continue;
1171  }
1172  // For other types, it should be the near small neighbor of seed.
1173  if (!IsNearSmallNeighbor(seed_box, part_box) ||
1174  !CheckSeedNeighborDensity(part)) {
1175  continue;
1176  }
1177  } else { // Equation type, check the y overlap.
1178  if (part_box.y_overlap_fraction(seed_box) < kYOverlapTh &&
1179  seed_box.y_overlap_fraction(part_box) < kYOverlapTh) {
1180  continue;
1181  }
1182  }
1183 
1184  // Passed the check, delete it from search and add into parts_to_merge.
1185  search.RemoveBBox();
1186  parts_to_merge->push_back(part);
1187  }
1188 }
ColPartitionGrid * part_grid_
GridSearch< ColPartition, ColPartition_CLIST, ColPartition_C_IT > ColPartitionGridSearch
Definition: colpartition.h:932
int push_back(T object)
bool IsNearSmallNeighbor(const TBOX &seed_box, const TBOX &part_box) const
bool CheckSeedNeighborDensity(const ColPartition *part) const
#define ASSERT_HOST(x)
Definition: errcode.h:84
inT16 left() const
Definition: rect.h:68
LIST search(LIST list, void *key, int_compare is_equal)
Definition: oldlist.cpp:406
bool IsTextOrEquationType(PolyBlockType type)
float roundf(float num)
Definition: mathfix.h:35
Definition: rect.h:30

◆ ExpandSeedVertical()

void tesseract::EquationDetect::ExpandSeedVertical ( const bool  search_bottom,
ColPartition seed,
GenericVector< ColPartition *> *  parts_to_merge 
)
protected

Definition at line 1190 of file equationdetect.cpp.

1193  {
1194  ASSERT_HOST(seed != NULL && parts_to_merge != NULL &&
1195  cps_super_bbox_ != NULL);
1196  const float kXOverlapTh = 0.4;
1197  const int kYGapTh = static_cast<int>(roundf(0.2 * resolution_));
1198 
1200  const TBOX& seed_box(seed->bounding_box());
1201  int y = search_bottom ? seed_box.bottom() : seed_box.top();
1202  search.StartVerticalSearch(
1204  search.SetUniqueMode(true);
1205 
1206  // Search iteratively.
1207  ColPartition *part = NULL;
1209  int skipped_min_top = INT_MAX, skipped_max_bottom = -1;
1210  while ((part = search.NextVerticalSearch(search_bottom)) != NULL) {
1211  if (part == seed) {
1212  continue;
1213  }
1214  const TBOX& part_box(part->bounding_box());
1215 
1216  if (part_box.y_gap(seed_box) > kYGapTh) { // Out of scope.
1217  break;
1218  }
1219 
1220  // Check part location.
1221  if ((part_box.bottom() >= seed_box.bottom() && search_bottom) ||
1222  (part_box.top() <= seed_box.top() && !search_bottom)) {
1223  continue;
1224  }
1225 
1226  bool skip_part = false;
1227  if (part->type() != PT_EQUATION) { // Non-equation type.
1228  // Skip PT_LINLINE_EQUATION and non text type.
1229  if (part->type() == PT_INLINE_EQUATION ||
1230  (!IsTextOrEquationType(part->type()) &&
1231  part->blob_type() != BRT_HLINE)) {
1232  skip_part = true;
1233  } else if (!IsNearSmallNeighbor(seed_box, part_box) ||
1234  !CheckSeedNeighborDensity(part)) {
1235  // For other types, it should be the near small neighbor of seed.
1236  skip_part = true;
1237  }
1238  } else { // Equation type, check the x overlap.
1239  if (part_box.x_overlap_fraction(seed_box) < kXOverlapTh &&
1240  seed_box.x_overlap_fraction(part_box) < kXOverlapTh) {
1241  skip_part = true;
1242  }
1243  }
1244  if (skip_part) {
1245  if (part->type() != PT_EQUATION) {
1246  if (skipped_min_top > part_box.top()) {
1247  skipped_min_top = part_box.top();
1248  }
1249  if (skipped_max_bottom < part_box.bottom()) {
1250  skipped_max_bottom = part_box.bottom();
1251  }
1252  }
1253  } else {
1254  parts.push_back(part);
1255  }
1256  }
1257 
1258  // For every part in parts, we need verify it is not above skipped_min_top
1259  // when search top, or not below skipped_max_bottom when search bottom. I.e.,
1260  // we will skip a part if it looks like:
1261  // search bottom | search top
1262  // seed: ****************** | part: **********
1263  // skipped: xxx | skipped: xxx
1264  // part: ********** | seed: ***********
1265  for (int i = 0; i < parts.size(); i++) {
1266  const TBOX& part_box(parts[i]->bounding_box());
1267  if ((search_bottom && part_box.top() <= skipped_max_bottom) ||
1268  (!search_bottom && part_box.bottom() >= skipped_min_top)) {
1269  continue;
1270  }
1271  // Add parts[i] into parts_to_merge, and delete it from part_grid_.
1272  parts_to_merge->push_back(parts[i]);
1273  part_grid_->RemoveBBox(parts[i]);
1274  }
1275 }
ColPartitionGrid * part_grid_
GridSearch< ColPartition, ColPartition_CLIST, ColPartition_C_IT > ColPartitionGridSearch
Definition: colpartition.h:932
int push_back(T object)
bool IsNearSmallNeighbor(const TBOX &seed_box, const TBOX &part_box) const
int size() const
Definition: genericvector.h:72
bool CheckSeedNeighborDensity(const ColPartition *part) const
#define ASSERT_HOST(x)
Definition: errcode.h:84
inT16 left() const
Definition: rect.h:68
LIST search(LIST list, void *key, int_compare is_equal)
Definition: oldlist.cpp:406
bool IsTextOrEquationType(PolyBlockType type)
inT16 top() const
Definition: rect.h:54
float roundf(float num)
Definition: mathfix.h:35
Definition: rect.h:30
inT16 right() const
Definition: rect.h:75
void RemoveBBox(BBC *bbox)
Definition: bbgrid.h:537
inT16 bottom() const
Definition: rect.h:61

◆ FindEquationParts()

int tesseract::EquationDetect::FindEquationParts ( ColPartitionGrid part_grid,
ColPartitionSet **  best_columns 
)
virtual

Implements tesseract::EquationDetectBase.

Definition at line 364 of file equationdetect.cpp.

365  {
366  if (!lang_tesseract_) {
367  tprintf("Warning: lang_tesseract_ is NULL!\n");
368  return -1;
369  }
370  if (!part_grid || !best_columns) {
371  tprintf("part_grid/best_columns is NULL!!\n");
372  return -1;
373  }
374  cp_seeds_.clear();
375  part_grid_ = part_grid;
376  best_columns_ = best_columns;
378  STRING outfile;
379  page_count_++;
380 
382  GetOutputTiffName("_bi", &outfile);
383  pixWrite(outfile.string(), lang_tesseract_->pix_binary(), IFF_TIFF_G4);
384  }
385 
386  // Pass 0: Compute special text type for blobs.
388 
389  // Pass 1: Merge parts by overlap.
391 
392  // Pass 2: compute the math blob density and find the seed partition.
394  // We still need separate seed into block seed and inline seed partition.
396 
398  GetOutputTiffName("_seed", &outfile);
399  PaintColParts(outfile);
400  }
401 
402  // Pass 3: expand block equation seeds.
403  while (!cp_seeds_.empty()) {
404  GenericVector<ColPartition*> seeds_expanded;
405  for (int i = 0; i < cp_seeds_.size(); ++i) {
406  if (ExpandSeed(cp_seeds_[i])) {
407  // If this seed is expanded, then we add it into seeds_expanded. Note
408  // this seed has been removed from part_grid_ if it is expanded.
409  seeds_expanded.push_back(cp_seeds_[i]);
410  }
411  }
412  // Add seeds_expanded back into part_grid_ and reset cp_seeds_.
413  for (int i = 0; i < seeds_expanded.size(); ++i) {
414  InsertPartAfterAbsorb(seeds_expanded[i]);
415  }
416  cp_seeds_ = seeds_expanded;
417  }
418 
419  // Pass 4: find math block satellite text partitions and merge them.
421 
422  if (equationdetect_save_merged_image) { // For debug.
423  GetOutputTiffName("_merged", &outfile);
424  PaintColParts(outfile);
425  }
426 
427  return 0;
428 }
ColPartitionGrid * part_grid_
void GetOutputTiffName(const char *name, STRING *image_name) const
GenericVector< ColPartition * > cp_seeds_
bool equationdetect_save_bi_image
bool equationdetect_save_merged_image
Pix * pix_binary() const
ColPartitionSet ** best_columns_
int push_back(T object)
#define tprintf(...)
Definition: tprintf.h:31
void PaintColParts(const STRING &outfile) const
const char * string() const
Definition: strngs.cpp:198
bool ExpandSeed(ColPartition *seed)
int size() const
Definition: genericvector.h:72
bool equationdetect_save_seed_image
Definition: strngs.h:45
int source_resolution() const
void InsertPartAfterAbsorb(ColPartition *part)

◆ GetOutputTiffName()

void tesseract::EquationDetect::GetOutputTiffName ( const char *  name,
STRING image_name 
) const
protected

Definition at line 1463 of file equationdetect.cpp.

1464  {
1465  ASSERT_HOST(image_name && name);
1466  char page[50];
1467  snprintf(page, sizeof(page), "%04d", page_count_);
1468  *image_name = STRING(lang_tesseract_->imagebasename) + page + name + ".tif";
1469 }
#define ASSERT_HOST(x)
Definition: errcode.h:84
Definition: strngs.h:45
STRING imagebasename
Definition: ccutil.h:65

◆ IdentifyBlobsToSkip()

void tesseract::EquationDetect::IdentifyBlobsToSkip ( ColPartition part)
protected

Definition at line 316 of file equationdetect.cpp.

316  {
317  ASSERT_HOST(part);
318  BLOBNBOX_C_IT blob_it(part->boxes());
319 
320  for (blob_it.mark_cycle_pt(); !blob_it.cycled_list(); blob_it.forward()) {
321  // At this moment, no blob should have been joined.
322  ASSERT_HOST(!blob_it.data()->joined_to_prev());
323  }
324  for (blob_it.mark_cycle_pt(); !blob_it.cycled_list(); blob_it.forward()) {
325  BLOBNBOX* blob = blob_it.data();
326  if (blob->joined_to_prev() || blob->special_text_type() == BSTT_SKIP) {
327  continue;
328  }
329  TBOX blob_box = blob->bounding_box();
330 
331  // Search if any blob can be merged into blob. If found, then we mark all
332  // these blobs as BSTT_SKIP.
333  BLOBNBOX_C_IT blob_it2 = blob_it;
334  bool found = false;
335  while (!blob_it2.at_last()) {
336  BLOBNBOX* nextblob = blob_it2.forward();
337  const TBOX& nextblob_box = nextblob->bounding_box();
338  if (nextblob_box.left() >= blob_box.right()) {
339  break;
340  }
341  const float kWidthR = 0.4, kHeightR = 0.3;
342  bool xoverlap = blob_box.major_x_overlap(nextblob_box),
343  yoverlap = blob_box.y_overlap(nextblob_box);
344  float widthR = static_cast<float>(
345  MIN(nextblob_box.width(), blob_box.width())) /
346  MAX(nextblob_box.width(), blob_box.width());
347  float heightR = static_cast<float>(
348  MIN(nextblob_box.height(), blob_box.height())) /
349  MAX(nextblob_box.height(), blob_box.height());
350 
351  if (xoverlap && yoverlap && widthR > kWidthR && heightR > kHeightR) {
352  // Found one, set nextblob type and recompute blob_box.
353  found = true;
354  nextblob->set_special_text_type(BSTT_SKIP);
355  blob_box += nextblob_box;
356  }
357  }
358  if (found) {
360  }
361  }
362 }
bool y_overlap(const TBOX &box) const
Definition: rect.h:418
bool joined_to_prev() const
Definition: blobbox.h:241
bool major_x_overlap(const TBOX &box) const
Definition: rect.h:402
BlobSpecialTextType special_text_type() const
Definition: blobbox.h:274
#define ASSERT_HOST(x)
Definition: errcode.h:84
inT16 left() const
Definition: rect.h:68
#define MAX(x, y)
Definition: ndminx.h:24
Definition: rect.h:30
#define MIN(x, y)
Definition: ndminx.h:28
inT16 height() const
Definition: rect.h:104
inT16 right() const
Definition: rect.h:75
inT16 width() const
Definition: rect.h:111
void set_special_text_type(BlobSpecialTextType new_type)
Definition: blobbox.h:277
const TBOX & bounding_box() const
Definition: blobbox.h:215

◆ IdentifyInlineParts()

void tesseract::EquationDetect::IdentifyInlineParts ( )
protected

Definition at line 788 of file equationdetect.cpp.

788  {
791  int textparts_linespacing = EstimateTextPartLineSpacing();
792  IdentifyInlinePartsVertical(true, textparts_linespacing);
793  IdentifyInlinePartsVertical(false, textparts_linespacing);
794 }
void IdentifyInlinePartsVertical(const bool top_to_bottom, const int textPartsLineSpacing)

◆ IdentifyInlinePartsHorizontal()

void tesseract::EquationDetect::IdentifyInlinePartsHorizontal ( )
protected

Definition at line 809 of file equationdetect.cpp.

809  {
812  const int kMarginDiffTh = IntCastRounded(
814  const int kGapTh = static_cast<int>(roundf(
817  search.SetUniqueMode(true);
818  // The center x coordinate of the cp_super_bbox_.
819  int cps_cx = cps_super_bbox_->left() + cps_super_bbox_->width() / 2;
820  for (int i = 0; i < cp_seeds_.size(); ++i) {
821  ColPartition* part = cp_seeds_[i];
822  const TBOX& part_box(part->bounding_box());
823  int left_margin = part_box.left() - cps_super_bbox_->left(),
824  right_margin = cps_super_bbox_->right() - part_box.right();
825  bool right_to_left;
826  if (left_margin + kMarginDiffTh < right_margin &&
827  left_margin < kMarginDiffTh) {
828  // part is left aligned, so we search if it has any right neighbor.
829  search.StartSideSearch(
830  part_box.right(), part_box.top(), part_box.bottom());
831  right_to_left = false;
832  } else if (left_margin > cps_cx) {
833  // part locates on the right half on image, so search if it has any left
834  // neighbor.
835  search.StartSideSearch(
836  part_box.left(), part_box.top(), part_box.bottom());
837  right_to_left = true;
838  } else { // part is not an inline equation.
839  new_seeds.push_back(part);
840  continue;
841  }
842  ColPartition* neighbor = NULL;
843  bool side_neighbor_found = false;
844  while ((neighbor = search.NextSideSearch(right_to_left)) != NULL) {
845  const TBOX& neighbor_box(neighbor->bounding_box());
846  if (!IsTextOrEquationType(neighbor->type()) ||
847  part_box.x_gap(neighbor_box) > kGapTh ||
848  !part_box.major_y_overlap(neighbor_box) ||
849  part_box.major_x_overlap(neighbor_box)) {
850  continue;
851  }
852  // We have found one. Set the side_neighbor_found flag.
853  side_neighbor_found = true;
854  break;
855  }
856  if (!side_neighbor_found) { // Mark part as PT_INLINE_EQUATION.
857  part->set_type(PT_INLINE_EQUATION);
858  } else {
859  // Check the geometric feature of neighbor.
860  const TBOX& neighbor_box(neighbor->bounding_box());
861  if (neighbor_box.width() > part_box.width() &&
862  neighbor->type() != PT_EQUATION) { // Mark as PT_INLINE_EQUATION.
863  part->set_type(PT_INLINE_EQUATION);
864  } else { // part is not an inline equation type.
865  new_seeds.push_back(part);
866  }
867  }
868  }
869 
870  // Reset the cp_seeds_ using the new_seeds.
871  cp_seeds_ = new_seeds;
872 }
ColPartitionGrid * part_grid_
GenericVector< ColPartition * > cp_seeds_
GridSearch< ColPartition, ColPartition_CLIST, ColPartition_C_IT > ColPartitionGridSearch
Definition: colpartition.h:932
int push_back(T object)
int IntCastRounded(double x)
Definition: helpers.h:179
#define ASSERT_HOST(x)
Definition: errcode.h:84
inT16 left() const
Definition: rect.h:68
LIST search(LIST list, void *key, int_compare is_equal)
Definition: oldlist.cpp:406
bool IsTextOrEquationType(PolyBlockType type)
float roundf(float num)
Definition: mathfix.h:35
Definition: rect.h:30
int source_resolution() const
inT16 right() const
Definition: rect.h:75
inT16 width() const
Definition: rect.h:111

◆ IdentifyInlinePartsVertical()

void tesseract::EquationDetect::IdentifyInlinePartsVertical ( const bool  top_to_bottom,
const int  textPartsLineSpacing 
)
protected

Definition at line 914 of file equationdetect.cpp.

915  {
916  if (cp_seeds_.empty()) {
917  return;
918  }
919 
920  // Sort cp_seeds_.
921  if (top_to_bottom) { // From top to bottom.
922  cp_seeds_.sort(&SortCPByTopReverse);
923  } else { // From bottom to top.
924  cp_seeds_.sort(&SortCPByBottom);
925  }
926 
928  for (int i = 0; i < cp_seeds_.size(); ++i) {
929  ColPartition* part = cp_seeds_[i];
930  // If we sort cp_seeds_ from top to bottom, then for each cp_seeds_, we look
931  // for its top neighbors, so that if two/more inline regions are connected
932  // to each other, then we will identify the top one, and then use it to
933  // identify the bottom one.
934  if (IsInline(!top_to_bottom, textparts_linespacing, part)) {
935  part->set_type(PT_INLINE_EQUATION);
936  } else {
937  new_seeds.push_back(part);
938  }
939  }
940  cp_seeds_ = new_seeds;
941 }
GenericVector< ColPartition * > cp_seeds_
int push_back(T object)
bool IsInline(const bool search_bottom, const int textPartsLineSpacing, ColPartition *part)

◆ IdentifySeedParts()

void tesseract::EquationDetect::IdentifySeedParts ( )
protected

Definition at line 545 of file equationdetect.cpp.

545  {
547  ColPartition *part = NULL;
548  gsearch.StartFullSearch();
549 
550  GenericVector<ColPartition*> seeds1, seeds2;
551  // The left coordinates of indented text partitions.
552  GenericVector<int> indented_texts_left;
553  // The foreground density of text partitions.
554  GenericVector<float> texts_foreground_density;
555  while ((part = gsearch.NextFullSearch()) != NULL) {
556  if (!IsTextOrEquationType(part->type())) {
557  continue;
558  }
559  part->ComputeSpecialBlobsDensity();
560  bool blobs_check = CheckSeedBlobsCount(part);
561  const int kTextBlobsTh = 20;
562 
564  blobs_check) {
565  // Passed high density threshold test, save into seeds1.
566  seeds1.push_back(part);
567  } else {
568  IndentType indent = IsIndented(part);
569  if (IsLeftIndented(indent) && blobs_check &&
571  // Passed low density threshold test and is indented, save into seeds2.
572  seeds2.push_back(part);
573  } else if (!IsRightIndented(indent) &&
574  part->boxes_count() > kTextBlobsTh) {
575  // This is likely to be a text part, save the features.
576  const TBOX&box = part->bounding_box();
577  if (IsLeftIndented(indent)) {
578  indented_texts_left.push_back(box.left());
579  }
580  texts_foreground_density.push_back(ComputeForegroundDensity(box));
581  }
582  }
583  }
584 
585  // Sort the features collected from text regions.
586  indented_texts_left.sort();
587  texts_foreground_density.sort();
588  float foreground_density_th = 0.15; // Default value.
589  if (!texts_foreground_density.empty()) {
590  // Use the median of the texts_foreground_density.
591  foreground_density_th = 0.8 * texts_foreground_density[
592  texts_foreground_density.size() / 2];
593  }
594 
595  for (int i = 0; i < seeds1.size(); ++i) {
596  const TBOX& box = seeds1[i]->bounding_box();
597  if (CheckSeedFgDensity(foreground_density_th, seeds1[i]) &&
598  !(IsLeftIndented(IsIndented(seeds1[i])) &&
599  CountAlignment(indented_texts_left, box.left()) >=
601  // Mark as PT_EQUATION type.
602  seeds1[i]->set_type(PT_EQUATION);
603  cp_seeds_.push_back(seeds1[i]);
604  } else { // Mark as PT_INLINE_EQUATION type.
605  seeds1[i]->set_type(PT_INLINE_EQUATION);
606  }
607  }
608 
609  for (int i = 0; i < seeds2.size(); ++i) {
610  if (CheckForSeed2(indented_texts_left, foreground_density_th, seeds2[i])) {
611  seeds2[i]->set_type(PT_EQUATION);
612  cp_seeds_.push_back(seeds2[i]);
613  }
614  }
615 }
ColPartitionGrid * part_grid_
bool CheckSeedFgDensity(const float density_th, ColPartition *part)
float ComputeForegroundDensity(const TBOX &tbox)
const int kLeftIndentAlignmentCountTh
GenericVector< ColPartition * > cp_seeds_
GridSearch< ColPartition, ColPartition_CLIST, ColPartition_C_IT > ColPartitionGridSearch
Definition: colpartition.h:932
int CountAlignment(const GenericVector< int > &sorted_vec, const int val) const
int push_back(T object)
bool CheckSeedBlobsCount(ColPartition *part)
bool empty() const
Definition: genericvector.h:90
int size() const
Definition: genericvector.h:72
inT16 left() const
Definition: rect.h:68
IndentType IsIndented(ColPartition *part)
bool IsTextOrEquationType(PolyBlockType type)
bool IsRightIndented(const EquationDetect::IndentType type)
bool IsLeftIndented(const EquationDetect::IndentType type)
const float kMathDigitDensityTh1
Definition: rect.h:30
bool CheckSeedDensity(const float math_density_high, const float math_density_low, const ColPartition *part) const
const float kMathDigitDensityTh2
bool CheckForSeed2(const GenericVector< int > &indented_texts_left, const float foreground_density_th, ColPartition *part)

◆ IdentifySpecialText() [1/2]

void tesseract::EquationDetect::IdentifySpecialText ( BLOBNBOX blob,
const int  height_th 
)
protected

Definition at line 156 of file equationdetect.cpp.

157  {
158  ASSERT_HOST(blobnbox != NULL);
159  if (blobnbox->bounding_box().height() < height_th && height_th > 0) {
160  // For small blob, we simply set to BSTT_NONE.
161  blobnbox->set_special_text_type(BSTT_NONE);
162  return;
163  }
164 
165  BLOB_CHOICE_LIST ratings_equ, ratings_lang;
166  C_BLOB* blob = blobnbox->cblob();
167  // TODO(joeliu/rays) Fix this. We may have to normalize separately for
168  // each classifier here, as they may require different PolygonalCopy.
169  TBLOB* tblob = TBLOB::PolygonalCopy(false, blob);
170  const TBOX& box = tblob->bounding_box();
171 
172  // Normalize the blob. Set the origin to the place we want to be the
173  // bottom-middle, and scaling is to make the height the x-height.
174  float scaling = static_cast<float>(kBlnXHeight) / box.height();
175  float x_orig = (box.left() + box.right()) / 2.0f, y_orig = box.bottom();
176  TBLOB* normed_blob = new TBLOB(*tblob);
177  normed_blob->Normalize(NULL, NULL, NULL, x_orig, y_orig, scaling, scaling,
178  0.0f, static_cast<float>(kBlnBaselineOffset),
179  false, NULL);
180  equ_tesseract_.AdaptiveClassifier(normed_blob, &ratings_equ);
181  lang_tesseract_->AdaptiveClassifier(normed_blob, &ratings_lang);
182  delete normed_blob;
183  delete tblob;
184 
185  // Get the best choice from ratings_lang and rating_equ. As the choice in the
186  // list has already been sorted by the certainty, we simply use the first
187  // choice.
188  BLOB_CHOICE *lang_choice = NULL, *equ_choice = NULL;
189  if (ratings_lang.length() > 0) {
190  BLOB_CHOICE_IT choice_it(&ratings_lang);
191  lang_choice = choice_it.data();
192  }
193  if (ratings_equ.length() > 0) {
194  BLOB_CHOICE_IT choice_it(&ratings_equ);
195  equ_choice = choice_it.data();
196  }
197 
198  float lang_score = lang_choice ? lang_choice->certainty() : -FLT_MAX;
199  float equ_score = equ_choice ? equ_choice->certainty() : -FLT_MAX;
200 
201  const float kConfScoreTh = -5.0f, kConfDiffTh = 1.8;
202  // The scores here are negative, so the max/min == fabs(min/max).
203  // float ratio = fmax(lang_score, equ_score) / fmin(lang_score, equ_score);
204  float diff = fabs(lang_score - equ_score);
206 
207  // Classification.
208  if (fmax(lang_score, equ_score) < kConfScoreTh) {
209  // If both score are very small, then mark it as unclear.
210  type = BSTT_UNCLEAR;
211  } else if (diff > kConfDiffTh && equ_score > lang_score) {
212  // If equ_score is significantly higher, then we classify this character as
213  // math symbol.
214  type = BSTT_MATH;
215  } else if (lang_choice) {
216  // For other cases: lang_score is similar or significantly higher.
217  type = EstimateTypeForUnichar(
218  lang_tesseract_->unicharset, lang_choice->unichar_id());
219  }
220 
221  if (type == BSTT_NONE && lang_tesseract_->get_fontinfo_table().get(
222  lang_choice->fontinfo_id()).is_italic()) {
223  // For text symbol, we still check if it is italic.
224  blobnbox->set_special_text_type(BSTT_ITALIC);
225  } else {
226  blobnbox->set_special_text_type(type);
227  }
228 }
#define fmax
Definition: mathfix.h:33
BlobSpecialTextType EstimateTypeForUnichar(const UNICHARSET &unicharset, const UNICHAR_ID id) const
const int kBlnXHeight
Definition: normalis.h:28
const int kBlnBaselineOffset
Definition: normalis.h:29
C_BLOB * cblob() const
Definition: blobbox.h:253
UnicityTable< FontInfo > & get_fontinfo_table()
Definition: classify.h:344
float certainty() const
Definition: ratngs.h:82
#define ASSERT_HOST(x)
Definition: errcode.h:84
inT16 left() const
Definition: rect.h:68
inT16 fontinfo_id() const
Definition: ratngs.h:85
void Normalize(const BLOCK *block, const FCOORD *rotation, const DENORM *predecessor, float x_origin, float y_origin, float x_scale, float y_scale, float final_xshift, float final_yshift, bool inverse, Pix *pix)
Definition: blobs.cpp:413
BlobSpecialTextType
Definition: blobbox.h:81
UNICHARSET unicharset
Definition: ccutil.h:68
Definition: rect.h:30
Definition: blobs.h:261
inT16 height() const
Definition: rect.h:104
void AdaptiveClassifier(TBLOB *Blob, BLOB_CHOICE_LIST *Choices)
Definition: adaptmatch.cpp:185
inT16 right() const
Definition: rect.h:75
UNICHAR_ID unichar_id() const
Definition: ratngs.h:76
inT16 bottom() const
Definition: rect.h:61
static TBLOB * PolygonalCopy(bool allow_detailed_fx, C_BLOB *src)
Definition: blobs.cpp:344
TBOX bounding_box() const
Definition: blobs.cpp:482

◆ IdentifySpecialText() [2/2]

void tesseract::EquationDetect::IdentifySpecialText ( )
protected

Definition at line 264 of file equationdetect.cpp.

264  {
265  // Set configuration for Tesseract::AdaptiveClassifier.
266  equ_tesseract_.tess_cn_matching.set_value(true); // turn it on
267  equ_tesseract_.tess_bn_matching.set_value(false);
268 
269  // Set the multiplier to zero for lang_tesseract_ to improve the accuracy.
270  int classify_class_pruner = lang_tesseract_->classify_class_pruner_multiplier;
271  int classify_integer_matcher =
275 
277  ColPartition *part = NULL;
278  gsearch.StartFullSearch();
279  while ((part = gsearch.NextFullSearch()) != NULL) {
280  if (!IsTextOrEquationType(part->type())) {
281  continue;
282  }
283  IdentifyBlobsToSkip(part);
284  BLOBNBOX_C_IT bbox_it(part->boxes());
285  // Compute the height threshold.
286  GenericVector<int> blob_heights;
287  for (bbox_it.mark_cycle_pt (); !bbox_it.cycled_list();
288  bbox_it.forward()) {
289  if (bbox_it.data()->special_text_type() != BSTT_SKIP) {
290  blob_heights.push_back(bbox_it.data()->bounding_box().height());
291  }
292  }
293  blob_heights.sort();
294  int height_th = blob_heights[blob_heights.size() / 2] / 3 * 2;
295  for (bbox_it.mark_cycle_pt (); !bbox_it.cycled_list();
296  bbox_it.forward()) {
297  if (bbox_it.data()->special_text_type() != BSTT_SKIP) {
298  IdentifySpecialText(bbox_it.data(), height_th);
299  }
300  }
301  }
302 
303  // Set the multiplier values back.
305  classify_class_pruner);
307  classify_integer_matcher);
308 
309  if (equationdetect_save_spt_image) { // For debug.
310  STRING outfile;
311  GetOutputTiffName("_spt", &outfile);
312  PaintSpecialTexts(outfile);
313  }
314 }
ColPartitionGrid * part_grid_
void GetOutputTiffName(const char *name, STRING *image_name) const
void IdentifyBlobsToSkip(ColPartition *part)
GridSearch< ColPartition, ColPartition_CLIST, ColPartition_C_IT > ColPartitionGridSearch
Definition: colpartition.h:932
int push_back(T object)
int classify_integer_matcher_multiplier
Definition: classify.h:468
int classify_class_pruner_multiplier
Definition: classify.h:464
bool IsTextOrEquationType(PolyBlockType type)
Definition: strngs.h:45
void PaintSpecialTexts(const STRING &outfile) const
bool equationdetect_save_spt_image

◆ InsertPartAfterAbsorb()

void tesseract::EquationDetect::InsertPartAfterAbsorb ( ColPartition part)
protected

Definition at line 518 of file equationdetect.cpp.

518  {
519  ASSERT_HOST(part);
520 
521  // Before insert part back into part_grid_, we will need re-compute some
522  // of its attributes such as first_column_, last_column_. However, we still
523  // want to preserve its type.
524  BlobTextFlowType flow_type = part->flow();
525  PolyBlockType part_type = part->type();
526  BlobRegionType blob_type = part->blob_type();
527 
528  // Call SetPartitionType to re-compute the attributes of part.
529  const TBOX& part_box(part->bounding_box());
530  int grid_x, grid_y;
532  part_box.left(), part_box.bottom(), &grid_x, &grid_y);
533  part->SetPartitionType(resolution_, best_columns_[grid_y]);
534 
535  // Reset the types back.
536  part->set_type(part_type);
537  part->set_blob_type(blob_type);
538  part->set_flow(flow_type);
539  part->SetBlobTypes();
540 
541  // Insert into part_grid_.
542  part_grid_->InsertBBox(true, true, part);
543 }
ColPartitionGrid * part_grid_
ColPartitionSet ** best_columns_
PolyBlockType
Definition: publictypes.h:41
#define ASSERT_HOST(x)
Definition: errcode.h:84
void InsertBBox(bool h_spread, bool v_spread, BBC *bbox)
Definition: bbgrid.h:490
Definition: rect.h:30
BlobTextFlowType
Definition: blobbox.h:99
BlobRegionType
Definition: blobbox.h:57
void GridCoords(int x, int y, int *grid_x, int *grid_y) const
Definition: bbgrid.cpp:54

◆ IsIndented()

EquationDetect::IndentType tesseract::EquationDetect::IsIndented ( ColPartition part)
protected

Definition at line 1027 of file equationdetect.cpp.

1027  {
1028  ASSERT_HOST(part);
1029 
1031  ColPartition *neighbor = NULL;
1032  const TBOX& part_box(part->bounding_box());
1033  const int kXGapTh = static_cast<int>(roundf(0.5 * resolution_));
1034  const int kRadiusTh = static_cast<int>(roundf(3.0 * resolution_));
1035  const int kYGapTh = static_cast<int>(roundf(0.5 * resolution_));
1036 
1037  // Here we use a simple approximation algorithm: from the center of part, We
1038  // perform the radius search, and check if we can find a neighboring parition
1039  // that locates on the top/bottom left of part.
1040  search.StartRadSearch((part_box.left() + part_box.right()) / 2,
1041  (part_box.top() + part_box.bottom()) / 2, kRadiusTh);
1042  search.SetUniqueMode(true);
1043  bool left_indented = false, right_indented = false;
1044  while ((neighbor = search.NextRadSearch()) != NULL &&
1045  (!left_indented || !right_indented)) {
1046  if (neighbor == part) {
1047  continue;
1048  }
1049  const TBOX& neighbor_box(neighbor->bounding_box());
1050 
1051  if (part_box.major_y_overlap(neighbor_box) &&
1052  part_box.x_gap(neighbor_box) < kXGapTh) {
1053  // When this happens, it is likely part is a fragment of an
1054  // over-segmented colpartition. So we return false.
1055  return NO_INDENT;
1056  }
1057 
1058  if (!IsTextOrEquationType(neighbor->type())) {
1059  continue;
1060  }
1061 
1062  // The neighbor should be above/below part, and overlap in x direction.
1063  if (!part_box.x_overlap(neighbor_box) || part_box.y_overlap(neighbor_box)) {
1064  continue;
1065  }
1066 
1067  if (part_box.y_gap(neighbor_box) < kYGapTh) {
1068  int left_gap = part_box.left() - neighbor_box.left();
1069  int right_gap = neighbor_box.right() - part_box.right();
1070  if (left_gap > kXGapTh) {
1071  left_indented = true;
1072  }
1073  if (right_gap > kXGapTh) {
1074  right_indented = true;
1075  }
1076  }
1077  }
1078 
1079  if (left_indented && right_indented) {
1080  return BOTH_INDENT;
1081  }
1082  if (left_indented) {
1083  return LEFT_INDENT;
1084  }
1085  if (right_indented) {
1086  return RIGHT_INDENT;
1087  }
1088  return NO_INDENT;
1089 }
ColPartitionGrid * part_grid_
GridSearch< ColPartition, ColPartition_CLIST, ColPartition_C_IT > ColPartitionGridSearch
Definition: colpartition.h:932
#define ASSERT_HOST(x)
Definition: errcode.h:84
LIST search(LIST list, void *key, int_compare is_equal)
Definition: oldlist.cpp:406
bool IsTextOrEquationType(PolyBlockType type)
float roundf(float num)
Definition: mathfix.h:35
Definition: rect.h:30

◆ IsInline()

bool tesseract::EquationDetect::IsInline ( const bool  search_bottom,
const int  textPartsLineSpacing,
ColPartition part 
)
protected

Definition at line 943 of file equationdetect.cpp.

945  {
946  ASSERT_HOST(part != NULL);
947  // Look for its nearest vertical neighbor that hardly overlaps in y but
948  // largely overlaps in x.
950  ColPartition *neighbor = NULL;
951  const TBOX& part_box(part->bounding_box());
952  const float kYGapRatioTh = 1.0;
953 
954  if (search_bottom) {
955  search.StartVerticalSearch(part_box.left(), part_box.right(),
956  part_box.bottom());
957  } else {
958  search.StartVerticalSearch(part_box.left(), part_box.right(),
959  part_box.top());
960  }
961  search.SetUniqueMode(true);
962  while ((neighbor = search.NextVerticalSearch(search_bottom)) != NULL) {
963  const TBOX& neighbor_box(neighbor->bounding_box());
964  if (part_box.y_gap(neighbor_box) > kYGapRatioTh *
965  MIN(part_box.height(), neighbor_box.height())) {
966  // Finished searching.
967  break;
968  }
969  if (!PTIsTextType(neighbor->type())) {
970  continue;
971  }
972 
973  // Check if neighbor and part is inline similar.
974  const float kHeightRatioTh = 0.5;
975  const int kYGapTh = textparts_linespacing > 0 ?
976  textparts_linespacing + static_cast<int>(roundf(0.02 * resolution_)):
977  static_cast<int>(roundf(0.05 * resolution_)); // Default value.
978  if (part_box.x_overlap(neighbor_box) && // Location feature.
979  part_box.y_gap(neighbor_box) <= kYGapTh && // Line spacing.
980  // Geo feature.
981  static_cast<float>(MIN(part_box.height(), neighbor_box.height())) /
982  MAX(part_box.height(), neighbor_box.height()) > kHeightRatioTh) {
983  return true;
984  }
985  }
986 
987  return false;
988 }
ColPartitionGrid * part_grid_
GridSearch< ColPartition, ColPartition_CLIST, ColPartition_C_IT > ColPartitionGridSearch
Definition: colpartition.h:932
#define ASSERT_HOST(x)
Definition: errcode.h:84
LIST search(LIST list, void *key, int_compare is_equal)
Definition: oldlist.cpp:406
#define MAX(x, y)
Definition: ndminx.h:24
float roundf(float num)
Definition: mathfix.h:35
Definition: rect.h:30
#define MIN(x, y)
Definition: ndminx.h:28
bool PTIsTextType(PolyBlockType type)
Definition: publictypes.h:70

◆ IsMathBlockSatellite()

bool tesseract::EquationDetect::IsMathBlockSatellite ( ColPartition part,
GenericVector< ColPartition *> *  math_blocks 
)
protected

Definition at line 1365 of file equationdetect.cpp.

1366  {
1367  ASSERT_HOST(part != NULL && math_blocks != NULL);
1368  math_blocks->clear();
1369  const TBOX& part_box(part->bounding_box());
1370  // Find the top/bottom nearest neighbor of part.
1371  ColPartition *neighbors[2];
1372  int y_gaps[2] = {INT_MAX, INT_MAX};
1373  // The horizontal boundary of the neighbors.
1374  int neighbors_left = INT_MAX, neighbors_right = 0;
1375  for (int i = 0; i < 2; ++i) {
1376  neighbors[i] = SearchNNVertical(i != 0, part);
1377  if (neighbors[i]) {
1378  const TBOX& neighbor_box = neighbors[i]->bounding_box();
1379  y_gaps[i] = neighbor_box.y_gap(part_box);
1380  if (neighbor_box.left() < neighbors_left) {
1381  neighbors_left = neighbor_box.left();
1382  }
1383  if (neighbor_box.right() > neighbors_right) {
1384  neighbors_right = neighbor_box.right();
1385  }
1386  }
1387  }
1388  if (neighbors[0] == neighbors[1]) {
1389  // This happens when part is inside neighbor.
1390  neighbors[1] = NULL;
1391  y_gaps[1] = INT_MAX;
1392  }
1393 
1394  // Check if part is within [neighbors_left, neighbors_right].
1395  if (part_box.left() < neighbors_left || part_box.right() > neighbors_right) {
1396  return false;
1397  }
1398 
1399  // Get the index of the near one in neighbors.
1400  int index = y_gaps[0] < y_gaps[1] ? 0 : 1;
1401 
1402  // Check the near one.
1403  if (IsNearMathNeighbor(y_gaps[index], neighbors[index])) {
1404  math_blocks->push_back(neighbors[index]);
1405  } else {
1406  // If the near one failed the check, then we skip checking the far one.
1407  return false;
1408  }
1409 
1410  // Check the far one.
1411  index = 1 - index;
1412  if (IsNearMathNeighbor(y_gaps[index], neighbors[index])) {
1413  math_blocks->push_back(neighbors[index]);
1414  }
1415 
1416  return true;
1417 }
bool IsNearMathNeighbor(const int y_gap, const ColPartition *neighbor) const
int push_back(T object)
#define ASSERT_HOST(x)
Definition: errcode.h:84
inT16 left() const
Definition: rect.h:68
int y_gap(const TBOX &box) const
Definition: rect.h:225
Definition: rect.h:30
inT16 right() const
Definition: rect.h:75
ColPartition * SearchNNVertical(const bool search_bottom, const ColPartition *part)

◆ IsNearMathNeighbor()

bool tesseract::EquationDetect::IsNearMathNeighbor ( const int  y_gap,
const ColPartition neighbor 
) const
protected

Definition at line 1454 of file equationdetect.cpp.

1455  {
1456  if (!neighbor) {
1457  return false;
1458  }
1459  const int kYGapTh = static_cast<int>(roundf(resolution_ * 0.1));
1460  return neighbor->type() == PT_EQUATION && y_gap <= kYGapTh;
1461 }
float roundf(float num)
Definition: mathfix.h:35

◆ IsNearSmallNeighbor()

bool tesseract::EquationDetect::IsNearSmallNeighbor ( const TBOX seed_box,
const TBOX part_box 
) const
protected

Definition at line 1277 of file equationdetect.cpp.

1278  {
1279  const int kXGapTh = static_cast<int>(roundf(0.25 * resolution_));
1280  const int kYGapTh = static_cast<int>(roundf(0.05 * resolution_));
1281 
1282  // Check geometric feature.
1283  if (part_box.height() > seed_box.height() ||
1284  part_box.width() > seed_box.width()) {
1285  return false;
1286  }
1287 
1288  // Check overlap and distance.
1289  if ((!part_box.major_x_overlap(seed_box) ||
1290  part_box.y_gap(seed_box) > kYGapTh) &&
1291  (!part_box.major_y_overlap(seed_box) ||
1292  part_box.x_gap(seed_box) > kXGapTh)) {
1293  return false;
1294  }
1295 
1296  return true;
1297 }
bool major_x_overlap(const TBOX &box) const
Definition: rect.h:402
bool major_y_overlap(const TBOX &box) const
Definition: rect.h:429
int y_gap(const TBOX &box) const
Definition: rect.h:225
float roundf(float num)
Definition: mathfix.h:35
inT16 height() const
Definition: rect.h:104
inT16 width() const
Definition: rect.h:111
int x_gap(const TBOX &box) const
Definition: rect.h:217

◆ LabelSpecialText()

int tesseract::EquationDetect::LabelSpecialText ( TO_BLOCK to_block)
virtual

Implements tesseract::EquationDetectBase.

Definition at line 136 of file equationdetect.cpp.

136  {
137  if (to_block == NULL) {
138  tprintf("Warning: input to_block is NULL!\n");
139  return -1;
140  }
141 
143  blob_lists.push_back(&(to_block->blobs));
144  blob_lists.push_back(&(to_block->large_blobs));
145  for (int i = 0; i < blob_lists.size(); ++i) {
146  BLOBNBOX_IT bbox_it(blob_lists[i]);
147  for (bbox_it.mark_cycle_pt (); !bbox_it.cycled_list();
148  bbox_it.forward()) {
149  bbox_it.data()->set_special_text_type(BSTT_NONE);
150  }
151  }
152 
153  return 0;
154 }
BLOBNBOX_LIST large_blobs
Definition: blobbox.h:772
int push_back(T object)
#define tprintf(...)
Definition: tprintf.h:31
int size() const
Definition: genericvector.h:72
BLOBNBOX_LIST blobs
Definition: blobbox.h:768

◆ MergePartsByLocation()

void tesseract::EquationDetect::MergePartsByLocation ( )
protected

Definition at line 430 of file equationdetect.cpp.

430  {
431  while (true) {
432  ColPartition* part = NULL;
433  // partitions that have been updated.
434  GenericVector<ColPartition*> parts_updated;
436  gsearch.StartFullSearch();
437  while ((part = gsearch.NextFullSearch()) != NULL) {
438  if (!IsTextOrEquationType(part->type())) {
439  continue;
440  }
441  GenericVector<ColPartition*> parts_to_merge;
442  SearchByOverlap(part, &parts_to_merge);
443  if (parts_to_merge.empty()) {
444  continue;
445  }
446 
447  // Merge parts_to_merge with part, and remove them from part_grid_.
448  part_grid_->RemoveBBox(part);
449  for (int i = 0; i < parts_to_merge.size(); ++i) {
450  ASSERT_HOST(parts_to_merge[i] != NULL && parts_to_merge[i] != part);
451  part->Absorb(parts_to_merge[i], NULL);
452  }
453  gsearch.RepositionIterator();
454 
455  parts_updated.push_back(part);
456  }
457 
458  if (parts_updated.empty()) { // Exit the loop
459  break;
460  }
461 
462  // Re-insert parts_updated into part_grid_.
463  for (int i = 0; i < parts_updated.size(); ++i) {
464  InsertPartAfterAbsorb(parts_updated[i]);
465  }
466  }
467 }
ColPartitionGrid * part_grid_
GridSearch< ColPartition, ColPartition_CLIST, ColPartition_C_IT > ColPartitionGridSearch
Definition: colpartition.h:932
int push_back(T object)
bool empty() const
Definition: genericvector.h:90
int size() const
Definition: genericvector.h:72
#define ASSERT_HOST(x)
Definition: errcode.h:84
bool IsTextOrEquationType(PolyBlockType type)
void SearchByOverlap(ColPartition *seed, GenericVector< ColPartition *> *parts_overlap)
void RemoveBBox(BBC *bbox)
Definition: bbgrid.h:537
void InsertPartAfterAbsorb(ColPartition *part)

◆ PaintColParts()

void tesseract::EquationDetect::PaintColParts ( const STRING outfile) const
protected

Definition at line 1488 of file equationdetect.cpp.

1488  {
1489  Pix *pix = pixConvertTo32(lang_tesseract_->BestPix());
1491  gsearch.StartFullSearch();
1492  ColPartition* part = NULL;
1493  while ((part = gsearch.NextFullSearch()) != NULL) {
1494  const TBOX& tbox = part->bounding_box();
1495  Box *box = boxCreate(tbox.left(), pixGetHeight(pix) - tbox.top(),
1496  tbox.width(), tbox.height());
1497  if (part->type() == PT_EQUATION) {
1498  pixRenderBoxArb(pix, box, 5, 255, 0, 0);
1499  } else if (part->type() == PT_INLINE_EQUATION) {
1500  pixRenderBoxArb(pix, box, 5, 0, 255, 0);
1501  } else {
1502  pixRenderBoxArb(pix, box, 5, 0, 0, 255);
1503  }
1504  boxDestroy(&box);
1505  }
1506 
1507  pixWrite(outfile.string(), pix, IFF_TIFF_LZW);
1508  pixDestroy(&pix);
1509 }
ColPartitionGrid * part_grid_
GridSearch< ColPartition, ColPartition_CLIST, ColPartition_C_IT > ColPartitionGridSearch
Definition: colpartition.h:932
const char * string() const
Definition: strngs.cpp:198
inT16 left() const
Definition: rect.h:68
Pix * BestPix() const
inT16 top() const
Definition: rect.h:54
Definition: rect.h:30
inT16 height() const
Definition: rect.h:104
inT16 width() const
Definition: rect.h:111

◆ PaintSpecialTexts()

void tesseract::EquationDetect::PaintSpecialTexts ( const STRING outfile) const
protected

Definition at line 1471 of file equationdetect.cpp.

1471  {
1472  Pix *pix = NULL, *pixBi = lang_tesseract_->pix_binary();
1473  pix = pixConvertTo32(pixBi);
1475  ColPartition* part = NULL;
1476  gsearch.StartFullSearch();
1477  while ((part = gsearch.NextFullSearch()) != NULL) {
1478  BLOBNBOX_C_IT blob_it(part->boxes());
1479  for (blob_it.mark_cycle_pt(); !blob_it.cycled_list(); blob_it.forward()) {
1480  RenderSpecialText(pix, blob_it.data());
1481  }
1482  }
1483 
1484  pixWrite(outfile.string(), pix, IFF_TIFF_LZW);
1485  pixDestroy(&pix);
1486 }
ColPartitionGrid * part_grid_
static void RenderSpecialText(Pix *pix, BLOBNBOX *blob)
GridSearch< ColPartition, ColPartition_CLIST, ColPartition_C_IT > ColPartitionGridSearch
Definition: colpartition.h:932
Pix * pix_binary() const
const char * string() const
Definition: strngs.cpp:198

◆ PrintSpecialBlobsDensity()

void tesseract::EquationDetect::PrintSpecialBlobsDensity ( const ColPartition part) const
protected

Definition at line 1511 of file equationdetect.cpp.

1511  {
1512  ASSERT_HOST(part);
1513  TBOX box(part->bounding_box());
1514  int h = pixGetHeight(lang_tesseract_->BestPix());
1515  tprintf("Printing special blobs density values for ColParition (t=%d,b=%d) ",
1516  h - box.top(), h - box.bottom());
1517  box.print();
1518  tprintf("blobs count = %d, density = ", part->boxes_count());
1519  for (int i = 0; i < BSTT_COUNT; ++i) {
1520  BlobSpecialTextType type = static_cast<BlobSpecialTextType>(i);
1521  tprintf("%d:%f ", i, part->SpecialBlobsDensity(type));
1522  }
1523  tprintf("\n");
1524 }
#define tprintf(...)
Definition: tprintf.h:31
#define ASSERT_HOST(x)
Definition: errcode.h:84
Pix * BestPix() const
BlobSpecialTextType
Definition: blobbox.h:81
Definition: rect.h:30

◆ ProcessMathBlockSatelliteParts()

void tesseract::EquationDetect::ProcessMathBlockSatelliteParts ( )
protected

Definition at line 1316 of file equationdetect.cpp.

1316  {
1317  // Iterate over part_grid_, and find all parts that are text type but not
1318  // equation type.
1319  ColPartition *part = NULL;
1320  GenericVector<ColPartition*> text_parts;
1322  gsearch.StartFullSearch();
1323  while ((part = gsearch.NextFullSearch()) != NULL) {
1324  if (part->type() == PT_FLOWING_TEXT || part->type() == PT_HEADING_TEXT) {
1325  text_parts.push_back(part);
1326  }
1327  }
1328  if (text_parts.empty()) {
1329  return;
1330  }
1331 
1332  // Compute the medium height of the text_parts.
1333  text_parts.sort(&SortCPByHeight);
1334  const TBOX& text_box = text_parts[text_parts.size() / 2]->bounding_box();
1335  int med_height = text_box.height();
1336  if (text_parts.size() % 2 == 0 && text_parts.size() > 1) {
1337  const TBOX& text_box =
1338  text_parts[text_parts.size() / 2 - 1]->bounding_box();
1339  med_height = static_cast<int>(roundf(
1340  0.5 * (text_box.height() + med_height)));
1341  }
1342 
1343  // Iterate every text_parts and check if it is a math block satellite.
1344  for (int i = 0; i < text_parts.size(); ++i) {
1345  const TBOX& text_box(text_parts[i]->bounding_box());
1346  if (text_box.height() > med_height) {
1347  continue;
1348  }
1349  GenericVector<ColPartition*> math_blocks;
1350  if (!IsMathBlockSatellite(text_parts[i], &math_blocks)) {
1351  continue;
1352  }
1353 
1354  // Found. merge text_parts[i] with math_blocks.
1355  part_grid_->RemoveBBox(text_parts[i]);
1356  text_parts[i]->set_type(PT_EQUATION);
1357  for (int j = 0; j < math_blocks.size(); ++j) {
1358  part_grid_->RemoveBBox(math_blocks[j]);
1359  text_parts[i]->Absorb(math_blocks[j], NULL);
1360  }
1361  InsertPartAfterAbsorb(text_parts[i]);
1362  }
1363 }
ColPartitionGrid * part_grid_
bool IsMathBlockSatellite(ColPartition *part, GenericVector< ColPartition *> *math_blocks)
GridSearch< ColPartition, ColPartition_CLIST, ColPartition_C_IT > ColPartitionGridSearch
Definition: colpartition.h:932
int push_back(T object)
bool empty() const
Definition: genericvector.h:90
int size() const
Definition: genericvector.h:72
float roundf(float num)
Definition: mathfix.h:35
Definition: rect.h:30
inT16 height() const
Definition: rect.h:104
void RemoveBBox(BBC *bbox)
Definition: bbgrid.h:537
void InsertPartAfterAbsorb(ColPartition *part)

◆ SearchByOverlap()

void tesseract::EquationDetect::SearchByOverlap ( ColPartition seed,
GenericVector< ColPartition *> *  parts_overlap 
)
protected

Definition at line 469 of file equationdetect.cpp.

471  {
472  ASSERT_HOST(seed != NULL && parts_overlap != NULL);
473  if (!IsTextOrEquationType(seed->type())) {
474  return;
475  }
477  const TBOX& seed_box(seed->bounding_box());
478  const int kRadNeighborCells = 30;
479  search.StartRadSearch((seed_box.left() + seed_box.right()) / 2,
480  (seed_box.top() + seed_box.bottom()) / 2,
481  kRadNeighborCells);
482  search.SetUniqueMode(true);
483 
484  // Search iteratively.
485  ColPartition *part;
487  const float kLargeOverlapTh = 0.95;
488  const float kEquXOverlap = 0.4, kEquYOverlap = 0.5;
489  while ((part = search.NextRadSearch()) != NULL) {
490  if (part == seed || !IsTextOrEquationType(part->type())) {
491  continue;
492  }
493  const TBOX& part_box(part->bounding_box());
494  bool merge = false;
495 
496  float x_overlap_fraction = part_box.x_overlap_fraction(seed_box),
497  y_overlap_fraction = part_box.y_overlap_fraction(seed_box);
498 
499  // If part is large overlapped with seed, then set merge to true.
500  if (x_overlap_fraction >= kLargeOverlapTh &&
501  y_overlap_fraction >= kLargeOverlapTh) {
502  merge = true;
503  } else if (seed->type() == PT_EQUATION &&
504  IsTextOrEquationType(part->type())) {
505  if ((x_overlap_fraction > kEquXOverlap && y_overlap_fraction > 0.0) ||
506  (x_overlap_fraction > 0.0 && y_overlap_fraction > kEquYOverlap)) {
507  merge = true;
508  }
509  }
510 
511  if (merge) { // Remove the part from search and put it into parts.
512  search.RemoveBBox();
513  parts_overlap->push_back(part);
514  }
515  }
516 }
ColPartitionGrid * part_grid_
GridSearch< ColPartition, ColPartition_CLIST, ColPartition_C_IT > ColPartitionGridSearch
Definition: colpartition.h:932
int push_back(T object)
double x_overlap_fraction(const TBOX &box) const
Definition: rect.h:447
#define ASSERT_HOST(x)
Definition: errcode.h:84
LIST search(LIST list, void *key, int_compare is_equal)
Definition: oldlist.cpp:406
bool IsTextOrEquationType(PolyBlockType type)
Definition: rect.h:30

◆ SearchNNVertical()

ColPartition * tesseract::EquationDetect::SearchNNVertical ( const bool  search_bottom,
const ColPartition part 
)
protected

Definition at line 1419 of file equationdetect.cpp.

1420  {
1421  ASSERT_HOST(part);
1422  ColPartition *nearest_neighbor = NULL, *neighbor = NULL;
1423  const int kYGapTh = static_cast<int>(roundf(resolution_ * 0.5));
1424 
1426  search.SetUniqueMode(true);
1427  const TBOX& part_box(part->bounding_box());
1428  int y = search_bottom ? part_box.bottom() : part_box.top();
1429  search.StartVerticalSearch(part_box.left(), part_box.right(), y);
1430  int min_y_gap = INT_MAX;
1431  while ((neighbor = search.NextVerticalSearch(search_bottom)) != NULL) {
1432  if (neighbor == part || !IsTextOrEquationType(neighbor->type())) {
1433  continue;
1434  }
1435  const TBOX& neighbor_box(neighbor->bounding_box());
1436  int y_gap = neighbor_box.y_gap(part_box);
1437  if (y_gap > kYGapTh) { // Out of scope.
1438  break;
1439  }
1440  if (!neighbor_box.major_x_overlap(part_box) ||
1441  (search_bottom && neighbor_box.bottom() > part_box.bottom()) ||
1442  (!search_bottom && neighbor_box.top() < part_box.top())) {
1443  continue;
1444  }
1445  if (y_gap < min_y_gap) {
1446  min_y_gap = y_gap;
1447  nearest_neighbor = neighbor;
1448  }
1449  }
1450 
1451  return nearest_neighbor;
1452 }
ColPartitionGrid * part_grid_
GridSearch< ColPartition, ColPartition_CLIST, ColPartition_C_IT > ColPartitionGridSearch
Definition: colpartition.h:932
#define ASSERT_HOST(x)
Definition: errcode.h:84
LIST search(LIST list, void *key, int_compare is_equal)
Definition: oldlist.cpp:406
int y_gap(const TBOX &box) const
Definition: rect.h:225
bool IsTextOrEquationType(PolyBlockType type)
float roundf(float num)
Definition: mathfix.h:35
Definition: rect.h:30
inT16 bottom() const
Definition: rect.h:61

◆ SetLangTesseract()

void tesseract::EquationDetect::SetLangTesseract ( Tesseract lang_tesseract)

Definition at line 128 of file equationdetect.cpp.

128  {
129  lang_tesseract_ = lang_tesseract;
130 }

◆ SetResolution()

void tesseract::EquationDetect::SetResolution ( const int  resolution)

Definition at line 132 of file equationdetect.cpp.

132  {
133  resolution_ = resolution;
134 }

◆ SplitCPHor()

void tesseract::EquationDetect::SplitCPHor ( ColPartition part,
GenericVector< ColPartition *> *  parts_splitted 
)
protected

Definition at line 653 of file equationdetect.cpp.

654  {
655  ASSERT_HOST(part && parts_splitted);
656  if (part->median_width() == 0 || part->boxes_count() == 0) {
657  return;
658  }
659 
660  // Make a copy of part, and reset parts_splitted.
661  ColPartition* right_part = part->CopyButDontOwnBlobs();
662  parts_splitted->delete_data_pointers();
663  parts_splitted->clear();
664 
665  const double kThreshold = part->median_width() * 3.0;
666  bool found_split = true;
667  while (found_split) {
668  found_split = false;
669  BLOBNBOX_C_IT box_it(right_part->boxes());
670  // Blobs are sorted left side first. If blobs overlap,
671  // the previous blob may have a "more right" right side.
672  // Account for this by always keeping the largest "right"
673  // so far.
674  int previous_right = MIN_INT32;
675 
676  // Look for the next split in the partition.
677  for (box_it.mark_cycle_pt(); !box_it.cycled_list(); box_it.forward()) {
678  const TBOX& box = box_it.data()->bounding_box();
679  if (previous_right != MIN_INT32 &&
680  box.left() - previous_right > kThreshold) {
681  // We have a split position. Split the partition in two pieces.
682  // Insert the left piece in the grid and keep processing the right.
683  int mid_x = (box.left() + previous_right) / 2;
684  ColPartition* left_part = right_part;
685  right_part = left_part->SplitAt(mid_x);
686 
687  parts_splitted->push_back(left_part);
688  left_part->ComputeSpecialBlobsDensity();
689  found_split = true;
690  break;
691  }
692 
693  // The right side of the previous blobs.
694  previous_right = MAX(previous_right, box.right());
695  }
696  }
697 
698  // Add the last piece.
699  right_part->ComputeSpecialBlobsDensity();
700  parts_splitted->push_back(right_part);
701 }
int push_back(T object)
#define ASSERT_HOST(x)
Definition: errcode.h:84
inT16 left() const
Definition: rect.h:68
#define MIN_INT32
Definition: host.h:70
#define MAX(x, y)
Definition: ndminx.h:24
Definition: rect.h:30
inT16 right() const
Definition: rect.h:75
void delete_data_pointers()

◆ SplitCPHorLite()

void tesseract::EquationDetect::SplitCPHorLite ( ColPartition part,
GenericVector< TBOX > *  splitted_boxes 
)
protected

Definition at line 703 of file equationdetect.cpp.

704  {
705  ASSERT_HOST(part && splitted_boxes);
706  splitted_boxes->clear();
707  if (part->median_width() == 0) {
708  return;
709  }
710 
711  const double kThreshold = part->median_width() * 3.0;
712 
713  // Blobs are sorted left side first. If blobs overlap,
714  // the previous blob may have a "more right" right side.
715  // Account for this by always keeping the largest "right"
716  // so far.
717  TBOX union_box;
718  int previous_right = MIN_INT32;
719  BLOBNBOX_C_IT box_it(part->boxes());
720  for (box_it.mark_cycle_pt(); !box_it.cycled_list(); box_it.forward()) {
721  const TBOX& box = box_it.data()->bounding_box();
722  if (previous_right != MIN_INT32 &&
723  box.left() - previous_right > kThreshold) {
724  // We have a split position.
725  splitted_boxes->push_back(union_box);
726  previous_right = MIN_INT32;
727  }
728  if (previous_right == MIN_INT32) {
729  union_box = box;
730  } else {
731  union_box += box;
732  }
733  // The right side of the previous blobs.
734  previous_right = MAX(previous_right, box.right());
735  }
736 
737  // Add the last piece.
738  if (previous_right != MIN_INT32) {
739  splitted_boxes->push_back(union_box);
740  }
741 }
int push_back(T object)
#define ASSERT_HOST(x)
Definition: errcode.h:84
inT16 left() const
Definition: rect.h:68
#define MIN_INT32
Definition: host.h:70
#define MAX(x, y)
Definition: ndminx.h:24
Definition: rect.h:30
inT16 right() const
Definition: rect.h:75

Member Data Documentation

◆ best_columns_

ColPartitionSet** tesseract::EquationDetect::best_columns_
protected

Definition at line 261 of file equationdetect.h.

◆ cp_seeds_

GenericVector<ColPartition*> tesseract::EquationDetect::cp_seeds_
protected

Definition at line 267 of file equationdetect.h.

◆ cps_super_bbox_

TBOX* tesseract::EquationDetect::cps_super_bbox_
protected

Definition at line 264 of file equationdetect.h.

◆ equ_tesseract_

Tesseract tesseract::EquationDetect::equ_tesseract_
protected

Definition at line 248 of file equationdetect.h.

◆ lang_tesseract_

Tesseract* tesseract::EquationDetect::lang_tesseract_
protected

Definition at line 252 of file equationdetect.h.

◆ page_count_

int tesseract::EquationDetect::page_count_
protected

Definition at line 273 of file equationdetect.h.

◆ part_grid_

ColPartitionGrid* tesseract::EquationDetect::part_grid_
protected

Definition at line 256 of file equationdetect.h.

◆ resolution_

int tesseract::EquationDetect::resolution_
protected

Definition at line 270 of file equationdetect.h.


The documentation for this class was generated from the following files: