tesseract  4.00.00dev
boxchar.cpp
Go to the documentation of this file.
1 /**********************************************************************
2  * File: boxchar.cpp
3  * Description: Simple class to associate a Tesseract classification unit with
4  * its bounding box so that the boxes can be rotated as the image
5  * is rotated for degradation. Also includes routines to output
6  * the character-tagged boxes to a boxfile.
7  * Author: Ray Smith
8  * Created: Mon Nov 18 2013
9  *
10  * (C) Copyright 2013, Google Inc.
11  * Licensed under the Apache License, Version 2.0 (the "License");
12  * you may not use this file except in compliance with the License.
13  * You may obtain a copy of the License at
14  * http://www.apache.org/licenses/LICENSE-2.0
15  * Unless required by applicable law or agreed to in writing, software
16  * distributed under the License is distributed on an "AS IS" BASIS,
17  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
18  * See the License for the specific language governing permissions and
19  * limitations under the License.
20  *
21  **********************************************************************/
22 
23 #include "boxchar.h"
24 
25 #include <stddef.h>
26 #include <algorithm>
27 
28 #include "fileio.h"
29 #include "genericvector.h"
30 #include "ndminx.h"
31 #include "normstrngs.h"
32 #include "tprintf.h"
33 #include "unicharset.h"
34 #include "unicode/uchar.h" // from libicu
35 
36 // Absolute Ratio of dx:dy or dy:dx to be a newline.
37 const int kMinNewlineRatio = 5;
38 
39 namespace tesseract {
40 
41 BoxChar::BoxChar(const char* utf8_str, int len) : ch_(utf8_str, len) {
42  box_ = nullptr;
43 }
44 
45 BoxChar::~BoxChar() { boxDestroy(&box_); }
46 
47 void BoxChar::AddBox(int x, int y, int width, int height) {
48  box_ = boxCreate(x, y, width, height);
49 }
50 
51 /* static */
52 void BoxChar::TranslateBoxes(int xshift, int yshift,
53  std::vector<BoxChar*>* boxes) {
54  for (size_t i = 0; i < boxes->size(); ++i) {
55  BOX* box = (*boxes)[i]->box_;
56  if (box != nullptr) {
57  box->x += xshift;
58  box->y += yshift;
59  }
60  }
61 }
62 
63 // Prepares for writing the boxes to a file by inserting newlines, spaces,
64 // and re-ordering so the boxes are strictly left-to-right.
65 /* static */
66 void BoxChar::PrepareToWrite(std::vector<BoxChar*>* boxes) {
67  bool rtl_rules = ContainsMostlyRTL(*boxes);
68  bool vertical_rules = MostlyVertical(*boxes);
69  InsertNewlines(rtl_rules, vertical_rules, boxes);
70  InsertSpaces(rtl_rules, vertical_rules, boxes);
71  for (unsigned int i = 0; i < boxes->size(); ++i) {
72  if ((*boxes)[i]->box_ == nullptr) tprintf("Null box at index %u\n", i);
73  }
74  if (rtl_rules) {
75  ReorderRTLText(boxes);
76  }
77 }
78 
79 // Inserts newline (tab) characters into the vector at newline positions.
80 /* static */
81 void BoxChar::InsertNewlines(bool rtl_rules, bool vertical_rules,
82  std::vector<BoxChar*>* boxes) {
83  int prev_i = -1;
84  int max_shift = 0;
85  for (int i = 0; static_cast<unsigned int>(i) < boxes->size(); ++i) {
86  Box* box = (*boxes)[i]->box_;
87  if (box == nullptr) {
88  if (prev_i < 0 || prev_i < i - 1 || static_cast<unsigned int>(i) + 1 == boxes->size()) {
89  // Erase null boxes at the start of a line and after another null box.
90  do {
91  delete (*boxes)[i];
92  boxes->erase(boxes->begin() + i);
93  --i;
94  } while (i >= 0 && static_cast<unsigned int>(i) + 1 == boxes->size() &&
95  (*boxes)[i]->box_ == nullptr);
96  }
97  continue;
98  }
99  if (prev_i >= 0) {
100  Box* prev_box = (*boxes)[prev_i]->box_;
101  int shift = box->x - prev_box->x;
102  if (vertical_rules) {
103  shift = box->y - prev_box->y;
104  } else if (rtl_rules) {
105  shift = -shift;
106  }
107  if (-shift > max_shift) {
108  // This is a newline.
109  int width = prev_box->w;
110  int height = prev_box->h;
111  int x = prev_box->x + width;
112  int y = prev_box->y;
113  if (vertical_rules) {
114  x = prev_box->x;
115  y = prev_box->y + height;
116  } else if (rtl_rules) {
117  x = prev_box->x - width;
118  if (x < 0) {
119  tprintf("prev x = %d, width=%d\n", prev_box->x, width);
120  x = 0;
121  }
122  }
123  if (prev_i == i - 1) {
124  // New character needed.
125  BoxChar* new_box = new BoxChar("\t", 1);
126  new_box->AddBox(x, y, width, height);
127  new_box->page_ = (*boxes)[i]->page_;
128  boxes->insert(boxes->begin() + i, new_box);
129  ++i;
130  } else {
131  (*boxes)[i - 1]->AddBox(x, y, width, height);
132  (*boxes)[i - 1]->ch_ = "\t";
133  }
134  max_shift = 0;
135  } else if (shift > max_shift) {
136  max_shift = shift;
137  }
138  }
139  prev_i = i;
140  }
141 }
142 
143 // Converts nullptr boxes to space characters, with appropriate bounding boxes.
144 /* static */
145 void BoxChar::InsertSpaces(bool rtl_rules, bool vertical_rules,
146  std::vector<BoxChar*>* boxes) {
147  // After InsertNewlines, any remaining null boxes are not newlines, and are
148  // singletons, so add a box to each remaining null box.
149  for (int i = 1; static_cast<unsigned int>(i) + 1 < boxes->size(); ++i) {
150  Box* box = (*boxes)[i]->box_;
151  if (box == nullptr) {
152  Box* prev = (*boxes)[i - 1]->box_;
153  Box* next = (*boxes)[i + 1]->box_;
154  ASSERT_HOST(prev != nullptr && next != nullptr);
155  int top = MIN(prev->y, next->y);
156  int bottom = MAX(prev->y + prev->h, next->y + next->h);
157  int left = prev->x + prev->w;
158  int right = next->x;
159  if (vertical_rules) {
160  top = prev->y + prev->h;
161  bottom = next->y;
162  left = MIN(prev->x, next->x);
163  right = MAX(prev->x + prev->w, next->x + next->w);
164  } else if (rtl_rules) {
165  // With RTL we have to account for BiDi.
166  // Right becomes the min left of all prior boxes back to the first
167  // space or newline.
168  right = prev->x;
169  left = next->x + next->w;
170  for (int j = i - 2;
171  j >= 0 && (*boxes)[j]->ch_ != " " && (*boxes)[j]->ch_ != "\t";
172  --j) {
173  prev = (*boxes)[j]->box_;
174  ASSERT_HOST(prev != nullptr);
175  if (prev->x < right) {
176  right = prev->x;
177  }
178  }
179  // Left becomes the max right of all next boxes forward to the first
180  // space or newline.
181  for (size_t j = i + 2; j < boxes->size() && (*boxes)[j]->box_ != nullptr &&
182  (*boxes)[j]->ch_ != "\t";
183  ++j) {
184  next = (*boxes)[j]->box_;
185  if (next->x + next->w > left) {
186  left = next->x + next->w;
187  }
188  }
189  }
190  // Italic and stylized characters can produce negative spaces, which
191  // Leptonica doesn't like, so clip to a positive size.
192  if (right <= left) right = left + 1;
193  if (bottom <= top) bottom = top + 1;
194  (*boxes)[i]->AddBox(left, top, right - left, bottom - top);
195  (*boxes)[i]->ch_ = " ";
196  }
197  }
198 }
199 
200 // Reorders text in a right-to-left script in left-to-right order.
201 /* static */
202 void BoxChar::ReorderRTLText(std::vector<BoxChar*>* boxes) {
203  // After adding newlines and spaces, this task is simply a matter of sorting
204  // by left each group of boxes between newlines.
205  BoxCharPtrSort sorter;
206  size_t end = 0;
207  for (size_t start = 0; start < boxes->size(); start = end + 1) {
208  end = start + 1;
209  while (end < boxes->size() && (*boxes)[end]->ch_ != "\t") ++end;
210  std::sort(boxes->begin() + start, boxes->begin() + end, sorter);
211  }
212 }
213 
214 // Returns true if the vector contains mostly RTL characters.
215 /* static */
216 bool BoxChar::ContainsMostlyRTL(const std::vector<BoxChar*>& boxes) {
217  int num_rtl = 0, num_ltr = 0;
218  for (unsigned int i = 0; i < boxes.size(); ++i) {
219  // Convert the unichar to UTF32 representation
220  GenericVector<char32> uni_vector;
221  if (!UNICHAR::UTF8ToUnicode(boxes[i]->ch_.c_str(), &uni_vector)) {
222  tprintf("Illegal utf8 in boxchar %u string:%s = ", i,
223  boxes[i]->ch_.c_str());
224  for (size_t c = 0; c < boxes[i]->ch_.size(); ++c) {
225  tprintf(" 0x%x", boxes[i]->ch_[c]);
226  }
227  tprintf("\n");
228  continue;
229  }
230  for (int j = 0; j < uni_vector.size(); ++j) {
231  UCharDirection dir = u_charDirection(uni_vector[j]);
232  if (dir == U_RIGHT_TO_LEFT || dir == U_RIGHT_TO_LEFT_ARABIC ||
233  dir == U_ARABIC_NUMBER) {
234  ++num_rtl;
235  } else {
236  ++num_ltr;
237  }
238  }
239  }
240  return num_rtl > num_ltr;
241 }
242 
243 // Returns true if the text is mostly laid out vertically.
244 /* static */
245 bool BoxChar::MostlyVertical(const std::vector<BoxChar*>& boxes) {
246  inT64 total_dx = 0, total_dy = 0;
247  for (size_t i = 1; i < boxes.size(); ++i) {
248  if (boxes[i - 1]->box_ != nullptr && boxes[i]->box_ != nullptr &&
249  boxes[i - 1]->page_ == boxes[i]->page_) {
250  int dx = boxes[i]->box_->x - boxes[i - 1]->box_->x;
251  int dy = boxes[i]->box_->y - boxes[i - 1]->box_->y;
252  if (abs(dx) > abs(dy) * kMinNewlineRatio ||
253  abs(dy) > abs(dx) * kMinNewlineRatio) {
254  total_dx += dx * dx;
255  total_dy += dy * dy;
256  }
257  }
258  }
259  return total_dy > total_dx;
260 }
261 
262 // Returns the total length of all the strings in the boxes.
263 /* static */
264 int BoxChar::TotalByteLength(const std::vector<BoxChar*>& boxes) {
265  int total_length = 0;
266  for (size_t i = 0; i < boxes.size(); ++i) total_length += boxes[i]->ch_.size();
267  return total_length;
268 }
269 
270 // Rotate the boxes in [start_box, end_box) by the given rotation.
271 // The rotation is in radians clockwise about the given center.
272 /* static */
273 void BoxChar::RotateBoxes(float rotation, int xcenter, int ycenter,
274  int start_box, int end_box,
275  std::vector<BoxChar*>* boxes) {
276  Boxa* orig = boxaCreate(0);
277  for (int i = start_box; i < end_box; ++i) {
278  BOX* box = (*boxes)[i]->box_;
279  if (box) boxaAddBox(orig, box, L_CLONE);
280  }
281  Boxa* rotated = boxaRotate(orig, xcenter, ycenter, rotation);
282  boxaDestroy(&orig);
283  for (int i = start_box, box_ind = 0; i < end_box; ++i) {
284  if ((*boxes)[i]->box_) {
285  boxDestroy(&((*boxes)[i]->box_));
286  (*boxes)[i]->box_ = boxaGetBox(rotated, box_ind++, L_CLONE);
287  }
288  }
289  boxaDestroy(&rotated);
290 }
291 
292 const int kMaxLineLength = 1024;
293 /* static */
294 void BoxChar::WriteTesseractBoxFile(const string& filename, int height,
295  const std::vector<BoxChar*>& boxes) {
296  string output = GetTesseractBoxStr(height, boxes);
297  File::WriteStringToFileOrDie(output, filename);
298 }
299 
300 /* static */
301 string BoxChar::GetTesseractBoxStr(int height,
302  const std::vector<BoxChar*>& boxes) {
303  string output;
304  char buffer[kMaxLineLength];
305  for (size_t i = 0; i < boxes.size(); ++i) {
306  const Box* box = boxes[i]->box_;
307  if (box == nullptr) {
308  tprintf("Error: Call PrepareToWrite before WriteTesseractBoxFile!!\n");
309  return "";
310  }
311  int nbytes =
312  snprintf(buffer, kMaxLineLength, "%s %d %d %d %d %d\n",
313  boxes[i]->ch_.c_str(), box->x, height - box->y - box->h,
314  box->x + box->w, height - box->y, boxes[i]->page_);
315  output.append(buffer, nbytes);
316  }
317  return output;
318 }
319 
320 } // namespace tesseract
static void WriteStringToFileOrDie(const string &str, const string &filename)
Definition: fileio.cpp:52
static void ReorderRTLText(std::vector< BoxChar *> *boxes)
Definition: boxchar.cpp:202
static void TranslateBoxes(int xshift, int yshift, std::vector< BoxChar *> *boxes)
Definition: boxchar.cpp:52
static bool MostlyVertical(const std::vector< BoxChar *> &boxes)
Definition: boxchar.cpp:245
int64_t inT64
Definition: host.h:40
voidpf void uLong size
Definition: ioapi.h:39
static int TotalByteLength(const std::vector< BoxChar *> &boxes)
Definition: boxchar.cpp:264
#define tprintf(...)
Definition: tprintf.h:31
int size() const
Definition: genericvector.h:72
static void WriteTesseractBoxFile(const string &name, int height, const std::vector< BoxChar *> &boxes)
Definition: boxchar.cpp:294
#define ASSERT_HOST(x)
Definition: errcode.h:84
const int kMinNewlineRatio
Definition: boxchar.cpp:37
static void InsertNewlines(bool rtl_rules, bool vertical_rules, std::vector< BoxChar *> *boxes)
Definition: boxchar.cpp:81
const Box * box() const
Definition: boxchar.h:48
#define MAX(x, y)
Definition: ndminx.h:24
#define MIN(x, y)
Definition: ndminx.h:28
static bool ContainsMostlyRTL(const std::vector< BoxChar *> &boxes)
Definition: boxchar.cpp:216
static string GetTesseractBoxStr(int height, const std::vector< BoxChar *> &boxes)
Definition: boxchar.cpp:301
const int kMaxLineLength
Definition: boxchar.cpp:292
static void RotateBoxes(float rotation, int xcenter, int ycenter, int start_box, int end_box, std::vector< BoxChar *> *boxes)
Definition: boxchar.cpp:273
BoxChar(const char *utf8_str, int len)
Definition: boxchar.cpp:41
const char * filename
Definition: ioapi.h:38
static bool UTF8ToUnicode(const char *utf8_str, GenericVector< int > *unicodes)
Definition: unichar.cpp:211
static void PrepareToWrite(std::vector< BoxChar *> *boxes)
Definition: boxchar.cpp:66
void AddBox(int x, int y, int width, int height)
Definition: boxchar.cpp:47
static void InsertSpaces(bool rtl_rules, bool vertical_rules, std::vector< BoxChar *> *boxes)
Definition: boxchar.cpp:145