tesseract  4.00.00dev
shapetable.cpp
Go to the documentation of this file.
1 // Copyright 2010 Google Inc. All Rights Reserved.
2 // Author: rays@google.com (Ray Smith)
4 // File: shapetable.cpp
5 // Description: Class to map a classifier shape index to unicharset
6 // indices and font indices.
7 // Author: Ray Smith
8 // Created: Tue Nov 02 15:31:32 PDT 2010
9 //
10 // (C) Copyright 2010, Google Inc.
11 // Licensed under the Apache License, Version 2.0 (the "License");
12 // you may not use this file except in compliance with the License.
13 // You may obtain a copy of the License at
14 // http://www.apache.org/licenses/LICENSE-2.0
15 // Unless required by applicable law or agreed to in writing, software
16 // distributed under the License is distributed on an "AS IS" BASIS,
17 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
18 // See the License for the specific language governing permissions and
19 // limitations under the License.
20 //
22 
23 #include "shapetable.h"
24 
25 #include "bitvector.h"
26 #include "fontinfo.h"
27 #include "intfeaturespace.h"
28 #include "strngs.h"
29 #include "unicharset.h"
30 #include "unicity_table.h"
31 
32 namespace tesseract {
33 
34 // Helper function to get the index of the first result with the required
35 // unichar_id. If the results are sorted by rating, this will also be the
36 // best result with the required unichar_id.
37 // Returns -1 if the unichar_id is not found
39  const GenericVector<ShapeRating>& results,
40  const ShapeTable& shape_table,
41  UNICHAR_ID unichar_id) {
42  for (int r = 0; r < results.size(); ++r) {
43  int shape_id = results[r].shape_id;
44  const Shape& shape = shape_table.GetShape(shape_id);
45  if (shape.ContainsUnichar(unichar_id)) {
46  return r;
47  }
48  }
49  return -1;
50 }
51 
52 // Helper function to get the index of the first result with the required
53 // unichar_id. If the results are sorted by rating, this will also be the
54 // best result with the required unichar_id.
55 // Returns -1 if the unichar_id is not found
57  const GenericVector<UnicharRating>& results,
58  UNICHAR_ID unichar_id) {
59  for (int r = 0; r < results.size(); ++r) {
60  if (results[r].unichar_id == unichar_id)
61  return r;
62  }
63  return -1;
64 }
65 
66 // Writes to the given file. Returns false in case of error.
67 bool UnicharAndFonts::Serialize(FILE* fp) const {
68  if (fwrite(&unichar_id, sizeof(unichar_id), 1, fp) != 1) return false;
69  if (!font_ids.Serialize(fp)) return false;
70  return true;
71 }
72 // Reads from the given file. Returns false in case of error.
73 
75  if (fp->FReadEndian(&unichar_id, sizeof(unichar_id), 1) != 1) return false;
76  if (!font_ids.DeSerialize(fp)) return false;
77  return true;
78 }
79 
80 // Sort function to sort a pair of UnicharAndFonts by unichar_id.
81 int UnicharAndFonts::SortByUnicharId(const void* v1, const void* v2) {
82  const UnicharAndFonts* p1 = static_cast<const UnicharAndFonts*>(v1);
83  const UnicharAndFonts* p2 = static_cast<const UnicharAndFonts*>(v2);
84  return p1->unichar_id - p2->unichar_id;
85 }
86 
87 // Writes to the given file. Returns false in case of error.
88 bool Shape::Serialize(FILE* fp) const {
89  uinT8 sorted = unichars_sorted_;
90  if (fwrite(&sorted, sizeof(sorted), 1, fp) != 1)
91  return false;
92  if (!unichars_.SerializeClasses(fp)) return false;
93  return true;
94 }
95 // Reads from the given file. Returns false in case of error.
96 
98  uinT8 sorted;
99  if (fp->FRead(&sorted, sizeof(sorted), 1) != 1) return false;
100  unichars_sorted_ = sorted != 0;
101  return unichars_.DeSerializeClasses(fp);
102 }
103 
104 // Adds a font_id for the given unichar_id. If the unichar_id is not
105 // in the shape, it is added.
106 void Shape::AddToShape(int unichar_id, int font_id) {
107  for (int c = 0; c < unichars_.size(); ++c) {
108  if (unichars_[c].unichar_id == unichar_id) {
109  // Found the unichar in the shape table.
110  GenericVector<int>& font_list = unichars_[c].font_ids;
111  for (int f = 0; f < font_list.size(); ++f) {
112  if (font_list[f] == font_id)
113  return; // Font is already there.
114  }
115  font_list.push_back(font_id);
116  return;
117  }
118  }
119  // Unichar_id is not in shape, so add it to shape.
120  unichars_.push_back(UnicharAndFonts(unichar_id, font_id));
121  unichars_sorted_ = unichars_.size() <= 1;
122 }
123 
124 // Adds everything in other to this.
125 void Shape::AddShape(const Shape& other) {
126  for (int c = 0; c < other.unichars_.size(); ++c) {
127  for (int f = 0; f < other.unichars_[c].font_ids.size(); ++f) {
128  AddToShape(other.unichars_[c].unichar_id,
129  other.unichars_[c].font_ids[f]);
130  }
131  }
132  unichars_sorted_ = unichars_.size() <= 1;
133 }
134 
135 // Returns true if the shape contains the given unichar_id, font_id pair.
136 bool Shape::ContainsUnicharAndFont(int unichar_id, int font_id) const {
137  for (int c = 0; c < unichars_.size(); ++c) {
138  if (unichars_[c].unichar_id == unichar_id) {
139  // Found the unichar, so look for the font.
140  GenericVector<int>& font_list = unichars_[c].font_ids;
141  for (int f = 0; f < font_list.size(); ++f) {
142  if (font_list[f] == font_id)
143  return true;
144  }
145  return false;
146  }
147  }
148  return false;
149 }
150 
151 // Returns true if the shape contains the given unichar_id, ignoring font.
152 bool Shape::ContainsUnichar(int unichar_id) const {
153  for (int c = 0; c < unichars_.size(); ++c) {
154  if (unichars_[c].unichar_id == unichar_id) {
155  return true;
156  }
157  }
158  return false;
159 }
160 
161 // Returns true if the shape contains the given font, ignoring unichar_id.
162 bool Shape::ContainsFont(int font_id) const {
163  for (int c = 0; c < unichars_.size(); ++c) {
164  GenericVector<int>& font_list = unichars_[c].font_ids;
165  for (int f = 0; f < font_list.size(); ++f) {
166  if (font_list[f] == font_id)
167  return true;
168  }
169  }
170  return false;
171 }
172 // Returns true if the shape contains the given font properties, ignoring
173 // unichar_id.
175  uinT32 properties) const {
176  for (int c = 0; c < unichars_.size(); ++c) {
177  GenericVector<int>& font_list = unichars_[c].font_ids;
178  for (int f = 0; f < font_list.size(); ++f) {
179  if (font_table.get(font_list[f]).properties == properties)
180  return true;
181  }
182  }
183  return false;
184 }
185 // Returns true if the shape contains multiple different font properties,
186 // ignoring unichar_id.
188  const FontInfoTable& font_table) const {
189  uinT32 properties = font_table.get(unichars_[0].font_ids[0]).properties;
190  for (int c = 0; c < unichars_.size(); ++c) {
191  GenericVector<int>& font_list = unichars_[c].font_ids;
192  for (int f = 0; f < font_list.size(); ++f) {
193  if (font_table.get(font_list[f]).properties != properties)
194  return true;
195  }
196  }
197  return false;
198 }
199 
200 // Returns true if this shape is equal to other (ignoring order of unichars
201 // and fonts).
202 bool Shape::operator==(const Shape& other) const {
203  return IsSubsetOf(other) && other.IsSubsetOf(*this);
204 }
205 
206 // Returns true if this is a subset (including equal) of other.
207 bool Shape::IsSubsetOf(const Shape& other) const {
208  for (int c = 0; c < unichars_.size(); ++c) {
209  int unichar_id = unichars_[c].unichar_id;
210  const GenericVector<int>& font_list = unichars_[c].font_ids;
211  for (int f = 0; f < font_list.size(); ++f) {
212  if (!other.ContainsUnicharAndFont(unichar_id, font_list[f]))
213  return false;
214  }
215  }
216  return true;
217 }
218 
219 // Returns true if the lists of unichar ids are the same in this and other,
220 // ignoring fonts.
221 // NOT const, as it will sort the unichars on demand.
223  if (unichars_.size() != other->unichars_.size()) return false;
224  if (!unichars_sorted_) SortUnichars();
225  if (!other->unichars_sorted_) other->SortUnichars();
226  for (int c = 0; c < unichars_.size(); ++c) {
227  if (unichars_[c].unichar_id != other->unichars_[c].unichar_id)
228  return false;
229  }
230  return true;
231 }
232 
233 // Sorts the unichars_ vector by unichar.
234 void Shape::SortUnichars() {
235  unichars_.sort(UnicharAndFonts::SortByUnicharId);
236  unichars_sorted_ = true;
237 }
238 
239 ShapeTable::ShapeTable() : unicharset_(NULL), num_fonts_(0) {
240 }
242  : unicharset_(&unicharset), num_fonts_(0) {
243 }
244 
245 // Writes to the given file. Returns false in case of error.
246 bool ShapeTable::Serialize(FILE* fp) const {
247  if (!shape_table_.Serialize(fp)) return false;
248  return true;
249 }
250 // Reads from the given file. Returns false in case of error.
251 
253  if (!shape_table_.DeSerialize(fp)) return false;
254  num_fonts_ = 0;
255  return true;
256 }
257 
258 // Returns the number of fonts used in this ShapeTable, computing it if
259 // necessary.
260 int ShapeTable::NumFonts() const {
261  if (num_fonts_ <= 0) {
262  for (int shape_id = 0; shape_id < shape_table_.size(); ++shape_id) {
263  const Shape& shape = *shape_table_[shape_id];
264  for (int c = 0; c < shape.size(); ++c) {
265  for (int f = 0; f < shape[c].font_ids.size(); ++f) {
266  if (shape[c].font_ids[f] >= num_fonts_)
267  num_fonts_ = shape[c].font_ids[f] + 1;
268  }
269  }
270  }
271  }
272  return num_fonts_;
273 }
274 
275 // Re-indexes the class_ids in the shapetable according to the given map.
276 // Useful in conjunction with set_unicharset.
277 void ShapeTable::ReMapClassIds(const GenericVector<int>& unicharset_map) {
278  for (int shape_id = 0; shape_id < shape_table_.size(); ++shape_id) {
279  Shape* shape = shape_table_[shape_id];
280  for (int c = 0; c < shape->size(); ++c) {
281  shape->SetUnicharId(c, unicharset_map[(*shape)[c].unichar_id]);
282  }
283  }
284 }
285 
286 // Returns a string listing the classes/fonts in a shape.
287 STRING ShapeTable::DebugStr(int shape_id) const {
288  if (shape_id < 0 || shape_id >= shape_table_.size())
289  return STRING("INVALID_UNICHAR_ID");
290  const Shape& shape = GetShape(shape_id);
291  STRING result;
292  result.add_str_int("Shape", shape_id);
293  if (shape.size() > 100) {
294  result.add_str_int(" Num unichars=", shape.size());
295  return result;
296  }
297  for (int c = 0; c < shape.size(); ++c) {
298  result.add_str_int(" c_id=", shape[c].unichar_id);
299  result += "=";
300  result += unicharset_->id_to_unichar(shape[c].unichar_id);
301  if (shape.size() < 10) {
302  result.add_str_int(", ", shape[c].font_ids.size());
303  result += " fonts =";
304  int num_fonts = shape[c].font_ids.size();
305  if (num_fonts > 10) {
306  result.add_str_int(" ", shape[c].font_ids[0]);
307  result.add_str_int(" ... ", shape[c].font_ids[num_fonts - 1]);
308  } else {
309  for (int f = 0; f < num_fonts; ++f) {
310  result.add_str_int(" ", shape[c].font_ids[f]);
311  }
312  }
313  }
314  }
315  return result;
316 }
317 
318 // Returns a debug string summarizing the table.
320  int max_unichars = 0;
321  int num_multi_shapes = 0;
322  int num_master_shapes = 0;
323  for (int s = 0; s < shape_table_.size(); ++s) {
324  if (MasterDestinationIndex(s) != s) continue;
325  ++num_master_shapes;
326  int shape_size = GetShape(s).size();
327  if (shape_size > 1)
328  ++num_multi_shapes;
329  if (shape_size > max_unichars)
330  max_unichars = shape_size;
331  }
332  STRING result;
333  result.add_str_int("Number of shapes = ", num_master_shapes);
334  result.add_str_int(" max unichars = ", max_unichars);
335  result.add_str_int(" number with multiple unichars = ", num_multi_shapes);
336  return result;
337 }
338 
339 
340 // Adds a new shape starting with the given unichar_id and font_id.
341 // Returns the assigned index.
342 int ShapeTable::AddShape(int unichar_id, int font_id) {
343  int index = shape_table_.size();
344  Shape* shape = new Shape;
345  shape->AddToShape(unichar_id, font_id);
346  shape_table_.push_back(shape);
347  num_fonts_ = MAX(num_fonts_, font_id + 1);
348  return index;
349 }
350 
351 // Adds a copy of the given shape unless it is already present.
352 // Returns the assigned index or index of existing shape if already present.
353 int ShapeTable::AddShape(const Shape& other) {
354  int index;
355  for (index = 0; index < shape_table_.size() &&
356  !(other == *shape_table_[index]); ++index)
357  continue;
358  if (index == shape_table_.size()) {
359  Shape* shape = new Shape(other);
360  shape_table_.push_back(shape);
361  }
362  num_fonts_ = 0;
363  return index;
364 }
365 
366 // Removes the shape given by the shape index.
367 void ShapeTable::DeleteShape(int shape_id) {
368  delete shape_table_[shape_id];
369  shape_table_[shape_id] = NULL;
370  shape_table_.remove(shape_id);
371 }
372 
373 // Adds a font_id to the given existing shape index for the given
374 // unichar_id. If the unichar_id is not in the shape, it is added.
375 void ShapeTable::AddToShape(int shape_id, int unichar_id, int font_id) {
376  Shape& shape = *shape_table_[shape_id];
377  shape.AddToShape(unichar_id, font_id);
378  num_fonts_ = MAX(num_fonts_, font_id + 1);
379 }
380 
381 // Adds the given shape to the existing shape with the given index.
382 void ShapeTable::AddShapeToShape(int shape_id, const Shape& other) {
383  Shape& shape = *shape_table_[shape_id];
384  shape.AddShape(other);
385  num_fonts_ = 0;
386 }
387 
388 // Returns the id of the shape that contains the given unichar and font.
389 // If not found, returns -1.
390 // If font_id < 0, the font_id is ignored and the first shape that matches
391 // the unichar_id is returned.
392 int ShapeTable::FindShape(int unichar_id, int font_id) const {
393  for (int s = 0; s < shape_table_.size(); ++s) {
394  const Shape& shape = GetShape(s);
395  for (int c = 0; c < shape.size(); ++c) {
396  if (shape[c].unichar_id == unichar_id) {
397  if (font_id < 0)
398  return s; // We don't care about the font.
399  for (int f = 0; f < shape[c].font_ids.size(); ++f) {
400  if (shape[c].font_ids[f] == font_id)
401  return s;
402  }
403  }
404  }
405  }
406  return -1;
407 }
408 
409 // Returns the first unichar_id and font_id in the given shape.
411  int* unichar_id, int* font_id) const {
412  const UnicharAndFonts& unichar_and_fonts = (*shape_table_[shape_id])[0];
413  *unichar_id = unichar_and_fonts.unichar_id;
414  *font_id = unichar_and_fonts.font_ids[0];
415 }
416 
417 // Expands all the classes/fonts in the shape individually to build
418 // a ShapeTable.
420  const ShapeTable& master_shapes) {
421  BitVector shape_map(master_shapes.NumShapes());
422  for (int u_ind = 0; u_ind < shape.size(); ++u_ind) {
423  for (int f_ind = 0; f_ind < shape[u_ind].font_ids.size(); ++f_ind) {
424  int c = shape[u_ind].unichar_id;
425  int f = shape[u_ind].font_ids[f_ind];
426  int master_id = master_shapes.FindShape(c, f);
427  if (master_id >= 0) {
428  shape_map.SetBit(master_id);
429  } else if (FindShape(c, f) < 0) {
430  AddShape(c, f);
431  }
432  }
433  }
434  int num_masters = 0;
435  for (int s = 0; s < master_shapes.NumShapes(); ++s) {
436  if (shape_map[s]) {
437  AddShape(master_shapes.GetShape(s));
438  ++num_masters;
439  }
440  }
441  return num_masters;
442 }
443 
444 // Returns true if the shapes are already merged.
445 bool ShapeTable::AlreadyMerged(int shape_id1, int shape_id2) const {
446  return MasterDestinationIndex(shape_id1) == MasterDestinationIndex(shape_id2);
447 }
448 
449 // Returns true if any shape contains multiple unichars.
451  int num_shapes = NumShapes();
452  for (int s1 = 0; s1 < num_shapes; ++s1) {
453  if (MasterDestinationIndex(s1) != s1) continue;
454  if (GetShape(s1).size() > 1)
455  return true;
456  }
457  return false;
458 }
459 
460 // Returns the maximum number of unichars over all shapes.
462  int max_num_unichars = 0;
463  int num_shapes = NumShapes();
464  for (int s = 0; s < num_shapes; ++s) {
465  if (GetShape(s).size() > max_num_unichars)
466  max_num_unichars = GetShape(s).size();
467  }
468  return max_num_unichars;
469 }
470 
471 
472 // Merges shapes with a common unichar over the [start, end) interval.
473 // Assumes single unichar per shape.
474 void ShapeTable::ForceFontMerges(int start, int end) {
475  for (int s1 = start; s1 < end; ++s1) {
476  if (MasterDestinationIndex(s1) == s1 && GetShape(s1).size() == 1) {
477  int unichar_id = GetShape(s1)[0].unichar_id;
478  for (int s2 = s1 + 1; s2 < end; ++s2) {
479  if (MasterDestinationIndex(s2) == s2 && GetShape(s2).size() == 1 &&
480  unichar_id == GetShape(s2)[0].unichar_id) {
481  MergeShapes(s1, s2);
482  }
483  }
484  }
485  }
486  ShapeTable compacted(*unicharset_);
487  compacted.AppendMasterShapes(*this, NULL);
488  *this = compacted;
489 }
490 
491 // Returns the number of unichars in the master shape.
492 int ShapeTable::MasterUnicharCount(int shape_id) const {
493  int master_id = MasterDestinationIndex(shape_id);
494  return GetShape(master_id).size();
495 }
496 
497 // Returns the sum of the font counts in the master shape.
498 int ShapeTable::MasterFontCount(int shape_id) const {
499  int master_id = MasterDestinationIndex(shape_id);
500  const Shape& shape = GetShape(master_id);
501  int font_count = 0;
502  for (int c = 0; c < shape.size(); ++c) {
503  font_count += shape[c].font_ids.size();
504  }
505  return font_count;
506 }
507 
508 // Returns the number of unichars that would result from merging the shapes.
509 int ShapeTable::MergedUnicharCount(int shape_id1, int shape_id2) const {
510  // Do it the easy way for now.
511  int master_id1 = MasterDestinationIndex(shape_id1);
512  int master_id2 = MasterDestinationIndex(shape_id2);
513  Shape combined_shape(*shape_table_[master_id1]);
514  combined_shape.AddShape(*shape_table_[master_id2]);
515  return combined_shape.size();
516 }
517 
518 // Merges two shape_ids, leaving shape_id2 marked as merged.
519 void ShapeTable::MergeShapes(int shape_id1, int shape_id2) {
520  int master_id1 = MasterDestinationIndex(shape_id1);
521  int master_id2 = MasterDestinationIndex(shape_id2);
522  // Point master_id2 (and all merged shapes) to master_id1.
523  shape_table_[master_id2]->set_destination_index(master_id1);
524  // Add all the shapes of master_id2 to master_id1.
525  shape_table_[master_id1]->AddShape(*shape_table_[master_id2]);
526 }
527 
528 // Swaps two shape_ids.
529 void ShapeTable::SwapShapes(int shape_id1, int shape_id2) {
530  Shape* tmp = shape_table_[shape_id1];
531  shape_table_[shape_id1] = shape_table_[shape_id2];
532  shape_table_[shape_id2] = tmp;
533 }
534 
535 // Returns the destination of this shape, (if merged), taking into account
536 // the fact that the destination may itself have been merged.
537 int ShapeTable::MasterDestinationIndex(int shape_id) const {
538  int dest_id = shape_table_[shape_id]->destination_index();
539  if (dest_id == shape_id || dest_id < 0)
540  return shape_id; // Is master already.
541  int master_id = shape_table_[dest_id]->destination_index();
542  if (master_id == dest_id || master_id < 0)
543  return dest_id; // Dest is the master and shape_id points to it.
544  master_id = MasterDestinationIndex(master_id);
545  return master_id;
546 }
547 
548 // Returns false if the unichars in neither shape is a subset of the other.
549 bool ShapeTable::SubsetUnichar(int shape_id1, int shape_id2) const {
550  const Shape& shape1 = GetShape(shape_id1);
551  const Shape& shape2 = GetShape(shape_id2);
552  int c1, c2;
553  for (c1 = 0; c1 < shape1.size(); ++c1) {
554  int unichar_id1 = shape1[c1].unichar_id;
555  if (!shape2.ContainsUnichar(unichar_id1))
556  break;
557  }
558  for (c2 = 0; c2 < shape2.size(); ++c2) {
559  int unichar_id2 = shape2[c2].unichar_id;
560  if (!shape1.ContainsUnichar(unichar_id2))
561  break;
562  }
563  return c1 == shape1.size() || c2 == shape2.size();
564 }
565 
566 // Returns false if the unichars in neither shape is a subset of the other.
567 bool ShapeTable::MergeSubsetUnichar(int merge_id1, int merge_id2,
568  int shape_id) const {
569  const Shape& merge1 = GetShape(merge_id1);
570  const Shape& merge2 = GetShape(merge_id2);
571  const Shape& shape = GetShape(shape_id);
572  int cm1, cm2, cs;
573  for (cs = 0; cs < shape.size(); ++cs) {
574  int unichar_id = shape[cs].unichar_id;
575  if (!merge1.ContainsUnichar(unichar_id) &&
576  !merge2.ContainsUnichar(unichar_id))
577  break; // Shape is not a subset of the merge.
578  }
579  for (cm1 = 0; cm1 < merge1.size(); ++cm1) {
580  int unichar_id1 = merge1[cm1].unichar_id;
581  if (!shape.ContainsUnichar(unichar_id1))
582  break; // Merge is not a subset of shape
583  }
584  for (cm2 = 0; cm2 < merge2.size(); ++cm2) {
585  int unichar_id2 = merge2[cm2].unichar_id;
586  if (!shape.ContainsUnichar(unichar_id2))
587  break; // Merge is not a subset of shape
588  }
589  return cs == shape.size() || (cm1 == merge1.size() && cm2 == merge2.size());
590 }
591 
592 // Returns true if the unichar sets are equal between the shapes.
593 bool ShapeTable::EqualUnichars(int shape_id1, int shape_id2) const {
594  const Shape& shape1 = GetShape(shape_id1);
595  const Shape& shape2 = GetShape(shape_id2);
596  for (int c1 = 0; c1 < shape1.size(); ++c1) {
597  int unichar_id1 = shape1[c1].unichar_id;
598  if (!shape2.ContainsUnichar(unichar_id1))
599  return false;
600  }
601  for (int c2 = 0; c2 < shape2.size(); ++c2) {
602  int unichar_id2 = shape2[c2].unichar_id;
603  if (!shape1.ContainsUnichar(unichar_id2))
604  return false;
605  }
606  return true;
607 }
608 
609 // Returns true if the unichar sets are equal between the shapes.
610 bool ShapeTable::MergeEqualUnichars(int merge_id1, int merge_id2,
611  int shape_id) const {
612  const Shape& merge1 = GetShape(merge_id1);
613  const Shape& merge2 = GetShape(merge_id2);
614  const Shape& shape = GetShape(shape_id);
615  for (int cs = 0; cs < shape.size(); ++cs) {
616  int unichar_id = shape[cs].unichar_id;
617  if (!merge1.ContainsUnichar(unichar_id) &&
618  !merge2.ContainsUnichar(unichar_id))
619  return false; // Shape has a unichar that appears in neither merge.
620  }
621  for (int cm1 = 0; cm1 < merge1.size(); ++cm1) {
622  int unichar_id1 = merge1[cm1].unichar_id;
623  if (!shape.ContainsUnichar(unichar_id1))
624  return false; // Merge has a unichar that is not in shape.
625  }
626  for (int cm2 = 0; cm2 < merge2.size(); ++cm2) {
627  int unichar_id2 = merge2[cm2].unichar_id;
628  if (!shape.ContainsUnichar(unichar_id2))
629  return false; // Merge has a unichar that is not in shape.
630  }
631  return true;
632 }
633 
634 // Returns true if there is a common unichar between the shapes.
635 bool ShapeTable::CommonUnichars(int shape_id1, int shape_id2) const {
636  const Shape& shape1 = GetShape(shape_id1);
637  const Shape& shape2 = GetShape(shape_id2);
638  for (int c1 = 0; c1 < shape1.size(); ++c1) {
639  int unichar_id1 = shape1[c1].unichar_id;
640  if (shape2.ContainsUnichar(unichar_id1))
641  return true;
642  }
643  return false;
644 }
645 
646 // Returns true if there is a common font id between the shapes.
647 bool ShapeTable::CommonFont(int shape_id1, int shape_id2) const {
648  const Shape& shape1 = GetShape(shape_id1);
649  const Shape& shape2 = GetShape(shape_id2);
650  for (int c1 = 0; c1 < shape1.size(); ++c1) {
651  const GenericVector<int>& font_list1 = shape1[c1].font_ids;
652  for (int f = 0; f < font_list1.size(); ++f) {
653  if (shape2.ContainsFont(font_list1[f]))
654  return true;
655  }
656  }
657  return false;
658 }
659 
660 // Appends the master shapes from other to this.
661 // If not NULL, shape_map is set to map other shape_ids to this's shape_ids.
663  GenericVector<int>* shape_map) {
664  if (shape_map != NULL)
665  shape_map->init_to_size(other.NumShapes(), -1);
666  for (int s = 0; s < other.shape_table_.size(); ++s) {
667  if (other.shape_table_[s]->destination_index() < 0) {
668  int index = AddShape(*other.shape_table_[s]);
669  if (shape_map != NULL)
670  (*shape_map)[s] = index;
671  }
672  }
673 }
674 
675 // Returns the number of master shapes remaining after merging.
677  int num_shapes = 0;
678  for (int s = 0; s < shape_table_.size(); ++s) {
679  if (shape_table_[s]->destination_index() < 0)
680  ++num_shapes;
681  }
682  return num_shapes;
683 }
684 
685 
686 // Adds the unichars of the given shape_id to the vector of results. Any
687 // unichar_id that is already present just has the fonts added to the
688 // font set for that result without adding a new entry in the vector.
689 // NOTE: it is assumed that the results are given to this function in order
690 // of decreasing rating.
691 // The unichar_map vector indicates the index of the results entry containing
692 // each unichar, or -1 if the unichar is not yet included in results.
694  GenericVector<int>* unichar_map,
695  GenericVector<UnicharRating>* results)const {
696  if (shape_rating.joined) {
697  AddUnicharToResults(UNICHAR_JOINED, shape_rating.rating, unichar_map,
698  results);
699  }
700  if (shape_rating.broken) {
701  AddUnicharToResults(UNICHAR_BROKEN, shape_rating.rating, unichar_map,
702  results);
703  }
704  const Shape& shape = GetShape(shape_rating.shape_id);
705  for (int u = 0; u < shape.size(); ++u) {
706  int result_index = AddUnicharToResults(shape[u].unichar_id,
707  shape_rating.rating,
708  unichar_map, results);
709  for (int f = 0; f < shape[u].font_ids.size(); ++f) {
710  (*results)[result_index].fonts.push_back(
711  ScoredFont(shape[u].font_ids[f],
712  IntCastRounded(shape_rating.rating * MAX_INT16)));
713  }
714  }
715 }
716 
717 // Adds the given unichar_id to the results if needed, updating unichar_map
718 // and returning the index of unichar in results.
719 int ShapeTable::AddUnicharToResults(
720  int unichar_id, float rating, GenericVector<int>* unichar_map,
721  GenericVector<UnicharRating>* results) const {
722  int result_index = unichar_map->get(unichar_id);
723  if (result_index < 0) {
724  UnicharRating result(unichar_id, rating);
725  result_index = results->push_back(result);
726  (*unichar_map)[unichar_id] = result_index;
727  }
728  return result_index;
729 }
730 
731 
732 } // namespace tesseract
double u[max]
bool operator==(const Shape &other) const
Definition: shapetable.cpp:202
const UNICHARSET & unicharset() const
Definition: shapetable.h:278
bool IsEqualUnichars(Shape *other)
Definition: shapetable.cpp:222
void add_str_int(const char *str, int number)
Definition: strngs.cpp:381
void AppendMasterShapes(const ShapeTable &other, GenericVector< int > *shape_map)
Definition: shapetable.cpp:662
static int SortByUnicharId(const void *v1, const void *v2)
Definition: shapetable.cpp:81
bool Serialize(FILE *fp) const
Definition: shapetable.cpp:246
bool DeSerialize(TFile *fp)
Definition: shapetable.cpp:74
int BuildFromShape(const Shape &shape, const ShapeTable &master_shapes)
Definition: shapetable.cpp:419
int NumMasterShapes() const
Definition: shapetable.cpp:676
int MasterUnicharCount(int shape_id) const
Definition: shapetable.cpp:492
static int FirstResultWithUnichar(const GenericVector< ShapeRating > &results, const ShapeTable &shape_table, UNICHAR_ID unichar_id)
Definition: shapetable.cpp:38
bool DeSerialize(TFile *fp)
Definition: shapetable.cpp:97
bool SubsetUnichar(int shape_id1, int shape_id2) const
Definition: shapetable.cpp:549
void init_to_size(int size, T t)
int UNICHAR_ID
Definition: unichar.h:33
STRING SummaryStr() const
Definition: shapetable.cpp:319
bool Serialize(FILE *fp) const
Definition: shapetable.cpp:88
bool CommonFont(int shape_id1, int shape_id2) const
Definition: shapetable.cpp:647
int MergedUnicharCount(int shape_id1, int shape_id2) const
Definition: shapetable.cpp:509
voidpf void uLong size
Definition: ioapi.h:39
bool AnyMultipleUnichars() const
Definition: shapetable.cpp:450
STRING DebugStr(int shape_id) const
Definition: shapetable.cpp:287
int MaxNumUnichars() const
Definition: shapetable.cpp:461
void MergeShapes(int shape_id1, int shape_id2)
Definition: shapetable.cpp:519
bool EqualUnichars(int shape_id1, int shape_id2) const
Definition: shapetable.cpp:593
#define MAX_INT16
Definition: host.h:61
bool IsSubsetOf(const Shape &other) const
Definition: shapetable.cpp:207
bool ContainsFontProperties(const FontInfoTable &font_table, uinT32 properties) const
Definition: shapetable.cpp:174
bool ContainsMultipleFontProperties(const FontInfoTable &font_table) const
Definition: shapetable.cpp:187
int push_back(T object)
bool CommonUnichars(int shape_id1, int shape_id2) const
Definition: shapetable.cpp:635
bool MergeEqualUnichars(int merge_id1, int merge_id2, int shape_id) const
Definition: shapetable.cpp:610
bool DeSerialize(TFile *fp)
Definition: shapetable.cpp:252
int FReadEndian(void *buffer, int size, int count)
Definition: serialis.cpp:97
int MasterFontCount(int shape_id) const
Definition: shapetable.cpp:498
int IntCastRounded(double x)
Definition: helpers.h:179
int NumShapes() const
Definition: shapetable.h:275
int size() const
Definition: genericvector.h:72
void GetFirstUnicharAndFont(int shape_id, int *unichar_id, int *font_id) const
Definition: shapetable.cpp:410
bool MergeSubsetUnichar(int merge_id1, int merge_id2, int shape_id) const
Definition: shapetable.cpp:567
const char * id_to_unichar(UNICHAR_ID id) const
Definition: unicharset.cpp:266
void AddToShape(int shape_id, int unichar_id, int font_id)
Definition: shapetable.cpp:375
int MasterDestinationIndex(int shape_id) const
Definition: shapetable.cpp:537
void AddShapeToShape(int shape_id, const Shape &other)
Definition: shapetable.cpp:382
uint32_t uinT32
Definition: host.h:39
const Shape & GetShape(int shape_id) const
Definition: shapetable.h:320
static int FirstResultWithUnichar(const GenericVector< UnicharRating > &results, UNICHAR_ID unichar_id)
Definition: shapetable.cpp:56
Definition: strngs.h:45
void ForceFontMerges(int start, int end)
Definition: shapetable.cpp:474
bool ContainsFont(int font_id) const
Definition: shapetable.cpp:162
T & get(int index) const
void ReMapClassIds(const GenericVector< int > &unicharset_map)
Definition: shapetable.cpp:277
#define MAX(x, y)
Definition: ndminx.h:24
void AddToShape(int unichar_id, int font_id)
Definition: shapetable.cpp:106
GenericVector< inT32 > font_ids
Definition: shapetable.h:175
void AddShape(const Shape &other)
Definition: shapetable.cpp:125
uint8_t uinT8
Definition: host.h:35
bool ContainsUnicharAndFont(int unichar_id, int font_id) const
Definition: shapetable.cpp:136
bool ContainsUnichar(int unichar_id) const
Definition: shapetable.cpp:152
int AddShape(int unichar_id, int font_id)
Definition: shapetable.cpp:342
int size() const
Definition: shapetable.h:200
void AddShapeToResults(const ShapeRating &shape_rating, GenericVector< int > *unichar_map, GenericVector< UnicharRating > *results) const
Definition: shapetable.cpp:693
void SetUnicharId(int index, int unichar_id)
Definition: shapetable.h:209
int FindShape(int unichar_id, int font_id) const
Definition: shapetable.cpp:392
bool Serialize(FILE *fp) const
Definition: shapetable.cpp:67
bool AlreadyMerged(int shape_id1, int shape_id2) const
Definition: shapetable.cpp:445
void SwapShapes(int shape_id1, int shape_id2)
Definition: shapetable.cpp:529
void DeleteShape(int shape_id)
Definition: shapetable.cpp:367
int FRead(void *buffer, int size, int count)
Definition: serialis.cpp:108