tesseract  4.00.00dev
unicharset.cpp
Go to the documentation of this file.
1 // File: unicharset.cpp
3 // Description: Unicode character/ligature set class.
4 // Author: Thomas Kielbus
5 // Created: Wed Jun 28 17:05:01 PDT 2006
6 //
7 // (C) Copyright 2006, Google Inc.
8 // Licensed under the Apache License, Version 2.0 (the "License");
9 // you may not use this file except in compliance with the License.
10 // You may obtain a copy of the License at
11 // http://www.apache.org/licenses/LICENSE-2.0
12 // Unless required by applicable law or agreed to in writing, software
13 // distributed under the License is distributed on an "AS IS" BASIS,
14 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15 // See the License for the specific language governing permissions and
16 // limitations under the License.
17 //
19 
20 #include "unicharset.h"
21 
22 #include <assert.h>
23 #include <stdio.h>
24 #include <string.h>
25 
26 #include "params.h"
27 #include "serialis.h"
28 #include "tesscallback.h"
29 #include "tprintf.h"
30 #include "unichar.h"
31 
32 // Special character used in representing character fragments.
33 static const char kSeparator = '|';
34 // Special character used in representing 'natural' character fragments.
35 static const char kNaturalFlag = 'n';
36 
37 static const int ISALPHA_MASK = 0x1;
38 static const int ISLOWER_MASK = 0x2;
39 static const int ISUPPER_MASK = 0x4;
40 static const int ISDIGIT_MASK = 0x8;
41 static const int ISPUNCTUATION_MASK = 0x10;
42 
43 // Y coordinate threshold for determining cap-height vs x-height.
44 // TODO(rays) Bring the global definition down to the ccutil library level,
45 // so this constant is relative to some other constants.
46 static const int kMeanlineThreshold = 220;
47 // Let C be the number of alpha chars for which all tops exceed
48 // kMeanlineThreshold, and X the number of alpha chars for which all
49 // tops are below kMeanlineThreshold, then if X > C *
50 // kMinXHeightFraction and C > X * kMinCapHeightFraction or more than
51 // half the alpha characters have upper or lower case, then the
52 // unicharset "has x-height".
53 const double kMinXHeightFraction = 0.25;
54 const double kMinCapHeightFraction = 0.05;
55 
56 /*static */
57 const char* UNICHARSET::kCustomLigatures[][2] = {
58  {"ct", "\uE003"}, // c + t -> U+E003
59  {"ſh", "\uE006"}, // long-s + h -> U+E006
60  {"ſi", "\uE007"}, // long-s + i -> U+E007
61  {"ſl", "\uE008"}, // long-s + l -> U+E008
62  {"ſſ", "\uE009"}, // long-s + long-s -> U+E009
63  {NULL, NULL}
64 };
65 
66 // List of strings for the SpecialUnicharCodes. Keep in sync with the enum.
68  " ",
69  "Joined",
70  "|Broken|0|1"
71 };
72 
73 UNICHARSET::UNICHAR_PROPERTIES::UNICHAR_PROPERTIES() {
74  Init();
75 }
76 
77 // Initialize all properties to sensible default values.
78 void UNICHARSET::UNICHAR_PROPERTIES::Init() {
79  isalpha = false;
80  islower = false;
81  isupper = false;
82  isdigit = false;
83  ispunctuation = false;
84  isngram = false;
85  enabled = false;
86  SetRangesOpen();
87  script_id = 0;
88  other_case = 0;
89  mirror = 0;
90  normed = "";
92  fragment = NULL;
93 }
94 
95 // Sets all ranges wide open. Initialization default in case there are
96 // no useful values available.
97 void UNICHARSET::UNICHAR_PROPERTIES::SetRangesOpen() {
98  min_bottom = 0;
99  max_bottom = MAX_UINT8;
100  min_top = 0;
101  max_top = MAX_UINT8;
102  width = 0.0f;
103  width_sd = 0.0f;
104  bearing = 0.0f;
105  bearing_sd = 0.0f;
106  advance = 0.0f;
107  advance_sd = 0.0f;
108 }
109 
110 // Sets all ranges to empty. Used before expanding with font-based data.
111 void UNICHARSET::UNICHAR_PROPERTIES::SetRangesEmpty() {
112  min_bottom = MAX_UINT8;
113  max_bottom = 0;
114  min_top = MAX_UINT8;
115  max_top = 0;
116  width = 0.0f;
117  width_sd = 0.0f;
118  bearing = 0.0f;
119  bearing_sd = 0.0f;
120  advance = 0.0f;
121  advance_sd = 0.0f;
122 }
123 
124 // Returns true if any of the top/bottom/width/bearing/advance ranges/stats
125 // is emtpy.
126 bool UNICHARSET::UNICHAR_PROPERTIES::AnyRangeEmpty() const {
127  return width == 0.0f || advance == 0.0f;
128 }
129 
130 // Expands the ranges with the ranges from the src properties.
131 void UNICHARSET::UNICHAR_PROPERTIES::ExpandRangesFrom(
132  const UNICHAR_PROPERTIES& src) {
133  UpdateRange(src.min_bottom, &min_bottom, &max_bottom);
134  UpdateRange(src.max_bottom, &min_bottom, &max_bottom);
135  UpdateRange(src.min_top, &min_top, &max_top);
136  UpdateRange(src.max_top, &min_top, &max_top);
137  if (src.width_sd > width_sd) {
138  width = src.width;
139  width_sd = src.width_sd;
140  }
141  if (src.bearing_sd > bearing_sd) {
142  bearing = src.bearing;
143  bearing_sd = src.bearing_sd;
144  }
145  if (src.advance_sd > advance_sd) {
146  advance = src.advance;
147  advance_sd = src.advance_sd;
148  }
149 }
150 
151 // Copies the properties from src into this.
152 void UNICHARSET::UNICHAR_PROPERTIES::CopyFrom(const UNICHAR_PROPERTIES& src) {
153  // Apart from the fragment, everything else can be done with a default copy.
154  CHAR_FRAGMENT* saved_fragment = fragment;
155  *this = src; // Bitwise copy.
156  fragment = saved_fragment;
157 }
158 
160  unichars(NULL),
161  ids(),
162  size_used(0),
163  size_reserved(0),
164  script_table(NULL),
165  script_table_size_used(0),
166  null_script("NULL") {
167  clear();
168  for (int i = 0; i < SPECIAL_UNICHAR_CODES_COUNT; ++i) {
170  if (i == UNICHAR_JOINED)
171  set_isngram(i, true);
172  }
173 }
174 
176  clear();
177 }
178 
179 void UNICHARSET::reserve(int unichars_number) {
180  if (unichars_number > size_reserved) {
181  UNICHAR_SLOT* unichars_new = new UNICHAR_SLOT[unichars_number];
182  for (int i = 0; i < size_used; ++i)
183  unichars_new[i] = unichars[i];
184  for (int j = size_used; j < unichars_number; ++j) {
185  unichars_new[j].properties.script_id = add_script(null_script);
186  }
187  delete[] unichars;
188  unichars = unichars_new;
189  size_reserved = unichars_number;
190  }
191 }
192 
194 UNICHARSET::unichar_to_id(const char* const unichar_repr) const {
195  return ids.contains(unichar_repr) ?
196  ids.unichar_to_id(unichar_repr) : INVALID_UNICHAR_ID;
197 }
198 
199 UNICHAR_ID UNICHARSET::unichar_to_id(const char* const unichar_repr,
200  int length) const {
201  assert(length > 0 && length <= UNICHAR_LEN);
202  return ids.contains(unichar_repr, length) ?
203  ids.unichar_to_id(unichar_repr, length) : INVALID_UNICHAR_ID;
204 }
205 
206 // Return the minimum number of bytes that matches a legal UNICHAR_ID,
207 // while leaving the rest of the string encodable. Returns 0 if the
208 // beginning of the string is not encodable.
209 // WARNING: this function now encodes the whole string for precision.
210 // Use encode_string in preference to repeatedly calling step.
211 int UNICHARSET::step(const char* str) const {
212  GenericVector<UNICHAR_ID> encoding;
213  GenericVector<char> lengths;
214  encode_string(str, true, &encoding, &lengths, NULL);
215  if (encoding.empty() || encoding[0] == INVALID_UNICHAR_ID) return 0;
216  return lengths[0];
217 }
218 
219 // Return whether the given UTF-8 string is encodable with this UNICHARSET.
220 // If not encodable, write the first byte offset which cannot be converted
221 // into the second (return) argument.
222 bool UNICHARSET::encodable_string(const char *str,
223  int *first_bad_position) const {
224  GenericVector<UNICHAR_ID> encoding;
225  return encode_string(str, true, &encoding, NULL, first_bad_position);
226 }
227 
228 // Encodes the given UTF-8 string with this UNICHARSET.
229 // Returns true if the encoding succeeds completely, false if there is at
230 // least one INVALID_UNICHAR_ID in the returned encoding, but in this case
231 // the rest of the string is still encoded.
232 // If lengths is not NULL, then it is filled with the corresponding
233 // byte length of each encoded UNICHAR_ID.
234 bool UNICHARSET::encode_string(const char* str, bool give_up_on_failure,
235  GenericVector<UNICHAR_ID>* encoding,
236  GenericVector<char>* lengths,
237  int* encoded_length) const {
238  GenericVector<UNICHAR_ID> working_encoding;
239  GenericVector<char> working_lengths;
240  GenericVector<char> best_lengths;
241  encoding->truncate(0); // Just in case str is empty.
242  int str_length = strlen(str);
243  int str_pos = 0;
244  bool perfect = true;
245  while (str_pos < str_length) {
246  encode_string(str, str_pos, str_length, &working_encoding, &working_lengths,
247  &str_pos, encoding, &best_lengths);
248  if (str_pos < str_length) {
249  // This is a non-match. Skip one utf-8 character.
250  perfect = false;
251  if (give_up_on_failure) break;
252  int step = UNICHAR::utf8_step(str + str_pos);
253  if (step == 0) step = 1;
254  encoding->push_back(INVALID_UNICHAR_ID);
255  best_lengths.push_back(step);
256  str_pos += step;
257  working_encoding = *encoding;
258  working_lengths = best_lengths;
259  }
260  }
261  if (lengths != NULL) *lengths = best_lengths;
262  if (encoded_length != NULL) *encoded_length = str_pos;
263  return perfect;
264 }
265 
266 const char* UNICHARSET::id_to_unichar(UNICHAR_ID id) const {
267  if (id == INVALID_UNICHAR_ID) {
268  return INVALID_UNICHAR;
269  }
270  ASSERT_HOST(id < this->size());
271  return unichars[id].representation;
272 }
273 
275  if (id == INVALID_UNICHAR_ID) {
276  return INVALID_UNICHAR;
277  }
278  ASSERT_HOST(id < this->size());
279  // Resolve from the kCustomLigatures table if this is a private encoding.
280  if (get_isprivate(id)) {
281  const char* ch = id_to_unichar(id);
282  for (int i = 0; kCustomLigatures[i][0] != NULL; ++i) {
283  if (!strcmp(ch, kCustomLigatures[i][1])) {
284  return kCustomLigatures[i][0];
285  }
286  }
287  }
288  // Otherwise return the stored representation.
289  return unichars[id].representation;
290 }
291 
292 // Return a STRING that reformats the utf8 str into the str followed
293 // by its hex unicodes.
295  STRING result = str;
296  result += " [";
297  int step = 1;
298  // Chop into unicodes and code each as hex.
299  for (int i = 0; str[i] != '\0'; i += step) {
300  char hex[sizeof(int) * 2 + 1];
301  step = UNICHAR::utf8_step(str + i);
302  if (step == 0) {
303  step = 1;
304  sprintf(hex, "%x", str[i]);
305  } else {
306  UNICHAR ch(str + i, step);
307  sprintf(hex, "%x", ch.first_uni());
308  }
309  result += hex;
310  result += " ";
311  }
312  result += "]";
313  return result;
314 }
315 
316 // Return a STRING containing debug information on the unichar, including
317 // the id_to_unichar, its hex unicodes and the properties.
319  if (id == INVALID_UNICHAR_ID) return STRING(id_to_unichar(id));
320  const CHAR_FRAGMENT *fragment = this->get_fragment(id);
321  if (fragment) {
322  return fragment->to_string();
323  }
324  const char* str = id_to_unichar(id);
325  STRING result = debug_utf8_str(str);
326  // Append a for lower alpha, A for upper alpha, and x if alpha but neither.
327  if (get_isalpha(id)) {
328  if (get_islower(id))
329  result += "a";
330  else if (get_isupper(id))
331  result += "A";
332  else
333  result += "x";
334  }
335  // Append 0 if a digit.
336  if (get_isdigit(id)) {
337  result += "0";
338  }
339  // Append p is a punctuation symbol.
340  if (get_ispunctuation(id)) {
341  result += "p";
342  }
343  return result;
344 }
345 
346 // Sets the normed_ids vector from the normed string. normed_ids is not
347 // stored in the file, and needs to be set when the UNICHARSET is loaded.
349  unichars[unichar_id].properties.normed_ids.truncate(0);
350  if (unichar_id == UNICHAR_SPACE && id_to_unichar(unichar_id)[0] == ' ') {
351  unichars[unichar_id].properties.normed_ids.push_back(UNICHAR_SPACE);
352  } else if (!encode_string(unichars[unichar_id].properties.normed.string(),
353  true, &unichars[unichar_id].properties.normed_ids,
354  NULL, NULL)) {
355  unichars[unichar_id].properties.normed_ids.truncate(0);
356  unichars[unichar_id].properties.normed_ids.push_back(unichar_id);
357  }
358 }
359 
360 // Returns whether the unichar id represents a unicode value in the private use
361 // area. We use this range only internally to represent uncommon ligatures
362 // (eg. 'ct') that do not have regular unicode values.
363 bool UNICHARSET::get_isprivate(UNICHAR_ID unichar_id) const {
364  UNICHAR uc(id_to_unichar(unichar_id), -1);
365  int uni = uc.first_uni();
366  return (uni >= 0xE000 && uni <= 0xF8FF);
367 }
368 
369 
370 // Sets all ranges to empty, so they can be expanded to set the values.
372  for (int id = 0; id < size_used; ++id) {
373  unichars[id].properties.SetRangesEmpty();
374  }
375 }
376 
377 // Sets all the properties for this unicharset given a src unicharset with
378 // everything set. The unicharsets don't have to be the same, and graphemes
379 // are correctly accounted for.
381  const UNICHARSET& src) {
382  for (int ch = start_index; ch < size_used; ++ch) {
383  const char* utf8 = id_to_unichar(ch);
384  UNICHAR_PROPERTIES properties;
385  if (src.GetStrProperties(utf8, &properties)) {
386  // Setup the script_id, other_case, and mirror properly.
387  const char* script = src.get_script_from_script_id(properties.script_id);
388  properties.script_id = add_script(script);
389  const char* other_case = src.id_to_unichar(properties.other_case);
390  if (contains_unichar(other_case)) {
391  properties.other_case = unichar_to_id(other_case);
392  } else {
393  properties.other_case = ch;
394  }
395  const char* mirror_str = src.id_to_unichar(properties.mirror);
396  if (contains_unichar(mirror_str)) {
397  properties.mirror = unichar_to_id(mirror_str);
398  } else {
399  properties.mirror = ch;
400  }
401  unichars[ch].properties.CopyFrom(properties);
402  set_normed_ids(ch);
403  }
404  }
405 }
406 
407 // Expands the tops and bottoms and widths for this unicharset given a
408 // src unicharset with ranges in it. The unicharsets don't have to be the
409 // same, and graphemes are correctly accounted for.
411  for (int ch = 0; ch < size_used; ++ch) {
412  const char* utf8 = id_to_unichar(ch);
413  UNICHAR_PROPERTIES properties;
414  if (src.GetStrProperties(utf8, &properties)) {
415  // Expand just the ranges from properties.
416  unichars[ch].properties.ExpandRangesFrom(properties);
417  }
418  }
419 }
420 
421 // Makes this a copy of src. Clears this completely first, so the automatic
422 // ids will not be present in this if not in src. Does NOT reorder the set!
424  clear();
425  for (int ch = 0; ch < src.size_used; ++ch) {
426  const UNICHAR_PROPERTIES& src_props = src.unichars[ch].properties;
427  const char* utf8 = src.id_to_unichar(ch);
428  unichar_insert(utf8);
429  unichars[ch].properties.ExpandRangesFrom(src_props);
430  }
431  // Set properties, including mirror and other_case, WITHOUT reordering
432  // the unicharset.
434 }
435 
436 // For each id in src, if it does not occur in this, add it, as in
437 // SetPropertiesFromOther, otherwise expand the ranges, as in
438 // ExpandRangesFromOther.
440  int initial_used = size_used;
441  for (int ch = 0; ch < src.size_used; ++ch) {
442  const UNICHAR_PROPERTIES& src_props = src.unichars[ch].properties;
443  const char* utf8 = src.id_to_unichar(ch);
444  if (ch >= SPECIAL_UNICHAR_CODES_COUNT && src_props.AnyRangeEmpty()) {
445  // Only use fully valid entries.
446  tprintf("Bad properties for index %d, char %s: "
447  "%d,%d %d,%d %g,%g %g,%g %g,%g\n",
448  ch, utf8, src_props.min_bottom, src_props.max_bottom,
449  src_props.min_top, src_props.max_top,
450  src_props.width, src_props.width_sd,
451  src_props.bearing, src_props.bearing_sd,
452  src_props.advance, src_props.advance_sd);
453  continue;
454  }
455  int id = size_used;
456  if (contains_unichar(utf8)) {
457  id = unichar_to_id(utf8);
458  // Just expand current ranges.
459  unichars[id].properties.ExpandRangesFrom(src_props);
460  } else {
461  unichar_insert(utf8);
462  unichars[id].properties.SetRangesEmpty();
463  }
464  }
465  // Set properties, including mirror and other_case, WITHOUT reordering
466  // the unicharset.
467  PartialSetPropertiesFromOther(initial_used, src);
468 }
469 
470 // Returns true if the acceptable ranges of the tops of the characters do
471 // not overlap, making their x-height calculations distinct.
473  int overlap = MIN(unichars[id1].properties.max_top,
474  unichars[id2].properties.max_top) -
475  MAX(unichars[id1].properties.min_top,
476  unichars[id2].properties.min_top);
477  return overlap <= 0;
478 }
479 
480 // Internal recursive version of encode_string above.
481 // Seeks to encode the given string as a sequence of UNICHAR_IDs such that
482 // each UNICHAR_ID uses the least possible part of the utf8 str.
483 // It does this by depth-first tail recursion on increasing length matches
484 // to the UNICHARSET, saving the first encountered result that encodes the
485 // maximum total length of str. It stops on a failure to encode to make
486 // the overall process of encoding a partially failed string more efficient.
487 // See unicharset.h for definition of the args.
488 void UNICHARSET::encode_string(const char* str, int str_index, int str_length,
489  GenericVector<UNICHAR_ID>* encoding,
490  GenericVector<char>* lengths,
491  int* best_total_length,
492  GenericVector<UNICHAR_ID>* best_encoding,
493  GenericVector<char>* best_lengths) const {
494  if (str_index > *best_total_length) {
495  // This is the best result so far.
496  *best_total_length = str_index;
497  *best_encoding = *encoding;
498  if (best_lengths != NULL)
499  *best_lengths = *lengths;
500  }
501  if (str_index == str_length) return;
502  int encoding_index = encoding->size();
503  // Find the length of the first matching unicharset member.
504  int length = ids.minmatch(str + str_index);
505  if (length == 0 || str_index + length > str_length) return;
506  do {
507  if (ids.contains(str + str_index, length)) {
508  // Successful encoding so far.
509  UNICHAR_ID id = ids.unichar_to_id(str + str_index, length);
510  encoding->push_back(id);
511  lengths->push_back(length);
512  encode_string(str, str_index + length, str_length, encoding, lengths,
513  best_total_length, best_encoding, best_lengths);
514  if (*best_total_length == str_length)
515  return; // Tail recursion success!
516  // Failed with that length, truncate back and try again.
517  encoding->truncate(encoding_index);
518  lengths->truncate(encoding_index);
519  }
520  int step = UNICHAR::utf8_step(str + str_index + length);
521  if (step == 0) step = 1;
522  length += step;
523  } while (length <= UNICHAR_LEN && str_index + length <= str_length);
524 }
525 
526 // Gets the properties for a grapheme string, combining properties for
527 // multiple characters in a meaningful way where possible.
528 // Returns false if no valid match was found in the unicharset.
529 // NOTE that script_id, mirror, and other_case refer to this unicharset on
530 // return and will need translation if the target unicharset is different.
531 bool UNICHARSET::GetStrProperties(const char* utf8_str,
532  UNICHAR_PROPERTIES* props) const {
533  props->Init();
534  props->SetRangesEmpty();
535  int total_unicodes = 0;
536  GenericVector<UNICHAR_ID> encoding;
537  if (!encode_string(utf8_str, true, &encoding, NULL, NULL))
538  return false; // Some part was invalid.
539  for (int i = 0; i < encoding.size(); ++i) {
540  int id = encoding[i];
541  const UNICHAR_PROPERTIES& src_props = unichars[id].properties;
542  // Logical OR all the bools.
543  if (src_props.isalpha) props->isalpha = true;
544  if (src_props.islower) props->islower = true;
545  if (src_props.isupper) props->isupper = true;
546  if (src_props.isdigit) props->isdigit = true;
547  if (src_props.ispunctuation) props->ispunctuation = true;
548  if (src_props.isngram) props->isngram = true;
549  if (src_props.enabled) props->enabled = true;
550  // Min/max the tops/bottoms.
551  UpdateRange(src_props.min_bottom, &props->min_bottom, &props->max_bottom);
552  UpdateRange(src_props.max_bottom, &props->min_bottom, &props->max_bottom);
553  UpdateRange(src_props.min_top, &props->min_top, &props->max_top);
554  UpdateRange(src_props.max_top, &props->min_top, &props->max_top);
555  float bearing = props->advance + src_props.bearing;
556  if (total_unicodes == 0 || bearing < props->bearing) {
557  props->bearing = bearing;
558  props->bearing_sd = props->advance_sd + src_props.bearing_sd;
559  }
560  props->advance += src_props.advance;
561  props->advance_sd += src_props.advance_sd;
562  // With a single width, just use the widths stored in the unicharset.
563  props->width = src_props.width;
564  props->width_sd = src_props.width_sd;
565  // Use the first script id, other_case, mirror, direction.
566  // Note that these will need translation, except direction.
567  if (total_unicodes == 0) {
568  props->script_id = src_props.script_id;
569  props->other_case = src_props.other_case;
570  props->mirror = src_props.mirror;
571  props->direction = src_props.direction;
572  }
573  // The normed string for the compound character is the concatenation of
574  // the normed versions of the individual characters.
575  props->normed += src_props.normed;
576  ++total_unicodes;
577  }
578  if (total_unicodes > 1) {
579  // Estimate the total widths from the advance - bearing.
580  props->width = props->advance - props->bearing;
581  props->width_sd = props->advance_sd + props->bearing_sd;
582  }
583  return total_unicodes > 0;
584 }
585 
586 // TODO(rays) clean-up the order of functions to match unicharset.h.
587 
588 unsigned int UNICHARSET::get_properties(UNICHAR_ID id) const {
589  unsigned int properties = 0;
590  if (this->get_isalpha(id))
591  properties |= ISALPHA_MASK;
592  if (this->get_islower(id))
593  properties |= ISLOWER_MASK;
594  if (this->get_isupper(id))
595  properties |= ISUPPER_MASK;
596  if (this->get_isdigit(id))
597  properties |= ISDIGIT_MASK;
598  if (this->get_ispunctuation(id))
599  properties |= ISPUNCTUATION_MASK;
600  return properties;
601 }
602 
604  if (this->get_isupper(id)) return 'A';
605  if (this->get_islower(id)) return 'a';
606  if (this->get_isalpha(id)) return 'x';
607  if (this->get_isdigit(id)) return '0';
608  if (this->get_ispunctuation(id)) return 'p';
609  return 0;
610 }
611 
612 void UNICHARSET::unichar_insert(const char* const unichar_repr) {
613  if (!ids.contains(unichar_repr)) {
614  if (strlen(unichar_repr) > UNICHAR_LEN) {
615  fprintf(stderr, "Utf8 buffer too big, size=%d for %s\n",
616  int(strlen(unichar_repr)), unichar_repr);
617  return;
618  }
619  if (size_used == size_reserved) {
620  if (size_used == 0)
621  reserve(8);
622  else
623  reserve(2 * size_used);
624  }
625 
626  strcpy(unichars[size_used].representation, unichar_repr);
627  this->set_script(size_used, null_script);
628  // If the given unichar_repr represents a fragmented character, set
629  // fragment property to a pointer to CHAR_FRAGMENT class instance with
630  // information parsed from the unichar representation. Use the script
631  // of the base unichar for the fragmented character if possible.
632  CHAR_FRAGMENT *frag = CHAR_FRAGMENT::parse_from_string(unichar_repr);
633  this->unichars[size_used].properties.fragment = frag;
634  if (frag != NULL && this->contains_unichar(frag->get_unichar())) {
635  this->unichars[size_used].properties.script_id =
636  this->get_script(frag->get_unichar());
637  }
638  this->unichars[size_used].properties.enabled = true;
639  ids.insert(unichar_repr, size_used);
640  ++size_used;
641  }
642 }
643 
644 bool UNICHARSET::contains_unichar(const char* const unichar_repr) const {
645  return ids.contains(unichar_repr);
646 }
647 
648 bool UNICHARSET::contains_unichar(const char* const unichar_repr,
649  int length) const {
650  if (length == 0) {
651  return false;
652  }
653  return ids.contains(unichar_repr, length);
654 }
655 
656 bool UNICHARSET::eq(UNICHAR_ID unichar_id,
657  const char* const unichar_repr) const {
658  return strcmp(this->id_to_unichar(unichar_id), unichar_repr) == 0;
659 }
660 
662  const int kFileBufSize = 1024;
663  char buffer[kFileBufSize + 1];
664  snprintf(buffer, kFileBufSize, "%d\n", this->size());
665  *str = buffer;
666  for (UNICHAR_ID id = 0; id < this->size(); ++id) {
667  int min_bottom, max_bottom, min_top, max_top;
668  get_top_bottom(id, &min_bottom, &max_bottom, &min_top, &max_top);
669  float width, width_sd;
670  get_width_stats(id, &width, &width_sd);
671  float bearing, bearing_sd;
672  get_bearing_stats(id, &bearing, &bearing_sd);
673  float advance, advance_sd;
674  get_advance_stats(id, &advance, &advance_sd);
675  unsigned int properties = this->get_properties(id);
676  if (strcmp(this->id_to_unichar(id), " ") == 0) {
677  snprintf(buffer, kFileBufSize, "%s %x %s %d\n", "NULL", properties,
678  this->get_script_from_script_id(this->get_script(id)),
679  this->get_other_case(id));
680  } else {
681  snprintf(buffer, kFileBufSize,
682  "%s %x %d,%d,%d,%d,%g,%g,%g,%g,%g,%g %s %d %d %d %s\t# %s\n",
683  this->id_to_unichar(id), properties,
684  min_bottom, max_bottom, min_top, max_top, width, width_sd,
685  bearing, bearing_sd, advance, advance_sd,
686  this->get_script_from_script_id(this->get_script(id)),
687  this->get_other_case(id), this->get_direction(id),
688  this->get_mirror(id), this->get_normed_unichar(id),
689  this->debug_str(id).string());
690  }
691  *str += buffer;
692  }
693  return true;
694 }
695 
696 // TODO(rays) Replace with TFile everywhere.
698  public:
699  InMemoryFilePointer(const char *memory, int mem_size)
700  : memory_(memory), fgets_ptr_(memory), mem_size_(mem_size) { }
701 
702  char *fgets(char *orig_dst, int size) {
703  const char *src_end = memory_ + mem_size_;
704  char *dst_end = orig_dst + size - 1;
705  if (size < 1) {
706  return fgets_ptr_ < src_end ? orig_dst : NULL;
707  }
708 
709  char *dst = orig_dst;
710  char ch = '^';
711  while (fgets_ptr_ < src_end && dst < dst_end && ch != '\n') {
712  ch = *dst++ = *fgets_ptr_++;
713  }
714  *dst = 0;
715  return (dst == orig_dst) ? NULL : orig_dst;
716  }
717 
718  private:
719  const char *memory_;
720  const char *fgets_ptr_;
721  const int mem_size_;
722 };
723 
725  const char *memory, int mem_size, bool skip_fragments) {
726  InMemoryFilePointer mem_fp(memory, mem_size);
729  bool success = load_via_fgets(fgets_cb, skip_fragments);
730  delete fgets_cb;
731  return success;
732 }
733 
735  public:
736  LocalFilePointer(FILE *stream) : fp_(stream) {}
737  char *fgets(char *dst, int size) {
738  return ::fgets(dst, size, fp_);
739  }
740  private:
741  FILE *fp_;
742 };
743 
744 bool UNICHARSET::load_from_file(FILE *file, bool skip_fragments) {
745  LocalFilePointer lfp(file);
748  bool success = load_via_fgets(fgets_cb, skip_fragments);
749  delete fgets_cb;
750  return success;
751 }
752 
753 bool UNICHARSET::load_from_file(tesseract::TFile *file, bool skip_fragments) {
756  bool success = load_via_fgets(fgets_cb, skip_fragments);
757  delete fgets_cb;
758  return success;
759 }
760 
761 bool UNICHARSET::load_via_fgets(
763  bool skip_fragments) {
764  int unicharset_size;
765  char buffer[256];
766 
767  this->clear();
768  if (fgets_cb->Run(buffer, sizeof(buffer)) == NULL ||
769  sscanf(buffer, "%d", &unicharset_size) != 1) {
770  return false;
771  }
772  this->reserve(unicharset_size);
773  for (UNICHAR_ID id = 0; id < unicharset_size; ++id) {
774  char unichar[256];
775  unsigned int properties;
776  char script[64];
777 
778  strcpy(script, null_script);
779  int min_bottom = 0;
780  int max_bottom = MAX_UINT8;
781  int min_top = 0;
782  int max_top = MAX_UINT8;
783  float width = 0.0f;
784  float width_sd = 0.0f;
785  float bearing = 0.0f;
786  float bearing_sd = 0.0f;
787  float advance = 0.0f;
788  float advance_sd = 0.0f;
789  // TODO(eger): check that this default it ok
790  // after enabling BiDi iterator for Arabic+Cube.
792  UNICHAR_ID other_case = id;
793  UNICHAR_ID mirror = id;
794  char normed[64];
795  int v = -1;
796  if (fgets_cb->Run(buffer, sizeof (buffer)) == NULL ||
797  ((v = sscanf(buffer,
798  "%s %x %d,%d,%d,%d,%g,%g,%g,%g,%g,%g %63s %d %d %d %63s",
799  unichar, &properties,
800  &min_bottom, &max_bottom, &min_top, &max_top,
801  &width, &width_sd, &bearing, &bearing_sd,
802  &advance, &advance_sd, script, &other_case,
803  &direction, &mirror, normed)) != 17 &&
804  (v = sscanf(buffer,
805  "%s %x %d,%d,%d,%d,%g,%g,%g,%g,%g,%g %63s %d %d %d",
806  unichar, &properties,
807  &min_bottom, &max_bottom, &min_top, &max_top,
808  &width, &width_sd, &bearing, &bearing_sd,
809  &advance, &advance_sd, script, &other_case,
810  &direction, &mirror)) != 16 &&
811  (v = sscanf(buffer, "%s %x %d,%d,%d,%d %63s %d %d %d",
812  unichar, &properties,
813  &min_bottom, &max_bottom, &min_top, &max_top,
814  script, &other_case, &direction, &mirror)) != 10 &&
815  (v = sscanf(buffer, "%s %x %d,%d,%d,%d %63s %d", unichar, &properties,
816  &min_bottom, &max_bottom, &min_top, &max_top,
817  script, &other_case)) != 8 &&
818  (v = sscanf(buffer, "%s %x %63s %d", unichar, &properties,
819  script, &other_case)) != 4 &&
820  (v = sscanf(buffer, "%s %x %63s",
821  unichar, &properties, script)) != 3 &&
822  (v = sscanf(buffer, "%s %x", unichar, &properties)) != 2)) {
823  return false;
824  }
825 
826  // Skip fragments if needed.
827  CHAR_FRAGMENT *frag = NULL;
828  if (skip_fragments && (frag = CHAR_FRAGMENT::parse_from_string(unichar))) {
829  int num_pieces = frag->get_total();
830  delete frag;
831  // Skip multi-element fragments, but keep singles like UNICHAR_BROKEN in.
832  if (num_pieces > 1)
833  continue;
834  }
835  // Insert unichar into unicharset and set its properties.
836  if (strcmp(unichar, "NULL") == 0)
837  this->unichar_insert(" ");
838  else
839  this->unichar_insert(unichar);
840 
841  this->set_isalpha(id, properties & ISALPHA_MASK);
842  this->set_islower(id, properties & ISLOWER_MASK);
843  this->set_isupper(id, properties & ISUPPER_MASK);
844  this->set_isdigit(id, properties & ISDIGIT_MASK);
845  this->set_ispunctuation(id, properties & ISPUNCTUATION_MASK);
846  this->set_isngram(id, false);
847  this->set_script(id, script);
848  this->unichars[id].properties.enabled = true;
849  this->set_top_bottom(id, min_bottom, max_bottom, min_top, max_top);
850  this->set_width_stats(id, width, width_sd);
851  this->set_bearing_stats(id, bearing, bearing_sd);
852  this->set_advance_stats(id, advance, advance_sd);
853  this->set_direction(id, static_cast<UNICHARSET::Direction>(direction));
854  ASSERT_HOST(other_case < unicharset_size);
855  this->set_other_case(id, (v>3) ? other_case : id);
856  ASSERT_HOST(mirror < unicharset_size);
857  this->set_mirror(id, (v>8) ? mirror : id);
858  this->set_normed(id, (v>16) ? normed : unichar);
859  }
860  post_load_setup();
861  return true;
862 }
863 
864 // Sets up internal data after loading the file, based on the char
865 // properties. Called from load_from_file, but also needs to be run
866 // during set_unicharset_properties.
868  // Number of alpha chars with the case property minus those without,
869  // in order to determine that half the alpha chars have case.
870  int net_case_alphas = 0;
871  int x_height_alphas = 0;
872  int cap_height_alphas = 0;
873  top_bottom_set_ = false;
874  for (UNICHAR_ID id = 0; id < size_used; ++id) {
875  int min_bottom = 0;
876  int max_bottom = MAX_UINT8;
877  int min_top = 0;
878  int max_top = MAX_UINT8;
879  get_top_bottom(id, &min_bottom, &max_bottom, &min_top, &max_top);
880  if (min_top > 0)
881  top_bottom_set_ = true;
882  if (get_isalpha(id)) {
883  if (get_islower(id) || get_isupper(id))
884  ++net_case_alphas;
885  else
886  --net_case_alphas;
887  if (min_top < kMeanlineThreshold && max_top < kMeanlineThreshold)
888  ++x_height_alphas;
889  else if (min_top > kMeanlineThreshold && max_top > kMeanlineThreshold)
890  ++cap_height_alphas;
891  }
892  set_normed_ids(id);
893  }
894 
895  script_has_upper_lower_ = net_case_alphas > 0;
896  script_has_xheight_ = script_has_upper_lower_ ||
897  (x_height_alphas > cap_height_alphas * kMinXHeightFraction &&
898  cap_height_alphas > x_height_alphas * kMinCapHeightFraction);
899 
900  null_sid_ = get_script_id_from_name(null_script);
901  ASSERT_HOST(null_sid_ == 0);
902  common_sid_ = get_script_id_from_name("Common");
903  latin_sid_ = get_script_id_from_name("Latin");
904  cyrillic_sid_ = get_script_id_from_name("Cyrillic");
905  greek_sid_ = get_script_id_from_name("Greek");
906  han_sid_ = get_script_id_from_name("Han");
907  hiragana_sid_ = get_script_id_from_name("Hiragana");
908  katakana_sid_ = get_script_id_from_name("Katakana");
909  thai_sid_ = get_script_id_from_name("Thai");
910  hangul_sid_ = get_script_id_from_name("Hangul");
911 
912  // Compute default script. Use the highest-counting alpha script, that is
913  // not the common script, as that still contains some "alphas".
914  int* script_counts = new int[script_table_size_used];
915  memset(script_counts, 0, sizeof(*script_counts) * script_table_size_used);
916  for (int id = 0; id < size_used; ++id) {
917  if (get_isalpha(id)) {
918  ++script_counts[get_script(id)];
919  }
920  }
921  default_sid_ = 0;
922  for (int s = 1; s < script_table_size_used; ++s) {
923  if (script_counts[s] > script_counts[default_sid_] && s != common_sid_)
924  default_sid_ = s;
925  }
926  delete [] script_counts;
927 }
928 
929 // Returns true if right_to_left scripts are significant in the unicharset,
930 // but without being so sensitive that "universal" unicharsets containing
931 // characters from many scripts, like orientation and script detection,
932 // look like they are right_to_left.
934  int ltr_count = 0;
935  int rtl_count = 0;
936  for (int id = 0; id < size_used; ++id) {
937  int dir = get_direction(id);
938  if (dir == UNICHARSET::U_LEFT_TO_RIGHT) ltr_count++;
939  if (dir == UNICHARSET::U_RIGHT_TO_LEFT ||
941  dir == UNICHARSET::U_ARABIC_NUMBER) rtl_count++;
942  }
943  return rtl_count > ltr_count;
944 }
945 
946 // Set a whitelist and/or blacklist of characters to recognize.
947 // An empty or NULL whitelist enables everything (minus any blacklist).
948 // An empty or NULL blacklist disables nothing.
949 // An empty or NULL blacklist has no effect.
950 void UNICHARSET::set_black_and_whitelist(const char* blacklist,
951  const char* whitelist,
952  const char* unblacklist) {
953  bool def_enabled = whitelist == NULL || whitelist[0] == '\0';
954  // Set everything to default
955  for (int ch = 0; ch < size_used; ++ch)
956  unichars[ch].properties.enabled = def_enabled;
957  if (!def_enabled) {
958  // Enable the whitelist.
959  GenericVector<UNICHAR_ID> encoding;
960  encode_string(whitelist, false, &encoding, NULL, NULL);
961  for (int i = 0; i < encoding.size(); ++i) {
962  if (encoding[i] != INVALID_UNICHAR_ID)
963  unichars[encoding[i]].properties.enabled = true;
964  }
965  }
966  if (blacklist != NULL && blacklist[0] != '\0') {
967  // Disable the blacklist.
968  GenericVector<UNICHAR_ID> encoding;
969  encode_string(blacklist, false, &encoding, NULL, NULL);
970  for (int i = 0; i < encoding.size(); ++i) {
971  if (encoding[i] != INVALID_UNICHAR_ID)
972  unichars[encoding[i]].properties.enabled = false;
973  }
974  }
975  if (unblacklist != NULL && unblacklist[0] != '\0') {
976  // Re-enable the unblacklist.
977  GenericVector<UNICHAR_ID> encoding;
978  encode_string(unblacklist, false, &encoding, NULL, NULL);
979  for (int i = 0; i < encoding.size(); ++i) {
980  if (encoding[i] != INVALID_UNICHAR_ID)
981  unichars[encoding[i]].properties.enabled = true;
982  }
983  }
984 }
985 
986 // Returns true if there are any repeated unicodes in the normalized
987 // text of any unichar-id in the unicharset.
989  int start_id = 0;
991  for (int id = start_id; id < size_used; ++id) {
992  // Convert to unicodes.
993  GenericVector<int> unicodes;
994  if (UNICHAR::UTF8ToUnicode(get_normed_unichar(id), &unicodes) &&
995  unicodes.size() > 1) {
996  for (int u = 1; u < unicodes.size(); ++u) {
997  if (unicodes[u - 1] == unicodes[u]) return true;
998  }
999  }
1000  }
1001  return false;
1002 }
1003 
1004 int UNICHARSET::add_script(const char* script) {
1005  for (int i = 0; i < script_table_size_used; ++i) {
1006  if (strcmp(script, script_table[i]) == 0)
1007  return i;
1008  }
1009  if (script_table_size_reserved == 0) {
1010  script_table_size_reserved = 8;
1011  script_table = new char*[script_table_size_reserved];
1012  } else if (script_table_size_used >= script_table_size_reserved) {
1013  assert(script_table_size_used == script_table_size_reserved);
1014  script_table_size_reserved += script_table_size_reserved;
1015  char** new_script_table = new char*[script_table_size_reserved];
1016  memcpy(new_script_table, script_table, script_table_size_used * sizeof(char*));
1017  delete[] script_table;
1018  script_table = new_script_table;
1019  }
1020  script_table[script_table_size_used] = new char[strlen(script) + 1];
1021  strcpy(script_table[script_table_size_used], script);
1022  return script_table_size_used++;
1023 }
1024 
1025 // Returns the string that represents a fragment
1026 // with the given unichar, pos and total.
1027 STRING CHAR_FRAGMENT::to_string(const char *unichar, int pos, int total,
1028  bool natural) {
1029  if (total == 1) return STRING(unichar);
1030  STRING result = "";
1031  result += kSeparator;
1032  result += unichar;
1033  char buffer[kMaxLen];
1034  snprintf(buffer, kMaxLen, "%c%d%c%d", kSeparator, pos,
1035  natural ? kNaturalFlag : kSeparator, total);
1036  result += buffer;
1037  return result;
1038 }
1039 
1041  const char *ptr = string;
1042  int len = strlen(string);
1043  if (len < kMinLen || *ptr != kSeparator) {
1044  return NULL; // this string can not represent a fragment
1045  }
1046  ptr++; // move to the next character
1047  int step = 0;
1048  while ((ptr + step) < (string + len) && *(ptr + step) != kSeparator) {
1049  step += UNICHAR::utf8_step(ptr + step);
1050  }
1051  if (step == 0 || step > UNICHAR_LEN) {
1052  return NULL; // no character for unichar or the character is too long
1053  }
1054  char unichar[UNICHAR_LEN + 1];
1055  strncpy(unichar, ptr, step);
1056  unichar[step] = '\0'; // null terminate unichar
1057  ptr += step; // move to the next fragment separator
1058  int pos = 0;
1059  int total = 0;
1060  bool natural = false;
1061  char *end_ptr = NULL;
1062  for (int i = 0; i < 2; i++) {
1063  if (ptr > string + len || *ptr != kSeparator) {
1064  if (i == 1 && *ptr == kNaturalFlag)
1065  natural = true;
1066  else
1067  return NULL; // Failed to parse fragment representation.
1068  }
1069  ptr++; // move to the next character
1070  i == 0 ? pos = static_cast<int>(strtol(ptr, &end_ptr, 10))
1071  : total = static_cast<int>(strtol(ptr, &end_ptr, 10));
1072  ptr = end_ptr;
1073  }
1074  if (ptr != string + len) {
1075  return NULL; // malformed fragment representation
1076  }
1077  CHAR_FRAGMENT *fragment = new CHAR_FRAGMENT();
1078  fragment->set_all(unichar, pos, total, natural);
1079  return fragment;
1080 }
1081 
1082 int UNICHARSET::get_script_id_from_name(const char* script_name) const {
1083  for (int i = 0; i < script_table_size_used; ++i) {
1084  if (strcmp(script_name, script_table[i]) == 0)
1085  return i;
1086  }
1087  return 0; // 0 is always the null_script
1088 }
double u[max]
static TESS_API const char * kCustomLigatures[][2]
Definition: unicharset.h:144
STRING to_string() const
Definition: unicharset.h:73
void set_isupper(UNICHAR_ID unichar_id, bool value)
Definition: unicharset.h:401
void set_all(const char *unichar, int pos, int total, bool natural)
Definition: unicharset.h:52
InMemoryFilePointer(const char *memory, int mem_size)
Definition: unicharset.cpp:699
voidpf stream
Definition: ioapi.h:39
_ConstTessMemberResultCallback_0_0< false, R, T1 >::base * NewPermanentTessCallback(const T1 *obj, R(T2::*member)() const)
Definition: tesscallback.h:116
void set_width_stats(UNICHAR_ID unichar_id, float width, float width_sd)
Definition: unicharset.h:567
bool contains_unichar(const char *const unichar_repr) const
Definition: unicharset.cpp:644
int UNICHAR_ID
Definition: unichar.h:33
bool get_ispunctuation(UNICHAR_ID unichar_id) const
Definition: unicharset.h:479
const CHAR_FRAGMENT * get_fragment(UNICHAR_ID unichar_id) const
Definition: unicharset.h:694
const char * get_unichar() const
Definition: unicharset.h:64
#define MAX_UINT8
Definition: host.h:63
void reserve(int unichars_number)
Definition: unicharset.cpp:179
voidpf void uLong size
Definition: ioapi.h:39
bool contains(const char *const unichar_repr) const
Definition: unicharmap.cpp:101
LocalFilePointer(FILE *stream)
Definition: unicharset.cpp:736
virtual R Run(A1, A2)=0
const double kMinCapHeightFraction
Definition: unicharset.cpp:54
void AppendOtherUnicharset(const UNICHARSET &src)
Definition: unicharset.cpp:439
const char * get_script_from_script_id(int id) const
Definition: unicharset.h:814
int push_back(T object)
#define tprintf(...)
Definition: tprintf.h:31
int direction(EDGEPT *point)
Definition: vecfuncs.cpp:43
bool empty() const
Definition: genericvector.h:90
void truncate(int size)
bool encodable_string(const char *str, int *first_bad_position) const
Definition: unicharset.cpp:222
static STRING debug_utf8_str(const char *str)
Definition: unicharset.cpp:294
void set_normed(UNICHAR_ID unichar_id, const char *normed)
Definition: unicharset.h:442
int size() const
Definition: genericvector.h:72
bool save_to_string(STRING *str) const
Definition: unicharset.cpp:661
static STRING to_string(const char *unichar, int pos, int total, bool natural)
#define UNICHAR_LEN
Definition: unichar.h:30
Direction get_direction(UNICHAR_ID unichar_id) const
Definition: unicharset.h:650
#define ASSERT_HOST(x)
Definition: errcode.h:84
void clear()
Definition: unicharset.h:265
int get_script(UNICHAR_ID unichar_id) const
Definition: unicharset.h:623
const char * id_to_unichar(UNICHAR_ID id) const
Definition: unicharset.cpp:266
void get_width_stats(UNICHAR_ID unichar_id, float *width, float *width_sd) const
Definition: unicharset.h:556
void set_advance_stats(UNICHAR_ID unichar_id, float advance, float advance_sd)
Definition: unicharset.h:600
void set_ispunctuation(UNICHAR_ID unichar_id, bool value)
Definition: unicharset.h:411
void set_direction(UNICHAR_ID unichar_id, UNICHARSET::Direction value)
Definition: unicharset.h:432
bool get_isalpha(UNICHAR_ID unichar_id) const
Definition: unicharset.h:451
void set_ranges_empty()
Definition: unicharset.cpp:371
const char * id_to_unichar_ext(UNICHAR_ID id) const
Definition: unicharset.cpp:274
void get_advance_stats(UNICHAR_ID unichar_id, float *advance, float *advance_sd) const
Definition: unicharset.h:590
bool get_isdigit(UNICHAR_ID unichar_id) const
Definition: unicharset.h:472
UNICHAR_ID get_other_case(UNICHAR_ID unichar_id) const
Definition: unicharset.h:643
char * fgets(char *dst, int size)
Definition: unicharset.cpp:737
void get_top_bottom(UNICHAR_ID unichar_id, int *min_bottom, int *max_bottom, int *min_top, int *max_top) const
Definition: unicharset.h:528
void post_load_setup()
Definition: unicharset.cpp:867
Definition: strngs.h:45
bool has_special_codes() const
Definition: unicharset.h:682
UNICHAR_ID get_mirror(UNICHAR_ID unichar_id) const
Definition: unicharset.h:657
void set_normed_ids(UNICHAR_ID unichar_id)
Definition: unicharset.cpp:348
int get_total() const
Definition: unicharset.h:66
char * fgets(char *orig_dst, int size)
Definition: unicharset.cpp:702
int add_script(const char *script)
unsigned int get_properties(UNICHAR_ID unichar_id) const
Definition: unicharset.cpp:588
static CHAR_FRAGMENT * parse_from_string(const char *str)
void set_black_and_whitelist(const char *blacklist, const char *whitelist, const char *unblacklist)
Definition: unicharset.cpp:950
char get_chartype(UNICHAR_ID unichar_id) const
Definition: unicharset.cpp:603
int first_uni() const
Definition: unichar.cpp:97
int minmatch(const char *const unichar_repr) const
Definition: unicharmap.cpp:140
void set_isngram(UNICHAR_ID unichar_id, bool value)
Definition: unicharset.h:416
void insert(const char *const unichar_repr, UNICHAR_ID id)
Definition: unicharmap.cpp:76
void get_bearing_stats(UNICHAR_ID unichar_id, float *bearing, float *bearing_sd) const
Definition: unicharset.h:573
#define MAX(x, y)
Definition: ndminx.h:24
bool load_from_file(const char *const filename, bool skip_fragments)
Definition: unicharset.h:348
int step(const char *str) const
Definition: unicharset.cpp:211
bool SizesDistinct(UNICHAR_ID id1, UNICHAR_ID id2) const
Definition: unicharset.cpp:472
void set_script(UNICHAR_ID unichar_id, const char *value)
Definition: unicharset.h:422
bool encode_string(const char *str, bool give_up_on_failure, GenericVector< UNICHAR_ID > *encoding, GenericVector< char > *lengths, int *encoded_length) const
Definition: unicharset.cpp:234
bool AnyRepeatedUnicodes() const
Definition: unicharset.cpp:988
UNICHAR_ID unichar_to_id(const char *const unichar_repr) const
Definition: unicharmap.cpp:36
#define MIN(x, y)
Definition: ndminx.h:28
bool eq(UNICHAR_ID unichar_id, const char *const unichar_repr) const
Definition: unicharset.cpp:656
typedef int(ZCALLBACK *close_file_func) OF((voidpf opaque
void ExpandRangesFromOther(const UNICHARSET &src)
Definition: unicharset.cpp:410
int get_script_id_from_name(const char *script_name) const
static int utf8_step(const char *utf8_str)
Definition: unichar.cpp:134
static bool UTF8ToUnicode(const char *utf8_str, GenericVector< int > *unicodes)
Definition: unichar.cpp:211
void set_other_case(UNICHAR_ID unichar_id, UNICHAR_ID other_case)
Definition: unicharset.h:427
int size() const
Definition: unicharset.h:299
void set_isalpha(UNICHAR_ID unichar_id, bool value)
Definition: unicharset.h:391
bool get_isupper(UNICHAR_ID unichar_id) const
Definition: unicharset.h:465
void PartialSetPropertiesFromOther(int start_index, const UNICHARSET &src)
Definition: unicharset.cpp:380
bool get_islower(UNICHAR_ID unichar_id) const
Definition: unicharset.h:458
char * FGets(char *buffer, int buffer_size)
Definition: serialis.cpp:86
bool get_isprivate(UNICHAR_ID unichar_id) const
Definition: unicharset.cpp:363
UNICHAR_ID unichar_to_id(const char *const unichar_repr) const
Definition: unicharset.cpp:194
void set_mirror(UNICHAR_ID unichar_id, UNICHAR_ID mirror)
Definition: unicharset.h:437
double v[max]
void UpdateRange(const T1 &x, T2 *lower_bound, T2 *upper_bound)
Definition: helpers.h:132
bool load_from_inmemory_file(const char *const memory, int mem_size, bool skip_fragments)
Definition: unicharset.cpp:724
void set_top_bottom(UNICHAR_ID unichar_id, int min_bottom, int max_bottom, int min_top, int max_top)
Definition: unicharset.h:542
const char * get_normed_unichar(UNICHAR_ID unichar_id) const
Definition: unicharset.h:788
STRING debug_str(UNICHAR_ID id) const
Definition: unicharset.cpp:318
void unichar_insert(const char *const unichar_repr)
Definition: unicharset.cpp:612
void CopyFrom(const UNICHARSET &src)
Definition: unicharset.cpp:423
const double kMinXHeightFraction
Definition: unicharset.cpp:53
void set_isdigit(UNICHAR_ID unichar_id, bool value)
Definition: unicharset.h:406
void set_bearing_stats(UNICHAR_ID unichar_id, float bearing, float bearing_sd)
Definition: unicharset.h:583
bool major_right_to_left() const
Definition: unicharset.cpp:933
void set_islower(UNICHAR_ID unichar_id, bool value)
Definition: unicharset.h:396
static const char * kSpecialUnicharCodes[SPECIAL_UNICHAR_CODES_COUNT]
Definition: unicharset.h:147