tesseract  4.00.00dev
UNICHARSET Class Reference

#include <unicharset.h>

Public Types

enum  Direction {
  U_LEFT_TO_RIGHT = 0, U_RIGHT_TO_LEFT = 1, U_EUROPEAN_NUMBER = 2, U_EUROPEAN_NUMBER_SEPARATOR = 3,
  U_EUROPEAN_NUMBER_TERMINATOR = 4, U_ARABIC_NUMBER = 5, U_COMMON_NUMBER_SEPARATOR = 6, U_BLOCK_SEPARATOR = 7,
  U_SEGMENT_SEPARATOR = 8, U_WHITE_SPACE_NEUTRAL = 9, U_OTHER_NEUTRAL = 10, U_LEFT_TO_RIGHT_EMBEDDING = 11,
  U_LEFT_TO_RIGHT_OVERRIDE = 12, U_RIGHT_TO_LEFT_ARABIC = 13, U_RIGHT_TO_LEFT_EMBEDDING = 14, U_RIGHT_TO_LEFT_OVERRIDE = 15,
  U_POP_DIRECTIONAL_FORMAT = 16, U_DIR_NON_SPACING_MARK = 17, U_BOUNDARY_NEUTRAL = 18, U_CHAR_DIRECTION_COUNT
}
 

Public Member Functions

 UNICHARSET ()
 
 ~UNICHARSET ()
 
UNICHAR_ID unichar_to_id (const char *const unichar_repr) const
 
UNICHAR_ID unichar_to_id (const char *const unichar_repr, int length) const
 
int step (const char *str) const
 
bool encodable_string (const char *str, int *first_bad_position) const
 
bool encode_string (const char *str, bool give_up_on_failure, GenericVector< UNICHAR_ID > *encoding, GenericVector< char > *lengths, int *encoded_length) const
 
const char * id_to_unichar (UNICHAR_ID id) const
 
const char * id_to_unichar_ext (UNICHAR_ID id) const
 
STRING debug_str (UNICHAR_ID id) const
 
STRING debug_str (const char *unichar_repr) const
 
void unichar_insert (const char *const unichar_repr)
 
bool contains_unichar_id (UNICHAR_ID unichar_id) const
 
bool contains_unichar (const char *const unichar_repr) const
 
bool contains_unichar (const char *const unichar_repr, int length) const
 
bool eq (UNICHAR_ID unichar_id, const char *const unichar_repr) const
 
void delete_pointers_in_unichars ()
 
void clear ()
 
int size () const
 
void reserve (int unichars_number)
 
bool save_to_file (const char *const filename) const
 
bool save_to_file (FILE *file) const
 
bool save_to_file (tesseract::TFile *file) const
 
bool save_to_string (STRING *str) const
 
bool load_from_inmemory_file (const char *const memory, int mem_size, bool skip_fragments)
 
bool load_from_inmemory_file (const char *const memory, int mem_size)
 
bool load_from_file (const char *const filename, bool skip_fragments)
 
bool load_from_file (const char *const filename)
 
bool load_from_file (FILE *file, bool skip_fragments)
 
bool load_from_file (FILE *file)
 
bool load_from_file (tesseract::TFile *file, bool skip_fragments)
 
void post_load_setup ()
 
bool major_right_to_left () const
 
void set_black_and_whitelist (const char *blacklist, const char *whitelist, const char *unblacklist)
 
void set_isalpha (UNICHAR_ID unichar_id, bool value)
 
void set_islower (UNICHAR_ID unichar_id, bool value)
 
void set_isupper (UNICHAR_ID unichar_id, bool value)
 
void set_isdigit (UNICHAR_ID unichar_id, bool value)
 
void set_ispunctuation (UNICHAR_ID unichar_id, bool value)
 
void set_isngram (UNICHAR_ID unichar_id, bool value)
 
void set_script (UNICHAR_ID unichar_id, const char *value)
 
void set_other_case (UNICHAR_ID unichar_id, UNICHAR_ID other_case)
 
void set_direction (UNICHAR_ID unichar_id, UNICHARSET::Direction value)
 
void set_mirror (UNICHAR_ID unichar_id, UNICHAR_ID mirror)
 
void set_normed (UNICHAR_ID unichar_id, const char *normed)
 
void set_normed_ids (UNICHAR_ID unichar_id)
 
bool get_isalpha (UNICHAR_ID unichar_id) const
 
bool get_islower (UNICHAR_ID unichar_id) const
 
bool get_isupper (UNICHAR_ID unichar_id) const
 
bool get_isdigit (UNICHAR_ID unichar_id) const
 
bool get_ispunctuation (UNICHAR_ID unichar_id) const
 
bool get_isngram (UNICHAR_ID unichar_id) const
 
bool get_isprivate (UNICHAR_ID unichar_id) const
 
bool top_bottom_useful () const
 
void set_ranges_empty ()
 
void SetPropertiesFromOther (const UNICHARSET &src)
 
void PartialSetPropertiesFromOther (int start_index, const UNICHARSET &src)
 
void ExpandRangesFromOther (const UNICHARSET &src)
 
void CopyFrom (const UNICHARSET &src)
 
void AppendOtherUnicharset (const UNICHARSET &src)
 
bool SizesDistinct (UNICHAR_ID id1, UNICHAR_ID id2) const
 
void get_top_bottom (UNICHAR_ID unichar_id, int *min_bottom, int *max_bottom, int *min_top, int *max_top) const
 
void set_top_bottom (UNICHAR_ID unichar_id, int min_bottom, int max_bottom, int min_top, int max_top)
 
void get_width_stats (UNICHAR_ID unichar_id, float *width, float *width_sd) const
 
void set_width_stats (UNICHAR_ID unichar_id, float width, float width_sd)
 
void get_bearing_stats (UNICHAR_ID unichar_id, float *bearing, float *bearing_sd) const
 
void set_bearing_stats (UNICHAR_ID unichar_id, float bearing, float bearing_sd)
 
void get_advance_stats (UNICHAR_ID unichar_id, float *advance, float *advance_sd) const
 
void set_advance_stats (UNICHAR_ID unichar_id, float advance, float advance_sd)
 
bool PropertiesIncomplete (UNICHAR_ID unichar_id) const
 
bool IsSpaceDelimited (UNICHAR_ID unichar_id) const
 
int get_script (UNICHAR_ID unichar_id) const
 
unsigned int get_properties (UNICHAR_ID unichar_id) const
 
char get_chartype (UNICHAR_ID unichar_id) const
 
UNICHAR_ID get_other_case (UNICHAR_ID unichar_id) const
 
Direction get_direction (UNICHAR_ID unichar_id) const
 
UNICHAR_ID get_mirror (UNICHAR_ID unichar_id) const
 
UNICHAR_ID to_lower (UNICHAR_ID unichar_id) const
 
UNICHAR_ID to_upper (UNICHAR_ID unichar_id) const
 
bool has_special_codes () const
 
bool AnyRepeatedUnicodes () const
 
const CHAR_FRAGMENTget_fragment (UNICHAR_ID unichar_id) const
 
bool get_isalpha (const char *const unichar_repr) const
 
bool get_islower (const char *const unichar_repr) const
 
bool get_isupper (const char *const unichar_repr) const
 
bool get_isdigit (const char *const unichar_repr) const
 
bool get_ispunctuation (const char *const unichar_repr) const
 
unsigned int get_properties (const char *const unichar_repr) const
 
char get_chartype (const char *const unichar_repr) const
 
int get_script (const char *const unichar_repr) const
 
const CHAR_FRAGMENTget_fragment (const char *const unichar_repr) const
 
bool get_isalpha (const char *const unichar_repr, int length) const
 
bool get_islower (const char *const unichar_repr, int length) const
 
bool get_isupper (const char *const unichar_repr, int length) const
 
bool get_isdigit (const char *const unichar_repr, int length) const
 
bool get_ispunctuation (const char *const unichar_repr, int length) const
 
const char * get_normed_unichar (UNICHAR_ID unichar_id) const
 
const GenericVector< UNICHAR_ID > & normed_ids (UNICHAR_ID unichar_id) const
 
int get_script (const char *const unichar_repr, int length) const
 
int get_script_table_size () const
 
const char * get_script_from_script_id (int id) const
 
int get_script_id_from_name (const char *script_name) const
 
bool is_null_script (const char *script) const
 
int add_script (const char *script)
 
bool get_enabled (UNICHAR_ID unichar_id) const
 
int null_sid () const
 
int common_sid () const
 
int latin_sid () const
 
int cyrillic_sid () const
 
int greek_sid () const
 
int han_sid () const
 
int hiragana_sid () const
 
int katakana_sid () const
 
int thai_sid () const
 
int hangul_sid () const
 
int default_sid () const
 
bool script_has_upper_lower () const
 
bool script_has_xheight () const
 

Static Public Member Functions

static STRING debug_utf8_str (const char *str)
 

Static Public Attributes

static TESS_API const char * kCustomLigatures [][2]
 
static const char * kSpecialUnicharCodes [SPECIAL_UNICHAR_CODES_COUNT]
 

Detailed Description

Definition at line 139 of file unicharset.h.

Member Enumeration Documentation

◆ Direction

Enumerator
U_LEFT_TO_RIGHT 
U_RIGHT_TO_LEFT 
U_EUROPEAN_NUMBER 
U_EUROPEAN_NUMBER_SEPARATOR 
U_EUROPEAN_NUMBER_TERMINATOR 
U_ARABIC_NUMBER 
U_COMMON_NUMBER_SEPARATOR 
U_BLOCK_SEPARATOR 
U_SEGMENT_SEPARATOR 
U_WHITE_SPACE_NEUTRAL 
U_OTHER_NEUTRAL 
U_LEFT_TO_RIGHT_EMBEDDING 
U_LEFT_TO_RIGHT_OVERRIDE 
U_RIGHT_TO_LEFT_ARABIC 
U_RIGHT_TO_LEFT_EMBEDDING 
U_RIGHT_TO_LEFT_OVERRIDE 
U_POP_DIRECTIONAL_FORMAT 
U_DIR_NON_SPACING_MARK 
U_BOUNDARY_NEUTRAL 
U_CHAR_DIRECTION_COUNT 

Definition at line 150 of file unicharset.h.

150  {
151  U_LEFT_TO_RIGHT = 0,
152  U_RIGHT_TO_LEFT = 1,
153  U_EUROPEAN_NUMBER = 2,
156  U_ARABIC_NUMBER = 5,
158  U_BLOCK_SEPARATOR = 7,
161  U_OTHER_NEUTRAL = 10,
169  U_BOUNDARY_NEUTRAL = 18,
171  };

Constructor & Destructor Documentation

◆ UNICHARSET()

UNICHARSET::UNICHARSET ( )

Definition at line 159 of file unicharset.cpp.

159  :
160  unichars(NULL),
161  ids(),
162  size_used(0),
163  size_reserved(0),
164  script_table(NULL),
165  script_table_size_used(0),
166  null_script("NULL") {
167  clear();
168  for (int i = 0; i < SPECIAL_UNICHAR_CODES_COUNT; ++i) {
170  if (i == UNICHAR_JOINED)
171  set_isngram(i, true);
172  }
173 }
void clear()
Definition: unicharset.h:265
void set_isngram(UNICHAR_ID unichar_id, bool value)
Definition: unicharset.h:416
void unichar_insert(const char *const unichar_repr)
Definition: unicharset.cpp:612
static const char * kSpecialUnicharCodes[SPECIAL_UNICHAR_CODES_COUNT]
Definition: unicharset.h:147

◆ ~UNICHARSET()

UNICHARSET::~UNICHARSET ( )

Definition at line 175 of file unicharset.cpp.

175  {
176  clear();
177 }
void clear()
Definition: unicharset.h:265

Member Function Documentation

◆ add_script()

int UNICHARSET::add_script ( const char *  script)

Definition at line 1004 of file unicharset.cpp.

1004  {
1005  for (int i = 0; i < script_table_size_used; ++i) {
1006  if (strcmp(script, script_table[i]) == 0)
1007  return i;
1008  }
1009  if (script_table_size_reserved == 0) {
1010  script_table_size_reserved = 8;
1011  script_table = new char*[script_table_size_reserved];
1012  } else if (script_table_size_used >= script_table_size_reserved) {
1013  assert(script_table_size_used == script_table_size_reserved);
1014  script_table_size_reserved += script_table_size_reserved;
1015  char** new_script_table = new char*[script_table_size_reserved];
1016  memcpy(new_script_table, script_table, script_table_size_used * sizeof(char*));
1017  delete[] script_table;
1018  script_table = new_script_table;
1019  }
1020  script_table[script_table_size_used] = new char[strlen(script) + 1];
1021  strcpy(script_table[script_table_size_used], script);
1022  return script_table_size_used++;
1023 }

◆ AnyRepeatedUnicodes()

bool UNICHARSET::AnyRepeatedUnicodes ( ) const

Definition at line 988 of file unicharset.cpp.

988  {
989  int start_id = 0;
991  for (int id = start_id; id < size_used; ++id) {
992  // Convert to unicodes.
993  GenericVector<int> unicodes;
994  if (UNICHAR::UTF8ToUnicode(get_normed_unichar(id), &unicodes) &&
995  unicodes.size() > 1) {
996  for (int u = 1; u < unicodes.size(); ++u) {
997  if (unicodes[u - 1] == unicodes[u]) return true;
998  }
999  }
1000  }
1001  return false;
1002 }
double u[max]
int size() const
Definition: genericvector.h:72
bool has_special_codes() const
Definition: unicharset.h:682
static bool UTF8ToUnicode(const char *utf8_str, GenericVector< int > *unicodes)
Definition: unichar.cpp:211
const char * get_normed_unichar(UNICHAR_ID unichar_id) const
Definition: unicharset.h:788

◆ AppendOtherUnicharset()

void UNICHARSET::AppendOtherUnicharset ( const UNICHARSET src)

Definition at line 439 of file unicharset.cpp.

439  {
440  int initial_used = size_used;
441  for (int ch = 0; ch < src.size_used; ++ch) {
442  const UNICHAR_PROPERTIES& src_props = src.unichars[ch].properties;
443  const char* utf8 = src.id_to_unichar(ch);
444  if (ch >= SPECIAL_UNICHAR_CODES_COUNT && src_props.AnyRangeEmpty()) {
445  // Only use fully valid entries.
446  tprintf("Bad properties for index %d, char %s: "
447  "%d,%d %d,%d %g,%g %g,%g %g,%g\n",
448  ch, utf8, src_props.min_bottom, src_props.max_bottom,
449  src_props.min_top, src_props.max_top,
450  src_props.width, src_props.width_sd,
451  src_props.bearing, src_props.bearing_sd,
452  src_props.advance, src_props.advance_sd);
453  continue;
454  }
455  int id = size_used;
456  if (contains_unichar(utf8)) {
457  id = unichar_to_id(utf8);
458  // Just expand current ranges.
459  unichars[id].properties.ExpandRangesFrom(src_props);
460  } else {
461  unichar_insert(utf8);
462  unichars[id].properties.SetRangesEmpty();
463  }
464  }
465  // Set properties, including mirror and other_case, WITHOUT reordering
466  // the unicharset.
467  PartialSetPropertiesFromOther(initial_used, src);
468 }
bool contains_unichar(const char *const unichar_repr) const
Definition: unicharset.cpp:644
#define tprintf(...)
Definition: tprintf.h:31
const char * id_to_unichar(UNICHAR_ID id) const
Definition: unicharset.cpp:266
void PartialSetPropertiesFromOther(int start_index, const UNICHARSET &src)
Definition: unicharset.cpp:380
UNICHAR_ID unichar_to_id(const char *const unichar_repr) const
Definition: unicharset.cpp:194
void unichar_insert(const char *const unichar_repr)
Definition: unicharset.cpp:612

◆ clear()

void UNICHARSET::clear ( )
inline

Definition at line 265 of file unicharset.h.

265  {
266  if (script_table != NULL) {
267  for (int i = 0; i < script_table_size_used; ++i)
268  delete[] script_table[i];
269  delete[] script_table;
270  script_table = NULL;
271  script_table_size_used = 0;
272  }
273  if (unichars != NULL) {
275  delete[] unichars;
276  unichars = NULL;
277  }
278  script_table_size_reserved = 0;
279  size_reserved = 0;
280  size_used = 0;
281  ids.clear();
282  top_bottom_set_ = false;
283  script_has_upper_lower_ = false;
284  script_has_xheight_ = false;
285  null_sid_ = 0;
286  common_sid_ = 0;
287  latin_sid_ = 0;
288  cyrillic_sid_ = 0;
289  greek_sid_ = 0;
290  han_sid_ = 0;
291  hiragana_sid_ = 0;
292  katakana_sid_ = 0;
293  thai_sid_ = 0;
294  hangul_sid_ = 0;
295  default_sid_ = 0;
296  }
void delete_pointers_in_unichars()
Definition: unicharset.h:255
void clear()
Definition: unicharmap.cpp:154

◆ common_sid()

int UNICHARSET::common_sid ( ) const
inline

Definition at line 844 of file unicharset.h.

844 { return common_sid_; }

◆ contains_unichar() [1/2]

bool UNICHARSET::contains_unichar ( const char *const  unichar_repr) const

Definition at line 644 of file unicharset.cpp.

644  {
645  return ids.contains(unichar_repr);
646 }
bool contains(const char *const unichar_repr) const
Definition: unicharmap.cpp:101

◆ contains_unichar() [2/2]

bool UNICHARSET::contains_unichar ( const char *const  unichar_repr,
int  length 
) const

Definition at line 648 of file unicharset.cpp.

649  {
650  if (length == 0) {
651  return false;
652  }
653  return ids.contains(unichar_repr, length);
654 }
bool contains(const char *const unichar_repr) const
Definition: unicharmap.cpp:101

◆ contains_unichar_id()

bool UNICHARSET::contains_unichar_id ( UNICHAR_ID  unichar_id) const
inline

Definition at line 241 of file unicharset.h.

241  {
242  return unichar_id != INVALID_UNICHAR_ID && unichar_id < size_used &&
243  unichar_id >= 0;
244  }

◆ CopyFrom()

void UNICHARSET::CopyFrom ( const UNICHARSET src)

Definition at line 423 of file unicharset.cpp.

423  {
424  clear();
425  for (int ch = 0; ch < src.size_used; ++ch) {
426  const UNICHAR_PROPERTIES& src_props = src.unichars[ch].properties;
427  const char* utf8 = src.id_to_unichar(ch);
428  unichar_insert(utf8);
429  unichars[ch].properties.ExpandRangesFrom(src_props);
430  }
431  // Set properties, including mirror and other_case, WITHOUT reordering
432  // the unicharset.
434 }
void clear()
Definition: unicharset.h:265
const char * id_to_unichar(UNICHAR_ID id) const
Definition: unicharset.cpp:266
void PartialSetPropertiesFromOther(int start_index, const UNICHARSET &src)
Definition: unicharset.cpp:380
void unichar_insert(const char *const unichar_repr)
Definition: unicharset.cpp:612

◆ cyrillic_sid()

int UNICHARSET::cyrillic_sid ( ) const
inline

Definition at line 846 of file unicharset.h.

846 { return cyrillic_sid_; }

◆ debug_str() [1/2]

STRING UNICHARSET::debug_str ( UNICHAR_ID  id) const

Definition at line 318 of file unicharset.cpp.

318  {
319  if (id == INVALID_UNICHAR_ID) return STRING(id_to_unichar(id));
320  const CHAR_FRAGMENT *fragment = this->get_fragment(id);
321  if (fragment) {
322  return fragment->to_string();
323  }
324  const char* str = id_to_unichar(id);
325  STRING result = debug_utf8_str(str);
326  // Append a for lower alpha, A for upper alpha, and x if alpha but neither.
327  if (get_isalpha(id)) {
328  if (get_islower(id))
329  result += "a";
330  else if (get_isupper(id))
331  result += "A";
332  else
333  result += "x";
334  }
335  // Append 0 if a digit.
336  if (get_isdigit(id)) {
337  result += "0";
338  }
339  // Append p is a punctuation symbol.
340  if (get_ispunctuation(id)) {
341  result += "p";
342  }
343  return result;
344 }
bool get_ispunctuation(UNICHAR_ID unichar_id) const
Definition: unicharset.h:479
const CHAR_FRAGMENT * get_fragment(UNICHAR_ID unichar_id) const
Definition: unicharset.h:694
static STRING debug_utf8_str(const char *str)
Definition: unicharset.cpp:294
static STRING to_string(const char *unichar, int pos, int total, bool natural)
const char * id_to_unichar(UNICHAR_ID id) const
Definition: unicharset.cpp:266
bool get_isalpha(UNICHAR_ID unichar_id) const
Definition: unicharset.h:451
bool get_isdigit(UNICHAR_ID unichar_id) const
Definition: unicharset.h:472
Definition: strngs.h:45
bool get_isupper(UNICHAR_ID unichar_id) const
Definition: unicharset.h:465
bool get_islower(UNICHAR_ID unichar_id) const
Definition: unicharset.h:458

◆ debug_str() [2/2]

STRING UNICHARSET::debug_str ( const char *  unichar_repr) const
inline

Definition at line 232 of file unicharset.h.

232  {
233  return debug_str(unichar_to_id(unichar_repr));
234  }
UNICHAR_ID unichar_to_id(const char *const unichar_repr) const
Definition: unicharset.cpp:194
STRING debug_str(UNICHAR_ID id) const
Definition: unicharset.cpp:318

◆ debug_utf8_str()

STRING UNICHARSET::debug_utf8_str ( const char *  str)
static

Definition at line 294 of file unicharset.cpp.

294  {
295  STRING result = str;
296  result += " [";
297  int step = 1;
298  // Chop into unicodes and code each as hex.
299  for (int i = 0; str[i] != '\0'; i += step) {
300  char hex[sizeof(int) * 2 + 1];
301  step = UNICHAR::utf8_step(str + i);
302  if (step == 0) {
303  step = 1;
304  sprintf(hex, "%x", str[i]);
305  } else {
306  UNICHAR ch(str + i, step);
307  sprintf(hex, "%x", ch.first_uni());
308  }
309  result += hex;
310  result += " ";
311  }
312  result += "]";
313  return result;
314 }
Definition: strngs.h:45
int step(const char *str) const
Definition: unicharset.cpp:211
typedef int(ZCALLBACK *close_file_func) OF((voidpf opaque
static int utf8_step(const char *utf8_str)
Definition: unichar.cpp:134

◆ default_sid()

int UNICHARSET::default_sid ( ) const
inline

Definition at line 853 of file unicharset.h.

853 { return default_sid_; }

◆ delete_pointers_in_unichars()

void UNICHARSET::delete_pointers_in_unichars ( )
inline

Definition at line 255 of file unicharset.h.

255  {
256  for (int i = 0; i < size_used; ++i) {
257  if (unichars[i].properties.fragment != NULL) {
258  delete unichars[i].properties.fragment;
259  unichars[i].properties.fragment = NULL;
260  }
261  }
262  }

◆ encodable_string()

bool UNICHARSET::encodable_string ( const char *  str,
int first_bad_position 
) const

Definition at line 222 of file unicharset.cpp.

223  {
224  GenericVector<UNICHAR_ID> encoding;
225  return encode_string(str, true, &encoding, NULL, first_bad_position);
226 }
bool encode_string(const char *str, bool give_up_on_failure, GenericVector< UNICHAR_ID > *encoding, GenericVector< char > *lengths, int *encoded_length) const
Definition: unicharset.cpp:234

◆ encode_string()

bool UNICHARSET::encode_string ( const char *  str,
bool  give_up_on_failure,
GenericVector< UNICHAR_ID > *  encoding,
GenericVector< char > *  lengths,
int encoded_length 
) const

Definition at line 234 of file unicharset.cpp.

237  {
238  GenericVector<UNICHAR_ID> working_encoding;
239  GenericVector<char> working_lengths;
240  GenericVector<char> best_lengths;
241  encoding->truncate(0); // Just in case str is empty.
242  int str_length = strlen(str);
243  int str_pos = 0;
244  bool perfect = true;
245  while (str_pos < str_length) {
246  encode_string(str, str_pos, str_length, &working_encoding, &working_lengths,
247  &str_pos, encoding, &best_lengths);
248  if (str_pos < str_length) {
249  // This is a non-match. Skip one utf-8 character.
250  perfect = false;
251  if (give_up_on_failure) break;
252  int step = UNICHAR::utf8_step(str + str_pos);
253  if (step == 0) step = 1;
254  encoding->push_back(INVALID_UNICHAR_ID);
255  best_lengths.push_back(step);
256  str_pos += step;
257  working_encoding = *encoding;
258  working_lengths = best_lengths;
259  }
260  }
261  if (lengths != NULL) *lengths = best_lengths;
262  if (encoded_length != NULL) *encoded_length = str_pos;
263  return perfect;
264 }
int push_back(T object)
void truncate(int size)
int step(const char *str) const
Definition: unicharset.cpp:211
bool encode_string(const char *str, bool give_up_on_failure, GenericVector< UNICHAR_ID > *encoding, GenericVector< char > *lengths, int *encoded_length) const
Definition: unicharset.cpp:234
static int utf8_step(const char *utf8_str)
Definition: unichar.cpp:134

◆ eq()

bool UNICHARSET::eq ( UNICHAR_ID  unichar_id,
const char *const  unichar_repr 
) const

Definition at line 656 of file unicharset.cpp.

657  {
658  return strcmp(this->id_to_unichar(unichar_id), unichar_repr) == 0;
659 }
const char * id_to_unichar(UNICHAR_ID id) const
Definition: unicharset.cpp:266

◆ ExpandRangesFromOther()

void UNICHARSET::ExpandRangesFromOther ( const UNICHARSET src)

Definition at line 410 of file unicharset.cpp.

410  {
411  for (int ch = 0; ch < size_used; ++ch) {
412  const char* utf8 = id_to_unichar(ch);
413  UNICHAR_PROPERTIES properties;
414  if (src.GetStrProperties(utf8, &properties)) {
415  // Expand just the ranges from properties.
416  unichars[ch].properties.ExpandRangesFrom(properties);
417  }
418  }
419 }
const char * id_to_unichar(UNICHAR_ID id) const
Definition: unicharset.cpp:266

◆ get_advance_stats()

void UNICHARSET::get_advance_stats ( UNICHAR_ID  unichar_id,
float *  advance,
float *  advance_sd 
) const
inline

Definition at line 590 of file unicharset.h.

591  {
592  if (INVALID_UNICHAR_ID == unichar_id) {
593  *advance = *advance_sd = 0;
594  return;
595  }
596  ASSERT_HOST(contains_unichar_id(unichar_id));
597  *advance = unichars[unichar_id].properties.advance;
598  *advance_sd = unichars[unichar_id].properties.advance_sd;
599  }
bool contains_unichar_id(UNICHAR_ID unichar_id) const
Definition: unicharset.h:241
#define ASSERT_HOST(x)
Definition: errcode.h:84

◆ get_bearing_stats()

void UNICHARSET::get_bearing_stats ( UNICHAR_ID  unichar_id,
float *  bearing,
float *  bearing_sd 
) const
inline

Definition at line 573 of file unicharset.h.

574  {
575  if (INVALID_UNICHAR_ID == unichar_id) {
576  *bearing = *bearing_sd = 0.0f;
577  return;
578  }
579  ASSERT_HOST(contains_unichar_id(unichar_id));
580  *bearing = unichars[unichar_id].properties.bearing;
581  *bearing_sd = unichars[unichar_id].properties.bearing_sd;
582  }
bool contains_unichar_id(UNICHAR_ID unichar_id) const
Definition: unicharset.h:241
#define ASSERT_HOST(x)
Definition: errcode.h:84

◆ get_chartype() [1/2]

char UNICHARSET::get_chartype ( UNICHAR_ID  unichar_id) const

Definition at line 603 of file unicharset.cpp.

603  {
604  if (this->get_isupper(id)) return 'A';
605  if (this->get_islower(id)) return 'a';
606  if (this->get_isalpha(id)) return 'x';
607  if (this->get_isdigit(id)) return '0';
608  if (this->get_ispunctuation(id)) return 'p';
609  return 0;
610 }
bool get_ispunctuation(UNICHAR_ID unichar_id) const
Definition: unicharset.h:479
bool get_isalpha(UNICHAR_ID unichar_id) const
Definition: unicharset.h:451
bool get_isdigit(UNICHAR_ID unichar_id) const
Definition: unicharset.h:472
bool get_isupper(UNICHAR_ID unichar_id) const
Definition: unicharset.h:465
bool get_islower(UNICHAR_ID unichar_id) const
Definition: unicharset.h:458

◆ get_chartype() [2/2]

char UNICHARSET::get_chartype ( const char *const  unichar_repr) const
inline

Definition at line 731 of file unicharset.h.

731  {
732  return get_chartype(unichar_to_id(unichar_repr));
733  }
char get_chartype(UNICHAR_ID unichar_id) const
Definition: unicharset.cpp:603
UNICHAR_ID unichar_to_id(const char *const unichar_repr) const
Definition: unicharset.cpp:194

◆ get_direction()

Direction UNICHARSET::get_direction ( UNICHAR_ID  unichar_id) const
inline

Definition at line 650 of file unicharset.h.

650  {
651  if (INVALID_UNICHAR_ID == unichar_id) return UNICHARSET::U_OTHER_NEUTRAL;
652  ASSERT_HOST(contains_unichar_id(unichar_id));
653  return unichars[unichar_id].properties.direction;
654  }
bool contains_unichar_id(UNICHAR_ID unichar_id) const
Definition: unicharset.h:241
#define ASSERT_HOST(x)
Definition: errcode.h:84

◆ get_enabled()

bool UNICHARSET::get_enabled ( UNICHAR_ID  unichar_id) const
inline

Definition at line 838 of file unicharset.h.

838  {
839  return unichars[unichar_id].properties.enabled;
840  }

◆ get_fragment() [1/2]

const CHAR_FRAGMENT* UNICHARSET::get_fragment ( UNICHAR_ID  unichar_id) const
inline

Definition at line 694 of file unicharset.h.

694  {
695  if (INVALID_UNICHAR_ID == unichar_id) return NULL;
696  ASSERT_HOST(contains_unichar_id(unichar_id));
697  return unichars[unichar_id].properties.fragment;
698  }
bool contains_unichar_id(UNICHAR_ID unichar_id) const
Definition: unicharset.h:241
#define ASSERT_HOST(x)
Definition: errcode.h:84

◆ get_fragment() [2/2]

const CHAR_FRAGMENT* UNICHARSET::get_fragment ( const char *const  unichar_repr) const
inline

Definition at line 744 of file unicharset.h.

744  {
745  if (unichar_repr == NULL || unichar_repr[0] == '\0' ||
746  !ids.contains(unichar_repr)) {
747  return NULL;
748  }
749  return get_fragment(unichar_to_id(unichar_repr));
750  }
const CHAR_FRAGMENT * get_fragment(UNICHAR_ID unichar_id) const
Definition: unicharset.h:694
bool contains(const char *const unichar_repr) const
Definition: unicharmap.cpp:101
UNICHAR_ID unichar_to_id(const char *const unichar_repr) const
Definition: unicharset.cpp:194

◆ get_isalpha() [1/3]

bool UNICHARSET::get_isalpha ( UNICHAR_ID  unichar_id) const
inline

Definition at line 451 of file unicharset.h.

451  {
452  if (INVALID_UNICHAR_ID == unichar_id) return false;
453  ASSERT_HOST(contains_unichar_id(unichar_id));
454  return unichars[unichar_id].properties.isalpha;
455  }
bool contains_unichar_id(UNICHAR_ID unichar_id) const
Definition: unicharset.h:241
#define ASSERT_HOST(x)
Definition: errcode.h:84

◆ get_isalpha() [2/3]

bool UNICHARSET::get_isalpha ( const char *const  unichar_repr) const
inline

Definition at line 701 of file unicharset.h.

701  {
702  return get_isalpha(unichar_to_id(unichar_repr));
703  }
bool get_isalpha(UNICHAR_ID unichar_id) const
Definition: unicharset.h:451
UNICHAR_ID unichar_to_id(const char *const unichar_repr) const
Definition: unicharset.cpp:194

◆ get_isalpha() [3/3]

bool UNICHARSET::get_isalpha ( const char *const  unichar_repr,
int  length 
) const
inline

Definition at line 754 of file unicharset.h.

755  {
756  return get_isalpha(unichar_to_id(unichar_repr, length));
757  }
bool get_isalpha(UNICHAR_ID unichar_id) const
Definition: unicharset.h:451
UNICHAR_ID unichar_to_id(const char *const unichar_repr) const
Definition: unicharset.cpp:194

◆ get_isdigit() [1/3]

bool UNICHARSET::get_isdigit ( UNICHAR_ID  unichar_id) const
inline

Definition at line 472 of file unicharset.h.

472  {
473  if (INVALID_UNICHAR_ID == unichar_id) return false;
474  ASSERT_HOST(contains_unichar_id(unichar_id));
475  return unichars[unichar_id].properties.isdigit;
476  }
bool contains_unichar_id(UNICHAR_ID unichar_id) const
Definition: unicharset.h:241
#define ASSERT_HOST(x)
Definition: errcode.h:84

◆ get_isdigit() [2/3]

bool UNICHARSET::get_isdigit ( const char *const  unichar_repr) const
inline

Definition at line 716 of file unicharset.h.

716  {
717  return get_isdigit(unichar_to_id(unichar_repr));
718  }
bool get_isdigit(UNICHAR_ID unichar_id) const
Definition: unicharset.h:472
UNICHAR_ID unichar_to_id(const char *const unichar_repr) const
Definition: unicharset.cpp:194

◆ get_isdigit() [3/3]

bool UNICHARSET::get_isdigit ( const char *const  unichar_repr,
int  length 
) const
inline

Definition at line 775 of file unicharset.h.

776  {
777  return get_isdigit(unichar_to_id(unichar_repr, length));
778  }
bool get_isdigit(UNICHAR_ID unichar_id) const
Definition: unicharset.h:472
UNICHAR_ID unichar_to_id(const char *const unichar_repr) const
Definition: unicharset.cpp:194

◆ get_islower() [1/3]

bool UNICHARSET::get_islower ( UNICHAR_ID  unichar_id) const
inline

Definition at line 458 of file unicharset.h.

458  {
459  if (INVALID_UNICHAR_ID == unichar_id) return false;
460  ASSERT_HOST(contains_unichar_id(unichar_id));
461  return unichars[unichar_id].properties.islower;
462  }
bool contains_unichar_id(UNICHAR_ID unichar_id) const
Definition: unicharset.h:241
#define ASSERT_HOST(x)
Definition: errcode.h:84

◆ get_islower() [2/3]

bool UNICHARSET::get_islower ( const char *const  unichar_repr) const
inline

Definition at line 706 of file unicharset.h.

706  {
707  return get_islower(unichar_to_id(unichar_repr));
708  }
bool get_islower(UNICHAR_ID unichar_id) const
Definition: unicharset.h:458
UNICHAR_ID unichar_to_id(const char *const unichar_repr) const
Definition: unicharset.cpp:194

◆ get_islower() [3/3]

bool UNICHARSET::get_islower ( const char *const  unichar_repr,
int  length 
) const
inline

Definition at line 761 of file unicharset.h.

762  {
763  return get_islower(unichar_to_id(unichar_repr, length));
764  }
bool get_islower(UNICHAR_ID unichar_id) const
Definition: unicharset.h:458
UNICHAR_ID unichar_to_id(const char *const unichar_repr) const
Definition: unicharset.cpp:194

◆ get_isngram()

bool UNICHARSET::get_isngram ( UNICHAR_ID  unichar_id) const
inline

Definition at line 486 of file unicharset.h.

486  {
487  if (INVALID_UNICHAR_ID == unichar_id) return false;
488  ASSERT_HOST(contains_unichar_id(unichar_id));
489  return unichars[unichar_id].properties.isngram;
490  }
bool contains_unichar_id(UNICHAR_ID unichar_id) const
Definition: unicharset.h:241
#define ASSERT_HOST(x)
Definition: errcode.h:84

◆ get_isprivate()

bool UNICHARSET::get_isprivate ( UNICHAR_ID  unichar_id) const

Definition at line 363 of file unicharset.cpp.

363  {
364  UNICHAR uc(id_to_unichar(unichar_id), -1);
365  int uni = uc.first_uni();
366  return (uni >= 0xE000 && uni <= 0xF8FF);
367 }
const char * id_to_unichar(UNICHAR_ID id) const
Definition: unicharset.cpp:266

◆ get_ispunctuation() [1/3]

bool UNICHARSET::get_ispunctuation ( UNICHAR_ID  unichar_id) const
inline

Definition at line 479 of file unicharset.h.

479  {
480  if (INVALID_UNICHAR_ID == unichar_id) return false;
481  ASSERT_HOST(contains_unichar_id(unichar_id));
482  return unichars[unichar_id].properties.ispunctuation;
483  }
bool contains_unichar_id(UNICHAR_ID unichar_id) const
Definition: unicharset.h:241
#define ASSERT_HOST(x)
Definition: errcode.h:84

◆ get_ispunctuation() [2/3]

bool UNICHARSET::get_ispunctuation ( const char *const  unichar_repr) const
inline

Definition at line 721 of file unicharset.h.

721  {
722  return get_ispunctuation(unichar_to_id(unichar_repr));
723  }
bool get_ispunctuation(UNICHAR_ID unichar_id) const
Definition: unicharset.h:479
UNICHAR_ID unichar_to_id(const char *const unichar_repr) const
Definition: unicharset.cpp:194

◆ get_ispunctuation() [3/3]

bool UNICHARSET::get_ispunctuation ( const char *const  unichar_repr,
int  length 
) const
inline

Definition at line 782 of file unicharset.h.

783  {
784  return get_ispunctuation(unichar_to_id(unichar_repr, length));
785  }
bool get_ispunctuation(UNICHAR_ID unichar_id) const
Definition: unicharset.h:479
UNICHAR_ID unichar_to_id(const char *const unichar_repr) const
Definition: unicharset.cpp:194

◆ get_isupper() [1/3]

bool UNICHARSET::get_isupper ( UNICHAR_ID  unichar_id) const
inline

Definition at line 465 of file unicharset.h.

465  {
466  if (INVALID_UNICHAR_ID == unichar_id) return false;
467  ASSERT_HOST(contains_unichar_id(unichar_id));
468  return unichars[unichar_id].properties.isupper;
469  }
bool contains_unichar_id(UNICHAR_ID unichar_id) const
Definition: unicharset.h:241
#define ASSERT_HOST(x)
Definition: errcode.h:84

◆ get_isupper() [2/3]

bool UNICHARSET::get_isupper ( const char *const  unichar_repr) const
inline

Definition at line 711 of file unicharset.h.

711  {
712  return get_isupper(unichar_to_id(unichar_repr));
713  }
bool get_isupper(UNICHAR_ID unichar_id) const
Definition: unicharset.h:465
UNICHAR_ID unichar_to_id(const char *const unichar_repr) const
Definition: unicharset.cpp:194

◆ get_isupper() [3/3]

bool UNICHARSET::get_isupper ( const char *const  unichar_repr,
int  length 
) const
inline

Definition at line 768 of file unicharset.h.

769  {
770  return get_isupper(unichar_to_id(unichar_repr, length));
771  }
bool get_isupper(UNICHAR_ID unichar_id) const
Definition: unicharset.h:465
UNICHAR_ID unichar_to_id(const char *const unichar_repr) const
Definition: unicharset.cpp:194

◆ get_mirror()

UNICHAR_ID UNICHARSET::get_mirror ( UNICHAR_ID  unichar_id) const
inline

Definition at line 657 of file unicharset.h.

657  {
658  if (INVALID_UNICHAR_ID == unichar_id) return INVALID_UNICHAR_ID;
659  ASSERT_HOST(contains_unichar_id(unichar_id));
660  return unichars[unichar_id].properties.mirror;
661  }
bool contains_unichar_id(UNICHAR_ID unichar_id) const
Definition: unicharset.h:241
#define ASSERT_HOST(x)
Definition: errcode.h:84

◆ get_normed_unichar()

const char* UNICHARSET::get_normed_unichar ( UNICHAR_ID  unichar_id) const
inline

Definition at line 788 of file unicharset.h.

788  {
789  if (unichar_id == UNICHAR_SPACE) return " ";
790  return unichars[unichar_id].properties.normed.string();
791  }

◆ get_other_case()

UNICHAR_ID UNICHARSET::get_other_case ( UNICHAR_ID  unichar_id) const
inline

Definition at line 643 of file unicharset.h.

643  {
644  if (INVALID_UNICHAR_ID == unichar_id) return INVALID_UNICHAR_ID;
645  ASSERT_HOST(contains_unichar_id(unichar_id));
646  return unichars[unichar_id].properties.other_case;
647  }
bool contains_unichar_id(UNICHAR_ID unichar_id) const
Definition: unicharset.h:241
#define ASSERT_HOST(x)
Definition: errcode.h:84

◆ get_properties() [1/2]

unsigned int UNICHARSET::get_properties ( UNICHAR_ID  unichar_id) const

Definition at line 588 of file unicharset.cpp.

588  {
589  unsigned int properties = 0;
590  if (this->get_isalpha(id))
591  properties |= ISALPHA_MASK;
592  if (this->get_islower(id))
593  properties |= ISLOWER_MASK;
594  if (this->get_isupper(id))
595  properties |= ISUPPER_MASK;
596  if (this->get_isdigit(id))
597  properties |= ISDIGIT_MASK;
598  if (this->get_ispunctuation(id))
599  properties |= ISPUNCTUATION_MASK;
600  return properties;
601 }
bool get_ispunctuation(UNICHAR_ID unichar_id) const
Definition: unicharset.h:479
bool get_isalpha(UNICHAR_ID unichar_id) const
Definition: unicharset.h:451
bool get_isdigit(UNICHAR_ID unichar_id) const
Definition: unicharset.h:472
bool get_isupper(UNICHAR_ID unichar_id) const
Definition: unicharset.h:465
bool get_islower(UNICHAR_ID unichar_id) const
Definition: unicharset.h:458

◆ get_properties() [2/2]

unsigned int UNICHARSET::get_properties ( const char *const  unichar_repr) const
inline

Definition at line 727 of file unicharset.h.

727  {
728  return get_properties(unichar_to_id(unichar_repr));
729  }
unsigned int get_properties(UNICHAR_ID unichar_id) const
Definition: unicharset.cpp:588
UNICHAR_ID unichar_to_id(const char *const unichar_repr) const
Definition: unicharset.cpp:194

◆ get_script() [1/3]

int UNICHARSET::get_script ( UNICHAR_ID  unichar_id) const
inline

Definition at line 623 of file unicharset.h.

623  {
624  if (INVALID_UNICHAR_ID == unichar_id) return null_sid_;
625  ASSERT_HOST(contains_unichar_id(unichar_id));
626  return unichars[unichar_id].properties.script_id;
627  }
bool contains_unichar_id(UNICHAR_ID unichar_id) const
Definition: unicharset.h:241
#define ASSERT_HOST(x)
Definition: errcode.h:84

◆ get_script() [2/3]

int UNICHARSET::get_script ( const char *const  unichar_repr) const
inline

Definition at line 738 of file unicharset.h.

738  {
739  return get_script(unichar_to_id(unichar_repr));
740  }
int get_script(UNICHAR_ID unichar_id) const
Definition: unicharset.h:623
UNICHAR_ID unichar_to_id(const char *const unichar_repr) const
Definition: unicharset.cpp:194

◆ get_script() [3/3]

int UNICHARSET::get_script ( const char *const  unichar_repr,
int  length 
) const
inline

Definition at line 803 of file unicharset.h.

804  {
805  return get_script(unichar_to_id(unichar_repr, length));
806  }
int get_script(UNICHAR_ID unichar_id) const
Definition: unicharset.h:623
UNICHAR_ID unichar_to_id(const char *const unichar_repr) const
Definition: unicharset.cpp:194

◆ get_script_from_script_id()

const char* UNICHARSET::get_script_from_script_id ( int  id) const
inline

Definition at line 814 of file unicharset.h.

814  {
815  if (id >= script_table_size_used || id < 0)
816  return null_script;
817  return script_table[id];
818  }

◆ get_script_id_from_name()

int UNICHARSET::get_script_id_from_name ( const char *  script_name) const

Definition at line 1082 of file unicharset.cpp.

1082  {
1083  for (int i = 0; i < script_table_size_used; ++i) {
1084  if (strcmp(script_name, script_table[i]) == 0)
1085  return i;
1086  }
1087  return 0; // 0 is always the null_script
1088 }

◆ get_script_table_size()

int UNICHARSET::get_script_table_size ( ) const
inline

Definition at line 809 of file unicharset.h.

809  {
810  return script_table_size_used;
811  }

◆ get_top_bottom()

void UNICHARSET::get_top_bottom ( UNICHAR_ID  unichar_id,
int min_bottom,
int max_bottom,
int min_top,
int max_top 
) const
inline

Definition at line 528 of file unicharset.h.

530  {
531  if (INVALID_UNICHAR_ID == unichar_id) {
532  *min_bottom = *min_top = 0;
533  *max_bottom = *max_top = 256; // kBlnCellHeight
534  return;
535  }
536  ASSERT_HOST(contains_unichar_id(unichar_id));
537  *min_bottom = unichars[unichar_id].properties.min_bottom;
538  *max_bottom = unichars[unichar_id].properties.max_bottom;
539  *min_top = unichars[unichar_id].properties.min_top;
540  *max_top = unichars[unichar_id].properties.max_top;
541  }
bool contains_unichar_id(UNICHAR_ID unichar_id) const
Definition: unicharset.h:241
#define ASSERT_HOST(x)
Definition: errcode.h:84

◆ get_width_stats()

void UNICHARSET::get_width_stats ( UNICHAR_ID  unichar_id,
float *  width,
float *  width_sd 
) const
inline

Definition at line 556 of file unicharset.h.

557  {
558  if (INVALID_UNICHAR_ID == unichar_id) {
559  *width = 0.0f;
560  *width_sd = 0.0f;;
561  return;
562  }
563  ASSERT_HOST(contains_unichar_id(unichar_id));
564  *width = unichars[unichar_id].properties.width;
565  *width_sd = unichars[unichar_id].properties.width_sd;
566  }
bool contains_unichar_id(UNICHAR_ID unichar_id) const
Definition: unicharset.h:241
#define ASSERT_HOST(x)
Definition: errcode.h:84

◆ greek_sid()

int UNICHARSET::greek_sid ( ) const
inline

Definition at line 847 of file unicharset.h.

847 { return greek_sid_; }

◆ han_sid()

int UNICHARSET::han_sid ( ) const
inline

Definition at line 848 of file unicharset.h.

848 { return han_sid_; }

◆ hangul_sid()

int UNICHARSET::hangul_sid ( ) const
inline

Definition at line 852 of file unicharset.h.

852 { return hangul_sid_; }

◆ has_special_codes()

bool UNICHARSET::has_special_codes ( ) const
inline

Definition at line 682 of file unicharset.h.

682  {
683  return get_fragment(UNICHAR_BROKEN) != NULL &&
686  }
const CHAR_FRAGMENT * get_fragment(UNICHAR_ID unichar_id) const
Definition: unicharset.h:694
const char * id_to_unichar(UNICHAR_ID id) const
Definition: unicharset.cpp:266
static const char * kSpecialUnicharCodes[SPECIAL_UNICHAR_CODES_COUNT]
Definition: unicharset.h:147

◆ hiragana_sid()

int UNICHARSET::hiragana_sid ( ) const
inline

Definition at line 849 of file unicharset.h.

849 { return hiragana_sid_; }

◆ id_to_unichar()

const char * UNICHARSET::id_to_unichar ( UNICHAR_ID  id) const

Definition at line 266 of file unicharset.cpp.

266  {
267  if (id == INVALID_UNICHAR_ID) {
268  return INVALID_UNICHAR;
269  }
270  ASSERT_HOST(id < this->size());
271  return unichars[id].representation;
272 }
#define ASSERT_HOST(x)
Definition: errcode.h:84
int size() const
Definition: unicharset.h:299

◆ id_to_unichar_ext()

const char * UNICHARSET::id_to_unichar_ext ( UNICHAR_ID  id) const

Definition at line 274 of file unicharset.cpp.

274  {
275  if (id == INVALID_UNICHAR_ID) {
276  return INVALID_UNICHAR;
277  }
278  ASSERT_HOST(id < this->size());
279  // Resolve from the kCustomLigatures table if this is a private encoding.
280  if (get_isprivate(id)) {
281  const char* ch = id_to_unichar(id);
282  for (int i = 0; kCustomLigatures[i][0] != NULL; ++i) {
283  if (!strcmp(ch, kCustomLigatures[i][1])) {
284  return kCustomLigatures[i][0];
285  }
286  }
287  }
288  // Otherwise return the stored representation.
289  return unichars[id].representation;
290 }
static TESS_API const char * kCustomLigatures[][2]
Definition: unicharset.h:144
#define ASSERT_HOST(x)
Definition: errcode.h:84
const char * id_to_unichar(UNICHAR_ID id) const
Definition: unicharset.cpp:266
int size() const
Definition: unicharset.h:299
bool get_isprivate(UNICHAR_ID unichar_id) const
Definition: unicharset.cpp:363

◆ is_null_script()

bool UNICHARSET::is_null_script ( const char *  script) const
inline

Definition at line 828 of file unicharset.h.

828  {
829  return script == null_script;
830  }

◆ IsSpaceDelimited()

bool UNICHARSET::IsSpaceDelimited ( UNICHAR_ID  unichar_id) const
inline

Definition at line 612 of file unicharset.h.

612  {
613  if (INVALID_UNICHAR_ID == unichar_id) return true;
614  int script_id = get_script(unichar_id);
615  return script_id != han_sid_ && script_id != thai_sid_ &&
616  script_id != hangul_sid_ && script_id != hiragana_sid_ &&
617  script_id != katakana_sid_;
618  }
int get_script(UNICHAR_ID unichar_id) const
Definition: unicharset.h:623

◆ katakana_sid()

int UNICHARSET::katakana_sid ( ) const
inline

Definition at line 850 of file unicharset.h.

850 { return katakana_sid_; }

◆ latin_sid()

int UNICHARSET::latin_sid ( ) const
inline

Definition at line 845 of file unicharset.h.

845 { return latin_sid_; }

◆ load_from_file() [1/5]

bool UNICHARSET::load_from_file ( const char *const  filename,
bool  skip_fragments 
)
inline

Definition at line 348 of file unicharset.h.

348  {
349  FILE* file = fopen(filename, "rb");
350  if (file == NULL) return false;
351  bool result = load_from_file(file, skip_fragments);
352  fclose(file);
353  return result;
354  }
bool load_from_file(const char *const filename, bool skip_fragments)
Definition: unicharset.h:348
const char * filename
Definition: ioapi.h:38

◆ load_from_file() [2/5]

bool UNICHARSET::load_from_file ( const char *const  filename)
inline

Definition at line 356 of file unicharset.h.

356  {
357  return load_from_file(filename, false);
358  }
bool load_from_file(const char *const filename, bool skip_fragments)
Definition: unicharset.h:348
const char * filename
Definition: ioapi.h:38

◆ load_from_file() [3/5]

bool UNICHARSET::load_from_file ( FILE *  file,
bool  skip_fragments 
)

Definition at line 744 of file unicharset.cpp.

744  {
745  LocalFilePointer lfp(file);
748  bool success = load_via_fgets(fgets_cb, skip_fragments);
749  delete fgets_cb;
750  return success;
751 }
_ConstTessMemberResultCallback_0_0< false, R, T1 >::base * NewPermanentTessCallback(const T1 *obj, R(T2::*member)() const)
Definition: tesscallback.h:116
char * fgets(char *dst, int size)
Definition: unicharset.cpp:737

◆ load_from_file() [4/5]

bool UNICHARSET::load_from_file ( FILE *  file)
inline

Definition at line 363 of file unicharset.h.

363 { return load_from_file(file, false); }
bool load_from_file(const char *const filename, bool skip_fragments)
Definition: unicharset.h:348

◆ load_from_file() [5/5]

bool UNICHARSET::load_from_file ( tesseract::TFile file,
bool  skip_fragments 
)

Definition at line 753 of file unicharset.cpp.

753  {
756  bool success = load_via_fgets(fgets_cb, skip_fragments);
757  delete fgets_cb;
758  return success;
759 }
_ConstTessMemberResultCallback_0_0< false, R, T1 >::base * NewPermanentTessCallback(const T1 *obj, R(T2::*member)() const)
Definition: tesscallback.h:116
char * FGets(char *buffer, int buffer_size)
Definition: serialis.cpp:86

◆ load_from_inmemory_file() [1/2]

bool UNICHARSET::load_from_inmemory_file ( const char *const  memory,
int  mem_size,
bool  skip_fragments 
)

Definition at line 724 of file unicharset.cpp.

725  {
726  InMemoryFilePointer mem_fp(memory, mem_size);
729  bool success = load_via_fgets(fgets_cb, skip_fragments);
730  delete fgets_cb;
731  return success;
732 }
_ConstTessMemberResultCallback_0_0< false, R, T1 >::base * NewPermanentTessCallback(const T1 *obj, R(T2::*member)() const)
Definition: tesscallback.h:116
char * fgets(char *orig_dst, int size)
Definition: unicharset.cpp:702

◆ load_from_inmemory_file() [2/2]

bool UNICHARSET::load_from_inmemory_file ( const char *const  memory,
int  mem_size 
)
inline

Definition at line 341 of file unicharset.h.

341  {
342  return load_from_inmemory_file(memory, mem_size, false);
343  }
bool load_from_inmemory_file(const char *const memory, int mem_size, bool skip_fragments)
Definition: unicharset.cpp:724

◆ major_right_to_left()

bool UNICHARSET::major_right_to_left ( ) const

Definition at line 933 of file unicharset.cpp.

933  {
934  int ltr_count = 0;
935  int rtl_count = 0;
936  for (int id = 0; id < size_used; ++id) {
937  int dir = get_direction(id);
938  if (dir == UNICHARSET::U_LEFT_TO_RIGHT) ltr_count++;
939  if (dir == UNICHARSET::U_RIGHT_TO_LEFT ||
941  dir == UNICHARSET::U_ARABIC_NUMBER) rtl_count++;
942  }
943  return rtl_count > ltr_count;
944 }
Direction get_direction(UNICHAR_ID unichar_id) const
Definition: unicharset.h:650

◆ normed_ids()

const GenericVector<UNICHAR_ID>& UNICHARSET::normed_ids ( UNICHAR_ID  unichar_id) const
inline

Definition at line 795 of file unicharset.h.

795  {
796  return unichars[unichar_id].properties.normed_ids;
797  }

◆ null_sid()

int UNICHARSET::null_sid ( ) const
inline

Definition at line 843 of file unicharset.h.

843 { return null_sid_; }

◆ PartialSetPropertiesFromOther()

void UNICHARSET::PartialSetPropertiesFromOther ( int  start_index,
const UNICHARSET src 
)

Definition at line 380 of file unicharset.cpp.

381  {
382  for (int ch = start_index; ch < size_used; ++ch) {
383  const char* utf8 = id_to_unichar(ch);
384  UNICHAR_PROPERTIES properties;
385  if (src.GetStrProperties(utf8, &properties)) {
386  // Setup the script_id, other_case, and mirror properly.
387  const char* script = src.get_script_from_script_id(properties.script_id);
388  properties.script_id = add_script(script);
389  const char* other_case = src.id_to_unichar(properties.other_case);
390  if (contains_unichar(other_case)) {
391  properties.other_case = unichar_to_id(other_case);
392  } else {
393  properties.other_case = ch;
394  }
395  const char* mirror_str = src.id_to_unichar(properties.mirror);
396  if (contains_unichar(mirror_str)) {
397  properties.mirror = unichar_to_id(mirror_str);
398  } else {
399  properties.mirror = ch;
400  }
401  unichars[ch].properties.CopyFrom(properties);
402  set_normed_ids(ch);
403  }
404  }
405 }
bool contains_unichar(const char *const unichar_repr) const
Definition: unicharset.cpp:644
const char * get_script_from_script_id(int id) const
Definition: unicharset.h:814
const char * id_to_unichar(UNICHAR_ID id) const
Definition: unicharset.cpp:266
void set_normed_ids(UNICHAR_ID unichar_id)
Definition: unicharset.cpp:348
int add_script(const char *script)
UNICHAR_ID unichar_to_id(const char *const unichar_repr) const
Definition: unicharset.cpp:194

◆ post_load_setup()

void UNICHARSET::post_load_setup ( )

Definition at line 867 of file unicharset.cpp.

867  {
868  // Number of alpha chars with the case property minus those without,
869  // in order to determine that half the alpha chars have case.
870  int net_case_alphas = 0;
871  int x_height_alphas = 0;
872  int cap_height_alphas = 0;
873  top_bottom_set_ = false;
874  for (UNICHAR_ID id = 0; id < size_used; ++id) {
875  int min_bottom = 0;
876  int max_bottom = MAX_UINT8;
877  int min_top = 0;
878  int max_top = MAX_UINT8;
879  get_top_bottom(id, &min_bottom, &max_bottom, &min_top, &max_top);
880  if (min_top > 0)
881  top_bottom_set_ = true;
882  if (get_isalpha(id)) {
883  if (get_islower(id) || get_isupper(id))
884  ++net_case_alphas;
885  else
886  --net_case_alphas;
887  if (min_top < kMeanlineThreshold && max_top < kMeanlineThreshold)
888  ++x_height_alphas;
889  else if (min_top > kMeanlineThreshold && max_top > kMeanlineThreshold)
890  ++cap_height_alphas;
891  }
892  set_normed_ids(id);
893  }
894 
895  script_has_upper_lower_ = net_case_alphas > 0;
896  script_has_xheight_ = script_has_upper_lower_ ||
897  (x_height_alphas > cap_height_alphas * kMinXHeightFraction &&
898  cap_height_alphas > x_height_alphas * kMinCapHeightFraction);
899 
900  null_sid_ = get_script_id_from_name(null_script);
901  ASSERT_HOST(null_sid_ == 0);
902  common_sid_ = get_script_id_from_name("Common");
903  latin_sid_ = get_script_id_from_name("Latin");
904  cyrillic_sid_ = get_script_id_from_name("Cyrillic");
905  greek_sid_ = get_script_id_from_name("Greek");
906  han_sid_ = get_script_id_from_name("Han");
907  hiragana_sid_ = get_script_id_from_name("Hiragana");
908  katakana_sid_ = get_script_id_from_name("Katakana");
909  thai_sid_ = get_script_id_from_name("Thai");
910  hangul_sid_ = get_script_id_from_name("Hangul");
911 
912  // Compute default script. Use the highest-counting alpha script, that is
913  // not the common script, as that still contains some "alphas".
914  int* script_counts = new int[script_table_size_used];
915  memset(script_counts, 0, sizeof(*script_counts) * script_table_size_used);
916  for (int id = 0; id < size_used; ++id) {
917  if (get_isalpha(id)) {
918  ++script_counts[get_script(id)];
919  }
920  }
921  default_sid_ = 0;
922  for (int s = 1; s < script_table_size_used; ++s) {
923  if (script_counts[s] > script_counts[default_sid_] && s != common_sid_)
924  default_sid_ = s;
925  }
926  delete [] script_counts;
927 }
int UNICHAR_ID
Definition: unichar.h:33
#define MAX_UINT8
Definition: host.h:63
const double kMinCapHeightFraction
Definition: unicharset.cpp:54
#define ASSERT_HOST(x)
Definition: errcode.h:84
int get_script(UNICHAR_ID unichar_id) const
Definition: unicharset.h:623
bool get_isalpha(UNICHAR_ID unichar_id) const
Definition: unicharset.h:451
void get_top_bottom(UNICHAR_ID unichar_id, int *min_bottom, int *max_bottom, int *min_top, int *max_top) const
Definition: unicharset.h:528
void set_normed_ids(UNICHAR_ID unichar_id)
Definition: unicharset.cpp:348
int get_script_id_from_name(const char *script_name) const
bool get_isupper(UNICHAR_ID unichar_id) const
Definition: unicharset.h:465
bool get_islower(UNICHAR_ID unichar_id) const
Definition: unicharset.h:458
const double kMinXHeightFraction
Definition: unicharset.cpp:53

◆ PropertiesIncomplete()

bool UNICHARSET::PropertiesIncomplete ( UNICHAR_ID  unichar_id) const
inline

Definition at line 606 of file unicharset.h.

606  {
607  return unichars[unichar_id].properties.AnyRangeEmpty();
608  }

◆ reserve()

void UNICHARSET::reserve ( int  unichars_number)

Definition at line 179 of file unicharset.cpp.

179  {
180  if (unichars_number > size_reserved) {
181  UNICHAR_SLOT* unichars_new = new UNICHAR_SLOT[unichars_number];
182  for (int i = 0; i < size_used; ++i)
183  unichars_new[i] = unichars[i];
184  for (int j = size_used; j < unichars_number; ++j) {
185  unichars_new[j].properties.script_id = add_script(null_script);
186  }
187  delete[] unichars;
188  unichars = unichars_new;
189  size_reserved = unichars_number;
190  }
191 }
int add_script(const char *script)

◆ save_to_file() [1/3]

bool UNICHARSET::save_to_file ( const char *const  filename) const
inline

Definition at line 308 of file unicharset.h.

308  {
309  FILE* file = fopen(filename, "w+b");
310  if (file == NULL) return false;
311  bool result = save_to_file(file);
312  fclose(file);
313  return result;
314  }
bool save_to_file(const char *const filename) const
Definition: unicharset.h:308
const char * filename
Definition: ioapi.h:38

◆ save_to_file() [2/3]

bool UNICHARSET::save_to_file ( FILE *  file) const
inline

Definition at line 318 of file unicharset.h.

318  {
319  STRING str;
320  if (!save_to_string(&str)) return false;
321  if (fwrite(&str[0], str.length(), 1, file) != 1) return false;
322  return true;
323  }
inT32 length() const
Definition: strngs.cpp:193
bool save_to_string(STRING *str) const
Definition: unicharset.cpp:661
Definition: strngs.h:45

◆ save_to_file() [3/3]

bool UNICHARSET::save_to_file ( tesseract::TFile file) const
inline

Definition at line 324 of file unicharset.h.

324  {
325  STRING str;
326  if (!save_to_string(&str)) return false;
327  if (file->FWrite(&str[0], str.length(), 1) != 1) return false;
328  return true;
329  }
inT32 length() const
Definition: strngs.cpp:193
bool save_to_string(STRING *str) const
Definition: unicharset.cpp:661
Definition: strngs.h:45
int FWrite(const void *buffer, int size, int count)
Definition: serialis.cpp:148

◆ save_to_string()

bool UNICHARSET::save_to_string ( STRING str) const

Definition at line 661 of file unicharset.cpp.

661  {
662  const int kFileBufSize = 1024;
663  char buffer[kFileBufSize + 1];
664  snprintf(buffer, kFileBufSize, "%d\n", this->size());
665  *str = buffer;
666  for (UNICHAR_ID id = 0; id < this->size(); ++id) {
667  int min_bottom, max_bottom, min_top, max_top;
668  get_top_bottom(id, &min_bottom, &max_bottom, &min_top, &max_top);
669  float width, width_sd;
670  get_width_stats(id, &width, &width_sd);
671  float bearing, bearing_sd;
672  get_bearing_stats(id, &bearing, &bearing_sd);
673  float advance, advance_sd;
674  get_advance_stats(id, &advance, &advance_sd);
675  unsigned int properties = this->get_properties(id);
676  if (strcmp(this->id_to_unichar(id), " ") == 0) {
677  snprintf(buffer, kFileBufSize, "%s %x %s %d\n", "NULL", properties,
678  this->get_script_from_script_id(this->get_script(id)),
679  this->get_other_case(id));
680  } else {
681  snprintf(buffer, kFileBufSize,
682  "%s %x %d,%d,%d,%d,%g,%g,%g,%g,%g,%g %s %d %d %d %s\t# %s\n",
683  this->id_to_unichar(id), properties,
684  min_bottom, max_bottom, min_top, max_top, width, width_sd,
685  bearing, bearing_sd, advance, advance_sd,
686  this->get_script_from_script_id(this->get_script(id)),
687  this->get_other_case(id), this->get_direction(id),
688  this->get_mirror(id), this->get_normed_unichar(id),
689  this->debug_str(id).string());
690  }
691  *str += buffer;
692  }
693  return true;
694 }
int UNICHAR_ID
Definition: unichar.h:33
const char * get_script_from_script_id(int id) const
Definition: unicharset.h:814
Direction get_direction(UNICHAR_ID unichar_id) const
Definition: unicharset.h:650
int get_script(UNICHAR_ID unichar_id) const
Definition: unicharset.h:623
const char * id_to_unichar(UNICHAR_ID id) const
Definition: unicharset.cpp:266
void get_width_stats(UNICHAR_ID unichar_id, float *width, float *width_sd) const
Definition: unicharset.h:556
void get_advance_stats(UNICHAR_ID unichar_id, float *advance, float *advance_sd) const
Definition: unicharset.h:590
UNICHAR_ID get_other_case(UNICHAR_ID unichar_id) const
Definition: unicharset.h:643
void get_top_bottom(UNICHAR_ID unichar_id, int *min_bottom, int *max_bottom, int *min_top, int *max_top) const
Definition: unicharset.h:528
UNICHAR_ID get_mirror(UNICHAR_ID unichar_id) const
Definition: unicharset.h:657
unsigned int get_properties(UNICHAR_ID unichar_id) const
Definition: unicharset.cpp:588
void get_bearing_stats(UNICHAR_ID unichar_id, float *bearing, float *bearing_sd) const
Definition: unicharset.h:573
int size() const
Definition: unicharset.h:299
const char * get_normed_unichar(UNICHAR_ID unichar_id) const
Definition: unicharset.h:788
STRING debug_str(UNICHAR_ID id) const
Definition: unicharset.cpp:318

◆ script_has_upper_lower()

bool UNICHARSET::script_has_upper_lower ( ) const
inline

Definition at line 856 of file unicharset.h.

856  {
857  return script_has_upper_lower_;
858  }

◆ script_has_xheight()

bool UNICHARSET::script_has_xheight ( ) const
inline

Definition at line 863 of file unicharset.h.

863  {
864  return script_has_xheight_;
865  }

◆ set_advance_stats()

void UNICHARSET::set_advance_stats ( UNICHAR_ID  unichar_id,
float  advance,
float  advance_sd 
)
inline

Definition at line 600 of file unicharset.h.

601  {
602  unichars[unichar_id].properties.advance = advance;
603  unichars[unichar_id].properties.advance_sd = advance_sd;
604  }

◆ set_bearing_stats()

void UNICHARSET::set_bearing_stats ( UNICHAR_ID  unichar_id,
float  bearing,
float  bearing_sd 
)
inline

Definition at line 583 of file unicharset.h.

584  {
585  unichars[unichar_id].properties.bearing = bearing;
586  unichars[unichar_id].properties.bearing_sd = bearing_sd;
587  }

◆ set_black_and_whitelist()

void UNICHARSET::set_black_and_whitelist ( const char *  blacklist,
const char *  whitelist,
const char *  unblacklist 
)

Definition at line 950 of file unicharset.cpp.

952  {
953  bool def_enabled = whitelist == NULL || whitelist[0] == '\0';
954  // Set everything to default
955  for (int ch = 0; ch < size_used; ++ch)
956  unichars[ch].properties.enabled = def_enabled;
957  if (!def_enabled) {
958  // Enable the whitelist.
959  GenericVector<UNICHAR_ID> encoding;
960  encode_string(whitelist, false, &encoding, NULL, NULL);
961  for (int i = 0; i < encoding.size(); ++i) {
962  if (encoding[i] != INVALID_UNICHAR_ID)
963  unichars[encoding[i]].properties.enabled = true;
964  }
965  }
966  if (blacklist != NULL && blacklist[0] != '\0') {
967  // Disable the blacklist.
968  GenericVector<UNICHAR_ID> encoding;
969  encode_string(blacklist, false, &encoding, NULL, NULL);
970  for (int i = 0; i < encoding.size(); ++i) {
971  if (encoding[i] != INVALID_UNICHAR_ID)
972  unichars[encoding[i]].properties.enabled = false;
973  }
974  }
975  if (unblacklist != NULL && unblacklist[0] != '\0') {
976  // Re-enable the unblacklist.
977  GenericVector<UNICHAR_ID> encoding;
978  encode_string(unblacklist, false, &encoding, NULL, NULL);
979  for (int i = 0; i < encoding.size(); ++i) {
980  if (encoding[i] != INVALID_UNICHAR_ID)
981  unichars[encoding[i]].properties.enabled = true;
982  }
983  }
984 }
int size() const
Definition: genericvector.h:72
bool encode_string(const char *str, bool give_up_on_failure, GenericVector< UNICHAR_ID > *encoding, GenericVector< char > *lengths, int *encoded_length) const
Definition: unicharset.cpp:234

◆ set_direction()

void UNICHARSET::set_direction ( UNICHAR_ID  unichar_id,
UNICHARSET::Direction  value 
)
inline

Definition at line 432 of file unicharset.h.

432  {
433  unichars[unichar_id].properties.direction = value;
434  }

◆ set_isalpha()

void UNICHARSET::set_isalpha ( UNICHAR_ID  unichar_id,
bool  value 
)
inline

Definition at line 391 of file unicharset.h.

391  {
392  unichars[unichar_id].properties.isalpha = value;
393  }

◆ set_isdigit()

void UNICHARSET::set_isdigit ( UNICHAR_ID  unichar_id,
bool  value 
)
inline

Definition at line 406 of file unicharset.h.

406  {
407  unichars[unichar_id].properties.isdigit = value;
408  }

◆ set_islower()

void UNICHARSET::set_islower ( UNICHAR_ID  unichar_id,
bool  value 
)
inline

Definition at line 396 of file unicharset.h.

396  {
397  unichars[unichar_id].properties.islower = value;
398  }

◆ set_isngram()

void UNICHARSET::set_isngram ( UNICHAR_ID  unichar_id,
bool  value 
)
inline

Definition at line 416 of file unicharset.h.

416  {
417  unichars[unichar_id].properties.isngram = value;
418  }

◆ set_ispunctuation()

void UNICHARSET::set_ispunctuation ( UNICHAR_ID  unichar_id,
bool  value 
)
inline

Definition at line 411 of file unicharset.h.

411  {
412  unichars[unichar_id].properties.ispunctuation = value;
413  }

◆ set_isupper()

void UNICHARSET::set_isupper ( UNICHAR_ID  unichar_id,
bool  value 
)
inline

Definition at line 401 of file unicharset.h.

401  {
402  unichars[unichar_id].properties.isupper = value;
403  }

◆ set_mirror()

void UNICHARSET::set_mirror ( UNICHAR_ID  unichar_id,
UNICHAR_ID  mirror 
)
inline

Definition at line 437 of file unicharset.h.

437  {
438  unichars[unichar_id].properties.mirror = mirror;
439  }

◆ set_normed()

void UNICHARSET::set_normed ( UNICHAR_ID  unichar_id,
const char *  normed 
)
inline

Definition at line 442 of file unicharset.h.

442  {
443  unichars[unichar_id].properties.normed = normed;
444  unichars[unichar_id].properties.normed_ids.truncate(0);
445  }

◆ set_normed_ids()

void UNICHARSET::set_normed_ids ( UNICHAR_ID  unichar_id)

Definition at line 348 of file unicharset.cpp.

348  {
349  unichars[unichar_id].properties.normed_ids.truncate(0);
350  if (unichar_id == UNICHAR_SPACE && id_to_unichar(unichar_id)[0] == ' ') {
351  unichars[unichar_id].properties.normed_ids.push_back(UNICHAR_SPACE);
352  } else if (!encode_string(unichars[unichar_id].properties.normed.string(),
353  true, &unichars[unichar_id].properties.normed_ids,
354  NULL, NULL)) {
355  unichars[unichar_id].properties.normed_ids.truncate(0);
356  unichars[unichar_id].properties.normed_ids.push_back(unichar_id);
357  }
358 }
const char * id_to_unichar(UNICHAR_ID id) const
Definition: unicharset.cpp:266
bool encode_string(const char *str, bool give_up_on_failure, GenericVector< UNICHAR_ID > *encoding, GenericVector< char > *lengths, int *encoded_length) const
Definition: unicharset.cpp:234

◆ set_other_case()

void UNICHARSET::set_other_case ( UNICHAR_ID  unichar_id,
UNICHAR_ID  other_case 
)
inline

Definition at line 427 of file unicharset.h.

427  {
428  unichars[unichar_id].properties.other_case = other_case;
429  }

◆ set_ranges_empty()

void UNICHARSET::set_ranges_empty ( )

Definition at line 371 of file unicharset.cpp.

371  {
372  for (int id = 0; id < size_used; ++id) {
373  unichars[id].properties.SetRangesEmpty();
374  }
375 }

◆ set_script()

void UNICHARSET::set_script ( UNICHAR_ID  unichar_id,
const char *  value 
)
inline

Definition at line 422 of file unicharset.h.

422  {
423  unichars[unichar_id].properties.script_id = add_script(value);
424  }
int add_script(const char *script)

◆ set_top_bottom()

void UNICHARSET::set_top_bottom ( UNICHAR_ID  unichar_id,
int  min_bottom,
int  max_bottom,
int  min_top,
int  max_top 
)
inline

Definition at line 542 of file unicharset.h.

544  {
545  unichars[unichar_id].properties.min_bottom =
546  static_cast<uinT8>(ClipToRange(min_bottom, 0, MAX_UINT8));
547  unichars[unichar_id].properties.max_bottom =
548  static_cast<uinT8>(ClipToRange(max_bottom, 0, MAX_UINT8));
549  unichars[unichar_id].properties.min_top =
550  static_cast<uinT8>(ClipToRange(min_top, 0, MAX_UINT8));
551  unichars[unichar_id].properties.max_top =
552  static_cast<uinT8>(ClipToRange(max_top, 0, MAX_UINT8));
553  }
#define MAX_UINT8
Definition: host.h:63
T ClipToRange(const T &x, const T &lower_bound, const T &upper_bound)
Definition: helpers.h:122
uint8_t uinT8
Definition: host.h:35

◆ set_width_stats()

void UNICHARSET::set_width_stats ( UNICHAR_ID  unichar_id,
float  width,
float  width_sd 
)
inline

Definition at line 567 of file unicharset.h.

567  {
568  unichars[unichar_id].properties.width = width;
569  unichars[unichar_id].properties.width_sd = width_sd;
570  }

◆ SetPropertiesFromOther()

void UNICHARSET::SetPropertiesFromOther ( const UNICHARSET src)
inline

Definition at line 505 of file unicharset.h.

505  {
507  }
void PartialSetPropertiesFromOther(int start_index, const UNICHARSET &src)
Definition: unicharset.cpp:380

◆ size()

int UNICHARSET::size ( ) const
inline

Definition at line 299 of file unicharset.h.

299  {
300  return size_used;
301  }

◆ SizesDistinct()

bool UNICHARSET::SizesDistinct ( UNICHAR_ID  id1,
UNICHAR_ID  id2 
) const

Definition at line 472 of file unicharset.cpp.

472  {
473  int overlap = MIN(unichars[id1].properties.max_top,
474  unichars[id2].properties.max_top) -
475  MAX(unichars[id1].properties.min_top,
476  unichars[id2].properties.min_top);
477  return overlap <= 0;
478 }
#define MAX(x, y)
Definition: ndminx.h:24
#define MIN(x, y)
Definition: ndminx.h:28

◆ step()

int UNICHARSET::step ( const char *  str) const

Definition at line 211 of file unicharset.cpp.

211  {
212  GenericVector<UNICHAR_ID> encoding;
213  GenericVector<char> lengths;
214  encode_string(str, true, &encoding, &lengths, NULL);
215  if (encoding.empty() || encoding[0] == INVALID_UNICHAR_ID) return 0;
216  return lengths[0];
217 }
bool empty() const
Definition: genericvector.h:90
bool encode_string(const char *str, bool give_up_on_failure, GenericVector< UNICHAR_ID > *encoding, GenericVector< char > *lengths, int *encoded_length) const
Definition: unicharset.cpp:234

◆ thai_sid()

int UNICHARSET::thai_sid ( ) const
inline

Definition at line 851 of file unicharset.h.

851 { return thai_sid_; }

◆ to_lower()

UNICHAR_ID UNICHARSET::to_lower ( UNICHAR_ID  unichar_id) const
inline

Definition at line 664 of file unicharset.h.

664  {
665  if (INVALID_UNICHAR_ID == unichar_id) return INVALID_UNICHAR_ID;
666  ASSERT_HOST(contains_unichar_id(unichar_id));
667  if (unichars[unichar_id].properties.islower) return unichar_id;
668  return unichars[unichar_id].properties.other_case;
669  }
bool contains_unichar_id(UNICHAR_ID unichar_id) const
Definition: unicharset.h:241
#define ASSERT_HOST(x)
Definition: errcode.h:84

◆ to_upper()

UNICHAR_ID UNICHARSET::to_upper ( UNICHAR_ID  unichar_id) const
inline

Definition at line 672 of file unicharset.h.

672  {
673  if (INVALID_UNICHAR_ID == unichar_id) return INVALID_UNICHAR_ID;
674  ASSERT_HOST(contains_unichar_id(unichar_id));
675  if (unichars[unichar_id].properties.isupper) return unichar_id;
676  return unichars[unichar_id].properties.other_case;
677  }
bool contains_unichar_id(UNICHAR_ID unichar_id) const
Definition: unicharset.h:241
#define ASSERT_HOST(x)
Definition: errcode.h:84

◆ top_bottom_useful()

bool UNICHARSET::top_bottom_useful ( ) const
inline

Definition at line 497 of file unicharset.h.

497  {
498  return top_bottom_set_;
499  }

◆ unichar_insert()

void UNICHARSET::unichar_insert ( const char *const  unichar_repr)

Definition at line 612 of file unicharset.cpp.

612  {
613  if (!ids.contains(unichar_repr)) {
614  if (strlen(unichar_repr) > UNICHAR_LEN) {
615  fprintf(stderr, "Utf8 buffer too big, size=%d for %s\n",
616  int(strlen(unichar_repr)), unichar_repr);
617  return;
618  }
619  if (size_used == size_reserved) {
620  if (size_used == 0)
621  reserve(8);
622  else
623  reserve(2 * size_used);
624  }
625 
626  strcpy(unichars[size_used].representation, unichar_repr);
627  this->set_script(size_used, null_script);
628  // If the given unichar_repr represents a fragmented character, set
629  // fragment property to a pointer to CHAR_FRAGMENT class instance with
630  // information parsed from the unichar representation. Use the script
631  // of the base unichar for the fragmented character if possible.
632  CHAR_FRAGMENT *frag = CHAR_FRAGMENT::parse_from_string(unichar_repr);
633  this->unichars[size_used].properties.fragment = frag;
634  if (frag != NULL && this->contains_unichar(frag->get_unichar())) {
635  this->unichars[size_used].properties.script_id =
636  this->get_script(frag->get_unichar());
637  }
638  this->unichars[size_used].properties.enabled = true;
639  ids.insert(unichar_repr, size_used);
640  ++size_used;
641  }
642 }
bool contains_unichar(const char *const unichar_repr) const
Definition: unicharset.cpp:644
const char * get_unichar() const
Definition: unicharset.h:64
void reserve(int unichars_number)
Definition: unicharset.cpp:179
bool contains(const char *const unichar_repr) const
Definition: unicharmap.cpp:101
#define UNICHAR_LEN
Definition: unichar.h:30
int get_script(UNICHAR_ID unichar_id) const
Definition: unicharset.h:623
static CHAR_FRAGMENT * parse_from_string(const char *str)
void insert(const char *const unichar_repr, UNICHAR_ID id)
Definition: unicharmap.cpp:76
void set_script(UNICHAR_ID unichar_id, const char *value)
Definition: unicharset.h:422

◆ unichar_to_id() [1/2]

UNICHAR_ID UNICHARSET::unichar_to_id ( const char *const  unichar_repr) const

Definition at line 194 of file unicharset.cpp.

194  {
195  return ids.contains(unichar_repr) ?
196  ids.unichar_to_id(unichar_repr) : INVALID_UNICHAR_ID;
197 }
bool contains(const char *const unichar_repr) const
Definition: unicharmap.cpp:101
UNICHAR_ID unichar_to_id(const char *const unichar_repr) const
Definition: unicharmap.cpp:36

◆ unichar_to_id() [2/2]

UNICHAR_ID UNICHARSET::unichar_to_id ( const char *const  unichar_repr,
int  length 
) const

Definition at line 199 of file unicharset.cpp.

200  {
201  assert(length > 0 && length <= UNICHAR_LEN);
202  return ids.contains(unichar_repr, length) ?
203  ids.unichar_to_id(unichar_repr, length) : INVALID_UNICHAR_ID;
204 }
bool contains(const char *const unichar_repr) const
Definition: unicharmap.cpp:101
#define UNICHAR_LEN
Definition: unichar.h:30
UNICHAR_ID unichar_to_id(const char *const unichar_repr) const
Definition: unicharmap.cpp:36

Member Data Documentation

◆ kCustomLigatures

const char * UNICHARSET::kCustomLigatures
static
Initial value:
= {
{"ct", "\uE003"},
{"ſh", "\uE006"},
{"ſi", "\uE007"},
{"ſl", "\uE008"},
{"ſſ", "\uE009"},
{NULL, NULL}
}

Definition at line 144 of file unicharset.h.

◆ kSpecialUnicharCodes

const char * UNICHARSET::kSpecialUnicharCodes
static
Initial value:
= {
" ",
"Joined",
"|Broken|0|1"
}

Definition at line 147 of file unicharset.h.


The documentation for this class was generated from the following files: