tesseract  4.00.00dev
tesseract::UnicharCompress Class Reference

#include <unicharcompress.h>

Public Member Functions

 UnicharCompress ()
 
 UnicharCompress (const UnicharCompress &src)
 
 ~UnicharCompress ()
 
UnicharCompressoperator= (const UnicharCompress &src)
 
bool ComputeEncoding (const UNICHARSET &unicharset, int null_id, STRING *radical_stroke_table)
 
void SetupPassThrough (const UNICHARSET &unicharset)
 
void SetupDirect (const GenericVector< RecodedCharID > &codes)
 
int code_range () const
 
int EncodeUnichar (int unichar_id, RecodedCharID *code) const
 
int DecodeUnichar (const RecodedCharID &code) const
 
bool IsValidFirstCode (int code) const
 
const GenericVector< int > * GetNextCodes (const RecodedCharID &code) const
 
const GenericVector< int > * GetFinalCodes (const RecodedCharID &code) const
 
bool Serialize (TFile *fp) const
 
bool DeSerialize (TFile *fp)
 
STRING GetEncodingAsString (const UNICHARSET &unicharset) const
 

Static Public Member Functions

static bool DecomposeHangul (int unicode, int *leading, int *vowel, int *trailing)
 

Static Public Attributes

static const int kFirstHangul = 0xac00
 
static const int kNumHangul = 11172
 
static const int kLCount = 19
 
static const int kVCount = 21
 
static const int kTCount = 28
 

Detailed Description

Definition at line 139 of file unicharcompress.h.

Constructor & Destructor Documentation

◆ UnicharCompress() [1/2]

tesseract::UnicharCompress::UnicharCompress ( )

Definition at line 92 of file unicharcompress.cpp.

92 : code_range_(0) {}

◆ UnicharCompress() [2/2]

tesseract::UnicharCompress::UnicharCompress ( const UnicharCompress src)

Definition at line 93 of file unicharcompress.cpp.

93 { *this = src; }

◆ ~UnicharCompress()

tesseract::UnicharCompress::~UnicharCompress ( )

Definition at line 94 of file unicharcompress.cpp.

94 { Cleanup(); }

Member Function Documentation

◆ code_range()

int tesseract::UnicharCompress::code_range ( ) const
inline

Definition at line 172 of file unicharcompress.h.

172 { return code_range_; }

◆ ComputeEncoding()

bool tesseract::UnicharCompress::ComputeEncoding ( const UNICHARSET unicharset,
int  null_id,
STRING radical_stroke_table 
)

Definition at line 107 of file unicharcompress.cpp.

108  {
109  RSMap radical_map;
110  if (!DecodeRadicalStrokeTable(radical_stroke_table, &radical_map))
111  return false;
112  encoder_.clear();
113  UNICHARSET direct_set;
114  UNICHARSET radicals;
115  // To avoid unused codes, clear the special codes from the unicharsets.
116  direct_set.clear();
117  radicals.clear();
118  // Always keep space as 0;
119  direct_set.unichar_insert(" ");
120  // Null char is next if we have one.
121  if (null_id >= 0) {
122  direct_set.unichar_insert(kNullChar);
123  }
124  RSCounts radical_counts;
125  // In the initial map, codes [0, unicharset.size()) are
126  // reserved for non-han/hangul sequences of 1 or more unicodes.
127  int hangul_offset = unicharset.size();
128  // Hangul takes the next range [hangul_offset, hangul_offset + kTotalJamos).
129  const int kTotalJamos = kLCount + kVCount + kTCount;
130  // Han takes the codes beyond hangul_offset + kTotalJamos. Since it is hard
131  // to measure the number of radicals and strokes, initially we use the same
132  // code range for all 3 Han code positions, and fix them after.
133  int han_offset = hangul_offset + kTotalJamos;
134  int max_num_strokes = -1;
135  for (int u = 0; u <= unicharset.size(); ++u) {
136  bool self_normalized = false;
137  // We special-case allow null_id to be equal to unicharset.size() in case
138  // there is no space in unicharset for it.
139  if (u == unicharset.size()) {
140  if (u == null_id) {
141  self_normalized = true;
142  } else {
143  break; // Finished.
144  }
145  } else {
146  self_normalized = strcmp(unicharset.id_to_unichar(u),
147  unicharset.get_normed_unichar(u)) == 0;
148  }
149  RecodedCharID code;
150  // Convert to unicodes.
151  GenericVector<int> unicodes;
152  if (u < unicharset.size() &&
153  UNICHAR::UTF8ToUnicode(unicharset.get_normed_unichar(u), &unicodes) &&
154  unicodes.size() == 1) {
155  // Check single unicodes for Hangul/Han and encode if so.
156  int unicode = unicodes[0];
157  int leading, vowel, trailing;
158  auto it = radical_map.find(unicode);
159  if (it != radical_map.end()) {
160  // This is Han. Convert to radical, stroke, index.
161  if (!radicals.contains_unichar(it->second.radical.string())) {
162  radicals.unichar_insert(it->second.radical.string());
163  }
164  int radical = radicals.unichar_to_id(it->second.radical.string());
165  int num_strokes = it->second.num_strokes;
166  int num_samples = radical_counts[it->second]++;
167  if (num_strokes > max_num_strokes) max_num_strokes = num_strokes;
168  code.Set3(radical + han_offset, num_strokes + han_offset,
169  num_samples + han_offset);
170  } else if (DecomposeHangul(unicode, &leading, &vowel, &trailing)) {
171  // This is Hangul. Since we know the exact size of each part at compile
172  // time, it gets the bottom set of codes.
173  code.Set3(leading + hangul_offset, vowel + kLCount + hangul_offset,
174  trailing + kLCount + kVCount + hangul_offset);
175  }
176  }
177  // If the code is still empty, it wasn't Han or Hangul.
178  if (code.length() == 0) {
179  // Special cases.
180  if (u == UNICHAR_SPACE) {
181  code.Set(0, 0); // Space.
182  } else if (u == null_id || (unicharset.has_special_codes() &&
184  code.Set(0, direct_set.unichar_to_id(kNullChar));
185  } else {
186  // Add the direct_set unichar-ids of the unicodes in sequence to the
187  // code.
188  for (int i = 0; i < unicodes.size(); ++i) {
189  int position = code.length();
190  if (position >= RecodedCharID::kMaxCodeLen) {
191  tprintf("Unichar %d=%s->%s is too long to encode!!\n", u,
192  unicharset.id_to_unichar(u),
193  unicharset.get_normed_unichar(u));
194  return false;
195  }
196  int uni = unicodes[i];
197  UNICHAR unichar(uni);
198  char* utf8 = unichar.utf8_str();
199  if (!direct_set.contains_unichar(utf8))
200  direct_set.unichar_insert(utf8);
201  code.Set(position, direct_set.unichar_to_id(utf8));
202  delete[] utf8;
203  if (direct_set.size() > unicharset.size()) {
204  // Code space got bigger!
205  tprintf("Code space expanded from original unicharset!!\n");
206  return false;
207  }
208  }
209  }
210  }
211  code.set_self_normalized(self_normalized);
212  encoder_.push_back(code);
213  }
214  // Now renumber Han to make all codes unique. We already added han_offset to
215  // all Han. Now separate out the radical, stroke, and count codes for Han.
216  // In the uniqued Han encoding, the 1st code uses the next radical_map.size()
217  // values, the 2nd code uses the next max_num_strokes+1 values, and the 3rd
218  // code uses the rest for the max number of duplicated radical/stroke combos.
219  int num_radicals = radicals.size();
220  for (int u = 0; u < unicharset.size(); ++u) {
221  RecodedCharID* code = &encoder_[u];
222  if ((*code)(0) >= han_offset) {
223  code->Set(1, (*code)(1) + num_radicals);
224  code->Set(2, (*code)(2) + num_radicals + max_num_strokes + 1);
225  }
226  }
227  DefragmentCodeValues(null_id >= 0 ? 1 : -1);
228  SetupDecoder();
229  return true;
230 }
double u[max]
bool contains_unichar(const char *const unichar_repr) const
Definition: unicharset.cpp:644
std::unordered_map< int, RadicalStroke > RSMap
static const int kMaxCodeLen
#define tprintf(...)
Definition: tprintf.h:31
int size() const
Definition: genericvector.h:72
void clear()
Definition: unicharset.h:265
const char * id_to_unichar(UNICHAR_ID id) const
Definition: unicharset.cpp:266
const char * kNullChar
bool has_special_codes() const
Definition: unicharset.h:682
static bool DecomposeHangul(int unicode, int *leading, int *vowel, int *trailing)
static bool UTF8ToUnicode(const char *utf8_str, GenericVector< int > *unicodes)
Definition: unichar.cpp:211
int size() const
Definition: unicharset.h:299
std::unordered_map< RadicalStroke, int, RadicalStrokedHash > RSCounts
UNICHAR_ID unichar_to_id(const char *const unichar_repr) const
Definition: unicharset.cpp:194
const char * get_normed_unichar(UNICHAR_ID unichar_id) const
Definition: unicharset.h:788
void unichar_insert(const char *const unichar_repr)
Definition: unicharset.cpp:612

◆ DecodeUnichar()

int tesseract::UnicharCompress::DecodeUnichar ( const RecodedCharID code) const

Definition at line 304 of file unicharcompress.cpp.

304  {
305  int len = code.length();
306  if (len <= 0 || len > RecodedCharID::kMaxCodeLen) return INVALID_UNICHAR_ID;
307  auto it = decoder_.find(code);
308  if (it == decoder_.end()) return INVALID_UNICHAR_ID;
309  return it->second;
310 }
static const int kMaxCodeLen

◆ DecomposeHangul()

bool tesseract::UnicharCompress::DecomposeHangul ( int  unicode,
int leading,
int vowel,
int trailing 
)
static

Definition at line 361 of file unicharcompress.cpp.

362  {
363  if (unicode < kFirstHangul) return false;
364  int offset = unicode - kFirstHangul;
365  if (offset >= kNumHangul) return false;
366  const int kNCount = kVCount * kTCount;
367  *leading = offset / kNCount;
368  *vowel = (offset % kNCount) / kTCount;
369  *trailing = offset % kTCount;
370  return true;
371 }
voidpf uLong offset
Definition: ioapi.h:42
static const int kFirstHangul

◆ DeSerialize()

bool tesseract::UnicharCompress::DeSerialize ( TFile fp)

Definition at line 318 of file unicharcompress.cpp.

318  {
319  if (!encoder_.DeSerializeClasses(fp)) return false;
320  ComputeCodeRange();
321  SetupDecoder();
322  return true;
323 }

◆ EncodeUnichar()

int tesseract::UnicharCompress::EncodeUnichar ( int  unichar_id,
RecodedCharID code 
) const

Definition at line 296 of file unicharcompress.cpp.

296  {
297  if (unichar_id < 0 || unichar_id >= encoder_.size()) return 0;
298  *code = encoder_[unichar_id];
299  return code->length();
300 }

◆ GetEncodingAsString()

STRING tesseract::UnicharCompress::GetEncodingAsString ( const UNICHARSET unicharset) const

Definition at line 332 of file unicharcompress.cpp.

333  {
334  STRING encoding;
335  for (int c = 0; c < encoder_.size(); ++c) {
336  const RecodedCharID& code = encoder_[c];
337  if (0 < c && c < SPECIAL_UNICHAR_CODES_COUNT && code == encoder_[c - 1]) {
338  // Don't show the duplicate entry.
339  continue;
340  }
341  encoding.add_str_int("", code(0));
342  for (int i = 1; i < code.length(); ++i) {
343  encoding.add_str_int(",", code(i));
344  }
345  encoding += "\t";
346  if (c >= unicharset.size() || (0 < c && c < SPECIAL_UNICHAR_CODES_COUNT &&
347  unicharset.has_special_codes())) {
348  encoding += kNullChar;
349  } else {
350  encoding += unicharset.id_to_unichar(c);
351  }
352  encoding += "\n";
353  }
354  return encoding;
355 }
void add_str_int(const char *str, int number)
Definition: strngs.cpp:381
const char * id_to_unichar(UNICHAR_ID id) const
Definition: unicharset.cpp:266
const char * kNullChar
Definition: strngs.h:45
bool has_special_codes() const
Definition: unicharset.h:682
int size() const
Definition: unicharset.h:299

◆ GetFinalCodes()

const GenericVector<int>* tesseract::UnicharCompress::GetFinalCodes ( const RecodedCharID code) const
inline

Definition at line 195 of file unicharcompress.h.

195  {
196  auto it = final_codes_.find(code);
197  return it == final_codes_.end() ? NULL : it->second;
198  }

◆ GetNextCodes()

const GenericVector<int>* tesseract::UnicharCompress::GetNextCodes ( const RecodedCharID code) const
inline

Definition at line 189 of file unicharcompress.h.

189  {
190  auto it = next_codes_.find(code);
191  return it == next_codes_.end() ? NULL : it->second;
192  }

◆ IsValidFirstCode()

bool tesseract::UnicharCompress::IsValidFirstCode ( int  code) const
inline

Definition at line 186 of file unicharcompress.h.

186 { return is_valid_start_[code]; }

◆ operator=()

UnicharCompress & tesseract::UnicharCompress::operator= ( const UnicharCompress src)

Definition at line 95 of file unicharcompress.cpp.

95  {
96  Cleanup();
97  encoder_ = src.encoder_;
98  code_range_ = src.code_range_;
99  SetupDecoder();
100  return *this;
101 }

◆ Serialize()

bool tesseract::UnicharCompress::Serialize ( TFile fp) const

Definition at line 313 of file unicharcompress.cpp.

313  {
314  return encoder_.SerializeClasses(fp);
315 }

◆ SetupDirect()

void tesseract::UnicharCompress::SetupDirect ( const GenericVector< RecodedCharID > &  codes)

Definition at line 246 of file unicharcompress.cpp.

246  {
247  encoder_ = codes;
248  ComputeCodeRange();
249  SetupDecoder();
250 }

◆ SetupPassThrough()

void tesseract::UnicharCompress::SetupPassThrough ( const UNICHARSET unicharset)

Definition at line 234 of file unicharcompress.cpp.

234  {
236  for (int u = 0; u < unicharset.size(); ++u) {
237  RecodedCharID code;
238  code.Set(0, u);
239  codes.push_back(code);
240  }
241  SetupDirect(codes);
242 }
double u[max]
int push_back(T object)
void SetupDirect(const GenericVector< RecodedCharID > &codes)
int size() const
Definition: unicharset.h:299

Member Data Documentation

◆ kFirstHangul

const int tesseract::UnicharCompress::kFirstHangul = 0xac00
static

Definition at line 147 of file unicharcompress.h.

◆ kLCount

const int tesseract::UnicharCompress::kLCount = 19
static

Definition at line 152 of file unicharcompress.h.

◆ kNumHangul

const int tesseract::UnicharCompress::kNumHangul = 11172
static

Definition at line 149 of file unicharcompress.h.

◆ kTCount

const int tesseract::UnicharCompress::kTCount = 28
static

Definition at line 154 of file unicharcompress.h.

◆ kVCount

const int tesseract::UnicharCompress::kVCount = 21
static

Definition at line 153 of file unicharcompress.h.


The documentation for this class was generated from the following files: