tesseract  4.00.00dev
unicharcompress.cpp
Go to the documentation of this file.
1 // File: unicharcompress.cpp
3 // Description: Unicode re-encoding using a sequence of smaller numbers in
4 // place of a single large code for CJK, similarly for Indic,
5 // and dissection of ligatures for other scripts.
6 // Author: Ray Smith
7 // Created: Wed Mar 04 14:45:01 PST 2015
8 //
9 // (C) Copyright 2015, Google Inc.
10 // Licensed under the Apache License, Version 2.0 (the "License");
11 // you may not use this file except in compliance with the License.
12 // You may obtain a copy of the License at
13 // http://www.apache.org/licenses/LICENSE-2.0
14 // Unless required by applicable law or agreed to in writing, software
15 // distributed under the License is distributed on an "AS IS" BASIS,
16 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
17 // See the License for the specific language governing permissions and
18 // limitations under the License.
19 //
21 
22 #include "unicharcompress.h"
23 #include "tprintf.h"
24 
25 namespace tesseract {
26 
27 // String used to represent the null_id in direct_set.
28 const char* kNullChar = "<nul>";
29 
30 // Local struct used only for processing the radical-stroke table.
31 struct RadicalStroke {
33  RadicalStroke(const STRING& r, int s) : radical(r), num_strokes(s) {}
34 
35  bool operator==(const RadicalStroke& other) const {
36  return radical == other.radical && num_strokes == other.num_strokes;
37  }
38 
39  // The radical is encoded as a string because its format is of an int with
40  // an optional ' mark to indicate a simplified shape. To treat these as
41  // distinct, we use a string and a UNICHARSET to do the integer mapping.
43  // The number of strokes we treat as dense and just take the face value from
44  // the table.
46 };
47 
48 // Hash functor for RadicalStroke.
50  size_t operator()(const RadicalStroke& rs) const {
51  size_t result = rs.num_strokes;
52  for (int i = 0; i < rs.radical.length(); ++i) {
53  result ^= rs.radical[i] << (6 * i + 8);
54  }
55  return result;
56  }
57 };
58 
59 // A hash map to convert unicodes to radical,stroke pair.
60 typedef std::unordered_map<int, RadicalStroke> RSMap;
61 // A hash map to count occurrences of each radical,stroke pair.
62 typedef std::unordered_map<RadicalStroke, int, RadicalStrokedHash> RSCounts;
63 
64 // Helper function builds the RSMap from the radical-stroke file, which has
65 // already been read into a STRING. Returns false on error.
66 // The radical_stroke_table is non-const because it gets split and the caller
67 // is unlikely to want to use it again.
68 static bool DecodeRadicalStrokeTable(STRING* radical_stroke_table,
69  RSMap* radical_map) {
71  radical_stroke_table->split('\n', &lines);
72  for (int i = 0; i < lines.size(); ++i) {
73  if (lines[i].length() == 0 || lines[i][0] == '#') continue;
74  int unicode, radical, strokes;
75  STRING str_radical;
76  if (sscanf(lines[i].string(), "%x\t%d.%d", &unicode, &radical, &strokes) ==
77  3) {
78  str_radical.add_str_int("", radical);
79  } else if (sscanf(lines[i].string(), "%x\t%d'.%d", &unicode, &radical,
80  &strokes) == 3) {
81  str_radical.add_str_int("'", radical);
82  } else {
83  tprintf("Invalid format in radical stroke table at line %d: %s\n", i,
84  lines[i].string());
85  return false;
86  }
87  (*radical_map)[unicode] = RadicalStroke(str_radical, strokes);
88  }
89  return true;
90 }
91 
92 UnicharCompress::UnicharCompress() : code_range_(0) {}
96  Cleanup();
97  encoder_ = src.encoder_;
98  code_range_ = src.code_range_;
99  SetupDecoder();
100  return *this;
101 }
102 
103 // Computes the encoding for the given unicharset. It is a requirement that
104 // the file training/langdata/radical-stroke.txt have been read into the
105 // input string radical_stroke_table.
106 // Returns false if the encoding cannot be constructed.
107 bool UnicharCompress::ComputeEncoding(const UNICHARSET& unicharset, int null_id,
108  STRING* radical_stroke_table) {
109  RSMap radical_map;
110  if (!DecodeRadicalStrokeTable(radical_stroke_table, &radical_map))
111  return false;
112  encoder_.clear();
113  UNICHARSET direct_set;
114  UNICHARSET radicals;
115  // To avoid unused codes, clear the special codes from the unicharsets.
116  direct_set.clear();
117  radicals.clear();
118  // Always keep space as 0;
119  direct_set.unichar_insert(" ");
120  // Null char is next if we have one.
121  if (null_id >= 0) {
122  direct_set.unichar_insert(kNullChar);
123  }
124  RSCounts radical_counts;
125  // In the initial map, codes [0, unicharset.size()) are
126  // reserved for non-han/hangul sequences of 1 or more unicodes.
127  int hangul_offset = unicharset.size();
128  // Hangul takes the next range [hangul_offset, hangul_offset + kTotalJamos).
129  const int kTotalJamos = kLCount + kVCount + kTCount;
130  // Han takes the codes beyond hangul_offset + kTotalJamos. Since it is hard
131  // to measure the number of radicals and strokes, initially we use the same
132  // code range for all 3 Han code positions, and fix them after.
133  int han_offset = hangul_offset + kTotalJamos;
134  int max_num_strokes = -1;
135  for (int u = 0; u <= unicharset.size(); ++u) {
136  bool self_normalized = false;
137  // We special-case allow null_id to be equal to unicharset.size() in case
138  // there is no space in unicharset for it.
139  if (u == unicharset.size()) {
140  if (u == null_id) {
141  self_normalized = true;
142  } else {
143  break; // Finished.
144  }
145  } else {
146  self_normalized = strcmp(unicharset.id_to_unichar(u),
147  unicharset.get_normed_unichar(u)) == 0;
148  }
149  RecodedCharID code;
150  // Convert to unicodes.
151  GenericVector<int> unicodes;
152  if (u < unicharset.size() &&
153  UNICHAR::UTF8ToUnicode(unicharset.get_normed_unichar(u), &unicodes) &&
154  unicodes.size() == 1) {
155  // Check single unicodes for Hangul/Han and encode if so.
156  int unicode = unicodes[0];
157  int leading, vowel, trailing;
158  auto it = radical_map.find(unicode);
159  if (it != radical_map.end()) {
160  // This is Han. Convert to radical, stroke, index.
161  if (!radicals.contains_unichar(it->second.radical.string())) {
162  radicals.unichar_insert(it->second.radical.string());
163  }
164  int radical = radicals.unichar_to_id(it->second.radical.string());
165  int num_strokes = it->second.num_strokes;
166  int num_samples = radical_counts[it->second]++;
167  if (num_strokes > max_num_strokes) max_num_strokes = num_strokes;
168  code.Set3(radical + han_offset, num_strokes + han_offset,
169  num_samples + han_offset);
170  } else if (DecomposeHangul(unicode, &leading, &vowel, &trailing)) {
171  // This is Hangul. Since we know the exact size of each part at compile
172  // time, it gets the bottom set of codes.
173  code.Set3(leading + hangul_offset, vowel + kLCount + hangul_offset,
174  trailing + kLCount + kVCount + hangul_offset);
175  }
176  }
177  // If the code is still empty, it wasn't Han or Hangul.
178  if (code.length() == 0) {
179  // Special cases.
180  if (u == UNICHAR_SPACE) {
181  code.Set(0, 0); // Space.
182  } else if (u == null_id || (unicharset.has_special_codes() &&
184  code.Set(0, direct_set.unichar_to_id(kNullChar));
185  } else {
186  // Add the direct_set unichar-ids of the unicodes in sequence to the
187  // code.
188  for (int i = 0; i < unicodes.size(); ++i) {
189  int position = code.length();
190  if (position >= RecodedCharID::kMaxCodeLen) {
191  tprintf("Unichar %d=%s->%s is too long to encode!!\n", u,
192  unicharset.id_to_unichar(u),
193  unicharset.get_normed_unichar(u));
194  return false;
195  }
196  int uni = unicodes[i];
197  UNICHAR unichar(uni);
198  char* utf8 = unichar.utf8_str();
199  if (!direct_set.contains_unichar(utf8))
200  direct_set.unichar_insert(utf8);
201  code.Set(position, direct_set.unichar_to_id(utf8));
202  delete[] utf8;
203  if (direct_set.size() > unicharset.size()) {
204  // Code space got bigger!
205  tprintf("Code space expanded from original unicharset!!\n");
206  return false;
207  }
208  }
209  }
210  }
211  code.set_self_normalized(self_normalized);
212  encoder_.push_back(code);
213  }
214  // Now renumber Han to make all codes unique. We already added han_offset to
215  // all Han. Now separate out the radical, stroke, and count codes for Han.
216  // In the uniqued Han encoding, the 1st code uses the next radical_map.size()
217  // values, the 2nd code uses the next max_num_strokes+1 values, and the 3rd
218  // code uses the rest for the max number of duplicated radical/stroke combos.
219  int num_radicals = radicals.size();
220  for (int u = 0; u < unicharset.size(); ++u) {
221  RecodedCharID* code = &encoder_[u];
222  if ((*code)(0) >= han_offset) {
223  code->Set(1, (*code)(1) + num_radicals);
224  code->Set(2, (*code)(2) + num_radicals + max_num_strokes + 1);
225  }
226  }
227  DefragmentCodeValues(null_id >= 0 ? 1 : -1);
228  SetupDecoder();
229  return true;
230 }
231 
232 // Sets up an encoder that doesn't change the unichars at all, so it just
233 // passes them through unchanged.
236  for (int u = 0; u < unicharset.size(); ++u) {
237  RecodedCharID code;
238  code.Set(0, u);
239  codes.push_back(code);
240  }
241  SetupDirect(codes);
242 }
243 
244 // Sets up an encoder directly using the given encoding vector, which maps
245 // unichar_ids to the given codes.
247  encoder_ = codes;
248  ComputeCodeRange();
249  SetupDecoder();
250 }
251 
252 // Renumbers codes to eliminate unused values.
253 void UnicharCompress::DefragmentCodeValues(int encoded_null) {
254  // There may not be any Hangul, but even if there is, it is possible that not
255  // all codes are used. Likewise with the Han encoding, it is possible that not
256  // all numbers of strokes are used.
257  ComputeCodeRange();
258  GenericVector<int> offsets;
259  offsets.init_to_size(code_range_, 0);
260  // Find which codes are used
261  for (int c = 0; c < encoder_.size(); ++c) {
262  const RecodedCharID& code = encoder_[c];
263  for (int i = 0; i < code.length(); ++i) {
264  offsets[code(i)] = 1;
265  }
266  }
267  // Compute offsets based on code use.
268  int offset = 0;
269  for (int i = 0; i < offsets.size(); ++i) {
270  // If not used, decrement everything above here.
271  // We are moving encoded_null to the end, so it is not "used".
272  if (offsets[i] == 0 || i == encoded_null) {
273  --offset;
274  } else {
275  offsets[i] = offset;
276  }
277  }
278  if (encoded_null >= 0) {
279  // The encoded_null is moving to the end, for the benefit of TensorFlow,
280  // which is offsets.size() + offsets.back().
281  offsets[encoded_null] = offsets.size() + offsets.back() - encoded_null;
282  }
283  // Now apply the offsets.
284  for (int c = 0; c < encoder_.size(); ++c) {
285  RecodedCharID* code = &encoder_[c];
286  for (int i = 0; i < code->length(); ++i) {
287  int value = (*code)(i);
288  code->Set(i, value + offsets[value]);
289  }
290  }
291  ComputeCodeRange();
292 }
293 
294 // Encodes a single unichar_id. Returns the length of the code, or zero if
295 // invalid input, and the encoding itself
296 int UnicharCompress::EncodeUnichar(int unichar_id, RecodedCharID* code) const {
297  if (unichar_id < 0 || unichar_id >= encoder_.size()) return 0;
298  *code = encoder_[unichar_id];
299  return code->length();
300 }
301 
302 // Decodes code, returning the original unichar-id, or
303 // INVALID_UNICHAR_ID if the input is invalid.
305  int len = code.length();
306  if (len <= 0 || len > RecodedCharID::kMaxCodeLen) return INVALID_UNICHAR_ID;
307  auto it = decoder_.find(code);
308  if (it == decoder_.end()) return INVALID_UNICHAR_ID;
309  return it->second;
310 }
311 
312 // Writes to the given file. Returns false in case of error.
314  return encoder_.SerializeClasses(fp);
315 }
316 
317 // Reads from the given file. Returns false in case of error.
319  if (!encoder_.DeSerializeClasses(fp)) return false;
320  ComputeCodeRange();
321  SetupDecoder();
322  return true;
323 }
324 
325 // Returns a STRING containing a text file that describes the encoding thus:
326 // <index>[,<index>]*<tab><UTF8-str><newline>
327 // In words, a comma-separated list of one or more indices, followed by a tab
328 // and the UTF-8 string that the code represents per line. Most simple scripts
329 // will encode a single index to a UTF8-string, but Chinese, Japanese, Korean
330 // and the Indic scripts will contain a many-to-many mapping.
331 // See the class comment above for details.
333  const UNICHARSET& unicharset) const {
334  STRING encoding;
335  for (int c = 0; c < encoder_.size(); ++c) {
336  const RecodedCharID& code = encoder_[c];
337  if (0 < c && c < SPECIAL_UNICHAR_CODES_COUNT && code == encoder_[c - 1]) {
338  // Don't show the duplicate entry.
339  continue;
340  }
341  encoding.add_str_int("", code(0));
342  for (int i = 1; i < code.length(); ++i) {
343  encoding.add_str_int(",", code(i));
344  }
345  encoding += "\t";
346  if (c >= unicharset.size() || (0 < c && c < SPECIAL_UNICHAR_CODES_COUNT &&
347  unicharset.has_special_codes())) {
348  encoding += kNullChar;
349  } else {
350  encoding += unicharset.id_to_unichar(c);
351  }
352  encoding += "\n";
353  }
354  return encoding;
355 }
356 
357 // Helper decomposes a Hangul unicode to 3 parts, leading, vowel, trailing.
358 // Note that the returned values are 0-based indices, NOT unicode Jamo.
359 // Returns false if the input is not in the Hangul unicode range.
360 /* static */
361 bool UnicharCompress::DecomposeHangul(int unicode, int* leading, int* vowel,
362  int* trailing) {
363  if (unicode < kFirstHangul) return false;
364  int offset = unicode - kFirstHangul;
365  if (offset >= kNumHangul) return false;
366  const int kNCount = kVCount * kTCount;
367  *leading = offset / kNCount;
368  *vowel = (offset % kNCount) / kTCount;
369  *trailing = offset % kTCount;
370  return true;
371 }
372 
373 // Computes the value of code_range_ from the encoder_.
374 void UnicharCompress::ComputeCodeRange() {
375  code_range_ = -1;
376  for (int c = 0; c < encoder_.size(); ++c) {
377  const RecodedCharID& code = encoder_[c];
378  for (int i = 0; i < code.length(); ++i) {
379  if (code(i) > code_range_) code_range_ = code(i);
380  }
381  }
382  ++code_range_;
383 }
384 
385 // Initializes the decoding hash_map from the encoding array.
386 void UnicharCompress::SetupDecoder() {
387  Cleanup();
388  is_valid_start_.init_to_size(code_range_, false);
389  for (int c = 0; c < encoder_.size(); ++c) {
390  const RecodedCharID& code = encoder_[c];
391  if (code.self_normalized() || decoder_.find(code) == decoder_.end())
392  decoder_[code] = c;
393  is_valid_start_[code(0)] = true;
394  RecodedCharID prefix = code;
395  int len = code.length() - 1;
396  prefix.Truncate(len);
397  auto final_it = final_codes_.find(prefix);
398  if (final_it == final_codes_.end()) {
400  code_list->push_back(code(len));
401  final_codes_[prefix] = code_list;
402  while (--len >= 0) {
403  prefix.Truncate(len);
404  auto next_it = next_codes_.find(prefix);
405  if (next_it == next_codes_.end()) {
407  code_list->push_back(code(len));
408  next_codes_[prefix] = code_list;
409  } else {
410  // We still have to search the list as we may get here via multiple
411  // lengths of code.
412  if (!next_it->second->contains(code(len)))
413  next_it->second->push_back(code(len));
414  break; // This prefix has been processed.
415  }
416  }
417  } else {
418  if (!final_it->second->contains(code(len)))
419  final_it->second->push_back(code(len));
420  }
421  }
422 }
423 
424 // Frees allocated memory.
425 void UnicharCompress::Cleanup() {
426  decoder_.clear();
427  is_valid_start_.clear();
428  for (auto it = next_codes_.begin(); it != next_codes_.end(); ++it) {
429  delete it->second;
430  }
431  for (auto it = final_codes_.begin(); it != final_codes_.end(); ++it) {
432  delete it->second;
433  }
434  next_codes_.clear();
435  final_codes_.clear();
436 }
437 
438 } // namespace tesseract.
double u[max]
STRING GetEncodingAsString(const UNICHARSET &unicharset) const
void add_str_int(const char *str, int number)
Definition: strngs.cpp:381
int EncodeUnichar(int unichar_id, RecodedCharID *code) const
UnicharCompress & operator=(const UnicharCompress &src)
void init_to_size(int size, T t)
bool contains_unichar(const char *const unichar_repr) const
Definition: unicharset.cpp:644
std::unordered_map< int, RadicalStroke > RSMap
static const int kMaxCodeLen
int push_back(T object)
#define tprintf(...)
Definition: tprintf.h:31
voidpf uLong offset
Definition: ioapi.h:42
static const int kFirstHangul
inT32 length() const
Definition: strngs.cpp:193
RadicalStroke(const STRING &r, int s)
int size() const
Definition: genericvector.h:72
void clear()
Definition: unicharset.h:265
const char * id_to_unichar(UNICHAR_ID id) const
Definition: unicharset.cpp:266
bool ComputeEncoding(const UNICHARSET &unicharset, int null_id, STRING *radical_stroke_table)
bool Serialize(TFile *fp) const
char * utf8_str() const
Definition: unichar.cpp:125
const char * kNullChar
Definition: strngs.h:45
bool has_special_codes() const
Definition: unicharset.h:682
static bool DecomposeHangul(int unicode, int *leading, int *vowel, int *trailing)
int DecodeUnichar(const RecodedCharID &code) const
void SetupDirect(const GenericVector< RecodedCharID > &codes)
void SetupPassThrough(const UNICHARSET &unicharset)
bool operator==(const RadicalStroke &other) const
T & back() const
void Set(int index, int value)
static bool UTF8ToUnicode(const char *utf8_str, GenericVector< int > *unicodes)
Definition: unichar.cpp:211
int size() const
Definition: unicharset.h:299
size_t operator()(const RadicalStroke &rs) const
std::unordered_map< RadicalStroke, int, RadicalStrokedHash > RSCounts
void Truncate(int length)
UNICHAR_ID unichar_to_id(const char *const unichar_repr) const
Definition: unicharset.cpp:194
const char * get_normed_unichar(UNICHAR_ID unichar_id) const
Definition: unicharset.h:788
void split(const char c, GenericVector< STRING > *splited)
Definition: strngs.cpp:286
void unichar_insert(const char *const unichar_repr)
Definition: unicharset.cpp:612