tesseract  4.00.00dev
unicharcompress.h
Go to the documentation of this file.
1 // File: unicharcompress.h
3 // Description: Unicode re-encoding using a sequence of smaller numbers in
4 // place of a single large code for CJK, similarly for Indic,
5 // and dissection of ligatures for other scripts.
6 // Author: Ray Smith
7 // Created: Wed Mar 04 14:45:01 PST 2015
8 //
9 // (C) Copyright 2015, Google Inc.
10 // Licensed under the Apache License, Version 2.0 (the "License");
11 // you may not use this file except in compliance with the License.
12 // You may obtain a copy of the License at
13 // http://www.apache.org/licenses/LICENSE-2.0
14 // Unless required by applicable law or agreed to in writing, software
15 // distributed under the License is distributed on an "AS IS" BASIS,
16 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
17 // See the License for the specific language governing permissions and
18 // limitations under the License.
19 //
21 
22 #ifndef TESSERACT_CCUTIL_UNICHARCOMPRESS_H_
23 #define TESSERACT_CCUTIL_UNICHARCOMPRESS_H_
24 
25 #include <unordered_map>
26 
27 #include "serialis.h"
28 #include "strngs.h"
29 #include "unicharset.h"
30 
31 namespace tesseract {
32 
33 // Trivial class to hold the code for a recoded unichar-id.
35  public:
36  // The maximum length of a code.
37  static const int kMaxCodeLen = 9;
38 
39  RecodedCharID() : self_normalized_(0), length_(0) {
40  memset(code_, 0, sizeof(code_));
41  }
42  void Truncate(int length) { length_ = length; }
43  // Sets the code value at the given index in the code.
44  void Set(int index, int value) {
45  code_[index] = value;
46  if (length_ <= index) length_ = index + 1;
47  }
48  // Shorthand for setting codes of length 3, as all Hangul and Han codes are
49  // length 3.
50  void Set3(int code0, int code1, int code2) {
51  length_ = 3;
52  code_[0] = code0;
53  code_[1] = code1;
54  code_[2] = code2;
55  }
56  // Accessors
57  bool self_normalized() const { return self_normalized_ != 0; }
58  void set_self_normalized(bool value) { self_normalized_ = value; }
59  int length() const { return length_; }
60  int operator()(int index) const { return code_[index]; }
61 
62  // Writes to the given file. Returns false in case of error.
63  bool Serialize(TFile* fp) const {
64  if (fp->FWrite(&self_normalized_, sizeof(self_normalized_), 1) != 1)
65  return false;
66  if (fp->FWrite(&length_, sizeof(length_), 1) != 1) return false;
67  if (fp->FWrite(code_, sizeof(code_[0]), length_) != length_) return false;
68  return true;
69  }
70  // Reads from the given file. Returns false in case of error.
71  // If swap is true, assumes a big/little-endian swap is needed.
72  bool DeSerialize(TFile* fp) {
73  if (fp->FRead(&self_normalized_, sizeof(self_normalized_), 1) != 1)
74  return false;
75  if (fp->FReadEndian(&length_, sizeof(length_), 1) != 1) return false;
76  if (fp->FReadEndian(code_, sizeof(code_[0]), length_) != length_)
77  return false;
78  return true;
79  }
80  bool operator==(const RecodedCharID& other) const {
81  if (length_ != other.length_) return false;
82  for (int i = 0; i < length_; ++i) {
83  if (code_[i] != other.code_[i]) return false;
84  }
85  return true;
86  }
87  // Hash functor for RecodedCharID.
89  size_t operator()(const RecodedCharID& code) const {
90  size_t result = 0;
91  for (int i = 0; i < code.length_; ++i) {
92  result ^= code(i) << (7 * i);
93  }
94  return result;
95  }
96  };
97 
98  private:
99  // True if this code is self-normalizing, ie is the master entry for indices
100  // that map to the same code. Has boolean value, but inT8 for serialization.
101  inT8 self_normalized_;
102  // The number of elements in use in code_;
103  inT32 length_;
104  // The re-encoded form of the unichar-id to which this RecodedCharID relates.
105  inT32 code_[kMaxCodeLen];
106 };
107 
108 // Class holds a "compression" of a unicharset to simplify the learning problem
109 // for a neural-network-based classifier.
110 // Objectives:
111 // 1 (CJK): Ids of a unicharset with a large number of classes are expressed as
112 // a sequence of 3 codes with much fewer values.
113 // This is achieved using the Jamo coding for Hangul and the Unicode
114 // Radical-Stroke-index for Han.
115 // 2 (Indic): Instead of thousands of codes with one for each grapheme, re-code
116 // as the unicode sequence (but coded in a more compact space).
117 // 3 (the rest): Eliminate multi-path problems with ligatures and fold confusing
118 // and not significantly distinct shapes (quotes) togther, ie
119 // represent the fi ligature as the f-i pair, and fold u+2019 and
120 // friends all onto ascii single '
121 // 4 The null character and mapping to target activations:
122 // To save horizontal coding space, the compressed codes are generally mapped
123 // to target network activations without intervening null characters, BUT
124 // in the case of ligatures, such as ff, null characters have to be included
125 // so existence of repeated codes is detected at codebook-building time, and
126 // null characters are embedded directly into the codes, so the rest of the
127 // system doesn't need to worry about the problem (much). There is still an
128 // effect on the range of ways in which the target activations can be
129 // generated.
130 //
131 // The computed code values are compact (no unused values), and, for CJK,
132 // unique (each code position uses a disjoint set of values from each other code
133 // position). For non-CJK, the same code value CAN be used in multiple
134 // positions, eg the ff ligature is converted to <f> <nullchar> <f>, where <f>
135 // is the same code as is used for the single f.
136 // NOTE that an intended consequence of using the normalized text from the
137 // unicharset is that the fancy quotes all map to a single code, so round-trip
138 // conversion doesn't work for all unichar-ids.
140  public:
141  UnicharCompress();
142  UnicharCompress(const UnicharCompress& src);
143  ~UnicharCompress();
144  UnicharCompress& operator=(const UnicharCompress& src);
145 
146  // The 1st Hangul unicode.
147  static const int kFirstHangul = 0xac00;
148  // The number of Hangul unicodes.
149  static const int kNumHangul = 11172;
150  // The number of Jamos for each of the 3 parts of a Hangul character, being
151  // the Leading consonant, Vowel and Trailing consonant.
152  static const int kLCount = 19;
153  static const int kVCount = 21;
154  static const int kTCount = 28;
155 
156  // Computes the encoding for the given unicharset. It is a requirement that
157  // the file training/langdata/radical-stroke.txt have been read into the
158  // input string radical_stroke_table.
159  // Returns false if the encoding cannot be constructed.
160  bool ComputeEncoding(const UNICHARSET& unicharset, int null_id,
161  STRING* radical_stroke_table);
162  // Sets up an encoder that doesn't change the unichars at all, so it just
163  // passes them through unchanged.
164  void SetupPassThrough(const UNICHARSET& unicharset);
165  // Sets up an encoder directly using the given encoding vector, which maps
166  // unichar_ids to the given codes.
167  void SetupDirect(const GenericVector<RecodedCharID>& codes);
168 
169  // Returns the number of different values that can be used in a code, ie
170  // 1 + the maximum value that will ever be used by an RecodedCharID code in
171  // any position in its array.
172  int code_range() const { return code_range_; }
173 
174  // Encodes a single unichar_id. Returns the length of the code, (or zero if
175  // invalid input), and the encoding itself in code.
176  int EncodeUnichar(int unichar_id, RecodedCharID* code) const;
177  // Decodes code, returning the original unichar-id, or
178  // INVALID_UNICHAR_ID if the input is invalid. Note that this is not a perfect
179  // inverse of EncodeUnichar, since the unichar-id of U+2019 (curly single
180  // quote), for example, will have the same encoding as the unichar-id of
181  // U+0027 (ascii '). The foldings are obtained from the input unicharset,
182  // which in turn obtains them from NormalizeUTF8String in normstrngs.cpp,
183  // and include NFKC normalization plus others like quote and dash folding.
184  int DecodeUnichar(const RecodedCharID& code) const;
185  // Returns true if the given code is a valid start or single code.
186  bool IsValidFirstCode(int code) const { return is_valid_start_[code]; }
187  // Returns a list of valid non-final next codes for a given prefix code,
188  // which may be empty.
189  const GenericVector<int>* GetNextCodes(const RecodedCharID& code) const {
190  auto it = next_codes_.find(code);
191  return it == next_codes_.end() ? NULL : it->second;
192  }
193  // Returns a list of valid final codes for a given prefix code, which may
194  // be empty.
195  const GenericVector<int>* GetFinalCodes(const RecodedCharID& code) const {
196  auto it = final_codes_.find(code);
197  return it == final_codes_.end() ? NULL : it->second;
198  }
199 
200  // Writes to the given file. Returns false in case of error.
201  bool Serialize(TFile* fp) const;
202  // Reads from the given file. Returns false in case of error.
203 
204  bool DeSerialize(TFile* fp);
205 
206  // Returns a STRING containing a text file that describes the encoding thus:
207  // <index>[,<index>]*<tab><UTF8-str><newline>
208  // In words, a comma-separated list of one or more indices, followed by a tab
209  // and the UTF-8 string that the code represents per line. Most simple scripts
210  // will encode a single index to a UTF8-string, but Chinese, Japanese, Korean
211  // and the Indic scripts will contain a many-to-many mapping.
212  // See the class comment above for details.
213  STRING GetEncodingAsString(const UNICHARSET& unicharset) const;
214 
215  // Helper decomposes a Hangul unicode to 3 parts, leading, vowel, trailing.
216  // Note that the returned values are 0-based indices, NOT unicode Jamo.
217  // Returns false if the input is not in the Hangul unicode range.
218  static bool DecomposeHangul(int unicode, int* leading, int* vowel,
219  int* trailing);
220 
221  private:
222  // Renumbers codes to eliminate unused values.
223  void DefragmentCodeValues(int encoded_null);
224  // Computes the value of code_range_ from the encoder_.
225  void ComputeCodeRange();
226  // Initializes the decoding hash_map from the encoder_ array.
227  void SetupDecoder();
228  // Frees allocated memory.
229  void Cleanup();
230 
231  // The encoder that maps a unichar-id to a sequence of small codes.
232  // encoder_ is the only part that is serialized. The rest is computed on load.
234  // Decoder converts the output of encoder back to a unichar-id.
235  std::unordered_map<RecodedCharID, int, RecodedCharID::RecodedCharIDHash>
236  decoder_;
237  // True if the index is a valid single or start code.
238  GenericVector<bool> is_valid_start_;
239  // Maps a prefix code to a list of valid next codes.
240  // The map owns the vectors.
241  std::unordered_map<RecodedCharID, GenericVectorEqEq<int>*,
243  next_codes_;
244  // Maps a prefix code to a list of valid final codes.
245  // The map owns the vectors.
246  std::unordered_map<RecodedCharID, GenericVectorEqEq<int>*,
248  final_codes_;
249  // Max of any value in encoder_ + 1.
250  int code_range_;
251 };
252 
253 } // namespace tesseract.
254 
255 #endif // TESSERACT_CCUTIL_UNICHARCOMPRESS_H_
int32_t inT32
Definition: host.h:38
bool IsValidFirstCode(int code) const
bool operator==(const RecodedCharID &other) const
const GenericVector< int > * GetNextCodes(const RecodedCharID &code) const
static const int kMaxCodeLen
const GenericVector< int > * GetFinalCodes(const RecodedCharID &code) const
void Set3(int code0, int code1, int code2)
int FReadEndian(void *buffer, int size, int count)
Definition: serialis.cpp:97
Definition: strngs.h:45
bool DeSerialize(TFile *fp)
void set_self_normalized(bool value)
int FWrite(const void *buffer, int size, int count)
Definition: serialis.cpp:148
int8_t inT8
Definition: host.h:34
void Set(int index, int value)
size_t operator()(const RecodedCharID &code) const
void Truncate(int length)
int operator()(int index) const
bool Serialize(TFile *fp) const
int FRead(void *buffer, int size, int count)
Definition: serialis.cpp:108