53 result ^= rs.
radical[i] << (6 * i + 8);
60 typedef std::unordered_map<int, RadicalStroke>
RSMap;
62 typedef std::unordered_map<RadicalStroke, int, RadicalStrokedHash>
RSCounts;
68 static bool DecodeRadicalStrokeTable(
STRING* radical_stroke_table,
71 radical_stroke_table->
split(
'\n', &lines);
72 for (
int i = 0; i < lines.
size(); ++i) {
73 if (lines[i].length() == 0 || lines[i][0] ==
'#')
continue;
76 if (sscanf(lines[i].
string(),
"%x\t%d.%d", &unicode, &radical, &strokes) ==
79 }
else if (sscanf(lines[i].
string(),
"%x\t%d'.%d", &unicode, &radical,
83 tprintf(
"Invalid format in radical stroke table at line %d: %s\n", i,
87 (*radical_map)[unicode] =
RadicalStroke(str_radical, strokes);
97 encoder_ = src.encoder_;
98 code_range_ = src.code_range_;
108 STRING* radical_stroke_table) {
110 if (!DecodeRadicalStrokeTable(radical_stroke_table, &radical_map))
124 RSCounts radical_counts;
127 int hangul_offset = unicharset.
size();
133 int han_offset = hangul_offset + kTotalJamos;
134 int max_num_strokes = -1;
135 for (
int u = 0;
u <= unicharset.
size(); ++
u) {
136 bool self_normalized =
false;
139 if (
u == unicharset.
size()) {
141 self_normalized =
true;
152 if (
u < unicharset.
size() &&
154 unicodes.
size() == 1) {
156 int unicode = unicodes[0];
157 int leading, vowel, trailing;
158 auto it = radical_map.find(unicode);
159 if (it != radical_map.end()) {
164 int radical = radicals.
unichar_to_id(it->second.radical.string());
165 int num_strokes = it->second.num_strokes;
166 int num_samples = radical_counts[it->second]++;
167 if (num_strokes > max_num_strokes) max_num_strokes = num_strokes;
168 code.Set3(radical + han_offset, num_strokes + han_offset,
169 num_samples + han_offset);
173 code.Set3(leading + hangul_offset, vowel +
kLCount + hangul_offset,
178 if (code.length() == 0) {
188 for (
int i = 0; i < unicodes.
size(); ++i) {
189 int position = code.length();
191 tprintf(
"Unichar %d=%s->%s is too long to encode!!\n",
u,
196 int uni = unicodes[i];
203 if (direct_set.
size() > unicharset.
size()) {
205 tprintf(
"Code space expanded from original unicharset!!\n");
211 code.set_self_normalized(self_normalized);
212 encoder_.push_back(code);
219 int num_radicals = radicals.
size();
220 for (
int u = 0;
u < unicharset.
size(); ++
u) {
222 if ((*code)(0) >= han_offset) {
223 code->
Set(1, (*code)(1) + num_radicals);
224 code->
Set(2, (*code)(2) + num_radicals + max_num_strokes + 1);
227 DefragmentCodeValues(null_id >= 0 ? 1 : -1);
236 for (
int u = 0;
u < unicharset.
size(); ++
u) {
253 void UnicharCompress::DefragmentCodeValues(
int encoded_null) {
261 for (
int c = 0; c < encoder_.size(); ++c) {
263 for (
int i = 0; i < code.
length(); ++i) {
264 offsets[code(i)] = 1;
269 for (
int i = 0; i < offsets.
size(); ++i) {
272 if (offsets[i] == 0 || i == encoded_null) {
278 if (encoded_null >= 0) {
281 offsets[encoded_null] = offsets.
size() + offsets.
back() - encoded_null;
284 for (
int c = 0; c < encoder_.size(); ++c) {
286 for (
int i = 0; i < code->
length(); ++i) {
287 int value = (*code)(i);
288 code->
Set(i, value + offsets[value]);
297 if (unichar_id < 0 || unichar_id >= encoder_.size())
return 0;
298 *code = encoder_[unichar_id];
307 auto it = decoder_.find(code);
308 if (it == decoder_.end())
return INVALID_UNICHAR_ID;
314 return encoder_.SerializeClasses(fp);
319 if (!encoder_.DeSerializeClasses(fp))
return false;
335 for (
int c = 0; c < encoder_.size(); ++c) {
342 for (
int i = 1; i < code.
length(); ++i) {
367 *leading = offset / kNCount;
368 *vowel = (offset % kNCount) / kTCount;
374 void UnicharCompress::ComputeCodeRange() {
376 for (
int c = 0; c < encoder_.size(); ++c) {
378 for (
int i = 0; i < code.
length(); ++i) {
379 if (code(i) > code_range_) code_range_ = code(i);
386 void UnicharCompress::SetupDecoder() {
389 for (
int c = 0; c < encoder_.size(); ++c) {
393 is_valid_start_[code(0)] =
true;
395 int len = code.
length() - 1;
397 auto final_it = final_codes_.find(prefix);
398 if (final_it == final_codes_.end()) {
401 final_codes_[prefix] = code_list;
404 auto next_it = next_codes_.find(prefix);
405 if (next_it == next_codes_.end()) {
408 next_codes_[prefix] = code_list;
412 if (!next_it->second->contains(code(len)))
418 if (!final_it->second->contains(code(len)))
419 final_it->second->push_back(code(len));
425 void UnicharCompress::Cleanup() {
427 is_valid_start_.
clear();
428 for (
auto it = next_codes_.begin(); it != next_codes_.end(); ++it) {
431 for (
auto it = final_codes_.begin(); it != final_codes_.end(); ++it) {
435 final_codes_.clear();
STRING GetEncodingAsString(const UNICHARSET &unicharset) const
void add_str_int(const char *str, int number)
int EncodeUnichar(int unichar_id, RecodedCharID *code) const
UnicharCompress & operator=(const UnicharCompress &src)
void init_to_size(int size, T t)
bool contains_unichar(const char *const unichar_repr) const
std::unordered_map< int, RadicalStroke > RSMap
bool self_normalized() const
static const int kMaxCodeLen
static const int kFirstHangul
RadicalStroke(const STRING &r, int s)
const char * id_to_unichar(UNICHAR_ID id) const
bool ComputeEncoding(const UNICHARSET &unicharset, int null_id, STRING *radical_stroke_table)
bool Serialize(TFile *fp) const
bool has_special_codes() const
static bool DecomposeHangul(int unicode, int *leading, int *vowel, int *trailing)
int DecodeUnichar(const RecodedCharID &code) const
void SetupDirect(const GenericVector< RecodedCharID > &codes)
void SetupPassThrough(const UNICHARSET &unicharset)
bool operator==(const RadicalStroke &other) const
void Set(int index, int value)
bool DeSerialize(TFile *fp)
static bool UTF8ToUnicode(const char *utf8_str, GenericVector< int > *unicodes)
static const int kNumHangul
size_t operator()(const RadicalStroke &rs) const
std::unordered_map< RadicalStroke, int, RadicalStrokedHash > RSCounts
void Truncate(int length)
UNICHAR_ID unichar_to_id(const char *const unichar_repr) const
const char * get_normed_unichar(UNICHAR_ID unichar_id) const
void split(const char c, GenericVector< STRING > *splited)
void unichar_insert(const char *const unichar_repr)