tesseract/a00620_source.html

 // File:        unicharcompress.cpp
 // Description: Unicode re-encoding using a sequence of smaller numbers in
 //              place of a single large code for CJK, similarly for Indic,
 //              and dissection of ligatures for other scripts.
 // Author:      Ray Smith
 // Created:     Wed Mar 04 14:45:01 PST 2015
 //
 // (C) Copyright 2015, Google Inc.
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
 // You may obtain a copy of the License at
 // http://www.apache.org/licenses/LICENSE-2.0
 // Unless required by applicable law or agreed to in writing, software
 // distributed under the License is distributed on an "AS IS" BASIS,
 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 // See the License for the specific language governing permissions and
 // limitations under the License.
 //

 #include "unicharcompress.h"
 #include "tprintf.h"

 namespace tesseract {

 // String used to represent the null_id in direct_set.
 const char* kNullChar = "<nul>";

 // Local struct used only for processing the radical-stroke table.
 struct RadicalStroke {
   RadicalStroke() : num_strokes(0) {}
   RadicalStroke(const STRING& r, int s) : radical(r), num_strokes(s) {}

   bool operator==(const RadicalStroke& other) const {
     return radical == other.radical && num_strokes == other.num_strokes;
   }

   // The radical is encoded as a string because its format is of an int with
   // an optional ' mark to indicate a simplified shape. To treat these as
   // distinct, we use a string and a UNICHARSET to do the integer mapping.
   STRING radical;
   // The number of strokes we treat as dense and just take the face value from
   // the table.
   int num_strokes;
 };

 // Hash functor for RadicalStroke.
 struct RadicalStrokedHash {
   size_t operator()(const RadicalStroke& rs) const {
     size_t result = rs.num_strokes;
     for (int i = 0; i < rs.radical.length(); ++i) {
       result ^= rs.radical[i] << (6 * i + 8);
     }
     return result;
   }
 };

 // A hash map to convert unicodes to radical,stroke pair.
 typedef std::unordered_map<int, RadicalStroke> RSMap;
 // A hash map to count occurrences of each radical,stroke pair.
 typedef std::unordered_map<RadicalStroke, int, RadicalStrokedHash> RSCounts;

 // Helper function builds the RSMap from the radical-stroke file, which has
 // already been read into a STRING. Returns false on error.
 // The radical_stroke_table is non-const because it gets split and the caller
 // is unlikely to want to use it again.
 static bool DecodeRadicalStrokeTable(STRING* radical_stroke_table,
                                      RSMap* radical_map) {
   GenericVector<STRING> lines;
   radical_stroke_table->split('\n', &lines);
   for (int i = 0; i < lines.size(); ++i) {
     if (lines[i].length() == 0 || lines[i][0] == '#') continue;
     int unicode, radical, strokes;
     STRING str_radical;
     if (sscanf(lines[i].string(), "%x\t%d.%d", &unicode, &radical, &strokes) ==
         3) {
       str_radical.add_str_int("", radical);
     } else if (sscanf(lines[i].string(), "%x\t%d'.%d", &unicode, &radical,
                       &strokes) == 3) {
       str_radical.add_str_int("'", radical);
     } else {
       tprintf("Invalid format in radical stroke table at line %d: %s\n", i,
               lines[i].string());
       return false;
     }
     (*radical_map)[unicode] = RadicalStroke(str_radical, strokes);
   }
   return true;
 }

 UnicharCompress::UnicharCompress() : code_range_(0) {}
 UnicharCompress::UnicharCompress(const UnicharCompress& src) { *this = src; }
 UnicharCompress::~UnicharCompress() { Cleanup(); }
 UnicharCompress& UnicharCompress::operator=(const UnicharCompress& src) {
   Cleanup();
   encoder_ = src.encoder_;
   code_range_ = src.code_range_;
   SetupDecoder();
   return *this;
 }

 // Computes the encoding for the given unicharset. It is a requirement that
 // the file training/langdata/radical-stroke.txt have been read into the
 // input string radical_stroke_table.
 // Returns false if the encoding cannot be constructed.
 bool UnicharCompress::ComputeEncoding(const UNICHARSET& unicharset, int null_id,
                                       STRING* radical_stroke_table) {
   RSMap radical_map;
   if (!DecodeRadicalStrokeTable(radical_stroke_table, &radical_map))
     return false;
   encoder_.clear();
   UNICHARSET direct_set;
   UNICHARSET radicals;
   // To avoid unused codes, clear the special codes from the unicharsets.
   direct_set.clear();
   radicals.clear();
   // Always keep space as 0;
   direct_set.unichar_insert(" ");
   // Null char is next if we have one.
   if (null_id >= 0) {
     direct_set.unichar_insert(kNullChar);
   }
   RSCounts radical_counts;
   // In the initial map, codes [0, unicharset.size()) are
   // reserved for non-han/hangul sequences of 1 or more unicodes.
   int hangul_offset = unicharset.size();
   // Hangul takes the next range [hangul_offset, hangul_offset + kTotalJamos).
   const int kTotalJamos = kLCount + kVCount + kTCount;
   // Han takes the codes beyond hangul_offset + kTotalJamos. Since it is hard
   // to measure the number of radicals and strokes, initially we use the same
   // code range for all 3 Han code positions, and fix them after.
   int han_offset = hangul_offset + kTotalJamos;
   int max_num_strokes = -1;
   for (int u = 0; u <= unicharset.size(); ++u) {
     bool self_normalized = false;
     // We special-case allow null_id to be equal to unicharset.size() in case
     // there is no space in unicharset for it.
     if (u == unicharset.size()) {
       if (u == null_id) {
         self_normalized = true;
       } else {
         break;  // Finished.
       }
     } else {
       self_normalized = strcmp(unicharset.id_to_unichar(u),
                                unicharset.get_normed_unichar(u)) == 0;
     }
     RecodedCharID code;
     // Convert to unicodes.
     GenericVector<int> unicodes;
     if (u < unicharset.size() &&
         UNICHAR::UTF8ToUnicode(unicharset.get_normed_unichar(u), &unicodes) &&
         unicodes.size() == 1) {
       // Check single unicodes for Hangul/Han and encode if so.
       int unicode = unicodes[0];
       int leading, vowel, trailing;
       auto it = radical_map.find(unicode);
       if (it != radical_map.end()) {
         // This is Han. Convert to radical, stroke, index.
         if (!radicals.contains_unichar(it->second.radical.string())) {
           radicals.unichar_insert(it->second.radical.string());
         }
         int radical = radicals.unichar_to_id(it->second.radical.string());
         int num_strokes = it->second.num_strokes;
         int num_samples = radical_counts[it->second]++;
         if (num_strokes > max_num_strokes) max_num_strokes = num_strokes;
         code.Set3(radical + han_offset, num_strokes + han_offset,
                   num_samples + han_offset);
       } else if (DecomposeHangul(unicode, &leading, &vowel, &trailing)) {
         // This is Hangul. Since we know the exact size of each part at compile
         // time, it gets the bottom set of codes.
         code.Set3(leading + hangul_offset, vowel + kLCount + hangul_offset,
                   trailing + kLCount + kVCount + hangul_offset);
       }
     }
     // If the code is still empty, it wasn't Han or Hangul.
     if (code.length() == 0) {
       // Special cases.
       if (u == UNICHAR_SPACE) {
         code.Set(0, 0);  // Space.
       } else if (u == null_id || (unicharset.has_special_codes() &&
                                   u < SPECIAL_UNICHAR_CODES_COUNT)) {
         code.Set(0, direct_set.unichar_to_id(kNullChar));
       } else {
         // Add the direct_set unichar-ids of the unicodes in sequence to the
         // code.
         for (int i = 0; i < unicodes.size(); ++i) {
           int position = code.length();
           if (position >= RecodedCharID::kMaxCodeLen) {
             tprintf("Unichar %d=%s->%s is too long to encode!!\n", u,
                     unicharset.id_to_unichar(u),
                     unicharset.get_normed_unichar(u));
             return false;
           }
           int uni = unicodes[i];
           UNICHAR unichar(uni);
           char* utf8 = unichar.utf8_str();
           if (!direct_set.contains_unichar(utf8))
             direct_set.unichar_insert(utf8);
           code.Set(position, direct_set.unichar_to_id(utf8));
           delete[] utf8;
           if (direct_set.size() > unicharset.size()) {
             // Code space got bigger!
             tprintf("Code space expanded from original unicharset!!\n");
             return false;
           }
         }
       }
     }
     code.set_self_normalized(self_normalized);
     encoder_.push_back(code);
   }
   // Now renumber Han to make all codes unique. We already added han_offset to
   // all Han. Now separate out the radical, stroke, and count codes for Han.
   // In the uniqued Han encoding, the 1st code uses the next radical_map.size()
   // values, the 2nd code uses the next max_num_strokes+1 values, and the 3rd
   // code uses the rest for the max number of duplicated radical/stroke combos.
   int num_radicals = radicals.size();
   for (int u = 0; u < unicharset.size(); ++u) {
     RecodedCharID* code = &encoder_[u];
     if ((*code)(0) >= han_offset) {
       code->Set(1, (*code)(1) + num_radicals);
       code->Set(2, (*code)(2) + num_radicals + max_num_strokes + 1);
     }
   }
   DefragmentCodeValues(null_id >= 0 ? 1 : -1);
   SetupDecoder();
   return true;
 }

 // Sets up an encoder that doesn't change the unichars at all, so it just
 // passes them through unchanged.
 void UnicharCompress::SetupPassThrough(const UNICHARSET& unicharset) {
   GenericVector<RecodedCharID> codes;
   for (int u = 0; u < unicharset.size(); ++u) {
     RecodedCharID code;
     code.Set(0, u);
     codes.push_back(code);
   }
   SetupDirect(codes);
 }

 // Sets up an encoder directly using the given encoding vector, which maps
 // unichar_ids to the given codes.
 void UnicharCompress::SetupDirect(const GenericVector<RecodedCharID>& codes) {
   encoder_ = codes;
   ComputeCodeRange();
   SetupDecoder();
 }

 // Renumbers codes to eliminate unused values.
 void UnicharCompress::DefragmentCodeValues(int encoded_null) {
   // There may not be any Hangul, but even if there is, it is possible that not
   // all codes are used. Likewise with the Han encoding, it is possible that not
   // all numbers of strokes are used.
   ComputeCodeRange();
   GenericVector<int> offsets;
   offsets.init_to_size(code_range_, 0);
   // Find which codes are used
   for (int c = 0; c < encoder_.size(); ++c) {
     const RecodedCharID& code = encoder_[c];
     for (int i = 0; i < code.length(); ++i) {
       offsets[code(i)] = 1;
     }
   }
   // Compute offsets based on code use.
   int offset = 0;
   for (int i = 0; i < offsets.size(); ++i) {
     // If not used, decrement everything above here.
     // We are moving encoded_null to the end, so it is not "used".
     if (offsets[i] == 0 || i == encoded_null) {
       --offset;
     } else {
       offsets[i] = offset;
     }
   }
   if (encoded_null >= 0) {
     // The encoded_null is moving to the end, for the benefit of TensorFlow,
     // which is offsets.size() + offsets.back().
     offsets[encoded_null] = offsets.size() + offsets.back() - encoded_null;
   }
   // Now apply the offsets.
   for (int c = 0; c < encoder_.size(); ++c) {
     RecodedCharID* code = &encoder_[c];
     for (int i = 0; i < code->length(); ++i) {
       int value = (*code)(i);
       code->Set(i, value + offsets[value]);
     }
   }
   ComputeCodeRange();
 }

 // Encodes a single unichar_id. Returns the length of the code, or zero if
 // invalid input, and the encoding itself
 int UnicharCompress::EncodeUnichar(int unichar_id, RecodedCharID* code) const {
   if (unichar_id < 0 || unichar_id >= encoder_.size()) return 0;
   *code = encoder_[unichar_id];
   return code->length();
 }

 // Decodes code, returning the original unichar-id, or
 // INVALID_UNICHAR_ID if the input is invalid.
 int UnicharCompress::DecodeUnichar(const RecodedCharID& code) const {
   int len = code.length();
   if (len <= 0 || len > RecodedCharID::kMaxCodeLen) return INVALID_UNICHAR_ID;
   auto it = decoder_.find(code);
   if (it == decoder_.end()) return INVALID_UNICHAR_ID;
   return it->second;
 }

 // Writes to the given file. Returns false in case of error.
 bool UnicharCompress::Serialize(TFile* fp) const {
   return encoder_.SerializeClasses(fp);
 }

 // Reads from the given file. Returns false in case of error.
 bool UnicharCompress::DeSerialize(TFile* fp) {
   if (!encoder_.DeSerializeClasses(fp)) return false;
   ComputeCodeRange();
   SetupDecoder();
   return true;
 }

 // Returns a STRING containing a text file that describes the encoding thus:
 // <index>[,<index>]*<tab><UTF8-str><newline>
 // In words, a comma-separated list of one or more indices, followed by a tab
 // and the UTF-8 string that the code represents per line. Most simple scripts
 // will encode a single index to a UTF8-string, but Chinese, Japanese, Korean
 // and the Indic scripts will contain a many-to-many mapping.
 // See the class comment above for details.
 STRING UnicharCompress::GetEncodingAsString(
     const UNICHARSET& unicharset) const {
   STRING encoding;
   for (int c = 0; c < encoder_.size(); ++c) {
     const RecodedCharID& code = encoder_[c];
     if (0 < c && c < SPECIAL_UNICHAR_CODES_COUNT && code == encoder_[c - 1]) {
       // Don't show the duplicate entry.
       continue;
     }
     encoding.add_str_int("", code(0));
     for (int i = 1; i < code.length(); ++i) {
       encoding.add_str_int(",", code(i));
     }
     encoding += "\t";
     if (c >= unicharset.size() || (0 < c && c < SPECIAL_UNICHAR_CODES_COUNT &&
                                    unicharset.has_special_codes())) {
       encoding += kNullChar;
     } else {
       encoding += unicharset.id_to_unichar(c);
     }
     encoding += "\n";
   }
   return encoding;
 }

 // Helper decomposes a Hangul unicode to 3 parts, leading, vowel, trailing.
 // Note that the returned values are 0-based indices, NOT unicode Jamo.
 // Returns false if the input is not in the Hangul unicode range.
 /* static */
 bool UnicharCompress::DecomposeHangul(int unicode, int* leading, int* vowel,
                                       int* trailing) {
   if (unicode < kFirstHangul) return false;
   int offset = unicode - kFirstHangul;
   if (offset >= kNumHangul) return false;
   const int kNCount = kVCount * kTCount;
   *leading = offset / kNCount;
   *vowel = (offset % kNCount) / kTCount;
   *trailing = offset % kTCount;
   return true;
 }

 // Computes the value of code_range_ from the encoder_.
 void UnicharCompress::ComputeCodeRange() {
   code_range_ = -1;
   for (int c = 0; c < encoder_.size(); ++c) {
     const RecodedCharID& code = encoder_[c];
     for (int i = 0; i < code.length(); ++i) {
       if (code(i) > code_range_) code_range_ = code(i);
     }
   }
   ++code_range_;
 }

 // Initializes the decoding hash_map from the encoding array.
 void UnicharCompress::SetupDecoder() {
   Cleanup();
   is_valid_start_.init_to_size(code_range_, false);
   for (int c = 0; c < encoder_.size(); ++c) {
     const RecodedCharID& code = encoder_[c];
     if (code.self_normalized() || decoder_.find(code) == decoder_.end())
       decoder_[code] = c;
     is_valid_start_[code(0)] = true;
     RecodedCharID prefix = code;
     int len = code.length() - 1;
     prefix.Truncate(len);
     auto final_it = final_codes_.find(prefix);
     if (final_it == final_codes_.end()) {
       GenericVectorEqEq<int>* code_list = new GenericVectorEqEq<int>;
       code_list->push_back(code(len));
       final_codes_[prefix] = code_list;
       while (--len >= 0) {
         prefix.Truncate(len);
         auto next_it = next_codes_.find(prefix);
         if (next_it == next_codes_.end()) {
           GenericVectorEqEq<int>* code_list = new GenericVectorEqEq<int>;
           code_list->push_back(code(len));
           next_codes_[prefix] = code_list;
         } else {
           // We still have to search the list as we may get here via multiple
           // lengths of code.
           if (!next_it->second->contains(code(len)))
             next_it->second->push_back(code(len));
           break;  // This prefix has been processed.
         }
       }
     } else {
       if (!final_it->second->contains(code(len)))
         final_it->second->push_back(code(len));
     }
   }
 }

 // Frees allocated memory.
 void UnicharCompress::Cleanup() {
   decoder_.clear();
   is_valid_start_.clear();
   for (auto it = next_codes_.begin(); it != next_codes_.end(); ++it) {
     delete it->second;
   }
   for (auto it = final_codes_.begin(); it != final_codes_.end(); ++it) {
     delete it->second;
   }
   next_codes_.clear();
   final_codes_.clear();
 }

 }  // namespace tesseract.
u
double u[max]
Definition: dotproduct-main.cpp:5

GenericVectorEqEq< int >

SPECIAL_UNICHAR_CODES_COUNT
Definition: unicharset.h:39

tesseract::UnicharCompress::GetEncodingAsString
STRING GetEncodingAsString(const UNICHARSET &unicharset) const
Definition: unicharcompress.cpp:332

STRING::add_str_int
void add_str_int(const char *str, int number)
Definition: strngs.cpp:381

tesseract::RadicalStroke::RadicalStroke
RadicalStroke()
Definition: unicharcompress.cpp:32

tesseract::RecodedCharID::length
int length() const
Definition: unicharcompress.h:59

UNICHARSET
Definition: unicharset.h:139

tesseract::UnicharCompress::EncodeUnichar
int EncodeUnichar(int unichar_id, RecodedCharID *code) const
Definition: unicharcompress.cpp:296

tesseract::UnicharCompress::operator=
UnicharCompress & operator=(const UnicharCompress &src)
Definition: unicharcompress.cpp:95

tesseract::UnicharCompress::kLCount
static const int kLCount
Definition: unicharcompress.h:152

GenericVector::init_to_size
void init_to_size(int size, T t)
Definition: genericvector.h:696

UNICHARSET::contains_unichar
bool contains_unichar(const char *const unichar_repr) const
Definition: unicharset.cpp:644

tesseract::RSMap
std::unordered_map< int, RadicalStroke > RSMap
Definition: unicharcompress.cpp:60

tesseract::UnicharCompress::kVCount
static const int kVCount
Definition: unicharcompress.h:153

tesseract::UnicharCompress::kTCount
static const int kTCount
Definition: unicharcompress.h:154

tesseract::RadicalStroke::radical
STRING radical
Definition: unicharcompress.cpp:42

tesseract::RecodedCharID::self_normalized
bool self_normalized() const
Definition: unicharcompress.h:57

tesseract::RecodedCharID::kMaxCodeLen
static const int kMaxCodeLen
Definition: unicharcompress.h:37

unicharcompress.h

GenericVector::push_back
int push_back(T object)
Definition: genericvector.h:787

tesseract::RadicalStroke::num_strokes
int num_strokes
Definition: unicharcompress.cpp:45

tprintf
#define tprintf(...)
Definition: tprintf.h:31

offset
voidpf uLong offset
Definition: ioapi.h:42

tesseract::UnicharCompress::kFirstHangul
static const int kFirstHangul
Definition: unicharcompress.h:147

STRING::length
inT32 length() const
Definition: strngs.cpp:193

tesseract::RadicalStroke::RadicalStroke
RadicalStroke(const STRING &r, int s)
Definition: unicharcompress.cpp:33

GenericVector::size
int size() const
Definition: genericvector.h:72

tesseract
Definition: baseapi.cpp:82

UNICHARSET::clear
void clear()
Definition: unicharset.h:265

UNICHARSET::id_to_unichar
const char * id_to_unichar(UNICHAR_ID id) const
Definition: unicharset.cpp:266

GenericVector< STRING >

tesseract::RecodedCharID
Definition: unicharcompress.h:34

tesseract::UnicharCompress::ComputeEncoding
bool ComputeEncoding(const UNICHARSET &unicharset, int null_id, STRING *radical_stroke_table)
Definition: unicharcompress.cpp:107

tesseract::UnicharCompress::Serialize
bool Serialize(TFile *fp) const
Definition: unicharcompress.cpp:313

tesseract::UnicharCompress::UnicharCompress
UnicharCompress()
Definition: unicharcompress.cpp:92

UNICHAR::utf8_str
char * utf8_str() const
Definition: unichar.cpp:125

tesseract::kNullChar
const char * kNullChar
Definition: unicharcompress.cpp:28

STRING
Definition: strngs.h:45

UNICHARSET::has_special_codes
bool has_special_codes() const
Definition: unicharset.h:682

tesseract::UnicharCompress::DecomposeHangul
static bool DecomposeHangul(int unicode, int *leading, int *vowel, int *trailing)
Definition: unicharcompress.cpp:361

tesseract::UnicharCompress::DecodeUnichar
int DecodeUnichar(const RecodedCharID &code) const
Definition: unicharcompress.cpp:304

tesseract::UnicharCompress::SetupDirect
void SetupDirect(const GenericVector< RecodedCharID > &codes)
Definition: unicharcompress.cpp:246

GenericVector::clear
void clear()
Definition: genericvector.h:856

tesseract::UnicharCompress::SetupPassThrough
void SetupPassThrough(const UNICHARSET &unicharset)
Definition: unicharcompress.cpp:234

tesseract::RadicalStroke::operator==
bool operator==(const RadicalStroke &other) const
Definition: unicharcompress.cpp:35

GenericVector::back
T & back() const
Definition: genericvector.h:718

tesseract::TFile
Definition: serialis.h:51

tprintf.h

tesseract::RecodedCharID::Set
void Set(int index, int value)
Definition: unicharcompress.h:44

tesseract::UnicharCompress
Definition: unicharcompress.h:139

tesseract::RadicalStrokedHash
Definition: unicharcompress.cpp:49

UNICHAR
Definition: unichar.h:52

tesseract::UnicharCompress::DeSerialize
bool DeSerialize(TFile *fp)
Definition: unicharcompress.cpp:318

UNICHAR::UTF8ToUnicode
static bool UTF8ToUnicode(const char *utf8_str, GenericVector< int > *unicodes)
Definition: unichar.cpp:211

tesseract::UnicharCompress::kNumHangul
static const int kNumHangul
Definition: unicharcompress.h:149

UNICHARSET::size
int size() const
Definition: unicharset.h:299

tesseract::RadicalStrokedHash::operator()
size_t operator()(const RadicalStroke &rs) const
Definition: unicharcompress.cpp:50

tesseract::RSCounts
std::unordered_map< RadicalStroke, int, RadicalStrokedHash > RSCounts
Definition: unicharcompress.cpp:62

UNICHAR_SPACE
Definition: unicharset.h:35

tesseract::RecodedCharID::Truncate
void Truncate(int length)
Definition: unicharcompress.h:42

tesseract::RadicalStroke
Definition: unicharcompress.cpp:31

UNICHARSET::unichar_to_id
UNICHAR_ID unichar_to_id(const char *const unichar_repr) const
Definition: unicharset.cpp:194

tesseract::UnicharCompress::~UnicharCompress
~UnicharCompress()
Definition: unicharcompress.cpp:94

UNICHARSET::get_normed_unichar
const char * get_normed_unichar(UNICHAR_ID unichar_id) const
Definition: unicharset.h:788

STRING::split
void split(const char c, GenericVector< STRING > *splited)
Definition: strngs.cpp:286

UNICHARSET::unichar_insert
void unichar_insert(const char *const unichar_repr)
Definition: unicharset.cpp:612