tesseract  4.00.00dev
unicharset_training_utils.cpp
Go to the documentation of this file.
1 // File: unicharset_training_utils.cpp
3 // Description: Training utilities for UNICHARSET.
4 // Author: Ray Smith
5 // Created: Fri Oct 17 17:09:01 PDT 2014
6 //
7 // (C) Copyright 2014, Google Inc.
8 // Licensed under the Apache License, Version 2.0 (the "License");
9 // you may not use this file except in compliance with the License.
10 // You may obtain a copy of the License at
11 // http://www.apache.org/licenses/LICENSE-2.0
12 // Unless required by applicable law or agreed to in writing, software
13 // distributed under the License is distributed on an "AS IS" BASIS,
14 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15 // See the License for the specific language governing permissions and
16 // limitations under the License.
17 //
19 
21 
22 #include <stdlib.h>
23 #include <string.h>
24 #include <string>
25 
26 #include "fileio.h"
27 #include "genericvector.h"
28 #include "icuerrorcode.h"
29 #include "normstrngs.h"
30 #include "statistc.h"
31 #include "strngs.h"
32 #include "unicharset.h"
33 #include "unicode/uchar.h" // from libicu
34 #include "unicode/uscript.h" // from libicu
35 
36 namespace tesseract {
37 
38 // Helper sets the character attribute properties and sets up the script table.
39 // Does not set tops and bottoms.
40 void SetupBasicProperties(bool report_errors, bool decompose,
41  UNICHARSET* unicharset) {
42  for (int unichar_id = 0; unichar_id < unicharset->size(); ++unichar_id) {
43  // Convert any custom ligatures.
44  const char* unichar_str = unicharset->id_to_unichar(unichar_id);
45  for (int i = 0; UNICHARSET::kCustomLigatures[i][0] != nullptr; ++i) {
46  if (!strcmp(UNICHARSET::kCustomLigatures[i][1], unichar_str)) {
47  unichar_str = UNICHARSET::kCustomLigatures[i][0];
48  break;
49  }
50  }
51 
52  // Convert the unichar to UTF32 representation
53  GenericVector<char32> uni_vector;
54  tesseract::UTF8ToUTF32(unichar_str, &uni_vector);
55 
56  // Assume that if the property is true for any character in the string,
57  // then it holds for the whole "character".
58  bool unichar_isalpha = false;
59  bool unichar_islower = false;
60  bool unichar_isupper = false;
61  bool unichar_isdigit = false;
62  bool unichar_ispunct = false;
63 
64  for (int i = 0; i < uni_vector.size(); ++i) {
65  if (u_isalpha(uni_vector[i]))
66  unichar_isalpha = true;
67  if (u_islower(uni_vector[i]))
68  unichar_islower = true;
69  if (u_isupper(uni_vector[i]))
70  unichar_isupper = true;
71  if (u_isdigit(uni_vector[i]))
72  unichar_isdigit = true;
73  if (u_ispunct(uni_vector[i]))
74  unichar_ispunct = true;
75  }
76 
77  unicharset->set_isalpha(unichar_id, unichar_isalpha);
78  unicharset->set_islower(unichar_id, unichar_islower);
79  unicharset->set_isupper(unichar_id, unichar_isupper);
80  unicharset->set_isdigit(unichar_id, unichar_isdigit);
81  unicharset->set_ispunctuation(unichar_id, unichar_ispunct);
82 
84  unicharset->set_script(unichar_id, uscript_getName(
85  uscript_getScript(uni_vector[0], err)));
86 
87  const int num_code_points = uni_vector.size();
88  // Obtain the lower/upper case if needed and record it in the properties.
89  unicharset->set_other_case(unichar_id, unichar_id);
90  if (unichar_islower || unichar_isupper) {
91  GenericVector<char32> other_case(num_code_points, 0);
92  for (int i = 0; i < num_code_points; ++i) {
93  // TODO(daria): Ideally u_strToLower()/ustrToUpper() should be used.
94  // However since they deal with UChars (so need a conversion function
95  // from char32 or UTF8string) and require a meaningful locale string,
96  // for now u_tolower()/u_toupper() are used.
97  other_case[i] = unichar_islower ? u_toupper(uni_vector[i]) :
98  u_tolower(uni_vector[i]);
99  }
100  STRING other_case_uch;
101  tesseract::UTF32ToUTF8(other_case, &other_case_uch);
102  UNICHAR_ID other_case_id =
103  unicharset->unichar_to_id(other_case_uch.c_str());
104  if (other_case_id != INVALID_UNICHAR_ID) {
105  unicharset->set_other_case(unichar_id, other_case_id);
106  } else if (unichar_id >= SPECIAL_UNICHAR_CODES_COUNT && report_errors) {
107  tprintf("Other case %s of %s is not in unicharset\n",
108  other_case_uch.c_str(), unichar_str);
109  }
110  }
111 
112  // Set RTL property and obtain mirror unichar ID from ICU.
113  GenericVector<char32> mirrors(num_code_points, 0);
114  for (int i = 0; i < num_code_points; ++i) {
115  mirrors[i] = u_charMirror(uni_vector[i]);
116  if (i == 0) { // set directionality to that of the 1st code point
117  unicharset->set_direction(unichar_id,
118  static_cast<UNICHARSET::Direction>(
119  u_charDirection(uni_vector[i])));
120  }
121  }
122  STRING mirror_uch;
123  tesseract::UTF32ToUTF8(mirrors, &mirror_uch);
124  UNICHAR_ID mirror_uch_id = unicharset->unichar_to_id(mirror_uch.c_str());
125  if (mirror_uch_id != INVALID_UNICHAR_ID) {
126  unicharset->set_mirror(unichar_id, mirror_uch_id);
127  } else if (report_errors) {
128  tprintf("Mirror %s of %s is not in unicharset\n",
129  mirror_uch.c_str(), unichar_str);
130  }
131 
132  // Record normalized version of this unichar.
133  STRING normed_str = tesseract::NormalizeUTF8String(decompose, unichar_str);
134  if (unichar_id != 0 && normed_str.length() > 0) {
135  unicharset->set_normed(unichar_id, normed_str.c_str());
136  } else {
137  unicharset->set_normed(unichar_id, unichar_str);
138  }
139  ASSERT_HOST(unicharset->get_other_case(unichar_id) < unicharset->size());
140  }
141  unicharset->post_load_setup();
142 }
143 
144 // Helper to set the properties for an input unicharset file, writes to the
145 // output file. If an appropriate script unicharset can be found in the
146 // script_dir directory, then the tops and bottoms are expanded using the
147 // script unicharset.
148 // If non-empty, xheight data for the fonts are written to the xheights_file.
149 void SetPropertiesForInputFile(const string& script_dir,
150  const string& input_unicharset_file,
151  const string& output_unicharset_file,
152  const string& output_xheights_file) {
153  UNICHARSET unicharset;
154 
155  // Load the input unicharset
156  unicharset.load_from_file(input_unicharset_file.c_str());
157  tprintf("Loaded unicharset of size %d from file %s\n", unicharset.size(),
158  input_unicharset_file.c_str());
159 
160  // Set unichar properties
161  tprintf("Setting unichar properties\n");
162  SetupBasicProperties(true, false, &unicharset);
163  string xheights_str;
164  for (int s = 0; s < unicharset.get_script_table_size(); ++s) {
165  // Load the unicharset for the script if available.
166  string filename = script_dir + "/" +
167  unicharset.get_script_from_script_id(s) + ".unicharset";
168  UNICHARSET script_set;
169  if (script_set.load_from_file(filename.c_str())) {
170  unicharset.SetPropertiesFromOther(script_set);
171  }
172  // Load the xheights for the script if available.
173  filename = script_dir + "/" + unicharset.get_script_from_script_id(s) +
174  ".xheights";
175  string script_heights;
176  if (File::ReadFileToString(filename, &script_heights))
177  xheights_str += script_heights;
178  }
179  if (!output_xheights_file.empty())
180  File::WriteStringToFileOrDie(xheights_str, output_xheights_file);
181  for (int c = SPECIAL_UNICHAR_CODES_COUNT; c < unicharset.size(); ++c) {
182  if (unicharset.PropertiesIncomplete(c)) {
183  tprintf("Warning: properties incomplete for index %d = %s\n",
184  c, unicharset.id_to_unichar(c));
185  }
186  }
187 
188  // Write the output unicharset
189  tprintf("Writing unicharset to file %s\n", output_unicharset_file.c_str());
190  unicharset.save_to_file(output_unicharset_file.c_str());
191 }
192 
193 } // namespace tesseract
194 
static TESS_API const char * kCustomLigatures[][2]
Definition: unicharset.h:144
static void WriteStringToFileOrDie(const string &str, const string &filename)
Definition: fileio.cpp:52
void set_isupper(UNICHAR_ID unichar_id, bool value)
Definition: unicharset.h:401
void SetupBasicProperties(bool report_errors, bool decompose, UNICHARSET *unicharset)
int UNICHAR_ID
Definition: unichar.h:33
const char * get_script_from_script_id(int id) const
Definition: unicharset.h:814
#define tprintf(...)
Definition: tprintf.h:31
void SetPropertiesFromOther(const UNICHARSET &src)
Definition: unicharset.h:505
void set_normed(UNICHAR_ID unichar_id, const char *normed)
Definition: unicharset.h:442
int size() const
Definition: genericvector.h:72
STRING NormalizeUTF8String(bool decompose, const char *str8)
Definition: normstrngs.cpp:117
#define ASSERT_HOST(x)
Definition: errcode.h:84
const char * id_to_unichar(UNICHAR_ID id) const
Definition: unicharset.cpp:266
void set_ispunctuation(UNICHAR_ID unichar_id, bool value)
Definition: unicharset.h:411
void set_direction(UNICHAR_ID unichar_id, UNICHARSET::Direction value)
Definition: unicharset.h:432
UNICHAR_ID get_other_case(UNICHAR_ID unichar_id) const
Definition: unicharset.h:643
int get_script_table_size() const
Definition: unicharset.h:809
void post_load_setup()
Definition: unicharset.cpp:867
bool PropertiesIncomplete(UNICHAR_ID unichar_id) const
Definition: unicharset.h:606
Definition: strngs.h:45
void UTF8ToUTF32(const char *utf8_str, GenericVector< char32 > *str32)
Definition: normstrngs.cpp:32
void SetPropertiesForInputFile(const string &script_dir, const string &input_unicharset_file, const string &output_unicharset_file, const string &output_xheights_file)
bool load_from_file(const char *const filename, bool skip_fragments)
Definition: unicharset.h:348
void set_script(UNICHAR_ID unichar_id, const char *value)
Definition: unicharset.h:422
bool save_to_file(const char *const filename) const
Definition: unicharset.h:308
const char * filename
Definition: ioapi.h:38
const char * c_str() const
Definition: strngs.cpp:209
void set_other_case(UNICHAR_ID unichar_id, UNICHAR_ID other_case)
Definition: unicharset.h:427
int size() const
Definition: unicharset.h:299
void set_isalpha(UNICHAR_ID unichar_id, bool value)
Definition: unicharset.h:391
void UTF32ToUTF8(const GenericVector< char32 > &str32, STRING *utf8_str)
Definition: normstrngs.cpp:46
UNICHAR_ID unichar_to_id(const char *const unichar_repr) const
Definition: unicharset.cpp:194
void set_mirror(UNICHAR_ID unichar_id, UNICHAR_ID mirror)
Definition: unicharset.h:437
void set_isdigit(UNICHAR_ID unichar_id, bool value)
Definition: unicharset.h:406
void set_islower(UNICHAR_ID unichar_id, bool value)
Definition: unicharset.h:396
static bool ReadFileToString(const string &filename, string *out)
Definition: fileio.cpp:72