tesseract  4.00.00dev
normstrngs.cpp
Go to the documentation of this file.
1 /**********************************************************************
2  * File: normstrngs.cpp
3  * Description: Utilities to normalize and manipulate UTF-32 and
4  * UTF-8 strings.
5  * Author: Ranjith Unnikrishnan
6  * Created: Thu July 4 2013
7  *
8  * (C) Copyright 2013, Google Inc.
9  * Licensed under the Apache License, Version 2.0 (the "License");
10  * you may not use this file except in compliance with the License.
11  * You may obtain a copy of the License at
12  * http://www.apache.org/licenses/LICENSE-2.0
13  * Unless required by applicable law or agreed to in writing, software
14  * distributed under the License is distributed on an "AS IS" BASIS,
15  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
16  * See the License for the specific language governing permissions and
17  * limitations under the License.
18  *
19  **********************************************************************/
20 
21 #include "normstrngs.h"
22 
23 #include <assert.h>
24 #include "icuerrorcode.h"
25 #include "unichar.h"
26 #include "unicode/normalizer2.h" // From libicu
27 #include "unicode/translit.h" // From libicu
28 #include "unicode/unorm2.h" // From libicu
29 
30 namespace tesseract {
31 
32 void UTF8ToUTF32(const char* utf8_str, GenericVector<char32>* str32) {
33  str32->clear();
34  str32->reserve(strlen(utf8_str));
35  int len = strlen(utf8_str);
36  int step = 0;
37  for (int ch = 0; ch < len; ch += step) {
38  step = UNICHAR::utf8_step(utf8_str + ch);
39  if (step > 0) {
40  UNICHAR uni_ch(utf8_str + ch, step);
41  (*str32) += uni_ch.first_uni();
42  }
43  }
44 }
45 
46 void UTF32ToUTF8(const GenericVector<char32>& str32, STRING* utf8_str) {
47  utf8_str->ensure(str32.length());
48  utf8_str->assign("", 0);
49  for (int i = 0; i < str32.length(); ++i) {
50  UNICHAR uni_ch(str32[i]);
51  char *utf8 = uni_ch.utf8_str();
52  if (utf8 != nullptr) {
53  (*utf8_str) += utf8;
54  delete[] utf8;
55  }
56  }
57 }
58 
59 bool is_hyphen_punc(const char32 ch) {
60  static const int kNumHyphenPuncUnicodes = 13;
61  static const char32 kHyphenPuncUnicodes[kNumHyphenPuncUnicodes] = {
62  '-',
63  0x2010, 0x2011, 0x2012, 0x2013, 0x2014, 0x2015, // hyphen..horizontal bar
64  0x207b, // superscript minus
65  0x208b, // subscript minus
66  0x2212, // minus sign
67  0xfe58, // small em dash
68  0xfe63, // small hyphen-minus
69  0xff0d, // fullwidth hyphen-minus
70  };
71  for (int i = 0; i < kNumHyphenPuncUnicodes; ++i) {
72  if (kHyphenPuncUnicodes[i] == ch)
73  return true;
74  }
75  return false;
76 }
77 
78 bool is_single_quote(const char32 ch) {
79  static const int kNumSingleQuoteUnicodes = 8;
80  static const char32 kSingleQuoteUnicodes[kNumSingleQuoteUnicodes] = {
81  '\'',
82  '`',
83  0x2018, // left single quotation mark (English, others)
84  0x2019, // right single quotation mark (Danish, Finnish, Swedish, Norw.)
85  // We may have to introduce a comma set with 0x201a
86  0x201B, // single high-reveresed-9 quotation mark (PropList.txt)
87  0x2032, // prime
88  0x300C, // left corner bracket (East Asian languages)
89  0xFF07, // fullwidth apostrophe
90  };
91  for (int i = 0; i < kNumSingleQuoteUnicodes; ++i) {
92  if (kSingleQuoteUnicodes[i] == ch)
93  return true;
94  }
95  return false;
96 }
97 
98 bool is_double_quote(const char32 ch) {
99  static const int kNumDoubleQuoteUnicodes = 8;
100  static const char32 kDoubleQuoteUnicodes[kNumDoubleQuoteUnicodes] = {
101  '"',
102  0x201C, // left double quotation mark (English, others)
103  0x201D, // right double quotation mark (Danish, Finnish, Swedish, Norw.)
104  0x201F, // double high-reversed-9 quotation mark (PropList.txt)
105  0x2033, // double prime
106  0x301D, // reversed double prime quotation mark (East Asian langs, horiz.)
107  0x301E, // close double prime (East Asian languages written horizontally)
108  0xFF02, // fullwidth quotation mark
109  };
110  for (int i = 0; i < kNumDoubleQuoteUnicodes; ++i) {
111  if (kDoubleQuoteUnicodes[i] == ch)
112  return true;
113  }
114  return false;
115 }
116 
117 STRING NormalizeUTF8String(bool decompose, const char* str8) {
118  GenericVector<char32> str32, out_str32, norm_str;
119  UTF8ToUTF32(str8, &str32);
120  for (int i = 0; i < str32.length(); ++i) {
121  norm_str.clear();
122  NormalizeChar32(str32[i], decompose, &norm_str);
123  for (int j = 0; j < norm_str.length(); ++j) {
124  out_str32.push_back(norm_str[j]);
125  }
126  }
127  STRING out_str8;
128  UTF32ToUTF8(out_str32, &out_str8);
129  return out_str8;
130 }
131 
132 void NormalizeChar32(char32 ch, bool decompose, GenericVector<char32>* str) {
133  IcuErrorCode error_code;
134  const icu::Normalizer2* nfkc = icu::Normalizer2::getInstance(
135  nullptr, "nfkc", decompose ? UNORM2_DECOMPOSE : UNORM2_COMPOSE,
136  error_code);
137  error_code.assertSuccess();
138  error_code.reset();
139 
140  icu::UnicodeString uch_str(static_cast<UChar32>(ch));
141  icu::UnicodeString norm_str = nfkc->normalize(uch_str, error_code);
142  error_code.assertSuccess();
143 
144  str->clear();
145  for (int i = 0; i < norm_str.length(); ++i) {
146  // If any spaces were added by NFKC, pretend normalization is a nop.
147  if (norm_str[i] == ' ') {
148  str->clear();
149  str->push_back(ch);
150  break;
151  } else {
152  str->push_back(OCRNormalize(static_cast<char32>(norm_str[i])));
153  }
154  }
155 }
156 
157 // Apply just the OCR-specific normalizations and return the normalized char.
159  if (is_hyphen_punc(ch))
160  return '-';
161  else if (is_single_quote(ch))
162  return '\'';
163  else if (is_double_quote(ch))
164  return '"';
165  return ch;
166 }
167 
168 bool IsOCREquivalent(char32 ch1, char32 ch2) {
169  return OCRNormalize(ch1) == OCRNormalize(ch2);
170 }
171 
172 bool IsValidCodepoint(const char32 ch) {
173  // In the range [0, 0xD800) or [0xE000, 0x10FFFF]
174  return (static_cast<uinT32>(ch) < 0xD800)
175  || (ch >= 0xE000 && ch <= 0x10FFFF);
176 }
177 
178 bool IsWhitespace(const char32 ch) {
180  "Invalid Unicode codepoint: 0x%x\n", ch);
181  return u_isUWhiteSpace(static_cast<UChar32>(ch));
182 }
183 
184 bool IsUTF8Whitespace(const char* text) {
185 #if 0 // intent
186  return SpanUTF8Whitespace(text) == strlen(text);
187 #else // avoiding g++ -Wsign-compare warning
188  const int res = SpanUTF8Whitespace(text);
189  assert(0 <= res);
190  return static_cast<unsigned int>(res) == strlen(text);
191 #endif
192 }
193 
194 int SpanUTF8Whitespace(const char* text) {
195  int n_white = 0;
196  for (UNICHAR::const_iterator it = UNICHAR::begin(text, strlen(text));
197  it != UNICHAR::end(text, strlen(text));
198  ++it) {
199  if (!IsWhitespace(*it)) break;
200  n_white += it.utf8_len();
201  }
202  return n_white;
203 }
204 
205 int SpanUTF8NotWhitespace(const char* text) {
206  int n_notwhite = 0;
207  for (UNICHAR::const_iterator it = UNICHAR::begin(text, strlen(text));
208  it != UNICHAR::end(text, strlen(text));
209  ++it) {
210  if (IsWhitespace(*it)) break;
211  n_notwhite += it.utf8_len();
212  }
213  return n_notwhite;
214 }
215 
216 bool IsInterchangeValid(const char32 ch) {
217  return IsValidCodepoint(ch) &&
218  !(ch >= 0xFDD0 && ch <= 0xFDEF) && // Noncharacters.
219  !(ch >= 0xFFFE && ch <= 0xFFFF) &&
220  !(ch >= 0x1FFFE && ch <= 0x1FFFF) &&
221  !(ch >= 0x2FFFE && ch <= 0x2FFFF) &&
222  !(ch >= 0x3FFFE && ch <= 0x3FFFF) &&
223  !(ch >= 0x4FFFE && ch <= 0x4FFFF) &&
224  !(ch >= 0x5FFFE && ch <= 0x5FFFF) &&
225  !(ch >= 0x6FFFE && ch <= 0x6FFFF) &&
226  !(ch >= 0x7FFFE && ch <= 0x7FFFF) &&
227  !(ch >= 0x8FFFE && ch <= 0x8FFFF) &&
228  !(ch >= 0x9FFFE && ch <= 0x9FFFF) &&
229  !(ch >= 0xAFFFE && ch <= 0xAFFFF) &&
230  !(ch >= 0xBFFFE && ch <= 0xBFFFF) &&
231  !(ch >= 0xCFFFE && ch <= 0xCFFFF) &&
232  !(ch >= 0xDFFFE && ch <= 0xDFFFF) &&
233  !(ch >= 0xEFFFE && ch <= 0xEFFFF) &&
234  !(ch >= 0xFFFFE && ch <= 0xFFFFF) &&
235  !(ch >= 0x10FFFE && ch <= 0x10FFFF) &&
236  (!u_isISOControl(static_cast<UChar32>(ch)) ||
237  ch == '\n' || ch == '\f' || ch == '\t' || ch == '\r');
238 }
239 
241  return IsValidCodepoint(ch) &&
242  ch <= 128 &&
243  (!u_isISOControl(static_cast<UChar32>(ch)) ||
244  ch == '\n' || ch == '\f' || ch == '\t' || ch == '\r');
245 }
246 
248  // Return unchanged if not in the fullwidth-halfwidth Unicode block.
249  if (ch < 0xFF00 || ch > 0xFFEF || !IsValidCodepoint(ch)) {
250  if (ch != 0x3000) return ch;
251  }
252  // Special case for fullwidth left and right "white parentheses".
253  if (ch == 0xFF5F) return 0x2985;
254  if (ch == 0xFF60) return 0x2986;
255  // Construct a full-to-half width transliterator.
256  IcuErrorCode error_code;
257  icu::UnicodeString uch_str(static_cast<UChar32>(ch));
258  const icu::Transliterator* fulltohalf = icu::Transliterator::createInstance(
259  "Fullwidth-Halfwidth", UTRANS_FORWARD, error_code);
260  error_code.assertSuccess();
261  error_code.reset();
262 
263  fulltohalf->transliterate(uch_str);
264  delete fulltohalf;
265  ASSERT_HOST(uch_str.length() != 0);
266  return uch_str[0];
267 }
268 
269 } // namespace tesseract
static const_iterator begin(const char *utf8_str, const int byte_length)
Definition: unichar.cpp:200
int SpanUTF8NotWhitespace(const char *text)
Definition: normstrngs.cpp:205
bool IsInterchangeValid7BitAscii(const char32 ch)
Definition: normstrngs.cpp:240
#define ASSERT_HOST_MSG(x,...)
Definition: errcode.h:90
bool is_single_quote(const char32 ch)
Definition: normstrngs.cpp:78
bool is_hyphen_punc(const char32 ch)
Definition: normstrngs.cpp:59
void reserve(int size)
static const_iterator end(const char *utf8_str, const int byte_length)
Definition: unichar.cpp:204
signed int char32
Definition: normstrngs.h:27
char32 FullwidthToHalfwidth(const char32 ch)
Definition: normstrngs.cpp:247
int push_back(T object)
bool IsOCREquivalent(char32 ch1, char32 ch2)
Definition: normstrngs.cpp:168
void assign(const char *cstr, int len)
Definition: strngs.cpp:422
void ensure(inT32 min_capacity)
Definition: strngs.h:121
STRING NormalizeUTF8String(bool decompose, const char *str8)
Definition: normstrngs.cpp:117
#define ASSERT_HOST(x)
Definition: errcode.h:84
char * utf8_str() const
Definition: unichar.cpp:125
bool is_double_quote(const char32 ch)
Definition: normstrngs.cpp:98
bool IsValidCodepoint(const char32 ch)
Definition: normstrngs.cpp:172
Definition: strngs.h:45
void NormalizeChar32(char32 ch, bool decompose, GenericVector< char32 > *str)
Definition: normstrngs.cpp:132
void UTF8ToUTF32(const char *utf8_str, GenericVector< char32 > *str32)
Definition: normstrngs.cpp:32
bool IsWhitespace(const char32 ch)
Definition: normstrngs.cpp:178
int first_uni() const
Definition: unichar.cpp:97
int length() const
Definition: genericvector.h:85
static int utf8_step(const char *utf8_str)
Definition: unichar.cpp:134
bool IsUTF8Whitespace(const char *text)
Definition: normstrngs.cpp:184
int SpanUTF8Whitespace(const char *text)
Definition: normstrngs.cpp:194
void UTF32ToUTF8(const GenericVector< char32 > &str32, STRING *utf8_str)
Definition: normstrngs.cpp:46
char32 OCRNormalize(char32 ch)
Definition: normstrngs.cpp:158
bool IsInterchangeValid(const char32 ch)
Definition: normstrngs.cpp:216