26 #include "unicode/normalizer2.h" 27 #include "unicode/translit.h" 28 #include "unicode/unorm2.h" 34 str32->
reserve(strlen(utf8_str));
35 int len = strlen(utf8_str);
37 for (
int ch = 0; ch < len; ch += step) {
40 UNICHAR uni_ch(utf8_str + ch, step);
49 for (
int i = 0; i < str32.
length(); ++i) {
52 if (utf8 !=
nullptr) {
60 static const int kNumHyphenPuncUnicodes = 13;
61 static const char32 kHyphenPuncUnicodes[kNumHyphenPuncUnicodes] = {
63 0x2010, 0x2011, 0x2012, 0x2013, 0x2014, 0x2015,
71 for (
int i = 0; i < kNumHyphenPuncUnicodes; ++i) {
72 if (kHyphenPuncUnicodes[i] == ch)
79 static const int kNumSingleQuoteUnicodes = 8;
80 static const char32 kSingleQuoteUnicodes[kNumSingleQuoteUnicodes] = {
91 for (
int i = 0; i < kNumSingleQuoteUnicodes; ++i) {
92 if (kSingleQuoteUnicodes[i] == ch)
99 static const int kNumDoubleQuoteUnicodes = 8;
100 static const char32 kDoubleQuoteUnicodes[kNumDoubleQuoteUnicodes] = {
110 for (
int i = 0; i < kNumDoubleQuoteUnicodes; ++i) {
111 if (kDoubleQuoteUnicodes[i] == ch)
120 for (
int i = 0; i < str32.
length(); ++i) {
123 for (
int j = 0; j < norm_str.
length(); ++j) {
134 const icu::Normalizer2* nfkc = icu::Normalizer2::getInstance(
135 nullptr,
"nfkc", decompose ? UNORM2_DECOMPOSE : UNORM2_COMPOSE,
137 error_code.assertSuccess();
140 icu::UnicodeString uch_str(static_cast<UChar32>(ch));
141 icu::UnicodeString norm_str = nfkc->normalize(uch_str, error_code);
142 error_code.assertSuccess();
145 for (
int i = 0; i < norm_str.length(); ++i) {
147 if (norm_str[i] ==
' ') {
174 return (static_cast<uinT32>(ch) < 0xD800)
175 || (ch >= 0xE000 && ch <= 0x10FFFF);
180 "Invalid Unicode codepoint: 0x%x\n", ch);
181 return u_isUWhiteSpace(static_cast<UChar32>(ch));
187 #else // avoiding g++ -Wsign-compare warning 190 return static_cast<unsigned int>(res) == strlen(text);
200 n_white += it.utf8_len();
211 n_notwhite += it.utf8_len();
218 !(ch >= 0xFDD0 && ch <= 0xFDEF) &&
219 !(ch >= 0xFFFE && ch <= 0xFFFF) &&
220 !(ch >= 0x1FFFE && ch <= 0x1FFFF) &&
221 !(ch >= 0x2FFFE && ch <= 0x2FFFF) &&
222 !(ch >= 0x3FFFE && ch <= 0x3FFFF) &&
223 !(ch >= 0x4FFFE && ch <= 0x4FFFF) &&
224 !(ch >= 0x5FFFE && ch <= 0x5FFFF) &&
225 !(ch >= 0x6FFFE && ch <= 0x6FFFF) &&
226 !(ch >= 0x7FFFE && ch <= 0x7FFFF) &&
227 !(ch >= 0x8FFFE && ch <= 0x8FFFF) &&
228 !(ch >= 0x9FFFE && ch <= 0x9FFFF) &&
229 !(ch >= 0xAFFFE && ch <= 0xAFFFF) &&
230 !(ch >= 0xBFFFE && ch <= 0xBFFFF) &&
231 !(ch >= 0xCFFFE && ch <= 0xCFFFF) &&
232 !(ch >= 0xDFFFE && ch <= 0xDFFFF) &&
233 !(ch >= 0xEFFFE && ch <= 0xEFFFF) &&
234 !(ch >= 0xFFFFE && ch <= 0xFFFFF) &&
235 !(ch >= 0x10FFFE && ch <= 0x10FFFF) &&
236 (!u_isISOControl(static_cast<UChar32>(ch)) ||
237 ch ==
'\n' || ch ==
'\f' || ch ==
'\t' || ch ==
'\r');
243 (!u_isISOControl(static_cast<UChar32>(ch)) ||
244 ch ==
'\n' || ch ==
'\f' || ch ==
'\t' || ch ==
'\r');
250 if (ch != 0x3000)
return ch;
253 if (ch == 0xFF5F)
return 0x2985;
254 if (ch == 0xFF60)
return 0x2986;
257 icu::UnicodeString uch_str(static_cast<UChar32>(ch));
258 const icu::Transliterator* fulltohalf = icu::Transliterator::createInstance(
259 "Fullwidth-Halfwidth", UTRANS_FORWARD, error_code);
260 error_code.assertSuccess();
263 fulltohalf->transliterate(uch_str);
static const_iterator begin(const char *utf8_str, const int byte_length)
int SpanUTF8NotWhitespace(const char *text)
bool IsInterchangeValid7BitAscii(const char32 ch)
#define ASSERT_HOST_MSG(x,...)
bool is_single_quote(const char32 ch)
bool is_hyphen_punc(const char32 ch)
static const_iterator end(const char *utf8_str, const int byte_length)
char32 FullwidthToHalfwidth(const char32 ch)
bool IsOCREquivalent(char32 ch1, char32 ch2)
void assign(const char *cstr, int len)
void ensure(inT32 min_capacity)
STRING NormalizeUTF8String(bool decompose, const char *str8)
bool is_double_quote(const char32 ch)
bool IsValidCodepoint(const char32 ch)
void NormalizeChar32(char32 ch, bool decompose, GenericVector< char32 > *str)
void UTF8ToUTF32(const char *utf8_str, GenericVector< char32 > *str32)
bool IsWhitespace(const char32 ch)
static int utf8_step(const char *utf8_str)
bool IsUTF8Whitespace(const char *text)
int SpanUTF8Whitespace(const char *text)
void UTF32ToUTF8(const GenericVector< char32 > &str32, STRING *utf8_str)
char32 OCRNormalize(char32 ch)
bool IsInterchangeValid(const char32 ch)