tesseract  4.00.00dev
normstrngs.h
Go to the documentation of this file.
1 /**********************************************************************
2  * File: normstrngs.h
3  * Description: Utilities to normalize and manipulate UTF-32 and
4  * UTF-8 strings.
5  * Author: Ranjith Unnikrishnan
6  * Created: Thu July 4 2013
7  *
8  * (C) Copyright 2013, Google Inc.
9  * Licensed under the Apache License, Version 2.0 (the "License");
10  * you may not use this file except in compliance with the License.
11  * You may obtain a copy of the License at
12  * http://www.apache.org/licenses/LICENSE-2.0
13  * Unless required by applicable law or agreed to in writing, software
14  * distributed under the License is distributed on an "AS IS" BASIS,
15  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
16  * See the License for the specific language governing permissions and
17  * limitations under the License.
18  *
19  **********************************************************************/
20 
21 #ifndef TESSERACT_CCUTIL_NORMSTRNGS_H_
22 #define TESSERACT_CCUTIL_NORMSTRNGS_H_
23 
24 #include "genericvector.h"
25 #include "strngs.h"
26 
27 typedef signed int char32;
28 
29 namespace tesseract {
30 
31 // UTF-8 to UTF-32 conversion function.
32 void UTF8ToUTF32(const char* utf8_str, GenericVector<char32>* str32);
33 
34 // UTF-32 to UTF-8 convesion function.
35 void UTF32ToUTF8(const GenericVector<char32>& str32, STRING* utf8_str);
36 
37 // Normalize a single char32 using NFKC + OCR-specific transformations.
38 // NOTE that proper NFKC may require multiple characters as input. The
39 // assumption of this function is that the input is already as fully composed
40 // as it can be, but may require some compatibility normalizations or just
41 // OCR evaluation related normalizations.
42 void NormalizeChar32(char32 ch, bool decompose, GenericVector<char32>* str);
43 
44 // Normalize a UTF8 string. Same as above, but for UTF8-encoded strings, that
45 // can contain multiple UTF32 code points.
46 STRING NormalizeUTF8String(bool decompose, const char* str8);
47 // Default behavior is to compose, until it is proven that decomposed benefits
48 // at least one language.
49 inline STRING NormalizeUTF8String(const char* str8) {
50  return NormalizeUTF8String(false, str8);
51 }
52 
53 // Apply just the OCR-specific normalizations and return the normalized char.
55 
56 // Returns true if the OCRNormalized ch1 and ch2 are the same.
57 bool IsOCREquivalent(char32 ch1, char32 ch2);
58 
59 // Returns true if the value lies in the range of valid unicodes.
60 bool IsValidCodepoint(const char32 ch);
61 
62 // Returns true a code point has the White_Space Unicode property.
63 bool IsWhitespace(const char32 ch);
64 // Returns true if every char in the given (null-terminated) string has the
65 // White_Space Unicode property.
66 bool IsUTF8Whitespace(const char* text);
67 
68 // Returns the length of bytes of the prefix of 'text' that have the White_Space
69 // unicode property.
70 int SpanUTF8Whitespace(const char* text);
71 
72 // Returns the length of bytes of the prefix of 'text' that DO NOT have the
73 // White_Space unicode property.
74 int SpanUTF8NotWhitespace(const char* text);
75 
76 // Returns true if the char is interchange valid i.e. no C0 or C1 control codes
77 // (other than CR LF HT FF) and no non-characters.
78 bool IsInterchangeValid(const char32 ch);
79 // Same as above but restricted to 7-bit ASCII.
80 bool IsInterchangeValid7BitAscii(const char32 ch);
81 
82 // Convert a full-width UTF-8 string to half-width.
84 
85 } // namespace tesseract
86 
87 #endif // TESSERACT_CCUTIL_NORMSTRNGS_H_
int SpanUTF8NotWhitespace(const char *text)
Definition: normstrngs.cpp:205
bool IsInterchangeValid7BitAscii(const char32 ch)
Definition: normstrngs.cpp:240
signed int char32
Definition: normstrngs.h:27
char32 FullwidthToHalfwidth(const char32 ch)
Definition: normstrngs.cpp:247
bool IsOCREquivalent(char32 ch1, char32 ch2)
Definition: normstrngs.cpp:168
STRING NormalizeUTF8String(bool decompose, const char *str8)
Definition: normstrngs.cpp:117
bool IsValidCodepoint(const char32 ch)
Definition: normstrngs.cpp:172
Definition: strngs.h:45
void NormalizeChar32(char32 ch, bool decompose, GenericVector< char32 > *str)
Definition: normstrngs.cpp:132
void UTF8ToUTF32(const char *utf8_str, GenericVector< char32 > *str32)
Definition: normstrngs.cpp:32
bool IsWhitespace(const char32 ch)
Definition: normstrngs.cpp:178
bool IsUTF8Whitespace(const char *text)
Definition: normstrngs.cpp:184
int SpanUTF8Whitespace(const char *text)
Definition: normstrngs.cpp:194
void UTF32ToUTF8(const GenericVector< char32 > &str32, STRING *utf8_str)
Definition: normstrngs.cpp:46
char32 OCRNormalize(char32 ch)
Definition: normstrngs.cpp:158
bool IsInterchangeValid(const char32 ch)
Definition: normstrngs.cpp:216