tesseract  4.00.00dev
unicodes.cpp
Go to the documentation of this file.
1 /**********************************************************************
2  * File: unicodes.h
3  * Description: Unicode related machinery
4  * Author: David Eger
5  * Created: Wed Jun 15 16:37:50 PST 2011
6  *
7  * (C) Copyright 2011, Google, Inc.
8  ** Licensed under the Apache License, Version 2.0 (the "License");
9  ** you may not use this file except in compliance with the License.
10  ** You may obtain a copy of the License at
11  ** http://www.apache.org/licenses/LICENSE-2.0
12  ** Unless required by applicable law or agreed to in writing, software
13  ** distributed under the License is distributed on an "AS IS" BASIS,
14  ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15  ** See the License for the specific language governing permissions and
16  ** limitations under the License.
17  *
18  **********************************************************************/
19 
20 #include "unicodes.h"
21 #include "host.h" // for NULL
22 
23 namespace tesseract {
24 
25 const char *kUTF8LineSeparator = "\u2028"; // "\xe2\x80\xa8";
26 const char *kUTF8ParagraphSeparator = "\u2029"; // "\xe2\x80\xa9";
27 const char *kLRM = "\u200E"; // Left-to-Right Mark
28 const char *kRLM = "\u200F"; // Right-to-Left Mark
29 const char *kRLE = "\u202A"; // Right-to-Left Embedding
30 const char *kPDF = "\u202C"; // Pop Directional Formatting
31 
32 const char *kHyphenLikeUTF8[] = {
33  "-", // ASCII hyphen-minus
34  "\u05BE", // word hyphen in hybrew
35  "\u2010", // hyphen
36  "\u2011", // non-breaking hyphen
37  "\u2012", // a hyphen the same width as digits
38  "\u2013", // en dash
39  "\u2014", // em dash
40  "\u2015", // horizontal bar
41  "\u2212", // arithmetic minus sign
42  "\uFE58", // small em dash
43  "\uFE63", // small hyphen-minus
44  "\uFF0D", // fullwidth hyphen-minus
45  NULL, // end of our list
46 };
47 
48 const char *kApostropheLikeUTF8[] = {
49  "'", // ASCII apostrophe
50  "`", // ASCII backtick
51  "\u2018", // opening single quote
52  "\u2019", // closing single quote
53  "\u2032", // mathematical prime mark
54  NULL, // end of our list.
55 };
56 
57 } // namespace
const char * kPDF
Definition: unicodes.cpp:30
const char * kApostropheLikeUTF8[]
Definition: unicodes.cpp:48
const char * kRLM
Definition: unicodes.cpp:28
const char * kRLE
Definition: unicodes.cpp:29
const char * kLRM
Definition: unicodes.cpp:27
const char * kUTF8ParagraphSeparator
Definition: unicodes.cpp:26
const char * kUTF8LineSeparator
Definition: unicodes.cpp:25
const char * kHyphenLikeUTF8[]
Definition: unicodes.cpp:32