blob: 563fcd0197fecb102d5683c38c98cea15ba7173e [file] [log] [blame]
The Android Open Source Project9066cfe2009-03-03 19:31:44 -08001/*
2 * Copyright (C) 2006 The Android Open Source Project
3 *
4 * Licensed under the Apache License, Version 2.0 (the "License");
5 * you may not use this file except in compliance with the License.
6 * You may obtain a copy of the License at
7 *
8 * http://www.apache.org/licenses/LICENSE-2.0
9 *
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
15 */
16
17//
18
19#ifndef ANDROID_UNICODE_H
20#define ANDROID_UNICODE_H
21
22#include <stdint.h>
23#include <sys/types.h>
24
25#define REPLACEMENT_CHAR (0xFFFD)
26
27// this part of code is copied from umachine.h under ICU
28/**
29 * Define UChar32 as a type for single Unicode code points.
30 * UChar32 is a signed 32-bit integer (same as int32_t).
31 *
32 * The Unicode code point range is 0..0x10ffff.
33 * All other values (negative or >=0x110000) are illegal as Unicode code points.
34 * They may be used as sentinel values to indicate "done", "error"
35 * or similar non-code point conditions.
36 *
37 * @stable ICU 2.4
38 */
39typedef int32_t UChar32;
40
41namespace android {
42
43 class Encoding;
44 /**
45 * \class Unicode
46 *
47 * Helper class for getting properties of Unicode characters. Characters
48 * can have one of the types listed in CharType and each character can have the
49 * directionality of Direction.
50 */
51 class Unicode
52 {
53 public:
54 /**
55 * Directions specified in the Unicode standard. These directions map directly
56 * to java.lang.Character.
57 */
58 enum Direction {
59 DIRECTIONALITY_UNDEFINED = -1,
60 DIRECTIONALITY_LEFT_TO_RIGHT,
61 DIRECTIONALITY_RIGHT_TO_LEFT,
62 DIRECTIONALITY_RIGHT_TO_LEFT_ARABIC,
63 DIRECTIONALITY_EUROPEAN_NUMBER,
64 DIRECTIONALITY_EUROPEAN_NUMBER_SEPARATOR,
65 DIRECTIONALITY_EUROPEAN_NUMBER_TERMINATOR,
66 DIRECTIONALITY_ARABIC_NUMBER,
67 DIRECTIONALITY_COMMON_NUMBER_SEPARATOR,
68 DIRECTIONALITY_NONSPACING_MARK,
69 DIRECTIONALITY_BOUNDARY_NEUTRAL,
70 DIRECTIONALITY_PARAGRAPH_SEPARATOR,
71 DIRECTIONALITY_SEGMENT_SEPARATOR,
72 DIRECTIONALITY_WHITESPACE,
73 DIRECTIONALITY_OTHER_NEUTRALS,
74 DIRECTIONALITY_LEFT_TO_RIGHT_EMBEDDING,
75 DIRECTIONALITY_LEFT_TO_RIGHT_OVERRIDE,
76 DIRECTIONALITY_RIGHT_TO_LEFT_EMBEDDING,
77 DIRECTIONALITY_RIGHT_TO_LEFT_OVERRIDE,
78 DIRECTIONALITY_POP_DIRECTIONAL_FORMAT
79 };
80
81 /**
82 * Character types as specified in the Unicode standard. These map directly to
83 * java.lang.Character.
84 */
85 enum CharType {
86 CHARTYPE_UNASSIGNED = 0,
87 CHARTYPE_UPPERCASE_LETTER,
88 CHARTYPE_LOWERCASE_LETTER,
89 CHARTYPE_TITLECASE_LETTER,
90 CHARTYPE_MODIFIER_LETTER,
91 CHARTYPE_OTHER_LETTER,
92 CHARTYPE_NON_SPACING_MARK,
93 CHARTYPE_ENCLOSING_MARK,
94 CHARTYPE_COMBINING_SPACING_MARK,
95 CHARTYPE_DECIMAL_DIGIT_NUMBER,
96 CHARTYPE_LETTER_NUMBER,
97 CHARTYPE_OTHER_NUMBER,
98 CHARTYPE_SPACE_SEPARATOR,
99 CHARTYPE_LINE_SEPARATOR,
100 CHARTYPE_PARAGRAPH_SEPARATOR,
101 CHARTYPE_CONTROL,
102 CHARTYPE_FORMAT,
103 CHARTYPE_MISSING_VALUE_FOR_JAVA, /* This is the mysterious missing 17 value from the java constants */
104 CHARTYPE_PRIVATE_USE,
105 CHARTYPE_SURROGATE,
106 CHARTYPE_DASH_PUNCTUATION,
107 CHARTYPE_START_PUNCTUATION,
108 CHARTYPE_END_PUNCTUATION,
109 CHARTYPE_CONNECTOR_PUNCTUATION,
110 CHARTYPE_OTHER_PUNCTUATION,
111 CHARTYPE_MATH_SYMBOL,
112 CHARTYPE_CURRENCY_SYMBOL,
113 CHARTYPE_MODIFIER_SYMBOL,
114 CHARTYPE_OTHER_SYMBOL,
115 CHARTYPE_INITIAL_QUOTE_PUNCTUATION,
116 CHARTYPE_FINAL_QUOTE_PUNCTUATION
117 };
118
119 /**
120 * Decomposition types as described by the unicode standard. These values map to
121 * the same values in uchar.h in ICU.
122 */
123 enum DecompositionType {
124 DECOMPOSITION_NONE = 0,
125 DECOMPOSITION_CANONICAL,
126 DECOMPOSITION_COMPAT,
127 DECOMPOSITION_CIRCLE,
128 DECOMPOSITION_FINAL,
129 DECOMPOSITION_FONT,
130 DECOMPOSITION_FRACTION,
131 DECOMPOSITION_INITIAL,
132 DECOMPOSITION_ISOLATED,
133 DECOMPOSITION_MEDIAL,
134 DECOMPOSITION_NARROW,
135 DECOMPOSITION_NOBREAK,
136 DECOMPOSITION_SMALL,
137 DECOMPOSITION_SQUARE,
138 DECOMPOSITION_SUB,
139 DECOMPOSITION_SUPER,
140 DECOMPOSITION_VERTICAL,
141 DECOMPOSITION_WIDE
142 };
143
144 /**
145 * Returns the packed data for java calls
146 * @param c The unicode character.
147 * @return The packed data for the character.
148 *
149 * Copied from java.lang.Character implementation:
150 * 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
151 * F E D C B A 9 8 7 6 5 4 3 2 1 0 F E D C B A 9 8 7 6 5 4 3 2 1 0
152 *
153 * 31 types ---------
154 * 18 directionalities ---------
155 * 2 mirroreds -
156 * ----------- 56 toupper diffs
157 * ----------- 48 tolower diffs
158 * --- 4 totitlecase diffs
159 * ------------- 84 numeric values
160 * --------- 24 mirror char diffs
161 */
162 static uint32_t getPackedData(UChar32 c);
163
164 /**
165 * Get the Character type.
166 * @param c The unicode character.
167 * @return The character's type or CHARTYPE_UNASSIGNED if the character is invalid
168 * or has an unassigned class.
169 */
170 static CharType getType(UChar32 c);
171
172 /**
173 * Get the Character's decomposition type.
174 * @param c The unicode character.
175 * @return The character's decomposition type or DECOMPOSITION_NONE is there
176 * is no decomposition.
177 */
178 static DecompositionType getDecompositionType(UChar32 c);
179
180 /**
181 * Returns the digit value of a character or -1 if the character
182 * is not within the specified radix.
183 *
184 * The digit value is computed for integer characters and letters
185 * within the given radix. This function does not handle Roman Numerals,
186 * fractions, or any other characters that may represent numbers.
187 *
188 * @param c The unicode character
189 * @param radix The intended radix.
190 * @return The digit value or -1 if there is no digit value or if the value is outside the radix.
191 */
192 static int getDigitValue(UChar32 c, int radix = 10);
193
194 /**
195 * Return the numeric value of a character
196 *
197 * @param c The unicode character.
198 * @return The numeric value of the character. -1 if the character has no numeric value,
199 * -2 if the character has a numeric value that is not representable by an integer.
200 */
201 static int getNumericValue(UChar32 c);
202
203 /**
204 * Convert the character to lowercase
205 * @param c The unicode character.
206 * @return The lowercase character equivalent of c. If c does not have a lowercase equivalent,
207 * the original character is returned.
208 */
209 static UChar32 toLower(UChar32 c);
210
211 /**
212 * Convert the character to uppercase
213 * @param c The unicode character.
214 * @return The uppercase character equivalent of c. If c does not have an uppercase equivalent,
215 * the original character is returned.
216 */
217 static UChar32 toUpper(UChar32 c);
218
219 /**
220 * Get the directionality of the character.
221 * @param c The unicode character.
222 * @return The direction of the character or DIRECTIONALITY_UNDEFINED.
223 */
224 static Direction getDirectionality(UChar32 c);
225
226 /**
227 * Check if the character is a mirrored character. This means that the character
228 * has an equivalent character that is the mirror image of itself.
229 * @param c The unicode character.
230 * @return True iff c has a mirror equivalent.
231 */
232 static bool isMirrored(UChar32 c);
233
234 /**
235 * Return the mirror of the given character.
236 * @param c The unicode character.
237 * @return The mirror equivalent of c. If c does not have a mirror equivalent,
238 * the original character is returned.
239 * @see isMirrored
240 */
241 static UChar32 toMirror(UChar32 c);
242
243 /**
244 * Convert the character to title case.
245 * @param c The unicode character.
246 * @return The titlecase equivalent of c. If c does not have a titlecase equivalent,
247 * the original character is returned.
248 */
249 static UChar32 toTitle(UChar32 c);
250
251 };
252
253}
254
255#endif