blob: 919259e4d30e3f5854713b7c462eb321e084a117 [file] [log] [blame]
Narayan Kamatha5afcfc2015-01-29 20:06:46 +00001/*
2 * Copyright (C) 2015 The Android Open Source Project
3 *
4 * Licensed under the Apache License, Version 2.0 (the "License");
5 * you may not use this file except in compliance with the License.
6 * You may obtain a copy of the License at
7 *
8 * http://www.apache.org/licenses/LICENSE-2.0
9 *
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
15 */
16
17#include "utf.h"
18
Vladimir Markoe3bbc3f2015-11-25 11:10:20 +000019#include <map>
Narayan Kamathe16dad12015-02-13 11:49:22 +000020#include <vector>
21
Vladimir Markof1d973d2019-03-19 13:38:34 +000022#include <android-base/stringprintf.h>
23
David Sehre1123402018-02-01 02:46:18 -080024#include "gtest/gtest.h"
25#include "utf-inl.h"
26
Narayan Kamatha5afcfc2015-01-29 20:06:46 +000027namespace art {
28
David Sehre1123402018-02-01 02:46:18 -080029class UtfTest : public testing::Test {};
Narayan Kamatha5afcfc2015-01-29 20:06:46 +000030
31TEST_F(UtfTest, GetLeadingUtf16Char) {
32 EXPECT_EQ(0xffff, GetLeadingUtf16Char(0xeeeeffff));
33}
34
35TEST_F(UtfTest, GetTrailingUtf16Char) {
36 EXPECT_EQ(0xffff, GetTrailingUtf16Char(0xffffeeee));
37 EXPECT_EQ(0, GetTrailingUtf16Char(0x0000aaaa));
38}
39
40#define EXPECT_ARRAY_POSITION(expected, end, start) \
41 EXPECT_EQ(static_cast<uintptr_t>(expected), \
42 reinterpret_cast<uintptr_t>(end) - reinterpret_cast<uintptr_t>(start));
43
44// A test string containing one, two, three and four byte UTF-8 sequences.
45static const uint8_t kAllSequences[] = {
46 0x24,
47 0xc2, 0xa2,
48 0xe2, 0x82, 0xac,
49 0xf0, 0x9f, 0x8f, 0xa0,
50 0x00
51};
52
53// A test string that contains a UTF-8 encoding of a surrogate pair
Bruce Hoult1646d7a2015-10-28 15:06:12 +030054// (code point = U+10400).
Narayan Kamatha5afcfc2015-01-29 20:06:46 +000055static const uint8_t kSurrogateEncoding[] = {
56 0xed, 0xa0, 0x81,
57 0xed, 0xb0, 0x80,
58 0x00
59};
60
61TEST_F(UtfTest, GetUtf16FromUtf8) {
62 const char* const start = reinterpret_cast<const char*>(kAllSequences);
63 const char* ptr = start;
64 uint32_t pair = 0;
65
66 // Single byte sequence.
67 pair = GetUtf16FromUtf8(&ptr);
68 EXPECT_EQ(0x24, GetLeadingUtf16Char(pair));
69 EXPECT_EQ(0, GetTrailingUtf16Char(pair));
70 EXPECT_ARRAY_POSITION(1, ptr, start);
71
Bruce Hoult1646d7a2015-10-28 15:06:12 +030072 // Two byte sequence.
Narayan Kamatha5afcfc2015-01-29 20:06:46 +000073 pair = GetUtf16FromUtf8(&ptr);
74 EXPECT_EQ(0xa2, GetLeadingUtf16Char(pair));
75 EXPECT_EQ(0, GetTrailingUtf16Char(pair));
76 EXPECT_ARRAY_POSITION(3, ptr, start);
77
Bruce Hoult1646d7a2015-10-28 15:06:12 +030078 // Three byte sequence.
Narayan Kamatha5afcfc2015-01-29 20:06:46 +000079 pair = GetUtf16FromUtf8(&ptr);
80 EXPECT_EQ(0x20ac, GetLeadingUtf16Char(pair));
81 EXPECT_EQ(0, GetTrailingUtf16Char(pair));
82 EXPECT_ARRAY_POSITION(6, ptr, start);
83
84 // Four byte sequence
85 pair = GetUtf16FromUtf8(&ptr);
86 EXPECT_EQ(0xd83c, GetLeadingUtf16Char(pair));
87 EXPECT_EQ(0xdfe0, GetTrailingUtf16Char(pair));
88 EXPECT_ARRAY_POSITION(10, ptr, start);
89
Bruce Hoult1646d7a2015-10-28 15:06:12 +030090 // Null terminator.
Narayan Kamatha5afcfc2015-01-29 20:06:46 +000091 pair = GetUtf16FromUtf8(&ptr);
92 EXPECT_EQ(0, GetLeadingUtf16Char(pair));
93 EXPECT_EQ(0, GetTrailingUtf16Char(pair));
94 EXPECT_ARRAY_POSITION(11, ptr, start);
95}
96
97TEST_F(UtfTest, GetUtf16FromUtf8_SurrogatesPassThrough) {
98 const char* const start = reinterpret_cast<const char *>(kSurrogateEncoding);
99 const char* ptr = start;
100 uint32_t pair = 0;
101
102 pair = GetUtf16FromUtf8(&ptr);
103 EXPECT_EQ(0xd801, GetLeadingUtf16Char(pair));
104 EXPECT_EQ(0, GetTrailingUtf16Char(pair));
105 EXPECT_ARRAY_POSITION(3, ptr, start);
106
107 pair = GetUtf16FromUtf8(&ptr);
108 EXPECT_EQ(0xdc00, GetLeadingUtf16Char(pair));
109 EXPECT_EQ(0, GetTrailingUtf16Char(pair));
110 EXPECT_ARRAY_POSITION(6, ptr, start);
111}
112
113TEST_F(UtfTest, CountModifiedUtf8Chars) {
114 EXPECT_EQ(5u, CountModifiedUtf8Chars(reinterpret_cast<const char *>(kAllSequences)));
115 EXPECT_EQ(2u, CountModifiedUtf8Chars(reinterpret_cast<const char *>(kSurrogateEncoding)));
116}
117
Andreas Gampeca620d72016-11-08 08:09:33 -0800118static void AssertConversion(const std::vector<uint16_t>& input,
119 const std::vector<uint8_t>& expected) {
Chuck Liao1b9d4422021-07-12 01:25:23 +0000120 ASSERT_EQ(expected.size(), CountUtf8Bytes(&input[0], input.size()));
Narayan Kamathe16dad12015-02-13 11:49:22 +0000121
122 std::vector<uint8_t> output(expected.size());
Bruce Hoult1646d7a2015-10-28 15:06:12 +0300123 ConvertUtf16ToModifiedUtf8(reinterpret_cast<char*>(&output[0]), expected.size(),
124 &input[0], input.size());
Narayan Kamathe16dad12015-02-13 11:49:22 +0000125 EXPECT_EQ(expected, output);
126}
127
128TEST_F(UtfTest, CountAndConvertUtf8Bytes) {
Chuck Liao1b9d4422021-07-12 01:25:23 +0000129 // Surrogate pairs will be converted into 4 byte sequences.
130 AssertConversion({ 0xd801, 0xdc00 }, { 0xf0, 0x90, 0x90, 0x80 });
Narayan Kamathe16dad12015-02-13 11:49:22 +0000131
132 // Three byte encodings that are below & above the leading surrogate
133 // range respectively.
134 AssertConversion({ 0xdef0 }, { 0xed, 0xbb, 0xb0 });
135 AssertConversion({ 0xdcff }, { 0xed, 0xb3, 0xbf });
136 // Two byte encoding.
137 AssertConversion({ 0x0101 }, { 0xc4, 0x81 });
138
139 // Two byte special case : 0 must use an overlong encoding.
140 AssertConversion({ 0x0101, 0x0000 }, { 0xc4, 0x81, 0xc0, 0x80 });
141
142 // One byte encoding.
143 AssertConversion({ 'h', 'e', 'l', 'l', 'o' }, { 0x68, 0x65, 0x6c, 0x6c, 0x6f });
144
145 AssertConversion({
Chuck Liao1b9d4422021-07-12 01:25:23 +0000146 0xd802, 0xdc02, // Surrogate pair.
Bruce Hoult1646d7a2015-10-28 15:06:12 +0300147 0xdef0, 0xdcff, // Three byte encodings.
148 0x0101, 0x0000, // Two byte encodings.
149 'p' , 'p' // One byte encoding.
Narayan Kamathe16dad12015-02-13 11:49:22 +0000150 }, {
Chuck Liao1b9d4422021-07-12 01:25:23 +0000151 0xf0, 0x90, 0xa0, 0x82,
Narayan Kamathe16dad12015-02-13 11:49:22 +0000152 0xed, 0xbb, 0xb0, 0xed, 0xb3, 0xbf,
153 0xc4, 0x81, 0xc0, 0x80,
154 0x70, 0x70
155 });
156}
157
158TEST_F(UtfTest, CountAndConvertUtf8Bytes_UnpairedSurrogate) {
159 // Unpaired trailing surrogate at the end of input.
160 AssertConversion({ 'h', 'e', 0xd801 }, { 'h', 'e', 0xed, 0xa0, 0x81 });
161 // Unpaired (or incorrectly paired) surrogates in the middle of the input.
Vladimir Markoe3bbc3f2015-11-25 11:10:20 +0000162 const std::map<std::vector<uint16_t>, std::vector<uint8_t>> prefixes {
163 {{ 'h' }, { 'h' }},
164 {{ 0 }, { 0xc0, 0x80 }},
165 {{ 0x81 }, { 0xc2, 0x81 }},
166 {{ 0x801 }, { 0xe0, 0xa0, 0x81 }},
167 };
168 const std::map<std::vector<uint16_t>, std::vector<uint8_t>> suffixes {
169 {{ 'e' }, { 'e' }},
170 {{ 0 }, { 0xc0, 0x80 }},
171 {{ 0x7ff }, { 0xdf, 0xbf }},
172 {{ 0xffff }, { 0xef, 0xbf, 0xbf }},
173 };
174 const std::map<std::vector<uint16_t>, std::vector<uint8_t>> tests {
175 {{ 0xd801 }, { 0xed, 0xa0, 0x81 }},
176 {{ 0xdc00 }, { 0xed, 0xb0, 0x80 }},
177 {{ 0xd801, 0xd801 }, { 0xed, 0xa0, 0x81, 0xed, 0xa0, 0x81 }},
178 {{ 0xdc00, 0xdc00 }, { 0xed, 0xb0, 0x80, 0xed, 0xb0, 0x80 }},
179 };
180 for (const auto& prefix : prefixes) {
181 const std::vector<uint16_t>& prefix_in = prefix.first;
182 const std::vector<uint8_t>& prefix_out = prefix.second;
183 for (const auto& test : tests) {
184 const std::vector<uint16_t>& test_in = test.first;
185 const std::vector<uint8_t>& test_out = test.second;
186 for (const auto& suffix : suffixes) {
187 const std::vector<uint16_t>& suffix_in = suffix.first;
188 const std::vector<uint8_t>& suffix_out = suffix.second;
189 std::vector<uint16_t> in = prefix_in;
190 in.insert(in.end(), test_in.begin(), test_in.end());
191 in.insert(in.end(), suffix_in.begin(), suffix_in.end());
192 std::vector<uint8_t> out = prefix_out;
193 out.insert(out.end(), test_out.begin(), test_out.end());
194 out.insert(out.end(), suffix_out.begin(), suffix_out.end());
195 AssertConversion(in, out);
196 }
197 }
198 }
Narayan Kamathe16dad12015-02-13 11:49:22 +0000199}
200
Bruce Hoult1646d7a2015-10-28 15:06:12 +0300201// Old versions of functions, here to compare answers with optimized versions.
202
203size_t CountModifiedUtf8Chars_reference(const char* utf8) {
204 size_t len = 0;
205 int ic;
206 while ((ic = *utf8++) != '\0') {
207 len++;
208 if ((ic & 0x80) == 0) {
209 // one-byte encoding
210 continue;
211 }
212 // two- or three-byte encoding
213 utf8++;
214 if ((ic & 0x20) == 0) {
215 // two-byte encoding
216 continue;
217 }
218 utf8++;
219 if ((ic & 0x10) == 0) {
220 // three-byte encoding
221 continue;
222 }
223
224 // four-byte encoding: needs to be converted into a surrogate
225 // pair.
226 utf8++;
227 len++;
228 }
229 return len;
230}
231
Chuck Liao1b9d4422021-07-12 01:25:23 +0000232static size_t CountUtf8Bytes_reference(const uint16_t* chars, size_t char_count) {
Bruce Hoult1646d7a2015-10-28 15:06:12 +0300233 size_t result = 0;
234 while (char_count--) {
235 const uint16_t ch = *chars++;
236 if (ch > 0 && ch <= 0x7f) {
237 ++result;
Chuck Liao1b9d4422021-07-12 01:25:23 +0000238 } else if (ch >= 0xd800 && ch <= 0xdbff) {
239 if (char_count > 0) {
240 const uint16_t ch2 = *chars;
241 // If we find a properly paired surrogate, we emit it as a 4 byte
242 // UTF sequence. If we find an unpaired leading or trailing surrogate,
243 // we emit it as a 3 byte sequence like would have done earlier.
244 if (ch2 >= 0xdc00 && ch2 <= 0xdfff) {
245 chars++;
246 char_count--;
247
248 result += 4;
249 } else {
250 result += 3;
251 }
252 } else {
253 // This implies we found an unpaired trailing surrogate at the end
254 // of a string.
255 result += 3;
256 }
Bruce Hoult1646d7a2015-10-28 15:06:12 +0300257 } else if (ch > 0x7ff) {
258 result += 3;
259 } else {
260 result += 2;
261 }
262 }
263 return result;
264}
265
266static void ConvertUtf16ToModifiedUtf8_reference(char* utf8_out, const uint16_t* utf16_in,
267 size_t char_count) {
268 while (char_count--) {
269 const uint16_t ch = *utf16_in++;
270 if (ch > 0 && ch <= 0x7f) {
271 *utf8_out++ = ch;
272 } else {
Chuck Liao1b9d4422021-07-12 01:25:23 +0000273 // Char_count == 0 here implies we've encountered an unpaired
274 // surrogate and we have no choice but to encode it as 3-byte UTF
275 // sequence. Note that unpaired surrogates can occur as a part of
276 // "normal" operation.
277 if ((ch >= 0xd800 && ch <= 0xdbff) && (char_count > 0)) {
278 const uint16_t ch2 = *utf16_in;
279
280 // Check if the other half of the pair is within the expected
281 // range. If it isn't, we will have to emit both "halves" as
282 // separate 3 byte sequences.
283 if (ch2 >= 0xdc00 && ch2 <= 0xdfff) {
284 utf16_in++;
285 char_count--;
286 const uint32_t code_point = (ch << 10) + ch2 - 0x035fdc00;
287 *utf8_out++ = (code_point >> 18) | 0xf0;
288 *utf8_out++ = ((code_point >> 12) & 0x3f) | 0x80;
289 *utf8_out++ = ((code_point >> 6) & 0x3f) | 0x80;
290 *utf8_out++ = (code_point & 0x3f) | 0x80;
291 continue;
292 }
293 }
294
Bruce Hoult1646d7a2015-10-28 15:06:12 +0300295 if (ch > 0x07ff) {
296 // Three byte encoding.
297 *utf8_out++ = (ch >> 12) | 0xe0;
298 *utf8_out++ = ((ch >> 6) & 0x3f) | 0x80;
299 *utf8_out++ = (ch & 0x3f) | 0x80;
300 } else /*(ch > 0x7f || ch == 0)*/ {
301 // Two byte encoding.
302 *utf8_out++ = (ch >> 6) | 0xc0;
303 *utf8_out++ = (ch & 0x3f) | 0x80;
304 }
305 }
306 }
307}
308
309// Exhaustive test of converting a single code point to UTF-16, then UTF-8, and back again.
310
311static void codePointToSurrogatePair(uint32_t code_point, uint16_t &first, uint16_t &second) {
312 first = (code_point >> 10) + 0xd7c0;
313 second = (code_point & 0x03ff) + 0xdc00;
314}
315
Chuck Liao1b9d4422021-07-12 01:25:23 +0000316static void testConversions(uint16_t *buf, int char_count) {
317 char bytes_test[8] = { 0 }, bytes_reference[8] = { 0 };
318 uint16_t out_buf_test[4] = { 0 }, out_buf_reference[4] = { 0 };
319 int byte_count_test, byte_count_reference;
320 int char_count_test, char_count_reference;
321
Bruce Hoult1646d7a2015-10-28 15:06:12 +0300322 // Calculate the number of utf-8 bytes for the utf-16 chars.
Chuck Liao1b9d4422021-07-12 01:25:23 +0000323 byte_count_reference = CountUtf8Bytes_reference(buf, char_count);
324 byte_count_test = CountUtf8Bytes(buf, char_count);
325 EXPECT_EQ(byte_count_reference, byte_count_test);
Bruce Hoult1646d7a2015-10-28 15:06:12 +0300326
327 // Convert the utf-16 string to utf-8 bytes.
328 ConvertUtf16ToModifiedUtf8_reference(bytes_reference, buf, char_count);
329 ConvertUtf16ToModifiedUtf8(bytes_test, byte_count_test, buf, char_count);
Chuck Liao1b9d4422021-07-12 01:25:23 +0000330 for (int i = 0; i < byte_count_test; ++i) {
331 EXPECT_EQ(bytes_reference[i], bytes_test[i]);
Bruce Hoult1646d7a2015-10-28 15:06:12 +0300332 }
333
334 // Calculate the number of utf-16 chars from the utf-8 bytes.
335 bytes_reference[byte_count_reference] = 0; // Reference function needs null termination.
Chuck Liao1b9d4422021-07-12 01:25:23 +0000336 char_count_reference = CountModifiedUtf8Chars_reference(bytes_reference);
337 char_count_test = CountModifiedUtf8Chars(bytes_test, byte_count_test);
338 EXPECT_EQ(char_count, char_count_reference);
339 EXPECT_EQ(char_count, char_count_test);
Bruce Hoult1646d7a2015-10-28 15:06:12 +0300340
341 // Convert the utf-8 bytes back to utf-16 chars.
342 // Does not need copied _reference version of the function because the original
343 // function with the old API is retained for debug/testing code.
344 ConvertModifiedUtf8ToUtf16(out_buf_reference, bytes_reference);
345 ConvertModifiedUtf8ToUtf16(out_buf_test, char_count_test, bytes_test, byte_count_test);
Chuck Liao1b9d4422021-07-12 01:25:23 +0000346 for (int i = 0; i < char_count_test; ++i) {
347 EXPECT_EQ(buf[i], out_buf_reference[i]);
348 EXPECT_EQ(buf[i], out_buf_test[i]);
Bruce Hoult1646d7a2015-10-28 15:06:12 +0300349 }
350}
351
352TEST_F(UtfTest, ExhaustiveBidirectionalCodePointCheck) {
353 for (int codePoint = 0; codePoint <= 0x10ffff; ++codePoint) {
Andreas Gampe4464a3e2016-03-03 20:15:47 -0800354 uint16_t buf[4] = { 0 };
Bruce Hoult1646d7a2015-10-28 15:06:12 +0300355 if (codePoint <= 0xffff) {
356 if (codePoint >= 0xd800 && codePoint <= 0xdfff) {
357 // According to the Unicode standard, no character will ever
Roland Levillain91d65e02016-01-19 15:59:16 +0000358 // be assigned to these code points, and they cannot be encoded
Bruce Hoult1646d7a2015-10-28 15:06:12 +0300359 // into either utf-16 or utf-8.
360 continue;
361 }
362 buf[0] = 'h';
363 buf[1] = codePoint;
364 buf[2] = 'e';
365 testConversions(buf, 2);
366 testConversions(buf, 3);
367 testConversions(buf + 1, 1);
368 testConversions(buf + 1, 2);
369 } else {
370 buf[0] = 'h';
371 codePointToSurrogatePair(codePoint, buf[1], buf[2]);
372 buf[3] = 'e';
373 testConversions(buf, 2);
374 testConversions(buf, 3);
375 testConversions(buf, 4);
376 testConversions(buf + 1, 1);
377 testConversions(buf + 1, 2);
378 testConversions(buf + 1, 3);
379 }
380 }
381}
382
Vladimir Markoca0f2dc2018-12-10 12:14:36 +0000383TEST_F(UtfTest, NonAscii) {
384 const char kNonAsciiCharacter = '\x80';
385 const char input[] = { kNonAsciiCharacter, '\0' };
386 uint32_t hash = ComputeModifiedUtf8Hash(input);
387 EXPECT_EQ(static_cast<uint8_t>(kNonAsciiCharacter), hash);
388}
389
Vladimir Markof1d973d2019-03-19 13:38:34 +0000390TEST_F(UtfTest, PrintableStringUtf8) {
391 // Note: This is UTF-8, not Modified-UTF-8.
392 const uint8_t kTestSequence[] = { 0xf0, 0x90, 0x80, 0x80, 0 };
393 const char* start = reinterpret_cast<const char*>(kTestSequence);
394 const char* ptr = start;
395 uint32_t pair = GetUtf16FromUtf8(&ptr);
396 ASSERT_EQ(*ptr, '\0');
397 uint16_t leading = GetLeadingUtf16Char(pair);
398 uint16_t trailing = GetTrailingUtf16Char(pair);
399 ASSERT_NE(0u, trailing);
400
401 std::string expected = android::base::StringPrintf("\"\\u%04x\\u%04x\"",
402 static_cast<unsigned>(leading),
403 static_cast<unsigned>(trailing));
404 std::string printable = PrintableString(start);
405 EXPECT_EQ(expected, printable);
406}
407
Narayan Kamatha5afcfc2015-01-29 20:06:46 +0000408} // namespace art