Narayan Kamath | a5afcfc | 2015-01-29 20:06:46 +0000 | [diff] [blame] | 1 | /* |
| 2 | * Copyright (C) 2015 The Android Open Source Project |
| 3 | * |
| 4 | * Licensed under the Apache License, Version 2.0 (the "License"); |
| 5 | * you may not use this file except in compliance with the License. |
| 6 | * You may obtain a copy of the License at |
| 7 | * |
| 8 | * http://www.apache.org/licenses/LICENSE-2.0 |
| 9 | * |
| 10 | * Unless required by applicable law or agreed to in writing, software |
| 11 | * distributed under the License is distributed on an "AS IS" BASIS, |
| 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
| 13 | * See the License for the specific language governing permissions and |
| 14 | * limitations under the License. |
| 15 | */ |
| 16 | |
| 17 | #include "utf.h" |
| 18 | |
Vladimir Marko | e3bbc3f | 2015-11-25 11:10:20 +0000 | [diff] [blame] | 19 | #include <map> |
Narayan Kamath | e16dad1 | 2015-02-13 11:49:22 +0000 | [diff] [blame] | 20 | #include <vector> |
| 21 | |
Vladimir Marko | f1d973d | 2019-03-19 13:38:34 +0000 | [diff] [blame] | 22 | #include <android-base/stringprintf.h> |
| 23 | |
David Sehr | e112340 | 2018-02-01 02:46:18 -0800 | [diff] [blame] | 24 | #include "gtest/gtest.h" |
| 25 | #include "utf-inl.h" |
| 26 | |
Narayan Kamath | a5afcfc | 2015-01-29 20:06:46 +0000 | [diff] [blame] | 27 | namespace art { |
| 28 | |
David Sehr | e112340 | 2018-02-01 02:46:18 -0800 | [diff] [blame] | 29 | class UtfTest : public testing::Test {}; |
Narayan Kamath | a5afcfc | 2015-01-29 20:06:46 +0000 | [diff] [blame] | 30 | |
| 31 | TEST_F(UtfTest, GetLeadingUtf16Char) { |
| 32 | EXPECT_EQ(0xffff, GetLeadingUtf16Char(0xeeeeffff)); |
| 33 | } |
| 34 | |
| 35 | TEST_F(UtfTest, GetTrailingUtf16Char) { |
| 36 | EXPECT_EQ(0xffff, GetTrailingUtf16Char(0xffffeeee)); |
| 37 | EXPECT_EQ(0, GetTrailingUtf16Char(0x0000aaaa)); |
| 38 | } |
| 39 | |
| 40 | #define EXPECT_ARRAY_POSITION(expected, end, start) \ |
| 41 | EXPECT_EQ(static_cast<uintptr_t>(expected), \ |
| 42 | reinterpret_cast<uintptr_t>(end) - reinterpret_cast<uintptr_t>(start)); |
| 43 | |
| 44 | // A test string containing one, two, three and four byte UTF-8 sequences. |
| 45 | static const uint8_t kAllSequences[] = { |
| 46 | 0x24, |
| 47 | 0xc2, 0xa2, |
| 48 | 0xe2, 0x82, 0xac, |
| 49 | 0xf0, 0x9f, 0x8f, 0xa0, |
| 50 | 0x00 |
| 51 | }; |
| 52 | |
| 53 | // A test string that contains a UTF-8 encoding of a surrogate pair |
Bruce Hoult | 1646d7a | 2015-10-28 15:06:12 +0300 | [diff] [blame] | 54 | // (code point = U+10400). |
Narayan Kamath | a5afcfc | 2015-01-29 20:06:46 +0000 | [diff] [blame] | 55 | static const uint8_t kSurrogateEncoding[] = { |
| 56 | 0xed, 0xa0, 0x81, |
| 57 | 0xed, 0xb0, 0x80, |
| 58 | 0x00 |
| 59 | }; |
| 60 | |
| 61 | TEST_F(UtfTest, GetUtf16FromUtf8) { |
| 62 | const char* const start = reinterpret_cast<const char*>(kAllSequences); |
| 63 | const char* ptr = start; |
| 64 | uint32_t pair = 0; |
| 65 | |
| 66 | // Single byte sequence. |
| 67 | pair = GetUtf16FromUtf8(&ptr); |
| 68 | EXPECT_EQ(0x24, GetLeadingUtf16Char(pair)); |
| 69 | EXPECT_EQ(0, GetTrailingUtf16Char(pair)); |
| 70 | EXPECT_ARRAY_POSITION(1, ptr, start); |
| 71 | |
Bruce Hoult | 1646d7a | 2015-10-28 15:06:12 +0300 | [diff] [blame] | 72 | // Two byte sequence. |
Narayan Kamath | a5afcfc | 2015-01-29 20:06:46 +0000 | [diff] [blame] | 73 | pair = GetUtf16FromUtf8(&ptr); |
| 74 | EXPECT_EQ(0xa2, GetLeadingUtf16Char(pair)); |
| 75 | EXPECT_EQ(0, GetTrailingUtf16Char(pair)); |
| 76 | EXPECT_ARRAY_POSITION(3, ptr, start); |
| 77 | |
Bruce Hoult | 1646d7a | 2015-10-28 15:06:12 +0300 | [diff] [blame] | 78 | // Three byte sequence. |
Narayan Kamath | a5afcfc | 2015-01-29 20:06:46 +0000 | [diff] [blame] | 79 | pair = GetUtf16FromUtf8(&ptr); |
| 80 | EXPECT_EQ(0x20ac, GetLeadingUtf16Char(pair)); |
| 81 | EXPECT_EQ(0, GetTrailingUtf16Char(pair)); |
| 82 | EXPECT_ARRAY_POSITION(6, ptr, start); |
| 83 | |
| 84 | // Four byte sequence |
| 85 | pair = GetUtf16FromUtf8(&ptr); |
| 86 | EXPECT_EQ(0xd83c, GetLeadingUtf16Char(pair)); |
| 87 | EXPECT_EQ(0xdfe0, GetTrailingUtf16Char(pair)); |
| 88 | EXPECT_ARRAY_POSITION(10, ptr, start); |
| 89 | |
Bruce Hoult | 1646d7a | 2015-10-28 15:06:12 +0300 | [diff] [blame] | 90 | // Null terminator. |
Narayan Kamath | a5afcfc | 2015-01-29 20:06:46 +0000 | [diff] [blame] | 91 | pair = GetUtf16FromUtf8(&ptr); |
| 92 | EXPECT_EQ(0, GetLeadingUtf16Char(pair)); |
| 93 | EXPECT_EQ(0, GetTrailingUtf16Char(pair)); |
| 94 | EXPECT_ARRAY_POSITION(11, ptr, start); |
| 95 | } |
| 96 | |
| 97 | TEST_F(UtfTest, GetUtf16FromUtf8_SurrogatesPassThrough) { |
| 98 | const char* const start = reinterpret_cast<const char *>(kSurrogateEncoding); |
| 99 | const char* ptr = start; |
| 100 | uint32_t pair = 0; |
| 101 | |
| 102 | pair = GetUtf16FromUtf8(&ptr); |
| 103 | EXPECT_EQ(0xd801, GetLeadingUtf16Char(pair)); |
| 104 | EXPECT_EQ(0, GetTrailingUtf16Char(pair)); |
| 105 | EXPECT_ARRAY_POSITION(3, ptr, start); |
| 106 | |
| 107 | pair = GetUtf16FromUtf8(&ptr); |
| 108 | EXPECT_EQ(0xdc00, GetLeadingUtf16Char(pair)); |
| 109 | EXPECT_EQ(0, GetTrailingUtf16Char(pair)); |
| 110 | EXPECT_ARRAY_POSITION(6, ptr, start); |
| 111 | } |
| 112 | |
| 113 | TEST_F(UtfTest, CountModifiedUtf8Chars) { |
| 114 | EXPECT_EQ(5u, CountModifiedUtf8Chars(reinterpret_cast<const char *>(kAllSequences))); |
| 115 | EXPECT_EQ(2u, CountModifiedUtf8Chars(reinterpret_cast<const char *>(kSurrogateEncoding))); |
| 116 | } |
| 117 | |
Andreas Gampe | ca620d7 | 2016-11-08 08:09:33 -0800 | [diff] [blame] | 118 | static void AssertConversion(const std::vector<uint16_t>& input, |
| 119 | const std::vector<uint8_t>& expected) { |
Chuck Liao | 1b9d442 | 2021-07-12 01:25:23 +0000 | [diff] [blame] | 120 | ASSERT_EQ(expected.size(), CountUtf8Bytes(&input[0], input.size())); |
Narayan Kamath | e16dad1 | 2015-02-13 11:49:22 +0000 | [diff] [blame] | 121 | |
| 122 | std::vector<uint8_t> output(expected.size()); |
Bruce Hoult | 1646d7a | 2015-10-28 15:06:12 +0300 | [diff] [blame] | 123 | ConvertUtf16ToModifiedUtf8(reinterpret_cast<char*>(&output[0]), expected.size(), |
| 124 | &input[0], input.size()); |
Narayan Kamath | e16dad1 | 2015-02-13 11:49:22 +0000 | [diff] [blame] | 125 | EXPECT_EQ(expected, output); |
| 126 | } |
| 127 | |
| 128 | TEST_F(UtfTest, CountAndConvertUtf8Bytes) { |
Chuck Liao | 1b9d442 | 2021-07-12 01:25:23 +0000 | [diff] [blame] | 129 | // Surrogate pairs will be converted into 4 byte sequences. |
| 130 | AssertConversion({ 0xd801, 0xdc00 }, { 0xf0, 0x90, 0x90, 0x80 }); |
Narayan Kamath | e16dad1 | 2015-02-13 11:49:22 +0000 | [diff] [blame] | 131 | |
| 132 | // Three byte encodings that are below & above the leading surrogate |
| 133 | // range respectively. |
| 134 | AssertConversion({ 0xdef0 }, { 0xed, 0xbb, 0xb0 }); |
| 135 | AssertConversion({ 0xdcff }, { 0xed, 0xb3, 0xbf }); |
| 136 | // Two byte encoding. |
| 137 | AssertConversion({ 0x0101 }, { 0xc4, 0x81 }); |
| 138 | |
| 139 | // Two byte special case : 0 must use an overlong encoding. |
| 140 | AssertConversion({ 0x0101, 0x0000 }, { 0xc4, 0x81, 0xc0, 0x80 }); |
| 141 | |
| 142 | // One byte encoding. |
| 143 | AssertConversion({ 'h', 'e', 'l', 'l', 'o' }, { 0x68, 0x65, 0x6c, 0x6c, 0x6f }); |
| 144 | |
| 145 | AssertConversion({ |
Chuck Liao | 1b9d442 | 2021-07-12 01:25:23 +0000 | [diff] [blame] | 146 | 0xd802, 0xdc02, // Surrogate pair. |
Bruce Hoult | 1646d7a | 2015-10-28 15:06:12 +0300 | [diff] [blame] | 147 | 0xdef0, 0xdcff, // Three byte encodings. |
| 148 | 0x0101, 0x0000, // Two byte encodings. |
| 149 | 'p' , 'p' // One byte encoding. |
Narayan Kamath | e16dad1 | 2015-02-13 11:49:22 +0000 | [diff] [blame] | 150 | }, { |
Chuck Liao | 1b9d442 | 2021-07-12 01:25:23 +0000 | [diff] [blame] | 151 | 0xf0, 0x90, 0xa0, 0x82, |
Narayan Kamath | e16dad1 | 2015-02-13 11:49:22 +0000 | [diff] [blame] | 152 | 0xed, 0xbb, 0xb0, 0xed, 0xb3, 0xbf, |
| 153 | 0xc4, 0x81, 0xc0, 0x80, |
| 154 | 0x70, 0x70 |
| 155 | }); |
| 156 | } |
| 157 | |
| 158 | TEST_F(UtfTest, CountAndConvertUtf8Bytes_UnpairedSurrogate) { |
| 159 | // Unpaired trailing surrogate at the end of input. |
| 160 | AssertConversion({ 'h', 'e', 0xd801 }, { 'h', 'e', 0xed, 0xa0, 0x81 }); |
| 161 | // Unpaired (or incorrectly paired) surrogates in the middle of the input. |
Vladimir Marko | e3bbc3f | 2015-11-25 11:10:20 +0000 | [diff] [blame] | 162 | const std::map<std::vector<uint16_t>, std::vector<uint8_t>> prefixes { |
| 163 | {{ 'h' }, { 'h' }}, |
| 164 | {{ 0 }, { 0xc0, 0x80 }}, |
| 165 | {{ 0x81 }, { 0xc2, 0x81 }}, |
| 166 | {{ 0x801 }, { 0xe0, 0xa0, 0x81 }}, |
| 167 | }; |
| 168 | const std::map<std::vector<uint16_t>, std::vector<uint8_t>> suffixes { |
| 169 | {{ 'e' }, { 'e' }}, |
| 170 | {{ 0 }, { 0xc0, 0x80 }}, |
| 171 | {{ 0x7ff }, { 0xdf, 0xbf }}, |
| 172 | {{ 0xffff }, { 0xef, 0xbf, 0xbf }}, |
| 173 | }; |
| 174 | const std::map<std::vector<uint16_t>, std::vector<uint8_t>> tests { |
| 175 | {{ 0xd801 }, { 0xed, 0xa0, 0x81 }}, |
| 176 | {{ 0xdc00 }, { 0xed, 0xb0, 0x80 }}, |
| 177 | {{ 0xd801, 0xd801 }, { 0xed, 0xa0, 0x81, 0xed, 0xa0, 0x81 }}, |
| 178 | {{ 0xdc00, 0xdc00 }, { 0xed, 0xb0, 0x80, 0xed, 0xb0, 0x80 }}, |
| 179 | }; |
| 180 | for (const auto& prefix : prefixes) { |
| 181 | const std::vector<uint16_t>& prefix_in = prefix.first; |
| 182 | const std::vector<uint8_t>& prefix_out = prefix.second; |
| 183 | for (const auto& test : tests) { |
| 184 | const std::vector<uint16_t>& test_in = test.first; |
| 185 | const std::vector<uint8_t>& test_out = test.second; |
| 186 | for (const auto& suffix : suffixes) { |
| 187 | const std::vector<uint16_t>& suffix_in = suffix.first; |
| 188 | const std::vector<uint8_t>& suffix_out = suffix.second; |
| 189 | std::vector<uint16_t> in = prefix_in; |
| 190 | in.insert(in.end(), test_in.begin(), test_in.end()); |
| 191 | in.insert(in.end(), suffix_in.begin(), suffix_in.end()); |
| 192 | std::vector<uint8_t> out = prefix_out; |
| 193 | out.insert(out.end(), test_out.begin(), test_out.end()); |
| 194 | out.insert(out.end(), suffix_out.begin(), suffix_out.end()); |
| 195 | AssertConversion(in, out); |
| 196 | } |
| 197 | } |
| 198 | } |
Narayan Kamath | e16dad1 | 2015-02-13 11:49:22 +0000 | [diff] [blame] | 199 | } |
| 200 | |
Bruce Hoult | 1646d7a | 2015-10-28 15:06:12 +0300 | [diff] [blame] | 201 | // Old versions of functions, here to compare answers with optimized versions. |
| 202 | |
| 203 | size_t CountModifiedUtf8Chars_reference(const char* utf8) { |
| 204 | size_t len = 0; |
| 205 | int ic; |
| 206 | while ((ic = *utf8++) != '\0') { |
| 207 | len++; |
| 208 | if ((ic & 0x80) == 0) { |
| 209 | // one-byte encoding |
| 210 | continue; |
| 211 | } |
| 212 | // two- or three-byte encoding |
| 213 | utf8++; |
| 214 | if ((ic & 0x20) == 0) { |
| 215 | // two-byte encoding |
| 216 | continue; |
| 217 | } |
| 218 | utf8++; |
| 219 | if ((ic & 0x10) == 0) { |
| 220 | // three-byte encoding |
| 221 | continue; |
| 222 | } |
| 223 | |
| 224 | // four-byte encoding: needs to be converted into a surrogate |
| 225 | // pair. |
| 226 | utf8++; |
| 227 | len++; |
| 228 | } |
| 229 | return len; |
| 230 | } |
| 231 | |
Chuck Liao | 1b9d442 | 2021-07-12 01:25:23 +0000 | [diff] [blame] | 232 | static size_t CountUtf8Bytes_reference(const uint16_t* chars, size_t char_count) { |
Bruce Hoult | 1646d7a | 2015-10-28 15:06:12 +0300 | [diff] [blame] | 233 | size_t result = 0; |
| 234 | while (char_count--) { |
| 235 | const uint16_t ch = *chars++; |
| 236 | if (ch > 0 && ch <= 0x7f) { |
| 237 | ++result; |
Chuck Liao | 1b9d442 | 2021-07-12 01:25:23 +0000 | [diff] [blame] | 238 | } else if (ch >= 0xd800 && ch <= 0xdbff) { |
| 239 | if (char_count > 0) { |
| 240 | const uint16_t ch2 = *chars; |
| 241 | // If we find a properly paired surrogate, we emit it as a 4 byte |
| 242 | // UTF sequence. If we find an unpaired leading or trailing surrogate, |
| 243 | // we emit it as a 3 byte sequence like would have done earlier. |
| 244 | if (ch2 >= 0xdc00 && ch2 <= 0xdfff) { |
| 245 | chars++; |
| 246 | char_count--; |
| 247 | |
| 248 | result += 4; |
| 249 | } else { |
| 250 | result += 3; |
| 251 | } |
| 252 | } else { |
| 253 | // This implies we found an unpaired trailing surrogate at the end |
| 254 | // of a string. |
| 255 | result += 3; |
| 256 | } |
Bruce Hoult | 1646d7a | 2015-10-28 15:06:12 +0300 | [diff] [blame] | 257 | } else if (ch > 0x7ff) { |
| 258 | result += 3; |
| 259 | } else { |
| 260 | result += 2; |
| 261 | } |
| 262 | } |
| 263 | return result; |
| 264 | } |
| 265 | |
| 266 | static void ConvertUtf16ToModifiedUtf8_reference(char* utf8_out, const uint16_t* utf16_in, |
| 267 | size_t char_count) { |
| 268 | while (char_count--) { |
| 269 | const uint16_t ch = *utf16_in++; |
| 270 | if (ch > 0 && ch <= 0x7f) { |
| 271 | *utf8_out++ = ch; |
| 272 | } else { |
Chuck Liao | 1b9d442 | 2021-07-12 01:25:23 +0000 | [diff] [blame] | 273 | // Char_count == 0 here implies we've encountered an unpaired |
| 274 | // surrogate and we have no choice but to encode it as 3-byte UTF |
| 275 | // sequence. Note that unpaired surrogates can occur as a part of |
| 276 | // "normal" operation. |
| 277 | if ((ch >= 0xd800 && ch <= 0xdbff) && (char_count > 0)) { |
| 278 | const uint16_t ch2 = *utf16_in; |
| 279 | |
| 280 | // Check if the other half of the pair is within the expected |
| 281 | // range. If it isn't, we will have to emit both "halves" as |
| 282 | // separate 3 byte sequences. |
| 283 | if (ch2 >= 0xdc00 && ch2 <= 0xdfff) { |
| 284 | utf16_in++; |
| 285 | char_count--; |
| 286 | const uint32_t code_point = (ch << 10) + ch2 - 0x035fdc00; |
| 287 | *utf8_out++ = (code_point >> 18) | 0xf0; |
| 288 | *utf8_out++ = ((code_point >> 12) & 0x3f) | 0x80; |
| 289 | *utf8_out++ = ((code_point >> 6) & 0x3f) | 0x80; |
| 290 | *utf8_out++ = (code_point & 0x3f) | 0x80; |
| 291 | continue; |
| 292 | } |
| 293 | } |
| 294 | |
Bruce Hoult | 1646d7a | 2015-10-28 15:06:12 +0300 | [diff] [blame] | 295 | if (ch > 0x07ff) { |
| 296 | // Three byte encoding. |
| 297 | *utf8_out++ = (ch >> 12) | 0xe0; |
| 298 | *utf8_out++ = ((ch >> 6) & 0x3f) | 0x80; |
| 299 | *utf8_out++ = (ch & 0x3f) | 0x80; |
| 300 | } else /*(ch > 0x7f || ch == 0)*/ { |
| 301 | // Two byte encoding. |
| 302 | *utf8_out++ = (ch >> 6) | 0xc0; |
| 303 | *utf8_out++ = (ch & 0x3f) | 0x80; |
| 304 | } |
| 305 | } |
| 306 | } |
| 307 | } |
| 308 | |
| 309 | // Exhaustive test of converting a single code point to UTF-16, then UTF-8, and back again. |
| 310 | |
| 311 | static void codePointToSurrogatePair(uint32_t code_point, uint16_t &first, uint16_t &second) { |
| 312 | first = (code_point >> 10) + 0xd7c0; |
| 313 | second = (code_point & 0x03ff) + 0xdc00; |
| 314 | } |
| 315 | |
Chuck Liao | 1b9d442 | 2021-07-12 01:25:23 +0000 | [diff] [blame] | 316 | static void testConversions(uint16_t *buf, int char_count) { |
| 317 | char bytes_test[8] = { 0 }, bytes_reference[8] = { 0 }; |
| 318 | uint16_t out_buf_test[4] = { 0 }, out_buf_reference[4] = { 0 }; |
| 319 | int byte_count_test, byte_count_reference; |
| 320 | int char_count_test, char_count_reference; |
| 321 | |
Bruce Hoult | 1646d7a | 2015-10-28 15:06:12 +0300 | [diff] [blame] | 322 | // Calculate the number of utf-8 bytes for the utf-16 chars. |
Chuck Liao | 1b9d442 | 2021-07-12 01:25:23 +0000 | [diff] [blame] | 323 | byte_count_reference = CountUtf8Bytes_reference(buf, char_count); |
| 324 | byte_count_test = CountUtf8Bytes(buf, char_count); |
| 325 | EXPECT_EQ(byte_count_reference, byte_count_test); |
Bruce Hoult | 1646d7a | 2015-10-28 15:06:12 +0300 | [diff] [blame] | 326 | |
| 327 | // Convert the utf-16 string to utf-8 bytes. |
| 328 | ConvertUtf16ToModifiedUtf8_reference(bytes_reference, buf, char_count); |
| 329 | ConvertUtf16ToModifiedUtf8(bytes_test, byte_count_test, buf, char_count); |
Chuck Liao | 1b9d442 | 2021-07-12 01:25:23 +0000 | [diff] [blame] | 330 | for (int i = 0; i < byte_count_test; ++i) { |
| 331 | EXPECT_EQ(bytes_reference[i], bytes_test[i]); |
Bruce Hoult | 1646d7a | 2015-10-28 15:06:12 +0300 | [diff] [blame] | 332 | } |
| 333 | |
| 334 | // Calculate the number of utf-16 chars from the utf-8 bytes. |
| 335 | bytes_reference[byte_count_reference] = 0; // Reference function needs null termination. |
Chuck Liao | 1b9d442 | 2021-07-12 01:25:23 +0000 | [diff] [blame] | 336 | char_count_reference = CountModifiedUtf8Chars_reference(bytes_reference); |
| 337 | char_count_test = CountModifiedUtf8Chars(bytes_test, byte_count_test); |
| 338 | EXPECT_EQ(char_count, char_count_reference); |
| 339 | EXPECT_EQ(char_count, char_count_test); |
Bruce Hoult | 1646d7a | 2015-10-28 15:06:12 +0300 | [diff] [blame] | 340 | |
| 341 | // Convert the utf-8 bytes back to utf-16 chars. |
| 342 | // Does not need copied _reference version of the function because the original |
| 343 | // function with the old API is retained for debug/testing code. |
| 344 | ConvertModifiedUtf8ToUtf16(out_buf_reference, bytes_reference); |
| 345 | ConvertModifiedUtf8ToUtf16(out_buf_test, char_count_test, bytes_test, byte_count_test); |
Chuck Liao | 1b9d442 | 2021-07-12 01:25:23 +0000 | [diff] [blame] | 346 | for (int i = 0; i < char_count_test; ++i) { |
| 347 | EXPECT_EQ(buf[i], out_buf_reference[i]); |
| 348 | EXPECT_EQ(buf[i], out_buf_test[i]); |
Bruce Hoult | 1646d7a | 2015-10-28 15:06:12 +0300 | [diff] [blame] | 349 | } |
| 350 | } |
| 351 | |
| 352 | TEST_F(UtfTest, ExhaustiveBidirectionalCodePointCheck) { |
| 353 | for (int codePoint = 0; codePoint <= 0x10ffff; ++codePoint) { |
Andreas Gampe | 4464a3e | 2016-03-03 20:15:47 -0800 | [diff] [blame] | 354 | uint16_t buf[4] = { 0 }; |
Bruce Hoult | 1646d7a | 2015-10-28 15:06:12 +0300 | [diff] [blame] | 355 | if (codePoint <= 0xffff) { |
| 356 | if (codePoint >= 0xd800 && codePoint <= 0xdfff) { |
| 357 | // According to the Unicode standard, no character will ever |
Roland Levillain | 91d65e0 | 2016-01-19 15:59:16 +0000 | [diff] [blame] | 358 | // be assigned to these code points, and they cannot be encoded |
Bruce Hoult | 1646d7a | 2015-10-28 15:06:12 +0300 | [diff] [blame] | 359 | // into either utf-16 or utf-8. |
| 360 | continue; |
| 361 | } |
| 362 | buf[0] = 'h'; |
| 363 | buf[1] = codePoint; |
| 364 | buf[2] = 'e'; |
| 365 | testConversions(buf, 2); |
| 366 | testConversions(buf, 3); |
| 367 | testConversions(buf + 1, 1); |
| 368 | testConversions(buf + 1, 2); |
| 369 | } else { |
| 370 | buf[0] = 'h'; |
| 371 | codePointToSurrogatePair(codePoint, buf[1], buf[2]); |
| 372 | buf[3] = 'e'; |
| 373 | testConversions(buf, 2); |
| 374 | testConversions(buf, 3); |
| 375 | testConversions(buf, 4); |
| 376 | testConversions(buf + 1, 1); |
| 377 | testConversions(buf + 1, 2); |
| 378 | testConversions(buf + 1, 3); |
| 379 | } |
| 380 | } |
| 381 | } |
| 382 | |
Vladimir Marko | ca0f2dc | 2018-12-10 12:14:36 +0000 | [diff] [blame] | 383 | TEST_F(UtfTest, NonAscii) { |
| 384 | const char kNonAsciiCharacter = '\x80'; |
| 385 | const char input[] = { kNonAsciiCharacter, '\0' }; |
| 386 | uint32_t hash = ComputeModifiedUtf8Hash(input); |
| 387 | EXPECT_EQ(static_cast<uint8_t>(kNonAsciiCharacter), hash); |
| 388 | } |
| 389 | |
Vladimir Marko | f1d973d | 2019-03-19 13:38:34 +0000 | [diff] [blame] | 390 | TEST_F(UtfTest, PrintableStringUtf8) { |
| 391 | // Note: This is UTF-8, not Modified-UTF-8. |
| 392 | const uint8_t kTestSequence[] = { 0xf0, 0x90, 0x80, 0x80, 0 }; |
| 393 | const char* start = reinterpret_cast<const char*>(kTestSequence); |
| 394 | const char* ptr = start; |
| 395 | uint32_t pair = GetUtf16FromUtf8(&ptr); |
| 396 | ASSERT_EQ(*ptr, '\0'); |
| 397 | uint16_t leading = GetLeadingUtf16Char(pair); |
| 398 | uint16_t trailing = GetTrailingUtf16Char(pair); |
| 399 | ASSERT_NE(0u, trailing); |
| 400 | |
| 401 | std::string expected = android::base::StringPrintf("\"\\u%04x\\u%04x\"", |
| 402 | static_cast<unsigned>(leading), |
| 403 | static_cast<unsigned>(trailing)); |
| 404 | std::string printable = PrintableString(start); |
| 405 | EXPECT_EQ(expected, printable); |
| 406 | } |
| 407 | |
Narayan Kamath | a5afcfc | 2015-01-29 20:06:46 +0000 | [diff] [blame] | 408 | } // namespace art |