Blame - libdexfile/dex/utf_test.cc - SHIFTPHONES/android_art

blob: 919259e4d30e3f5854713b7c462eb321e084a117 [file] [log] [blame]

Narayan Kamath	a5afcfc	2015-01-29 20:06:46 +0000	[diff] [blame]	1	/*
				2	* Copyright (C) 2015 The Android Open Source Project
				3	*
				4	* Licensed under the Apache License, Version 2.0 (the "License");
				5	* you may not use this file except in compliance with the License.
				6	* You may obtain a copy of the License at
				7	*
				8	* http://www.apache.org/licenses/LICENSE-2.0
				9	*
				10	* Unless required by applicable law or agreed to in writing, software
				11	* distributed under the License is distributed on an "AS IS" BASIS,
				12	* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
				13	* See the License for the specific language governing permissions and
				14	* limitations under the License.
				15	*/
				16
				17	#include "utf.h"
				18
Vladimir Marko	e3bbc3f	2015-11-25 11:10:20 +0000	[diff] [blame]	19	#include <map>
Narayan Kamath	e16dad1	2015-02-13 11:49:22 +0000	[diff] [blame]	20	#include <vector>
				21
Vladimir Marko	f1d973d	2019-03-19 13:38:34 +0000	[diff] [blame]	22	#include <android-base/stringprintf.h>
				23
David Sehr	e112340	2018-02-01 02:46:18 -0800	[diff] [blame]	24	#include "gtest/gtest.h"
				25	#include "utf-inl.h"
				26
Narayan Kamath	a5afcfc	2015-01-29 20:06:46 +0000	[diff] [blame]	27	namespace art {
				28
David Sehr	e112340	2018-02-01 02:46:18 -0800	[diff] [blame]	29	class UtfTest : public testing::Test {};
Narayan Kamath	a5afcfc	2015-01-29 20:06:46 +0000	[diff] [blame]	30
				31	TEST_F(UtfTest, GetLeadingUtf16Char) {
				32	EXPECT_EQ(0xffff, GetLeadingUtf16Char(0xeeeeffff));
				33	}
				34
				35	TEST_F(UtfTest, GetTrailingUtf16Char) {
				36	EXPECT_EQ(0xffff, GetTrailingUtf16Char(0xffffeeee));
				37	EXPECT_EQ(0, GetTrailingUtf16Char(0x0000aaaa));
				38	}
				39
				40	#define EXPECT_ARRAY_POSITION(expected, end, start) \
				41	EXPECT_EQ(static_cast<uintptr_t>(expected), \
				42	reinterpret_cast<uintptr_t>(end) - reinterpret_cast<uintptr_t>(start));
				43
				44	// A test string containing one, two, three and four byte UTF-8 sequences.
				45	static const uint8_t kAllSequences[] = {
				46	0x24,
				47	0xc2, 0xa2,
				48	0xe2, 0x82, 0xac,
				49	0xf0, 0x9f, 0x8f, 0xa0,
				50	0x00
				51	};
				52
				53	// A test string that contains a UTF-8 encoding of a surrogate pair
Bruce Hoult	1646d7a	2015-10-28 15:06:12 +0300	[diff] [blame]	54	// (code point = U+10400).
Narayan Kamath	a5afcfc	2015-01-29 20:06:46 +0000	[diff] [blame]	55	static const uint8_t kSurrogateEncoding[] = {
				56	0xed, 0xa0, 0x81,
				57	0xed, 0xb0, 0x80,
				58	0x00
				59	};
				60
				61	TEST_F(UtfTest, GetUtf16FromUtf8) {
				62	const char* const start = reinterpret_cast<const char*>(kAllSequences);
				63	const char* ptr = start;
				64	uint32_t pair = 0;
				65
				66	// Single byte sequence.
				67	pair = GetUtf16FromUtf8(&ptr);
				68	EXPECT_EQ(0x24, GetLeadingUtf16Char(pair));
				69	EXPECT_EQ(0, GetTrailingUtf16Char(pair));
				70	EXPECT_ARRAY_POSITION(1, ptr, start);
				71
Bruce Hoult	1646d7a	2015-10-28 15:06:12 +0300	[diff] [blame]	72	// Two byte sequence.
Narayan Kamath	a5afcfc	2015-01-29 20:06:46 +0000	[diff] [blame]	73	pair = GetUtf16FromUtf8(&ptr);
				74	EXPECT_EQ(0xa2, GetLeadingUtf16Char(pair));
				75	EXPECT_EQ(0, GetTrailingUtf16Char(pair));
				76	EXPECT_ARRAY_POSITION(3, ptr, start);
				77
Bruce Hoult	1646d7a	2015-10-28 15:06:12 +0300	[diff] [blame]	78	// Three byte sequence.
Narayan Kamath	a5afcfc	2015-01-29 20:06:46 +0000	[diff] [blame]	79	pair = GetUtf16FromUtf8(&ptr);
				80	EXPECT_EQ(0x20ac, GetLeadingUtf16Char(pair));
				81	EXPECT_EQ(0, GetTrailingUtf16Char(pair));
				82	EXPECT_ARRAY_POSITION(6, ptr, start);
				83
				84	// Four byte sequence
				85	pair = GetUtf16FromUtf8(&ptr);
				86	EXPECT_EQ(0xd83c, GetLeadingUtf16Char(pair));
				87	EXPECT_EQ(0xdfe0, GetTrailingUtf16Char(pair));
				88	EXPECT_ARRAY_POSITION(10, ptr, start);
				89
Bruce Hoult	1646d7a	2015-10-28 15:06:12 +0300	[diff] [blame]	90	// Null terminator.
Narayan Kamath	a5afcfc	2015-01-29 20:06:46 +0000	[diff] [blame]	91	pair = GetUtf16FromUtf8(&ptr);
				92	EXPECT_EQ(0, GetLeadingUtf16Char(pair));
				93	EXPECT_EQ(0, GetTrailingUtf16Char(pair));
				94	EXPECT_ARRAY_POSITION(11, ptr, start);
				95	}
				96
				97	TEST_F(UtfTest, GetUtf16FromUtf8_SurrogatesPassThrough) {
				98	const char* const start = reinterpret_cast<const char *>(kSurrogateEncoding);
				99	const char* ptr = start;
				100	uint32_t pair = 0;
				101
				102	pair = GetUtf16FromUtf8(&ptr);
				103	EXPECT_EQ(0xd801, GetLeadingUtf16Char(pair));
				104	EXPECT_EQ(0, GetTrailingUtf16Char(pair));
				105	EXPECT_ARRAY_POSITION(3, ptr, start);
				106
				107	pair = GetUtf16FromUtf8(&ptr);
				108	EXPECT_EQ(0xdc00, GetLeadingUtf16Char(pair));
				109	EXPECT_EQ(0, GetTrailingUtf16Char(pair));
				110	EXPECT_ARRAY_POSITION(6, ptr, start);
				111	}
				112
				113	TEST_F(UtfTest, CountModifiedUtf8Chars) {
				114	EXPECT_EQ(5u, CountModifiedUtf8Chars(reinterpret_cast<const char *>(kAllSequences)));
				115	EXPECT_EQ(2u, CountModifiedUtf8Chars(reinterpret_cast<const char *>(kSurrogateEncoding)));
				116	}
				117
Andreas Gampe	ca620d7	2016-11-08 08:09:33 -0800	[diff] [blame]	118	static void AssertConversion(const std::vector<uint16_t>& input,
				119	const std::vector<uint8_t>& expected) {
Chuck Liao	1b9d442	2021-07-12 01:25:23 +0000	[diff] [blame]	120	ASSERT_EQ(expected.size(), CountUtf8Bytes(&input[0], input.size()));
Narayan Kamath	e16dad1	2015-02-13 11:49:22 +0000	[diff] [blame]	121
				122	std::vector<uint8_t> output(expected.size());
Bruce Hoult	1646d7a	2015-10-28 15:06:12 +0300	[diff] [blame]	123	ConvertUtf16ToModifiedUtf8(reinterpret_cast<char*>(&output[0]), expected.size(),
				124	&input[0], input.size());
Narayan Kamath	e16dad1	2015-02-13 11:49:22 +0000	[diff] [blame]	125	EXPECT_EQ(expected, output);
				126	}
				127
				128	TEST_F(UtfTest, CountAndConvertUtf8Bytes) {
Chuck Liao	1b9d442	2021-07-12 01:25:23 +0000	[diff] [blame]	129	// Surrogate pairs will be converted into 4 byte sequences.
				130	AssertConversion({ 0xd801, 0xdc00 }, { 0xf0, 0x90, 0x90, 0x80 });
Narayan Kamath	e16dad1	2015-02-13 11:49:22 +0000	[diff] [blame]	131
				132	// Three byte encodings that are below & above the leading surrogate
				133	// range respectively.
				134	AssertConversion({ 0xdef0 }, { 0xed, 0xbb, 0xb0 });
				135	AssertConversion({ 0xdcff }, { 0xed, 0xb3, 0xbf });
				136	// Two byte encoding.
				137	AssertConversion({ 0x0101 }, { 0xc4, 0x81 });
				138
				139	// Two byte special case : 0 must use an overlong encoding.
				140	AssertConversion({ 0x0101, 0x0000 }, { 0xc4, 0x81, 0xc0, 0x80 });
				141
				142	// One byte encoding.
				143	AssertConversion({ 'h', 'e', 'l', 'l', 'o' }, { 0x68, 0x65, 0x6c, 0x6c, 0x6f });
				144
				145	AssertConversion({
Chuck Liao	1b9d442	2021-07-12 01:25:23 +0000	[diff] [blame]	146	0xd802, 0xdc02, // Surrogate pair.
Bruce Hoult	1646d7a	2015-10-28 15:06:12 +0300	[diff] [blame]	147	0xdef0, 0xdcff, // Three byte encodings.
				148	0x0101, 0x0000, // Two byte encodings.
				149	'p' , 'p' // One byte encoding.
Narayan Kamath	e16dad1	2015-02-13 11:49:22 +0000	[diff] [blame]	150	}, {
Chuck Liao	1b9d442	2021-07-12 01:25:23 +0000	[diff] [blame]	151	0xf0, 0x90, 0xa0, 0x82,
Narayan Kamath	e16dad1	2015-02-13 11:49:22 +0000	[diff] [blame]	152	0xed, 0xbb, 0xb0, 0xed, 0xb3, 0xbf,
				153	0xc4, 0x81, 0xc0, 0x80,
				154	0x70, 0x70
				155	});
				156	}
				157
				158	TEST_F(UtfTest, CountAndConvertUtf8Bytes_UnpairedSurrogate) {
				159	// Unpaired trailing surrogate at the end of input.
				160	AssertConversion({ 'h', 'e', 0xd801 }, { 'h', 'e', 0xed, 0xa0, 0x81 });
				161	// Unpaired (or incorrectly paired) surrogates in the middle of the input.
Vladimir Marko	e3bbc3f	2015-11-25 11:10:20 +0000	[diff] [blame]	162	const std::map<std::vector<uint16_t>, std::vector<uint8_t>> prefixes {
				163	{{ 'h' }, { 'h' }},
				164	{{ 0 }, { 0xc0, 0x80 }},
				165	{{ 0x81 }, { 0xc2, 0x81 }},
				166	{{ 0x801 }, { 0xe0, 0xa0, 0x81 }},
				167	};
				168	const std::map<std::vector<uint16_t>, std::vector<uint8_t>> suffixes {
				169	{{ 'e' }, { 'e' }},
				170	{{ 0 }, { 0xc0, 0x80 }},
				171	{{ 0x7ff }, { 0xdf, 0xbf }},
				172	{{ 0xffff }, { 0xef, 0xbf, 0xbf }},
				173	};
				174	const std::map<std::vector<uint16_t>, std::vector<uint8_t>> tests {
				175	{{ 0xd801 }, { 0xed, 0xa0, 0x81 }},
				176	{{ 0xdc00 }, { 0xed, 0xb0, 0x80 }},
				177	{{ 0xd801, 0xd801 }, { 0xed, 0xa0, 0x81, 0xed, 0xa0, 0x81 }},
				178	{{ 0xdc00, 0xdc00 }, { 0xed, 0xb0, 0x80, 0xed, 0xb0, 0x80 }},
				179	};
				180	for (const auto& prefix : prefixes) {
				181	const std::vector<uint16_t>& prefix_in = prefix.first;
				182	const std::vector<uint8_t>& prefix_out = prefix.second;
				183	for (const auto& test : tests) {
				184	const std::vector<uint16_t>& test_in = test.first;
				185	const std::vector<uint8_t>& test_out = test.second;
				186	for (const auto& suffix : suffixes) {
				187	const std::vector<uint16_t>& suffix_in = suffix.first;
				188	const std::vector<uint8_t>& suffix_out = suffix.second;
				189	std::vector<uint16_t> in = prefix_in;
				190	in.insert(in.end(), test_in.begin(), test_in.end());
				191	in.insert(in.end(), suffix_in.begin(), suffix_in.end());
				192	std::vector<uint8_t> out = prefix_out;
				193	out.insert(out.end(), test_out.begin(), test_out.end());
				194	out.insert(out.end(), suffix_out.begin(), suffix_out.end());
				195	AssertConversion(in, out);
				196	}
				197	}
				198	}
Narayan Kamath	e16dad1	2015-02-13 11:49:22 +0000	[diff] [blame]	199	}
				200
Bruce Hoult	1646d7a	2015-10-28 15:06:12 +0300	[diff] [blame]	201	// Old versions of functions, here to compare answers with optimized versions.
				202
				203	size_t CountModifiedUtf8Chars_reference(const char* utf8) {
				204	size_t len = 0;
				205	int ic;
				206	while ((ic = *utf8++) != '\0') {
				207	len++;
				208	if ((ic & 0x80) == 0) {
				209	// one-byte encoding
				210	continue;
				211	}
				212	// two- or three-byte encoding
				213	utf8++;
				214	if ((ic & 0x20) == 0) {
				215	// two-byte encoding
				216	continue;
				217	}
				218	utf8++;
				219	if ((ic & 0x10) == 0) {
				220	// three-byte encoding
				221	continue;
				222	}
				223
				224	// four-byte encoding: needs to be converted into a surrogate
				225	// pair.
				226	utf8++;
				227	len++;
				228	}
				229	return len;
				230	}
				231
Chuck Liao	1b9d442	2021-07-12 01:25:23 +0000	[diff] [blame]	232	static size_t CountUtf8Bytes_reference(const uint16_t* chars, size_t char_count) {
Bruce Hoult	1646d7a	2015-10-28 15:06:12 +0300	[diff] [blame]	233	size_t result = 0;
				234	while (char_count--) {
				235	const uint16_t ch = *chars++;
				236	if (ch > 0 && ch <= 0x7f) {
				237	++result;
Chuck Liao	1b9d442	2021-07-12 01:25:23 +0000	[diff] [blame]	238	} else if (ch >= 0xd800 && ch <= 0xdbff) {
				239	if (char_count > 0) {
				240	const uint16_t ch2 = *chars;
				241	// If we find a properly paired surrogate, we emit it as a 4 byte
				242	// UTF sequence. If we find an unpaired leading or trailing surrogate,
				243	// we emit it as a 3 byte sequence like would have done earlier.
				244	if (ch2 >= 0xdc00 && ch2 <= 0xdfff) {
				245	chars++;
				246	char_count--;
				247
				248	result += 4;
				249	} else {
				250	result += 3;
				251	}
				252	} else {
				253	// This implies we found an unpaired trailing surrogate at the end
				254	// of a string.
				255	result += 3;
				256	}
Bruce Hoult	1646d7a	2015-10-28 15:06:12 +0300	[diff] [blame]	257	} else if (ch > 0x7ff) {
				258	result += 3;
				259	} else {
				260	result += 2;
				261	}
				262	}
				263	return result;
				264	}
				265
				266	static void ConvertUtf16ToModifiedUtf8_reference(char* utf8_out, const uint16_t* utf16_in,
				267	size_t char_count) {
				268	while (char_count--) {
				269	const uint16_t ch = *utf16_in++;
				270	if (ch > 0 && ch <= 0x7f) {
				271	*utf8_out++ = ch;
				272	} else {
Chuck Liao	1b9d442	2021-07-12 01:25:23 +0000	[diff] [blame]	273	// Char_count == 0 here implies we've encountered an unpaired
				274	// surrogate and we have no choice but to encode it as 3-byte UTF
				275	// sequence. Note that unpaired surrogates can occur as a part of
				276	// "normal" operation.
				277	if ((ch >= 0xd800 && ch <= 0xdbff) && (char_count > 0)) {
				278	const uint16_t ch2 = *utf16_in;
				279
				280	// Check if the other half of the pair is within the expected
				281	// range. If it isn't, we will have to emit both "halves" as
				282	// separate 3 byte sequences.
				283	if (ch2 >= 0xdc00 && ch2 <= 0xdfff) {
				284	utf16_in++;
				285	char_count--;
				286	const uint32_t code_point = (ch << 10) + ch2 - 0x035fdc00;
				287	*utf8_out++ = (code_point >> 18) \| 0xf0;
				288	*utf8_out++ = ((code_point >> 12) & 0x3f) \| 0x80;
				289	*utf8_out++ = ((code_point >> 6) & 0x3f) \| 0x80;
				290	*utf8_out++ = (code_point & 0x3f) \| 0x80;
				291	continue;
				292	}
				293	}
				294
Bruce Hoult	1646d7a	2015-10-28 15:06:12 +0300	[diff] [blame]	295	if (ch > 0x07ff) {
				296	// Three byte encoding.
				297	*utf8_out++ = (ch >> 12) \| 0xe0;
				298	*utf8_out++ = ((ch >> 6) & 0x3f) \| 0x80;
				299	*utf8_out++ = (ch & 0x3f) \| 0x80;
				300	} else /(ch > 0x7f \|\| ch == 0)/ {
				301	// Two byte encoding.
				302	*utf8_out++ = (ch >> 6) \| 0xc0;
				303	*utf8_out++ = (ch & 0x3f) \| 0x80;
				304	}
				305	}
				306	}
				307	}
				308
				309	// Exhaustive test of converting a single code point to UTF-16, then UTF-8, and back again.
				310
				311	static void codePointToSurrogatePair(uint32_t code_point, uint16_t &first, uint16_t &second) {
				312	first = (code_point >> 10) + 0xd7c0;
				313	second = (code_point & 0x03ff) + 0xdc00;
				314	}
				315
Chuck Liao	1b9d442	2021-07-12 01:25:23 +0000	[diff] [blame]	316	static void testConversions(uint16_t *buf, int char_count) {
				317	char bytes_test[8] = { 0 }, bytes_reference[8] = { 0 };
				318	uint16_t out_buf_test[4] = { 0 }, out_buf_reference[4] = { 0 };
				319	int byte_count_test, byte_count_reference;
				320	int char_count_test, char_count_reference;
				321
Bruce Hoult	1646d7a	2015-10-28 15:06:12 +0300	[diff] [blame]	322	// Calculate the number of utf-8 bytes for the utf-16 chars.
Chuck Liao	1b9d442	2021-07-12 01:25:23 +0000	[diff] [blame]	323	byte_count_reference = CountUtf8Bytes_reference(buf, char_count);
				324	byte_count_test = CountUtf8Bytes(buf, char_count);
				325	EXPECT_EQ(byte_count_reference, byte_count_test);
Bruce Hoult	1646d7a	2015-10-28 15:06:12 +0300	[diff] [blame]	326
				327	// Convert the utf-16 string to utf-8 bytes.
				328	ConvertUtf16ToModifiedUtf8_reference(bytes_reference, buf, char_count);
				329	ConvertUtf16ToModifiedUtf8(bytes_test, byte_count_test, buf, char_count);
Chuck Liao	1b9d442	2021-07-12 01:25:23 +0000	[diff] [blame]	330	for (int i = 0; i < byte_count_test; ++i) {
				331	EXPECT_EQ(bytes_reference[i], bytes_test[i]);
Bruce Hoult	1646d7a	2015-10-28 15:06:12 +0300	[diff] [blame]	332	}
				333
				334	// Calculate the number of utf-16 chars from the utf-8 bytes.
				335	bytes_reference[byte_count_reference] = 0; // Reference function needs null termination.
Chuck Liao	1b9d442	2021-07-12 01:25:23 +0000	[diff] [blame]	336	char_count_reference = CountModifiedUtf8Chars_reference(bytes_reference);
				337	char_count_test = CountModifiedUtf8Chars(bytes_test, byte_count_test);
				338	EXPECT_EQ(char_count, char_count_reference);
				339	EXPECT_EQ(char_count, char_count_test);
Bruce Hoult	1646d7a	2015-10-28 15:06:12 +0300	[diff] [blame]	340
				341	// Convert the utf-8 bytes back to utf-16 chars.
				342	// Does not need copied _reference version of the function because the original
				343	// function with the old API is retained for debug/testing code.
				344	ConvertModifiedUtf8ToUtf16(out_buf_reference, bytes_reference);
				345	ConvertModifiedUtf8ToUtf16(out_buf_test, char_count_test, bytes_test, byte_count_test);
Chuck Liao	1b9d442	2021-07-12 01:25:23 +0000	[diff] [blame]	346	for (int i = 0; i < char_count_test; ++i) {
				347	EXPECT_EQ(buf[i], out_buf_reference[i]);
				348	EXPECT_EQ(buf[i], out_buf_test[i]);
Bruce Hoult	1646d7a	2015-10-28 15:06:12 +0300	[diff] [blame]	349	}
				350	}
				351
				352	TEST_F(UtfTest, ExhaustiveBidirectionalCodePointCheck) {
				353	for (int codePoint = 0; codePoint <= 0x10ffff; ++codePoint) {
Andreas Gampe	4464a3e	2016-03-03 20:15:47 -0800	[diff] [blame]	354	uint16_t buf[4] = { 0 };
Bruce Hoult	1646d7a	2015-10-28 15:06:12 +0300	[diff] [blame]	355	if (codePoint <= 0xffff) {
				356	if (codePoint >= 0xd800 && codePoint <= 0xdfff) {
				357	// According to the Unicode standard, no character will ever
Roland Levillain	91d65e0	2016-01-19 15:59:16 +0000	[diff] [blame]	358	// be assigned to these code points, and they cannot be encoded
Bruce Hoult	1646d7a	2015-10-28 15:06:12 +0300	[diff] [blame]	359	// into either utf-16 or utf-8.
				360	continue;
				361	}
				362	buf[0] = 'h';
				363	buf[1] = codePoint;
				364	buf[2] = 'e';
				365	testConversions(buf, 2);
				366	testConversions(buf, 3);
				367	testConversions(buf + 1, 1);
				368	testConversions(buf + 1, 2);
				369	} else {
				370	buf[0] = 'h';
				371	codePointToSurrogatePair(codePoint, buf[1], buf[2]);
				372	buf[3] = 'e';
				373	testConversions(buf, 2);
				374	testConversions(buf, 3);
				375	testConversions(buf, 4);
				376	testConversions(buf + 1, 1);
				377	testConversions(buf + 1, 2);
				378	testConversions(buf + 1, 3);
				379	}
				380	}
				381	}
				382
Vladimir Marko	ca0f2dc	2018-12-10 12:14:36 +0000	[diff] [blame]	383	TEST_F(UtfTest, NonAscii) {
				384	const char kNonAsciiCharacter = '\x80';
				385	const char input[] = { kNonAsciiCharacter, '\0' };
				386	uint32_t hash = ComputeModifiedUtf8Hash(input);
				387	EXPECT_EQ(static_cast<uint8_t>(kNonAsciiCharacter), hash);
				388	}
				389
Vladimir Marko	f1d973d	2019-03-19 13:38:34 +0000	[diff] [blame]	390	TEST_F(UtfTest, PrintableStringUtf8) {
				391	// Note: This is UTF-8, not Modified-UTF-8.
				392	const uint8_t kTestSequence[] = { 0xf0, 0x90, 0x80, 0x80, 0 };
				393	const char* start = reinterpret_cast<const char*>(kTestSequence);
				394	const char* ptr = start;
				395	uint32_t pair = GetUtf16FromUtf8(&ptr);
				396	ASSERT_EQ(*ptr, '\0');
				397	uint16_t leading = GetLeadingUtf16Char(pair);
				398	uint16_t trailing = GetTrailingUtf16Char(pair);
				399	ASSERT_NE(0u, trailing);
				400
				401	std::string expected = android::base::StringPrintf("\"\\u%04x\\u%04x\"",
				402	static_cast<unsigned>(leading),
				403	static_cast<unsigned>(trailing));
				404	std::string printable = PrintableString(start);
				405	EXPECT_EQ(expected, printable);
				406	}
				407
Narayan Kamath	a5afcfc	2015-01-29 20:06:46 +0000	[diff] [blame]	408	} // namespace art