Fix broken checks in IsValidPartOfMemberNameUtf8Slow.
GetUtf16FromUtf8 returns a surrogate pair only if it encounters
a 4-byte UTF sequence. Three byte UTF sequences will only return the
first or second half of a pair so we need to check for that
explicitly.
bug: 20844537
(cherry picked from commit 3ba8671d60061359fd833f60f7a9dca14878cc0b)
Change-Id: I2e2a4f9f736cd11050a2b634b3bb27b75a0ee0ba
diff --git a/runtime/utf.h b/runtime/utf.h
index dd38afa..7f05248 100644
--- a/runtime/utf.h
+++ b/runtime/utf.h
@@ -87,9 +87,9 @@
/*
* Retrieve the next UTF-16 character or surrogate pair from a UTF-8 string.
* single byte, 2-byte and 3-byte UTF-8 sequences result in a single UTF-16
- * character whereas 4-byte UTF-8 sequences result in a surrogate pair. Use
- * GetLeadingUtf16Char and GetTrailingUtf16Char to process the return value
- * of this function.
+ * character (possibly one half of a surrogate) whereas 4-byte UTF-8 sequences
+ * result in a surrogate pair. Use GetLeadingUtf16Char and GetTrailingUtf16Char
+ * to process the return value of this function.
*
* Advances "*utf8_data_in" to the start of the next character.
*
diff --git a/runtime/utils.cc b/runtime/utils.cc
index 650214f..7986cdc 100644
--- a/runtime/utils.cc
+++ b/runtime/utils.cc
@@ -827,14 +827,21 @@
*/
const uint32_t pair = GetUtf16FromUtf8(pUtf8Ptr);
-
const uint16_t leading = GetLeadingUtf16Char(pair);
- const uint32_t trailing = GetTrailingUtf16Char(pair);
- if (trailing == 0) {
- // Perform follow-up tests based on the high 8 bits of the
- // lower surrogate.
- switch (leading >> 8) {
+ // We have a surrogate pair resulting from a valid 4 byte UTF sequence.
+ // No further checks are necessary because 4 byte sequences span code
+ // points [U+10000, U+1FFFFF], which are valid codepoints in a dex
+ // identifier. Furthermore, GetUtf16FromUtf8 guarantees that each of
+ // the surrogate halves are valid and well formed in this instance.
+ if (GetTrailingUtf16Char(pair) != 0) {
+ return true;
+ }
+
+
+ // We've encountered a one, two or three byte UTF-8 sequence. The
+ // three byte UTF-8 sequence could be one half of a surrogate pair.
+ switch (leading >> 8) {
case 0x00:
// It's only valid if it's above the ISO-8859-1 high space (0xa0).
return (leading > 0x00a0);
@@ -842,9 +849,14 @@
case 0xd9:
case 0xda:
case 0xdb:
- // It looks like a leading surrogate but we didn't find a trailing
- // surrogate if we're here.
- return false;
+ {
+ // We found a three byte sequence encoding one half of a surrogate.
+ // Look for the other half.
+ const uint32_t pair2 = GetUtf16FromUtf8(pUtf8Ptr);
+ const uint16_t trailing = GetLeadingUtf16Char(pair2);
+
+ return (GetTrailingUtf16Char(pair2) == 0) && (0xdc00 <= trailing && trailing <= 0xdfff);
+ }
case 0xdc:
case 0xdd:
case 0xde:
@@ -855,21 +867,19 @@
case 0xff:
// It's in the range that has spaces, controls, and specials.
switch (leading & 0xfff8) {
- case 0x2000:
- case 0x2008:
- case 0x2028:
- case 0xfff0:
- case 0xfff8:
- return false;
+ case 0x2000:
+ case 0x2008:
+ case 0x2028:
+ case 0xfff0:
+ case 0xfff8:
+ return false;
}
- break;
- }
-
- return true;
+ return true;
+ default:
+ return true;
}
- // We have a surrogate pair. Check that trailing surrogate is well formed.
- return (trailing >= 0xdc00 && trailing <= 0xdfff);
+ UNREACHABLE();
}
/* Return whether the pointed-at modified-UTF-8 encoded character is
diff --git a/runtime/utils_test.cc b/runtime/utils_test.cc
index 195de0c..869d305 100644
--- a/runtime/utils_test.cc
+++ b/runtime/utils_test.cc
@@ -521,4 +521,27 @@
EXPECT_GT(NanoTime() - start, MsToNs(1000));
}
+TEST_F(UtilsTest, IsValidDescriptor) {
+ std::vector<uint8_t> descriptor(
+ { 'L', 'a', '/', 'b', '$', 0xed, 0xa0, 0x80, 0xed, 0xb0, 0x80, ';', 0x00 });
+ EXPECT_TRUE(IsValidDescriptor(reinterpret_cast<char*>(&descriptor[0])));
+
+ std::vector<uint8_t> unpaired_surrogate(
+ { 'L', 'a', '/', 'b', '$', 0xed, 0xa0, 0x80, ';', 0x00 });
+ EXPECT_FALSE(IsValidDescriptor(reinterpret_cast<char*>(&unpaired_surrogate[0])));
+
+ std::vector<uint8_t> unpaired_surrogate_at_end(
+ { 'L', 'a', '/', 'b', '$', 0xed, 0xa0, 0x80, 0x00 });
+ EXPECT_FALSE(IsValidDescriptor(reinterpret_cast<char*>(&unpaired_surrogate_at_end[0])));
+
+ std::vector<uint8_t> invalid_surrogate(
+ { 'L', 'a', '/', 'b', '$', 0xed, 0xb0, 0x80, ';', 0x00 });
+ EXPECT_FALSE(IsValidDescriptor(reinterpret_cast<char*>(&invalid_surrogate[0])));
+
+ std::vector<uint8_t> unpaired_surrogate_with_multibyte_sequence(
+ { 'L', 'a', '/', 'b', '$', 0xed, 0xb0, 0x80, 0xf0, 0x9f, 0x8f, 0xa0, ';', 0x00 });
+ EXPECT_FALSE(
+ IsValidDescriptor(reinterpret_cast<char*>(&unpaired_surrogate_with_multibyte_sequence[0])));
+}
+
} // namespace art