Flesh out AllocString
Change-Id: Ie8c1170e71374942eafdcb40775ca2df3cf7bbc7
diff --git a/src/class_linker.cc b/src/class_linker.cc
index 92bb6d4..0a0bb04 100644
--- a/src/class_linker.cc
+++ b/src/class_linker.cc
@@ -137,6 +137,14 @@
sizeof(Method)));
}
+String* ClassLinker::AllocStringFromModifiedUtf8(int32_t utf16_length,
+ const char* utf8_data_in) {
+ return String::AllocFromModifiedUtf8(java_lang_String_,
+ char_array_class_,
+ utf16_length,
+ utf8_data_in);
+}
+
Class* ClassLinker::FindClass(const StringPiece& descriptor,
Object* class_loader,
const DexFile* dex_file) {
@@ -1497,10 +1505,9 @@
uint32_t string_idx) {
const DexFile* dex_file = FindDexFile(referring->GetDexCache());
const DexFile::StringId& string_id = dex_file->GetStringId(string_idx);
- const char* string_data = dex_file->GetStringData(string_id);
- String* new_string = String::AllocFromModifiedUtf8(java_lang_String_,
- char_array_class_,
- string_data);
+ int32_t utf16_length = dex_file->GetStringLength(string_id);
+ const char* utf8_data = dex_file->GetStringData(string_id);
+ String* new_string = AllocStringFromModifiedUtf8(utf16_length, utf8_data);
// TODO: intern the new string
referring->GetDexCache()->SetResolvedString(string_idx, new_string);
return new_string;
diff --git a/src/class_linker.h b/src/class_linker.h
index 738979b..46d4b1f 100644
--- a/src/class_linker.h
+++ b/src/class_linker.h
@@ -23,12 +23,15 @@
~ClassLinker() {}
+ // Alloc* convenience functions to avoid needing to pass in Class*
+ // values that are known to the ClassLinker such as
+ // object_array_class_ and java_lang_String_ etc.
DexCache* AllocDexCache();
Class* AllocClass(DexCache* dex_cache);
StaticField* AllocStaticField();
InstanceField* AllocInstanceField();
Method* AllocMethod();
-
+ String* AllocStringFromModifiedUtf8(int32_t utf16_length, const char* utf8_data_in);
template <class T>
ObjectArray<T>* AllocObjectArray(size_t length) {
return ObjectArray<T>::Alloc(object_array_class_, length);
diff --git a/src/dex_file.h b/src/dex_file.h
index 63fefae..c8f8afa 100644
--- a/src/dex_file.h
+++ b/src/dex_file.h
@@ -377,7 +377,7 @@
const byte* ptr = base_ + string_id.string_data_off_;
// Skip the uleb128 length.
while (*(ptr++) > 0x7f) /* empty */ ;
- return (const char*) ptr;
+ return reinterpret_cast<const char*>(ptr);
}
// return the UTF-8 encoded string with the specified string_id index
diff --git a/src/object.h b/src/object.h
index 77eb164..0abf444 100644
--- a/src/object.h
+++ b/src/object.h
@@ -607,12 +607,12 @@
}
T* Get(uint32_t i) const {
- DCHECK_LT(i, GetLength());
+ CHECK_LT(i, GetLength());
Object* const * data = reinterpret_cast<Object* const *>(GetData());
return down_cast<T*>(data[i]);
}
void Set(uint32_t i, T* object) {
- DCHECK_LT(i, GetLength());
+ CHECK_LT(i, GetLength());
T** data = reinterpret_cast<T**>(GetData());
data[i] = object;
}
@@ -992,24 +992,53 @@
length,
sizeof(uint16_t)));
}
+
+ uint16_t* GetChars() {
+ return reinterpret_cast<uint16_t*>(GetData());
+ }
+
+ const uint16_t* GetChars() const {
+ return reinterpret_cast<const uint16_t*>(GetData());
+ }
+
+ uint16_t GetChar(uint32_t i) const {
+ CHECK_LT(i, GetLength());
+ return GetChars()[i];
+ }
+
+ void SetChar(uint32_t i, uint16_t ch) {
+ CHECK_LT(i, GetLength());
+ GetChars()[i] = ch;
+ }
+
private:
CharArray();
};
class String : public Object {
public:
- static String* Alloc(Class* java_lang_String) {
- return down_cast<String*>(Object::Alloc(java_lang_String));
+ static String* AllocFromUtf16(Class* java_lang_String,
+ Class* char_array,
+ int32_t utf16_length,
+ uint16_t* utf16_data_in) {
+ String* string = Alloc(java_lang_String, char_array, utf16_length);
+ uint16_t* utf16_data_out = string->array_->GetChars();
+ // TODO use 16-bit wide memset variant
+ for (int i = 0; i < utf16_length; i++ ) {
+ utf16_data_out[i] = utf16_data_in[i];
+ }
+ string->hash_code_ = ComputeUtf16Hash(utf16_data_out, utf16_length);
+ return string;
}
static String* AllocFromModifiedUtf8(Class* java_lang_String,
Class* char_array,
- const char* data) {
- String* string = Alloc(java_lang_String);
- uint32_t count = strlen(data); // TODO
- CharArray* array = CharArray::Alloc(char_array, count);
- string->array_ = array;
- string->count_ = count;
+ int32_t utf16_length,
+ const char* utf8_data_in) {
+ String* string = Alloc(java_lang_String, char_array, utf16_length);
+ uint16_t* utf16_data_out = string->array_->GetChars();
+ ConvertModifiedUtf8ToUtf16(utf16_data_out, utf8_data_in);
+ string->hash_code_ = ComputeUtf16Hash(utf16_data_out, utf16_length);
return string;
}
@@ -1022,6 +1051,65 @@
uint32_t count_;
+ static String* Alloc(Class* java_lang_String,
+ Class* char_array,
+ int32_t utf16_length) {
+ String* string = down_cast<String*>(Object::Alloc(java_lang_String));
+ CharArray* array = CharArray::Alloc(char_array, utf16_length);
+ string->array_ = array;
+ string->count_ = utf16_length;
+ return string;
+ }
+
+ // Convert Modified UTF-8 to UTF-16
+ // http://en.wikipedia.org/wiki/UTF-8#Modified_UTF-8
+ static void ConvertModifiedUtf8ToUtf16(uint16_t* utf16_data_out, const char* utf8_data_in) {
+ while (*utf8_data_in != '\0') {
+ *utf16_data_out++ = GetUtf16FromUtf8(&utf8_data_in);
+ }
+ }
+
+ // Retrieve the next UTF-16 character from a UTF-8 string.
+ //
+ // Advances "*pUtf8Ptr" to the start of the next character.
+ //
+ // WARNING: If a string is corrupted by dropping a '\0' in the middle
+ // of a 3-byte sequence, you can end up overrunning the buffer with
+ // reads (and possibly with the writes if the length was computed and
+ // cached before the damage). For performance reasons, this function
+ // assumes that the string being parsed is known to be valid (e.g., by
+ // already being verified). Most strings we process here are coming
+ // out of dex files or other internal translations, so the only real
+ // risk comes from the JNI NewStringUTF call.
+ static uint16_t GetUtf16FromUtf8(const char** utf8_data_in) {
+ uint8_t one = *(*utf8_data_in)++;
+ if ((one & 0x80) == 0) {
+ /* one-byte encoding */
+ return one;
+ }
+ /* two- or three-byte encoding */
+ uint8_t two = *(*utf8_data_in)++;
+ if ((one & 0x20) == 0) {
+ /* two-byte encoding */
+ return ((one & 0x1f) << 6) |
+ (two & 0x3f);
+ }
+ /* three-byte encoding */
+ uint8_t three = *(*utf8_data_in)++;
+ return ((one & 0x0f) << 12) |
+ ((two & 0x3f) << 6) |
+ (three & 0x3f);
+ }
+
+ // The java/lang/String.computeHashCode() algorithm
+ static uint32_t ComputeUtf16Hash(const uint16_t* string_data, size_t string_length) {
+ uint32_t hash = 0;
+ while (string_length--) {
+ hash = hash * 31 + *string_data++;
+ }
+ return hash;
+ }
+
private:
String();
};
diff --git a/src/object_test.cc b/src/object_test.cc
index dc74b8b..969378c 100644
--- a/src/object_test.cc
+++ b/src/object_test.cc
@@ -8,12 +8,35 @@
#include "object.h"
#include "scoped_ptr.h"
+#include <stdint.h>
#include <stdio.h>
#include "gtest/gtest.h"
namespace art {
-class ObjectTest : public RuntimeTest {};
+class ObjectTest : public RuntimeTest {
+ protected:
+ void AssertString(size_t length,
+ const char* utf8_in,
+ const char* utf16_expected_le,
+ uint32_t hash_expected) {
+ uint16_t utf16_expected[length];
+ for (size_t i = 0; i < length; i++) {
+ uint16_t ch = (((utf16_expected_le[i*2 + 0] & 0xff) << 8) |
+ ((utf16_expected_le[i*2 + 1] & 0xff) << 0));
+ utf16_expected[i] = ch;
+ }
+
+ String* string = class_linker_->AllocStringFromModifiedUtf8(length, utf8_in);
+ ASSERT_EQ(length, string->count_);
+ ASSERT_TRUE(string->array_ != NULL);
+ ASSERT_TRUE(string->array_->GetChars() != NULL);
+ for (size_t i = 0; i < length; i++) {
+ EXPECT_EQ(utf16_expected[i], string->array_->GetChar(i));
+ }
+ EXPECT_EQ(hash_expected, string->hash_code_);
+ }
+};
TEST_F(ObjectTest, IsInSamePackage) {
// Matches
@@ -42,4 +65,26 @@
EXPECT_TRUE(oa->Get(1) == oa);
}
+TEST_F(ObjectTest, String) {
+ // Test the empty string.
+ AssertString(0, "", "", 0);
+
+ // Test one-byte characters.
+ AssertString(1, " ", "\x00\x20", 0x20);
+ AssertString(1, "", "\x00\x00", 0);
+ AssertString(1, "\x7f", "\x00\x7f", 0x7f);
+ AssertString(2, "hi", "\x00\x68\x00\x69", (31 * 0x68) + 0x69);
+
+ // Test two-byte characters.
+ AssertString(1, "\xc2\x80", "\x00\x80", 0x80);
+ AssertString(1, "\xd9\xa6", "\x06\x66", 0x0666);
+ AssertString(1, "\xdf\xbf", "\x07\xff", 0x07ff);
+ AssertString(3, "h\xd9\xa6i", "\x00\x68\x06\x66\x00\x69", (31 * ((31 * 0x68) + 0x0666)) + 0x69);
+
+ // Test three-byte characters.
+ AssertString(1, "\xe0\xa0\x80", "\x08\x00", 0x0800);
+ AssertString(1, "\xe1\x88\xb4", "\x12\x34", 0x1234);
+ AssertString(1, "\xef\xbf\xbf", "\xff\xff", 0xffff);
+ AssertString(3, "h\xe1\x88\xb4i", "\x00\x68\x12\x34\x00\x69", (31 * ((31 * 0x68) + 0x1234)) + 0x69);
+}
} // namespace art