Add ASCII vs UTF-16 string data analysis Measure how many strings are fully ASCII characters or a UTF-16 mix. Bug: 77721545 Test: test-art-host-gtest-dexanalyze_test Change-Id: Ic59613596c1542c693bcf49cee379310414c44cd

commit: f275979ca2d2809d19210195c22526861554fac0 [log] [tgz]
author: Mathieu Chartier <mathieuc@google.com> Fri May 18 13:16:54 2018 -0700
committer: Mathieu Chartier <mathieuc@google.com> Fri May 18 13:18:05 2018 -0700
tree: f44938cb2e68496cdc32883ce5064efdd2e8461c
parent: fc2f8b6d93d05a689f634f6a70bd68376d8265f8 [diff] [blame]
diff --git a/tools/dexanalyze/dexanalyze_experiments.cc b/tools/dexanalyze/dexanalyze_experiments.cc
index bfeb4b9..adc5154 100644
--- a/tools/dexanalyze/dexanalyze_experiments.cc
+++ b/tools/dexanalyze/dexanalyze_experiments.cc

@@ -26,6 +26,7 @@
 #include "dex/code_item_accessors-inl.h"
 #include "dex/dex_instruction-inl.h"
 #include "dex/standard_dex_file.h"
+#include "dex/utf-inl.h"
 
 namespace art {
 
@@ -48,8 +49,20 @@
   std::vector<std::string> strings;
   for (size_t i = 0; i < dex_file.NumStringIds(); ++i) {
     uint32_t length = 0;
-    const char* data =
-        dex_file.GetStringDataAndUtf16Length(dex_file.GetStringId(dex::StringIndex(i)), &length);
+    const char* data = dex_file.StringDataAndUtf16LengthByIdx(dex::StringIndex(i), &length);
+    // Analyze if the string has any UTF16 chars.
+    bool have_wide_char = false;
+    const char* ptr = data;
+    for (size_t j = 0; j < length; ++j) {
+      have_wide_char = have_wide_char || GetUtf16FromUtf8(&ptr) >= 0x100;
+    }
+    if (have_wide_char) {
+      wide_string_bytes_ += 2 * length;
+    } else {
+      ascii_string_bytes_ += length;
+    }
+    string_data_bytes_ += ptr - data;
+
     strings.push_back(data);
   }
   // Note that the strings are probably already sorted.
@@ -88,6 +101,11 @@
 }
 
 void AnalyzeStrings::Dump(std::ostream& os, uint64_t total_size) const {
+  os << "Total string data bytes " << Percent(string_data_bytes_, total_size) << "\n";
+  os << "UTF-16 string data bytes " << Percent(wide_string_bytes_, total_size) << "\n";
+  os << "ASCII string data bytes " << Percent(ascii_string_bytes_, total_size) << "\n";
+
+  // Prefix based strings.
   os << "Total shared prefix bytes " << Percent(total_prefix_savings_, total_size) << "\n";
   os << "Prefix dictionary cost " << Percent(total_prefix_dict_, total_size) << "\n";
   os << "Prefix table cost " << Percent(total_prefix_table_, total_size) << "\n";
commit	f275979ca2d2809d19210195c22526861554fac0	[log] [tgz]
author	Mathieu Chartier <mathieuc@google.com>	Fri May 18 13:16:54 2018 -0700
committer	Mathieu Chartier <mathieuc@google.com>	Fri May 18 13:18:05 2018 -0700
tree	f44938cb2e68496cdc32883ce5064efdd2e8461c
parent	fc2f8b6d93d05a689f634f6a70bd68376d8265f8 [diff] [blame]