blob: 9f67ff431a30044745b6a77a225bc5c671199109 [file] [log] [blame]
Mathieu Chartier0ddf6262018-08-23 17:37:46 -07001/*
2 * Copyright (C) 2018 The Android Open Source Project
3 *
4 * Licensed under the Apache License, Version 2.0 (the "License");
5 * you may not use this file except in compliance with the License.
6 * You may obtain a copy of the License at
7 *
8 * http://www.apache.org/licenses/LICENSE-2.0
9 *
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
15 */
16
17#include "dexanalyze_strings.h"
18
19#include <algorithm>
20#include <iomanip>
21#include <iostream>
22
23#include "dex/class_accessor-inl.h"
24#include "dex/code_item_accessors-inl.h"
25#include "dex/dex_instruction-inl.h"
26
27namespace art {
28namespace dexanalyze {
29
30void AnalyzeStrings::ProcessDexFiles(const std::vector<std::unique_ptr<const DexFile>>& dex_files) {
31 std::set<std::string> unique_strings;
32 for (const std::unique_ptr<const DexFile>& dex_file : dex_files) {
33 for (size_t i = 0; i < dex_file->NumStringIds(); ++i) {
34 uint32_t length = 0;
35 const char* data = dex_file->StringDataAndUtf16LengthByIdx(dex::StringIndex(i), &length);
36 // Analyze if the string has any UTF16 chars.
37 bool have_wide_char = false;
38 const char* ptr = data;
39 for (size_t j = 0; j < length; ++j) {
40 have_wide_char = have_wide_char || GetUtf16FromUtf8(&ptr) >= 0x100;
41 }
42 if (have_wide_char) {
43 wide_string_bytes_ += 2 * length;
44 } else {
45 ascii_string_bytes_ += length;
46 }
47 string_data_bytes_ += ptr - data;
48 unique_strings.insert(data);
49 }
50 }
51 // Unique strings only since we want to exclude savings from multidex duplication.
52 std::vector<std::string> strings(unique_strings.begin(), unique_strings.end());
53 unique_strings.clear();
54
55 // Tunable parameters.
56 static const size_t kMinPrefixLen = 1;
57 static const size_t kMaxPrefixLen = 255;
58 static const size_t kPrefixConstantCost = 4;
59 static const size_t kPrefixIndexCost = 2;
60
61 // Calculate total shared prefix.
62 std::vector<size_t> shared_len;
63 prefixes_.clear();
64 for (size_t i = 0; i < strings.size(); ++i) {
65 size_t best_len = 0;
66 if (i > 0) {
67 best_len = std::max(best_len, PrefixLen(strings[i], strings[i - 1]));
68 }
69 if (i < strings.size() - 1) {
70 best_len = std::max(best_len, PrefixLen(strings[i], strings[i + 1]));
71 }
72 best_len = std::min(best_len, kMaxPrefixLen);
73 std::string prefix;
74 if (best_len >= kMinPrefixLen) {
75 prefix = strings[i].substr(0, best_len);
76 ++prefixes_[prefix];
77 }
78 total_prefix_index_cost_ += kPrefixIndexCost;
79 }
80 // Optimize the result by moving long prefixes to shorter ones if it causes savings.
81 while (true) {
82 bool have_savings = false;
83 auto it = prefixes_.begin();
84 std::vector<std::string> longest;
85 for (const auto& pair : prefixes_) {
86 longest.push_back(pair.first);
87 }
88 std::sort(longest.begin(), longest.end(), [](const std::string& a, const std::string& b) {
89 return a.length() > b.length();
90 });
91 // Do longest first since this provides the best results.
92 for (const std::string& s : longest) {
93 it = prefixes_.find(s);
94 CHECK(it != prefixes_.end());
95 const std::string& prefix = it->first;
96 int64_t best_savings = 0u;
97 int64_t best_len = -1;
98 for (int64_t len = prefix.length() - 1; len >= 0; --len) {
99 auto found = prefixes_.find(prefix.substr(0, len));
100 if (len != 0 && found == prefixes_.end()) {
101 continue;
102 }
103 // Calculate savings from downgrading the prefix.
104 int64_t savings = kPrefixConstantCost + prefix.length() -
105 (prefix.length() - len) * it->second;
106 if (savings > best_savings) {
107 best_savings = savings;
108 best_len = len;
109 break;
110 }
111 }
112 if (best_len != -1) {
113 prefixes_[prefix.substr(0, best_len)] += it->second;
114 it = prefixes_.erase(it);
115 optimization_savings_ += best_savings;
116 have_savings = true;
117 } else {
118 ++it;
119 }
120 }
121 if (!have_savings) {
122 break;
123 }
124 }
125 total_num_prefixes_ += prefixes_.size();
126 for (const auto& pair : prefixes_) {
127 // 4 bytes for an offset, one for length.
128 total_prefix_dict_ += pair.first.length();
129 total_prefix_table_ += kPrefixConstantCost;
130 total_prefix_savings_ += pair.first.length() * pair.second;
131 }
132}
133
134void AnalyzeStrings::Dump(std::ostream& os, uint64_t total_size) const {
135 os << "Total string data bytes " << Percent(string_data_bytes_, total_size) << "\n";
136 os << "UTF-16 string data bytes " << Percent(wide_string_bytes_, total_size) << "\n";
137 os << "ASCII string data bytes " << Percent(ascii_string_bytes_, total_size) << "\n";
138
139 // Prefix based strings.
140 os << "Total shared prefix bytes " << Percent(total_prefix_savings_, total_size) << "\n";
141 os << "Prefix dictionary cost " << Percent(total_prefix_dict_, total_size) << "\n";
142 os << "Prefix table cost " << Percent(total_prefix_table_, total_size) << "\n";
143 os << "Prefix index cost " << Percent(total_prefix_index_cost_, total_size) << "\n";
144 int64_t net_savings = total_prefix_savings_;
145 net_savings -= total_prefix_dict_;
146 net_savings -= total_prefix_table_;
147 net_savings -= total_prefix_index_cost_;
148 os << "Prefix dictionary elements " << total_num_prefixes_ << "\n";
149 os << "Optimization savings " << Percent(optimization_savings_, total_size) << "\n";
150 os << "Prefix net savings " << Percent(net_savings, total_size) << "\n";
151 if (verbose_level_ >= VerboseLevel::kEverything) {
152 std::vector<std::pair<std::string, size_t>> pairs(prefixes_.begin(), prefixes_.end());
153 // Sort lexicographically.
154 std::sort(pairs.begin(), pairs.end());
155 for (const auto& pair : pairs) {
156 os << pair.first << " : " << pair.second << "\n";
157 }
158 }
159}
160
161} // namespace dexanalyze
162} // namespace art