Mathieu Chartier | 0ddf626 | 2018-08-23 17:37:46 -0700 | [diff] [blame^] | 1 | /* |
| 2 | * Copyright (C) 2018 The Android Open Source Project |
| 3 | * |
| 4 | * Licensed under the Apache License, Version 2.0 (the "License"); |
| 5 | * you may not use this file except in compliance with the License. |
| 6 | * You may obtain a copy of the License at |
| 7 | * |
| 8 | * http://www.apache.org/licenses/LICENSE-2.0 |
| 9 | * |
| 10 | * Unless required by applicable law or agreed to in writing, software |
| 11 | * distributed under the License is distributed on an "AS IS" BASIS, |
| 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
| 13 | * See the License for the specific language governing permissions and |
| 14 | * limitations under the License. |
| 15 | */ |
| 16 | |
| 17 | #include "dexanalyze_strings.h" |
| 18 | |
| 19 | #include <algorithm> |
| 20 | #include <iomanip> |
| 21 | #include <iostream> |
| 22 | |
| 23 | #include "dex/class_accessor-inl.h" |
| 24 | #include "dex/code_item_accessors-inl.h" |
| 25 | #include "dex/dex_instruction-inl.h" |
| 26 | |
| 27 | namespace art { |
| 28 | namespace dexanalyze { |
| 29 | |
| 30 | void AnalyzeStrings::ProcessDexFiles(const std::vector<std::unique_ptr<const DexFile>>& dex_files) { |
| 31 | std::set<std::string> unique_strings; |
| 32 | for (const std::unique_ptr<const DexFile>& dex_file : dex_files) { |
| 33 | for (size_t i = 0; i < dex_file->NumStringIds(); ++i) { |
| 34 | uint32_t length = 0; |
| 35 | const char* data = dex_file->StringDataAndUtf16LengthByIdx(dex::StringIndex(i), &length); |
| 36 | // Analyze if the string has any UTF16 chars. |
| 37 | bool have_wide_char = false; |
| 38 | const char* ptr = data; |
| 39 | for (size_t j = 0; j < length; ++j) { |
| 40 | have_wide_char = have_wide_char || GetUtf16FromUtf8(&ptr) >= 0x100; |
| 41 | } |
| 42 | if (have_wide_char) { |
| 43 | wide_string_bytes_ += 2 * length; |
| 44 | } else { |
| 45 | ascii_string_bytes_ += length; |
| 46 | } |
| 47 | string_data_bytes_ += ptr - data; |
| 48 | unique_strings.insert(data); |
| 49 | } |
| 50 | } |
| 51 | // Unique strings only since we want to exclude savings from multidex duplication. |
| 52 | std::vector<std::string> strings(unique_strings.begin(), unique_strings.end()); |
| 53 | unique_strings.clear(); |
| 54 | |
| 55 | // Tunable parameters. |
| 56 | static const size_t kMinPrefixLen = 1; |
| 57 | static const size_t kMaxPrefixLen = 255; |
| 58 | static const size_t kPrefixConstantCost = 4; |
| 59 | static const size_t kPrefixIndexCost = 2; |
| 60 | |
| 61 | // Calculate total shared prefix. |
| 62 | std::vector<size_t> shared_len; |
| 63 | prefixes_.clear(); |
| 64 | for (size_t i = 0; i < strings.size(); ++i) { |
| 65 | size_t best_len = 0; |
| 66 | if (i > 0) { |
| 67 | best_len = std::max(best_len, PrefixLen(strings[i], strings[i - 1])); |
| 68 | } |
| 69 | if (i < strings.size() - 1) { |
| 70 | best_len = std::max(best_len, PrefixLen(strings[i], strings[i + 1])); |
| 71 | } |
| 72 | best_len = std::min(best_len, kMaxPrefixLen); |
| 73 | std::string prefix; |
| 74 | if (best_len >= kMinPrefixLen) { |
| 75 | prefix = strings[i].substr(0, best_len); |
| 76 | ++prefixes_[prefix]; |
| 77 | } |
| 78 | total_prefix_index_cost_ += kPrefixIndexCost; |
| 79 | } |
| 80 | // Optimize the result by moving long prefixes to shorter ones if it causes savings. |
| 81 | while (true) { |
| 82 | bool have_savings = false; |
| 83 | auto it = prefixes_.begin(); |
| 84 | std::vector<std::string> longest; |
| 85 | for (const auto& pair : prefixes_) { |
| 86 | longest.push_back(pair.first); |
| 87 | } |
| 88 | std::sort(longest.begin(), longest.end(), [](const std::string& a, const std::string& b) { |
| 89 | return a.length() > b.length(); |
| 90 | }); |
| 91 | // Do longest first since this provides the best results. |
| 92 | for (const std::string& s : longest) { |
| 93 | it = prefixes_.find(s); |
| 94 | CHECK(it != prefixes_.end()); |
| 95 | const std::string& prefix = it->first; |
| 96 | int64_t best_savings = 0u; |
| 97 | int64_t best_len = -1; |
| 98 | for (int64_t len = prefix.length() - 1; len >= 0; --len) { |
| 99 | auto found = prefixes_.find(prefix.substr(0, len)); |
| 100 | if (len != 0 && found == prefixes_.end()) { |
| 101 | continue; |
| 102 | } |
| 103 | // Calculate savings from downgrading the prefix. |
| 104 | int64_t savings = kPrefixConstantCost + prefix.length() - |
| 105 | (prefix.length() - len) * it->second; |
| 106 | if (savings > best_savings) { |
| 107 | best_savings = savings; |
| 108 | best_len = len; |
| 109 | break; |
| 110 | } |
| 111 | } |
| 112 | if (best_len != -1) { |
| 113 | prefixes_[prefix.substr(0, best_len)] += it->second; |
| 114 | it = prefixes_.erase(it); |
| 115 | optimization_savings_ += best_savings; |
| 116 | have_savings = true; |
| 117 | } else { |
| 118 | ++it; |
| 119 | } |
| 120 | } |
| 121 | if (!have_savings) { |
| 122 | break; |
| 123 | } |
| 124 | } |
| 125 | total_num_prefixes_ += prefixes_.size(); |
| 126 | for (const auto& pair : prefixes_) { |
| 127 | // 4 bytes for an offset, one for length. |
| 128 | total_prefix_dict_ += pair.first.length(); |
| 129 | total_prefix_table_ += kPrefixConstantCost; |
| 130 | total_prefix_savings_ += pair.first.length() * pair.second; |
| 131 | } |
| 132 | } |
| 133 | |
| 134 | void AnalyzeStrings::Dump(std::ostream& os, uint64_t total_size) const { |
| 135 | os << "Total string data bytes " << Percent(string_data_bytes_, total_size) << "\n"; |
| 136 | os << "UTF-16 string data bytes " << Percent(wide_string_bytes_, total_size) << "\n"; |
| 137 | os << "ASCII string data bytes " << Percent(ascii_string_bytes_, total_size) << "\n"; |
| 138 | |
| 139 | // Prefix based strings. |
| 140 | os << "Total shared prefix bytes " << Percent(total_prefix_savings_, total_size) << "\n"; |
| 141 | os << "Prefix dictionary cost " << Percent(total_prefix_dict_, total_size) << "\n"; |
| 142 | os << "Prefix table cost " << Percent(total_prefix_table_, total_size) << "\n"; |
| 143 | os << "Prefix index cost " << Percent(total_prefix_index_cost_, total_size) << "\n"; |
| 144 | int64_t net_savings = total_prefix_savings_; |
| 145 | net_savings -= total_prefix_dict_; |
| 146 | net_savings -= total_prefix_table_; |
| 147 | net_savings -= total_prefix_index_cost_; |
| 148 | os << "Prefix dictionary elements " << total_num_prefixes_ << "\n"; |
| 149 | os << "Optimization savings " << Percent(optimization_savings_, total_size) << "\n"; |
| 150 | os << "Prefix net savings " << Percent(net_savings, total_size) << "\n"; |
| 151 | if (verbose_level_ >= VerboseLevel::kEverything) { |
| 152 | std::vector<std::pair<std::string, size_t>> pairs(prefixes_.begin(), prefixes_.end()); |
| 153 | // Sort lexicographically. |
| 154 | std::sort(pairs.begin(), pairs.end()); |
| 155 | for (const auto& pair : pairs) { |
| 156 | os << pair.first << " : " << pair.second << "\n"; |
| 157 | } |
| 158 | } |
| 159 | } |
| 160 | |
| 161 | } // namespace dexanalyze |
| 162 | } // namespace art |