Mathieu Chartier | 0ddf626 | 2018-08-23 17:37:46 -0700 | [diff] [blame] | 1 | /* |
| 2 | * Copyright (C) 2018 The Android Open Source Project |
| 3 | * |
| 4 | * Licensed under the Apache License, Version 2.0 (the "License"); |
| 5 | * you may not use this file except in compliance with the License. |
| 6 | * You may obtain a copy of the License at |
| 7 | * |
| 8 | * http://www.apache.org/licenses/LICENSE-2.0 |
| 9 | * |
| 10 | * Unless required by applicable law or agreed to in writing, software |
| 11 | * distributed under the License is distributed on an "AS IS" BASIS, |
| 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
| 13 | * See the License for the specific language governing permissions and |
| 14 | * limitations under the License. |
| 15 | */ |
| 16 | |
| 17 | #include "dexanalyze_strings.h" |
| 18 | |
| 19 | #include <algorithm> |
| 20 | #include <iomanip> |
| 21 | #include <iostream> |
Mathieu Chartier | 49ca869 | 2018-08-24 09:34:49 -0700 | [diff] [blame] | 22 | #include <queue> |
Mathieu Chartier | 0ddf626 | 2018-08-23 17:37:46 -0700 | [diff] [blame] | 23 | |
Mathieu Chartier | 5eae4c5 | 2018-08-30 15:52:15 -0700 | [diff] [blame] | 24 | #include "base/time_utils.h" |
Mathieu Chartier | 0ddf626 | 2018-08-23 17:37:46 -0700 | [diff] [blame] | 25 | #include "dex/class_accessor-inl.h" |
| 26 | #include "dex/code_item_accessors-inl.h" |
| 27 | #include "dex/dex_instruction-inl.h" |
| 28 | |
| 29 | namespace art { |
| 30 | namespace dexanalyze { |
| 31 | |
Mathieu Chartier | 49ca869 | 2018-08-24 09:34:49 -0700 | [diff] [blame] | 32 | // Tunable parameters. |
| 33 | static const size_t kMinPrefixLen = 1; |
| 34 | static const size_t kMaxPrefixLen = 255; |
| 35 | static const size_t kPrefixConstantCost = 4; |
| 36 | static const size_t kPrefixIndexCost = 2; |
| 37 | |
Mathieu Chartier | 5eae4c5 | 2018-08-30 15:52:15 -0700 | [diff] [blame] | 38 | class PrefixDictionary { |
| 39 | public: |
| 40 | // Add prefix data and return the offset to the start of the added data. |
| 41 | size_t AddPrefixData(const uint8_t* data, size_t len) { |
| 42 | const size_t offset = prefix_data_.size(); |
| 43 | prefix_data_.insert(prefix_data_.end(), data, data + len); |
| 44 | return offset; |
| 45 | } |
| 46 | |
| 47 | static constexpr size_t kLengthBits = 8; |
| 48 | static constexpr size_t kLengthMask = (1u << kLengthBits) - 1; |
| 49 | |
| 50 | // Return the prefix offset and length. |
| 51 | ALWAYS_INLINE void GetOffset(uint32_t prefix_index, uint32_t* offset, uint32_t* length) const { |
| 52 | CHECK_LT(prefix_index, offsets_.size()); |
| 53 | const uint32_t data = offsets_[prefix_index]; |
| 54 | *length = data & kLengthMask; |
| 55 | *offset = data >> kLengthBits; |
| 56 | } |
| 57 | |
| 58 | uint32_t AddOffset(uint32_t offset, uint32_t length) { |
| 59 | CHECK_LE(length, kLengthMask); |
| 60 | offsets_.push_back((offset << kLengthBits) | length); |
| 61 | return offsets_.size() - 1; |
| 62 | } |
| 63 | |
| 64 | public: |
| 65 | std::vector<uint32_t> offsets_; |
| 66 | std::vector<uint8_t> prefix_data_; |
| 67 | }; |
| 68 | |
| 69 | class PrefixStrings { |
| 70 | public: |
| 71 | class Builder { |
| 72 | public: |
| 73 | explicit Builder(PrefixStrings* output) : output_(output) {} |
| 74 | void Build(const std::vector<std::string>& strings); |
| 75 | |
| 76 | private: |
| 77 | PrefixStrings* const output_; |
| 78 | }; |
| 79 | |
| 80 | // Return the string index that was added. |
| 81 | size_t AddString(uint16_t prefix, const std::string& str) { |
| 82 | const size_t string_offset = chars_.size(); |
| 83 | chars_.push_back(static_cast<uint8_t>(prefix >> 8)); |
| 84 | chars_.push_back(static_cast<uint8_t>(prefix >> 0)); |
| 85 | EncodeUnsignedLeb128(&chars_, str.length()); |
| 86 | const uint8_t* ptr = reinterpret_cast<const uint8_t*>(&str[0]); |
| 87 | chars_.insert(chars_.end(), ptr, ptr + str.length()); |
| 88 | string_offsets_.push_back(string_offset); |
| 89 | return string_offsets_.size() - 1; |
| 90 | } |
| 91 | |
| 92 | std::string GetString(uint32_t string_idx) const { |
| 93 | const size_t offset = string_offsets_[string_idx]; |
| 94 | const uint8_t* suffix_data = &chars_[offset]; |
| 95 | uint16_t prefix_idx = (static_cast<uint16_t>(suffix_data[0]) << 8) + |
| 96 | suffix_data[1]; |
| 97 | suffix_data += 2; |
| 98 | uint32_t prefix_offset; |
| 99 | uint32_t prefix_len; |
| 100 | dictionary_.GetOffset(prefix_idx, &prefix_offset, &prefix_len); |
| 101 | const uint8_t* prefix_data = &dictionary_.prefix_data_[prefix_offset]; |
| 102 | std::string ret(prefix_data, prefix_data + prefix_len); |
| 103 | uint32_t suffix_len = DecodeUnsignedLeb128(&suffix_data); |
| 104 | ret.insert(ret.end(), suffix_data, suffix_data + suffix_len); |
| 105 | return ret; |
| 106 | } |
| 107 | |
| 108 | ALWAYS_INLINE bool Equal(uint32_t string_idx, const uint8_t* data, size_t len) const { |
| 109 | const size_t offset = string_offsets_[string_idx]; |
| 110 | const uint8_t* suffix_data = &chars_[offset]; |
| 111 | uint16_t prefix_idx = (static_cast<uint16_t>(suffix_data[0]) << 8) + |
| 112 | suffix_data[1]; |
| 113 | suffix_data += 2; |
| 114 | uint32_t prefix_offset; |
| 115 | uint32_t prefix_len; |
| 116 | dictionary_.GetOffset(prefix_idx, &prefix_offset, &prefix_len); |
| 117 | uint32_t suffix_len = DecodeUnsignedLeb128(&suffix_data); |
| 118 | if (prefix_len + suffix_len != len) { |
| 119 | return false; |
| 120 | } |
| 121 | const uint8_t* prefix_data = &dictionary_.prefix_data_[prefix_offset]; |
Mathieu Chartier | e5afbf3 | 2018-09-12 17:51:54 -0700 | [diff] [blame] | 122 | if ((true)) { |
| 123 | return memcmp(prefix_data, data, prefix_len) == 0u && |
| 124 | memcmp(suffix_data, data + prefix_len, len - prefix_len) == 0u; |
| 125 | } else { |
| 126 | len -= prefix_len; |
| 127 | while (prefix_len != 0u) { |
| 128 | if (*prefix_data++ != *data++) { |
| 129 | return false; |
| 130 | } |
| 131 | --prefix_len; |
| 132 | } |
| 133 | while (len != 0u) { |
| 134 | if (*suffix_data++ != *data++) { |
| 135 | return false; |
| 136 | } |
| 137 | --len; |
| 138 | } |
| 139 | return true; |
| 140 | } |
Mathieu Chartier | 5eae4c5 | 2018-08-30 15:52:15 -0700 | [diff] [blame] | 141 | } |
| 142 | |
| 143 | public: |
| 144 | PrefixDictionary dictionary_; |
| 145 | std::vector<uint8_t> chars_; |
| 146 | std::vector<uint32_t> string_offsets_; |
| 147 | }; |
| 148 | |
| 149 | // Normal non prefix strings. |
| 150 | class NormalStrings { |
| 151 | public: |
| 152 | // Return the string index that was added. |
| 153 | size_t AddString(const std::string& str) { |
| 154 | const size_t string_offset = chars_.size(); |
| 155 | EncodeUnsignedLeb128(&chars_, str.length()); |
| 156 | const uint8_t* ptr = reinterpret_cast<const uint8_t*>(&str[0]); |
| 157 | chars_.insert(chars_.end(), ptr, ptr + str.length()); |
| 158 | string_offsets_.push_back(string_offset); |
| 159 | return string_offsets_.size() - 1; |
| 160 | } |
| 161 | |
| 162 | std::string GetString(uint32_t string_idx) const { |
| 163 | const size_t offset = string_offsets_[string_idx]; |
| 164 | const uint8_t* data = &chars_[offset]; |
| 165 | uint32_t len = DecodeUnsignedLeb128(&data); |
| 166 | return std::string(data, data + len); |
| 167 | } |
| 168 | |
| 169 | ALWAYS_INLINE bool Equal(uint32_t string_idx, const uint8_t* data, size_t len) const { |
| 170 | const size_t offset = string_offsets_[string_idx]; |
| 171 | const uint8_t* str_data = &chars_[offset]; |
| 172 | uint32_t str_len = DecodeUnsignedLeb128(&str_data); |
| 173 | if (str_len != len) { |
| 174 | return false; |
| 175 | } |
| 176 | return memcmp(data, str_data, len) == 0u; |
| 177 | } |
| 178 | |
| 179 | public: |
| 180 | std::vector<uint8_t> chars_; |
| 181 | std::vector<uint32_t> string_offsets_; |
| 182 | }; |
| 183 | |
Mathieu Chartier | 49ca869 | 2018-08-24 09:34:49 -0700 | [diff] [blame] | 184 | // Node value = (distance from root) * (occurrences - 1). |
| 185 | class MatchTrie { |
| 186 | public: |
Mathieu Chartier | 5eae4c5 | 2018-08-30 15:52:15 -0700 | [diff] [blame] | 187 | MatchTrie* Add(const std::string& str) { |
Mathieu Chartier | 49ca869 | 2018-08-24 09:34:49 -0700 | [diff] [blame] | 188 | MatchTrie* node = this; |
| 189 | size_t depth = 0u; |
| 190 | for (uint8_t c : str) { |
| 191 | ++depth; |
| 192 | if (node->nodes_[c] == nullptr) { |
| 193 | MatchTrie* new_node = new MatchTrie(); |
| 194 | node->nodes_[c].reset(new_node); |
| 195 | new_node->parent_ = node; |
| 196 | new_node->depth_ = depth; |
| 197 | new_node->incoming_ = c; |
| 198 | node = new_node; |
| 199 | } else { |
| 200 | node = node->nodes_[c].get(); |
| 201 | } |
| 202 | ++node->count_; |
| 203 | } |
Mathieu Chartier | 5eae4c5 | 2018-08-30 15:52:15 -0700 | [diff] [blame] | 204 | return node; |
Mathieu Chartier | 49ca869 | 2018-08-24 09:34:49 -0700 | [diff] [blame] | 205 | } |
| 206 | |
| 207 | // Returns the length of the longest prefix and if it's a leaf node. |
Mathieu Chartier | 5eae4c5 | 2018-08-30 15:52:15 -0700 | [diff] [blame] | 208 | MatchTrie* LongestPrefix(const std::string& str) { |
| 209 | MatchTrie* node = this; |
Mathieu Chartier | 49ca869 | 2018-08-24 09:34:49 -0700 | [diff] [blame] | 210 | for (uint8_t c : str) { |
| 211 | if (node->nodes_[c] == nullptr) { |
| 212 | break; |
| 213 | } |
| 214 | node = node->nodes_[c].get(); |
Mathieu Chartier | 49ca869 | 2018-08-24 09:34:49 -0700 | [diff] [blame] | 215 | } |
Mathieu Chartier | 5eae4c5 | 2018-08-30 15:52:15 -0700 | [diff] [blame] | 216 | return node; |
| 217 | } |
| 218 | |
| 219 | bool IsLeaf() const { |
| 220 | for (const std::unique_ptr<MatchTrie>& cur_node : nodes_) { |
Mathieu Chartier | 49ca869 | 2018-08-24 09:34:49 -0700 | [diff] [blame] | 221 | if (cur_node != nullptr) { |
Mathieu Chartier | 5eae4c5 | 2018-08-30 15:52:15 -0700 | [diff] [blame] | 222 | return false; |
Mathieu Chartier | 49ca869 | 2018-08-24 09:34:49 -0700 | [diff] [blame] | 223 | } |
| 224 | } |
Mathieu Chartier | 5eae4c5 | 2018-08-30 15:52:15 -0700 | [diff] [blame] | 225 | return true; |
Mathieu Chartier | 49ca869 | 2018-08-24 09:34:49 -0700 | [diff] [blame] | 226 | } |
| 227 | |
| 228 | int32_t Savings() const { |
| 229 | int32_t cost = kPrefixConstantCost; |
| 230 | int32_t first_used = 0u; |
| 231 | if (chosen_suffix_count_ == 0u) { |
| 232 | cost += depth_; |
| 233 | } |
| 234 | uint32_t extra_savings = 0u; |
| 235 | for (MatchTrie* cur = parent_; cur != nullptr; cur = cur->parent_) { |
| 236 | if (cur->chosen_) { |
| 237 | first_used = cur->depth_; |
| 238 | if (cur->chosen_suffix_count_ == 0u) { |
| 239 | // First suffix for the chosen parent, remove the cost of the dictionary entry. |
| 240 | extra_savings += first_used; |
| 241 | } |
| 242 | break; |
| 243 | } |
| 244 | } |
| 245 | return count_ * (depth_ - first_used) - cost + extra_savings; |
| 246 | } |
| 247 | |
| 248 | template <typename T, typename... Args, template <typename...> class Queue> |
| 249 | T PopRealTop(Queue<T, Args...>& queue) { |
| 250 | auto pair = queue.top(); |
| 251 | queue.pop(); |
| 252 | // Keep updating values until one sticks. |
| 253 | while (pair.second->Savings() != pair.first) { |
| 254 | pair.first = pair.second->Savings(); |
| 255 | queue.push(pair); |
| 256 | pair = queue.top(); |
| 257 | queue.pop(); |
| 258 | } |
| 259 | return pair; |
| 260 | } |
| 261 | |
| 262 | std::vector<std::string> ExtractPrefixes(size_t max) { |
| 263 | std::vector<std::string> ret; |
| 264 | // Make priority queue and adaptively update it. Each node value is the savings from picking |
| 265 | // it. Insert all of the interesting nodes in the queue (children != 1). |
| 266 | std::priority_queue<std::pair<int32_t, MatchTrie*>> queue; |
| 267 | // Add all of the nodes to the queue. |
| 268 | std::vector<MatchTrie*> work(1, this); |
| 269 | while (!work.empty()) { |
| 270 | MatchTrie* elem = work.back(); |
| 271 | work.pop_back(); |
| 272 | size_t num_childs = 0u; |
| 273 | for (const std::unique_ptr<MatchTrie>& child : elem->nodes_) { |
| 274 | if (child != nullptr) { |
| 275 | work.push_back(child.get()); |
| 276 | ++num_childs; |
| 277 | } |
| 278 | } |
Mathieu Chartier | 5eae4c5 | 2018-08-30 15:52:15 -0700 | [diff] [blame] | 279 | if (num_childs > 1u || elem->value_ != 0u) { |
Mathieu Chartier | 49ca869 | 2018-08-24 09:34:49 -0700 | [diff] [blame] | 280 | queue.emplace(elem->Savings(), elem); |
| 281 | } |
| 282 | } |
| 283 | std::priority_queue<std::pair<int32_t, MatchTrie*>> prefixes; |
| 284 | // The savings can only ever go down for a given node, never up. |
| 285 | while (max != 0u && !queue.empty()) { |
| 286 | std::pair<int32_t, MatchTrie*> pair = PopRealTop(queue); |
| 287 | if (pair.second != this && pair.first > 0) { |
| 288 | // Pick this node. |
| 289 | uint32_t count = pair.second->count_; |
| 290 | pair.second->chosen_ = true; |
| 291 | for (MatchTrie* cur = pair.second->parent_; cur != this; cur = cur->parent_) { |
| 292 | if (cur->chosen_) { |
| 293 | break; |
| 294 | } |
| 295 | cur->count_ -= count; |
| 296 | } |
| 297 | for (MatchTrie* cur = pair.second->parent_; cur != this; cur = cur->parent_) { |
| 298 | ++cur->chosen_suffix_count_; |
| 299 | } |
| 300 | prefixes.emplace(pair.first, pair.second); |
| 301 | --max; |
| 302 | } else { |
| 303 | // Negative or no EV, just delete the node. |
| 304 | } |
| 305 | } |
| 306 | while (!prefixes.empty()) { |
| 307 | std::pair<int32_t, MatchTrie*> pair = PopRealTop(prefixes); |
| 308 | if (pair.first <= 0) { |
| 309 | continue; |
| 310 | } |
Mathieu Chartier | 5eae4c5 | 2018-08-30 15:52:15 -0700 | [diff] [blame] | 311 | ret.push_back(pair.second->GetString()); |
Mathieu Chartier | 49ca869 | 2018-08-24 09:34:49 -0700 | [diff] [blame] | 312 | } |
| 313 | return ret; |
| 314 | } |
| 315 | |
Mathieu Chartier | 5eae4c5 | 2018-08-30 15:52:15 -0700 | [diff] [blame] | 316 | std::string GetString() const { |
| 317 | std::vector<uint8_t> chars; |
| 318 | for (const MatchTrie* cur = this; cur->parent_ != nullptr; cur = cur->parent_) { |
| 319 | chars.push_back(cur->incoming_); |
| 320 | } |
| 321 | return std::string(chars.rbegin(), chars.rend()); |
| 322 | } |
| 323 | |
Mathieu Chartier | 49ca869 | 2018-08-24 09:34:49 -0700 | [diff] [blame] | 324 | std::unique_ptr<MatchTrie> nodes_[256]; |
| 325 | MatchTrie* parent_ = nullptr; |
| 326 | uint32_t count_ = 0u; |
Mathieu Chartier | 5eae4c5 | 2018-08-30 15:52:15 -0700 | [diff] [blame] | 327 | uint32_t depth_ = 0u; |
Mathieu Chartier | 49ca869 | 2018-08-24 09:34:49 -0700 | [diff] [blame] | 328 | int32_t savings_ = 0u; |
| 329 | uint8_t incoming_ = 0u; |
Mathieu Chartier | 5eae4c5 | 2018-08-30 15:52:15 -0700 | [diff] [blame] | 330 | // Value of the current node, non zero if the node is chosen. |
| 331 | uint32_t value_ = 0u; |
Mathieu Chartier | 49ca869 | 2018-08-24 09:34:49 -0700 | [diff] [blame] | 332 | // If the current node is chosen to be a used prefix. |
| 333 | bool chosen_ = false; |
| 334 | // If the current node is a prefix of a longer chosen prefix. |
| 335 | uint32_t chosen_suffix_count_ = 0u; |
| 336 | }; |
| 337 | |
Mathieu Chartier | 5eae4c5 | 2018-08-30 15:52:15 -0700 | [diff] [blame] | 338 | void PrefixStrings::Builder::Build(const std::vector<std::string>& strings) { |
| 339 | std::unique_ptr<MatchTrie> prefixe_trie(new MatchTrie()); |
| 340 | for (size_t i = 0; i < strings.size(); ++i) { |
| 341 | size_t len = 0u; |
| 342 | if (i > 0u) { |
| 343 | CHECK_GT(strings[i], strings[i - 1]); |
| 344 | len = std::max(len, PrefixLen(strings[i], strings[i - 1])); |
| 345 | } |
| 346 | if (i < strings.size() - 1) { |
| 347 | len = std::max(len, PrefixLen(strings[i], strings[i + 1])); |
| 348 | } |
| 349 | len = std::min(len, kMaxPrefixLen); |
| 350 | if (len >= kMinPrefixLen) { |
| 351 | prefixe_trie->Add(strings[i].substr(0, len))->value_ = 1u; |
| 352 | } |
| 353 | } |
| 354 | |
| 355 | // Build prefixes. |
| 356 | { |
| 357 | static constexpr size_t kPrefixBits = 15; |
| 358 | std::vector<std::string> prefixes(prefixe_trie->ExtractPrefixes(1 << kPrefixBits)); |
| 359 | // Add longest prefixes first so that subprefixes can share data. |
| 360 | std::sort(prefixes.begin(), prefixes.end(), [](const std::string& a, const std::string& b) { |
| 361 | return a.length() > b.length(); |
| 362 | }); |
| 363 | prefixe_trie.reset(); |
| 364 | prefixe_trie.reset(new MatchTrie()); |
| 365 | uint32_t prefix_idx = 0u; |
| 366 | CHECK_EQ(output_->dictionary_.AddOffset(0u, 0u), prefix_idx++); |
| 367 | for (const std::string& str : prefixes) { |
| 368 | uint32_t prefix_offset = 0u; |
| 369 | MatchTrie* node = prefixe_trie->LongestPrefix(str); |
| 370 | if (node != nullptr && node->depth_ == str.length() && node->value_ != 0u) { |
| 371 | CHECK_EQ(node->GetString(), str); |
| 372 | uint32_t existing_len = 0u; |
| 373 | output_->dictionary_.GetOffset(node->value_, &prefix_offset, &existing_len); |
| 374 | // Make sure to register the current node. |
| 375 | prefixe_trie->Add(str)->value_ = prefix_idx; |
| 376 | } else { |
| 377 | auto add_str = [&](const std::string& s) { |
| 378 | node = prefixe_trie->Add(s); |
| 379 | node->value_ = prefix_idx; |
| 380 | while (node != nullptr) { |
| 381 | node->value_ = prefix_idx; |
| 382 | node = node->parent_; |
| 383 | } |
| 384 | }; |
| 385 | static constexpr size_t kNumSubstrings = 1u; |
| 386 | // Increasing kNumSubstrings provides savings since it enables common substrings and not |
| 387 | // only prefixes to share data. The problem is that it's slow. |
| 388 | for (size_t i = 0; i < std::min(str.length(), kNumSubstrings); ++i) { |
| 389 | add_str(str.substr(i)); |
| 390 | } |
| 391 | prefix_offset = output_->dictionary_.AddPrefixData( |
| 392 | reinterpret_cast<const uint8_t*>(&str[0]), |
| 393 | str.length()); |
| 394 | } |
| 395 | // TODO: Validiate the prefix offset. |
| 396 | CHECK_EQ(output_->dictionary_.AddOffset(prefix_offset, str.length()), prefix_idx); |
| 397 | ++prefix_idx; |
| 398 | } |
| 399 | } |
| 400 | |
| 401 | // Add strings to the dictionary. |
| 402 | for (const std::string& str : strings) { |
| 403 | MatchTrie* node = prefixe_trie->LongestPrefix(str); |
| 404 | uint32_t prefix_idx = 0u; |
| 405 | uint32_t best_length = 0u; |
| 406 | while (node != nullptr) { |
| 407 | uint32_t offset = 0u; |
| 408 | uint32_t length = 0u; |
| 409 | output_->dictionary_.GetOffset(node->value_, &offset, &length); |
| 410 | if (node->depth_ == length) { |
| 411 | prefix_idx = node->value_; |
| 412 | best_length = node->depth_; |
| 413 | break; |
| 414 | // Actually the prefix we want. |
| 415 | } |
| 416 | node = node->parent_; |
| 417 | } |
| 418 | output_->AddString(prefix_idx, str.substr(best_length)); |
| 419 | } |
| 420 | } |
| 421 | |
Mathieu Chartier | 0ddf626 | 2018-08-23 17:37:46 -0700 | [diff] [blame] | 422 | void AnalyzeStrings::ProcessDexFiles(const std::vector<std::unique_ptr<const DexFile>>& dex_files) { |
| 423 | std::set<std::string> unique_strings; |
Mathieu Chartier | 49ca869 | 2018-08-24 09:34:49 -0700 | [diff] [blame] | 424 | // Accumulate the strings. |
Mathieu Chartier | 0ddf626 | 2018-08-23 17:37:46 -0700 | [diff] [blame] | 425 | for (const std::unique_ptr<const DexFile>& dex_file : dex_files) { |
| 426 | for (size_t i = 0; i < dex_file->NumStringIds(); ++i) { |
| 427 | uint32_t length = 0; |
| 428 | const char* data = dex_file->StringDataAndUtf16LengthByIdx(dex::StringIndex(i), &length); |
| 429 | // Analyze if the string has any UTF16 chars. |
| 430 | bool have_wide_char = false; |
| 431 | const char* ptr = data; |
| 432 | for (size_t j = 0; j < length; ++j) { |
| 433 | have_wide_char = have_wide_char || GetUtf16FromUtf8(&ptr) >= 0x100; |
| 434 | } |
| 435 | if (have_wide_char) { |
| 436 | wide_string_bytes_ += 2 * length; |
| 437 | } else { |
| 438 | ascii_string_bytes_ += length; |
| 439 | } |
| 440 | string_data_bytes_ += ptr - data; |
| 441 | unique_strings.insert(data); |
| 442 | } |
| 443 | } |
Mathieu Chartier | 5eae4c5 | 2018-08-30 15:52:15 -0700 | [diff] [blame] | 444 | // Unique strings only since we want to exclude savings from multi-dex duplication. |
| 445 | ProcessStrings(std::vector<std::string>(unique_strings.begin(), unique_strings.end())); |
Mathieu Chartier | 49ca869 | 2018-08-24 09:34:49 -0700 | [diff] [blame] | 446 | } |
Mathieu Chartier | 0ddf626 | 2018-08-23 17:37:46 -0700 | [diff] [blame] | 447 | |
Mathieu Chartier | 5eae4c5 | 2018-08-30 15:52:15 -0700 | [diff] [blame] | 448 | void AnalyzeStrings::ProcessStrings(const std::vector<std::string>& strings) { |
Mathieu Chartier | 0ddf626 | 2018-08-23 17:37:46 -0700 | [diff] [blame] | 449 | // Calculate total shared prefix. |
Mathieu Chartier | 5eae4c5 | 2018-08-30 15:52:15 -0700 | [diff] [blame] | 450 | size_t prefix_index_cost_ = 0u; |
Mathieu Chartier | 0ddf626 | 2018-08-23 17:37:46 -0700 | [diff] [blame] | 451 | for (size_t i = 0; i < strings.size(); ++i) { |
| 452 | size_t best_len = 0; |
| 453 | if (i > 0) { |
| 454 | best_len = std::max(best_len, PrefixLen(strings[i], strings[i - 1])); |
| 455 | } |
| 456 | if (i < strings.size() - 1) { |
| 457 | best_len = std::max(best_len, PrefixLen(strings[i], strings[i + 1])); |
| 458 | } |
| 459 | best_len = std::min(best_len, kMaxPrefixLen); |
Mathieu Chartier | 0ddf626 | 2018-08-23 17:37:46 -0700 | [diff] [blame] | 460 | if (best_len >= kMinPrefixLen) { |
Mathieu Chartier | 49ca869 | 2018-08-24 09:34:49 -0700 | [diff] [blame] | 461 | total_shared_prefix_bytes_ += best_len; |
Mathieu Chartier | 0ddf626 | 2018-08-23 17:37:46 -0700 | [diff] [blame] | 462 | } |
Mathieu Chartier | 5eae4c5 | 2018-08-30 15:52:15 -0700 | [diff] [blame] | 463 | prefix_index_cost_ += kPrefixIndexCost; |
| 464 | if (strings[i].length() < 64) { |
Mathieu Chartier | 49ca869 | 2018-08-24 09:34:49 -0700 | [diff] [blame] | 465 | ++short_strings_; |
| 466 | } else { |
| 467 | ++long_strings_; |
| 468 | } |
| 469 | } |
Mathieu Chartier | 5eae4c5 | 2018-08-30 15:52:15 -0700 | [diff] [blame] | 470 | total_prefix_index_cost_ += prefix_index_cost_; |
| 471 | |
| 472 | PrefixStrings prefix_strings; |
| 473 | { |
| 474 | PrefixStrings::Builder prefix_builder(&prefix_strings); |
| 475 | prefix_builder.Build(strings); |
Mathieu Chartier | 0ddf626 | 2018-08-23 17:37:46 -0700 | [diff] [blame] | 476 | } |
Mathieu Chartier | 5eae4c5 | 2018-08-30 15:52:15 -0700 | [diff] [blame] | 477 | Benchmark(prefix_strings, strings, &prefix_timings_); |
| 478 | const size_t num_prefixes = prefix_strings.dictionary_.offsets_.size(); |
| 479 | total_num_prefixes_ += num_prefixes; |
| 480 | total_prefix_table_ += num_prefixes * sizeof(prefix_strings.dictionary_.offsets_[0]); |
| 481 | total_prefix_dict_ += prefix_strings.dictionary_.prefix_data_.size(); |
| 482 | |
| 483 | { |
| 484 | NormalStrings normal_strings; |
| 485 | for (const std::string& s : strings) { |
| 486 | normal_strings.AddString(s); |
| 487 | } |
| 488 | const uint64_t unique_string_data_bytes = normal_strings.chars_.size(); |
| 489 | total_unique_string_data_bytes_ += unique_string_data_bytes; |
| 490 | total_prefix_savings_ += unique_string_data_bytes - prefix_strings.chars_.size() + |
| 491 | prefix_index_cost_; |
| 492 | Benchmark(normal_strings, strings, &normal_timings_); |
| 493 | } |
| 494 | } |
| 495 | |
| 496 | template <typename Strings> |
| 497 | void AnalyzeStrings::Benchmark(const Strings& strings, |
| 498 | const std::vector<std::string>& reference, |
| 499 | StringTimings* timings) { |
| 500 | const size_t kIterations = 100; |
| 501 | timings->num_comparisons_ += reference.size() * kIterations; |
| 502 | |
| 503 | uint64_t start = NanoTime(); |
| 504 | for (size_t j = 0; j < kIterations; ++j) { |
| 505 | for (size_t i = 0; i < reference.size(); ++i) { |
| 506 | CHECK(strings.Equal( |
| 507 | i, |
| 508 | reinterpret_cast<const uint8_t*>(&reference[i][0]), |
| 509 | reference[i].length())) |
| 510 | << i << ": " << strings.GetString(i) << " vs " << reference[i]; |
| 511 | } |
| 512 | } |
| 513 | timings->time_equal_comparisons_ += NanoTime() - start; |
| 514 | |
| 515 | start = NanoTime(); |
| 516 | for (size_t j = 0; j < kIterations; ++j) { |
| 517 | size_t count = 0u; |
| 518 | for (size_t i = 0; i < reference.size(); ++i) { |
| 519 | count += strings.Equal( |
| 520 | reference.size() - 1 - i, |
| 521 | reinterpret_cast<const uint8_t*>(&reference[i][0]), |
| 522 | reference[i].length()); |
| 523 | } |
| 524 | CHECK_LT(count, 2u); |
| 525 | } |
| 526 | timings->time_non_equal_comparisons_ += NanoTime() - start; |
| 527 | } |
| 528 | |
| 529 | template void AnalyzeStrings::Benchmark(const PrefixStrings&, |
| 530 | const std::vector<std::string>&, |
| 531 | StringTimings* timings); |
| 532 | template void AnalyzeStrings::Benchmark(const NormalStrings&, |
| 533 | const std::vector<std::string>&, |
| 534 | StringTimings* timings); |
| 535 | |
| 536 | void StringTimings::Dump(std::ostream& os) const { |
| 537 | const double comparisons = static_cast<double>(num_comparisons_); |
| 538 | os << "Compare equal " << static_cast<double>(time_equal_comparisons_) / comparisons << "\n"; |
| 539 | os << "Compare not equal " << static_cast<double>(time_non_equal_comparisons_) / comparisons << "\n"; |
Mathieu Chartier | 0ddf626 | 2018-08-23 17:37:46 -0700 | [diff] [blame] | 540 | } |
| 541 | |
| 542 | void AnalyzeStrings::Dump(std::ostream& os, uint64_t total_size) const { |
| 543 | os << "Total string data bytes " << Percent(string_data_bytes_, total_size) << "\n"; |
Mathieu Chartier | 5eae4c5 | 2018-08-30 15:52:15 -0700 | [diff] [blame] | 544 | os << "Total unique string data bytes " |
| 545 | << Percent(total_unique_string_data_bytes_, total_size) << "\n"; |
Mathieu Chartier | 0ddf626 | 2018-08-23 17:37:46 -0700 | [diff] [blame] | 546 | os << "UTF-16 string data bytes " << Percent(wide_string_bytes_, total_size) << "\n"; |
| 547 | os << "ASCII string data bytes " << Percent(ascii_string_bytes_, total_size) << "\n"; |
| 548 | |
Mathieu Chartier | 5eae4c5 | 2018-08-30 15:52:15 -0700 | [diff] [blame] | 549 | os << "Prefix string timings\n"; |
| 550 | prefix_timings_.Dump(os); |
| 551 | os << "Normal string timings\n"; |
| 552 | normal_timings_.Dump(os); |
| 553 | |
Mathieu Chartier | 0ddf626 | 2018-08-23 17:37:46 -0700 | [diff] [blame] | 554 | // Prefix based strings. |
Mathieu Chartier | 49ca869 | 2018-08-24 09:34:49 -0700 | [diff] [blame] | 555 | os << "Total shared prefix bytes " << Percent(total_shared_prefix_bytes_, total_size) << "\n"; |
Mathieu Chartier | 0ddf626 | 2018-08-23 17:37:46 -0700 | [diff] [blame] | 556 | os << "Prefix dictionary cost " << Percent(total_prefix_dict_, total_size) << "\n"; |
| 557 | os << "Prefix table cost " << Percent(total_prefix_table_, total_size) << "\n"; |
| 558 | os << "Prefix index cost " << Percent(total_prefix_index_cost_, total_size) << "\n"; |
Mathieu Chartier | 5eae4c5 | 2018-08-30 15:52:15 -0700 | [diff] [blame] | 559 | int64_t net_savings = total_prefix_savings_; |
Mathieu Chartier | 0ddf626 | 2018-08-23 17:37:46 -0700 | [diff] [blame] | 560 | net_savings -= total_prefix_dict_; |
| 561 | net_savings -= total_prefix_table_; |
| 562 | net_savings -= total_prefix_index_cost_; |
| 563 | os << "Prefix dictionary elements " << total_num_prefixes_ << "\n"; |
Mathieu Chartier | 5eae4c5 | 2018-08-30 15:52:15 -0700 | [diff] [blame] | 564 | os << "Prefix base savings " << Percent(total_prefix_savings_, total_size) << "\n"; |
Mathieu Chartier | 0ddf626 | 2018-08-23 17:37:46 -0700 | [diff] [blame] | 565 | os << "Prefix net savings " << Percent(net_savings, total_size) << "\n"; |
Mathieu Chartier | 49ca869 | 2018-08-24 09:34:49 -0700 | [diff] [blame] | 566 | os << "Strings using prefix " |
| 567 | << Percent(strings_used_prefixed_, total_prefix_index_cost_ / kPrefixIndexCost) << "\n"; |
| 568 | os << "Short strings " << Percent(short_strings_, short_strings_ + long_strings_) << "\n"; |
Mathieu Chartier | 0ddf626 | 2018-08-23 17:37:46 -0700 | [diff] [blame] | 569 | if (verbose_level_ >= VerboseLevel::kEverything) { |
Mathieu Chartier | 5eae4c5 | 2018-08-30 15:52:15 -0700 | [diff] [blame] | 570 | std::vector<std::pair<std::string, size_t>> pairs; // (prefixes_.begin(), prefixes_.end()); |
Mathieu Chartier | 0ddf626 | 2018-08-23 17:37:46 -0700 | [diff] [blame] | 571 | // Sort lexicographically. |
| 572 | std::sort(pairs.begin(), pairs.end()); |
| 573 | for (const auto& pair : pairs) { |
| 574 | os << pair.first << " : " << pair.second << "\n"; |
| 575 | } |
| 576 | } |
| 577 | } |
| 578 | |
| 579 | } // namespace dexanalyze |
| 580 | } // namespace art |