Merge "Ensure we have the correct thread when allocating obsolete methods."
diff --git a/build/Android.gtest.mk b/build/Android.gtest.mk
index b661e00..c27f8db 100644
--- a/build/Android.gtest.mk
+++ b/build/Android.gtest.mk
@@ -75,8 +75,11 @@
 	$(call dexpreopt-remove-classes.dex,$@)
 
 ART_TEST_GTEST_VerifierDeps_SRC := $(abspath $(wildcard $(LOCAL_PATH)/VerifierDeps/*.smali))
+ART_TEST_GTEST_VerifierDepsMulti_SRC := $(abspath $(wildcard $(LOCAL_PATH)/VerifierDepsMulti/*.smali))
 ART_TEST_HOST_GTEST_VerifierDeps_DEX := $(dir $(ART_TEST_HOST_GTEST_Main_DEX))$(subst Main,VerifierDeps,$(basename $(notdir $(ART_TEST_HOST_GTEST_Main_DEX))))$(suffix $(ART_TEST_HOST_GTEST_Main_DEX))
 ART_TEST_TARGET_GTEST_VerifierDeps_DEX := $(dir $(ART_TEST_TARGET_GTEST_Main_DEX))$(subst Main,VerifierDeps,$(basename $(notdir $(ART_TEST_TARGET_GTEST_Main_DEX))))$(suffix $(ART_TEST_TARGET_GTEST_Main_DEX))
+ART_TEST_HOST_GTEST_VerifierDepsMulti_DEX := $(dir $(ART_TEST_HOST_GTEST_Main_DEX))$(subst Main,VerifierDepsMulti,$(basename $(notdir $(ART_TEST_HOST_GTEST_Main_DEX))))$(suffix $(ART_TEST_HOST_GTEST_Main_DEX))
+ART_TEST_TARGET_GTEST_VerifierDepsMulti_DEX := $(dir $(ART_TEST_TARGET_GTEST_Main_DEX))$(subst Main,VerifierDepsMulti,$(basename $(notdir $(ART_TEST_TARGET_GTEST_Main_DEX))))$(suffix $(ART_TEST_TARGET_GTEST_Main_DEX))
 
 $(ART_TEST_HOST_GTEST_VerifierDeps_DEX): $(ART_TEST_GTEST_VerifierDeps_SRC) $(HOST_OUT_EXECUTABLES)/smali
 	 $(HOST_OUT_EXECUTABLES)/smali --output=$@ $(filter %.smali,$^)
@@ -84,6 +87,12 @@
 $(ART_TEST_TARGET_GTEST_VerifierDeps_DEX): $(ART_TEST_GTEST_VerifierDeps_SRC) $(HOST_OUT_EXECUTABLES)/smali
 	 $(HOST_OUT_EXECUTABLES)/smali --output=$@ $(filter %.smali,$^)
 
+$(ART_TEST_HOST_GTEST_VerifierDepsMulti_DEX): $(ART_TEST_GTEST_VerifierDepsMulti_SRC) $(HOST_OUT_EXECUTABLES)/smali
+	 $(HOST_OUT_EXECUTABLES)/smali --output=$@ $(filter %.smali,$^)
+
+$(ART_TEST_TARGET_GTEST_VerifierDepsMulti_DEX): $(ART_TEST_GTEST_VerifierDepsMulti_SRC) $(HOST_OUT_EXECUTABLES)/smali
+	 $(HOST_OUT_EXECUTABLES)/smali --output=$@ $(filter %.smali,$^)
+
 # Dex file dependencies for each gtest.
 ART_GTEST_dex2oat_environment_tests_DEX_DEPS := Main MainStripped MultiDex MultiDexModifiedSecondary Nested
 
@@ -115,7 +124,7 @@
 ART_GTEST_transaction_test_DEX_DEPS := Transaction
 ART_GTEST_type_lookup_table_test_DEX_DEPS := Lookup
 ART_GTEST_unstarted_runtime_test_DEX_DEPS := Nested
-ART_GTEST_verifier_deps_test_DEX_DEPS := VerifierDeps MultiDex
+ART_GTEST_verifier_deps_test_DEX_DEPS := VerifierDeps VerifierDepsMulti MultiDex
 ART_GTEST_dex_to_dex_decompiler_test_DEX_DEPS := VerifierDeps DexToDexDecompiler
 
 # The elf writer test has dependencies on core.oat.
diff --git a/compiler/Android.bp b/compiler/Android.bp
index 1ee2a21..c59e36b 100644
--- a/compiler/Android.bp
+++ b/compiler/Android.bp
@@ -417,6 +417,7 @@
 
     shared_libs: [
         "libartd-compiler",
+        "libartd-simulator",
         "libvixld-arm",
         "libvixld-arm64",
 
diff --git a/compiler/compiled_method.h b/compiler/compiled_method.h
index e2a0942..00e2d62 100644
--- a/compiler/compiled_method.h
+++ b/compiler/compiled_method.h
@@ -170,7 +170,6 @@
   // choose to squeeze the Type into fewer than 8 bits, we'll have to declare
   // patch_type_ as an uintN_t and do explicit static_cast<>s.
   enum class Type : uint8_t {
-    kRecordPosition,   // Just record patch position for patchoat.
     kMethod,
     kCall,
     kCallRelative,     // NOTE: Actual patching is instruction_set-dependent.
@@ -183,10 +182,6 @@
     kDexCacheArray,    // NOTE: Actual patching is instruction_set-dependent.
   };
 
-  static LinkerPatch RecordPosition(size_t literal_offset) {
-    return LinkerPatch(literal_offset, Type::kRecordPosition, /* target_dex_file */ nullptr);
-  }
-
   static LinkerPatch MethodPatch(size_t literal_offset,
                                  const DexFile* target_dex_file,
                                  uint32_t target_method_idx) {
diff --git a/compiler/dex/dex_to_dex_decompiler.cc b/compiler/dex/dex_to_dex_decompiler.cc
index 5360103..85d5784 100644
--- a/compiler/dex/dex_to_dex_decompiler.cc
+++ b/compiler/dex/dex_to_dex_decompiler.cc
@@ -185,7 +185,7 @@
   }
 
   if (quickened_info_ptr_ != quickened_info_end_) {
-    LOG(ERROR) << "Failed to use all values in quickening info."
+    LOG(FATAL) << "Failed to use all values in quickening info."
                << " Actual: " << std::hex << quickened_info_ptr_
                << " Expected: " << quickened_info_end_;
     return false;
diff --git a/compiler/driver/compiler_driver.cc b/compiler/driver/compiler_driver.cc
index a5e4cb0..9950987 100644
--- a/compiler/driver/compiler_driver.cc
+++ b/compiler/driver/compiler_driver.cc
@@ -480,7 +480,9 @@
       DCHECK(!it.HasNext());
     }
   }
-  DCHECK_EQ(quickening_info_ptr, quickening_info_end) << "Failed to use all quickening info";
+  if (quickening_info_ptr != quickening_info_end) {
+    LOG(FATAL) << "Failed to use all quickening info";
+  }
 }
 
 void CompilerDriver::CompileAll(jobject class_loader,
@@ -535,9 +537,8 @@
   if (klass->IsVerified()) {
     // Class is verified so we can enable DEX-to-DEX compilation for performance.
     return max_level;
-  } else if (klass->IsCompileTimeVerified()) {
+  } else if (klass->ShouldVerifyAtRuntime()) {
     // Class verification has soft-failed. Anyway, ensure at least correctness.
-    DCHECK_EQ(klass->GetStatus(), mirror::Class::kStatusRetryVerificationAtRuntime);
     return optimizer::DexToDexCompilationLevel::kRequired;
   } else {
     // Class verification has failed: do not run DEX-to-DEX compilation.
@@ -964,7 +965,7 @@
       if (cls == nullptr) {
         soa.Self()->ClearException();
       } else if (&cls->GetDexFile() == dex_file) {
-        DCHECK(cls->IsErroneous() || cls->IsVerified() || cls->IsCompileTimeVerified())
+        DCHECK(cls->IsErroneous() || cls->IsVerified() || cls->ShouldVerifyAtRuntime())
             << cls->PrettyClass()
             << " " << cls->GetStatus();
       }
@@ -2160,6 +2161,14 @@
         LOG(ERROR) << "Verification failed on class " << PrettyDescriptor(descriptor)
                    << " because: " << error_msg;
         manager_->GetCompiler()->SetHadHardVerifierFailure();
+      } else {
+        // Force a soft failure for the VerifierDeps. This is a sanity measure, as
+        // the vdex file already records that the class hasn't been resolved. It avoids
+        // trying to do future verification optimizations when processing the vdex file.
+        DCHECK(failure_kind == verifier::MethodVerifier::kSoftFailure ||
+               failure_kind == verifier::MethodVerifier::kNoFailure)
+            << failure_kind;
+        failure_kind = verifier::MethodVerifier::kSoftFailure;
       }
     } else if (!SkipClass(jclass_loader, dex_file, klass.Get())) {
       CHECK(klass->IsResolved()) << klass->PrettyClass();
@@ -2172,7 +2181,7 @@
         manager_->GetCompiler()->SetHadHardVerifierFailure();
       }
 
-      CHECK(klass->IsCompileTimeVerified() || klass->IsErroneous())
+      CHECK(klass->ShouldVerifyAtRuntime() || klass->IsVerified() || klass->IsErroneous())
           << klass->PrettyDescriptor() << ": state=" << klass->GetStatus();
 
       // It is *very* problematic if there are verification errors in the boot classpath. For example,
@@ -2186,6 +2195,13 @@
           DCHECK(klass->IsVerified()) << "Boot classpath class " << klass->PrettyClass()
               << " failed to fully verify: state= " << klass->GetStatus();
         }
+        if (klass->IsVerified()) {
+          DCHECK_EQ(failure_kind, verifier::MethodVerifier::kNoFailure);
+        } else if (klass->ShouldVerifyAtRuntime()) {
+          DCHECK_EQ(failure_kind, verifier::MethodVerifier::kSoftFailure);
+        } else {
+          DCHECK_EQ(failure_kind, verifier::MethodVerifier::kHardFailure);
+        }
       }
     } else {
       // Make the skip a soft failure, essentially being considered as verify at runtime.
diff --git a/compiler/driver/compiler_driver.h b/compiler/driver/compiler_driver.h
index 1e5c43d..cbde587 100644
--- a/compiler/driver/compiler_driver.h
+++ b/compiler/driver/compiler_driver.h
@@ -355,6 +355,10 @@
     return current_dex_to_dex_methods_;
   }
 
+  const ProfileCompilationInfo* GetProfileCompilationInfo() const {
+    return profile_compilation_info_;
+  }
+
  private:
   // Can `referrer_class` access the resolved `member`?
   // Dispatch call to mirror::Class::CanAccessResolvedField or
diff --git a/compiler/driver/compiler_driver_test.cc b/compiler/driver/compiler_driver_test.cc
index 562f97b..35aa1ee 100644
--- a/compiler/driver/compiler_driver_test.cc
+++ b/compiler/driver/compiler_driver_test.cc
@@ -133,9 +133,10 @@
         << " " << dex.GetMethodDeclaringClassDescriptor(dex.GetMethodId(i)) << " "
         << dex.GetMethodName(dex.GetMethodId(i));
   }
-  EXPECT_EQ(dex.NumFieldIds(), dex_cache->NumResolvedFields());
+  EXPECT_TRUE(dex_cache->StaticArtFieldSize() == dex_cache->NumResolvedFields()
+      || dex.NumFieldIds() ==  dex_cache->NumResolvedFields());
   for (size_t i = 0; i < dex_cache->NumResolvedFields(); i++) {
-    ArtField* field = cl->GetResolvedField(i, dex_cache);
+    ArtField* field = dex_cache->GetResolvedField(i, cl->GetImagePointerSize());
     EXPECT_TRUE(field != nullptr) << "field_idx=" << i
                                << " " << dex.GetFieldDeclaringClassDescriptor(dex.GetFieldId(i))
                                << " " << dex.GetFieldName(dex.GetFieldId(i));
diff --git a/compiler/driver/compiler_options.cc b/compiler/driver/compiler_options.cc
index c222f90..34ad1c5 100644
--- a/compiler/driver/compiler_options.cc
+++ b/compiler/driver/compiler_options.cc
@@ -32,7 +32,6 @@
       no_inline_from_(nullptr),
       boot_image_(false),
       app_image_(false),
-      include_patch_information_(kDefaultIncludePatchInformation),
       top_k_profile_threshold_(kDefaultTopKProfileThreshold),
       debuggable_(false),
       generate_debug_info_(kDefaultGenerateDebugInfo),
@@ -66,7 +65,6 @@
                                  size_t inline_depth_limit,
                                  size_t inline_max_code_units,
                                  const std::vector<const DexFile*>* no_inline_from,
-                                 bool include_patch_information,
                                  double top_k_profile_threshold,
                                  bool debuggable,
                                  bool generate_debug_info,
@@ -93,7 +91,6 @@
       no_inline_from_(no_inline_from),
       boot_image_(false),
       app_image_(false),
-      include_patch_information_(include_patch_information),
       top_k_profile_threshold_(top_k_profile_threshold),
       debuggable_(debuggable),
       generate_debug_info_(generate_debug_info),
@@ -206,10 +203,6 @@
     debuggable_ = true;
   } else if (option.starts_with("--top-k-profile-threshold=")) {
     ParseDouble(option.data(), '=', 0.0, 100.0, &top_k_profile_threshold_, Usage);
-  } else if (option == "--include-patch-information") {
-    include_patch_information_ = true;
-  } else if (option == "--no-include-patch-information") {
-    include_patch_information_ = false;
   } else if (option == "--abort-on-hard-verifier-error") {
     abort_on_hard_verifier_failure_ = true;
   } else if (option.starts_with("--dump-init-failures=")) {
diff --git a/compiler/driver/compiler_options.h b/compiler/driver/compiler_options.h
index 6894cd5..2e3e55f 100644
--- a/compiler/driver/compiler_options.h
+++ b/compiler/driver/compiler_options.h
@@ -46,7 +46,6 @@
   static constexpr double kDefaultTopKProfileThreshold = 90.0;
   static const bool kDefaultGenerateDebugInfo = false;
   static const bool kDefaultGenerateMiniDebugInfo = false;
-  static const bool kDefaultIncludePatchInformation = false;
   static const size_t kDefaultInlineDepthLimit = 3;
   static const size_t kDefaultInlineMaxCodeUnits = 32;
   static constexpr size_t kUnsetInlineDepthLimit = -1;
@@ -68,7 +67,6 @@
                   size_t inline_depth_limit,
                   size_t inline_max_code_units,
                   const std::vector<const DexFile*>* no_inline_from,
-                  bool include_patch_information,
                   double top_k_profile_threshold,
                   bool debuggable,
                   bool generate_debug_info,
@@ -213,10 +211,6 @@
     return implicit_suspend_checks_;
   }
 
-  bool GetIncludePatchInformation() const {
-    return include_patch_information_;
-  }
-
   bool IsBootImage() const {
     return boot_image_;
   }
@@ -305,7 +299,6 @@
 
   bool boot_image_;
   bool app_image_;
-  bool include_patch_information_;
   // When using a profile file only the top K% of the profiled samples will be compiled.
   double top_k_profile_threshold_;
   bool debuggable_;
diff --git a/compiler/elf_writer.h b/compiler/elf_writer.h
index d55f745..7baae52 100644
--- a/compiler/elf_writer.h
+++ b/compiler/elf_writer.h
@@ -63,7 +63,6 @@
   virtual void EndText(OutputStream* text) = 0;
   virtual void WriteDynamicSection() = 0;
   virtual void WriteDebugInfo(const ArrayRef<const debug::MethodDebugInfo>& method_infos) = 0;
-  virtual void WritePatchLocations(const ArrayRef<const uintptr_t>& patch_locations) = 0;
   virtual bool End() = 0;
 
   // Get the ELF writer's stream. This stream can be used for writing data directly
diff --git a/compiler/elf_writer_quick.cc b/compiler/elf_writer_quick.cc
index 0d6575c..28c35e9 100644
--- a/compiler/elf_writer_quick.cc
+++ b/compiler/elf_writer_quick.cc
@@ -105,7 +105,6 @@
   void EndText(OutputStream* text) OVERRIDE;
   void WriteDynamicSection() OVERRIDE;
   void WriteDebugInfo(const ArrayRef<const debug::MethodDebugInfo>& method_infos) OVERRIDE;
-  void WritePatchLocations(const ArrayRef<const uintptr_t>& patch_locations) OVERRIDE;
   bool End() OVERRIDE;
 
   virtual OutputStream* GetStream() OVERRIDE;
@@ -268,17 +267,6 @@
 }
 
 template <typename ElfTypes>
-void ElfWriterQuick<ElfTypes>::WritePatchLocations(
-    const ArrayRef<const uintptr_t>& patch_locations) {
-  // Add relocation section for .text.
-  if (compiler_options_->GetIncludePatchInformation()) {
-    // Note that ElfWriter::Fixup will be called regardless and therefore
-    // we need to include oat_patches for debug sections unconditionally.
-    builder_->WritePatches(".text.oat_patches", patch_locations);
-  }
-}
-
-template <typename ElfTypes>
 bool ElfWriterQuick<ElfTypes>::End() {
   builder_->End();
   if (compiler_options_->GetGenerateBuildId()) {
diff --git a/compiler/image_test.cc b/compiler/image_test.cc
index b0225a3..89e8a67 100644
--- a/compiler/image_test.cc
+++ b/compiler/image_test.cc
@@ -318,7 +318,6 @@
 
         elf_writer->WriteDynamicSection();
         elf_writer->WriteDebugInfo(oat_writer->GetMethodDebugInfo());
-        elf_writer->WritePatchLocations(oat_writer->GetAbsolutePatchLocations());
 
         bool success = elf_writer->End();
         ASSERT_TRUE(success);
diff --git a/compiler/image_writer.cc b/compiler/image_writer.cc
index 117d113..a4a1fd3 100644
--- a/compiler/image_writer.cc
+++ b/compiler/image_writer.cc
@@ -238,10 +238,11 @@
       case ImageHeader::kStorageModeLZ4: {
         const size_t compressed_max_size = LZ4_compressBound(image_data_size);
         compressed_data.reset(new char[compressed_max_size]);
-        data_size = LZ4_compress(
+        data_size = LZ4_compress_default(
             reinterpret_cast<char*>(image_info.image_->Begin()) + sizeof(ImageHeader),
             &compressed_data[0],
-            image_data_size);
+            image_data_size,
+            compressed_max_size);
 
         break;
       }
@@ -967,11 +968,12 @@
             << Class::PrettyClass(declaring_class) << " not in class linker table";
       }
     }
-    ArtField** resolved_fields = dex_cache->GetResolvedFields();
+    mirror::FieldDexCacheType* resolved_fields = dex_cache->GetResolvedFields();
     for (size_t i = 0; i < dex_cache->NumResolvedFields(); i++) {
-      ArtField* field = mirror::DexCache::GetElementPtrSize(resolved_fields, i, target_ptr_size_);
+      auto pair = mirror::DexCache::GetNativePairPtrSize(resolved_fields, i, target_ptr_size_);
+      ArtField* field = pair.object;
       if (field != nullptr && !KeepClass(field->GetDeclaringClass().Ptr())) {
-        dex_cache->SetResolvedField(i, nullptr, target_ptr_size_);
+        dex_cache->ClearResolvedField(pair.index, target_ptr_size_);
       }
     }
     // Clean the dex field. It might have been populated during the initialization phase, but
@@ -1595,7 +1597,7 @@
           break;
         }
         case kBinDexCacheArray:
-          bin_offset = RoundUp(bin_offset, DexCacheArraysLayout::Alignment());
+          bin_offset = RoundUp(bin_offset, DexCacheArraysLayout::Alignment(target_ptr_size_));
           break;
         case kBinImTable:
         case kBinIMTConflictTable: {
@@ -2235,16 +2237,17 @@
       mirror::DexCache::SetElementPtrSize(copy_methods, i, copy, target_ptr_size_);
     }
   }
-  ArtField** orig_fields = orig_dex_cache->GetResolvedFields();
+  mirror::FieldDexCacheType* orig_fields = orig_dex_cache->GetResolvedFields();
   if (orig_fields != nullptr) {
     copy_dex_cache->SetFieldPtrWithSize<false>(mirror::DexCache::ResolvedFieldsOffset(),
                                                NativeLocationInImage(orig_fields),
                                                PointerSize::k64);
-    ArtField** copy_fields = NativeCopyLocation(orig_fields, orig_dex_cache);
+    mirror::FieldDexCacheType* copy_fields = NativeCopyLocation(orig_fields, orig_dex_cache);
     for (size_t i = 0, num = orig_dex_cache->NumResolvedFields(); i != num; ++i) {
-      ArtField* orig = mirror::DexCache::GetElementPtrSize(orig_fields, i, target_ptr_size_);
-      ArtField* copy = NativeLocationInImage(orig);
-      mirror::DexCache::SetElementPtrSize(copy_fields, i, copy, target_ptr_size_);
+      mirror::FieldDexCachePair orig =
+          mirror::DexCache::GetNativePairPtrSize(orig_fields, i, target_ptr_size_);
+      mirror::FieldDexCachePair copy(NativeLocationInImage(orig.object), orig.index);
+      mirror::DexCache::SetNativePairPtrSize(copy_fields, i, copy, target_ptr_size_);
     }
   }
   mirror::MethodTypeDexCacheType* orig_method_types = orig_dex_cache->GetResolvedMethodTypes();
diff --git a/compiler/jit/jit_compiler.cc b/compiler/jit/jit_compiler.cc
index cbd831a..3ae7974 100644
--- a/compiler/jit/jit_compiler.cc
+++ b/compiler/jit/jit_compiler.cc
@@ -100,7 +100,6 @@
       CompilerOptions::kDefaultInlineDepthLimit,
       CompilerOptions::kDefaultInlineMaxCodeUnits,
       /* no_inline_from */ nullptr,
-      /* include_patch_information */ false,
       CompilerOptions::kDefaultTopKProfileThreshold,
       Runtime::Current()->IsJavaDebuggable(),
       CompilerOptions::kDefaultGenerateDebugInfo,
diff --git a/compiler/oat_test.cc b/compiler/oat_test.cc
index e2233e4..97b1374 100644
--- a/compiler/oat_test.cc
+++ b/compiler/oat_test.cc
@@ -250,7 +250,6 @@
 
     elf_writer->WriteDynamicSection();
     elf_writer->WriteDebugInfo(oat_writer.GetMethodDebugInfo());
-    elf_writer->WritePatchLocations(oat_writer.GetAbsolutePatchLocations());
 
     if (!elf_writer->End()) {
       return false;
diff --git a/compiler/oat_writer.cc b/compiler/oat_writer.cc
index 8ab44d2..afcdf5e 100644
--- a/compiler/oat_writer.cc
+++ b/compiler/oat_writer.cc
@@ -1225,7 +1225,7 @@
                 break;
               }
               default: {
-                DCHECK_EQ(patch.GetType(), LinkerPatch::Type::kRecordPosition);
+                DCHECK(false) << "Unexpected linker patch type: " << patch.GetType();
                 break;
               }
             }
@@ -2270,28 +2270,11 @@
                              /* verify */ true,
                              /* verify_checksum */ true,
                              &error_msg);
-  } else if (oat_dex_file->source_.IsRawFile()) {
+  } else {
+    CHECK(oat_dex_file->source_.IsRawFile())
+        << static_cast<size_t>(oat_dex_file->source_.GetType());
     File* raw_file = oat_dex_file->source_.GetRawFile();
     dex_file = DexFile::OpenDex(raw_file->Fd(), location, /* verify_checksum */ true, &error_msg);
-  } else {
-    // The source data is a vdex file.
-    CHECK(oat_dex_file->source_.IsRawData())
-        << static_cast<size_t>(oat_dex_file->source_.GetType());
-    const uint8_t* raw_dex_file = oat_dex_file->source_.GetRawData();
-    // Note: The raw data has already been checked to contain the header
-    // and all the data that the header specifies as the file size.
-    DCHECK(raw_dex_file != nullptr);
-    DCHECK(ValidateDexFileHeader(raw_dex_file, oat_dex_file->GetLocation()));
-    const UnalignedDexFileHeader* header = AsUnalignedDexFileHeader(raw_dex_file);
-    // Since the source may have had its layout changed, don't verify the checksum.
-    dex_file = DexFile::Open(raw_dex_file,
-                             header->file_size_,
-                             location,
-                             oat_dex_file->dex_file_location_checksum_,
-                             nullptr,
-                             /* verify */ true,
-                             /* verify_checksum */ false,
-                             &error_msg);
   }
   if (dex_file == nullptr) {
     LOG(ERROR) << "Failed to open dex file for layout: " << error_msg;
diff --git a/compiler/oat_writer.h b/compiler/oat_writer.h
index db84166..5113714 100644
--- a/compiler/oat_writer.h
+++ b/compiler/oat_writer.h
@@ -225,10 +225,6 @@
     return oat_data_offset_;
   }
 
-  ArrayRef<const uintptr_t> GetAbsolutePatchLocations() const {
-    return ArrayRef<const uintptr_t>(absolute_patch_locations_);
-  }
-
   ~OatWriter();
 
   void AddMethodDebugInfos(const std::vector<debug::MethodDebugInfo>& infos) {
diff --git a/compiler/optimizing/bounds_check_elimination_test.cc b/compiler/optimizing/bounds_check_elimination_test.cc
index 5d58207..cb6e14b 100644
--- a/compiler/optimizing/bounds_check_elimination_test.cc
+++ b/compiler/optimizing/bounds_check_elimination_test.cc
@@ -43,7 +43,7 @@
   void RunBCE() {
     graph_->BuildDominatorTree();
 
-    InstructionSimplifier(graph_).Run();
+    InstructionSimplifier(graph_, /* codegen */ nullptr).Run();
 
     SideEffectsAnalysis side_effects(graph_);
     side_effects.Run();
diff --git a/compiler/optimizing/code_generator_arm.cc b/compiler/optimizing/code_generator_arm.cc
index 7b84ef8..e34f116 100644
--- a/compiler/optimizing/code_generator_arm.cc
+++ b/compiler/optimizing/code_generator_arm.cc
@@ -636,56 +636,25 @@
   DISALLOW_COPY_AND_ASSIGN(ArraySetSlowPathARM);
 };
 
-// Slow path marking an object reference `ref` during a read
-// barrier. The field `obj.field` in the object `obj` holding this
-// reference does not get updated by this slow path after marking (see
-// ReadBarrierMarkAndUpdateFieldSlowPathARM below for that).
+// Abstract base class for read barrier slow paths marking a reference
+// `ref`.
 //
-// This means that after the execution of this slow path, `ref` will
-// always be up-to-date, but `obj.field` may not; i.e., after the
-// flip, `ref` will be a to-space reference, but `obj.field` will
-// probably still be a from-space reference (unless it gets updated by
-// another thread, or if another thread installed another object
-// reference (different from `ref`) in `obj.field`).
-//
-// If `entrypoint` is a valid location it is assumed to already be
-// holding the entrypoint. The case where the entrypoint is passed in
-// is for the GcRoot read barrier.
-class ReadBarrierMarkSlowPathARM : public SlowPathCodeARM {
- public:
-  ReadBarrierMarkSlowPathARM(HInstruction* instruction,
-                             Location ref,
-                             Location entrypoint = Location::NoLocation())
+// Argument `entrypoint` must be a register location holding the read
+// barrier marking runtime entry point to be invoked.
+class ReadBarrierMarkSlowPathBaseARM : public SlowPathCodeARM {
+ protected:
+  ReadBarrierMarkSlowPathBaseARM(HInstruction* instruction, Location ref, Location entrypoint)
       : SlowPathCodeARM(instruction), ref_(ref), entrypoint_(entrypoint) {
     DCHECK(kEmitCompilerReadBarrier);
   }
 
-  const char* GetDescription() const OVERRIDE { return "ReadBarrierMarkSlowPathARM"; }
+  const char* GetDescription() const OVERRIDE { return "ReadBarrierMarkSlowPathBaseARM"; }
 
-  void EmitNativeCode(CodeGenerator* codegen) OVERRIDE {
-    LocationSummary* locations = instruction_->GetLocations();
+  // Generate assembly code calling the read barrier marking runtime
+  // entry point (ReadBarrierMarkRegX).
+  void GenerateReadBarrierMarkRuntimeCall(CodeGenerator* codegen) {
     Register ref_reg = ref_.AsRegister<Register>();
-    DCHECK(locations->CanCall());
-    DCHECK(!locations->GetLiveRegisters()->ContainsCoreRegister(ref_reg)) << ref_reg;
-    DCHECK(instruction_->IsInstanceFieldGet() ||
-           instruction_->IsStaticFieldGet() ||
-           instruction_->IsArrayGet() ||
-           instruction_->IsArraySet() ||
-           instruction_->IsLoadClass() ||
-           instruction_->IsLoadString() ||
-           instruction_->IsInstanceOf() ||
-           instruction_->IsCheckCast() ||
-           (instruction_->IsInvokeVirtual() && instruction_->GetLocations()->Intrinsified()) ||
-           (instruction_->IsInvokeStaticOrDirect() && instruction_->GetLocations()->Intrinsified()))
-        << "Unexpected instruction in read barrier marking slow path: "
-        << instruction_->DebugName();
-    // The read barrier instrumentation of object ArrayGet
-    // instructions does not support the HIntermediateAddress
-    // instruction.
-    DCHECK(!(instruction_->IsArrayGet() &&
-             instruction_->AsArrayGet()->GetArray()->IsIntermediateAddress()));
 
-    __ Bind(GetEntryLabel());
     // No need to save live registers; it's taken care of by the
     // entrypoint. Also, there is no need to update the stack mask,
     // as this runtime call will not trigger a garbage collection.
@@ -715,116 +684,331 @@
       arm_codegen->ValidateInvokeRuntimeWithoutRecordingPcInfo(instruction_, this);
       __ blx(entrypoint_.AsRegister<Register>());
     } else {
+      // Entrypoint is not already loaded, load from the thread.
       int32_t entry_point_offset =
           CodeGenerator::GetReadBarrierMarkEntryPointsOffset<kArmPointerSize>(ref_reg);
       // This runtime call does not require a stack map.
       arm_codegen->InvokeRuntimeWithoutRecordingPcInfo(entry_point_offset, instruction_, this);
     }
+  }
+
+  // The location (register) of the marked object reference.
+  const Location ref_;
+
+  // The location of the entrypoint if it is already loaded.
+  const Location entrypoint_;
+
+ private:
+  DISALLOW_COPY_AND_ASSIGN(ReadBarrierMarkSlowPathBaseARM);
+};
+
+// Slow path marking an object reference `ref` during a read
+// barrier. The field `obj.field` in the object `obj` holding this
+// reference does not get updated by this slow path after marking.
+//
+// This means that after the execution of this slow path, `ref` will
+// always be up-to-date, but `obj.field` may not; i.e., after the
+// flip, `ref` will be a to-space reference, but `obj.field` will
+// probably still be a from-space reference (unless it gets updated by
+// another thread, or if another thread installed another object
+// reference (different from `ref`) in `obj.field`).
+//
+// If `entrypoint` is a valid location it is assumed to already be
+// holding the entrypoint. The case where the entrypoint is passed in
+// is when the decision to mark is based on whether the GC is marking.
+class ReadBarrierMarkSlowPathARM : public ReadBarrierMarkSlowPathBaseARM {
+ public:
+  ReadBarrierMarkSlowPathARM(HInstruction* instruction,
+                             Location ref,
+                             Location entrypoint = Location::NoLocation())
+      : ReadBarrierMarkSlowPathBaseARM(instruction, ref, entrypoint) {
+    DCHECK(kEmitCompilerReadBarrier);
+  }
+
+  const char* GetDescription() const OVERRIDE { return "ReadBarrierMarkSlowPathARM"; }
+
+  void EmitNativeCode(CodeGenerator* codegen) OVERRIDE {
+    LocationSummary* locations = instruction_->GetLocations();
+    DCHECK(locations->CanCall());
+    if (kIsDebugBuild) {
+      Register ref_reg = ref_.AsRegister<Register>();
+      DCHECK(!locations->GetLiveRegisters()->ContainsCoreRegister(ref_reg)) << ref_reg;
+    }
+    DCHECK(instruction_->IsLoadClass() || instruction_->IsLoadString())
+        << "Unexpected instruction in read barrier marking slow path: "
+        << instruction_->DebugName();
+
+    __ Bind(GetEntryLabel());
+    GenerateReadBarrierMarkRuntimeCall(codegen);
     __ b(GetExitLabel());
   }
 
  private:
-  // The location (register) of the marked object reference.
-  const Location ref_;
-
-  // The location of the entrypoint if already loaded.
-  const Location entrypoint_;
-
   DISALLOW_COPY_AND_ASSIGN(ReadBarrierMarkSlowPathARM);
 };
 
-// Slow path marking an object reference `ref` during a read barrier,
-// and if needed, atomically updating the field `obj.field` in the
-// object `obj` holding this reference after marking (contrary to
-// ReadBarrierMarkSlowPathARM above, which never tries to update
-// `obj.field`).
+// Slow path loading `obj`'s lock word, loading a reference from
+// object `*(obj + offset + (index << scale_factor))` into `ref`, and
+// marking `ref` if `obj` is gray according to the lock word (Baker
+// read barrier). The field `obj.field` in the object `obj` holding
+// this reference does not get updated by this slow path after marking
+// (see LoadReferenceWithBakerReadBarrierAndUpdateFieldSlowPathARM
+// below for that).
 //
-// This means that after the execution of this slow path, both `ref`
-// and `obj.field` will be up-to-date; i.e., after the flip, both will
-// hold the same to-space reference (unless another thread installed
-// another object reference (different from `ref`) in `obj.field`).
-class ReadBarrierMarkAndUpdateFieldSlowPathARM : public SlowPathCodeARM {
+// This means that after the execution of this slow path, `ref` will
+// always be up-to-date, but `obj.field` may not; i.e., after the
+// flip, `ref` will be a to-space reference, but `obj.field` will
+// probably still be a from-space reference (unless it gets updated by
+// another thread, or if another thread installed another object
+// reference (different from `ref`) in `obj.field`).
+//
+// Argument `entrypoint` must be a register location holding the read
+// barrier marking runtime entry point to be invoked.
+class LoadReferenceWithBakerReadBarrierSlowPathARM : public ReadBarrierMarkSlowPathBaseARM {
  public:
-  ReadBarrierMarkAndUpdateFieldSlowPathARM(HInstruction* instruction,
-                                           Location ref,
-                                           Register obj,
-                                           Location field_offset,
-                                           Register temp1,
-                                           Register temp2)
-      : SlowPathCodeARM(instruction),
-        ref_(ref),
+  LoadReferenceWithBakerReadBarrierSlowPathARM(HInstruction* instruction,
+                                               Location ref,
+                                               Register obj,
+                                               uint32_t offset,
+                                               Location index,
+                                               ScaleFactor scale_factor,
+                                               bool needs_null_check,
+                                               Register temp,
+                                               Location entrypoint)
+      : ReadBarrierMarkSlowPathBaseARM(instruction, ref, entrypoint),
         obj_(obj),
-        field_offset_(field_offset),
-        temp1_(temp1),
-        temp2_(temp2) {
+        offset_(offset),
+        index_(index),
+        scale_factor_(scale_factor),
+        needs_null_check_(needs_null_check),
+        temp_(temp) {
     DCHECK(kEmitCompilerReadBarrier);
+    DCHECK(kUseBakerReadBarrier);
   }
 
-  const char* GetDescription() const OVERRIDE { return "ReadBarrierMarkAndUpdateFieldSlowPathARM"; }
+  const char* GetDescription() const OVERRIDE {
+    return "LoadReferenceWithBakerReadBarrierSlowPathARM";
+  }
 
   void EmitNativeCode(CodeGenerator* codegen) OVERRIDE {
     LocationSummary* locations = instruction_->GetLocations();
     Register ref_reg = ref_.AsRegister<Register>();
     DCHECK(locations->CanCall());
     DCHECK(!locations->GetLiveRegisters()->ContainsCoreRegister(ref_reg)) << ref_reg;
-    // This slow path is only used by the UnsafeCASObject intrinsic.
+    DCHECK_NE(ref_reg, temp_);
+    DCHECK(instruction_->IsInstanceFieldGet() ||
+           instruction_->IsStaticFieldGet() ||
+           instruction_->IsArrayGet() ||
+           instruction_->IsArraySet() ||
+           instruction_->IsInstanceOf() ||
+           instruction_->IsCheckCast() ||
+           (instruction_->IsInvokeVirtual() && instruction_->GetLocations()->Intrinsified()) ||
+           (instruction_->IsInvokeStaticOrDirect() && instruction_->GetLocations()->Intrinsified()))
+        << "Unexpected instruction in read barrier marking slow path: "
+        << instruction_->DebugName();
+    // The read barrier instrumentation of object ArrayGet
+    // instructions does not support the HIntermediateAddress
+    // instruction.
+    DCHECK(!(instruction_->IsArrayGet() &&
+             instruction_->AsArrayGet()->GetArray()->IsIntermediateAddress()));
+
+    __ Bind(GetEntryLabel());
+
+    // When using MaybeGenerateReadBarrierSlow, the read barrier call is
+    // inserted after the original load. However, in fast path based
+    // Baker's read barriers, we need to perform the load of
+    // mirror::Object::monitor_ *before* the original reference load.
+    // This load-load ordering is required by the read barrier.
+    // The fast path/slow path (for Baker's algorithm) should look like:
+    //
+    //   uint32_t rb_state = Lockword(obj->monitor_).ReadBarrierState();
+    //   lfence;  // Load fence or artificial data dependency to prevent load-load reordering
+    //   HeapReference<mirror::Object> ref = *src;  // Original reference load.
+    //   bool is_gray = (rb_state == ReadBarrier::GrayState());
+    //   if (is_gray) {
+    //     ref = entrypoint(ref);  // ref = ReadBarrier::Mark(ref);  // Runtime entry point call.
+    //   }
+    //
+    // Note: the original implementation in ReadBarrier::Barrier is
+    // slightly more complex as it performs additional checks that we do
+    // not do here for performance reasons.
+
+    // /* int32_t */ monitor = obj->monitor_
+    uint32_t monitor_offset = mirror::Object::MonitorOffset().Int32Value();
+    __ LoadFromOffset(kLoadWord, temp_, obj_, monitor_offset);
+    if (needs_null_check_) {
+      codegen->MaybeRecordImplicitNullCheck(instruction_);
+    }
+    // /* LockWord */ lock_word = LockWord(monitor)
+    static_assert(sizeof(LockWord) == sizeof(int32_t),
+                  "art::LockWord and int32_t have different sizes.");
+
+    // Introduce a dependency on the lock_word including the rb_state,
+    // which shall prevent load-load reordering without using
+    // a memory barrier (which would be more expensive).
+    // `obj` is unchanged by this operation, but its value now depends
+    // on `temp`.
+    __ add(obj_, obj_, ShifterOperand(temp_, LSR, 32));
+
+    // The actual reference load.
+    // A possible implicit null check has already been handled above.
+    CodeGeneratorARM* arm_codegen = down_cast<CodeGeneratorARM*>(codegen);
+    arm_codegen->GenerateRawReferenceLoad(
+        instruction_, ref_, obj_, offset_, index_, scale_factor_, /* needs_null_check */ false);
+
+    // Mark the object `ref` when `obj` is gray.
+    //
+    // if (rb_state == ReadBarrier::GrayState())
+    //   ref = ReadBarrier::Mark(ref);
+    //
+    // Given the numeric representation, it's enough to check the low bit of the
+    // rb_state. We do that by shifting the bit out of the lock word with LSRS
+    // which can be a 16-bit instruction unlike the TST immediate.
+    static_assert(ReadBarrier::WhiteState() == 0, "Expecting white to have value 0");
+    static_assert(ReadBarrier::GrayState() == 1, "Expecting gray to have value 1");
+    __ Lsrs(temp_, temp_, LockWord::kReadBarrierStateShift + 1);
+    __ b(GetExitLabel(), CC);  // Carry flag is the last bit shifted out by LSRS.
+    GenerateReadBarrierMarkRuntimeCall(codegen);
+
+    __ b(GetExitLabel());
+  }
+
+ private:
+  // The register containing the object holding the marked object reference field.
+  Register obj_;
+  // The offset, index and scale factor to access the reference in `obj_`.
+  uint32_t offset_;
+  Location index_;
+  ScaleFactor scale_factor_;
+  // Is a null check required?
+  bool needs_null_check_;
+  // A temporary register used to hold the lock word of `obj_`.
+  Register temp_;
+
+  DISALLOW_COPY_AND_ASSIGN(LoadReferenceWithBakerReadBarrierSlowPathARM);
+};
+
+// Slow path loading `obj`'s lock word, loading a reference from
+// object `*(obj + offset + (index << scale_factor))` into `ref`, and
+// marking `ref` if `obj` is gray according to the lock word (Baker
+// read barrier). If needed, this slow path also atomically updates
+// the field `obj.field` in the object `obj` holding this reference
+// after marking (contrary to
+// LoadReferenceWithBakerReadBarrierSlowPathARM above, which never
+// tries to update `obj.field`).
+//
+// This means that after the execution of this slow path, both `ref`
+// and `obj.field` will be up-to-date; i.e., after the flip, both will
+// hold the same to-space reference (unless another thread installed
+// another object reference (different from `ref`) in `obj.field`).
+//
+// Argument `entrypoint` must be a register location holding the read
+// barrier marking runtime entry point to be invoked.
+class LoadReferenceWithBakerReadBarrierAndUpdateFieldSlowPathARM
+    : public ReadBarrierMarkSlowPathBaseARM {
+ public:
+  LoadReferenceWithBakerReadBarrierAndUpdateFieldSlowPathARM(HInstruction* instruction,
+                                                             Location ref,
+                                                             Register obj,
+                                                             uint32_t offset,
+                                                             Location index,
+                                                             ScaleFactor scale_factor,
+                                                             bool needs_null_check,
+                                                             Register temp1,
+                                                             Register temp2,
+                                                             Location entrypoint)
+      : ReadBarrierMarkSlowPathBaseARM(instruction, ref, entrypoint),
+        obj_(obj),
+        offset_(offset),
+        index_(index),
+        scale_factor_(scale_factor),
+        needs_null_check_(needs_null_check),
+        temp1_(temp1),
+        temp2_(temp2) {
+    DCHECK(kEmitCompilerReadBarrier);
+    DCHECK(kUseBakerReadBarrier);
+  }
+
+  const char* GetDescription() const OVERRIDE {
+    return "LoadReferenceWithBakerReadBarrierAndUpdateFieldSlowPathARM";
+  }
+
+  void EmitNativeCode(CodeGenerator* codegen) OVERRIDE {
+    LocationSummary* locations = instruction_->GetLocations();
+    Register ref_reg = ref_.AsRegister<Register>();
+    DCHECK(locations->CanCall());
+    DCHECK(!locations->GetLiveRegisters()->ContainsCoreRegister(ref_reg)) << ref_reg;
+    DCHECK_NE(ref_reg, temp1_);
+
+    // This slow path is only used by the UnsafeCASObject intrinsic at the moment.
     DCHECK((instruction_->IsInvokeVirtual() && instruction_->GetLocations()->Intrinsified()))
         << "Unexpected instruction in read barrier marking and field updating slow path: "
         << instruction_->DebugName();
     DCHECK(instruction_->GetLocations()->Intrinsified());
     DCHECK_EQ(instruction_->AsInvoke()->GetIntrinsic(), Intrinsics::kUnsafeCASObject);
-    DCHECK(field_offset_.IsRegisterPair()) << field_offset_;
+    DCHECK_EQ(offset_, 0u);
+    DCHECK_EQ(scale_factor_, ScaleFactor::TIMES_1);
+    // The location of the offset of the marked reference field within `obj_`.
+    Location field_offset = index_;
+    DCHECK(field_offset.IsRegisterPair()) << field_offset;
 
     __ Bind(GetEntryLabel());
 
-    // Save the old reference.
+    // /* int32_t */ monitor = obj->monitor_
+    uint32_t monitor_offset = mirror::Object::MonitorOffset().Int32Value();
+    __ LoadFromOffset(kLoadWord, temp1_, obj_, monitor_offset);
+    if (needs_null_check_) {
+      codegen->MaybeRecordImplicitNullCheck(instruction_);
+    }
+    // /* LockWord */ lock_word = LockWord(monitor)
+    static_assert(sizeof(LockWord) == sizeof(int32_t),
+                  "art::LockWord and int32_t have different sizes.");
+
+    // Introduce a dependency on the lock_word including the rb_state,
+    // which shall prevent load-load reordering without using
+    // a memory barrier (which would be more expensive).
+    // `obj` is unchanged by this operation, but its value now depends
+    // on `temp1`.
+    __ add(obj_, obj_, ShifterOperand(temp1_, LSR, 32));
+
+    // The actual reference load.
+    // A possible implicit null check has already been handled above.
+    CodeGeneratorARM* arm_codegen = down_cast<CodeGeneratorARM*>(codegen);
+    arm_codegen->GenerateRawReferenceLoad(
+        instruction_, ref_, obj_, offset_, index_, scale_factor_, /* needs_null_check */ false);
+
+    // Mark the object `ref` when `obj` is gray.
+    //
+    // if (rb_state == ReadBarrier::GrayState())
+    //   ref = ReadBarrier::Mark(ref);
+    //
+    // Given the numeric representation, it's enough to check the low bit of the
+    // rb_state. We do that by shifting the bit out of the lock word with LSRS
+    // which can be a 16-bit instruction unlike the TST immediate.
+    static_assert(ReadBarrier::WhiteState() == 0, "Expecting white to have value 0");
+    static_assert(ReadBarrier::GrayState() == 1, "Expecting gray to have value 1");
+    __ Lsrs(temp1_, temp1_, LockWord::kReadBarrierStateShift + 1);
+    __ b(GetExitLabel(), CC);  // Carry flag is the last bit shifted out by LSRS.
+
+    // Save the old value of the reference before marking it.
     // Note that we cannot use IP to save the old reference, as IP is
     // used internally by the ReadBarrierMarkRegX entry point, and we
     // need the old reference after the call to that entry point.
     DCHECK_NE(temp1_, IP);
     __ Mov(temp1_, ref_reg);
 
-    // No need to save live registers; it's taken care of by the
-    // entrypoint. Also, there is no need to update the stack mask,
-    // as this runtime call will not trigger a garbage collection.
-    CodeGeneratorARM* arm_codegen = down_cast<CodeGeneratorARM*>(codegen);
-    DCHECK_NE(ref_reg, SP);
-    DCHECK_NE(ref_reg, LR);
-    DCHECK_NE(ref_reg, PC);
-    // IP is used internally by the ReadBarrierMarkRegX entry point
-    // as a temporary, it cannot be the entry point's input/output.
-    DCHECK_NE(ref_reg, IP);
-    DCHECK(0 <= ref_reg && ref_reg < kNumberOfCoreRegisters) << ref_reg;
-    // "Compact" slow path, saving two moves.
-    //
-    // Instead of using the standard runtime calling convention (input
-    // and output in R0):
-    //
-    //   R0 <- ref
-    //   R0 <- ReadBarrierMark(R0)
-    //   ref <- R0
-    //
-    // we just use rX (the register containing `ref`) as input and output
-    // of a dedicated entrypoint:
-    //
-    //   rX <- ReadBarrierMarkRegX(rX)
-    //
-    int32_t entry_point_offset =
-        CodeGenerator::GetReadBarrierMarkEntryPointsOffset<kArmPointerSize>(ref_reg);
-    // This runtime call does not require a stack map.
-    arm_codegen->InvokeRuntimeWithoutRecordingPcInfo(entry_point_offset, instruction_, this);
+    GenerateReadBarrierMarkRuntimeCall(codegen);
 
     // If the new reference is different from the old reference,
-    // update the field in the holder (`*(obj_ + field_offset_)`).
+    // update the field in the holder (`*(obj_ + field_offset)`).
     //
     // Note that this field could also hold a different object, if
     // another thread had concurrently changed it. In that case, the
     // LDREX/SUBS/ITNE sequence of instructions in the compare-and-set
     // (CAS) operation below would abort the CAS, leaving the field
     // as-is.
-    Label done;
     __ cmp(temp1_, ShifterOperand(ref_reg));
-    __ b(&done, EQ);
+    __ b(GetExitLabel(), EQ);
 
     // Update the the holder's field atomically.  This may fail if
     // mutator updates before us, but it's OK.  This is achieved
@@ -837,7 +1021,7 @@
     // The UnsafeCASObject intrinsic uses a register pair as field
     // offset ("long offset"), of which only the low part contains
     // data.
-    Register offset = field_offset_.AsRegisterPairLow<Register>();
+    Register offset = field_offset.AsRegisterPairLow<Register>();
     Register expected = temp1_;
     Register value = ref_reg;
     Register tmp_ptr = IP;       // Pointer to actual memory.
@@ -887,22 +1071,27 @@
       }
     }
 
-    __ Bind(&done);
     __ b(GetExitLabel());
   }
 
  private:
-  // The location (register) of the marked object reference.
-  const Location ref_;
   // The register containing the object holding the marked object reference field.
   const Register obj_;
-  // The location of the offset of the marked reference field within `obj_`.
-  Location field_offset_;
-
+  // The offset, index and scale factor to access the reference in `obj_`.
+  uint32_t offset_;
+  Location index_;
+  ScaleFactor scale_factor_;
+  // Is a null check required?
+  bool needs_null_check_;
+  // A temporary register used to hold the lock word of `obj_`; and
+  // also to hold the original reference value, when the reference is
+  // marked.
   const Register temp1_;
+  // A temporary register used in the implementation of the CAS, to
+  // update the object's reference field.
   const Register temp2_;
 
-  DISALLOW_COPY_AND_ASSIGN(ReadBarrierMarkAndUpdateFieldSlowPathARM);
+  DISALLOW_COPY_AND_ASSIGN(LoadReferenceWithBakerReadBarrierAndUpdateFieldSlowPathARM);
 };
 
 // Slow path generating a read barrier for a heap reference.
@@ -1375,10 +1564,332 @@
   }
 }
 
+static void GenerateVcmp(HInstruction* instruction, CodeGeneratorARM* codegen) {
+  Primitive::Type type = instruction->InputAt(0)->GetType();
+  Location lhs_loc = instruction->GetLocations()->InAt(0);
+  Location rhs_loc = instruction->GetLocations()->InAt(1);
+  if (rhs_loc.IsConstant()) {
+    // 0.0 is the only immediate that can be encoded directly in
+    // a VCMP instruction.
+    //
+    // Both the JLS (section 15.20.1) and the JVMS (section 6.5)
+    // specify that in a floating-point comparison, positive zero
+    // and negative zero are considered equal, so we can use the
+    // literal 0.0 for both cases here.
+    //
+    // Note however that some methods (Float.equal, Float.compare,
+    // Float.compareTo, Double.equal, Double.compare,
+    // Double.compareTo, Math.max, Math.min, StrictMath.max,
+    // StrictMath.min) consider 0.0 to be (strictly) greater than
+    // -0.0. So if we ever translate calls to these methods into a
+    // HCompare instruction, we must handle the -0.0 case with
+    // care here.
+    DCHECK(rhs_loc.GetConstant()->IsArithmeticZero());
+    if (type == Primitive::kPrimFloat) {
+      __ vcmpsz(lhs_loc.AsFpuRegister<SRegister>());
+    } else {
+      DCHECK_EQ(type, Primitive::kPrimDouble);
+      __ vcmpdz(FromLowSToD(lhs_loc.AsFpuRegisterPairLow<SRegister>()));
+    }
+  } else {
+    if (type == Primitive::kPrimFloat) {
+      __ vcmps(lhs_loc.AsFpuRegister<SRegister>(), rhs_loc.AsFpuRegister<SRegister>());
+    } else {
+      DCHECK_EQ(type, Primitive::kPrimDouble);
+      __ vcmpd(FromLowSToD(lhs_loc.AsFpuRegisterPairLow<SRegister>()),
+               FromLowSToD(rhs_loc.AsFpuRegisterPairLow<SRegister>()));
+    }
+  }
+}
+
+static Condition GenerateLongTestConstant(HCondition* condition,
+                                          bool invert,
+                                          CodeGeneratorARM* codegen) {
+  DCHECK_EQ(condition->GetLeft()->GetType(), Primitive::kPrimLong);
+
+  const LocationSummary* const locations = condition->GetLocations();
+  IfCondition cond = invert ? condition->GetOppositeCondition() : condition->GetCondition();
+  Condition ret = EQ;
+  const Location left = locations->InAt(0);
+  const Location right = locations->InAt(1);
+
+  DCHECK(right.IsConstant());
+
+  const Register left_high = left.AsRegisterPairHigh<Register>();
+  const Register left_low = left.AsRegisterPairLow<Register>();
+  int64_t value = right.GetConstant()->AsLongConstant()->GetValue();
+
+  switch (cond) {
+    case kCondEQ:
+    case kCondNE:
+    case kCondB:
+    case kCondBE:
+    case kCondA:
+    case kCondAE:
+      __ CmpConstant(left_high, High32Bits(value));
+      __ it(EQ);
+      __ cmp(left_low, ShifterOperand(Low32Bits(value)), EQ);
+      ret = ARMUnsignedCondition(cond);
+      break;
+    case kCondLE:
+    case kCondGT:
+      // Trivially true or false.
+      if (value == std::numeric_limits<int64_t>::max()) {
+        __ cmp(left_low, ShifterOperand(left_low));
+        ret = cond == kCondLE ? EQ : NE;
+        break;
+      }
+
+      if (cond == kCondLE) {
+        cond = kCondLT;
+      } else {
+        DCHECK_EQ(cond, kCondGT);
+        cond = kCondGE;
+      }
+
+      value++;
+      FALLTHROUGH_INTENDED;
+    case kCondGE:
+    case kCondLT:
+      __ CmpConstant(left_low, Low32Bits(value));
+      __ sbcs(IP, left_high, ShifterOperand(High32Bits(value)));
+      ret = ARMCondition(cond);
+      break;
+    default:
+      LOG(FATAL) << "Unreachable";
+      UNREACHABLE();
+  }
+
+  return ret;
+}
+
+static Condition GenerateLongTest(HCondition* condition,
+                                  bool invert,
+                                  CodeGeneratorARM* codegen) {
+  DCHECK_EQ(condition->GetLeft()->GetType(), Primitive::kPrimLong);
+
+  const LocationSummary* const locations = condition->GetLocations();
+  IfCondition cond = invert ? condition->GetOppositeCondition() : condition->GetCondition();
+  Condition ret = EQ;
+  Location left = locations->InAt(0);
+  Location right = locations->InAt(1);
+
+  DCHECK(right.IsRegisterPair());
+
+  switch (cond) {
+    case kCondEQ:
+    case kCondNE:
+    case kCondB:
+    case kCondBE:
+    case kCondA:
+    case kCondAE:
+      __ cmp(left.AsRegisterPairHigh<Register>(),
+             ShifterOperand(right.AsRegisterPairHigh<Register>()));
+      __ it(EQ);
+      __ cmp(left.AsRegisterPairLow<Register>(),
+             ShifterOperand(right.AsRegisterPairLow<Register>()),
+             EQ);
+      ret = ARMUnsignedCondition(cond);
+      break;
+    case kCondLE:
+    case kCondGT:
+      if (cond == kCondLE) {
+        cond = kCondGE;
+      } else {
+        DCHECK_EQ(cond, kCondGT);
+        cond = kCondLT;
+      }
+
+      std::swap(left, right);
+      FALLTHROUGH_INTENDED;
+    case kCondGE:
+    case kCondLT:
+      __ cmp(left.AsRegisterPairLow<Register>(),
+             ShifterOperand(right.AsRegisterPairLow<Register>()));
+      __ sbcs(IP,
+              left.AsRegisterPairHigh<Register>(),
+              ShifterOperand(right.AsRegisterPairHigh<Register>()));
+      ret = ARMCondition(cond);
+      break;
+    default:
+      LOG(FATAL) << "Unreachable";
+      UNREACHABLE();
+  }
+
+  return ret;
+}
+
+static Condition GenerateTest(HInstruction* instruction,
+                              Location loc,
+                              bool invert,
+                              CodeGeneratorARM* codegen) {
+  DCHECK(!instruction->IsConstant());
+
+  Condition ret = invert ? EQ : NE;
+
+  if (IsBooleanValueOrMaterializedCondition(instruction)) {
+    __ CmpConstant(loc.AsRegister<Register>(), 0);
+  } else {
+    HCondition* const condition = instruction->AsCondition();
+    const LocationSummary* const locations = condition->GetLocations();
+    const Primitive::Type type = condition->GetLeft()->GetType();
+    const IfCondition cond = invert ? condition->GetOppositeCondition() : condition->GetCondition();
+    const Location right = locations->InAt(1);
+
+    if (type == Primitive::kPrimLong) {
+      ret = condition->GetLocations()->InAt(1).IsConstant()
+          ? GenerateLongTestConstant(condition, invert, codegen)
+          : GenerateLongTest(condition, invert, codegen);
+    } else if (Primitive::IsFloatingPointType(type)) {
+      GenerateVcmp(condition, codegen);
+      __ vmstat();
+      ret = ARMFPCondition(cond, condition->IsGtBias());
+    } else {
+      DCHECK(Primitive::IsIntegralType(type) || type == Primitive::kPrimNot) << type;
+
+      const Register left = locations->InAt(0).AsRegister<Register>();
+
+      if (right.IsRegister()) {
+        __ cmp(left, ShifterOperand(right.AsRegister<Register>()));
+      } else {
+        DCHECK(right.IsConstant());
+        __ CmpConstant(left, CodeGenerator::GetInt32ValueOf(right.GetConstant()));
+      }
+
+      ret = ARMCondition(cond);
+    }
+  }
+
+  return ret;
+}
+
+static bool CanGenerateTest(HInstruction* condition, ArmAssembler* assembler) {
+  if (!IsBooleanValueOrMaterializedCondition(condition)) {
+    const HCondition* const cond = condition->AsCondition();
+
+    if (cond->GetLeft()->GetType() == Primitive::kPrimLong) {
+      const LocationSummary* const locations = cond->GetLocations();
+      const IfCondition c = cond->GetCondition();
+
+      if (locations->InAt(1).IsConstant()) {
+        const int64_t value = locations->InAt(1).GetConstant()->AsLongConstant()->GetValue();
+        ShifterOperand so;
+
+        if (c < kCondLT || c > kCondGE) {
+          // Since IT blocks longer than a 16-bit instruction are deprecated by ARMv8,
+          // we check that the least significant half of the first input to be compared
+          // is in a low register (the other half is read outside an IT block), and
+          // the constant fits in an 8-bit unsigned integer, so that a 16-bit CMP
+          // encoding can be used.
+          if (!ArmAssembler::IsLowRegister(locations->InAt(0).AsRegisterPairLow<Register>()) ||
+              !IsUint<8>(Low32Bits(value))) {
+            return false;
+          }
+        } else if (c == kCondLE || c == kCondGT) {
+          if (value < std::numeric_limits<int64_t>::max() &&
+              !assembler->ShifterOperandCanHold(kNoRegister,
+                                                kNoRegister,
+                                                SBC,
+                                                High32Bits(value + 1),
+                                                kCcSet,
+                                                &so)) {
+            return false;
+          }
+        } else if (!assembler->ShifterOperandCanHold(kNoRegister,
+                                                     kNoRegister,
+                                                     SBC,
+                                                     High32Bits(value),
+                                                     kCcSet,
+                                                     &so)) {
+          return false;
+        }
+      }
+    }
+  }
+
+  return true;
+}
+
+static bool CanEncodeConstantAs8BitImmediate(HConstant* constant) {
+  const Primitive::Type type = constant->GetType();
+  bool ret = false;
+
+  DCHECK(Primitive::IsIntegralType(type) || type == Primitive::kPrimNot) << type;
+
+  if (type == Primitive::kPrimLong) {
+    const uint64_t value = constant->AsLongConstant()->GetValueAsUint64();
+
+    ret = IsUint<8>(Low32Bits(value)) && IsUint<8>(High32Bits(value));
+  } else {
+    ret = IsUint<8>(CodeGenerator::GetInt32ValueOf(constant));
+  }
+
+  return ret;
+}
+
+static Location Arm8BitEncodableConstantOrRegister(HInstruction* constant) {
+  DCHECK(!Primitive::IsFloatingPointType(constant->GetType()));
+
+  if (constant->IsConstant() && CanEncodeConstantAs8BitImmediate(constant->AsConstant())) {
+    return Location::ConstantLocation(constant->AsConstant());
+  }
+
+  return Location::RequiresRegister();
+}
+
+static bool CanGenerateConditionalMove(const Location& out, const Location& src) {
+  // Since IT blocks longer than a 16-bit instruction are deprecated by ARMv8,
+  // we check that we are not dealing with floating-point output (there is no
+  // 16-bit VMOV encoding).
+  if (!out.IsRegister() && !out.IsRegisterPair()) {
+    return false;
+  }
+
+  // For constants, we also check that the output is in one or two low registers,
+  // and that the constants fit in an 8-bit unsigned integer, so that a 16-bit
+  // MOV encoding can be used.
+  if (src.IsConstant()) {
+    if (!CanEncodeConstantAs8BitImmediate(src.GetConstant())) {
+      return false;
+    }
+
+    if (out.IsRegister()) {
+      if (!ArmAssembler::IsLowRegister(out.AsRegister<Register>())) {
+        return false;
+      }
+    } else {
+      DCHECK(out.IsRegisterPair());
+
+      if (!ArmAssembler::IsLowRegister(out.AsRegisterPairHigh<Register>())) {
+        return false;
+      }
+    }
+  }
+
+  return true;
+}
+
 #undef __
 // NOLINT on __ macro to suppress wrong warning/fix (misc-macro-parentheses) from clang-tidy.
 #define __ down_cast<ArmAssembler*>(GetAssembler())->  // NOLINT
 
+Label* CodeGeneratorARM::GetFinalLabel(HInstruction* instruction, Label* final_label) {
+  DCHECK(!instruction->IsControlFlow() && !instruction->IsSuspendCheck());
+
+  const HBasicBlock* const block = instruction->GetBlock();
+  const HLoopInformation* const info = block->GetLoopInformation();
+  HInstruction* const next = instruction->GetNext();
+
+  // Avoid a branch to a branch.
+  if (next->IsGoto() && (info == nullptr ||
+                         !info->IsBackEdge(*block) ||
+                         !info->HasSuspendCheck())) {
+    final_label = GetLabelOf(next->AsGoto()->GetSuccessor());
+  }
+
+  return final_label;
+}
+
 void CodeGeneratorARM::DumpCoreRegister(std::ostream& stream, int reg) const {
   stream << Register(reg);
 }
@@ -1437,8 +1948,6 @@
                                graph->GetArena()->Adapter(kArenaAllocCodeGenerator)),
       pc_relative_type_patches_(graph->GetArena()->Adapter(kArenaAllocCodeGenerator)),
       type_bss_entry_patches_(graph->GetArena()->Adapter(kArenaAllocCodeGenerator)),
-      boot_image_address_patches_(std::less<uint32_t>(),
-                                  graph->GetArena()->Adapter(kArenaAllocCodeGenerator)),
       jit_string_patches_(StringReferenceValueComparator(),
                           graph->GetArena()->Adapter(kArenaAllocCodeGenerator)),
       jit_class_patches_(TypeReferenceValueComparator(),
@@ -1905,44 +2414,6 @@
 void InstructionCodeGeneratorARM::VisitExit(HExit* exit ATTRIBUTE_UNUSED) {
 }
 
-void InstructionCodeGeneratorARM::GenerateVcmp(HInstruction* instruction) {
-  Primitive::Type type = instruction->InputAt(0)->GetType();
-  Location lhs_loc = instruction->GetLocations()->InAt(0);
-  Location rhs_loc = instruction->GetLocations()->InAt(1);
-  if (rhs_loc.IsConstant()) {
-    // 0.0 is the only immediate that can be encoded directly in
-    // a VCMP instruction.
-    //
-    // Both the JLS (section 15.20.1) and the JVMS (section 6.5)
-    // specify that in a floating-point comparison, positive zero
-    // and negative zero are considered equal, so we can use the
-    // literal 0.0 for both cases here.
-    //
-    // Note however that some methods (Float.equal, Float.compare,
-    // Float.compareTo, Double.equal, Double.compare,
-    // Double.compareTo, Math.max, Math.min, StrictMath.max,
-    // StrictMath.min) consider 0.0 to be (strictly) greater than
-    // -0.0. So if we ever translate calls to these methods into a
-    // HCompare instruction, we must handle the -0.0 case with
-    // care here.
-    DCHECK(rhs_loc.GetConstant()->IsArithmeticZero());
-    if (type == Primitive::kPrimFloat) {
-      __ vcmpsz(lhs_loc.AsFpuRegister<SRegister>());
-    } else {
-      DCHECK_EQ(type, Primitive::kPrimDouble);
-      __ vcmpdz(FromLowSToD(lhs_loc.AsFpuRegisterPairLow<SRegister>()));
-    }
-  } else {
-    if (type == Primitive::kPrimFloat) {
-      __ vcmps(lhs_loc.AsFpuRegister<SRegister>(), rhs_loc.AsFpuRegister<SRegister>());
-    } else {
-      DCHECK_EQ(type, Primitive::kPrimDouble);
-      __ vcmpd(FromLowSToD(lhs_loc.AsFpuRegisterPairLow<SRegister>()),
-               FromLowSToD(rhs_loc.AsFpuRegisterPairLow<SRegister>()));
-    }
-  }
-}
-
 void InstructionCodeGeneratorARM::GenerateFPJumps(HCondition* cond,
                                                   Label* true_label,
                                                   Label* false_label ATTRIBUTE_UNUSED) {
@@ -2050,7 +2521,7 @@
       break;
     case Primitive::kPrimFloat:
     case Primitive::kPrimDouble:
-      GenerateVcmp(condition);
+      GenerateVcmp(condition, codegen_);
       GenerateFPJumps(condition, true_target, false_target);
       break;
     default:
@@ -2120,20 +2591,38 @@
       return;
     }
 
+    Label* non_fallthrough_target;
+    Condition arm_cond;
     LocationSummary* locations = cond->GetLocations();
     DCHECK(locations->InAt(0).IsRegister());
     Register left = locations->InAt(0).AsRegister<Register>();
     Location right = locations->InAt(1);
-    if (right.IsRegister()) {
-      __ cmp(left, ShifterOperand(right.AsRegister<Register>()));
-    } else {
-      DCHECK(right.IsConstant());
-      __ CmpConstant(left, CodeGenerator::GetInt32ValueOf(right.GetConstant()));
-    }
+
     if (true_target == nullptr) {
-      __ b(false_target, ARMCondition(condition->GetOppositeCondition()));
+      arm_cond = ARMCondition(condition->GetOppositeCondition());
+      non_fallthrough_target = false_target;
     } else {
-      __ b(true_target, ARMCondition(condition->GetCondition()));
+      arm_cond = ARMCondition(condition->GetCondition());
+      non_fallthrough_target = true_target;
+    }
+
+    if (right.IsConstant() && (arm_cond == NE || arm_cond == EQ) &&
+        CodeGenerator::GetInt32ValueOf(right.GetConstant()) == 0) {
+      if (arm_cond == EQ) {
+        __ CompareAndBranchIfZero(left, non_fallthrough_target);
+      } else {
+        DCHECK_EQ(arm_cond, NE);
+        __ CompareAndBranchIfNonZero(left, non_fallthrough_target);
+      }
+    } else {
+      if (right.IsRegister()) {
+        __ cmp(left, ShifterOperand(right.AsRegister<Register>()));
+      } else {
+        DCHECK(right.IsConstant());
+        __ CmpConstant(left, CodeGenerator::GetInt32ValueOf(right.GetConstant()));
+      }
+
+      __ b(non_fallthrough_target, arm_cond);
     }
   }
 
@@ -2193,28 +2682,140 @@
 
 void LocationsBuilderARM::VisitSelect(HSelect* select) {
   LocationSummary* locations = new (GetGraph()->GetArena()) LocationSummary(select);
-  if (Primitive::IsFloatingPointType(select->GetType())) {
+  const bool is_floating_point = Primitive::IsFloatingPointType(select->GetType());
+
+  if (is_floating_point) {
     locations->SetInAt(0, Location::RequiresFpuRegister());
-    locations->SetInAt(1, Location::RequiresFpuRegister());
+    locations->SetInAt(1, Location::FpuRegisterOrConstant(select->GetTrueValue()));
   } else {
     locations->SetInAt(0, Location::RequiresRegister());
-    locations->SetInAt(1, Location::RequiresRegister());
+    locations->SetInAt(1, Arm8BitEncodableConstantOrRegister(select->GetTrueValue()));
   }
+
   if (IsBooleanValueOrMaterializedCondition(select->GetCondition())) {
-    locations->SetInAt(2, Location::RequiresRegister());
+    locations->SetInAt(2, Location::RegisterOrConstant(select->GetCondition()));
+    // The code generator handles overlap with the values, but not with the condition.
+    locations->SetOut(Location::SameAsFirstInput());
+  } else if (is_floating_point) {
+    locations->SetOut(Location::RequiresFpuRegister(), Location::kNoOutputOverlap);
+  } else {
+    if (!locations->InAt(1).IsConstant()) {
+      locations->SetInAt(0, Arm8BitEncodableConstantOrRegister(select->GetFalseValue()));
+    }
+
+    locations->SetOut(Location::RequiresRegister(), Location::kNoOutputOverlap);
   }
-  locations->SetOut(Location::SameAsFirstInput());
 }
 
 void InstructionCodeGeneratorARM::VisitSelect(HSelect* select) {
-  LocationSummary* locations = select->GetLocations();
-  Label false_target;
-  GenerateTestAndBranch(select,
-                        /* condition_input_index */ 2,
-                        /* true_target */ nullptr,
-                        &false_target);
-  codegen_->MoveLocation(locations->Out(), locations->InAt(1), select->GetType());
-  __ Bind(&false_target);
+  HInstruction* const condition = select->GetCondition();
+  const LocationSummary* const locations = select->GetLocations();
+  const Primitive::Type type = select->GetType();
+  const Location first = locations->InAt(0);
+  const Location out = locations->Out();
+  const Location second = locations->InAt(1);
+  Location src;
+
+  if (condition->IsIntConstant()) {
+    if (condition->AsIntConstant()->IsFalse()) {
+      src = first;
+    } else {
+      src = second;
+    }
+
+    codegen_->MoveLocation(out, src, type);
+    return;
+  }
+
+  if (!Primitive::IsFloatingPointType(type) &&
+      CanGenerateTest(condition, codegen_->GetAssembler())) {
+    bool invert = false;
+
+    if (out.Equals(second)) {
+      src = first;
+      invert = true;
+    } else if (out.Equals(first)) {
+      src = second;
+    } else if (second.IsConstant()) {
+      DCHECK(CanEncodeConstantAs8BitImmediate(second.GetConstant()));
+      src = second;
+    } else if (first.IsConstant()) {
+      DCHECK(CanEncodeConstantAs8BitImmediate(first.GetConstant()));
+      src = first;
+      invert = true;
+    } else {
+      src = second;
+    }
+
+    if (CanGenerateConditionalMove(out, src)) {
+      if (!out.Equals(first) && !out.Equals(second)) {
+        codegen_->MoveLocation(out, src.Equals(first) ? second : first, type);
+      }
+
+      const Condition cond = GenerateTest(condition, locations->InAt(2), invert, codegen_);
+
+      if (out.IsRegister()) {
+        ShifterOperand operand;
+
+        if (src.IsConstant()) {
+          operand = ShifterOperand(CodeGenerator::GetInt32ValueOf(src.GetConstant()));
+        } else {
+          DCHECK(src.IsRegister());
+          operand = ShifterOperand(src.AsRegister<Register>());
+        }
+
+        __ it(cond);
+        __ mov(out.AsRegister<Register>(), operand, cond);
+      } else {
+        DCHECK(out.IsRegisterPair());
+
+        ShifterOperand operand_high;
+        ShifterOperand operand_low;
+
+        if (src.IsConstant()) {
+          const int64_t value = src.GetConstant()->AsLongConstant()->GetValue();
+
+          operand_high = ShifterOperand(High32Bits(value));
+          operand_low = ShifterOperand(Low32Bits(value));
+        } else {
+          DCHECK(src.IsRegisterPair());
+          operand_high = ShifterOperand(src.AsRegisterPairHigh<Register>());
+          operand_low = ShifterOperand(src.AsRegisterPairLow<Register>());
+        }
+
+        __ it(cond);
+        __ mov(out.AsRegisterPairLow<Register>(), operand_low, cond);
+        __ it(cond);
+        __ mov(out.AsRegisterPairHigh<Register>(), operand_high, cond);
+      }
+
+      return;
+    }
+  }
+
+  Label* false_target = nullptr;
+  Label* true_target = nullptr;
+  Label select_end;
+  Label* target = codegen_->GetFinalLabel(select, &select_end);
+
+  if (out.Equals(second)) {
+    true_target = target;
+    src = first;
+  } else {
+    false_target = target;
+    src = second;
+
+    if (!out.Equals(first)) {
+      codegen_->MoveLocation(out, first, type);
+    }
+  }
+
+  GenerateTestAndBranch(select, 2, true_target, false_target);
+  codegen_->MoveLocation(out, src, type);
+
+  if (select_end.IsLinked()) {
+    __ Bind(&select_end);
+  }
 }
 
 void LocationsBuilderARM::VisitNativeDebugInfo(HNativeDebugInfo* info) {
@@ -2293,7 +2894,7 @@
       break;
     case Primitive::kPrimFloat:
     case Primitive::kPrimDouble:
-      GenerateVcmp(cond);
+      GenerateVcmp(cond, codegen_);
       GenerateFPJumps(cond, &true_label, &false_label);
       break;
   }
@@ -4347,7 +4948,7 @@
     case Primitive::kPrimFloat:
     case Primitive::kPrimDouble: {
       __ LoadImmediate(out, 0);
-      GenerateVcmp(compare);
+      GenerateVcmp(compare, codegen_);
       __ vmstat();  // transfer FP status register to ARM APSR.
       less_cond = ARMFPCondition(kCondLT, compare->IsGtBias());
       break;
@@ -4703,17 +5304,29 @@
     return true;
   }
   Opcode neg_opcode = kNoOperand;
+  uint32_t neg_value = 0;
   switch (opcode) {
-    case AND: neg_opcode = BIC; value = ~value; break;
-    case ORR: neg_opcode = ORN; value = ~value; break;
-    case ADD: neg_opcode = SUB; value = -value; break;
-    case ADC: neg_opcode = SBC; value = ~value; break;
-    case SUB: neg_opcode = ADD; value = -value; break;
-    case SBC: neg_opcode = ADC; value = ~value; break;
+    case AND: neg_opcode = BIC; neg_value = ~value; break;
+    case ORR: neg_opcode = ORN; neg_value = ~value; break;
+    case ADD: neg_opcode = SUB; neg_value = -value; break;
+    case ADC: neg_opcode = SBC; neg_value = ~value; break;
+    case SUB: neg_opcode = ADD; neg_value = -value; break;
+    case SBC: neg_opcode = ADC; neg_value = ~value; break;
+    case MOV: neg_opcode = MVN; neg_value = ~value; break;
     default:
       return false;
   }
-  return assembler->ShifterOperandCanHold(kNoRegister, kNoRegister, neg_opcode, value, set_cc, &so);
+
+  if (assembler->ShifterOperandCanHold(kNoRegister,
+                                       kNoRegister,
+                                       neg_opcode,
+                                       neg_value,
+                                       set_cc,
+                                       &so)) {
+    return true;
+  }
+
+  return opcode == AND && IsPowerOfTwo(value + 1);
 }
 
 void InstructionCodeGeneratorARM::HandleFieldGet(HInstruction* instruction,
@@ -5615,21 +6228,59 @@
   caller_saves.Add(Location::RegisterLocation(calling_convention.GetRegisterAt(0)));
   caller_saves.Add(Location::RegisterLocation(calling_convention.GetRegisterAt(1)));
   LocationSummary* locations = codegen_->CreateThrowingSlowPathLocations(instruction, caller_saves);
-  locations->SetInAt(0, Location::RequiresRegister());
-  locations->SetInAt(1, Location::RequiresRegister());
+
+  HInstruction* index = instruction->InputAt(0);
+  HInstruction* length = instruction->InputAt(1);
+  // If both index and length are constants we can statically check the bounds. But if at least one
+  // of them is not encodable ArmEncodableConstantOrRegister will create
+  // Location::RequiresRegister() which is not desired to happen. Instead we create constant
+  // locations.
+  bool both_const = index->IsConstant() && length->IsConstant();
+  locations->SetInAt(0, both_const
+      ? Location::ConstantLocation(index->AsConstant())
+      : ArmEncodableConstantOrRegister(index, CMP));
+  locations->SetInAt(1, both_const
+      ? Location::ConstantLocation(length->AsConstant())
+      : ArmEncodableConstantOrRegister(length, CMP));
 }
 
 void InstructionCodeGeneratorARM::VisitBoundsCheck(HBoundsCheck* instruction) {
   LocationSummary* locations = instruction->GetLocations();
-  SlowPathCodeARM* slow_path =
-      new (GetGraph()->GetArena()) BoundsCheckSlowPathARM(instruction);
-  codegen_->AddSlowPath(slow_path);
+  Location index_loc = locations->InAt(0);
+  Location length_loc = locations->InAt(1);
 
-  Register index = locations->InAt(0).AsRegister<Register>();
-  Register length = locations->InAt(1).AsRegister<Register>();
+  if (length_loc.IsConstant()) {
+    int32_t length = helpers::Int32ConstantFrom(length_loc);
+    if (index_loc.IsConstant()) {
+      // BCE will remove the bounds check if we are guaranteed to pass.
+      int32_t index = helpers::Int32ConstantFrom(index_loc);
+      if (index < 0 || index >= length) {
+        SlowPathCodeARM* slow_path =
+            new (GetGraph()->GetArena()) BoundsCheckSlowPathARM(instruction);
+        codegen_->AddSlowPath(slow_path);
+        __ b(slow_path->GetEntryLabel());
+      } else {
+        // Some optimization after BCE may have generated this, and we should not
+        // generate a bounds check if it is a valid range.
+      }
+      return;
+    }
 
-  __ cmp(index, ShifterOperand(length));
-  __ b(slow_path->GetEntryLabel(), HS);
+    SlowPathCodeARM* slow_path = new (GetGraph()->GetArena()) BoundsCheckSlowPathARM(instruction);
+    __ cmp(index_loc.AsRegister<Register>(), ShifterOperand(length));
+    codegen_->AddSlowPath(slow_path);
+    __ b(slow_path->GetEntryLabel(), HS);
+  } else {
+    SlowPathCodeARM* slow_path = new (GetGraph()->GetArena()) BoundsCheckSlowPathARM(instruction);
+    if (index_loc.IsConstant()) {
+      int32_t index = helpers::Int32ConstantFrom(index_loc);
+      __ cmp(length_loc.AsRegister<Register>(), ShifterOperand(index));
+    } else {
+      __ cmp(length_loc.AsRegister<Register>(), ShifterOperand(index_loc.AsRegister<Register>()));
+    }
+    codegen_->AddSlowPath(slow_path);
+    __ b(slow_path->GetEntryLabel(), LS);
+  }
 }
 
 void CodeGeneratorARM::MarkGCCard(Register temp,
@@ -6969,9 +7620,11 @@
   ShifterOperand so;
   if (__ ShifterOperandCanHold(kNoRegister, kNoRegister, AND, value, &so)) {
     __ and_(out, first, so);
-  } else {
-    DCHECK(__ ShifterOperandCanHold(kNoRegister, kNoRegister, BIC, ~value, &so));
+  } else if (__ ShifterOperandCanHold(kNoRegister, kNoRegister, BIC, ~value, &so)) {
     __ bic(out, first, ShifterOperand(~value));
+  } else {
+    DCHECK(IsPowerOfTwo(value + 1));
+    __ ubfx(out, first, 0, WhichPowerOf2(value + 1));
   }
 }
 
@@ -7185,14 +7838,35 @@
     DCHECK(kEmitCompilerReadBarrier);
     if (kUseBakerReadBarrier) {
       // Fast path implementation of art::ReadBarrier::BarrierForRoot when
-      // Baker's read barrier are used:
+      // Baker's read barrier are used.
       //
-      //   root = obj.field;
+      // Note that we do not actually check the value of
+      // `GetIsGcMarking()` to decide whether to mark the loaded GC
+      // root or not.  Instead, we load into `temp` the read barrier
+      // mark entry point corresponding to register `root`. If `temp`
+      // is null, it means that `GetIsGcMarking()` is false, and vice
+      // versa.
+      //
       //   temp = Thread::Current()->pReadBarrierMarkReg ## root.reg()
-      //   if (temp != null) {
-      //     root = temp(root)
+      //   GcRoot<mirror::Object> root = *(obj+offset);  // Original reference load.
+      //   if (temp != nullptr) {  // <=> Thread::Current()->GetIsGcMarking()
+      //     // Slow path.
+      //     root = temp(root);  // root = ReadBarrier::Mark(root);  // Runtime entry point call.
       //   }
 
+      // Slow path marking the GC root `root`. The entrypoint will already be loaded in `temp`.
+      Location temp = Location::RegisterLocation(LR);
+      SlowPathCodeARM* slow_path = new (GetGraph()->GetArena()) ReadBarrierMarkSlowPathARM(
+          instruction, root, /* entrypoint */ temp);
+      codegen_->AddSlowPath(slow_path);
+
+      // temp = Thread::Current()->pReadBarrierMarkReg ## root.reg()
+      const int32_t entry_point_offset =
+          CodeGenerator::GetReadBarrierMarkEntryPointsOffset<kArmPointerSize>(root.reg());
+      // Loading the entrypoint does not require a load acquire since it is only changed when
+      // threads are suspended or running a checkpoint.
+      __ LoadFromOffset(kLoadWord, temp.AsRegister<Register>(), TR, entry_point_offset);
+
       // /* GcRoot<mirror::Object> */ root = *(obj + offset)
       __ LoadFromOffset(kLoadWord, root_reg, obj, offset);
       static_assert(
@@ -7203,21 +7877,6 @@
                     "art::mirror::CompressedReference<mirror::Object> and int32_t "
                     "have different sizes.");
 
-      // Slow path marking the GC root `root`.
-      Location temp = Location::RegisterLocation(LR);
-      SlowPathCodeARM* slow_path =
-          new (GetGraph()->GetArena()) ReadBarrierMarkSlowPathARM(
-              instruction,
-              root,
-              /*entrypoint*/ temp);
-      codegen_->AddSlowPath(slow_path);
-
-      // temp = Thread::Current()->pReadBarrierMarkReg ## root.reg()
-      const int32_t entry_point_offset =
-          CodeGenerator::GetReadBarrierMarkEntryPointsOffset<kArmPointerSize>(root.reg());
-      // Loading the entrypoint does not require a load acquire since it is only changed when
-      // threads are suspended or running a checkpoint.
-      __ LoadFromOffset(kLoadWord, temp.AsRegister<Register>(), TR, entry_point_offset);
       // The entrypoint is null when the GC is not marking, this prevents one load compared to
       // checking GetIsGcMarking.
       __ CompareAndBranchIfNonZero(temp.AsRegister<Register>(), slow_path->GetEntryLabel());
@@ -7288,51 +7947,101 @@
   DCHECK(kEmitCompilerReadBarrier);
   DCHECK(kUseBakerReadBarrier);
 
-  // In slow path based read barriers, the read barrier call is
-  // inserted after the original load. However, in fast path based
-  // Baker's read barriers, we need to perform the load of
-  // mirror::Object::monitor_ *before* the original reference load.
-  // This load-load ordering is required by the read barrier.
-  // The fast path/slow path (for Baker's algorithm) should look like:
+  // Query `art::Thread::Current()->GetIsGcMarking()` to decide
+  // whether we need to enter the slow path to mark the reference.
+  // Then, in the slow path, check the gray bit in the lock word of
+  // the reference's holder (`obj`) to decide whether to mark `ref` or
+  // not.
   //
-  //   uint32_t rb_state = Lockword(obj->monitor_).ReadBarrierState();
-  //   lfence;  // Load fence or artificial data dependency to prevent load-load reordering
-  //   HeapReference<Object> ref = *src;  // Original reference load.
-  //   bool is_gray = (rb_state == ReadBarrier::GrayState());
-  //   if (is_gray) {
-  //     ref = ReadBarrier::Mark(ref);  // Performed by runtime entrypoint slow path.
+  // Note that we do not actually check the value of `GetIsGcMarking()`;
+  // instead, we load into `temp3` the read barrier mark entry point
+  // corresponding to register `ref`. If `temp3` is null, it means
+  // that `GetIsGcMarking()` is false, and vice versa.
+  //
+  //   temp3 = Thread::Current()->pReadBarrierMarkReg ## root.reg()
+  //   if (temp3 != nullptr) {  // <=> Thread::Current()->GetIsGcMarking()
+  //     // Slow path.
+  //     uint32_t rb_state = Lockword(obj->monitor_).ReadBarrierState();
+  //     lfence;  // Load fence or artificial data dependency to prevent load-load reordering
+  //     HeapReference<mirror::Object> ref = *src;  // Original reference load.
+  //     bool is_gray = (rb_state == ReadBarrier::GrayState());
+  //     if (is_gray) {
+  //       ref = temp3(ref);  // ref = ReadBarrier::Mark(ref);  // Runtime entry point call.
+  //     }
+  //   } else {
+  //     HeapReference<mirror::Object> ref = *src;  // Original reference load.
   //   }
-  //
-  // Note: the original implementation in ReadBarrier::Barrier is
-  // slightly more complex as it performs additional checks that we do
-  // not do here for performance reasons.
 
-  Register ref_reg = ref.AsRegister<Register>();
   Register temp_reg = temp.AsRegister<Register>();
-  uint32_t monitor_offset = mirror::Object::MonitorOffset().Int32Value();
 
-  // /* int32_t */ monitor = obj->monitor_
-  __ LoadFromOffset(kLoadWord, temp_reg, obj, monitor_offset);
-  if (needs_null_check) {
-    MaybeRecordImplicitNullCheck(instruction);
+  // Slow path marking the object `ref` when the GC is marking. The
+  // entrypoint will already be loaded in `temp3`.
+  Location temp3 = Location::RegisterLocation(LR);
+  SlowPathCodeARM* slow_path;
+  if (always_update_field) {
+    DCHECK(temp2 != nullptr);
+    // LoadReferenceWithBakerReadBarrierAndUpdateFieldSlowPathARM only
+    // supports address of the form `obj + field_offset`, where `obj`
+    // is a register and `field_offset` is a register pair (of which
+    // only the lower half is used). Thus `offset` and `scale_factor`
+    // above are expected to be null in this code path.
+    DCHECK_EQ(offset, 0u);
+    DCHECK_EQ(scale_factor, ScaleFactor::TIMES_1);
+    Location field_offset = index;
+    slow_path =
+        new (GetGraph()->GetArena()) LoadReferenceWithBakerReadBarrierAndUpdateFieldSlowPathARM(
+            instruction,
+            ref,
+            obj,
+            offset,
+            /* index */ field_offset,
+            scale_factor,
+            needs_null_check,
+            temp_reg,
+            *temp2,
+            /* entrypoint */ temp3);
+  } else {
+    slow_path = new (GetGraph()->GetArena()) LoadReferenceWithBakerReadBarrierSlowPathARM(
+        instruction,
+        ref,
+        obj,
+        offset,
+        index,
+        scale_factor,
+        needs_null_check,
+        temp_reg,
+        /* entrypoint */ temp3);
   }
-  // /* LockWord */ lock_word = LockWord(monitor)
-  static_assert(sizeof(LockWord) == sizeof(int32_t),
-                "art::LockWord and int32_t have different sizes.");
+  AddSlowPath(slow_path);
 
-  // Introduce a dependency on the lock_word including the rb_state,
-  // which shall prevent load-load reordering without using
-  // a memory barrier (which would be more expensive).
-  // `obj` is unchanged by this operation, but its value now depends
-  // on `temp_reg`.
-  __ add(obj, obj, ShifterOperand(temp_reg, LSR, 32));
+  // temp3 = Thread::Current()->pReadBarrierMarkReg ## ref.reg()
+  const int32_t entry_point_offset =
+      CodeGenerator::GetReadBarrierMarkEntryPointsOffset<kArmPointerSize>(ref.reg());
+  // Loading the entrypoint does not require a load acquire since it is only changed when
+  // threads are suspended or running a checkpoint.
+  __ LoadFromOffset(kLoadWord, temp3.AsRegister<Register>(), TR, entry_point_offset);
+  // The entrypoint is null when the GC is not marking, this prevents one load compared to
+  // checking GetIsGcMarking.
+  __ CompareAndBranchIfNonZero(temp3.AsRegister<Register>(), slow_path->GetEntryLabel());
+  // Fast path: just load the reference.
+  GenerateRawReferenceLoad(instruction, ref, obj, offset, index, scale_factor, needs_null_check);
+  __ Bind(slow_path->GetExitLabel());
+}
 
-  // The actual reference load.
+void CodeGeneratorARM::GenerateRawReferenceLoad(HInstruction* instruction,
+                                                Location ref,
+                                                Register obj,
+                                                uint32_t offset,
+                                                Location index,
+                                                ScaleFactor scale_factor,
+                                                bool needs_null_check) {
+  Register ref_reg = ref.AsRegister<Register>();
+
   if (index.IsValid()) {
     // Load types involving an "index": ArrayGet,
     // UnsafeGetObject/UnsafeGetObjectVolatile and UnsafeCASObject
     // intrinsics.
-    // /* HeapReference<Object> */ ref = *(obj + offset + (index << scale_factor))
+    // /* HeapReference<mirror::Object> */ ref = *(obj + offset + (index << scale_factor))
     if (index.IsConstant()) {
       size_t computed_offset =
           (index.GetConstant()->AsIntConstant()->GetValue() << scale_factor) + offset;
@@ -7349,41 +8058,16 @@
       __ LoadFromOffset(kLoadWord, ref_reg, IP, offset);
     }
   } else {
-    // /* HeapReference<Object> */ ref = *(obj + offset)
+    // /* HeapReference<mirror::Object> */ ref = *(obj + offset)
     __ LoadFromOffset(kLoadWord, ref_reg, obj, offset);
   }
 
+  if (needs_null_check) {
+    MaybeRecordImplicitNullCheck(instruction);
+  }
+
   // Object* ref = ref_addr->AsMirrorPtr()
   __ MaybeUnpoisonHeapReference(ref_reg);
-
-  // Slow path marking the object `ref` when it is gray.
-  SlowPathCodeARM* slow_path;
-  if (always_update_field) {
-    DCHECK(temp2 != nullptr);
-    // ReadBarrierMarkAndUpdateFieldSlowPathARM only supports address
-    // of the form `obj + field_offset`, where `obj` is a register and
-    // `field_offset` is a register pair (of which only the lower half
-    // is used). Thus `offset` and `scale_factor` above are expected
-    // to be null in this code path.
-    DCHECK_EQ(offset, 0u);
-    DCHECK_EQ(scale_factor, ScaleFactor::TIMES_1);
-    slow_path = new (GetGraph()->GetArena()) ReadBarrierMarkAndUpdateFieldSlowPathARM(
-        instruction, ref, obj, /* field_offset */ index, temp_reg, *temp2);
-  } else {
-    slow_path = new (GetGraph()->GetArena()) ReadBarrierMarkSlowPathARM(instruction, ref);
-  }
-  AddSlowPath(slow_path);
-
-  // if (rb_state == ReadBarrier::GrayState())
-  //   ref = ReadBarrier::Mark(ref);
-  // Given the numeric representation, it's enough to check the low bit of the
-  // rb_state. We do that by shifting the bit out of the lock word with LSRS
-  // which can be a 16-bit instruction unlike the TST immediate.
-  static_assert(ReadBarrier::WhiteState() == 0, "Expecting white to have value 0");
-  static_assert(ReadBarrier::GrayState() == 1, "Expecting gray to have value 1");
-  __ Lsrs(temp_reg, temp_reg, LockWord::kReadBarrierStateShift + 1);
-  __ b(slow_path->GetEntryLabel(), CS);  // Carry flag is the last bit shifted out by LSRS.
-  __ Bind(slow_path->GetExitLabel());
 }
 
 void CodeGeneratorARM::GenerateReadBarrierSlow(HInstruction* instruction,
@@ -7626,9 +8310,7 @@
 }
 
 Literal* CodeGeneratorARM::DeduplicateBootImageAddressLiteral(uint32_t address) {
-  bool needs_patch = GetCompilerOptions().GetIncludePatchInformation();
-  Uint32ToLiteralMap* map = needs_patch ? &boot_image_address_patches_ : &uint32_literals_;
-  return DeduplicateUint32Literal(dchecked_integral_cast<uint32_t>(address), map);
+  return DeduplicateUint32Literal(dchecked_integral_cast<uint32_t>(address), &uint32_literals_);
 }
 
 Literal* CodeGeneratorARM::DeduplicateJitStringLiteral(const DexFile& dex_file,
@@ -7679,8 +8361,7 @@
       /* MOVW+MOVT for each entry */ 2u * pc_relative_string_patches_.size() +
       boot_image_type_patches_.size() +
       /* MOVW+MOVT for each entry */ 2u * pc_relative_type_patches_.size() +
-      /* MOVW+MOVT for each entry */ 2u * type_bss_entry_patches_.size() +
-      boot_image_address_patches_.size();
+      /* MOVW+MOVT for each entry */ 2u * type_bss_entry_patches_.size();
   linker_patches->reserve(size);
   EmitPcRelativeLinkerPatches<LinkerPatch::DexCacheArrayPatch>(pc_relative_dex_cache_patches_,
                                                                linker_patches);
@@ -7714,13 +8395,6 @@
                                                      target_type.dex_file,
                                                      target_type.type_index.index_));
   }
-  for (const auto& entry : boot_image_address_patches_) {
-    DCHECK(GetCompilerOptions().GetIncludePatchInformation());
-    Literal* literal = entry.second;
-    DCHECK(literal->GetLabel()->IsBound());
-    uint32_t literal_offset = literal->GetLabel()->Position();
-    linker_patches->push_back(LinkerPatch::RecordPosition(literal_offset));
-  }
   DCHECK_EQ(size, linker_patches->size());
 }
 
diff --git a/compiler/optimizing/code_generator_arm.h b/compiler/optimizing/code_generator_arm.h
index df2dbc7..5b15902 100644
--- a/compiler/optimizing/code_generator_arm.h
+++ b/compiler/optimizing/code_generator_arm.h
@@ -299,7 +299,6 @@
   void GenerateCompareTestAndBranch(HCondition* condition,
                                     Label* true_target,
                                     Label* false_target);
-  void GenerateVcmp(HInstruction* instruction);
   void GenerateFPJumps(HCondition* cond, Label* true_label, Label* false_label);
   void GenerateLongComparesAndJumps(HCondition* cond, Label* true_label, Label* false_label);
   void DivRemOneOrMinusOne(HBinaryOperation* instruction);
@@ -422,6 +421,8 @@
     return CommonGetLabelOf<Label>(block_labels_, block);
   }
 
+  Label* GetFinalLabel(HInstruction* instruction, Label* final_label);
+
   void Initialize() OVERRIDE {
     block_labels_ = CommonInitializeLabels<Label>();
   }
@@ -520,9 +521,6 @@
                                              Location index,
                                              Location temp,
                                              bool needs_null_check);
-  // Factored implementation used by GenerateFieldLoadWithBakerReadBarrier
-  // and GenerateArrayLoadWithBakerReadBarrier.
-
   // Factored implementation, used by GenerateFieldLoadWithBakerReadBarrier,
   // GenerateArrayLoadWithBakerReadBarrier and some intrinsics.
   //
@@ -545,6 +543,15 @@
                                                  bool always_update_field = false,
                                                  Register* temp2 = nullptr);
 
+  // Generate a heap reference load (with no read barrier).
+  void GenerateRawReferenceLoad(HInstruction* instruction,
+                                Location ref,
+                                Register obj,
+                                uint32_t offset,
+                                Location index,
+                                ScaleFactor scale_factor,
+                                bool needs_null_check);
+
   // Generate a read barrier for a heap reference within `instruction`
   // using a slow path.
   //
@@ -642,8 +649,6 @@
   ArenaDeque<PcRelativePatchInfo> pc_relative_type_patches_;
   // PC-relative type patch info for kBssEntry.
   ArenaDeque<PcRelativePatchInfo> type_bss_entry_patches_;
-  // Deduplication map for patchable boot image addresses.
-  Uint32ToLiteralMap boot_image_address_patches_;
 
   // Patches for string literals in JIT compiled code.
   StringToLiteralMap jit_string_patches_;
diff --git a/compiler/optimizing/code_generator_arm64.cc b/compiler/optimizing/code_generator_arm64.cc
index 18c95b3..97b61ed 100644
--- a/compiler/optimizing/code_generator_arm64.cc
+++ b/compiler/optimizing/code_generator_arm64.cc
@@ -633,58 +633,23 @@
   }
 }
 
-// Slow path marking an object reference `ref` during a read
-// barrier. The field `obj.field` in the object `obj` holding this
-// reference does not get updated by this slow path after marking (see
-// ReadBarrierMarkAndUpdateFieldSlowPathARM64 below for that).
+// Abstract base class for read barrier slow paths marking a reference
+// `ref`.
 //
-// This means that after the execution of this slow path, `ref` will
-// always be up-to-date, but `obj.field` may not; i.e., after the
-// flip, `ref` will be a to-space reference, but `obj.field` will
-// probably still be a from-space reference (unless it gets updated by
-// another thread, or if another thread installed another object
-// reference (different from `ref`) in `obj.field`).
-//
-// If `entrypoint` is a valid location it is assumed to already be
-// holding the entrypoint. The case where the entrypoint is passed in
-// is for the GcRoot read barrier.
-class ReadBarrierMarkSlowPathARM64 : public SlowPathCodeARM64 {
- public:
-  ReadBarrierMarkSlowPathARM64(HInstruction* instruction,
-                               Location ref,
-                               Location entrypoint = Location::NoLocation())
-      : SlowPathCodeARM64(instruction),
-        ref_(ref),
-        entrypoint_(entrypoint) {
+// Argument `entrypoint` must be a register location holding the read
+// barrier marking runtime entry point to be invoked.
+class ReadBarrierMarkSlowPathBaseARM64 : public SlowPathCodeARM64 {
+ protected:
+  ReadBarrierMarkSlowPathBaseARM64(HInstruction* instruction, Location ref, Location entrypoint)
+      : SlowPathCodeARM64(instruction), ref_(ref), entrypoint_(entrypoint) {
     DCHECK(kEmitCompilerReadBarrier);
   }
 
-  const char* GetDescription() const OVERRIDE { return "ReadBarrierMarkSlowPathARM64"; }
+  const char* GetDescription() const OVERRIDE { return "ReadBarrierMarkSlowPathBaseARM64"; }
 
-  void EmitNativeCode(CodeGenerator* codegen) OVERRIDE {
-    LocationSummary* locations = instruction_->GetLocations();
-    DCHECK(locations->CanCall());
-    DCHECK(ref_.IsRegister()) << ref_;
-    DCHECK(!locations->GetLiveRegisters()->ContainsCoreRegister(ref_.reg())) << ref_.reg();
-    DCHECK(instruction_->IsInstanceFieldGet() ||
-           instruction_->IsStaticFieldGet() ||
-           instruction_->IsArrayGet() ||
-           instruction_->IsArraySet() ||
-           instruction_->IsLoadClass() ||
-           instruction_->IsLoadString() ||
-           instruction_->IsInstanceOf() ||
-           instruction_->IsCheckCast() ||
-           (instruction_->IsInvokeVirtual() && instruction_->GetLocations()->Intrinsified()) ||
-           (instruction_->IsInvokeStaticOrDirect() && instruction_->GetLocations()->Intrinsified()))
-        << "Unexpected instruction in read barrier marking slow path: "
-        << instruction_->DebugName();
-    // The read barrier instrumentation of object ArrayGet
-    // instructions does not support the HIntermediateAddress
-    // instruction.
-    DCHECK(!(instruction_->IsArrayGet() &&
-             instruction_->AsArrayGet()->GetArray()->IsIntermediateAddress()));
-
-    __ Bind(GetEntryLabel());
+  // Generate assembly code calling the read barrier marking runtime
+  // entry point (ReadBarrierMarkRegX).
+  void GenerateReadBarrierMarkRuntimeCall(CodeGenerator* codegen) {
     // No need to save live registers; it's taken care of by the
     // entrypoint. Also, there is no need to update the stack mask,
     // as this runtime call will not trigger a garbage collection.
@@ -720,46 +685,261 @@
       // This runtime call does not require a stack map.
       arm64_codegen->InvokeRuntimeWithoutRecordingPcInfo(entry_point_offset, instruction_, this);
     }
-    __ B(GetExitLabel());
   }
 
- private:
   // The location (register) of the marked object reference.
   const Location ref_;
 
   // The location of the entrypoint if it is already loaded.
   const Location entrypoint_;
 
+ private:
+  DISALLOW_COPY_AND_ASSIGN(ReadBarrierMarkSlowPathBaseARM64);
+};
+
+// Slow path marking an object reference `ref` during a read
+// barrier. The field `obj.field` in the object `obj` holding this
+// reference does not get updated by this slow path after marking.
+//
+// This means that after the execution of this slow path, `ref` will
+// always be up-to-date, but `obj.field` may not; i.e., after the
+// flip, `ref` will be a to-space reference, but `obj.field` will
+// probably still be a from-space reference (unless it gets updated by
+// another thread, or if another thread installed another object
+// reference (different from `ref`) in `obj.field`).
+//
+// If `entrypoint` is a valid location it is assumed to already be
+// holding the entrypoint. The case where the entrypoint is passed in
+// is when the decision to mark is based on whether the GC is marking.
+class ReadBarrierMarkSlowPathARM64 : public ReadBarrierMarkSlowPathBaseARM64 {
+ public:
+  ReadBarrierMarkSlowPathARM64(HInstruction* instruction,
+                               Location ref,
+                               Location entrypoint = Location::NoLocation())
+      : ReadBarrierMarkSlowPathBaseARM64(instruction, ref, entrypoint) {
+    DCHECK(kEmitCompilerReadBarrier);
+  }
+
+  const char* GetDescription() const OVERRIDE { return "ReadBarrierMarkSlowPathARM64"; }
+
+  void EmitNativeCode(CodeGenerator* codegen) OVERRIDE {
+    LocationSummary* locations = instruction_->GetLocations();
+    DCHECK(locations->CanCall());
+    DCHECK(ref_.IsRegister()) << ref_;
+    DCHECK(!locations->GetLiveRegisters()->ContainsCoreRegister(ref_.reg())) << ref_.reg();
+    DCHECK(instruction_->IsLoadClass() || instruction_->IsLoadString())
+        << "Unexpected instruction in read barrier marking slow path: "
+        << instruction_->DebugName();
+
+    __ Bind(GetEntryLabel());
+    GenerateReadBarrierMarkRuntimeCall(codegen);
+    __ B(GetExitLabel());
+  }
+
+ private:
   DISALLOW_COPY_AND_ASSIGN(ReadBarrierMarkSlowPathARM64);
 };
 
-// Slow path marking an object reference `ref` during a read barrier,
-// and if needed, atomically updating the field `obj.field` in the
-// object `obj` holding this reference after marking (contrary to
-// ReadBarrierMarkSlowPathARM64 above, which never tries to update
-// `obj.field`).
+// Slow path loading `obj`'s lock word, loading a reference from
+// object `*(obj + offset + (index << scale_factor))` into `ref`, and
+// marking `ref` if `obj` is gray according to the lock word (Baker
+// read barrier). The field `obj.field` in the object `obj` holding
+// this reference does not get updated by this slow path after marking
+// (see LoadReferenceWithBakerReadBarrierAndUpdateFieldSlowPathARM64
+// below for that).
+//
+// This means that after the execution of this slow path, `ref` will
+// always be up-to-date, but `obj.field` may not; i.e., after the
+// flip, `ref` will be a to-space reference, but `obj.field` will
+// probably still be a from-space reference (unless it gets updated by
+// another thread, or if another thread installed another object
+// reference (different from `ref`) in `obj.field`).
+//
+// Argument `entrypoint` must be a register location holding the read
+// barrier marking runtime entry point to be invoked.
+class LoadReferenceWithBakerReadBarrierSlowPathARM64 : public ReadBarrierMarkSlowPathBaseARM64 {
+ public:
+  LoadReferenceWithBakerReadBarrierSlowPathARM64(HInstruction* instruction,
+                                                 Location ref,
+                                                 Register obj,
+                                                 uint32_t offset,
+                                                 Location index,
+                                                 size_t scale_factor,
+                                                 bool needs_null_check,
+                                                 bool use_load_acquire,
+                                                 Register temp,
+                                                 Location entrypoint)
+      : ReadBarrierMarkSlowPathBaseARM64(instruction, ref, entrypoint),
+        obj_(obj),
+        offset_(offset),
+        index_(index),
+        scale_factor_(scale_factor),
+        needs_null_check_(needs_null_check),
+        use_load_acquire_(use_load_acquire),
+        temp_(temp) {
+    DCHECK(kEmitCompilerReadBarrier);
+    DCHECK(kUseBakerReadBarrier);
+  }
+
+  const char* GetDescription() const OVERRIDE {
+    return "LoadReferenceWithBakerReadBarrierSlowPathARM64";
+  }
+
+  void EmitNativeCode(CodeGenerator* codegen) OVERRIDE {
+    LocationSummary* locations = instruction_->GetLocations();
+    DCHECK(locations->CanCall());
+    DCHECK(ref_.IsRegister()) << ref_;
+    DCHECK(!locations->GetLiveRegisters()->ContainsCoreRegister(ref_.reg())) << ref_.reg();
+    DCHECK(obj_.IsW());
+    DCHECK_NE(ref_.reg(), LocationFrom(temp_).reg());
+    DCHECK(instruction_->IsInstanceFieldGet() ||
+           instruction_->IsStaticFieldGet() ||
+           instruction_->IsArrayGet() ||
+           instruction_->IsArraySet() ||
+           instruction_->IsInstanceOf() ||
+           instruction_->IsCheckCast() ||
+           (instruction_->IsInvokeVirtual() && instruction_->GetLocations()->Intrinsified()) ||
+           (instruction_->IsInvokeStaticOrDirect() && instruction_->GetLocations()->Intrinsified()))
+        << "Unexpected instruction in read barrier marking slow path: "
+        << instruction_->DebugName();
+    // The read barrier instrumentation of object ArrayGet
+    // instructions does not support the HIntermediateAddress
+    // instruction.
+    DCHECK(!(instruction_->IsArrayGet() &&
+             instruction_->AsArrayGet()->GetArray()->IsIntermediateAddress()));
+
+    // Temporary register `temp_`, used to store the lock word, must
+    // not be IP0 nor IP1, as we may use them to emit the reference
+    // load (in the call to GenerateRawReferenceLoad below), and we
+    // need the lock word to still be in `temp_` after the reference
+    // load.
+    DCHECK_NE(LocationFrom(temp_).reg(), IP0);
+    DCHECK_NE(LocationFrom(temp_).reg(), IP1);
+
+    __ Bind(GetEntryLabel());
+
+    // When using MaybeGenerateReadBarrierSlow, the read barrier call is
+    // inserted after the original load. However, in fast path based
+    // Baker's read barriers, we need to perform the load of
+    // mirror::Object::monitor_ *before* the original reference load.
+    // This load-load ordering is required by the read barrier.
+    // The fast path/slow path (for Baker's algorithm) should look like:
+    //
+    //   uint32_t rb_state = Lockword(obj->monitor_).ReadBarrierState();
+    //   lfence;  // Load fence or artificial data dependency to prevent load-load reordering
+    //   HeapReference<mirror::Object> ref = *src;  // Original reference load.
+    //   bool is_gray = (rb_state == ReadBarrier::GrayState());
+    //   if (is_gray) {
+    //     ref = entrypoint(ref);  // ref = ReadBarrier::Mark(ref);  // Runtime entry point call.
+    //   }
+    //
+    // Note: the original implementation in ReadBarrier::Barrier is
+    // slightly more complex as it performs additional checks that we do
+    // not do here for performance reasons.
+
+    // /* int32_t */ monitor = obj->monitor_
+    uint32_t monitor_offset = mirror::Object::MonitorOffset().Int32Value();
+    __ Ldr(temp_, HeapOperand(obj_, monitor_offset));
+    if (needs_null_check_) {
+      codegen->MaybeRecordImplicitNullCheck(instruction_);
+    }
+    // /* LockWord */ lock_word = LockWord(monitor)
+    static_assert(sizeof(LockWord) == sizeof(int32_t),
+                  "art::LockWord and int32_t have different sizes.");
+
+    // Introduce a dependency on the lock_word including rb_state,
+    // to prevent load-load reordering, and without using
+    // a memory barrier (which would be more expensive).
+    // `obj` is unchanged by this operation, but its value now depends
+    // on `temp`.
+    __ Add(obj_.X(), obj_.X(), Operand(temp_.X(), LSR, 32));
+
+    // The actual reference load.
+    // A possible implicit null check has already been handled above.
+    CodeGeneratorARM64* arm64_codegen = down_cast<CodeGeneratorARM64*>(codegen);
+    arm64_codegen->GenerateRawReferenceLoad(instruction_,
+                                            ref_,
+                                            obj_,
+                                            offset_,
+                                            index_,
+                                            scale_factor_,
+                                            /* needs_null_check */ false,
+                                            use_load_acquire_);
+
+    // Mark the object `ref` when `obj` is gray.
+    //
+    //   if (rb_state == ReadBarrier::GrayState())
+    //     ref = ReadBarrier::Mark(ref);
+    //
+    // Given the numeric representation, it's enough to check the low bit of the rb_state.
+    static_assert(ReadBarrier::WhiteState() == 0, "Expecting white to have value 0");
+    static_assert(ReadBarrier::GrayState() == 1, "Expecting gray to have value 1");
+    __ Tbz(temp_, LockWord::kReadBarrierStateShift, GetExitLabel());
+    GenerateReadBarrierMarkRuntimeCall(codegen);
+
+    __ B(GetExitLabel());
+  }
+
+ private:
+  // The register containing the object holding the marked object reference field.
+  Register obj_;
+  // The offset, index and scale factor to access the reference in `obj_`.
+  uint32_t offset_;
+  Location index_;
+  size_t scale_factor_;
+  // Is a null check required?
+  bool needs_null_check_;
+  // Should this reference load use Load-Acquire semantics?
+  bool use_load_acquire_;
+  // A temporary register used to hold the lock word of `obj_`.
+  Register temp_;
+
+  DISALLOW_COPY_AND_ASSIGN(LoadReferenceWithBakerReadBarrierSlowPathARM64);
+};
+
+// Slow path loading `obj`'s lock word, loading a reference from
+// object `*(obj + offset + (index << scale_factor))` into `ref`, and
+// marking `ref` if `obj` is gray according to the lock word (Baker
+// read barrier). If needed, this slow path also atomically updates
+// the field `obj.field` in the object `obj` holding this reference
+// after marking (contrary to
+// LoadReferenceWithBakerReadBarrierSlowPathARM64 above, which never
+// tries to update `obj.field`).
 //
 // This means that after the execution of this slow path, both `ref`
 // and `obj.field` will be up-to-date; i.e., after the flip, both will
 // hold the same to-space reference (unless another thread installed
 // another object reference (different from `ref`) in `obj.field`).
-class ReadBarrierMarkAndUpdateFieldSlowPathARM64 : public SlowPathCodeARM64 {
+//
+// Argument `entrypoint` must be a register location holding the read
+// barrier marking runtime entry point to be invoked.
+class LoadReferenceWithBakerReadBarrierAndUpdateFieldSlowPathARM64
+    : public ReadBarrierMarkSlowPathBaseARM64 {
  public:
-  ReadBarrierMarkAndUpdateFieldSlowPathARM64(HInstruction* instruction,
-                                             Location ref,
-                                             Register obj,
-                                             Location field_offset,
-                                             Register temp)
-      : SlowPathCodeARM64(instruction),
-        ref_(ref),
+  LoadReferenceWithBakerReadBarrierAndUpdateFieldSlowPathARM64(HInstruction* instruction,
+                                                               Location ref,
+                                                               Register obj,
+                                                               uint32_t offset,
+                                                               Location index,
+                                                               size_t scale_factor,
+                                                               bool needs_null_check,
+                                                               bool use_load_acquire,
+                                                               Register temp,
+                                                               Location entrypoint)
+      : ReadBarrierMarkSlowPathBaseARM64(instruction, ref, entrypoint),
         obj_(obj),
-        field_offset_(field_offset),
+        offset_(offset),
+        index_(index),
+        scale_factor_(scale_factor),
+        needs_null_check_(needs_null_check),
+        use_load_acquire_(use_load_acquire),
         temp_(temp) {
     DCHECK(kEmitCompilerReadBarrier);
+    DCHECK(kUseBakerReadBarrier);
   }
 
   const char* GetDescription() const OVERRIDE {
-    return "ReadBarrierMarkAndUpdateFieldSlowPathARM64";
+    return "LoadReferenceWithBakerReadBarrierAndUpdateFieldSlowPathARM64";
   }
 
   void EmitNativeCode(CodeGenerator* codegen) OVERRIDE {
@@ -768,64 +948,90 @@
     DCHECK(locations->CanCall());
     DCHECK(ref_.IsRegister()) << ref_;
     DCHECK(!locations->GetLiveRegisters()->ContainsCoreRegister(ref_.reg())) << ref_.reg();
-    // This slow path is only used by the UnsafeCASObject intrinsic.
+    DCHECK(obj_.IsW());
+    DCHECK_NE(ref_.reg(), LocationFrom(temp_).reg());
+
+    // This slow path is only used by the UnsafeCASObject intrinsic at the moment.
     DCHECK((instruction_->IsInvokeVirtual() && instruction_->GetLocations()->Intrinsified()))
         << "Unexpected instruction in read barrier marking and field updating slow path: "
         << instruction_->DebugName();
     DCHECK(instruction_->GetLocations()->Intrinsified());
     DCHECK_EQ(instruction_->AsInvoke()->GetIntrinsic(), Intrinsics::kUnsafeCASObject);
-    DCHECK(field_offset_.IsRegister()) << field_offset_;
+    DCHECK_EQ(offset_, 0u);
+    DCHECK_EQ(scale_factor_, 0u);
+    DCHECK_EQ(use_load_acquire_, false);
+    // The location of the offset of the marked reference field within `obj_`.
+    Location field_offset = index_;
+    DCHECK(field_offset.IsRegister()) << field_offset;
+
+    // Temporary register `temp_`, used to store the lock word, must
+    // not be IP0 nor IP1, as we may use them to emit the reference
+    // load (in the call to GenerateRawReferenceLoad below), and we
+    // need the lock word to still be in `temp_` after the reference
+    // load.
+    DCHECK_NE(LocationFrom(temp_).reg(), IP0);
+    DCHECK_NE(LocationFrom(temp_).reg(), IP1);
 
     __ Bind(GetEntryLabel());
 
-    // Save the old reference.
+    // /* int32_t */ monitor = obj->monitor_
+    uint32_t monitor_offset = mirror::Object::MonitorOffset().Int32Value();
+    __ Ldr(temp_, HeapOperand(obj_, monitor_offset));
+    if (needs_null_check_) {
+      codegen->MaybeRecordImplicitNullCheck(instruction_);
+    }
+    // /* LockWord */ lock_word = LockWord(monitor)
+    static_assert(sizeof(LockWord) == sizeof(int32_t),
+                  "art::LockWord and int32_t have different sizes.");
+
+    // Introduce a dependency on the lock_word including rb_state,
+    // to prevent load-load reordering, and without using
+    // a memory barrier (which would be more expensive).
+    // `obj` is unchanged by this operation, but its value now depends
+    // on `temp`.
+    __ Add(obj_.X(), obj_.X(), Operand(temp_.X(), LSR, 32));
+
+    // The actual reference load.
+    // A possible implicit null check has already been handled above.
+    CodeGeneratorARM64* arm64_codegen = down_cast<CodeGeneratorARM64*>(codegen);
+    arm64_codegen->GenerateRawReferenceLoad(instruction_,
+                                            ref_,
+                                            obj_,
+                                            offset_,
+                                            index_,
+                                            scale_factor_,
+                                            /* needs_null_check */ false,
+                                            use_load_acquire_);
+
+    // Mark the object `ref` when `obj` is gray.
+    //
+    //   if (rb_state == ReadBarrier::GrayState())
+    //     ref = ReadBarrier::Mark(ref);
+    //
+    // Given the numeric representation, it's enough to check the low bit of the rb_state.
+    static_assert(ReadBarrier::WhiteState() == 0, "Expecting white to have value 0");
+    static_assert(ReadBarrier::GrayState() == 1, "Expecting gray to have value 1");
+    __ Tbz(temp_, LockWord::kReadBarrierStateShift, GetExitLabel());
+
+    // Save the old value of the reference before marking it.
     // Note that we cannot use IP to save the old reference, as IP is
     // used internally by the ReadBarrierMarkRegX entry point, and we
     // need the old reference after the call to that entry point.
     DCHECK_NE(LocationFrom(temp_).reg(), IP0);
     __ Mov(temp_.W(), ref_reg);
 
-    // No need to save live registers; it's taken care of by the
-    // entrypoint. Also, there is no need to update the stack mask,
-    // as this runtime call will not trigger a garbage collection.
-    CodeGeneratorARM64* arm64_codegen = down_cast<CodeGeneratorARM64*>(codegen);
-    DCHECK_NE(ref_.reg(), LR);
-    DCHECK_NE(ref_.reg(), WSP);
-    DCHECK_NE(ref_.reg(), WZR);
-    // IP0 is used internally by the ReadBarrierMarkRegX entry point
-    // as a temporary, it cannot be the entry point's input/output.
-    DCHECK_NE(ref_.reg(), IP0);
-    DCHECK(0 <= ref_.reg() && ref_.reg() < kNumberOfWRegisters) << ref_.reg();
-    // "Compact" slow path, saving two moves.
-    //
-    // Instead of using the standard runtime calling convention (input
-    // and output in W0):
-    //
-    //   W0 <- ref
-    //   W0 <- ReadBarrierMark(W0)
-    //   ref <- W0
-    //
-    // we just use rX (the register containing `ref`) as input and output
-    // of a dedicated entrypoint:
-    //
-    //   rX <- ReadBarrierMarkRegX(rX)
-    //
-    int32_t entry_point_offset =
-        CodeGenerator::GetReadBarrierMarkEntryPointsOffset<kArm64PointerSize>(ref_.reg());
-    // This runtime call does not require a stack map.
-    arm64_codegen->InvokeRuntimeWithoutRecordingPcInfo(entry_point_offset, instruction_, this);
+    GenerateReadBarrierMarkRuntimeCall(codegen);
 
     // If the new reference is different from the old reference,
-    // update the field in the holder (`*(obj_ + field_offset_)`).
+    // update the field in the holder (`*(obj_ + field_offset)`).
     //
     // Note that this field could also hold a different object, if
     // another thread had concurrently changed it. In that case, the
     // LDXR/CMP/BNE sequence of instructions in the compare-and-set
     // (CAS) operation below would abort the CAS, leaving the field
     // as-is.
-    vixl::aarch64::Label done;
     __ Cmp(temp_.W(), ref_reg);
-    __ B(eq, &done);
+    __ B(eq, GetExitLabel());
 
     // Update the the holder's field atomically.  This may fail if
     // mutator updates before us, but it's OK.  This is achieved
@@ -838,7 +1044,7 @@
 
     // Convenience aliases.
     Register base = obj_.W();
-    Register offset = XRegisterFrom(field_offset_);
+    Register offset = XRegisterFrom(field_offset);
     Register expected = temp_.W();
     Register value = ref_reg;
     Register tmp_ptr = temps.AcquireX();    // Pointer to actual memory.
@@ -882,21 +1088,26 @@
       }
     }
 
-    __ Bind(&done);
     __ B(GetExitLabel());
   }
 
  private:
-  // The location (register) of the marked object reference.
-  const Location ref_;
   // The register containing the object holding the marked object reference field.
   const Register obj_;
-  // The location of the offset of the marked reference field within `obj_`.
-  Location field_offset_;
-
+  // The offset, index and scale factor to access the reference in `obj_`.
+  uint32_t offset_;
+  Location index_;
+  size_t scale_factor_;
+  // Is a null check required?
+  bool needs_null_check_;
+  // Should this reference load use Load-Acquire semantics?
+  bool use_load_acquire_;
+  // A temporary register used to hold the lock word of `obj_`; and
+  // also to hold the original reference value, when the reference is
+  // marked.
   const Register temp_;
 
-  DISALLOW_COPY_AND_ASSIGN(ReadBarrierMarkAndUpdateFieldSlowPathARM64);
+  DISALLOW_COPY_AND_ASSIGN(LoadReferenceWithBakerReadBarrierAndUpdateFieldSlowPathARM64);
 };
 
 // Slow path generating a read barrier for a heap reference.
@@ -1200,8 +1411,6 @@
                                graph->GetArena()->Adapter(kArenaAllocCodeGenerator)),
       pc_relative_type_patches_(graph->GetArena()->Adapter(kArenaAllocCodeGenerator)),
       type_bss_entry_patches_(graph->GetArena()->Adapter(kArenaAllocCodeGenerator)),
-      boot_image_address_patches_(std::less<uint32_t>(),
-                                  graph->GetArena()->Adapter(kArenaAllocCodeGenerator)),
       jit_string_patches_(StringReferenceValueComparator(),
                           graph->GetArena()->Adapter(kArenaAllocCodeGenerator)),
       jit_class_patches_(TypeReferenceValueComparator(),
@@ -2427,6 +2636,9 @@
                                                        LocationSummary::kNoCall);
   if (object_array_get_with_read_barrier && kUseBakerReadBarrier) {
     locations->SetCustomSlowPathCallerSaves(RegisterSet::Empty());  // No caller-save registers.
+    // We need a temporary register for the read barrier marking slow
+    // path in CodeGeneratorARM64::GenerateArrayLoadWithBakerReadBarrier.
+    locations->AddTemp(Location::RequiresRegister());
   }
   locations->SetInAt(0, Location::RequiresRegister());
   locations->SetInAt(1, Location::RegisterOrConstant(instruction->InputAt(1)));
@@ -2462,7 +2674,7 @@
 
   if (type == Primitive::kPrimNot && kEmitCompilerReadBarrier && kUseBakerReadBarrier) {
     // Object ArrayGet with Baker's read barrier case.
-    Register temp = temps.AcquireW();
+    Register temp = WRegisterFrom(locations->GetTemp(0));
     // Note that a potential implicit null check is handled in the
     // CodeGeneratorARM64::GenerateArrayLoadWithBakerReadBarrier call.
     codegen_->GenerateArrayLoadWithBakerReadBarrier(
@@ -3419,7 +3631,7 @@
   if (Primitive::IsFloatingPointType(select->GetType())) {
     locations->SetInAt(0, Location::RequiresFpuRegister());
     locations->SetInAt(1, Location::RequiresFpuRegister());
-    locations->SetOut(Location::RequiresFpuRegister());
+    locations->SetOut(Location::RequiresFpuRegister(), Location::kNoOutputOverlap);
   } else {
     HConstant* cst_true_value = select->GetTrueValue()->AsConstant();
     HConstant* cst_false_value = select->GetFalseValue()->AsConstant();
@@ -3442,7 +3654,7 @@
                                                  : Location::ConstantLocation(cst_true_value));
     locations->SetInAt(0, false_value_in_register ? Location::RequiresRegister()
                                                   : Location::ConstantLocation(cst_false_value));
-    locations->SetOut(Location::RequiresRegister());
+    locations->SetOut(Location::RequiresRegister(), Location::kNoOutputOverlap);
   }
 
   if (IsBooleanValueOrMaterializedCondition(select->GetCondition())) {
@@ -4328,9 +4540,7 @@
 
 vixl::aarch64::Literal<uint32_t>* CodeGeneratorARM64::DeduplicateBootImageAddressLiteral(
     uint64_t address) {
-  bool needs_patch = GetCompilerOptions().GetIncludePatchInformation();
-  Uint32ToLiteralMap* map = needs_patch ? &boot_image_address_patches_ : &uint32_literals_;
-  return DeduplicateUint32Literal(dchecked_integral_cast<uint32_t>(address), map);
+  return DeduplicateUint32Literal(dchecked_integral_cast<uint32_t>(address), &uint32_literals_);
 }
 
 vixl::aarch64::Literal<uint32_t>* CodeGeneratorARM64::DeduplicateJitStringLiteral(
@@ -4398,8 +4608,7 @@
       pc_relative_string_patches_.size() +
       boot_image_type_patches_.size() +
       pc_relative_type_patches_.size() +
-      type_bss_entry_patches_.size() +
-      boot_image_address_patches_.size();
+      type_bss_entry_patches_.size();
   linker_patches->reserve(size);
   for (const PcRelativePatchInfo& info : pc_relative_dex_cache_patches_) {
     linker_patches->push_back(LinkerPatch::DexCacheArrayPatch(info.label.GetLocation(),
@@ -4433,11 +4642,6 @@
                                                      target_type.dex_file,
                                                      target_type.type_index.index_));
   }
-  for (const auto& entry : boot_image_address_patches_) {
-    DCHECK(GetCompilerOptions().GetIncludePatchInformation());
-    vixl::aarch64::Literal<uint32_t>* literal = entry.second;
-    linker_patches->push_back(LinkerPatch::RecordPosition(literal->GetOffset()));
-  }
   DCHECK_EQ(size, linker_patches->size());
 }
 
@@ -5614,14 +5818,35 @@
     DCHECK(kEmitCompilerReadBarrier);
     if (kUseBakerReadBarrier) {
       // Fast path implementation of art::ReadBarrier::BarrierForRoot when
-      // Baker's read barrier are used:
+      // Baker's read barrier are used.
       //
-      //   root = obj.field;
+      // Note that we do not actually check the value of
+      // `GetIsGcMarking()` to decide whether to mark the loaded GC
+      // root or not.  Instead, we load into `temp` the read barrier
+      // mark entry point corresponding to register `root`. If `temp`
+      // is null, it means that `GetIsGcMarking()` is false, and vice
+      // versa.
+      //
       //   temp = Thread::Current()->pReadBarrierMarkReg ## root.reg()
-      //   if (temp != null) {
-      //     root = temp(root)
+      //   GcRoot<mirror::Object> root = *(obj+offset);  // Original reference load.
+      //   if (temp != nullptr) {  // <=> Thread::Current()->GetIsGcMarking()
+      //     // Slow path.
+      //     root = temp(root);  // root = ReadBarrier::Mark(root);  // Runtime entry point call.
       //   }
 
+      // Slow path marking the GC root `root`. The entrypoint will already be loaded in `temp`.
+      Register temp = lr;
+      SlowPathCodeARM64* slow_path = new (GetGraph()->GetArena()) ReadBarrierMarkSlowPathARM64(
+          instruction, root, /* entrypoint */ LocationFrom(temp));
+      codegen_->AddSlowPath(slow_path);
+
+      // temp = Thread::Current()->pReadBarrierMarkReg ## root.reg()
+      const int32_t entry_point_offset =
+          CodeGenerator::GetReadBarrierMarkEntryPointsOffset<kArm64PointerSize>(root.reg());
+      // Loading the entrypoint does not require a load acquire since it is only changed when
+      // threads are suspended or running a checkpoint.
+      __ Ldr(temp, MemOperand(tr, entry_point_offset));
+
       // /* GcRoot<mirror::Object> */ root = *(obj + offset)
       if (fixup_label == nullptr) {
         __ Ldr(root_reg, MemOperand(obj, offset));
@@ -5636,20 +5861,6 @@
                     "art::mirror::CompressedReference<mirror::Object> and int32_t "
                     "have different sizes.");
 
-      Register temp = lr;
-
-      // Slow path marking the GC root `root`. The entrypoint will alrady be loaded in temp.
-      SlowPathCodeARM64* slow_path =
-          new (GetGraph()->GetArena()) ReadBarrierMarkSlowPathARM64(instruction,
-                                                                    root,
-                                                                    LocationFrom(temp));
-      codegen_->AddSlowPath(slow_path);
-      const int32_t entry_point_offset =
-          CodeGenerator::GetReadBarrierMarkEntryPointsOffset<kArm64PointerSize>(root.reg());
-      // temp = Thread::Current()->pReadBarrierMarkReg ## root.reg()
-      // Loading the entrypoint does not require a load acquire since it is only changed when
-      // threads are suspended or running a checkpoint.
-      __ Ldr(temp, MemOperand(tr, entry_point_offset));
       // The entrypoint is null when the GC is not marking, this prevents one load compared to
       // checking GetIsGcMarking.
       __ Cbnz(temp, slow_path->GetEntryLabel());
@@ -5751,54 +5962,103 @@
   // `instruction->IsArrayGet()` => `!use_load_acquire`.
   DCHECK(!instruction->IsArrayGet() || !use_load_acquire);
 
-  MacroAssembler* masm = GetVIXLAssembler();
-  UseScratchRegisterScope temps(masm);
-
-  // In slow path based read barriers, the read barrier call is
-  // inserted after the original load. However, in fast path based
-  // Baker's read barriers, we need to perform the load of
-  // mirror::Object::monitor_ *before* the original reference load.
-  // This load-load ordering is required by the read barrier.
-  // The fast path/slow path (for Baker's algorithm) should look like:
+  // Query `art::Thread::Current()->GetIsGcMarking()` to decide
+  // whether we need to enter the slow path to mark the reference.
+  // Then, in the slow path, check the gray bit in the lock word of
+  // the reference's holder (`obj`) to decide whether to mark `ref` or
+  // not.
   //
-  //   uint32_t rb_state = Lockword(obj->monitor_).ReadBarrierState();
-  //   lfence;  // Load fence or artificial data dependency to prevent load-load reordering
-  //   HeapReference<Object> ref = *src;  // Original reference load.
-  //   bool is_gray = (rb_state == ReadBarrier::GrayState());
-  //   if (is_gray) {
-  //     ref = ReadBarrier::Mark(ref);  // Performed by runtime entrypoint slow path.
+  // Note that we do not actually check the value of `GetIsGcMarking()`;
+  // instead, we load into `temp2` the read barrier mark entry point
+  // corresponding to register `ref`. If `temp2` is null, it means
+  // that `GetIsGcMarking()` is false, and vice versa.
+  //
+  //   temp2 = Thread::Current()->pReadBarrierMarkReg ## root.reg()
+  //   if (temp2 != nullptr) {  // <=> Thread::Current()->GetIsGcMarking()
+  //     // Slow path.
+  //     uint32_t rb_state = Lockword(obj->monitor_).ReadBarrierState();
+  //     lfence;  // Load fence or artificial data dependency to prevent load-load reordering
+  //     HeapReference<mirror::Object> ref = *src;  // Original reference load.
+  //     bool is_gray = (rb_state == ReadBarrier::GrayState());
+  //     if (is_gray) {
+  //       ref = temp2(ref);  // ref = ReadBarrier::Mark(ref);  // Runtime entry point call.
+  //     }
+  //   } else {
+  //     HeapReference<mirror::Object> ref = *src;  // Original reference load.
   //   }
-  //
-  // Note: the original implementation in ReadBarrier::Barrier is
-  // slightly more complex as it performs additional checks that we do
-  // not do here for performance reasons.
 
+  // Slow path marking the object `ref` when the GC is marking. The
+  // entrypoint will already be loaded in `temp2`.
+  Register temp2 = lr;
+  Location temp2_loc = LocationFrom(temp2);
+  SlowPathCodeARM64* slow_path;
+  if (always_update_field) {
+    // LoadReferenceWithBakerReadBarrierAndUpdateFieldSlowPathARM64
+    // only supports address of the form `obj + field_offset`, where
+    // `obj` is a register and `field_offset` is a register. Thus
+    // `offset` and `scale_factor` above are expected to be null in
+    // this code path.
+    DCHECK_EQ(offset, 0u);
+    DCHECK_EQ(scale_factor, 0u);  /* "times 1" */
+    Location field_offset = index;
+    slow_path =
+        new (GetGraph()->GetArena()) LoadReferenceWithBakerReadBarrierAndUpdateFieldSlowPathARM64(
+            instruction,
+            ref,
+            obj,
+            offset,
+            /* index */ field_offset,
+            scale_factor,
+            needs_null_check,
+            use_load_acquire,
+            temp,
+            /* entrypoint */ temp2_loc);
+  } else {
+    slow_path = new (GetGraph()->GetArena()) LoadReferenceWithBakerReadBarrierSlowPathARM64(
+        instruction,
+        ref,
+        obj,
+        offset,
+        index,
+        scale_factor,
+        needs_null_check,
+        use_load_acquire,
+        temp,
+        /* entrypoint */ temp2_loc);
+  }
+  AddSlowPath(slow_path);
+
+  // temp2 = Thread::Current()->pReadBarrierMarkReg ## ref.reg()
+  const int32_t entry_point_offset =
+      CodeGenerator::GetReadBarrierMarkEntryPointsOffset<kArm64PointerSize>(ref.reg());
+  // Loading the entrypoint does not require a load acquire since it is only changed when
+  // threads are suspended or running a checkpoint.
+  __ Ldr(temp2, MemOperand(tr, entry_point_offset));
+  // The entrypoint is null when the GC is not marking, this prevents one load compared to
+  // checking GetIsGcMarking.
+  __ Cbnz(temp2, slow_path->GetEntryLabel());
+  // Fast path: just load the reference.
+  GenerateRawReferenceLoad(
+      instruction, ref, obj, offset, index, scale_factor, needs_null_check, use_load_acquire);
+  __ Bind(slow_path->GetExitLabel());
+}
+
+void CodeGeneratorARM64::GenerateRawReferenceLoad(HInstruction* instruction,
+                                                  Location ref,
+                                                  Register obj,
+                                                  uint32_t offset,
+                                                  Location index,
+                                                  size_t scale_factor,
+                                                  bool needs_null_check,
+                                                  bool use_load_acquire) {
+  DCHECK(obj.IsW());
   Primitive::Type type = Primitive::kPrimNot;
   Register ref_reg = RegisterFrom(ref, type);
-  DCHECK(obj.IsW());
-  uint32_t monitor_offset = mirror::Object::MonitorOffset().Int32Value();
 
-  {
-    // Ensure that between load and MaybeRecordImplicitNullCheck there are no pools emitted.
-    EmissionCheckScope guard(GetVIXLAssembler(), kMaxMacroInstructionSizeInBytes);
-    // /* int32_t */ monitor = obj->monitor_
-    __ Ldr(temp, HeapOperand(obj, monitor_offset));
-    if (needs_null_check) {
-      MaybeRecordImplicitNullCheck(instruction);
-    }
-  }
-  // /* LockWord */ lock_word = LockWord(monitor)
-  static_assert(sizeof(LockWord) == sizeof(int32_t),
-                "art::LockWord and int32_t have different sizes.");
+  // If needed, vixl::EmissionCheckScope guards are used to ensure
+  // that no pools are emitted between the load (macro) instruction
+  // and MaybeRecordImplicitNullCheck.
 
-  // Introduce a dependency on the lock_word including rb_state,
-  // to prevent load-load reordering, and without using
-  // a memory barrier (which would be more expensive).
-  // `obj` is unchanged by this operation, but its value now depends
-  // on `temp`.
-  __ Add(obj.X(), obj.X(), Operand(temp.X(), LSR, 32));
-
-  // The actual reference load.
   if (index.IsValid()) {
     // Load types involving an "index": ArrayGet,
     // UnsafeGetObject/UnsafeGetObjectVolatile and UnsafeCASObject
@@ -5813,59 +6073,50 @@
           << instruction->AsInvoke()->GetIntrinsic();
       DCHECK_EQ(offset, 0u);
       DCHECK_EQ(scale_factor, 0u);
-      DCHECK_EQ(needs_null_check, 0u);
-      // /* HeapReference<Object> */ ref = *(obj + index)
+      DCHECK_EQ(needs_null_check, false);
+      // /* HeapReference<mirror::Object> */ ref = *(obj + index)
       MemOperand field = HeapOperand(obj, XRegisterFrom(index));
       LoadAcquire(instruction, ref_reg, field, /* needs_null_check */ false);
     } else {
-      // ArrayGet and UnsafeGetObject intrinsics cases.
-      // /* HeapReference<Object> */ ref = *(obj + offset + (index << scale_factor))
+      // ArrayGet and UnsafeGetObject and UnsafeCASObject intrinsics cases.
+      // /* HeapReference<mirror::Object> */ ref = *(obj + offset + (index << scale_factor))
       if (index.IsConstant()) {
         uint32_t computed_offset = offset + (Int64ConstantFrom(index) << scale_factor);
+        EmissionCheckScope guard(GetVIXLAssembler(), kMaxMacroInstructionSizeInBytes);
         Load(type, ref_reg, HeapOperand(obj, computed_offset));
+        if (needs_null_check) {
+          MaybeRecordImplicitNullCheck(instruction);
+        }
       } else {
-        Register temp3 = temps.AcquireW();
-        __ Add(temp3, obj, offset);
-        Load(type, ref_reg, HeapOperand(temp3, XRegisterFrom(index), LSL, scale_factor));
-        temps.Release(temp3);
+        UseScratchRegisterScope temps(GetVIXLAssembler());
+        Register temp = temps.AcquireW();
+        __ Add(temp, obj, offset);
+        {
+          EmissionCheckScope guard(GetVIXLAssembler(), kMaxMacroInstructionSizeInBytes);
+          Load(type, ref_reg, HeapOperand(temp, XRegisterFrom(index), LSL, scale_factor));
+          if (needs_null_check) {
+            MaybeRecordImplicitNullCheck(instruction);
+          }
+        }
       }
     }
   } else {
-    // /* HeapReference<Object> */ ref = *(obj + offset)
+    // /* HeapReference<mirror::Object> */ ref = *(obj + offset)
     MemOperand field = HeapOperand(obj, offset);
     if (use_load_acquire) {
-      LoadAcquire(instruction, ref_reg, field, /* needs_null_check */ false);
+      // Implicit null checks are handled by CodeGeneratorARM64::LoadAcquire.
+      LoadAcquire(instruction, ref_reg, field, needs_null_check);
     } else {
+      EmissionCheckScope guard(GetVIXLAssembler(), kMaxMacroInstructionSizeInBytes);
       Load(type, ref_reg, field);
+      if (needs_null_check) {
+        MaybeRecordImplicitNullCheck(instruction);
+      }
     }
   }
 
   // Object* ref = ref_addr->AsMirrorPtr()
   GetAssembler()->MaybeUnpoisonHeapReference(ref_reg);
-
-  // Slow path marking the object `ref` when it is gray.
-  SlowPathCodeARM64* slow_path;
-  if (always_update_field) {
-    // ReadBarrierMarkAndUpdateFieldSlowPathARM64 only supports
-    // address of the form `obj + field_offset`, where `obj` is a
-    // register and `field_offset` is a register. Thus `offset` and
-    // `scale_factor` above are expected to be null in this code path.
-    DCHECK_EQ(offset, 0u);
-    DCHECK_EQ(scale_factor, 0u);  /* "times 1" */
-    slow_path = new (GetGraph()->GetArena()) ReadBarrierMarkAndUpdateFieldSlowPathARM64(
-        instruction, ref, obj, /* field_offset */ index, temp);
-  } else {
-    slow_path = new (GetGraph()->GetArena()) ReadBarrierMarkSlowPathARM64(instruction, ref);
-  }
-  AddSlowPath(slow_path);
-
-  // if (rb_state == ReadBarrier::GrayState())
-  //   ref = ReadBarrier::Mark(ref);
-  // Given the numeric representation, it's enough to check the low bit of the rb_state.
-  static_assert(ReadBarrier::WhiteState() == 0, "Expecting white to have value 0");
-  static_assert(ReadBarrier::GrayState() == 1, "Expecting gray to have value 1");
-  __ Tbnz(temp, LockWord::kReadBarrierStateShift, slow_path->GetEntryLabel());
-  __ Bind(slow_path->GetExitLabel());
 }
 
 void CodeGeneratorARM64::GenerateReadBarrierSlow(HInstruction* instruction,
diff --git a/compiler/optimizing/code_generator_arm64.h b/compiler/optimizing/code_generator_arm64.h
index 5faf29a..7471cd5 100644
--- a/compiler/optimizing/code_generator_arm64.h
+++ b/compiler/optimizing/code_generator_arm64.h
@@ -616,8 +616,8 @@
                                              Location index,
                                              vixl::aarch64::Register temp,
                                              bool needs_null_check);
-  // Factored implementation used by GenerateFieldLoadWithBakerReadBarrier
-  // and GenerateArrayLoadWithBakerReadBarrier.
+  // Factored implementation, used by GenerateFieldLoadWithBakerReadBarrier,
+  // GenerateArrayLoadWithBakerReadBarrier and some intrinsics.
   //
   // Load the object reference located at the address
   // `obj + offset + (index << scale_factor)`, held by object `obj`, into
@@ -636,6 +636,16 @@
                                                  bool use_load_acquire,
                                                  bool always_update_field = false);
 
+  // Generate a heap reference load (with no read barrier).
+  void GenerateRawReferenceLoad(HInstruction* instruction,
+                                Location ref,
+                                vixl::aarch64::Register obj,
+                                uint32_t offset,
+                                Location index,
+                                size_t scale_factor,
+                                bool needs_null_check,
+                                bool use_load_acquire);
+
   // Generate a read barrier for a heap reference within `instruction`
   // using a slow path.
   //
@@ -761,8 +771,6 @@
   ArenaDeque<PcRelativePatchInfo> pc_relative_type_patches_;
   // PC-relative type patch info for kBssEntry.
   ArenaDeque<PcRelativePatchInfo> type_bss_entry_patches_;
-  // Deduplication map for patchable boot image addresses.
-  Uint32ToLiteralMap boot_image_address_patches_;
 
   // Patches for string literals in JIT compiled code.
   StringToLiteralMap jit_string_patches_;
diff --git a/compiler/optimizing/code_generator_arm_vixl.cc b/compiler/optimizing/code_generator_arm_vixl.cc
index 6bfbe4a..f5ada52 100644
--- a/compiler/optimizing/code_generator_arm_vixl.cc
+++ b/compiler/optimizing/code_generator_arm_vixl.cc
@@ -42,6 +42,7 @@
 using helpers::DWARFReg;
 using helpers::HighDRegisterFrom;
 using helpers::HighRegisterFrom;
+using helpers::InputDRegisterAt;
 using helpers::InputOperandAt;
 using helpers::InputRegister;
 using helpers::InputRegisterAt;
@@ -53,6 +54,7 @@
 using helpers::LocationFrom;
 using helpers::LowRegisterFrom;
 using helpers::LowSRegisterFrom;
+using helpers::OperandFrom;
 using helpers::OutputRegister;
 using helpers::OutputSRegister;
 using helpers::OutputVRegister;
@@ -657,52 +659,25 @@
   DISALLOW_COPY_AND_ASSIGN(ArraySetSlowPathARMVIXL);
 };
 
-// Slow path marking an object reference `ref` during a read
-// barrier. The field `obj.field` in the object `obj` holding this
-// reference does not get updated by this slow path after marking (see
-// ReadBarrierMarkAndUpdateFieldSlowPathARM below for that).
+// Abstract base class for read barrier slow paths marking a reference
+// `ref`.
 //
-// This means that after the execution of this slow path, `ref` will
-// always be up-to-date, but `obj.field` may not; i.e., after the
-// flip, `ref` will be a to-space reference, but `obj.field` will
-// probably still be a from-space reference (unless it gets updated by
-// another thread, or if another thread installed another object
-// reference (different from `ref`) in `obj.field`).
-class ReadBarrierMarkSlowPathARMVIXL : public SlowPathCodeARMVIXL {
- public:
-  ReadBarrierMarkSlowPathARMVIXL(HInstruction* instruction,
-                                 Location ref,
-                                 Location entrypoint = Location::NoLocation())
+// Argument `entrypoint` must be a register location holding the read
+// barrier marking runtime entry point to be invoked.
+class ReadBarrierMarkSlowPathBaseARMVIXL : public SlowPathCodeARMVIXL {
+ protected:
+  ReadBarrierMarkSlowPathBaseARMVIXL(HInstruction* instruction, Location ref, Location entrypoint)
       : SlowPathCodeARMVIXL(instruction), ref_(ref), entrypoint_(entrypoint) {
     DCHECK(kEmitCompilerReadBarrier);
   }
 
-  const char* GetDescription() const OVERRIDE { return "ReadBarrierMarkSlowPathARMVIXL"; }
+  const char* GetDescription() const OVERRIDE { return "ReadBarrierMarkSlowPathBaseARMVIXL"; }
 
-  void EmitNativeCode(CodeGenerator* codegen) OVERRIDE {
-    LocationSummary* locations = instruction_->GetLocations();
+  // Generate assembly code calling the read barrier marking runtime
+  // entry point (ReadBarrierMarkRegX).
+  void GenerateReadBarrierMarkRuntimeCall(CodeGenerator* codegen) {
     vixl32::Register ref_reg = RegisterFrom(ref_);
-    DCHECK(locations->CanCall());
-    DCHECK(!locations->GetLiveRegisters()->ContainsCoreRegister(ref_reg.GetCode())) << ref_reg;
-    DCHECK(instruction_->IsInstanceFieldGet() ||
-           instruction_->IsStaticFieldGet() ||
-           instruction_->IsArrayGet() ||
-           instruction_->IsArraySet() ||
-           instruction_->IsLoadClass() ||
-           instruction_->IsLoadString() ||
-           instruction_->IsInstanceOf() ||
-           instruction_->IsCheckCast() ||
-           (instruction_->IsInvokeVirtual() && instruction_->GetLocations()->Intrinsified()) ||
-           (instruction_->IsInvokeStaticOrDirect() && instruction_->GetLocations()->Intrinsified()))
-        << "Unexpected instruction in read barrier marking slow path: "
-        << instruction_->DebugName();
-    // The read barrier instrumentation of object ArrayGet
-    // instructions does not support the HIntermediateAddress
-    // instruction.
-    DCHECK(!(instruction_->IsArrayGet() &&
-             instruction_->AsArrayGet()->GetArray()->IsIntermediateAddress()));
 
-    __ Bind(GetEntryLabel());
     // No need to save live registers; it's taken care of by the
     // entrypoint. Also, there is no need to update the stack mask,
     // as this runtime call will not trigger a garbage collection.
@@ -732,53 +707,108 @@
       arm_codegen->ValidateInvokeRuntimeWithoutRecordingPcInfo(instruction_, this);
       __ Blx(RegisterFrom(entrypoint_));
     } else {
+      // Entrypoint is not already loaded, load from the thread.
       int32_t entry_point_offset =
           CodeGenerator::GetReadBarrierMarkEntryPointsOffset<kArmPointerSize>(ref_reg.GetCode());
       // This runtime call does not require a stack map.
       arm_codegen->InvokeRuntimeWithoutRecordingPcInfo(entry_point_offset, instruction_, this);
     }
-    __ B(GetExitLabel());
   }
 
- private:
   // The location (register) of the marked object reference.
   const Location ref_;
 
   // The location of the entrypoint if already loaded.
   const Location entrypoint_;
 
-  DISALLOW_COPY_AND_ASSIGN(ReadBarrierMarkSlowPathARMVIXL);
+ private:
+  DISALLOW_COPY_AND_ASSIGN(ReadBarrierMarkSlowPathBaseARMVIXL);
 };
 
-// Slow path marking an object reference `ref` during a read barrier,
-// and if needed, atomically updating the field `obj.field` in the
-// object `obj` holding this reference after marking (contrary to
-// ReadBarrierMarkSlowPathARM above, which never tries to update
-// `obj.field`).
+// Slow path marking an object reference `ref` during a read
+// barrier. The field `obj.field` in the object `obj` holding this
+// reference does not get updated by this slow path after marking.
 //
-// This means that after the execution of this slow path, both `ref`
-// and `obj.field` will be up-to-date; i.e., after the flip, both will
-// hold the same to-space reference (unless another thread installed
-// another object reference (different from `ref`) in `obj.field`).
-class ReadBarrierMarkAndUpdateFieldSlowPathARMVIXL : public SlowPathCodeARMVIXL {
+// This means that after the execution of this slow path, `ref` will
+// always be up-to-date, but `obj.field` may not; i.e., after the
+// flip, `ref` will be a to-space reference, but `obj.field` will
+// probably still be a from-space reference (unless it gets updated by
+// another thread, or if another thread installed another object
+// reference (different from `ref`) in `obj.field`).
+//
+// If `entrypoint` is a valid location it is assumed to already be
+// holding the entrypoint. The case where the entrypoint is passed in
+// is when the decision to mark is based on whether the GC is marking.
+class ReadBarrierMarkSlowPathARMVIXL : public ReadBarrierMarkSlowPathBaseARMVIXL {
  public:
-  ReadBarrierMarkAndUpdateFieldSlowPathARMVIXL(HInstruction* instruction,
-                                               Location ref,
-                                               vixl32::Register obj,
-                                               Location field_offset,
-                                               vixl32::Register temp1,
-                                               vixl32::Register temp2)
-      : SlowPathCodeARMVIXL(instruction),
-        ref_(ref),
-        obj_(obj),
-        field_offset_(field_offset),
-        temp1_(temp1),
-        temp2_(temp2) {
+  ReadBarrierMarkSlowPathARMVIXL(HInstruction* instruction,
+                                 Location ref,
+                                 Location entrypoint = Location::NoLocation())
+      : ReadBarrierMarkSlowPathBaseARMVIXL(instruction, ref, entrypoint) {
     DCHECK(kEmitCompilerReadBarrier);
   }
 
+  const char* GetDescription() const OVERRIDE { return "ReadBarrierMarkSlowPathARMVIXL"; }
+
+  void EmitNativeCode(CodeGenerator* codegen) OVERRIDE {
+    LocationSummary* locations = instruction_->GetLocations();
+    DCHECK(locations->CanCall());
+    DCHECK(ref_.IsRegister()) << ref_;
+    DCHECK(!locations->GetLiveRegisters()->ContainsCoreRegister(ref_.reg())) << ref_.reg();
+    DCHECK(instruction_->IsLoadClass() || instruction_->IsLoadString())
+        << "Unexpected instruction in read barrier marking slow path: "
+        << instruction_->DebugName();
+
+    __ Bind(GetEntryLabel());
+    GenerateReadBarrierMarkRuntimeCall(codegen);
+    __ B(GetExitLabel());
+  }
+
+ private:
+  DISALLOW_COPY_AND_ASSIGN(ReadBarrierMarkSlowPathARMVIXL);
+};
+
+// Slow path loading `obj`'s lock word, loading a reference from
+// object `*(obj + offset + (index << scale_factor))` into `ref`, and
+// marking `ref` if `obj` is gray according to the lock word (Baker
+// read barrier). The field `obj.field` in the object `obj` holding
+// this reference does not get updated by this slow path after marking
+// (see LoadReferenceWithBakerReadBarrierAndUpdateFieldSlowPathARMVIXL
+// below for that).
+//
+// This means that after the execution of this slow path, `ref` will
+// always be up-to-date, but `obj.field` may not; i.e., after the
+// flip, `ref` will be a to-space reference, but `obj.field` will
+// probably still be a from-space reference (unless it gets updated by
+// another thread, or if another thread installed another object
+// reference (different from `ref`) in `obj.field`).
+//
+// Argument `entrypoint` must be a register location holding the read
+// barrier marking runtime entry point to be invoked.
+class LoadReferenceWithBakerReadBarrierSlowPathARMVIXL : public ReadBarrierMarkSlowPathBaseARMVIXL {
+ public:
+  LoadReferenceWithBakerReadBarrierSlowPathARMVIXL(HInstruction* instruction,
+                                                   Location ref,
+                                                   vixl32::Register obj,
+                                                   uint32_t offset,
+                                                   Location index,
+                                                   ScaleFactor scale_factor,
+                                                   bool needs_null_check,
+                                                   vixl32::Register temp,
+                                                   Location entrypoint)
+      : ReadBarrierMarkSlowPathBaseARMVIXL(instruction, ref, entrypoint),
+        obj_(obj),
+        offset_(offset),
+        index_(index),
+        scale_factor_(scale_factor),
+        needs_null_check_(needs_null_check),
+        temp_(temp) {
+    DCHECK(kEmitCompilerReadBarrier);
+    DCHECK(kUseBakerReadBarrier);
+  }
+
   const char* GetDescription() const OVERRIDE {
-    return "ReadBarrierMarkAndUpdateFieldSlowPathARMVIXL";
+    return "LoadReferenceWithBakerReadBarrierSlowPathARMVIXL";
   }
 
   void EmitNativeCode(CodeGenerator* codegen) OVERRIDE {
@@ -786,64 +816,233 @@
     vixl32::Register ref_reg = RegisterFrom(ref_);
     DCHECK(locations->CanCall());
     DCHECK(!locations->GetLiveRegisters()->ContainsCoreRegister(ref_reg.GetCode())) << ref_reg;
-    // This slow path is only used by the UnsafeCASObject intrinsic.
+    DCHECK(instruction_->IsInstanceFieldGet() ||
+           instruction_->IsStaticFieldGet() ||
+           instruction_->IsArrayGet() ||
+           instruction_->IsArraySet() ||
+           instruction_->IsInstanceOf() ||
+           instruction_->IsCheckCast() ||
+           (instruction_->IsInvokeVirtual() && instruction_->GetLocations()->Intrinsified()) ||
+           (instruction_->IsInvokeStaticOrDirect() && instruction_->GetLocations()->Intrinsified()))
+        << "Unexpected instruction in read barrier marking slow path: "
+        << instruction_->DebugName();
+    // The read barrier instrumentation of object ArrayGet
+    // instructions does not support the HIntermediateAddress
+    // instruction.
+    DCHECK(!(instruction_->IsArrayGet() &&
+             instruction_->AsArrayGet()->GetArray()->IsIntermediateAddress()));
+
+    // Temporary register `temp_`, used to store the lock word, must
+    // not be IP, as we may use it to emit the reference load (in the
+    // call to GenerateRawReferenceLoad below), and we need the lock
+    // word to still be in `temp_` after the reference load.
+    DCHECK(!temp_.Is(ip));
+
+    __ Bind(GetEntryLabel());
+
+    // When using MaybeGenerateReadBarrierSlow, the read barrier call is
+    // inserted after the original load. However, in fast path based
+    // Baker's read barriers, we need to perform the load of
+    // mirror::Object::monitor_ *before* the original reference load.
+    // This load-load ordering is required by the read barrier.
+    // The fast path/slow path (for Baker's algorithm) should look like:
+    //
+    //   uint32_t rb_state = Lockword(obj->monitor_).ReadBarrierState();
+    //   lfence;  // Load fence or artificial data dependency to prevent load-load reordering
+    //   HeapReference<mirror::Object> ref = *src;  // Original reference load.
+    //   bool is_gray = (rb_state == ReadBarrier::GrayState());
+    //   if (is_gray) {
+    //     ref = entrypoint(ref);  // ref = ReadBarrier::Mark(ref);  // Runtime entry point call.
+    //   }
+    //
+    // Note: the original implementation in ReadBarrier::Barrier is
+    // slightly more complex as it performs additional checks that we do
+    // not do here for performance reasons.
+
+    CodeGeneratorARMVIXL* arm_codegen = down_cast<CodeGeneratorARMVIXL*>(codegen);
+
+    // /* int32_t */ monitor = obj->monitor_
+    uint32_t monitor_offset = mirror::Object::MonitorOffset().Int32Value();
+    arm_codegen->GetAssembler()->LoadFromOffset(kLoadWord, temp_, obj_, monitor_offset);
+    if (needs_null_check_) {
+      codegen->MaybeRecordImplicitNullCheck(instruction_);
+    }
+    // /* LockWord */ lock_word = LockWord(monitor)
+    static_assert(sizeof(LockWord) == sizeof(int32_t),
+                  "art::LockWord and int32_t have different sizes.");
+
+    // Introduce a dependency on the lock_word including the rb_state,
+    // which shall prevent load-load reordering without using
+    // a memory barrier (which would be more expensive).
+    // `obj` is unchanged by this operation, but its value now depends
+    // on `temp`.
+    __ Add(obj_, obj_, Operand(temp_, ShiftType::LSR, 32));
+
+    // The actual reference load.
+    // A possible implicit null check has already been handled above.
+    arm_codegen->GenerateRawReferenceLoad(
+        instruction_, ref_, obj_, offset_, index_, scale_factor_, /* needs_null_check */ false);
+
+    // Mark the object `ref` when `obj` is gray.
+    //
+    //   if (rb_state == ReadBarrier::GrayState())
+    //     ref = ReadBarrier::Mark(ref);
+    //
+    // Given the numeric representation, it's enough to check the low bit of the
+    // rb_state. We do that by shifting the bit out of the lock word with LSRS
+    // which can be a 16-bit instruction unlike the TST immediate.
+    static_assert(ReadBarrier::WhiteState() == 0, "Expecting white to have value 0");
+    static_assert(ReadBarrier::GrayState() == 1, "Expecting gray to have value 1");
+    __ Lsrs(temp_, temp_, LockWord::kReadBarrierStateShift + 1);
+    __ B(cc, GetExitLabel());  // Carry flag is the last bit shifted out by LSRS.
+    GenerateReadBarrierMarkRuntimeCall(codegen);
+
+    __ B(GetExitLabel());
+  }
+
+ private:
+  // The register containing the object holding the marked object reference field.
+  vixl32::Register obj_;
+  // The offset, index and scale factor to access the reference in `obj_`.
+  uint32_t offset_;
+  Location index_;
+  ScaleFactor scale_factor_;
+  // Is a null check required?
+  bool needs_null_check_;
+  // A temporary register used to hold the lock word of `obj_`.
+  vixl32::Register temp_;
+
+  DISALLOW_COPY_AND_ASSIGN(LoadReferenceWithBakerReadBarrierSlowPathARMVIXL);
+};
+
+// Slow path loading `obj`'s lock word, loading a reference from
+// object `*(obj + offset + (index << scale_factor))` into `ref`, and
+// marking `ref` if `obj` is gray according to the lock word (Baker
+// read barrier). If needed, this slow path also atomically updates
+// the field `obj.field` in the object `obj` holding this reference
+// after marking (contrary to
+// LoadReferenceWithBakerReadBarrierSlowPathARMVIXL above, which never
+// tries to update `obj.field`).
+//
+// This means that after the execution of this slow path, both `ref`
+// and `obj.field` will be up-to-date; i.e., after the flip, both will
+// hold the same to-space reference (unless another thread installed
+// another object reference (different from `ref`) in `obj.field`).
+//
+//
+// Argument `entrypoint` must be a register location holding the read
+// barrier marking runtime entry point to be invoked.
+class LoadReferenceWithBakerReadBarrierAndUpdateFieldSlowPathARMVIXL
+    : public ReadBarrierMarkSlowPathBaseARMVIXL {
+ public:
+  LoadReferenceWithBakerReadBarrierAndUpdateFieldSlowPathARMVIXL(HInstruction* instruction,
+                                                                 Location ref,
+                                                                 vixl32::Register obj,
+                                                                 uint32_t offset,
+                                                                 Location index,
+                                                                 ScaleFactor scale_factor,
+                                                                 bool needs_null_check,
+                                                                 vixl32::Register temp1,
+                                                                 vixl32::Register temp2,
+                                                                 Location entrypoint)
+      : ReadBarrierMarkSlowPathBaseARMVIXL(instruction, ref, entrypoint),
+        obj_(obj),
+        offset_(offset),
+        index_(index),
+        scale_factor_(scale_factor),
+        needs_null_check_(needs_null_check),
+        temp1_(temp1),
+        temp2_(temp2) {
+    DCHECK(kEmitCompilerReadBarrier);
+    DCHECK(kUseBakerReadBarrier);
+  }
+
+  const char* GetDescription() const OVERRIDE {
+    return "LoadReferenceWithBakerReadBarrierAndUpdateFieldSlowPathARMVIXL";
+  }
+
+  void EmitNativeCode(CodeGenerator* codegen) OVERRIDE {
+    LocationSummary* locations = instruction_->GetLocations();
+    vixl32::Register ref_reg = RegisterFrom(ref_);
+    DCHECK(locations->CanCall());
+    DCHECK(!locations->GetLiveRegisters()->ContainsCoreRegister(ref_reg.GetCode())) << ref_reg;
+    DCHECK_NE(ref_.reg(), LocationFrom(temp1_).reg());
+
+    // This slow path is only used by the UnsafeCASObject intrinsic at the moment.
     DCHECK((instruction_->IsInvokeVirtual() && instruction_->GetLocations()->Intrinsified()))
         << "Unexpected instruction in read barrier marking and field updating slow path: "
         << instruction_->DebugName();
     DCHECK(instruction_->GetLocations()->Intrinsified());
     DCHECK_EQ(instruction_->AsInvoke()->GetIntrinsic(), Intrinsics::kUnsafeCASObject);
-    DCHECK(field_offset_.IsRegisterPair()) << field_offset_;
+    DCHECK_EQ(offset_, 0u);
+    DCHECK_EQ(scale_factor_, ScaleFactor::TIMES_1);
+    Location field_offset = index_;
+    DCHECK(field_offset.IsRegisterPair()) << field_offset;
+
+    // Temporary register `temp1_`, used to store the lock word, must
+    // not be IP, as we may use it to emit the reference load (in the
+    // call to GenerateRawReferenceLoad below), and we need the lock
+    // word to still be in `temp1_` after the reference load.
+    DCHECK(!temp1_.Is(ip));
 
     __ Bind(GetEntryLabel());
 
-    // Save the old reference.
+    CodeGeneratorARMVIXL* arm_codegen = down_cast<CodeGeneratorARMVIXL*>(codegen);
+
+    // /* int32_t */ monitor = obj->monitor_
+    uint32_t monitor_offset = mirror::Object::MonitorOffset().Int32Value();
+    arm_codegen->GetAssembler()->LoadFromOffset(kLoadWord, temp1_, obj_, monitor_offset);
+    if (needs_null_check_) {
+      codegen->MaybeRecordImplicitNullCheck(instruction_);
+    }
+    // /* LockWord */ lock_word = LockWord(monitor)
+    static_assert(sizeof(LockWord) == sizeof(int32_t),
+                  "art::LockWord and int32_t have different sizes.");
+
+    // Introduce a dependency on the lock_word including the rb_state,
+    // which shall prevent load-load reordering without using
+    // a memory barrier (which would be more expensive).
+    // `obj` is unchanged by this operation, but its value now depends
+    // on `temp`.
+    __ Add(obj_, obj_, Operand(temp1_, ShiftType::LSR, 32));
+
+    // The actual reference load.
+    // A possible implicit null check has already been handled above.
+    arm_codegen->GenerateRawReferenceLoad(
+        instruction_, ref_, obj_, offset_, index_, scale_factor_, /* needs_null_check */ false);
+
+    // Mark the object `ref` when `obj` is gray.
+    //
+    //   if (rb_state == ReadBarrier::GrayState())
+    //     ref = ReadBarrier::Mark(ref);
+    //
+    // Given the numeric representation, it's enough to check the low bit of the
+    // rb_state. We do that by shifting the bit out of the lock word with LSRS
+    // which can be a 16-bit instruction unlike the TST immediate.
+    static_assert(ReadBarrier::WhiteState() == 0, "Expecting white to have value 0");
+    static_assert(ReadBarrier::GrayState() == 1, "Expecting gray to have value 1");
+    __ Lsrs(temp1_, temp1_, LockWord::kReadBarrierStateShift + 1);
+    __ B(cc, GetExitLabel());  // Carry flag is the last bit shifted out by LSRS.
+
+    // Save the old value of the reference before marking it.
     // Note that we cannot use IP to save the old reference, as IP is
     // used internally by the ReadBarrierMarkRegX entry point, and we
     // need the old reference after the call to that entry point.
     DCHECK(!temp1_.Is(ip));
     __ Mov(temp1_, ref_reg);
 
-    // No need to save live registers; it's taken care of by the
-    // entrypoint. Also, there is no need to update the stack mask,
-    // as this runtime call will not trigger a garbage collection.
-    CodeGeneratorARMVIXL* arm_codegen = down_cast<CodeGeneratorARMVIXL*>(codegen);
-    DCHECK(!ref_reg.Is(sp));
-    DCHECK(!ref_reg.Is(lr));
-    DCHECK(!ref_reg.Is(pc));
-    // IP is used internally by the ReadBarrierMarkRegX entry point
-    // as a temporary, it cannot be the entry point's input/output.
-    DCHECK(!ref_reg.Is(ip));
-    DCHECK(ref_reg.IsRegister()) << ref_reg;
-    // "Compact" slow path, saving two moves.
-    //
-    // Instead of using the standard runtime calling convention (input
-    // and output in R0):
-    //
-    //   R0 <- ref
-    //   R0 <- ReadBarrierMark(R0)
-    //   ref <- R0
-    //
-    // we just use rX (the register containing `ref`) as input and output
-    // of a dedicated entrypoint:
-    //
-    //   rX <- ReadBarrierMarkRegX(rX)
-    //
-    int32_t entry_point_offset =
-        CodeGenerator::GetReadBarrierMarkEntryPointsOffset<kArmPointerSize>(ref_reg.GetCode());
-    // This runtime call does not require a stack map.
-    arm_codegen->InvokeRuntimeWithoutRecordingPcInfo(entry_point_offset, instruction_, this);
+    GenerateReadBarrierMarkRuntimeCall(codegen);
 
     // If the new reference is different from the old reference,
-    // update the field in the holder (`*(obj_ + field_offset_)`).
+    // update the field in the holder (`*(obj_ + field_offset)`).
     //
     // Note that this field could also hold a different object, if
     // another thread had concurrently changed it. In that case, the
     // LDREX/SUBS/ITNE sequence of instructions in the compare-and-set
     // (CAS) operation below would abort the CAS, leaving the field
     // as-is.
-    vixl32::Label done;
     __ Cmp(temp1_, ref_reg);
-    __ B(eq, &done, /* far_target */ false);
+    __ B(eq, GetExitLabel());
 
     // Update the the holder's field atomically.  This may fail if
     // mutator updates before us, but it's OK.  This is achieved
@@ -857,7 +1056,7 @@
     // The UnsafeCASObject intrinsic uses a register pair as field
     // offset ("long offset"), of which only the low part contains
     // data.
-    vixl32::Register offset = LowRegisterFrom(field_offset_);
+    vixl32::Register offset = LowRegisterFrom(field_offset);
     vixl32::Register expected = temp1_;
     vixl32::Register value = ref_reg;
     vixl32::Register tmp_ptr = temps.Acquire();       // Pointer to actual memory.
@@ -913,22 +1112,27 @@
       }
     }
 
-    __ Bind(&done);
     __ B(GetExitLabel());
   }
 
  private:
-  // The location (register) of the marked object reference.
-  const Location ref_;
   // The register containing the object holding the marked object reference field.
   const vixl32::Register obj_;
-  // The location of the offset of the marked reference field within `obj_`.
-  Location field_offset_;
-
+  // The offset, index and scale factor to access the reference in `obj_`.
+  uint32_t offset_;
+  Location index_;
+  ScaleFactor scale_factor_;
+  // Is a null check required?
+  bool needs_null_check_;
+  // A temporary register used to hold the lock word of `obj_`; and
+  // also to hold the original reference value, when the reference is
+  // marked.
   const vixl32::Register temp1_;
+  // A temporary register used in the implementation of the CAS, to
+  // update the object's reference field.
   const vixl32::Register temp2_;
 
-  DISALLOW_COPY_AND_ASSIGN(ReadBarrierMarkAndUpdateFieldSlowPathARMVIXL);
+  DISALLOW_COPY_AND_ASSIGN(LoadReferenceWithBakerReadBarrierAndUpdateFieldSlowPathARMVIXL);
 };
 
 // Slow path generating a read barrier for a heap reference.
@@ -1450,8 +1654,317 @@
   }
 }
 
+static void GenerateVcmp(HInstruction* instruction, CodeGeneratorARMVIXL* codegen) {
+  const Location rhs_loc = instruction->GetLocations()->InAt(1);
+  if (rhs_loc.IsConstant()) {
+    // 0.0 is the only immediate that can be encoded directly in
+    // a VCMP instruction.
+    //
+    // Both the JLS (section 15.20.1) and the JVMS (section 6.5)
+    // specify that in a floating-point comparison, positive zero
+    // and negative zero are considered equal, so we can use the
+    // literal 0.0 for both cases here.
+    //
+    // Note however that some methods (Float.equal, Float.compare,
+    // Float.compareTo, Double.equal, Double.compare,
+    // Double.compareTo, Math.max, Math.min, StrictMath.max,
+    // StrictMath.min) consider 0.0 to be (strictly) greater than
+    // -0.0. So if we ever translate calls to these methods into a
+    // HCompare instruction, we must handle the -0.0 case with
+    // care here.
+    DCHECK(rhs_loc.GetConstant()->IsArithmeticZero());
+
+    const Primitive::Type type = instruction->InputAt(0)->GetType();
+
+    if (type == Primitive::kPrimFloat) {
+      __ Vcmp(F32, InputSRegisterAt(instruction, 0), 0.0);
+    } else {
+      DCHECK_EQ(type, Primitive::kPrimDouble);
+      __ Vcmp(F64, InputDRegisterAt(instruction, 0), 0.0);
+    }
+  } else {
+    __ Vcmp(InputVRegisterAt(instruction, 0), InputVRegisterAt(instruction, 1));
+  }
+}
+
+static vixl32::Condition GenerateLongTestConstant(HCondition* condition,
+                                                  bool invert,
+                                                  CodeGeneratorARMVIXL* codegen) {
+  DCHECK_EQ(condition->GetLeft()->GetType(), Primitive::kPrimLong);
+
+  const LocationSummary* const locations = condition->GetLocations();
+  IfCondition cond = invert ? condition->GetOppositeCondition() : condition->GetCondition();
+  vixl32::Condition ret = eq;
+  const Location left = locations->InAt(0);
+  const Location right = locations->InAt(1);
+
+  DCHECK(right.IsConstant());
+
+  const vixl32::Register left_high = HighRegisterFrom(left);
+  const vixl32::Register left_low = LowRegisterFrom(left);
+  int64_t value = Int64ConstantFrom(right);
+
+  switch (cond) {
+    case kCondEQ:
+    case kCondNE:
+    case kCondB:
+    case kCondBE:
+    case kCondA:
+    case kCondAE: {
+      __ Cmp(left_high, High32Bits(value));
+
+      ExactAssemblyScope guard(codegen->GetVIXLAssembler(),
+                               2 * vixl32::k16BitT32InstructionSizeInBytes,
+                               CodeBufferCheckScope::kExactSize);
+
+      __ it(eq);
+      __ cmp(eq, left_low, Low32Bits(value));
+      ret = ARMUnsignedCondition(cond);
+      break;
+    }
+    case kCondLE:
+    case kCondGT:
+      // Trivially true or false.
+      if (value == std::numeric_limits<int64_t>::max()) {
+        __ Cmp(left_low, left_low);
+        ret = cond == kCondLE ? eq : ne;
+        break;
+      }
+
+      if (cond == kCondLE) {
+        cond = kCondLT;
+      } else {
+        DCHECK_EQ(cond, kCondGT);
+        cond = kCondGE;
+      }
+
+      value++;
+      FALLTHROUGH_INTENDED;
+    case kCondGE:
+    case kCondLT: {
+      UseScratchRegisterScope temps(codegen->GetVIXLAssembler());
+
+      __ Cmp(left_low, Low32Bits(value));
+      __ Sbcs(temps.Acquire(), left_high, High32Bits(value));
+      ret = ARMCondition(cond);
+      break;
+    }
+    default:
+      LOG(FATAL) << "Unreachable";
+      UNREACHABLE();
+  }
+
+  return ret;
+}
+
+static vixl32::Condition GenerateLongTest(HCondition* condition,
+                                          bool invert,
+                                          CodeGeneratorARMVIXL* codegen) {
+  DCHECK_EQ(condition->GetLeft()->GetType(), Primitive::kPrimLong);
+
+  const LocationSummary* const locations = condition->GetLocations();
+  IfCondition cond = invert ? condition->GetOppositeCondition() : condition->GetCondition();
+  vixl32::Condition ret = eq;
+  Location left = locations->InAt(0);
+  Location right = locations->InAt(1);
+
+  DCHECK(right.IsRegisterPair());
+
+  switch (cond) {
+    case kCondEQ:
+    case kCondNE:
+    case kCondB:
+    case kCondBE:
+    case kCondA:
+    case kCondAE: {
+      __ Cmp(HighRegisterFrom(left), HighRegisterFrom(right));
+
+      ExactAssemblyScope guard(codegen->GetVIXLAssembler(),
+                               2 * vixl32::k16BitT32InstructionSizeInBytes,
+                               CodeBufferCheckScope::kExactSize);
+
+      __ it(eq);
+      __ cmp(eq, LowRegisterFrom(left), LowRegisterFrom(right));
+      ret = ARMUnsignedCondition(cond);
+      break;
+    }
+    case kCondLE:
+    case kCondGT:
+      if (cond == kCondLE) {
+        cond = kCondGE;
+      } else {
+        DCHECK_EQ(cond, kCondGT);
+        cond = kCondLT;
+      }
+
+      std::swap(left, right);
+      FALLTHROUGH_INTENDED;
+    case kCondGE:
+    case kCondLT: {
+      UseScratchRegisterScope temps(codegen->GetVIXLAssembler());
+
+      __ Cmp(LowRegisterFrom(left), LowRegisterFrom(right));
+      __ Sbcs(temps.Acquire(), HighRegisterFrom(left), HighRegisterFrom(right));
+      ret = ARMCondition(cond);
+      break;
+    }
+    default:
+      LOG(FATAL) << "Unreachable";
+      UNREACHABLE();
+  }
+
+  return ret;
+}
+
+static vixl32::Condition GenerateTest(HInstruction* instruction,
+                                      Location loc,
+                                      bool invert,
+                                      CodeGeneratorARMVIXL* codegen) {
+  DCHECK(!instruction->IsConstant());
+
+  vixl32::Condition ret = invert ? eq : ne;
+
+  if (IsBooleanValueOrMaterializedCondition(instruction)) {
+    __ Cmp(RegisterFrom(loc), 0);
+  } else {
+    HCondition* const condition = instruction->AsCondition();
+    const Primitive::Type type = condition->GetLeft()->GetType();
+    const IfCondition cond = invert ? condition->GetOppositeCondition() : condition->GetCondition();
+
+    if (type == Primitive::kPrimLong) {
+      ret = condition->GetLocations()->InAt(1).IsConstant()
+          ? GenerateLongTestConstant(condition, invert, codegen)
+          : GenerateLongTest(condition, invert, codegen);
+    } else if (Primitive::IsFloatingPointType(type)) {
+      GenerateVcmp(condition, codegen);
+      __ Vmrs(RegisterOrAPSR_nzcv(kPcCode), FPSCR);
+      ret = ARMFPCondition(cond, condition->IsGtBias());
+    } else {
+      DCHECK(Primitive::IsIntegralType(type) || type == Primitive::kPrimNot) << type;
+      __ Cmp(InputRegisterAt(condition, 0), InputOperandAt(condition, 1));
+      ret = ARMCondition(cond);
+    }
+  }
+
+  return ret;
+}
+
+static bool CanGenerateTest(HInstruction* condition, ArmVIXLAssembler* assembler) {
+  if (!IsBooleanValueOrMaterializedCondition(condition)) {
+    const HCondition* const cond = condition->AsCondition();
+
+    if (cond->GetLeft()->GetType() == Primitive::kPrimLong) {
+      const LocationSummary* const locations = cond->GetLocations();
+      const IfCondition c = cond->GetCondition();
+
+      if (locations->InAt(1).IsConstant()) {
+        const int64_t value = Int64ConstantFrom(locations->InAt(1));
+
+        if (c < kCondLT || c > kCondGE) {
+          // Since IT blocks longer than a 16-bit instruction are deprecated by ARMv8,
+          // we check that the least significant half of the first input to be compared
+          // is in a low register (the other half is read outside an IT block), and
+          // the constant fits in an 8-bit unsigned integer, so that a 16-bit CMP
+          // encoding can be used.
+          if (!LowRegisterFrom(locations->InAt(0)).IsLow() || !IsUint<8>(Low32Bits(value))) {
+            return false;
+          }
+        // TODO(VIXL): The rest of the checks are there to keep the backend in sync with
+        // the previous one, but are not strictly necessary.
+        } else if (c == kCondLE || c == kCondGT) {
+          if (value < std::numeric_limits<int64_t>::max() &&
+              !assembler->ShifterOperandCanHold(SBC, High32Bits(value + 1), kCcSet)) {
+            return false;
+          }
+        } else if (!assembler->ShifterOperandCanHold(SBC, High32Bits(value), kCcSet)) {
+          return false;
+        }
+      }
+    }
+  }
+
+  return true;
+}
+
+static bool CanEncodeConstantAs8BitImmediate(HConstant* constant) {
+  const Primitive::Type type = constant->GetType();
+  bool ret = false;
+
+  DCHECK(Primitive::IsIntegralType(type) || type == Primitive::kPrimNot) << type;
+
+  if (type == Primitive::kPrimLong) {
+    const uint64_t value = Uint64ConstantFrom(constant);
+
+    ret = IsUint<8>(Low32Bits(value)) && IsUint<8>(High32Bits(value));
+  } else {
+    ret = IsUint<8>(Int32ConstantFrom(constant));
+  }
+
+  return ret;
+}
+
+static Location Arm8BitEncodableConstantOrRegister(HInstruction* constant) {
+  DCHECK(!Primitive::IsFloatingPointType(constant->GetType()));
+
+  if (constant->IsConstant() && CanEncodeConstantAs8BitImmediate(constant->AsConstant())) {
+    return Location::ConstantLocation(constant->AsConstant());
+  }
+
+  return Location::RequiresRegister();
+}
+
+static bool CanGenerateConditionalMove(const Location& out, const Location& src) {
+  // Since IT blocks longer than a 16-bit instruction are deprecated by ARMv8,
+  // we check that we are not dealing with floating-point output (there is no
+  // 16-bit VMOV encoding).
+  if (!out.IsRegister() && !out.IsRegisterPair()) {
+    return false;
+  }
+
+  // For constants, we also check that the output is in one or two low registers,
+  // and that the constants fit in an 8-bit unsigned integer, so that a 16-bit
+  // MOV encoding can be used.
+  if (src.IsConstant()) {
+    if (!CanEncodeConstantAs8BitImmediate(src.GetConstant())) {
+      return false;
+    }
+
+    if (out.IsRegister()) {
+      if (!RegisterFrom(out).IsLow()) {
+        return false;
+      }
+    } else {
+      DCHECK(out.IsRegisterPair());
+
+      if (!HighRegisterFrom(out).IsLow()) {
+        return false;
+      }
+    }
+  }
+
+  return true;
+}
+
 #undef __
 
+vixl32::Label* CodeGeneratorARMVIXL::GetFinalLabel(HInstruction* instruction,
+                                                   vixl32::Label* final_label) {
+  DCHECK(!instruction->IsControlFlow() && !instruction->IsSuspendCheck());
+
+  const HBasicBlock* const block = instruction->GetBlock();
+  const HLoopInformation* const info = block->GetLoopInformation();
+  HInstruction* const next = instruction->GetNext();
+
+  // Avoid a branch to a branch.
+  if (next->IsGoto() && (info == nullptr ||
+                         !info->IsBackEdge(*block) ||
+                         !info->HasSuspendCheck())) {
+    final_label = GetLabelOf(next->AsGoto()->GetSuccessor());
+  }
+
+  return final_label;
+}
+
 CodeGeneratorARMVIXL::CodeGeneratorARMVIXL(HGraph* graph,
                                            const ArmInstructionSetFeatures& isa_features,
                                            const CompilerOptions& compiler_options,
@@ -1481,8 +1994,6 @@
                                graph->GetArena()->Adapter(kArenaAllocCodeGenerator)),
       pc_relative_type_patches_(graph->GetArena()->Adapter(kArenaAllocCodeGenerator)),
       type_bss_entry_patches_(graph->GetArena()->Adapter(kArenaAllocCodeGenerator)),
-      boot_image_address_patches_(std::less<uint32_t>(),
-                                  graph->GetArena()->Adapter(kArenaAllocCodeGenerator)),
       jit_string_patches_(StringReferenceValueComparator(),
                           graph->GetArena()->Adapter(kArenaAllocCodeGenerator)),
       jit_class_patches_(TypeReferenceValueComparator(),
@@ -1945,43 +2456,6 @@
 void InstructionCodeGeneratorARMVIXL::VisitExit(HExit* exit ATTRIBUTE_UNUSED) {
 }
 
-void InstructionCodeGeneratorARMVIXL::GenerateVcmp(HInstruction* instruction) {
-  Primitive::Type type = instruction->InputAt(0)->GetType();
-  Location lhs_loc = instruction->GetLocations()->InAt(0);
-  Location rhs_loc = instruction->GetLocations()->InAt(1);
-  if (rhs_loc.IsConstant()) {
-    // 0.0 is the only immediate that can be encoded directly in
-    // a VCMP instruction.
-    //
-    // Both the JLS (section 15.20.1) and the JVMS (section 6.5)
-    // specify that in a floating-point comparison, positive zero
-    // and negative zero are considered equal, so we can use the
-    // literal 0.0 for both cases here.
-    //
-    // Note however that some methods (Float.equal, Float.compare,
-    // Float.compareTo, Double.equal, Double.compare,
-    // Double.compareTo, Math.max, Math.min, StrictMath.max,
-    // StrictMath.min) consider 0.0 to be (strictly) greater than
-    // -0.0. So if we ever translate calls to these methods into a
-    // HCompare instruction, we must handle the -0.0 case with
-    // care here.
-    DCHECK(rhs_loc.GetConstant()->IsArithmeticZero());
-    if (type == Primitive::kPrimFloat) {
-      __ Vcmp(F32, InputSRegisterAt(instruction, 0), 0.0);
-    } else {
-      DCHECK_EQ(type, Primitive::kPrimDouble);
-      __ Vcmp(F64, DRegisterFrom(lhs_loc), 0.0);
-    }
-  } else {
-    if (type == Primitive::kPrimFloat) {
-      __ Vcmp(InputSRegisterAt(instruction, 0), InputSRegisterAt(instruction, 1));
-    } else {
-      DCHECK_EQ(type, Primitive::kPrimDouble);
-      __ Vcmp(DRegisterFrom(lhs_loc), DRegisterFrom(rhs_loc));
-    }
-  }
-}
-
 void InstructionCodeGeneratorARMVIXL::GenerateFPJumps(HCondition* cond,
                                                       vixl32::Label* true_label,
                                                       vixl32::Label* false_label ATTRIBUTE_UNUSED) {
@@ -2090,7 +2564,7 @@
       break;
     case Primitive::kPrimFloat:
     case Primitive::kPrimDouble:
-      GenerateVcmp(condition);
+      GenerateVcmp(condition, codegen_);
       GenerateFPJumps(condition, true_target, false_target);
       break;
     default:
@@ -2167,20 +2641,29 @@
       return;
     }
 
-    LocationSummary* locations = cond->GetLocations();
-    DCHECK(locations->InAt(0).IsRegister());
-    vixl32::Register left = InputRegisterAt(cond, 0);
-    Location right = locations->InAt(1);
-    if (right.IsRegister()) {
-      __ Cmp(left, InputRegisterAt(cond, 1));
-    } else {
-      DCHECK(right.IsConstant());
-      __ Cmp(left, CodeGenerator::GetInt32ValueOf(right.GetConstant()));
-    }
+    vixl32::Label* non_fallthrough_target;
+    vixl32::Condition arm_cond = vixl32::Condition::None();
+    const vixl32::Register left = InputRegisterAt(cond, 0);
+    const Operand right = InputOperandAt(cond, 1);
+
     if (true_target == nullptr) {
-      __ B(ARMCondition(condition->GetOppositeCondition()), false_target);
+      arm_cond = ARMCondition(condition->GetOppositeCondition());
+      non_fallthrough_target = false_target;
     } else {
-      __ B(ARMCondition(condition->GetCondition()), true_target);
+      arm_cond = ARMCondition(condition->GetCondition());
+      non_fallthrough_target = true_target;
+    }
+
+    if (right.IsImmediate() && right.GetImmediate() == 0 && (arm_cond.Is(ne) || arm_cond.Is(eq))) {
+      if (arm_cond.Is(eq)) {
+        __ CompareAndBranchIfZero(left, non_fallthrough_target);
+      } else {
+        DCHECK(arm_cond.Is(ne));
+        __ CompareAndBranchIfNonZero(left, non_fallthrough_target);
+      }
+    } else {
+      __ Cmp(left, right);
+      __ B(arm_cond, non_fallthrough_target);
     }
   }
 
@@ -2241,29 +2724,135 @@
 
 void LocationsBuilderARMVIXL::VisitSelect(HSelect* select) {
   LocationSummary* locations = new (GetGraph()->GetArena()) LocationSummary(select);
-  if (Primitive::IsFloatingPointType(select->GetType())) {
+  const bool is_floating_point = Primitive::IsFloatingPointType(select->GetType());
+
+  if (is_floating_point) {
     locations->SetInAt(0, Location::RequiresFpuRegister());
-    locations->SetInAt(1, Location::RequiresFpuRegister());
+    locations->SetInAt(1, Location::FpuRegisterOrConstant(select->GetTrueValue()));
   } else {
     locations->SetInAt(0, Location::RequiresRegister());
-    locations->SetInAt(1, Location::RequiresRegister());
+    locations->SetInAt(1, Arm8BitEncodableConstantOrRegister(select->GetTrueValue()));
   }
+
   if (IsBooleanValueOrMaterializedCondition(select->GetCondition())) {
-    locations->SetInAt(2, Location::RequiresRegister());
+    locations->SetInAt(2, Location::RegisterOrConstant(select->GetCondition()));
+    // The code generator handles overlap with the values, but not with the condition.
+    locations->SetOut(Location::SameAsFirstInput());
+  } else if (is_floating_point) {
+    locations->SetOut(Location::RequiresFpuRegister(), Location::kNoOutputOverlap);
+  } else {
+    if (!locations->InAt(1).IsConstant()) {
+      locations->SetInAt(0, Arm8BitEncodableConstantOrRegister(select->GetFalseValue()));
+    }
+
+    locations->SetOut(Location::RequiresRegister(), Location::kNoOutputOverlap);
   }
-  locations->SetOut(Location::SameAsFirstInput());
 }
 
 void InstructionCodeGeneratorARMVIXL::VisitSelect(HSelect* select) {
-  LocationSummary* locations = select->GetLocations();
-  vixl32::Label false_target;
-  GenerateTestAndBranch(select,
-                        /* condition_input_index */ 2,
-                        /* true_target */ nullptr,
-                        &false_target,
-                        /* far_target */ false);
-  codegen_->MoveLocation(locations->Out(), locations->InAt(1), select->GetType());
-  __ Bind(&false_target);
+  HInstruction* const condition = select->GetCondition();
+  const LocationSummary* const locations = select->GetLocations();
+  const Primitive::Type type = select->GetType();
+  const Location first = locations->InAt(0);
+  const Location out = locations->Out();
+  const Location second = locations->InAt(1);
+  Location src;
+
+  if (condition->IsIntConstant()) {
+    if (condition->AsIntConstant()->IsFalse()) {
+      src = first;
+    } else {
+      src = second;
+    }
+
+    codegen_->MoveLocation(out, src, type);
+    return;
+  }
+
+  if (!Primitive::IsFloatingPointType(type) &&
+      CanGenerateTest(condition, codegen_->GetAssembler())) {
+    bool invert = false;
+
+    if (out.Equals(second)) {
+      src = first;
+      invert = true;
+    } else if (out.Equals(first)) {
+      src = second;
+    } else if (second.IsConstant()) {
+      DCHECK(CanEncodeConstantAs8BitImmediate(second.GetConstant()));
+      src = second;
+    } else if (first.IsConstant()) {
+      DCHECK(CanEncodeConstantAs8BitImmediate(first.GetConstant()));
+      src = first;
+      invert = true;
+    } else {
+      src = second;
+    }
+
+    if (CanGenerateConditionalMove(out, src)) {
+      if (!out.Equals(first) && !out.Equals(second)) {
+        codegen_->MoveLocation(out, src.Equals(first) ? second : first, type);
+      }
+
+      const vixl32::Condition cond = GenerateTest(condition, locations->InAt(2), invert, codegen_);
+      const size_t instr_count = out.IsRegisterPair() ? 4 : 2;
+      ExactAssemblyScope guard(GetVIXLAssembler(),
+                               instr_count * vixl32::k16BitT32InstructionSizeInBytes,
+                               CodeBufferCheckScope::kExactSize);
+
+      if (out.IsRegister()) {
+        __ it(cond);
+        __ mov(cond, RegisterFrom(out), OperandFrom(src, type));
+      } else {
+        DCHECK(out.IsRegisterPair());
+
+        Operand operand_high(0);
+        Operand operand_low(0);
+
+        if (src.IsConstant()) {
+          const int64_t value = Int64ConstantFrom(src);
+
+          operand_high = High32Bits(value);
+          operand_low = Low32Bits(value);
+        } else {
+          DCHECK(src.IsRegisterPair());
+          operand_high = HighRegisterFrom(src);
+          operand_low = LowRegisterFrom(src);
+        }
+
+        __ it(cond);
+        __ mov(cond, LowRegisterFrom(out), operand_low);
+        __ it(cond);
+        __ mov(cond, HighRegisterFrom(out), operand_high);
+      }
+
+      return;
+    }
+  }
+
+  vixl32::Label* false_target = nullptr;
+  vixl32::Label* true_target = nullptr;
+  vixl32::Label select_end;
+  vixl32::Label* const target = codegen_->GetFinalLabel(select, &select_end);
+
+  if (out.Equals(second)) {
+    true_target = target;
+    src = first;
+  } else {
+    false_target = target;
+    src = second;
+
+    if (!out.Equals(first)) {
+      codegen_->MoveLocation(out, first, type);
+    }
+  }
+
+  GenerateTestAndBranch(select, 2, true_target, false_target, /* far_target */ false);
+  codegen_->MoveLocation(out, src, type);
+
+  if (select_end.IsReferenced()) {
+    __ Bind(&select_end);
+  }
 }
 
 void LocationsBuilderARMVIXL::VisitNativeDebugInfo(HNativeDebugInfo* info) {
@@ -2341,7 +2930,7 @@
       break;
     case Primitive::kPrimFloat:
     case Primitive::kPrimDouble:
-      GenerateVcmp(cond);
+      GenerateVcmp(cond, codegen_);
       GenerateFPJumps(cond, &true_label, &false_label);
       break;
   }
@@ -4356,7 +4945,7 @@
     case Primitive::kPrimFloat:
     case Primitive::kPrimDouble: {
       __ Mov(out, 0);
-      GenerateVcmp(compare);
+      GenerateVcmp(compare, codegen_);
       // To branch on the FP compare result we transfer FPSCR to APSR (encoded as PC in VMRS).
       __ Vmrs(RegisterOrAPSR_nzcv(kPcCode), FPSCR);
       less_cond = ARMFPCondition(kCondLT, compare->IsGtBias());
@@ -4726,17 +5315,24 @@
     return true;
   }
   Opcode neg_opcode = kNoOperand;
+  uint32_t neg_value = 0;
   switch (opcode) {
-    case AND: neg_opcode = BIC; value = ~value; break;
-    case ORR: neg_opcode = ORN; value = ~value; break;
-    case ADD: neg_opcode = SUB; value = -value; break;
-    case ADC: neg_opcode = SBC; value = ~value; break;
-    case SUB: neg_opcode = ADD; value = -value; break;
-    case SBC: neg_opcode = ADC; value = ~value; break;
+    case AND: neg_opcode = BIC; neg_value = ~value; break;
+    case ORR: neg_opcode = ORN; neg_value = ~value; break;
+    case ADD: neg_opcode = SUB; neg_value = -value; break;
+    case ADC: neg_opcode = SBC; neg_value = ~value; break;
+    case SUB: neg_opcode = ADD; neg_value = -value; break;
+    case SBC: neg_opcode = ADC; neg_value = ~value; break;
+    case MOV: neg_opcode = MVN; neg_value = ~value; break;
     default:
       return false;
   }
-  return assembler->ShifterOperandCanHold(neg_opcode, value, set_cc);
+
+  if (assembler->ShifterOperandCanHold(neg_opcode, neg_value, set_cc)) {
+    return true;
+  }
+
+  return opcode == AND && IsPowerOfTwo(value + 1);
 }
 
 void InstructionCodeGeneratorARMVIXL::HandleFieldGet(HInstruction* instruction,
@@ -5674,20 +6270,56 @@
   caller_saves.Add(LocationFrom(calling_convention.GetRegisterAt(0)));
   caller_saves.Add(LocationFrom(calling_convention.GetRegisterAt(1)));
   LocationSummary* locations = codegen_->CreateThrowingSlowPathLocations(instruction, caller_saves);
-  locations->SetInAt(0, Location::RequiresRegister());
-  locations->SetInAt(1, Location::RequiresRegister());
+
+  HInstruction* index = instruction->InputAt(0);
+  HInstruction* length = instruction->InputAt(1);
+  // If both index and length are constants we can statically check the bounds. But if at least one
+  // of them is not encodable ArmEncodableConstantOrRegister will create
+  // Location::RequiresRegister() which is not desired to happen. Instead we create constant
+  // locations.
+  bool both_const = index->IsConstant() && length->IsConstant();
+  locations->SetInAt(0, both_const
+      ? Location::ConstantLocation(index->AsConstant())
+      : ArmEncodableConstantOrRegister(index, CMP));
+  locations->SetInAt(1, both_const
+      ? Location::ConstantLocation(length->AsConstant())
+      : ArmEncodableConstantOrRegister(length, CMP));
 }
 
 void InstructionCodeGeneratorARMVIXL::VisitBoundsCheck(HBoundsCheck* instruction) {
-  SlowPathCodeARMVIXL* slow_path =
-      new (GetGraph()->GetArena()) BoundsCheckSlowPathARMVIXL(instruction);
-  codegen_->AddSlowPath(slow_path);
+  LocationSummary* locations = instruction->GetLocations();
+  Location index_loc = locations->InAt(0);
+  Location length_loc = locations->InAt(1);
 
-  vixl32::Register index = InputRegisterAt(instruction, 0);
-  vixl32::Register length = InputRegisterAt(instruction, 1);
+  if (length_loc.IsConstant()) {
+    int32_t length = Int32ConstantFrom(length_loc);
+    if (index_loc.IsConstant()) {
+      // BCE will remove the bounds check if we are guaranteed to pass.
+      int32_t index = Int32ConstantFrom(index_loc);
+      if (index < 0 || index >= length) {
+        SlowPathCodeARMVIXL* slow_path =
+            new (GetGraph()->GetArena()) BoundsCheckSlowPathARMVIXL(instruction);
+        codegen_->AddSlowPath(slow_path);
+        __ B(slow_path->GetEntryLabel());
+      } else {
+        // Some optimization after BCE may have generated this, and we should not
+        // generate a bounds check if it is a valid range.
+      }
+      return;
+    }
 
-  __ Cmp(index, length);
-  __ B(hs, slow_path->GetEntryLabel());
+    SlowPathCodeARMVIXL* slow_path =
+        new (GetGraph()->GetArena()) BoundsCheckSlowPathARMVIXL(instruction);
+    __ Cmp(RegisterFrom(index_loc), length);
+    codegen_->AddSlowPath(slow_path);
+    __ B(hs, slow_path->GetEntryLabel());
+  } else {
+    SlowPathCodeARMVIXL* slow_path =
+        new (GetGraph()->GetArena()) BoundsCheckSlowPathARMVIXL(instruction);
+    __ Cmp(RegisterFrom(length_loc), InputOperandAt(instruction, 0));
+    codegen_->AddSlowPath(slow_path);
+    __ B(ls, slow_path->GetEntryLabel());
+  }
 }
 
 void CodeGeneratorARMVIXL::MarkGCCard(vixl32::Register temp,
@@ -7041,10 +7673,12 @@
     return;
   }
   if (GetAssembler()->ShifterOperandCanHold(AND, value)) {
-  __ And(out, first, value);
+    __ And(out, first, value);
+  } else if (GetAssembler()->ShifterOperandCanHold(BIC, ~value)) {
+    __ Bic(out, first, ~value);
   } else {
-    DCHECK(GetAssembler()->ShifterOperandCanHold(BIC, ~value));
-  __ Bic(out, first, ~value);
+    DCHECK(IsPowerOfTwo(value + 1));
+    __ Ubfx(out, first, 0, WhichPowerOf2(value + 1));
   }
 }
 
@@ -7263,14 +7897,35 @@
     DCHECK(kEmitCompilerReadBarrier);
     if (kUseBakerReadBarrier) {
       // Fast path implementation of art::ReadBarrier::BarrierForRoot when
-      // Baker's read barrier are used:
+      // Baker's read barrier are used.
       //
-      //   root = obj.field;
+      // Note that we do not actually check the value of
+      // `GetIsGcMarking()` to decide whether to mark the loaded GC
+      // root or not.  Instead, we load into `temp` the read barrier
+      // mark entry point corresponding to register `root`. If `temp`
+      // is null, it means that `GetIsGcMarking()` is false, and vice
+      // versa.
+      //
       //   temp = Thread::Current()->pReadBarrierMarkReg ## root.reg()
-      //   if (temp != null) {
-      //     root = temp(root)
+      //   GcRoot<mirror::Object> root = *(obj+offset);  // Original reference load.
+      //   if (temp != nullptr) {  // <=> Thread::Current()->GetIsGcMarking()
+      //     // Slow path.
+      //     root = temp(root);  // root = ReadBarrier::Mark(root);  // Runtime entry point call.
       //   }
 
+      // Slow path marking the GC root `root`. The entrypoint will already be loaded in `temp`.
+      Location temp = LocationFrom(lr);
+      SlowPathCodeARMVIXL* slow_path = new (GetGraph()->GetArena()) ReadBarrierMarkSlowPathARMVIXL(
+              instruction, root, /* entrypoint */ temp);
+      codegen_->AddSlowPath(slow_path);
+
+      // temp = Thread::Current()->pReadBarrierMarkReg ## root.reg()
+      const int32_t entry_point_offset =
+          CodeGenerator::GetReadBarrierMarkEntryPointsOffset<kArmPointerSize>(root.reg());
+      // Loading the entrypoint does not require a load acquire since it is only changed when
+      // threads are suspended or running a checkpoint.
+      GetAssembler()->LoadFromOffset(kLoadWord, RegisterFrom(temp), tr, entry_point_offset);
+
       // /* GcRoot<mirror::Object> */ root = *(obj + offset)
       GetAssembler()->LoadFromOffset(kLoadWord, root_reg, obj, offset);
       static_assert(
@@ -7281,21 +7936,6 @@
                     "art::mirror::CompressedReference<mirror::Object> and int32_t "
                     "have different sizes.");
 
-      // Slow path marking the GC root `root`.
-      Location temp = LocationFrom(lr);
-      SlowPathCodeARMVIXL* slow_path =
-          new (GetGraph()->GetArena()) ReadBarrierMarkSlowPathARMVIXL(
-              instruction,
-              root,
-              /*entrypoint*/ temp);
-      codegen_->AddSlowPath(slow_path);
-
-      // temp = Thread::Current()->pReadBarrierMarkReg ## root.reg()
-      const int32_t entry_point_offset =
-          CodeGenerator::GetReadBarrierMarkEntryPointsOffset<kArmPointerSize>(root.reg());
-      // Loading the entrypoint does not require a load acquire since it is only changed when
-      // threads are suspended or running a checkpoint.
-      GetAssembler()->LoadFromOffset(kLoadWord, RegisterFrom(temp), tr, entry_point_offset);
       // The entrypoint is null when the GC is not marking, this prevents one load compared to
       // checking GetIsGcMarking.
       __ CompareAndBranchIfNonZero(RegisterFrom(temp), slow_path->GetEntryLabel());
@@ -7366,55 +8006,114 @@
   DCHECK(kEmitCompilerReadBarrier);
   DCHECK(kUseBakerReadBarrier);
 
-  // In slow path based read barriers, the read barrier call is
-  // inserted after the original load. However, in fast path based
-  // Baker's read barriers, we need to perform the load of
-  // mirror::Object::monitor_ *before* the original reference load.
-  // This load-load ordering is required by the read barrier.
-  // The fast path/slow path (for Baker's algorithm) should look like:
+  // Query `art::Thread::Current()->GetIsGcMarking()` to decide
+  // whether we need to enter the slow path to mark the reference.
+  // Then, in the slow path, check the gray bit in the lock word of
+  // the reference's holder (`obj`) to decide whether to mark `ref` or
+  // not.
   //
-  //   uint32_t rb_state = Lockword(obj->monitor_).ReadBarrierState();
-  //   lfence;  // Load fence or artificial data dependency to prevent load-load reordering
-  //   HeapReference<Object> ref = *src;  // Original reference load.
-  //   bool is_gray = (rb_state == ReadBarrier::GrayState());
-  //   if (is_gray) {
-  //     ref = ReadBarrier::Mark(ref);  // Performed by runtime entrypoint slow path.
+  // Note that we do not actually check the value of `GetIsGcMarking()`;
+  // instead, we load into `temp3` the read barrier mark entry point
+  // corresponding to register `ref`. If `temp3` is null, it means
+  // that `GetIsGcMarking()` is false, and vice versa.
+  //
+  //   temp3 = Thread::Current()->pReadBarrierMarkReg ## root.reg()
+  //   if (temp3 != nullptr) {  // <=> Thread::Current()->GetIsGcMarking()
+  //     // Slow path.
+  //     uint32_t rb_state = Lockword(obj->monitor_).ReadBarrierState();
+  //     lfence;  // Load fence or artificial data dependency to prevent load-load reordering
+  //     HeapReference<mirror::Object> ref = *src;  // Original reference load.
+  //     bool is_gray = (rb_state == ReadBarrier::GrayState());
+  //     if (is_gray) {
+  //       ref = temp3(ref);  // ref = ReadBarrier::Mark(ref);  // Runtime entry point call.
+  //     }
+  //   } else {
+  //     HeapReference<mirror::Object> ref = *src;  // Original reference load.
   //   }
-  //
-  // Note: the original implementation in ReadBarrier::Barrier is
-  // slightly more complex as it performs additional checks that we do
-  // not do here for performance reasons.
 
-  vixl32::Register ref_reg = RegisterFrom(ref);
   vixl32::Register temp_reg = RegisterFrom(temp);
-  uint32_t monitor_offset = mirror::Object::MonitorOffset().Int32Value();
 
-  // /* int32_t */ monitor = obj->monitor_
-  GetAssembler()->LoadFromOffset(kLoadWord, temp_reg, obj, monitor_offset);
-  if (needs_null_check) {
-    MaybeRecordImplicitNullCheck(instruction);
+  // Slow path marking the object `ref` when the GC is marking. The
+  // entrypoint will already be loaded in `temp3`.
+  Location temp3 = LocationFrom(lr);
+  SlowPathCodeARMVIXL* slow_path;
+  if (always_update_field) {
+    DCHECK(temp2 != nullptr);
+    // LoadReferenceWithBakerReadBarrierAndUpdateFieldSlowPathARMVIXL
+    // only supports address of the form `obj + field_offset`, where
+    // `obj` is a register and `field_offset` is a register pair (of
+    // which only the lower half is used). Thus `offset` and
+    // `scale_factor` above are expected to be null in this code path.
+    DCHECK_EQ(offset, 0u);
+    DCHECK_EQ(scale_factor, ScaleFactor::TIMES_1);
+    Location field_offset = index;
+    slow_path =
+        new (GetGraph()->GetArena()) LoadReferenceWithBakerReadBarrierAndUpdateFieldSlowPathARMVIXL(
+            instruction,
+            ref,
+            obj,
+            offset,
+            /* index */ field_offset,
+            scale_factor,
+            needs_null_check,
+            temp_reg,
+            *temp2,
+            /* entrypoint */ temp3);
+  } else {
+    slow_path = new (GetGraph()->GetArena()) LoadReferenceWithBakerReadBarrierSlowPathARMVIXL(
+        instruction,
+        ref,
+        obj,
+        offset,
+        index,
+        scale_factor,
+        needs_null_check,
+        temp_reg,
+        /* entrypoint */ temp3);
   }
-  // /* LockWord */ lock_word = LockWord(monitor)
-  static_assert(sizeof(LockWord) == sizeof(int32_t),
-                "art::LockWord and int32_t have different sizes.");
+  AddSlowPath(slow_path);
 
-  // Introduce a dependency on the lock_word including the rb_state,
-  // which shall prevent load-load reordering without using
-  // a memory barrier (which would be more expensive).
-  // `obj` is unchanged by this operation, but its value now depends
-  // on `temp_reg`.
-  __ Add(obj, obj, Operand(temp_reg, ShiftType::LSR, 32));
+  // temp3 = Thread::Current()->pReadBarrierMarkReg ## ref.reg()
+  const int32_t entry_point_offset =
+      CodeGenerator::GetReadBarrierMarkEntryPointsOffset<kArmPointerSize>(ref.reg());
+  // Loading the entrypoint does not require a load acquire since it is only changed when
+  // threads are suspended or running a checkpoint.
+  GetAssembler()->LoadFromOffset(kLoadWord, RegisterFrom(temp3), tr, entry_point_offset);
+  // The entrypoint is null when the GC is not marking, this prevents one load compared to
+  // checking GetIsGcMarking.
+  __ CompareAndBranchIfNonZero(RegisterFrom(temp3), slow_path->GetEntryLabel());
+  // Fast path: just load the reference.
+  GenerateRawReferenceLoad(instruction, ref, obj, offset, index, scale_factor, needs_null_check);
+  __ Bind(slow_path->GetExitLabel());
+}
 
-  // The actual reference load.
+void CodeGeneratorARMVIXL::GenerateRawReferenceLoad(HInstruction* instruction,
+                                                    Location ref,
+                                                    vixl::aarch32::Register obj,
+                                                    uint32_t offset,
+                                                    Location index,
+                                                    ScaleFactor scale_factor,
+                                                    bool needs_null_check) {
+  Primitive::Type type = Primitive::kPrimNot;
+  vixl32::Register ref_reg = RegisterFrom(ref, type);
+
+  // If needed, vixl::EmissionCheckScope guards are used to ensure
+  // that no pools are emitted between the load (macro) instruction
+  // and MaybeRecordImplicitNullCheck.
+
   if (index.IsValid()) {
     // Load types involving an "index": ArrayGet,
     // UnsafeGetObject/UnsafeGetObjectVolatile and UnsafeCASObject
     // intrinsics.
-    // /* HeapReference<Object> */ ref = *(obj + offset + (index << scale_factor))
+    // /* HeapReference<mirror::Object> */ ref = *(obj + offset + (index << scale_factor))
     if (index.IsConstant()) {
       size_t computed_offset =
           (Int32ConstantFrom(index) << scale_factor) + offset;
+      vixl::EmissionCheckScope guard(GetVIXLAssembler(), kMaxMacroInstructionSizeInBytes);
       GetAssembler()->LoadFromOffset(kLoadWord, ref_reg, obj, computed_offset);
+      if (needs_null_check) {
+        MaybeRecordImplicitNullCheck(instruction);
+      }
     } else {
       // Handle the special case of the
       // UnsafeGetObject/UnsafeGetObjectVolatile and UnsafeCASObject
@@ -7424,46 +8123,27 @@
           ? LowRegisterFrom(index)
           : RegisterFrom(index);
       UseScratchRegisterScope temps(GetVIXLAssembler());
-      const vixl32::Register temp3 = temps.Acquire();
-      __ Add(temp3, obj, Operand(index_reg, ShiftType::LSL, scale_factor));
-      GetAssembler()->LoadFromOffset(kLoadWord, ref_reg, temp3, offset);
+      vixl32::Register temp = temps.Acquire();
+      __ Add(temp, obj, Operand(index_reg, ShiftType::LSL, scale_factor));
+      {
+        vixl::EmissionCheckScope guard(GetVIXLAssembler(), kMaxMacroInstructionSizeInBytes);
+        GetAssembler()->LoadFromOffset(kLoadWord, ref_reg, temp, offset);
+        if (needs_null_check) {
+          MaybeRecordImplicitNullCheck(instruction);
+        }
+      }
     }
   } else {
-    // /* HeapReference<Object> */ ref = *(obj + offset)
+    // /* HeapReference<mirror::Object> */ ref = *(obj + offset)
+    vixl::EmissionCheckScope guard(GetVIXLAssembler(), kMaxMacroInstructionSizeInBytes);
     GetAssembler()->LoadFromOffset(kLoadWord, ref_reg, obj, offset);
+    if (needs_null_check) {
+      MaybeRecordImplicitNullCheck(instruction);
+    }
   }
 
   // Object* ref = ref_addr->AsMirrorPtr()
   GetAssembler()->MaybeUnpoisonHeapReference(ref_reg);
-
-  // Slow path marking the object `ref` when it is gray.
-  SlowPathCodeARMVIXL* slow_path;
-  if (always_update_field) {
-    DCHECK(temp2 != nullptr);
-    // ReadBarrierMarkAndUpdateFieldSlowPathARMVIXL only supports address
-    // of the form `obj + field_offset`, where `obj` is a register and
-    // `field_offset` is a register pair (of which only the lower half
-    // is used). Thus `offset` and `scale_factor` above are expected
-    // to be null in this code path.
-    DCHECK_EQ(offset, 0u);
-    DCHECK_EQ(scale_factor, ScaleFactor::TIMES_1);
-    slow_path = new (GetGraph()->GetArena()) ReadBarrierMarkAndUpdateFieldSlowPathARMVIXL(
-        instruction, ref, obj, /* field_offset */ index, temp_reg, *temp2);
-  } else {
-    slow_path = new (GetGraph()->GetArena()) ReadBarrierMarkSlowPathARMVIXL(instruction, ref);
-  }
-  AddSlowPath(slow_path);
-
-  // if (rb_state == ReadBarrier::GrayState())
-  //   ref = ReadBarrier::Mark(ref);
-  // Given the numeric representation, it's enough to check the low bit of the
-  // rb_state. We do that by shifting the bit out of the lock word with LSRS
-  // which can be a 16-bit instruction unlike the TST immediate.
-  static_assert(ReadBarrier::WhiteState() == 0, "Expecting white to have value 0");
-  static_assert(ReadBarrier::GrayState() == 1, "Expecting gray to have value 1");
-  __ Lsrs(temp_reg, temp_reg, LockWord::kReadBarrierStateShift + 1);
-  __ B(cs, slow_path->GetEntryLabel());  // Carry flag is the last bit shifted out by LSRS.
-  __ Bind(slow_path->GetExitLabel());
 }
 
 void CodeGeneratorARMVIXL::GenerateReadBarrierSlow(HInstruction* instruction,
@@ -7738,9 +8418,7 @@
 }
 
 VIXLUInt32Literal* CodeGeneratorARMVIXL::DeduplicateBootImageAddressLiteral(uint32_t address) {
-  bool needs_patch = GetCompilerOptions().GetIncludePatchInformation();
-  Uint32ToLiteralMap* map = needs_patch ? &boot_image_address_patches_ : &uint32_literals_;
-  return DeduplicateUint32Literal(dchecked_integral_cast<uint32_t>(address), map);
+  return DeduplicateUint32Literal(dchecked_integral_cast<uint32_t>(address), &uint32_literals_);
 }
 
 VIXLUInt32Literal* CodeGeneratorARMVIXL::DeduplicateDexCacheAddressLiteral(uint32_t address) {
@@ -7800,8 +8478,7 @@
       /* MOVW+MOVT for each entry */ 2u * pc_relative_string_patches_.size() +
       boot_image_type_patches_.size() +
       /* MOVW+MOVT for each entry */ 2u * pc_relative_type_patches_.size() +
-      /* MOVW+MOVT for each entry */ 2u * type_bss_entry_patches_.size() +
-      boot_image_address_patches_.size();
+      /* MOVW+MOVT for each entry */ 2u * type_bss_entry_patches_.size();
   linker_patches->reserve(size);
   EmitPcRelativeLinkerPatches<LinkerPatch::DexCacheArrayPatch>(pc_relative_dex_cache_patches_,
                                                                linker_patches);
@@ -7835,13 +8512,6 @@
                                                      target_type.dex_file,
                                                      target_type.type_index.index_));
   }
-  for (const auto& entry : boot_image_address_patches_) {
-    DCHECK(GetCompilerOptions().GetIncludePatchInformation());
-    VIXLUInt32Literal* literal = entry.second;
-    DCHECK(literal->IsBound());
-    uint32_t literal_offset = literal->GetLocation();
-    linker_patches->push_back(LinkerPatch::RecordPosition(literal_offset));
-  }
   DCHECK_EQ(size, linker_patches->size());
 }
 
diff --git a/compiler/optimizing/code_generator_arm_vixl.h b/compiler/optimizing/code_generator_arm_vixl.h
index 3f52c72..781027a 100644
--- a/compiler/optimizing/code_generator_arm_vixl.h
+++ b/compiler/optimizing/code_generator_arm_vixl.h
@@ -45,6 +45,11 @@
 namespace art {
 namespace arm {
 
+// This constant is used as an approximate margin when emission of veneer and literal pools
+// must be blocked.
+static constexpr int kMaxMacroInstructionSizeInBytes =
+    15 * vixl::aarch32::kMaxInstructionSizeInBytes;
+
 static const vixl::aarch32::Register kParameterCoreRegistersVIXL[] = {
     vixl::aarch32::r1,
     vixl::aarch32::r2,
@@ -396,7 +401,6 @@
   void GenerateCompareTestAndBranch(HCondition* condition,
                                     vixl::aarch32::Label* true_target,
                                     vixl::aarch32::Label* false_target);
-  void GenerateVcmp(HInstruction* instruction);
   void GenerateFPJumps(HCondition* cond,
                        vixl::aarch32::Label* true_label,
                        vixl::aarch32::Label* false_label);
@@ -505,6 +509,8 @@
     return &(block_labels_[block->GetBlockId()]);
   }
 
+  vixl32::Label* GetFinalLabel(HInstruction* instruction, vixl32::Label* final_label);
+
   void Initialize() OVERRIDE {
     block_labels_.resize(GetGraph()->GetBlocks().size());
   }
@@ -625,6 +631,15 @@
                                                  bool always_update_field = false,
                                                  vixl::aarch32::Register* temp2 = nullptr);
 
+  // Generate a heap reference load (with no read barrier).
+  void GenerateRawReferenceLoad(HInstruction* instruction,
+                                Location ref,
+                                vixl::aarch32::Register obj,
+                                uint32_t offset,
+                                Location index,
+                                ScaleFactor scale_factor,
+                                bool needs_null_check);
+
   // Generate a read barrier for a heap reference within `instruction`
   // using a slow path.
   //
@@ -738,8 +753,6 @@
   ArenaDeque<PcRelativePatchInfo> pc_relative_type_patches_;
   // PC-relative type patch info for kBssEntry.
   ArenaDeque<PcRelativePatchInfo> type_bss_entry_patches_;
-  // Deduplication map for patchable boot image addresses.
-  Uint32ToLiteralMap boot_image_address_patches_;
 
   // Patches for string literals in JIT compiled code.
   StringToLiteralMap jit_string_patches_;
diff --git a/compiler/optimizing/code_generator_mips.cc b/compiler/optimizing/code_generator_mips.cc
index 51dd898..5f02a52 100644
--- a/compiler/optimizing/code_generator_mips.cc
+++ b/compiler/optimizing/code_generator_mips.cc
@@ -391,7 +391,8 @@
 
 class TypeCheckSlowPathMIPS : public SlowPathCodeMIPS {
  public:
-  explicit TypeCheckSlowPathMIPS(HInstruction* instruction) : SlowPathCodeMIPS(instruction) {}
+  explicit TypeCheckSlowPathMIPS(HInstruction* instruction, bool is_fatal)
+      : SlowPathCodeMIPS(instruction), is_fatal_(is_fatal) {}
 
   void EmitNativeCode(CodeGenerator* codegen) OVERRIDE {
     LocationSummary* locations = instruction_->GetLocations();
@@ -401,7 +402,9 @@
     CodeGeneratorMIPS* mips_codegen = down_cast<CodeGeneratorMIPS*>(codegen);
 
     __ Bind(GetEntryLabel());
-    SaveLiveRegisters(codegen, locations);
+    if (!is_fatal_) {
+      SaveLiveRegisters(codegen, locations);
+    }
 
     // We're moving two locations to locations that could overlap, so we need a parallel
     // move resolver.
@@ -424,13 +427,19 @@
       CheckEntrypointTypes<kQuickCheckInstanceOf, void, mirror::Object*, mirror::Class*>();
     }
 
-    RestoreLiveRegisters(codegen, locations);
-    __ B(GetExitLabel());
+    if (!is_fatal_) {
+      RestoreLiveRegisters(codegen, locations);
+      __ B(GetExitLabel());
+    }
   }
 
   const char* GetDescription() const OVERRIDE { return "TypeCheckSlowPathMIPS"; }
 
+  bool IsFatal() const OVERRIDE { return is_fatal_; }
+
  private:
+  const bool is_fatal_;
+
   DISALLOW_COPY_AND_ASSIGN(TypeCheckSlowPathMIPS);
 };
 
@@ -482,8 +491,6 @@
                                graph->GetArena()->Adapter(kArenaAllocCodeGenerator)),
       pc_relative_type_patches_(graph->GetArena()->Adapter(kArenaAllocCodeGenerator)),
       type_bss_entry_patches_(graph->GetArena()->Adapter(kArenaAllocCodeGenerator)),
-      boot_image_address_patches_(std::less<uint32_t>(),
-                                  graph->GetArena()->Adapter(kArenaAllocCodeGenerator)),
       jit_string_patches_(graph->GetArena()->Adapter(kArenaAllocCodeGenerator)),
       jit_class_patches_(graph->GetArena()->Adapter(kArenaAllocCodeGenerator)),
       clobbered_ra_(false) {
@@ -1026,8 +1033,7 @@
       pc_relative_type_patches_.size() +
       type_bss_entry_patches_.size() +
       boot_image_string_patches_.size() +
-      boot_image_type_patches_.size() +
-      boot_image_address_patches_.size();
+      boot_image_type_patches_.size();
   linker_patches->reserve(size);
   EmitPcRelativeLinkerPatches<LinkerPatch::DexCacheArrayPatch>(pc_relative_dex_cache_patches_,
                                                                linker_patches);
@@ -1061,13 +1067,6 @@
                                                      target_type.dex_file,
                                                      target_type.type_index.index_));
   }
-  for (const auto& entry : boot_image_address_patches_) {
-    DCHECK(GetCompilerOptions().GetIncludePatchInformation());
-    Literal* literal = entry.second;
-    DCHECK(literal->GetLabel()->IsBound());
-    uint32_t literal_offset = __ GetLabelLocation(literal->GetLabel());
-    linker_patches->push_back(LinkerPatch::RecordPosition(literal_offset));
-  }
   DCHECK_EQ(size, linker_patches->size());
 }
 
@@ -1125,9 +1124,7 @@
 }
 
 Literal* CodeGeneratorMIPS::DeduplicateBootImageAddressLiteral(uint32_t address) {
-  bool needs_patch = GetCompilerOptions().GetIncludePatchInformation();
-  Uint32ToLiteralMap* map = needs_patch ? &boot_image_address_patches_ : &uint32_literals_;
-  return DeduplicateUint32Literal(dchecked_integral_cast<uint32_t>(address), map);
+  return DeduplicateUint32Literal(dchecked_integral_cast<uint32_t>(address), &uint32_literals_);
 }
 
 void CodeGeneratorMIPS::EmitPcRelativeAddressPlaceholderHigh(PcRelativePatchInfo* info,
@@ -2331,30 +2328,178 @@
 }
 
 void LocationsBuilderMIPS::VisitCheckCast(HCheckCast* instruction) {
-  LocationSummary* locations = new (GetGraph()->GetArena()) LocationSummary(
-      instruction,
-      LocationSummary::kCallOnSlowPath);
+  LocationSummary::CallKind call_kind = LocationSummary::kNoCall;
+  bool throws_into_catch = instruction->CanThrowIntoCatchBlock();
+
+  TypeCheckKind type_check_kind = instruction->GetTypeCheckKind();
+  switch (type_check_kind) {
+    case TypeCheckKind::kExactCheck:
+    case TypeCheckKind::kAbstractClassCheck:
+    case TypeCheckKind::kClassHierarchyCheck:
+    case TypeCheckKind::kArrayObjectCheck:
+      call_kind = throws_into_catch
+          ? LocationSummary::kCallOnSlowPath
+          : LocationSummary::kNoCall;  // In fact, call on a fatal (non-returning) slow path.
+      break;
+    case TypeCheckKind::kArrayCheck:
+    case TypeCheckKind::kUnresolvedCheck:
+    case TypeCheckKind::kInterfaceCheck:
+      call_kind = LocationSummary::kCallOnSlowPath;
+      break;
+  }
+
+  LocationSummary* locations = new (GetGraph()->GetArena()) LocationSummary(instruction, call_kind);
   locations->SetInAt(0, Location::RequiresRegister());
   locations->SetInAt(1, Location::RequiresRegister());
-  // Note that TypeCheckSlowPathMIPS uses this register too.
   locations->AddTemp(Location::RequiresRegister());
 }
 
 void InstructionCodeGeneratorMIPS::VisitCheckCast(HCheckCast* instruction) {
+  TypeCheckKind type_check_kind = instruction->GetTypeCheckKind();
   LocationSummary* locations = instruction->GetLocations();
   Register obj = locations->InAt(0).AsRegister<Register>();
   Register cls = locations->InAt(1).AsRegister<Register>();
-  Register obj_cls = locations->GetTemp(0).AsRegister<Register>();
+  Register temp = locations->GetTemp(0).AsRegister<Register>();
+  const uint32_t class_offset = mirror::Object::ClassOffset().Int32Value();
+  const uint32_t super_offset = mirror::Class::SuperClassOffset().Int32Value();
+  const uint32_t component_offset = mirror::Class::ComponentTypeOffset().Int32Value();
+  const uint32_t primitive_offset = mirror::Class::PrimitiveTypeOffset().Int32Value();
+  const uint32_t iftable_offset = mirror::Class::IfTableOffset().Uint32Value();
+  const uint32_t array_length_offset = mirror::Array::LengthOffset().Uint32Value();
+  const uint32_t object_array_data_offset =
+      mirror::Array::DataOffset(kHeapReferenceSize).Uint32Value();
+  MipsLabel done;
 
-  SlowPathCodeMIPS* slow_path = new (GetGraph()->GetArena()) TypeCheckSlowPathMIPS(instruction);
+  // Always false for read barriers since we may need to go to the entrypoint for non-fatal cases
+  // from false negatives. The false negatives may come from avoiding read barriers below. Avoiding
+  // read barriers is done for performance and code size reasons.
+  bool is_type_check_slow_path_fatal = false;
+  if (!kEmitCompilerReadBarrier) {
+    is_type_check_slow_path_fatal =
+        (type_check_kind == TypeCheckKind::kExactCheck ||
+         type_check_kind == TypeCheckKind::kAbstractClassCheck ||
+         type_check_kind == TypeCheckKind::kClassHierarchyCheck ||
+         type_check_kind == TypeCheckKind::kArrayObjectCheck) &&
+        !instruction->CanThrowIntoCatchBlock();
+  }
+  SlowPathCodeMIPS* slow_path =
+      new (GetGraph()->GetArena()) TypeCheckSlowPathMIPS(instruction,
+                                                         is_type_check_slow_path_fatal);
   codegen_->AddSlowPath(slow_path);
 
-  // TODO: avoid this check if we know obj is not null.
-  __ Beqz(obj, slow_path->GetExitLabel());
-  // Compare the class of `obj` with `cls`.
-  __ LoadFromOffset(kLoadWord, obj_cls, obj, mirror::Object::ClassOffset().Int32Value());
-  __ MaybeUnpoisonHeapReference(obj_cls);
-  __ Bne(obj_cls, cls, slow_path->GetEntryLabel());
+  // Avoid this check if we know `obj` is not null.
+  if (instruction->MustDoNullCheck()) {
+    __ Beqz(obj, &done);
+  }
+
+  switch (type_check_kind) {
+    case TypeCheckKind::kExactCheck:
+    case TypeCheckKind::kArrayCheck: {
+      // /* HeapReference<Class> */ temp = obj->klass_
+      __ LoadFromOffset(kLoadWord, temp, obj, class_offset);
+      __ MaybeUnpoisonHeapReference(temp);
+      // Jump to slow path for throwing the exception or doing a
+      // more involved array check.
+      __ Bne(temp, cls, slow_path->GetEntryLabel());
+      break;
+    }
+
+    case TypeCheckKind::kAbstractClassCheck: {
+      // /* HeapReference<Class> */ temp = obj->klass_
+      __ LoadFromOffset(kLoadWord, temp, obj, class_offset);
+      __ MaybeUnpoisonHeapReference(temp);
+      // If the class is abstract, we eagerly fetch the super class of the
+      // object to avoid doing a comparison we know will fail.
+      MipsLabel loop;
+      __ Bind(&loop);
+      // /* HeapReference<Class> */ temp = temp->super_class_
+      __ LoadFromOffset(kLoadWord, temp, temp, super_offset);
+      __ MaybeUnpoisonHeapReference(temp);
+      // If the class reference currently in `temp` is null, jump to the slow path to throw the
+      // exception.
+      __ Beqz(temp, slow_path->GetEntryLabel());
+      // Otherwise, compare the classes.
+      __ Bne(temp, cls, &loop);
+      break;
+    }
+
+    case TypeCheckKind::kClassHierarchyCheck: {
+      // /* HeapReference<Class> */ temp = obj->klass_
+      __ LoadFromOffset(kLoadWord, temp, obj, class_offset);
+      __ MaybeUnpoisonHeapReference(temp);
+      // Walk over the class hierarchy to find a match.
+      MipsLabel loop;
+      __ Bind(&loop);
+      __ Beq(temp, cls, &done);
+      // /* HeapReference<Class> */ temp = temp->super_class_
+      __ LoadFromOffset(kLoadWord, temp, temp, super_offset);
+      __ MaybeUnpoisonHeapReference(temp);
+      // If the class reference currently in `temp` is null, jump to the slow path to throw the
+      // exception. Otherwise, jump to the beginning of the loop.
+      __ Bnez(temp, &loop);
+      __ B(slow_path->GetEntryLabel());
+      break;
+    }
+
+    case TypeCheckKind::kArrayObjectCheck: {
+      // /* HeapReference<Class> */ temp = obj->klass_
+      __ LoadFromOffset(kLoadWord, temp, obj, class_offset);
+      __ MaybeUnpoisonHeapReference(temp);
+      // Do an exact check.
+      __ Beq(temp, cls, &done);
+      // Otherwise, we need to check that the object's class is a non-primitive array.
+      // /* HeapReference<Class> */ temp = temp->component_type_
+      __ LoadFromOffset(kLoadWord, temp, temp, component_offset);
+      __ MaybeUnpoisonHeapReference(temp);
+      // If the component type is null, jump to the slow path to throw the exception.
+      __ Beqz(temp, slow_path->GetEntryLabel());
+      // Otherwise, the object is indeed an array, further check that this component
+      // type is not a primitive type.
+      __ LoadFromOffset(kLoadUnsignedHalfword, temp, temp, primitive_offset);
+      static_assert(Primitive::kPrimNot == 0, "Expected 0 for kPrimNot");
+      __ Bnez(temp, slow_path->GetEntryLabel());
+      break;
+    }
+
+    case TypeCheckKind::kUnresolvedCheck:
+      // We always go into the type check slow path for the unresolved check case.
+      // We cannot directly call the CheckCast runtime entry point
+      // without resorting to a type checking slow path here (i.e. by
+      // calling InvokeRuntime directly), as it would require to
+      // assign fixed registers for the inputs of this HInstanceOf
+      // instruction (following the runtime calling convention), which
+      // might be cluttered by the potential first read barrier
+      // emission at the beginning of this method.
+      __ B(slow_path->GetEntryLabel());
+      break;
+
+    case TypeCheckKind::kInterfaceCheck: {
+      // Avoid read barriers to improve performance of the fast path. We can not get false
+      // positives by doing this.
+      // /* HeapReference<Class> */ temp = obj->klass_
+      __ LoadFromOffset(kLoadWord, temp, obj, class_offset);
+      __ MaybeUnpoisonHeapReference(temp);
+      // /* HeapReference<Class> */ temp = temp->iftable_
+      __ LoadFromOffset(kLoadWord, temp, temp, iftable_offset);
+      __ MaybeUnpoisonHeapReference(temp);
+      // Iftable is never null.
+      __ Lw(TMP, temp, array_length_offset);
+      // Loop through the iftable and check if any class matches.
+      MipsLabel loop;
+      __ Bind(&loop);
+      __ Addiu(temp, temp, 2 * kHeapReferenceSize);  // Possibly in delay slot on R2.
+      __ Beqz(TMP, slow_path->GetEntryLabel());
+      __ Lw(AT, temp, object_array_data_offset - 2 * kHeapReferenceSize);
+      __ MaybeUnpoisonHeapReference(AT);
+      // Go to next interface.
+      __ Addiu(TMP, TMP, -2);
+      // Compare the classes and continue the loop if they do not match.
+      __ Bne(AT, cls, &loop);
+      break;
+    }
+  }
+
+  __ Bind(&done);
   __ Bind(slow_path->GetExitLabel());
 }
 
@@ -5193,8 +5338,22 @@
 }
 
 void LocationsBuilderMIPS::VisitInstanceOf(HInstanceOf* instruction) {
-  LocationSummary::CallKind call_kind =
-      instruction->IsExactCheck() ? LocationSummary::kNoCall : LocationSummary::kCallOnSlowPath;
+  LocationSummary::CallKind call_kind = LocationSummary::kNoCall;
+  TypeCheckKind type_check_kind = instruction->GetTypeCheckKind();
+  switch (type_check_kind) {
+    case TypeCheckKind::kExactCheck:
+    case TypeCheckKind::kAbstractClassCheck:
+    case TypeCheckKind::kClassHierarchyCheck:
+    case TypeCheckKind::kArrayObjectCheck:
+      call_kind = LocationSummary::kNoCall;
+      break;
+    case TypeCheckKind::kArrayCheck:
+    case TypeCheckKind::kUnresolvedCheck:
+    case TypeCheckKind::kInterfaceCheck:
+      call_kind = LocationSummary::kCallOnSlowPath;
+      break;
+  }
+
   LocationSummary* locations = new (GetGraph()->GetArena()) LocationSummary(instruction, call_kind);
   locations->SetInAt(0, Location::RequiresRegister());
   locations->SetInAt(1, Location::RequiresRegister());
@@ -5204,36 +5363,143 @@
 }
 
 void InstructionCodeGeneratorMIPS::VisitInstanceOf(HInstanceOf* instruction) {
+  TypeCheckKind type_check_kind = instruction->GetTypeCheckKind();
   LocationSummary* locations = instruction->GetLocations();
   Register obj = locations->InAt(0).AsRegister<Register>();
   Register cls = locations->InAt(1).AsRegister<Register>();
   Register out = locations->Out().AsRegister<Register>();
-
+  uint32_t class_offset = mirror::Object::ClassOffset().Int32Value();
+  uint32_t super_offset = mirror::Class::SuperClassOffset().Int32Value();
+  uint32_t component_offset = mirror::Class::ComponentTypeOffset().Int32Value();
+  uint32_t primitive_offset = mirror::Class::PrimitiveTypeOffset().Int32Value();
   MipsLabel done;
+  SlowPathCodeMIPS* slow_path = nullptr;
 
   // Return 0 if `obj` is null.
-  // TODO: Avoid this check if we know `obj` is not null.
-  __ Move(out, ZERO);
-  __ Beqz(obj, &done);
+  // Avoid this check if we know `obj` is not null.
+  if (instruction->MustDoNullCheck()) {
+    __ Move(out, ZERO);
+    __ Beqz(obj, &done);
+  }
 
-  // Compare the class of `obj` with `cls`.
-  __ LoadFromOffset(kLoadWord, out, obj, mirror::Object::ClassOffset().Int32Value());
-  __ MaybeUnpoisonHeapReference(out);
-  if (instruction->IsExactCheck()) {
-    // Classes must be equal for the instanceof to succeed.
-    __ Xor(out, out, cls);
-    __ Sltiu(out, out, 1);
-  } else {
-    // If the classes are not equal, we go into a slow path.
-    DCHECK(locations->OnlyCallsOnSlowPath());
-    SlowPathCodeMIPS* slow_path = new (GetGraph()->GetArena()) TypeCheckSlowPathMIPS(instruction);
-    codegen_->AddSlowPath(slow_path);
-    __ Bne(out, cls, slow_path->GetEntryLabel());
-    __ LoadConst32(out, 1);
-    __ Bind(slow_path->GetExitLabel());
+  switch (type_check_kind) {
+    case TypeCheckKind::kExactCheck: {
+      // /* HeapReference<Class> */ out = obj->klass_
+      __ LoadFromOffset(kLoadWord, out, obj, class_offset);
+      __ MaybeUnpoisonHeapReference(out);
+      // Classes must be equal for the instanceof to succeed.
+      __ Xor(out, out, cls);
+      __ Sltiu(out, out, 1);
+      break;
+    }
+
+    case TypeCheckKind::kAbstractClassCheck: {
+      // /* HeapReference<Class> */ out = obj->klass_
+      __ LoadFromOffset(kLoadWord, out, obj, class_offset);
+      __ MaybeUnpoisonHeapReference(out);
+      // If the class is abstract, we eagerly fetch the super class of the
+      // object to avoid doing a comparison we know will fail.
+      MipsLabel loop;
+      __ Bind(&loop);
+      // /* HeapReference<Class> */ out = out->super_class_
+      __ LoadFromOffset(kLoadWord, out, out, super_offset);
+      __ MaybeUnpoisonHeapReference(out);
+      // If `out` is null, we use it for the result, and jump to `done`.
+      __ Beqz(out, &done);
+      __ Bne(out, cls, &loop);
+      __ LoadConst32(out, 1);
+      break;
+    }
+
+    case TypeCheckKind::kClassHierarchyCheck: {
+      // /* HeapReference<Class> */ out = obj->klass_
+      __ LoadFromOffset(kLoadWord, out, obj, class_offset);
+      __ MaybeUnpoisonHeapReference(out);
+      // Walk over the class hierarchy to find a match.
+      MipsLabel loop, success;
+      __ Bind(&loop);
+      __ Beq(out, cls, &success);
+      // /* HeapReference<Class> */ out = out->super_class_
+      __ LoadFromOffset(kLoadWord, out, out, super_offset);
+      __ MaybeUnpoisonHeapReference(out);
+      __ Bnez(out, &loop);
+      // If `out` is null, we use it for the result, and jump to `done`.
+      __ B(&done);
+      __ Bind(&success);
+      __ LoadConst32(out, 1);
+      break;
+    }
+
+    case TypeCheckKind::kArrayObjectCheck: {
+      // /* HeapReference<Class> */ out = obj->klass_
+      __ LoadFromOffset(kLoadWord, out, obj, class_offset);
+      __ MaybeUnpoisonHeapReference(out);
+      // Do an exact check.
+      MipsLabel success;
+      __ Beq(out, cls, &success);
+      // Otherwise, we need to check that the object's class is a non-primitive array.
+      // /* HeapReference<Class> */ out = out->component_type_
+      __ LoadFromOffset(kLoadWord, out, out, component_offset);
+      __ MaybeUnpoisonHeapReference(out);
+      // If `out` is null, we use it for the result, and jump to `done`.
+      __ Beqz(out, &done);
+      __ LoadFromOffset(kLoadUnsignedHalfword, out, out, primitive_offset);
+      static_assert(Primitive::kPrimNot == 0, "Expected 0 for kPrimNot");
+      __ Sltiu(out, out, 1);
+      __ B(&done);
+      __ Bind(&success);
+      __ LoadConst32(out, 1);
+      break;
+    }
+
+    case TypeCheckKind::kArrayCheck: {
+      // No read barrier since the slow path will retry upon failure.
+      // /* HeapReference<Class> */ out = obj->klass_
+      __ LoadFromOffset(kLoadWord, out, obj, class_offset);
+      __ MaybeUnpoisonHeapReference(out);
+      DCHECK(locations->OnlyCallsOnSlowPath());
+      slow_path = new (GetGraph()->GetArena()) TypeCheckSlowPathMIPS(instruction,
+                                                                     /* is_fatal */ false);
+      codegen_->AddSlowPath(slow_path);
+      __ Bne(out, cls, slow_path->GetEntryLabel());
+      __ LoadConst32(out, 1);
+      break;
+    }
+
+    case TypeCheckKind::kUnresolvedCheck:
+    case TypeCheckKind::kInterfaceCheck: {
+      // Note that we indeed only call on slow path, but we always go
+      // into the slow path for the unresolved and interface check
+      // cases.
+      //
+      // We cannot directly call the InstanceofNonTrivial runtime
+      // entry point without resorting to a type checking slow path
+      // here (i.e. by calling InvokeRuntime directly), as it would
+      // require to assign fixed registers for the inputs of this
+      // HInstanceOf instruction (following the runtime calling
+      // convention), which might be cluttered by the potential first
+      // read barrier emission at the beginning of this method.
+      //
+      // TODO: Introduce a new runtime entry point taking the object
+      // to test (instead of its class) as argument, and let it deal
+      // with the read barrier issues. This will let us refactor this
+      // case of the `switch` code as it was previously (with a direct
+      // call to the runtime not using a type checking slow path).
+      // This should also be beneficial for the other cases above.
+      DCHECK(locations->OnlyCallsOnSlowPath());
+      slow_path = new (GetGraph()->GetArena()) TypeCheckSlowPathMIPS(instruction,
+                                                                     /* is_fatal */ false);
+      codegen_->AddSlowPath(slow_path);
+      __ B(slow_path->GetEntryLabel());
+      break;
+    }
   }
 
   __ Bind(&done);
+
+  if (slow_path != nullptr) {
+    __ Bind(slow_path->GetExitLabel());
+  }
 }
 
 void LocationsBuilderMIPS::VisitIntConstant(HIntConstant* constant) {
diff --git a/compiler/optimizing/code_generator_mips.h b/compiler/optimizing/code_generator_mips.h
index 0ccd80a..98fee24 100644
--- a/compiler/optimizing/code_generator_mips.h
+++ b/compiler/optimizing/code_generator_mips.h
@@ -535,8 +535,6 @@
   ArenaDeque<PcRelativePatchInfo> pc_relative_type_patches_;
   // PC-relative type patch info for kBssEntry.
   ArenaDeque<PcRelativePatchInfo> type_bss_entry_patches_;
-  // Deduplication map for patchable boot image addresses.
-  Uint32ToLiteralMap boot_image_address_patches_;
   // Patches for string root accesses in JIT compiled code.
   ArenaDeque<JitPatchInfo> jit_string_patches_;
   // Patches for class root accesses in JIT compiled code.
diff --git a/compiler/optimizing/code_generator_mips64.cc b/compiler/optimizing/code_generator_mips64.cc
index d3f3598..02c3ad6 100644
--- a/compiler/optimizing/code_generator_mips64.cc
+++ b/compiler/optimizing/code_generator_mips64.cc
@@ -336,7 +336,8 @@
 
 class TypeCheckSlowPathMIPS64 : public SlowPathCodeMIPS64 {
  public:
-  explicit TypeCheckSlowPathMIPS64(HInstruction* instruction) : SlowPathCodeMIPS64(instruction) {}
+  explicit TypeCheckSlowPathMIPS64(HInstruction* instruction, bool is_fatal)
+      : SlowPathCodeMIPS64(instruction), is_fatal_(is_fatal) {}
 
   void EmitNativeCode(CodeGenerator* codegen) OVERRIDE {
     LocationSummary* locations = instruction_->GetLocations();
@@ -347,7 +348,9 @@
     CodeGeneratorMIPS64* mips64_codegen = down_cast<CodeGeneratorMIPS64*>(codegen);
 
     __ Bind(GetEntryLabel());
-    SaveLiveRegisters(codegen, locations);
+    if (!is_fatal_) {
+      SaveLiveRegisters(codegen, locations);
+    }
 
     // We're moving two locations to locations that could overlap, so we need a parallel
     // move resolver.
@@ -370,13 +373,19 @@
       CheckEntrypointTypes<kQuickCheckInstanceOf, void, mirror::Object*, mirror::Class*>();
     }
 
-    RestoreLiveRegisters(codegen, locations);
-    __ Bc(GetExitLabel());
+    if (!is_fatal_) {
+      RestoreLiveRegisters(codegen, locations);
+      __ Bc(GetExitLabel());
+    }
   }
 
   const char* GetDescription() const OVERRIDE { return "TypeCheckSlowPathMIPS64"; }
 
+  bool IsFatal() const OVERRIDE { return is_fatal_; }
+
  private:
+  const bool is_fatal_;
+
   DISALLOW_COPY_AND_ASSIGN(TypeCheckSlowPathMIPS64);
 };
 
@@ -430,8 +439,6 @@
                                graph->GetArena()->Adapter(kArenaAllocCodeGenerator)),
       pc_relative_type_patches_(graph->GetArena()->Adapter(kArenaAllocCodeGenerator)),
       type_bss_entry_patches_(graph->GetArena()->Adapter(kArenaAllocCodeGenerator)),
-      boot_image_address_patches_(std::less<uint32_t>(),
-                                  graph->GetArena()->Adapter(kArenaAllocCodeGenerator)),
       jit_string_patches_(StringReferenceValueComparator(),
                           graph->GetArena()->Adapter(kArenaAllocCodeGenerator)),
       jit_class_patches_(TypeReferenceValueComparator(),
@@ -937,8 +944,7 @@
       pc_relative_type_patches_.size() +
       type_bss_entry_patches_.size() +
       boot_image_string_patches_.size() +
-      boot_image_type_patches_.size() +
-      boot_image_address_patches_.size();
+      boot_image_type_patches_.size();
   linker_patches->reserve(size);
   EmitPcRelativeLinkerPatches<LinkerPatch::DexCacheArrayPatch>(pc_relative_dex_cache_patches_,
                                                                linker_patches);
@@ -972,13 +978,6 @@
                                                      target_type.dex_file,
                                                      target_type.type_index.index_));
   }
-  for (const auto& entry : boot_image_address_patches_) {
-    DCHECK(GetCompilerOptions().GetIncludePatchInformation());
-    Literal* literal = entry.second;
-    DCHECK(literal->GetLabel()->IsBound());
-    uint32_t literal_offset = __ GetLabelLocation(literal->GetLabel());
-    linker_patches->push_back(LinkerPatch::RecordPosition(literal_offset));
-  }
   DCHECK_EQ(size, linker_patches->size());
 }
 
@@ -1042,9 +1041,7 @@
 }
 
 Literal* CodeGeneratorMIPS64::DeduplicateBootImageAddressLiteral(uint64_t address) {
-  bool needs_patch = GetCompilerOptions().GetIncludePatchInformation();
-  Uint32ToLiteralMap* map = needs_patch ? &boot_image_address_patches_ : &uint32_literals_;
-  return DeduplicateUint32Literal(dchecked_integral_cast<uint32_t>(address), map);
+  return DeduplicateUint32Literal(dchecked_integral_cast<uint32_t>(address), &uint32_literals_);
 }
 
 void CodeGeneratorMIPS64::EmitPcRelativeAddressPlaceholderHigh(PcRelativePatchInfo* info,
@@ -1659,10 +1656,6 @@
       UNREACHABLE();
   }
 
-  if (!maybe_compressed_char_at) {
-    codegen_->MaybeRecordImplicitNullCheck(instruction);
-  }
-
   if (type == Primitive::kPrimNot) {
     GpuRegister out = locations->Out().AsRegister<GpuRegister>();
     __ MaybeUnpoisonHeapReference(out);
@@ -1893,31 +1886,178 @@
 }
 
 void LocationsBuilderMIPS64::VisitCheckCast(HCheckCast* instruction) {
-  LocationSummary* locations = new (GetGraph()->GetArena()) LocationSummary(
-      instruction,
-      LocationSummary::kCallOnSlowPath);
+  LocationSummary::CallKind call_kind = LocationSummary::kNoCall;
+  bool throws_into_catch = instruction->CanThrowIntoCatchBlock();
+
+  TypeCheckKind type_check_kind = instruction->GetTypeCheckKind();
+  switch (type_check_kind) {
+    case TypeCheckKind::kExactCheck:
+    case TypeCheckKind::kAbstractClassCheck:
+    case TypeCheckKind::kClassHierarchyCheck:
+    case TypeCheckKind::kArrayObjectCheck:
+      call_kind = throws_into_catch
+          ? LocationSummary::kCallOnSlowPath
+          : LocationSummary::kNoCall;  // In fact, call on a fatal (non-returning) slow path.
+      break;
+    case TypeCheckKind::kArrayCheck:
+    case TypeCheckKind::kUnresolvedCheck:
+    case TypeCheckKind::kInterfaceCheck:
+      call_kind = LocationSummary::kCallOnSlowPath;
+      break;
+  }
+
+  LocationSummary* locations = new (GetGraph()->GetArena()) LocationSummary(instruction, call_kind);
   locations->SetInAt(0, Location::RequiresRegister());
   locations->SetInAt(1, Location::RequiresRegister());
-  // Note that TypeCheckSlowPathMIPS64 uses this register too.
   locations->AddTemp(Location::RequiresRegister());
 }
 
 void InstructionCodeGeneratorMIPS64::VisitCheckCast(HCheckCast* instruction) {
+  TypeCheckKind type_check_kind = instruction->GetTypeCheckKind();
   LocationSummary* locations = instruction->GetLocations();
   GpuRegister obj = locations->InAt(0).AsRegister<GpuRegister>();
   GpuRegister cls = locations->InAt(1).AsRegister<GpuRegister>();
-  GpuRegister obj_cls = locations->GetTemp(0).AsRegister<GpuRegister>();
+  GpuRegister temp = locations->GetTemp(0).AsRegister<GpuRegister>();
+  const uint32_t class_offset = mirror::Object::ClassOffset().Int32Value();
+  const uint32_t super_offset = mirror::Class::SuperClassOffset().Int32Value();
+  const uint32_t component_offset = mirror::Class::ComponentTypeOffset().Int32Value();
+  const uint32_t primitive_offset = mirror::Class::PrimitiveTypeOffset().Int32Value();
+  const uint32_t iftable_offset = mirror::Class::IfTableOffset().Uint32Value();
+  const uint32_t array_length_offset = mirror::Array::LengthOffset().Uint32Value();
+  const uint32_t object_array_data_offset =
+      mirror::Array::DataOffset(kHeapReferenceSize).Uint32Value();
+  Mips64Label done;
 
+  // Always false for read barriers since we may need to go to the entrypoint for non-fatal cases
+  // from false negatives. The false negatives may come from avoiding read barriers below. Avoiding
+  // read barriers is done for performance and code size reasons.
+  bool is_type_check_slow_path_fatal = false;
+  if (!kEmitCompilerReadBarrier) {
+    is_type_check_slow_path_fatal =
+        (type_check_kind == TypeCheckKind::kExactCheck ||
+         type_check_kind == TypeCheckKind::kAbstractClassCheck ||
+         type_check_kind == TypeCheckKind::kClassHierarchyCheck ||
+         type_check_kind == TypeCheckKind::kArrayObjectCheck) &&
+        !instruction->CanThrowIntoCatchBlock();
+  }
   SlowPathCodeMIPS64* slow_path =
-      new (GetGraph()->GetArena()) TypeCheckSlowPathMIPS64(instruction);
+      new (GetGraph()->GetArena()) TypeCheckSlowPathMIPS64(instruction,
+                                                           is_type_check_slow_path_fatal);
   codegen_->AddSlowPath(slow_path);
 
-  // TODO: avoid this check if we know obj is not null.
-  __ Beqzc(obj, slow_path->GetExitLabel());
-  // Compare the class of `obj` with `cls`.
-  __ LoadFromOffset(kLoadUnsignedWord, obj_cls, obj, mirror::Object::ClassOffset().Int32Value());
-  __ MaybeUnpoisonHeapReference(obj_cls);
-  __ Bnec(obj_cls, cls, slow_path->GetEntryLabel());
+  // Avoid this check if we know `obj` is not null.
+  if (instruction->MustDoNullCheck()) {
+    __ Beqzc(obj, &done);
+  }
+
+  switch (type_check_kind) {
+    case TypeCheckKind::kExactCheck:
+    case TypeCheckKind::kArrayCheck: {
+      // /* HeapReference<Class> */ temp = obj->klass_
+      __ LoadFromOffset(kLoadUnsignedWord, temp, obj, class_offset);
+      __ MaybeUnpoisonHeapReference(temp);
+      // Jump to slow path for throwing the exception or doing a
+      // more involved array check.
+      __ Bnec(temp, cls, slow_path->GetEntryLabel());
+      break;
+    }
+
+    case TypeCheckKind::kAbstractClassCheck: {
+      // /* HeapReference<Class> */ temp = obj->klass_
+      __ LoadFromOffset(kLoadUnsignedWord, temp, obj, class_offset);
+      __ MaybeUnpoisonHeapReference(temp);
+      // If the class is abstract, we eagerly fetch the super class of the
+      // object to avoid doing a comparison we know will fail.
+      Mips64Label loop;
+      __ Bind(&loop);
+      // /* HeapReference<Class> */ temp = temp->super_class_
+      __ LoadFromOffset(kLoadUnsignedWord, temp, temp, super_offset);
+      __ MaybeUnpoisonHeapReference(temp);
+      // If the class reference currently in `temp` is null, jump to the slow path to throw the
+      // exception.
+      __ Beqzc(temp, slow_path->GetEntryLabel());
+      // Otherwise, compare the classes.
+      __ Bnec(temp, cls, &loop);
+      break;
+    }
+
+    case TypeCheckKind::kClassHierarchyCheck: {
+      // /* HeapReference<Class> */ temp = obj->klass_
+      __ LoadFromOffset(kLoadUnsignedWord, temp, obj, class_offset);
+      __ MaybeUnpoisonHeapReference(temp);
+      // Walk over the class hierarchy to find a match.
+      Mips64Label loop;
+      __ Bind(&loop);
+      __ Beqc(temp, cls, &done);
+      // /* HeapReference<Class> */ temp = temp->super_class_
+      __ LoadFromOffset(kLoadUnsignedWord, temp, temp, super_offset);
+      __ MaybeUnpoisonHeapReference(temp);
+      // If the class reference currently in `temp` is null, jump to the slow path to throw the
+      // exception. Otherwise, jump to the beginning of the loop.
+      __ Bnezc(temp, &loop);
+      __ Bc(slow_path->GetEntryLabel());
+      break;
+    }
+
+    case TypeCheckKind::kArrayObjectCheck: {
+      // /* HeapReference<Class> */ temp = obj->klass_
+      __ LoadFromOffset(kLoadUnsignedWord, temp, obj, class_offset);
+      __ MaybeUnpoisonHeapReference(temp);
+      // Do an exact check.
+      __ Beqc(temp, cls, &done);
+      // Otherwise, we need to check that the object's class is a non-primitive array.
+      // /* HeapReference<Class> */ temp = temp->component_type_
+      __ LoadFromOffset(kLoadUnsignedWord, temp, temp, component_offset);
+      __ MaybeUnpoisonHeapReference(temp);
+      // If the component type is null, jump to the slow path to throw the exception.
+      __ Beqzc(temp, slow_path->GetEntryLabel());
+      // Otherwise, the object is indeed an array, further check that this component
+      // type is not a primitive type.
+      __ LoadFromOffset(kLoadUnsignedHalfword, temp, temp, primitive_offset);
+      static_assert(Primitive::kPrimNot == 0, "Expected 0 for kPrimNot");
+      __ Bnezc(temp, slow_path->GetEntryLabel());
+      break;
+    }
+
+    case TypeCheckKind::kUnresolvedCheck:
+      // We always go into the type check slow path for the unresolved check case.
+      // We cannot directly call the CheckCast runtime entry point
+      // without resorting to a type checking slow path here (i.e. by
+      // calling InvokeRuntime directly), as it would require to
+      // assign fixed registers for the inputs of this HInstanceOf
+      // instruction (following the runtime calling convention), which
+      // might be cluttered by the potential first read barrier
+      // emission at the beginning of this method.
+      __ Bc(slow_path->GetEntryLabel());
+      break;
+
+    case TypeCheckKind::kInterfaceCheck: {
+      // Avoid read barriers to improve performance of the fast path. We can not get false
+      // positives by doing this.
+      // /* HeapReference<Class> */ temp = obj->klass_
+      __ LoadFromOffset(kLoadUnsignedWord, temp, obj, class_offset);
+      __ MaybeUnpoisonHeapReference(temp);
+      // /* HeapReference<Class> */ temp = temp->iftable_
+      __ LoadFromOffset(kLoadUnsignedWord, temp, temp, iftable_offset);
+      __ MaybeUnpoisonHeapReference(temp);
+      // Iftable is never null.
+      __ Lw(TMP, temp, array_length_offset);
+      // Loop through the iftable and check if any class matches.
+      Mips64Label loop;
+      __ Bind(&loop);
+      __ Beqzc(TMP, slow_path->GetEntryLabel());
+      __ Lwu(AT, temp, object_array_data_offset);
+      __ MaybeUnpoisonHeapReference(AT);
+      // Go to next interface.
+      __ Daddiu(temp, temp, 2 * kHeapReferenceSize);
+      __ Addiu(TMP, TMP, -2);
+      // Compare the classes and continue the loop if they do not match.
+      __ Bnec(AT, cls, &loop);
+      break;
+    }
+  }
+
+  __ Bind(&done);
   __ Bind(slow_path->GetExitLabel());
 }
 
@@ -3286,8 +3426,22 @@
 }
 
 void LocationsBuilderMIPS64::VisitInstanceOf(HInstanceOf* instruction) {
-  LocationSummary::CallKind call_kind =
-      instruction->IsExactCheck() ? LocationSummary::kNoCall : LocationSummary::kCallOnSlowPath;
+  LocationSummary::CallKind call_kind = LocationSummary::kNoCall;
+  TypeCheckKind type_check_kind = instruction->GetTypeCheckKind();
+  switch (type_check_kind) {
+    case TypeCheckKind::kExactCheck:
+    case TypeCheckKind::kAbstractClassCheck:
+    case TypeCheckKind::kClassHierarchyCheck:
+    case TypeCheckKind::kArrayObjectCheck:
+      call_kind = LocationSummary::kNoCall;
+      break;
+    case TypeCheckKind::kArrayCheck:
+    case TypeCheckKind::kUnresolvedCheck:
+    case TypeCheckKind::kInterfaceCheck:
+      call_kind = LocationSummary::kCallOnSlowPath;
+      break;
+  }
+
   LocationSummary* locations = new (GetGraph()->GetArena()) LocationSummary(instruction, call_kind);
   locations->SetInAt(0, Location::RequiresRegister());
   locations->SetInAt(1, Location::RequiresRegister());
@@ -3297,37 +3451,143 @@
 }
 
 void InstructionCodeGeneratorMIPS64::VisitInstanceOf(HInstanceOf* instruction) {
+  TypeCheckKind type_check_kind = instruction->GetTypeCheckKind();
   LocationSummary* locations = instruction->GetLocations();
   GpuRegister obj = locations->InAt(0).AsRegister<GpuRegister>();
   GpuRegister cls = locations->InAt(1).AsRegister<GpuRegister>();
   GpuRegister out = locations->Out().AsRegister<GpuRegister>();
-
+  uint32_t class_offset = mirror::Object::ClassOffset().Int32Value();
+  uint32_t super_offset = mirror::Class::SuperClassOffset().Int32Value();
+  uint32_t component_offset = mirror::Class::ComponentTypeOffset().Int32Value();
+  uint32_t primitive_offset = mirror::Class::PrimitiveTypeOffset().Int32Value();
   Mips64Label done;
+  SlowPathCodeMIPS64* slow_path = nullptr;
 
   // Return 0 if `obj` is null.
-  // TODO: Avoid this check if we know `obj` is not null.
-  __ Move(out, ZERO);
-  __ Beqzc(obj, &done);
+  // Avoid this check if we know `obj` is not null.
+  if (instruction->MustDoNullCheck()) {
+    __ Move(out, ZERO);
+    __ Beqzc(obj, &done);
+  }
 
-  // Compare the class of `obj` with `cls`.
-  __ LoadFromOffset(kLoadUnsignedWord, out, obj, mirror::Object::ClassOffset().Int32Value());
-  __ MaybeUnpoisonHeapReference(out);
-  if (instruction->IsExactCheck()) {
-    // Classes must be equal for the instanceof to succeed.
-    __ Xor(out, out, cls);
-    __ Sltiu(out, out, 1);
-  } else {
-    // If the classes are not equal, we go into a slow path.
-    DCHECK(locations->OnlyCallsOnSlowPath());
-    SlowPathCodeMIPS64* slow_path =
-        new (GetGraph()->GetArena()) TypeCheckSlowPathMIPS64(instruction);
-    codegen_->AddSlowPath(slow_path);
-    __ Bnec(out, cls, slow_path->GetEntryLabel());
-    __ LoadConst32(out, 1);
-    __ Bind(slow_path->GetExitLabel());
+  switch (type_check_kind) {
+    case TypeCheckKind::kExactCheck: {
+      // /* HeapReference<Class> */ out = obj->klass_
+      __ LoadFromOffset(kLoadUnsignedWord, out, obj, class_offset);
+      __ MaybeUnpoisonHeapReference(out);
+      // Classes must be equal for the instanceof to succeed.
+      __ Xor(out, out, cls);
+      __ Sltiu(out, out, 1);
+      break;
+    }
+
+    case TypeCheckKind::kAbstractClassCheck: {
+      // /* HeapReference<Class> */ out = obj->klass_
+      __ LoadFromOffset(kLoadUnsignedWord, out, obj, class_offset);
+      __ MaybeUnpoisonHeapReference(out);
+      // If the class is abstract, we eagerly fetch the super class of the
+      // object to avoid doing a comparison we know will fail.
+      Mips64Label loop;
+      __ Bind(&loop);
+      // /* HeapReference<Class> */ out = out->super_class_
+      __ LoadFromOffset(kLoadUnsignedWord, out, out, super_offset);
+      __ MaybeUnpoisonHeapReference(out);
+      // If `out` is null, we use it for the result, and jump to `done`.
+      __ Beqzc(out, &done);
+      __ Bnec(out, cls, &loop);
+      __ LoadConst32(out, 1);
+      break;
+    }
+
+    case TypeCheckKind::kClassHierarchyCheck: {
+      // /* HeapReference<Class> */ out = obj->klass_
+      __ LoadFromOffset(kLoadUnsignedWord, out, obj, class_offset);
+      __ MaybeUnpoisonHeapReference(out);
+      // Walk over the class hierarchy to find a match.
+      Mips64Label loop, success;
+      __ Bind(&loop);
+      __ Beqc(out, cls, &success);
+      // /* HeapReference<Class> */ out = out->super_class_
+      __ LoadFromOffset(kLoadUnsignedWord, out, out, super_offset);
+      __ MaybeUnpoisonHeapReference(out);
+      __ Bnezc(out, &loop);
+      // If `out` is null, we use it for the result, and jump to `done`.
+      __ Bc(&done);
+      __ Bind(&success);
+      __ LoadConst32(out, 1);
+      break;
+    }
+
+    case TypeCheckKind::kArrayObjectCheck: {
+      // /* HeapReference<Class> */ out = obj->klass_
+      __ LoadFromOffset(kLoadUnsignedWord, out, obj, class_offset);
+      __ MaybeUnpoisonHeapReference(out);
+      // Do an exact check.
+      Mips64Label success;
+      __ Beqc(out, cls, &success);
+      // Otherwise, we need to check that the object's class is a non-primitive array.
+      // /* HeapReference<Class> */ out = out->component_type_
+      __ LoadFromOffset(kLoadUnsignedWord, out, out, component_offset);
+      __ MaybeUnpoisonHeapReference(out);
+      // If `out` is null, we use it for the result, and jump to `done`.
+      __ Beqzc(out, &done);
+      __ LoadFromOffset(kLoadUnsignedHalfword, out, out, primitive_offset);
+      static_assert(Primitive::kPrimNot == 0, "Expected 0 for kPrimNot");
+      __ Sltiu(out, out, 1);
+      __ Bc(&done);
+      __ Bind(&success);
+      __ LoadConst32(out, 1);
+      break;
+    }
+
+    case TypeCheckKind::kArrayCheck: {
+      // No read barrier since the slow path will retry upon failure.
+      // /* HeapReference<Class> */ out = obj->klass_
+      __ LoadFromOffset(kLoadUnsignedWord, out, obj, class_offset);
+      __ MaybeUnpoisonHeapReference(out);
+      DCHECK(locations->OnlyCallsOnSlowPath());
+      slow_path = new (GetGraph()->GetArena()) TypeCheckSlowPathMIPS64(instruction,
+                                                                       /* is_fatal */ false);
+      codegen_->AddSlowPath(slow_path);
+      __ Bnec(out, cls, slow_path->GetEntryLabel());
+      __ LoadConst32(out, 1);
+      break;
+    }
+
+    case TypeCheckKind::kUnresolvedCheck:
+    case TypeCheckKind::kInterfaceCheck: {
+      // Note that we indeed only call on slow path, but we always go
+      // into the slow path for the unresolved and interface check
+      // cases.
+      //
+      // We cannot directly call the InstanceofNonTrivial runtime
+      // entry point without resorting to a type checking slow path
+      // here (i.e. by calling InvokeRuntime directly), as it would
+      // require to assign fixed registers for the inputs of this
+      // HInstanceOf instruction (following the runtime calling
+      // convention), which might be cluttered by the potential first
+      // read barrier emission at the beginning of this method.
+      //
+      // TODO: Introduce a new runtime entry point taking the object
+      // to test (instead of its class) as argument, and let it deal
+      // with the read barrier issues. This will let us refactor this
+      // case of the `switch` code as it was previously (with a direct
+      // call to the runtime not using a type checking slow path).
+      // This should also be beneficial for the other cases above.
+      DCHECK(locations->OnlyCallsOnSlowPath());
+      slow_path = new (GetGraph()->GetArena()) TypeCheckSlowPathMIPS64(instruction,
+                                                                       /* is_fatal */ false);
+      codegen_->AddSlowPath(slow_path);
+      __ Bc(slow_path->GetEntryLabel());
+      break;
+    }
   }
 
   __ Bind(&done);
+
+  if (slow_path != nullptr) {
+    __ Bind(slow_path->GetExitLabel());
+  }
 }
 
 void LocationsBuilderMIPS64::VisitIntConstant(HIntConstant* constant) {
diff --git a/compiler/optimizing/code_generator_mips64.h b/compiler/optimizing/code_generator_mips64.h
index 26cc7dc..3056f7f 100644
--- a/compiler/optimizing/code_generator_mips64.h
+++ b/compiler/optimizing/code_generator_mips64.h
@@ -492,8 +492,6 @@
   ArenaDeque<PcRelativePatchInfo> pc_relative_type_patches_;
   // PC-relative type patch info for kBssEntry.
   ArenaDeque<PcRelativePatchInfo> type_bss_entry_patches_;
-  // Deduplication map for patchable boot image addresses.
-  Uint32ToLiteralMap boot_image_address_patches_;
   // Patches for string root accesses in JIT compiled code.
   StringToLiteralMap jit_string_patches_;
   // Patches for class root accesses in JIT compiled code.
diff --git a/compiler/optimizing/code_generator_x86.cc b/compiler/optimizing/code_generator_x86.cc
index b779aed..0b50619 100644
--- a/compiler/optimizing/code_generator_x86.cc
+++ b/compiler/optimizing/code_generator_x86.cc
@@ -1015,7 +1015,6 @@
       assembler_(graph->GetArena()),
       isa_features_(isa_features),
       pc_relative_dex_cache_patches_(graph->GetArena()->Adapter(kArenaAllocCodeGenerator)),
-      simple_patches_(graph->GetArena()->Adapter(kArenaAllocCodeGenerator)),
       string_patches_(graph->GetArena()->Adapter(kArenaAllocCodeGenerator)),
       boot_image_type_patches_(graph->GetArena()->Adapter(kArenaAllocCodeGenerator)),
       type_bss_entry_patches_(graph->GetArena()->Adapter(kArenaAllocCodeGenerator)),
@@ -4603,13 +4602,6 @@
       temp, ArtMethod::EntryPointFromQuickCompiledCodeOffset(kX86PointerSize).Int32Value()));
 }
 
-void CodeGeneratorX86::RecordSimplePatch() {
-  if (GetCompilerOptions().GetIncludePatchInformation()) {
-    simple_patches_.emplace_back();
-    __ Bind(&simple_patches_.back());
-  }
-}
-
 void CodeGeneratorX86::RecordBootStringPatch(HLoadString* load_string) {
   DCHECK(GetCompilerOptions().IsBootImage());
   HX86ComputeBaseMethodAddress* address = nullptr;
@@ -4682,17 +4674,12 @@
   DCHECK(linker_patches->empty());
   size_t size =
       pc_relative_dex_cache_patches_.size() +
-      simple_patches_.size() +
       string_patches_.size() +
       boot_image_type_patches_.size() +
       type_bss_entry_patches_.size();
   linker_patches->reserve(size);
   EmitPcRelativeLinkerPatches<LinkerPatch::DexCacheArrayPatch>(pc_relative_dex_cache_patches_,
                                                                linker_patches);
-  for (const Label& label : simple_patches_) {
-    uint32_t literal_offset = label.Position() - kLabelPositionToLiteralOffsetAdjustment;
-    linker_patches->push_back(LinkerPatch::RecordPosition(literal_offset));
-  }
   if (!GetCompilerOptions().IsBootImage()) {
     DCHECK(boot_image_type_patches_.empty());
     EmitPcRelativeLinkerPatches<LinkerPatch::StringBssEntryPatch>(string_patches_, linker_patches);
@@ -6154,7 +6141,6 @@
           reinterpret_cast<uintptr_t>(cls->GetClass().Get()));
       DCHECK_NE(address, 0u);
       __ movl(out, Immediate(address));
-      codegen_->RecordSimplePatch();
       break;
     }
     case HLoadClass::LoadKind::kBssEntry: {
@@ -6311,7 +6297,6 @@
           reinterpret_cast<uintptr_t>(load->GetString().Get()));
       DCHECK_NE(address, 0u);
       __ movl(out, Immediate(address));
-      codegen_->RecordSimplePatch();
       return;  // No dex cache slow path.
     }
     case HLoadString::LoadKind::kBssEntry: {
diff --git a/compiler/optimizing/code_generator_x86.h b/compiler/optimizing/code_generator_x86.h
index 5360dc9..65ee383 100644
--- a/compiler/optimizing/code_generator_x86.h
+++ b/compiler/optimizing/code_generator_x86.h
@@ -412,7 +412,6 @@
   // Generate a call to a virtual method.
   void GenerateVirtualCall(HInvokeVirtual* invoke, Location temp) OVERRIDE;
 
-  void RecordSimplePatch();
   void RecordBootStringPatch(HLoadString* load_string);
   void RecordBootTypePatch(HLoadClass* load_class);
   Label* NewTypeBssEntryPatch(HLoadClass* load_class);
@@ -633,8 +632,6 @@
 
   // PC-relative DexCache access info.
   ArenaDeque<X86PcRelativePatchInfo> pc_relative_dex_cache_patches_;
-  // Patch locations for patchoat where the linker doesn't do any other work.
-  ArenaDeque<Label> simple_patches_;
   // String patch locations; type depends on configuration (app .bss or boot image PIC/non-PIC).
   ArenaDeque<X86PcRelativePatchInfo> string_patches_;
   // Type patch locations for boot image; type depends on configuration (boot image PIC/non-PIC).
diff --git a/compiler/optimizing/code_generator_x86_64.cc b/compiler/optimizing/code_generator_x86_64.cc
index 179bf6d..644fcee 100644
--- a/compiler/optimizing/code_generator_x86_64.cc
+++ b/compiler/optimizing/code_generator_x86_64.cc
@@ -1070,13 +1070,6 @@
       kX86_64PointerSize).SizeValue()));
 }
 
-void CodeGeneratorX86_64::RecordSimplePatch() {
-  if (GetCompilerOptions().GetIncludePatchInformation()) {
-    simple_patches_.emplace_back();
-    __ Bind(&simple_patches_.back());
-  }
-}
-
 void CodeGeneratorX86_64::RecordBootStringPatch(HLoadString* load_string) {
   DCHECK(GetCompilerOptions().IsBootImage());
   string_patches_.emplace_back(load_string->GetDexFile(), load_string->GetStringIndex().index_);
@@ -1126,17 +1119,12 @@
   DCHECK(linker_patches->empty());
   size_t size =
       pc_relative_dex_cache_patches_.size() +
-      simple_patches_.size() +
       string_patches_.size() +
       boot_image_type_patches_.size() +
       type_bss_entry_patches_.size();
   linker_patches->reserve(size);
   EmitPcRelativeLinkerPatches<LinkerPatch::DexCacheArrayPatch>(pc_relative_dex_cache_patches_,
                                                                linker_patches);
-  for (const Label& label : simple_patches_) {
-    uint32_t literal_offset = label.Position() - kLabelPositionToLiteralOffsetAdjustment;
-    linker_patches->push_back(LinkerPatch::RecordPosition(literal_offset));
-  }
   if (!GetCompilerOptions().IsBootImage()) {
     DCHECK(boot_image_type_patches_.empty());
     EmitPcRelativeLinkerPatches<LinkerPatch::StringBssEntryPatch>(string_patches_, linker_patches);
@@ -1227,7 +1215,6 @@
         isa_features_(isa_features),
         constant_area_start_(0),
         pc_relative_dex_cache_patches_(graph->GetArena()->Adapter(kArenaAllocCodeGenerator)),
-        simple_patches_(graph->GetArena()->Adapter(kArenaAllocCodeGenerator)),
         string_patches_(graph->GetArena()->Adapter(kArenaAllocCodeGenerator)),
         boot_image_type_patches_(graph->GetArena()->Adapter(kArenaAllocCodeGenerator)),
         type_bss_entry_patches_(graph->GetArena()->Adapter(kArenaAllocCodeGenerator)),
@@ -5545,7 +5532,6 @@
           reinterpret_cast<uintptr_t>(cls->GetClass().Get()));
       DCHECK_NE(address, 0u);
       __ movl(out, Immediate(address));  // Zero-extended.
-      codegen_->RecordSimplePatch();
       break;
     }
     case HLoadClass::LoadKind::kBssEntry: {
@@ -5681,7 +5667,6 @@
           reinterpret_cast<uintptr_t>(load->GetString().Get()));
       DCHECK_NE(address, 0u);
       __ movl(out, Immediate(address));  // Zero-extended.
-      codegen_->RecordSimplePatch();
       return;  // No dex cache slow path.
     }
     case HLoadString::LoadKind::kBssEntry: {
diff --git a/compiler/optimizing/code_generator_x86_64.h b/compiler/optimizing/code_generator_x86_64.h
index 3a83731..376c3ce 100644
--- a/compiler/optimizing/code_generator_x86_64.h
+++ b/compiler/optimizing/code_generator_x86_64.h
@@ -406,7 +406,6 @@
   void GenerateStaticOrDirectCall(HInvokeStaticOrDirect* invoke, Location temp) OVERRIDE;
   void GenerateVirtualCall(HInvokeVirtual* invoke, Location temp) OVERRIDE;
 
-  void RecordSimplePatch();
   void RecordBootStringPatch(HLoadString* load_string);
   void RecordBootTypePatch(HLoadClass* load_class);
   Label* NewTypeBssEntryPatch(HLoadClass* load_class);
@@ -602,8 +601,6 @@
 
   // PC-relative DexCache access info.
   ArenaDeque<PatchInfo<Label>> pc_relative_dex_cache_patches_;
-  // Patch locations for patchoat where the linker doesn't do any other work.
-  ArenaDeque<Label> simple_patches_;
   // String patch locations; type depends on configuration (app .bss or boot image PIC).
   ArenaDeque<PatchInfo<Label>> string_patches_;
   // Type patch locations for boot image (always PIC).
diff --git a/compiler/optimizing/induction_var_range.cc b/compiler/optimizing/induction_var_range.cc
index 1cd65c1..d6513c8 100644
--- a/compiler/optimizing/induction_var_range.cc
+++ b/compiler/optimizing/induction_var_range.cc
@@ -57,21 +57,27 @@
   return false;
 }
 
-/** Returns b^e for b,e >= 1. Sets overflow if arithmetic wrap-around occurred. */
+/** Computes a * b for a,b > 0 (at least until first overflow happens). */
+static int64_t SafeMul(int64_t a, int64_t b, /*out*/ bool* overflow) {
+  if (a > 0 && b > 0 && a > (std::numeric_limits<int64_t>::max() / b)) {
+    *overflow = true;
+  }
+  return a * b;
+}
+
+/** Returns b^e for b,e > 0. Sets overflow if arithmetic wrap-around occurred. */
 static int64_t IntPow(int64_t b, int64_t e, /*out*/ bool* overflow) {
-  DCHECK_GE(b, 1);
-  DCHECK_GE(e, 1);
+  DCHECK_LT(0, b);
+  DCHECK_LT(0, e);
   int64_t pow = 1;
   while (e) {
     if (e & 1) {
-      int64_t oldpow = pow;
-      pow *= b;
-      if (pow < oldpow) {
-        *overflow = true;
-      }
+      pow = SafeMul(pow, b, overflow);
     }
     e >>= 1;
-    b *= b;
+    if (e) {
+      b = SafeMul(b, b, overflow);
+    }
   }
   return pow;
 }
@@ -384,7 +390,8 @@
   HInductionVarAnalysis::InductionInfo* trip = nullptr;
   if (HasInductionInfo(instruction, instruction, &loop, &info, &trip)) {
     if (info->induction_class == HInductionVarAnalysis::kLinear &&
-        info->op_b->operation == HInductionVarAnalysis::kFetch) {
+        info->op_b->operation == HInductionVarAnalysis::kFetch &&
+        !HInductionVarAnalysis::IsNarrowingLinear(info)) {
       int64_t stride_value = 0;
       if (IsConstant(info->op_a, kExact, &stride_value) && stride_value == 1) {
         int64_t off_value = 0;
diff --git a/compiler/optimizing/inliner.cc b/compiler/optimizing/inliner.cc
index 3e34090..583008b 100644
--- a/compiler/optimizing/inliner.cc
+++ b/compiler/optimizing/inliner.cc
@@ -249,20 +249,25 @@
   ProfilingInfo* const profiling_info_;
 };
 
-static bool IsMonomorphic(Handle<mirror::ObjectArray<mirror::Class>> classes)
-    REQUIRES_SHARED(Locks::mutator_lock_) {
-  DCHECK_GE(InlineCache::kIndividualCacheSize, 2);
-  return classes->Get(0) != nullptr && classes->Get(1) == nullptr;
-}
-
-static bool IsMegamorphic(Handle<mirror::ObjectArray<mirror::Class>> classes)
-    REQUIRES_SHARED(Locks::mutator_lock_) {
-  for (size_t i = 0; i < InlineCache::kIndividualCacheSize; ++i) {
-    if (classes->Get(i) == nullptr) {
-      return false;
+HInliner::InlineCacheType HInliner::GetInlineCacheType(
+    const Handle<mirror::ObjectArray<mirror::Class>>& classes)
+  REQUIRES_SHARED(Locks::mutator_lock_) {
+  uint8_t number_of_types = 0;
+  for (; number_of_types < InlineCache::kIndividualCacheSize; ++number_of_types) {
+    if (classes->Get(number_of_types) == nullptr) {
+      break;
     }
   }
-  return true;
+
+  if (number_of_types == 0) {
+    return kInlineCacheUninitialized;
+  } else if (number_of_types == 1) {
+    return kInlineCacheMonomorphic;
+  } else if (number_of_types == InlineCache::kIndividualCacheSize) {
+    return kInlineCacheMegamorphic;
+  } else {
+    return kInlineCachePolymorphic;
+  }
 }
 
 static mirror::Class* GetMonomorphicType(Handle<mirror::ObjectArray<mirror::Class>> classes)
@@ -271,18 +276,6 @@
   return classes->Get(0);
 }
 
-static bool IsUninitialized(Handle<mirror::ObjectArray<mirror::Class>> classes)
-    REQUIRES_SHARED(Locks::mutator_lock_) {
-  return classes->Get(0) == nullptr;
-}
-
-static bool IsPolymorphic(Handle<mirror::ObjectArray<mirror::Class>> classes)
-    REQUIRES_SHARED(Locks::mutator_lock_) {
-  DCHECK_GE(InlineCache::kIndividualCacheSize, 3);
-  return classes->Get(1) != nullptr &&
-      classes->Get(InlineCache::kIndividualCacheSize - 1) == nullptr;
-}
-
 ArtMethod* HInliner::TryCHADevirtualization(ArtMethod* resolved_method) {
   if (!resolved_method->HasSingleImplementation()) {
     return nullptr;
@@ -353,67 +346,209 @@
     }
     return result;
   }
-
   DCHECK(!invoke_instruction->IsInvokeStaticOrDirect());
 
-  // Check if we can use an inline cache.
-  ArtMethod* caller = graph_->GetArtMethod();
-  if (Runtime::Current()->UseJitCompilation()) {
-    // Under JIT, we should always know the caller.
-    DCHECK(caller != nullptr);
-    ScopedProfilingInfoInlineUse spiis(caller, soa.Self());
-    ProfilingInfo* profiling_info = spiis.GetProfilingInfo();
-    if (profiling_info != nullptr) {
-      StackHandleScope<1> hs(soa.Self());
-      ClassLinker* class_linker = caller_compilation_unit_.GetClassLinker();
-      Handle<mirror::ObjectArray<mirror::Class>> inline_cache = hs.NewHandle(
-          mirror::ObjectArray<mirror::Class>::Alloc(
-              soa.Self(),
-              class_linker->GetClassRoot(ClassLinker::kClassArrayClass),
-              InlineCache::kIndividualCacheSize));
-      if (inline_cache == nullptr) {
-        // We got an OOME. Just clear the exception, and don't inline.
-        DCHECK(soa.Self()->IsExceptionPending());
-        soa.Self()->ClearException();
-        VLOG(compiler) << "Out of memory in the compiler when trying to inline";
-        return false;
+  // Try using inline caches.
+  return TryInlineFromInlineCache(caller_dex_file, invoke_instruction, resolved_method);
+}
+
+static Handle<mirror::ObjectArray<mirror::Class>> AllocateInlineCacheHolder(
+    const DexCompilationUnit& compilation_unit,
+    StackHandleScope<1>* hs)
+    REQUIRES_SHARED(Locks::mutator_lock_) {
+  Thread* self = Thread::Current();
+  ClassLinker* class_linker = compilation_unit.GetClassLinker();
+  Handle<mirror::ObjectArray<mirror::Class>> inline_cache = hs->NewHandle(
+      mirror::ObjectArray<mirror::Class>::Alloc(
+          self,
+          class_linker->GetClassRoot(ClassLinker::kClassArrayClass),
+          InlineCache::kIndividualCacheSize));
+  if (inline_cache == nullptr) {
+    // We got an OOME. Just clear the exception, and don't inline.
+    DCHECK(self->IsExceptionPending());
+    self->ClearException();
+    VLOG(compiler) << "Out of memory in the compiler when trying to inline";
+  }
+  return inline_cache;
+}
+
+bool HInliner::TryInlineFromInlineCache(const DexFile& caller_dex_file,
+                                        HInvoke* invoke_instruction,
+                                        ArtMethod* resolved_method)
+    REQUIRES_SHARED(Locks::mutator_lock_) {
+  StackHandleScope<1> hs(Thread::Current());
+  Handle<mirror::ObjectArray<mirror::Class>> inline_cache;
+  InlineCacheType inline_cache_type = Runtime::Current()->IsAotCompiler()
+      ? GetInlineCacheAOT(caller_dex_file, invoke_instruction, &hs, &inline_cache)
+      : GetInlineCacheJIT(invoke_instruction, &hs, &inline_cache);
+
+  switch (inline_cache_type) {
+    case kInlineCacheNoData:
+      break;
+
+    case kInlineCacheUninitialized:
+      VLOG(compiler) << "Interface or virtual call to "
+                     << caller_dex_file.PrettyMethod(invoke_instruction->GetDexMethodIndex())
+                     << " is not hit and not inlined";
+      return false;
+
+    case kInlineCacheMonomorphic:
+      MaybeRecordStat(kMonomorphicCall);
+      if (outermost_graph_->IsCompilingOsr()) {
+        // If we are compiling OSR, we pretend this call is polymorphic, as we may come from the
+        // interpreter and it may have seen different receiver types.
+        return TryInlinePolymorphicCall(invoke_instruction, resolved_method, inline_cache);
       } else {
-        Runtime::Current()->GetJit()->GetCodeCache()->CopyInlineCacheInto(
-            *profiling_info->GetInlineCache(invoke_instruction->GetDexPc()),
-            inline_cache);
-        if (IsUninitialized(inline_cache)) {
-          VLOG(compiler) << "Interface or virtual call to "
-                         << caller_dex_file.PrettyMethod(method_index)
-                         << " is not hit and not inlined";
-          return false;
-        } else if (IsMonomorphic(inline_cache)) {
-          MaybeRecordStat(kMonomorphicCall);
-          if (outermost_graph_->IsCompilingOsr()) {
-            // If we are compiling OSR, we pretend this call is polymorphic, as we may come from the
-            // interpreter and it may have seen different receiver types.
-            return TryInlinePolymorphicCall(invoke_instruction, resolved_method, inline_cache);
-          } else {
-            return TryInlineMonomorphicCall(invoke_instruction, resolved_method, inline_cache);
-          }
-        } else if (IsPolymorphic(inline_cache)) {
-          MaybeRecordStat(kPolymorphicCall);
-          return TryInlinePolymorphicCall(invoke_instruction, resolved_method, inline_cache);
-        } else {
-          DCHECK(IsMegamorphic(inline_cache));
-          VLOG(compiler) << "Interface or virtual call to "
-                         << caller_dex_file.PrettyMethod(method_index)
-                         << " is megamorphic and not inlined";
-          MaybeRecordStat(kMegamorphicCall);
-          return false;
-        }
+        return TryInlineMonomorphicCall(invoke_instruction, resolved_method, inline_cache);
       }
+
+    case kInlineCachePolymorphic:
+      MaybeRecordStat(kPolymorphicCall);
+      return TryInlinePolymorphicCall(invoke_instruction, resolved_method, inline_cache);
+
+    case kInlineCacheMegamorphic:
+      VLOG(compiler) << "Interface or virtual call to "
+                     << caller_dex_file.PrettyMethod(invoke_instruction->GetDexMethodIndex())
+                     << " is megamorphic and not inlined";
+      MaybeRecordStat(kMegamorphicCall);
+      return false;
+
+    case kInlineCacheMissingTypes:
+      VLOG(compiler) << "Interface or virtual call to "
+                     << caller_dex_file.PrettyMethod(invoke_instruction->GetDexMethodIndex())
+                     << " is missing types and not inlined";
+      return false;
+  }
+  UNREACHABLE();
+}
+
+HInliner::InlineCacheType HInliner::GetInlineCacheJIT(
+    HInvoke* invoke_instruction,
+    StackHandleScope<1>* hs,
+    /*out*/Handle<mirror::ObjectArray<mirror::Class>>* inline_cache)
+    REQUIRES_SHARED(Locks::mutator_lock_) {
+  DCHECK(Runtime::Current()->UseJitCompilation());
+
+  ArtMethod* caller = graph_->GetArtMethod();
+  // Under JIT, we should always know the caller.
+  DCHECK(caller != nullptr);
+  ScopedProfilingInfoInlineUse spiis(caller, Thread::Current());
+  ProfilingInfo* profiling_info = spiis.GetProfilingInfo();
+
+  if (profiling_info == nullptr) {
+    return kInlineCacheNoData;
+  }
+
+  *inline_cache = AllocateInlineCacheHolder(caller_compilation_unit_, hs);
+  if (inline_cache->Get() == nullptr) {
+    // We can't extract any data if we failed to allocate;
+    return kInlineCacheNoData;
+  } else {
+    Runtime::Current()->GetJit()->GetCodeCache()->CopyInlineCacheInto(
+        *profiling_info->GetInlineCache(invoke_instruction->GetDexPc()),
+        *inline_cache);
+    return GetInlineCacheType(*inline_cache);
+  }
+}
+
+HInliner::InlineCacheType HInliner::GetInlineCacheAOT(
+    const DexFile& caller_dex_file,
+    HInvoke* invoke_instruction,
+    StackHandleScope<1>* hs,
+    /*out*/Handle<mirror::ObjectArray<mirror::Class>>* inline_cache)
+    REQUIRES_SHARED(Locks::mutator_lock_) {
+  DCHECK(Runtime::Current()->IsAotCompiler());
+  const ProfileCompilationInfo* pci = compiler_driver_->GetProfileCompilationInfo();
+  if (pci == nullptr) {
+    return kInlineCacheNoData;
+  }
+
+  ProfileCompilationInfo::OfflineProfileMethodInfo offline_profile;
+  bool found = pci->GetMethod(caller_dex_file.GetLocation(),
+                              caller_dex_file.GetLocationChecksum(),
+                              caller_compilation_unit_.GetDexMethodIndex(),
+                              &offline_profile);
+  if (!found) {
+    return kInlineCacheNoData;  // no profile information for this invocation.
+  }
+
+  *inline_cache = AllocateInlineCacheHolder(caller_compilation_unit_, hs);
+  if (inline_cache == nullptr) {
+    // We can't extract any data if we failed to allocate;
+    return kInlineCacheNoData;
+  } else {
+    return ExtractClassesFromOfflineProfile(invoke_instruction,
+                                            offline_profile,
+                                            *inline_cache);
+  }
+}
+
+HInliner::InlineCacheType HInliner::ExtractClassesFromOfflineProfile(
+    const HInvoke* invoke_instruction,
+    const ProfileCompilationInfo::OfflineProfileMethodInfo& offline_profile,
+    /*out*/Handle<mirror::ObjectArray<mirror::Class>> inline_cache)
+    REQUIRES_SHARED(Locks::mutator_lock_) {
+  const auto it = offline_profile.inline_caches.find(invoke_instruction->GetDexPc());
+  if (it == offline_profile.inline_caches.end()) {
+    return kInlineCacheUninitialized;
+  }
+
+  const ProfileCompilationInfo::DexPcData& dex_pc_data = it->second;
+
+  if (dex_pc_data.is_missing_types) {
+    return kInlineCacheMissingTypes;
+  }
+  if (dex_pc_data.is_megamorphic) {
+    return kInlineCacheMegamorphic;
+  }
+
+  DCHECK_LE(dex_pc_data.classes.size(), InlineCache::kIndividualCacheSize);
+  Thread* self = Thread::Current();
+  // We need to resolve the class relative to the containing dex file.
+  // So first, build a mapping from the index of dex file in the profile to
+  // its dex cache. This will avoid repeating the lookup when walking over
+  // the inline cache types.
+  std::vector<ObjPtr<mirror::DexCache>> dex_profile_index_to_dex_cache(
+        offline_profile.dex_references.size());
+  for (size_t i = 0; i < offline_profile.dex_references.size(); i++) {
+    bool found = false;
+    for (const DexFile* dex_file : compiler_driver_->GetDexFilesForOatFile()) {
+      if (offline_profile.dex_references[i].MatchesDex(dex_file)) {
+        dex_profile_index_to_dex_cache[i] =
+            caller_compilation_unit_.GetClassLinker()->FindDexCache(self, *dex_file);
+        found = true;
+      }
+    }
+    if (!found) {
+      VLOG(compiler) << "Could not find profiled dex file: "
+          << offline_profile.dex_references[i].dex_location;
+      return kInlineCacheMissingTypes;
     }
   }
 
-  VLOG(compiler) << "Interface or virtual call to "
-                 << caller_dex_file.PrettyMethod(method_index)
-                 << " could not be statically determined";
-  return false;
+  // Walk over the classes and resolve them. If we cannot find a type we return
+  // kInlineCacheMissingTypes.
+  int ic_index = 0;
+  for (const ProfileCompilationInfo::ClassReference& class_ref : dex_pc_data.classes) {
+    ObjPtr<mirror::DexCache> dex_cache =
+        dex_profile_index_to_dex_cache[class_ref.dex_profile_index];
+    DCHECK(dex_cache != nullptr);
+    ObjPtr<mirror::Class> clazz = ClassLinker::LookupResolvedType(
+          class_ref.type_index,
+          dex_cache,
+          caller_compilation_unit_.GetClassLoader().Get());
+    if (clazz != nullptr) {
+      inline_cache->Set(ic_index++, clazz);
+    } else {
+      VLOG(compiler) << "Could not resolve class from inline cache in AOT mode "
+          << caller_compilation_unit_.GetDexFile()->PrettyMethod(
+              invoke_instruction->GetDexMethodIndex()) << " : "
+          << caller_compilation_unit_
+              .GetDexFile()->StringByTypeIdx(class_ref.type_index);
+      return kInlineCacheMissingTypes;
+    }
+  }
+  return GetInlineCacheType(inline_cache);
 }
 
 HInstanceFieldGet* HInliner::BuildGetReceiverClass(ClassLinker* class_linker,
@@ -549,13 +684,20 @@
                                                                is_referrer,
                                                                invoke_instruction->GetDexPc(),
                                                                /* needs_access_check */ false);
-  HLoadClass::LoadKind kind = HSharpening::SharpenClass(
+  HLoadClass::LoadKind kind = HSharpening::ComputeLoadClassKind(
       load_class, codegen_, compiler_driver_, caller_compilation_unit_);
   DCHECK(kind != HLoadClass::LoadKind::kInvalid)
       << "We should always be able to reference a class for inline caches";
   // Insert before setting the kind, as setting the kind affects the inputs.
   bb_cursor->InsertInstructionAfter(load_class, receiver_class);
   load_class->SetLoadKind(kind);
+  // In AOT mode, we will most likely load the class from BSS, which will involve a call
+  // to the runtime. In this case, the load instruction will need an environment so copy
+  // it from the invoke instruction.
+  if (load_class->NeedsEnvironment()) {
+    DCHECK(Runtime::Current()->IsAotCompiler());
+    load_class->CopyEnvironmentFrom(invoke_instruction->GetEnvironment());
+  }
 
   HNotEqual* compare = new (graph_->GetArena()) HNotEqual(load_class, receiver_class);
   bb_cursor->InsertInstructionAfter(compare, load_class);
@@ -746,7 +888,10 @@
     ArtMethod* resolved_method,
     Handle<mirror::ObjectArray<mirror::Class>> classes) {
   // This optimization only works under JIT for now.
-  DCHECK(Runtime::Current()->UseJitCompilation());
+  if (!Runtime::Current()->UseJitCompilation()) {
+    return false;
+  }
+
   if (graph_->GetInstructionSet() == kMips64) {
     // TODO: Support HClassTableGet for mips64.
     return false;
@@ -1064,9 +1209,8 @@
         // TODO: Needs null check.
         return false;
       }
-      Handle<mirror::DexCache> dex_cache(handles_->NewHandle(resolved_method->GetDexCache()));
       HInstruction* obj = GetInvokeInputForArgVRegIndex(invoke_instruction, data.object_arg);
-      HInstanceFieldGet* iget = CreateInstanceFieldGet(dex_cache, data.field_idx, obj);
+      HInstanceFieldGet* iget = CreateInstanceFieldGet(data.field_idx, resolved_method, obj);
       DCHECK_EQ(iget->GetFieldOffset().Uint32Value(), data.field_offset);
       DCHECK_EQ(iget->IsVolatile() ? 1u : 0u, data.is_volatile);
       invoke_instruction->GetBlock()->InsertInstructionBefore(iget, invoke_instruction);
@@ -1079,10 +1223,9 @@
         // TODO: Needs null check.
         return false;
       }
-      Handle<mirror::DexCache> dex_cache(handles_->NewHandle(resolved_method->GetDexCache()));
       HInstruction* obj = GetInvokeInputForArgVRegIndex(invoke_instruction, data.object_arg);
       HInstruction* value = GetInvokeInputForArgVRegIndex(invoke_instruction, data.src_arg);
-      HInstanceFieldSet* iput = CreateInstanceFieldSet(dex_cache, data.field_idx, obj, value);
+      HInstanceFieldSet* iput = CreateInstanceFieldSet(data.field_idx, resolved_method, obj, value);
       DCHECK_EQ(iput->GetFieldOffset().Uint32Value(), data.field_offset);
       DCHECK_EQ(iput->IsVolatile() ? 1u : 0u, data.is_volatile);
       invoke_instruction->GetBlock()->InsertInstructionBefore(iput, invoke_instruction);
@@ -1116,24 +1259,19 @@
                                  [](uint16_t index) { return index != DexFile::kDexNoIndex16; }));
 
       // Create HInstanceFieldSet for each IPUT that stores non-zero data.
-      Handle<mirror::DexCache> dex_cache;
       HInstruction* obj = GetInvokeInputForArgVRegIndex(invoke_instruction, /* this */ 0u);
       bool needs_constructor_barrier = false;
       for (size_t i = 0; i != number_of_iputs; ++i) {
         HInstruction* value = GetInvokeInputForArgVRegIndex(invoke_instruction, iput_args[i]);
         if (!value->IsConstant() || !value->AsConstant()->IsZeroBitPattern()) {
-          if (dex_cache.GetReference() == nullptr) {
-            dex_cache = handles_->NewHandle(resolved_method->GetDexCache());
-          }
           uint16_t field_index = iput_field_indexes[i];
-          HInstanceFieldSet* iput = CreateInstanceFieldSet(dex_cache, field_index, obj, value);
+          bool is_final;
+          HInstanceFieldSet* iput =
+              CreateInstanceFieldSet(field_index, resolved_method, obj, value, &is_final);
           invoke_instruction->GetBlock()->InsertInstructionBefore(iput, invoke_instruction);
 
           // Check whether the field is final. If it is, we need to add a barrier.
-          PointerSize pointer_size = InstructionSetPointerSize(codegen_->GetInstructionSet());
-          ArtField* resolved_field = dex_cache->GetResolvedField(field_index, pointer_size);
-          DCHECK(resolved_field != nullptr);
-          if (resolved_field->IsFinal()) {
+          if (is_final) {
             needs_constructor_barrier = true;
           }
         }
@@ -1152,12 +1290,13 @@
   return true;
 }
 
-HInstanceFieldGet* HInliner::CreateInstanceFieldGet(Handle<mirror::DexCache> dex_cache,
-                                                    uint32_t field_index,
+HInstanceFieldGet* HInliner::CreateInstanceFieldGet(uint32_t field_index,
+                                                    ArtMethod* referrer,
                                                     HInstruction* obj)
     REQUIRES_SHARED(Locks::mutator_lock_) {
-  PointerSize pointer_size = InstructionSetPointerSize(codegen_->GetInstructionSet());
-  ArtField* resolved_field = dex_cache->GetResolvedField(field_index, pointer_size);
+  ClassLinker* class_linker = Runtime::Current()->GetClassLinker();
+  ArtField* resolved_field =
+      class_linker->LookupResolvedField(field_index, referrer, /* is_static */ false);
   DCHECK(resolved_field != nullptr);
   HInstanceFieldGet* iget = new (graph_->GetArena()) HInstanceFieldGet(
       obj,
@@ -1167,12 +1306,13 @@
       resolved_field->IsVolatile(),
       field_index,
       resolved_field->GetDeclaringClass()->GetDexClassDefIndex(),
-      *dex_cache->GetDexFile(),
+      *referrer->GetDexFile(),
       // Read barrier generates a runtime call in slow path and we need a valid
       // dex pc for the associated stack map. 0 is bogus but valid. Bug: 26854537.
       /* dex_pc */ 0);
   if (iget->GetType() == Primitive::kPrimNot) {
     // Use the same dex_cache that we used for field lookup as the hint_dex_cache.
+    Handle<mirror::DexCache> dex_cache = handles_->NewHandle(referrer->GetDexCache());
     ReferenceTypePropagation rtp(graph_,
                                  outer_compilation_unit_.GetClassLoader(),
                                  dex_cache,
@@ -1183,14 +1323,21 @@
   return iget;
 }
 
-HInstanceFieldSet* HInliner::CreateInstanceFieldSet(Handle<mirror::DexCache> dex_cache,
-                                                    uint32_t field_index,
+HInstanceFieldSet* HInliner::CreateInstanceFieldSet(uint32_t field_index,
+                                                    ArtMethod* referrer,
                                                     HInstruction* obj,
-                                                    HInstruction* value)
+                                                    HInstruction* value,
+                                                    bool* is_final)
     REQUIRES_SHARED(Locks::mutator_lock_) {
-  PointerSize pointer_size = InstructionSetPointerSize(codegen_->GetInstructionSet());
-  ArtField* resolved_field = dex_cache->GetResolvedField(field_index, pointer_size);
+  ClassLinker* class_linker = Runtime::Current()->GetClassLinker();
+  ArtField* resolved_field =
+      class_linker->LookupResolvedField(field_index, referrer, /* is_static */ false);
   DCHECK(resolved_field != nullptr);
+  if (is_final != nullptr) {
+    // This information is needed only for constructors.
+    DCHECK(referrer->IsConstructor());
+    *is_final = resolved_field->IsFinal();
+  }
   HInstanceFieldSet* iput = new (graph_->GetArena()) HInstanceFieldSet(
       obj,
       value,
@@ -1200,7 +1347,7 @@
       resolved_field->IsVolatile(),
       field_index,
       resolved_field->GetDeclaringClass()->GetDexClassDefIndex(),
-      *dex_cache->GetDexFile(),
+      *referrer->GetDexFile(),
       // Read barrier generates a runtime call in slow path and we need a valid
       // dex pc for the associated stack map. 0 is bogus but valid. Bug: 26854537.
       /* dex_pc */ 0);
@@ -1380,6 +1527,13 @@
                        << " could not be inlined because one branch always throws and"
                        << " caller does not have an exit block";
         return false;
+      } else if (graph_->HasIrreducibleLoops()) {
+        // TODO(ngeoffray): Support re-computing loop information to graphs with
+        // irreducible loops?
+        VLOG(compiler) << "Method " << callee_dex_file.PrettyMethod(method_index)
+                       << " could not be inlined because one branch always throws and"
+                       << " caller has irreducible loops";
+        return false;
       }
     } else {
       has_one_return = true;
@@ -1491,7 +1645,7 @@
   HDeadCodeElimination dce(callee_graph, inline_stats_, "dead_code_elimination$inliner");
   HConstantFolding fold(callee_graph, "constant_folding$inliner");
   HSharpening sharpening(callee_graph, codegen_, dex_compilation_unit, compiler_driver_, handles_);
-  InstructionSimplifier simplify(callee_graph, inline_stats_);
+  InstructionSimplifier simplify(callee_graph, codegen_, inline_stats_);
   IntrinsicsRecognizer intrinsics(callee_graph, inline_stats_);
 
   HOptimization* optimizations[] = {
diff --git a/compiler/optimizing/inliner.h b/compiler/optimizing/inliner.h
index 75d025a..a032042 100644
--- a/compiler/optimizing/inliner.h
+++ b/compiler/optimizing/inliner.h
@@ -20,6 +20,7 @@
 #include "dex_file_types.h"
 #include "invoke_type.h"
 #include "optimization.h"
+#include "jit/profile_compilation_info.h"
 
 namespace art {
 
@@ -59,6 +60,15 @@
   static constexpr const char* kInlinerPassName = "inliner";
 
  private:
+  enum InlineCacheType {
+    kInlineCacheNoData = 0,
+    kInlineCacheUninitialized = 1,
+    kInlineCacheMonomorphic = 2,
+    kInlineCachePolymorphic = 3,
+    kInlineCacheMegamorphic = 4,
+    kInlineCacheMissingTypes = 5
+  };
+
   bool TryInline(HInvoke* invoke_instruction);
 
   // Try to inline `resolved_method` in place of `invoke_instruction`. `do_rtp` is whether
@@ -97,14 +107,54 @@
     REQUIRES_SHARED(Locks::mutator_lock_);
 
   // Create a new HInstanceFieldGet.
-  HInstanceFieldGet* CreateInstanceFieldGet(Handle<mirror::DexCache> dex_cache,
-                                            uint32_t field_index,
+  HInstanceFieldGet* CreateInstanceFieldGet(uint32_t field_index,
+                                            ArtMethod* referrer,
                                             HInstruction* obj);
   // Create a new HInstanceFieldSet.
-  HInstanceFieldSet* CreateInstanceFieldSet(Handle<mirror::DexCache> dex_cache,
-                                            uint32_t field_index,
+  HInstanceFieldSet* CreateInstanceFieldSet(uint32_t field_index,
+                                            ArtMethod* referrer,
                                             HInstruction* obj,
-                                            HInstruction* value);
+                                            HInstruction* value,
+                                            bool* is_final = nullptr);
+
+  // Try inlining the invoke instruction using inline caches.
+  bool TryInlineFromInlineCache(
+      const DexFile& caller_dex_file,
+      HInvoke* invoke_instruction,
+      ArtMethod* resolved_method)
+    REQUIRES_SHARED(Locks::mutator_lock_);
+
+  // Try getting the inline cache from JIT code cache.
+  // Return true if the inline cache was successfully allocated and the
+  // invoke info was found in the profile info.
+  InlineCacheType GetInlineCacheJIT(
+      HInvoke* invoke_instruction,
+      StackHandleScope<1>* hs,
+      /*out*/Handle<mirror::ObjectArray<mirror::Class>>* inline_cache)
+    REQUIRES_SHARED(Locks::mutator_lock_);
+
+  // Try getting the inline cache from AOT offline profile.
+  // Return true if the inline cache was successfully allocated and the
+  // invoke info was found in the profile info.
+  InlineCacheType GetInlineCacheAOT(const DexFile& caller_dex_file,
+      HInvoke* invoke_instruction,
+      StackHandleScope<1>* hs,
+      /*out*/Handle<mirror::ObjectArray<mirror::Class>>* inline_cache)
+    REQUIRES_SHARED(Locks::mutator_lock_);
+
+  // Extract the mirror classes from the offline profile and add them to the `inline_cache`.
+  // Note that even if we have profile data for the invoke the inline_cache might contain
+  // only null entries if the types cannot be resolved.
+  InlineCacheType ExtractClassesFromOfflineProfile(
+      const HInvoke* invoke_instruction,
+      const ProfileCompilationInfo::OfflineProfileMethodInfo& offline_profile,
+      /*out*/Handle<mirror::ObjectArray<mirror::Class>> inline_cache)
+    REQUIRES_SHARED(Locks::mutator_lock_);
+
+  // Compute the inline cache type.
+  InlineCacheType GetInlineCacheType(
+      const Handle<mirror::ObjectArray<mirror::Class>>& classes)
+    REQUIRES_SHARED(Locks::mutator_lock_);
 
   // Try to inline the target of a monomorphic call. If successful, the code
   // in the graph will look like:
diff --git a/compiler/optimizing/instruction_builder.cc b/compiler/optimizing/instruction_builder.cc
index c60f6e5..88f67fa 100644
--- a/compiler/optimizing/instruction_builder.cc
+++ b/compiler/optimizing/instruction_builder.cc
@@ -37,37 +37,45 @@
   return block_builder_->GetBlockAt(dex_pc);
 }
 
-ArenaVector<HInstruction*>* HInstructionBuilder::GetLocalsFor(HBasicBlock* block) {
+inline ArenaVector<HInstruction*>* HInstructionBuilder::GetLocalsFor(HBasicBlock* block) {
   ArenaVector<HInstruction*>* locals = &locals_for_[block->GetBlockId()];
   const size_t vregs = graph_->GetNumberOfVRegs();
-  if (locals->size() != vregs) {
-    locals->resize(vregs, nullptr);
+  if (locals->size() == vregs) {
+    return locals;
+  }
+  return GetLocalsForWithAllocation(block, locals, vregs);
+}
 
-    if (block->IsCatchBlock()) {
-      // We record incoming inputs of catch phis at throwing instructions and
-      // must therefore eagerly create the phis. Phis for undefined vregs will
-      // be deleted when the first throwing instruction with the vreg undefined
-      // is encountered. Unused phis will be removed by dead phi analysis.
-      for (size_t i = 0; i < vregs; ++i) {
-        // No point in creating the catch phi if it is already undefined at
-        // the first throwing instruction.
-        HInstruction* current_local_value = (*current_locals_)[i];
-        if (current_local_value != nullptr) {
-          HPhi* phi = new (arena_) HPhi(
-              arena_,
-              i,
-              0,
-              current_local_value->GetType());
-          block->AddPhi(phi);
-          (*locals)[i] = phi;
-        }
+ArenaVector<HInstruction*>* HInstructionBuilder::GetLocalsForWithAllocation(
+    HBasicBlock* block,
+    ArenaVector<HInstruction*>* locals,
+    const size_t vregs) {
+  DCHECK_NE(locals->size(), vregs);
+  locals->resize(vregs, nullptr);
+  if (block->IsCatchBlock()) {
+    // We record incoming inputs of catch phis at throwing instructions and
+    // must therefore eagerly create the phis. Phis for undefined vregs will
+    // be deleted when the first throwing instruction with the vreg undefined
+    // is encountered. Unused phis will be removed by dead phi analysis.
+    for (size_t i = 0; i < vregs; ++i) {
+      // No point in creating the catch phi if it is already undefined at
+      // the first throwing instruction.
+      HInstruction* current_local_value = (*current_locals_)[i];
+      if (current_local_value != nullptr) {
+        HPhi* phi = new (arena_) HPhi(
+            arena_,
+            i,
+            0,
+            current_local_value->GetType());
+        block->AddPhi(phi);
+        (*locals)[i] = phi;
       }
     }
   }
   return locals;
 }
 
-HInstruction* HInstructionBuilder::ValueOfLocalAt(HBasicBlock* block, size_t local) {
+inline HInstruction* HInstructionBuilder::ValueOfLocalAt(HBasicBlock* block, size_t local) {
   ArenaVector<HInstruction*>* locals = GetLocalsFor(block);
   return (*locals)[local];
 }
@@ -1676,10 +1684,10 @@
       dex_pc,
       needs_access_check);
 
-  HLoadClass::LoadKind load_kind = HSharpening::SharpenClass(load_class,
-                                                             code_generator_,
-                                                             compiler_driver_,
-                                                             *dex_compilation_unit_);
+  HLoadClass::LoadKind load_kind = HSharpening::ComputeLoadClassKind(load_class,
+                                                                     code_generator_,
+                                                                     compiler_driver_,
+                                                                     *dex_compilation_unit_);
 
   if (load_kind == HLoadClass::LoadKind::kInvalid) {
     // We actually cannot reference this class, we're forced to bail.
diff --git a/compiler/optimizing/instruction_builder.h b/compiler/optimizing/instruction_builder.h
index e735a0c..7fdc188 100644
--- a/compiler/optimizing/instruction_builder.h
+++ b/compiler/optimizing/instruction_builder.h
@@ -93,6 +93,10 @@
   HBasicBlock* FindBlockStartingAt(uint32_t dex_pc) const;
 
   ArenaVector<HInstruction*>* GetLocalsFor(HBasicBlock* block);
+  // Out of line version of GetLocalsFor(), which has a fast path that is
+  // beneficial to get inlined by callers.
+  ArenaVector<HInstruction*>* GetLocalsForWithAllocation(
+      HBasicBlock* block, ArenaVector<HInstruction*>* locals, const size_t vregs);
   HInstruction* ValueOfLocalAt(HBasicBlock* block, size_t local);
   HInstruction* LoadLocal(uint32_t register_index, Primitive::Type type) const;
   HInstruction* LoadNullCheckedLocal(uint32_t register_index, uint32_t dex_pc);
diff --git a/compiler/optimizing/instruction_simplifier.cc b/compiler/optimizing/instruction_simplifier.cc
index 35f59cb..17421fc 100644
--- a/compiler/optimizing/instruction_simplifier.cc
+++ b/compiler/optimizing/instruction_simplifier.cc
@@ -19,14 +19,18 @@
 #include "escape.h"
 #include "intrinsics.h"
 #include "mirror/class-inl.h"
+#include "sharpening.h"
 #include "scoped_thread_state_change-inl.h"
 
 namespace art {
 
 class InstructionSimplifierVisitor : public HGraphDelegateVisitor {
  public:
-  InstructionSimplifierVisitor(HGraph* graph, OptimizingCompilerStats* stats)
+  InstructionSimplifierVisitor(HGraph* graph,
+                               CodeGenerator* codegen,
+                               OptimizingCompilerStats* stats)
       : HGraphDelegateVisitor(graph),
+        codegen_(codegen),
         stats_(stats) {}
 
   void Run();
@@ -112,6 +116,7 @@
   void SimplifyAllocationIntrinsic(HInvoke* invoke);
   void SimplifyMemBarrier(HInvoke* invoke, MemBarrierKind barrier_kind);
 
+  CodeGenerator* codegen_;
   OptimizingCompilerStats* stats_;
   bool simplification_occurred_ = false;
   int simplifications_at_current_position_ = 0;
@@ -123,7 +128,7 @@
 };
 
 void InstructionSimplifier::Run() {
-  InstructionSimplifierVisitor visitor(graph_, stats_);
+  InstructionSimplifierVisitor visitor(graph_, codegen_, stats_);
   visitor.Run();
 }
 
@@ -1805,6 +1810,8 @@
 
   {
     ScopedObjectAccess soa(Thread::Current());
+    Primitive::Type source_component_type = Primitive::kPrimVoid;
+    Primitive::Type destination_component_type = Primitive::kPrimVoid;
     ReferenceTypeInfo destination_rti = destination->GetReferenceTypeInfo();
     if (destination_rti.IsValid()) {
       if (destination_rti.IsObjectArray()) {
@@ -1814,6 +1821,8 @@
         optimizations.SetDestinationIsTypedObjectArray();
       }
       if (destination_rti.IsPrimitiveArrayClass()) {
+        destination_component_type =
+            destination_rti.GetTypeHandle()->GetComponentType()->GetPrimitiveType();
         optimizations.SetDestinationIsPrimitiveArray();
       } else if (destination_rti.IsNonPrimitiveArrayClass()) {
         optimizations.SetDestinationIsNonPrimitiveArray();
@@ -1826,10 +1835,55 @@
       }
       if (source_rti.IsPrimitiveArrayClass()) {
         optimizations.SetSourceIsPrimitiveArray();
+        source_component_type = source_rti.GetTypeHandle()->GetComponentType()->GetPrimitiveType();
       } else if (source_rti.IsNonPrimitiveArrayClass()) {
         optimizations.SetSourceIsNonPrimitiveArray();
       }
     }
+    // For primitive arrays, use their optimized ArtMethod implementations.
+    if ((source_component_type != Primitive::kPrimVoid) &&
+        (source_component_type == destination_component_type)) {
+      ClassLinker* class_linker = Runtime::Current()->GetClassLinker();
+      PointerSize image_size = class_linker->GetImagePointerSize();
+      HInvokeStaticOrDirect* invoke = instruction->AsInvokeStaticOrDirect();
+      mirror::Class* system = invoke->GetResolvedMethod()->GetDeclaringClass();
+      ArtMethod* method = nullptr;
+      switch (source_component_type) {
+        case Primitive::kPrimBoolean:
+          method = system->FindDeclaredDirectMethod("arraycopy", "([ZI[ZII)V", image_size);
+          break;
+        case Primitive::kPrimByte:
+          method = system->FindDeclaredDirectMethod("arraycopy", "([BI[BII)V", image_size);
+          break;
+        case Primitive::kPrimChar:
+          method = system->FindDeclaredDirectMethod("arraycopy", "([CI[CII)V", image_size);
+          break;
+        case Primitive::kPrimShort:
+          method = system->FindDeclaredDirectMethod("arraycopy", "([SI[SII)V", image_size);
+          break;
+        case Primitive::kPrimInt:
+          method = system->FindDeclaredDirectMethod("arraycopy", "([II[III)V", image_size);
+          break;
+        case Primitive::kPrimFloat:
+          method = system->FindDeclaredDirectMethod("arraycopy", "([FI[FII)V", image_size);
+          break;
+        case Primitive::kPrimLong:
+          method = system->FindDeclaredDirectMethod("arraycopy", "([JI[JII)V", image_size);
+          break;
+        case Primitive::kPrimDouble:
+          method = system->FindDeclaredDirectMethod("arraycopy", "([DI[DII)V", image_size);
+          break;
+        default:
+          LOG(FATAL) << "Unreachable";
+      }
+      DCHECK(method != nullptr);
+      invoke->SetResolvedMethod(method);
+      // Sharpen the new invoke. Note that we do not update the dex method index of
+      // the invoke, as we would need to look it up in the current dex file, and it
+      // is unlikely that it exists. The most usual situation for such typed
+      // arraycopy methods is a direct pointer to the boot image.
+      HSharpening::SharpenInvokeStaticOrDirect(invoke, codegen_);
+    }
   }
 }
 
diff --git a/compiler/optimizing/instruction_simplifier.h b/compiler/optimizing/instruction_simplifier.h
index 7fe1067..f7329a4 100644
--- a/compiler/optimizing/instruction_simplifier.h
+++ b/compiler/optimizing/instruction_simplifier.h
@@ -23,6 +23,8 @@
 
 namespace art {
 
+class CodeGenerator;
+
 /**
  * Implements optimizations specific to each instruction.
  *
@@ -36,15 +38,19 @@
 class InstructionSimplifier : public HOptimization {
  public:
   explicit InstructionSimplifier(HGraph* graph,
+                                 CodeGenerator* codegen,
                                  OptimizingCompilerStats* stats = nullptr,
                                  const char* name = kInstructionSimplifierPassName)
-      : HOptimization(graph, name, stats) {}
+      : HOptimization(graph, name, stats),
+        codegen_(codegen) {}
 
   static constexpr const char* kInstructionSimplifierPassName = "instruction_simplifier";
 
   void Run() OVERRIDE;
 
  private:
+  CodeGenerator* codegen_;
+
   DISALLOW_COPY_AND_ASSIGN(InstructionSimplifier);
 };
 
diff --git a/compiler/optimizing/intrinsics_arm.cc b/compiler/optimizing/intrinsics_arm.cc
index 86000e9..28095c4 100644
--- a/compiler/optimizing/intrinsics_arm.cc
+++ b/compiler/optimizing/intrinsics_arm.cc
@@ -1947,6 +1947,8 @@
   }
 
   if (kEmitCompilerReadBarrier && kUseBakerReadBarrier) {
+    // TODO: Also convert this intrinsic to the IsGcMarking strategy?
+
     // The base destination address is computed later, as `temp2` is
     // used for intermediate computations.
 
diff --git a/compiler/optimizing/intrinsics_arm64.cc b/compiler/optimizing/intrinsics_arm64.cc
index 6c3938c..934ba1b 100644
--- a/compiler/optimizing/intrinsics_arm64.cc
+++ b/compiler/optimizing/intrinsics_arm64.cc
@@ -853,7 +853,6 @@
   DCHECK((type == Primitive::kPrimInt) ||
          (type == Primitive::kPrimLong) ||
          (type == Primitive::kPrimNot));
-  MacroAssembler* masm = codegen->GetVIXLAssembler();
   Location base_loc = locations->InAt(1);
   Register base = WRegisterFrom(base_loc);      // Object pointer.
   Location offset_loc = locations->InAt(2);
@@ -863,8 +862,7 @@
 
   if (type == Primitive::kPrimNot && kEmitCompilerReadBarrier && kUseBakerReadBarrier) {
     // UnsafeGetObject/UnsafeGetObjectVolatile with Baker's read barrier case.
-    UseScratchRegisterScope temps(masm);
-    Register temp = temps.AcquireW();
+    Register temp = WRegisterFrom(locations->GetTemp(0));
     codegen->GenerateReferenceLoadWithBakerReadBarrier(invoke,
                                                        trg_loc,
                                                        base,
@@ -901,6 +899,9 @@
                                                            kIntrinsified);
   if (can_call && kUseBakerReadBarrier) {
     locations->SetCustomSlowPathCallerSaves(RegisterSet::Empty());  // No caller-save registers.
+    // We need a temporary register for the read barrier marking slow
+    // path in CodeGeneratorARM64::GenerateReferenceLoadWithBakerReadBarrier.
+    locations->AddTemp(Location::RequiresRegister());
   }
   locations->SetInAt(0, Location::NoLocation());        // Unused receiver.
   locations->SetInAt(1, Location::RequiresRegister());
@@ -2381,9 +2382,14 @@
     // Temporary register IP0, obtained from the VIXL scratch register
     // pool, cannot be used in ReadBarrierSystemArrayCopySlowPathARM64
     // (because that register is clobbered by ReadBarrierMarkRegX
-    // entry points). Get an extra temporary register from the
-    // register allocator.
+    // entry points). It cannot be used in calls to
+    // CodeGeneratorARM64::GenerateFieldLoadWithBakerReadBarrier
+    // either. For these reasons, get a third extra temporary register
+    // from the register allocator.
     locations->AddTemp(Location::RequiresRegister());
+  } else {
+    // Cases other than Baker read barriers: the third temporary will
+    // be acquired from the VIXL scratch register pool.
   }
 }
 
@@ -2494,11 +2500,12 @@
     // We use a block to end the scratch scope before the write barrier, thus
     // freeing the temporary registers so they can be used in `MarkGCCard`.
     UseScratchRegisterScope temps(masm);
-    // Note: Because it is acquired from VIXL's scratch register pool,
-    // `temp3` might be IP0, and thus cannot be used as `ref` argument
-    // of CodeGeneratorARM64::GenerateFieldLoadWithBakerReadBarrier
-    // calls below (see ReadBarrierMarkSlowPathARM64 for more details).
-    Register temp3 = temps.AcquireW();
+    Register temp3;
+    if (kEmitCompilerReadBarrier && kUseBakerReadBarrier) {
+      temp3 = WRegisterFrom(locations->GetTemp(2));
+    } else {
+      temp3 = temps.AcquireW();
+    }
 
     if (!optimizations.GetDoesNotNeedTypeCheck()) {
       // Check whether all elements of the source array are assignable to the component
@@ -2704,19 +2711,7 @@
 
     Register src_curr_addr = temp1.X();
     Register dst_curr_addr = temp2.X();
-    Register src_stop_addr;
-    if (kEmitCompilerReadBarrier && kUseBakerReadBarrier) {
-      // Temporary register IP0, obtained from the VIXL scratch
-      // register pool as `temp3`, cannot be used in
-      // ReadBarrierSystemArrayCopySlowPathARM64 (because that
-      // register is clobbered by ReadBarrierMarkRegX entry points).
-      // So another temporary register allocated by the register
-      // allocator instead.
-      DCHECK_EQ(LocationFrom(temp3).reg(), IP0);
-      src_stop_addr = XRegisterFrom(locations->GetTemp(2));
-    } else {
-      src_stop_addr = temp3.X();
-    }
+    Register src_stop_addr = temp3.X();
 
     GenSystemArrayCopyAddresses(masm,
                                 Primitive::kPrimNot,
@@ -2732,6 +2727,8 @@
     const int32_t element_size = Primitive::ComponentSize(Primitive::kPrimNot);
 
     if (kEmitCompilerReadBarrier && kUseBakerReadBarrier) {
+      // TODO: Also convert this intrinsic to the IsGcMarking strategy?
+
       // SystemArrayCopy implementation for Baker read barriers (see
       // also CodeGeneratorARM::GenerateReferenceLoadWithBakerReadBarrier):
       //
@@ -2758,10 +2755,11 @@
       __ Cmp(src_curr_addr, src_stop_addr);
       __ B(&done, eq);
 
-      Register tmp = temps.AcquireW();
       // Make sure `tmp` is not IP0, as it is clobbered by
       // ReadBarrierMarkRegX entry points in
       // ReadBarrierSystemArrayCopySlowPathARM64.
+      temps.Exclude(ip0);
+      Register tmp = temps.AcquireW();
       DCHECK_NE(LocationFrom(tmp).reg(), IP0);
 
       // /* int32_t */ monitor = src->monitor_
diff --git a/compiler/optimizing/intrinsics_arm_vixl.cc b/compiler/optimizing/intrinsics_arm_vixl.cc
index aa89dea..60bcf2c 100644
--- a/compiler/optimizing/intrinsics_arm_vixl.cc
+++ b/compiler/optimizing/intrinsics_arm_vixl.cc
@@ -2265,6 +2265,8 @@
   }
 
   if (kEmitCompilerReadBarrier && kUseBakerReadBarrier) {
+    // TODO: Also convert this intrinsic to the IsGcMarking strategy?
+
     // The base destination address is computed later, as `temp2` is
     // used for intermediate computations.
 
diff --git a/compiler/optimizing/load_store_elimination.cc b/compiler/optimizing/load_store_elimination.cc
index 46ba048..48699b3 100644
--- a/compiler/optimizing/load_store_elimination.cc
+++ b/compiler/optimizing/load_store_elimination.cc
@@ -69,6 +69,13 @@
     return is_singleton_and_not_returned_ && is_singleton_and_not_deopt_visible_;
   }
 
+  // Returns true if reference_ is a singleton and returned to the caller or
+  // used as an environment local of an HDeoptimize instruction.
+  bool IsSingletonAndNonRemovable() const {
+    return is_singleton_ &&
+           (!is_singleton_and_not_returned_ || !is_singleton_and_not_deopt_visible_);
+  }
+
   bool HasIndexAliasing() {
     return has_index_aliasing_;
   }
@@ -339,11 +346,7 @@
         return false;
       }
       ReferenceInfo* ref_info = loc1->GetReferenceInfo();
-      if (ref_info->IsSingleton()) {
-        // This is guaranteed by the CanReferencesAlias() test above.
-        DCHECK_EQ(ref_info, loc2->GetReferenceInfo());
-        ref_info->SetHasIndexAliasing(true);
-      }
+      ref_info->SetHasIndexAliasing(true);
     }
     return true;
   }
@@ -628,14 +631,17 @@
       for (size_t i = 0; i < heap_values.size(); i++) {
         HeapLocation* location = heap_location_collector_.GetHeapLocation(i);
         ReferenceInfo* ref_info = location->GetReferenceInfo();
-        if (!ref_info->IsSingleton() || location->IsValueKilledByLoopSideEffects()) {
-          // heap value is killed by loop side effects (stored into directly, or due to
-          // aliasing).
+        if (ref_info->IsSingletonAndRemovable() &&
+            !location->IsValueKilledByLoopSideEffects()) {
+          // A removable singleton's field that's not stored into inside a loop is
+          // invariant throughout the loop. Nothing to do.
+          DCHECK(ref_info->IsSingletonAndRemovable());
+        } else {
+          // heap value is killed by loop side effects (stored into directly, or
+          // due to aliasing). Or the heap value may be needed after method return
+          // or deoptimization.
           KeepIfIsStore(pre_header_heap_values[i]);
           heap_values[i] = kUnknownHeapValue;
-        } else {
-          // A singleton's field that's not stored into inside a loop is invariant throughout
-          // the loop.
         }
       }
     }
@@ -654,7 +660,7 @@
       bool from_all_predecessors = true;
       ReferenceInfo* ref_info = heap_location_collector_.GetHeapLocation(i)->GetReferenceInfo();
       HInstruction* singleton_ref = nullptr;
-      if (ref_info->IsSingletonAndRemovable()) {
+      if (ref_info->IsSingleton()) {
         // We do more analysis of liveness when merging heap values for such
         // cases since stores into such references may potentially be eliminated.
         singleton_ref = ref_info->GetReference();
@@ -680,8 +686,9 @@
         }
       }
 
-      if (merged_value == kUnknownHeapValue) {
-        // There are conflicting heap values from different predecessors.
+      if (merged_value == kUnknownHeapValue || ref_info->IsSingletonAndNonRemovable()) {
+        // There are conflicting heap values from different predecessors,
+        // or the heap value may be needed after method return or deoptimization.
         // Keep the last store in each predecessor since future loads cannot be eliminated.
         for (HBasicBlock* predecessor : predecessors) {
           ArenaVector<HInstruction*>& pred_values = heap_values_for_[predecessor->GetBlockId()];
@@ -828,15 +835,15 @@
       // Store into the heap location with the same value.
       same_value = true;
     } else if (index != nullptr && ref_info->HasIndexAliasing()) {
-      // For array element, don't eliminate stores if the index can be
-      // aliased.
-    } else if (ref_info->IsSingletonAndRemovable()) {
-      // Store into a field/element of a singleton instance/array that's not returned.
-      // The value cannot be killed due to aliasing/invocation. It can be redundant since
-      // future loads can directly get the value set by this instruction. The value can
-      // still be killed due to merging or loop side effects. Stores whose values are
-      // killed due to merging/loop side effects later will be removed from
-      // possibly_removed_stores_ when that is detected.
+      // For array element, don't eliminate stores if the index can be aliased.
+    } else if (ref_info->IsSingleton()) {
+      // Store into a field of a singleton. The value cannot be killed due to
+      // aliasing/invocation. It can be redundant since future loads can
+      // directly get the value set by this instruction. The value can still be killed due to
+      // merging or loop side effects. Stores whose values are killed due to merging/loop side
+      // effects later will be removed from possibly_removed_stores_ when that is detected.
+      // Stores whose values may be needed after method return or deoptimization
+      // are also removed from possibly_removed_stores_ when that is detected.
       possibly_redundant = true;
       HNewInstance* new_instance = ref_info->GetReference()->AsNewInstance();
       if (new_instance != nullptr && new_instance->IsFinalizable()) {
@@ -945,6 +952,33 @@
                      value);
   }
 
+  void VisitDeoptimize(HDeoptimize* instruction) {
+    const ArenaVector<HInstruction*>& heap_values =
+        heap_values_for_[instruction->GetBlock()->GetBlockId()];
+    for (HInstruction* heap_value : heap_values) {
+      // Filter out fake instructions before checking instruction kind below.
+      if (heap_value == kUnknownHeapValue || heap_value == kDefaultHeapValue) {
+        continue;
+      }
+      // A store is kept as the heap value for possibly removed stores.
+      if (heap_value->IsInstanceFieldSet() || heap_value->IsArraySet()) {
+        // Check whether the reference for a store is used by an environment local of
+        // HDeoptimize.
+        HInstruction* reference = heap_value->InputAt(0);
+        DCHECK(heap_location_collector_.FindReferenceInfoOf(reference)->IsSingleton());
+        for (const HUseListNode<HEnvironment*>& use : reference->GetEnvUses()) {
+          HEnvironment* user = use.GetUser();
+          if (user->GetHolder() == instruction) {
+            // The singleton for the store is visible at this deoptimization
+            // point. Need to keep the store so that the heap value is
+            // seen by the interpreter.
+            KeepIfIsStore(heap_value);
+          }
+        }
+      }
+    }
+  }
+
   void HandleInvoke(HInstruction* invoke) {
     ArenaVector<HInstruction*>& heap_values =
         heap_values_for_[invoke->GetBlock()->GetBlockId()];
diff --git a/compiler/optimizing/loop_optimization.cc b/compiler/optimizing/loop_optimization.cc
index 26c9ab8..8df513f 100644
--- a/compiler/optimizing/loop_optimization.cc
+++ b/compiler/optimizing/loop_optimization.cc
@@ -16,6 +16,7 @@
 
 #include "loop_optimization.h"
 
+#include "driver/compiler_driver.h"
 #include "linear_order.h"
 
 namespace art {
@@ -57,8 +58,10 @@
 //
 
 HLoopOptimization::HLoopOptimization(HGraph* graph,
+                                     CompilerDriver* compiler_driver,
                                      HInductionVarAnalysis* induction_analysis)
     : HOptimization(graph, kLoopOptimizationPassName),
+      compiler_driver_(compiler_driver),
       induction_range_(induction_analysis),
       loop_allocator_(nullptr),
       top_loop_(nullptr),
@@ -69,7 +72,7 @@
 }
 
 void HLoopOptimization::Run() {
-  // Well-behaved loops only.
+  // Skip if there is no loop or the graph has try-catch/irreducible loops.
   // TODO: make this less of a sledgehammer.
   if (!graph_->HasLoops() || graph_->HasTryCatch() || graph_->HasIrreducibleLoops()) {
     return;
@@ -85,6 +88,7 @@
   LocalRun();
 
   if (top_loop_ == nullptr) {
+    // All loops have been eliminated.
     graph_->SetHasLoops(false);
   }
 
diff --git a/compiler/optimizing/loop_optimization.h b/compiler/optimizing/loop_optimization.h
index 9ddab41..0b798fc 100644
--- a/compiler/optimizing/loop_optimization.h
+++ b/compiler/optimizing/loop_optimization.h
@@ -23,13 +23,17 @@
 
 namespace art {
 
+class CompilerDriver;
+
 /**
  * Loop optimizations. Builds a loop hierarchy and applies optimizations to
  * the detected nested loops, such as removal of dead induction and empty loops.
  */
 class HLoopOptimization : public HOptimization {
  public:
-  HLoopOptimization(HGraph* graph, HInductionVarAnalysis* induction_analysis);
+  HLoopOptimization(HGraph* graph,
+                    CompilerDriver* compiler_driver,
+                    HInductionVarAnalysis* induction_analysis);
 
   void Run() OVERRIDE;
 
@@ -76,6 +80,9 @@
   bool TryReplaceWithLastValue(HInstruction* instruction, HBasicBlock* block);
   void RemoveDeadInstructions(const HInstructionList& list);
 
+  // Compiler driver (to query ISA features).
+  const CompilerDriver* compiler_driver_;
+
   // Range information based on prior induction variable analysis.
   InductionVarRange induction_range_;
 
diff --git a/compiler/optimizing/loop_optimization_test.cc b/compiler/optimizing/loop_optimization_test.cc
index 9a6b493..5b93506 100644
--- a/compiler/optimizing/loop_optimization_test.cc
+++ b/compiler/optimizing/loop_optimization_test.cc
@@ -31,7 +31,7 @@
         allocator_(&pool_),
         graph_(CreateGraph(&allocator_)),
         iva_(new (&allocator_) HInductionVarAnalysis(graph_)),
-        loop_opt_(new (&allocator_) HLoopOptimization(graph_, iva_)) {
+        loop_opt_(new (&allocator_) HLoopOptimization(graph_, nullptr, iva_)) {
     BuildGraph();
   }
 
diff --git a/compiler/optimizing/nodes.cc b/compiler/optimizing/nodes.cc
index 62c8910..020e446 100644
--- a/compiler/optimizing/nodes.cc
+++ b/compiler/optimizing/nodes.cc
@@ -2179,6 +2179,9 @@
       }
     }
     if (rerun_loop_analysis) {
+      DCHECK(!outer_graph->HasIrreducibleLoops())
+          << "Recomputing loop information in graphs with irreducible loops "
+          << "is unsupported, as it could lead to loop header changes";
       outer_graph->ClearLoopInformation();
       outer_graph->ClearDominanceInformation();
       outer_graph->BuildDominatorTree();
diff --git a/compiler/optimizing/nodes.h b/compiler/optimizing/nodes.h
index c39aed2..542b218 100644
--- a/compiler/optimizing/nodes.h
+++ b/compiler/optimizing/nodes.h
@@ -1734,11 +1734,11 @@
 // A HEnvironment object contains the values of virtual registers at a given location.
 class HEnvironment : public ArenaObject<kArenaAllocEnvironment> {
  public:
-  HEnvironment(ArenaAllocator* arena,
-               size_t number_of_vregs,
-               ArtMethod* method,
-               uint32_t dex_pc,
-               HInstruction* holder)
+  ALWAYS_INLINE HEnvironment(ArenaAllocator* arena,
+                             size_t number_of_vregs,
+                             ArtMethod* method,
+                             uint32_t dex_pc,
+                             HInstruction* holder)
      : vregs_(number_of_vregs, arena->Adapter(kArenaAllocEnvironmentVRegs)),
        locations_(number_of_vregs, arena->Adapter(kArenaAllocEnvironmentLocations)),
        parent_(nullptr),
@@ -1747,7 +1747,7 @@
        holder_(holder) {
   }
 
-  HEnvironment(ArenaAllocator* arena, const HEnvironment& to_copy, HInstruction* holder)
+  ALWAYS_INLINE HEnvironment(ArenaAllocator* arena, const HEnvironment& to_copy, HInstruction* holder)
       : HEnvironment(arena,
                      to_copy.Size(),
                      to_copy.GetMethod(),
@@ -3915,6 +3915,7 @@
   bool IsIntrinsic() const { return intrinsic_ != Intrinsics::kNone; }
 
   ArtMethod* GetResolvedMethod() const { return resolved_method_; }
+  void SetResolvedMethod(ArtMethod* method) { resolved_method_ = method; }
 
   DECLARE_ABSTRACT_INSTRUCTION(Invoke);
 
@@ -3957,7 +3958,7 @@
   }
 
   uint32_t number_of_arguments_;
-  ArtMethod* const resolved_method_;
+  ArtMethod* resolved_method_;
   const uint32_t dex_method_index_;
   Intrinsics intrinsic_;
 
@@ -5544,8 +5545,6 @@
 
     // Use a known boot image Class* address, embedded in the code by the codegen.
     // Used for boot image classes referenced by apps in AOT- and JIT-compiled code.
-    // Note: codegen needs to emit a linker patch if indicated by compiler options'
-    // GetIncludePatchInformation().
     kBootImageAddress,
 
     // Load from an entry in the .bss section using a PC-relative load.
@@ -5749,8 +5748,6 @@
 
     // Use a known boot image String* address, embedded in the code by the codegen.
     // Used for boot image strings referenced by apps in AOT- and JIT-compiled code.
-    // Note: codegen needs to emit a linker patch if indicated by compiler options'
-    // GetIncludePatchInformation().
     kBootImageAddress,
 
     // Load from an entry in the .bss section using a PC-relative load.
diff --git a/compiler/optimizing/optimizing_compiler.cc b/compiler/optimizing/optimizing_compiler.cc
index 3842ef9..d6153b0 100644
--- a/compiler/optimizing/optimizing_compiler.cc
+++ b/compiler/optimizing/optimizing_compiler.cc
@@ -507,7 +507,7 @@
   } else if (opt_name == HInductionVarAnalysis::kInductionPassName) {
     return new (arena) HInductionVarAnalysis(graph);
   } else if (opt_name == InstructionSimplifier::kInstructionSimplifierPassName) {
-    return new (arena) InstructionSimplifier(graph, stats, pass_name.c_str());
+    return new (arena) InstructionSimplifier(graph, codegen, stats, pass_name.c_str());
   } else if (opt_name == IntrinsicsRecognizer::kIntrinsicsRecognizerPassName) {
     return new (arena) IntrinsicsRecognizer(graph, stats);
   } else if (opt_name == LICM::kLoopInvariantCodeMotionPassName) {
@@ -519,7 +519,7 @@
   } else if (opt_name == SideEffectsAnalysis::kSideEffectsAnalysisPassName) {
     return new (arena) SideEffectsAnalysis(graph);
   } else if (opt_name == HLoopOptimization::kLoopOptimizationPassName) {
-    return new (arena) HLoopOptimization(graph, most_recent_induction);
+    return new (arena) HLoopOptimization(graph, driver, most_recent_induction);
   } else if (opt_name == CHAGuardOptimization::kCHAGuardOptimizationPassName) {
     return new (arena) CHAGuardOptimization(graph);
   } else if (opt_name == CodeSinking::kCodeSinkingPassName) {
@@ -768,26 +768,29 @@
   HDeadCodeElimination* dce3 = new (arena) HDeadCodeElimination(
       graph, stats, "dead_code_elimination$final");
   HConstantFolding* fold1 = new (arena) HConstantFolding(graph, "constant_folding");
-  InstructionSimplifier* simplify1 = new (arena) InstructionSimplifier(graph, stats);
+  InstructionSimplifier* simplify1 = new (arena) InstructionSimplifier(graph, codegen, stats);
   HSelectGenerator* select_generator = new (arena) HSelectGenerator(graph, stats);
   HConstantFolding* fold2 = new (arena) HConstantFolding(
       graph, "constant_folding$after_inlining");
   HConstantFolding* fold3 = new (arena) HConstantFolding(graph, "constant_folding$after_bce");
-  SideEffectsAnalysis* side_effects = new (arena) SideEffectsAnalysis(graph);
-  GVNOptimization* gvn = new (arena) GVNOptimization(graph, *side_effects);
-  LICM* licm = new (arena) LICM(graph, *side_effects, stats);
-  LoadStoreElimination* lse = new (arena) LoadStoreElimination(graph, *side_effects);
+  SideEffectsAnalysis* side_effects1 = new (arena) SideEffectsAnalysis(
+      graph, "side_effects$before_gvn");
+  SideEffectsAnalysis* side_effects2 = new (arena) SideEffectsAnalysis(
+      graph, "side_effects$before_lse");
+  GVNOptimization* gvn = new (arena) GVNOptimization(graph, *side_effects1);
+  LICM* licm = new (arena) LICM(graph, *side_effects1, stats);
   HInductionVarAnalysis* induction = new (arena) HInductionVarAnalysis(graph);
-  BoundsCheckElimination* bce = new (arena) BoundsCheckElimination(graph, *side_effects, induction);
-  HLoopOptimization* loop = new (arena) HLoopOptimization(graph, induction);
+  BoundsCheckElimination* bce = new (arena) BoundsCheckElimination(graph, *side_effects1, induction);
+  HLoopOptimization* loop = new (arena) HLoopOptimization(graph, driver, induction);
+  LoadStoreElimination* lse = new (arena) LoadStoreElimination(graph, *side_effects2);
   HSharpening* sharpening = new (arena) HSharpening(
       graph, codegen, dex_compilation_unit, driver, handles);
   InstructionSimplifier* simplify2 = new (arena) InstructionSimplifier(
-      graph, stats, "instruction_simplifier$after_inlining");
+      graph, codegen, stats, "instruction_simplifier$after_inlining");
   InstructionSimplifier* simplify3 = new (arena) InstructionSimplifier(
-      graph, stats, "instruction_simplifier$after_bce");
+      graph, codegen, stats, "instruction_simplifier$after_bce");
   InstructionSimplifier* simplify4 = new (arena) InstructionSimplifier(
-      graph, stats, "instruction_simplifier$before_codegen");
+      graph, codegen, stats, "instruction_simplifier$before_codegen");
   IntrinsicsRecognizer* intrinsics = new (arena) IntrinsicsRecognizer(graph, stats);
   CHAGuardOptimization* cha_guard = new (arena) CHAGuardOptimization(graph);
   CodeSinking* code_sinking = new (arena) CodeSinking(graph, stats);
@@ -810,7 +813,7 @@
     fold2,  // TODO: if we don't inline we can also skip fold2.
     simplify2,
     dce2,
-    side_effects,
+    side_effects1,
     gvn,
     licm,
     induction,
@@ -818,6 +821,7 @@
     loop,
     fold3,  // evaluates code generated by dynamic bce
     simplify3,
+    side_effects2,
     lse,
     cha_guard,
     dce3,
diff --git a/compiler/optimizing/register_allocator_linear_scan.cc b/compiler/optimizing/register_allocator_linear_scan.cc
index 1a391ce..6354e76 100644
--- a/compiler/optimizing/register_allocator_linear_scan.cc
+++ b/compiler/optimizing/register_allocator_linear_scan.cc
@@ -629,21 +629,21 @@
     if (!locations->OutputCanOverlapWithInputs() && locations->Out().IsUnallocated()) {
       HInputsRef inputs = defined_by->GetInputs();
       for (size_t i = 0; i < inputs.size(); ++i) {
-        // Take the last interval of the input. It is the location of that interval
-        // that will be used at `defined_by`.
-        LiveInterval* interval = inputs[i]->GetLiveInterval()->GetLastSibling();
-        // Note that interval may have not been processed yet.
-        // TODO: Handle non-split intervals last in the work list.
-        if (locations->InAt(i).IsValid()
-            && interval->HasRegister()
-            && interval->SameRegisterKind(*current)) {
-          // The input must be live until the end of `defined_by`, to comply to
-          // the linear scan algorithm. So we use `defined_by`'s end lifetime
-          // position to check whether the input is dead or is inactive after
-          // `defined_by`.
-          DCHECK(interval->CoversSlow(defined_by->GetLifetimePosition()));
-          size_t position = defined_by->GetLifetimePosition() + 1;
-          FreeIfNotCoverAt(interval, position, free_until);
+        if (locations->InAt(i).IsValid()) {
+          // Take the last interval of the input. It is the location of that interval
+          // that will be used at `defined_by`.
+          LiveInterval* interval = inputs[i]->GetLiveInterval()->GetLastSibling();
+          // Note that interval may have not been processed yet.
+          // TODO: Handle non-split intervals last in the work list.
+          if (interval->HasRegister() && interval->SameRegisterKind(*current)) {
+            // The input must be live until the end of `defined_by`, to comply to
+            // the linear scan algorithm. So we use `defined_by`'s end lifetime
+            // position to check whether the input is dead or is inactive after
+            // `defined_by`.
+            DCHECK(interval->CoversSlow(defined_by->GetLifetimePosition()));
+            size_t position = defined_by->GetLifetimePosition() + 1;
+            FreeIfNotCoverAt(interval, position, free_until);
+          }
         }
       }
     }
diff --git a/compiler/optimizing/sharpening.cc b/compiler/optimizing/sharpening.cc
index be40092..7bd38c7 100644
--- a/compiler/optimizing/sharpening.cc
+++ b/compiler/optimizing/sharpening.cc
@@ -41,7 +41,7 @@
     for (HInstructionIterator it(block->GetInstructions()); !it.Done(); it.Advance()) {
       HInstruction* instruction = it.Current();
       if (instruction->IsInvokeStaticOrDirect()) {
-        ProcessInvokeStaticOrDirect(instruction->AsInvokeStaticOrDirect());
+        SharpenInvokeStaticOrDirect(instruction->AsInvokeStaticOrDirect(), codegen_);
       } else if (instruction->IsLoadString()) {
         ProcessLoadString(instruction->AsLoadString());
       }
@@ -65,12 +65,12 @@
 }
 
 static bool AOTCanEmbedMethod(ArtMethod* method, const CompilerOptions& options) {
-  // Including patch information means the AOT code will be patched, which we don't
-  // support in the compiler, and is anyways moving away b/33192586.
-  return IsInBootImage(method) && !options.GetCompilePic() && !options.GetIncludePatchInformation();
+  return IsInBootImage(method) && !options.GetCompilePic();
 }
 
-void HSharpening::ProcessInvokeStaticOrDirect(HInvokeStaticOrDirect* invoke) {
+
+void HSharpening::SharpenInvokeStaticOrDirect(HInvokeStaticOrDirect* invoke,
+                                              CodeGenerator* codegen) {
   if (invoke->IsStringInit()) {
     // Not using the dex cache arrays. But we could still try to use a better dispatch...
     // TODO: Use direct_method and direct_code for the appropriate StringFactory method.
@@ -97,12 +97,12 @@
 
   // We don't optimize for debuggable as it would prevent us from obsoleting the method in some
   // situations.
-  if (callee == codegen_->GetGraph()->GetArtMethod() && !codegen_->GetGraph()->IsDebuggable()) {
+  if (callee == codegen->GetGraph()->GetArtMethod() && !codegen->GetGraph()->IsDebuggable()) {
     // Recursive call.
     method_load_kind = HInvokeStaticOrDirect::MethodLoadKind::kRecursive;
     code_ptr_location = HInvokeStaticOrDirect::CodePtrLocation::kCallSelf;
   } else if (Runtime::Current()->UseJitCompilation() ||
-      AOTCanEmbedMethod(callee, codegen_->GetCompilerOptions())) {
+      AOTCanEmbedMethod(callee, codegen->GetCompilerOptions())) {
     // JIT or on-device AOT compilation referencing a boot image method.
     // Use the method address directly.
     method_load_kind = HInvokeStaticOrDirect::MethodLoadKind::kDirectAddress;
@@ -111,13 +111,17 @@
   } else {
     // Use PC-relative access to the dex cache arrays.
     method_load_kind = HInvokeStaticOrDirect::MethodLoadKind::kDexCachePcRelative;
-    DexCacheArraysLayout layout(GetInstructionSetPointerSize(codegen_->GetInstructionSet()),
-                                &graph_->GetDexFile());
+    // Note: we use the invoke's graph instead of the codegen graph, which are
+    // different when inlining (the codegen graph is the most outer graph). The
+    // invoke's dex method index is relative to the dex file where the invoke's graph
+    // was built from.
+    DexCacheArraysLayout layout(GetInstructionSetPointerSize(codegen->GetInstructionSet()),
+                                &invoke->GetBlock()->GetGraph()->GetDexFile());
     method_load_data = layout.MethodOffset(invoke->GetDexMethodIndex());
     code_ptr_location = HInvokeStaticOrDirect::CodePtrLocation::kCallArtMethod;
   }
 
-  if (graph_->IsDebuggable()) {
+  if (codegen->GetGraph()->IsDebuggable()) {
     // For debuggable apps always use the code pointer from ArtMethod
     // so that we don't circumvent instrumentation stubs if installed.
     code_ptr_location = HInvokeStaticOrDirect::CodePtrLocation::kCallArtMethod;
@@ -127,14 +131,14 @@
       method_load_kind, code_ptr_location, method_load_data
   };
   HInvokeStaticOrDirect::DispatchInfo dispatch_info =
-      codegen_->GetSupportedInvokeStaticOrDirectDispatch(desired_dispatch_info, invoke);
+      codegen->GetSupportedInvokeStaticOrDirectDispatch(desired_dispatch_info, invoke);
   invoke->SetDispatchInfo(dispatch_info);
 }
 
-HLoadClass::LoadKind HSharpening::SharpenClass(HLoadClass* load_class,
-                                               CodeGenerator* codegen,
-                                               CompilerDriver* compiler_driver,
-                                               const DexCompilationUnit& dex_compilation_unit) {
+HLoadClass::LoadKind HSharpening::ComputeLoadClassKind(HLoadClass* load_class,
+                                                       CodeGenerator* codegen,
+                                                       CompilerDriver* compiler_driver,
+                                                       const DexCompilationUnit& dex_compilation_unit) {
   Handle<mirror::Class> klass = load_class->GetClass();
   DCHECK(load_class->GetLoadKind() == HLoadClass::LoadKind::kDexCacheViaMethod ||
          load_class->GetLoadKind() == HLoadClass::LoadKind::kReferrersClass)
diff --git a/compiler/optimizing/sharpening.h b/compiler/optimizing/sharpening.h
index 4240b2f..10707c7 100644
--- a/compiler/optimizing/sharpening.h
+++ b/compiler/optimizing/sharpening.h
@@ -48,14 +48,16 @@
   static constexpr const char* kSharpeningPassName = "sharpening";
 
   // Used by the builder and the inliner.
-  static HLoadClass::LoadKind SharpenClass(HLoadClass* load_class,
-                                           CodeGenerator* codegen,
-                                           CompilerDriver* compiler_driver,
-                                           const DexCompilationUnit& dex_compilation_unit)
+  static HLoadClass::LoadKind ComputeLoadClassKind(HLoadClass* load_class,
+                                                   CodeGenerator* codegen,
+                                                   CompilerDriver* compiler_driver,
+                                                   const DexCompilationUnit& dex_compilation_unit)
     REQUIRES_SHARED(Locks::mutator_lock_);
 
+  // Used by Sharpening and InstructionSimplifier.
+  static void SharpenInvokeStaticOrDirect(HInvokeStaticOrDirect* invoke, CodeGenerator* codegen);
+
  private:
-  void ProcessInvokeStaticOrDirect(HInvokeStaticOrDirect* invoke);
   void ProcessLoadString(HLoadString* load_string);
 
   CodeGenerator* codegen_;
diff --git a/compiler/optimizing/side_effects_analysis.h b/compiler/optimizing/side_effects_analysis.h
index bac6088..fea47e6 100644
--- a/compiler/optimizing/side_effects_analysis.h
+++ b/compiler/optimizing/side_effects_analysis.h
@@ -25,8 +25,8 @@
 
 class SideEffectsAnalysis : public HOptimization {
  public:
-  explicit SideEffectsAnalysis(HGraph* graph)
-      : HOptimization(graph, kSideEffectsAnalysisPassName),
+  SideEffectsAnalysis(HGraph* graph, const char* pass_name = kSideEffectsAnalysisPassName)
+      : HOptimization(graph, pass_name),
         graph_(graph),
         block_effects_(graph->GetBlocks().size(),
                        graph->GetArena()->Adapter(kArenaAllocSideEffectsAnalysis)),
@@ -41,7 +41,7 @@
 
   bool HasRun() const { return has_run_; }
 
-  static constexpr const char* kSideEffectsAnalysisPassName = "SideEffects";
+  static constexpr const char* kSideEffectsAnalysisPassName = "side_effects";
 
  private:
   void UpdateLoopEffects(HLoopInformation* info, SideEffects effects);
diff --git a/compiler/optimizing/ssa_liveness_analysis.h b/compiler/optimizing/ssa_liveness_analysis.h
index b62bf4e..a239bd5 100644
--- a/compiler/optimizing/ssa_liveness_analysis.h
+++ b/compiler/optimizing/ssa_liveness_analysis.h
@@ -331,7 +331,7 @@
         instruction, /* environment */ nullptr, input_index, block->GetLifetimeEnd(), first_use_);
   }
 
-  void AddRange(size_t start, size_t end) {
+  ALWAYS_INLINE void AddRange(size_t start, size_t end) {
     if (first_range_ == nullptr) {
       first_range_ = last_range_ = range_search_start_ =
           new (allocator_) LiveRange(start, end, first_range_);
diff --git a/compiler/verifier_deps_test.cc b/compiler/verifier_deps_test.cc
index c892b25..01c3359 100644
--- a/compiler/verifier_deps_test.cc
+++ b/compiler/verifier_deps_test.cc
@@ -246,9 +246,13 @@
   }
 
   bool HasUnverifiedClass(const std::string& cls) {
-    const DexFile::TypeId* type_id = primary_dex_file_->FindTypeId(cls.c_str());
+    return HasUnverifiedClass(cls, *primary_dex_file_);
+  }
+
+  bool HasUnverifiedClass(const std::string& cls, const DexFile& dex_file) {
+    const DexFile::TypeId* type_id = dex_file.FindTypeId(cls.c_str());
     DCHECK(type_id != nullptr);
-    dex::TypeIndex index = primary_dex_file_->GetIndexForTypeId(*type_id);
+    dex::TypeIndex index = dex_file.GetIndexForTypeId(*type_id);
     for (const auto& dex_dep : verifier_deps_->dex_deps_) {
       for (dex::TypeIndex entry : dex_dep.second->unverified_classes_) {
         if (index == entry) {
@@ -1141,7 +1145,7 @@
   // Test that a class with hard failure is recorded.
   ASSERT_TRUE(HasUnverifiedClass("LMyVerificationFailure;"));
   // Test that a class with unresolved super is recorded.
-  ASSERT_FALSE(HasUnverifiedClass("LMyClassWithNoSuper;"));
+  ASSERT_TRUE(HasUnverifiedClass("LMyClassWithNoSuper;"));
   // Test that a class with unresolved super and hard failure is recorded.
   ASSERT_TRUE(HasUnverifiedClass("LMyClassWithNoSuperButFailures;"));
 }
@@ -1511,5 +1515,18 @@
   }
 }
 
+TEST_F(VerifierDepsTest, MultiDexVerification) {
+  VerifyDexFile("VerifierDepsMulti");
+  ASSERT_EQ(NumberOfCompiledDexFiles(), 2u);
+
+  ASSERT_TRUE(HasUnverifiedClass("LMySoftVerificationFailure;", *dex_files_[1]));
+  ASSERT_TRUE(HasUnverifiedClass("LMySub1SoftVerificationFailure;", *dex_files_[0]));
+  ASSERT_TRUE(HasUnverifiedClass("LMySub2SoftVerificationFailure;", *dex_files_[0]));
+
+  std::vector<uint8_t> buffer;
+  verifier_deps_->Encode(dex_files_, &buffer);
+  ASSERT_FALSE(buffer.empty());
+}
+
 }  // namespace verifier
 }  // namespace art
diff --git a/dex2oat/dex2oat.cc b/dex2oat/dex2oat.cc
index f535557..3fa30fa 100644
--- a/dex2oat/dex2oat.cc
+++ b/dex2oat/dex2oat.cc
@@ -328,11 +328,6 @@
   UsageError("");
   UsageError("  --dump-timing: display a breakdown of where time was spent");
   UsageError("");
-  UsageError("  --include-patch-information: Include patching information so the generated code");
-  UsageError("      can have its base address moved without full recompilation.");
-  UsageError("");
-  UsageError("  --no-include-patch-information: Do not include patching information.");
-  UsageError("");
   UsageError("  -g");
   UsageError("  --generate-debug-info: Generate debug information for native debugging,");
   UsageError("      such as stack unwinding information, ELF symbols and DWARF sections.");
@@ -1958,7 +1953,6 @@
 
         elf_writer->WriteDynamicSection();
         elf_writer->WriteDebugInfo(oat_writer->GetMethodDebugInfo());
-        elf_writer->WritePatchLocations(oat_writer->GetAbsolutePatchLocations());
 
         if (!elf_writer->End()) {
           LOG(ERROR) << "Failed to write ELF file " << oat_file->GetPath();
@@ -2106,6 +2100,10 @@
     return DoProfileGuidedOptimizations();
   }
 
+  bool HasInputVdexFile() const {
+    return input_vdex_file_ != nullptr || input_vdex_fd_ != -1 || !input_vdex_.empty();
+  }
+
   bool LoadProfile() {
     DCHECK(UseProfile());
 
@@ -2891,6 +2889,13 @@
     }
   }
 
+  if (dex2oat->DoDexLayoutOptimizations()) {
+    if (dex2oat->HasInputVdexFile()) {
+      LOG(ERROR) << "Dexlayout is incompatible with an input VDEX";
+      return EXIT_FAILURE;
+    }
+  }
+
   art::MemMap::Init();  // For ZipEntry::ExtractToMemMap, and vdex.
 
   // Check early that the result of compilation can be written
diff --git a/dex2oat/dex2oat_test.cc b/dex2oat/dex2oat_test.cc
index e7277bc..289b8ab 100644
--- a/dex2oat/dex2oat_test.cc
+++ b/dex2oat/dex2oat_test.cc
@@ -604,7 +604,8 @@
                           const std::string& app_image_file_name,
                           bool use_fd,
                           size_t num_profile_classes,
-                          const std::vector<std::string>& extra_args = {}) {
+                          const std::vector<std::string>& extra_args = {},
+                          bool expect_success = true) {
     const std::string profile_location = GetScratchDir() + "/primary.prof";
     const char* location = dex_location.c_str();
     std::string error_msg;
@@ -631,7 +632,7 @@
                         odex_location,
                         CompilerFilter::kSpeedProfile,
                         copy,
-                        /* expect_success */ true,
+                        expect_success,
                         use_fd);
     if (app_image_file != nullptr) {
       ASSERT_EQ(app_image_file->FlushCloseOrErase(), 0) << "Could not flush and close art file";
@@ -709,6 +710,7 @@
       EXPECT_GT(vdex_file1->GetLength(), 0u);
     }
     {
+      // Test that vdex and dexlayout fail gracefully.
       std::string input_vdex = StringPrintf("--input-vdex-fd=%d", vdex_file1->Fd());
       std::string output_vdex = StringPrintf("--output-vdex-fd=%d", vdex_file2.GetFd());
       CompileProfileOdex(dex_location,
@@ -716,13 +718,13 @@
                          app_image_file_name,
                          /* use_fd */ true,
                          /* num_profile_classes */ 1,
-                         { input_vdex, output_vdex });
-      EXPECT_GT(vdex_file2.GetFile()->GetLength(), 0u);
+                         { input_vdex, output_vdex },
+                         /* expect_success */ false);
+      EXPECT_EQ(vdex_file2.GetFile()->GetLength(), 0u);
     }
     ASSERT_EQ(vdex_file1->FlushCloseOrErase(), 0) << "Could not flush and close vdex file";
     CheckValidity();
-    ASSERT_TRUE(success_);
-    CheckResult(dex_location, odex_location, app_image_file_name);
+    ASSERT_FALSE(success_);
   }
 
   void CheckResult(const std::string& dex_location,
diff --git a/dexlayout/dex_ir.cc b/dexlayout/dex_ir.cc
index a694099..34983cf 100644
--- a/dexlayout/dex_ir.cc
+++ b/dexlayout/dex_ir.cc
@@ -64,23 +64,18 @@
     uintptr_t insns_end = reinterpret_cast<uintptr_t>(&disk_code_item.insns_[insns_size]);
     return insns_end - code_item_start;
   } else {
-    uint32_t last_handler_off = 0;
-    for (uint32_t i = 0; i < tries_size; ++i) {
-      // Iterate over the try items to find the last catch handler.
-      const DexFile::TryItem* disk_try_item = dex_file.GetTryItems(disk_code_item, i);
-      uint16_t handler_off = disk_try_item->handler_off_;
-      if (handler_off > last_handler_off) {
-        last_handler_off = handler_off;
+    // Get the start of the handler data.
+    const uint8_t* handler_data = DexFile::GetCatchHandlerData(disk_code_item, 0);
+    uint32_t handlers_size = DecodeUnsignedLeb128(&handler_data);
+    // Manually read each handler.
+    for (uint32_t i = 0; i < handlers_size; ++i) {
+      int32_t uleb128_count = DecodeSignedLeb128(&handler_data) * 2;
+      if (uleb128_count <= 0) {
+        uleb128_count = -uleb128_count + 1;
       }
-    }
-    // Decode the final handler to see where it ends.
-    const uint8_t* handler_data = DexFile::GetCatchHandlerData(disk_code_item, last_handler_off);
-    int32_t uleb128_count = DecodeSignedLeb128(&handler_data) * 2;
-    if (uleb128_count <= 0) {
-      uleb128_count = -uleb128_count + 1;
-    }
-    for (int32_t i = 0; i < uleb128_count; ++i) {
-      DecodeUnsignedLeb128(&handler_data);
+      for (int32_t j = 0; j < uleb128_count; ++j) {
+        DecodeUnsignedLeb128(&handler_data);
+      }
     }
     return reinterpret_cast<uintptr_t>(handler_data) - code_item_start;
   }
diff --git a/dexlayout/dex_visualize.cc b/dexlayout/dex_visualize.cc
index 75d47e4..8997146 100644
--- a/dexlayout/dex_visualize.cc
+++ b/dexlayout/dex_visualize.cc
@@ -35,6 +35,12 @@
 
 namespace art {
 
+std::string MultidexName(const std::string& prefix,
+                         size_t dex_file_index,
+                         const std::string& suffix) {
+  return prefix + ((dex_file_index > 0) ? std::to_string(dex_file_index + 1) : "") + suffix;
+}
+
 struct FileSection {
  public:
   std::string name_;
@@ -43,8 +49,22 @@
   std::function<uint32_t(const dex_ir::Collections&)> offset_fn_;
 };
 
+static uint32_t HeaderOffset(const dex_ir::Collections& collections ATTRIBUTE_UNUSED) {
+  return 0;
+}
+
+static uint32_t HeaderSize(const dex_ir::Collections& collections ATTRIBUTE_UNUSED) {
+  // Size is in elements, so there is only one header.
+  return 1;
+}
+
 static const std::vector<FileSection> kFileSections = {
   {
+    "Header",
+    DexFile::kDexTypeHeaderItem,
+    &HeaderSize,
+    &HeaderOffset,
+  }, {
     "StringId",
     DexFile::kDexTypeStringIdItem,
     &dex_ir::Collections::StringIdsSize,
@@ -127,58 +147,71 @@
   }
 };
 
+static constexpr bool kSortAscending = false;
+static constexpr bool kSortDescending = true;
+
+static std::vector<const FileSection*> GetSortedSections(
+    const dex_ir::Collections& collections,
+    bool sort_descending) {
+  std::vector<const FileSection*> sorted_sections;
+  // Build the table that will map from offset to color
+  for (const FileSection& s : kFileSections) {
+    sorted_sections.push_back(&s);
+  }
+  // Sort by offset.
+  std::sort(sorted_sections.begin(),
+            sorted_sections.end(),
+            [&](const FileSection* a, const FileSection* b) {
+              if (sort_descending) {
+                return a->offset_fn_(collections) > b->offset_fn_(collections);
+              } else {
+                return a->offset_fn_(collections) < b->offset_fn_(collections);
+              }
+            });
+  return sorted_sections;
+}
+
 class Dumper {
  public:
   // Colors are based on the type of the section in MapList.
-  Dumper(const dex_ir::Collections& collections, size_t dex_file_index) {
-    // Build the table that will map from offset to color
-    table_.emplace_back(DexFile::kDexTypeHeaderItem, 0u);
-    for (const FileSection& s : kFileSections) {
-      table_.emplace_back(s.type_, s.offset_fn_(collections));
-    }
-    // Sort into descending order by offset.
-    std::sort(table_.begin(),
-              table_.end(),
-              [](const SectionColor& a, const SectionColor& b) { return a.offset_ > b.offset_; });
+  explicit Dumper(const dex_ir::Collections& collections)
+      : collections_(collections), out_file_(nullptr),
+        sorted_sections_(GetSortedSections(collections, kSortDescending)) { }
+
+  bool OpenAndPrintHeader(size_t dex_index) {
     // Open the file and emit the gnuplot prologue.
-    std::string dex_file_name("classes");
-    std::string out_file_base_name("layout");
-    if (dex_file_index > 0) {
-      out_file_base_name += std::to_string(dex_file_index + 1);
-      dex_file_name += std::to_string(dex_file_index + 1);
+    out_file_ = fopen(MultidexName("layout", dex_index, ".gnuplot").c_str(), "w");
+    if (out_file_ == nullptr) {
+      return false;
     }
-    dex_file_name += ".dex";
-    std::string out_file_name(out_file_base_name + ".gnuplot");
-    std::string png_file_name(out_file_base_name + ".png");
-    out_file_ = fopen(out_file_name.c_str(), "w");
     fprintf(out_file_, "set terminal png size 1920,1080\n");
-    fprintf(out_file_, "set output \"%s\"\n", png_file_name.c_str());
-    fprintf(out_file_, "set title \"%s\"\n", dex_file_name.c_str());
+    fprintf(out_file_, "set output \"%s\"\n", MultidexName("layout", dex_index, ".png").c_str());
+    fprintf(out_file_, "set title \"%s\"\n", MultidexName("classes", dex_index, ".dex").c_str());
     fprintf(out_file_, "set xlabel \"Page offset into dex\"\n");
     fprintf(out_file_, "set ylabel \"ClassDef index\"\n");
     fprintf(out_file_, "set xtics rotate out (");
-    fprintf(out_file_, "\"Header\" %d, ", 0);
     bool printed_one = false;
     for (const FileSection& s : kFileSections) {
-      if (s.size_fn_(collections) > 0) {
+      if (s.size_fn_(collections_) > 0) {
         if (printed_one) {
           fprintf(out_file_, ", ");
         }
-        fprintf(out_file_, "\"%s\" %d", s.name_.c_str(), s.offset_fn_(collections) / kPageSize);
+        fprintf(out_file_, "\"%s\" %d", s.name_.c_str(), s.offset_fn_(collections_) / kPageSize);
         printed_one = true;
       }
     }
     fprintf(out_file_, ")\n");
     fprintf(out_file_,
             "plot \"-\" using 1:2:3:4:5 with vector nohead linewidth 1 lc variable notitle\n");
+    return true;
   }
 
   int GetColor(uint32_t offset) const {
     // The dread linear search to find the right section for the reference.
     uint16_t section = 0;
-    for (uint16_t i = 0; i < table_.size(); ++i) {
-      if (table_[i].offset_ < offset) {
-        section = table_[i].type_;
+    for (const FileSection* file_section : sorted_sections_) {
+      if (file_section->offset_fn_(collections_) < offset) {
+        section = file_section->type_;
         break;
       }
     }
@@ -308,13 +341,6 @@
   }
 
  private:
-  struct SectionColor {
-   public:
-    SectionColor(uint16_t type, uint32_t offset) : type_(type), offset_(offset) { }
-    uint16_t type_;
-    uint32_t offset_;
-  };
-
   using ColorMapType = std::map<uint16_t, int>;
   const ColorMapType kColorMap = {
     { DexFile::kDexTypeHeaderItem, 1 },
@@ -336,8 +362,9 @@
     { DexFile::kDexTypeAnnotationsDirectoryItem, 16 }
   };
 
-  std::vector<SectionColor> table_;
+  const dex_ir::Collections& collections_;
   FILE* out_file_;
+  std::vector<const FileSection*> sorted_sections_;
 
   DISALLOW_COPY_AND_ASSIGN(Dumper);
 };
@@ -350,7 +377,11 @@
                         const DexFile* dex_file,
                         size_t dex_file_index,
                         ProfileCompilationInfo* profile_info) {
-  std::unique_ptr<Dumper> dumper(new Dumper(header->GetCollections(), dex_file_index));
+  std::unique_ptr<Dumper> dumper(new Dumper(header->GetCollections()));
+  if (!dumper->OpenAndPrintHeader(dex_file_index)) {
+    fprintf(stderr, "Could not open output file.\n");
+    return;
+  }
 
   const uint32_t class_defs_size = header->GetCollections().ClassDefsSize();
   for (uint32_t class_index = 0; class_index < class_defs_size; class_index++) {
@@ -401,4 +432,22 @@
   }  // for
 }
 
+/*
+ * Dumps the offset and size of sections within the file.
+ */
+void ShowDexSectionStatistics(dex_ir::Header* header, size_t dex_file_index) {
+  // Compute the (multidex) class file name).
+  fprintf(stdout, "%s\n", MultidexName("classes", dex_file_index, ".dex").c_str());
+  fprintf(stdout, "section    offset     items\n");
+  const dex_ir::Collections& collections = header->GetCollections();
+  std::vector<const FileSection*> sorted_sections(GetSortedSections(collections, kSortAscending));
+  for (const FileSection* file_section : sorted_sections) {
+    fprintf(stdout, "%-10s 0x%08x 0x%08x\n",
+      file_section->name_.c_str(),
+      file_section->offset_fn_(collections),
+      file_section->size_fn_(collections));
+  }
+  fprintf(stdout, "\n");
+}
+
 }  // namespace art
diff --git a/dexlayout/dex_visualize.h b/dexlayout/dex_visualize.h
index 09f8306..a1aa2cd 100644
--- a/dexlayout/dex_visualize.h
+++ b/dexlayout/dex_visualize.h
@@ -38,6 +38,8 @@
                         size_t dex_file_index,
                         ProfileCompilationInfo* profile_info);
 
+void ShowDexSectionStatistics(dex_ir::Header* header, size_t dex_file_index);
+
 }  // namespace art
 
 #endif  // ART_DEXLAYOUT_DEX_VISUALIZE_H_
diff --git a/dexlayout/dexlayout.cc b/dexlayout/dexlayout.cc
index 4aa8b82..dbe6988 100644
--- a/dexlayout/dexlayout.cc
+++ b/dexlayout/dexlayout.cc
@@ -34,6 +34,7 @@
 
 #include "dex_ir_builder.h"
 #include "dex_file-inl.h"
+#include "dex_file_verifier.h"
 #include "dex_instruction-inl.h"
 #include "dex_visualize.h"
 #include "dex_writer.h"
@@ -1745,6 +1746,11 @@
     return;
   }
 
+  if (options_.show_section_statistics_) {
+    ShowDexSectionStatistics(header_, dex_file_index);
+    return;
+  }
+
   // Dump dex file.
   if (options_.dump_) {
     DumpDexFile();
@@ -1756,6 +1762,16 @@
       LayoutOutputFile(dex_file);
     }
     OutputDexFile(dex_file->GetLocation());
+    // Verify the output dex file is ok on debug builds.
+    if (kIsDebugBuild) {
+      std::string error_msg;
+      DCHECK(DexFileVerifier::Verify(dex_file,
+                                     dex_file->Begin(),
+                                     dex_file->Size(),
+                                     dex_file->GetLocation().c_str(),
+                                     false,
+                                     &error_msg));
+    }
   }
 }
 
diff --git a/dexlayout/dexlayout.h b/dexlayout/dexlayout.h
index 3918706..74b5253 100644
--- a/dexlayout/dexlayout.h
+++ b/dexlayout/dexlayout.h
@@ -56,6 +56,7 @@
   bool show_annotations_ = false;
   bool show_file_headers_ = false;
   bool show_section_headers_ = false;
+  bool show_section_statistics_ = false;
   bool verbose_ = false;
   bool visualize_pattern_ = false;
   OutputFormat output_format_ = kOutputPlain;
diff --git a/dexlayout/dexlayout_main.cc b/dexlayout/dexlayout_main.cc
index ad599ae..3eac660 100644
--- a/dexlayout/dexlayout_main.cc
+++ b/dexlayout/dexlayout_main.cc
@@ -57,6 +57,7 @@
   fprintf(stderr, " -o : output file name (defaults to stdout)\n");
   fprintf(stderr, " -p : profile file name (defaults to no profile)\n");
   fprintf(stderr, " -s : visualize reference pattern\n");
+  fprintf(stderr, " -t : display file section sizes\n");
   fprintf(stderr, " -w : output dex directory \n");
 }
 
@@ -75,7 +76,7 @@
 
   // Parse all arguments.
   while (1) {
-    const int ic = getopt(argc, argv, "abcdefghil:mo:p:sw:");
+    const int ic = getopt(argc, argv, "abcdefghil:mo:p:stw:");
     if (ic < 0) {
       break;  // done
     }
@@ -127,6 +128,10 @@
         options.visualize_pattern_ = true;
         options.verbose_ = false;
         break;
+      case 't':  // display section statistics
+        options.show_section_statistics_ = true;
+        options.verbose_ = false;
+        break;
       case 'w':  // output dex files directory
         options.output_dex_directory_ = optarg;
         break;
diff --git a/dexlayout/dexlayout_test.cc b/dexlayout/dexlayout_test.cc
index 952909c..bd6548e 100644
--- a/dexlayout/dexlayout_test.cc
+++ b/dexlayout/dexlayout_test.cc
@@ -95,6 +95,29 @@
     "AAABIAAAAgAAADwBAAABEAAAAgAAANwBAAACIAAAEQAAAOoBAAADIAAAAgAAAKYCAAAAIAAAAQAA"
     "ALkCAAAAEAAAAQAAAMgCAAA=";
 
+// Dex file with an unreferenced catch handler at end of code item.
+// Constructed by building a dex file with try/catch blocks and hex editing.
+static const char kUnreferencedEndingCatchHandlerInputDex[] =
+    "ZGV4CjAzNQCEflufI6xGTDDRmLpbfYi6ujPrDLIwvYcEBAAAcAAAAHhWNBIAAAAAAAAAAGQDAAAT"
+    "AAAAcAAAAAgAAAC8AAAAAwAAANwAAAABAAAAAAEAAAUAAAAIAQAAAQAAADABAAC0AgAAUAEAAE4C"
+    "AABWAgAAXgIAAGYCAAB4AgAAhwIAAJ4CAAC1AgAAyQIAAN0CAADxAgAA9wIAAP0CAAAAAwAABAMA"
+    "ABkDAAAcAwAAIgMAACcDAAAEAAAABQAAAAYAAAAHAAAACAAAAAkAAAAMAAAADgAAAAwAAAAGAAAA"
+    "AAAAAA0AAAAGAAAAQAIAAA0AAAAGAAAASAIAAAUAAQARAAAAAAAAAAAAAAAAAAAADwAAAAAAAgAQ"
+    "AAAAAQABABIAAAADAAAAAAAAAAAAAAABAAAAAwAAAAAAAAADAAAAAAAAAFADAAAAAAAAAQABAAEA"
+    "AAAwAwAABAAAAHAQBAAAAA4AAgAAAAIAAgA1AwAAIQAAAGIAAAAaAQoAbiADABAAYgAAABoBCwBu"
+    "IAMAEAAOAA0AYgAAABoBAQBuIAMAEAAo8A0AYgAAABoBAgBuIAMAEAAo7gAAAAAAAAcAAQAHAAAA"
+    "BwABAAIBAg8BAhgAAwABAAIAAgBCAwAAIQAAAGIAAAAaAQoAbiADABAAYgAAABoBCwBuIAMAEAAO"
+    "AA0AYgAAABoBAQBuIAMAEAAo8A0AYgAAABoBAgBuIAMAEAAo7gAAAAAAAAcAAQAHAAAABwABAAIB"
+    "Ag8BAhgAAQAAAAQAAAABAAAABwAGPGluaXQ+AAZDYXRjaDEABkNhdGNoMgAQSGFuZGxlclRlc3Qu"
+    "amF2YQANTEhhbmRsZXJUZXN0OwAVTGphdmEvaW8vUHJpbnRTdHJlYW07ABVMamF2YS9sYW5nL0V4"
+    "Y2VwdGlvbjsAEkxqYXZhL2xhbmcvT2JqZWN0OwASTGphdmEvbGFuZy9TdHJpbmc7ABJMamF2YS9s"
+    "YW5nL1N5c3RlbTsABFRyeTEABFRyeTIAAVYAAlZMABNbTGphdmEvbGFuZy9TdHJpbmc7AAFhAARt"
+    "YWluAANvdXQAB3ByaW50bG4AAQAHDgAEAAcOfHsCeB0eih4AEQEABw59ewJ3HR6LHgAAAAMAAIGA"
+    "BNACAQnoAgEJ1AMAAA0AAAAAAAAAAQAAAAAAAAABAAAAEwAAAHAAAAACAAAACAAAALwAAAADAAAA"
+    "AwAAANwAAAAEAAAAAQAAAAABAAAFAAAABQAAAAgBAAAGAAAAAQAAADABAAABIAAAAwAAAFABAAAB"
+    "EAAAAgAAAEACAAACIAAAEwAAAE4CAAADIAAAAwAAADADAAAAIAAAAQAAAFADAAAAEAAAAQAAAGQD"
+    "AAA=";
+
 // Dex file with multiple code items that have the same debug_info_off_. Constructed by a modified
 // dexlayout on XandY.
 static const char kDexFileDuplicateOffset[] =
@@ -390,6 +413,14 @@
                                            kUnreferenced0SizeCatchHandlerInputDex)) << error_msg;
 }
 
+TEST_F(DexLayoutTest, UnreferencedEndingCatchHandler) {
+  // Disable test on target.
+  TEST_DISABLED_FOR_TARGET();
+  std::string error_msg;
+  ASSERT_TRUE(UnreferencedCatchHandlerExec(&error_msg,
+                                           kUnreferencedEndingCatchHandlerInputDex)) << error_msg;
+}
+
 TEST_F(DexLayoutTest, DuplicateOffset) {
   ScratchFile temp;
   WriteBase64ToFile(kDexFileDuplicateOffset, temp.GetFile());
diff --git a/oatdump/oatdump.cc b/oatdump/oatdump.cc
index becb827..e767023 100644
--- a/oatdump/oatdump.cc
+++ b/oatdump/oatdump.cc
@@ -2210,13 +2210,13 @@
           ScopedIndentation indent2(&state->vios_);
           auto* resolved_fields = dex_cache->GetResolvedFields();
           for (size_t i = 0, length = dex_cache->NumResolvedFields(); i < length; ++i) {
-            auto* elem = mirror::DexCache::GetElementPtrSize(
-                resolved_fields, i, image_pointer_size);
+            auto* elem = mirror::DexCache::GetNativePairPtrSize(
+                resolved_fields, i, image_pointer_size).object;
             size_t run = 0;
             for (size_t j = i + 1;
-                 j != length && elem == mirror::DexCache::GetElementPtrSize(resolved_fields,
-                                                                            j,
-                                                                            image_pointer_size);
+                 j != length &&
+                 elem == mirror::DexCache::GetNativePairPtrSize(
+                     resolved_fields, j, image_pointer_size).object;
                  ++j) {
               ++run;
             }
diff --git a/patchoat/patchoat.cc b/patchoat/patchoat.cc
index 18a6670..dfaae7d 100644
--- a/patchoat/patchoat.cc
+++ b/patchoat/patchoat.cc
@@ -534,17 +534,18 @@
         mirror::DexCache::SetElementPtrSize(copy_methods, j, copy, pointer_size);
       }
     }
-    ArtField** orig_fields = orig_dex_cache->GetResolvedFields();
-    ArtField** relocated_fields = RelocatedAddressOfPointer(orig_fields);
+    mirror::FieldDexCacheType* orig_fields = orig_dex_cache->GetResolvedFields();
+    mirror::FieldDexCacheType* relocated_fields = RelocatedAddressOfPointer(orig_fields);
     copy_dex_cache->SetField64<false>(
         mirror::DexCache::ResolvedFieldsOffset(),
         static_cast<int64_t>(reinterpret_cast<uintptr_t>(relocated_fields)));
     if (orig_fields != nullptr) {
-      ArtField** copy_fields = RelocatedCopyOf(orig_fields);
+      mirror::FieldDexCacheType* copy_fields = RelocatedCopyOf(orig_fields);
       for (size_t j = 0, num = orig_dex_cache->NumResolvedFields(); j != num; ++j) {
-        ArtField* orig = mirror::DexCache::GetElementPtrSize(orig_fields, j, pointer_size);
-        ArtField* copy = RelocatedAddressOfPointer(orig);
-        mirror::DexCache::SetElementPtrSize(copy_fields, j, copy, pointer_size);
+        mirror::FieldDexCachePair orig =
+            mirror::DexCache::GetNativePairPtrSize(orig_fields, j, pointer_size);
+        mirror::FieldDexCachePair copy(RelocatedAddressOfPointer(orig.object), orig.index);
+        mirror::DexCache::SetNativePairPtrSize(copy_fields, j, copy, pointer_size);
       }
     }
     mirror::MethodTypeDexCacheType* orig_method_types = orig_dex_cache->GetResolvedMethodTypes();
diff --git a/profman/profile_assistant_test.cc b/profman/profile_assistant_test.cc
index 5a758ae..52f3b52 100644
--- a/profman/profile_assistant_test.cc
+++ b/profman/profile_assistant_test.cc
@@ -182,7 +182,8 @@
   void AssertInlineCaches(ArtMethod* method,
                           const std::set<mirror::Class*>& expected_clases,
                           const ProfileCompilationInfo& info,
-                          bool megamorphic)
+                          bool is_megamorphic,
+                          bool is_missing_types)
       REQUIRES_SHARED(Locks::mutator_lock_) {
     ProfileCompilationInfo::OfflineProfileMethodInfo pmi;
     ASSERT_TRUE(info.GetMethod(method->GetDexFile()->GetLocation(),
@@ -192,7 +193,8 @@
     ASSERT_EQ(pmi.inline_caches.size(), 1u);
     ProfileCompilationInfo::DexPcData dex_pc_data = pmi.inline_caches.begin()->second;
 
-    ASSERT_EQ(dex_pc_data.is_megamorphic, megamorphic);
+    ASSERT_EQ(dex_pc_data.is_megamorphic, is_megamorphic);
+    ASSERT_EQ(dex_pc_data.is_missing_types, is_missing_types);
     ASSERT_EQ(expected_clases.size(), dex_pc_data.classes.size());
     size_t found = 0;
     for (mirror::Class* it : expected_clases) {
@@ -482,6 +484,7 @@
     "LTestInline;->inlineMonomorphic(LSuper;)I+LSubA;",
     "LTestInline;->inlinePolymorphic(LSuper;)I+LSubA;,LSubB;,LSubC;",
     "LTestInline;->inlineMegamorphic(LSuper;)I+LSubA;,LSubB;,LSubC;,LSubD;,LSubE;",
+    "LTestInline;->inlineMissingTypes(LSuper;)I+missing_types",
     "LTestInline;->noInlineCache(LSuper;)I"
   };
   std::string input_file_contents;
@@ -521,7 +524,11 @@
     ASSERT_TRUE(inline_monomorphic != nullptr);
     std::set<mirror::Class*> expected_monomorphic;
     expected_monomorphic.insert(sub_a);
-    AssertInlineCaches(inline_monomorphic, expected_monomorphic, info, /*megamorphic*/ false);
+    AssertInlineCaches(inline_monomorphic,
+                       expected_monomorphic,
+                       info,
+                       /*megamorphic*/false,
+                       /*missing_types*/false);
   }
 
   {
@@ -534,7 +541,11 @@
     expected_polymorphic.insert(sub_a);
     expected_polymorphic.insert(sub_b);
     expected_polymorphic.insert(sub_c);
-    AssertInlineCaches(inline_polymorhic, expected_polymorphic, info, /*megamorphic*/ false);
+    AssertInlineCaches(inline_polymorhic,
+                       expected_polymorphic,
+                       info,
+                       /*megamorphic*/false,
+                       /*missing_types*/false);
   }
 
   {
@@ -544,7 +555,25 @@
                                                      "inlineMegamorphic");
     ASSERT_TRUE(inline_megamorphic != nullptr);
     std::set<mirror::Class*> expected_megamorphic;
-    AssertInlineCaches(inline_megamorphic, expected_megamorphic, info, /*megamorphic*/ true);
+    AssertInlineCaches(inline_megamorphic,
+                       expected_megamorphic,
+                       info,
+                       /*megamorphic*/true,
+                       /*missing_types*/false);
+  }
+
+  {
+    // Verify that method inlineMegamorphic has the expected inline caches and nothing else.
+    ArtMethod* inline_missing_types = GetVirtualMethod(class_loader,
+                                                       "LTestInline;",
+                                                       "inlineMissingTypes");
+    ASSERT_TRUE(inline_missing_types != nullptr);
+    std::set<mirror::Class*> expected_missing_Types;
+    AssertInlineCaches(inline_missing_types,
+                       expected_missing_Types,
+                       info,
+                       /*megamorphic*/false,
+                       /*missing_types*/true);
   }
 
   {
diff --git a/profman/profman.cc b/profman/profman.cc
index a99a0ea..f7316cc 100644
--- a/profman/profman.cc
+++ b/profman/profman.cc
@@ -139,6 +139,7 @@
 
 // Separators used when parsing human friendly representation of profiles.
 static const std::string kMethodSep = "->";
+static const std::string kMissingTypesMarker = "missing_types";
 static constexpr char kProfileParsingInlineChacheSep = '+';
 static constexpr char kProfileParsingTypeSep = ',';
 static constexpr char kProfileParsingFirstCharInSignature = '(';
@@ -624,8 +625,11 @@
 
   // Process a line defining a class or a method and its inline caches.
   // Upon success return true and add the class or the method info to profile.
-  // The format of the method line is:
+  // The possible line formats are:
+  // "LJustTheCass;".
   // "LTestInline;->inlinePolymorphic(LSuper;)I+LSubA;,LSubB;,LSubC;".
+  // "LTestInline;->inlineMissingTypes(LSuper;)I+missing_types".
+  // "LTestInline;->inlineNoInlineCaches(LSuper;)I".
   // The method and classes are searched only in the given dex files.
   bool ProcessLine(const std::vector<std::unique_ptr<const DexFile>>& dex_files,
                    const std::string& line,
@@ -642,7 +646,7 @@
 
     ProfileMethodInfo::ProfileClassReference class_ref;
     if (!FindClass(dex_files, klass, &class_ref)) {
-      LOG(ERROR) << "Could not find class: " << klass;
+      LOG(WARNING) << "Could not find class: " << klass;
       return false;
     }
 
@@ -664,10 +668,14 @@
     std::vector<std::string> inline_cache_elems;
 
     std::vector<std::string> method_elems;
+    bool is_missing_types = false;
     Split(method_str, kProfileParsingInlineChacheSep, &method_elems);
     if (method_elems.size() == 2) {
       method_spec = method_elems[0];
-      Split(method_elems[1], kProfileParsingTypeSep, &inline_cache_elems);
+      is_missing_types = method_elems[1] == kMissingTypesMarker;
+      if (!is_missing_types) {
+        Split(method_elems[1], kProfileParsingTypeSep, &inline_cache_elems);
+      }
     } else if (method_elems.size() == 1) {
       method_spec = method_elems[0];
     } else {
@@ -689,7 +697,7 @@
       }
     }
     std::vector<ProfileMethodInfo::ProfileInlineCache> inline_caches;
-    inline_caches.emplace_back(dex_pc, classes);
+    inline_caches.emplace_back(dex_pc, is_missing_types, classes);
     std::vector<ProfileMethodInfo> pmi;
     pmi.emplace_back(class_ref.dex_file, method_index, inline_caches);
 
diff --git a/runtime/Android.bp b/runtime/Android.bp
index 9958814..d075c58 100644
--- a/runtime/Android.bp
+++ b/runtime/Android.bp
@@ -99,6 +99,7 @@
         "intern_table.cc",
         "interpreter/interpreter.cc",
         "interpreter/interpreter_common.cc",
+        "interpreter/interpreter_intrinsics.cc",
         "interpreter/interpreter_switch_impl.cc",
         "interpreter/unstarted_runtime.cc",
         "java_vm_ext.cc",
diff --git a/runtime/arch/arm/fault_handler_arm.cc b/runtime/arch/arm/fault_handler_arm.cc
index daa2dff..923ff4f 100644
--- a/runtime/arch/arm/fault_handler_arm.cc
+++ b/runtime/arch/arm/fault_handler_arm.cc
@@ -47,24 +47,6 @@
   return instr_size;
 }
 
-void FaultManager::HandleNestedSignal(int sig ATTRIBUTE_UNUSED, siginfo_t* info ATTRIBUTE_UNUSED,
-                                      void* context) {
-  // Note that in this handler we set up the registers and return to
-  // longjmp directly rather than going through an assembly language stub.  The
-  // reason for this is that longjmp is (currently) in ARM mode and that would
-  // require switching modes in the stub - incurring an unwanted relocation.
-
-  struct ucontext *uc = reinterpret_cast<struct ucontext*>(context);
-  struct sigcontext *sc = reinterpret_cast<struct sigcontext*>(&uc->uc_mcontext);
-  Thread* self = Thread::Current();
-  CHECK(self != nullptr);  // This will cause a SIGABRT if self is null.
-
-  sc->arm_r0 = reinterpret_cast<uintptr_t>(*self->GetNestedSignalState());
-  sc->arm_r1 = 1;
-  sc->arm_pc = reinterpret_cast<uintptr_t>(longjmp);
-  VLOG(signals) << "longjmp address: " << reinterpret_cast<void*>(sc->arm_pc);
-}
-
 void FaultManager::GetMethodAndReturnPcAndSp(siginfo_t* siginfo ATTRIBUTE_UNUSED, void* context,
                                              ArtMethod** out_method,
                                              uintptr_t* out_return_pc, uintptr_t* out_sp) {
diff --git a/runtime/arch/arm/quick_entrypoints_arm.S b/runtime/arch/arm/quick_entrypoints_arm.S
index 8531091..72aa785 100644
--- a/runtime/arch/arm/quick_entrypoints_arm.S
+++ b/runtime/arch/arm/quick_entrypoints_arm.S
@@ -1487,6 +1487,7 @@
 .Lconflict_trampoline:
     // Call the runtime stub to populate the ImtConflictTable and jump to the
     // resolved method.
+    mov r0, r12  // Load interface method
     INVOKE_TRAMPOLINE_BODY artInvokeInterfaceTrampoline
 END art_quick_imt_conflict_trampoline
 
diff --git a/runtime/arch/arm64/fault_handler_arm64.cc b/runtime/arch/arm64/fault_handler_arm64.cc
index c02be87..193af58 100644
--- a/runtime/arch/arm64/fault_handler_arm64.cc
+++ b/runtime/arch/arm64/fault_handler_arm64.cc
@@ -39,21 +39,6 @@
 
 namespace art {
 
-void FaultManager::HandleNestedSignal(int sig ATTRIBUTE_UNUSED, siginfo_t* info ATTRIBUTE_UNUSED,
-                                      void* context) {
-  // To match the case used in ARM we return directly to the longjmp function
-  // rather than through a trivial assembly language stub.
-
-  struct ucontext *uc = reinterpret_cast<struct ucontext*>(context);
-  struct sigcontext *sc = reinterpret_cast<struct sigcontext*>(&uc->uc_mcontext);
-  Thread* self = Thread::Current();
-  CHECK(self != nullptr);       // This will cause a SIGABRT if self is null.
-
-  sc->regs[0] = reinterpret_cast<uintptr_t>(*self->GetNestedSignalState());
-  sc->regs[1] = 1;
-  sc->pc = reinterpret_cast<uintptr_t>(longjmp);
-}
-
 void FaultManager::GetMethodAndReturnPcAndSp(siginfo_t* siginfo ATTRIBUTE_UNUSED, void* context,
                                              ArtMethod** out_method,
                                              uintptr_t* out_return_pc, uintptr_t* out_sp) {
diff --git a/runtime/arch/arm64/quick_entrypoints_arm64.S b/runtime/arch/arm64/quick_entrypoints_arm64.S
index 7cb50b7..5b5d2ef 100644
--- a/runtime/arch/arm64/quick_entrypoints_arm64.S
+++ b/runtime/arch/arm64/quick_entrypoints_arm64.S
@@ -1966,6 +1966,7 @@
 .Lconflict_trampoline:
     // Call the runtime stub to populate the ImtConflictTable and jump to the
     // resolved method.
+    mov x0, xIP0  // Load interface method
     INVOKE_TRAMPOLINE_BODY artInvokeInterfaceTrampoline
 END art_quick_imt_conflict_trampoline
 
diff --git a/runtime/arch/mips/fault_handler_mips.cc b/runtime/arch/mips/fault_handler_mips.cc
index 1792f31..f9c19e8 100644
--- a/runtime/arch/mips/fault_handler_mips.cc
+++ b/runtime/arch/mips/fault_handler_mips.cc
@@ -35,10 +35,6 @@
 
 namespace art {
 
-void FaultManager::HandleNestedSignal(int sig ATTRIBUTE_UNUSED, siginfo_t* info ATTRIBUTE_UNUSED,
-                                      void* context ATTRIBUTE_UNUSED) {
-}
-
 void FaultManager::GetMethodAndReturnPcAndSp(siginfo_t* siginfo, void* context,
                                              ArtMethod** out_method,
                                              uintptr_t* out_return_pc, uintptr_t* out_sp) {
diff --git a/runtime/arch/mips/quick_entrypoints_mips.S b/runtime/arch/mips/quick_entrypoints_mips.S
index 4f7b495..5d61539 100644
--- a/runtime/arch/mips/quick_entrypoints_mips.S
+++ b/runtime/arch/mips/quick_entrypoints_mips.S
@@ -1763,6 +1763,7 @@
 
 .Lconflict_trampoline:
     # Call the runtime stub to populate the ImtConflictTable and jump to the resolved method.
+    move    $a0, $t7                                         # Load interface method.
     INVOKE_TRAMPOLINE_BODY artInvokeInterfaceTrampoline
 END art_quick_imt_conflict_trampoline
 
diff --git a/runtime/arch/mips64/fault_handler_mips64.cc b/runtime/arch/mips64/fault_handler_mips64.cc
index 709cab5..d668d3a 100644
--- a/runtime/arch/mips64/fault_handler_mips64.cc
+++ b/runtime/arch/mips64/fault_handler_mips64.cc
@@ -35,10 +35,6 @@
 
 namespace art {
 
-void FaultManager::HandleNestedSignal(int sig ATTRIBUTE_UNUSED, siginfo_t* info ATTRIBUTE_UNUSED,
-                                      void* context ATTRIBUTE_UNUSED) {
-}
-
 void FaultManager::GetMethodAndReturnPcAndSp(siginfo_t* siginfo, void* context,
                                              ArtMethod** out_method,
                                              uintptr_t* out_return_pc, uintptr_t* out_sp) {
diff --git a/runtime/arch/mips64/quick_entrypoints_mips64.S b/runtime/arch/mips64/quick_entrypoints_mips64.S
index 28d7c77..3ee9c4a 100644
--- a/runtime/arch/mips64/quick_entrypoints_mips64.S
+++ b/runtime/arch/mips64/quick_entrypoints_mips64.S
@@ -1715,6 +1715,7 @@
 
 .Lconflict_trampoline:
     # Call the runtime stub to populate the ImtConflictTable and jump to the resolved method.
+    move   $a0, $t0                                          # Load interface method.
     INVOKE_TRAMPOLINE_BODY artInvokeInterfaceTrampoline
 END art_quick_imt_conflict_trampoline
 
diff --git a/runtime/arch/x86/fault_handler_x86.cc b/runtime/arch/x86/fault_handler_x86.cc
index a4d6bb4..f407ebf 100644
--- a/runtime/arch/x86/fault_handler_x86.cc
+++ b/runtime/arch/x86/fault_handler_x86.cc
@@ -75,12 +75,6 @@
 extern "C" void art_quick_throw_stack_overflow();
 extern "C" void art_quick_test_suspend();
 
-// Note this is different from the others (no underscore on 64 bit mac) due to
-// the way the symbol is defined in the .S file.
-// TODO: fix the symbols for 64 bit mac - there is a double underscore prefix for some
-// of them.
-extern "C" void art_nested_signal_return();
-
 // Get the size of an instruction in bytes.
 // Return 0 if the instruction is not handled.
 static uint32_t GetInstructionSize(const uint8_t* pc) {
@@ -247,21 +241,6 @@
   return pc - startpc;
 }
 
-void FaultManager::HandleNestedSignal(int, siginfo_t*, void* context) {
-  // For the Intel architectures we need to go to an assembly language
-  // stub.  This is because the 32 bit call to longjmp is much different
-  // from the 64 bit ABI call and pushing things onto the stack inside this
-  // handler was unwieldy and ugly.  The use of the stub means we can keep
-  // this code the same for both 32 and 64 bit.
-
-  Thread* self = Thread::Current();
-  CHECK(self != nullptr);  // This will cause a SIGABRT if self is null.
-
-  struct ucontext* uc = reinterpret_cast<struct ucontext*>(context);
-  uc->CTX_JMP_BUF = reinterpret_cast<uintptr_t>(*self->GetNestedSignalState());
-  uc->CTX_EIP = reinterpret_cast<uintptr_t>(art_nested_signal_return);
-}
-
 void FaultManager::GetMethodAndReturnPcAndSp(siginfo_t* siginfo, void* context,
                                              ArtMethod** out_method,
                                              uintptr_t* out_return_pc, uintptr_t* out_sp) {
diff --git a/runtime/arch/x86/quick_entrypoints_x86.S b/runtime/arch/x86/quick_entrypoints_x86.S
index 8c907e0..5f38dc8 100644
--- a/runtime/arch/x86/quick_entrypoints_x86.S
+++ b/runtime/arch/x86/quick_entrypoints_x86.S
@@ -1806,6 +1806,7 @@
 .Lconflict_trampoline:
     // Call the runtime stub to populate the ImtConflictTable and jump to the
     // resolved method.
+    movl %edi, %eax  // Load interface method
     POP EDI
     INVOKE_TRAMPOLINE_BODY artInvokeInterfaceTrampoline
 END_FUNCTION art_quick_imt_conflict_trampoline
@@ -2136,19 +2137,6 @@
     ret
 END_FUNCTION art_quick_string_compareto
 
-// Return from a nested signal:
-// Entry:
-//  eax: address of jmp_buf in TLS
-
-DEFINE_FUNCTION art_nested_signal_return
-    SETUP_GOT_NOSAVE ebx            // sets %ebx for call into PLT
-    movl LITERAL(1), %ecx
-    PUSH ecx                        // second arg to longjmp (1)
-    PUSH eax                        // first arg to longjmp (jmp_buf)
-    call PLT_SYMBOL(longjmp)
-    UNREACHABLE
-END_FUNCTION art_nested_signal_return
-
 // Create a function `name` calling the ReadBarrier::Mark routine,
 // getting its argument and returning its result through register
 // `reg`, saving and restoring all caller-save registers.
diff --git a/runtime/arch/x86_64/quick_entrypoints_x86_64.S b/runtime/arch/x86_64/quick_entrypoints_x86_64.S
index f1be52e..e87b165 100644
--- a/runtime/arch/x86_64/quick_entrypoints_x86_64.S
+++ b/runtime/arch/x86_64/quick_entrypoints_x86_64.S
@@ -1662,6 +1662,7 @@
 .Lconflict_trampoline:
     // Call the runtime stub to populate the ImtConflictTable and jump to the
     // resolved method.
+    movq %r10, %rdi  // Load interface method
     INVOKE_TRAMPOLINE_BODY artInvokeInterfaceTrampoline
 #endif  // __APPLE__
 END_FUNCTION art_quick_imt_conflict_trampoline
@@ -2099,18 +2100,6 @@
     ret
 END_FUNCTION art_quick_instance_of
 
-
-// Return from a nested signal:
-// Entry:
-//  rdi: address of jmp_buf in TLS
-
-DEFINE_FUNCTION art_nested_signal_return
-                                    // first arg to longjmp is already in correct register
-    movq LITERAL(1), %rsi           // second arg to longjmp (1)
-    call PLT_SYMBOL(longjmp)
-    UNREACHABLE
-END_FUNCTION art_nested_signal_return
-
 // Create a function `name` calling the ReadBarrier::Mark routine,
 // getting its argument and returning its result through register
 // `reg`, saving and restoring all caller-save registers.
diff --git a/runtime/base/arena_allocator.cc b/runtime/base/arena_allocator.cc
index db43319..5aede38 100644
--- a/runtime/base/arena_allocator.cc
+++ b/runtime/base/arena_allocator.cc
@@ -15,6 +15,7 @@
  */
 
 #include <algorithm>
+#include <cstddef>
 #include <iomanip>
 #include <numeric>
 
@@ -27,7 +28,7 @@
 
 namespace art {
 
-static constexpr size_t kMemoryToolRedZoneBytes = 8;
+constexpr size_t kMemoryToolRedZoneBytes = 8;
 constexpr size_t Arena::kDefaultSize;
 
 template <bool kCount>
@@ -168,23 +169,75 @@
 Arena::Arena() : bytes_allocated_(0), next_(nullptr) {
 }
 
+class MallocArena FINAL : public Arena {
+ public:
+  explicit MallocArena(size_t size = Arena::kDefaultSize);
+  virtual ~MallocArena();
+ private:
+  static constexpr size_t RequiredOverallocation() {
+    return (alignof(std::max_align_t) < ArenaAllocator::kArenaAlignment)
+        ? ArenaAllocator::kArenaAlignment - alignof(std::max_align_t)
+        : 0u;
+  }
+
+  uint8_t* unaligned_memory_;
+};
+
 MallocArena::MallocArena(size_t size) {
-  memory_ = reinterpret_cast<uint8_t*>(calloc(1, size));
-  CHECK(memory_ != nullptr);  // Abort on OOM.
-  DCHECK_ALIGNED(memory_, ArenaAllocator::kAlignment);
+  // We need to guarantee kArenaAlignment aligned allocation for the new arena.
+  // TODO: Use std::aligned_alloc() when it becomes available with C++17.
+  constexpr size_t overallocation = RequiredOverallocation();
+  unaligned_memory_ = reinterpret_cast<uint8_t*>(calloc(1, size + overallocation));
+  CHECK(unaligned_memory_ != nullptr);  // Abort on OOM.
+  DCHECK_ALIGNED(unaligned_memory_, alignof(std::max_align_t));
+  if (overallocation == 0u) {
+    memory_ = unaligned_memory_;
+  } else {
+    memory_ = AlignUp(unaligned_memory_, ArenaAllocator::kArenaAlignment);
+    if (UNLIKELY(RUNNING_ON_MEMORY_TOOL > 0)) {
+      size_t head = memory_ - unaligned_memory_;
+      size_t tail = overallocation - head;
+      MEMORY_TOOL_MAKE_NOACCESS(unaligned_memory_, head);
+      MEMORY_TOOL_MAKE_NOACCESS(memory_ + size, tail);
+    }
+  }
+  DCHECK_ALIGNED(memory_, ArenaAllocator::kArenaAlignment);
   size_ = size;
 }
 
 MallocArena::~MallocArena() {
-  free(reinterpret_cast<void*>(memory_));
+  constexpr size_t overallocation = RequiredOverallocation();
+  if (overallocation != 0u && UNLIKELY(RUNNING_ON_MEMORY_TOOL > 0)) {
+    size_t head = memory_ - unaligned_memory_;
+    size_t tail = overallocation - head;
+    MEMORY_TOOL_MAKE_UNDEFINED(unaligned_memory_, head);
+    MEMORY_TOOL_MAKE_UNDEFINED(memory_ + size_, tail);
+  }
+  free(reinterpret_cast<void*>(unaligned_memory_));
 }
 
+class MemMapArena FINAL : public Arena {
+ public:
+  MemMapArena(size_t size, bool low_4gb, const char* name);
+  virtual ~MemMapArena();
+  void Release() OVERRIDE;
+
+ private:
+  std::unique_ptr<MemMap> map_;
+};
+
 MemMapArena::MemMapArena(size_t size, bool low_4gb, const char* name) {
+  // Round up to a full page as that's the smallest unit of allocation for mmap()
+  // and we want to be able to use all memory that we actually allocate.
+  size = RoundUp(size, kPageSize);
   std::string error_msg;
   map_.reset(MemMap::MapAnonymous(
       name, nullptr, size, PROT_READ | PROT_WRITE, low_4gb, false, &error_msg));
   CHECK(map_.get() != nullptr) << error_msg;
   memory_ = map_->Begin();
+  static_assert(ArenaAllocator::kArenaAlignment <= kPageSize,
+                "Arena should not need stronger alignment than kPageSize.");
+  DCHECK_ALIGNED(memory_, ArenaAllocator::kArenaAlignment);
   size_ = map_->Size();
 }
 
@@ -332,20 +385,7 @@
   ArenaAllocatorStats::RecordAlloc(rounded_bytes, kind);
   uint8_t* ret;
   if (UNLIKELY(rounded_bytes > static_cast<size_t>(end_ - ptr_))) {
-    ret = AllocFromNewArena(rounded_bytes);
-    uint8_t* noaccess_begin = ret + bytes;
-    uint8_t* noaccess_end;
-    if (ret == arena_head_->Begin()) {
-      DCHECK(ptr_ - rounded_bytes == ret);
-      noaccess_end = end_;
-    } else {
-      // We're still using the old arena but `ret` comes from a new one just after it.
-      DCHECK(arena_head_->next_ != nullptr);
-      DCHECK(ret == arena_head_->next_->Begin());
-      DCHECK_EQ(rounded_bytes, arena_head_->next_->GetBytesAllocated());
-      noaccess_end = arena_head_->next_->End();
-    }
-    MEMORY_TOOL_MAKE_NOACCESS(noaccess_begin, noaccess_end - noaccess_begin);
+    ret = AllocFromNewArenaWithMemoryTool(rounded_bytes);
   } else {
     ret = ptr_;
     ptr_ += rounded_bytes;
@@ -356,6 +396,30 @@
   return ret;
 }
 
+void* ArenaAllocator::AllocWithMemoryToolAlign16(size_t bytes, ArenaAllocKind kind) {
+  // We mark all memory for a newly retrieved arena as inaccessible and then
+  // mark only the actually allocated memory as defined. That leaves red zones
+  // and padding between allocations marked as inaccessible.
+  size_t rounded_bytes = bytes + kMemoryToolRedZoneBytes;
+  DCHECK_ALIGNED(rounded_bytes, 8);  // `bytes` is 16-byte aligned, red zone is 8-byte aligned.
+  uintptr_t padding =
+      ((reinterpret_cast<uintptr_t>(ptr_) + 15u) & 15u) - reinterpret_cast<uintptr_t>(ptr_);
+  ArenaAllocatorStats::RecordAlloc(rounded_bytes, kind);
+  uint8_t* ret;
+  if (UNLIKELY(padding + rounded_bytes > static_cast<size_t>(end_ - ptr_))) {
+    static_assert(kArenaAlignment >= 16, "Expecting sufficient alignment for new Arena.");
+    ret = AllocFromNewArenaWithMemoryTool(rounded_bytes);
+  } else {
+    ptr_ += padding;  // Leave padding inaccessible.
+    ret = ptr_;
+    ptr_ += rounded_bytes;
+  }
+  MEMORY_TOOL_MAKE_DEFINED(ret, bytes);
+  // Check that the memory is already zeroed out.
+  DCHECK(std::all_of(ret, ret + bytes, [](uint8_t val) { return val == 0u; }));
+  return ret;
+}
+
 ArenaAllocator::~ArenaAllocator() {
   // Reclaim all the arenas by giving them back to the thread pool.
   UpdateBytesAllocated();
@@ -386,6 +450,24 @@
   return new_arena->Begin();
 }
 
+uint8_t* ArenaAllocator::AllocFromNewArenaWithMemoryTool(size_t bytes) {
+  uint8_t* ret = AllocFromNewArena(bytes);
+  uint8_t* noaccess_begin = ret + bytes;
+  uint8_t* noaccess_end;
+  if (ret == arena_head_->Begin()) {
+    DCHECK(ptr_ - bytes == ret);
+    noaccess_end = end_;
+  } else {
+    // We're still using the old arena but `ret` comes from a new one just after it.
+    DCHECK(arena_head_->next_ != nullptr);
+    DCHECK(ret == arena_head_->next_->Begin());
+    DCHECK_EQ(bytes, arena_head_->next_->GetBytesAllocated());
+    noaccess_end = arena_head_->next_->End();
+  }
+  MEMORY_TOOL_MAKE_NOACCESS(noaccess_begin, noaccess_end - noaccess_begin);
+  return ret;
+}
+
 bool ArenaAllocator::Contains(const void* ptr) const {
   if (ptr >= begin_ && ptr < end_) {
     return true;
@@ -398,7 +480,9 @@
   return false;
 }
 
-MemStats::MemStats(const char* name, const ArenaAllocatorStats* stats, const Arena* first_arena,
+MemStats::MemStats(const char* name,
+                   const ArenaAllocatorStats* stats,
+                   const Arena* first_arena,
                    ssize_t lost_bytes_adjustment)
     : name_(name),
       stats_(stats),
diff --git a/runtime/base/arena_allocator.h b/runtime/base/arena_allocator.h
index 245ab3b..c39429c 100644
--- a/runtime/base/arena_allocator.h
+++ b/runtime/base/arena_allocator.h
@@ -34,7 +34,6 @@
 class ArenaAllocator;
 class ArenaStack;
 class ScopedArenaAllocator;
-class MemMap;
 class MemStats;
 
 template <typename T>
@@ -89,6 +88,7 @@
   kArenaAllocRegisterAllocator,
   kArenaAllocRegisterAllocatorValidate,
   kArenaAllocStackMapStream,
+  kArenaAllocVectorNode,
   kArenaAllocCodeGenerator,
   kArenaAllocAssembler,
   kArenaAllocParallelMoveResolver,
@@ -243,22 +243,6 @@
   DISALLOW_COPY_AND_ASSIGN(Arena);
 };
 
-class MallocArena FINAL : public Arena {
- public:
-  explicit MallocArena(size_t size = Arena::kDefaultSize);
-  virtual ~MallocArena();
-};
-
-class MemMapArena FINAL : public Arena {
- public:
-  MemMapArena(size_t size, bool low_4gb, const char* name);
-  virtual ~MemMapArena();
-  void Release() OVERRIDE;
-
- private:
-  std::unique_ptr<MemMap> map_;
-};
-
 class ArenaPool {
  public:
   explicit ArenaPool(bool use_malloc = true,
@@ -318,8 +302,31 @@
     return ret;
   }
 
+  // Returns zeroed memory.
+  void* AllocAlign16(size_t bytes, ArenaAllocKind kind = kArenaAllocMisc) ALWAYS_INLINE {
+    // It is an error to request 16-byte aligned allocation of unaligned size.
+    DCHECK_ALIGNED(bytes, 16);
+    if (UNLIKELY(IsRunningOnMemoryTool())) {
+      return AllocWithMemoryToolAlign16(bytes, kind);
+    }
+    uintptr_t padding =
+        ((reinterpret_cast<uintptr_t>(ptr_) + 15u) & 15u) - reinterpret_cast<uintptr_t>(ptr_);
+    ArenaAllocatorStats::RecordAlloc(bytes, kind);
+    if (UNLIKELY(padding + bytes > static_cast<size_t>(end_ - ptr_))) {
+      static_assert(kArenaAlignment >= 16, "Expecting sufficient alignment for new Arena.");
+      return AllocFromNewArena(bytes);
+    }
+    ptr_ += padding;
+    uint8_t* ret = ptr_;
+    DCHECK_ALIGNED(ret, 16);
+    ptr_ += bytes;
+    return ret;
+  }
+
   // Realloc never frees the input pointer, it is the caller's job to do this if necessary.
-  void* Realloc(void* ptr, size_t ptr_size, size_t new_size,
+  void* Realloc(void* ptr,
+                size_t ptr_size,
+                size_t new_size,
                 ArenaAllocKind kind = kArenaAllocMisc) ALWAYS_INLINE {
     DCHECK_GE(new_size, ptr_size);
     DCHECK_EQ(ptr == nullptr, ptr_size == 0u);
@@ -370,12 +377,17 @@
 
   bool Contains(const void* ptr) const;
 
-  static constexpr size_t kAlignment = 8;
+  // The alignment guaranteed for individual allocations.
+  static constexpr size_t kAlignment = 8u;
+
+  // The alignment required for the whole Arena rather than individual allocations.
+  static constexpr size_t kArenaAlignment = 16u;
 
  private:
   void* AllocWithMemoryTool(size_t bytes, ArenaAllocKind kind);
+  void* AllocWithMemoryToolAlign16(size_t bytes, ArenaAllocKind kind);
   uint8_t* AllocFromNewArena(size_t bytes);
-
+  uint8_t* AllocFromNewArenaWithMemoryTool(size_t bytes);
 
   void UpdateBytesAllocated();
 
@@ -395,7 +407,9 @@
 
 class MemStats {
  public:
-  MemStats(const char* name, const ArenaAllocatorStats* stats, const Arena* first_arena,
+  MemStats(const char* name,
+           const ArenaAllocatorStats* stats,
+           const Arena* first_arena,
            ssize_t lost_bytes_adjustment = 0);
   void Dump(std::ostream& os) const;
 
diff --git a/runtime/base/mutex.cc b/runtime/base/mutex.cc
index 24846e5..b0394a5 100644
--- a/runtime/base/mutex.cc
+++ b/runtime/base/mutex.cc
@@ -46,7 +46,6 @@
 ReaderWriterMutex* Locks::heap_bitmap_lock_ = nullptr;
 Mutex* Locks::instrument_entrypoints_lock_ = nullptr;
 Mutex* Locks::intern_table_lock_ = nullptr;
-Mutex* Locks::jdwp_event_list_lock_ = nullptr;
 Mutex* Locks::jni_function_table_lock_ = nullptr;
 Mutex* Locks::jni_libraries_lock_ = nullptr;
 Mutex* Locks::logging_lock_ = nullptr;
@@ -74,6 +73,7 @@
 Mutex* Locks::jni_weak_globals_lock_ = nullptr;
 ReaderWriterMutex* Locks::dex_lock_ = nullptr;
 std::vector<BaseMutex*> Locks::expected_mutexes_on_weak_ref_access_;
+Atomic<const BaseMutex*> Locks::expected_mutexes_on_weak_ref_access_guard_;
 
 struct AllMutexData {
   // A guard for all_mutexes_ that's not a mutex (Mutexes must CAS to acquire and busy wait).
@@ -118,6 +118,26 @@
   const BaseMutex* const mutex_;
 };
 
+class Locks::ScopedExpectedMutexesOnWeakRefAccessLock FINAL {
+ public:
+  explicit ScopedExpectedMutexesOnWeakRefAccessLock(const BaseMutex* mutex) : mutex_(mutex) {
+    while (!Locks::expected_mutexes_on_weak_ref_access_guard_.CompareExchangeWeakAcquire(0,
+                                                                                         mutex)) {
+      NanoSleep(100);
+    }
+  }
+
+  ~ScopedExpectedMutexesOnWeakRefAccessLock() {
+    while (!Locks::expected_mutexes_on_weak_ref_access_guard_.CompareExchangeWeakRelease(mutex_,
+                                                                                         0)) {
+      NanoSleep(100);
+    }
+  }
+
+ private:
+  const BaseMutex* const mutex_;
+};
+
 // Scoped class that generates events at the beginning and end of lock contention.
 class ScopedContentionRecorder FINAL : public ValueObject {
  public:
@@ -999,7 +1019,6 @@
     DCHECK(verifier_deps_lock_ != nullptr);
     DCHECK(host_dlopen_handles_lock_ != nullptr);
     DCHECK(intern_table_lock_ != nullptr);
-    DCHECK(jdwp_event_list_lock_ != nullptr);
     DCHECK(jni_function_table_lock_ != nullptr);
     DCHECK(jni_libraries_lock_ != nullptr);
     DCHECK(logging_lock_ != nullptr);
@@ -1042,10 +1061,6 @@
     DCHECK(runtime_shutdown_lock_ == nullptr);
     runtime_shutdown_lock_ = new Mutex("runtime shutdown lock", current_lock_level);
 
-    UPDATE_CURRENT_LOCK_LEVEL(kJdwpEventListLock);
-    DCHECK(jdwp_event_list_lock_ == nullptr);
-    jdwp_event_list_lock_ = new Mutex("JDWP event list lock", current_lock_level);
-
     UPDATE_CURRENT_LOCK_LEVEL(kProfilerLock);
     DCHECK(profiler_lock_ == nullptr);
     profiler_lock_ = new Mutex("profiler lock", current_lock_level);
@@ -1169,14 +1184,9 @@
     #undef UPDATE_CURRENT_LOCK_LEVEL
 
     // List of mutexes that we may hold when accessing a weak ref.
-    dex_lock_->SetShouldRespondToEmptyCheckpointRequest(true);
-    expected_mutexes_on_weak_ref_access_.push_back(dex_lock_);
-    classlinker_classes_lock_->SetShouldRespondToEmptyCheckpointRequest(true);
-    expected_mutexes_on_weak_ref_access_.push_back(classlinker_classes_lock_);
-    jdwp_event_list_lock_->SetShouldRespondToEmptyCheckpointRequest(true);
-    expected_mutexes_on_weak_ref_access_.push_back(jdwp_event_list_lock_);
-    jni_libraries_lock_->SetShouldRespondToEmptyCheckpointRequest(true);
-    expected_mutexes_on_weak_ref_access_.push_back(jni_libraries_lock_);
+    AddToExpectedMutexesOnWeakRefAccess(dex_lock_, /*need_lock*/ false);
+    AddToExpectedMutexesOnWeakRefAccess(classlinker_classes_lock_, /*need_lock*/ false);
+    AddToExpectedMutexesOnWeakRefAccess(jni_libraries_lock_, /*need_lock*/ false);
 
     InitConditions();
   }
@@ -1196,4 +1206,38 @@
   return safe_to_call_abort_cb != nullptr && safe_to_call_abort_cb();
 }
 
+void Locks::AddToExpectedMutexesOnWeakRefAccess(BaseMutex* mutex, bool need_lock) {
+  if (need_lock) {
+    ScopedExpectedMutexesOnWeakRefAccessLock mu(mutex);
+    mutex->SetShouldRespondToEmptyCheckpointRequest(true);
+    expected_mutexes_on_weak_ref_access_.push_back(mutex);
+  } else {
+    mutex->SetShouldRespondToEmptyCheckpointRequest(true);
+    expected_mutexes_on_weak_ref_access_.push_back(mutex);
+  }
+}
+
+void Locks::RemoveFromExpectedMutexesOnWeakRefAccess(BaseMutex* mutex, bool need_lock) {
+  if (need_lock) {
+    ScopedExpectedMutexesOnWeakRefAccessLock mu(mutex);
+    mutex->SetShouldRespondToEmptyCheckpointRequest(false);
+    std::vector<BaseMutex*>& list = expected_mutexes_on_weak_ref_access_;
+    auto it = std::find(list.begin(), list.end(), mutex);
+    DCHECK(it != list.end());
+    list.erase(it);
+  } else {
+    mutex->SetShouldRespondToEmptyCheckpointRequest(false);
+    std::vector<BaseMutex*>& list = expected_mutexes_on_weak_ref_access_;
+    auto it = std::find(list.begin(), list.end(), mutex);
+    DCHECK(it != list.end());
+    list.erase(it);
+  }
+}
+
+bool Locks::IsExpectedOnWeakRefAccess(BaseMutex* mutex) {
+  ScopedExpectedMutexesOnWeakRefAccessLock mu(mutex);
+  std::vector<BaseMutex*>& list = expected_mutexes_on_weak_ref_access_;
+  return std::find(list.begin(), list.end(), mutex) != list.end();
+}
+
 }  // namespace art
diff --git a/runtime/base/mutex.h b/runtime/base/mutex.h
index c59664b..038aeb3 100644
--- a/runtime/base/mutex.h
+++ b/runtime/base/mutex.h
@@ -516,12 +516,12 @@
 // construction and releases it upon destruction.
 class SCOPED_CAPABILITY ReaderMutexLock {
  public:
-  ReaderMutexLock(Thread* self, ReaderWriterMutex& mu) ACQUIRE(mu) :
+  ReaderMutexLock(Thread* self, ReaderWriterMutex& mu) ACQUIRE(mu) ALWAYS_INLINE :
       self_(self), mu_(mu) {
     mu_.SharedLock(self_);
   }
 
-  ~ReaderMutexLock() RELEASE() {
+  ~ReaderMutexLock() RELEASE() ALWAYS_INLINE {
     mu_.SharedUnlock(self_);
   }
 
@@ -583,6 +583,12 @@
   // Checks for whether it is safe to call Abort() without using locks.
   static bool IsSafeToCallAbortRacy() NO_THREAD_SAFETY_ANALYSIS;
 
+  // Add a mutex to expected_mutexes_on_weak_ref_access_.
+  static void AddToExpectedMutexesOnWeakRefAccess(BaseMutex* mutex, bool need_lock = true);
+  // Remove a mutex from expected_mutexes_on_weak_ref_access_.
+  static void RemoveFromExpectedMutexesOnWeakRefAccess(BaseMutex* mutex, bool need_lock = true);
+  // Check if the given mutex is in expected_mutexes_on_weak_ref_access_.
+  static bool IsExpectedOnWeakRefAccess(BaseMutex* mutex);
 
   // Guards allocation entrypoint instrumenting.
   static Mutex* instrument_entrypoints_lock_;
@@ -630,12 +636,8 @@
   // Guards shutdown of the runtime.
   static Mutex* runtime_shutdown_lock_ ACQUIRED_AFTER(heap_bitmap_lock_);
 
-  static Mutex* jdwp_event_list_lock_
-      ACQUIRED_AFTER(runtime_shutdown_lock_)
-      ACQUIRED_BEFORE(breakpoint_lock_);
-
   // Guards background profiler global state.
-  static Mutex* profiler_lock_ ACQUIRED_AFTER(jdwp_event_list_lock_);
+  static Mutex* profiler_lock_ ACQUIRED_AFTER(runtime_shutdown_lock_);
 
   // Guards trace (ie traceview) requests.
   static Mutex* trace_lock_ ACQUIRED_AFTER(profiler_lock_);
@@ -738,6 +740,8 @@
   // encounter an unexpected mutex on accessing weak refs,
   // Thread::CheckEmptyCheckpointFromWeakRefAccess will detect it.
   static std::vector<BaseMutex*> expected_mutexes_on_weak_ref_access_;
+  static Atomic<const BaseMutex*> expected_mutexes_on_weak_ref_access_guard_;
+  class ScopedExpectedMutexesOnWeakRefAccessLock;
 };
 
 class Roles {
diff --git a/runtime/base/scoped_arena_allocator.h b/runtime/base/scoped_arena_allocator.h
index 55044b3..1a0eb5e 100644
--- a/runtime/base/scoped_arena_allocator.h
+++ b/runtime/base/scoped_arena_allocator.h
@@ -39,8 +39,6 @@
   kFree,
 };
 
-static constexpr size_t kArenaAlignment = 8;
-
 // Holds a list of Arenas for use by ScopedArenaAllocator stack.
 // The memory is returned to the ArenaPool when the ArenaStack is destroyed.
 class ArenaStack : private DebugStackRefCounter, private ArenaAllocatorMemoryTool {
@@ -67,6 +65,9 @@
     return *(reinterpret_cast<ArenaFreeTag*>(ptr) - 1);
   }
 
+  // The alignment guaranteed for individual allocations.
+  static constexpr size_t kAlignment = 8u;
+
  private:
   struct Peak;
   struct Current;
@@ -89,8 +90,8 @@
     if (UNLIKELY(IsRunningOnMemoryTool())) {
       return AllocWithMemoryTool(bytes, kind);
     }
-    // Add kArenaAlignment for the free or used tag. Required to preserve alignment.
-    size_t rounded_bytes = RoundUp(bytes + (kIsDebugBuild ? kArenaAlignment : 0u), kArenaAlignment);
+    // Add kAlignment for the free or used tag. Required to preserve alignment.
+    size_t rounded_bytes = RoundUp(bytes + (kIsDebugBuild ? kAlignment : 0u), kAlignment);
     uint8_t* ptr = top_ptr_;
     if (UNLIKELY(static_cast<size_t>(top_end_ - ptr) < rounded_bytes)) {
       ptr = AllocateFromNextArena(rounded_bytes);
@@ -98,7 +99,7 @@
     CurrentStats()->RecordAlloc(bytes, kind);
     top_ptr_ = ptr + rounded_bytes;
     if (kIsDebugBuild) {
-      ptr += kArenaAlignment;
+      ptr += kAlignment;
       ArenaTagForAllocation(ptr) = ArenaFreeTag::kUsed;
     }
     return ptr;
diff --git a/runtime/class_linker-inl.h b/runtime/class_linker-inl.h
index bd510ca..9ddc6cf 100644
--- a/runtime/class_linker-inl.h
+++ b/runtime/class_linker-inl.h
@@ -161,9 +161,15 @@
   return resolved_method;
 }
 
-inline ArtField* ClassLinker::GetResolvedField(uint32_t field_idx,
-                                               ObjPtr<mirror::DexCache> dex_cache) {
-  return dex_cache->GetResolvedField(field_idx, image_pointer_size_);
+inline ArtField* ClassLinker::LookupResolvedField(uint32_t field_idx,
+                                                  ArtMethod* referrer,
+                                                  bool is_static) {
+  ObjPtr<mirror::DexCache> dex_cache = referrer->GetDexCache();
+  ArtField* field = dex_cache->GetResolvedField(field_idx, image_pointer_size_);
+  if (field == nullptr) {
+    field = LookupResolvedField(field_idx, dex_cache, referrer->GetClassLoader(), is_static);
+  }
+  return field;
 }
 
 inline ArtField* ClassLinker::ResolveField(uint32_t field_idx,
@@ -171,7 +177,8 @@
                                            bool is_static) {
   Thread::PoisonObjectPointersIfDebug();
   ObjPtr<mirror::Class> declaring_class = referrer->GetDeclaringClass();
-  ArtField* resolved_field = GetResolvedField(field_idx, referrer->GetDexCache());
+  ArtField* resolved_field =
+      referrer->GetDexCache()->GetResolvedField(field_idx, image_pointer_size_);
   if (UNLIKELY(resolved_field == nullptr)) {
     StackHandleScope<2> hs(Thread::Current());
     Handle<mirror::DexCache> dex_cache(hs.NewHandle(referrer->GetDexCache()));
diff --git a/runtime/class_linker.cc b/runtime/class_linker.cc
index d232714..ab2b395 100644
--- a/runtime/class_linker.cc
+++ b/runtime/class_linker.cc
@@ -1282,7 +1282,10 @@
           num_types = dex_file->NumTypeIds();
         }
         const size_t num_methods = dex_file->NumMethodIds();
-        const size_t num_fields = dex_file->NumFieldIds();
+        size_t num_fields = mirror::DexCache::kDexCacheFieldCacheSize;
+        if (dex_file->NumFieldIds() < num_fields) {
+          num_fields = dex_file->NumFieldIds();
+        }
         size_t num_method_types = mirror::DexCache::kDexCacheMethodTypeCacheSize;
         if (dex_file->NumProtoIds() < num_method_types) {
           num_method_types = dex_file->NumProtoIds();
@@ -1326,17 +1329,22 @@
           dex_cache->SetResolvedMethods(methods);
         }
         if (num_fields != 0u) {
-          ArtField** const fields =
-              reinterpret_cast<ArtField**>(raw_arrays + layout.FieldsOffset());
-          for (size_t j = 0; kIsDebugBuild && j < num_fields; ++j) {
-            DCHECK(fields[j] == nullptr);
+          mirror::FieldDexCacheType* const image_resolved_fields = dex_cache->GetResolvedFields();
+          mirror::FieldDexCacheType* const fields =
+              reinterpret_cast<mirror::FieldDexCacheType*>(raw_arrays + layout.FieldsOffset());
+          for (size_t j = 0; j < num_fields; ++j) {
+            DCHECK_EQ(mirror::DexCache::GetNativePairPtrSize(fields, j, image_pointer_size_).index,
+                      0u);
+            DCHECK(mirror::DexCache::GetNativePairPtrSize(fields, j, image_pointer_size_).object ==
+                   nullptr);
+            mirror::DexCache::SetNativePairPtrSize(
+                fields,
+                j,
+                mirror::DexCache::GetNativePairPtrSize(image_resolved_fields,
+                                                       j,
+                                                       image_pointer_size_),
+                image_pointer_size_);
           }
-          CopyNonNull(dex_cache->GetResolvedFields(),
-                      num_fields,
-                      fields,
-                      [] (const ArtField* field) {
-                          return field == nullptr;
-                      });
           dex_cache->SetResolvedFields(fields);
         }
         if (num_method_types != 0u) {
@@ -3900,8 +3908,10 @@
   if (!supertype->IsVerified() && !supertype->IsErroneous()) {
     VerifyClass(self, supertype);
   }
-  if (supertype->IsCompileTimeVerified()) {
-    // Either we are verified or we soft failed and need to retry at runtime.
+
+  if (supertype->IsVerified() || supertype->ShouldVerifyAtRuntime()) {
+    // The supertype is either verified, or we soft failed at AOT time.
+    DCHECK(supertype->IsVerified() || Runtime::Current()->IsAotCompiler());
     return true;
   }
   // If we got this far then we have a hard failure.
@@ -3967,13 +3977,16 @@
       return verifier::MethodVerifier::kHardFailure;
     }
 
-    // Don't attempt to re-verify if already sufficiently verified.
+    // Don't attempt to re-verify if already verified.
     if (klass->IsVerified()) {
       EnsureSkipAccessChecksMethods(klass, image_pointer_size_);
       return verifier::MethodVerifier::kNoFailure;
     }
-    if (klass->IsCompileTimeVerified() && Runtime::Current()->IsAotCompiler()) {
-      return verifier::MethodVerifier::kNoFailure;
+
+    // For AOT, don't attempt to re-verify if we have already found we should
+    // verify at runtime.
+    if (Runtime::Current()->IsAotCompiler() && klass->ShouldVerifyAtRuntime()) {
+      return verifier::MethodVerifier::kSoftFailure;
     }
 
     if (klass->GetStatus() == mirror::Class::kStatusResolved) {
@@ -4926,7 +4939,15 @@
     // First we initialize all of iface's super-interfaces recursively.
     for (size_t i = 0; i < num_direct_ifaces; i++) {
       ObjPtr<mirror::Class> super_iface = mirror::Class::GetDirectInterface(self, iface.Get(), i);
-      DCHECK(super_iface != nullptr);
+      if (UNLIKELY(super_iface == nullptr)) {
+        const char* iface_descriptor =
+            iface->GetDexFile().StringByTypeIdx(iface->GetDirectInterfaceTypeIdx(i));
+        LOG(FATAL) << "Check failed: super_iface != nullptr "
+            << "Debug data for bug 34839984: "
+            << iface->PrettyDescriptor() << " iface #" << i << " " << iface_descriptor
+            << " space: " << DescribeSpace(iface.Get())
+            << " loaders: " << DescribeLoaders(iface.Get(), iface_descriptor);
+      }
       if (!super_iface->HasBeenRecursivelyInitialized()) {
         // Recursive step
         handle_super_iface.Assign(super_iface);
@@ -8247,6 +8268,43 @@
   return resolved;
 }
 
+ArtField* ClassLinker::LookupResolvedField(uint32_t field_idx,
+                                           ObjPtr<mirror::DexCache> dex_cache,
+                                           ObjPtr<mirror::ClassLoader> class_loader,
+                                           bool is_static) {
+  const DexFile& dex_file = *dex_cache->GetDexFile();
+  const DexFile::FieldId& field_id = dex_file.GetFieldId(field_idx);
+  ObjPtr<mirror::Class> klass = dex_cache->GetResolvedType(field_id.class_idx_);
+  if (klass == nullptr) {
+    klass = LookupResolvedType(dex_file, field_id.class_idx_, dex_cache, class_loader);
+  }
+  if (klass == nullptr) {
+    // The class has not been resolved yet, so the field is also unresolved.
+    return nullptr;
+  }
+  DCHECK(klass->IsResolved());
+  Thread* self = is_static ? Thread::Current() : nullptr;
+
+  // First try to find a field declared directly by `klass` by the field index.
+  ArtField* resolved_field = is_static
+      ? mirror::Class::FindStaticField(self, klass, dex_cache, field_idx)
+      : klass->FindInstanceField(dex_cache, field_idx);
+
+  if (resolved_field == nullptr) {
+    // If not found in `klass` by field index, search the class hierarchy using the name and type.
+    const char* name = dex_file.GetFieldName(field_id);
+    const char* type = dex_file.GetFieldTypeDescriptor(field_id);
+    resolved_field = is_static
+        ? mirror::Class::FindStaticField(self, klass, name, type)
+        : klass->FindInstanceField(name, type);
+  }
+
+  if (resolved_field != nullptr) {
+    dex_cache->SetResolvedField(field_idx, resolved_field, image_pointer_size_);
+  }
+  return resolved_field;
+}
+
 ArtField* ClassLinker::ResolveField(const DexFile& dex_file,
                                     uint32_t field_idx,
                                     Handle<mirror::DexCache> dex_cache,
@@ -8307,9 +8365,8 @@
     return nullptr;
   }
 
-  StringPiece name(dex_file.StringDataByIdx(field_id.name_idx_));
-  StringPiece type(dex_file.StringDataByIdx(
-      dex_file.GetTypeId(field_id.type_idx_).descriptor_idx_));
+  StringPiece name(dex_file.GetFieldName(field_id));
+  StringPiece type(dex_file.GetFieldTypeDescriptor(field_id));
   resolved = mirror::Class::FindField(self, klass, name, type);
   if (resolved != nullptr) {
     dex_cache->SetResolvedField(field_idx, resolved, image_pointer_size_);
diff --git a/runtime/class_linker.h b/runtime/class_linker.h
index a5d26c7..6254acb 100644
--- a/runtime/class_linker.h
+++ b/runtime/class_linker.h
@@ -333,7 +333,7 @@
       REQUIRES_SHARED(Locks::mutator_lock_)
       REQUIRES(!Locks::dex_lock_, !Roles::uninterruptible_);
 
-  ArtField* GetResolvedField(uint32_t field_idx, ObjPtr<mirror::DexCache> dex_cache)
+  ArtField* LookupResolvedField(uint32_t field_idx, ArtMethod* referrer, bool is_static)
       REQUIRES_SHARED(Locks::mutator_lock_);
   ArtField* ResolveField(uint32_t field_idx, ArtMethod* referrer, bool is_static)
       REQUIRES_SHARED(Locks::mutator_lock_)
@@ -842,6 +842,13 @@
       REQUIRES(!Locks::classlinker_classes_lock_)
       REQUIRES_SHARED(Locks::mutator_lock_);
 
+  // Find a field by its field index.
+  ArtField* LookupResolvedField(uint32_t field_idx,
+                                ObjPtr<mirror::DexCache> dex_cache,
+                                ObjPtr<mirror::ClassLoader> class_loader,
+                                bool is_static)
+      REQUIRES_SHARED(Locks::mutator_lock_);
+
   void RegisterDexFileLocked(const DexFile& dex_file,
                              ObjPtr<mirror::DexCache> dex_cache,
                              ObjPtr<mirror::ClassLoader> class_loader)
diff --git a/runtime/entrypoints/quick/quick_throw_entrypoints.cc b/runtime/entrypoints/quick/quick_throw_entrypoints.cc
index c8ee99a..1520e13 100644
--- a/runtime/entrypoints/quick/quick_throw_entrypoints.cc
+++ b/runtime/entrypoints/quick/quick_throw_entrypoints.cc
@@ -62,9 +62,7 @@
 extern "C" NO_RETURN void artThrowNullPointerExceptionFromSignal(uintptr_t addr, Thread* self)
     REQUIRES_SHARED(Locks::mutator_lock_) {
   ScopedQuickEntrypointChecks sqec(self);
-  self->NoteSignalBeingHandled();
   ThrowNullPointerExceptionFromDexPC(/* check_address */ true, addr);
-  self->NoteSignalHandlerDone();
   self->QuickDeliverException();
 }
 
@@ -95,9 +93,7 @@
 extern "C" NO_RETURN void artThrowStackOverflowFromCode(Thread* self)
     REQUIRES_SHARED(Locks::mutator_lock_) {
   ScopedQuickEntrypointChecks sqec(self);
-  self->NoteSignalBeingHandled();
   ThrowStackOverflowError(self);
-  self->NoteSignalHandlerDone();
   self->QuickDeliverException();
 }
 
diff --git a/runtime/entrypoints/quick/quick_trampoline_entrypoints.cc b/runtime/entrypoints/quick/quick_trampoline_entrypoints.cc
index 4c3990a..3fd20a6 100644
--- a/runtime/entrypoints/quick/quick_trampoline_entrypoints.cc
+++ b/runtime/entrypoints/quick/quick_trampoline_entrypoints.cc
@@ -2323,48 +2323,26 @@
   return artInvokeCommon<kVirtual, true>(method_idx, this_object, self, sp);
 }
 
-// Determine target of interface dispatch. This object is known non-null. First argument
-// is there for consistency but should not be used, as some architectures overwrite it
-// in the assembly trampoline.
-extern "C" TwoWordReturn artInvokeInterfaceTrampoline(uint32_t deadbeef ATTRIBUTE_UNUSED,
+// Determine target of interface dispatch. The interface method and this object are known non-null.
+// The interface method is the method returned by the dex cache in the conflict trampoline.
+extern "C" TwoWordReturn artInvokeInterfaceTrampoline(ArtMethod* interface_method,
                                                       mirror::Object* raw_this_object,
                                                       Thread* self,
                                                       ArtMethod** sp)
     REQUIRES_SHARED(Locks::mutator_lock_) {
+  CHECK(interface_method != nullptr);
   ObjPtr<mirror::Object> this_object(raw_this_object);
   ScopedQuickEntrypointChecks sqec(self);
   StackHandleScope<1> hs(self);
   Handle<mirror::Class> cls(hs.NewHandle(this_object->GetClass()));
 
   ArtMethod* caller_method = QuickArgumentVisitor::GetCallingMethod(sp);
-
-  // Fetch the dex_method_idx of the target interface method from the caller.
-  uint32_t dex_pc = QuickArgumentVisitor::GetCallingDexPc(sp);
-
-  const DexFile::CodeItem* code_item = caller_method->GetCodeItem();
-  CHECK_LT(dex_pc, code_item->insns_size_in_code_units_);
-  const Instruction* instr = Instruction::At(&code_item->insns_[dex_pc]);
-  Instruction::Code instr_code = instr->Opcode();
-  CHECK(instr_code == Instruction::INVOKE_INTERFACE ||
-        instr_code == Instruction::INVOKE_INTERFACE_RANGE)
-      << "Unexpected call into interface trampoline: " << instr->DumpString(nullptr);
-  uint32_t dex_method_idx;
-  if (instr_code == Instruction::INVOKE_INTERFACE) {
-    dex_method_idx = instr->VRegB_35c();
-  } else {
-    CHECK_EQ(instr_code, Instruction::INVOKE_INTERFACE_RANGE);
-    dex_method_idx = instr->VRegB_3rc();
-  }
-
-  ArtMethod* interface_method = caller_method->GetDexCacheResolvedMethod(
-      dex_method_idx, kRuntimePointerSize);
-  DCHECK(interface_method != nullptr) << dex_method_idx << " " << caller_method->PrettyMethod();
   ArtMethod* method = nullptr;
   ImTable* imt = cls->GetImt(kRuntimePointerSize);
 
   if (LIKELY(interface_method->GetDexMethodIndex() != DexFile::kDexNoIndex)) {
-    // If the dex cache already resolved the interface method, look whether we have
-    // a match in the ImtConflictTable.
+    // If the interface method is already resolved, look whether we have a match in the
+    // ImtConflictTable.
     ArtMethod* conflict_method = imt->Get(ImTable::GetImtIndex(interface_method),
                                           kRuntimePointerSize);
     if (LIKELY(conflict_method->IsRuntimeMethod())) {
@@ -2389,9 +2367,26 @@
       return GetTwoWordFailureValue();  // Failure.
     }
   } else {
-    // The dex cache did not resolve the method, look it up in the dex file
-    // of the caller,
+    // The interface method is unresolved, so look it up in the dex file of the caller.
     DCHECK_EQ(interface_method, Runtime::Current()->GetResolutionMethod());
+
+    // Fetch the dex_method_idx of the target interface method from the caller.
+    uint32_t dex_method_idx;
+    uint32_t dex_pc = QuickArgumentVisitor::GetCallingDexPc(sp);
+    const DexFile::CodeItem* code_item = caller_method->GetCodeItem();
+    DCHECK_LT(dex_pc, code_item->insns_size_in_code_units_);
+    const Instruction* instr = Instruction::At(&code_item->insns_[dex_pc]);
+    Instruction::Code instr_code = instr->Opcode();
+    DCHECK(instr_code == Instruction::INVOKE_INTERFACE ||
+           instr_code == Instruction::INVOKE_INTERFACE_RANGE)
+        << "Unexpected call into interface trampoline: " << instr->DumpString(nullptr);
+    if (instr_code == Instruction::INVOKE_INTERFACE) {
+      dex_method_idx = instr->VRegB_35c();
+    } else {
+      DCHECK_EQ(instr_code, Instruction::INVOKE_INTERFACE_RANGE);
+      dex_method_idx = instr->VRegB_3rc();
+    }
+
     const DexFile* dex_file = caller_method->GetDeclaringClass()->GetDexCache()
         ->GetDexFile();
     uint32_t shorty_len;
diff --git a/runtime/entrypoints_order_test.cc b/runtime/entrypoints_order_test.cc
index d0687ce..55a4625 100644
--- a/runtime/entrypoints_order_test.cc
+++ b/runtime/entrypoints_order_test.cc
@@ -133,9 +133,8 @@
     EXPECT_OFFSET_DIFFP(Thread, tlsPtr_, thread_local_alloc_stack_top, thread_local_alloc_stack_end,
                         sizeof(void*));
     EXPECT_OFFSET_DIFFP(Thread, tlsPtr_, thread_local_alloc_stack_end, held_mutexes, sizeof(void*));
-    EXPECT_OFFSET_DIFFP(Thread, tlsPtr_, held_mutexes, nested_signal_state,
+    EXPECT_OFFSET_DIFFP(Thread, tlsPtr_, held_mutexes, flip_function,
                         sizeof(void*) * kLockLevelCount);
-    EXPECT_OFFSET_DIFFP(Thread, tlsPtr_, nested_signal_state, flip_function, sizeof(void*));
     EXPECT_OFFSET_DIFFP(Thread, tlsPtr_, flip_function, method_verifier, sizeof(void*));
     EXPECT_OFFSET_DIFFP(Thread, tlsPtr_, method_verifier, thread_local_mark_stack, sizeof(void*));
     EXPECT_OFFSET_DIFF(Thread, tlsPtr_.thread_local_mark_stack, Thread, wait_mutex_, sizeof(void*),
diff --git a/runtime/fault_handler.cc b/runtime/fault_handler.cc
index f9345b6..64128cc 100644
--- a/runtime/fault_handler.cc
+++ b/runtime/fault_handler.cc
@@ -28,47 +28,6 @@
 #include "thread-inl.h"
 #include "verify_object-inl.h"
 
-// Note on nested signal support
-// -----------------------------
-//
-// Typically a signal handler should not need to deal with signals that occur within it.
-// However, when a SIGSEGV occurs that is in generated code and is not one of the
-// handled signals (implicit checks), we call a function to try to dump the stack
-// to the log.  This enhances the debugging experience but may have the side effect
-// that it may not work.  If the cause of the original SIGSEGV is a corrupted stack or other
-// memory region, the stack backtrace code may run into trouble and may either crash
-// or fail with an abort (SIGABRT).  In either case we don't want that (new) signal to
-// mask the original signal and thus prevent useful debug output from being presented.
-//
-// In order to handle this situation, before we call the stack tracer we do the following:
-//
-// 1. shutdown the fault manager so that we are talking to the real signal management
-//    functions rather than those in sigchain.
-// 2. use pthread_sigmask to allow SIGSEGV and SIGABRT signals to be delivered to the
-//    thread running the signal handler.
-// 3. set the handler for SIGSEGV and SIGABRT to a secondary signal handler.
-// 4. save the thread's state to the TLS of the current thread using 'setjmp'
-//
-// We then call the stack tracer and one of two things may happen:
-// a. it completes successfully
-// b. it crashes and a signal is raised.
-//
-// In the former case, we fall through and everything is fine.  In the latter case
-// our secondary signal handler gets called in a signal context.  This results in
-// a call to FaultManager::HandledNestedSignal(), an archirecture specific function
-// whose purpose is to call 'longjmp' on the jmp_buf saved in the TLS of the current
-// thread.  This results in a return with a non-zero value from 'setjmp'.  We detect this
-// and write something to the log to tell the user that it happened.
-//
-// Regardless of how we got there, we reach the code after the stack tracer and we
-// restore the signal states to their original values, reinstate the fault manager (thus
-// reestablishing the signal chain) and continue.
-
-// This is difficult to test with a runtime test.  To invoke the nested signal code
-// on any signal, uncomment the following line and run something that throws a
-// NullPointerException.
-// #define TEST_NESTED_SIGNAL
-
 namespace art {
 // Static fault manger object accessed by signal handler.
 FaultManager fault_manager;
@@ -83,11 +42,6 @@
   fault_manager.HandleFault(sig, info, context);
 }
 
-// Signal handler for dealing with a nested signal.
-static void art_nested_signal_handler(int sig, siginfo_t* info, void* context) {
-  fault_manager.HandleNestedSignal(sig, info, context);
-}
-
 FaultManager::FaultManager() : initialized_(false) {
   sigaction(SIGSEGV, nullptr, &oldaction_);
 }
@@ -156,122 +110,93 @@
   DCHECK(self != nullptr);
   DCHECK(Runtime::Current() != nullptr);
   DCHECK(Runtime::Current()->IsStarted());
-
-  // Now set up the nested signal handler.
-
-  // TODO: add SIGSEGV back to the nested signals when we can handle running out stack gracefully.
-  static const int handled_nested_signals[] = {SIGABRT};
-  constexpr size_t num_handled_nested_signals = arraysize(handled_nested_signals);
-
-  // Release the fault manager so that it will remove the signal chain for
-  // SIGSEGV and we call the real sigaction.
-  fault_manager.Release();
-
-  // The action for SIGSEGV should be the default handler now.
-
-  // Unblock the signals we allow so that they can be delivered in the signal handler.
-  sigset_t sigset;
-  sigemptyset(&sigset);
-  for (int signal : handled_nested_signals) {
-    sigaddset(&sigset, signal);
-  }
-  pthread_sigmask(SIG_UNBLOCK, &sigset, nullptr);
-
-  // If we get a signal in this code we want to invoke our nested signal
-  // handler.
-  struct sigaction action;
-  struct sigaction oldactions[num_handled_nested_signals];
-  action.sa_sigaction = art_nested_signal_handler;
-
-  // Explicitly mask out SIGSEGV and SIGABRT from the nested signal handler.  This
-  // should be the default but we definitely don't want these happening in our
-  // nested signal handler.
-  sigemptyset(&action.sa_mask);
-  for (int signal : handled_nested_signals) {
-    sigaddset(&action.sa_mask, signal);
-  }
-
-  action.sa_flags = SA_SIGINFO | SA_ONSTACK;
-#if !defined(__APPLE__) && !defined(__mips__)
-  action.sa_restorer = nullptr;
-#endif
-
-  // Catch handled signals to invoke our nested handler.
-  bool success = true;
-  for (size_t i = 0; i < num_handled_nested_signals; ++i) {
-    success = sigaction(handled_nested_signals[i], &action, &oldactions[i]) == 0;
-    if (!success) {
-      PLOG(ERROR) << "Unable to set up nested signal handler";
-      break;
+  for (const auto& handler : other_handlers_) {
+    if (handler->Action(sig, info, context)) {
+      return true;
     }
   }
-
-  if (success) {
-    // Save the current state and call the handlers.  If anything causes a signal
-    // our nested signal handler will be invoked and this will longjmp to the saved
-    // state.
-    if (setjmp(*self->GetNestedSignalState()) == 0) {
-      for (const auto& handler : other_handlers_) {
-        if (handler->Action(sig, info, context)) {
-          // Restore the signal handlers, reinit the fault manager and return.  Signal was
-          // handled.
-          for (size_t i = 0; i < num_handled_nested_signals; ++i) {
-            success = sigaction(handled_nested_signals[i], &oldactions[i], nullptr) == 0;
-            if (!success) {
-              PLOG(ERROR) << "Unable to restore signal handler";
-            }
-          }
-          fault_manager.Init();
-          return true;
-        }
-      }
-    } else {
-      LOG(ERROR) << "Nested signal detected - original signal being reported";
-    }
-
-    // Restore the signal handlers.
-    for (size_t i = 0; i < num_handled_nested_signals; ++i) {
-      success = sigaction(handled_nested_signals[i], &oldactions[i], nullptr) == 0;
-      if (!success) {
-        PLOG(ERROR) << "Unable to restore signal handler";
-      }
-    }
-  }
-
-  // Now put the fault manager back in place.
-  fault_manager.Init();
   return false;
 }
 
+class ScopedSignalUnblocker {
+ public:
+  explicit ScopedSignalUnblocker(const std::initializer_list<int>& signals) {
+    sigset_t new_mask;
+    sigemptyset(&new_mask);
+    for (int signal : signals) {
+      sigaddset(&new_mask, signal);
+    }
+    if (sigprocmask(SIG_UNBLOCK, &new_mask, &previous_mask_) != 0) {
+      PLOG(FATAL) << "failed to unblock signals";
+    }
+  }
+
+  ~ScopedSignalUnblocker() {
+    if (sigprocmask(SIG_SETMASK, &previous_mask_, nullptr) != 0) {
+      PLOG(FATAL) << "failed to unblock signals";
+    }
+  }
+
+ private:
+  sigset_t previous_mask_;
+};
+
+class ScopedHandlingSignalSetter {
+ public:
+  explicit ScopedHandlingSignalSetter(Thread* thread) : thread_(thread) {
+    CHECK(!thread->HandlingSignal());
+    thread_->SetHandlingSignal(true);
+  }
+
+  ~ScopedHandlingSignalSetter() {
+    CHECK(thread_->HandlingSignal());
+    thread_->SetHandlingSignal(false);
+  }
+
+ private:
+  Thread* thread_;
+};
+
 void FaultManager::HandleFault(int sig, siginfo_t* info, void* context) {
   // BE CAREFUL ALLOCATING HERE INCLUDING USING LOG(...)
   //
   // If malloc calls abort, it will be holding its lock.
   // If the handler tries to call malloc, it will deadlock.
-  VLOG(signals) << "Handling fault";
-  if (IsInGeneratedCode(info, context, true)) {
-    VLOG(signals) << "in generated code, looking for handler";
-    for (const auto& handler : generated_code_handlers_) {
-      VLOG(signals) << "invoking Action on handler " << handler;
-      if (handler->Action(sig, info, context)) {
-#ifdef TEST_NESTED_SIGNAL
-        // In test mode we want to fall through to stack trace handler
-        // on every signal (in reality this will cause a crash on the first
-        // signal).
-        break;
-#else
-        // We have handled a signal so it's time to return from the
-        // signal handler to the appropriate place.
-        return;
-#endif
-      }
-    }
 
-    // We hit a signal we didn't handle.  This might be something for which
-    // we can give more information about so call all registered handlers to see
-    // if it is.
-    if (HandleFaultByOtherHandlers(sig, info, context)) {
+  // Use a thread local field to track whether we're recursing, and fall back.
+  // (e.g.. if one of our handlers crashed)
+  Thread* thread = Thread::Current();
+
+  if (thread != nullptr && !thread->HandlingSignal()) {
+    // Unblock some signals and set thread->handling_signal_ to true,
+    // so that we can catch crashes in our signal handler.
+    ScopedHandlingSignalSetter setter(thread);
+    ScopedSignalUnblocker unblocker { SIGABRT, SIGBUS, SIGSEGV }; // NOLINT
+
+    VLOG(signals) << "Handling fault";
+
+#ifdef TEST_NESTED_SIGNAL
+    // Simulate a crash in a handler.
+    raise(SIGSEGV);
+#endif
+
+    if (IsInGeneratedCode(info, context, true)) {
+      VLOG(signals) << "in generated code, looking for handler";
+      for (const auto& handler : generated_code_handlers_) {
+        VLOG(signals) << "invoking Action on handler " << handler;
+        if (handler->Action(sig, info, context)) {
+          // We have handled a signal so it's time to return from the
+          // signal handler to the appropriate place.
+          return;
+        }
+      }
+
+      // We hit a signal we didn't handle.  This might be something for which
+      // we can give more information about so call all registered handlers to
+      // see if it is.
+      if (HandleFaultByOtherHandlers(sig, info, context)) {
         return;
+      }
     }
   }
 
@@ -417,11 +342,7 @@
 
 bool JavaStackTraceHandler::Action(int sig ATTRIBUTE_UNUSED, siginfo_t* siginfo, void* context) {
   // Make sure that we are in the generated code, but we may not have a dex pc.
-#ifdef TEST_NESTED_SIGNAL
-  bool in_generated_code = true;
-#else
   bool in_generated_code = manager_->IsInGeneratedCode(siginfo, context, false);
-#endif
   if (in_generated_code) {
     LOG(ERROR) << "Dumping java stack trace for crash in generated code";
     ArtMethod* method = nullptr;
@@ -432,12 +353,6 @@
     manager_->GetMethodAndReturnPcAndSp(siginfo, context, &method, &return_pc, &sp);
     // Inside of generated code, sp[0] is the method, so sp is the frame.
     self->SetTopOfStack(reinterpret_cast<ArtMethod**>(sp));
-#ifdef TEST_NESTED_SIGNAL
-    // To test the nested signal handler we raise a signal here.  This will cause the
-    // nested signal handler to be called and perform a longjmp back to the setjmp
-    // above.
-    abort();
-#endif
     self->DumpJavaStack(LOG_STREAM(ERROR));
   }
 
diff --git a/runtime/fault_handler.h b/runtime/fault_handler.h
index 56e0fb7..ce59ba7 100644
--- a/runtime/fault_handler.h
+++ b/runtime/fault_handler.h
@@ -45,7 +45,6 @@
   void EnsureArtActionInFrontOfSignalChain();
 
   void HandleFault(int sig, siginfo_t* info, void* context);
-  void HandleNestedSignal(int sig, siginfo_t* info, void* context);
 
   // Added handlers are owned by the fault handler and will be freed on Shutdown().
   void AddHandler(FaultHandler* handler, bool generated_code);
diff --git a/runtime/gc/scoped_gc_critical_section.cc b/runtime/gc/scoped_gc_critical_section.cc
index b5eb979..f937d2c 100644
--- a/runtime/gc/scoped_gc_critical_section.cc
+++ b/runtime/gc/scoped_gc_critical_section.cc
@@ -29,10 +29,14 @@
                                                  CollectorType collector_type)
     : self_(self) {
   Runtime::Current()->GetHeap()->StartGC(self, cause, collector_type);
-  old_cause_ = self->StartAssertNoThreadSuspension("ScopedGCCriticalSection");
+  if (self != nullptr) {
+    old_cause_ = self->StartAssertNoThreadSuspension("ScopedGCCriticalSection");
+  }
 }
 ScopedGCCriticalSection::~ScopedGCCriticalSection() {
-  self_->EndAssertNoThreadSuspension(old_cause_);
+  if (self_ != nullptr) {
+    self_->EndAssertNoThreadSuspension(old_cause_);
+  }
   Runtime::Current()->GetHeap()->FinishGC(self_, collector::kGcTypeNone);
 }
 
diff --git a/runtime/gc/space/image_space.cc b/runtime/gc/space/image_space.cc
index 010ef11..568f8d6 100644
--- a/runtime/gc/space/image_space.cc
+++ b/runtime/gc/space/image_space.cc
@@ -1259,17 +1259,18 @@
             }
           }
         }
-        ArtField** fields = dex_cache->GetResolvedFields();
+        mirror::FieldDexCacheType* fields = dex_cache->GetResolvedFields();
         if (fields != nullptr) {
-          ArtField** new_fields = fixup_adapter.ForwardObject(fields);
+          mirror::FieldDexCacheType* new_fields = fixup_adapter.ForwardObject(fields);
           if (fields != new_fields) {
             dex_cache->SetResolvedFields(new_fields);
           }
           for (size_t j = 0, num = dex_cache->NumResolvedFields(); j != num; ++j) {
-            ArtField* orig = mirror::DexCache::GetElementPtrSize(new_fields, j, pointer_size);
-            ArtField* copy = fixup_adapter.ForwardObject(orig);
-            if (orig != copy) {
-              mirror::DexCache::SetElementPtrSize(new_fields, j, copy, pointer_size);
+            mirror::FieldDexCachePair orig =
+                mirror::DexCache::GetNativePairPtrSize(new_fields, j, pointer_size);
+            mirror::FieldDexCachePair copy(fixup_adapter.ForwardObject(orig.object), orig.index);
+            if (orig.object != copy.object) {
+              mirror::DexCache::SetNativePairPtrSize(new_fields, j, copy, pointer_size);
             }
           }
         }
diff --git a/runtime/image.cc b/runtime/image.cc
index 88f28f3..5fbb7a6 100644
--- a/runtime/image.cc
+++ b/runtime/image.cc
@@ -25,7 +25,7 @@
 namespace art {
 
 const uint8_t ImageHeader::kImageMagic[] = { 'a', 'r', 't', '\n' };
-const uint8_t ImageHeader::kImageVersion[] = { '0', '4', '0', '\0' };  // Integer.valueOf intrinsic
+const uint8_t ImageHeader::kImageVersion[] = { '0', '4', '1', '\0' };  // hash-based DexCache fields
 
 ImageHeader::ImageHeader(uint32_t image_begin,
                          uint32_t image_size,
diff --git a/runtime/interpreter/interpreter_common.h b/runtime/interpreter/interpreter_common.h
index 6b22af9..2589ad0 100644
--- a/runtime/interpreter/interpreter_common.h
+++ b/runtime/interpreter/interpreter_common.h
@@ -18,6 +18,7 @@
 #define ART_RUNTIME_INTERPRETER_INTERPRETER_COMMON_H_
 
 #include "interpreter.h"
+#include "interpreter_intrinsics.h"
 
 #include <math.h>
 
@@ -104,13 +105,58 @@
 void RecordArrayElementsInTransaction(ObjPtr<mirror::Array> array, int32_t count)
     REQUIRES_SHARED(Locks::mutator_lock_);
 
-// Invokes the given method. This is part of the invocation support and is used by DoInvoke and
-// DoInvokeVirtualQuick functions.
+// Invokes the given method. This is part of the invocation support and is used by DoInvoke,
+// DoFastInvoke and DoInvokeVirtualQuick functions.
 // Returns true on success, otherwise throws an exception and returns false.
 template<bool is_range, bool do_assignability_check>
 bool DoCall(ArtMethod* called_method, Thread* self, ShadowFrame& shadow_frame,
             const Instruction* inst, uint16_t inst_data, JValue* result);
 
+// Handles streamlined non-range invoke static, direct and virtual instructions originating in
+// mterp. Access checks and instrumentation other than jit profiling are not supported, but does
+// support interpreter intrinsics if applicable.
+// Returns true on success, otherwise throws an exception and returns false.
+template<InvokeType type>
+static inline bool DoFastInvoke(Thread* self,
+                                ShadowFrame& shadow_frame,
+                                const Instruction* inst,
+                                uint16_t inst_data,
+                                JValue* result) {
+  const uint32_t method_idx = inst->VRegB_35c();
+  const uint32_t vregC = inst->VRegC_35c();
+  ObjPtr<mirror::Object> receiver = (type == kStatic)
+      ? nullptr
+      : shadow_frame.GetVRegReference(vregC);
+  ArtMethod* sf_method = shadow_frame.GetMethod();
+  ArtMethod* const called_method = FindMethodFromCode<type, false>(
+      method_idx, &receiver, sf_method, self);
+  // The shadow frame should already be pushed, so we don't need to update it.
+  if (UNLIKELY(called_method == nullptr)) {
+    CHECK(self->IsExceptionPending());
+    result->SetJ(0);
+    return false;
+  } else if (UNLIKELY(!called_method->IsInvokable())) {
+    called_method->ThrowInvocationTimeError();
+    result->SetJ(0);
+    return false;
+  } else {
+    if (called_method->IsIntrinsic()) {
+      if (MterpHandleIntrinsic(&shadow_frame, called_method, inst, inst_data,
+                               shadow_frame.GetResultRegister())) {
+        return !self->IsExceptionPending();
+      }
+    }
+    jit::Jit* jit = Runtime::Current()->GetJit();
+    if (jit != nullptr) {
+      if (type == kVirtual) {
+        jit->InvokeVirtualOrInterface(receiver, sf_method, shadow_frame.GetDexPC(), called_method);
+      }
+      jit->AddSamples(self, sf_method, 1, /*with_backedges*/false);
+    }
+    return DoCall<false, false>(called_method, self, shadow_frame, inst, inst_data, result);
+  }
+}
+
 // Handles all invoke-XXX/range instructions except for invoke-polymorphic[/range].
 // Returns true on success, otherwise throws an exception and returns false.
 template<InvokeType type, bool is_range, bool do_access_check>
@@ -495,8 +541,9 @@
 
 // Explicitly instantiate all DoInvoke functions.
 #define EXPLICIT_DO_INVOKE_TEMPLATE_DECL(_type, _is_range, _do_check)                      \
-  template REQUIRES_SHARED(Locks::mutator_lock_)                                     \
-  bool DoInvoke<_type, _is_range, _do_check>(Thread* self, ShadowFrame& shadow_frame,      \
+  template REQUIRES_SHARED(Locks::mutator_lock_)                                           \
+  bool DoInvoke<_type, _is_range, _do_check>(Thread* self,                                 \
+                                             ShadowFrame& shadow_frame,                    \
                                              const Instruction* inst, uint16_t inst_data,  \
                                              JValue* result)
 
@@ -514,6 +561,19 @@
 #undef EXPLICIT_DO_INVOKE_ALL_TEMPLATE_DECL
 #undef EXPLICIT_DO_INVOKE_TEMPLATE_DECL
 
+// Explicitly instantiate all DoFastInvoke functions.
+#define EXPLICIT_DO_FAST_INVOKE_TEMPLATE_DECL(_type)                     \
+  template REQUIRES_SHARED(Locks::mutator_lock_)                         \
+  bool DoFastInvoke<_type>(Thread* self,                                 \
+                           ShadowFrame& shadow_frame,                    \
+                           const Instruction* inst, uint16_t inst_data,  \
+                           JValue* result)
+
+EXPLICIT_DO_FAST_INVOKE_TEMPLATE_DECL(kStatic);     // invoke-static
+EXPLICIT_DO_FAST_INVOKE_TEMPLATE_DECL(kDirect);     // invoke-direct
+EXPLICIT_DO_FAST_INVOKE_TEMPLATE_DECL(kVirtual);    // invoke-virtual
+#undef EXPLICIT_DO_FAST_INVOKE_TEMPLATE_DECL
+
 // Explicitly instantiate all DoInvokeVirtualQuick functions.
 #define EXPLICIT_DO_INVOKE_VIRTUAL_QUICK_TEMPLATE_DECL(_is_range)                    \
   template REQUIRES_SHARED(Locks::mutator_lock_)                               \
diff --git a/runtime/interpreter/interpreter_intrinsics.cc b/runtime/interpreter/interpreter_intrinsics.cc
new file mode 100644
index 0000000..5e901cd
--- /dev/null
+++ b/runtime/interpreter/interpreter_intrinsics.cc
@@ -0,0 +1,121 @@
+/*
+ * Copyright (C) 2017 The Android Open Source Project
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "interpreter/interpreter_common.h"
+#include "interpreter/interpreter_intrinsics.h"
+
+namespace art {
+namespace interpreter {
+
+#define BINARY_SIMPLE_INTRINSIC(name, op, get, set, offset)  \
+static ALWAYS_INLINE bool name(ShadowFrame* shadow_frame,    \
+                               const Instruction* inst,      \
+                               uint16_t inst_data,           \
+                               JValue* result_register)      \
+    REQUIRES_SHARED(Locks::mutator_lock_) {                  \
+  uint32_t arg[Instruction::kMaxVarArgRegs] = {};            \
+  inst->GetVarArgs(arg, inst_data);                          \
+  result_register->set(op(shadow_frame->get(arg[0]), shadow_frame->get(arg[offset]))); \
+  return true;                                               \
+}
+
+#define UNARY_SIMPLE_INTRINSIC(name, op, get, set)           \
+static ALWAYS_INLINE bool name(ShadowFrame* shadow_frame,    \
+                               const Instruction* inst,      \
+                               uint16_t inst_data,           \
+                               JValue* result_register)      \
+    REQUIRES_SHARED(Locks::mutator_lock_) {                  \
+  uint32_t arg[Instruction::kMaxVarArgRegs] = {};            \
+  inst->GetVarArgs(arg, inst_data);                          \
+  result_register->set(op(shadow_frame->get(arg[0])));       \
+  return true;                                               \
+}
+
+// java.lang.Math.min(II)I
+BINARY_SIMPLE_INTRINSIC(MterpMathMinIntInt, std::min, GetVReg, SetI, 1);
+// java.lang.Math.min(JJ)J
+BINARY_SIMPLE_INTRINSIC(MterpMathMinLongLong, std::min, GetVRegLong, SetJ, 2);
+// java.lang.Math.max(II)I
+BINARY_SIMPLE_INTRINSIC(MterpMathMaxIntInt, std::max, GetVReg, SetI, 1);
+// java.lang.Math.max(JJ)J
+BINARY_SIMPLE_INTRINSIC(MterpMathMaxLongLong, std::max, GetVRegLong, SetJ, 2);
+// java.lang.Math.abs(I)I
+UNARY_SIMPLE_INTRINSIC(MterpMathAbsInt, std::abs, GetVReg, SetI);
+// java.lang.Math.abs(J)J
+UNARY_SIMPLE_INTRINSIC(MterpMathAbsLong, std::abs, GetVRegLong, SetJ);
+// java.lang.Math.abs(F)F
+UNARY_SIMPLE_INTRINSIC(MterpMathAbsFloat, 0x7fffffff&, GetVReg, SetI);
+// java.lang.Math.abs(D)D
+UNARY_SIMPLE_INTRINSIC(MterpMathAbsDouble, INT64_C(0x7fffffffffffffff)&, GetVRegLong, SetJ);
+// java.lang.Math.sqrt(D)D
+UNARY_SIMPLE_INTRINSIC(MterpMathSqrt, std::sqrt, GetVRegDouble, SetD);
+// java.lang.Math.ceil(D)D
+UNARY_SIMPLE_INTRINSIC(MterpMathCeil, std::ceil, GetVRegDouble, SetD);
+// java.lang.Math.floor(D)D
+UNARY_SIMPLE_INTRINSIC(MterpMathFloor, std::floor, GetVRegDouble, SetD);
+// java.lang.Math.sin(D)D
+UNARY_SIMPLE_INTRINSIC(MterpMathSin, std::sin, GetVRegDouble, SetD);
+// java.lang.Math.cos(D)D
+UNARY_SIMPLE_INTRINSIC(MterpMathCos, std::cos, GetVRegDouble, SetD);
+// java.lang.Math.tan(D)D
+UNARY_SIMPLE_INTRINSIC(MterpMathTan, std::tan, GetVRegDouble, SetD);
+// java.lang.Math.asin(D)D
+UNARY_SIMPLE_INTRINSIC(MterpMathAsin, std::asin, GetVRegDouble, SetD);
+// java.lang.Math.acos(D)D
+UNARY_SIMPLE_INTRINSIC(MterpMathAcos, std::acos, GetVRegDouble, SetD);
+// java.lang.Math.atan(D)D
+UNARY_SIMPLE_INTRINSIC(MterpMathAtan, std::atan, GetVRegDouble, SetD);
+
+#define INTRINSIC_CASE(name)                                           \
+    case Intrinsics::k##name:                                          \
+      res = Mterp##name(shadow_frame, inst, inst_data, result_register); \
+      break;
+
+bool MterpHandleIntrinsic(ShadowFrame* shadow_frame,
+                          ArtMethod* const called_method,
+                          const Instruction* inst,
+                          uint16_t inst_data,
+                          JValue* result_register)
+    REQUIRES_SHARED(Locks::mutator_lock_) {
+  Intrinsics intrinsic = static_cast<Intrinsics>(called_method->GetIntrinsic());
+  bool res = false;  // Assume failure
+  switch (intrinsic) {
+    INTRINSIC_CASE(MathMinIntInt)
+    INTRINSIC_CASE(MathMinLongLong)
+    INTRINSIC_CASE(MathMaxIntInt)
+    INTRINSIC_CASE(MathMaxLongLong)
+    INTRINSIC_CASE(MathAbsInt)
+    INTRINSIC_CASE(MathAbsLong)
+    INTRINSIC_CASE(MathAbsFloat)
+    INTRINSIC_CASE(MathAbsDouble)
+    INTRINSIC_CASE(MathSqrt)
+    INTRINSIC_CASE(MathCeil)
+    INTRINSIC_CASE(MathFloor)
+    INTRINSIC_CASE(MathSin)
+    INTRINSIC_CASE(MathCos)
+    INTRINSIC_CASE(MathTan)
+    INTRINSIC_CASE(MathAsin)
+    INTRINSIC_CASE(MathAcos)
+    INTRINSIC_CASE(MathAtan)
+    default:
+      res = false;  // Punt
+      break;
+  }
+  return res;
+}
+
+}  // namespace interpreter
+}  // namespace art
diff --git a/runtime/interpreter/interpreter_intrinsics.h b/runtime/interpreter/interpreter_intrinsics.h
new file mode 100644
index 0000000..ae45679
--- /dev/null
+++ b/runtime/interpreter/interpreter_intrinsics.h
@@ -0,0 +1,37 @@
+/*
+ * Copyright (C) 2017 The Android Open Source Project
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef ART_RUNTIME_INTERPRETER_INTERPRETER_INTRINSICS_H_
+#define ART_RUNTIME_INTERPRETER_INTERPRETER_INTRINSICS_H_
+
+#include "compiler/intrinsics_enum.h"
+#include "dex_instruction.h"
+
+namespace art {
+namespace interpreter {
+
+// Invokes to methods identified as intrinics are routed here.  If there is
+// no interpreter implementation, return false and a normal invoke will proceed.
+bool MterpHandleIntrinsic(ShadowFrame* shadow_frame,
+                          ArtMethod* const called_method,
+                          const Instruction* inst,
+                          uint16_t inst_data,
+                          JValue* result_register);
+
+}  // namespace interpreter
+}  // namespace art
+
+#endif  // ART_RUNTIME_INTERPRETER_INTERPRETER_INTRINSICS_H_
diff --git a/runtime/interpreter/mterp/mterp.cc b/runtime/interpreter/mterp/mterp.cc
index 8bf094e..a53040c 100644
--- a/runtime/interpreter/mterp/mterp.cc
+++ b/runtime/interpreter/mterp/mterp.cc
@@ -18,6 +18,7 @@
  * Mterp entry point and support functions.
  */
 #include "interpreter/interpreter_common.h"
+#include "interpreter/interpreter_intrinsics.h"
 #include "entrypoints/entrypoint_utils-inl.h"
 #include "mterp.h"
 #include "debugger.h"
@@ -157,7 +158,7 @@
     REQUIRES_SHARED(Locks::mutator_lock_) {
   JValue* result_register = shadow_frame->GetResultRegister();
   const Instruction* inst = Instruction::At(dex_pc_ptr);
-  return DoInvoke<kVirtual, false, false>(
+  return DoFastInvoke<kVirtual>(
       self, *shadow_frame, inst, inst_data, result_register);
 }
 
@@ -190,7 +191,7 @@
     REQUIRES_SHARED(Locks::mutator_lock_) {
   JValue* result_register = shadow_frame->GetResultRegister();
   const Instruction* inst = Instruction::At(dex_pc_ptr);
-  return DoInvoke<kDirect, false, false>(
+  return DoFastInvoke<kDirect>(
       self, *shadow_frame, inst, inst_data, result_register);
 }
 
@@ -201,7 +202,7 @@
     REQUIRES_SHARED(Locks::mutator_lock_) {
   JValue* result_register = shadow_frame->GetResultRegister();
   const Instruction* inst = Instruction::At(dex_pc_ptr);
-  return DoInvoke<kStatic, false, false>(
+  return DoFastInvoke<kStatic>(
       self, *shadow_frame, inst, inst_data, result_register);
 }
 
@@ -267,6 +268,18 @@
     REQUIRES_SHARED(Locks::mutator_lock_) {
   JValue* result_register = shadow_frame->GetResultRegister();
   const Instruction* inst = Instruction::At(dex_pc_ptr);
+  const uint32_t vregC = inst->VRegC_35c();
+  const uint32_t vtable_idx = inst->VRegB_35c();
+  ObjPtr<mirror::Object> const receiver = shadow_frame->GetVRegReference(vregC);
+  if (receiver != nullptr) {
+    ArtMethod* const called_method = receiver->GetClass()->GetEmbeddedVTableEntry(
+        vtable_idx, kRuntimePointerSize);
+    if ((called_method != nullptr) && called_method->IsIntrinsic()) {
+      if (MterpHandleIntrinsic(shadow_frame, called_method, inst, inst_data, result_register)) {
+        return !self->IsExceptionPending();
+      }
+    }
+  }
   return DoInvokeVirtualQuick<false>(
       self, *shadow_frame, inst, inst_data, result_register);
 }
diff --git a/runtime/interpreter/unstarted_runtime.cc b/runtime/interpreter/unstarted_runtime.cc
index 80554c2..eb0a9d1 100644
--- a/runtime/interpreter/unstarted_runtime.cc
+++ b/runtime/interpreter/unstarted_runtime.cc
@@ -1672,6 +1672,12 @@
   }
 }
 
+void UnstartedRuntime::UnstartedSystemIdentityHashCode(
+    Thread* self ATTRIBUTE_UNUSED, ShadowFrame* shadow_frame, JValue* result, size_t arg_offset)
+    REQUIRES_SHARED(Locks::mutator_lock_) {
+  mirror::Object* obj = shadow_frame->GetVRegReference(arg_offset);
+  result->SetI((obj != nullptr) ? obj->IdentityHashCode() : 0);
+}
 
 void UnstartedRuntime::UnstartedJNIVMRuntimeNewUnpaddedArray(
     Thread* self, ArtMethod* method ATTRIBUTE_UNUSED, mirror::Object* receiver ATTRIBUTE_UNUSED,
@@ -1836,13 +1842,6 @@
   }
 }
 
-void UnstartedRuntime::UnstartedJNISystemIdentityHashCode(
-    Thread* self ATTRIBUTE_UNUSED, ArtMethod* method ATTRIBUTE_UNUSED,
-    mirror::Object* receiver ATTRIBUTE_UNUSED, uint32_t* args, JValue* result) {
-  mirror::Object* obj = reinterpret_cast<mirror::Object*>(args[0]);
-  result->SetI((obj != nullptr) ? obj->IdentityHashCode() : 0);
-}
-
 void UnstartedRuntime::UnstartedJNIByteOrderIsLittleEndian(
     Thread* self ATTRIBUTE_UNUSED, ArtMethod* method ATTRIBUTE_UNUSED,
     mirror::Object* receiver ATTRIBUTE_UNUSED, uint32_t* args ATTRIBUTE_UNUSED, JValue* result) {
diff --git a/runtime/interpreter/unstarted_runtime_list.h b/runtime/interpreter/unstarted_runtime_list.h
index e9435e4..2560a92 100644
--- a/runtime/interpreter/unstarted_runtime_list.h
+++ b/runtime/interpreter/unstarted_runtime_list.h
@@ -76,7 +76,8 @@
   V(UnsafePutObjectVolatile, "void sun.misc.Unsafe.putObjectVolatile(java.lang.Object, long, java.lang.Object)") \
   V(UnsafePutOrderedObject, "void sun.misc.Unsafe.putOrderedObject(java.lang.Object, long, java.lang.Object)") \
   V(IntegerParseInt, "int java.lang.Integer.parseInt(java.lang.String)") \
-  V(LongParseLong, "long java.lang.Long.parseLong(java.lang.String)")
+  V(LongParseLong, "long java.lang.Long.parseLong(java.lang.String)") \
+  V(SystemIdentityHashCode, "int java.lang.System.identityHashCode(java.lang.Object)")
 
 // Methods that are native.
 #define UNSTARTED_RUNTIME_JNI_LIST(V)           \
@@ -98,7 +99,6 @@
   V(ArrayCreateMultiArray, "java.lang.Object java.lang.reflect.Array.createMultiArray(java.lang.Class, int[])") \
   V(ArrayCreateObjectArray, "java.lang.Object java.lang.reflect.Array.createObjectArray(java.lang.Class, int)") \
   V(ThrowableNativeFillInStackTrace, "java.lang.Object java.lang.Throwable.nativeFillInStackTrace()") \
-  V(SystemIdentityHashCode, "int java.lang.System.identityHashCode(java.lang.Object)") \
   V(ByteOrderIsLittleEndian, "boolean java.nio.ByteOrder.isLittleEndian()") \
   V(UnsafeCompareAndSwapInt, "boolean sun.misc.Unsafe.compareAndSwapInt(java.lang.Object, long, int, int)") \
   V(UnsafeGetIntVolatile, "int sun.misc.Unsafe.getIntVolatile(java.lang.Object, long)") \
diff --git a/runtime/interpreter/unstarted_runtime_test.cc b/runtime/interpreter/unstarted_runtime_test.cc
index db222fa..56e261c 100644
--- a/runtime/interpreter/unstarted_runtime_test.cc
+++ b/runtime/interpreter/unstarted_runtime_test.cc
@@ -1367,5 +1367,26 @@
   ShadowFrame::DeleteDeoptimizedFrame(shadow_frame);
 }
 
+TEST_F(UnstartedRuntimeTest, IdentityHashCode) {
+  Thread* self = Thread::Current();
+  ScopedObjectAccess soa(self);
+  ShadowFrame* tmp = ShadowFrame::CreateDeoptimizedFrame(10, nullptr, nullptr, 0);
+
+  JValue result;
+  UnstartedSystemIdentityHashCode(self, tmp, &result, 0);
+
+  EXPECT_EQ(0, result.GetI());
+  ASSERT_FALSE(self->IsExceptionPending());
+
+  ObjPtr<mirror::String> str = mirror::String::AllocFromModifiedUtf8(self, "abd");
+  tmp->SetVRegReference(0, str.Ptr());
+  UnstartedSystemIdentityHashCode(self, tmp, &result, 0);
+  EXPECT_NE(0, result.GetI());
+  EXPECT_EQ(str->IdentityHashCode(), result.GetI());
+  ASSERT_FALSE(self->IsExceptionPending());
+
+  ShadowFrame::DeleteDeoptimizedFrame(tmp);
+}
+
 }  // namespace interpreter
 }  // namespace art
diff --git a/runtime/jdwp/jdwp.h b/runtime/jdwp/jdwp.h
index af29468..86af6d4 100644
--- a/runtime/jdwp/jdwp.h
+++ b/runtime/jdwp/jdwp.h
@@ -203,8 +203,7 @@
    */
   void PostLocationEvent(const EventLocation* pLoc, mirror::Object* thisPtr, int eventFlags,
                          const JValue* returnValue)
-      REQUIRES(!Locks::jdwp_event_list_lock_, !jdwp_token_lock_)
-      REQUIRES_SHARED(Locks::mutator_lock_);
+     REQUIRES(!event_list_lock_, !jdwp_token_lock_) REQUIRES_SHARED(Locks::mutator_lock_);
 
   /*
    * A field of interest has been accessed or modified. This is used for field access and field
@@ -215,8 +214,7 @@
    */
   void PostFieldEvent(const EventLocation* pLoc, ArtField* field, mirror::Object* thisPtr,
                       const JValue* fieldValue, bool is_modification)
-      REQUIRES(!Locks::jdwp_event_list_lock_, !jdwp_token_lock_)
-      REQUIRES_SHARED(Locks::mutator_lock_);
+      REQUIRES(!event_list_lock_, !jdwp_token_lock_) REQUIRES_SHARED(Locks::mutator_lock_);
 
   /*
    * An exception has been thrown.
@@ -225,22 +223,19 @@
    */
   void PostException(const EventLocation* pThrowLoc, mirror::Throwable* exception_object,
                      const EventLocation* pCatchLoc, mirror::Object* thisPtr)
-      REQUIRES(!Locks::jdwp_event_list_lock_, !jdwp_token_lock_)
-      REQUIRES_SHARED(Locks::mutator_lock_);
+      REQUIRES(!event_list_lock_, !jdwp_token_lock_) REQUIRES_SHARED(Locks::mutator_lock_);
 
   /*
    * A thread has started or stopped.
    */
   void PostThreadChange(Thread* thread, bool start)
-      REQUIRES(!Locks::jdwp_event_list_lock_, !jdwp_token_lock_)
-      REQUIRES_SHARED(Locks::mutator_lock_);
+      REQUIRES(!event_list_lock_, !jdwp_token_lock_) REQUIRES_SHARED(Locks::mutator_lock_);
 
   /*
    * Class has been prepared.
    */
   void PostClassPrepare(mirror::Class* klass)
-      REQUIRES(!Locks::jdwp_event_list_lock_, !jdwp_token_lock_)
-      REQUIRES_SHARED(Locks::mutator_lock_);
+      REQUIRES(!event_list_lock_, !jdwp_token_lock_) REQUIRES_SHARED(Locks::mutator_lock_);
 
   /*
    * The VM is about to stop.
@@ -264,7 +259,7 @@
   void SendRequest(ExpandBuf* pReq);
 
   void ResetState()
-      REQUIRES(!Locks::jdwp_event_list_lock_)
+      REQUIRES(!event_list_lock_)
       REQUIRES_SHARED(Locks::mutator_lock_);
 
   /* atomic ops to get next serial number */
@@ -273,7 +268,7 @@
 
   void Run()
       REQUIRES(!Locks::mutator_lock_, !Locks::thread_suspend_count_lock_, !thread_start_lock_,
-               !attach_lock_, !Locks::jdwp_event_list_lock_);
+               !attach_lock_, !event_list_lock_);
 
   /*
    * Register an event by adding it to the event list.
@@ -282,25 +277,25 @@
    * may discard its pointer after calling this.
    */
   JdwpError RegisterEvent(JdwpEvent* pEvent)
-      REQUIRES(!Locks::jdwp_event_list_lock_)
+      REQUIRES(!event_list_lock_)
       REQUIRES_SHARED(Locks::mutator_lock_);
 
   /*
    * Unregister an event, given the requestId.
    */
   void UnregisterEventById(uint32_t requestId)
-      REQUIRES(!Locks::jdwp_event_list_lock_)
+      REQUIRES(!event_list_lock_)
       REQUIRES_SHARED(Locks::mutator_lock_);
 
   void UnregisterLocationEventsOnClass(ObjPtr<mirror::Class> klass)
-      REQUIRES(!Locks::jdwp_event_list_lock_)
+      REQUIRES(!event_list_lock_)
       REQUIRES_SHARED(Locks::mutator_lock_);
 
   /*
    * Unregister all events.
    */
   void UnregisterAll()
-      REQUIRES(!Locks::jdwp_event_list_lock_)
+      REQUIRES(!event_list_lock_)
       REQUIRES_SHARED(Locks::mutator_lock_);
 
  private:
@@ -315,16 +310,16 @@
                                      ObjectId threadId)
       REQUIRES_SHARED(Locks::mutator_lock_) REQUIRES(!jdwp_token_lock_);
   void CleanupMatchList(const std::vector<JdwpEvent*>& match_list)
-      REQUIRES(Locks::jdwp_event_list_lock_) REQUIRES_SHARED(Locks::mutator_lock_);
+      REQUIRES(event_list_lock_) REQUIRES_SHARED(Locks::mutator_lock_);
   void EventFinish(ExpandBuf* pReq);
   bool FindMatchingEvents(JdwpEventKind eventKind, const ModBasket& basket,
                           std::vector<JdwpEvent*>* match_list)
-      REQUIRES(!Locks::jdwp_event_list_lock_) REQUIRES_SHARED(Locks::mutator_lock_);
+      REQUIRES(!event_list_lock_) REQUIRES_SHARED(Locks::mutator_lock_);
   void FindMatchingEventsLocked(JdwpEventKind eventKind, const ModBasket& basket,
                                 std::vector<JdwpEvent*>* match_list)
-      REQUIRES(Locks::jdwp_event_list_lock_) REQUIRES_SHARED(Locks::mutator_lock_);
+      REQUIRES(event_list_lock_) REQUIRES_SHARED(Locks::mutator_lock_);
   void UnregisterEvent(JdwpEvent* pEvent)
-      REQUIRES(Locks::jdwp_event_list_lock_) REQUIRES_SHARED(Locks::mutator_lock_);
+      REQUIRES(event_list_lock_) REQUIRES_SHARED(Locks::mutator_lock_);
   void SendBufferedRequest(uint32_t type, const std::vector<iovec>& iov);
 
   /*
@@ -392,8 +387,9 @@
   AtomicInteger event_serial_;
 
   // Linked list of events requested by the debugger (breakpoints, class prep, etc).
-  JdwpEvent* event_list_ GUARDED_BY(Locks::jdwp_event_list_lock_);
-  size_t event_list_size_ GUARDED_BY(Locks::jdwp_event_list_lock_);  // Number of elements in event_list_.
+  Mutex event_list_lock_ DEFAULT_MUTEX_ACQUIRED_AFTER ACQUIRED_BEFORE(Locks::breakpoint_lock_);
+  JdwpEvent* event_list_ GUARDED_BY(event_list_lock_);
+  size_t event_list_size_ GUARDED_BY(event_list_lock_);  // Number of elements in event_list_.
 
   // Used to synchronize JDWP command handler thread and event threads so only one
   // thread does JDWP stuff at a time. This prevent from interleaving command handling
@@ -414,7 +410,7 @@
   // When the runtime shuts down, it needs to stop JDWP command handler thread by closing the
   // JDWP connection. However, if the JDWP thread is processing a command, it needs to wait
   // for the command to finish so we can send its reply before closing the connection.
-  Mutex shutdown_lock_ ACQUIRED_AFTER(Locks::jdwp_event_list_lock_);
+  Mutex shutdown_lock_ ACQUIRED_AFTER(event_list_lock_);
   ConditionVariable shutdown_cond_ GUARDED_BY(shutdown_lock_);
   bool processing_request_ GUARDED_BY(shutdown_lock_);
 };
diff --git a/runtime/jdwp/jdwp_event.cc b/runtime/jdwp/jdwp_event.cc
index 36d733e..96249f9 100644
--- a/runtime/jdwp/jdwp_event.cc
+++ b/runtime/jdwp/jdwp_event.cc
@@ -237,7 +237,7 @@
     /*
      * Add to list.
      */
-    MutexLock mu(Thread::Current(), *Locks::jdwp_event_list_lock_);
+    MutexLock mu(Thread::Current(), event_list_lock_);
     if (event_list_ != nullptr) {
       pEvent->next = event_list_;
       event_list_->prev = pEvent;
@@ -256,7 +256,7 @@
   StackHandleScope<1> hs(Thread::Current());
   Handle<mirror::Class> h_klass(hs.NewHandle(klass));
   std::vector<JdwpEvent*> to_remove;
-  MutexLock mu(Thread::Current(), *Locks::jdwp_event_list_lock_);
+  MutexLock mu(Thread::Current(), event_list_lock_);
   for (JdwpEvent* cur_event = event_list_; cur_event != nullptr; cur_event = cur_event->next) {
     // Fill in the to_remove list
     bool found_event = false;
@@ -356,7 +356,7 @@
 void JdwpState::UnregisterEventById(uint32_t requestId) {
   bool found = false;
   {
-    MutexLock mu(Thread::Current(), *Locks::jdwp_event_list_lock_);
+    MutexLock mu(Thread::Current(), event_list_lock_);
 
     for (JdwpEvent* pEvent = event_list_; pEvent != nullptr; pEvent = pEvent->next) {
       if (pEvent->requestId == requestId) {
@@ -383,7 +383,7 @@
  * Remove all entries from the event list.
  */
 void JdwpState::UnregisterAll() {
-  MutexLock mu(Thread::Current(), *Locks::jdwp_event_list_lock_);
+  MutexLock mu(Thread::Current(), event_list_lock_);
 
   JdwpEvent* pEvent = event_list_;
   while (pEvent != nullptr) {
@@ -593,7 +593,7 @@
  */
 bool JdwpState::FindMatchingEvents(JdwpEventKind event_kind, const ModBasket& basket,
                                    std::vector<JdwpEvent*>* match_list) {
-  MutexLock mu(Thread::Current(), *Locks::jdwp_event_list_lock_);
+  MutexLock mu(Thread::Current(), event_list_lock_);
   match_list->reserve(event_list_size_);
   FindMatchingEventsLocked(event_kind, basket, match_list);
   return !match_list->empty();
@@ -908,7 +908,7 @@
   std::vector<JdwpEvent*> match_list;
   {
     // We use the locked version because we have multiple possible match events.
-    MutexLock mu(Thread::Current(), *Locks::jdwp_event_list_lock_);
+    MutexLock mu(Thread::Current(), event_list_lock_);
     match_list.reserve(event_list_size_);
     if ((eventFlags & Dbg::kBreakpoint) != 0) {
       FindMatchingEventsLocked(EK_BREAKPOINT, basket, &match_list);
@@ -955,7 +955,7 @@
   }
 
   {
-    MutexLock mu(Thread::Current(), *Locks::jdwp_event_list_lock_);
+    MutexLock mu(Thread::Current(), event_list_lock_);
     CleanupMatchList(match_list);
   }
 
@@ -1041,7 +1041,7 @@
   }
 
   {
-    MutexLock mu(Thread::Current(), *Locks::jdwp_event_list_lock_);
+    MutexLock mu(Thread::Current(), event_list_lock_);
     CleanupMatchList(match_list);
   }
 
@@ -1103,7 +1103,7 @@
   }
 
   {
-    MutexLock mu(Thread::Current(), *Locks::jdwp_event_list_lock_);
+    MutexLock mu(Thread::Current(), event_list_lock_);
     CleanupMatchList(match_list);
   }
 
@@ -1213,7 +1213,7 @@
   }
 
   {
-    MutexLock mu(Thread::Current(), *Locks::jdwp_event_list_lock_);
+    MutexLock mu(Thread::Current(), event_list_lock_);
     CleanupMatchList(match_list);
   }
 
@@ -1295,7 +1295,7 @@
   }
 
   {
-    MutexLock mu(Thread::Current(), *Locks::jdwp_event_list_lock_);
+    MutexLock mu(Thread::Current(), event_list_lock_);
     CleanupMatchList(match_list);
   }
 
diff --git a/runtime/jdwp/jdwp_main.cc b/runtime/jdwp/jdwp_main.cc
index 64ed724..e6c6068 100644
--- a/runtime/jdwp/jdwp_main.cc
+++ b/runtime/jdwp/jdwp_main.cc
@@ -227,6 +227,7 @@
       last_activity_time_ms_(0),
       request_serial_(0x10000000),
       event_serial_(0x20000000),
+      event_list_lock_("JDWP event list lock", kJdwpEventListLock),
       event_list_(nullptr),
       event_list_size_(0),
       jdwp_token_lock_("JDWP token lock"),
@@ -238,6 +239,7 @@
       shutdown_lock_("JDWP shutdown lock", kJdwpShutdownLock),
       shutdown_cond_("JDWP shutdown condition variable", shutdown_lock_),
       processing_request_(false) {
+  Locks::AddToExpectedMutexesOnWeakRefAccess(&event_list_lock_);
 }
 
 /*
@@ -330,7 +332,7 @@
 
   UnregisterAll();
   {
-    MutexLock mu(Thread::Current(), *Locks::jdwp_event_list_lock_);
+    MutexLock mu(Thread::Current(), event_list_lock_);
     CHECK(event_list_ == nullptr);
   }
 
@@ -380,6 +382,8 @@
   CHECK(netState == nullptr);
 
   ResetState();
+
+  Locks::RemoveFromExpectedMutexesOnWeakRefAccess(&event_list_lock_);
 }
 
 /*
diff --git a/runtime/jdwp/object_registry.cc b/runtime/jdwp/object_registry.cc
index bd7251b..510f5f0 100644
--- a/runtime/jdwp/object_registry.cc
+++ b/runtime/jdwp/object_registry.cc
@@ -35,6 +35,11 @@
 
 ObjectRegistry::ObjectRegistry()
     : lock_("ObjectRegistry lock", kJdwpObjectRegistryLock), next_id_(1) {
+  Locks::AddToExpectedMutexesOnWeakRefAccess(&lock_);
+}
+
+ObjectRegistry::~ObjectRegistry() {
+  Locks::RemoveFromExpectedMutexesOnWeakRefAccess(&lock_);
 }
 
 JDWP::RefTypeId ObjectRegistry::AddRefType(ObjPtr<mirror::Class> c) {
diff --git a/runtime/jdwp/object_registry.h b/runtime/jdwp/object_registry.h
index 9cacc66..8754631 100644
--- a/runtime/jdwp/object_registry.h
+++ b/runtime/jdwp/object_registry.h
@@ -62,6 +62,7 @@
 class ObjectRegistry {
  public:
   ObjectRegistry();
+  ~ObjectRegistry();
 
   JDWP::ObjectId Add(ObjPtr<mirror::Object> o)
       REQUIRES_SHARED(Locks::mutator_lock_)
diff --git a/runtime/jit/jit.cc b/runtime/jit/jit.cc
index 1ec4749..3631a9d 100644
--- a/runtime/jit/jit.cc
+++ b/runtime/jit/jit.cc
@@ -325,16 +325,12 @@
 }
 
 void Jit::StartProfileSaver(const std::string& filename,
-                            const std::vector<std::string>& code_paths,
-                            const std::string& foreign_dex_profile_path,
-                            const std::string& app_dir) {
+                            const std::vector<std::string>& code_paths) {
   if (profile_saver_options_.IsEnabled()) {
     ProfileSaver::Start(profile_saver_options_,
                         filename,
                         code_cache_.get(),
-                        code_paths,
-                        foreign_dex_profile_path,
-                        app_dir);
+                        code_paths);
   }
 }
 
diff --git a/runtime/jit/jit.h b/runtime/jit/jit.h
index d566799..5da1ea1 100644
--- a/runtime/jit/jit.h
+++ b/runtime/jit/jit.h
@@ -136,14 +136,8 @@
   // Starts the profile saver if the config options allow profile recording.
   // The profile will be stored in the specified `filename` and will contain
   // information collected from the given `code_paths` (a set of dex locations).
-  // The `foreign_dex_profile_path` is the path where the saver will put the
-  // profile markers for loaded dex files which are not owned by the application.
-  // The `app_dir` is the application directory and is used to decide which
-  // dex files belong to the application.
   void StartProfileSaver(const std::string& filename,
-                         const std::vector<std::string>& code_paths,
-                         const std::string& foreign_dex_profile_path,
-                         const std::string& app_dir);
+                         const std::vector<std::string>& code_paths);
   void StopProfileSaver();
 
   void DumpForSigQuit(std::ostream& os) REQUIRES(!lock_);
diff --git a/runtime/jit/jit_code_cache.cc b/runtime/jit/jit_code_cache.cc
index 62acedf..e7b23dc 100644
--- a/runtime/jit/jit_code_cache.cc
+++ b/runtime/jit/jit_code_cache.cc
@@ -1262,12 +1262,23 @@
     for (size_t i = 0; i < info->number_of_inline_caches_; ++i) {
       std::vector<ProfileMethodInfo::ProfileClassReference> profile_classes;
       const InlineCache& cache = info->cache_[i];
+      ArtMethod* caller = info->GetMethod();
+      bool is_missing_types = false;
       for (size_t k = 0; k < InlineCache::kIndividualCacheSize; k++) {
         mirror::Class* cls = cache.classes_[k].Read();
         if (cls == nullptr) {
           break;
         }
 
+        // Check if the receiver is in the boot class path or if it's in the
+        // same class loader as the caller. If not, skip it, as there is not
+        // much we can do during AOT.
+        if (!cls->IsBootStrapClassLoaded() &&
+            caller->GetClassLoader() != cls->GetClassLoader()) {
+          is_missing_types = true;
+          continue;
+        }
+
         const DexFile* class_dex_file = nullptr;
         dex::TypeIndex type_index;
 
@@ -1284,17 +1295,20 @@
         }
         if (!type_index.IsValid()) {
           // Could be a proxy class or an array for which we couldn't find the type index.
+          is_missing_types = true;
           continue;
         }
         if (ContainsElement(dex_base_locations, class_dex_file->GetBaseLocation())) {
           // Only consider classes from the same apk (including multidex).
           profile_classes.emplace_back(/*ProfileMethodInfo::ProfileClassReference*/
               class_dex_file, type_index);
+        } else {
+          is_missing_types = true;
         }
       }
       if (!profile_classes.empty()) {
         inline_caches.emplace_back(/*ProfileMethodInfo::ProfileInlineCache*/
-            cache.dex_pc_, profile_classes);
+            cache.dex_pc_, is_missing_types, profile_classes);
       }
     }
     methods.emplace_back(/*ProfileMethodInfo*/
diff --git a/runtime/jit/profile_compilation_info.cc b/runtime/jit/profile_compilation_info.cc
index 8c5bb0d..b23a863 100644
--- a/runtime/jit/profile_compilation_info.cc
+++ b/runtime/jit/profile_compilation_info.cc
@@ -47,16 +47,19 @@
 // using the same test profile.
 static constexpr bool kDebugIgnoreChecksum = false;
 
-static constexpr uint8_t kMegamorphicEncoding = 7;
+static constexpr uint8_t kIsMissingTypesEncoding = 6;
+static constexpr uint8_t kIsMegamorphicEncoding = 7;
 
 static_assert(sizeof(InlineCache::kIndividualCacheSize) == sizeof(uint8_t),
               "InlineCache::kIndividualCacheSize does not have the expect type size");
-static_assert(InlineCache::kIndividualCacheSize < kMegamorphicEncoding,
+static_assert(InlineCache::kIndividualCacheSize < kIsMegamorphicEncoding,
+              "InlineCache::kIndividualCacheSize is larger than expected");
+static_assert(InlineCache::kIndividualCacheSize < kIsMissingTypesEncoding,
               "InlineCache::kIndividualCacheSize is larger than expected");
 
 void ProfileCompilationInfo::DexPcData::AddClass(uint16_t dex_profile_idx,
                                                  const dex::TypeIndex& type_idx) {
-  if (is_megamorphic) {
+  if (is_megamorphic || is_missing_types) {
     return;
   }
   classes.emplace(dex_profile_idx, type_idx);
@@ -207,7 +210,8 @@
  *       Classes are grouped per their dex files and the line
  *       `dex_profile_index,class_id1,class_id2...,dex_profile_index2,...` encodes the
  *       mapping from `dex_profile_index` to the set of classes `class_id1,class_id2...`
- *    M stands for megamorphic and it's encoded as the byte kMegamorphicEncoding.
+ *    M stands for megamorphic or missing types and it's encoded as either
+ *    the byte kIsMegamorphicEncoding or kIsMissingTypesEncoding.
  *    When present, there will be no class ids following.
  **/
 bool ProfileCompilationInfo::Save(int fd) {
@@ -298,10 +302,19 @@
     // Add the dex pc.
     AddUintToBuffer(buffer, dex_pc);
 
-    if (dex_pc_data.is_megamorphic) {
-      // Add the megamorphic encoding if needed and continue.
-      // If megamorphic, we don't add the rest of the classes.
-      AddUintToBuffer(buffer, kMegamorphicEncoding);
+    // Add the megamorphic/missing_types encoding if needed and continue.
+    // In either cases we don't add any classes to the profiles and so there's
+    // no point to continue.
+    // TODO(calin): in case we miss types there is still value to add the
+    // rest of the classes. They can be added without bumping the profile version.
+    if (dex_pc_data.is_missing_types) {
+      DCHECK(!dex_pc_data.is_megamorphic);  // at this point the megamorphic flag should not be set.
+      DCHECK_EQ(classes.size(), 0u);
+      AddUintToBuffer(buffer, kIsMissingTypesEncoding);
+      continue;
+    } else if (dex_pc_data.is_megamorphic) {
+      DCHECK_EQ(classes.size(), 0u);
+      AddUintToBuffer(buffer, kIsMegamorphicEncoding);
       continue;
     }
 
@@ -412,11 +425,21 @@
   for (const auto& pmi_inline_cache_it : pmi.inline_caches) {
     uint16_t pmi_ic_dex_pc = pmi_inline_cache_it.first;
     const DexPcData& pmi_ic_dex_pc_data = pmi_inline_cache_it.second;
-    auto dex_pc_data_it = inline_cache_it->second.FindOrAdd(pmi_ic_dex_pc);
-    if (pmi_ic_dex_pc_data.is_megamorphic) {
-      dex_pc_data_it->second.SetMegamorphic();
+    DexPcData& dex_pc_data = inline_cache_it->second.FindOrAdd(pmi_ic_dex_pc)->second;
+    if (dex_pc_data.is_missing_types || dex_pc_data.is_megamorphic) {
+      // We are already megamorphic or we are missing types; no point in going forward.
       continue;
     }
+
+    if (pmi_ic_dex_pc_data.is_missing_types) {
+      dex_pc_data.SetIsMissingTypes();
+      continue;
+    }
+    if (pmi_ic_dex_pc_data.is_megamorphic) {
+      dex_pc_data.SetIsMegamorphic();
+      continue;
+    }
+
     for (const ClassReference& class_ref : pmi_ic_dex_pc_data.classes) {
       const DexReference& dex_ref = pmi.dex_references[class_ref.dex_profile_index];
       DexFileData* class_dex_data = GetOrAddDexFileData(
@@ -425,7 +448,7 @@
       if (class_dex_data == nullptr) {  // checksum mismatch
         return false;
       }
-      dex_pc_data_it->second.AddClass(class_dex_data->profile_index, class_ref.type_index);
+      dex_pc_data.AddClass(class_dex_data->profile_index, class_ref.type_index);
     }
   }
   return true;
@@ -441,6 +464,11 @@
   auto inline_cache_it = data->method_map.FindOrAdd(pmi.dex_method_index);
 
   for (const ProfileMethodInfo::ProfileInlineCache& cache : pmi.inline_caches) {
+    if (cache.is_missing_types) {
+      auto dex_pc_data_it = inline_cache_it->second.FindOrAdd(cache.dex_pc);
+      dex_pc_data_it->second.SetIsMissingTypes();
+      continue;
+    }
     for (const ProfileMethodInfo::ProfileClassReference& class_ref : cache.classes) {
       DexFileData* class_dex_data = GetOrAddDexFileData(
           GetProfileDexFileKey(class_ref.dex_file->GetLocation()),
@@ -449,6 +477,10 @@
         return false;
       }
       auto dex_pc_data_it = inline_cache_it->second.FindOrAdd(cache.dex_pc);
+      if (dex_pc_data_it->second.is_missing_types) {
+        // Don't bother adding classes if we are missing types.
+        break;
+      }
       dex_pc_data_it->second.AddClass(class_dex_data->profile_index, class_ref.type_index);
     }
   }
@@ -487,8 +519,12 @@
     READ_UINT(uint16_t, buffer, dex_pc, error);
     READ_UINT(uint8_t, buffer, dex_to_classes_map_size, error);
     auto dex_pc_data_it = inline_cache->FindOrAdd(dex_pc);
-    if (dex_to_classes_map_size == kMegamorphicEncoding) {
-      dex_pc_data_it->second.SetMegamorphic();
+    if (dex_to_classes_map_size == kIsMissingTypesEncoding) {
+      dex_pc_data_it->second.SetIsMissingTypes();
+      continue;
+    }
+    if (dex_to_classes_map_size == kIsMegamorphicEncoding) {
+      dex_pc_data_it->second.SetIsMegamorphic();
       continue;
     }
     for (; dex_to_classes_map_size > 0; dex_to_classes_map_size--) {
@@ -835,8 +871,10 @@
         uint16_t other_dex_pc = other_ic_it.first;
         const ClassSet& other_class_set = other_ic_it.second.classes;
         auto class_set = method_it->second.FindOrAdd(other_dex_pc);
-        if (other_ic_it.second.is_megamorphic) {
-          class_set->second.SetMegamorphic();
+        if (other_ic_it.second.is_missing_types) {
+          class_set->second.SetIsMissingTypes();
+        } else if (other_ic_it.second.is_megamorphic) {
+          class_set->second.SetIsMegamorphic();
         } else {
           for (const auto& class_it : other_class_set) {
             class_set->second.AddClass(dex_profile_index_remap.Get(
@@ -999,8 +1037,10 @@
       os << "[";
       for (const auto& inline_cache_it : method_it.second) {
         os << "{" << std::hex << inline_cache_it.first << std::dec << ":";
-        if (inline_cache_it.second.is_megamorphic) {
-          os << "M";
+        if (inline_cache_it.second.is_missing_types) {
+          os << "MT";
+        } else if (inline_cache_it.second.is_megamorphic) {
+          os << "MM";
         } else {
           for (const ClassReference& class_ref : inline_cache_it.second.classes) {
             os << "(" << static_cast<uint32_t>(class_ref.dex_profile_index)
@@ -1136,7 +1176,7 @@
   }
 
   // We can't use a simple equality test because we need to match the dex files
-  // of the inline caches which might have different profile indices.
+  // of the inline caches which might have different profile indexes.
   for (const auto& inline_cache_it : inline_caches) {
     uint16_t dex_pc = inline_cache_it.first;
     const DexPcData dex_pc_data = inline_cache_it.second;
@@ -1145,7 +1185,8 @@
       return false;
     }
     const DexPcData& other_dex_pc_data = other_it->second;
-    if (dex_pc_data.is_megamorphic != other_dex_pc_data.is_megamorphic) {
+    if (dex_pc_data.is_megamorphic != other_dex_pc_data.is_megamorphic ||
+        dex_pc_data.is_missing_types != other_dex_pc_data.is_missing_types) {
       return false;
     }
     for (const ClassReference& class_ref : dex_pc_data.classes) {
diff --git a/runtime/jit/profile_compilation_info.h b/runtime/jit/profile_compilation_info.h
index f089dff..6ad528c 100644
--- a/runtime/jit/profile_compilation_info.h
+++ b/runtime/jit/profile_compilation_info.h
@@ -45,10 +45,13 @@
   };
 
   struct ProfileInlineCache {
-    ProfileInlineCache(uint32_t pc, const std::vector<ProfileClassReference>& profile_classes)
-        : dex_pc(pc), classes(profile_classes) {}
+    ProfileInlineCache(uint32_t pc,
+                       bool missing_types,
+                       const std::vector<ProfileClassReference>& profile_classes)
+        : dex_pc(pc), is_missing_types(missing_types), classes(profile_classes) {}
 
     const uint32_t dex_pc;
+    const bool is_missing_types;
     const std::vector<ProfileClassReference> classes;
   };
 
@@ -134,18 +137,30 @@
 
   // Encodes the actual inline cache for a given dex pc (whether or not the receiver is
   // megamorphic and its possible types).
-  // If the receiver is megamorphic the set of classes will be empty.
+  // If the receiver is megamorphic or is missing types the set of classes will be empty.
   struct DexPcData {
-    DexPcData() : is_megamorphic(false) {}
+    DexPcData() : is_missing_types(false), is_megamorphic(false) {}
     void AddClass(uint16_t dex_profile_idx, const dex::TypeIndex& type_idx);
-    void SetMegamorphic() {
+    void SetIsMegamorphic() {
+      if (is_missing_types) return;
       is_megamorphic = true;
       classes.clear();
     }
+    void SetIsMissingTypes() {
+      is_megamorphic = false;
+      is_missing_types = true;
+      classes.clear();
+    }
     bool operator==(const DexPcData& other) const {
-      return is_megamorphic == other.is_megamorphic && classes == other.classes;
+      return is_megamorphic == other.is_megamorphic &&
+          is_missing_types == other.is_missing_types &&
+          classes == other.classes;
     }
 
+    // Not all runtime types can be encoded in the profile. For example if the receiver
+    // type is in a dex file which is not tracked for profiling its type cannot be
+    // encoded. When types are missing this field will be set to true.
+    bool is_missing_types;
     bool is_megamorphic;
     ClassSet classes;
   };
diff --git a/runtime/jit/profile_compilation_info_test.cc b/runtime/jit/profile_compilation_info_test.cc
index 332280a..5cd8e8f 100644
--- a/runtime/jit/profile_compilation_info_test.cc
+++ b/runtime/jit/profile_compilation_info_test.cc
@@ -108,26 +108,31 @@
     for (ArtMethod* method : methods) {
       std::vector<ProfileMethodInfo::ProfileInlineCache> caches;
       // Monomorphic
-      for (uint16_t dex_pc = 0; dex_pc < 1; dex_pc++) {
+      for (uint16_t dex_pc = 0; dex_pc < 11; dex_pc++) {
         std::vector<ProfileMethodInfo::ProfileClassReference> classes;
         classes.emplace_back(method->GetDexFile(), dex::TypeIndex(0));
-        caches.emplace_back(dex_pc, classes);
+        caches.emplace_back(dex_pc, /*is_missing_types*/false, classes);
       }
       // Polymorphic
-      for (uint16_t dex_pc = 1; dex_pc < 2; dex_pc++) {
+      for (uint16_t dex_pc = 11; dex_pc < 22; dex_pc++) {
         std::vector<ProfileMethodInfo::ProfileClassReference> classes;
         for (uint16_t k = 0; k < InlineCache::kIndividualCacheSize / 2; k++) {
           classes.emplace_back(method->GetDexFile(), dex::TypeIndex(k));
         }
-        caches.emplace_back(dex_pc, classes);
+        caches.emplace_back(dex_pc, /*is_missing_types*/false, classes);
       }
       // Megamorphic
-      for (uint16_t dex_pc = 2; dex_pc < 3; dex_pc++) {
+      for (uint16_t dex_pc = 22; dex_pc < 33; dex_pc++) {
         std::vector<ProfileMethodInfo::ProfileClassReference> classes;
         for (uint16_t k = 0; k < 2 * InlineCache::kIndividualCacheSize; k++) {
           classes.emplace_back(method->GetDexFile(), dex::TypeIndex(k));
         }
-        caches.emplace_back(dex_pc, classes);
+        caches.emplace_back(dex_pc, /*is_missing_types*/false, classes);
+      }
+      // Missing types
+      for (uint16_t dex_pc = 33; dex_pc < 44; dex_pc++) {
+        std::vector<ProfileMethodInfo::ProfileClassReference> classes;
+        caches.emplace_back(dex_pc, /*is_missing_types*/true, classes);
       }
       ProfileMethodInfo pmi(method->GetDexFile(), method->GetDexMethodIndex(), caches);
       profile_methods.push_back(pmi);
@@ -148,12 +153,15 @@
     ProfileCompilationInfo::OfflineProfileMethodInfo offline_pmi;
     SafeMap<DexFile*, uint8_t> dex_map;  // dex files to profile index
     for (const auto& inline_cache : pmi.inline_caches) {
+      ProfileCompilationInfo::DexPcData& dex_pc_data =
+          offline_pmi.inline_caches.FindOrAdd(inline_cache.dex_pc)->second;
+      if (inline_cache.is_missing_types) {
+        dex_pc_data.SetIsMissingTypes();
+      }
       for (const auto& class_ref : inline_cache.classes) {
         uint8_t dex_profile_index = dex_map.FindOrAdd(const_cast<DexFile*>(class_ref.dex_file),
                                                       static_cast<uint8_t>(dex_map.size()))->second;
-        offline_pmi.inline_caches
-            .FindOrAdd(inline_cache.dex_pc)->second
-            .AddClass(dex_profile_index, class_ref.type_index);
+        dex_pc_data.AddClass(dex_profile_index, class_ref.type_index);
         if (dex_profile_index >= offline_pmi.dex_references.size()) {
           // This is a new dex.
           const std::string& dex_key = ProfileCompilationInfo::GetProfileDexFileKey(
@@ -170,18 +178,18 @@
   ProfileCompilationInfo::OfflineProfileMethodInfo GetOfflineProfileMethodInfo() {
     ProfileCompilationInfo::OfflineProfileMethodInfo pmi;
 
-    pmi.dex_references.emplace_back("dex_location1", /* checksum */ 1);
-    pmi.dex_references.emplace_back("dex_location2", /* checksum */ 2);
-    pmi.dex_references.emplace_back("dex_location3", /* checksum */ 3);
+    pmi.dex_references.emplace_back("dex_location1", /* checksum */1);
+    pmi.dex_references.emplace_back("dex_location2", /* checksum */2);
+    pmi.dex_references.emplace_back("dex_location3", /* checksum */3);
 
     // Monomorphic
-    for (uint16_t dex_pc = 0; dex_pc < 10; dex_pc++) {
+    for (uint16_t dex_pc = 0; dex_pc < 11; dex_pc++) {
       ProfileCompilationInfo::DexPcData dex_pc_data;
       dex_pc_data.AddClass(0, dex::TypeIndex(0));
       pmi.inline_caches.Put(dex_pc, dex_pc_data);
     }
     // Polymorphic
-    for (uint16_t dex_pc = 10; dex_pc < 20; dex_pc++) {
+    for (uint16_t dex_pc = 11; dex_pc < 22; dex_pc++) {
       ProfileCompilationInfo::DexPcData dex_pc_data;
       dex_pc_data.AddClass(0, dex::TypeIndex(0));
       dex_pc_data.AddClass(1, dex::TypeIndex(1));
@@ -190,9 +198,15 @@
        pmi.inline_caches.Put(dex_pc, dex_pc_data);
     }
     // Megamorphic
-    for (uint16_t dex_pc = 20; dex_pc < 30; dex_pc++) {
+    for (uint16_t dex_pc = 22; dex_pc < 33; dex_pc++) {
       ProfileCompilationInfo::DexPcData dex_pc_data;
-      dex_pc_data.is_megamorphic = true;
+      dex_pc_data.SetIsMegamorphic();
+      pmi.inline_caches.Put(dex_pc, dex_pc_data);
+    }
+    // Missing types
+    for (uint16_t dex_pc = 33; dex_pc < 44; dex_pc++) {
+      ProfileCompilationInfo::DexPcData dex_pc_data;
+      dex_pc_data.SetIsMissingTypes();
       pmi.inline_caches.Put(dex_pc, dex_pc_data);
     }
 
@@ -207,7 +221,13 @@
     }
   }
 
-  // Cannot sizeof the actual arrays so hardcode the values here.
+  void SetIsMissingTypes(/*out*/ProfileCompilationInfo::OfflineProfileMethodInfo* pmi) {
+    for (auto it : pmi->inline_caches) {
+      it.second.SetIsMissingTypes();
+    }
+  }
+
+  // Cannot sizeof the actual arrays so hard code the values here.
   // They should not change anyway.
   static constexpr int kProfileMagicSize = 4;
   static constexpr int kProfileVersionSize = 4;
@@ -530,6 +550,58 @@
   ASSERT_TRUE(loaded_pmi1 == pmi_extra);
 }
 
+TEST_F(ProfileCompilationInfoTest, MissingTypesInlineCaches) {
+  ScratchFile profile;
+
+  ProfileCompilationInfo saved_info;
+  ProfileCompilationInfo::OfflineProfileMethodInfo pmi = GetOfflineProfileMethodInfo();
+
+  // Add methods with inline caches.
+  for (uint16_t method_idx = 0; method_idx < 10; method_idx++) {
+    ASSERT_TRUE(AddMethod("dex_location1", /* checksum */ 1, method_idx, pmi, &saved_info));
+  }
+
+  ASSERT_TRUE(saved_info.Save(GetFd(profile)));
+  ASSERT_EQ(0, profile.GetFile()->Flush());
+
+  // Make some inline caches megamorphic and add them to the profile again.
+  ProfileCompilationInfo saved_info_extra;
+  ProfileCompilationInfo::OfflineProfileMethodInfo pmi_extra = GetOfflineProfileMethodInfo();
+  MakeMegamorphic(&pmi_extra);
+  for (uint16_t method_idx = 5; method_idx < 10; method_idx++) {
+    ASSERT_TRUE(AddMethod("dex_location1", /* checksum */ 1, method_idx, pmi, &saved_info_extra));
+  }
+
+  // Mark all inline caches with missing types and add them to the profile again.
+  // This will verify that all inline caches (megamorphic or not) should be marked as missing types.
+  ProfileCompilationInfo::OfflineProfileMethodInfo missing_types = GetOfflineProfileMethodInfo();
+  SetIsMissingTypes(&missing_types);
+  for (uint16_t method_idx = 0; method_idx < 10; method_idx++) {
+    ASSERT_TRUE(AddMethod("dex_location1", /* checksum */ 1, method_idx, pmi, &saved_info_extra));
+  }
+
+  ASSERT_TRUE(profile.GetFile()->ResetOffset());
+  ASSERT_TRUE(saved_info_extra.Save(GetFd(profile)));
+  ASSERT_EQ(0, profile.GetFile()->Flush());
+
+  // Merge the profiles so that we have the same view as the file.
+  ASSERT_TRUE(saved_info.MergeWith(saved_info_extra));
+
+  // Check that we get back what we saved.
+  ProfileCompilationInfo loaded_info;
+  ASSERT_TRUE(profile.GetFile()->ResetOffset());
+  ASSERT_TRUE(loaded_info.Load(GetFd(profile)));
+
+  ASSERT_TRUE(loaded_info.Equals(saved_info));
+
+  ProfileCompilationInfo::OfflineProfileMethodInfo loaded_pmi1;
+  ASSERT_TRUE(loaded_info.GetMethod("dex_location1",
+                                    /* checksum */ 1,
+                                    /* method_idx */ 3,
+                                    &loaded_pmi1));
+  ASSERT_TRUE(loaded_pmi1 == pmi_extra);
+}
+
 TEST_F(ProfileCompilationInfoTest, SaveArtMethodsWithInlineCaches) {
   ScratchFile profile;
 
@@ -570,7 +642,7 @@
   }
 }
 
-TEST_F(ProfileCompilationInfoTest, InvalidChecksumInInlineCahce) {
+TEST_F(ProfileCompilationInfoTest, InvalidChecksumInInlineCache) {
   ScratchFile profile;
 
   ProfileCompilationInfo info;
@@ -662,7 +734,7 @@
   ProfileCompilationInfo::OfflineProfileMethodInfo pmi;
   pmi.dex_references.emplace_back("dex_location1", /* checksum */ 1);
   ProfileCompilationInfo::DexPcData dex_pc_data;
-  dex_pc_data.is_megamorphic = true;
+  dex_pc_data.SetIsMegamorphic();
   pmi.inline_caches.Put(/*dex_pc*/ 0, dex_pc_data);
 
   ProfileCompilationInfo info_megamorphic;
@@ -686,4 +758,33 @@
   ASSERT_TRUE(info_no_inline_cache.Save(GetFd(profile)));
 }
 
+TEST_F(ProfileCompilationInfoTest, MissingTypesInlineCachesMerge) {
+  // Create an inline cache with missing types
+  ProfileCompilationInfo::OfflineProfileMethodInfo pmi;
+  pmi.dex_references.emplace_back("dex_location1", /* checksum */ 1);
+  ProfileCompilationInfo::DexPcData dex_pc_data;
+  dex_pc_data.SetIsMissingTypes();
+  pmi.inline_caches.Put(/*dex_pc*/ 0, dex_pc_data);
+
+  ProfileCompilationInfo info_megamorphic;
+  ASSERT_TRUE(AddMethod("dex_location1",
+                        /*checksum*/ 1,
+                        /*method_idx*/ 0,
+                        pmi,
+                        &info_megamorphic));
+
+  // Create a profile with no inline caches (for the same method).
+  ProfileCompilationInfo info_no_inline_cache;
+  ASSERT_TRUE(AddMethod("dex_location1",
+                        /*checksum*/ 1,
+                        /*method_idx*/ 0,
+                        &info_no_inline_cache));
+
+  // Merge the missing type cache into the empty one.
+  // Everything should be saved without errors.
+  ASSERT_TRUE(info_no_inline_cache.MergeWith(info_megamorphic));
+  ScratchFile profile;
+  ASSERT_TRUE(info_no_inline_cache.Save(GetFd(profile)));
+}
+
 }  // namespace art
diff --git a/runtime/jit/profile_saver.cc b/runtime/jit/profile_saver.cc
index 61e6c41..00487c6 100644
--- a/runtime/jit/profile_saver.cc
+++ b/runtime/jit/profile_saver.cc
@@ -39,11 +39,8 @@
 ProfileSaver::ProfileSaver(const ProfileSaverOptions& options,
                            const std::string& output_filename,
                            jit::JitCodeCache* jit_code_cache,
-                           const std::vector<std::string>& code_paths,
-                           const std::string& foreign_dex_profile_path,
-                           const std::string& app_data_dir)
+                           const std::vector<std::string>& code_paths)
     : jit_code_cache_(jit_code_cache),
-      foreign_dex_profile_path_(foreign_dex_profile_path),
       shutting_down_(false),
       last_save_number_of_methods_(0),
       last_save_number_of_classes_(0),
@@ -58,13 +55,12 @@
       total_number_of_failed_writes_(0),
       total_ms_of_sleep_(0),
       total_ns_of_work_(0),
-      total_number_of_foreign_dex_marks_(0),
       max_number_of_profile_entries_cached_(0),
       total_number_of_hot_spikes_(0),
       total_number_of_wake_ups_(0),
       options_(options) {
   DCHECK(options_.IsEnabled());
-  AddTrackedLocations(output_filename, app_data_dir, code_paths);
+  AddTrackedLocations(output_filename, code_paths);
 }
 
 void ProfileSaver::Run() {
@@ -382,9 +378,7 @@
 void ProfileSaver::Start(const ProfileSaverOptions& options,
                          const std::string& output_filename,
                          jit::JitCodeCache* jit_code_cache,
-                         const std::vector<std::string>& code_paths,
-                         const std::string& foreign_dex_profile_path,
-                         const std::string& app_data_dir) {
+                         const std::vector<std::string>& code_paths) {
   DCHECK(options.IsEnabled());
   DCHECK(Runtime::Current()->GetJit() != nullptr);
   DCHECK(!output_filename.empty());
@@ -409,7 +403,7 @@
     // apps which share the same runtime).
     DCHECK_EQ(instance_->jit_code_cache_, jit_code_cache);
     // Add the code_paths to the tracked locations.
-    instance_->AddTrackedLocations(output_filename, app_data_dir, code_paths_to_profile);
+    instance_->AddTrackedLocations(output_filename, code_paths_to_profile);
     return;
   }
 
@@ -419,9 +413,7 @@
   instance_ = new ProfileSaver(options,
                                output_filename,
                                jit_code_cache,
-                               code_paths_to_profile,
-                               foreign_dex_profile_path,
-                               app_data_dir);
+                               code_paths_to_profile);
 
   // Create a new thread which does the saving.
   CHECK_PTHREAD_CALL(
@@ -481,154 +473,16 @@
 }
 
 void ProfileSaver::AddTrackedLocations(const std::string& output_filename,
-                                       const std::string& app_data_dir,
                                        const std::vector<std::string>& code_paths) {
   auto it = tracked_dex_base_locations_.find(output_filename);
   if (it == tracked_dex_base_locations_.end()) {
     tracked_dex_base_locations_.Put(output_filename,
                                     std::set<std::string>(code_paths.begin(), code_paths.end()));
-    if (!app_data_dir.empty()) {
-      app_data_dirs_.insert(app_data_dir);
-    }
   } else {
     it->second.insert(code_paths.begin(), code_paths.end());
   }
 }
 
-// TODO(calin): This may lead to several calls to realpath.
-// Consider moving the logic to the saver thread (i.e. when notified,
-// only cache the location, and then wake up the saver thread to do the
-// comparisons with the real file paths and to create the markers).
-void ProfileSaver::NotifyDexUse(const std::string& dex_location) {
-  if (!ShouldProfileLocation(dex_location)) {
-    return;
-  }
-  std::set<std::string> app_code_paths;
-  std::string foreign_dex_profile_path;
-  std::set<std::string> app_data_dirs;
-  {
-    MutexLock mu(Thread::Current(), *Locks::profiler_lock_);
-    if (instance_ == nullptr) {
-      return;
-    }
-    // Make a copy so that we don't hold the lock while doing I/O.
-    for (const auto& it : instance_->tracked_dex_base_locations_) {
-      app_code_paths.insert(it.second.begin(), it.second.end());
-    }
-    foreign_dex_profile_path = instance_->foreign_dex_profile_path_;
-    app_data_dirs.insert(instance_->app_data_dirs_.begin(), instance_->app_data_dirs_.end());
-  }
-
-  bool mark_created = MaybeRecordDexUseInternal(dex_location,
-                                                app_code_paths,
-                                                foreign_dex_profile_path,
-                                                app_data_dirs);
-  if (mark_created) {
-    MutexLock mu(Thread::Current(), *Locks::profiler_lock_);
-    if (instance_ != nullptr) {
-      instance_->total_number_of_foreign_dex_marks_++;
-    }
-  }
-}
-
-static bool CheckContainsWithRealPath(const std::set<std::string>& paths_set,
-                                      const std::string& path_to_check) {
-  for (const auto& path : paths_set) {
-    UniqueCPtr<const char[]> real_path(realpath(path.c_str(), nullptr));
-    if (real_path == nullptr) {
-      PLOG(WARNING) << "Could not get realpath for " << path;
-      continue;
-    }
-    std::string real_path_str(real_path.get());
-    if (real_path_str == path_to_check) {
-      return true;
-    }
-  }
-  return false;
-}
-
-// After the call, dex_location_real_path will contain the marker's name.
-static bool CreateForeignDexMarker(const std::string& foreign_dex_profile_path,
-                                   /*in-out*/ std::string* dex_location_real_path) {
-  // For foreign dex files we record a flag on disk. PackageManager will (potentially) take this
-  // into account when deciding how to optimize the loaded dex file.
-  // The expected flag name is the canonical path of the apk where '/' is substituted to '@'.
-  // (it needs to be kept in sync with
-  // frameworks/base/services/core/java/com/android/server/pm/PackageDexOptimizer.java)
-  std::replace(dex_location_real_path->begin(), dex_location_real_path->end(), '/', '@');
-  std::string flag_path = foreign_dex_profile_path + "/" + *dex_location_real_path;
-  // We use O_RDONLY as the access mode because we must supply some access
-  // mode, and there is no access mode that means 'create but do not read' the
-  // file. We will not not actually read from the file.
-  int fd = TEMP_FAILURE_RETRY(open(flag_path.c_str(),
-        O_CREAT | O_RDONLY | O_EXCL | O_CLOEXEC | O_NOFOLLOW, 0));
-  if (fd != -1) {
-    if (close(fd) != 0) {
-      PLOG(WARNING) << "Could not close file after flagging foreign dex use " << flag_path;
-    }
-    return true;
-  } else {
-    if (errno != EEXIST && errno != EACCES) {
-      // Another app could have already created the file, and selinux may not
-      // allow the read access to the file implied by the call to open.
-      PLOG(WARNING) << "Could not create foreign dex use mark " << flag_path;
-      return false;
-    }
-    return true;
-  }
-}
-
-bool ProfileSaver::MaybeRecordDexUseInternal(
-      const std::string& dex_location,
-      const std::set<std::string>& app_code_paths,
-      const std::string& foreign_dex_profile_path,
-      const std::set<std::string>& app_data_dirs) {
-  if (dex_location.empty()) {
-    LOG(WARNING) << "Asked to record foreign dex use with an empty dex location.";
-    return false;
-  }
-  if (foreign_dex_profile_path.empty()) {
-    LOG(WARNING) << "Asked to record foreign dex use without a valid profile path ";
-    return false;
-  }
-
-  if (app_code_paths.find(dex_location) != app_code_paths.end()) {
-    // The dex location belongs to the application code paths. Nothing to record.
-    return false;
-  }
-
-  if (app_data_dirs.find(dex_location) != app_data_dirs.end()) {
-    // The dex location is under the application folder. Nothing to record.
-    return false;
-  }
-
-  // Do another round of checks with the real paths.
-  // Application directory could be a symlink (e.g. /data/data instead of /data/user/0), and we
-  // don't have control over how the dex files are actually loaded (symlink or canonical path),
-
-  // Note that we could cache all the real locations in the saver (since it's an expensive
-  // operation). However we expect that app_code_paths is small (usually 1 element), and
-  // NotifyDexUse is called just a few times in the app lifetime. So we make the compromise
-  // to save some bytes of memory usage.
-
-  UniqueCPtr<const char[]> dex_location_real_path(realpath(dex_location.c_str(), nullptr));
-  if (dex_location_real_path == nullptr) {
-    PLOG(WARNING) << "Could not get realpath for " << dex_location;
-    return false;
-  }
-  std::string dex_location_real_path_str(dex_location_real_path.get());
-
-  if (CheckContainsWithRealPath(app_code_paths, dex_location_real_path_str)) {
-    return false;
-  }
-
-  if (CheckContainsWithRealPath(app_data_dirs, dex_location_real_path_str)) {
-    return false;
-  }
-
-  return CreateForeignDexMarker(foreign_dex_profile_path, &dex_location_real_path_str);
-}
-
 void ProfileSaver::DumpInstanceInfo(std::ostream& os) {
   MutexLock mu(Thread::Current(), *Locks::profiler_lock_);
   if (instance_ != nullptr) {
@@ -645,8 +499,6 @@
      << "ProfileSaver total_number_of_failed_writes=" << total_number_of_failed_writes_ << '\n'
      << "ProfileSaver total_ms_of_sleep=" << total_ms_of_sleep_ << '\n'
      << "ProfileSaver total_ms_of_work=" << NsToMs(total_ns_of_work_) << '\n'
-     << "ProfileSaver total_number_of_foreign_dex_marks="
-     << total_number_of_foreign_dex_marks_ << '\n'
      << "ProfileSaver max_number_profile_entries_cached="
      << max_number_of_profile_entries_cached_ << '\n'
      << "ProfileSaver total_number_of_hot_spikes=" << total_number_of_hot_spikes_ << '\n'
diff --git a/runtime/jit/profile_saver.h b/runtime/jit/profile_saver.h
index 9c5e41f..ec8342a 100644
--- a/runtime/jit/profile_saver.h
+++ b/runtime/jit/profile_saver.h
@@ -32,9 +32,7 @@
   static void Start(const ProfileSaverOptions& options,
                     const std::string& output_filename,
                     jit::JitCodeCache* jit_code_cache,
-                    const std::vector<std::string>& code_paths,
-                    const std::string& foreign_dex_profile_path,
-                    const std::string& app_data_dir)
+                    const std::vector<std::string>& code_paths)
       REQUIRES(!Locks::profiler_lock_, !wait_lock_);
 
   // Stops the profile saver thread.
@@ -46,8 +44,6 @@
   // Returns true if the profile saver is started.
   static bool IsStarted() REQUIRES(!Locks::profiler_lock_);
 
-  static void NotifyDexUse(const std::string& dex_location);
-
   // If the profile saver is running, dumps statistics to the `os`. Otherwise it does nothing.
   static void DumpInstanceInfo(std::ostream& os);
 
@@ -66,9 +62,7 @@
   ProfileSaver(const ProfileSaverOptions& options,
                const std::string& output_filename,
                jit::JitCodeCache* jit_code_cache,
-               const std::vector<std::string>& code_paths,
-               const std::string& foreign_dex_profile_path,
-               const std::string& app_data_dir);
+               const std::vector<std::string>& code_paths);
 
   // NO_THREAD_SAFETY_ANALYSIS for static function calling into member function with excludes lock.
   static void* RunProfileSaverThread(void* arg)
@@ -90,7 +84,6 @@
   bool ShuttingDown(Thread* self) REQUIRES(!Locks::profiler_lock_);
 
   void AddTrackedLocations(const std::string& output_filename,
-                           const std::string& app_data_dir,
                            const std::vector<std::string>& code_paths)
       REQUIRES(Locks::profiler_lock_);
 
@@ -102,12 +95,6 @@
   // profile_cache_ for later save.
   void FetchAndCacheResolvedClassesAndMethods();
 
-  static bool MaybeRecordDexUseInternal(
-      const std::string& dex_location,
-      const std::set<std::string>& tracked_locations,
-      const std::string& foreign_dex_profile_path,
-      const std::set<std::string>& app_data_dirs);
-
   void DumpInfo(std::ostream& os);
 
   // The only instance of the saver.
@@ -121,13 +108,6 @@
   // It maps profile locations to code paths (dex base locations).
   SafeMap<std::string, std::set<std::string>> tracked_dex_base_locations_
       GUARDED_BY(Locks::profiler_lock_);
-  // The directory were the we should store the code paths.
-  std::string foreign_dex_profile_path_;
-
-  // A list of application directories, used to infer if a loaded dex belongs
-  // to the application or not. Multiple application data directories are possible when
-  // different apps share the same runtime.
-  std::set<std::string> app_data_dirs_ GUARDED_BY(Locks::profiler_lock_);
 
   bool shutting_down_ GUARDED_BY(Locks::profiler_lock_);
   uint32_t last_save_number_of_methods_;
@@ -152,7 +132,6 @@
   uint64_t total_number_of_failed_writes_;
   uint64_t total_ms_of_sleep_;
   uint64_t total_ns_of_work_;
-  uint64_t total_number_of_foreign_dex_marks_;
   // TODO(calin): replace with an actual size.
   uint64_t max_number_of_profile_entries_cached_;
   uint64_t total_number_of_hot_spikes_;
diff --git a/runtime/linear_alloc.cc b/runtime/linear_alloc.cc
index f91b0ed..e9db9b8 100644
--- a/runtime/linear_alloc.cc
+++ b/runtime/linear_alloc.cc
@@ -33,6 +33,11 @@
   return allocator_.Alloc(size);
 }
 
+void* LinearAlloc::AllocAlign16(Thread* self, size_t size) {
+  MutexLock mu(self, lock_);
+  return allocator_.AllocAlign16(size);
+}
+
 size_t LinearAlloc::GetUsedMemory() const {
   MutexLock mu(Thread::Current(), lock_);
   return allocator_.BytesUsed();
diff --git a/runtime/linear_alloc.h b/runtime/linear_alloc.h
index df7f17d..384b2e3 100644
--- a/runtime/linear_alloc.h
+++ b/runtime/linear_alloc.h
@@ -29,6 +29,7 @@
   explicit LinearAlloc(ArenaPool* pool);
 
   void* Alloc(Thread* self, size_t size) REQUIRES(!lock_);
+  void* AllocAlign16(Thread* self, size_t size) REQUIRES(!lock_);
 
   // Realloc never frees the input pointer, it is the caller's job to do this if necessary.
   void* Realloc(Thread* self, void* ptr, size_t old_size, size_t new_size) REQUIRES(!lock_);
diff --git a/runtime/lock_word.h b/runtime/lock_word.h
index 2f2565b..edc64f3 100644
--- a/runtime/lock_word.h
+++ b/runtime/lock_word.h
@@ -57,7 +57,8 @@
  *  |10|9|87654321098765432109876543210|
  *  |11|0| ForwardingAddress           |
  *
- * The rb bits store the read barrier state.
+ * The `r` bit stores the read barrier state.
+ * The `m` bit stores the mark state.
  */
 class LockWord {
  public:
diff --git a/runtime/mirror/class.h b/runtime/mirror/class.h
index d34f09c..c52b66a 100644
--- a/runtime/mirror/class.h
+++ b/runtime/mirror/class.h
@@ -206,10 +206,10 @@
     return status >= kStatusResolved || status == kStatusErrorResolved;
   }
 
-  // Returns true if the class was compile-time verified.
+  // Returns true if the class should be verified at runtime.
   template<VerifyObjectFlags kVerifyFlags = kDefaultVerifyFlags>
-  bool IsCompileTimeVerified() REQUIRES_SHARED(Locks::mutator_lock_) {
-    return GetStatus<kVerifyFlags>() >= kStatusRetryVerificationAtRuntime;
+  bool ShouldVerifyAtRuntime() REQUIRES_SHARED(Locks::mutator_lock_) {
+    return GetStatus<kVerifyFlags>() == kStatusRetryVerificationAtRuntime;
   }
 
   // Returns true if the class has been verified.
@@ -595,7 +595,7 @@
   // The size of java.lang.Class.class.
   static uint32_t ClassClassSize(PointerSize pointer_size) {
     // The number of vtable entries in java.lang.Class.
-    uint32_t vtable_entries = Object::kVTableLength + 73;
+    uint32_t vtable_entries = Object::kVTableLength + 70;
     return ComputeClassSize(true, vtable_entries, 0, 0, 4, 1, 0, pointer_size);
   }
 
diff --git a/runtime/mirror/dex_cache-inl.h b/runtime/mirror/dex_cache-inl.h
index 29bf6a0..582ecb2 100644
--- a/runtime/mirror/dex_cache-inl.h
+++ b/runtime/mirror/dex_cache-inl.h
@@ -24,6 +24,7 @@
 #include "base/casts.h"
 #include "base/enums.h"
 #include "base/logging.h"
+#include "dex_file.h"
 #include "gc_root.h"
 #include "mirror/class.h"
 #include "mirror/call_site.h"
@@ -36,6 +37,15 @@
 namespace art {
 namespace mirror {
 
+template <typename T>
+inline void NativeDexCachePair<T>::Initialize(std::atomic<NativeDexCachePair<T>>* dex_cache,
+                                              PointerSize pointer_size) {
+  NativeDexCachePair<T> first_elem;
+  first_elem.object = nullptr;
+  first_elem.index = InvalidIndexForSlot(0);
+  DexCache::SetNativePairPtrSize(dex_cache, 0, first_elem, pointer_size);
+}
+
 inline uint32_t DexCache::ClassSize(PointerSize pointer_size) {
   uint32_t vtable_entries = Object::kVTableLength + 5;
   return Class::ComputeClassSize(true, vtable_entries, 0, 0, 0, 0, 0, pointer_size);
@@ -164,20 +174,36 @@
   }
 }
 
+inline uint32_t DexCache::FieldSlotIndex(uint32_t field_idx) {
+  DCHECK_LT(field_idx, GetDexFile()->NumFieldIds());
+  const uint32_t slot_idx = field_idx % kDexCacheFieldCacheSize;
+  DCHECK_LT(slot_idx, NumResolvedFields());
+  return slot_idx;
+}
+
 inline ArtField* DexCache::GetResolvedField(uint32_t field_idx, PointerSize ptr_size) {
   DCHECK_EQ(Runtime::Current()->GetClassLinker()->GetImagePointerSize(), ptr_size);
-  DCHECK_LT(field_idx, NumResolvedFields());  // NOTE: Unchecked, i.e. not throwing AIOOB.
-  ArtField* field = GetElementPtrSize(GetResolvedFields(), field_idx, ptr_size);
-  if (field == nullptr || field->GetDeclaringClass()->IsErroneous()) {
-    return nullptr;
-  }
-  return field;
+  auto pair = GetNativePairPtrSize(GetResolvedFields(), FieldSlotIndex(field_idx), ptr_size);
+  return pair.GetObjectForIndex(field_idx);
 }
 
 inline void DexCache::SetResolvedField(uint32_t field_idx, ArtField* field, PointerSize ptr_size) {
   DCHECK_EQ(Runtime::Current()->GetClassLinker()->GetImagePointerSize(), ptr_size);
-  DCHECK_LT(field_idx, NumResolvedFields());  // NOTE: Unchecked, i.e. not throwing AIOOB.
-  SetElementPtrSize(GetResolvedFields(), field_idx, field, ptr_size);
+  DCHECK(field != nullptr);
+  FieldDexCachePair pair(field, field_idx);
+  SetNativePairPtrSize(GetResolvedFields(), FieldSlotIndex(field_idx), pair, ptr_size);
+}
+
+inline void DexCache::ClearResolvedField(uint32_t field_idx, PointerSize ptr_size) {
+  DCHECK_EQ(Runtime::Current()->GetClassLinker()->GetImagePointerSize(), ptr_size);
+  uint32_t slot_idx = FieldSlotIndex(field_idx);
+  auto* resolved_fields = GetResolvedFields();
+  // This is racy but should only be called from the single-threaded ImageWriter.
+  DCHECK(Runtime::Current()->IsAotCompiler());
+  if (GetNativePairPtrSize(resolved_fields, slot_idx, ptr_size).index == field_idx) {
+    FieldDexCachePair cleared(nullptr, FieldDexCachePair::InvalidIndexForSlot(slot_idx));
+    SetNativePairPtrSize(resolved_fields, slot_idx, cleared, ptr_size);
+  }
 }
 
 inline ArtMethod* DexCache::GetResolvedMethod(uint32_t method_idx, PointerSize ptr_size) {
@@ -225,6 +251,40 @@
   }
 }
 
+template <typename T>
+NativeDexCachePair<T> DexCache::GetNativePairPtrSize(std::atomic<NativeDexCachePair<T>>* pair_array,
+                                                     size_t idx,
+                                                     PointerSize ptr_size) {
+  if (ptr_size == PointerSize::k64) {
+    auto* array = reinterpret_cast<std::atomic<ConversionPair64>*>(pair_array);
+    ConversionPair64 value = AtomicLoadRelaxed16B(&array[idx]);
+    return NativeDexCachePair<T>(reinterpret_cast64<T*>(value.first),
+                                 dchecked_integral_cast<size_t>(value.second));
+  } else {
+    auto* array = reinterpret_cast<std::atomic<ConversionPair32>*>(pair_array);
+    ConversionPair32 value = array[idx].load(std::memory_order_relaxed);
+    return NativeDexCachePair<T>(reinterpret_cast<T*>(value.first), value.second);
+  }
+}
+
+template <typename T>
+void DexCache::SetNativePairPtrSize(std::atomic<NativeDexCachePair<T>>* pair_array,
+                                    size_t idx,
+                                    NativeDexCachePair<T> pair,
+                                    PointerSize ptr_size) {
+  if (ptr_size == PointerSize::k64) {
+    auto* array = reinterpret_cast<std::atomic<ConversionPair64>*>(pair_array);
+    ConversionPair64 v(reinterpret_cast64<uint64_t>(pair.object), pair.index);
+    AtomicStoreRelease16B(&array[idx], v);
+  } else {
+    auto* array = reinterpret_cast<std::atomic<ConversionPair32>*>(pair_array);
+    ConversionPair32 v(
+        dchecked_integral_cast<uint32_t>(reinterpret_cast<uintptr_t>(pair.object)),
+        dchecked_integral_cast<uint32_t>(pair.index));
+    array[idx].store(v, std::memory_order_release);
+  }
+}
+
 template <typename T,
           ReadBarrierOption kReadBarrierOption,
           typename Visitor>
diff --git a/runtime/mirror/dex_cache.cc b/runtime/mirror/dex_cache.cc
index 1b8b391..c95d92e 100644
--- a/runtime/mirror/dex_cache.cc
+++ b/runtime/mirror/dex_cache.cc
@@ -52,8 +52,12 @@
              dex_file->NumTypeIds() != 0u ||
              dex_file->NumMethodIds() != 0u ||
              dex_file->NumFieldIds() != 0u) {
+    static_assert(ArenaAllocator::kAlignment == 8, "Expecting arena alignment of 8.");
+    DCHECK(layout.Alignment() == 8u || layout.Alignment() == 16u);
     // Zero-initialized.
-    raw_arrays = reinterpret_cast<uint8_t*>(linear_alloc->Alloc(self, layout.Size()));
+    raw_arrays = (layout.Alignment() == 16u)
+        ? reinterpret_cast<uint8_t*>(linear_alloc->AllocAlign16(self, layout.Size()))
+        : reinterpret_cast<uint8_t*>(linear_alloc->Alloc(self, layout.Size()));
   }
 
   mirror::StringDexCacheType* strings = (dex_file->NumStringIds() == 0u) ? nullptr :
@@ -62,17 +66,21 @@
       reinterpret_cast<mirror::TypeDexCacheType*>(raw_arrays + layout.TypesOffset());
   ArtMethod** methods = (dex_file->NumMethodIds() == 0u) ? nullptr :
       reinterpret_cast<ArtMethod**>(raw_arrays + layout.MethodsOffset());
-  ArtField** fields = (dex_file->NumFieldIds() == 0u) ? nullptr :
-      reinterpret_cast<ArtField**>(raw_arrays + layout.FieldsOffset());
+  mirror::FieldDexCacheType* fields = (dex_file->NumFieldIds() == 0u) ? nullptr :
+      reinterpret_cast<mirror::FieldDexCacheType*>(raw_arrays + layout.FieldsOffset());
 
-  size_t num_strings = mirror::DexCache::kDexCacheStringCacheSize;
+  size_t num_strings = kDexCacheStringCacheSize;
   if (dex_file->NumStringIds() < num_strings) {
     num_strings = dex_file->NumStringIds();
   }
-  size_t num_types = mirror::DexCache::kDexCacheTypeCacheSize;
+  size_t num_types = kDexCacheTypeCacheSize;
   if (dex_file->NumTypeIds() < num_types) {
     num_types = dex_file->NumTypeIds();
   }
+  size_t num_fields = kDexCacheFieldCacheSize;
+  if (dex_file->NumFieldIds() < num_fields) {
+    num_fields = dex_file->NumFieldIds();
+  }
 
   // Note that we allocate the method type dex caches regardless of this flag,
   // and we make sure here that they're not used by the runtime. This is in the
@@ -80,17 +88,17 @@
   //
   // If this needs to be mitigated in a production system running this code,
   // DexCache::kDexCacheMethodTypeCacheSize can be set to zero.
-  mirror::MethodTypeDexCacheType* method_types = nullptr;
+  MethodTypeDexCacheType* method_types = nullptr;
   size_t num_method_types = 0;
 
-  if (dex_file->NumProtoIds() < mirror::DexCache::kDexCacheMethodTypeCacheSize) {
+  if (dex_file->NumProtoIds() < kDexCacheMethodTypeCacheSize) {
     num_method_types = dex_file->NumProtoIds();
   } else {
-    num_method_types = mirror::DexCache::kDexCacheMethodTypeCacheSize;
+    num_method_types = kDexCacheMethodTypeCacheSize;
   }
 
   if (num_method_types > 0) {
-    method_types = reinterpret_cast<mirror::MethodTypeDexCacheType*>(
+    method_types = reinterpret_cast<MethodTypeDexCacheType*>(
         raw_arrays + layout.MethodTypesOffset());
   }
 
@@ -98,13 +106,13 @@
       ? nullptr
       : reinterpret_cast<GcRoot<mirror::CallSite>*>(raw_arrays + layout.CallSitesOffset());
 
-  DCHECK_ALIGNED(raw_arrays, alignof(mirror::StringDexCacheType)) <<
+  DCHECK_ALIGNED(raw_arrays, alignof(StringDexCacheType)) <<
                  "Expected raw_arrays to align to StringDexCacheType.";
-  DCHECK_ALIGNED(layout.StringsOffset(), alignof(mirror::StringDexCacheType)) <<
+  DCHECK_ALIGNED(layout.StringsOffset(), alignof(StringDexCacheType)) <<
                  "Expected StringsOffset() to align to StringDexCacheType.";
-  DCHECK_ALIGNED(strings, alignof(mirror::StringDexCacheType)) <<
+  DCHECK_ALIGNED(strings, alignof(StringDexCacheType)) <<
                  "Expected strings to align to StringDexCacheType.";
-  static_assert(alignof(mirror::StringDexCacheType) == 8u,
+  static_assert(alignof(StringDexCacheType) == 8u,
                 "Expected StringDexCacheType to have align of 8.");
   if (kIsDebugBuild) {
     // Sanity check to make sure all the dex cache arrays are empty. b/28992179
@@ -117,10 +125,11 @@
       CHECK(types[i].load(std::memory_order_relaxed).object.IsNull());
     }
     for (size_t i = 0; i < dex_file->NumMethodIds(); ++i) {
-      CHECK(mirror::DexCache::GetElementPtrSize(methods, i, image_pointer_size) == nullptr);
+      CHECK(GetElementPtrSize(methods, i, image_pointer_size) == nullptr);
     }
-    for (size_t i = 0; i < dex_file->NumFieldIds(); ++i) {
-      CHECK(mirror::DexCache::GetElementPtrSize(fields, i, image_pointer_size) == nullptr);
+    for (size_t i = 0; i < num_fields; ++i) {
+      CHECK_EQ(GetNativePairPtrSize(fields, i, image_pointer_size).index, 0u);
+      CHECK(GetNativePairPtrSize(fields, i, image_pointer_size).object == nullptr);
     }
     for (size_t i = 0; i < num_method_types; ++i) {
       CHECK_EQ(method_types[i].load(std::memory_order_relaxed).index, 0u);
@@ -136,6 +145,9 @@
   if (types != nullptr) {
     mirror::TypeDexCachePair::Initialize(types);
   }
+  if (fields != nullptr) {
+    mirror::FieldDexCachePair::Initialize(fields, image_pointer_size);
+  }
   if (method_types != nullptr) {
     mirror::MethodTypeDexCachePair::Initialize(method_types);
   }
@@ -148,7 +160,7 @@
                   methods,
                   dex_file->NumMethodIds(),
                   fields,
-                  dex_file->NumFieldIds(),
+                  num_fields,
                   method_types,
                   num_method_types,
                   call_sites,
@@ -164,7 +176,7 @@
                     uint32_t num_resolved_types,
                     ArtMethod** resolved_methods,
                     uint32_t num_resolved_methods,
-                    ArtField** resolved_fields,
+                    FieldDexCacheType* resolved_fields,
                     uint32_t num_resolved_fields,
                     MethodTypeDexCacheType* resolved_method_types,
                     uint32_t num_resolved_method_types,
@@ -218,5 +230,23 @@
   SetFieldObject<false>(OFFSET_OF_OBJECT_MEMBER(DexCache, location_), location);
 }
 
+#if !defined(__aarch64__) && !defined(__x86_64__)
+static pthread_mutex_t dex_cache_slow_atomic_mutex = PTHREAD_MUTEX_INITIALIZER;
+
+DexCache::ConversionPair64 DexCache::AtomicLoadRelaxed16B(std::atomic<ConversionPair64>* target) {
+  pthread_mutex_lock(&dex_cache_slow_atomic_mutex);
+  DexCache::ConversionPair64 value = *reinterpret_cast<ConversionPair64*>(target);
+  pthread_mutex_unlock(&dex_cache_slow_atomic_mutex);
+  return value;
+}
+
+void DexCache::AtomicStoreRelease16B(std::atomic<ConversionPair64>* target,
+                                     ConversionPair64 value) {
+  pthread_mutex_lock(&dex_cache_slow_atomic_mutex);
+  *reinterpret_cast<ConversionPair64*>(target) = value;
+  pthread_mutex_unlock(&dex_cache_slow_atomic_mutex);
+}
+#endif
+
 }  // namespace mirror
 }  // namespace art
diff --git a/runtime/mirror/dex_cache.h b/runtime/mirror/dex_cache.h
index 0579198..35707ef 100644
--- a/runtime/mirror/dex_cache.h
+++ b/runtime/mirror/dex_cache.h
@@ -91,12 +91,44 @@
   }
 };
 
+template <typename T> struct PACKED(2 * __SIZEOF_POINTER__) NativeDexCachePair {
+  T* object;
+  size_t index;
+  // This is similar to DexCachePair except that we're storing a native pointer
+  // instead of a GC root. See DexCachePair for the details.
+  NativeDexCachePair(T* object, uint32_t index)
+      : object(object),
+        index(index) {}
+  NativeDexCachePair() : object(nullptr), index(0u) { }
+  NativeDexCachePair(const NativeDexCachePair<T>&) = default;
+  NativeDexCachePair& operator=(const NativeDexCachePair<T>&) = default;
+
+  static void Initialize(std::atomic<NativeDexCachePair<T>>* dex_cache, PointerSize pointer_size);
+
+  static uint32_t InvalidIndexForSlot(uint32_t slot) {
+    // Since the cache size is a power of two, 0 will always map to slot 0.
+    // Use 1 for slot 0 and 0 for all other slots.
+    return (slot == 0) ? 1u : 0u;
+  }
+
+  T* GetObjectForIndex(uint32_t idx) REQUIRES_SHARED(Locks::mutator_lock_) {
+    if (idx != index) {
+      return nullptr;
+    }
+    DCHECK(object != nullptr);
+    return object;
+  }
+};
+
 using TypeDexCachePair = DexCachePair<Class>;
 using TypeDexCacheType = std::atomic<TypeDexCachePair>;
 
 using StringDexCachePair = DexCachePair<String>;
 using StringDexCacheType = std::atomic<StringDexCachePair>;
 
+using FieldDexCachePair = NativeDexCachePair<ArtField>;
+using FieldDexCacheType = std::atomic<FieldDexCachePair>;
+
 using MethodTypeDexCachePair = DexCachePair<MethodType>;
 using MethodTypeDexCacheType = std::atomic<MethodTypeDexCachePair>;
 
@@ -116,6 +148,11 @@
   static_assert(IsPowerOfTwo(kDexCacheStringCacheSize),
                 "String dex cache size is not a power of 2.");
 
+  // Size of field dex cache. Needs to be a power of 2 for entrypoint assumptions to hold.
+  static constexpr size_t kDexCacheFieldCacheSize = 1024;
+  static_assert(IsPowerOfTwo(kDexCacheFieldCacheSize),
+                "Field dex cache size is not a power of 2.");
+
   // Size of method type dex cache. Needs to be a power of 2 for entrypoint assumptions
   // to hold.
   static constexpr size_t kDexCacheMethodTypeCacheSize = 1024;
@@ -130,6 +167,10 @@
     return kDexCacheStringCacheSize;
   }
 
+  static constexpr size_t StaticArtFieldSize() {
+    return kDexCacheFieldCacheSize;
+  }
+
   static constexpr size_t StaticMethodTypeSize() {
     return kDexCacheMethodTypeCacheSize;
   }
@@ -255,6 +296,8 @@
   // Pointer sized variant, used for patching.
   ALWAYS_INLINE void SetResolvedField(uint32_t idx, ArtField* field, PointerSize ptr_size)
       REQUIRES_SHARED(Locks::mutator_lock_);
+  ALWAYS_INLINE void ClearResolvedField(uint32_t idx, PointerSize ptr_size)
+      REQUIRES_SHARED(Locks::mutator_lock_);
 
   MethodType* GetResolvedMethodType(uint32_t proto_idx) REQUIRES_SHARED(Locks::mutator_lock_);
 
@@ -299,11 +342,11 @@
     SetFieldPtr<false>(ResolvedMethodsOffset(), resolved_methods);
   }
 
-  ArtField** GetResolvedFields() ALWAYS_INLINE REQUIRES_SHARED(Locks::mutator_lock_) {
-    return GetFieldPtr<ArtField**>(ResolvedFieldsOffset());
+  FieldDexCacheType* GetResolvedFields() ALWAYS_INLINE REQUIRES_SHARED(Locks::mutator_lock_) {
+    return GetFieldPtr<FieldDexCacheType*>(ResolvedFieldsOffset());
   }
 
-  void SetResolvedFields(ArtField** resolved_fields)
+  void SetResolvedFields(FieldDexCacheType* resolved_fields)
       ALWAYS_INLINE
       REQUIRES_SHARED(Locks::mutator_lock_) {
     SetFieldPtr<false>(ResolvedFieldsOffset(), resolved_fields);
@@ -376,6 +419,17 @@
   template <typename PtrType>
   static void SetElementPtrSize(PtrType* ptr_array, size_t idx, PtrType ptr, PointerSize ptr_size);
 
+  template <typename T>
+  static NativeDexCachePair<T> GetNativePairPtrSize(std::atomic<NativeDexCachePair<T>>* pair_array,
+                                                    size_t idx,
+                                                    PointerSize ptr_size);
+
+  template <typename T>
+  static void SetNativePairPtrSize(std::atomic<NativeDexCachePair<T>>* pair_array,
+                                   size_t idx,
+                                   NativeDexCachePair<T> pair,
+                                   PointerSize ptr_size);
+
  private:
   void Init(const DexFile* dex_file,
             ObjPtr<String> location,
@@ -385,7 +439,7 @@
             uint32_t num_resolved_types,
             ArtMethod** resolved_methods,
             uint32_t num_resolved_methods,
-            ArtField** resolved_fields,
+            FieldDexCacheType* resolved_fields,
             uint32_t num_resolved_fields,
             MethodTypeDexCacheType* resolved_method_types,
             uint32_t num_resolved_method_types,
@@ -394,8 +448,22 @@
             PointerSize pointer_size)
       REQUIRES_SHARED(Locks::mutator_lock_);
 
+  // std::pair<> is not trivially copyable and as such it is unsuitable for atomic operations,
+  // so we use a custom pair class for loading and storing the NativeDexCachePair<>.
+  template <typename IntType>
+  struct PACKED(2 * sizeof(IntType)) ConversionPair {
+    ConversionPair(IntType f, IntType s) : first(f), second(s) { }
+    ConversionPair(const ConversionPair&) = default;
+    ConversionPair& operator=(const ConversionPair&) = default;
+    IntType first;
+    IntType second;
+  };
+  using ConversionPair32 = ConversionPair<uint32_t>;
+  using ConversionPair64 = ConversionPair<uint64_t>;
+
   uint32_t StringSlotIndex(dex::StringIndex string_idx) REQUIRES_SHARED(Locks::mutator_lock_);
   uint32_t TypeSlotIndex(dex::TypeIndex type_idx) REQUIRES_SHARED(Locks::mutator_lock_);
+  uint32_t FieldSlotIndex(uint32_t field_idx) REQUIRES_SHARED(Locks::mutator_lock_);
   uint32_t MethodTypeSlotIndex(uint32_t proto_idx) REQUIRES_SHARED(Locks::mutator_lock_);
 
   // Visit instance fields of the dex cache as well as its associated arrays.
@@ -406,12 +474,55 @@
   void VisitReferences(ObjPtr<Class> klass, const Visitor& visitor)
       REQUIRES_SHARED(Locks::mutator_lock_) REQUIRES(Locks::heap_bitmap_lock_);
 
+  // Due to lack of 16-byte atomics support, we use hand-crafted routines.
+#if  defined(__aarch64__)
+  // 16-byte atomics are supported on aarch64.
+  ALWAYS_INLINE static ConversionPair64 AtomicLoadRelaxed16B(
+      std::atomic<ConversionPair64>* target) {
+    return target->load(std::memory_order_relaxed);
+  }
+
+  ALWAYS_INLINE static void AtomicStoreRelease16B(
+      std::atomic<ConversionPair64>* target, ConversionPair64 value) {
+    target->store(value, std::memory_order_release);
+  }
+#elif defined(__x86_64__)
+  ALWAYS_INLINE static ConversionPair64 AtomicLoadRelaxed16B(
+      std::atomic<ConversionPair64>* target) {
+    uint64_t first, second;
+    __asm__ __volatile__(
+        "lock cmpxchg16b (%2)"
+        : "=&a"(first), "=&d"(second)
+        : "r"(target), "a"(0), "d"(0), "b"(0), "c"(0)
+        : "cc");
+    return ConversionPair64(first, second);
+  }
+
+  ALWAYS_INLINE static void AtomicStoreRelease16B(
+      std::atomic<ConversionPair64>* target, ConversionPair64 value) {
+    uint64_t first, second;
+    __asm__ __volatile__ (
+        "movq (%2), %%rax\n\t"
+        "movq 8(%2), %%rdx\n\t"
+        "1:\n\t"
+        "lock cmpxchg16b (%2)\n\t"
+        "jnz 1b"
+        : "=&a"(first), "=&d"(second)
+        : "r"(target), "b"(value.first), "c"(value.second)
+        : "cc");
+  }
+#else
+  static ConversionPair64 AtomicLoadRelaxed16B(std::atomic<ConversionPair64>* target);
+  static void AtomicStoreRelease16B(std::atomic<ConversionPair64>* target, ConversionPair64 value);
+#endif
+
   HeapReference<Object> dex_;
   HeapReference<String> location_;
   uint64_t dex_file_;               // const DexFile*
   uint64_t resolved_call_sites_;    // GcRoot<CallSite>* array with num_resolved_call_sites_
                                     // elements.
-  uint64_t resolved_fields_;        // ArtField*, array with num_resolved_fields_ elements.
+  uint64_t resolved_fields_;        // std::atomic<FieldDexCachePair>*, array with
+                                    // num_resolved_fields_ elements.
   uint64_t resolved_method_types_;  // std::atomic<MethodTypeDexCachePair>* array with
                                     // num_resolved_method_types_ elements.
   uint64_t resolved_methods_;       // ArtMethod*, array with num_resolved_methods_ elements.
diff --git a/runtime/mirror/dex_cache_test.cc b/runtime/mirror/dex_cache_test.cc
index ef0aaaa..71a47f6 100644
--- a/runtime/mirror/dex_cache_test.cc
+++ b/runtime/mirror/dex_cache_test.cc
@@ -54,7 +54,8 @@
   EXPECT_TRUE(dex_cache->StaticTypeSize() == dex_cache->NumResolvedTypes()
       || java_lang_dex_file_->NumTypeIds() == dex_cache->NumResolvedTypes());
   EXPECT_EQ(java_lang_dex_file_->NumMethodIds(), dex_cache->NumResolvedMethods());
-  EXPECT_EQ(java_lang_dex_file_->NumFieldIds(),  dex_cache->NumResolvedFields());
+  EXPECT_TRUE(dex_cache->StaticArtFieldSize() == dex_cache->NumResolvedFields()
+      || java_lang_dex_file_->NumFieldIds() ==  dex_cache->NumResolvedFields());
   EXPECT_TRUE(dex_cache->StaticMethodTypeSize() == dex_cache->NumResolvedMethodTypes()
       || java_lang_dex_file_->NumProtoIds() == dex_cache->NumResolvedMethodTypes());
 }
diff --git a/runtime/mirror/field.cc b/runtime/mirror/field.cc
index f6b6489..54034c2 100644
--- a/runtime/mirror/field.cc
+++ b/runtime/mirror/field.cc
@@ -68,8 +68,16 @@
     }
   }
   mirror::DexCache* const dex_cache = declaring_class->GetDexCache();
-  ArtField* const art_field = dex_cache->GetResolvedField(GetDexFieldIndex(), kRuntimePointerSize);
-  CHECK(art_field != nullptr);
+  ArtField* art_field = dex_cache->GetResolvedField(GetDexFieldIndex(), kRuntimePointerSize);
+  if (UNLIKELY(art_field == nullptr)) {
+    if (IsStatic()) {
+      art_field = declaring_class->FindDeclaredStaticField(dex_cache, GetDexFieldIndex());
+    } else {
+      art_field = declaring_class->FindInstanceField(dex_cache, GetDexFieldIndex());
+    }
+    CHECK(art_field != nullptr);
+    dex_cache->SetResolvedField(GetDexFieldIndex(), art_field, kRuntimePointerSize);
+  }
   CHECK_EQ(declaring_class, art_field->GetDeclaringClass());
   return art_field;
 }
diff --git a/runtime/native/dalvik_system_VMRuntime.cc b/runtime/native/dalvik_system_VMRuntime.cc
index efc42fd..d81c13d 100644
--- a/runtime/native/dalvik_system_VMRuntime.cc
+++ b/runtime/native/dalvik_system_VMRuntime.cc
@@ -444,6 +444,7 @@
   if (!kPreloadDexCachesCollectStats) {
     return;
   }
+  // TODO: Update for hash-based DexCache arrays.
   ClassLinker* const class_linker = Runtime::Current()->GetClassLinker();
   Thread* const self = Thread::Current();
   for (const DexFile* dex_file : class_linker->GetBootClassPath()) {
@@ -463,7 +464,7 @@
       }
     }
     for (size_t j = 0; j < dex_cache->NumResolvedFields(); j++) {
-      ArtField* field = class_linker->GetResolvedField(j, dex_cache);
+      ArtField* field = dex_cache->GetResolvedField(j, class_linker->GetImagePointerSize());
       if (field != nullptr) {
         filled->num_fields++;
       }
@@ -580,9 +581,7 @@
 static void VMRuntime_registerAppInfo(JNIEnv* env,
                                       jclass clazz ATTRIBUTE_UNUSED,
                                       jstring profile_file,
-                                      jstring app_dir,
-                                      jobjectArray code_paths,
-                                      jstring foreign_dex_profile_path) {
+                                      jobjectArray code_paths) {
   std::vector<std::string> code_paths_vec;
   int code_paths_length = env->GetArrayLength(code_paths);
   for (int i = 0; i < code_paths_length; i++) {
@@ -596,22 +595,7 @@
   std::string profile_file_str(raw_profile_file);
   env->ReleaseStringUTFChars(profile_file, raw_profile_file);
 
-  std::string foreign_dex_profile_path_str = "";
-  if (foreign_dex_profile_path != nullptr) {
-    const char* raw_foreign_dex_profile_path =
-        env->GetStringUTFChars(foreign_dex_profile_path, nullptr);
-    foreign_dex_profile_path_str.assign(raw_foreign_dex_profile_path);
-    env->ReleaseStringUTFChars(foreign_dex_profile_path, raw_foreign_dex_profile_path);
-  }
-
-  const char* raw_app_dir = env->GetStringUTFChars(app_dir, nullptr);
-  std::string app_dir_str(raw_app_dir);
-  env->ReleaseStringUTFChars(app_dir, raw_app_dir);
-
-  Runtime::Current()->RegisterAppInfo(code_paths_vec,
-                                      profile_file_str,
-                                      foreign_dex_profile_path_str,
-                                      app_dir_str);
+  Runtime::Current()->RegisterAppInfo(code_paths_vec, profile_file_str);
 }
 
 static jboolean VMRuntime_isBootClassPathOnDisk(JNIEnv* env, jclass, jstring java_instruction_set) {
@@ -674,8 +658,7 @@
   FAST_NATIVE_METHOD(VMRuntime, is64Bit, "()Z"),
   FAST_NATIVE_METHOD(VMRuntime, isCheckJniEnabled, "()Z"),
   NATIVE_METHOD(VMRuntime, preloadDexCaches, "()V"),
-  NATIVE_METHOD(VMRuntime, registerAppInfo,
-                "(Ljava/lang/String;Ljava/lang/String;[Ljava/lang/String;Ljava/lang/String;)V"),
+  NATIVE_METHOD(VMRuntime, registerAppInfo, "(Ljava/lang/String;[Ljava/lang/String;)V"),
   NATIVE_METHOD(VMRuntime, isBootClassPathOnDisk, "(Ljava/lang/String;)Z"),
   NATIVE_METHOD(VMRuntime, getCurrentInstructionSet, "()Ljava/lang/String;"),
   NATIVE_METHOD(VMRuntime, didPruneDalvikCache, "()Z"),
diff --git a/runtime/oat.h b/runtime/oat.h
index 1544121..df43107 100644
--- a/runtime/oat.h
+++ b/runtime/oat.h
@@ -32,7 +32,7 @@
 class PACKED(4) OatHeader {
  public:
   static constexpr uint8_t kOatMagic[] = { 'o', 'a', 't', '\n' };
-  static constexpr uint8_t kOatVersion[] = { '1', '1', '4', '\0' };  // hash-based DexCache types.
+  static constexpr uint8_t kOatVersion[] = { '1', '1', '5', '\0' };  // hash-based DexCache fields
 
   static constexpr const char* kImageLocationKey = "image-location";
   static constexpr const char* kDex2OatCmdLineKey = "dex2oat-cmdline";
diff --git a/runtime/oat_file_manager.cc b/runtime/oat_file_manager.cc
index 7079614..d04dbbe 100644
--- a/runtime/oat_file_manager.cc
+++ b/runtime/oat_file_manager.cc
@@ -23,6 +23,7 @@
 #include "android-base/stringprintf.h"
 
 #include "art_field-inl.h"
+#include "base/bit_vector-inl.h"
 #include "base/logging.h"
 #include "base/stl_util.h"
 #include "base/systrace.h"
@@ -145,13 +146,52 @@
   return oat_files;
 }
 
+class TypeIndexInfo {
+ public:
+  explicit TypeIndexInfo(const DexFile* dex_file)
+      : type_indexes_(GenerateTypeIndexes(dex_file)),
+        iter_(type_indexes_.Indexes().begin()),
+        end_(type_indexes_.Indexes().end()) { }
+
+  BitVector& GetTypeIndexes() {
+    return type_indexes_;
+  }
+  BitVector::IndexIterator& GetIterator() {
+    return iter_;
+  }
+  BitVector::IndexIterator& GetIteratorEnd() {
+    return end_;
+  }
+  void AdvanceIterator() {
+    iter_++;
+  }
+
+ private:
+  static BitVector GenerateTypeIndexes(const DexFile* dex_file) {
+    BitVector type_indexes(/*start_bits*/0, /*expandable*/true, Allocator::GetMallocAllocator());
+    for (uint16_t i = 0; i < dex_file->NumClassDefs(); ++i) {
+      const DexFile::ClassDef& class_def = dex_file->GetClassDef(i);
+      uint16_t type_idx = class_def.class_idx_.index_;
+      type_indexes.SetBit(type_idx);
+    }
+    return type_indexes;
+  }
+
+  // BitVector with bits set for the type indexes of all classes in the input dex file.
+  BitVector type_indexes_;
+  BitVector::IndexIterator iter_;
+  BitVector::IndexIterator end_;
+};
+
 class DexFileAndClassPair : ValueObject {
  public:
-  DexFileAndClassPair(const DexFile* dex_file, size_t current_class_index, bool from_loaded_oat)
-     : cached_descriptor_(GetClassDescriptor(dex_file, current_class_index)),
+  DexFileAndClassPair(const DexFile* dex_file, TypeIndexInfo* type_info, bool from_loaded_oat)
+     : type_info_(type_info),
        dex_file_(dex_file),
-       current_class_index_(current_class_index),
-       from_loaded_oat_(from_loaded_oat) {}
+       cached_descriptor_(dex_file_->StringByTypeIdx(dex::TypeIndex(*type_info->GetIterator()))),
+       from_loaded_oat_(from_loaded_oat) {
+    type_info_->AdvanceIterator();
+  }
 
   DexFileAndClassPair(const DexFileAndClassPair& rhs) = default;
 
@@ -172,16 +212,12 @@
   }
 
   bool DexFileHasMoreClasses() const {
-    return current_class_index_ + 1 < dex_file_->NumClassDefs();
+    return type_info_->GetIterator() != type_info_->GetIteratorEnd();
   }
 
   void Next() {
-    ++current_class_index_;
-    cached_descriptor_ = GetClassDescriptor(dex_file_, current_class_index_);
-  }
-
-  size_t GetCurrentClassIndex() const {
-    return current_class_index_;
+    cached_descriptor_ = dex_file_->StringByTypeIdx(dex::TypeIndex(*type_info_->GetIterator()));
+    type_info_->AdvanceIterator();
   }
 
   bool FromLoadedOat() const {
@@ -193,42 +229,36 @@
   }
 
  private:
-  static const char* GetClassDescriptor(const DexFile* dex_file, size_t index) {
-    DCHECK(IsUint<16>(index));
-    const DexFile::ClassDef& class_def = dex_file->GetClassDef(static_cast<uint16_t>(index));
-    return dex_file->StringByTypeIdx(class_def.class_idx_);
-  }
-
-  const char* cached_descriptor_;
+  TypeIndexInfo* type_info_;
   const DexFile* dex_file_;
-  size_t current_class_index_;
+  const char* cached_descriptor_;
   bool from_loaded_oat_;  // We only need to compare mismatches between what we load now
                           // and what was loaded before. Any old duplicates must have been
                           // OK, and any new "internal" duplicates are as well (they must
                           // be from multidex, which resolves correctly).
 };
 
-static void AddDexFilesFromOat(const OatFile* oat_file,
-                               bool already_loaded,
-                               /*out*/std::priority_queue<DexFileAndClassPair>* heap,
-                               std::vector<std::unique_ptr<const DexFile>>* opened_dex_files) {
+static void AddDexFilesFromOat(
+    const OatFile* oat_file,
+    /*out*/std::vector<const DexFile*>* dex_files,
+    std::vector<std::unique_ptr<const DexFile>>* opened_dex_files) {
   for (const OatDexFile* oat_dex_file : oat_file->GetOatDexFiles()) {
     std::string error;
     std::unique_ptr<const DexFile> dex_file = oat_dex_file->OpenDexFile(&error);
     if (dex_file == nullptr) {
       LOG(WARNING) << "Could not create dex file from oat file: " << error;
     } else if (dex_file->NumClassDefs() > 0U) {
-      heap->emplace(dex_file.get(), /*current_class_index*/0U, already_loaded);
+      dex_files->push_back(dex_file.get());
       opened_dex_files->push_back(std::move(dex_file));
     }
   }
 }
 
-static void AddNext(/*inout*/DexFileAndClassPair* original,
-                    /*inout*/std::priority_queue<DexFileAndClassPair>* heap) {
-  if (original->DexFileHasMoreClasses()) {
-    original->Next();
-    heap->push(std::move(*original));
+static void AddNext(/*inout*/DexFileAndClassPair& original,
+                    /*inout*/std::priority_queue<DexFileAndClassPair>& heap) {
+  if (original.DexFileHasMoreClasses()) {
+    original.Next();
+    heap.push(std::move(original));
   }
 }
 
@@ -297,7 +327,8 @@
 static bool GetDexFilesFromClassLoader(
     ScopedObjectAccessAlreadyRunnable& soa,
     mirror::ClassLoader* class_loader,
-    std::priority_queue<DexFileAndClassPair>* queue) REQUIRES_SHARED(Locks::mutator_lock_) {
+    std::vector<const DexFile*>* dex_files)
+    REQUIRES_SHARED(Locks::mutator_lock_) {
   if (ClassLinker::IsBootClassLoader(soa, class_loader)) {
     // The boot class loader. We don't load any of these files, as we know we compiled against
     // them correctly.
@@ -312,7 +343,7 @@
     return false;
   }
 
-  bool recursive_result = GetDexFilesFromClassLoader(soa, class_loader->GetParent(), queue);
+  bool recursive_result = GetDexFilesFromClassLoader(soa, class_loader->GetParent(), dex_files);
   if (!recursive_result) {
     // Something wrong up the chain.
     return false;
@@ -322,7 +353,7 @@
   auto GetDexFilesFn = [&] (const DexFile* cp_dex_file)
             REQUIRES_SHARED(Locks::mutator_lock_) {
     if (cp_dex_file->NumClassDefs() > 0) {
-      queue->emplace(cp_dex_file, 0U, true);
+      dex_files->push_back(cp_dex_file);
     }
     return true;  // Continue looking.
   };
@@ -341,7 +372,8 @@
 static void GetDexFilesFromDexElementsArray(
     ScopedObjectAccessAlreadyRunnable& soa,
     Handle<mirror::ObjectArray<mirror::Object>> dex_elements,
-    std::priority_queue<DexFileAndClassPair>* queue) REQUIRES_SHARED(Locks::mutator_lock_) {
+    std::vector<const DexFile*>* dex_files)
+    REQUIRES_SHARED(Locks::mutator_lock_) {
   if (dex_elements == nullptr) {
     // Nothing to do.
     return;
@@ -360,7 +392,7 @@
   auto GetDexFilesFn = [&] (const DexFile* cp_dex_file)
       REQUIRES_SHARED(Locks::mutator_lock_) {
     if (cp_dex_file != nullptr && cp_dex_file->NumClassDefs() > 0) {
-      queue->emplace(cp_dex_file, 0U, true);
+      dex_files->push_back(cp_dex_file);
     }
     return true;  // Continue looking.
   };
@@ -389,43 +421,95 @@
 }
 
 static bool AreSharedLibrariesOk(const std::string& shared_libraries,
-                                 std::priority_queue<DexFileAndClassPair>& queue) {
+                                 std::vector<const DexFile*>& dex_files) {
+  // If no shared libraries, we expect no dex files.
   if (shared_libraries.empty()) {
-    if (queue.empty()) {
-      // No shared libraries or oat files, as expected.
-      return true;
-    }
-  } else {
-    if (shared_libraries.compare(OatFile::kSpecialSharedLibrary) == 0) {
-      // If we find the special shared library, skip the shared libraries check.
-      return true;
-    }
-    // Shared libraries is a series of dex file paths and their checksums, each separated by '*'.
-    std::vector<std::string> shared_libraries_split;
-    Split(shared_libraries, '*', &shared_libraries_split);
-
-    size_t index = 0;
-    std::priority_queue<DexFileAndClassPair> temp = queue;
-    while (!temp.empty() && index < shared_libraries_split.size() - 1) {
-      DexFileAndClassPair pair(temp.top());
-      const DexFile* dex_file = pair.GetDexFile();
-      const std::string& dex_filename = dex_file->GetLocation();
-      if (dex_filename != shared_libraries_split[index]) {
-        break;
-      }
-      char* end;
-      size_t shared_lib_checksum = strtoul(shared_libraries_split[index + 1].c_str(), &end, 10);
-      uint32_t dex_checksum = dex_file->GetLocationChecksum();
-      if (*end != '\0' || dex_checksum != shared_lib_checksum) {
-        break;
-      }
-      temp.pop();
-      index += 2;
-    }
-
-    // Check is successful if it made it through the queue and all the shared libraries.
-    return temp.empty() && index == shared_libraries_split.size();
+    return dex_files.empty();
   }
+  // If we find the special shared library, skip the shared libraries check.
+  if (shared_libraries.compare(OatFile::kSpecialSharedLibrary) == 0) {
+    return true;
+  }
+  // Shared libraries is a series of dex file paths and their checksums, each separated by '*'.
+  std::vector<std::string> shared_libraries_split;
+  Split(shared_libraries, '*', &shared_libraries_split);
+
+  // Sanity check size of dex files and split shared libraries. Should be 2x as many entries in
+  // the split shared libraries since it contains pairs of filename/checksum.
+  if (dex_files.size() * 2 != shared_libraries_split.size()) {
+    return false;
+  }
+
+  for (size_t i = 0; i < dex_files.size(); ++i) {
+    if (dex_files[i]->GetLocation() != shared_libraries_split[i * 2]) {
+      return false;
+    }
+    char* end;
+    size_t shared_lib_checksum = strtoul(shared_libraries_split[i * 2 + 1].c_str(), &end, 10);
+    uint32_t dex_checksum = dex_files[i]->GetLocationChecksum();
+    if (*end != '\0' || dex_checksum != shared_lib_checksum) {
+      return false;
+    }
+  }
+
+  return true;
+}
+
+static bool CollisionCheck(std::vector<const DexFile*>& dex_files_loaded,
+                           std::vector<const DexFile*>& dex_files_unloaded,
+                           std::string* error_msg /*out*/) {
+  // Generate type index information for each dex file.
+  std::vector<TypeIndexInfo> loaded_types;
+  for (const DexFile* dex_file : dex_files_loaded) {
+    loaded_types.push_back(TypeIndexInfo(dex_file));
+  }
+  std::vector<TypeIndexInfo> unloaded_types;
+  for (const DexFile* dex_file : dex_files_unloaded) {
+    unloaded_types.push_back(TypeIndexInfo(dex_file));
+  }
+
+  // Populate the queue of dex file and class pairs with the loaded and unloaded dex files.
+  std::priority_queue<DexFileAndClassPair> queue;
+  for (size_t i = 0; i < dex_files_loaded.size(); ++i) {
+    if (loaded_types[i].GetIterator() != loaded_types[i].GetIteratorEnd()) {
+      queue.emplace(dex_files_loaded[i], &loaded_types[i], /*from_loaded_oat*/true);
+    }
+  }
+  for (size_t i = 0; i < dex_files_unloaded.size(); ++i) {
+    if (unloaded_types[i].GetIterator() != unloaded_types[i].GetIteratorEnd()) {
+      queue.emplace(dex_files_unloaded[i], &unloaded_types[i], /*from_loaded_oat*/false);
+    }
+  }
+
+  // Now drain the queue.
+  while (!queue.empty()) {
+    // Modifying the top element is only safe if we pop right after.
+    DexFileAndClassPair compare_pop(queue.top());
+    queue.pop();
+
+    // Compare against the following elements.
+    while (!queue.empty()) {
+      DexFileAndClassPair top(queue.top());
+      if (strcmp(compare_pop.GetCachedDescriptor(), top.GetCachedDescriptor()) == 0) {
+        // Same descriptor. Check whether it's crossing old-oat-files to new-oat-files.
+        if (compare_pop.FromLoadedOat() != top.FromLoadedOat()) {
+          *error_msg =
+              StringPrintf("Found duplicated class when checking oat files: '%s' in %s and %s",
+                           compare_pop.GetCachedDescriptor(),
+                           compare_pop.GetDexFile()->GetLocation().c_str(),
+                           top.GetDexFile()->GetLocation().c_str());
+          return true;
+        }
+        queue.pop();
+        AddNext(top, queue);
+      } else {
+        // Something else. Done here.
+        break;
+      }
+    }
+    AddNext(compare_pop, queue);
+  }
+
   return false;
 }
 
@@ -450,7 +534,7 @@
   DCHECK(oat_file != nullptr);
   DCHECK(error_msg != nullptr);
 
-  std::priority_queue<DexFileAndClassPair> queue;
+  std::vector<const DexFile*> dex_files_loaded;
 
   // Try to get dex files from the given class loader. If the class loader is null, or we do
   // not support one of the class loaders in the chain, conservatively compare against all
@@ -464,12 +548,12 @@
     Handle<mirror::ObjectArray<mirror::Object>> h_dex_elements =
         hs.NewHandle(soa.Decode<mirror::ObjectArray<mirror::Object>>(dex_elements));
     if (h_class_loader != nullptr &&
-        GetDexFilesFromClassLoader(soa, h_class_loader.Get(), &queue)) {
+        GetDexFilesFromClassLoader(soa, h_class_loader.Get(), &dex_files_loaded)) {
       class_loader_ok = true;
 
       // In this case, also take into account the dex_elements array, if given. We don't need to
       // read it otherwise, as we'll compare against all open oat files anyways.
-      GetDexFilesFromDexElementsArray(soa, h_dex_elements, &queue);
+      GetDexFilesFromDexElementsArray(soa, h_dex_elements, &dex_files_loaded);
     } else if (h_class_loader != nullptr) {
       VLOG(class_linker) << "Something unsupported with "
                          << mirror::Class::PrettyClass(h_class_loader->GetClass());
@@ -486,10 +570,8 @@
   if (!class_loader_ok) {
     // Add dex files from already loaded oat files, but skip boot.
 
-    // Clean up the queue.
-    while (!queue.empty()) {
-      queue.pop();
-    }
+    // Clean up the dex files.
+    dex_files_loaded.clear();
 
     std::vector<const OatFile*> boot_oat_files = GetBootOatFiles();
     // The same OatFile can be loaded multiple times at different addresses. In this case, we don't
@@ -503,10 +585,7 @@
           boot_oat_files.end() && location != oat_file->GetLocation() &&
           unique_locations.find(location) == unique_locations.end()) {
         unique_locations.insert(location);
-        AddDexFilesFromOat(loaded_oat_file.get(),
-                           /*already_loaded*/true,
-                           &queue,
-                           /*out*/&opened_dex_files);
+        AddDexFilesFromOat(loaded_oat_file.get(), &dex_files_loaded, &opened_dex_files);
       }
     }
   }
@@ -514,46 +593,15 @@
   // Exit if shared libraries are ok. Do a full duplicate classes check otherwise.
   const std::string
       shared_libraries(oat_file->GetOatHeader().GetStoreValueByKey(OatHeader::kClassPathKey));
-  if (AreSharedLibrariesOk(shared_libraries, queue)) {
+  if (AreSharedLibrariesOk(shared_libraries, dex_files_loaded)) {
     return false;
   }
 
   ScopedTrace st("Collision check");
-
   // Add dex files from the oat file to check.
-  AddDexFilesFromOat(oat_file, /*already_loaded*/false, &queue, &opened_dex_files);
-
-  // Now drain the queue.
-  while (!queue.empty()) {
-    // Modifying the top element is only safe if we pop right after.
-    DexFileAndClassPair compare_pop(queue.top());
-    queue.pop();
-
-    // Compare against the following elements.
-    while (!queue.empty()) {
-      DexFileAndClassPair top(queue.top());
-
-      if (strcmp(compare_pop.GetCachedDescriptor(), top.GetCachedDescriptor()) == 0) {
-        // Same descriptor. Check whether it's crossing old-oat-files to new-oat-files.
-        if (compare_pop.FromLoadedOat() != top.FromLoadedOat()) {
-          *error_msg =
-              StringPrintf("Found duplicated class when checking oat files: '%s' in %s and %s",
-                           compare_pop.GetCachedDescriptor(),
-                           compare_pop.GetDexFile()->GetLocation().c_str(),
-                           top.GetDexFile()->GetLocation().c_str());
-          return true;
-        }
-        queue.pop();
-        AddNext(&top, &queue);
-      } else {
-        // Something else. Done here.
-        break;
-      }
-    }
-    AddNext(&compare_pop, &queue);
-  }
-
-  return false;
+  std::vector<const DexFile*> dex_files_unloaded;
+  AddDexFilesFromOat(oat_file, &dex_files_unloaded, &opened_dex_files);
+  return CollisionCheck(dex_files_loaded, dex_files_unloaded, error_msg);
 }
 
 std::vector<std::unique_ptr<const DexFile>> OatFileManager::OpenDexFilesFromOat(
@@ -729,9 +777,6 @@
     }
   }
 
-  // TODO(calin): Consider optimizing this knowing that is useless to record the
-  // use of fully compiled apks.
-  Runtime::Current()->NotifyDexLoaded(dex_location);
   return dex_files;
 }
 
diff --git a/runtime/openjdkjvmti/OpenjdkJvmTi.cc b/runtime/openjdkjvmti/OpenjdkJvmTi.cc
index c016728..5e0d4bd 100644
--- a/runtime/openjdkjvmti/OpenjdkJvmTi.cc
+++ b/runtime/openjdkjvmti/OpenjdkJvmTi.cc
@@ -1426,6 +1426,7 @@
   ClassUtil::Register(&gEventHandler);
   DumpUtil::Register(&gEventHandler);
   SearchUtil::Register();
+  HeapUtil::Register();
 
   runtime->GetJavaVM()->AddEnvironmentHook(GetEnvHandler);
 
@@ -1438,6 +1439,7 @@
   ClassUtil::Unregister();
   DumpUtil::Unregister();
   SearchUtil::Unregister();
+  HeapUtil::Unregister();
 
   return true;
 }
diff --git a/runtime/openjdkjvmti/jvmti_weak_table-inl.h b/runtime/openjdkjvmti/jvmti_weak_table-inl.h
new file mode 100644
index 0000000..f67fffc
--- /dev/null
+++ b/runtime/openjdkjvmti/jvmti_weak_table-inl.h
@@ -0,0 +1,389 @@
+/* Copyright (C) 2017 The Android Open Source Project
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This file implements interfaces from the file jvmti.h. This implementation
+ * is licensed under the same terms as the file jvmti.h.  The
+ * copyright and license information for the file jvmti.h follows.
+ *
+ * Copyright (c) 2003, 2011, Oracle and/or its affiliates. All rights reserved.
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This code is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License version 2 only, as
+ * published by the Free Software Foundation.  Oracle designates this
+ * particular file as subject to the "Classpath" exception as provided
+ * by Oracle in the LICENSE file that accompanied this code.
+ *
+ * This code is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+ * version 2 for more details (a copy is included in the LICENSE file that
+ * accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License version
+ * 2 along with this work; if not, write to the Free Software Foundation,
+ * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
+ *
+ * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
+ * or visit www.oracle.com if you need additional information or have any
+ * questions.
+ */
+
+#ifndef ART_RUNTIME_OPENJDKJVMTI_JVMTI_WEAK_TABLE_INL_H_
+#define ART_RUNTIME_OPENJDKJVMTI_JVMTI_WEAK_TABLE_INL_H_
+
+#include "jvmti_weak_table.h"
+
+#include <limits>
+
+#include "art_jvmti.h"
+#include "base/logging.h"
+#include "gc/allocation_listener.h"
+#include "instrumentation.h"
+#include "jni_env_ext-inl.h"
+#include "jvmti_allocator.h"
+#include "mirror/class.h"
+#include "mirror/object.h"
+#include "runtime.h"
+#include "ScopedLocalRef.h"
+
+namespace openjdkjvmti {
+
+template <typename T>
+void JvmtiWeakTable<T>::Lock() {
+  allow_disallow_lock_.ExclusiveLock(art::Thread::Current());
+}
+template <typename T>
+void JvmtiWeakTable<T>::Unlock() {
+  allow_disallow_lock_.ExclusiveUnlock(art::Thread::Current());
+}
+template <typename T>
+void JvmtiWeakTable<T>::AssertLocked() {
+  allow_disallow_lock_.AssertHeld(art::Thread::Current());
+}
+
+template <typename T>
+void JvmtiWeakTable<T>::UpdateTableWithReadBarrier() {
+  update_since_last_sweep_ = true;
+
+  auto WithReadBarrierUpdater = [&](const art::GcRoot<art::mirror::Object>& original_root,
+                                    art::mirror::Object* original_obj ATTRIBUTE_UNUSED)
+     REQUIRES_SHARED(art::Locks::mutator_lock_) {
+    return original_root.Read<art::kWithReadBarrier>();
+  };
+
+  UpdateTableWith<decltype(WithReadBarrierUpdater), kIgnoreNull>(WithReadBarrierUpdater);
+}
+
+template <typename T>
+bool JvmtiWeakTable<T>::GetTagSlowPath(art::Thread* self, art::mirror::Object* obj, T* result) {
+  // Under concurrent GC, there is a window between moving objects and sweeping of system
+  // weaks in which mutators are active. We may receive a to-space object pointer in obj,
+  // but still have from-space pointers in the table. Explicitly update the table once.
+  // Note: this will keep *all* objects in the table live, but should be a rare occurrence.
+  UpdateTableWithReadBarrier();
+  return GetTagLocked(self, obj, result);
+}
+
+template <typename T>
+bool JvmtiWeakTable<T>::Remove(art::mirror::Object* obj, /* out */ T* tag) {
+  art::Thread* self = art::Thread::Current();
+  art::MutexLock mu(self, allow_disallow_lock_);
+  Wait(self);
+
+  return RemoveLocked(self, obj, tag);
+}
+template <typename T>
+bool JvmtiWeakTable<T>::RemoveLocked(art::mirror::Object* obj, T* tag) {
+  art::Thread* self = art::Thread::Current();
+  allow_disallow_lock_.AssertHeld(self);
+  Wait(self);
+
+  return RemoveLocked(self, obj, tag);
+}
+
+template <typename T>
+bool JvmtiWeakTable<T>::RemoveLocked(art::Thread* self, art::mirror::Object* obj, T* tag) {
+  auto it = tagged_objects_.find(art::GcRoot<art::mirror::Object>(obj));
+  if (it != tagged_objects_.end()) {
+    if (tag != nullptr) {
+      *tag = it->second;
+    }
+    tagged_objects_.erase(it);
+    return true;
+  }
+
+  if (art::kUseReadBarrier && self->GetIsGcMarking() && !update_since_last_sweep_) {
+    // Under concurrent GC, there is a window between moving objects and sweeping of system
+    // weaks in which mutators are active. We may receive a to-space object pointer in obj,
+    // but still have from-space pointers in the table. Explicitly update the table once.
+    // Note: this will keep *all* objects in the table live, but should be a rare occurrence.
+
+    // Update the table.
+    UpdateTableWithReadBarrier();
+
+    // And try again.
+    return RemoveLocked(self, obj, tag);
+  }
+
+  // Not in here.
+  return false;
+}
+
+template <typename T>
+bool JvmtiWeakTable<T>::Set(art::mirror::Object* obj, T new_tag) {
+  art::Thread* self = art::Thread::Current();
+  art::MutexLock mu(self, allow_disallow_lock_);
+  Wait(self);
+
+  return SetLocked(self, obj, new_tag);
+}
+template <typename T>
+bool JvmtiWeakTable<T>::SetLocked(art::mirror::Object* obj, T new_tag) {
+  art::Thread* self = art::Thread::Current();
+  allow_disallow_lock_.AssertHeld(self);
+  Wait(self);
+
+  return SetLocked(self, obj, new_tag);
+}
+
+template <typename T>
+bool JvmtiWeakTable<T>::SetLocked(art::Thread* self, art::mirror::Object* obj, T new_tag) {
+  auto it = tagged_objects_.find(art::GcRoot<art::mirror::Object>(obj));
+  if (it != tagged_objects_.end()) {
+    it->second = new_tag;
+    return true;
+  }
+
+  if (art::kUseReadBarrier && self->GetIsGcMarking() && !update_since_last_sweep_) {
+    // Under concurrent GC, there is a window between moving objects and sweeping of system
+    // weaks in which mutators are active. We may receive a to-space object pointer in obj,
+    // but still have from-space pointers in the table. Explicitly update the table once.
+    // Note: this will keep *all* objects in the table live, but should be a rare occurrence.
+
+    // Update the table.
+    UpdateTableWithReadBarrier();
+
+    // And try again.
+    return SetLocked(self, obj, new_tag);
+  }
+
+  // New element.
+  auto insert_it = tagged_objects_.emplace(art::GcRoot<art::mirror::Object>(obj), new_tag);
+  DCHECK(insert_it.second);
+  return false;
+}
+
+template <typename T>
+void JvmtiWeakTable<T>::Sweep(art::IsMarkedVisitor* visitor) {
+  if (DoesHandleNullOnSweep()) {
+    SweepImpl<true>(visitor);
+  } else {
+    SweepImpl<false>(visitor);
+  }
+
+  // Under concurrent GC, there is a window between moving objects and sweeping of system
+  // weaks in which mutators are active. We may receive a to-space object pointer in obj,
+  // but still have from-space pointers in the table. We explicitly update the table then
+  // to ensure we compare against to-space pointers. But we want to do this only once. Once
+  // sweeping is done, we know all objects are to-space pointers until the next GC cycle,
+  // so we re-enable the explicit update for the next marking.
+  update_since_last_sweep_ = false;
+}
+
+template <typename T>
+template <bool kHandleNull>
+void JvmtiWeakTable<T>::SweepImpl(art::IsMarkedVisitor* visitor) {
+  art::Thread* self = art::Thread::Current();
+  art::MutexLock mu(self, allow_disallow_lock_);
+
+  auto IsMarkedUpdater = [&](const art::GcRoot<art::mirror::Object>& original_root ATTRIBUTE_UNUSED,
+                             art::mirror::Object* original_obj) {
+    return visitor->IsMarked(original_obj);
+  };
+
+  UpdateTableWith<decltype(IsMarkedUpdater),
+                  kHandleNull ? kCallHandleNull : kRemoveNull>(IsMarkedUpdater);
+}
+
+template <typename T>
+template <typename Updater, typename JvmtiWeakTable<T>::TableUpdateNullTarget kTargetNull>
+ALWAYS_INLINE inline void JvmtiWeakTable<T>::UpdateTableWith(Updater& updater) {
+  // We optimistically hope that elements will still be well-distributed when re-inserting them.
+  // So play with the map mechanics, and postpone rehashing. This avoids the need of a side
+  // vector and two passes.
+  float original_max_load_factor = tagged_objects_.max_load_factor();
+  tagged_objects_.max_load_factor(std::numeric_limits<float>::max());
+  // For checking that a max load-factor actually does what we expect.
+  size_t original_bucket_count = tagged_objects_.bucket_count();
+
+  for (auto it = tagged_objects_.begin(); it != tagged_objects_.end();) {
+    DCHECK(!it->first.IsNull());
+    art::mirror::Object* original_obj = it->first.template Read<art::kWithoutReadBarrier>();
+    art::mirror::Object* target_obj = updater(it->first, original_obj);
+    if (original_obj != target_obj) {
+      if (kTargetNull == kIgnoreNull && target_obj == nullptr) {
+        // Ignore null target, don't do anything.
+      } else {
+        T tag = it->second;
+        it = tagged_objects_.erase(it);
+        if (target_obj != nullptr) {
+          tagged_objects_.emplace(art::GcRoot<art::mirror::Object>(target_obj), tag);
+          DCHECK_EQ(original_bucket_count, tagged_objects_.bucket_count());
+        } else if (kTargetNull == kCallHandleNull) {
+          HandleNullSweep(tag);
+        }
+        continue;  // Iterator was implicitly updated by erase.
+      }
+    }
+    it++;
+  }
+
+  tagged_objects_.max_load_factor(original_max_load_factor);
+  // TODO: consider rehash here.
+}
+
+template <typename T>
+template <typename Storage, class Allocator>
+struct JvmtiWeakTable<T>::ReleasableContainer {
+  using allocator_type = Allocator;
+
+  explicit ReleasableContainer(const allocator_type& alloc, size_t reserve = 10)
+      : allocator(alloc),
+        data(reserve > 0 ? allocator.allocate(reserve) : nullptr),
+        size(0),
+        capacity(reserve) {
+  }
+
+  ~ReleasableContainer() {
+    if (data != nullptr) {
+      allocator.deallocate(data, capacity);
+      capacity = 0;
+      size = 0;
+    }
+  }
+
+  Storage* Release() {
+    Storage* tmp = data;
+
+    data = nullptr;
+    size = 0;
+    capacity = 0;
+
+    return tmp;
+  }
+
+  void Resize(size_t new_capacity) {
+    CHECK_GT(new_capacity, capacity);
+
+    Storage* tmp = allocator.allocate(new_capacity);
+    DCHECK(tmp != nullptr);
+    if (data != nullptr) {
+      memcpy(tmp, data, sizeof(Storage) * size);
+    }
+    Storage* old = data;
+    data = tmp;
+    allocator.deallocate(old, capacity);
+    capacity = new_capacity;
+  }
+
+  void Pushback(const Storage& elem) {
+    if (size == capacity) {
+      size_t new_capacity = 2 * capacity + 1;
+      Resize(new_capacity);
+    }
+    data[size++] = elem;
+  }
+
+  Allocator allocator;
+  Storage* data;
+  size_t size;
+  size_t capacity;
+};
+
+template <typename T>
+jvmtiError JvmtiWeakTable<T>::GetTaggedObjects(jvmtiEnv* jvmti_env,
+                                               jint tag_count,
+                                               const T* tags,
+                                               jint* count_ptr,
+                                               jobject** object_result_ptr,
+                                               T** tag_result_ptr) {
+  if (tag_count < 0) {
+    return ERR(ILLEGAL_ARGUMENT);
+  }
+  if (tag_count > 0) {
+    for (size_t i = 0; i != static_cast<size_t>(tag_count); ++i) {
+      if (tags[i] == 0) {
+        return ERR(ILLEGAL_ARGUMENT);
+      }
+    }
+  }
+  if (tags == nullptr) {
+    return ERR(NULL_POINTER);
+  }
+  if (count_ptr == nullptr) {
+    return ERR(NULL_POINTER);
+  }
+
+  art::Thread* self = art::Thread::Current();
+  art::MutexLock mu(self, allow_disallow_lock_);
+  Wait(self);
+
+  art::JNIEnvExt* jni_env = self->GetJniEnv();
+
+  constexpr size_t kDefaultSize = 10;
+  size_t initial_object_size;
+  size_t initial_tag_size;
+  if (tag_count == 0) {
+    initial_object_size = (object_result_ptr != nullptr) ? tagged_objects_.size() : 0;
+    initial_tag_size = (tag_result_ptr != nullptr) ? tagged_objects_.size() : 0;
+  } else {
+    initial_object_size = initial_tag_size = kDefaultSize;
+  }
+  JvmtiAllocator<void> allocator(jvmti_env);
+  ReleasableContainer<jobject, JvmtiAllocator<jobject>> selected_objects(allocator,
+                                                                         initial_object_size);
+  ReleasableContainer<T, JvmtiAllocator<T>> selected_tags(allocator, initial_tag_size);
+
+  size_t count = 0;
+  for (auto& pair : tagged_objects_) {
+    bool select;
+    if (tag_count > 0) {
+      select = false;
+      for (size_t i = 0; i != static_cast<size_t>(tag_count); ++i) {
+        if (tags[i] == pair.second) {
+          select = true;
+          break;
+        }
+      }
+    } else {
+      select = true;
+    }
+
+    if (select) {
+      art::mirror::Object* obj = pair.first.template Read<art::kWithReadBarrier>();
+      if (obj != nullptr) {
+        count++;
+        if (object_result_ptr != nullptr) {
+          selected_objects.Pushback(jni_env->AddLocalReference<jobject>(obj));
+        }
+        if (tag_result_ptr != nullptr) {
+          selected_tags.Pushback(pair.second);
+        }
+      }
+    }
+  }
+
+  if (object_result_ptr != nullptr) {
+    *object_result_ptr = selected_objects.Release();
+  }
+  if (tag_result_ptr != nullptr) {
+    *tag_result_ptr = selected_tags.Release();
+  }
+  *count_ptr = static_cast<jint>(count);
+  return ERR(NONE);
+}
+
+}  // namespace openjdkjvmti
+
+#endif  // ART_RUNTIME_OPENJDKJVMTI_JVMTI_WEAK_TABLE_INL_H_
diff --git a/runtime/openjdkjvmti/jvmti_weak_table.h b/runtime/openjdkjvmti/jvmti_weak_table.h
new file mode 100644
index 0000000..ae36122
--- /dev/null
+++ b/runtime/openjdkjvmti/jvmti_weak_table.h
@@ -0,0 +1,219 @@
+/* Copyright (C) 2017 The Android Open Source Project
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This file implements interfaces from the file jvmti.h. This implementation
+ * is licensed under the same terms as the file jvmti.h.  The
+ * copyright and license information for the file jvmti.h follows.
+ *
+ * Copyright (c) 2003, 2011, Oracle and/or its affiliates. All rights reserved.
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This code is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License version 2 only, as
+ * published by the Free Software Foundation.  Oracle designates this
+ * particular file as subject to the "Classpath" exception as provided
+ * by Oracle in the LICENSE file that accompanied this code.
+ *
+ * This code is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+ * version 2 for more details (a copy is included in the LICENSE file that
+ * accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License version
+ * 2 along with this work; if not, write to the Free Software Foundation,
+ * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
+ *
+ * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
+ * or visit www.oracle.com if you need additional information or have any
+ * questions.
+ */
+
+#ifndef ART_RUNTIME_OPENJDKJVMTI_JVMTI_WEAK_TABLE_H_
+#define ART_RUNTIME_OPENJDKJVMTI_JVMTI_WEAK_TABLE_H_
+
+#include <unordered_map>
+
+#include "base/macros.h"
+#include "base/mutex.h"
+#include "gc/system_weak.h"
+#include "gc_root-inl.h"
+#include "globals.h"
+#include "jvmti.h"
+#include "mirror/object.h"
+#include "thread-inl.h"
+
+namespace openjdkjvmti {
+
+class EventHandler;
+
+// A system-weak container mapping objects to elements of the template type. This corresponds
+// to a weak hash map. For historical reasons the stored value is called "tag."
+template <typename T>
+class JvmtiWeakTable : public art::gc::SystemWeakHolder {
+ public:
+  JvmtiWeakTable()
+      : art::gc::SystemWeakHolder(kTaggingLockLevel),
+        update_since_last_sweep_(false) {
+  }
+
+  // Remove the mapping for the given object, returning whether such a mapping existed (and the old
+  // value).
+  bool Remove(art::mirror::Object* obj, /* out */ T* tag)
+      REQUIRES_SHARED(art::Locks::mutator_lock_)
+      REQUIRES(!allow_disallow_lock_);
+  bool RemoveLocked(art::mirror::Object* obj, /* out */ T* tag)
+      REQUIRES_SHARED(art::Locks::mutator_lock_)
+      REQUIRES(allow_disallow_lock_);
+
+  // Set the mapping for the given object. Returns true if this overwrites an already existing
+  // mapping.
+  virtual bool Set(art::mirror::Object* obj, T tag)
+      REQUIRES_SHARED(art::Locks::mutator_lock_)
+      REQUIRES(!allow_disallow_lock_);
+  virtual bool SetLocked(art::mirror::Object* obj, T tag)
+      REQUIRES_SHARED(art::Locks::mutator_lock_)
+      REQUIRES(allow_disallow_lock_);
+
+  // Return the value associated with the given object. Returns true if the mapping exists, false
+  // otherwise.
+  bool GetTag(art::mirror::Object* obj, /* out */ T* result)
+      REQUIRES_SHARED(art::Locks::mutator_lock_)
+      REQUIRES(!allow_disallow_lock_) {
+    art::Thread* self = art::Thread::Current();
+    art::MutexLock mu(self, allow_disallow_lock_);
+    Wait(self);
+
+    return GetTagLocked(self, obj, result);
+  }
+  bool GetTagLocked(art::mirror::Object* obj, /* out */ T* result)
+      REQUIRES_SHARED(art::Locks::mutator_lock_)
+      REQUIRES(allow_disallow_lock_) {
+    art::Thread* self = art::Thread::Current();
+    allow_disallow_lock_.AssertHeld(self);
+    Wait(self);
+
+    return GetTagLocked(self, obj, result);
+  }
+
+  // Sweep the container. DO NOT CALL MANUALLY.
+  void Sweep(art::IsMarkedVisitor* visitor)
+      REQUIRES_SHARED(art::Locks::mutator_lock_)
+      REQUIRES(!allow_disallow_lock_);
+
+  // Return all objects that have a value mapping in tags.
+  jvmtiError GetTaggedObjects(jvmtiEnv* jvmti_env,
+                              jint tag_count,
+                              const T* tags,
+                              /* out */ jint* count_ptr,
+                              /* out */ jobject** object_result_ptr,
+                              /* out */ T** tag_result_ptr)
+      REQUIRES_SHARED(art::Locks::mutator_lock_)
+      REQUIRES(!allow_disallow_lock_);
+
+  // Locking functions, to allow coarse-grained locking and amortization.
+  void Lock() ACQUIRE(allow_disallow_lock_);
+  void Unlock() RELEASE(allow_disallow_lock_);
+  void AssertLocked() ASSERT_CAPABILITY(allow_disallow_lock_);
+
+ protected:
+  // Should HandleNullSweep be called when Sweep detects the release of an object?
+  virtual bool DoesHandleNullOnSweep() {
+    return false;
+  }
+  // If DoesHandleNullOnSweep returns true, this function will be called.
+  virtual void HandleNullSweep(T tag ATTRIBUTE_UNUSED) {}
+
+ private:
+  bool SetLocked(art::Thread* self, art::mirror::Object* obj, T tag)
+      REQUIRES_SHARED(art::Locks::mutator_lock_)
+      REQUIRES(allow_disallow_lock_);
+
+  bool RemoveLocked(art::Thread* self, art::mirror::Object* obj, /* out */ T* tag)
+      REQUIRES_SHARED(art::Locks::mutator_lock_)
+      REQUIRES(allow_disallow_lock_);
+
+  bool GetTagLocked(art::Thread* self, art::mirror::Object* obj, /* out */ T* result)
+      REQUIRES_SHARED(art::Locks::mutator_lock_)
+      REQUIRES(allow_disallow_lock_) {
+    auto it = tagged_objects_.find(art::GcRoot<art::mirror::Object>(obj));
+    if (it != tagged_objects_.end()) {
+      *result = it->second;
+      return true;
+    }
+
+    // Performance optimization: To avoid multiple table updates, ensure that during GC we
+    // only update once. See the comment on the implementation of GetTagSlowPath.
+    if (art::kUseReadBarrier &&
+        self != nullptr &&
+        self->GetIsGcMarking() &&
+        !update_since_last_sweep_) {
+      return GetTagSlowPath(self, obj, result);
+    }
+
+    return false;
+  }
+
+  // Slow-path for GetTag. We didn't find the object, but we might be storing from-pointers and
+  // are asked to retrieve with a to-pointer.
+  bool GetTagSlowPath(art::Thread* self, art::mirror::Object* obj, /* out */ T* result)
+      REQUIRES_SHARED(art::Locks::mutator_lock_)
+      REQUIRES(allow_disallow_lock_);
+
+  // Update the table by doing read barriers on each element, ensuring that to-space pointers
+  // are stored.
+  void UpdateTableWithReadBarrier()
+      REQUIRES_SHARED(art::Locks::mutator_lock_)
+      REQUIRES(allow_disallow_lock_);
+
+  template <bool kHandleNull>
+  void SweepImpl(art::IsMarkedVisitor* visitor)
+      REQUIRES_SHARED(art::Locks::mutator_lock_)
+      REQUIRES(!allow_disallow_lock_);
+
+  enum TableUpdateNullTarget {
+    kIgnoreNull,
+    kRemoveNull,
+    kCallHandleNull
+  };
+
+  template <typename Updater, TableUpdateNullTarget kTargetNull>
+  void UpdateTableWith(Updater& updater)
+      REQUIRES_SHARED(art::Locks::mutator_lock_)
+      REQUIRES(allow_disallow_lock_);
+
+  template <typename Storage, class Allocator = std::allocator<T>>
+  struct ReleasableContainer;
+
+  struct HashGcRoot {
+    size_t operator()(const art::GcRoot<art::mirror::Object>& r) const
+        REQUIRES_SHARED(art::Locks::mutator_lock_) {
+      return reinterpret_cast<uintptr_t>(r.Read<art::kWithoutReadBarrier>());
+    }
+  };
+
+  struct EqGcRoot {
+    bool operator()(const art::GcRoot<art::mirror::Object>& r1,
+                    const art::GcRoot<art::mirror::Object>& r2) const
+        REQUIRES_SHARED(art::Locks::mutator_lock_) {
+      return r1.Read<art::kWithoutReadBarrier>() == r2.Read<art::kWithoutReadBarrier>();
+    }
+  };
+
+  // The tag table is used when visiting roots. So it needs to have a low lock level.
+  static constexpr art::LockLevel kTaggingLockLevel =
+      static_cast<art::LockLevel>(art::LockLevel::kAbortLock + 1);
+
+  std::unordered_map<art::GcRoot<art::mirror::Object>,
+                     T,
+                     HashGcRoot,
+                     EqGcRoot> tagged_objects_
+      GUARDED_BY(allow_disallow_lock_)
+      GUARDED_BY(art::Locks::mutator_lock_);
+  // To avoid repeatedly scanning the whole table, remember if we did that since the last sweep.
+  bool update_since_last_sweep_;
+};
+
+}  // namespace openjdkjvmti
+
+#endif  // ART_RUNTIME_OPENJDKJVMTI_JVMTI_WEAK_TABLE_H_
diff --git a/runtime/openjdkjvmti/object_tagging.cc b/runtime/openjdkjvmti/object_tagging.cc
index b27c2a3..4215588 100644
--- a/runtime/openjdkjvmti/object_tagging.cc
+++ b/runtime/openjdkjvmti/object_tagging.cc
@@ -33,355 +33,34 @@
 
 #include <limits>
 
-#include "art_jvmti.h"
-#include "base/logging.h"
 #include "events-inl.h"
-#include "gc/allocation_listener.h"
-#include "instrumentation.h"
-#include "jni_env_ext-inl.h"
-#include "jvmti_allocator.h"
-#include "mirror/class.h"
-#include "mirror/object.h"
-#include "runtime.h"
-#include "ScopedLocalRef.h"
+#include "jvmti_weak_table-inl.h"
 
 namespace openjdkjvmti {
 
-void ObjectTagTable::Lock() {
-  allow_disallow_lock_.ExclusiveLock(art::Thread::Current());
-}
-void ObjectTagTable::Unlock() {
-  allow_disallow_lock_.ExclusiveUnlock(art::Thread::Current());
-}
-void ObjectTagTable::AssertLocked() {
-  allow_disallow_lock_.AssertHeld(art::Thread::Current());
-}
-
-void ObjectTagTable::UpdateTableWithReadBarrier() {
-  update_since_last_sweep_ = true;
-
-  auto WithReadBarrierUpdater = [&](const art::GcRoot<art::mirror::Object>& original_root,
-                                    art::mirror::Object* original_obj ATTRIBUTE_UNUSED)
-     REQUIRES_SHARED(art::Locks::mutator_lock_) {
-    return original_root.Read<art::kWithReadBarrier>();
-  };
-
-  UpdateTableWith<decltype(WithReadBarrierUpdater), kIgnoreNull>(WithReadBarrierUpdater);
-}
-
-bool ObjectTagTable::GetTagSlowPath(art::Thread* self, art::mirror::Object* obj, jlong* result) {
-  // Under concurrent GC, there is a window between moving objects and sweeping of system
-  // weaks in which mutators are active. We may receive a to-space object pointer in obj,
-  // but still have from-space pointers in the table. Explicitly update the table once.
-  // Note: this will keep *all* objects in the table live, but should be a rare occurrence.
-  UpdateTableWithReadBarrier();
-  return GetTagLocked(self, obj, result);
-}
-
-void ObjectTagTable::Add(art::mirror::Object* obj, jlong tag) {
-  // Same as Set(), as we don't have duplicates in an unordered_map.
-  Set(obj, tag);
-}
-
-bool ObjectTagTable::Remove(art::mirror::Object* obj, jlong* tag) {
-  art::Thread* self = art::Thread::Current();
-  art::MutexLock mu(self, allow_disallow_lock_);
-  Wait(self);
-
-  return RemoveLocked(self, obj, tag);
-}
-bool ObjectTagTable::RemoveLocked(art::mirror::Object* obj, jlong* tag) {
-  art::Thread* self = art::Thread::Current();
-  allow_disallow_lock_.AssertHeld(self);
-  Wait(self);
-
-  return RemoveLocked(self, obj, tag);
-}
-
-bool ObjectTagTable::RemoveLocked(art::Thread* self, art::mirror::Object* obj, jlong* tag) {
-  auto it = tagged_objects_.find(art::GcRoot<art::mirror::Object>(obj));
-  if (it != tagged_objects_.end()) {
-    if (tag != nullptr) {
-      *tag = it->second;
-    }
-    tagged_objects_.erase(it);
-    return true;
-  }
-
-  if (art::kUseReadBarrier && self->GetIsGcMarking() && !update_since_last_sweep_) {
-    // Under concurrent GC, there is a window between moving objects and sweeping of system
-    // weaks in which mutators are active. We may receive a to-space object pointer in obj,
-    // but still have from-space pointers in the table. Explicitly update the table once.
-    // Note: this will keep *all* objects in the table live, but should be a rare occurrence.
-
-    // Update the table.
-    UpdateTableWithReadBarrier();
-
-    // And try again.
-    return RemoveLocked(self, obj, tag);
-  }
-
-  // Not in here.
-  return false;
-}
+// Instantiate for jlong = JVMTI tags.
+template class JvmtiWeakTable<jlong>;
 
 bool ObjectTagTable::Set(art::mirror::Object* obj, jlong new_tag) {
   if (new_tag == 0) {
     jlong tmp;
     return Remove(obj, &tmp);
   }
-
-  art::Thread* self = art::Thread::Current();
-  art::MutexLock mu(self, allow_disallow_lock_);
-  Wait(self);
-
-  return SetLocked(self, obj, new_tag);
+  return JvmtiWeakTable<jlong>::Set(obj, new_tag);
 }
 bool ObjectTagTable::SetLocked(art::mirror::Object* obj, jlong new_tag) {
   if (new_tag == 0) {
     jlong tmp;
     return RemoveLocked(obj, &tmp);
   }
-
-  art::Thread* self = art::Thread::Current();
-  allow_disallow_lock_.AssertHeld(self);
-  Wait(self);
-
-  return SetLocked(self, obj, new_tag);
+  return JvmtiWeakTable<jlong>::SetLocked(obj, new_tag);
 }
 
-bool ObjectTagTable::SetLocked(art::Thread* self, art::mirror::Object* obj, jlong new_tag) {
-  auto it = tagged_objects_.find(art::GcRoot<art::mirror::Object>(obj));
-  if (it != tagged_objects_.end()) {
-    it->second = new_tag;
-    return true;
-  }
-
-  if (art::kUseReadBarrier && self->GetIsGcMarking() && !update_since_last_sweep_) {
-    // Under concurrent GC, there is a window between moving objects and sweeping of system
-    // weaks in which mutators are active. We may receive a to-space object pointer in obj,
-    // but still have from-space pointers in the table. Explicitly update the table once.
-    // Note: this will keep *all* objects in the table live, but should be a rare occurrence.
-
-    // Update the table.
-    UpdateTableWithReadBarrier();
-
-    // And try again.
-    return SetLocked(self, obj, new_tag);
-  }
-
-  // New element.
-  auto insert_it = tagged_objects_.emplace(art::GcRoot<art::mirror::Object>(obj), new_tag);
-  DCHECK(insert_it.second);
-  return false;
+bool ObjectTagTable::DoesHandleNullOnSweep() {
+  return event_handler_->IsEventEnabledAnywhere(ArtJvmtiEvent::kObjectFree);
 }
-
-void ObjectTagTable::Sweep(art::IsMarkedVisitor* visitor) {
-  if (event_handler_->IsEventEnabledAnywhere(ArtJvmtiEvent::kObjectFree)) {
-    SweepImpl<true>(visitor);
-  } else {
-    SweepImpl<false>(visitor);
-  }
-
-  // Under concurrent GC, there is a window between moving objects and sweeping of system
-  // weaks in which mutators are active. We may receive a to-space object pointer in obj,
-  // but still have from-space pointers in the table. We explicitly update the table then
-  // to ensure we compare against to-space pointers. But we want to do this only once. Once
-  // sweeping is done, we know all objects are to-space pointers until the next GC cycle,
-  // so we re-enable the explicit update for the next marking.
-  update_since_last_sweep_ = false;
-}
-
-template <bool kHandleNull>
-void ObjectTagTable::SweepImpl(art::IsMarkedVisitor* visitor) {
-  art::Thread* self = art::Thread::Current();
-  art::MutexLock mu(self, allow_disallow_lock_);
-
-  auto IsMarkedUpdater = [&](const art::GcRoot<art::mirror::Object>& original_root ATTRIBUTE_UNUSED,
-                             art::mirror::Object* original_obj) {
-    return visitor->IsMarked(original_obj);
-  };
-
-  UpdateTableWith<decltype(IsMarkedUpdater),
-                  kHandleNull ? kCallHandleNull : kRemoveNull>(IsMarkedUpdater);
-}
-
 void ObjectTagTable::HandleNullSweep(jlong tag) {
   event_handler_->DispatchEvent<ArtJvmtiEvent::kObjectFree>(nullptr, tag);
 }
 
-template <typename T, ObjectTagTable::TableUpdateNullTarget kTargetNull>
-ALWAYS_INLINE inline void ObjectTagTable::UpdateTableWith(T& updater) {
-  // We optimistically hope that elements will still be well-distributed when re-inserting them.
-  // So play with the map mechanics, and postpone rehashing. This avoids the need of a side
-  // vector and two passes.
-  float original_max_load_factor = tagged_objects_.max_load_factor();
-  tagged_objects_.max_load_factor(std::numeric_limits<float>::max());
-  // For checking that a max load-factor actually does what we expect.
-  size_t original_bucket_count = tagged_objects_.bucket_count();
-
-  for (auto it = tagged_objects_.begin(); it != tagged_objects_.end();) {
-    DCHECK(!it->first.IsNull());
-    art::mirror::Object* original_obj = it->first.Read<art::kWithoutReadBarrier>();
-    art::mirror::Object* target_obj = updater(it->first, original_obj);
-    if (original_obj != target_obj) {
-      if (kTargetNull == kIgnoreNull && target_obj == nullptr) {
-        // Ignore null target, don't do anything.
-      } else {
-        jlong tag = it->second;
-        it = tagged_objects_.erase(it);
-        if (target_obj != nullptr) {
-          tagged_objects_.emplace(art::GcRoot<art::mirror::Object>(target_obj), tag);
-          DCHECK_EQ(original_bucket_count, tagged_objects_.bucket_count());
-        } else if (kTargetNull == kCallHandleNull) {
-          HandleNullSweep(tag);
-        }
-        continue;  // Iterator was implicitly updated by erase.
-      }
-    }
-    it++;
-  }
-
-  tagged_objects_.max_load_factor(original_max_load_factor);
-  // TODO: consider rehash here.
-}
-
-template <typename T, class Allocator = std::allocator<T>>
-struct ReleasableContainer {
-  using allocator_type = Allocator;
-
-  explicit ReleasableContainer(const allocator_type& alloc, size_t reserve = 10)
-      : allocator(alloc),
-        data(reserve > 0 ? allocator.allocate(reserve) : nullptr),
-        size(0),
-        capacity(reserve) {
-  }
-
-  ~ReleasableContainer() {
-    if (data != nullptr) {
-      allocator.deallocate(data, capacity);
-      capacity = 0;
-      size = 0;
-    }
-  }
-
-  T* Release() {
-    T* tmp = data;
-
-    data = nullptr;
-    size = 0;
-    capacity = 0;
-
-    return tmp;
-  }
-
-  void Resize(size_t new_capacity) {
-    CHECK_GT(new_capacity, capacity);
-
-    T* tmp = allocator.allocate(new_capacity);
-    DCHECK(tmp != nullptr);
-    if (data != nullptr) {
-      memcpy(tmp, data, sizeof(T) * size);
-    }
-    T* old = data;
-    data = tmp;
-    allocator.deallocate(old, capacity);
-    capacity = new_capacity;
-  }
-
-  void Pushback(const T& elem) {
-    if (size == capacity) {
-      size_t new_capacity = 2 * capacity + 1;
-      Resize(new_capacity);
-    }
-    data[size++] = elem;
-  }
-
-  Allocator allocator;
-  T* data;
-  size_t size;
-  size_t capacity;
-};
-
-jvmtiError ObjectTagTable::GetTaggedObjects(jvmtiEnv* jvmti_env,
-                                            jint tag_count,
-                                            const jlong* tags,
-                                            jint* count_ptr,
-                                            jobject** object_result_ptr,
-                                            jlong** tag_result_ptr) {
-  if (tag_count < 0) {
-    return ERR(ILLEGAL_ARGUMENT);
-  }
-  if (tag_count > 0) {
-    for (size_t i = 0; i != static_cast<size_t>(tag_count); ++i) {
-      if (tags[i] == 0) {
-        return ERR(ILLEGAL_ARGUMENT);
-      }
-    }
-  }
-  if (tags == nullptr) {
-    return ERR(NULL_POINTER);
-  }
-  if (count_ptr == nullptr) {
-    return ERR(NULL_POINTER);
-  }
-
-  art::Thread* self = art::Thread::Current();
-  art::MutexLock mu(self, allow_disallow_lock_);
-  Wait(self);
-
-  art::JNIEnvExt* jni_env = self->GetJniEnv();
-
-  constexpr size_t kDefaultSize = 10;
-  size_t initial_object_size;
-  size_t initial_tag_size;
-  if (tag_count == 0) {
-    initial_object_size = (object_result_ptr != nullptr) ? tagged_objects_.size() : 0;
-    initial_tag_size = (tag_result_ptr != nullptr) ? tagged_objects_.size() : 0;
-  } else {
-    initial_object_size = initial_tag_size = kDefaultSize;
-  }
-  JvmtiAllocator<void> allocator(jvmti_env);
-  ReleasableContainer<jobject, JvmtiAllocator<jobject>> selected_objects(allocator, initial_object_size);
-  ReleasableContainer<jlong, JvmtiAllocator<jlong>> selected_tags(allocator, initial_tag_size);
-
-  size_t count = 0;
-  for (auto& pair : tagged_objects_) {
-    bool select;
-    if (tag_count > 0) {
-      select = false;
-      for (size_t i = 0; i != static_cast<size_t>(tag_count); ++i) {
-        if (tags[i] == pair.second) {
-          select = true;
-          break;
-        }
-      }
-    } else {
-      select = true;
-    }
-
-    if (select) {
-      art::mirror::Object* obj = pair.first.Read<art::kWithReadBarrier>();
-      if (obj != nullptr) {
-        count++;
-        if (object_result_ptr != nullptr) {
-          selected_objects.Pushback(jni_env->AddLocalReference<jobject>(obj));
-        }
-        if (tag_result_ptr != nullptr) {
-          selected_tags.Pushback(pair.second);
-        }
-      }
-    }
-  }
-
-  if (object_result_ptr != nullptr) {
-    *object_result_ptr = selected_objects.Release();
-  }
-  if (tag_result_ptr != nullptr) {
-    *tag_result_ptr = selected_tags.Release();
-  }
-  *count_ptr = static_cast<jint>(count);
-  return ERR(NONE);
-}
-
 }  // namespace openjdkjvmti
diff --git a/runtime/openjdkjvmti/object_tagging.h b/runtime/openjdkjvmti/object_tagging.h
index 0296f1a..b5a601c 100644
--- a/runtime/openjdkjvmti/object_tagging.h
+++ b/runtime/openjdkjvmti/object_tagging.h
@@ -1,17 +1,32 @@
-/*
- * Copyright (C) 2016 The Android Open Source Project
+/* Copyright (C) 2016 The Android Open Source Project
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
  *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
+ * This file implements interfaces from the file jvmti.h. This implementation
+ * is licensed under the same terms as the file jvmti.h.  The
+ * copyright and license information for the file jvmti.h follows.
  *
- *      http://www.apache.org/licenses/LICENSE-2.0
+ * Copyright (c) 2003, 2011, Oracle and/or its affiliates. All rights reserved.
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
  *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
+ * This code is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License version 2 only, as
+ * published by the Free Software Foundation.  Oracle designates this
+ * particular file as subject to the "Classpath" exception as provided
+ * by Oracle in the LICENSE file that accompanied this code.
+ *
+ * This code is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+ * version 2 for more details (a copy is included in the LICENSE file that
+ * accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License version
+ * 2 along with this work; if not, write to the Free Software Foundation,
+ * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
+ *
+ * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
+ * or visit www.oracle.com if you need additional information or have any
+ * questions.
  */
 
 #ifndef ART_RUNTIME_OPENJDKJVMTI_OBJECT_TAGGING_H_
@@ -20,62 +35,27 @@
 #include <unordered_map>
 
 #include "base/mutex.h"
-#include "gc/system_weak.h"
-#include "gc_root-inl.h"
 #include "globals.h"
 #include "jvmti.h"
+#include "jvmti_weak_table.h"
 #include "mirror/object.h"
-#include "thread-inl.h"
 
 namespace openjdkjvmti {
 
 class EventHandler;
 
-class ObjectTagTable : public art::gc::SystemWeakHolder {
+class ObjectTagTable FINAL : public JvmtiWeakTable<jlong> {
  public:
-  explicit ObjectTagTable(EventHandler* event_handler)
-      : art::gc::SystemWeakHolder(kTaggingLockLevel),
-        update_since_last_sweep_(false),
-        event_handler_(event_handler) {
+  explicit ObjectTagTable(EventHandler* event_handler) : event_handler_(event_handler) {
   }
 
-  void Add(art::mirror::Object* obj, jlong tag)
+  bool Set(art::mirror::Object* obj, jlong tag) OVERRIDE
       REQUIRES_SHARED(art::Locks::mutator_lock_)
       REQUIRES(!allow_disallow_lock_);
-
-  bool Remove(art::mirror::Object* obj, jlong* tag)
-      REQUIRES_SHARED(art::Locks::mutator_lock_)
-      REQUIRES(!allow_disallow_lock_);
-  bool RemoveLocked(art::mirror::Object* obj, jlong* tag)
+  bool SetLocked(art::mirror::Object* obj, jlong tag) OVERRIDE
       REQUIRES_SHARED(art::Locks::mutator_lock_)
       REQUIRES(allow_disallow_lock_);
 
-  bool Set(art::mirror::Object* obj, jlong tag)
-      REQUIRES_SHARED(art::Locks::mutator_lock_)
-      REQUIRES(!allow_disallow_lock_);
-  bool SetLocked(art::mirror::Object* obj, jlong tag)
-      REQUIRES_SHARED(art::Locks::mutator_lock_)
-      REQUIRES(allow_disallow_lock_);
-
-  bool GetTag(art::mirror::Object* obj, jlong* result)
-      REQUIRES_SHARED(art::Locks::mutator_lock_)
-      REQUIRES(!allow_disallow_lock_) {
-    art::Thread* self = art::Thread::Current();
-    art::MutexLock mu(self, allow_disallow_lock_);
-    Wait(self);
-
-    return GetTagLocked(self, obj, result);
-  }
-  bool GetTagLocked(art::mirror::Object* obj, jlong* result)
-      REQUIRES_SHARED(art::Locks::mutator_lock_)
-      REQUIRES(allow_disallow_lock_) {
-    art::Thread* self = art::Thread::Current();
-    allow_disallow_lock_.AssertHeld(self);
-    Wait(self);
-
-    return GetTagLocked(self, obj, result);
-  }
-
   jlong GetTagOrZero(art::mirror::Object* obj)
       REQUIRES_SHARED(art::Locks::mutator_lock_)
       REQUIRES(!allow_disallow_lock_) {
@@ -91,108 +71,11 @@
     return tmp;
   }
 
-  void Sweep(art::IsMarkedVisitor* visitor)
-      REQUIRES_SHARED(art::Locks::mutator_lock_)
-      REQUIRES(!allow_disallow_lock_);
-
-  jvmtiError GetTaggedObjects(jvmtiEnv* jvmti_env,
-                              jint tag_count,
-                              const jlong* tags,
-                              jint* count_ptr,
-                              jobject** object_result_ptr,
-                              jlong** tag_result_ptr)
-      REQUIRES_SHARED(art::Locks::mutator_lock_)
-      REQUIRES(!allow_disallow_lock_);
-
-  void Lock() ACQUIRE(allow_disallow_lock_);
-  void Unlock() RELEASE(allow_disallow_lock_);
-  void AssertLocked() ASSERT_CAPABILITY(allow_disallow_lock_);
+ protected:
+  bool DoesHandleNullOnSweep() OVERRIDE;
+  void HandleNullSweep(jlong tag) OVERRIDE;
 
  private:
-  bool SetLocked(art::Thread* self, art::mirror::Object* obj, jlong tag)
-      REQUIRES_SHARED(art::Locks::mutator_lock_)
-      REQUIRES(allow_disallow_lock_);
-
-  bool RemoveLocked(art::Thread* self, art::mirror::Object* obj, jlong* tag)
-      REQUIRES_SHARED(art::Locks::mutator_lock_)
-      REQUIRES(allow_disallow_lock_);
-
-  bool GetTagLocked(art::Thread* self, art::mirror::Object* obj, jlong* result)
-      REQUIRES_SHARED(art::Locks::mutator_lock_)
-      REQUIRES(allow_disallow_lock_) {
-    auto it = tagged_objects_.find(art::GcRoot<art::mirror::Object>(obj));
-    if (it != tagged_objects_.end()) {
-      *result = it->second;
-      return true;
-    }
-
-    if (art::kUseReadBarrier &&
-        self != nullptr &&
-        self->GetIsGcMarking() &&
-        !update_since_last_sweep_) {
-      return GetTagSlowPath(self, obj, result);
-    }
-
-    return false;
-  }
-
-  // Slow-path for GetTag. We didn't find the object, but we might be storing from-pointers and
-  // are asked to retrieve with a to-pointer.
-  bool GetTagSlowPath(art::Thread* self, art::mirror::Object* obj, jlong* result)
-      REQUIRES_SHARED(art::Locks::mutator_lock_)
-      REQUIRES(allow_disallow_lock_);
-
-  // Update the table by doing read barriers on each element, ensuring that to-space pointers
-  // are stored.
-  void UpdateTableWithReadBarrier()
-      REQUIRES_SHARED(art::Locks::mutator_lock_)
-      REQUIRES(allow_disallow_lock_);
-
-  template <bool kHandleNull>
-  void SweepImpl(art::IsMarkedVisitor* visitor)
-      REQUIRES_SHARED(art::Locks::mutator_lock_)
-      REQUIRES(!allow_disallow_lock_);
-  void HandleNullSweep(jlong tag);
-
-  enum TableUpdateNullTarget {
-    kIgnoreNull,
-    kRemoveNull,
-    kCallHandleNull
-  };
-
-  template <typename T, TableUpdateNullTarget kTargetNull>
-  void UpdateTableWith(T& updater)
-      REQUIRES_SHARED(art::Locks::mutator_lock_)
-      REQUIRES(allow_disallow_lock_);
-
-  struct HashGcRoot {
-    size_t operator()(const art::GcRoot<art::mirror::Object>& r) const
-        REQUIRES_SHARED(art::Locks::mutator_lock_) {
-      return reinterpret_cast<uintptr_t>(r.Read<art::kWithoutReadBarrier>());
-    }
-  };
-
-  struct EqGcRoot {
-    bool operator()(const art::GcRoot<art::mirror::Object>& r1,
-                    const art::GcRoot<art::mirror::Object>& r2) const
-        REQUIRES_SHARED(art::Locks::mutator_lock_) {
-      return r1.Read<art::kWithoutReadBarrier>() == r2.Read<art::kWithoutReadBarrier>();
-    }
-  };
-
-  // The tag table is used when visiting roots. So it needs to have a low lock level.
-  static constexpr art::LockLevel kTaggingLockLevel =
-      static_cast<art::LockLevel>(art::LockLevel::kAbortLock + 1);
-
-  std::unordered_map<art::GcRoot<art::mirror::Object>,
-                     jlong,
-                     HashGcRoot,
-                     EqGcRoot> tagged_objects_
-      GUARDED_BY(allow_disallow_lock_)
-      GUARDED_BY(art::Locks::mutator_lock_);
-  // To avoid repeatedly scanning the whole table, remember if we did that since the last sweep.
-  bool update_since_last_sweep_;
-
   EventHandler* event_handler_;
 };
 
diff --git a/runtime/openjdkjvmti/ti_heap.cc b/runtime/openjdkjvmti/ti_heap.cc
index d52f0ea..c2495e3 100644
--- a/runtime/openjdkjvmti/ti_heap.cc
+++ b/runtime/openjdkjvmti/ti_heap.cc
@@ -25,6 +25,7 @@
 #include "gc_root-inl.h"
 #include "jni_env_ext.h"
 #include "jni_internal.h"
+#include "jvmti_weak_table-inl.h"
 #include "mirror/class.h"
 #include "mirror/object-inl.h"
 #include "mirror/object_array-inl.h"
@@ -41,6 +42,21 @@
 
 namespace {
 
+struct IndexCache {
+  // The number of interface fields implemented by the class. This is a prefix to all assigned
+  // field indices.
+  size_t interface_fields;
+
+  // It would be nice to also cache the following, but it is complicated to wire up into the
+  // generic visit:
+  // The number of fields in interfaces and superclasses. This is the first index assigned to
+  // fields of the class.
+  // size_t superclass_fields;
+};
+using IndexCachingTable = JvmtiWeakTable<IndexCache>;
+
+static IndexCachingTable gIndexCachingTable;
+
 // Report the contents of a string, if a callback is set.
 jint ReportString(art::ObjPtr<art::mirror::Object> obj,
                   jvmtiEnv* env,
@@ -402,6 +418,12 @@
   // "non-marker" interfaces (= interfaces with methods).
   static size_t CountInterfaceFields(art::ObjPtr<art::mirror::Class> klass)
       REQUIRES_SHARED(art::Locks::mutator_lock_) {
+    // Do we have a cached value?
+    IndexCache tmp;
+    if (gIndexCachingTable.GetTag(klass.Ptr(), &tmp)) {
+      return tmp.interface_fields;
+    }
+
     size_t count = 0;
     auto visitor = [&count](art::ObjPtr<art::mirror::Class> inf_klass)
         REQUIRES_SHARED(art::Locks::mutator_lock_) {
@@ -410,9 +432,12 @@
       count += inf_klass->NumStaticFields();
     };
     RecursiveInterfaceVisit<decltype(visitor)>::VisitStatic(art::Thread::Current(), klass, visitor);
-    return count;
 
-    // TODO: Implement caching.
+    // Store this into the cache.
+    tmp.interface_fields = count;
+    gIndexCachingTable.Set(klass.Ptr(), tmp);
+
+    return count;
   }
 
   UserData* user_data_;
@@ -618,6 +643,14 @@
 
 }  // namespace
 
+void HeapUtil::Register() {
+  art::Runtime::Current()->AddSystemWeakHolder(&gIndexCachingTable);
+}
+
+void HeapUtil::Unregister() {
+  art::Runtime::Current()->RemoveSystemWeakHolder(&gIndexCachingTable);
+}
+
 struct IterateThroughHeapData {
   IterateThroughHeapData(HeapUtil* _heap_util,
                          jvmtiEnv* _env,
@@ -1004,7 +1037,6 @@
         jvmtiHeapReferenceInfo reference_info;
         memset(&reference_info, 0, sizeof(reference_info));
 
-        // TODO: Implement spec-compliant numbering.
         reference_info.field.index = field_index;
 
         jvmtiHeapReferenceKind kind =
diff --git a/runtime/openjdkjvmti/ti_heap.h b/runtime/openjdkjvmti/ti_heap.h
index 72ee097..dccecb4 100644
--- a/runtime/openjdkjvmti/ti_heap.h
+++ b/runtime/openjdkjvmti/ti_heap.h
@@ -49,6 +49,9 @@
     return tags_;
   }
 
+  static void Register();
+  static void Unregister();
+
  private:
   ObjectTagTable* tags_;
 };
diff --git a/runtime/openjdkjvmti/ti_redefine.cc b/runtime/openjdkjvmti/ti_redefine.cc
index 0bf8e05..9c1d6ef 100644
--- a/runtime/openjdkjvmti/ti_redefine.cc
+++ b/runtime/openjdkjvmti/ti_redefine.cc
@@ -325,12 +325,19 @@
   std::vector<ArtClassDefinition> def_vector;
   def_vector.reserve(class_count);
   for (jint i = 0; i < class_count; i++) {
+    jboolean is_modifiable = JNI_FALSE;
+    jvmtiError res = env->IsModifiableClass(definitions[i].klass, &is_modifiable);
+    if (res != OK) {
+      return res;
+    } else if (!is_modifiable) {
+      return ERR(UNMODIFIABLE_CLASS);
+    }
     // We make a copy of the class_bytes to pass into the retransformation.
     // This makes cleanup easier (since we unambiguously own the bytes) and also is useful since we
     // will need to keep the original bytes around unaltered for subsequent RetransformClasses calls
     // to get the passed in bytes.
     unsigned char* class_bytes_copy = nullptr;
-    jvmtiError res = env->Allocate(definitions[i].class_byte_count, &class_bytes_copy);
+    res = env->Allocate(definitions[i].class_byte_count, &class_bytes_copy);
     if (res != OK) {
       return res;
     }
diff --git a/runtime/openjdkjvmti/transform.cc b/runtime/openjdkjvmti/transform.cc
index 36421b9..bd52cbb 100644
--- a/runtime/openjdkjvmti/transform.cc
+++ b/runtime/openjdkjvmti/transform.cc
@@ -109,6 +109,13 @@
   std::vector<ArtClassDefinition> definitions;
   jvmtiError res = OK;
   for (jint i = 0; i < class_count; i++) {
+    jboolean is_modifiable = JNI_FALSE;
+    res = env->IsModifiableClass(classes[i], &is_modifiable);
+    if (res != OK) {
+      return res;
+    } else if (!is_modifiable) {
+      return ERR(UNMODIFIABLE_CLASS);
+    }
     ArtClassDefinition def;
     res = FillInTransformationData(env, classes[i], &def);
     if (res != OK) {
diff --git a/runtime/quick/inline_method_analyser.cc b/runtime/quick/inline_method_analyser.cc
index b009b47..3347070 100644
--- a/runtime/quick/inline_method_analyser.cc
+++ b/runtime/quick/inline_method_analyser.cc
@@ -215,9 +215,8 @@
     REQUIRES_SHARED(Locks::mutator_lock_) {
   DCHECK(IsInstructionIPut(new_iput->Opcode()));
   uint32_t field_index = new_iput->VRegC_22c();
-  PointerSize pointer_size = Runtime::Current()->GetClassLinker()->GetImagePointerSize();
-  mirror::DexCache* dex_cache = method->GetDexCache();
-  ArtField* field = dex_cache->GetResolvedField(field_index, pointer_size);
+  ClassLinker* class_linker = Runtime::Current()->GetClassLinker();
+  ArtField* field = class_linker->LookupResolvedField(field_index, method, /* is_static */ false);
   if (UNLIKELY(field == nullptr)) {
     return false;
   }
@@ -227,7 +226,9 @@
     if (iputs[old_pos].field_index == DexFile::kDexNoIndex16) {
       break;
     }
-    ArtField* f = dex_cache->GetResolvedField(iputs[old_pos].field_index, pointer_size);
+    ArtField* f = class_linker->LookupResolvedField(iputs[old_pos].field_index,
+                                                    method,
+                                                    /* is_static */ false);
     DCHECK(f != nullptr);
     if (f == field) {
       auto back_it = std::copy(iputs + old_pos + 1, iputs + arraysize(iputs), iputs + old_pos);
@@ -732,9 +733,9 @@
   if (method == nullptr) {
     return false;
   }
-  mirror::DexCache* dex_cache = method->GetDexCache();
-  PointerSize pointer_size = Runtime::Current()->GetClassLinker()->GetImagePointerSize();
-  ArtField* field = dex_cache->GetResolvedField(field_idx, pointer_size);
+  ObjPtr<mirror::DexCache> dex_cache = method->GetDexCache();
+  ClassLinker* class_linker = Runtime::Current()->GetClassLinker();
+  ArtField* field = class_linker->LookupResolvedField(field_idx, method, /* is_static */ false);
   if (field == nullptr || field->IsStatic()) {
     return false;
   }
diff --git a/runtime/runtime.cc b/runtime/runtime.cc
index 42a0ca9..9fd2c88 100644
--- a/runtime/runtime.cc
+++ b/runtime/runtime.cc
@@ -1963,9 +1963,7 @@
 }
 
 void Runtime::RegisterAppInfo(const std::vector<std::string>& code_paths,
-                              const std::string& profile_output_filename,
-                              const std::string& foreign_dex_profile_path,
-                              const std::string& app_dir) {
+                              const std::string& profile_output_filename) {
   if (jit_.get() == nullptr) {
     // We are not JITing. Nothing to do.
     return;
@@ -1987,18 +1985,7 @@
     return;
   }
 
-  jit_->StartProfileSaver(profile_output_filename,
-                          code_paths,
-                          foreign_dex_profile_path,
-                          app_dir);
-}
-
-void Runtime::NotifyDexLoaded(const std::string& dex_location) {
-  VLOG(profiler) << "Notify dex loaded: " << dex_location;
-  // We know that if the ProfileSaver is started then we can record profile information.
-  if (ProfileSaver::IsStarted()) {
-    ProfileSaver::NotifyDexUse(dex_location);
-  }
+  jit_->StartProfileSaver(profile_output_filename, code_paths);
 }
 
 // Transaction support.
diff --git a/runtime/runtime.h b/runtime/runtime.h
index 4a0169d..d244a9b 100644
--- a/runtime/runtime.h
+++ b/runtime/runtime.h
@@ -454,10 +454,7 @@
   }
 
   void RegisterAppInfo(const std::vector<std::string>& code_paths,
-                       const std::string& profile_output_filename,
-                       const std::string& foreign_dex_profile_path,
-                       const std::string& app_dir);
-  void NotifyDexLoaded(const std::string& dex_location);
+                       const std::string& profile_output_filename);
 
   // Transaction support.
   bool IsActiveTransaction() const {
diff --git a/runtime/scoped_thread_state_change.h b/runtime/scoped_thread_state_change.h
index a3286ac..5f03741 100644
--- a/runtime/scoped_thread_state_change.h
+++ b/runtime/scoped_thread_state_change.h
@@ -141,6 +141,8 @@
   ALWAYS_INLINE explicit ScopedObjectAccessUnchecked(Thread* self)
       REQUIRES(!Locks::thread_suspend_count_lock_);
 
+  ALWAYS_INLINE ~ScopedObjectAccessUnchecked() REQUIRES(!Locks::thread_suspend_count_lock_) {}
+
   // Used when we want a scoped JNI thread state but have no thread/JNIEnv. Consequently doesn't
   // change into Runnable or acquire a share on the mutator_lock_.
   explicit ScopedObjectAccessUnchecked(JavaVM* vm) ALWAYS_INLINE
diff --git a/runtime/stack_map.h b/runtime/stack_map.h
index 67f0b57..d936ce9 100644
--- a/runtime/stack_map.h
+++ b/runtime/stack_map.h
@@ -571,7 +571,7 @@
     }
   }
 
-  bool IsDexRegisterLive(uint16_t dex_register_number) const {
+  ALWAYS_INLINE bool IsDexRegisterLive(uint16_t dex_register_number) const {
     size_t live_bit_mask_offset_in_bits = GetLiveBitMaskOffset() * kBitsPerByte;
     return region_.LoadBit(live_bit_mask_offset_in_bits + dex_register_number);
   }
diff --git a/runtime/thread-inl.h b/runtime/thread-inl.h
index 482e0e3..02a1e4d 100644
--- a/runtime/thread-inl.h
+++ b/runtime/thread-inl.h
@@ -94,9 +94,7 @@
           if (held_mutex != nullptr &&
               held_mutex != Locks::mutator_lock_ &&
               held_mutex != cond_var_mutex) {
-            std::vector<BaseMutex*>& expected_mutexes = Locks::expected_mutexes_on_weak_ref_access_;
-            CHECK(std::find(expected_mutexes.begin(), expected_mutexes.end(), held_mutex) !=
-                  expected_mutexes.end())
+            CHECK(Locks::IsExpectedOnWeakRefAccess(held_mutex))
                 << "Holding unexpected mutex " << held_mutex->GetName()
                 << " when accessing weak ref";
           }
diff --git a/runtime/thread.cc b/runtime/thread.cc
index ff66cc1..30a4046 100644
--- a/runtime/thread.cc
+++ b/runtime/thread.cc
@@ -1934,7 +1934,6 @@
   wait_cond_ = new ConditionVariable("a thread wait condition variable", *wait_mutex_);
   tlsPtr_.instrumentation_stack = new std::deque<instrumentation::InstrumentationStackFrame>;
   tlsPtr_.name = new std::string(kThreadNameDuringStartup);
-  tlsPtr_.nested_signal_state = static_cast<jmp_buf*>(malloc(sizeof(jmp_buf)));
 
   static_assert((sizeof(Thread) % 4) == 0U,
                 "art::Thread has a size which is not a multiple of 4.");
@@ -2118,7 +2117,6 @@
   delete tlsPtr_.instrumentation_stack;
   delete tlsPtr_.name;
   delete tlsPtr_.deps_or_stack_trace_sample.stack_trace_sample;
-  free(tlsPtr_.nested_signal_state);
 
   Runtime::Current()->GetHeap()->AssertThreadLocalBuffersAreRevoked(this);
 
diff --git a/runtime/thread.h b/runtime/thread.h
index d5fd9e9..de0b892 100644
--- a/runtime/thread.h
+++ b/runtime/thread.h
@@ -1115,21 +1115,12 @@
     return tlsPtr_.mterp_alt_ibase;
   }
 
-  // Notify that a signal is being handled. This is to protect us from doing recursive
-  // NPE handling after a SIGSEGV.
-  void NoteSignalBeingHandled() {
-    if (tls32_.handling_signal_) {
-      LOG(FATAL) << "Detected signal while processing a signal";
-    }
-    tls32_.handling_signal_ = true;
+  bool HandlingSignal() const {
+    return tls32_.handling_signal_;
   }
 
-  void NoteSignalHandlerDone() {
-    tls32_.handling_signal_ = false;
-  }
-
-  jmp_buf* GetNestedSignalState() {
-    return tlsPtr_.nested_signal_state;
+  void SetHandlingSignal(bool handling_signal) {
+    tls32_.handling_signal_ = handling_signal;
   }
 
   bool IsTransitioningToRunnable() const {
@@ -1460,7 +1451,7 @@
       thread_local_start(nullptr), thread_local_pos(nullptr), thread_local_end(nullptr),
       thread_local_objects(0), mterp_current_ibase(nullptr), mterp_default_ibase(nullptr),
       mterp_alt_ibase(nullptr), thread_local_alloc_stack_top(nullptr),
-      thread_local_alloc_stack_end(nullptr), nested_signal_state(nullptr),
+      thread_local_alloc_stack_end(nullptr),
       flip_function(nullptr), method_verifier(nullptr), thread_local_mark_stack(nullptr) {
       std::fill(held_mutexes, held_mutexes + kLockLevelCount, nullptr);
     }
@@ -1606,9 +1597,6 @@
     // Support for Mutex lock hierarchy bug detection.
     BaseMutex* held_mutexes[kLockLevelCount];
 
-    // Recorded thread state for nested signals.
-    jmp_buf* nested_signal_state;
-
     // The function used for thread flip.
     Closure* flip_function;
 
diff --git a/runtime/type_lookup_table.h b/runtime/type_lookup_table.h
index 3f6f76f..fd68deb 100644
--- a/runtime/type_lookup_table.h
+++ b/runtime/type_lookup_table.h
@@ -148,7 +148,7 @@
     return mask_;
   }
 
-  // Attempt to set an entry on it's hash' slot. If there is alrady something there, return false.
+  // Attempt to set an entry on its hash's slot. If there is already something there, return false.
   // Otherwise return true.
   bool SetOnInitialPos(const Entry& entry, uint32_t hash);
 
diff --git a/runtime/utils/dex_cache_arrays_layout-inl.h b/runtime/utils/dex_cache_arrays_layout-inl.h
index f9a1405..95904af 100644
--- a/runtime/utils/dex_cache_arrays_layout-inl.h
+++ b/runtime/utils/dex_cache_arrays_layout-inl.h
@@ -51,7 +51,11 @@
     : DexCacheArraysLayout(pointer_size, dex_file->GetHeader(), dex_file->NumCallSiteIds()) {
 }
 
-constexpr size_t DexCacheArraysLayout::Alignment() {
+inline size_t DexCacheArraysLayout::Alignment() const {
+  return Alignment(pointer_size_);
+}
+
+inline constexpr size_t DexCacheArraysLayout::Alignment(PointerSize pointer_size) {
   // mirror::Type/String/MethodTypeDexCacheType alignment is 8,
   // i.e. higher than or equal to the pointer alignment.
   static_assert(alignof(mirror::TypeDexCacheType) == 8,
@@ -60,8 +64,8 @@
                 "Expecting alignof(StringDexCacheType) == 8");
   static_assert(alignof(mirror::MethodTypeDexCacheType) == 8,
                 "Expecting alignof(MethodTypeDexCacheType) == 8");
-  // This is the same as alignof(MethodTypeDexCacheType).
-  return alignof(mirror::StringDexCacheType);
+  // This is the same as alignof(FieldDexCacheType) for the given pointer size.
+  return 2u * static_cast<size_t>(pointer_size);
 }
 
 template <typename T>
@@ -100,8 +104,8 @@
 }
 
 inline size_t DexCacheArraysLayout::StringOffset(uint32_t string_idx) const {
-  return strings_offset_ + ElementOffset(PointerSize::k64,
-                                         string_idx % mirror::DexCache::kDexCacheStringCacheSize);
+  uint32_t string_hash = string_idx % mirror::DexCache::kDexCacheStringCacheSize;
+  return strings_offset_ + ElementOffset(PointerSize::k64, string_hash);
 }
 
 inline size_t DexCacheArraysLayout::StringsSize(size_t num_elements) const {
@@ -119,15 +123,20 @@
 }
 
 inline size_t DexCacheArraysLayout::FieldOffset(uint32_t field_idx) const {
-  return fields_offset_ + ElementOffset(pointer_size_, field_idx);
+  uint32_t field_hash = field_idx % mirror::DexCache::kDexCacheFieldCacheSize;
+  return fields_offset_ + 2u * static_cast<size_t>(pointer_size_) * field_hash;
 }
 
 inline size_t DexCacheArraysLayout::FieldsSize(size_t num_elements) const {
-  return ArraySize(pointer_size_, num_elements);
+  size_t cache_size = mirror::DexCache::kDexCacheFieldCacheSize;
+  if (num_elements < cache_size) {
+    cache_size = num_elements;
+  }
+  return 2u * static_cast<size_t>(pointer_size_) * num_elements;
 }
 
 inline size_t DexCacheArraysLayout::FieldsAlignment() const {
-  return static_cast<size_t>(pointer_size_);
+  return 2u * static_cast<size_t>(pointer_size_);
 }
 
 inline size_t DexCacheArraysLayout::MethodTypesSize(size_t num_elements) const {
diff --git a/runtime/utils/dex_cache_arrays_layout.h b/runtime/utils/dex_cache_arrays_layout.h
index ed677ed..377a374 100644
--- a/runtime/utils/dex_cache_arrays_layout.h
+++ b/runtime/utils/dex_cache_arrays_layout.h
@@ -57,7 +57,9 @@
     return size_;
   }
 
-  static constexpr size_t Alignment();
+  size_t Alignment() const;
+
+  static constexpr size_t Alignment(PointerSize pointer_size);
 
   size_t TypesOffset() const {
     return types_offset_;
@@ -125,8 +127,6 @@
   const size_t call_sites_offset_;
   const size_t size_;
 
-  static size_t Alignment(PointerSize pointer_size);
-
   static size_t ElementOffset(PointerSize element_size, uint32_t idx);
 
   static size_t ArraySize(PointerSize element_size, uint32_t num_elements);
diff --git a/runtime/vdex_file.h b/runtime/vdex_file.h
index 7daf2f8..5048bad 100644
--- a/runtime/vdex_file.h
+++ b/runtime/vdex_file.h
@@ -61,7 +61,7 @@
 
    private:
     static constexpr uint8_t kVdexMagic[] = { 'v', 'd', 'e', 'x' };
-    static constexpr uint8_t kVdexVersion[] = { '0', '0', '3', '\0' };  // Remove verify-profile
+    static constexpr uint8_t kVdexVersion[] = { '0', '0', '4', '\0' };  // dexlayout incompatibility
 
     uint8_t magic_[4];
     uint8_t version_[4];
diff --git a/test/530-checker-lse/src/Main.java b/test/530-checker-lse/src/Main.java
index 14a40ef..6632503 100644
--- a/test/530-checker-lse/src/Main.java
+++ b/test/530-checker-lse/src/Main.java
@@ -747,6 +747,69 @@
     return 1.0f;
   }
 
+  /// CHECK-START: TestClass2 Main.testStoreStore() load_store_elimination (before)
+  /// CHECK: NewInstance
+  /// CHECK: InstanceFieldSet
+  /// CHECK: InstanceFieldSet
+  /// CHECK: InstanceFieldSet
+  /// CHECK: InstanceFieldSet
+
+  /// CHECK-START: TestClass2 Main.testStoreStore() load_store_elimination (after)
+  /// CHECK: NewInstance
+  /// CHECK: InstanceFieldSet
+  /// CHECK: InstanceFieldSet
+  /// CHECK-NOT: InstanceFieldSet
+
+  private static TestClass2 testStoreStore() {
+    TestClass2 obj = new TestClass2();
+    obj.i = 41;
+    obj.j = 42;
+    obj.i = 41;
+    obj.j = 43;
+    return obj;
+  }
+
+  /// CHECK-START: int Main.testStoreStoreWithDeoptimize(int[]) load_store_elimination (before)
+  /// CHECK: NewInstance
+  /// CHECK: InstanceFieldSet
+  /// CHECK: InstanceFieldSet
+  /// CHECK: InstanceFieldSet
+  /// CHECK: InstanceFieldSet
+  /// CHECK: Deoptimize
+  /// CHECK: ArraySet
+  /// CHECK: ArraySet
+  /// CHECK: ArraySet
+  /// CHECK: ArraySet
+  /// CHECK: ArrayGet
+  /// CHECK: ArrayGet
+  /// CHECK: ArrayGet
+  /// CHECK: ArrayGet
+
+  /// CHECK-START: int Main.testStoreStoreWithDeoptimize(int[]) load_store_elimination (after)
+  /// CHECK: NewInstance
+  /// CHECK: InstanceFieldSet
+  /// CHECK: InstanceFieldSet
+  /// CHECK-NOT: InstanceFieldSet
+  /// CHECK: Deoptimize
+  /// CHECK: ArraySet
+  /// CHECK: ArraySet
+  /// CHECK: ArraySet
+  /// CHECK: ArraySet
+  /// CHECK-NOT: ArrayGet
+
+  private static int testStoreStoreWithDeoptimize(int[] arr) {
+    TestClass2 obj = new TestClass2();
+    obj.i = 41;
+    obj.j = 42;
+    obj.i = 41;
+    obj.j = 43;
+    arr[0] = 1;  // One HDeoptimize here.
+    arr[1] = 1;
+    arr[2] = 1;
+    arr[3] = 1;
+    return arr[0] + arr[1] + arr[2] + arr[3];
+  }
+
   /// CHECK-START: double Main.getCircleArea(double, boolean) load_store_elimination (before)
   /// CHECK: NewInstance
 
@@ -950,6 +1013,10 @@
     assertIntEquals(testAllocationEliminationOfArray2(), 11);
     assertIntEquals(testAllocationEliminationOfArray3(2), 4);
     assertIntEquals(testAllocationEliminationOfArray4(2), 6);
+
+    assertIntEquals(testStoreStore().i, 41);
+    assertIntEquals(testStoreStore().j, 43);
+    assertIntEquals(testStoreStoreWithDeoptimize(new int[4]), 4);
   }
 
   static boolean sFlag;
diff --git a/test/538-checker-embed-constants/src/Main.java b/test/538-checker-embed-constants/src/Main.java
index 4f34ec9..94aad9d 100644
--- a/test/538-checker-embed-constants/src/Main.java
+++ b/test/538-checker-embed-constants/src/Main.java
@@ -37,13 +37,20 @@
   }
 
   /// CHECK-START-ARM: int Main.and511(int) disassembly (after)
-  /// CHECK:                mov {{r\d+}}, #511
-  /// CHECK:                and{{(\.w)?}} {{r\d+}}, {{r\d+}}, {{r\d+}}
+  /// CHECK:                ubfx {{r\d+}}, {{r\d+}}, #0, #9
 
   public static int and511(int arg) {
     return arg & 511;
   }
 
+  /// CHECK-START-ARM: int Main.andF00D(int) disassembly (after)
+  /// CHECK:                mov {{r\d+}}, #61453
+  /// CHECK:                and{{(\.w)?}} {{r\d+}}, {{r\d+}}, {{r\d+}}
+
+  public static int andF00D(int arg) {
+    return arg & 0xF00D;
+  }
+
   /// CHECK-START-ARM: int Main.andNot15(int) disassembly (after)
   /// CHECK-NOT:            mvn {{r\d+}}, #15
   /// CHECK:                bic {{r\d+}}, {{r\d+}}, #0xf
@@ -114,19 +121,31 @@
   }
 
   /// CHECK-START-ARM: long Main.and511(long) disassembly (after)
-  /// CHECK:                mov {{r\d+}}, #511
+  /// CHECK:                ubfx {{r\d+}}, {{r\d+}}, #0, #9
   /// CHECK-NEXT:           mov{{s?}} {{r\d+}}, #0
   /// CHECK-NOT:            and{{(\.w)?}}
   /// CHECK-NOT:            bic{{(\.w)?}}
-  /// CHECK:                and{{(\.w)?}} {{r\d+}}, {{r\d+}}, {{r\d+}}
-  /// CHECK-NEXT:           and{{(\.w)?}} {{r\d+}}, {{r\d+}}, {{r\d+}}
-  /// CHECK-NOT:            and{{(\.w)?}}
-  /// CHECK-NOT:            bic{{(\.w)?}}
 
   public static long and511(long arg) {
     return arg & 511L;
   }
 
+  /// CHECK-START-ARM: long Main.andF00D(long) disassembly (after)
+  /// CHECK:                mov {{r\d+}}, #61453
+  /// CHECK-NEXT:           mov{{s?}} {{r\d+}}, #0
+  /// CHECK-NOT:            and{{(\.w)?}}
+  /// CHECK-NOT:            bic{{(\.w)?}}
+  /// CHECK-NOT:            ubfx
+  /// CHECK:                and{{(\.w)?}} {{r\d+}}, {{r\d+}}, {{r\d+}}
+  /// CHECK-NEXT:           and{{(\.w)?}} {{r\d+}}, {{r\d+}}, {{r\d+}}
+  /// CHECK-NOT:            and{{(\.w)?}}
+  /// CHECK-NOT:            bic{{(\.w)?}}
+  /// CHECK-NOT:            ubfx
+
+  public static long andF00D(long arg) {
+    return arg & 0xF00DL;
+  }
+
   /// CHECK-START-ARM: long Main.andNot15(long) disassembly (after)
   /// CHECK-NOT:            mvn {{r\d+}}, #15
   /// CHECK-NOT:            and{{(\.w)?}}
@@ -631,6 +650,7 @@
     int arg = 0x87654321;
     assertIntEquals(and255(arg), 0x21);
     assertIntEquals(and511(arg), 0x121);
+    assertIntEquals(andF00D(arg), 0x4001);
     assertIntEquals(andNot15(arg), 0x87654320);
     assertIntEquals(or255(arg), 0x876543ff);
     assertIntEquals(or511(arg), 0x876543ff);
@@ -642,6 +662,7 @@
     long longArg = 0x1234567887654321L;
     assertLongEquals(and255(longArg), 0x21L);
     assertLongEquals(and511(longArg), 0x121L);
+    assertLongEquals(andF00D(longArg), 0x4001L);
     assertLongEquals(andNot15(longArg), 0x1234567887654320L);
     assertLongEquals(and0xfffffff00000000f(longArg), 0x1234567000000001L);
     assertLongEquals(or255(longArg), 0x12345678876543ffL);
diff --git a/test/570-checker-select/src/Main.java b/test/570-checker-select/src/Main.java
index e0a76ca..3ac6f89 100644
--- a/test/570-checker-select/src/Main.java
+++ b/test/570-checker-select/src/Main.java
@@ -371,6 +371,49 @@
     return a > b ? x : y;
   }
 
+  /// CHECK-START-ARM: long Main.$noinline$LongEqNonmatCond_LongVarVar(long, long, long, long) disassembly (after)
+  /// CHECK:               Select
+  /// CHECK-NEXT:            cmp {{r\d+}}, {{r\d+}}
+  /// CHECK-NEXT:            it eq
+  /// CHECK-NEXT:            cmpeq {{r\d+}}, {{r\d+}}
+  /// CHECK-NEXT:            it eq
+
+  public static long $noinline$LongEqNonmatCond_LongVarVar(long a, long b, long x, long y) {
+    return a == b ? x : y;
+  }
+
+  /// CHECK-START-ARM: long Main.$noinline$LongNonmatCondCst_LongVarVar(long, long, long) disassembly (after)
+  /// CHECK:               Select
+  /// CHECK-NEXT:            mov ip, #52720
+  /// CHECK-NEXT:            movt ip, #35243
+  /// CHECK-NEXT:            cmp {{r\d+}}, ip
+  /// CHECK-NEXT:            sbcs ip, {{r\d+}}, #{{\d+}}
+  /// CHECK-NEXT:            it ge
+
+  public static long $noinline$LongNonmatCondCst_LongVarVar(long a, long x, long y) {
+    return a > 0x89ABCDEFL ? x : y;
+  }
+
+  /// CHECK-START-ARM: long Main.$noinline$LongNonmatCondCst_LongVarVar2(long, long, long) disassembly (after)
+  /// CHECK:               Select
+  /// CHECK-NEXT:            mov ip, #{{\d+}}
+  /// CHECK-NEXT:            movt ip, #{{\d+}}
+  /// CHECK-NEXT:            cmp {{r\d+}}, ip
+
+  public static long $noinline$LongNonmatCondCst_LongVarVar2(long a, long x, long y) {
+    return a > 0x0123456789ABCDEFL ? x : y;
+  }
+
+  /// CHECK-START-ARM: long Main.$noinline$LongNonmatCondCst_LongVarVar3(long, long, long) disassembly (after)
+  /// CHECK:               Select
+  /// CHECK-NEXT:            cmp {{r\d+}}, {{r\d+}}
+  /// CHECK-NOT:             sbcs
+  /// CHECK-NOT:             cmp
+
+  public static long $noinline$LongNonmatCondCst_LongVarVar3(long a, long x, long y) {
+    return a > 0x7FFFFFFFFFFFFFFFL ? x : y;
+  }
+
   /// CHECK-START: long Main.LongMatCond_LongVarVar(long, long, long, long) register (after)
   /// CHECK:            <<Cond:z\d+>> LessThanOrEqual [{{j\d+}},{{j\d+}}]
   /// CHECK:            <<Sel1:j\d+>> Select [{{j\d+}},{{j\d+}},<<Cond>>]
@@ -612,6 +655,39 @@
     assertEqual(5, IntMatCond_IntVarVar(3, 2, 5, 7));
     assertEqual(8, IntMatCond_IntVarVar(2, 3, 5, 7));
 
+    assertEqual(0xAAAAAAAA55555555L,
+                LongNonmatCond_LongVarVar(3L, 2L, 0xAAAAAAAA55555555L, 0x8888888877777777L));
+    assertEqual(0x8888888877777777L,
+                LongNonmatCond_LongVarVar(2L, 2L, 0xAAAAAAAA55555555L, 0x8888888877777777L));
+    assertEqual(0x8888888877777777L,
+                LongNonmatCond_LongVarVar(2L, 3L, 0xAAAAAAAA55555555L, 0x8888888877777777L));
+    assertEqual(0xAAAAAAAA55555555L, LongNonmatCond_LongVarVar(0x0000000100000000L,
+                                                               0x00000000FFFFFFFFL,
+                                                               0xAAAAAAAA55555555L,
+                                                               0x8888888877777777L));
+    assertEqual(0x8888888877777777L, LongNonmatCond_LongVarVar(0x00000000FFFFFFFFL,
+                                                               0x0000000100000000L,
+                                                               0xAAAAAAAA55555555L,
+                                                               0x8888888877777777L));
+
+    assertEqual(0x8888888877777777L, $noinline$LongEqNonmatCond_LongVarVar(2L,
+                                                                           3L,
+                                                                           0xAAAAAAAA55555555L,
+                                                                           0x8888888877777777L));
+    assertEqual(0xAAAAAAAA55555555L, $noinline$LongEqNonmatCond_LongVarVar(2L,
+                                                                           2L,
+                                                                           0xAAAAAAAA55555555L,
+                                                                           0x8888888877777777L));
+    assertEqual(0x8888888877777777L, $noinline$LongEqNonmatCond_LongVarVar(0x10000000000L,
+                                                                           0L,
+                                                                           0xAAAAAAAA55555555L,
+                                                                           0x8888888877777777L));
+
+    assertEqual(5L, $noinline$LongNonmatCondCst_LongVarVar2(0x7FFFFFFFFFFFFFFFL, 5L, 7L));
+    assertEqual(7L, $noinline$LongNonmatCondCst_LongVarVar2(2L, 5L, 7L));
+
+    assertEqual(7L, $noinline$LongNonmatCondCst_LongVarVar3(2L, 5L, 7L));
+
     assertEqual(5, FloatLtNonmatCond_IntVarVar(3, 2, 5, 7));
     assertEqual(7, FloatLtNonmatCond_IntVarVar(2, 3, 5, 7));
     assertEqual(7, FloatLtNonmatCond_IntVarVar(Float.NaN, 2, 5, 7));
diff --git a/test/577-profile-foreign-dex/info.txt b/test/577-profile-foreign-dex/info.txt
deleted file mode 100644
index 090db3f..0000000
--- a/test/577-profile-foreign-dex/info.txt
+++ /dev/null
@@ -1 +0,0 @@
-Check that we record the use of foreign dex files when profiles are enabled.
diff --git a/test/577-profile-foreign-dex/src/Main.java b/test/577-profile-foreign-dex/src/Main.java
deleted file mode 100644
index ed7a625..0000000
--- a/test/577-profile-foreign-dex/src/Main.java
+++ /dev/null
@@ -1,175 +0,0 @@
-/*
- * Copyright (C) 2016 The Android Open Source Project
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-import java.io.File;
-import java.io.FileInputStream;
-import java.io.FileOutputStream;
-import java.io.IOException;
-import java.lang.reflect.Method;
-import java.lang.reflect.Constructor;
-import java.util.HashMap;
-
-public class Main {
-
-  private static final String PROFILE_NAME = "primary.prof";
-  private static final String APP_DIR_PREFIX = "app_dir_";
-  private static final String FOREIGN_DEX_PROFILE_DIR = "foreign-dex";
-  private static final String TEMP_FILE_NAME_PREFIX = "dummy";
-  private static final String TEMP_FILE_NAME_SUFFIX = "-file";
-
-  public static void main(String[] args) throws Exception {
-    File tmpFile = null;
-    File appDir = null;
-    File profileFile = null;
-    File foreignDexProfileDir = null;
-
-    try {
-      // Create the necessary files layout.
-      tmpFile = createTempFile();
-      appDir = new File(tmpFile.getParent(), APP_DIR_PREFIX + tmpFile.getName());
-      appDir.mkdir();
-      foreignDexProfileDir = new File(tmpFile.getParent(), FOREIGN_DEX_PROFILE_DIR);
-      foreignDexProfileDir.mkdir();
-      profileFile = createTempFile();
-
-      String codePath = System.getenv("DEX_LOCATION") + "/577-profile-foreign-dex.jar";
-
-      // Register the app with the runtime
-      VMRuntime.registerAppInfo(profileFile.getPath(), appDir.getPath(),
-             new String[] { codePath }, foreignDexProfileDir.getPath());
-
-      testMarkerForForeignDex(foreignDexProfileDir);
-      testMarkerForCodePath(foreignDexProfileDir);
-      testMarkerForApplicationDexFile(foreignDexProfileDir, appDir);
-    } finally {
-      if (tmpFile != null) {
-        tmpFile.delete();
-      }
-      if (profileFile != null) {
-        profileFile.delete();
-      }
-      if (foreignDexProfileDir != null) {
-        foreignDexProfileDir.delete();
-      }
-      if (appDir != null) {
-        appDir.delete();
-      }
-    }
-  }
-
-  // Verify we actually create a marker on disk for foreign dex files.
-  private static void testMarkerForForeignDex(File foreignDexProfileDir) throws Exception {
-    String foreignDex = System.getenv("DEX_LOCATION") + "/577-profile-foreign-dex-ex.jar";
-    loadDexFile(foreignDex);
-    checkMarker(foreignDexProfileDir, foreignDex, /* exists */ true);
-  }
-
-  // Verify we do not create a marker on disk for dex files path of the code path.
-  private static void testMarkerForCodePath(File foreignDexProfileDir) throws Exception {
-    String codePath = System.getenv("DEX_LOCATION") + "/577-profile-foreign-dex.jar";
-    loadDexFile(codePath);
-    checkMarker(foreignDexProfileDir, codePath, /* exists */ false);
-  }
-
-  private static void testMarkerForApplicationDexFile(File foreignDexProfileDir, File appDir)
-      throws Exception {
-    // Copy the -ex jar to the application directory and load it from there.
-    // This will record duplicate class conflicts but we don't care for this use case.
-    File foreignDex = new File(System.getenv("DEX_LOCATION") + "/577-profile-foreign-dex-ex.jar");
-    File appDex = new File(appDir, "appDex.jar");
-    try {
-      copyFile(foreignDex, appDex);
-
-      loadDexFile(appDex.getAbsolutePath());
-      checkMarker(foreignDexProfileDir, appDex.getAbsolutePath(), /* exists */ false);
-    } finally {
-      if (appDex != null) {
-        appDex.delete();
-      }
-    }
-  }
-
-  private static void checkMarker(File foreignDexProfileDir, String dexFile, boolean exists) {
-    File marker = new File(foreignDexProfileDir, dexFile.replace('/', '@'));
-    boolean result_ok = exists ? marker.exists() : !marker.exists();
-    if (!result_ok) {
-      throw new RuntimeException("Marker test failed for:" + marker.getPath());
-    }
-  }
-
-  private static void loadDexFile(String dexFile) throws Exception {
-    Class<?> pathClassLoader = Class.forName("dalvik.system.PathClassLoader");
-    if (pathClassLoader == null) {
-        throw new RuntimeException("Couldn't find path class loader class");
-    }
-    Constructor<?> constructor =
-        pathClassLoader.getDeclaredConstructor(String.class, ClassLoader.class);
-    constructor.newInstance(
-            dexFile, ClassLoader.getSystemClassLoader());
-  }
-
-  private static class VMRuntime {
-    private static final Method registerAppInfoMethod;
-    static {
-      try {
-        Class<?> c = Class.forName("dalvik.system.VMRuntime");
-        registerAppInfoMethod = c.getDeclaredMethod("registerAppInfo",
-            String.class, String.class, String[].class, String.class);
-      } catch (Exception e) {
-        throw new RuntimeException(e);
-      }
-    }
-
-    public static void registerAppInfo(String pkgName, String appDir,
-        String[] codePath, String foreignDexProfileDir) throws Exception {
-      registerAppInfoMethod.invoke(null, pkgName, appDir, codePath, foreignDexProfileDir);
-    }
-  }
-
-  private static void copyFile(File fromFile, File toFile) throws Exception {
-    FileInputStream in = new FileInputStream(fromFile);
-    FileOutputStream out = new FileOutputStream(toFile);
-    try {
-      byte[] buffer = new byte[4096];
-      int bytesRead;
-      while ((bytesRead = in.read(buffer)) >= 0) {
-          out.write(buffer, 0, bytesRead);
-      }
-    } finally {
-      out.flush();
-      try {
-          out.getFD().sync();
-      } catch (IOException e) {
-      }
-      out.close();
-      in.close();
-    }
-  }
-
-  private static File createTempFile() throws Exception {
-    try {
-      return File.createTempFile(TEMP_FILE_NAME_PREFIX, TEMP_FILE_NAME_SUFFIX);
-    } catch (IOException e) {
-      System.setProperty("java.io.tmpdir", "/data/local/tmp");
-      try {
-        return File.createTempFile(TEMP_FILE_NAME_PREFIX, TEMP_FILE_NAME_SUFFIX);
-      } catch (IOException e2) {
-        System.setProperty("java.io.tmpdir", "/sdcard");
-        return File.createTempFile(TEMP_FILE_NAME_PREFIX, TEMP_FILE_NAME_SUFFIX);
-      }
-    }
-  }
-}
diff --git a/test/595-profile-saving/src/Main.java b/test/595-profile-saving/src/Main.java
index 039503f..faf94c4 100644
--- a/test/595-profile-saving/src/Main.java
+++ b/test/595-profile-saving/src/Main.java
@@ -29,9 +29,7 @@
       // String codePath = getDexBaseLocation();
       String codePath = System.getenv("DEX_LOCATION") + "/595-profile-saving.jar";
       VMRuntime.registerAppInfo(file.getPath(),
-                                System.getenv("DEX_LOCATION"),
-                                new String[] {codePath},
-                                /* foreignProfileDir */ null);
+                                new String[] {codePath});
 
       int methodIdx = $opt$noinline$testProfile();
       ensureProfileProcessing();
@@ -85,15 +83,15 @@
       try {
         Class<? extends Object> c = Class.forName("dalvik.system.VMRuntime");
         registerAppInfoMethod = c.getDeclaredMethod("registerAppInfo",
-            String.class, String.class, String[].class, String.class);
+            String.class, String[].class);
       } catch (Exception e) {
         throw new RuntimeException(e);
       }
     }
 
-    public static void registerAppInfo(String profile, String appDir,
-                                       String[] codePaths, String foreignDir) throws Exception {
-      registerAppInfoMethod.invoke(null, profile, appDir, codePaths, foreignDir);
+    public static void registerAppInfo(String profile, String[] codePaths)
+        throws Exception {
+      registerAppInfoMethod.invoke(null, profile, codePaths);
     }
   }
 }
diff --git a/test/618-checker-induction/src/Main.java b/test/618-checker-induction/src/Main.java
index ad3ff44..2d9daf1 100644
--- a/test/618-checker-induction/src/Main.java
+++ b/test/618-checker-induction/src/Main.java
@@ -21,6 +21,8 @@
 
   static int[] a = new int[10];
 
+  static int[] novec = new int[20];  // to prevent vectorization
+
   /// CHECK-START: void Main.deadSingleLoop() loop_optimization (before)
   /// CHECK-DAG: Phi loop:{{B\d+}} outer_loop:none
   //
@@ -132,16 +134,18 @@
   /// CHECK-START: void Main.deadInduction() loop_optimization (before)
   /// CHECK-DAG: Phi      loop:<<Loop:B\d+>> outer_loop:none
   /// CHECK-DAG: Phi      loop:<<Loop>>      outer_loop:none
+  /// CHECK-DAG: ArrayGet loop:<<Loop>>      outer_loop:none
   /// CHECK-DAG: ArraySet loop:<<Loop>>      outer_loop:none
   //
   /// CHECK-START: void Main.deadInduction() loop_optimization (after)
   /// CHECK-DAG: Phi      loop:<<Loop:B\d+>> outer_loop:none
   /// CHECK-NOT: Phi      loop:<<Loop>>      outer_loop:none
+  /// CHECK-DAG: ArrayGet loop:<<Loop>>      outer_loop:none
   /// CHECK-DAG: ArraySet loop:<<Loop>>      outer_loop:none
   static void deadInduction() {
     int dead = 0;
     for (int i = 0; i < a.length; i++) {
-      a[i] = 1;
+      a[i] = novec[2 * i] + 1;
       dead += 5;
     }
   }
@@ -151,17 +155,19 @@
   /// CHECK-DAG: Phi      loop:<<Loop>>      outer_loop:none
   /// CHECK-DAG: Phi      loop:<<Loop>>      outer_loop:none
   /// CHECK-DAG: Phi      loop:<<Loop>>      outer_loop:none
+  /// CHECK-DAG: ArrayGet loop:<<Loop>>      outer_loop:none
   /// CHECK-DAG: ArraySet loop:<<Loop>>      outer_loop:none
   //
   /// CHECK-START: void Main.deadManyInduction() loop_optimization (after)
   /// CHECK-DAG: Phi      loop:<<Loop:B\d+>> outer_loop:none
   /// CHECK-NOT: Phi      loop:<<Loop>>      outer_loop:none
+  /// CHECK-DAG: ArrayGet loop:<<Loop>>      outer_loop:none
   /// CHECK-DAG: ArraySet loop:<<Loop>>      outer_loop:none
   static void deadManyInduction() {
     int dead1 = 0, dead2 = 1, dead3 = 3;
     for (int i = 0; i < a.length; i++) {
       dead1 += 5;
-      a[i] = 2;
+      a[i] = novec[2 * i] + 2;
       dead2 += 10;
       dead3 += 100;
     }
@@ -170,16 +176,18 @@
   /// CHECK-START: void Main.deadSequence() loop_optimization (before)
   /// CHECK-DAG: Phi      loop:<<Loop:B\d+>> outer_loop:none
   /// CHECK-DAG: Phi      loop:<<Loop>>      outer_loop:none
+  /// CHECK-DAG: ArrayGet loop:<<Loop>>      outer_loop:none
   /// CHECK-DAG: ArraySet loop:<<Loop>>      outer_loop:none
   //
   /// CHECK-START: void Main.deadSequence() loop_optimization (after)
   /// CHECK-DAG: Phi      loop:<<Loop:B\d+>> outer_loop:none
   /// CHECK-NOT: Phi      loop:<<Loop>>      outer_loop:none
+  /// CHECK-DAG: ArrayGet loop:<<Loop>>      outer_loop:none
   /// CHECK-DAG: ArraySet loop:<<Loop>>      outer_loop:none
   static void deadSequence() {
     int dead = 0;
     for (int i = 0; i < a.length; i++) {
-      a[i] = 3;
+      a[i] = novec[2 * i] + 3;
       // Increment value defined inside loop,
       // but sequence itself not used anywhere.
       dead += i;
@@ -191,17 +199,19 @@
   /// CHECK-DAG: Phi      loop:<<Loop>>      outer_loop:none
   /// CHECK-DAG: ArraySet loop:<<Loop>>      outer_loop:none
   /// CHECK-DAG: ArrayGet loop:<<Loop>>      outer_loop:none
+  /// CHECK-DAG: ArrayGet loop:<<Loop>>      outer_loop:none
   /// CHECK-NOT: BoundsCheck
   //
   /// CHECK-START: void Main.deadCycleWithException(int) loop_optimization (after)
   /// CHECK-DAG: Phi      loop:<<Loop:B\d+>> outer_loop:none
   /// CHECK-NOT: Phi      loop:<<Loop>>      outer_loop:none
   /// CHECK-DAG: ArraySet loop:<<Loop>>      outer_loop:none
+  /// CHECK-DAG: ArrayGet loop:<<Loop>>      outer_loop:none
   /// CHECK-NOT: ArrayGet loop:<<Loop>>      outer_loop:none
   static void deadCycleWithException(int k) {
     int dead = 0;
     for (int i = 0; i < a.length; i++) {
-      a[i] = 4;
+      a[i] = novec[2 * i] + 4;
       // Increment value of dead cycle may throw exception. Dynamic
       // BCE takes care of the bounds check though, which enables
       // removing the ArrayGet after removing the dead cycle.
diff --git a/test/623-checker-loop-regressions/src/Main.java b/test/623-checker-loop-regressions/src/Main.java
index 7509d9b..eee90ab 100644
--- a/test/623-checker-loop-regressions/src/Main.java
+++ b/test/623-checker-loop-regressions/src/Main.java
@@ -213,6 +213,8 @@
   /// CHECK-START: long Main.geoLongDivLastValue(long) instruction_simplifier$after_bce (after)
   /// CHECK-DAG: <<Long:j\d+>> LongConstant 0    loop:none
   /// CHECK-DAG:               Return [<<Long>>] loop:none
+  //
+  // Tests overflow in the divisor (while updating intermediate result).
   static long geoLongDivLastValue(long x) {
     for (int i = 0; i < 10; i++) {
       x /= 1081788608;
@@ -220,6 +222,26 @@
     return x;
   }
 
+  /// CHECK-START: long Main.geoLongDivLastValue() loop_optimization (before)
+  /// CHECK-DAG: Phi loop:<<Loop:B\d+>> outer_loop:none
+  /// CHECK-DAG: Phi loop:<<Loop>>      outer_loop:none
+  //
+  /// CHECK-START: long Main.geoLongDivLastValue() loop_optimization (after)
+  /// CHECK-NOT: Phi
+  //
+  /// CHECK-START: long Main.geoLongDivLastValue() instruction_simplifier$after_bce (after)
+  /// CHECK-DAG: <<Long:j\d+>> LongConstant 0    loop:none
+  /// CHECK-DAG:               Return [<<Long>>] loop:none
+  //
+  // Tests overflow in the divisor (while updating base).
+  static long geoLongDivLastValue() {
+    long x = -1;
+    for (int i2 = 0; i2 < 2; i2++) {
+      x /= (Long.MAX_VALUE);
+    }
+    return x;
+  }
+
   /// CHECK-START: long Main.geoLongMulLastValue(long) loop_optimization (before)
   /// CHECK-DAG: Phi loop:<<Loop:B\d+>> outer_loop:none
   /// CHECK-DAG: Phi loop:<<Loop>>      outer_loop:none
@@ -239,6 +261,15 @@
     return x;
   }
 
+  // If vectorized, the narrowing subscript should not cause
+  // type inconsistencies in the synthesized code.
+  static void narrowingSubscript(float[] a) {
+    float val = 2.0f;
+    for (long i = 0; i < a.length; i++) {
+      a[(int) i] += val;
+    }
+  }
+
   public static void main(String[] args) {
     expectEquals(10, earlyExitFirst(-1));
     for (int i = 0; i <= 10; i++) {
@@ -286,6 +317,8 @@
     expectEquals(0L, geoLongDivLastValue(9223372036854775807L));
     expectEquals(0L, geoLongDivLastValue(-9223372036854775808L));
 
+    expectEquals(0L, geoLongDivLastValue());
+
     expectEquals(                   0L, geoLongMulLastValue(0L));
     expectEquals(-8070450532247928832L, geoLongMulLastValue(1L));
     expectEquals( 2305843009213693952L, geoLongMulLastValue(2L));
@@ -296,6 +329,12 @@
     expectEquals( 8070450532247928832L, geoLongMulLastValue(9223372036854775807L));
     expectEquals(                   0L, geoLongMulLastValue(-9223372036854775808L));
 
+    float[] a = new float[16];
+    narrowingSubscript(a);
+    for (int i = 0; i < 16; i++) {
+      expectEquals(2.0f, a[i]);
+    }
+
     System.out.println("passed");
   }
 
@@ -310,4 +349,10 @@
       throw new Error("Expected: " + expected + ", found: " + result);
     }
   }
+
+  private static void expectEquals(float expected, float result) {
+    if (expected != result) {
+      throw new Error("Expected: " + expected + ", found: " + result);
+    }
+  }
 }
diff --git a/test/626-checker-arm64-scratch-register/src/Main.java b/test/626-checker-arm64-scratch-register/src/Main.java
index 6dd4374..1394917 100644
--- a/test/626-checker-arm64-scratch-register/src/Main.java
+++ b/test/626-checker-arm64-scratch-register/src/Main.java
@@ -70,7 +70,7 @@
   /// CHECK:  end_block
   /// CHECK: begin_block
   /// CHECK:   name "<<ElseBlock>>"
-  /// CHECK:                      ParallelMove moves:[#100->d17,32(sp)->d1,36(sp)->d2,d17->d3,d3->d4,d4->d5,d5->d6,d6->d7,d7->d18,d18->d19,d19->d20,d20->d21,d21->d22,d22->d23,d23->d10,d10->d11,d11->d12,24(sp)->d13,28(sp)->d14,d14->16(sp),d12->20(sp),d13->24(sp),d1->28(sp),d2->32(sp),16(sp)->36(sp),20(sp)->40(sp)]
+  /// CHECK:                      ParallelMove moves:[40(sp)->d0,24(sp)->32(sp),28(sp)->36(sp),d0->d3,d3->d4,d2->d5,d4->d6,d5->d7,d6->d18,d7->d19,d18->d20,d19->d21,d20->d22,d21->d23,d22->d10,d23->d11,16(sp)->24(sp),20(sp)->28(sp),d10->d14,d11->d12,d12->d13,d13->d1,d14->d2,32(sp)->16(sp),36(sp)->20(sp)]
   /// CHECK: end_block
 
   /// CHECK-START-ARM64: void Main.test() disassembly (after)
@@ -85,7 +85,7 @@
   /// CHECK:  end_block
   /// CHECK: begin_block
   /// CHECK:   name "<<ElseBlock>>"
-  /// CHECK:                      ParallelMove moves:[invalid->invalid,invalid->invalid,invalid->invalid,invalid->invalid,invalid->invalid,invalid->invalid,invalid->invalid,invalid->invalid,invalid->invalid,invalid->invalid,invalid->invalid,invalid->invalid,invalid->invalid,invalid->invalid,invalid->invalid,invalid->invalid,invalid->invalid,invalid->invalid,invalid->invalid,invalid->invalid,invalid->invalid,invalid->invalid,invalid->invalid,invalid->invalid,invalid->invalid,invalid->invalid]
+  /// CHECK:                      ParallelMove moves:[invalid->invalid,invalid->invalid,invalid->invalid,invalid->invalid,invalid->invalid,invalid->invalid,invalid->invalid,invalid->invalid,invalid->invalid,invalid->invalid,invalid->invalid,invalid->invalid,invalid->invalid,invalid->invalid,invalid->invalid,invalid->invalid,invalid->invalid,invalid->invalid,invalid->invalid,invalid->invalid,invalid->invalid,invalid->invalid,invalid->invalid,invalid->invalid,invalid->invalid]
   /// CHECK:                        fmov d31, d2
   /// CHECK:                        ldr s2, [sp, #36]
   /// CHECK:                        ldr w16, [sp, #16]
@@ -111,11 +111,10 @@
   /// CHECK:                        fmov d6, d5
   /// CHECK:                        fmov d5, d4
   /// CHECK:                        fmov d4, d3
-  /// CHECK:                        fmov d3, d17
-  /// CHECK:                        fmov d17, d13
+  /// CHECK:                        fmov d3, d13
   /// CHECK:                        ldr s13, [sp, #24]
-  /// CHECK:                        str s17, [sp, #24]
-  /// CHECK:                        ldr s17, pc+{{\d+}} (addr {{0x[0-9a-f]+}}) (100)
+  /// CHECK:                        str s3, [sp, #24]
+  /// CHECK:                        ldr s3, pc+{{\d+}} (addr {{0x[0-9a-f]+}}) (100)
   /// CHECK: end_block
 
   public void test() {
diff --git a/test/577-profile-foreign-dex/expected.txt b/test/638-checker-inline-caches/expected.txt
similarity index 100%
copy from test/577-profile-foreign-dex/expected.txt
copy to test/638-checker-inline-caches/expected.txt
diff --git a/test/638-checker-inline-caches/info.txt b/test/638-checker-inline-caches/info.txt
new file mode 100644
index 0000000..1fac628
--- /dev/null
+++ b/test/638-checker-inline-caches/info.txt
@@ -0,0 +1 @@
+Verify the use of inline caches in AOT mode.
diff --git a/test/638-checker-inline-caches/multidex.jpp b/test/638-checker-inline-caches/multidex.jpp
new file mode 100644
index 0000000..69a2cc1
--- /dev/null
+++ b/test/638-checker-inline-caches/multidex.jpp
@@ -0,0 +1,12 @@
+Main:
+  @@com.android.jack.annotations.ForceInMainDex
+  class Main
+Super:
+  @@com.android.jack.annotations.ForceInMainDex
+  class Super
+SubA:
+  @@com.android.jack.annotations.ForceInMainDex
+  class SubA
+SubB
+  @@com.android.jack.annotations.ForceInMainDex
+  class SubB
diff --git a/test/638-checker-inline-caches/profile b/test/638-checker-inline-caches/profile
new file mode 100644
index 0000000..1ca6d7b
--- /dev/null
+++ b/test/638-checker-inline-caches/profile
@@ -0,0 +1,6 @@
+LMain;->inlineMonomorphicSubA(LSuper;)I+LSubA;
+LMain;->inlinePolymophicSubASubB(LSuper;)I+LSubA;,LSubB;
+LMain;->inlinePolymophicCrossDexSubASubC(LSuper;)I+LSubA;,LSubC;
+LMain;->inlineMegamorphic(LSuper;)I+LSubA;,LSubB;,LSubC;,LSubD;,LSubE;
+LMain;->inlineMissingTypes(LSuper;)I+missing_types
+LMain;->noInlineCache(LSuper;)I
diff --git a/test/577-profile-foreign-dex/run b/test/638-checker-inline-caches/run
similarity index 71%
copy from test/577-profile-foreign-dex/run
copy to test/638-checker-inline-caches/run
index ad57d14..146e180 100644
--- a/test/577-profile-foreign-dex/run
+++ b/test/638-checker-inline-caches/run
@@ -1,12 +1,12 @@
 #!/bin/bash
 #
-# Copyright 2016 The Android Open Source Project
+# Copyright (C) 2017 The Android Open Source Project
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
 #
-#      http://www.apache.org/licenses/LICENSE-2.0
+#     http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
@@ -14,7 +14,4 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-exec ${RUN} \
-  --runtime-option -Xjitsaveprofilinginfo \
-  --runtime-option -Xusejit:true \
-  "${@}"
+exec ${RUN} $@ --profile -Xcompiler-option --compiler-filter=speed-profile
diff --git a/test/577-profile-foreign-dex/src-ex/OtherDex.java b/test/638-checker-inline-caches/src-multidex/SubC.java
similarity index 80%
copy from test/577-profile-foreign-dex/src-ex/OtherDex.java
copy to test/638-checker-inline-caches/src-multidex/SubC.java
index cba73b3..f7e3c08 100644
--- a/test/577-profile-foreign-dex/src-ex/OtherDex.java
+++ b/test/638-checker-inline-caches/src-multidex/SubC.java
@@ -1,5 +1,5 @@
 /*
- * Copyright (C) 2016 The Android Open Source Project
+ * Copyright (C) 2017 The Android Open Source Project
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -13,5 +13,7 @@
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
-public class OtherDex {
+
+public class SubC extends Super   {
+  public int getValue() { return 24; }
 }
diff --git a/test/638-checker-inline-caches/src/Main.java b/test/638-checker-inline-caches/src/Main.java
new file mode 100644
index 0000000..2cee47e
--- /dev/null
+++ b/test/638-checker-inline-caches/src/Main.java
@@ -0,0 +1,192 @@
+/*
+ * Copyright (C) 2017 The Android Open Source Project
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+class SubA extends Super {
+  int getValue() { return 42; }
+}
+
+class SubB extends Super {
+  int getValue() { return 38; }
+}
+
+class SubD extends Super {
+  int getValue() { return 10; }
+}
+
+class SubE extends Super {
+  int getValue() { return -4; }
+}
+
+public class Main {
+
+  /// CHECK-START: int Main.inlineMonomorphicSubA(Super) inliner (before)
+  /// CHECK:       InvokeVirtual method_name:Super.getValue
+
+  /// CHECK-START: int Main.inlineMonomorphicSubA(Super) inliner (after)
+  /// CHECK-NOT:   InvokeVirtual method_name:Super.getValue
+
+  /// CHECK-START: int Main.inlineMonomorphicSubA(Super) inliner (after)
+  /// CHECK:  <<SubARet:i\d+>>      IntConstant 42
+  /// CHECK:  <<ObjClass:l\d+>>     InstanceFieldGet field_name:java.lang.Object.shadow$_klass_
+  /// CHECK:  <<InlineClass:l\d+>>  LoadClass class_name:SubA
+  /// CHECK:  <<Test:z\d+>>         NotEqual [<<InlineClass>>,<<ObjClass>>]
+  /// CHECK:                        Deoptimize [<<Test>>]
+  /// CHECK:                        Return [<<SubARet>>]
+  public static int inlineMonomorphicSubA(Super a) {
+    return a.getValue();
+  }
+
+  /// CHECK-START: int Main.inlinePolymophicSubASubB(Super) inliner (before)
+  /// CHECK:       InvokeVirtual method_name:Super.getValue
+
+  /// CHECK-START: int Main.inlinePolymophicSubASubB(Super) inliner (after)
+  /// CHECK-NOT:   InvokeVirtual method_name:Super.getValue
+
+  // Note that the order in which the types are added to the inline cache in the profile matters.
+
+  /// CHECK-START: int Main.inlinePolymophicSubASubB(Super) inliner (after)
+  /// CHECK-DAG:  <<SubARet:i\d+>>          IntConstant 42
+  /// CHECK-DAG:  <<SubBRet:i\d+>>          IntConstant 38
+  /// CHECK:      <<ObjClassSubA:l\d+>>     InstanceFieldGet field_name:java.lang.Object.shadow$_klass_
+  /// CHECK:      <<InlineClassSubA:l\d+>>  LoadClass class_name:SubA
+  /// CHECK:      <<TestSubA:z\d+>>         NotEqual [<<InlineClassSubA>>,<<ObjClassSubA>>]
+  /// CHECK:                                If [<<TestSubA>>]
+
+  /// CHECK:      <<ObjClassSubB:l\d+>>     InstanceFieldGet field_name:java.lang.Object.shadow$_klass_
+  /// CHECK:      <<InlineClassSubB:l\d+>>  LoadClass class_name:SubB
+  /// CHECK:      <<TestSubB:z\d+>>         NotEqual [<<InlineClassSubB>>,<<ObjClassSubB>>]
+  /// CHECK:                                Deoptimize [<<TestSubB>>]
+
+  /// CHECK:      <<Ret:i\d+>>              Phi [<<SubARet>>,<<SubBRet>>]
+  /// CHECK:                                Return [<<Ret>>]
+  public static int inlinePolymophicSubASubB(Super a) {
+    return a.getValue();
+  }
+
+  /// CHECK-START: int Main.inlinePolymophicCrossDexSubASubC(Super) inliner (before)
+  /// CHECK:       InvokeVirtual method_name:Super.getValue
+
+  /// CHECK-START: int Main.inlinePolymophicCrossDexSubASubC(Super) inliner (after)
+  /// CHECK-NOT:   InvokeVirtual method_name:Super.getValue
+
+  // Note that the order in which the types are added to the inline cache in the profile matters.
+
+  /// CHECK-START: int Main.inlinePolymophicCrossDexSubASubC(Super) inliner (after)
+  /// CHECK-DAG:  <<SubARet:i\d+>>          IntConstant 42
+  /// CHECK-DAG:  <<SubCRet:i\d+>>          IntConstant 24
+  /// CHECK:      <<ObjClassSubA:l\d+>>     InstanceFieldGet field_name:java.lang.Object.shadow$_klass_
+  /// CHECK:      <<InlineClassSubA:l\d+>>  LoadClass class_name:SubA
+  /// CHECK:      <<TestSubA:z\d+>>         NotEqual [<<InlineClassSubA>>,<<ObjClassSubA>>]
+  /// CHECK:                                If [<<TestSubA>>]
+
+  /// CHECK:      <<ObjClassSubC:l\d+>>     InstanceFieldGet field_name:java.lang.Object.shadow$_klass_
+  /// CHECK:      <<InlineClassSubC:l\d+>>  LoadClass class_name:SubC
+  /// CHECK:      <<TestSubC:z\d+>>         NotEqual [<<InlineClassSubC>>,<<ObjClassSubC>>]
+  /// CHECK:                                Deoptimize [<<TestSubC>>]
+
+  /// CHECK:      <<Ret:i\d+>>              Phi [<<SubARet>>,<<SubCRet>>]
+  /// CHECK:                                Return [<<Ret>>]
+  public static int inlinePolymophicCrossDexSubASubC(Super a) {
+    return a.getValue();
+  }
+
+  /// CHECK-START: int Main.inlineMegamorphic(Super) inliner (before)
+  /// CHECK:       InvokeVirtual method_name:Super.getValue
+
+  /// CHECK-START: int Main.inlineMegamorphic(Super) inliner (after)
+  /// CHECK:       InvokeVirtual method_name:Super.getValue
+  public static int inlineMegamorphic(Super a) {
+    return a.getValue();
+  }
+
+  /// CHECK-START: int Main.inlineMissingTypes(Super) inliner (before)
+  /// CHECK:       InvokeVirtual method_name:Super.getValue
+
+  /// CHECK-START: int Main.inlineMissingTypes(Super) inliner (after)
+  /// CHECK:       InvokeVirtual method_name:Super.getValue
+  public static int inlineMissingTypes(Super a) {
+    return a.getValue();
+  }
+
+  /// CHECK-START: int Main.noInlineCache(Super) inliner (before)
+  /// CHECK:       InvokeVirtual method_name:Super.getValue
+
+  /// CHECK-START: int Main.noInlineCache(Super) inliner (after)
+  /// CHECK:       InvokeVirtual method_name:Super.getValue
+  public static int noInlineCache(Super a) {
+    return a.getValue();
+  }
+
+  public static void testInlineMonomorphic() {
+    if (inlineMonomorphicSubA(new SubA()) != 42) {
+      throw new Error("Expected 42");
+    }
+
+    // Call with a different type than the one from the inline cache.
+    if (inlineMonomorphicSubA(new SubB()) != 38) {
+      throw new Error("Expected 38");
+    }
+  }
+
+  public static void testInlinePolymorhic() {
+    if (inlinePolymophicSubASubB(new SubA()) != 42) {
+      throw new Error("Expected 42");
+    }
+
+    if (inlinePolymophicSubASubB(new SubB()) != 38) {
+      throw new Error("Expected 38");
+    }
+
+    // Call with a different type than the one from the inline cache.
+    if (inlinePolymophicSubASubB(new SubC()) != 24) {
+      throw new Error("Expected 25");
+    }
+
+    if (inlinePolymophicCrossDexSubASubC(new SubA()) != 42) {
+      throw new Error("Expected 42");
+    }
+
+    if (inlinePolymophicCrossDexSubASubC(new SubC()) != 24) {
+      throw new Error("Expected 24");
+    }
+
+    // Call with a different type than the one from the inline cache.
+    if (inlinePolymophicCrossDexSubASubC(new SubB()) != 38) {
+      throw new Error("Expected 38");
+    }
+  }
+
+  public static void testInlineMegamorphic() {
+    if (inlineMegamorphic(new SubA()) != 42) {
+      throw new Error("Expected 42");
+    }
+  }
+
+
+  public static void testNoInlineCache() {
+    if (noInlineCache(new SubA()) != 42) {
+      throw new Error("Expected 42");
+    }
+  }
+
+  public static void main(String[] args) {
+    testInlineMonomorphic();
+    testInlinePolymorhic();
+    testInlineMegamorphic();
+    testNoInlineCache();
+  }
+
+}
diff --git a/test/577-profile-foreign-dex/src-ex/OtherDex.java b/test/638-checker-inline-caches/src/Super.java
similarity index 82%
rename from test/577-profile-foreign-dex/src-ex/OtherDex.java
rename to test/638-checker-inline-caches/src/Super.java
index cba73b3..30cdf30 100644
--- a/test/577-profile-foreign-dex/src-ex/OtherDex.java
+++ b/test/638-checker-inline-caches/src/Super.java
@@ -1,5 +1,5 @@
 /*
- * Copyright (C) 2016 The Android Open Source Project
+ * Copyright (C) 2017 The Android Open Source Project
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -13,5 +13,7 @@
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
-public class OtherDex {
+
+public abstract class Super {
+  abstract int getValue();
 }
diff --git a/test/577-profile-foreign-dex/run b/test/641-checker-arraycopy/build
similarity index 70%
copy from test/577-profile-foreign-dex/run
copy to test/641-checker-arraycopy/build
index ad57d14..9abc618 100644
--- a/test/577-profile-foreign-dex/run
+++ b/test/641-checker-arraycopy/build
@@ -1,6 +1,6 @@
 #!/bin/bash
 #
-# Copyright 2016 The Android Open Source Project
+# Copyright 2017 The Android Open Source Project
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -14,7 +14,11 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-exec ${RUN} \
-  --runtime-option -Xjitsaveprofilinginfo \
-  --runtime-option -Xusejit:true \
-  "${@}"
+# make us exit on a failure
+set -e
+
+# Don't use jack for this test, to ensure we don't use
+# the typed System.arraycopy versions directly.
+export USE_JACK=false
+
+./default-build
diff --git a/test/577-profile-foreign-dex/expected.txt b/test/641-checker-arraycopy/expected.txt
similarity index 100%
rename from test/577-profile-foreign-dex/expected.txt
rename to test/641-checker-arraycopy/expected.txt
diff --git a/test/641-checker-arraycopy/info.txt b/test/641-checker-arraycopy/info.txt
new file mode 100644
index 0000000..1a1111e
--- /dev/null
+++ b/test/641-checker-arraycopy/info.txt
@@ -0,0 +1,2 @@
+Checker test for testing the arraycopy optimization in
+instruction simplifier.
diff --git a/test/641-checker-arraycopy/src/Main.java b/test/641-checker-arraycopy/src/Main.java
new file mode 100644
index 0000000..f0fcf28
--- /dev/null
+++ b/test/641-checker-arraycopy/src/Main.java
@@ -0,0 +1,83 @@
+/*
+ * Copyright (C) 2017 The Android Open Source Project
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+public class Main {
+
+  // Note that this is testing we haven't intrinsified the byte[] arraycopy version.
+  // Once we eventually start doing it, we will need to re-adjust this test.
+
+  /// CHECK-START-X86: void Main.typedCopy(java.lang.Object, byte[]) disassembly (after)
+  /// CHECK: InvokeStaticOrDirect method_name:java.lang.System.arraycopy intrinsic:SystemArrayCopy
+  /// CHECK-NOT:    call
+  /// CHECK: InvokeStaticOrDirect method_name:java.lang.System.arraycopy intrinsic:SystemArrayCopy
+  /// CHECK:        call
+  /// CHECK: ReturnVoid
+  public static void typedCopy(Object o, byte[] foo) {
+    System.arraycopy(o, 1, o, 0, 1);
+    System.arraycopy(foo, 1, foo, 0, 1);
+  }
+
+  public static void untypedCopy(Object o, Object foo) {
+    System.arraycopy(o, 1, o, 0, 1);
+    System.arraycopy(foo, 1, foo, 0, 1);
+  }
+
+  // Test that we still do the optimization after inlining.
+
+  /// CHECK-START-X86: void Main.untypedCopyCaller(java.lang.Object, byte[]) disassembly (after)
+  /// CHECK: InvokeStaticOrDirect method_name:java.lang.System.arraycopy intrinsic:SystemArrayCopy
+  /// CHECK-NOT:    call
+  /// CHECK: InvokeStaticOrDirect method_name:java.lang.System.arraycopy intrinsic:SystemArrayCopy
+  /// CHECK:        call
+  /// CHECK: ReturnVoid
+  public static void untypedCopyCaller(Object o, byte[] array) {
+    untypedCopy(o, array);
+  }
+
+  public static void assertEquals(Object one, Object two) {
+    if (one != two) {
+      throw new Error("Expected " + one + ", got " + two);
+    }
+  }
+
+  public static void main(String[] args) {
+    // Simple sanity checks.
+    byte[] a = new byte[2];
+    Object[] o = new Object[2];
+
+    o[0] = a;
+    o[1] = o;
+    a[0] = 1;
+    a[1] = 2;
+
+    untypedCopyCaller(o, a);
+    assertEquals(o[0], o);
+    assertEquals(o[1], o);
+    assertEquals(a[0], (byte)2);
+    assertEquals(a[1], (byte)2);
+
+    o[0] = a;
+    o[1] = o;
+    a[0] = 1;
+    a[1] = 2;
+
+    typedCopy(o, a);
+    assertEquals(o[0], o);
+    assertEquals(o[1], o);
+    assertEquals(a[0], (byte)2);
+    assertEquals(a[1], (byte)2);
+  }
+}
diff --git a/test/641-irreducible-inline/expected.txt b/test/641-irreducible-inline/expected.txt
new file mode 100644
index 0000000..d81cc07
--- /dev/null
+++ b/test/641-irreducible-inline/expected.txt
@@ -0,0 +1 @@
+42
diff --git a/test/641-irreducible-inline/info.txt b/test/641-irreducible-inline/info.txt
new file mode 100644
index 0000000..ec6d0d2
--- /dev/null
+++ b/test/641-irreducible-inline/info.txt
@@ -0,0 +1,2 @@
+Regression test for optimizing in the presence of
+inlining a method that throws in an irreducible loop
diff --git a/test/641-irreducible-inline/smali/IrreducibleLoop.smali b/test/641-irreducible-inline/smali/IrreducibleLoop.smali
new file mode 100644
index 0000000..3e6c1f1
--- /dev/null
+++ b/test/641-irreducible-inline/smali/IrreducibleLoop.smali
@@ -0,0 +1,54 @@
+# Copyright (C) 2017 The Android Open Source Project
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+.class public LIrreducibleLoop;
+
+.super Ljava/lang/Object;
+
+.method public static simpleLoop(I)I
+   .registers 3
+   const/16 v0, 42
+   if-eqz p0, :loop_entry
+   goto :other_loop_pre_entry
+
+   # The then part: beginning of the irreducible loop.
+   :loop_entry
+   if-nez p0, :exit
+   invoke-static {v0},LIrreducibleLoop;->foo(I)V
+   :other_loop_entry
+   goto :loop_entry
+
+   # The else part.
+   :other_loop_pre_entry
+   if-eqz p0, :other_loop_entry
+   invoke-static {v0},LIrreducibleLoop;->foo(I)V
+   goto :other_loop_entry
+
+   :exit
+   return v0
+.end method
+
+.method public static foo(I)V
+   .registers 3
+   const/16 v0, 0
+   sget-boolean v1,LIrreducibleLoop;->doThrow:Z
+   if-eqz v1, :exit
+   # Inlining a method that throws requires re-computing loop information
+   # which is unsupported when the caller has an irreducible loop.
+   throw v0
+   :exit
+   return-void
+.end method
+
+.field public static doThrow:Z
diff --git a/test/641-irreducible-inline/src/Main.java b/test/641-irreducible-inline/src/Main.java
new file mode 100644
index 0000000..53244f7
--- /dev/null
+++ b/test/641-irreducible-inline/src/Main.java
@@ -0,0 +1,29 @@
+/*
+ * Copyright (C) 2017 The Android Open Source Project
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import java.lang.reflect.Method;
+
+public class Main {
+  // Workaround for b/18051191.
+  class InnerClass {}
+
+  public static void main(String[] args) throws Exception {
+    Class<?> c = Class.forName("IrreducibleLoop");
+    Method m = c.getMethod("simpleLoop", int.class);
+    Object[] arguments = { 42 };
+    System.out.println(m.invoke(null, arguments));
+  }
+}
diff --git a/test/641-iterations/expected.txt b/test/641-iterations/expected.txt
new file mode 100644
index 0000000..b0aad4d
--- /dev/null
+++ b/test/641-iterations/expected.txt
@@ -0,0 +1 @@
+passed
diff --git a/test/641-iterations/info.txt b/test/641-iterations/info.txt
new file mode 100644
index 0000000..fd80595
--- /dev/null
+++ b/test/641-iterations/info.txt
@@ -0,0 +1 @@
+Tests on varying trip counts (to validate vector/cleanup loops).
diff --git a/test/641-iterations/src/Main.java b/test/641-iterations/src/Main.java
new file mode 100644
index 0000000..6a27f80
--- /dev/null
+++ b/test/641-iterations/src/Main.java
@@ -0,0 +1,73 @@
+/*
+ * Copyright (C) 2017 The Android Open Source Project
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/**
+ * Tests of varying trip counts. Focused on testing
+ * core and cleanup loop after vectorization.
+ */
+public class Main {
+
+  static int[] sA;
+
+  static void init() {
+    for (int i = 0; i < sA.length; i++)
+      sA[i] = 100;
+  }
+
+  static void doitTo(int n) {
+    for (int i = 0; i < n; i++)
+      sA[i] += 1;
+  }
+
+  static void doitFrom(int n) {
+    for (int i = n; i < sA.length; i++)
+      sA[i] += 1;
+  }
+
+  static void verify(int n) {
+    for (int i = 0; i < n; i++)
+      if (sA[i] != 101)
+        throw new Error("failed inside loop");
+    for (int i = n; i < sA.length; i++)
+      if (sA[i] != 100)
+        throw new Error("failed outside loop");
+  }
+
+  static void verify() {
+    for (int i = 0; i < sA.length; i++)
+      if (sA[i] != 101)
+        throw new Error("failed inside loop");
+  }
+
+  static void driver() {
+    for (int n = 0; n <= sA.length; n++) {
+      init();
+      doitTo(n);
+      verify(n);
+      doitFrom(n);
+      verify();
+    }
+  }
+
+  public static void main(String[] args) {
+    sA = new int[17];
+    driver();
+    sA = new int[32];
+    driver();
+    System.out.println("passed");
+  }
+}
+
diff --git a/test/921-hello-failure/expected.txt b/test/921-hello-failure/expected.txt
index a5dc10d..fdbfbe2 100644
--- a/test/921-hello-failure/expected.txt
+++ b/test/921-hello-failure/expected.txt
@@ -50,3 +50,6 @@
 hello there again - FieldChange
 Transformation error : java.lang.Exception(Failed to redefine class <LTransform4;> due to JVMTI_ERROR_UNSUPPORTED_REDEFINITION_SCHEMA_CHANGED)
 hello there again - FieldChange
+hello - Unmodifiable
+Transformation error : java.lang.Exception(Failed to redefine class <[LTransform;> due to JVMTI_ERROR_UNMODIFIABLE_CLASS)
+hello - Unmodifiable
diff --git a/test/921-hello-failure/src/Main.java b/test/921-hello-failure/src/Main.java
index 5bbe2b5..6779ed8 100644
--- a/test/921-hello-failure/src/Main.java
+++ b/test/921-hello-failure/src/Main.java
@@ -32,6 +32,7 @@
     NewField.doTest(new Transform());
     MissingField.doTest(new Transform4("there"));
     FieldChange.doTest(new Transform4("there again"));
+    Unmodifiable.doTest(new Transform[] { new Transform(), });
   }
 
   // Transforms the class. This throws an exception if something goes wrong.
diff --git a/test/921-hello-failure/src/Unmodifiable.java b/test/921-hello-failure/src/Unmodifiable.java
new file mode 100644
index 0000000..ad05f51
--- /dev/null
+++ b/test/921-hello-failure/src/Unmodifiable.java
@@ -0,0 +1,52 @@
+/*
+ * Copyright (C) 2017 The Android Open Source Project
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import java.util.Base64;
+
+class Unmodifiable {
+  // The following is a base64 encoding of a valid class file.
+  private static final byte[] CLASS_BYTES = Base64.getDecoder().decode(
+    "yv66vgAAADQAFQoABgAPBwAQCAARCgACABIHABMHABQBAAY8aW5pdD4BAAMoKVYBAARDb2RlAQAP" +
+    "TGluZU51bWJlclRhYmxlAQAFc2F5SGkBABUoTGphdmEvbGFuZy9TdHJpbmc7KVYBAApTb3VyY2VG" +
+    "aWxlAQAOVHJhbnNmb3JtLmphdmEMAAcACAEAD2phdmEvbGFuZy9FcnJvcgEAFVNob3VsZCBub3Qg" +
+    "YmUgY2FsbGVkIQwABwAMAQAJVHJhbnNmb3JtAQAQamF2YS9sYW5nL09iamVjdAAgAAUABgAAAAAA" +
+    "AgAAAAcACAABAAkAAAAdAAEAAQAAAAUqtwABsQAAAAEACgAAAAYAAQAAAAIAAAALAAwAAQAJAAAA" +
+    "IgADAAIAAAAKuwACWRIDtwAEvwAAAAEACgAAAAYAAQAAAAQAAQANAAAAAgAO");
+  private static final byte[] DEX_BYTES = Base64.getDecoder().decode(
+    "ZGV4CjAzNQCrV81cy4Q+YKMMMqc0bZEO5Y1X5u7irPeQAgAAcAAAAHhWNBIAAAAAAAAAAPwBAAAL" +
+    "AAAAcAAAAAUAAACcAAAAAgAAALAAAAAAAAAAAAAAAAQAAADIAAAAAQAAAOgAAACIAQAACAEAAEoB" +
+    "AABSAQAAXwEAAHIBAACGAQAAmgEAALEBAADBAQAAxAEAAMgBAADcAQAAAQAAAAIAAAADAAAABAAA" +
+    "AAcAAAAHAAAABAAAAAAAAAAIAAAABAAAAEQBAAAAAAAAAAAAAAAAAQAKAAAAAQABAAAAAAACAAAA" +
+    "AAAAAAAAAAAAAAAAAgAAAAAAAAAGAAAAAAAAAO4BAAAAAAAAAQABAAEAAADjAQAABAAAAHAQAwAA" +
+    "AA4ABAACAAIAAADoAQAACQAAACIAAQAbAQUAAABwIAIAEAAnAAAAAQAAAAMABjxpbml0PgALTFRy" +
+    "YW5zZm9ybTsAEUxqYXZhL2xhbmcvRXJyb3I7ABJMamF2YS9sYW5nL09iamVjdDsAEkxqYXZhL2xh" +
+    "bmcvU3RyaW5nOwAVU2hvdWxkIG5vdCBiZSBjYWxsZWQhAA5UcmFuc2Zvcm0uamF2YQABVgACVkwA" +
+    "EmVtaXR0ZXI6IGphY2stNC4yNAAFc2F5SGkAAgAHDgAEAQAHDgAAAAEBAICABIgCAQCgAgwAAAAA" +
+    "AAAAAQAAAAAAAAABAAAACwAAAHAAAAACAAAABQAAAJwAAAADAAAAAgAAALAAAAAFAAAABAAAAMgA" +
+    "AAAGAAAAAQAAAOgAAAABIAAAAgAAAAgBAAABEAAAAQAAAEQBAAACIAAACwAAAEoBAAADIAAAAgAA" +
+    "AOMBAAAAIAAAAQAAAO4BAAAAEAAAAQAAAPwBAAA=");
+
+  public static void doTest(Transform[] ts) {
+    ts[0].sayHi("Unmodifiable");
+    try {
+      Main.doCommonClassRedefinition(Transform[].class, CLASS_BYTES, DEX_BYTES);
+    } catch (Exception e) {
+      System.out.println(
+          "Transformation error : " + e.getClass().getName() + "(" + e.getMessage() + ")");
+    }
+    ts[0].sayHi("Unmodifiable");
+  }
+}
diff --git a/test/924-threads/src/Main.java b/test/924-threads/src/Main.java
index f18d70e..716f59e 100644
--- a/test/924-threads/src/Main.java
+++ b/test/924-threads/src/Main.java
@@ -135,8 +135,12 @@
     synchronized(cdl3_2) {
       cdl3_1.countDown();
       cdl3_2.await();
-      Thread.yield();
-      Thread.sleep(100);
+      // While the latch improves the chances to make good progress, scheduling might still be
+      // messy. Wait till we get the right Java-side Thread state.
+      do {
+        Thread.yield();
+      } while (t.getState() != Thread.State.BLOCKED);
+      Thread.sleep(10);
       printThreadState(t);
     }
 
diff --git a/test/Android.run-test.mk b/test/Android.run-test.mk
index ca1fac6..01eb14e 100644
--- a/test/Android.run-test.mk
+++ b/test/Android.run-test.mk
@@ -242,10 +242,8 @@
 
 
 # Disable 149-suspend-all-stress, its output is flaky (b/28988206).
-# Disable 577-profile-foreign-dex (b/27454772).
 TEST_ART_BROKEN_ALL_TARGET_TESTS := \
   149-suspend-all-stress \
-  577-profile-foreign-dex \
 
 ART_TEST_KNOWN_BROKEN += $(call all-run-test-names,$(TARGET_TYPES),$(RUN_TYPES),$(PREBUILD_TYPES), \
     $(COMPILER_TYPES), $(RELOCATE_TYPES),$(TRACE_TYPES),$(GC_TYPES),$(JNI_TYPES), \
@@ -626,16 +624,18 @@
 TEST_ART_BROKEN_JIT_READ_BARRIER_RUN_TESTS :=
 
 # Tests failing in non-Baker read barrier configurations with the Optimizing compiler (AOT).
-# 537: Expects an array copy to be intrinsified, but calling-on-slowpath intrinsics are not yet
+# 537 and 641: Expect an array copy to be intrinsified, but calling-on-slowpath intrinsics are not yet
 #      handled in non-Baker read barrier configurations.
 TEST_ART_BROKEN_OPTIMIZING_NON_BAKER_READ_BARRIER_RUN_TESTS := \
-  537-checker-arraycopy
+  537-checker-arraycopy \
+  641-checker-arraycopy
 
 # Tests failing in non-Baker read barrier configurations with JIT (Optimizing compiler).
-# 537: Expects an array copy to be intrinsified, but calling-on-slowpath intrinsics are not yet
+# 537 and 641: Expect an array copy to be intrinsified, but calling-on-slowpath intrinsics are not yet
 #      handled in non-Baker read barrier configurations.
 TEST_ART_BROKEN_JIT_NON_BAKER_READ_BARRIER_RUN_TESTS := \
-  537-checker-arraycopy
+  537-checker-arraycopy \
+  641-checker-arraycopy
 
 ifeq ($(ART_USE_READ_BARRIER),true)
   ifneq (,$(filter interpreter,$(COMPILER_TYPES)))
diff --git a/test/ProfileTestMultiDex/Main.java b/test/ProfileTestMultiDex/Main.java
index 73fbb00..a8ced54 100644
--- a/test/ProfileTestMultiDex/Main.java
+++ b/test/ProfileTestMultiDex/Main.java
@@ -39,6 +39,10 @@
     return s.getValue();
   }
 
+  public int inlineMissingTypes(Super s) {
+    return s.getValue();
+  }
+
   public int noInlineCache(Super s) {
     return s.getValue();
   }
diff --git a/test/577-profile-foreign-dex/run b/test/VerifierDeps/MySub1SoftVerificationFailure.smali
similarity index 76%
rename from test/577-profile-foreign-dex/run
rename to test/VerifierDeps/MySub1SoftVerificationFailure.smali
index ad57d14..8123394 100644
--- a/test/577-profile-foreign-dex/run
+++ b/test/VerifierDeps/MySub1SoftVerificationFailure.smali
@@ -1,6 +1,4 @@
-#!/bin/bash
-#
-# Copyright 2016 The Android Open Source Project
+# Copyright (C) 2017 The Android Open Source Project
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -14,7 +12,5 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-exec ${RUN} \
-  --runtime-option -Xjitsaveprofilinginfo \
-  --runtime-option -Xusejit:true \
-  "${@}"
+.class public LMySub1SoftVerificationFailure;
+.super LMySoftVerificationFailure;
diff --git a/test/577-profile-foreign-dex/run b/test/VerifierDeps/MySub2SoftVerificationFailure.smali
similarity index 76%
copy from test/577-profile-foreign-dex/run
copy to test/VerifierDeps/MySub2SoftVerificationFailure.smali
index ad57d14..8d00323 100644
--- a/test/577-profile-foreign-dex/run
+++ b/test/VerifierDeps/MySub2SoftVerificationFailure.smali
@@ -1,6 +1,4 @@
-#!/bin/bash
-#
-# Copyright 2016 The Android Open Source Project
+# Copyright (C) 2017 The Android Open Source Project
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -14,7 +12,5 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-exec ${RUN} \
-  --runtime-option -Xjitsaveprofilinginfo \
-  --runtime-option -Xusejit:true \
-  "${@}"
+.class public LMySub2SoftVerificationFailure;
+.super LMySoftVerificationFailure;
diff --git a/test/577-profile-foreign-dex/run b/test/VerifierDepsMulti/MySoftVerificationFailure.smali
similarity index 64%
copy from test/577-profile-foreign-dex/run
copy to test/VerifierDepsMulti/MySoftVerificationFailure.smali
index ad57d14..6b56a3b 100644
--- a/test/577-profile-foreign-dex/run
+++ b/test/VerifierDepsMulti/MySoftVerificationFailure.smali
@@ -1,6 +1,4 @@
-#!/bin/bash
-#
-# Copyright 2016 The Android Open Source Project
+# Copyright (C) 2017 The Android Open Source Project
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -14,7 +12,13 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-exec ${RUN} \
-  --runtime-option -Xjitsaveprofilinginfo \
-  --runtime-option -Xusejit:true \
-  "${@}"
+.class public LMySoftVerificationFailure;
+.super Ljava/lang/Object;
+
+.method public final foo()V
+  .registers 1
+  sget-object v0, LMySoftVerificationFailure;->error:LUnknownType;
+  throw v0
+.end method
+
+.field public static error:LUnknownType;
diff --git a/test/etc/run-test-jar b/test/etc/run-test-jar
index 0ac5481..e4e571c 100755
--- a/test/etc/run-test-jar
+++ b/test/etc/run-test-jar
@@ -63,6 +63,7 @@
 TEST_IS_NDEBUG="n"
 APP_IMAGE="y"
 VDEX_FILTER=""
+PROFILE="n"
 
 # if "y", run 'sync' before dalvikvm to make sure all files from
 # build step (e.g. dex2oat) were finished writing.
@@ -269,6 +270,9 @@
     elif [ "x$1" = "x--sync" ]; then
         SYNC_BEFORE_RUN="y"
         shift
+    elif [ "x$1" = "x--profile" ]; then
+        PROFILE="y"
+        shift
     elif expr "x$1" : "x--" >/dev/null 2>&1; then
         echo "unknown $0 option: $1" 1>&2
         exit 1
@@ -441,8 +445,8 @@
 JNI_OPTS="-Xjnigreflimit:512 -Xcheck:jni"
 
 if [ "$RELOCATE" = "y" ]; then
-    COMPILE_FLAGS="${COMPILE_FLAGS} --include-patch-information --runtime-arg -Xnorelocate"
-    FLAGS="${FLAGS} -Xrelocate -Xcompiler-option --include-patch-information"
+    COMPILE_FLAGS="${COMPILE_FLAGS} --runtime-arg -Xnorelocate"
+    FLAGS="${FLAGS} -Xrelocate"
     if [ "$HOST" = "y" ]; then
         # Run test sets a fairly draconian ulimit that we will likely blow right over
         # since we are relocating. Get the total size of the /system/framework directory
@@ -455,11 +459,12 @@
 else
     FLAGS="$FLAGS -Xnorelocate"
     COMPILE_FLAGS="${COMPILE_FLAGS} --runtime-arg -Xnorelocate"
-    if [ "$HOST" = "y" ]; then
-        # Increase ulimit to 128MB in case we are running hprof test,
-        # or string append test with art-debug-gc.
-        ulimit -S 128000 || exit 1
-    fi
+fi
+
+if [ "$HOST" = "y" ]; then
+    # Increase ulimit to 128MB in case we are running hprof test,
+    # or string append test with art-debug-gc.
+    ulimit -S 128000 || exit 1
 fi
 
 if [ "$HOST" = "n" ]; then
@@ -505,16 +510,28 @@
 DEX_LOCATION_STRIPPED="${DEX_LOCATION#/}"
 VDEX_NAME="${DEX_LOCATION_STRIPPED//\//@}@$TEST_NAME.jar@classes.vdex"
 if [ ${#VDEX_NAME} -gt $max_filename_size ]; then
-    echo  "Dex location path too long."
+    echo "Dex location path too long:"
+    echo "$VDEX_NAME is ${#VDEX_NAME} character long, and the limit is $max_filename_size."
     exit 1
 fi
 
+profman_cmdline="true"
 dex2oat_cmdline="true"
 vdex_cmdline="true"
 mkdir_locations="${DEX_LOCATION}/dalvik-cache/$ISA"
 strip_cmdline="true"
 sync_cmdline="true"
 
+if [ "$PROFILE" = "y" ]; then
+  profman_cmdline="${ANDROID_ROOT}/bin/profman  \
+    --apk=$DEX_LOCATION/$TEST_NAME.jar \
+    --dex-location=$DEX_LOCATION/$TEST_NAME.jar \
+    --create-profile-from=$DEX_LOCATION/profile \
+    --reference-profile-file=$DEX_LOCATION/$TEST_NAME.prof"
+  COMPILE_FLAGS="${COMPILE_FLAGS} --profile-file=$DEX_LOCATION/$TEST_NAME.prof"
+  FLAGS="${FLAGS} -Xcompiler-option --profile-file=$DEX_LOCATION/$TEST_NAME.prof"
+fi
+
 
 if [ "$PREBUILD" = "y" ]; then
   mkdir_locations="${mkdir_locations} ${DEX_LOCATION}/oat/$ISA"
@@ -592,6 +609,7 @@
 dex2oat_cmdline=$(echo $dex2oat_cmdline)
 dalvikvm_cmdline=$(echo $dalvikvm_cmdline)
 vdex_cmdline=$(echo $vdex_cmdline)
+profman_cmdline=$(echo $profman_cmdline)
 
 if [ "$HOST" = "n" ]; then
     adb root > /dev/null
@@ -601,11 +619,18 @@
       adb shell mkdir -p $DEX_LOCATION
       adb push $TEST_NAME.jar $DEX_LOCATION
       adb push $TEST_NAME-ex.jar $DEX_LOCATION
+      if [ "$PROFILE" = "y" ]; then
+        adb push profile $DEX_LOCATION
+      fi
     else
       adb shell rm -r $DEX_LOCATION >/dev/null 2>&1
       adb shell mkdir -p $DEX_LOCATION >/dev/null 2>&1
       adb push $TEST_NAME.jar $DEX_LOCATION >/dev/null 2>&1
       adb push $TEST_NAME-ex.jar $DEX_LOCATION >/dev/null 2>&1
+      if [ "$PROFILE" = "y" ]; then
+        adb push profile $DEX_LOCATION >/dev/null 2>&1
+      fi
+
     fi
 
     LD_LIBRARY_PATH=/data/$TEST_DIRECTORY/art/$ISA
@@ -632,6 +657,7 @@
              mkdir -p ${mkdir_locations} && \
              export LD_LIBRARY_PATH=$LD_LIBRARY_PATH && \
              export PATH=$ANDROID_ROOT/bin:$PATH && \
+             $profman_cmdline && \
              $dex2oat_cmdline && \
              $vdex_cmdline && \
              $strip_cmdline && \
@@ -708,13 +734,14 @@
     fi
 
     if [ "$DEV_MODE" = "y" ]; then
-      echo "mkdir -p ${mkdir_locations} && $dex2oat_cmdline && $vdex_cmdline && $strip_cmdline && $sync_cmdline && $cmdline"
+      echo "mkdir -p ${mkdir_locations} && $profman_cmdline && $dex2oat_cmdline && $vdex_cmdline && $strip_cmdline && $sync_cmdline && $cmdline"
     fi
 
     cd $ANDROID_BUILD_TOP
 
     rm -rf ${DEX_LOCATION}/dalvik-cache/
     mkdir -p ${mkdir_locations} || exit 1
+    $profman_cmdline || { echo "Profman failed." >&2 ; exit 2; }
     $dex2oat_cmdline || { echo "Dex2oat failed." >&2 ; exit 2; }
     $vdex_cmdline || { echo "Dex2oat failed." >&2 ; exit 2; }
     $strip_cmdline || { echo "Strip failed." >&2 ; exit 3; }
diff --git a/test/knownfailures.json b/test/knownfailures.json
index 8681815..50d70f1 100644
--- a/test/knownfailures.json
+++ b/test/knownfailures.json
@@ -26,11 +26,6 @@
         "bug": "http://b/28988206"
     },
     {
-        "test": "577-profile-foreign-dex",
-        "description": "Disable 577-profile-foreign-dex",
-        "bug": "http://b/27454772"
-    },
-    {
         "tests": ["002-sleep",
                   "053-wait-some",
                   "055-enum-performance",
@@ -302,7 +297,7 @@
         "tests": ["000-nop",
                   "134-nodex2oat-nofallback",
                   "147-stripped-dex-fallback",
-                 "595-profile-saving"],
+                  "595-profile-saving"],
         "description": "The doesn't compile anything",
         "env_vars": {"ART_TEST_BISECTION": "true"},
         "variant": "optimizing | regalloc_gc"
@@ -327,7 +322,7 @@
     },
     {
         "tests": ["115-native-bridge",
-                 "088-monitor-verification"],
+                  "088-monitor-verification"],
         "description": "The test assume they are always compiled.",
         "env_vars": {"ART_TEST_BISECTION": "true"},
         "variant": "optimizing | regalloc_gc"
@@ -340,7 +335,8 @@
         "variant": "optimizing | regalloc_gc"
     },
     {
-        "test": "537-checker-arraycopy",
+        "tests": ["537-checker-arraycopy",
+                  "641-checker-arraycopy"],
         "env_vars": {"ART_USE_READ_BARRIER": "true"},
         "variant": "interpreter | optimizing | regalloc_gc | jit"
     }
diff --git a/test/testrunner/run_build_test_target.py b/test/testrunner/run_build_test_target.py
new file mode 100755
index 0000000..0cd1dde
--- /dev/null
+++ b/test/testrunner/run_build_test_target.py
@@ -0,0 +1,58 @@
+#!/usr/bin/env python
+#
+# Copyright 2017, The Android Open Source Project
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import argparse
+import os
+import subprocess
+
+from target_config import target_config
+import env
+
+parser = argparse.ArgumentParser()
+parser.add_argument('--build-target', required=True, dest='build_target')
+parser.add_argument('-j', default='1', dest='n_threads')
+options = parser.parse_args()
+
+target = target_config[options.build_target]
+n_threads = options.n_threads
+custom_env = target.get('env', {})
+custom_env['SOONG_ALLOW_MISSING_DEPENDENCIES'] = 'true'
+print custom_env
+os.environ.update(custom_env)
+
+
+if target.get('target'):
+  build_command = 'make'
+  build_command += ' -j' + str(n_threads)
+  build_command += ' -C ' + env.ANDROID_BUILD_TOP
+  build_command += ' ' + target.get('target')
+  print build_command.split()
+  if subprocess.call(build_command.split()):
+    sys.exit(1)
+
+else:
+  run_test_command = [os.path.join(env.ANDROID_BUILD_TOP,
+                                   'art/test/testrunner/testrunner.py')]
+  run_test_command += target.get('flags', [])
+  run_test_command += ['-j', str(n_threads)]
+  run_test_command += ['-b']
+  run_test_command += ['--verbose']
+
+  print run_test_command
+  if subprocess.call(run_test_command):
+    sys.exit(1)
+
+sys.exit(0)
diff --git a/test/testrunner/target_config.py b/test/testrunner/target_config.py
new file mode 100644
index 0000000..5387d6a
--- /dev/null
+++ b/test/testrunner/target_config.py
@@ -0,0 +1,257 @@
+target_config = {
+    'art-test' : {
+        'flags' : [],
+        'env' : {
+            'ART_USE_READ_BARRIER' : 'false'
+        }
+    },
+    'art-interpreter' : {
+        'flags' : ['--interpreter'],
+        'env' : {
+            'ART_USE_READ_BARRIER' : 'false'
+        }
+    },
+    'art-interpreter-access-checks' : {
+        'flags' : ['--interp-ac'],
+        'env' : {
+            'ART_USE_READ_BARRIER' : 'false'
+        }
+    },
+    'art-jit' : {
+        'flags' : ['--jit'],
+        'env' : {
+            'ART_USE_READ_BARRIER' : 'false'
+        }
+    },
+    'art-gcstress-gcverify': {
+        'flags' : ['--gcstress',
+                   '--gcverify'],
+        'env' : {
+            'ART_USE_READ_BARRIER' : 'false',
+            'ART_DEFAULT_GC_TYPE' : 'SS'
+        }
+    },
+    'art-interpreter-gcstress' : {
+        'flags': ['--interpreter',
+                  '--gcstress'],
+        'env' : {
+            'ART_USE_READ_BARRIER' : 'false',
+            'ART_DEFAULT_GC_TYPE' : 'SS'
+        }
+    },
+    'art-optimizing-gcstress' : {
+        'flags': ['--gcstress',
+                  '--optimizing'],
+        'env' : {
+            'ART_USE_READ_BARRIER' : 'false',
+            'ART_DEFAULT_GC_TYPE' : 'SS'
+        }
+    },
+    'art-jit-gcstress' : {
+        'flags': ['--jit',
+                  '--gcstress'],
+        'env' : {
+            'ART_USE_READ_BARRIER' : 'false'
+        }
+    },
+    'art-read-barrier' : {
+        'flags': ['--interpreter',
+                  '--optimizing'],
+        'env' : {
+            'ART_USE_READ_BARRIER' : 'true',
+            'ART_HEAP_POISONING' : 'true'
+        }
+    },
+    'art-read-barrier-gcstress' : {
+        'flags' : ['--interpreter',
+                   '--optimizing',
+                   '--gcstress'],
+        'env' : {
+            'ART_USE_READ_BARRIER' : 'true',
+            'ART_HEAP_POISONING' : 'true'
+        }
+    },
+    'art-read-barrier-table-lookup' : {
+        'flags' : ['--interpreter',
+                   '--optimizing'],
+        'env' : {
+            'ART_USE_READ_BARRIER' : 'true',
+            'ART_READ_BARRIER_TYPE' : 'TABLELOOKUP',
+            'ART_HEAP_POISONING' : 'true'
+        }
+    },
+    'art-debug-gc' : {
+        'flags' : ['--interpreter',
+                   '--optimizing'],
+        'env' : {
+            'ART_TEST_DEBUG_GC' : 'true',
+            'ART_USE_READ_BARRIER' : 'false'
+        }
+    },
+    'art-ss-gc' : {
+        'flags' : ['--interpreter',
+                   '--optimizing',
+                   '--jit'],
+        'env' : {
+            'ART_DEFAULT_GC_TYPE' : 'SS',
+            'ART_USE_READ_BARRIER' : 'false'
+        }
+    },
+    'art-gss-gc' : {
+        'flags' : ['--interpreter',
+                   '--optimizing',
+                   '--jit'],
+        'env' : {
+            'ART_DEFAULT_GC_TYPE' : 'GSS',
+            'ART_USE_READ_BARRIER' : 'false'
+        }
+    },
+    'art-ss-gc-tlab' : {
+        'flags' : ['--interpreter',
+                   '--optimizing',
+                   '--jit'],
+        'env' : {
+            'ART_DEFAULT_GC_TYPE' : 'SS',
+            'ART_USE_TLAB' : 'true',
+            'ART_USE_READ_BARRIER' : 'false'
+        }
+    },
+    'art-gss-gc-tlab' : {
+        'flags' : ['--interpreter',
+                   '--optimizing',
+                   '--jit'],
+        'env' : {
+            'ART_DEFAULT_GC_TYPE' : 'GSS',
+            'ART_USE_TLAB' : 'true',
+            'ART_USE_READ_BARRIER' : 'false'
+        }
+    },
+    'art-tracing' : {
+        'flags' : ['--trace'],
+        'env' : {
+            'ART_USE_READ_BARRIER' : 'false'
+        }
+    },
+    'art-interpreter-tracing' : {
+        'flags' : ['--interpreter',
+                   '--trace'],
+        'env' : {
+            'ART_USE_READ_BARRIER' : 'false',
+        }
+    },
+    'art-forcecopy' : {
+        'flags' : ['--forcecopy'],
+        'env' : {
+            'ART_USE_READ_BARRIER' : 'false',
+        }
+    },
+    'art-no-prebuild' : {
+        'flags' : ['--no-prebuild'],
+        'env' : {
+            'ART_USE_READ_BARRIER' : 'false',
+        }
+    },
+    'art-no-image' : {
+        'flags' : ['--no-image'],
+        'env' : {
+            'ART_USE_READ_BARRIER' : 'false',
+        }
+    },
+    'art-interpreter-no-image' : {
+        'flags' : ['--interpreter',
+                   '--no-image'],
+        'env' : {
+            'ART_USE_READ_BARRIER' : 'false',
+        }
+    },
+    'art-relocate-no-patchoat' : {
+        'flags' : ['--relocate-npatchoat'],
+        'env' : {
+            'ART_USE_READ_BARRIER' : 'false',
+        }
+    },
+    'art-no-dex2oat' : {
+        'flags' : ['--no-dex2oat'],
+        'env' : {
+            'ART_USE_READ_BARRIER' : 'false',
+        }
+    },
+    'art-heap-poisoning' : {
+        'flags' : ['--interpreter',
+                   '--optimizing'],
+        'env' : {
+            'ART_USE_READ_BARRIER' : 'false',
+            'ART_HEAP_POISONING' : 'true'
+        }
+    },
+    'art-gtest' : {
+        'target' :  'test-art-gtest',
+        'env' : {
+            'ART_USE_READ_BARRIER' : 'true'
+        }
+    },
+    'art-gtest-read-barrier': {
+        'target' :  'test-art-gtest',
+        'env' : {
+            'ART_USE_READ_BARRIER' : 'true',
+            'ART_HEAP_POISONING' : 'true'
+        }
+    },
+    'art-gtest-read-barrier-table-lookup': {
+        'target' :  'test-art-gtest',
+        'env': {
+            'ART_USE_READ_BARRIER' : 'true',
+            'ART_READ_BARRIER_TYPE' : 'TABLELOOKUP',
+            'ART_HEAP_POISONING' : 'true'
+        }
+    },
+    'art-gtest-ss-gc': {
+        'target' :  'test-art-gtest',
+        'env': {
+            'ART_DEFAULT_GC_TYPE' : 'SS',
+            'ART_USE_READ_BARRIER' : 'false'
+        }
+    },
+    'art-gtest-gss-gc': {
+        'target' :  'test-art-gtest',
+        'env' : {
+            'ART_DEFAULT_GC_TYPE' : 'GSS',
+            'ART_USE_READ_BARRIER' : 'false'
+        }
+    },
+    'art-gtest-ss-gc-tlab': {
+        'target' :  'test-art-gtest',
+        'env': {
+            'ART_DEFAULT_GC_TYPE' : 'SS',
+            'ART_USE_TLAB' : 'true',
+            'ART_USE_READ_BARRIER' : 'false',
+        }
+    },
+    'art-gtest-gss-gc-tlab': {
+        'target' :  'test-art-gtest',
+        'env': {
+            'ART_DEFAULT_GC_TYPE' : 'GSS',
+            'ART_USE_TLAB' : 'true',
+            'ART_USE_READ_BARRIER' : 'false'
+        }
+    },
+    'art-gtest-valgrind32': {
+        'target' : 'valgrind-test-art-host32',
+        'env': {
+            'ART_USE_READ_BARRIER' : 'false'
+        }
+    },
+    'art-gtest-valgrind64': {
+        'target' : 'valgrind-test-art-host64',
+        'env': {
+            'ART_USE_READ_BARRIER' : 'false'
+        }
+    },
+    'art-gtest-heap-poisoning': {
+        'target' : 'valgrind-test-art-host64',
+        'env' : {
+            'ART_HEAP_POISONING' : 'true',
+            'ART_USE_READ_BARRIER' : 'false'
+        }
+    }
+}
diff --git a/test/testrunner/testrunner.py b/test/testrunner/testrunner.py
index 9c8d3b8..f77e9ad 100755
--- a/test/testrunner/testrunner.py
+++ b/test/testrunner/testrunner.py
@@ -56,6 +56,7 @@
 import time
 
 import env
+from target_config import target_config
 
 TARGET_TYPES = set()
 RUN_TYPES = set()
@@ -704,6 +705,25 @@
     return {match.group(12)}
   raise ValueError(test_name + " is not a valid test")
 
+
+def setup_env_for_build_target(build_target, parser, options):
+  """Setup environment for the build target
+
+  The method setup environment for the master-art-host targets.
+  """
+  os.environ.update(build_target['env'])
+  os.environ['SOONG_ALLOW_MISSING_DEPENDENCIES'] = 'true'
+  print_text('%s\n' % (str(os.environ)))
+
+  target_options = vars(parser.parse_args(build_target['flags']))
+  target_options['host'] = True
+  target_options['verbose'] = True
+  target_options['build'] = True
+  target_options['n_thread'] = options['n_thread']
+  target_options['dry_run'] = options['dry_run']
+
+  return target_options
+
 def parse_option():
   global verbose
   global dry_run
@@ -733,90 +753,95 @@
                       action='store_true', dest='build',
                       help="Build dependencies under all circumstances. By default we will " +
                            "not build dependencies unless ART_TEST_RUN_TEST_BUILD=true.")
+  parser.add_argument('--build-target', dest='build_target', help='master-art-host targets')
   parser.set_defaults(build = env.ART_TEST_RUN_TEST_BUILD)
   parser.add_argument('--gdb', action='store_true', dest='gdb')
   parser.add_argument('--gdb-arg', dest='gdb_arg')
 
-  options = parser.parse_args()
+  options = vars(parser.parse_args())
+  if options['build_target']:
+    options = setup_env_for_build_target(target_config[options['build_target']],
+                                         parser, options)
+
   test = ''
-  env.EXTRA_DISABLED_TESTS.update(set(options.skips))
-  if options.test:
-    test = parse_test_name(options.test)
-  if options.pictest:
+  env.EXTRA_DISABLED_TESTS.update(set(options['skips']))
+  if options['test']:
+    test = parse_test_name(options['test'])
+  if options['pictest']:
     PICTEST_TYPES.add('pictest')
-  if options.ndebug:
+  if options['ndebug']:
     RUN_TYPES.add('ndebug')
-  if options.interp_ac:
+  if options['interp_ac']:
     COMPILER_TYPES.add('interp-ac')
-  if options.picimage:
+  if options['picimage']:
     IMAGE_TYPES.add('picimage')
-  if options.n64:
+  if options['n64']:
     ADDRESS_SIZES.add('64')
-  if options.interpreter:
+  if options['interpreter']:
     COMPILER_TYPES.add('interpreter')
-  if options.jni:
+  if options['jni']:
     JNI_TYPES.add('jni')
-  if options.relocate_npatchoat:
+  if options['relocate_npatchoat']:
     RELOCATE_TYPES.add('relocate-npatchoat')
-  if options.no_prebuild:
+  if options['no_prebuild']:
     PREBUILD_TYPES.add('no-prebuild')
-  if options.npictest:
+  if options['npictest']:
     PICTEST_TYPES.add('npictest')
-  if options.no_dex2oat:
+  if options['no_dex2oat']:
     PREBUILD_TYPES.add('no-dex2oat')
-  if options.jit:
+  if options['jit']:
     COMPILER_TYPES.add('jit')
-  if options.relocate:
+  if options['relocate']:
     RELOCATE_TYPES.add('relocate')
-  if options.ndebuggable:
+  if options['ndebuggable']:
     DEBUGGABLE_TYPES.add('ndebuggable')
-  if options.no_image:
+  if options['no_image']:
     IMAGE_TYPES.add('no-image')
-  if options.optimizing:
+  if options['optimizing']:
     COMPILER_TYPES.add('optimizing')
-  if options.trace:
+  if options['trace']:
     TRACE_TYPES.add('trace')
-  if options.gcstress:
+  if options['gcstress']:
     GC_TYPES.add('gcstress')
-  if options.no_relocate:
+  if options['no_relocate']:
     RELOCATE_TYPES.add('no-relocate')
-  if options.target:
+  if options['target']:
     TARGET_TYPES.add('target')
-  if options.forcecopy:
+  if options['forcecopy']:
     JNI_TYPES.add('forcecopy')
-  if options.n32:
+  if options['n32']:
     ADDRESS_SIZES.add('32')
-  if options.host:
+  if options['host']:
     TARGET_TYPES.add('host')
-  if options.gcverify:
+  if options['gcverify']:
     GC_TYPES.add('gcverify')
-  if options.debuggable:
+  if options['debuggable']:
     DEBUGGABLE_TYPES.add('debuggable')
-  if options.prebuild:
+  if options['prebuild']:
     PREBUILD_TYPES.add('prebuild')
-  if options.debug:
+  if options['debug']:
     RUN_TYPES.add('debug')
-  if options.checkjni:
+  if options['checkjni']:
     JNI_TYPES.add('checkjni')
-  if options.ntrace:
+  if options['ntrace']:
     TRACE_TYPES.add('ntrace')
-  if options.cms:
+  if options['cms']:
     GC_TYPES.add('cms')
-  if options.multipicimage:
+  if options['multipicimage']:
     IMAGE_TYPES.add('multipicimage')
-  if options.verbose:
+  if options['verbose']:
     verbose = True
-  if options.n_thread:
-    n_thread = max(1, options.n_thread)
-  if options.dry_run:
+  if options['n_thread']:
+    n_thread = max(1, options['n_thread'])
+  if options['dry_run']:
     dry_run = True
     verbose = True
-  build = options.build
-  if options.gdb:
+  build = options['build']
+  if options['gdb']:
     n_thread = 1
     gdb = True
-    if options.gdb_arg:
-      gdb_arg = options.gdb_arg
+    if options['gdb_arg']:
+      gdb_arg = options['gdb_arg']
 
   return test