Merge "Fix an occasional ThreadStress crash."
diff --git a/build/Android.gtest.mk b/build/Android.gtest.mk
index ef5819d..22e6df4 100644
--- a/build/Android.gtest.mk
+++ b/build/Android.gtest.mk
@@ -59,6 +59,7 @@
 	runtime/verifier/method_verifier_test.cc \
 	runtime/verifier/reg_type_test.cc \
 	runtime/zip_archive_test.cc \
+	runtime/stack_indirect_reference_table_test.cc
 
 COMPILER_GTEST_COMMON_SRC_FILES := \
 	runtime/jni_internal_test.cc \
diff --git a/compiler/common_compiler_test.h b/compiler/common_compiler_test.h
index 9a21da0..fdf09a5 100644
--- a/compiler/common_compiler_test.h
+++ b/compiler/common_compiler_test.h
@@ -299,7 +299,7 @@
 
       // for ARM, do a runtime check to make sure that the features we are passed from
       // the build match the features we actually determine at runtime.
-      ASSERT_EQ(instruction_set_features, runtime_features);
+      ASSERT_LE(instruction_set_features, runtime_features);
 #elif defined(__aarch64__)
       instruction_set = kArm64;
       // TODO: arm64 compilation support.
diff --git a/compiler/driver/compiler_driver.cc b/compiler/driver/compiler_driver.cc
index b66082d..2b20c6f 100644
--- a/compiler/driver/compiler_driver.cc
+++ b/compiler/driver/compiler_driver.cc
@@ -2036,7 +2036,7 @@
 
 bool CompilerDriver::SkipCompilation(const std::string& method_name) {
   if (!profile_ok_) {
-    return true;
+    return false;
   }
   // Methods that comprise topKPercentThreshold % of the total samples will be compiled.
   double topKPercentThreshold = 90.0;
diff --git a/compiler/jni/quick/arm/calling_convention_arm.cc b/compiler/jni/quick/arm/calling_convention_arm.cc
index ab39d6b..ae18d2e 100644
--- a/compiler/jni/quick/arm/calling_convention_arm.cc
+++ b/compiler/jni/quick/arm/calling_convention_arm.cc
@@ -145,7 +145,7 @@
   // Method*, LR and callee save area size, local reference segment state
   size_t frame_data_size = (3 + CalleeSaveRegisters().size()) * kFramePointerSize;
   // References plus 2 words for SIRT header
-  size_t sirt_size = (ReferenceCount() + 2) * sirt_pointer_size_;
+  size_t sirt_size = StackIndirectReferenceTable::GetAlignedSirtSizeTarget(kFramePointerSize, ReferenceCount());
   // Plus return value spill area size
   return RoundUp(frame_data_size + sirt_size + SizeOfReturnValue(), kStackAlignment);
 }
diff --git a/compiler/jni/quick/arm64/calling_convention_arm64.cc b/compiler/jni/quick/arm64/calling_convention_arm64.cc
index c408fa9..6212a23 100644
--- a/compiler/jni/quick/arm64/calling_convention_arm64.cc
+++ b/compiler/jni/quick/arm64/calling_convention_arm64.cc
@@ -21,14 +21,29 @@
 namespace art {
 namespace arm64 {
 
-// Calling convention
+static const Register kCoreArgumentRegisters[] = {
+  X0, X1, X2, X3, X4, X5, X6, X7
+};
 
+static const WRegister kWArgumentRegisters[] = {
+  W0, W1, W2, W3, W4, W5, W6, W7
+};
+
+static const DRegister kDArgumentRegisters[] = {
+  D0, D1, D2, D3, D4, D5, D6, D7
+};
+
+static const SRegister kSArgumentRegisters[] = {
+  S0, S1, S2, S3, S4, S5, S6, S7
+};
+
+// Calling convention
 ManagedRegister Arm64ManagedRuntimeCallingConvention::InterproceduralScratchRegister() {
-  return Arm64ManagedRegister::FromCoreRegister(IP0);  // X16
+  return Arm64ManagedRegister::FromCoreRegister(X20);  // saved on entry restored on exit
 }
 
 ManagedRegister Arm64JniCallingConvention::InterproceduralScratchRegister() {
-  return Arm64ManagedRegister::FromCoreRegister(IP0);  // X16
+  return Arm64ManagedRegister::FromCoreRegister(X20);  // saved on entry restored on exit
 }
 
 static ManagedRegister ReturnRegisterForShorty(const char* shorty) {
@@ -79,64 +94,64 @@
 FrameOffset Arm64ManagedRuntimeCallingConvention::CurrentParamStackOffset() {
   CHECK(IsCurrentParamOnStack());
   FrameOffset result =
-      FrameOffset(displacement_.Int32Value() +         // displacement
+      FrameOffset(displacement_.Int32Value() +   // displacement
                   kFramePointerSize +                 // Method*
-                  (itr_slots_ * kFramePointerSize));  // offset into in args
+                  (itr_slots_ * sizeof(uint32_t)));  // offset into in args
   return result;
 }
 
 const ManagedRegisterEntrySpills& Arm64ManagedRuntimeCallingConvention::EntrySpills() {
   // We spill the argument registers on ARM64 to free them up for scratch use, we then assume
   // all arguments are on the stack.
-  if (entry_spills_.size() == 0) {
-    // TODO Need fp regs spilled too.
-    //
-    size_t num_spills = NumArgs();
+  if ((entry_spills_.size() == 0) && (NumArgs() > 0)) {
+    int gp_reg_index = 1;   // we start from X1/W1, X0 holds ArtMethod*.
+    int fp_reg_index = 0;   // D0/S0.
 
-    // TODO Floating point need spilling too.
-    if (num_spills > 0) {
-      entry_spills_.push_back(Arm64ManagedRegister::FromCoreRegister(X1));
-      if (num_spills > 1) {
-        entry_spills_.push_back(Arm64ManagedRegister::FromCoreRegister(X2));
-        if (num_spills > 2) {
-          entry_spills_.push_back(Arm64ManagedRegister::FromCoreRegister(X3));
-          if (num_spills > 3) {
-            entry_spills_.push_back(Arm64ManagedRegister::FromCoreRegister(X5));
-            if (num_spills > 4) {
-              entry_spills_.push_back(Arm64ManagedRegister::FromCoreRegister(X6));
-              if (num_spills > 5) {
-                entry_spills_.push_back(Arm64ManagedRegister::FromCoreRegister(X7));
-              }
+    // We need to choose the correct register (D/S or X/W) since the managed
+    // stack uses 32bit stack slots.
+    ResetIterator(FrameOffset(0));
+    while (HasNext()) {
+      if (IsCurrentParamAFloatOrDouble()) {  // FP regs.
+          if (fp_reg_index < 8) {
+            if (!IsCurrentParamADouble()) {
+              entry_spills_.push_back(Arm64ManagedRegister::FromSRegister(kSArgumentRegisters[fp_reg_index]));
+            } else {
+              entry_spills_.push_back(Arm64ManagedRegister::FromDRegister(kDArgumentRegisters[fp_reg_index]));
             }
+            fp_reg_index++;
+          } else {  // just increase the stack offset.
+            if (!IsCurrentParamADouble()) {
+              entry_spills_.push_back(ManagedRegister::NoRegister(), 4);
+            } else {
+              entry_spills_.push_back(ManagedRegister::NoRegister(), 8);
+            }
+          }
+      } else {  // GP regs.
+        if (gp_reg_index < 8) {
+          if (IsCurrentParamALong() && (!IsCurrentParamAReference())) {
+            entry_spills_.push_back(Arm64ManagedRegister::FromCoreRegister(kCoreArgumentRegisters[gp_reg_index]));
+          } else {
+            entry_spills_.push_back(Arm64ManagedRegister::FromWRegister(kWArgumentRegisters[gp_reg_index]));
+          }
+          gp_reg_index++;
+        } else {  // just increase the stack offset.
+          if (IsCurrentParamALong() && (!IsCurrentParamAReference())) {
+              entry_spills_.push_back(ManagedRegister::NoRegister(), 8);
+          } else {
+              entry_spills_.push_back(ManagedRegister::NoRegister(), 4);
           }
         }
       }
+      Next();
     }
   }
-
   return entry_spills_;
 }
-// JNI calling convention
 
+// JNI calling convention
 Arm64JniCallingConvention::Arm64JniCallingConvention(bool is_static, bool is_synchronized,
                                                      const char* shorty)
     : JniCallingConvention(is_static, is_synchronized, shorty, kFramePointerSize) {
-  // TODO This needs to be converted to 64bit.
-  // Compute padding to ensure longs and doubles are not split in AAPCS. Ignore the 'this' jobject
-  // or jclass for static methods and the JNIEnv. We start at the aligned register r2.
-//  size_t padding = 0;
-//  for (size_t cur_arg = IsStatic() ? 0 : 1, cur_reg = 2; cur_arg < NumArgs(); cur_arg++) {
-//    if (IsParamALongOrDouble(cur_arg)) {
-//      if ((cur_reg & 1) != 0) {
-//        padding += 4;
-//        cur_reg++;  // additional bump to ensure alignment
-//      }
-//      cur_reg++;  // additional bump to skip extra long word
-//    }
-//    cur_reg++;  // bump the iterator for every argument
-//  }
-  padding_ =0;
-
   callee_save_regs_.push_back(Arm64ManagedRegister::FromCoreRegister(X19));
   callee_save_regs_.push_back(Arm64ManagedRegister::FromCoreRegister(X20));
   callee_save_regs_.push_back(Arm64ManagedRegister::FromCoreRegister(X21));
@@ -162,83 +177,87 @@
 uint32_t Arm64JniCallingConvention::CoreSpillMask() const {
   // Compute spill mask to agree with callee saves initialized in the constructor
   uint32_t result = 0;
-  result =  1 << X19 | 1 << X20 | 1 << X21 | 1 << X22 | 1 << X23 | 1 << X24 | 1 << X25
-      | 1 << X26 | 1 << X27 | 1 << X28 | 1<< X29 | 1 << LR;
+  result =  1 << X19 | 1 << X20 | 1 << X21 | 1 << X22 | 1 << X23 | 1 << X24 |
+            1 << X25 | 1 << X26 | 1 << X27 | 1 << X28 | 1 << X29 | 1 << LR;
+  return result;
+}
+
+uint32_t Arm64JniCallingConvention::FpSpillMask() const {
+  // Compute spill mask to agree with callee saves initialized in the constructor
+  uint32_t result = 0;
+  result = 1 << D8 | 1 << D9 | 1 << D10 | 1 << D11 | 1 << D12 | 1 << D13 |
+           1 << D14 | 1 << D15;
   return result;
 }
 
 ManagedRegister Arm64JniCallingConvention::ReturnScratchRegister() const {
-  return Arm64ManagedRegister::FromCoreRegister(X9);
+  return ManagedRegister::NoRegister();
 }
 
 size_t Arm64JniCallingConvention::FrameSize() {
-  // Method*, LR and callee save area size, local reference segment state
-  size_t frame_data_size = (3 + CalleeSaveRegisters().size()) * kFramePointerSize;
+  // Method*, callee save area size, local reference segment state
+  size_t frame_data_size = ((1 + CalleeSaveRegisters().size()) * kFramePointerSize) + sizeof(uint32_t);
   // References plus 2 words for SIRT header
-  size_t sirt_size = (ReferenceCount() + 2) * sirt_pointer_size_;
+  size_t sirt_size = StackIndirectReferenceTable::GetAlignedSirtSizeTarget(kFramePointerSize, ReferenceCount());
   // Plus return value spill area size
   return RoundUp(frame_data_size + sirt_size + SizeOfReturnValue(), kStackAlignment);
 }
 
 size_t Arm64JniCallingConvention::OutArgSize() {
-  return RoundUp(NumberOfOutgoingStackArgs() * kFramePointerSize + padding_,
-                 kStackAlignment);
-}
-
-// JniCallingConvention ABI follows AAPCS where longs and doubles must occur
-// in even register numbers and stack slots
-void Arm64JniCallingConvention::Next() {
-  JniCallingConvention::Next();
-  size_t arg_pos = itr_args_ - NumberOfExtraArgumentsForJni();
-  if ((itr_args_ >= 2) &&
-      (arg_pos < NumArgs()) &&
-      IsParamALongOrDouble(arg_pos)) {
-    // itr_slots_ needs to be an even number, according to AAPCS.
-    if ((itr_slots_ & 0x1u) != 0) {
-      itr_slots_++;
-    }
-  }
+  return RoundUp(NumberOfOutgoingStackArgs() * kFramePointerSize, kStackAlignment);
 }
 
 bool Arm64JniCallingConvention::IsCurrentParamInRegister() {
-  return itr_slots_ < 4;
+  if (IsCurrentParamAFloatOrDouble()) {
+    return (itr_float_and_doubles_ < 8);
+  } else {
+    return ((itr_args_ - itr_float_and_doubles_) < 8);
+  }
 }
 
 bool Arm64JniCallingConvention::IsCurrentParamOnStack() {
   return !IsCurrentParamInRegister();
 }
 
-// TODO and floating point?
-
-static const Register kJniArgumentRegisters[] = {
-  X0, X1, X2, X3, X4, X5, X6, X7
-};
 ManagedRegister Arm64JniCallingConvention::CurrentParamRegister() {
-  CHECK_LT(itr_slots_, 4u);
-  int arg_pos = itr_args_ - NumberOfExtraArgumentsForJni();
-  // TODO Floating point & 64bit registers.
-  if ((itr_args_ >= 2) && IsParamALongOrDouble(arg_pos)) {
-    CHECK_EQ(itr_slots_, 2u);
-    return Arm64ManagedRegister::FromCoreRegister(X1);
+  CHECK(IsCurrentParamInRegister());
+  if (IsCurrentParamAFloatOrDouble()) {
+    CHECK_LT(itr_float_and_doubles_, 8u);
+    if (IsCurrentParamADouble()) {
+      return Arm64ManagedRegister::FromDRegister(kDArgumentRegisters[itr_float_and_doubles_]);
+    } else {
+      return Arm64ManagedRegister::FromSRegister(kSArgumentRegisters[itr_float_and_doubles_]);
+    }
   } else {
-    return
-      Arm64ManagedRegister::FromCoreRegister(kJniArgumentRegisters[itr_slots_]);
+    int gp_reg = itr_args_ - itr_float_and_doubles_;
+    CHECK_LT(static_cast<unsigned int>(gp_reg), 8u);
+    if (IsCurrentParamALong() || IsCurrentParamAReference() || IsCurrentParamJniEnv())  {
+      return Arm64ManagedRegister::FromCoreRegister(kCoreArgumentRegisters[gp_reg]);
+    } else {
+      return Arm64ManagedRegister::FromWRegister(kWArgumentRegisters[gp_reg]);
+    }
   }
 }
 
 FrameOffset Arm64JniCallingConvention::CurrentParamStackOffset() {
-  CHECK_GE(itr_slots_, 4u);
-  size_t offset = displacement_.Int32Value() - OutArgSize() + ((itr_slots_ - 4) * kFramePointerSize);
+  CHECK(IsCurrentParamOnStack());
+  size_t args_on_stack = itr_args_
+                  - std::min(8u, itr_float_and_doubles_)
+                  - std::min(8u, (itr_args_ - itr_float_and_doubles_));
+  size_t offset = displacement_.Int32Value() - OutArgSize() + (args_on_stack * kFramePointerSize);
   CHECK_LT(offset, OutArgSize());
   return FrameOffset(offset);
 }
 
 size_t Arm64JniCallingConvention::NumberOfOutgoingStackArgs() {
-  size_t static_args = IsStatic() ? 1 : 0;  // count jclass
-  // regular argument parameters and this
-  size_t param_args = NumArgs() + NumLongOrDoubleArgs();
-  // count JNIEnv* less arguments in registers
-  return static_args + param_args + 1 - 4;
+  // all arguments including JNI args
+  size_t all_args = NumArgs() + NumberOfExtraArgumentsForJni();
+
+  size_t all_stack_args = all_args -
+            std::min(8u, static_cast<unsigned int>(NumFloatOrDoubleArgs())) -
+            std::min(8u, static_cast<unsigned int>((all_args - NumFloatOrDoubleArgs())));
+
+  return all_stack_args;
 }
 
 }  // namespace arm64
diff --git a/compiler/jni/quick/arm64/calling_convention_arm64.h b/compiler/jni/quick/arm64/calling_convention_arm64.h
index c18cd2b..92f547c 100644
--- a/compiler/jni/quick/arm64/calling_convention_arm64.h
+++ b/compiler/jni/quick/arm64/calling_convention_arm64.h
@@ -55,7 +55,6 @@
   ManagedRegister IntReturnRegister() OVERRIDE;
   ManagedRegister InterproceduralScratchRegister() OVERRIDE;
   // JNI calling convention
-  void Next() OVERRIDE;  // Override default behavior for AAPCS
   size_t FrameSize() OVERRIDE;
   size_t OutArgSize() OVERRIDE;
   const std::vector<ManagedRegister>& CalleeSaveRegisters() const OVERRIDE {
@@ -63,9 +62,7 @@
   }
   ManagedRegister ReturnScratchRegister() const OVERRIDE;
   uint32_t CoreSpillMask() const OVERRIDE;
-  uint32_t FpSpillMask() const OVERRIDE {
-    return 0;  // Floats aren't spilled in JNI down call
-  }
+  uint32_t FpSpillMask() const OVERRIDE;
   bool IsCurrentParamInRegister() OVERRIDE;
   bool IsCurrentParamOnStack() OVERRIDE;
   ManagedRegister CurrentParamRegister() OVERRIDE;
@@ -78,9 +75,6 @@
   // TODO: these values aren't unique and can be shared amongst instances
   std::vector<ManagedRegister> callee_save_regs_;
 
-  // Padding to ensure longs and doubles are not split in AAPCS
-  size_t padding_;
-
   DISALLOW_COPY_AND_ASSIGN(Arm64JniCallingConvention);
 };
 
diff --git a/compiler/jni/quick/calling_convention.cc b/compiler/jni/quick/calling_convention.cc
index 8efdcda..a99a4c2 100644
--- a/compiler/jni/quick/calling_convention.cc
+++ b/compiler/jni/quick/calling_convention.cc
@@ -90,6 +90,14 @@
   return IsParamAFloatOrDouble(itr_args_);
 }
 
+bool ManagedRuntimeCallingConvention::IsCurrentParamADouble() {
+  return IsParamADouble(itr_args_);
+}
+
+bool ManagedRuntimeCallingConvention::IsCurrentParamALong() {
+  return IsParamALong(itr_args_);
+}
+
 // JNI calling convention
 
 JniCallingConvention* JniCallingConvention::Create(bool is_static, bool is_synchronized,
@@ -168,6 +176,10 @@
   }
 }
 
+bool JniCallingConvention::IsCurrentParamJniEnv() {
+  return (itr_args_ == kJniEnv);
+}
+
 bool JniCallingConvention::IsCurrentParamAFloatOrDouble() {
   switch (itr_args_) {
     case kJniEnv:
@@ -181,6 +193,32 @@
   }
 }
 
+bool JniCallingConvention::IsCurrentParamADouble() {
+  switch (itr_args_) {
+    case kJniEnv:
+      return false;  // JNIEnv*
+    case kObjectOrClass:
+      return false;   // jobject or jclass
+    default: {
+      int arg_pos = itr_args_ - NumberOfExtraArgumentsForJni();
+      return IsParamADouble(arg_pos);
+    }
+  }
+}
+
+bool JniCallingConvention::IsCurrentParamALong() {
+  switch (itr_args_) {
+    case kJniEnv:
+      return false;  // JNIEnv*
+    case kObjectOrClass:
+      return false;   // jobject or jclass
+    default: {
+      int arg_pos = itr_args_ - NumberOfExtraArgumentsForJni();
+      return IsParamALong(arg_pos);
+    }
+  }
+}
+
 // Return position of SIRT entry holding reference at the current iterator
 // position
 FrameOffset JniCallingConvention::CurrentParamSirtEntryOffset() {
diff --git a/compiler/jni/quick/calling_convention.h b/compiler/jni/quick/calling_convention.h
index 7e1cf63..4d25d1c 100644
--- a/compiler/jni/quick/calling_convention.h
+++ b/compiler/jni/quick/calling_convention.h
@@ -126,6 +126,24 @@
     char ch = shorty_[param];
     return (ch == 'F' || ch == 'D');
   }
+  bool IsParamADouble(unsigned int param) const {
+    DCHECK_LT(param, NumArgs());
+    if (IsStatic()) {
+      param++;  // 0th argument must skip return value at start of the shorty
+    } else if (param == 0) {
+      return false;  // this argument
+    }
+    return shorty_[param] == 'D';
+  }
+  bool IsParamALong(unsigned int param) const {
+    DCHECK_LT(param, NumArgs());
+    if (IsStatic()) {
+      param++;  // 0th argument must skip return value at start of the shorty
+    } else if (param == 0) {
+      return true;  // this argument
+    }
+    return shorty_[param] == 'J';
+  }
   bool IsParamAReference(unsigned int param) const {
     DCHECK_LT(param, NumArgs());
     if (IsStatic()) {
@@ -214,6 +232,8 @@
   void Next();
   bool IsCurrentParamAReference();
   bool IsCurrentParamAFloatOrDouble();
+  bool IsCurrentParamADouble();
+  bool IsCurrentParamALong();
   bool IsCurrentArgExplicit();  // ie a non-implict argument such as this
   bool IsCurrentArgPossiblyNull();
   size_t CurrentParamSize();
@@ -283,6 +303,9 @@
   virtual void Next();
   bool IsCurrentParamAReference();
   bool IsCurrentParamAFloatOrDouble();
+  bool IsCurrentParamADouble();
+  bool IsCurrentParamALong();
+  bool IsCurrentParamJniEnv();
   size_t CurrentParamSize();
   virtual bool IsCurrentParamInRegister() = 0;
   virtual bool IsCurrentParamOnStack() = 0;
@@ -299,17 +322,17 @@
 
   FrameOffset SirtLinkOffset() const {
     return FrameOffset(SirtOffset().Int32Value() +
-                       StackIndirectReferenceTable::LinkOffset());
+                       StackIndirectReferenceTable::LinkOffset(frame_pointer_size_));
   }
 
   FrameOffset SirtNumRefsOffset() const {
     return FrameOffset(SirtOffset().Int32Value() +
-                       StackIndirectReferenceTable::NumberOfReferencesOffset());
+                       StackIndirectReferenceTable::NumberOfReferencesOffset(frame_pointer_size_));
   }
 
   FrameOffset SirtReferencesOffset() const {
     return FrameOffset(SirtOffset().Int32Value() +
-                       StackIndirectReferenceTable::ReferencesOffset());
+                       StackIndirectReferenceTable::ReferencesOffset(frame_pointer_size_));
   }
 
   virtual ~JniCallingConvention() {}
diff --git a/compiler/jni/quick/jni_compiler.cc b/compiler/jni/quick/jni_compiler.cc
index dcdcdd1..64508d1 100644
--- a/compiler/jni/quick/jni_compiler.cc
+++ b/compiler/jni/quick/jni_compiler.cc
@@ -29,6 +29,7 @@
 #include "utils/assembler.h"
 #include "utils/managed_register.h"
 #include "utils/arm/managed_register_arm.h"
+#include "utils/arm64/managed_register_arm64.h"
 #include "utils/mips/managed_register_mips.h"
 #include "utils/x86/managed_register_x86.h"
 #include "thread.h"
@@ -73,11 +74,17 @@
 
   // Calling conventions to call into JNI method "end" possibly passing a returned reference, the
   //     method and the current thread.
-  size_t jni_end_arg_count = 0;
-  if (reference_return) { jni_end_arg_count++; }
-  if (is_synchronized) { jni_end_arg_count++; }
-  const char* jni_end_shorty = jni_end_arg_count == 0 ? "I"
-                                                        : (jni_end_arg_count == 1 ? "II" : "III");
+  const char* jni_end_shorty;
+  if (reference_return && is_synchronized) {
+    jni_end_shorty = "ILL";
+  } else if (reference_return) {
+    jni_end_shorty = "IL";
+  } else if (is_synchronized) {
+    jni_end_shorty = "VL";
+  } else {
+    jni_end_shorty = "V";
+  }
+
   UniquePtr<JniCallingConvention> end_jni_conv(
       JniCallingConvention::Create(is_static, is_synchronized, jni_end_shorty, instruction_set));
 
@@ -101,12 +108,22 @@
   __ StoreImmediateToFrame(main_jni_conv->SirtNumRefsOffset(),
                            main_jni_conv->ReferenceCount(),
                            mr_conv->InterproceduralScratchRegister());
-  __ CopyRawPtrFromThread32(main_jni_conv->SirtLinkOffset(),
-                          Thread::TopSirtOffset<4>(),
-                          mr_conv->InterproceduralScratchRegister());
-  __ StoreStackOffsetToThread32(Thread::TopSirtOffset<4>(),
-                              main_jni_conv->SirtOffset(),
-                              mr_conv->InterproceduralScratchRegister());
+
+  if (instruction_set == kArm64 || instruction_set == kX86_64) {
+    __ CopyRawPtrFromThread64(main_jni_conv->SirtLinkOffset(),
+                            Thread::TopSirtOffset<8>(),
+                            mr_conv->InterproceduralScratchRegister());
+    __ StoreStackOffsetToThread64(Thread::TopSirtOffset<8>(),
+                                main_jni_conv->SirtOffset(),
+                                mr_conv->InterproceduralScratchRegister());
+  } else {
+    __ CopyRawPtrFromThread32(main_jni_conv->SirtLinkOffset(),
+                            Thread::TopSirtOffset<4>(),
+                            mr_conv->InterproceduralScratchRegister());
+    __ StoreStackOffsetToThread32(Thread::TopSirtOffset<4>(),
+                                main_jni_conv->SirtOffset(),
+                                mr_conv->InterproceduralScratchRegister());
+  }
 
   // 3. Place incoming reference arguments into SIRT
   main_jni_conv->Next();  // Skip JNIEnv*
@@ -154,9 +171,15 @@
   }
 
   // 4. Write out the end of the quick frames.
-  __ StoreStackPointerToThread32(Thread::TopOfManagedStackOffset<4>());
-  __ StoreImmediateToThread32(Thread::TopOfManagedStackPcOffset<4>(), 0,
-                            mr_conv->InterproceduralScratchRegister());
+  if (instruction_set == kArm64 || instruction_set == kX86_64) {
+    __ StoreStackPointerToThread64(Thread::TopOfManagedStackOffset<8>());
+    __ StoreImmediateToThread64(Thread::TopOfManagedStackPcOffset<8>(), 0,
+                              mr_conv->InterproceduralScratchRegister());
+  } else {
+    __ StoreStackPointerToThread32(Thread::TopOfManagedStackOffset<4>());
+    __ StoreImmediateToThread32(Thread::TopOfManagedStackPcOffset<4>(), 0,
+                              mr_conv->InterproceduralScratchRegister());
+  }
 
   // 5. Move frame down to allow space for out going args.
   const size_t main_out_arg_size = main_jni_conv->OutArgSize();
@@ -164,13 +187,14 @@
   const size_t max_out_arg_size = std::max(main_out_arg_size, end_out_arg_size);
   __ IncreaseFrameSize(max_out_arg_size);
 
-
   // 6. Call into appropriate JniMethodStart passing Thread* so that transition out of Runnable
   //    can occur. The result is the saved JNI local state that is restored by the exit call. We
   //    abuse the JNI calling convention here, that is guaranteed to support passing 2 pointer
   //    arguments.
-  ThreadOffset<4> jni_start = is_synchronized ? QUICK_ENTRYPOINT_OFFSET(4, pJniMethodStartSynchronized)
-                                              : QUICK_ENTRYPOINT_OFFSET(4, pJniMethodStart);
+  ThreadOffset<4> jni_start32 = is_synchronized ? QUICK_ENTRYPOINT_OFFSET(4, pJniMethodStartSynchronized)
+                                                : QUICK_ENTRYPOINT_OFFSET(4, pJniMethodStart);
+  ThreadOffset<8> jni_start64 = is_synchronized ? QUICK_ENTRYPOINT_OFFSET(8, pJniMethodStartSynchronized)
+                                                : QUICK_ENTRYPOINT_OFFSET(8, pJniMethodStart);
   main_jni_conv->ResetIterator(FrameOffset(main_out_arg_size));
   FrameOffset locked_object_sirt_offset(0);
   if (is_synchronized) {
@@ -192,12 +216,21 @@
   }
   if (main_jni_conv->IsCurrentParamInRegister()) {
     __ GetCurrentThread(main_jni_conv->CurrentParamRegister());
-    __ Call(main_jni_conv->CurrentParamRegister(), Offset(jni_start),
-            main_jni_conv->InterproceduralScratchRegister());
+    if (instruction_set == kArm64 || instruction_set == kX86_64) {
+      __ Call(main_jni_conv->CurrentParamRegister(), Offset(jni_start64),
+             main_jni_conv->InterproceduralScratchRegister());
+    } else {
+      __ Call(main_jni_conv->CurrentParamRegister(), Offset(jni_start32),
+             main_jni_conv->InterproceduralScratchRegister());
+    }
   } else {
     __ GetCurrentThread(main_jni_conv->CurrentParamStackOffset(),
                         main_jni_conv->InterproceduralScratchRegister());
-    __ CallFromThread32(jni_start, main_jni_conv->InterproceduralScratchRegister());
+    if (instruction_set == kArm64 || instruction_set == kX86_64) {
+      __ CallFromThread64(jni_start64, main_jni_conv->InterproceduralScratchRegister());
+    } else {
+      __ CallFromThread32(jni_start32, main_jni_conv->InterproceduralScratchRegister());
+    }
   }
   if (is_synchronized) {  // Check for exceptions from monitor enter.
     __ ExceptionPoll(main_jni_conv->InterproceduralScratchRegister(), main_out_arg_size);
@@ -259,11 +292,20 @@
   if (main_jni_conv->IsCurrentParamInRegister()) {
     ManagedRegister jni_env = main_jni_conv->CurrentParamRegister();
     DCHECK(!jni_env.Equals(main_jni_conv->InterproceduralScratchRegister()));
-    __ LoadRawPtrFromThread32(jni_env, Thread::JniEnvOffset<4>());
+    if (instruction_set == kArm64 || instruction_set == kX86_64) {
+      __ LoadRawPtrFromThread64(jni_env, Thread::JniEnvOffset<8>());
+    } else {
+      __ LoadRawPtrFromThread32(jni_env, Thread::JniEnvOffset<4>());
+    }
   } else {
     FrameOffset jni_env = main_jni_conv->CurrentParamStackOffset();
-    __ CopyRawPtrFromThread32(jni_env, Thread::JniEnvOffset<4>(),
+    if (instruction_set == kArm64 || instruction_set == kX86_64) {
+      __ CopyRawPtrFromThread64(jni_env, Thread::JniEnvOffset<8>(),
                             main_jni_conv->InterproceduralScratchRegister());
+    } else {
+      __ CopyRawPtrFromThread32(jni_env, Thread::JniEnvOffset<4>(),
+                            main_jni_conv->InterproceduralScratchRegister());
+    }
   }
 
   // 9. Plant call to native code associated with method.
@@ -295,19 +337,23 @@
     __ Store(return_save_location, main_jni_conv->ReturnRegister(), main_jni_conv->SizeOfReturnValue());
   }
 
-  // 12. Call into JNI method end possibly passing a returned reference, the method and the current
   //     thread.
   end_jni_conv->ResetIterator(FrameOffset(end_out_arg_size));
-  ThreadOffset<4> jni_end(-1);
+  ThreadOffset<4> jni_end32(-1);
+  ThreadOffset<8> jni_end64(-1);
   if (reference_return) {
     // Pass result.
-    jni_end = is_synchronized ? QUICK_ENTRYPOINT_OFFSET(4, pJniMethodEndWithReferenceSynchronized)
-                              : QUICK_ENTRYPOINT_OFFSET(4, pJniMethodEndWithReference);
+    jni_end32 = is_synchronized ? QUICK_ENTRYPOINT_OFFSET(4, pJniMethodEndWithReferenceSynchronized)
+                                : QUICK_ENTRYPOINT_OFFSET(4, pJniMethodEndWithReference);
+    jni_end64 = is_synchronized ? QUICK_ENTRYPOINT_OFFSET(8, pJniMethodEndWithReferenceSynchronized)
+                                : QUICK_ENTRYPOINT_OFFSET(8, pJniMethodEndWithReference);
     SetNativeParameter(jni_asm.get(), end_jni_conv.get(), end_jni_conv->ReturnRegister());
     end_jni_conv->Next();
   } else {
-    jni_end = is_synchronized ? QUICK_ENTRYPOINT_OFFSET(4, pJniMethodEndSynchronized)
-                              : QUICK_ENTRYPOINT_OFFSET(4, pJniMethodEnd);
+    jni_end32 = is_synchronized ? QUICK_ENTRYPOINT_OFFSET(4, pJniMethodEndSynchronized)
+                                : QUICK_ENTRYPOINT_OFFSET(4, pJniMethodEnd);
+    jni_end64 = is_synchronized ? QUICK_ENTRYPOINT_OFFSET(8, pJniMethodEndSynchronized)
+                                : QUICK_ENTRYPOINT_OFFSET(8, pJniMethodEnd);
   }
   // Pass saved local reference state.
   if (end_jni_conv->IsCurrentParamOnStack()) {
@@ -334,12 +380,21 @@
   }
   if (end_jni_conv->IsCurrentParamInRegister()) {
     __ GetCurrentThread(end_jni_conv->CurrentParamRegister());
-    __ Call(end_jni_conv->CurrentParamRegister(), Offset(jni_end),
-            end_jni_conv->InterproceduralScratchRegister());
+    if (instruction_set == kArm64 || instruction_set == kX86_64) {
+      __ Call(end_jni_conv->CurrentParamRegister(), Offset(jni_end64),
+              end_jni_conv->InterproceduralScratchRegister());
+    } else {
+      __ Call(end_jni_conv->CurrentParamRegister(), Offset(jni_end32),
+              end_jni_conv->InterproceduralScratchRegister());
+    }
   } else {
     __ GetCurrentThread(end_jni_conv->CurrentParamStackOffset(),
                         end_jni_conv->InterproceduralScratchRegister());
-    __ CallFromThread32(ThreadOffset<4>(jni_end), end_jni_conv->InterproceduralScratchRegister());
+    if (instruction_set == kArm64 || instruction_set == kX86_64) {
+      __ CallFromThread64(ThreadOffset<8>(jni_end64), end_jni_conv->InterproceduralScratchRegister());
+    } else {
+      __ CallFromThread32(ThreadOffset<4>(jni_end32), end_jni_conv->InterproceduralScratchRegister());
+    }
   }
 
   // 13. Reload return value
@@ -360,6 +415,10 @@
   // 17. Finalize code generation
   __ EmitSlowPaths();
   size_t cs = __ CodeSize();
+  if (instruction_set == kArm64) {
+    // Test that we do not exceed the buffer size.
+    CHECK(cs < arm64::kBufferSizeArm64);
+  }
   std::vector<uint8_t> managed_code(cs);
   MemoryRegion code(&managed_code[0], managed_code.size());
   __ FinalizeInstructions(code);
diff --git a/compiler/jni/quick/mips/calling_convention_mips.cc b/compiler/jni/quick/mips/calling_convention_mips.cc
index 51a3f54..8e1c0c7 100644
--- a/compiler/jni/quick/mips/calling_convention_mips.cc
+++ b/compiler/jni/quick/mips/calling_convention_mips.cc
@@ -149,7 +149,7 @@
   // Method*, LR and callee save area size, local reference segment state
   size_t frame_data_size = (3 + CalleeSaveRegisters().size()) * kFramePointerSize;
   // References plus 2 words for SIRT header
-  size_t sirt_size = (ReferenceCount() + 2) * sirt_pointer_size_;
+  size_t sirt_size = StackIndirectReferenceTable::GetAlignedSirtSizeTarget(kFramePointerSize, ReferenceCount());
   // Plus return value spill area size
   return RoundUp(frame_data_size + sirt_size + SizeOfReturnValue(), kStackAlignment);
 }
diff --git a/compiler/jni/quick/x86/calling_convention_x86.cc b/compiler/jni/quick/x86/calling_convention_x86.cc
index 8b440ed..153f953 100644
--- a/compiler/jni/quick/x86/calling_convention_x86.cc
+++ b/compiler/jni/quick/x86/calling_convention_x86.cc
@@ -126,7 +126,7 @@
   // Method*, return address and callee save area size, local reference segment state
   size_t frame_data_size = (3 + CalleeSaveRegisters().size()) * kFramePointerSize;
   // References plus 2 words for SIRT header
-  size_t sirt_size = (ReferenceCount() + 2) * sirt_pointer_size_;
+  size_t sirt_size = StackIndirectReferenceTable::GetAlignedSirtSizeTarget(kFramePointerSize, ReferenceCount());
   // Plus return value spill area size
   return RoundUp(frame_data_size + sirt_size + SizeOfReturnValue(), kStackAlignment);
 }
diff --git a/compiler/jni/quick/x86_64/calling_convention_x86_64.cc b/compiler/jni/quick/x86_64/calling_convention_x86_64.cc
index 21e0bd7..4dfa29a 100644
--- a/compiler/jni/quick/x86_64/calling_convention_x86_64.cc
+++ b/compiler/jni/quick/x86_64/calling_convention_x86_64.cc
@@ -141,7 +141,7 @@
   // Method*, return address and callee save area size, local reference segment state
   size_t frame_data_size = (3 + CalleeSaveRegisters().size()) * kFramePointerSize;
   // References plus link_ (pointer) and number_of_references_ (uint32_t) for SIRT header
-  size_t sirt_size = kFramePointerSize + sizeof(uint32_t) + (ReferenceCount() * sirt_pointer_size_);
+  size_t sirt_size = StackIndirectReferenceTable::GetAlignedSirtSizeTarget(kFramePointerSize, ReferenceCount());
   // Plus return value spill area size
   return RoundUp(frame_data_size + sirt_size + SizeOfReturnValue(), kStackAlignment);
 }
diff --git a/compiler/oat_writer.cc b/compiler/oat_writer.cc
index 2d45a2f..eff2425 100644
--- a/compiler/oat_writer.cc
+++ b/compiler/oat_writer.cc
@@ -345,6 +345,36 @@
   return offset;
 }
 
+static void DCheckCodeAlignment(size_t offset, InstructionSet isa) {
+  switch (isa) {
+    case kArm:
+      // Fall-through.
+    case kThumb2:
+      DCHECK_ALIGNED(offset, kArmAlignment);
+      break;
+
+    case kArm64:
+      DCHECK_ALIGNED(offset, kArm64Alignment);
+      break;
+
+    case kMips:
+      DCHECK_ALIGNED(offset, kMipsAlignment);
+      break;
+
+    case kX86_64:
+      // Fall-through.
+    case kX86:
+      DCHECK_ALIGNED(offset, kX86Alignment);
+      break;
+
+    case kNone:
+      // Use a DCHECK instead of FATAL so that in the non-debug case the whole switch can
+      // be optimized away.
+      DCHECK(false);
+      break;
+  }
+}
+
 size_t OatWriter::InitOatCodeMethod(size_t offset, size_t oat_class_index,
                                     size_t __attribute__((unused)) class_def_index,
                                     size_t class_def_method_index,
@@ -376,7 +406,8 @@
     } else {
       CHECK(quick_code != nullptr);
       offset = compiled_method->AlignCode(offset);
-      DCHECK_ALIGNED(offset, kArmAlignment);
+      DCheckCodeAlignment(offset, compiled_method->GetInstructionSet());
+
       uint32_t code_size = quick_code->size() * sizeof(uint8_t);
       CHECK_NE(code_size, 0U);
       uint32_t thumb_offset = compiled_method->CodeDelta();
@@ -826,7 +857,8 @@
         relative_offset += aligned_code_delta;
         DCHECK_OFFSET();
       }
-      DCHECK_ALIGNED(relative_offset, kArmAlignment);
+      DCheckCodeAlignment(relative_offset, compiled_method->GetInstructionSet());
+
       uint32_t code_size = quick_code->size() * sizeof(uint8_t);
       CHECK_NE(code_size, 0U);
 
diff --git a/compiler/utils/arm64/assembler_arm64.cc b/compiler/utils/arm64/assembler_arm64.cc
index a11c2da..1d87eaa 100644
--- a/compiler/utils/arm64/assembler_arm64.cc
+++ b/compiler/utils/arm64/assembler_arm64.cc
@@ -50,11 +50,11 @@
 }
 
 void Arm64Assembler::GetCurrentThread(ManagedRegister tr) {
-  ___ Mov(reg_x(tr.AsArm64().AsCoreRegister()), reg_x(TR));
+  ___ Mov(reg_x(tr.AsArm64().AsCoreRegister()), reg_x(TR1));
 }
 
 void Arm64Assembler::GetCurrentThread(FrameOffset offset, ManagedRegister /* scratch */) {
-  StoreToOffset(TR, SP, offset.Int32Value());
+  StoreToOffset(TR1, SP, offset.Int32Value());
 }
 
 // See Arm64 PCS Section 5.2.2.1.
@@ -138,7 +138,8 @@
 void Arm64Assembler::StoreRef(FrameOffset offs, ManagedRegister m_src) {
   Arm64ManagedRegister src = m_src.AsArm64();
   CHECK(src.IsCoreRegister()) << src;
-  StoreToOffset(src.AsCoreRegister(), SP, offs.Int32Value());
+  StoreWToOffset(kStoreWord, src.AsOverlappingCoreRegisterLow(), SP,
+                 offs.Int32Value());
 }
 
 void Arm64Assembler::StoreRawPtr(FrameOffset offs, ManagedRegister m_src) {
@@ -152,30 +153,31 @@
   Arm64ManagedRegister scratch = m_scratch.AsArm64();
   CHECK(scratch.IsCoreRegister()) << scratch;
   LoadImmediate(scratch.AsCoreRegister(), imm);
-  StoreToOffset(scratch.AsCoreRegister(), SP, offs.Int32Value());
+  StoreWToOffset(kStoreWord, scratch.AsOverlappingCoreRegisterLow(), SP,
+                 offs.Int32Value());
 }
 
-void Arm64Assembler::StoreImmediateToThread32(ThreadOffset<4> offs, uint32_t imm,
+void Arm64Assembler::StoreImmediateToThread64(ThreadOffset<8> offs, uint32_t imm,
                                             ManagedRegister m_scratch) {
   Arm64ManagedRegister scratch = m_scratch.AsArm64();
   CHECK(scratch.IsCoreRegister()) << scratch;
   LoadImmediate(scratch.AsCoreRegister(), imm);
-  StoreToOffset(scratch.AsCoreRegister(), TR, offs.Int32Value());
+  StoreToOffset(scratch.AsCoreRegister(), TR1, offs.Int32Value());
 }
 
-void Arm64Assembler::StoreStackOffsetToThread32(ThreadOffset<4> tr_offs,
+void Arm64Assembler::StoreStackOffsetToThread64(ThreadOffset<8> tr_offs,
                                               FrameOffset fr_offs,
                                               ManagedRegister m_scratch) {
   Arm64ManagedRegister scratch = m_scratch.AsArm64();
   CHECK(scratch.IsCoreRegister()) << scratch;
   AddConstant(scratch.AsCoreRegister(), SP, fr_offs.Int32Value());
-  StoreToOffset(scratch.AsCoreRegister(), TR, tr_offs.Int32Value());
+  StoreToOffset(scratch.AsCoreRegister(), TR1, tr_offs.Int32Value());
 }
 
-void Arm64Assembler::StoreStackPointerToThread32(ThreadOffset<4> tr_offs) {
+void Arm64Assembler::StoreStackPointerToThread64(ThreadOffset<8> tr_offs) {
   // Arm64 does not support: "str sp, [dest]" therefore we use IP1 as a temp reg.
   ___ Mov(reg_x(IP1), reg_x(SP));
-  StoreToOffset(IP1, TR, tr_offs.Int32Value());
+  StoreToOffset(IP1, TR1, tr_offs.Int32Value());
 }
 
 void Arm64Assembler::StoreSpanning(FrameOffset dest_off, ManagedRegister m_source,
@@ -254,9 +256,13 @@
     CHECK_EQ(4u, size) << dest;
     ___ Ldr(reg_w(dest.AsWRegister()), MEM_OP(reg_x(base), offset));
   } else if (dest.IsCoreRegister()) {
-    CHECK_EQ(8u, size) << dest;
     CHECK_NE(dest.AsCoreRegister(), SP) << dest;
-    ___ Ldr(reg_x(dest.AsCoreRegister()), MEM_OP(reg_x(base), offset));
+    if (size == 4u) {
+      ___ Ldr(reg_w(dest.AsOverlappingCoreRegisterLow()), MEM_OP(reg_x(base), offset));
+    } else {
+      CHECK_EQ(8u, size) << dest;
+      ___ Ldr(reg_x(dest.AsCoreRegister()), MEM_OP(reg_x(base), offset));
+    }
   } else if (dest.IsSRegister()) {
     ___ Ldr(reg_s(dest.AsSRegister()), MEM_OP(reg_x(base), offset));
   } else {
@@ -269,14 +275,14 @@
   return Load(m_dst.AsArm64(), SP, src.Int32Value(), size);
 }
 
-void Arm64Assembler::LoadFromThread32(ManagedRegister m_dst, ThreadOffset<4> src, size_t size) {
-  return Load(m_dst.AsArm64(), TR, src.Int32Value(), size);
+void Arm64Assembler::LoadFromThread64(ManagedRegister m_dst, ThreadOffset<8> src, size_t size) {
+  return Load(m_dst.AsArm64(), TR1, src.Int32Value(), size);
 }
 
 void Arm64Assembler::LoadRef(ManagedRegister m_dst, FrameOffset offs) {
   Arm64ManagedRegister dst = m_dst.AsArm64();
   CHECK(dst.IsCoreRegister()) << dst;
-  LoadFromOffset(dst.AsCoreRegister(), SP, offs.Int32Value());
+  LoadWFromOffset(kLoadWord, dst.AsOverlappingCoreRegisterLow(), SP, offs.Int32Value());
 }
 
 void Arm64Assembler::LoadRef(ManagedRegister m_dst, ManagedRegister m_base,
@@ -284,7 +290,8 @@
   Arm64ManagedRegister dst = m_dst.AsArm64();
   Arm64ManagedRegister base = m_base.AsArm64();
   CHECK(dst.IsCoreRegister() && base.IsCoreRegister());
-  LoadFromOffset(dst.AsCoreRegister(), base.AsCoreRegister(), offs.Int32Value());
+  LoadWFromOffset(kLoadWord, dst.AsOverlappingCoreRegisterLow(), base.AsCoreRegister(),
+                  offs.Int32Value());
 }
 
 void Arm64Assembler::LoadRawPtr(ManagedRegister m_dst, ManagedRegister m_base, Offset offs) {
@@ -294,10 +301,10 @@
   LoadFromOffset(dst.AsCoreRegister(), base.AsCoreRegister(), offs.Int32Value());
 }
 
-void Arm64Assembler::LoadRawPtrFromThread32(ManagedRegister m_dst, ThreadOffset<4> offs) {
+void Arm64Assembler::LoadRawPtrFromThread64(ManagedRegister m_dst, ThreadOffset<8> offs) {
   Arm64ManagedRegister dst = m_dst.AsArm64();
   CHECK(dst.IsCoreRegister()) << dst;
-  LoadFromOffset(dst.AsCoreRegister(), TR, offs.Int32Value());
+  LoadFromOffset(dst.AsCoreRegister(), TR1, offs.Int32Value());
 }
 
 // Copying routines.
@@ -306,8 +313,16 @@
   Arm64ManagedRegister src = m_src.AsArm64();
   if (!dst.Equals(src)) {
     if (dst.IsCoreRegister()) {
-      CHECK(src.IsCoreRegister()) << src;
-      ___ Mov(reg_x(dst.AsCoreRegister()), reg_x(src.AsCoreRegister()));
+      if (size == 4) {
+        CHECK(src.IsWRegister());
+        ___ Mov(reg_x(dst.AsCoreRegister()), reg_w(src.AsWRegister()));
+      } else {
+        if (src.IsCoreRegister()) {
+          ___ Mov(reg_x(dst.AsCoreRegister()), reg_x(src.AsCoreRegister()));
+        } else {
+          ___ Mov(reg_x(dst.AsCoreRegister()), reg_w(src.AsWRegister()));
+        }
+      }
     } else if (dst.IsWRegister()) {
       CHECK(src.IsWRegister()) << src;
       ___ Mov(reg_w(dst.AsWRegister()), reg_w(src.AsWRegister()));
@@ -322,40 +337,42 @@
   }
 }
 
-void Arm64Assembler::CopyRawPtrFromThread32(FrameOffset fr_offs,
-                                          ThreadOffset<4> tr_offs,
+void Arm64Assembler::CopyRawPtrFromThread64(FrameOffset fr_offs,
+                                          ThreadOffset<8> tr_offs,
                                           ManagedRegister m_scratch) {
   Arm64ManagedRegister scratch = m_scratch.AsArm64();
   CHECK(scratch.IsCoreRegister()) << scratch;
-  LoadFromOffset(scratch.AsCoreRegister(), TR, tr_offs.Int32Value());
+  LoadFromOffset(scratch.AsCoreRegister(), TR1, tr_offs.Int32Value());
   StoreToOffset(scratch.AsCoreRegister(), SP, fr_offs.Int32Value());
 }
 
-void Arm64Assembler::CopyRawPtrToThread32(ThreadOffset<4> tr_offs,
+void Arm64Assembler::CopyRawPtrToThread64(ThreadOffset<8> tr_offs,
                                         FrameOffset fr_offs,
                                         ManagedRegister m_scratch) {
   Arm64ManagedRegister scratch = m_scratch.AsArm64();
   CHECK(scratch.IsCoreRegister()) << scratch;
   LoadFromOffset(scratch.AsCoreRegister(), SP, fr_offs.Int32Value());
-  StoreToOffset(scratch.AsCoreRegister(), TR, tr_offs.Int32Value());
+  StoreToOffset(scratch.AsCoreRegister(), TR1, tr_offs.Int32Value());
 }
 
 void Arm64Assembler::CopyRef(FrameOffset dest, FrameOffset src,
                              ManagedRegister m_scratch) {
   Arm64ManagedRegister scratch = m_scratch.AsArm64();
   CHECK(scratch.IsCoreRegister()) << scratch;
-  LoadFromOffset(scratch.AsCoreRegister(), SP, src.Int32Value());
-  StoreToOffset(scratch.AsCoreRegister(), SP, dest.Int32Value());
+  LoadWFromOffset(kLoadWord, scratch.AsOverlappingCoreRegisterLow(),
+                  SP, src.Int32Value());
+  StoreWToOffset(kStoreWord, scratch.AsOverlappingCoreRegisterLow(),
+                 SP, dest.Int32Value());
 }
 
 void Arm64Assembler::Copy(FrameOffset dest, FrameOffset src,
                           ManagedRegister m_scratch, size_t size) {
   Arm64ManagedRegister scratch = m_scratch.AsArm64();
-  CHECK(scratch.IsCoreRegister() || scratch.IsWRegister()) << scratch;
+  CHECK(scratch.IsCoreRegister()) << scratch;
   CHECK(size == 4 || size == 8) << size;
   if (size == 4) {
-    LoadWFromOffset(kLoadWord, scratch.AsWRegister(), SP, src.Int32Value());
-    StoreWToOffset(kStoreWord, scratch.AsWRegister(), SP, dest.Int32Value());
+    LoadWFromOffset(kLoadWord, scratch.AsOverlappingCoreRegisterLow(), SP, src.Int32Value());
+    StoreWToOffset(kStoreWord, scratch.AsOverlappingCoreRegisterLow(), SP, dest.Int32Value());
   } else if (size == 8) {
     LoadFromOffset(scratch.AsCoreRegister(), SP, src.Int32Value());
     StoreToOffset(scratch.AsCoreRegister(), SP, dest.Int32Value());
@@ -418,10 +435,17 @@
   CHECK(scratch.IsCoreRegister() || scratch.IsWRegister()) << scratch;
   CHECK(size == 4 || size == 8) << size;
   if (size == 4) {
-    LoadWFromOffset(kLoadWord, scratch.AsWRegister(), src.AsCoreRegister(),
+    if (scratch.IsWRegister()) {
+      LoadWFromOffset(kLoadWord, scratch.AsWRegister(), src.AsCoreRegister(),
                     src_offset.Int32Value());
-    StoreWToOffset(kStoreWord, scratch.AsWRegister(), dest.AsCoreRegister(),
+      StoreWToOffset(kStoreWord, scratch.AsWRegister(), dest.AsCoreRegister(),
                    dest_offset.Int32Value());
+    } else {
+      LoadWFromOffset(kLoadWord, scratch.AsOverlappingCoreRegisterLow(), src.AsCoreRegister(),
+                    src_offset.Int32Value());
+      StoreWToOffset(kStoreWord, scratch.AsOverlappingCoreRegisterLow(), dest.AsCoreRegister(),
+                   dest_offset.Int32Value());
+    }
   } else if (size == 8) {
     LoadFromOffset(scratch.AsCoreRegister(), src.AsCoreRegister(), src_offset.Int32Value());
     StoreToOffset(scratch.AsCoreRegister(), dest.AsCoreRegister(), dest_offset.Int32Value());
@@ -486,7 +510,7 @@
   ___ Blr(reg_x(scratch.AsCoreRegister()));
 }
 
-void Arm64Assembler::CallFromThread32(ThreadOffset<4> /*offset*/, ManagedRegister /*scratch*/) {
+void Arm64Assembler::CallFromThread64(ThreadOffset<8> /*offset*/, ManagedRegister /*scratch*/) {
   UNIMPLEMENTED(FATAL) << "Unimplemented Call() variant";
 }
 
@@ -502,10 +526,11 @@
     // the address in the SIRT holding the reference.
     // e.g. out_reg = (handle == 0) ? 0 : (SP+handle_offset)
     if (in_reg.IsNoRegister()) {
-      LoadFromOffset(out_reg.AsCoreRegister(), SP, sirt_offs.Int32Value());
+      LoadWFromOffset(kLoadWord, out_reg.AsOverlappingCoreRegisterLow(), SP,
+                      sirt_offs.Int32Value());
       in_reg = out_reg;
     }
-    ___ Cmp(reg_x(in_reg.AsCoreRegister()), 0);
+    ___ Cmp(reg_w(in_reg.AsOverlappingCoreRegisterLow()), 0);
     if (!out_reg.Equals(in_reg)) {
       LoadImmediate(out_reg.AsCoreRegister(), 0, EQ);
     }
@@ -520,11 +545,12 @@
   Arm64ManagedRegister scratch = m_scratch.AsArm64();
   CHECK(scratch.IsCoreRegister()) << scratch;
   if (null_allowed) {
-    LoadFromOffset(scratch.AsCoreRegister(), SP, sirt_offset.Int32Value());
+    LoadWFromOffset(kLoadWord, scratch.AsOverlappingCoreRegisterLow(), SP,
+                    sirt_offset.Int32Value());
     // Null values get a SIRT entry value of 0.  Otherwise, the sirt entry is
     // the address in the SIRT holding the reference.
     // e.g. scratch = (scratch == 0) ? 0 : (SP+sirt_offset)
-    ___ Cmp(reg_x(scratch.AsCoreRegister()), 0);
+    ___ Cmp(reg_w(scratch.AsOverlappingCoreRegisterLow()), 0);
     // Move this logic in add constants with flags.
     AddConstant(scratch.AsCoreRegister(), SP, sirt_offset.Int32Value(), NE);
   } else {
@@ -555,7 +581,7 @@
   Arm64ManagedRegister scratch = m_scratch.AsArm64();
   Arm64Exception *current_exception = new Arm64Exception(scratch, stack_adjust);
   exception_blocks_.push_back(current_exception);
-  LoadFromOffset(scratch.AsCoreRegister(), TR, Thread::ExceptionOffset<4>().Int32Value());
+  LoadFromOffset(scratch.AsCoreRegister(), TR1, Thread::ExceptionOffset<8>().Int32Value());
   ___ Cmp(reg_x(scratch.AsCoreRegister()), 0);
   ___ B(current_exception->Entry(), COND_OP(NE));
 }
@@ -569,7 +595,11 @@
   // Pass exception object as argument.
   // Don't care about preserving X0 as this won't return.
   ___ Mov(reg_x(X0), reg_x(exception->scratch_.AsCoreRegister()));
-  LoadFromOffset(IP1, TR, QUICK_ENTRYPOINT_OFFSET(8, pDeliverException).Int32Value());
+  LoadFromOffset(IP1, TR1, QUICK_ENTRYPOINT_OFFSET(8, pDeliverException).Int32Value());
+
+  // FIXME: Temporary fix for TR (XSELF).
+  ___ Mov(reg_x(TR), reg_x(TR1));
+
   ___ Blr(reg_x(IP1));
   // Call should never return.
   ___ Brk();
@@ -590,6 +620,9 @@
   CHECK_EQ(callee_save_regs.size(), kCalleeSavedRegsSize);
   ___ PushCalleeSavedRegisters();
 
+  // FIXME: Temporary fix for TR (XSELF).
+  ___ Mov(reg_x(TR1), reg_x(TR));
+
   // Increate frame to required size - must be at least space to push Method*.
   CHECK_GT(frame_size, kCalleeSavedRegsSize * kFramePointerSize);
   size_t adjust = frame_size - (kCalleeSavedRegsSize * kFramePointerSize);
@@ -598,11 +631,27 @@
   // Write Method*.
   StoreToOffset(X0, SP, 0);
 
-  // Write out entry spills, treated as X regs.
-  // TODO: we can implement a %2 STRP variant of StoreToOffset.
+  // Write out entry spills
+  int32_t offset = frame_size + kFramePointerSize;
   for (size_t i = 0; i < entry_spills.size(); ++i) {
-    Register reg = entry_spills.at(i).AsArm64().AsCoreRegister();
-    StoreToOffset(reg, SP, frame_size + kFramePointerSize + (i * kFramePointerSize));
+    Arm64ManagedRegister reg = entry_spills.at(i).AsArm64();
+    if (reg.IsNoRegister()) {
+      // only increment stack offset.
+      ManagedRegisterSpill spill = entry_spills.at(i);
+      offset += spill.getSize();
+    } else if (reg.IsCoreRegister()) {
+      StoreToOffset(reg.AsCoreRegister(), SP, offset);
+      offset += 8;
+    } else if (reg.IsWRegister()) {
+      StoreWToOffset(kStoreWord, reg.AsWRegister(), SP, offset);
+      offset += 4;
+    } else if (reg.IsDRegister()) {
+      StoreDToOffset(reg.AsDRegister(), SP, offset);
+      offset += 8;
+    } else if (reg.IsSRegister()) {
+      StoreSToOffset(reg.AsSRegister(), SP, offset);
+      offset += 4;
+    }
   }
 }
 
@@ -618,6 +667,9 @@
   size_t adjust = frame_size - (kCalleeSavedRegsSize * kFramePointerSize);
   DecreaseFrameSize(adjust);
 
+  // FIXME: Temporary fix for TR (XSELF).
+  ___ Mov(reg_x(TR), reg_x(TR1));
+
   // Pop callee saved and return to LR.
   ___ PopCalleeSavedRegisters();
   ___ Ret();
diff --git a/compiler/utils/arm64/assembler_arm64.h b/compiler/utils/arm64/assembler_arm64.h
index 8acd1f9..97fb93a 100644
--- a/compiler/utils/arm64/assembler_arm64.h
+++ b/compiler/utils/arm64/assembler_arm64.h
@@ -81,8 +81,8 @@
 
 class Arm64Assembler FINAL : public Assembler {
  public:
-  Arm64Assembler() : vixl_buf_(new byte[BUF_SIZE]),
-  vixl_masm_(new vixl::MacroAssembler(vixl_buf_, BUF_SIZE)) {}
+  Arm64Assembler() : vixl_buf_(new byte[kBufferSizeArm64]),
+  vixl_masm_(new vixl::MacroAssembler(vixl_buf_, kBufferSizeArm64)) {}
 
   virtual ~Arm64Assembler() {
     delete[] vixl_buf_;
@@ -114,27 +114,27 @@
   void StoreRef(FrameOffset dest, ManagedRegister src) OVERRIDE;
   void StoreRawPtr(FrameOffset dest, ManagedRegister src) OVERRIDE;
   void StoreImmediateToFrame(FrameOffset dest, uint32_t imm, ManagedRegister scratch) OVERRIDE;
-  void StoreImmediateToThread32(ThreadOffset<4> dest, uint32_t imm, ManagedRegister scratch)
+  void StoreImmediateToThread64(ThreadOffset<8> dest, uint32_t imm, ManagedRegister scratch)
       OVERRIDE;
-  void StoreStackOffsetToThread32(ThreadOffset<4> thr_offs, FrameOffset fr_offs,
+  void StoreStackOffsetToThread64(ThreadOffset<8> thr_offs, FrameOffset fr_offs,
                                   ManagedRegister scratch) OVERRIDE;
-  void StoreStackPointerToThread32(ThreadOffset<4> thr_offs) OVERRIDE;
+  void StoreStackPointerToThread64(ThreadOffset<8> thr_offs) OVERRIDE;
   void StoreSpanning(FrameOffset dest, ManagedRegister src, FrameOffset in_off,
                      ManagedRegister scratch) OVERRIDE;
 
   // Load routines.
   void Load(ManagedRegister dest, FrameOffset src, size_t size) OVERRIDE;
-  void LoadFromThread32(ManagedRegister dest, ThreadOffset<4> src, size_t size) OVERRIDE;
+  void LoadFromThread64(ManagedRegister dest, ThreadOffset<8> src, size_t size) OVERRIDE;
   void LoadRef(ManagedRegister dest, FrameOffset  src) OVERRIDE;
   void LoadRef(ManagedRegister dest, ManagedRegister base, MemberOffset offs) OVERRIDE;
   void LoadRawPtr(ManagedRegister dest, ManagedRegister base, Offset offs) OVERRIDE;
-  void LoadRawPtrFromThread32(ManagedRegister dest, ThreadOffset<4> offs) OVERRIDE;
+  void LoadRawPtrFromThread64(ManagedRegister dest, ThreadOffset<8> offs) OVERRIDE;
 
   // Copying routines.
   void Move(ManagedRegister dest, ManagedRegister src, size_t size) OVERRIDE;
-  void CopyRawPtrFromThread32(FrameOffset fr_offs, ThreadOffset<4> thr_offs,
+  void CopyRawPtrFromThread64(FrameOffset fr_offs, ThreadOffset<8> thr_offs,
                               ManagedRegister scratch) OVERRIDE;
-  void CopyRawPtrToThread32(ThreadOffset<4> thr_offs, FrameOffset fr_offs, ManagedRegister scratch)
+  void CopyRawPtrToThread64(ThreadOffset<8> thr_offs, FrameOffset fr_offs, ManagedRegister scratch)
       OVERRIDE;
   void CopyRef(FrameOffset dest, FrameOffset src, ManagedRegister scratch) OVERRIDE;
   void Copy(FrameOffset dest, FrameOffset src, ManagedRegister scratch, size_t size) OVERRIDE;
@@ -183,7 +183,7 @@
   // Call to address held at [base+offset].
   void Call(ManagedRegister base, Offset offset, ManagedRegister scratch) OVERRIDE;
   void Call(FrameOffset base, Offset offset, ManagedRegister scratch) OVERRIDE;
-  void CallFromThread32(ThreadOffset<4> offset, ManagedRegister scratch) OVERRIDE;
+  void CallFromThread64(ThreadOffset<8> offset, ManagedRegister scratch) OVERRIDE;
 
   // Jump to address (not setting link register)
   void JumpTo(ManagedRegister m_base, Offset offs, ManagedRegister m_scratch);
@@ -234,9 +234,6 @@
   void AddConstant(Register rd, int32_t value, Condition cond = AL);
   void AddConstant(Register rd, Register rn, int32_t value, Condition cond = AL);
 
-  // Vixl buffer size.
-  static constexpr size_t BUF_SIZE = 4096;
-
   // Vixl buffer.
   byte* vixl_buf_;
 
diff --git a/compiler/utils/arm64/constants_arm64.h b/compiler/utils/arm64/constants_arm64.h
index ecf9fbe..2a08c95 100644
--- a/compiler/utils/arm64/constants_arm64.h
+++ b/compiler/utils/arm64/constants_arm64.h
@@ -31,6 +31,9 @@
 
 constexpr unsigned int kCalleeSavedRegsSize = 20;
 
+// Vixl buffer size.
+constexpr size_t kBufferSizeArm64 = 4096*2;
+
 }  // arm64
 }  // art
 
diff --git a/compiler/utils/managed_register.h b/compiler/utils/managed_register.h
index f007d28..06ce3b4 100644
--- a/compiler/utils/managed_register.h
+++ b/compiler/utils/managed_register.h
@@ -89,6 +89,9 @@
   explicit ManagedRegisterSpill(const ManagedRegister& other)
       : ManagedRegister(other), size_(-1), spill_offset_(-1) { }
 
+  explicit ManagedRegisterSpill(const ManagedRegister& other, int32_t size)
+      : ManagedRegister(other), size_(size), spill_offset_(-1) { }
+
   int32_t getSpillOffset() {
     return spill_offset_;
   }
@@ -111,6 +114,11 @@
     std::vector<ManagedRegisterSpill>::push_back(spill);
   }
 
+  void push_back(ManagedRegister __x, int32_t __size) {
+    ManagedRegisterSpill spill(__x, __size);
+    std::vector<ManagedRegisterSpill>::push_back(spill);
+  }
+
   void push_back(ManagedRegisterSpill __x) {
     std::vector<ManagedRegisterSpill>::push_back(__x);
   }
diff --git a/dex2oat/dex2oat.cc b/dex2oat/dex2oat.cc
index c6e448e..ac76c35 100644
--- a/dex2oat/dex2oat.cc
+++ b/dex2oat/dex2oat.cc
@@ -139,7 +139,7 @@
   UsageError("      Example: --android-root=out/host/linux-x86");
   UsageError("      Default: $ANDROID_ROOT");
   UsageError("");
-  UsageError("  --instruction-set=(arm|mips|x86|x86_64): compile for a particular instruction");
+  UsageError("  --instruction-set=(arm|arm64|mips|x86|x86_64): compile for a particular instruction");
   UsageError("      set.");
   UsageError("      Example: --instruction-set=x86");
   UsageError("      Default: arm");
diff --git a/runtime/arch/arm64/jni_entrypoints_arm64.S b/runtime/arch/arm64/jni_entrypoints_arm64.S
index d2ed692..ba783ab 100644
--- a/runtime/arch/arm64/jni_entrypoints_arm64.S
+++ b/runtime/arch/arm64/jni_entrypoints_arm64.S
@@ -20,11 +20,76 @@
      * Jni dlsym lookup stub.
      */
     .extern artFindNativeMethod
-UNIMPLEMENTED art_jni_dlsym_lookup_stub
+
+    // TODO: Add CFI directives.
+ENTRY art_jni_dlsym_lookup_stub
+  // spill regs.
+  stp   x29, x30, [sp, #-16]!
+  mov   x29, sp
+  stp   d6, d7,   [sp, #-16]!
+  stp   d4, d5,   [sp, #-16]!
+  stp   d2, d3,   [sp, #-16]!
+  stp   d0, d1,   [sp, #-16]!
+  stp   x6, x7,   [sp, #-16]!
+  stp   x4, x5,   [sp, #-16]!
+  stp   x2, x3,   [sp, #-16]!
+  stp   x0, x1,   [sp, #-16]!
+
+  bl  artFindNativeMethod
+  mov  x17, x0    // store result in scratch reg.
+
+  // load spill regs.
+  ldp   x0, x1,   [sp], #16
+  ldp   x2, x3,   [sp], #16
+  ldp   x4, x5,   [sp], #16
+  ldp   x6, x7,   [sp], #16
+  ldp   d0, d1,   [sp], #16
+  ldp   d2, d3,   [sp], #16
+  ldp   d4, d5,   [sp], #16
+  ldp   d6, d7,   [sp], #16
+  ldp   x29, x30, [sp], #16
+
+  cbz   x17, 1f   // is method code null ?
+  br    x17       // if non-null, tail call to method's code.
+
+1:
+  ret             // restore regs and return to caller to handle exception.
+END art_jni_dlsym_lookup_stub
 
     /*
      * Entry point of native methods when JNI bug compatibility is enabled.
      */
     .extern artWorkAroundAppJniBugs
-UNIMPLEMENTED art_work_around_app_jni_bugs
+ENTRY art_work_around_app_jni_bugs
+  // spill regs.
+  stp   x29, x30, [sp, #-16]!
+  mov   x29, sp
+  stp   d6, d7,   [sp, #-16]!
+  stp   d4, d5,   [sp, #-16]!
+  stp   d2, d3,   [sp, #-16]!
+  stp   d0, d1,   [sp, #-16]!
+  stp   x6, x7,   [sp, #-16]!
+  stp   x4, x5,   [sp, #-16]!
+  stp   x2, x3,   [sp, #-16]!
+  stp   x0, x1,   [sp, #-16]!
+
+  mov   x0, x19   // Thread::Current.
+  mov   x1, sp    // SP.
+  bl    artWorkAroundAppJniBugs   // (Thread*, SP).
+  mov   x17, x0   // save target return.
+
+  // load spill regs.
+  ldp   x0, x1,   [sp], #16
+  ldp   x2, x3,   [sp], #16
+  ldp   x4, x5,   [sp], #16
+  ldp   x6, x7,   [sp], #16
+  ldp   d0, d1,   [sp], #16
+  ldp   d2, d3,   [sp], #16
+  ldp   d4, d5,   [sp], #16
+  ldp   d6, d7,   [sp], #16
+  ldp   x29, x30, [sp], #16
+
+  //tail call into JNI routine.
+  br    x17
+END art_work_around_app_jni_bugs
 
diff --git a/runtime/arch/arm64/quick_entrypoints_arm64.S b/runtime/arch/arm64/quick_entrypoints_arm64.S
index dd34583..6ce5d06 100644
--- a/runtime/arch/arm64/quick_entrypoints_arm64.S
+++ b/runtime/arch/arm64/quick_entrypoints_arm64.S
@@ -332,9 +332,14 @@
 END \c_name
 .endm
 
+// FIXME: Temporary fix for TR(XSELF).
 .macro ONE_ARG_RUNTIME_EXCEPTION c_name, cxx_name
     .extern \cxx_name
 ENTRY \c_name
+    SETUP_SAVE_ALL_CALLEE_SAVE_FRAME  // save all registers as basis for long jump context.
+    mov x1, x19                       // pass Thread::Current.
+    mov x2, sp                        // pass SP.
+    b   \cxx_name                     // \cxx_name(Thread*, SP).
     brk 0
 END \c_name
 .endm
diff --git a/runtime/arch/arm64/registers_arm64.h b/runtime/arch/arm64/registers_arm64.h
index 43c0ad6..2503918 100644
--- a/runtime/arch/arm64/registers_arm64.h
+++ b/runtime/arch/arm64/registers_arm64.h
@@ -56,7 +56,8 @@
   X29 = 29,
   X30 = 30,
   X31 = 31,
-  TR  = 18,     // ART Thread Register.
+  TR  = 18,     // ART Thread Register - Needs to be one of the callee saved regs.
+  TR1 = 19,     // FIXME!
   IP0 = 16,     // Used as scratch by VIXL.
   IP1 = 17,     // Used as scratch by ART JNI Assembler.
   FP  = 29,
diff --git a/runtime/gc/collector/semi_space.cc b/runtime/gc/collector/semi_space.cc
index e82d533..6e31cb7 100644
--- a/runtime/gc/collector/semi_space.cc
+++ b/runtime/gc/collector/semi_space.cc
@@ -63,7 +63,6 @@
 namespace collector {
 
 static constexpr bool kProtectFromSpace = true;
-static constexpr bool kClearFromSpace = true;
 static constexpr bool kStoreStackTraces = false;
 static constexpr bool kUseBytesPromoted = true;
 static constexpr size_t kBytesPromotedThreshold = 4 * MB;
@@ -122,6 +121,7 @@
   // Do any pre GC verification.
   timings_.NewSplit("PreGcVerification");
   heap_->PreGcVerification(this);
+  CHECK(from_space_->CanMoveObjects()) << "Attempting to move from " << *from_space_;
   // Set the initial bitmap.
   to_space_live_bitmap_ = to_space_->GetLiveBitmap();
 }
@@ -182,9 +182,6 @@
   Locks::mutator_lock_->AssertExclusiveHeld(self_);
 
   TimingLogger::ScopedSplit split("MarkingPhase", &timings_);
-  // Need to do this with mutators paused so that somebody doesn't accidentally allocate into the
-  // wrong space.
-  heap_->SwapSemiSpaces();
   if (generational_) {
     // If last_gc_to_space_end_ is out of the bounds of the from-space
     // (the to-space from last GC), then point it to the beginning of
@@ -414,11 +411,7 @@
     TimingLogger::ScopedSplit split("UnBindBitmaps", &timings_);
     GetHeap()->UnBindBitmaps();
   }
-  if (kClearFromSpace) {
-    // Release the memory used by the from space.
-    from_space_->Clear();
-  }
-  from_space_->Reset();
+  from_space_->Clear();
   // Protect the from space.
   VLOG(heap) << "Protecting space " << *from_space_;
   if (kProtectFromSpace) {
diff --git a/runtime/gc/heap.cc b/runtime/gc/heap.cc
index 35ec297..e44ec6a 100644
--- a/runtime/gc/heap.cc
+++ b/runtime/gc/heap.cc
@@ -81,10 +81,15 @@
 // relative to partial/full GC. This is desirable since sticky GCs interfere less with mutator
 // threads (lower pauses, use less memory bandwidth).
 static constexpr double kStickyGcThroughputAdjustment = 1.25;
+// Whether or not we use the free list large object space.
+static constexpr bool kUseFreeListSpaceForLOS = false;
+// Whtehr or not we compact the zygote in PreZygoteFork.
+static constexpr bool kCompactZygote = kMovingCollector;
+static constexpr size_t kNonMovingSpaceCapacity = 64 * MB;
 
 Heap::Heap(size_t initial_size, size_t growth_limit, size_t min_free, size_t max_free,
            double target_utilization, size_t capacity, const std::string& image_file_name,
-           CollectorType post_zygote_collector_type, CollectorType background_collector_type,
+           CollectorType foreground_collector_type, CollectorType background_collector_type,
            size_t parallel_gc_threads, size_t conc_gc_threads, bool low_memory_mode,
            size_t long_pause_log_threshold, size_t long_gc_log_threshold,
            bool ignore_max_footprint, bool use_tlab, bool verify_pre_gc_heap,
@@ -95,9 +100,9 @@
       dlmalloc_space_(nullptr),
       main_space_(nullptr),
       collector_type_(kCollectorTypeNone),
-      post_zygote_collector_type_(post_zygote_collector_type),
+      foreground_collector_type_(foreground_collector_type),
       background_collector_type_(background_collector_type),
-      desired_collector_type_(collector_type_),
+      desired_collector_type_(foreground_collector_type_),
       heap_trim_request_lock_(nullptr),
       last_trim_time_(0),
       heap_transition_target_time_(0),
@@ -162,15 +167,11 @@
   // If we aren't the zygote, switch to the default non zygote allocator. This may update the
   // entrypoints.
   if (!is_zygote) {
-    desired_collector_type_ = post_zygote_collector_type_;
     large_object_threshold_ = kDefaultLargeObjectThreshold;
-  } else {
-    if (kMovingCollector) {
-      // We are the zygote, use bump pointer allocation + semi space collector.
-      bool generational = post_zygote_collector_type_ == kCollectorTypeGSS;
-      desired_collector_type_ = generational ? kCollectorTypeGSS : kCollectorTypeSS;
-    } else {
-      desired_collector_type_ = post_zygote_collector_type_;
+    // Background compaction is currently not supported for command line runs.
+    if (background_collector_type_ != foreground_collector_type_) {
+      LOG(WARNING) << "Disabling background compaction for non zygote";
+      background_collector_type_ = foreground_collector_type_;
     }
   }
   ChangeCollector(desired_collector_type_);
@@ -187,73 +188,56 @@
     // isn't going to get in the middle
     byte* oat_file_end_addr = image_space->GetImageHeader().GetOatFileEnd();
     CHECK_GT(oat_file_end_addr, image_space->End());
-    if (oat_file_end_addr > requested_alloc_space_begin) {
-      requested_alloc_space_begin = AlignUp(oat_file_end_addr, kPageSize);
-    }
+    requested_alloc_space_begin = AlignUp(oat_file_end_addr, kPageSize);
   }
-  MemMap* malloc_space_mem_map = nullptr;
-  const char* malloc_space_name = is_zygote ? "zygote space" : "alloc space";
   if (is_zygote) {
-    // Allocate a single mem map that is split into the malloc space
-    // and the post zygote non-moving space to put them adjacent.
-    size_t post_zygote_non_moving_space_size = 64 * MB;
-    size_t non_moving_spaces_size = capacity + post_zygote_non_moving_space_size;
+    // Reserve the address range before we create the non moving space to make sure bitmaps don't
+    // take it.
     std::string error_str;
-    malloc_space_mem_map = MemMap::MapAnonymous(malloc_space_name, requested_alloc_space_begin,
-                                                non_moving_spaces_size, PROT_READ | PROT_WRITE,
-                                                true, &error_str);
-    CHECK(malloc_space_mem_map != nullptr) << error_str;
-    post_zygote_non_moving_space_mem_map_.reset(malloc_space_mem_map->RemapAtEnd(
-        malloc_space_mem_map->Begin() + capacity, "post zygote non-moving space",
-        PROT_READ | PROT_WRITE, &error_str));
-    CHECK(post_zygote_non_moving_space_mem_map_.get() != nullptr) << error_str;
-    VLOG(heap) << "malloc space mem map : " << malloc_space_mem_map;
-    VLOG(heap) << "post zygote non-moving space mem map : "
-               << post_zygote_non_moving_space_mem_map_.get();
+    MemMap* mem_map = MemMap::MapAnonymous(
+        "main space", requested_alloc_space_begin + kNonMovingSpaceCapacity, capacity,
+        PROT_READ | PROT_WRITE, true, &error_str);
+    CHECK(mem_map != nullptr) << error_str;
+    // Non moving space is always dlmalloc since we currently don't have support for multiple
+    // rosalloc spaces.
+    non_moving_space_ = space::DlMallocSpace::Create(
+        "zygote / non moving space", initial_size, kNonMovingSpaceCapacity, kNonMovingSpaceCapacity,
+        requested_alloc_space_begin, false);
+    non_moving_space_->SetGrowthLimit(non_moving_space_->Capacity());
+    CreateMainMallocSpace(mem_map, initial_size, growth_limit, capacity);
   } else {
-    // Allocate a mem map for the malloc space.
     std::string error_str;
-    malloc_space_mem_map = MemMap::MapAnonymous(malloc_space_name, requested_alloc_space_begin,
-                                                capacity, PROT_READ | PROT_WRITE, true, &error_str);
-    CHECK(malloc_space_mem_map != nullptr) << error_str;
-    VLOG(heap) << "malloc space mem map : " << malloc_space_mem_map;
+    MemMap* mem_map = MemMap::MapAnonymous("main/non-moving space", requested_alloc_space_begin,
+                                           capacity, PROT_READ | PROT_WRITE, true, &error_str);
+    CHECK(mem_map != nullptr) << error_str;
+    // Create the main free list space, which doubles as the non moving space. We can do this since
+    // non zygote means that we won't have any background compaction.
+    CreateMainMallocSpace(mem_map, initial_size, growth_limit, capacity);
+    non_moving_space_ = main_space_;
   }
-  CHECK(malloc_space_mem_map != nullptr);
-  space::MallocSpace* malloc_space;
-  if (kUseRosAlloc) {
-    malloc_space = space::RosAllocSpace::CreateFromMemMap(malloc_space_mem_map, malloc_space_name,
-                                                          kDefaultStartingSize, initial_size,
-                                                          growth_limit, capacity, low_memory_mode_);
-    CHECK(malloc_space != nullptr) << "Failed to create rosalloc space";
-  } else {
-    malloc_space = space::DlMallocSpace::CreateFromMemMap(malloc_space_mem_map, malloc_space_name,
-                                                          kDefaultStartingSize, initial_size,
-                                                          growth_limit, capacity);
-    CHECK(malloc_space != nullptr) << "Failed to create dlmalloc space";
-  }
-  VLOG(heap) << "malloc_space : " << malloc_space;
+  CHECK(non_moving_space_ != nullptr);
+
+  // We need to create the bump pointer if the foreground collector is a compacting GC. We only
+  // create the bump pointer space if we are not a moving foreground collector but have a moving
+  // background collector since the heap transition code will create the temp space by recycling
+  // the bitmap from the main space.
   if (kMovingCollector) {
     // TODO: Place bump-pointer spaces somewhere to minimize size of card table.
-    // TODO: Having 3+ spaces as big as the large heap size can cause virtual memory fragmentation
-    // issues.
-    const size_t bump_pointer_space_size = std::min(malloc_space->Capacity(), 128 * MB);
-    bump_pointer_space_ = space::BumpPointerSpace::Create("Bump pointer space",
-                                                          bump_pointer_space_size, nullptr);
+    bump_pointer_space_ = space::BumpPointerSpace::Create("Bump pointer space", capacity, nullptr);
     CHECK(bump_pointer_space_ != nullptr) << "Failed to create bump pointer space";
     AddSpace(bump_pointer_space_);
-    temp_space_ = space::BumpPointerSpace::Create("Bump pointer space 2", bump_pointer_space_size,
-                                                  nullptr);
+    temp_space_ = space::BumpPointerSpace::Create("Bump pointer space 2", capacity, nullptr);
     CHECK(temp_space_ != nullptr) << "Failed to create bump pointer space";
     AddSpace(temp_space_);
-    VLOG(heap) << "bump_pointer_space : " << bump_pointer_space_;
-    VLOG(heap) << "temp_space : " << temp_space_;
   }
-  non_moving_space_ = malloc_space;
-  malloc_space->SetFootprintLimit(malloc_space->Capacity());
-  AddSpace(malloc_space);
+  if (non_moving_space_ != main_space_) {
+    AddSpace(non_moving_space_);
+  }
+  if (main_space_ != nullptr) {
+    AddSpace(main_space_);
+  }
 
   // Allocate the large object space.
-  constexpr bool kUseFreeListSpaceForLOS = false;
   if (kUseFreeListSpaceForLOS) {
     large_object_space_ = space::FreeListSpace::Create("large object space", nullptr, capacity);
   } else {
@@ -268,11 +252,6 @@
   // Relies on the spaces being sorted.
   byte* heap_begin = continuous_spaces_.front()->Begin();
   byte* heap_end = continuous_spaces_.back()->Limit();
-  if (is_zygote) {
-    CHECK(post_zygote_non_moving_space_mem_map_.get() != nullptr);
-    heap_begin = std::min(post_zygote_non_moving_space_mem_map_->Begin(), heap_begin);
-    heap_end = std::max(post_zygote_non_moving_space_mem_map_->End(), heap_end);
-  }
   size_t heap_capacity = heap_end - heap_begin;
 
   // Allocate the card table.
@@ -292,6 +271,12 @@
         new accounting::RememberedSet("Non-moving space remembered set", this, non_moving_space_);
     CHECK(non_moving_space_rem_set != nullptr) << "Failed to create non-moving space remembered set";
     AddRememberedSet(non_moving_space_rem_set);
+    if (main_space_ != nullptr && main_space_ != non_moving_space_) {
+      accounting::RememberedSet* main_space_rem_set =
+          new accounting::RememberedSet("Main space remembered set", this, main_space_);
+      CHECK(main_space_rem_set != nullptr) << "Failed to create main space remembered set";
+      AddRememberedSet(main_space_rem_set);
+    }
   }
 
   // TODO: Count objects in the image space here.
@@ -329,7 +314,7 @@
   }
   if (kMovingCollector) {
     // TODO: Clean this up.
-    bool generational = post_zygote_collector_type_ == kCollectorTypeGSS;
+    bool generational = foreground_collector_type_ == kCollectorTypeGSS;
     semi_space_collector_ = new collector::SemiSpace(this, generational,
                                                      generational ? "generational" : "");
     garbage_collectors_.push_back(semi_space_collector_);
@@ -347,6 +332,37 @@
   }
 }
 
+void Heap::CreateMainMallocSpace(MemMap* mem_map, size_t initial_size, size_t growth_limit,
+                                 size_t capacity) {
+  // Is background compaction is enabled?
+  bool can_move_objects = IsMovingGc(background_collector_type_) !=
+      IsMovingGc(foreground_collector_type_);
+  // If we are the zygote and don't yet have a zygote space, it means that the zygote fork will
+  // happen in the future. If this happens and we have kCompactZygote enabled we wish to compact
+  // from the main space to the zygote space. If background compaction is enabled, always pass in
+  // that we can move objets.
+  if (kCompactZygote && Runtime::Current()->IsZygote() && !can_move_objects) {
+    // After the zygote we want this to be false if we don't have background compaction enabled so
+    // that getting primitive array elements is faster.
+    can_move_objects = !have_zygote_space_;
+  }
+  if (kUseRosAlloc) {
+    main_space_ = space::RosAllocSpace::CreateFromMemMap(mem_map, "main rosalloc space",
+                                                          kDefaultStartingSize, initial_size,
+                                                          growth_limit, capacity, low_memory_mode_,
+                                                          can_move_objects);
+    CHECK(main_space_ != nullptr) << "Failed to create rosalloc space";
+  } else {
+    main_space_ = space::DlMallocSpace::CreateFromMemMap(mem_map, "main dlmalloc space",
+                                                          kDefaultStartingSize, initial_size,
+                                                          growth_limit, capacity,
+                                                          can_move_objects);
+    CHECK(main_space_ != nullptr) << "Failed to create dlmalloc space";
+  }
+  main_space_->SetFootprintLimit(main_space_->Capacity());
+  VLOG(heap) << "Created main space " << main_space_;
+}
+
 void Heap::ChangeAllocator(AllocatorType allocator) {
   if (current_allocator_ != allocator) {
     // These two allocators are only used internally and don't have any entrypoints.
@@ -360,13 +376,13 @@
 }
 
 void Heap::DisableCompaction() {
-  if (IsCompactingGC(post_zygote_collector_type_)) {
-    post_zygote_collector_type_ = kCollectorTypeCMS;
+  if (IsMovingGc(foreground_collector_type_)) {
+    foreground_collector_type_  = kCollectorTypeCMS;
   }
-  if (IsCompactingGC(background_collector_type_)) {
-    background_collector_type_ = post_zygote_collector_type_;
+  if (IsMovingGc(background_collector_type_)) {
+    background_collector_type_ = foreground_collector_type_;
   }
-  TransitionCollector(post_zygote_collector_type_);
+  TransitionCollector(foreground_collector_type_);
 }
 
 std::string Heap::SafeGetClassDescriptor(mirror::Class* klass) {
@@ -428,14 +444,6 @@
         break;
       }
     }
-    if (space == nullptr) {
-      if (allocator_mem_map_.get() == nullptr || !allocator_mem_map_->HasAddress(obj)) {
-        stream << "obj " << obj << " not a valid heap address";
-        return;
-      } else if (allocator_mem_map_.get() != nullptr) {
-        allocator_mem_map_->Protect(PROT_READ | PROT_WRITE);
-      }
-    }
     // Unprotect all the spaces.
     for (const auto& space : continuous_spaces_) {
       mprotect(space->Begin(), space->Capacity(), PROT_READ | PROT_WRITE);
@@ -478,7 +486,7 @@
   ScopedThreadStateChange tsc(self, kWaitingForGcToComplete);
   MutexLock mu(self, *gc_complete_lock_);
   ++disable_moving_gc_count_;
-  if (IsCompactingGC(collector_type_running_)) {
+  if (IsMovingGc(collector_type_running_)) {
     WaitForGcToCompleteLocked(self);
   }
 }
@@ -496,12 +504,12 @@
       // Start at index 1 to avoid "is always false" warning.
       // Have iteration 1 always transition the collector.
       TransitionCollector((((i & 1) == 1) == (process_state_ == kProcessStateJankPerceptible))
-                          ? post_zygote_collector_type_ : background_collector_type_);
+                          ? foreground_collector_type_ : background_collector_type_);
       usleep(kCollectorTransitionStressWait);
     }
     if (process_state_ == kProcessStateJankPerceptible) {
       // Transition back to foreground right away to prevent jank.
-      RequestCollectorTransition(post_zygote_collector_type_, 0);
+      RequestCollectorTransition(foreground_collector_type_, 0);
     } else {
       // Don't delay for debug builds since we may want to stress test the GC.
       RequestCollectorTransition(background_collector_type_, kIsDebugBuild ? 0 :
@@ -626,6 +634,10 @@
     }
     if (continuous_space == main_space_) {
       main_space_ = nullptr;
+    } else if (continuous_space == bump_pointer_space_) {
+      bump_pointer_space_ = nullptr;
+    } else if (continuous_space == temp_space_) {
+      temp_space_ = nullptr;
     }
   } else {
     DCHECK(space->IsDiscontinuousSpace());
@@ -967,8 +979,10 @@
       managed_reclaimed += alloc_space->Trim();
     }
   }
-  total_alloc_space_allocated = GetBytesAllocated() - large_object_space_->GetBytesAllocated() -
-      bump_pointer_space_->Size();
+  total_alloc_space_allocated = GetBytesAllocated() - large_object_space_->GetBytesAllocated();
+  if (bump_pointer_space_ != nullptr) {
+    total_alloc_space_allocated -= bump_pointer_space_->Size();
+  }
   const float managed_utilization = static_cast<float>(total_alloc_space_allocated) /
       static_cast<float>(total_alloc_space_size);
   uint64_t gc_heap_end_ns = NanoTime();
@@ -1400,7 +1414,7 @@
   ScopedThreadStateChange tsc(self, kWaitingPerformingGc);
   Locks::mutator_lock_->AssertNotHeld(self);
   const bool copying_transition =
-      IsCompactingGC(background_collector_type_) || IsCompactingGC(post_zygote_collector_type_);
+      IsMovingGc(background_collector_type_) || IsMovingGc(foreground_collector_type_);
   // Busy wait until we can GC (StartGC can fail if we have a non-zero
   // compacting_gc_disable_count_, this should rarely occurs).
   for (;;) {
@@ -1409,6 +1423,13 @@
       MutexLock mu(self, *gc_complete_lock_);
       // Ensure there is only one GC at a time.
       WaitForGcToCompleteLocked(self);
+      // If someone else beat us to it and changed the collector before we could, exit.
+      // This is safe to do before the suspend all since we set the collector_type_running_ before
+      // we exit the loop. If another thread attempts to do the heap transition before we exit,
+      // then it would get blocked on WaitForGcToCompleteLocked.
+      if (collector_type == collector_type_) {
+        return;
+      }
       // GC can be disabled if someone has a used GetPrimitiveArrayCritical but not yet released.
       if (!copying_transition || disable_moving_gc_count_ == 0) {
         // TODO: Not hard code in semi-space collector?
@@ -1424,42 +1445,20 @@
     case kCollectorTypeSS:
       // Fall-through.
     case kCollectorTypeGSS: {
-      mprotect(temp_space_->Begin(), temp_space_->Capacity(), PROT_READ | PROT_WRITE);
-      CHECK(main_space_ != nullptr);
-      Compact(temp_space_, main_space_);
-      DCHECK(allocator_mem_map_.get() == nullptr);
-      allocator_mem_map_.reset(main_space_->ReleaseMemMap());
-      madvise(main_space_->Begin(), main_space_->Size(), MADV_DONTNEED);
-      // RemoveSpace does not delete the removed space.
-      space::Space* old_space = main_space_;
-      RemoveSpace(old_space);
-      delete old_space;
+      if (!IsMovingGc(collector_type_)) {
+        // We are transitioning from non moving GC -> moving GC, since we copied from the bump
+        // pointer space last transition it will be protected.
+        bump_pointer_space_->GetMemMap()->Protect(PROT_READ | PROT_WRITE);
+        Compact(bump_pointer_space_, main_space_);
+      }
       break;
     }
     case kCollectorTypeMS:
       // Fall through.
     case kCollectorTypeCMS: {
-      if (IsCompactingGC(collector_type_)) {
-        // TODO: Use mem-map from temp space?
-        MemMap* mem_map = allocator_mem_map_.release();
-        CHECK(mem_map != nullptr);
-        size_t starting_size = kDefaultStartingSize;
-        size_t initial_size = kDefaultInitialSize;
-        mprotect(mem_map->Begin(), initial_size, PROT_READ | PROT_WRITE);
-        CHECK(main_space_ == nullptr);
-        if (kUseRosAlloc) {
-          main_space_ =
-              space::RosAllocSpace::CreateFromMemMap(mem_map, "alloc space", starting_size,
-                                                     initial_size, mem_map->Size(),
-                                                     mem_map->Size(), low_memory_mode_);
-        } else {
-          main_space_ =
-              space::DlMallocSpace::CreateFromMemMap(mem_map, "alloc space", starting_size,
-                                                     initial_size, mem_map->Size(),
-                                                     mem_map->Size());
-        }
-        main_space_->SetFootprintLimit(main_space_->Capacity());
-        AddSpace(main_space_);
+      if (IsMovingGc(collector_type_)) {
+        // Compact to the main space from the bump pointer space, don't need to swap semispaces.
+        main_space_->GetMemMap()->Protect(PROT_READ | PROT_WRITE);
         Compact(main_space_, bump_pointer_space_);
       }
       break;
@@ -1655,11 +1654,12 @@
   VLOG(heap) << "Starting PreZygoteFork";
   // Trim the pages at the end of the non moving space.
   non_moving_space_->Trim();
+  // The end of the non-moving space may be protected, unprotect it so that we can copy the zygote
+  // there.
   non_moving_space_->GetMemMap()->Protect(PROT_READ | PROT_WRITE);
   // Change the collector to the post zygote one.
-  ChangeCollector(post_zygote_collector_type_);
-  // TODO: Delete bump_pointer_space_ and temp_pointer_space_?
-  if (semi_space_collector_ != nullptr) {
+  if (kCompactZygote) {
+    DCHECK(semi_space_collector_ != nullptr);
     // Temporarily disable rosalloc verification because the zygote
     // compaction will mess up the rosalloc internal metadata.
     ScopedDisableRosAllocVerification disable_rosalloc_verif(this);
@@ -1669,18 +1669,47 @@
     space::BumpPointerSpace target_space("zygote bump space", non_moving_space_->End(),
                                          non_moving_space_->Limit());
     // Compact the bump pointer space to a new zygote bump pointer space.
-    temp_space_->GetMemMap()->Protect(PROT_READ | PROT_WRITE);
-    zygote_collector.SetFromSpace(bump_pointer_space_);
+    bool reset_main_space = false;
+    if (IsMovingGc(collector_type_)) {
+      zygote_collector.SetFromSpace(bump_pointer_space_);
+    } else {
+      CHECK(main_space_ != nullptr);
+      // Copy from the main space.
+      zygote_collector.SetFromSpace(main_space_);
+      reset_main_space = true;
+    }
     zygote_collector.SetToSpace(&target_space);
+
+    Runtime::Current()->GetThreadList()->SuspendAll();
     zygote_collector.Run(kGcCauseCollectorTransition, false);
-    CHECK(temp_space_->IsEmpty());
+    if (IsMovingGc(collector_type_)) {
+      SwapSemiSpaces();
+    }
+    Runtime::Current()->GetThreadList()->ResumeAll();
+
+    if (reset_main_space) {
+      main_space_->GetMemMap()->Protect(PROT_READ | PROT_WRITE);
+      madvise(main_space_->Begin(), main_space_->Capacity(), MADV_DONTNEED);
+      MemMap* mem_map = main_space_->ReleaseMemMap();
+      RemoveSpace(main_space_);
+      delete main_space_;
+      main_space_ = nullptr;
+      CreateMainMallocSpace(mem_map, kDefaultInitialSize, mem_map->Size(), mem_map->Size());
+      AddSpace(main_space_);
+    } else {
+      bump_pointer_space_->GetMemMap()->Protect(PROT_READ | PROT_WRITE);
+    }
+    if (temp_space_ != nullptr) {
+      CHECK(temp_space_->IsEmpty());
+    }
     total_objects_freed_ever_ += semi_space_collector_->GetFreedObjects();
     total_bytes_freed_ever_ += semi_space_collector_->GetFreedBytes();
     // Update the end and write out image.
     non_moving_space_->SetEnd(target_space.End());
     non_moving_space_->SetLimit(target_space.Limit());
-    VLOG(heap) << "Zygote size " << non_moving_space_->Size() << " bytes";
+    VLOG(heap) << "Zygote space size " << non_moving_space_->Size() << " bytes";
   }
+  ChangeCollector(foreground_collector_type_);
   // Save the old space so that we can remove it after we complete creating the zygote space.
   space::MallocSpace* old_alloc_space = non_moving_space_;
   // Turn the current alloc space into a zygote space and obtain the new alloc space composed of
@@ -1700,18 +1729,12 @@
   }
   space::ZygoteSpace* zygote_space = old_alloc_space->CreateZygoteSpace("alloc space",
                                                                         low_memory_mode_,
-                                                                        &main_space_);
+                                                                        &non_moving_space_);
   delete old_alloc_space;
   CHECK(zygote_space != nullptr) << "Failed creating zygote space";
   AddSpace(zygote_space, false);
-  CHECK(main_space_ != nullptr);
-  if (main_space_->IsRosAllocSpace()) {
-    rosalloc_space_ = main_space_->AsRosAllocSpace();
-  } else if (main_space_->IsDlMallocSpace()) {
-    dlmalloc_space_ = main_space_->AsDlMallocSpace();
-  }
-  main_space_->SetFootprintLimit(main_space_->Capacity());
-  AddSpace(main_space_);
+  non_moving_space_->SetFootprintLimit(non_moving_space_->Capacity());
+  AddSpace(non_moving_space_);
   have_zygote_space_ = true;
   // Enable large object space allocations.
   large_object_threshold_ = kDefaultLargeObjectThreshold;
@@ -1721,23 +1744,6 @@
   CHECK(mod_union_table != nullptr) << "Failed to create zygote space mod-union table";
   AddModUnionTable(mod_union_table);
   if (collector::SemiSpace::kUseRememberedSet) {
-    // Add a new remembered set for the new main space.
-    accounting::RememberedSet* main_space_rem_set =
-        new accounting::RememberedSet("Main space remembered set", this, main_space_);
-    CHECK(main_space_rem_set != nullptr) << "Failed to create main space remembered set";
-    AddRememberedSet(main_space_rem_set);
-  }
-  // Can't use RosAlloc for non moving space due to thread local buffers.
-  // TODO: Non limited space for non-movable objects?
-  MemMap* mem_map = post_zygote_non_moving_space_mem_map_.release();
-  space::MallocSpace* new_non_moving_space =
-      space::DlMallocSpace::CreateFromMemMap(mem_map, "Non moving dlmalloc space", kPageSize,
-                                             2 * MB, mem_map->Size(), mem_map->Size());
-  AddSpace(new_non_moving_space, false);
-  CHECK(new_non_moving_space != nullptr) << "Failed to create new non-moving space";
-  new_non_moving_space->SetFootprintLimit(new_non_moving_space->Capacity());
-  non_moving_space_ = new_non_moving_space;
-  if (collector::SemiSpace::kUseRememberedSet) {
     // Add a new remembered set for the post-zygote non-moving space.
     accounting::RememberedSet* post_zygote_non_moving_space_rem_set =
         new accounting::RememberedSet("Post-zygote non-moving space remembered set", this,
@@ -1775,9 +1781,9 @@
 }
 
 void Heap::SwapSemiSpaces() {
-  // Swap the spaces so we allocate into the space which we just evacuated.
+  CHECK(bump_pointer_space_ != nullptr);
+  CHECK(temp_space_ != nullptr);
   std::swap(bump_pointer_space_, temp_space_);
-  bump_pointer_space_->Clear();
 }
 
 void Heap::Compact(space::ContinuousMemMapAllocSpace* target_space,
@@ -1820,7 +1826,7 @@
     MutexLock mu(self, *gc_complete_lock_);
     // Ensure there is only one GC at a time.
     WaitForGcToCompleteLocked(self);
-    compacting_gc = IsCompactingGC(collector_type_);
+    compacting_gc = IsMovingGc(collector_type_);
     // GC can be disabled if someone has a used GetPrimitiveArrayCritical.
     if (compacting_gc && disable_moving_gc_count_ != 0) {
       LOG(WARNING) << "Skipping GC due to disable moving GC count " << disable_moving_gc_count_;
@@ -1875,7 +1881,14 @@
       << "Could not find garbage collector with collector_type="
       << static_cast<size_t>(collector_type_) << " and gc_type=" << gc_type;
   ATRACE_BEGIN(StringPrintf("%s %s GC", PrettyCause(gc_cause), collector->GetName()).c_str());
-  collector->Run(gc_cause, clear_soft_references || runtime->IsZygote());
+  if (compacting_gc) {
+    runtime->GetThreadList()->SuspendAll();
+    collector->Run(gc_cause, clear_soft_references || runtime->IsZygote());
+    SwapSemiSpaces();
+    runtime->GetThreadList()->ResumeAll();
+  } else {
+    collector->Run(gc_cause, clear_soft_references || runtime->IsZygote());
+  }
   total_objects_freed_ever_ += collector->GetFreedObjects();
   total_bytes_freed_ever_ += collector->GetFreedBytes();
   RequestHeapTrim();
@@ -2478,25 +2491,11 @@
 
 bool Heap::IsMovableObject(const mirror::Object* obj) const {
   if (kMovingCollector) {
-    DCHECK(!IsInTempSpace(obj));
-    if (bump_pointer_space_->HasAddress(obj)) {
-      return true;
+    space::Space* space = FindContinuousSpaceFromObject(obj, true);
+    if (space != nullptr) {
+      // TODO: Check large object?
+      return space->CanMoveObjects();
     }
-    // TODO: Refactor this logic into the space itself?
-    // Objects in the main space are only copied during background -> foreground transitions or
-    // visa versa.
-    if (main_space_ != nullptr && main_space_->HasAddress(obj) &&
-        (IsCompactingGC(background_collector_type_) ||
-            IsCompactingGC(post_zygote_collector_type_))) {
-      return true;
-    }
-  }
-  return false;
-}
-
-bool Heap::IsInTempSpace(const mirror::Object* obj) const {
-  if (temp_space_->HasAddress(obj) && !temp_space_->Contains(obj)) {
-    return true;
   }
   return false;
 }
diff --git a/runtime/gc/heap.h b/runtime/gc/heap.h
index a8989ec..912cf7d 100644
--- a/runtime/gc/heap.h
+++ b/runtime/gc/heap.h
@@ -150,7 +150,7 @@
   explicit Heap(size_t initial_size, size_t growth_limit, size_t min_free,
                 size_t max_free, double target_utilization, size_t capacity,
                 const std::string& original_image_file_name,
-                CollectorType post_zygote_collector_type, CollectorType background_collector_type,
+                CollectorType foreground_collector_type, CollectorType background_collector_type,
                 size_t parallel_gc_threads, size_t conc_gc_threads, bool low_memory_mode,
                 size_t long_pause_threshold, size_t long_gc_threshold,
                 bool ignore_max_footprint, bool use_tlab, bool verify_pre_gc_heap,
@@ -196,8 +196,6 @@
   void VisitObjects(ObjectCallback callback, void* arg)
       SHARED_LOCKS_REQUIRED(Locks::heap_bitmap_lock_, Locks::mutator_lock_);
 
-  void SwapSemiSpaces() EXCLUSIVE_LOCKS_REQUIRED(Locks::mutator_lock_);
-
   void CheckPreconditionsForAllocObject(mirror::Class* c, size_t byte_count)
       SHARED_LOCKS_REQUIRED(Locks::mutator_lock_);
   void ThrowOutOfMemoryError(size_t byte_count, bool large_object_allocation);
@@ -249,10 +247,6 @@
   // Returns true if there is any chance that the object (obj) will move.
   bool IsMovableObject(const mirror::Object* obj) const;
 
-  // Returns true if an object is in the temp space, if this happens its usually indicative of
-  // compaction related errors.
-  bool IsInTempSpace(const mirror::Object* obj) const;
-
   // Enables us to compacting GC until objects are released.
   void IncrementDisableMovingGC(Thread* self);
   void DecrementDisableMovingGC(Thread* self);
@@ -568,7 +562,8 @@
 
  private:
   void Compact(space::ContinuousMemMapAllocSpace* target_space,
-               space::ContinuousMemMapAllocSpace* source_space);
+               space::ContinuousMemMapAllocSpace* source_space)
+      EXCLUSIVE_LOCKS_REQUIRED(Locks::mutator_lock_);
 
   void FinishGC(Thread* self, collector::GcType gc_type) LOCKS_EXCLUDED(gc_complete_lock_);
 
@@ -580,7 +575,7 @@
   static ALWAYS_INLINE bool AllocatorMayHaveConcurrentGC(AllocatorType allocator_type) {
     return AllocatorHasAllocationStack(allocator_type);
   }
-  static bool IsCompactingGC(CollectorType collector_type) {
+  static bool IsMovingGc(CollectorType collector_type) {
     return collector_type == kCollectorTypeSS || collector_type == kCollectorTypeGSS ||
         collector_type == kCollectorTypeCC;
   }
@@ -609,6 +604,10 @@
                                size_t bytes)
       SHARED_LOCKS_REQUIRED(Locks::mutator_lock_);
 
+  // Need to do this with mutators paused so that somebody doesn't accidentally allocate into the
+  // wrong space.
+  void SwapSemiSpaces() EXCLUSIVE_LOCKS_REQUIRED(Locks::mutator_lock_);
+
   // Try to allocate a number of bytes, this function never does any GCs. Needs to be inlined so
   // that the switch statement is constant optimized in the entrypoints.
   template <const bool kInstrumented, const bool kGrow>
@@ -668,6 +667,10 @@
   // Find a collector based on GC type.
   collector::GarbageCollector* FindCollectorByGcType(collector::GcType gc_type);
 
+  // Create the main free list space, typically either a RosAlloc space or DlMalloc space.
+  void CreateMainMallocSpace(MemMap* mem_map, size_t initial_size, size_t growth_limit,
+                             size_t capacity);
+
   // Given the current contents of the alloc space, increase the allowed heap footprint to match
   // the target utilization ratio.  This should only be called immediately after a full garbage
   // collection.
@@ -737,17 +740,10 @@
   // A remembered set remembers all of the references from the it's space to the target space.
   SafeMap<space::Space*, accounting::RememberedSet*> remembered_sets_;
 
-  // Keep the free list allocator mem map lying around when we transition to background so that we
-  // don't have to worry about virtual address space fragmentation.
-  UniquePtr<MemMap> allocator_mem_map_;
-
-  // The mem-map which we will use for the non-moving space after the zygote is done forking:
-  UniquePtr<MemMap> post_zygote_non_moving_space_mem_map_;
-
   // The current collector type.
   CollectorType collector_type_;
-  // Which collector we will switch to after zygote fork.
-  CollectorType post_zygote_collector_type_;
+  // Which collector we use when the app is in the foreground.
+  CollectorType foreground_collector_type_;
   // Which collector we will use when the app is notified of a transition to background.
   CollectorType background_collector_type_;
   // Desired collector type, heap trimming daemon transitions the heap if it is != collector_type_.
diff --git a/runtime/gc/space/bump_pointer_space.cc b/runtime/gc/space/bump_pointer_space.cc
index a955cc8..6bd0526 100644
--- a/runtime/gc/space/bump_pointer_space.cc
+++ b/runtime/gc/space/bump_pointer_space.cc
@@ -38,6 +38,10 @@
   return new BumpPointerSpace(name, mem_map.release());
 }
 
+BumpPointerSpace* BumpPointerSpace::CreateFromMemMap(const std::string& name, MemMap* mem_map) {
+  return new BumpPointerSpace(name, mem_map);
+}
+
 BumpPointerSpace::BumpPointerSpace(const std::string& name, byte* begin, byte* limit)
     : ContinuousMemMapAllocSpace(name, nullptr, begin, begin, limit,
                                  kGcRetentionPolicyAlwaysCollect),
@@ -61,9 +65,6 @@
 void BumpPointerSpace::Clear() {
   // Release the pages back to the operating system.
   CHECK_NE(madvise(Begin(), Limit() - Begin(), MADV_DONTNEED), -1) << "madvise failed";
-}
-
-void BumpPointerSpace::Reset() {
   // Reset the end of the space back to the beginning, we move the end forward as we allocate
   // objects.
   SetEnd(Begin());
diff --git a/runtime/gc/space/bump_pointer_space.h b/runtime/gc/space/bump_pointer_space.h
index 3ab5df4..ecfeae5 100644
--- a/runtime/gc/space/bump_pointer_space.h
+++ b/runtime/gc/space/bump_pointer_space.h
@@ -43,6 +43,7 @@
   // guaranteed to be granted, if it is required, the caller should call Begin on the returned
   // space to confirm the request was granted.
   static BumpPointerSpace* Create(const std::string& name, size_t capacity, byte* requested_begin);
+  static BumpPointerSpace* CreateFromMemMap(const std::string& name, MemMap* mem_map);
 
   // Allocate num_bytes, returns nullptr if the space is full.
   mirror::Object* Alloc(Thread* self, size_t num_bytes, size_t* bytes_allocated,
@@ -92,11 +93,8 @@
     return nullptr;
   }
 
-  // Madvise the memory back to the OS.
-  void Clear() OVERRIDE;
-
-  // Reset the pointer to the start of the space.
-  void Reset() OVERRIDE LOCKS_EXCLUDED(block_lock_);
+  // Reset the space to empty.
+  void Clear() OVERRIDE LOCKS_EXCLUDED(block_lock_);
 
   void Dump(std::ostream& os) const;
 
@@ -113,6 +111,9 @@
     return Begin() == End();
   }
 
+  bool CanMoveObjects() const OVERRIDE {
+    return true;
+  }
 
   bool Contains(const mirror::Object* obj) const {
     const byte* byte_obj = reinterpret_cast<const byte*>(obj);
diff --git a/runtime/gc/space/dlmalloc_space-inl.h b/runtime/gc/space/dlmalloc_space-inl.h
index 02d8b54..4c8a35e 100644
--- a/runtime/gc/space/dlmalloc_space-inl.h
+++ b/runtime/gc/space/dlmalloc_space-inl.h
@@ -52,7 +52,7 @@
 inline mirror::Object* DlMallocSpace::AllocWithoutGrowthLocked(Thread* /*self*/, size_t num_bytes,
                                                                size_t* bytes_allocated,
                                                                size_t* usable_size) {
-  mirror::Object* result = reinterpret_cast<mirror::Object*>(mspace_malloc(mspace_for_alloc_, num_bytes));
+  mirror::Object* result = reinterpret_cast<mirror::Object*>(mspace_malloc(mspace_, num_bytes));
   if (LIKELY(result != NULL)) {
     if (kDebugSpaces) {
       CHECK(Contains(result)) << "Allocation (" << reinterpret_cast<void*>(result)
diff --git a/runtime/gc/space/dlmalloc_space.cc b/runtime/gc/space/dlmalloc_space.cc
index 60f566c..be88b33 100644
--- a/runtime/gc/space/dlmalloc_space.cc
+++ b/runtime/gc/space/dlmalloc_space.cc
@@ -36,15 +36,19 @@
 template class ValgrindMallocSpace<DlMallocSpace, void*>;
 
 DlMallocSpace::DlMallocSpace(const std::string& name, MemMap* mem_map, void* mspace, byte* begin,
-                             byte* end, byte* limit, size_t growth_limit)
-    : MallocSpace(name, mem_map, begin, end, limit, growth_limit),
-      mspace_(mspace), mspace_for_alloc_(mspace) {
+                             byte* end, byte* limit, size_t growth_limit,
+                             bool can_move_objects, size_t starting_size,
+                             size_t initial_size)
+    : MallocSpace(name, mem_map, begin, end, limit, growth_limit, true, can_move_objects,
+                  starting_size, initial_size),
+      mspace_(mspace) {
   CHECK(mspace != NULL);
 }
 
 DlMallocSpace* DlMallocSpace::CreateFromMemMap(MemMap* mem_map, const std::string& name,
                                                size_t starting_size, size_t initial_size,
-                                               size_t growth_limit, size_t capacity) {
+                                               size_t growth_limit, size_t capacity,
+                                               bool can_move_objects) {
   DCHECK(mem_map != nullptr);
   void* mspace = CreateMspace(mem_map->Begin(), starting_size, initial_size);
   if (mspace == nullptr) {
@@ -62,14 +66,17 @@
   byte* begin = mem_map->Begin();
   if (Runtime::Current()->RunningOnValgrind()) {
     return new ValgrindMallocSpace<DlMallocSpace, void*>(
-        name, mem_map, mspace, begin, end, begin + capacity, growth_limit, initial_size);
+        name, mem_map, mspace, begin, end, begin + capacity, growth_limit, initial_size,
+        can_move_objects, starting_size);
   } else {
-    return new DlMallocSpace(name, mem_map, mspace, begin, end, begin + capacity, growth_limit);
+    return new DlMallocSpace(name, mem_map, mspace, begin, end, begin + capacity, growth_limit,
+                             can_move_objects, starting_size, initial_size);
   }
 }
 
-DlMallocSpace* DlMallocSpace::Create(const std::string& name, size_t initial_size, size_t growth_limit,
-                                     size_t capacity, byte* requested_begin) {
+DlMallocSpace* DlMallocSpace::Create(const std::string& name, size_t initial_size,
+                                     size_t growth_limit, size_t capacity, byte* requested_begin,
+                                     bool can_move_objects) {
   uint64_t start_time = 0;
   if (VLOG_IS_ON(heap) || VLOG_IS_ON(startup)) {
     start_time = NanoTime();
@@ -93,7 +100,7 @@
     return nullptr;
   }
   DlMallocSpace* space = CreateFromMemMap(mem_map, name, starting_size, initial_size,
-                                          growth_limit, capacity);
+                                          growth_limit, capacity, can_move_objects);
   // We start out with only the initial size possibly containing objects.
   if (VLOG_IS_ON(heap) || VLOG_IS_ON(startup)) {
     LOG(INFO) << "DlMallocSpace::Create exiting (" << PrettyDuration(NanoTime() - start_time)
@@ -143,8 +150,10 @@
 
 MallocSpace* DlMallocSpace::CreateInstance(const std::string& name, MemMap* mem_map,
                                            void* allocator, byte* begin, byte* end,
-                                           byte* limit, size_t growth_limit) {
-  return new DlMallocSpace(name, mem_map, allocator, begin, end, limit, growth_limit);
+                                           byte* limit, size_t growth_limit,
+                                           bool can_move_objects) {
+  return new DlMallocSpace(name, mem_map, allocator, begin, end, limit, growth_limit,
+                           can_move_objects, starting_size_, initial_size_);
 }
 
 size_t DlMallocSpace::Free(Thread* self, mirror::Object* ptr) {
@@ -280,13 +289,13 @@
 }
 
 void DlMallocSpace::Clear() {
+  size_t footprint_limit = GetFootprintLimit();
   madvise(GetMemMap()->Begin(), GetMemMap()->Size(), MADV_DONTNEED);
-  GetLiveBitmap()->Clear();
-  GetMarkBitmap()->Clear();
-}
-
-void DlMallocSpace::Reset() {
-  // TODO: Delete and create new mspace here.
+  live_bitmap_->Clear();
+  mark_bitmap_->Clear();
+  end_ = Begin() + starting_size_;
+  mspace_ = CreateMspace(mem_map_->Begin(), starting_size_, initial_size_);
+  SetFootprintLimit(footprint_limit);
 }
 
 #ifndef NDEBUG
diff --git a/runtime/gc/space/dlmalloc_space.h b/runtime/gc/space/dlmalloc_space.h
index 76c4489..accd26b 100644
--- a/runtime/gc/space/dlmalloc_space.h
+++ b/runtime/gc/space/dlmalloc_space.h
@@ -36,14 +36,15 @@
   // Create a DlMallocSpace from an existing mem_map.
   static DlMallocSpace* CreateFromMemMap(MemMap* mem_map, const std::string& name,
                                          size_t starting_size, size_t initial_size,
-                                         size_t growth_limit, size_t capacity);
+                                         size_t growth_limit, size_t capacity,
+                                         bool can_move_objects);
 
   // Create a DlMallocSpace with the requested sizes. The requested
   // base address is not guaranteed to be granted, if it is required,
   // the caller should call Begin on the returned space to confirm the
   // request was granted.
   static DlMallocSpace* Create(const std::string& name, size_t initial_size, size_t growth_limit,
-                               size_t capacity, byte* requested_begin);
+                               size_t capacity, byte* requested_begin, bool can_move_objects);
 
   // Virtual to allow ValgrindMallocSpace to intercept.
   virtual mirror::Object* AllocWithGrowth(Thread* self, size_t num_bytes, size_t* bytes_allocated,
@@ -107,13 +108,13 @@
   void SetFootprintLimit(size_t limit) OVERRIDE;
 
   MallocSpace* CreateInstance(const std::string& name, MemMap* mem_map, void* allocator,
-                              byte* begin, byte* end, byte* limit, size_t growth_limit);
+                              byte* begin, byte* end, byte* limit, size_t growth_limit,
+                              bool can_move_objects);
 
   uint64_t GetBytesAllocated() OVERRIDE;
   uint64_t GetObjectsAllocated() OVERRIDE;
 
-  void Clear() OVERRIDE;
-  void Reset() OVERRIDE;
+  virtual void Clear() OVERRIDE;
 
   bool IsDlMallocSpace() const OVERRIDE {
     return true;
@@ -125,7 +126,8 @@
 
  protected:
   DlMallocSpace(const std::string& name, MemMap* mem_map, void* mspace, byte* begin, byte* end,
-                byte* limit, size_t growth_limit);
+                byte* limit, size_t growth_limit, bool can_move_objects, size_t starting_size,
+                size_t initial_size);
 
  private:
   mirror::Object* AllocWithoutGrowthLocked(Thread* self, size_t num_bytes, size_t* bytes_allocated,
@@ -142,11 +144,7 @@
   static const size_t kChunkOverhead = kWordSize;
 
   // Underlying malloc space.
-  void* const mspace_;
-
-  // An mspace pointer used for allocation. Equals  mspace_ or nullptr after InvalidateAllocator()
-  // is called.
-  void* mspace_for_alloc_;
+  void* mspace_;
 
   friend class collector::MarkSweep;
 
diff --git a/runtime/gc/space/dlmalloc_space_base_test.cc b/runtime/gc/space/dlmalloc_space_base_test.cc
index 508d869..129eace 100644
--- a/runtime/gc/space/dlmalloc_space_base_test.cc
+++ b/runtime/gc/space/dlmalloc_space_base_test.cc
@@ -23,7 +23,7 @@
 
 MallocSpace* CreateDlMallocSpace(const std::string& name, size_t initial_size, size_t growth_limit,
                                  size_t capacity, byte* requested_begin) {
-  return DlMallocSpace::Create(name, initial_size, growth_limit, capacity, requested_begin);
+  return DlMallocSpace::Create(name, initial_size, growth_limit, capacity, requested_begin, false);
 }
 
 TEST_SPACE_CREATE_FN_BASE(DlMallocSpace, CreateDlMallocSpace)
diff --git a/runtime/gc/space/dlmalloc_space_random_test.cc b/runtime/gc/space/dlmalloc_space_random_test.cc
index 43a1bf0..c4f8bae 100644
--- a/runtime/gc/space/dlmalloc_space_random_test.cc
+++ b/runtime/gc/space/dlmalloc_space_random_test.cc
@@ -23,7 +23,7 @@
 
 MallocSpace* CreateDlMallocSpace(const std::string& name, size_t initial_size, size_t growth_limit,
                                  size_t capacity, byte* requested_begin) {
-  return DlMallocSpace::Create(name, initial_size, growth_limit, capacity, requested_begin);
+  return DlMallocSpace::Create(name, initial_size, growth_limit, capacity, requested_begin, false);
 }
 
 TEST_SPACE_CREATE_FN_RANDOM(DlMallocSpace, CreateDlMallocSpace)
diff --git a/runtime/gc/space/dlmalloc_space_static_test.cc b/runtime/gc/space/dlmalloc_space_static_test.cc
index 4fbc81e..edaa198 100644
--- a/runtime/gc/space/dlmalloc_space_static_test.cc
+++ b/runtime/gc/space/dlmalloc_space_static_test.cc
@@ -23,7 +23,7 @@
 
 MallocSpace* CreateDlMallocSpace(const std::string& name, size_t initial_size, size_t growth_limit,
                                  size_t capacity, byte* requested_begin) {
-  return DlMallocSpace::Create(name, initial_size, growth_limit, capacity, requested_begin);
+  return DlMallocSpace::Create(name, initial_size, growth_limit, capacity, requested_begin, false);
 }
 
 TEST_SPACE_CREATE_FN_STATIC(DlMallocSpace, CreateDlMallocSpace)
diff --git a/runtime/gc/space/image_space.h b/runtime/gc/space/image_space.h
index 116c498..6b63d10 100644
--- a/runtime/gc/space/image_space.h
+++ b/runtime/gc/space/image_space.h
@@ -75,6 +75,10 @@
   void Sweep(bool /* swap_bitmaps */, size_t* /* freed_objects */, size_t* /* freed_bytes */) {
   }
 
+  bool CanMoveObjects() const OVERRIDE {
+    return false;
+  }
+
  private:
   // Tries to initialize an ImageSpace from the given image path,
   // returning NULL on error.
diff --git a/runtime/gc/space/large_object_space.h b/runtime/gc/space/large_object_space.h
index eb01325..18e518f 100644
--- a/runtime/gc/space/large_object_space.h
+++ b/runtime/gc/space/large_object_space.h
@@ -75,6 +75,10 @@
 
   void Sweep(bool swap_bitmaps, size_t* freed_objects, size_t* freed_bytes);
 
+  virtual bool CanMoveObjects() const OVERRIDE {
+    return false;
+  }
+
  protected:
   explicit LargeObjectSpace(const std::string& name);
 
diff --git a/runtime/gc/space/malloc_space.cc b/runtime/gc/space/malloc_space.cc
index dac043e..c3ca096 100644
--- a/runtime/gc/space/malloc_space.cc
+++ b/runtime/gc/space/malloc_space.cc
@@ -37,10 +37,12 @@
 
 MallocSpace::MallocSpace(const std::string& name, MemMap* mem_map,
                          byte* begin, byte* end, byte* limit, size_t growth_limit,
-                         bool create_bitmaps)
+                         bool create_bitmaps, bool can_move_objects, size_t starting_size,
+                         size_t initial_size)
     : ContinuousMemMapAllocSpace(name, mem_map, begin, end, limit, kGcRetentionPolicyAlwaysCollect),
       recent_free_pos_(0), lock_("allocation space lock", kAllocSpaceLock),
-      growth_limit_(growth_limit) {
+      growth_limit_(growth_limit), can_move_objects_(can_move_objects),
+      starting_size_(starting_size), initial_size_(initial_size) {
   if (create_bitmaps) {
     size_t bitmap_index = bitmap_index_++;
     static const uintptr_t kGcCardSize = static_cast<uintptr_t>(accounting::CardTable::kCardSize);
@@ -201,7 +203,7 @@
     CHECK_MEMORY_CALL(mprotect, (end, capacity - initial_size, PROT_NONE), alloc_space_name);
   }
   *out_malloc_space = CreateInstance(alloc_space_name, mem_map.release(), allocator, end_, end,
-                                     limit_, growth_limit);
+                                     limit_, growth_limit, CanMoveObjects());
   SetLimit(End());
   live_bitmap_->SetHeapLimit(reinterpret_cast<uintptr_t>(End()));
   CHECK_EQ(live_bitmap_->HeapLimit(), reinterpret_cast<uintptr_t>(End()));
diff --git a/runtime/gc/space/malloc_space.h b/runtime/gc/space/malloc_space.h
index fbcee5f..dd4e5d4 100644
--- a/runtime/gc/space/malloc_space.h
+++ b/runtime/gc/space/malloc_space.h
@@ -114,7 +114,8 @@
   void SetGrowthLimit(size_t growth_limit);
 
   virtual MallocSpace* CreateInstance(const std::string& name, MemMap* mem_map, void* allocator,
-                                      byte* begin, byte* end, byte* limit, size_t growth_limit) = 0;
+                                      byte* begin, byte* end, byte* limit, size_t growth_limit,
+                                      bool can_move_objects) = 0;
 
   // Splits ourself into a zygote space and new malloc space which has our unused memory. When true,
   // the low memory mode argument specifies that the heap wishes the created space to be more
@@ -127,9 +128,14 @@
   // Returns the class of a recently freed object.
   mirror::Class* FindRecentFreedObject(const mirror::Object* obj);
 
+  bool CanMoveObjects() const OVERRIDE {
+    return can_move_objects_;
+  }
+
  protected:
   MallocSpace(const std::string& name, MemMap* mem_map, byte* begin, byte* end,
-              byte* limit, size_t growth_limit, bool create_bitmaps = true);
+              byte* limit, size_t growth_limit, bool create_bitmaps, bool can_move_objects,
+              size_t starting_size, size_t initial_size);
 
   static MemMap* CreateMemMap(const std::string& name, size_t starting_size, size_t* initial_size,
                               size_t* growth_limit, size_t* capacity, byte* requested_begin);
@@ -167,6 +173,13 @@
   // one time by a call to ClearGrowthLimit.
   size_t growth_limit_;
 
+  // True if objects in the space are movable.
+  const bool can_move_objects_;
+
+  // Starting and initial sized, used when you reset the space.
+  const size_t starting_size_;
+  const size_t initial_size_;
+
  private:
   static void SweepCallback(size_t num_ptrs, mirror::Object** ptrs, void* arg)
       SHARED_LOCKS_REQUIRED(Locks::mutator_lock_);
diff --git a/runtime/gc/space/rosalloc_space.cc b/runtime/gc/space/rosalloc_space.cc
index 5c5e7f8..afac2a2 100644
--- a/runtime/gc/space/rosalloc_space.cc
+++ b/runtime/gc/space/rosalloc_space.cc
@@ -34,19 +34,23 @@
 
 static constexpr bool kPrefetchDuringRosAllocFreeList = true;
 
-template class ValgrindMallocSpace<RosAllocSpace, allocator::RosAlloc*>;
+// TODO: Fix
+// template class ValgrindMallocSpace<RosAllocSpace, allocator::RosAlloc*>;
 
 RosAllocSpace::RosAllocSpace(const std::string& name, MemMap* mem_map,
                              art::gc::allocator::RosAlloc* rosalloc, byte* begin, byte* end,
-                             byte* limit, size_t growth_limit)
-    : MallocSpace(name, mem_map, begin, end, limit, growth_limit), rosalloc_(rosalloc) {
-  CHECK(rosalloc != NULL);
+                             byte* limit, size_t growth_limit, bool can_move_objects,
+                             size_t starting_size, size_t initial_size, bool low_memory_mode)
+    : MallocSpace(name, mem_map, begin, end, limit, growth_limit, true, can_move_objects,
+                  starting_size, initial_size),
+      rosalloc_(rosalloc), low_memory_mode_(low_memory_mode) {
+  CHECK(rosalloc != nullptr);
 }
 
 RosAllocSpace* RosAllocSpace::CreateFromMemMap(MemMap* mem_map, const std::string& name,
                                                size_t starting_size, size_t initial_size,
                                                size_t growth_limit, size_t capacity,
-                                               bool low_memory_mode) {
+                                               bool low_memory_mode, bool can_move_objects) {
   DCHECK(mem_map != nullptr);
   allocator::RosAlloc* rosalloc = CreateRosAlloc(mem_map->Begin(), starting_size, initial_size,
                                                  capacity, low_memory_mode);
@@ -66,10 +70,10 @@
   // TODO: Fix RosAllocSpace to support valgrind. There is currently some issues with
   // AllocationSize caused by redzones. b/12944686
   if (false && Runtime::Current()->GetHeap()->RunningOnValgrind()) {
-    return new ValgrindMallocSpace<RosAllocSpace, allocator::RosAlloc*>(
-        name, mem_map, rosalloc, begin, end, begin + capacity, growth_limit, initial_size);
+    LOG(FATAL) << "Unimplemented";
   } else {
-    return new RosAllocSpace(name, mem_map, rosalloc, begin, end, begin + capacity, growth_limit);
+    return new RosAllocSpace(name, mem_map, rosalloc, begin, end, begin + capacity, growth_limit,
+                             can_move_objects, starting_size, initial_size, low_memory_mode);
   }
 }
 
@@ -79,7 +83,7 @@
 
 RosAllocSpace* RosAllocSpace::Create(const std::string& name, size_t initial_size,
                                      size_t growth_limit, size_t capacity, byte* requested_begin,
-                                     bool low_memory_mode) {
+                                     bool low_memory_mode, bool can_move_objects) {
   uint64_t start_time = 0;
   if (VLOG_IS_ON(heap) || VLOG_IS_ON(startup)) {
     start_time = NanoTime();
@@ -104,7 +108,8 @@
   }
 
   RosAllocSpace* space = CreateFromMemMap(mem_map, name, starting_size, initial_size,
-                                          growth_limit, capacity, low_memory_mode);
+                                          growth_limit, capacity, low_memory_mode,
+                                          can_move_objects);
   // We start out with only the initial size possibly containing objects.
   if (VLOG_IS_ON(heap) || VLOG_IS_ON(startup)) {
     LOG(INFO) << "RosAllocSpace::Create exiting (" << PrettyDuration(NanoTime() - start_time)
@@ -113,7 +118,8 @@
   return space;
 }
 
-allocator::RosAlloc* RosAllocSpace::CreateRosAlloc(void* begin, size_t morecore_start, size_t initial_size,
+allocator::RosAlloc* RosAllocSpace::CreateRosAlloc(void* begin, size_t morecore_start,
+                                                   size_t initial_size,
                                                    size_t maximum_size, bool low_memory_mode) {
   // clear errno to allow PLOG on error
   errno = 0;
@@ -154,9 +160,11 @@
 }
 
 MallocSpace* RosAllocSpace::CreateInstance(const std::string& name, MemMap* mem_map, void* allocator,
-                                           byte* begin, byte* end, byte* limit, size_t growth_limit) {
+                                           byte* begin, byte* end, byte* limit, size_t growth_limit,
+                                           bool can_move_objects) {
   return new RosAllocSpace(name, mem_map, reinterpret_cast<allocator::RosAlloc*>(allocator),
-                           begin, end, limit, growth_limit);
+                           begin, end, limit, growth_limit, can_move_objects, starting_size_,
+                           initial_size_, low_memory_mode_);
 }
 
 size_t RosAllocSpace::Free(Thread* self, mirror::Object* ptr) {
@@ -333,13 +341,15 @@
 }
 
 void RosAllocSpace::Clear() {
+  size_t footprint_limit = GetFootprintLimit();
   madvise(GetMemMap()->Begin(), GetMemMap()->Size(), MADV_DONTNEED);
-  GetLiveBitmap()->Clear();
-  GetMarkBitmap()->Clear();
-}
-
-void RosAllocSpace::Reset() {
-  // TODO: Delete and create new mspace here.
+  live_bitmap_->Clear();
+  mark_bitmap_->Clear();
+  end_ = begin_ + starting_size_;
+  delete rosalloc_;
+  rosalloc_ = CreateRosAlloc(mem_map_->Begin(), starting_size_, initial_size_, Capacity(),
+                             low_memory_mode_);
+  SetFootprintLimit(footprint_limit);
 }
 
 }  // namespace space
diff --git a/runtime/gc/space/rosalloc_space.h b/runtime/gc/space/rosalloc_space.h
index 900e7a9..a156738 100644
--- a/runtime/gc/space/rosalloc_space.h
+++ b/runtime/gc/space/rosalloc_space.h
@@ -39,11 +39,12 @@
   // the caller should call Begin on the returned space to confirm the
   // request was granted.
   static RosAllocSpace* Create(const std::string& name, size_t initial_size, size_t growth_limit,
-                               size_t capacity, byte* requested_begin, bool low_memory_mode);
+                               size_t capacity, byte* requested_begin, bool low_memory_mode,
+                               bool can_move_objects);
   static RosAllocSpace* CreateFromMemMap(MemMap* mem_map, const std::string& name,
                                          size_t starting_size, size_t initial_size,
                                          size_t growth_limit, size_t capacity,
-                                         bool low_memory_mode);
+                                         bool low_memory_mode, bool can_move_objects);
 
   mirror::Object* AllocWithGrowth(Thread* self, size_t num_bytes, size_t* bytes_allocated,
                                   size_t* usable_size) OVERRIDE LOCKS_EXCLUDED(lock_);
@@ -80,9 +81,10 @@
   void SetFootprintLimit(size_t limit) OVERRIDE;
 
   void Clear() OVERRIDE;
-  void Reset() OVERRIDE;
+
   MallocSpace* CreateInstance(const std::string& name, MemMap* mem_map, void* allocator,
-                              byte* begin, byte* end, byte* limit, size_t growth_limit);
+                              byte* begin, byte* end, byte* limit, size_t growth_limit,
+                              bool can_move_objects) OVERRIDE;
 
   uint64_t GetBytesAllocated() OVERRIDE;
   uint64_t GetObjectsAllocated() OVERRIDE;
@@ -110,7 +112,8 @@
 
  protected:
   RosAllocSpace(const std::string& name, MemMap* mem_map, allocator::RosAlloc* rosalloc,
-                byte* begin, byte* end, byte* limit, size_t growth_limit);
+                byte* begin, byte* end, byte* limit, size_t growth_limit, bool can_move_objects,
+                size_t starting_size, size_t initial_size, bool low_memory_mode);
 
  private:
   mirror::Object* AllocCommon(Thread* self, size_t num_bytes, size_t* bytes_allocated,
@@ -132,7 +135,9 @@
       LOCKS_EXCLUDED(Locks::runtime_shutdown_lock_, Locks::thread_list_lock_);
 
   // Underlying rosalloc.
-  allocator::RosAlloc* const rosalloc_;
+  allocator::RosAlloc* rosalloc_;
+
+  const bool low_memory_mode_;
 
   friend class collector::MarkSweep;
 
diff --git a/runtime/gc/space/rosalloc_space_base_test.cc b/runtime/gc/space/rosalloc_space_base_test.cc
index df42076..c3157fa 100644
--- a/runtime/gc/space/rosalloc_space_base_test.cc
+++ b/runtime/gc/space/rosalloc_space_base_test.cc
@@ -23,7 +23,7 @@
 MallocSpace* CreateRosAllocSpace(const std::string& name, size_t initial_size, size_t growth_limit,
                                  size_t capacity, byte* requested_begin) {
   return RosAllocSpace::Create(name, initial_size, growth_limit, capacity, requested_begin,
-                               Runtime::Current()->GetHeap()->IsLowMemoryMode());
+                               Runtime::Current()->GetHeap()->IsLowMemoryMode(), false);
 }
 
 TEST_SPACE_CREATE_FN_BASE(RosAllocSpace, CreateRosAllocSpace)
diff --git a/runtime/gc/space/rosalloc_space_random_test.cc b/runtime/gc/space/rosalloc_space_random_test.cc
index 4d37c9e..864bbc9 100644
--- a/runtime/gc/space/rosalloc_space_random_test.cc
+++ b/runtime/gc/space/rosalloc_space_random_test.cc
@@ -23,7 +23,7 @@
 MallocSpace* CreateRosAllocSpace(const std::string& name, size_t initial_size, size_t growth_limit,
                                  size_t capacity, byte* requested_begin) {
   return RosAllocSpace::Create(name, initial_size, growth_limit, capacity, requested_begin,
-                               Runtime::Current()->GetHeap()->IsLowMemoryMode());
+                               Runtime::Current()->GetHeap()->IsLowMemoryMode(), false);
 }
 
 TEST_SPACE_CREATE_FN_RANDOM(RosAllocSpace, CreateRosAllocSpace)
diff --git a/runtime/gc/space/rosalloc_space_static_test.cc b/runtime/gc/space/rosalloc_space_static_test.cc
index 9f11fd0..c0e2ac8 100644
--- a/runtime/gc/space/rosalloc_space_static_test.cc
+++ b/runtime/gc/space/rosalloc_space_static_test.cc
@@ -23,7 +23,7 @@
 MallocSpace* CreateRosAllocSpace(const std::string& name, size_t initial_size, size_t growth_limit,
                                  size_t capacity, byte* requested_begin) {
   return RosAllocSpace::Create(name, initial_size, growth_limit, capacity, requested_begin,
-                               Runtime::Current()->GetHeap()->IsLowMemoryMode());
+                               Runtime::Current()->GetHeap()->IsLowMemoryMode(), false);
 }
 
 TEST_SPACE_CREATE_FN_STATIC(RosAllocSpace, CreateRosAllocSpace)
diff --git a/runtime/gc/space/space.h b/runtime/gc/space/space.h
index 37d7c80..c9022f1 100644
--- a/runtime/gc/space/space.h
+++ b/runtime/gc/space/space.h
@@ -160,6 +160,9 @@
   }
   virtual ContinuousMemMapAllocSpace* AsContinuousMemMapAllocSpace();
 
+  // Returns true if objects in the space are movable.
+  virtual bool CanMoveObjects() const = 0;
+
   virtual ~Space() {}
 
  protected:
@@ -396,12 +399,9 @@
   // Swap the live and mark bitmaps of this space. This is used by the GC for concurrent sweeping.
   void SwapBitmaps();
 
-  // Free all memory associated with this space.
+  // Reset the space back to an empty space and release memory.
   virtual void Clear() = 0;
 
-  // Reset the space back to an empty space.
-  virtual void Reset() = 0;
-
   accounting::SpaceBitmap* GetLiveBitmap() const {
     return live_bitmap_.get();
   }
diff --git a/runtime/gc/space/valgrind_malloc_space-inl.h b/runtime/gc/space/valgrind_malloc_space-inl.h
index ed97e60..966c276 100644
--- a/runtime/gc/space/valgrind_malloc_space-inl.h
+++ b/runtime/gc/space/valgrind_malloc_space-inl.h
@@ -95,8 +95,10 @@
 ValgrindMallocSpace<S, A>::ValgrindMallocSpace(const std::string& name, MemMap* mem_map,
                                                A allocator, byte* begin,
                                                byte* end, byte* limit, size_t growth_limit,
-                                               size_t initial_size) :
-    S(name, mem_map, allocator, begin, end, limit, growth_limit) {
+                                               size_t initial_size,
+                                               bool can_move_objects, size_t starting_size) :
+    S(name, mem_map, allocator, begin, end, limit, growth_limit, can_move_objects, starting_size,
+      initial_size) {
   VALGRIND_MAKE_MEM_UNDEFINED(mem_map->Begin() + initial_size, mem_map->Size() - initial_size);
 }
 
diff --git a/runtime/gc/space/valgrind_malloc_space.h b/runtime/gc/space/valgrind_malloc_space.h
index 6b755c4..200ad83 100644
--- a/runtime/gc/space/valgrind_malloc_space.h
+++ b/runtime/gc/space/valgrind_malloc_space.h
@@ -48,7 +48,7 @@
 
   ValgrindMallocSpace(const std::string& name, MemMap* mem_map, AllocatorType allocator,
                       byte* begin, byte* end, byte* limit, size_t growth_limit,
-                      size_t initial_size);
+                      size_t initial_size, bool can_move_objects, size_t starting_size);
   virtual ~ValgrindMallocSpace() {}
 
  private:
diff --git a/runtime/gc/space/zygote_space.cc b/runtime/gc/space/zygote_space.cc
index d1c3d03..a60ab38 100644
--- a/runtime/gc/space/zygote_space.cc
+++ b/runtime/gc/space/zygote_space.cc
@@ -61,10 +61,6 @@
   LOG(FATAL) << "Unimplemented";
 }
 
-void ZygoteSpace::Reset() {
-  LOG(FATAL) << "Unimplemented";
-}
-
 ZygoteSpace::ZygoteSpace(const std::string& name, MemMap* mem_map, size_t objects_allocated)
     : ContinuousMemMapAllocSpace(name, mem_map, mem_map->Begin(), mem_map->End(), mem_map->End(),
                                  kGcRetentionPolicyFullCollect),
diff --git a/runtime/gc/space/zygote_space.h b/runtime/gc/space/zygote_space.h
index 8880548..30370aa 100644
--- a/runtime/gc/space/zygote_space.h
+++ b/runtime/gc/space/zygote_space.h
@@ -72,7 +72,10 @@
   }
 
   void Clear() OVERRIDE;
-  void Reset() OVERRIDE;
+
+  bool CanMoveObjects() const OVERRIDE {
+    return false;
+  }
 
  protected:
   virtual accounting::SpaceBitmap::SweepCallback* GetSweepCallback() {
diff --git a/runtime/instruction_set.h b/runtime/instruction_set.h
index f4eecfc..a08becf 100644
--- a/runtime/instruction_set.h
+++ b/runtime/instruction_set.h
@@ -81,6 +81,10 @@
     return mask_ != peer.mask_;
   }
 
+  bool operator<=(const InstructionSetFeatures &peer) const {
+    return (mask_ & peer.mask_) == mask_;
+  }
+
  private:
   uint32_t mask_;
 };
diff --git a/runtime/mirror/object.cc b/runtime/mirror/object.cc
index d9155f5..be7e9f2 100644
--- a/runtime/mirror/object.cc
+++ b/runtime/mirror/object.cc
@@ -39,6 +39,32 @@
 namespace art {
 namespace mirror {
 
+class CopyReferenceFieldsWithReadBarrierVisitor {
+ public:
+  explicit CopyReferenceFieldsWithReadBarrierVisitor(Object* dest_obj)
+      : dest_obj_(dest_obj) {}
+
+  void operator()(Object* obj, MemberOffset offset, bool /* is_static */) const
+      ALWAYS_INLINE SHARED_LOCKS_REQUIRED(Locks::mutator_lock_) {
+    // GetFieldObject() contains a RB.
+    Object* ref = obj->GetFieldObject<Object>(offset, false);
+    // No WB here as a large object space does not have a card table
+    // coverage. Instead, cards will be marked separately.
+    dest_obj_->SetFieldObjectWithoutWriteBarrier<false, false>(offset, ref, false);
+  }
+
+  void operator()(mirror::Class* klass, mirror::Reference* ref) const
+      ALWAYS_INLINE SHARED_LOCKS_REQUIRED(Locks::mutator_lock_) {
+    // Copy java.lang.ref.Reference.referent which isn't visited in
+    // Object::VisitReferences().
+    DCHECK(klass->IsReferenceClass());
+    this->operator()(ref, mirror::Reference::ReferentOffset(), false);
+  }
+
+ private:
+  Object* const dest_obj_;
+};
+
 static Object* CopyObject(Thread* self, mirror::Object* dest, mirror::Object* src, size_t num_bytes)
     SHARED_LOCKS_REQUIRED(Locks::mutator_lock_) {
   // Copy instance data.  We assume memcpy copies by words.
@@ -47,6 +73,13 @@
   byte* dst_bytes = reinterpret_cast<byte*>(dest);
   size_t offset = sizeof(Object);
   memcpy(dst_bytes + offset, src_bytes + offset, num_bytes - offset);
+  if (kUseBakerOrBrooksReadBarrier) {
+    // We need a RB here. After the memcpy that covers the whole
+    // object above, copy references fields one by one again with a
+    // RB. TODO: Optimize this later?
+    CopyReferenceFieldsWithReadBarrierVisitor visitor(dest);
+    src->VisitReferences<true>(visitor, visitor);
+  }
   gc::Heap* heap = Runtime::Current()->GetHeap();
   // Perform write barriers on copied object references.
   Class* c = src->GetClass();
diff --git a/runtime/mirror/object_array-inl.h b/runtime/mirror/object_array-inl.h
index 8032cc3..e0c14c3 100644
--- a/runtime/mirror/object_array-inl.h
+++ b/runtime/mirror/object_array-inl.h
@@ -128,7 +128,27 @@
   CHECK_EQ(sizeof(HeapReference<T>), sizeof(uint32_t));
   IntArray* dstAsIntArray = reinterpret_cast<IntArray*>(this);
   IntArray* srcAsIntArray = reinterpret_cast<IntArray*>(src);
-  dstAsIntArray->Memmove(dst_pos, srcAsIntArray, src_pos, count);
+  if (kUseBakerOrBrooksReadBarrier) {
+    // TODO: Optimize this later?
+    const bool copy_forward = (src != this) || (dst_pos < src_pos) || (dst_pos - src_pos >= count);
+    if (copy_forward) {
+      // Forward copy.
+      for (int i = 0; i < count; ++i) {
+        // We need a RB here. ObjectArray::GetWithoutChecks() contains a RB.
+        Object* obj = src->GetWithoutChecks(src_pos + i);
+        SetWithoutChecks<false>(dst_pos + i, obj);
+      }
+    } else {
+      // Backward copy.
+      for (int i = count - 1; i >= 0; --i) {
+        // We need a RB here. ObjectArray::GetWithoutChecks() contains a RB.
+        Object* obj = src->GetWithoutChecks(src_pos + i);
+        SetWithoutChecks<false>(dst_pos + i, obj);
+      }
+    }
+  } else {
+    dstAsIntArray->Memmove(dst_pos, srcAsIntArray, src_pos, count);
+  }
   Runtime::Current()->GetHeap()->WriteBarrierArray(this, dst_pos, count);
   if (kIsDebugBuild) {
     for (int i = 0; i < count; ++i) {
@@ -151,7 +171,16 @@
   CHECK_EQ(sizeof(HeapReference<T>), sizeof(uint32_t));
   IntArray* dstAsIntArray = reinterpret_cast<IntArray*>(this);
   IntArray* srcAsIntArray = reinterpret_cast<IntArray*>(src);
-  dstAsIntArray->Memcpy(dst_pos, srcAsIntArray, src_pos, count);
+  if (kUseBakerOrBrooksReadBarrier) {
+    // TODO: Optimize this later?
+    for (int i = 0; i < count; ++i) {
+      // We need a RB here. ObjectArray::GetWithoutChecks() contains a RB.
+      T* obj = src->GetWithoutChecks(src_pos + i);
+      SetWithoutChecks<false>(dst_pos + i, obj);
+    }
+  } else {
+    dstAsIntArray->Memcpy(dst_pos, srcAsIntArray, src_pos, count);
+  }
   Runtime::Current()->GetHeap()->WriteBarrierArray(this, dst_pos, count);
   if (kIsDebugBuild) {
     for (int i = 0; i < count; ++i) {
@@ -176,6 +205,7 @@
   int i = 0;
   for (; i < count; ++i) {
     // The follow get operations force the objects to be verified.
+    // We need a RB here. ObjectArray::GetWithoutChecks() contains a RB.
     o = src->GetWithoutChecks(src_pos + i);
     if (o == nullptr) {
       // Null is always assignable.
diff --git a/runtime/profiler.cc b/runtime/profiler.cc
index 77e7316..7b117f4 100644
--- a/runtime/profiler.cc
+++ b/runtime/profiler.cc
@@ -588,7 +588,7 @@
     return false;
   }
   if (st.st_size == 0) {
-    return true;  // empty profiles are ok.
+    return false;  // Empty profiles are invalid.
   }
   std::ifstream in(fileName.c_str());
   if (!in) {
diff --git a/runtime/runtime.cc b/runtime/runtime.cc
index a19fa53..5c31d35 100644
--- a/runtime/runtime.cc
+++ b/runtime/runtime.cc
@@ -188,7 +188,7 @@
 }
 
 struct AbortState {
-  void Dump(std::ostream& os) {
+  void Dump(std::ostream& os) NO_THREAD_SAFETY_ANALYSIS {
     if (gAborting > 1) {
       os << "Runtime aborting --- recursively, so no thread-specific detail!\n";
       return;
@@ -200,26 +200,33 @@
       return;
     }
     Thread* self = Thread::Current();
-    if (self == NULL) {
+    if (self == nullptr) {
       os << "(Aborting thread was not attached to runtime!)\n";
     } else {
-      // TODO: we're aborting and the ScopedObjectAccess may attempt to acquire the mutator_lock_
-      //       which may block indefinitely if there's a misbehaving thread holding it exclusively.
-      //       The code below should be made robust to this.
-      ScopedObjectAccess soa(self);
       os << "Aborting thread:\n";
-      self->Dump(os);
-      if (self->IsExceptionPending()) {
-        ThrowLocation throw_location;
-        mirror::Throwable* exception = self->GetException(&throw_location);
-        os << "Pending exception " << PrettyTypeOf(exception)
-            << " thrown by '" << throw_location.Dump() << "'\n"
-            << exception->Dump();
+      if (Locks::mutator_lock_->IsExclusiveHeld(self) || Locks::mutator_lock_->IsSharedHeld(self)) {
+        DumpThread(os, self);
+      } else {
+        if (Locks::mutator_lock_->SharedTryLock(self)) {
+          DumpThread(os, self);
+          Locks::mutator_lock_->SharedUnlock(self);
+        }
       }
     }
     DumpAllThreads(os, self);
   }
 
+  void DumpThread(std::ostream& os, Thread* self) SHARED_LOCKS_REQUIRED(Locks::mutator_lock_) {
+    self->Dump(os);
+    if (self->IsExceptionPending()) {
+      ThrowLocation throw_location;
+      mirror::Throwable* exception = self->GetException(&throw_location);
+      os << "Pending exception " << PrettyTypeOf(exception)
+          << " thrown by '" << throw_location.Dump() << "'\n"
+          << exception->Dump();
+    }
+  }
+
   void DumpAllThreads(std::ostream& os, Thread* self) NO_THREAD_SAFETY_ANALYSIS {
     bool tll_already_held = Locks::thread_list_lock_->IsExclusiveHeld(self);
     bool ml_already_held = Locks::mutator_lock_->IsSharedHeld(self);
diff --git a/runtime/stack_indirect_reference_table.h b/runtime/stack_indirect_reference_table.h
index 6049e06..b113129 100644
--- a/runtime/stack_indirect_reference_table.h
+++ b/runtime/stack_indirect_reference_table.h
@@ -44,6 +44,10 @@
     return number_of_references_;
   }
 
+  // We have versions with and without explicit pointer size of the following. The first two are
+  // used at runtime, so OFFSETOF_MEMBER computes the right offsets automatically. The last one
+  // takes the pointer size explicitly so that at compile time we can cross-compile correctly.
+
   // Returns the size of a StackIndirectReferenceTable containing num_references sirts.
   static size_t SizeOf(uint32_t num_references) {
     size_t header_size = OFFSETOF_MEMBER(StackIndirectReferenceTable, references_);
@@ -60,7 +64,7 @@
   // Get the size of the SIRT for the number of entries, with padding added for potential alignment.
   static size_t GetAlignedSirtSizeTarget(size_t pointer_size, uint32_t num_references) {
     // Assume that the layout is packed.
-    size_t header_size = pointer_size + sizeof(uint32_t);
+    size_t header_size = pointer_size + sizeof(number_of_references_);
     // This assumes there is no layout change between 32 and 64b.
     size_t data_size = sizeof(StackReference<mirror::Object>) * num_references;
     size_t sirt_size = header_size + data_size;
@@ -109,18 +113,18 @@
   }
 
   // Offset of link within SIRT, used by generated code
-  static size_t LinkOffset() {
-    return OFFSETOF_MEMBER(StackIndirectReferenceTable, link_);
+  static size_t LinkOffset(size_t pointer_size) {
+    return 0;
   }
 
   // Offset of length within SIRT, used by generated code
-  static uint32_t NumberOfReferencesOffset() {
-    return OFFSETOF_MEMBER(StackIndirectReferenceTable, number_of_references_);
+  static size_t NumberOfReferencesOffset(size_t pointer_size) {
+    return pointer_size;
   }
 
   // Offset of link within SIRT, used by generated code
-  static size_t ReferencesOffset() {
-    return OFFSETOF_MEMBER(StackIndirectReferenceTable, references_);
+  static size_t ReferencesOffset(size_t pointer_size) {
+    return pointer_size + sizeof(number_of_references_);
   }
 
  private:
diff --git a/runtime/stack_indirect_reference_table_test.cc b/runtime/stack_indirect_reference_table_test.cc
new file mode 100644
index 0000000..72ef6b6
--- /dev/null
+++ b/runtime/stack_indirect_reference_table_test.cc
@@ -0,0 +1,58 @@
+/*
+ * Copyright (C) 2014 The Android Open Source Project
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "stack_indirect_reference_table.h"
+#include "gtest/gtest.h"
+
+namespace art {
+
+// Test the offsets computed for members of StackIndirectReferenceTable. Because of cross-compiling
+// it is impossible the use OFFSETOF_MEMBER, so we do some reasonable computations ourselves. This
+// test checks whether we do the right thing.
+TEST(StackIndirectReferenceTableTest, Offsets) {
+  // As the members of StackIndirectReferenceTable are private, we cannot use OFFSETOF_MEMBER
+  // here. So do the inverse: set some data, and access it through pointers created from the offsets.
+
+  StackIndirectReferenceTable test_table(reinterpret_cast<mirror::Object*>(0x1234));
+  test_table.SetLink(reinterpret_cast<StackIndirectReferenceTable*>(0x5678));
+  test_table.SetNumberOfReferences(0x9ABC);
+
+  byte* table_base_ptr = reinterpret_cast<byte*>(&test_table);
+
+  {
+    uintptr_t* link_ptr = reinterpret_cast<uintptr_t*>(table_base_ptr +
+        StackIndirectReferenceTable::LinkOffset(kPointerSize));
+    EXPECT_EQ(*link_ptr, static_cast<size_t>(0x5678));
+  }
+
+  {
+    uint32_t* num_ptr = reinterpret_cast<uint32_t*>(table_base_ptr +
+        StackIndirectReferenceTable::NumberOfReferencesOffset(kPointerSize));
+    EXPECT_EQ(*num_ptr, static_cast<size_t>(0x9ABC));
+  }
+
+  {
+    // Assume sizeof(StackReference<mirror::Object>) == sizeof(uint32_t)
+    // TODO: How can we make this assumption-less but still access directly and fully?
+    EXPECT_EQ(sizeof(StackReference<mirror::Object>), sizeof(uint32_t));
+
+    uint32_t* ref_ptr = reinterpret_cast<uint32_t*>(table_base_ptr +
+        StackIndirectReferenceTable::ReferencesOffset(kPointerSize));
+    EXPECT_EQ(*ref_ptr, static_cast<uint32_t>(0x1234));
+  }
+}
+
+}  // namespace art
diff --git a/runtime/thread.cc b/runtime/thread.cc
index 5a2410a..131e2b6 100644
--- a/runtime/thread.cc
+++ b/runtime/thread.cc
@@ -711,7 +711,9 @@
   bool is_daemon = false;
   Thread* self = Thread::Current();
 
-  if (self != nullptr && thread != nullptr && thread->tlsPtr_.opeer != nullptr) {
+  // Don't do this if we are aborting since the GC may have all the threads suspended. This will
+  // cause ScopedObjectAccessUnchecked to deadlock.
+  if (gAborting == 0 && self != nullptr && thread != nullptr && thread->tlsPtr_.opeer != nullptr) {
     ScopedObjectAccessUnchecked soa(self);
     priority = soa.DecodeField(WellKnownClasses::java_lang_Thread_priority)
         ->GetInt(thread->tlsPtr_.opeer);