Merge "Fix occasional long ThreadSuspendSleep"
diff --git a/compiler/dex/quick/x86/target_x86.cc b/compiler/dex/quick/x86/target_x86.cc
index 43ebf55..c4adb09 100755
--- a/compiler/dex/quick/x86/target_x86.cc
+++ b/compiler/dex/quick/x86/target_x86.cc
@@ -2469,11 +2469,17 @@
       return m2l_->TargetReg(fpArgMappingToPhysicalReg[cur_fp_reg_++],
                              arg.IsWide() ? kWide : kNotWide);
     }
-  } else {
-    if (cur_core_reg_ < coreArgMappingToPhysicalRegSize) {
-      result = m2l_->TargetReg(coreArgMappingToPhysicalReg[cur_core_reg_++],
-                               arg.IsRef() ? kRef : kNotWide);
-      if (arg.IsWide() && cur_core_reg_ < coreArgMappingToPhysicalRegSize) {
+  } else if (cur_core_reg_ < coreArgMappingToPhysicalRegSize) {
+    result = m2l_->TargetReg(coreArgMappingToPhysicalReg[cur_core_reg_++],
+                             arg.IsRef() ? kRef : kNotWide);
+    if (arg.IsWide()) {
+      // This must be a long, as double is handled above.
+      // Ensure that we don't split a long across the last register and the stack.
+      if (cur_core_reg_ == coreArgMappingToPhysicalRegSize) {
+        // Leave the last core register unused and force the whole long to the stack.
+        cur_core_reg_++;
+        result = RegStorage::InvalidReg();
+      } else if (cur_core_reg_ < coreArgMappingToPhysicalRegSize) {
         result = RegStorage::MakeRegPair(
             result, m2l_->TargetReg(coreArgMappingToPhysicalReg[cur_core_reg_++], kNotWide));
       }
diff --git a/compiler/jni/quick/calling_convention.h b/compiler/jni/quick/calling_convention.h
index 6db0c3b..0c64a36 100644
--- a/compiler/jni/quick/calling_convention.h
+++ b/compiler/jni/quick/calling_convention.h
@@ -141,7 +141,7 @@
     if (IsStatic()) {
       param++;  // 0th argument must skip return value at start of the shorty
     } else if (param == 0) {
-      return true;  // this argument
+      return false;  // this argument
     }
     return shorty_[param] == 'J';
   }
diff --git a/compiler/jni/quick/x86/calling_convention_x86.cc b/compiler/jni/quick/x86/calling_convention_x86.cc
index fc72e88..8a45f0c 100644
--- a/compiler/jni/quick/x86/calling_convention_x86.cc
+++ b/compiler/jni/quick/x86/calling_convention_x86.cc
@@ -85,9 +85,19 @@
   ManagedRegister res = ManagedRegister::NoRegister();
   if (!IsCurrentParamAFloatOrDouble()) {
     switch (gpr_arg_count_) {
-      case 0: res = X86ManagedRegister::FromCpuRegister(ECX); break;
-      case 1: res = X86ManagedRegister::FromCpuRegister(EDX); break;
-      case 2: res = X86ManagedRegister::FromCpuRegister(EBX); break;
+      case 0:
+        res = X86ManagedRegister::FromCpuRegister(ECX);
+        break;
+      case 1:
+        res = X86ManagedRegister::FromCpuRegister(EDX);
+        break;
+      case 2:
+        // Don't split a long between the last register and the stack.
+        if (IsCurrentParamALong()) {
+          return ManagedRegister::NoRegister();
+        }
+        res = X86ManagedRegister::FromCpuRegister(EBX);
+        break;
     }
   } else if (itr_float_and_doubles_ < 4) {
     // First four float parameters are passed via XMM0..XMM3
@@ -120,27 +130,34 @@
     ResetIterator(FrameOffset(0));
     while (HasNext()) {
       ManagedRegister in_reg = CurrentParamRegister();
+      bool is_long = IsCurrentParamALong();
       if (!in_reg.IsNoRegister()) {
         int32_t size = IsParamADouble(itr_args_) ? 8 : 4;
         int32_t spill_offset = CurrentParamStackOffset().Uint32Value();
         ManagedRegisterSpill spill(in_reg, size, spill_offset);
         entry_spills_.push_back(spill);
-        if (IsCurrentParamALong() && !IsCurrentParamAReference()) {  // Long.
-          // special case, as we may need a second register here.
+        if (is_long) {
+          // special case, as we need a second register here.
           in_reg = CurrentParamHighLongRegister();
-          if (!in_reg.IsNoRegister()) {
-            // We have to spill the second half of the long.
-            ManagedRegisterSpill spill2(in_reg, size, spill_offset + 4);
-            entry_spills_.push_back(spill2);
-            // Long was allocated in 2 registers.
-            gpr_arg_count_++;
-          }
+          DCHECK(!in_reg.IsNoRegister());
+          // We have to spill the second half of the long.
+          ManagedRegisterSpill spill2(in_reg, size, spill_offset + 4);
+          entry_spills_.push_back(spill2);
         }
 
         // Keep track of the number of GPRs allocated.
         if (!IsCurrentParamAFloatOrDouble()) {
-          gpr_arg_count_++;
+          if (is_long) {
+            // Long was allocated in 2 registers.
+            gpr_arg_count_ += 2;
+          } else {
+            gpr_arg_count_++;
+          }
         }
+      } else if (is_long) {
+        // We need to skip the unused last register, which is empty.
+        // If we are already out of registers, this is harmless.
+        gpr_arg_count_ += 2;
       }
       Next();
     }
diff --git a/compiler/optimizing/builder.cc b/compiler/optimizing/builder.cc
index c509606..20a1b03 100644
--- a/compiler/optimizing/builder.cc
+++ b/compiler/optimizing/builder.cc
@@ -259,13 +259,14 @@
   return false;
 }
 
-HGraph* HGraphBuilder::BuildGraph(const DexFile::CodeItem& code_item, int start_instruction_id) {
+bool HGraphBuilder::BuildGraph(const DexFile::CodeItem& code_item) {
+  DCHECK(graph_->GetBlocks().IsEmpty());
+
   const uint16_t* code_ptr = code_item.insns_;
   const uint16_t* code_end = code_item.insns_ + code_item.insns_size_in_code_units_;
   code_start_ = code_ptr;
 
   // Setup the graph with the entry block and exit block.
-  graph_ = new (arena_) HGraph(arena_, start_instruction_id);
   entry_block_ = new (arena_) HBasicBlock(graph_, 0);
   graph_->AddBlock(entry_block_);
   exit_block_ = new (arena_) HBasicBlock(graph_, kNoDexPc);
@@ -289,7 +290,7 @@
   // Note that the compiler driver is null when unit testing.
   if ((compiler_driver_ != nullptr)
       && SkipCompilation(number_of_dex_instructions, number_of_blocks, number_of_branches)) {
-    return nullptr;
+    return false;
   }
 
   // Also create blocks for catch handlers.
@@ -319,7 +320,7 @@
     MaybeUpdateCurrentBlock(dex_pc);
     const Instruction& instruction = *Instruction::At(code_ptr);
     if (!AnalyzeDexInstruction(instruction, dex_pc)) {
-      return nullptr;
+      return false;
     }
     dex_pc += instruction.SizeInCodeUnits();
     code_ptr += instruction.SizeInCodeUnits();
@@ -331,7 +332,8 @@
   // Add the suspend check to the entry block.
   entry_block_->AddInstruction(new (arena_) HSuspendCheck(0));
   entry_block_->AddInstruction(new (arena_) HGoto());
-  return graph_;
+
+  return true;
 }
 
 void HGraphBuilder::MaybeUpdateCurrentBlock(size_t index) {
diff --git a/compiler/optimizing/builder.h b/compiler/optimizing/builder.h
index 8ee27a1..c510136 100644
--- a/compiler/optimizing/builder.h
+++ b/compiler/optimizing/builder.h
@@ -34,19 +34,19 @@
 
 class HGraphBuilder : public ValueObject {
  public:
-  HGraphBuilder(ArenaAllocator* arena,
+  HGraphBuilder(HGraph* graph,
                 DexCompilationUnit* dex_compilation_unit,
                 const DexCompilationUnit* const outer_compilation_unit,
                 const DexFile* dex_file,
                 CompilerDriver* driver,
                 OptimizingCompilerStats* compiler_stats)
-      : arena_(arena),
-        branch_targets_(arena, 0),
-        locals_(arena, 0),
+      : arena_(graph->GetArena()),
+        branch_targets_(graph->GetArena(), 0),
+        locals_(graph->GetArena(), 0),
         entry_block_(nullptr),
         exit_block_(nullptr),
         current_block_(nullptr),
-        graph_(nullptr),
+        graph_(graph),
         constant0_(nullptr),
         constant1_(nullptr),
         dex_file_(dex_file),
@@ -59,14 +59,14 @@
         compilation_stats_(compiler_stats) {}
 
   // Only for unit testing.
-  HGraphBuilder(ArenaAllocator* arena, Primitive::Type return_type = Primitive::kPrimInt)
-      : arena_(arena),
-        branch_targets_(arena, 0),
-        locals_(arena, 0),
+  HGraphBuilder(HGraph* graph, Primitive::Type return_type = Primitive::kPrimInt)
+      : arena_(graph->GetArena()),
+        branch_targets_(graph->GetArena(), 0),
+        locals_(graph->GetArena(), 0),
         entry_block_(nullptr),
         exit_block_(nullptr),
         current_block_(nullptr),
-        graph_(nullptr),
+        graph_(graph),
         constant0_(nullptr),
         constant1_(nullptr),
         dex_file_(nullptr),
@@ -78,7 +78,7 @@
         latest_result_(nullptr),
         compilation_stats_(nullptr) {}
 
-  HGraph* BuildGraph(const DexFile::CodeItem& code, int start_instruction_id = 0);
+  bool BuildGraph(const DexFile::CodeItem& code);
 
  private:
   // Analyzes the dex instruction and adds HInstruction to the graph
@@ -249,7 +249,7 @@
   HBasicBlock* entry_block_;
   HBasicBlock* exit_block_;
   HBasicBlock* current_block_;
-  HGraph* graph_;
+  HGraph* const graph_;
 
   HIntConstant* constant0_;
   HIntConstant* constant1_;
diff --git a/compiler/optimizing/code_generator.cc b/compiler/optimizing/code_generator.cc
index dc2446d..fd4e391 100644
--- a/compiler/optimizing/code_generator.cc
+++ b/compiler/optimizing/code_generator.cc
@@ -290,7 +290,7 @@
         result_location = locations->InAt(0);
         break;
     }
-    locations->SetOut(result_location);
+    locations->UpdateOut(result_location);
   }
 }
 
diff --git a/compiler/optimizing/code_generator_arm.cc b/compiler/optimizing/code_generator_arm.cc
index b0cd7ba..78fd181 100644
--- a/compiler/optimizing/code_generator_arm.cc
+++ b/compiler/optimizing/code_generator_arm.cc
@@ -1296,13 +1296,14 @@
   LocationSummary* locations =
       new (GetGraph()->GetArena()) LocationSummary(neg, LocationSummary::kNoCall);
   switch (neg->GetResultType()) {
-    case Primitive::kPrimInt:
-    case Primitive::kPrimLong: {
-      Location::OutputOverlap output_overlaps = (neg->GetResultType() == Primitive::kPrimLong)
-          ? Location::kOutputOverlap
-          : Location::kNoOutputOverlap;
+    case Primitive::kPrimInt: {
       locations->SetInAt(0, Location::RequiresRegister());
-      locations->SetOut(Location::RequiresRegister(), output_overlaps);
+      locations->SetOut(Location::RequiresRegister(), Location::kNoOutputOverlap);
+      break;
+    }
+    case Primitive::kPrimLong: {
+      locations->SetInAt(0, Location::RequiresRegister());
+      locations->SetOut(Location::RequiresRegister(), Location::kOutputOverlap);
       break;
     }
 
@@ -1837,7 +1838,7 @@
     case Primitive::kPrimLong: {
       locations->SetInAt(0, Location::RequiresRegister());
       locations->SetInAt(1, Location::RequiresRegister());
-      locations->SetOut(Location::RequiresRegister(), Location::kOutputOverlap);
+      locations->SetOut(Location::RequiresRegister(), Location::kNoOutputOverlap);
       break;
     }
 
@@ -1914,7 +1915,7 @@
     case Primitive::kPrimLong: {
       locations->SetInAt(0, Location::RequiresRegister());
       locations->SetInAt(1, Location::RequiresRegister());
-      locations->SetOut(Location::RequiresRegister(), Location::kOutputOverlap);
+      locations->SetOut(Location::RequiresRegister(), Location::kNoOutputOverlap);
       break;
     }
     case Primitive::kPrimFloat:
@@ -2297,7 +2298,7 @@
     case Primitive::kPrimInt: {
       locations->SetInAt(0, Location::RequiresRegister());
       locations->SetInAt(1, Location::RegisterOrConstant(op->InputAt(1)));
-      locations->SetOut(Location::RequiresRegister());
+      locations->SetOut(Location::RequiresRegister(), Location::kNoOutputOverlap);
       break;
     }
     case Primitive::kPrimLong: {
@@ -2492,7 +2493,8 @@
     case Primitive::kPrimLong: {
       locations->SetInAt(0, Location::RequiresRegister());
       locations->SetInAt(1, Location::RequiresRegister());
-      locations->SetOut(Location::RequiresRegister(), Location::kNoOutputOverlap);
+      // Output overlaps because it is written before doing the low comparison.
+      locations->SetOut(Location::RequiresRegister(), Location::kOutputOverlap);
       break;
     }
     case Primitive::kPrimFloat:
@@ -2765,12 +2767,14 @@
   LocationSummary* locations =
       new (GetGraph()->GetArena()) LocationSummary(instruction, LocationSummary::kNoCall);
   locations->SetInAt(0, Location::RequiresRegister());
-  locations->SetOut(Location::RequiresRegister(), Location::kNoOutputOverlap);
 
-  bool generate_volatile = field_info.IsVolatile()
+  bool volatile_for_double = field_info.IsVolatile()
       && (field_info.GetFieldType() == Primitive::kPrimDouble)
       && !codegen_->GetInstructionSetFeatures().HasAtomicLdrdAndStrd();
-  if (generate_volatile) {
+  bool overlap = field_info.IsVolatile() && (field_info.GetFieldType() == Primitive::kPrimLong);
+  locations->SetOut(Location::RequiresRegister(),
+                    (overlap ? Location::kOutputOverlap : Location::kNoOutputOverlap));
+  if (volatile_for_double) {
     // Arm encoding have some additional constraints for ldrexd/strexd:
     // - registers need to be consecutive
     // - the first register should be even but not R14.
@@ -3614,7 +3618,8 @@
   LocationSummary* locations = new (GetGraph()->GetArena()) LocationSummary(instruction, call_kind);
   locations->SetInAt(0, Location::RequiresRegister());
   locations->SetInAt(1, Location::RequiresRegister());
-  locations->SetOut(Location::RequiresRegister());
+  // The out register is used as a temporary, so it overlaps with the inputs.
+  locations->SetOut(Location::RequiresRegister(), Location::kOutputOverlap);
 }
 
 void InstructionCodeGeneratorARM::VisitInstanceOf(HInstanceOf* instruction) {
@@ -3710,10 +3715,7 @@
          || instruction->GetResultType() == Primitive::kPrimLong);
   locations->SetInAt(0, Location::RequiresRegister());
   locations->SetInAt(1, Location::RequiresRegister());
-  Location::OutputOverlap output_overlaps = (instruction->GetResultType() == Primitive::kPrimLong)
-      ? Location::kOutputOverlap
-      : Location::kNoOutputOverlap;
-  locations->SetOut(Location::RequiresRegister(), output_overlaps);
+  locations->SetOut(Location::RequiresRegister(), Location::kNoOutputOverlap);
 }
 
 void InstructionCodeGeneratorARM::VisitAnd(HAnd* instruction) {
diff --git a/compiler/optimizing/code_generator_x86.cc b/compiler/optimizing/code_generator_x86.cc
index c840793..98f93a4 100644
--- a/compiler/optimizing/code_generator_x86.cc
+++ b/compiler/optimizing/code_generator_x86.cc
@@ -535,9 +535,6 @@
         X86ManagedRegister pair = X86ManagedRegister::FromRegisterPair(
             calling_convention.GetRegisterPairAt(index));
         return Location::RegisterPairLocation(pair.AsRegisterPairLow(), pair.AsRegisterPairHigh());
-      } else if (index + 1 == calling_convention.GetNumberOfRegisters()) {
-        // stack_index_ is the right offset for the memory.
-        return Location::QuickParameter(index, stack_index_ - 2);
       } else {
         return Location::DoubleStackSlot(calling_convention.GetStackOffsetOf(stack_index_ - 2));
       }
@@ -629,16 +626,6 @@
           Location::RegisterLocation(destination.AsRegisterPairLow<Register>()));
     } else if (source.IsFpuRegister()) {
       LOG(FATAL) << "Unimplemented";
-    } else if (source.IsQuickParameter()) {
-      uint16_t register_index = source.GetQuickParameterRegisterIndex();
-      uint16_t stack_index = source.GetQuickParameterStackIndex();
-      InvokeDexCallingConvention calling_convention;
-      EmitParallelMoves(
-          Location::RegisterLocation(calling_convention.GetRegisterAt(register_index)),
-          Location::RegisterLocation(destination.AsRegisterPairLow<Register>()),
-          Location::StackSlot(
-              calling_convention.GetStackOffsetOf(stack_index + 1) + GetFrameSize()),
-          Location::RegisterLocation(destination.AsRegisterPairHigh<Register>()));
     } else {
       // No conflict possible, so just do the moves.
       DCHECK(source.IsDoubleStackSlot());
@@ -646,23 +633,6 @@
       __ movl(destination.AsRegisterPairHigh<Register>(),
               Address(ESP, source.GetHighStackIndex(kX86WordSize)));
     }
-  } else if (destination.IsQuickParameter()) {
-    InvokeDexCallingConvention calling_convention;
-    uint16_t register_index = destination.GetQuickParameterRegisterIndex();
-    uint16_t stack_index = destination.GetQuickParameterStackIndex();
-    if (source.IsRegisterPair()) {
-      LOG(FATAL) << "Unimplemented";
-    } else if (source.IsFpuRegister()) {
-      LOG(FATAL) << "Unimplemented";
-    } else {
-      DCHECK(source.IsDoubleStackSlot());
-      EmitParallelMoves(
-          Location::StackSlot(source.GetStackIndex()),
-          Location::StackSlot(calling_convention.GetStackOffsetOf(stack_index)),
-          Location::StackSlot(source.GetHighStackIndex(kX86WordSize)),
-          Location::StackSlot(calling_convention.GetStackOffsetOf(stack_index + 1)));
-      __ movl(calling_convention.GetRegisterAt(register_index), Address(ESP, source.GetStackIndex()));
-    }
   } else if (destination.IsFpuRegister()) {
     if (source.IsFpuRegister()) {
       __ movaps(destination.AsFpuRegister<XmmRegister>(), source.AsFpuRegister<XmmRegister>());
@@ -678,18 +648,6 @@
       __ movl(Address(ESP, destination.GetStackIndex()), source.AsRegisterPairLow<Register>());
       __ movl(Address(ESP, destination.GetHighStackIndex(kX86WordSize)),
               source.AsRegisterPairHigh<Register>());
-    } else if (source.IsQuickParameter()) {
-      // No conflict possible, so just do the move.
-      InvokeDexCallingConvention calling_convention;
-      uint16_t register_index = source.GetQuickParameterRegisterIndex();
-      uint16_t stack_index = source.GetQuickParameterStackIndex();
-      // Just move the low part. The only time a source is a quick parameter is
-      // when moving the parameter to its stack locations. And the (Java) caller
-      // of this method has already done that.
-      __ movl(Address(ESP, destination.GetStackIndex()),
-              calling_convention.GetRegisterAt(register_index));
-      DCHECK_EQ(calling_convention.GetStackOffsetOf(stack_index + 1) + GetFrameSize(),
-                static_cast<size_t>(destination.GetHighStackIndex(kX86WordSize)));
     } else if (source.IsFpuRegister()) {
       __ movsd(Address(ESP, destination.GetStackIndex()), source.AsFpuRegister<XmmRegister>());
     } else {
diff --git a/compiler/optimizing/codegen_test.cc b/compiler/optimizing/codegen_test.cc
index dfa4748..e0e0b4c 100644
--- a/compiler/optimizing/codegen_test.cc
+++ b/compiler/optimizing/codegen_test.cc
@@ -180,10 +180,11 @@
 static void TestCode(const uint16_t* data, bool has_result = false, int32_t expected = 0) {
   ArenaPool pool;
   ArenaAllocator arena(&pool);
-  HGraphBuilder builder(&arena);
+  HGraph* graph = new (&arena) HGraph(&arena);
+  HGraphBuilder builder(graph);
   const DexFile::CodeItem* item = reinterpret_cast<const DexFile::CodeItem*>(data);
-  HGraph* graph = builder.BuildGraph(*item);
-  ASSERT_NE(graph, nullptr);
+  bool graph_built = builder.BuildGraph(*item);
+  ASSERT_TRUE(graph_built);
   // Remove suspend checks, they cannot be executed in this context.
   RemoveSuspendChecks(graph);
   RunCodeBaseline(graph, has_result, expected);
@@ -192,10 +193,11 @@
 static void TestCodeLong(const uint16_t* data, bool has_result, int64_t expected) {
   ArenaPool pool;
   ArenaAllocator arena(&pool);
-  HGraphBuilder builder(&arena, Primitive::kPrimLong);
+  HGraph* graph = new (&arena) HGraph(&arena);
+  HGraphBuilder builder(graph, Primitive::kPrimLong);
   const DexFile::CodeItem* item = reinterpret_cast<const DexFile::CodeItem*>(data);
-  HGraph* graph = builder.BuildGraph(*item);
-  ASSERT_NE(graph, nullptr);
+  bool graph_built = builder.BuildGraph(*item);
+  ASSERT_TRUE(graph_built);
   // Remove suspend checks, they cannot be executed in this context.
   RemoveSuspendChecks(graph);
   RunCodeBaseline(graph, has_result, expected);
diff --git a/compiler/optimizing/dominator_test.cc b/compiler/optimizing/dominator_test.cc
index 3062e37..b246c6f 100644
--- a/compiler/optimizing/dominator_test.cc
+++ b/compiler/optimizing/dominator_test.cc
@@ -27,10 +27,11 @@
 static void TestCode(const uint16_t* data, const int* blocks, size_t blocks_length) {
   ArenaPool pool;
   ArenaAllocator allocator(&pool);
-  HGraphBuilder builder(&allocator);
+  HGraph* graph = new (&allocator) HGraph(&allocator);
+  HGraphBuilder builder(graph);
   const DexFile::CodeItem* item = reinterpret_cast<const DexFile::CodeItem*>(data);
-  HGraph* graph = builder.BuildGraph(*item);
-  ASSERT_NE(graph, nullptr);
+  bool graph_built = builder.BuildGraph(*item);
+  ASSERT_TRUE(graph_built);
   graph->BuildDominatorTree();
   ASSERT_EQ(graph->GetBlocks().Size(), blocks_length);
   for (size_t i = 0, e = blocks_length; i < e; ++i) {
diff --git a/compiler/optimizing/find_loops_test.cc b/compiler/optimizing/find_loops_test.cc
index 82fe03c..e05d9b3 100644
--- a/compiler/optimizing/find_loops_test.cc
+++ b/compiler/optimizing/find_loops_test.cc
@@ -28,9 +28,10 @@
 namespace art {
 
 static HGraph* TestCode(const uint16_t* data, ArenaAllocator* allocator) {
-  HGraphBuilder builder(allocator);
+  HGraph* graph = new (allocator) HGraph(allocator);
+  HGraphBuilder builder(graph);
   const DexFile::CodeItem* item = reinterpret_cast<const DexFile::CodeItem*>(data);
-  HGraph* graph = builder.BuildGraph(*item);
+  builder.BuildGraph(*item);
   graph->BuildDominatorTree();
   graph->AnalyzeNaturalLoops();
   return graph;
diff --git a/compiler/optimizing/graph_visualizer.cc b/compiler/optimizing/graph_visualizer.cc
index 22a3d12..835bca6 100644
--- a/compiler/optimizing/graph_visualizer.cc
+++ b/compiler/optimizing/graph_visualizer.cc
@@ -194,7 +194,22 @@
       }
       output_ << "]";
     }
-    if (pass_name_ == kLivenessPassName && instruction->GetLifetimePosition() != kNoLifetime) {
+    if (instruction->HasEnvironment()) {
+      HEnvironment* env = instruction->GetEnvironment();
+      output_ << " (env: [ ";
+      for (size_t i = 0, e = env->Size(); i < e; ++i) {
+        HInstruction* insn = env->GetInstructionAt(i);
+        if (insn != nullptr) {
+          output_ << GetTypeId(insn->GetType()) << insn->GetId() << " ";
+        } else {
+          output_ << " _ ";
+        }
+      }
+      output_ << "])";
+    }
+    if (pass_name_ == kLivenessPassName
+        && is_after_pass_
+        && instruction->GetLifetimePosition() != kNoLifetime) {
       output_ << " (liveness: " << instruction->GetLifetimePosition();
       if (instruction->HasLiveInterval()) {
         output_ << " ";
@@ -202,7 +217,7 @@
         interval.Dump(output_);
       }
       output_ << ")";
-    } else if (pass_name_ == kRegisterAllocatorPassName) {
+    } else if (pass_name_ == kRegisterAllocatorPassName && is_after_pass_) {
       LocationSummary* locations = instruction->GetLocations();
       if (locations != nullptr) {
         output_ << " ( ";
@@ -310,18 +325,13 @@
 
 HGraphVisualizer::HGraphVisualizer(std::ostream* output,
                                    HGraph* graph,
-                                   const char* string_filter,
                                    const CodeGenerator& codegen,
                                    const char* method_name)
-  : output_(output), graph_(graph), codegen_(codegen), is_enabled_(false) {
+  : output_(output), graph_(graph), codegen_(codegen) {
   if (output == nullptr) {
     return;
   }
-  if (strstr(method_name, string_filter) == nullptr) {
-    return;
-  }
 
-  is_enabled_ = true;
   HGraphVisualizerPrinter printer(graph_, *output_, "", true, codegen_);
   printer.StartTag("compilation");
   printer.PrintProperty("name", method_name);
@@ -331,7 +341,8 @@
 }
 
 void HGraphVisualizer::DumpGraph(const char* pass_name, bool is_after_pass) const {
-  if (is_enabled_) {
+  DCHECK(output_ != nullptr);
+  if (!graph_->GetBlocks().IsEmpty()) {
     HGraphVisualizerPrinter printer(graph_, *output_, pass_name, is_after_pass, codegen_);
     printer.Run();
   }
diff --git a/compiler/optimizing/graph_visualizer.h b/compiler/optimizing/graph_visualizer.h
index 8d6fe04..bc553ae 100644
--- a/compiler/optimizing/graph_visualizer.h
+++ b/compiler/optimizing/graph_visualizer.h
@@ -35,7 +35,6 @@
  public:
   HGraphVisualizer(std::ostream* output,
                    HGraph* graph,
-                   const char* string_filter,
                    const CodeGenerator& codegen,
                    const char* method_name);
 
@@ -46,10 +45,6 @@
   HGraph* const graph_;
   const CodeGenerator& codegen_;
 
-  // Is true when `output_` is not null, and the compiled method's name
-  // contains the string_filter given in the constructor.
-  bool is_enabled_;
-
   DISALLOW_COPY_AND_ASSIGN(HGraphVisualizer);
 };
 
diff --git a/compiler/optimizing/inliner.cc b/compiler/optimizing/inliner.cc
index 41e5164..32f6972 100644
--- a/compiler/optimizing/inliner.cc
+++ b/compiler/optimizing/inliner.cc
@@ -124,16 +124,18 @@
     resolved_method->GetAccessFlags(),
     nullptr);
 
+  HGraph* callee_graph =
+      new (graph_->GetArena()) HGraph(graph_->GetArena(), graph_->GetCurrentInstructionId());
+
   OptimizingCompilerStats inline_stats;
-  HGraphBuilder builder(graph_->GetArena(),
+  HGraphBuilder builder(callee_graph,
                         &dex_compilation_unit,
                         &outer_compilation_unit_,
                         &outer_dex_file,
                         compiler_driver_,
                         &inline_stats);
-  HGraph* callee_graph = builder.BuildGraph(*code_item, graph_->GetCurrentInstructionId());
 
-  if (callee_graph == nullptr) {
+  if (!builder.BuildGraph(*code_item)) {
     VLOG(compiler) << "Method " << PrettyMethod(method_index, outer_dex_file)
                    << " could not be built, so cannot be inlined";
     return false;
diff --git a/compiler/optimizing/intrinsics_arm64.cc b/compiler/optimizing/intrinsics_arm64.cc
index ba26afe..7a3d7d8 100644
--- a/compiler/optimizing/intrinsics_arm64.cc
+++ b/compiler/optimizing/intrinsics_arm64.cc
@@ -930,7 +930,10 @@
                                                             kIntrinsified);
   locations->SetInAt(0, Location::RequiresRegister());
   locations->SetInAt(1, Location::RequiresRegister());
-  locations->SetOut(Location::RequiresRegister(), Location::kNoOutputOverlap);
+  // In case we need to go in the slow path, we can't have the output be the same
+  // as the input: the current liveness analysis considers the input to be live
+  // at the point of the call.
+  locations->SetOut(Location::RequiresRegister(), Location::kOutputOverlap);
 }
 
 void IntrinsicCodeGeneratorARM64::VisitStringCharAt(HInvoke* invoke) {
diff --git a/compiler/optimizing/linearize_test.cc b/compiler/optimizing/linearize_test.cc
index 2ab9b57..eb27965 100644
--- a/compiler/optimizing/linearize_test.cc
+++ b/compiler/optimizing/linearize_test.cc
@@ -38,10 +38,11 @@
 static void TestCode(const uint16_t* data, const int* expected_order, size_t number_of_blocks) {
   ArenaPool pool;
   ArenaAllocator allocator(&pool);
-  HGraphBuilder builder(&allocator);
+  HGraph* graph = new (&allocator) HGraph(&allocator);
+  HGraphBuilder builder(graph);
   const DexFile::CodeItem* item = reinterpret_cast<const DexFile::CodeItem*>(data);
-  HGraph* graph = builder.BuildGraph(*item);
-  ASSERT_NE(graph, nullptr);
+  bool graph_built = builder.BuildGraph(*item);
+  ASSERT_TRUE(graph_built);
 
   graph->TryBuildingSsa();
 
diff --git a/compiler/optimizing/live_ranges_test.cc b/compiler/optimizing/live_ranges_test.cc
index 92742f9..0558b85 100644
--- a/compiler/optimizing/live_ranges_test.cc
+++ b/compiler/optimizing/live_ranges_test.cc
@@ -31,9 +31,10 @@
 namespace art {
 
 static HGraph* BuildGraph(const uint16_t* data, ArenaAllocator* allocator) {
-  HGraphBuilder builder(allocator);
+  HGraph* graph = new (allocator) HGraph(allocator);
+  HGraphBuilder builder(graph);
   const DexFile::CodeItem* item = reinterpret_cast<const DexFile::CodeItem*>(data);
-  HGraph* graph = builder.BuildGraph(*item);
+  builder.BuildGraph(*item);
   // Suspend checks implementation may change in the future, and this test relies
   // on how instructions are ordered.
   RemoveSuspendChecks(graph);
diff --git a/compiler/optimizing/liveness_test.cc b/compiler/optimizing/liveness_test.cc
index f2d49ac..c9be570 100644
--- a/compiler/optimizing/liveness_test.cc
+++ b/compiler/optimizing/liveness_test.cc
@@ -45,10 +45,11 @@
 static void TestCode(const uint16_t* data, const char* expected) {
   ArenaPool pool;
   ArenaAllocator allocator(&pool);
-  HGraphBuilder builder(&allocator);
+  HGraph* graph = new (&allocator) HGraph(&allocator);
+  HGraphBuilder builder(graph);
   const DexFile::CodeItem* item = reinterpret_cast<const DexFile::CodeItem*>(data);
-  HGraph* graph = builder.BuildGraph(*item);
-  ASSERT_NE(graph, nullptr);
+  bool graph_built = builder.BuildGraph(*item);
+  ASSERT_TRUE(graph_built);
   graph->TryBuildingSsa();
   // `Inline` conditions into ifs.
   PrepareForRegisterAllocation(graph).Run();
diff --git a/compiler/optimizing/locations.h b/compiler/optimizing/locations.h
index 8b06d60..bf27c5c 100644
--- a/compiler/optimizing/locations.h
+++ b/compiler/optimizing/locations.h
@@ -62,17 +62,11 @@
     // We do not use the value 9 because it conflicts with kLocationConstantMask.
     kDoNotUse9 = 9,
 
-    // On 32bits architectures, quick can pass a long where the
-    // low bits are in the last parameter register, and the high
-    // bits are in a stack slot. The kQuickParameter kind is for
-    // handling this special case.
-    kQuickParameter = 10,
-
     // Unallocated location represents a location that is not fixed and can be
     // allocated by a register allocator.  Each unallocated location has
     // a policy that specifies what kind of location is suitable. Payload
     // contains register allocation policy.
-    kUnallocated = 11,
+    kUnallocated = 10,
   };
 
   Location() : value_(kInvalid) {
@@ -82,7 +76,6 @@
     static_assert((kStackSlot & kLocationConstantMask) != kConstant, "TagError");
     static_assert((kDoubleStackSlot & kLocationConstantMask) != kConstant, "TagError");
     static_assert((kRegister & kLocationConstantMask) != kConstant, "TagError");
-    static_assert((kQuickParameter & kLocationConstantMask) != kConstant, "TagError");
     static_assert((kFpuRegister & kLocationConstantMask) != kConstant, "TagError");
     static_assert((kRegisterPair & kLocationConstantMask) != kConstant, "TagError");
     static_assert((kFpuRegisterPair & kLocationConstantMask) != kConstant, "TagError");
@@ -267,24 +260,6 @@
     return GetPayload() - kStackIndexBias + word_size;
   }
 
-  static Location QuickParameter(uint16_t register_index, uint16_t stack_index) {
-    return Location(kQuickParameter, register_index << 16 | stack_index);
-  }
-
-  uint32_t GetQuickParameterRegisterIndex() const {
-    DCHECK(IsQuickParameter());
-    return GetPayload() >> 16;
-  }
-
-  uint32_t GetQuickParameterStackIndex() const {
-    DCHECK(IsQuickParameter());
-    return GetPayload() & 0xFFFF;
-  }
-
-  bool IsQuickParameter() const {
-    return GetKind() == kQuickParameter;
-  }
-
   Kind GetKind() const {
     return IsConstant() ? kConstant : KindField::Decode(value_);
   }
@@ -299,7 +274,6 @@
       case kRegister: return "R";
       case kStackSlot: return "S";
       case kDoubleStackSlot: return "DS";
-      case kQuickParameter: return "Q";
       case kUnallocated: return "U";
       case kConstant: return "C";
       case kFpuRegister: return "F";
@@ -482,16 +456,17 @@
   }
 
   void SetOut(Location location, Location::OutputOverlap overlaps = Location::kOutputOverlap) {
-    DCHECK(output_.IsUnallocated() || output_.IsInvalid());
+    DCHECK(output_.IsInvalid());
     output_overlaps_ = overlaps;
     output_ = location;
   }
 
   void UpdateOut(Location location) {
-    // The only reason for updating an output is for parameters where
-    // we only know the exact stack slot after doing full register
-    // allocation.
-    DCHECK(output_.IsStackSlot() || output_.IsDoubleStackSlot());
+    // There are two reasons for updating an output:
+    // 1) Parameters, where we only know the exact stack slot after
+    //    doing full register allocation.
+    // 2) Unallocated location.
+    DCHECK(output_.IsStackSlot() || output_.IsDoubleStackSlot() || output_.IsUnallocated());
     output_ = location;
   }
 
@@ -563,28 +538,22 @@
     return live_registers_.GetNumberOfRegisters();
   }
 
-  bool InputOverlapsWithOutputOrTemp(uint32_t input_index, bool is_environment) const {
-    if (is_environment) return true;
-    if ((input_index == 0)
+  bool OutputUsesSameAs(uint32_t input_index) const {
+    return (input_index == 0)
         && output_.IsUnallocated()
-        && (output_.GetPolicy() == Location::kSameAsFirstInput)) {
-      return false;
-    }
+        && (output_.GetPolicy() == Location::kSameAsFirstInput);
+  }
+
+  bool IsFixedInput(uint32_t input_index) const {
     Location input = inputs_.Get(input_index);
-    if (input.IsRegister()
+    return input.IsRegister()
         || input.IsFpuRegister()
         || input.IsPair()
         || input.IsStackSlot()
-        || input.IsDoubleStackSlot()) {
-      // For fixed locations, the register allocator requires to have inputs die before
-      // the instruction, so that input moves use the location of the input just
-      // before that instruction (and not potential moves due to splitting).
-      return false;
-    }
-    return true;
+        || input.IsDoubleStackSlot();
   }
 
-  bool OutputOverlapsWithInputs() const {
+  bool OutputCanOverlapWithInputs() const {
     return output_overlaps_ == Location::kOutputOverlap;
   }
 
diff --git a/compiler/optimizing/nodes.cc b/compiler/optimizing/nodes.cc
index f1868cb..cd36598 100644
--- a/compiler/optimizing/nodes.cc
+++ b/compiler/optimizing/nodes.cc
@@ -867,8 +867,11 @@
   }
 
   if (GetBlocks().Size() == 3) {
-    // Simple case: Put the first block's instruction into `invoke`'s block.
+    // Simple case of an entry block, a body block, and an exit block.
+    // Put the body block's instruction into `invoke`'s block.
     HBasicBlock* body = GetBlocks().Get(1);
+    DCHECK(GetBlocks().Get(0)->IsEntryBlock());
+    DCHECK(GetBlocks().Get(2)->IsExitBlock());
     DCHECK(!body->IsExitBlock());
     HInstruction* last = body->GetLastInstruction();
 
@@ -886,7 +889,7 @@
   } else {
     // Need to inline multiple blocks. We split `invoke`'s block
     // into two blocks, merge the first block of the inlined graph into
-    // the first half, and replace the exit block if the inlined graph
+    // the first half, and replace the exit block of the inlined graph
     // with the second half.
     ArenaAllocator* allocator = outer_graph->GetArena();
     HBasicBlock* at = invoke->GetBlock();
@@ -908,10 +911,9 @@
       if (!last->IsReturnVoid()) {
         if (return_value != nullptr) {
           if (!return_value->IsPhi()) {
-            HPhi* phi = new (allocator) HPhi(
-                allocator, kNoRegNumber, to->GetPredecessors().Size(), invoke->GetType());
-            return_value->AsPhi()->AddInput(return_value);
+            HPhi* phi = new (allocator) HPhi(allocator, kNoRegNumber, 0, invoke->GetType());
             to->AddPhi(phi);
+            phi->AddInput(return_value);
             return_value = phi;
           }
           return_value->AsPhi()->AddInput(last->InputAt(0));
diff --git a/compiler/optimizing/optimization.h b/compiler/optimizing/optimization.h
index 9315d89..d9e082a 100644
--- a/compiler/optimizing/optimization.h
+++ b/compiler/optimizing/optimization.h
@@ -21,6 +21,8 @@
 
 namespace art {
 
+static const char* kBuilderPassName = "builder";
+static const char* kSsaBuilderPassName = "ssa_builder";
 static const char* kLivenessPassName = "liveness";
 static const char* kRegisterAllocatorPassName = "register";
 static const char* kLoopInvariantCodeMotionPassName = "licm";
diff --git a/compiler/optimizing/optimizing_compiler.cc b/compiler/optimizing/optimizing_compiler.cc
index 50d7924..38f7daa 100644
--- a/compiler/optimizing/optimizing_compiler.cc
+++ b/compiler/optimizing/optimizing_compiler.cc
@@ -19,6 +19,8 @@
 #include <fstream>
 #include <stdint.h>
 
+#include "base/dumpable.h"
+#include "base/timing_logger.h"
 #include "bounds_check_elimination.h"
 #include "builder.h"
 #include "code_generator.h"
@@ -78,6 +80,70 @@
  */
 static const char* kStringFilter = "";
 
+class PassInfoPrinter : public ValueObject {
+ public:
+  PassInfoPrinter(HGraph* graph,
+                  const char* method_name,
+                  const CodeGenerator& codegen,
+                  std::ostream* visualizer_output,
+                  bool timing_logger_enabled,
+                  bool visualizer_enabled)
+      : method_name_(method_name),
+        timing_logger_enabled_(timing_logger_enabled),
+        timing_logger_running_(false),
+        timing_logger_(method_name, true, true),
+        visualizer_enabled_(visualizer_enabled),
+        visualizer_(visualizer_output, graph, codegen, method_name_) {
+    if (strstr(method_name, kStringFilter) == nullptr) {
+      timing_logger_enabled_ = visualizer_enabled_ = false;
+    }
+  }
+
+  void BeforePass(const char* pass_name) {
+    // Dump graph first, then start timer.
+    if (visualizer_enabled_) {
+      visualizer_.DumpGraph(pass_name, /* is_after_pass */ false);
+    }
+    if (timing_logger_enabled_) {
+      DCHECK(!timing_logger_running_);
+      timing_logger_running_ = true;
+      timing_logger_.StartTiming(pass_name);
+    }
+  }
+
+  void AfterPass(const char* pass_name) {
+    // Pause timer first, then dump graph.
+    if (timing_logger_enabled_) {
+      DCHECK(timing_logger_running_);
+      timing_logger_.EndTiming();
+      timing_logger_running_ = false;
+    }
+    if (visualizer_enabled_) {
+      visualizer_.DumpGraph(pass_name, /* is_after_pass */ true);
+    }
+  }
+
+  ~PassInfoPrinter() {
+    if (timing_logger_enabled_) {
+      DCHECK(!timing_logger_running_);
+      LOG(INFO) << "TIMINGS " << method_name_;
+      LOG(INFO) << Dumpable<TimingLogger>(timing_logger_);
+    }
+  }
+
+ private:
+  const char* method_name_;
+
+  bool timing_logger_enabled_;
+  bool timing_logger_running_;
+  TimingLogger timing_logger_;
+
+  bool visualizer_enabled_;
+  HGraphVisualizer visualizer_;
+
+  DISALLOW_COPY_AND_ASSIGN(PassInfoPrinter);
+};
+
 class OptimizingCompiler FINAL : public Compiler {
  public:
   explicit OptimizingCompiler(CompilerDriver* driver);
@@ -123,7 +189,7 @@
                                    CodeGenerator* codegen,
                                    CompilerDriver* driver,
                                    const DexCompilationUnit& dex_compilation_unit,
-                                   const HGraphVisualizer& visualizer) const;
+                                   PassInfoPrinter* pass_info) const;
 
   // Just compile without doing optimizations.
   CompiledMethod* CompileBaseline(CodeGenerator* codegen,
@@ -200,12 +266,12 @@
 
 static void RunOptimizations(HOptimization* optimizations[],
                              size_t length,
-                             const HGraphVisualizer& visualizer) {
+                             PassInfoPrinter* pass_info) {
   for (size_t i = 0; i < length; ++i) {
     HOptimization* optimization = optimizations[i];
-    visualizer.DumpGraph(optimization->GetPassName(), /*is_after=*/false);
+    pass_info->BeforePass(optimization->GetPassName());
     optimization->Run();
-    visualizer.DumpGraph(optimization->GetPassName(), /*is_after=*/true);
+    pass_info->AfterPass(optimization->GetPassName());
     optimization->Check();
   }
 }
@@ -214,7 +280,7 @@
                              CompilerDriver* driver,
                              OptimizingCompilerStats* stats,
                              const DexCompilationUnit& dex_compilation_unit,
-                             const HGraphVisualizer& visualizer) {
+                             PassInfoPrinter* pass_info) {
   SsaRedundantPhiElimination redundant_phi(graph);
   SsaDeadPhiElimination dead_phi(graph);
   HDeadCodeElimination dce(graph);
@@ -250,7 +316,7 @@
     &simplify2
   };
 
-  RunOptimizations(optimizations, arraysize(optimizations), visualizer);
+  RunOptimizations(optimizations, arraysize(optimizations), pass_info);
 }
 
 // The stack map we generate must be 4-byte aligned on ARM. Since existing
@@ -269,18 +335,20 @@
                                                      CodeGenerator* codegen,
                                                      CompilerDriver* compiler_driver,
                                                      const DexCompilationUnit& dex_compilation_unit,
-                                                     const HGraphVisualizer& visualizer) const {
+                                                     PassInfoPrinter* pass_info) const {
   RunOptimizations(
-      graph, compiler_driver, &compilation_stats_, dex_compilation_unit, visualizer);
+      graph, compiler_driver, &compilation_stats_, dex_compilation_unit, pass_info);
 
+  pass_info->BeforePass(kLivenessPassName);
   PrepareForRegisterAllocation(graph).Run();
   SsaLivenessAnalysis liveness(*graph, codegen);
   liveness.Analyze();
-  visualizer.DumpGraph(kLivenessPassName);
+  pass_info->AfterPass(kLivenessPassName);
 
+  pass_info->BeforePass(kRegisterAllocatorPassName);
   RegisterAllocator register_allocator(graph->GetArena(), codegen, liveness);
   register_allocator.AllocateRegisters();
-  visualizer.DumpGraph(kRegisterAllocatorPassName);
+  pass_info->AfterPass(kRegisterAllocatorPassName);
 
   CodeVectorAllocator allocator;
   codegen->CompileOptimized(&allocator);
@@ -339,6 +407,7 @@
                                             jobject class_loader,
                                             const DexFile& dex_file) const {
   UNUSED(invoke_type);
+  std::string method_name = PrettyMethod(method_idx, dex_file);
   compilation_stats_.RecordStat(MethodCompilationStat::kAttemptCompilation);
   CompilerDriver* compiler_driver = GetCompilerDriver();
   InstructionSet instruction_set = compiler_driver->GetInstructionSet();
@@ -364,29 +433,15 @@
     class_def_idx, method_idx, access_flags,
     compiler_driver->GetVerifiedMethod(&dex_file, method_idx));
 
-  std::string method_name = PrettyMethod(method_idx, dex_file);
+  ArenaPool pool;
+  ArenaAllocator arena(&pool);
+  HGraph* graph = new (&arena) HGraph(&arena);
 
   // For testing purposes, we put a special marker on method names that should be compiled
   // with this compiler. This makes sure we're not regressing.
   bool shouldCompile = method_name.find("$opt$") != std::string::npos;
   bool shouldOptimize = method_name.find("$opt$reg$") != std::string::npos;
 
-  ArenaPool pool;
-  ArenaAllocator arena(&pool);
-  HGraphBuilder builder(&arena,
-                        &dex_compilation_unit,
-                        &dex_compilation_unit,
-                        &dex_file,
-                        compiler_driver,
-                        &compilation_stats_);
-
-  VLOG(compiler) << "Building " << PrettyMethod(method_idx, dex_file);
-  HGraph* graph = builder.BuildGraph(*code_item);
-  if (graph == nullptr) {
-    CHECK(!shouldCompile) << "Could not build graph in optimizing compiler";
-    return nullptr;
-  }
-
   std::unique_ptr<CodeGenerator> codegen(
       CodeGenerator::Create(graph,
                             instruction_set,
@@ -398,29 +453,53 @@
     return nullptr;
   }
 
-  HGraphVisualizer visualizer(
-      visualizer_output_.get(), graph, kStringFilter, *codegen.get(), method_name.c_str());
-  visualizer.DumpGraph("builder");
+  PassInfoPrinter pass_info(graph,
+                            method_name.c_str(),
+                            *codegen.get(),
+                            visualizer_output_.get(),
+                            GetCompilerDriver()->GetDumpPasses(),
+                            !GetCompilerDriver()->GetDumpCfgFileName().empty());
+
+  HGraphBuilder builder(graph,
+                        &dex_compilation_unit,
+                        &dex_compilation_unit,
+                        &dex_file,
+                        compiler_driver,
+                        &compilation_stats_);
+
+  VLOG(compiler) << "Building " << method_name;
+
+  pass_info.BeforePass(kBuilderPassName);
+  if (!builder.BuildGraph(*code_item)) {
+    CHECK(!shouldCompile) << "Could not build graph in optimizing compiler";
+    return nullptr;
+  }
+  pass_info.AfterPass(kBuilderPassName);
 
   bool can_optimize = CanOptimize(*code_item);
   bool can_allocate_registers = RegisterAllocator::CanAllocateRegistersFor(*graph, instruction_set);
-  CompiledMethod* result = nullptr;
   if (run_optimizations_ && can_optimize && can_allocate_registers) {
-    VLOG(compiler) << "Optimizing " << PrettyMethod(method_idx, dex_file);
+    VLOG(compiler) << "Optimizing " << method_name;
+
+    pass_info.BeforePass(kSsaBuilderPassName);
     if (!graph->TryBuildingSsa()) {
-      LOG(INFO) << "Skipping compilation of "
-                << PrettyMethod(method_idx, dex_file)
-                << ": it contains a non natural loop";
       // We could not transform the graph to SSA, bailout.
+      LOG(INFO) << "Skipping compilation of " << method_name << ": it contains a non natural loop";
       compilation_stats_.RecordStat(MethodCompilationStat::kNotCompiledCannotBuildSSA);
-    } else {
-      result = CompileOptimized(graph, codegen.get(), compiler_driver, dex_compilation_unit, visualizer);
+      return nullptr;
     }
+    pass_info.AfterPass(kSsaBuilderPassName);
+
+    return CompileOptimized(graph,
+                            codegen.get(),
+                            compiler_driver,
+                            dex_compilation_unit,
+                            &pass_info);
   } else if (shouldOptimize && RegisterAllocator::Supports(instruction_set)) {
     LOG(FATAL) << "Could not allocate registers in optimizing compiler";
     UNREACHABLE();
   } else {
-    VLOG(compiler) << "Compile baseline " << PrettyMethod(method_idx, dex_file);
+    VLOG(compiler) << "Compile baseline " << method_name;
 
     if (!run_optimizations_) {
       compilation_stats_.RecordStat(MethodCompilationStat::kNotOptimizedDisabled);
@@ -430,9 +509,8 @@
       compilation_stats_.RecordStat(MethodCompilationStat::kNotOptimizedRegisterAllocator);
     }
 
-    result = CompileBaseline(codegen.get(), compiler_driver, dex_compilation_unit);
+    return CompileBaseline(codegen.get(), compiler_driver, dex_compilation_unit);
   }
-  return result;
 }
 
 Compiler* CreateOptimizingCompiler(CompilerDriver* driver) {
diff --git a/compiler/optimizing/optimizing_unit_test.h b/compiler/optimizing/optimizing_unit_test.h
index 29d47e1..6b23692 100644
--- a/compiler/optimizing/optimizing_unit_test.h
+++ b/compiler/optimizing/optimizing_unit_test.h
@@ -76,11 +76,12 @@
 inline HGraph* CreateCFG(ArenaAllocator* allocator,
                          const uint16_t* data,
                          Primitive::Type return_type = Primitive::kPrimInt) {
-  HGraphBuilder builder(allocator, return_type);
+  HGraph* graph = new (allocator) HGraph(allocator);
+  HGraphBuilder builder(graph, return_type);
   const DexFile::CodeItem* item =
     reinterpret_cast<const DexFile::CodeItem*>(data);
-  HGraph* graph = builder.BuildGraph(*item);
-  return graph;
+  bool graph_built = builder.BuildGraph(*item);
+  return graph_built ? graph : nullptr;
 }
 
 // Naive string diff data type.
diff --git a/compiler/optimizing/pretty_printer_test.cc b/compiler/optimizing/pretty_printer_test.cc
index a231a72..9cf8235 100644
--- a/compiler/optimizing/pretty_printer_test.cc
+++ b/compiler/optimizing/pretty_printer_test.cc
@@ -30,10 +30,11 @@
 static void TestCode(const uint16_t* data, const char* expected) {
   ArenaPool pool;
   ArenaAllocator allocator(&pool);
-  HGraphBuilder builder(&allocator);
+  HGraph* graph = new (&allocator) HGraph(&allocator);
+  HGraphBuilder builder(graph);
   const DexFile::CodeItem* item = reinterpret_cast<const DexFile::CodeItem*>(data);
-  HGraph* graph = builder.BuildGraph(*item);
-  ASSERT_NE(graph, nullptr);
+  bool graph_built = builder.BuildGraph(*item);
+  ASSERT_TRUE(graph_built);
   StringPrettyPrinter printer(graph);
   printer.VisitInsertionOrder();
   ASSERT_STREQ(expected, printer.str().c_str());
diff --git a/compiler/optimizing/register_allocator.cc b/compiler/optimizing/register_allocator.cc
index 0a3f24b..3809720 100644
--- a/compiler/optimizing/register_allocator.cc
+++ b/compiler/optimizing/register_allocator.cc
@@ -485,6 +485,9 @@
         BitVector* liveness_of_register = liveness_of_values.Get(current->GetRegister());
         for (size_t j = it.CurrentRange()->GetStart(); j < it.CurrentRange()->GetEnd(); ++j) {
           if (liveness_of_register->IsBitSet(j)) {
+            if (current->IsUsingInputRegister() && current->CanUseInputRegister()) {
+              continue;
+            }
             if (log_fatal_on_failure) {
               std::ostringstream message;
               message << "Register conflict at " << j << " ";
@@ -639,6 +642,29 @@
   }
 }
 
+static void FreeIfNotCoverAt(LiveInterval* interval, size_t position, size_t* free_until) {
+  DCHECK(!interval->IsHighInterval());
+  // Note that the same instruction may occur multiple times in the input list,
+  // so `free_until` may have changed already.
+  if (interval->IsDeadAt(position)) {
+    // Set the register to be free. Note that inactive intervals might later
+    // update this.
+    free_until[interval->GetRegister()] = kMaxLifetimePosition;
+    if (interval->HasHighInterval()) {
+      DCHECK(interval->GetHighInterval()->IsDeadAt(position));
+      free_until[interval->GetHighInterval()->GetRegister()] = kMaxLifetimePosition;
+    }
+  } else if (!interval->Covers(position)) {
+    // The interval becomes inactive at `defined_by`. We make its register
+    // available only until the next use strictly after `defined_by`.
+    free_until[interval->GetRegister()] = interval->FirstUseAfter(position);
+    if (interval->HasHighInterval()) {
+      DCHECK(!interval->GetHighInterval()->Covers(position));
+      free_until[interval->GetHighInterval()->GetRegister()] = free_until[interval->GetRegister()];
+    }
+  }
+}
+
 // Find a free register. If multiple are found, pick the register that
 // is free the longest.
 bool RegisterAllocator::TryAllocateFreeReg(LiveInterval* current) {
@@ -656,6 +682,32 @@
     free_until[interval->GetRegister()] = 0;
   }
 
+  // An interval that starts an instruction (that is, it is not split), may
+  // re-use the registers used by the inputs of that instruciton, based on the
+  // location summary.
+  HInstruction* defined_by = current->GetDefinedBy();
+  if (defined_by != nullptr && !current->IsSplit()) {
+    LocationSummary* locations = defined_by->GetLocations();
+    if (!locations->OutputCanOverlapWithInputs() && locations->Out().IsUnallocated()) {
+      for (HInputIterator it(defined_by); !it.Done(); it.Advance()) {
+        // Take the last interval of the input. It is the location of that interval
+        // that will be used at `defined_by`.
+        LiveInterval* interval = it.Current()->GetLiveInterval()->GetLastSibling();
+        // Note that interval may have not been processed yet.
+        // TODO: Handle non-split intervals last in the work list.
+        if (interval->HasRegister() && interval->SameRegisterKind(*current)) {
+          // The input must be live until the end of `defined_by`, to comply to
+          // the linear scan algorithm. So we use `defined_by`'s end lifetime
+          // position to check whether the input is dead or is inactive after
+          // `defined_by`.
+          DCHECK(interval->Covers(defined_by->GetLifetimePosition()));
+          size_t position = defined_by->GetLifetimePosition() + 1;
+          FreeIfNotCoverAt(interval, position, free_until);
+        }
+      }
+    }
+  }
+
   // For each inactive interval, set its register to be free until
   // the next intersection with `current`.
   for (size_t i = 0, e = inactive_.Size(); i < e; ++i) {
@@ -1497,7 +1549,7 @@
           DCHECK(locations->InAt(0).Equals(source));
         }
       }
-      locations->SetOut(source);
+      locations->UpdateOut(source);
     } else {
       DCHECK(source.Equals(location));
     }
diff --git a/compiler/optimizing/register_allocator_test.cc b/compiler/optimizing/register_allocator_test.cc
index cb5010a..0cc00c0 100644
--- a/compiler/optimizing/register_allocator_test.cc
+++ b/compiler/optimizing/register_allocator_test.cc
@@ -37,9 +37,10 @@
 static bool Check(const uint16_t* data) {
   ArenaPool pool;
   ArenaAllocator allocator(&pool);
-  HGraphBuilder builder(&allocator);
+  HGraph* graph = new (&allocator) HGraph(&allocator);
+  HGraphBuilder builder(graph);
   const DexFile::CodeItem* item = reinterpret_cast<const DexFile::CodeItem*>(data);
-  HGraph* graph = builder.BuildGraph(*item);
+  builder.BuildGraph(*item);
   graph->TryBuildingSsa();
   x86::CodeGeneratorX86 codegen(graph, CompilerOptions());
   SsaLivenessAnalysis liveness(*graph, &codegen);
@@ -249,9 +250,10 @@
 }
 
 static HGraph* BuildSSAGraph(const uint16_t* data, ArenaAllocator* allocator) {
-  HGraphBuilder builder(allocator);
+  HGraph* graph = new (allocator) HGraph(allocator);
+  HGraphBuilder builder(graph);
   const DexFile::CodeItem* item = reinterpret_cast<const DexFile::CodeItem*>(data);
-  HGraph* graph = builder.BuildGraph(*item);
+  builder.BuildGraph(*item);
   graph->TryBuildingSsa();
   return graph;
 }
@@ -526,7 +528,7 @@
 
     // Set the phi to a specific register, and check that the inputs get allocated
     // the same register.
-    phi->GetLocations()->SetOut(Location::RegisterLocation(2));
+    phi->GetLocations()->UpdateOut(Location::RegisterLocation(2));
     RegisterAllocator register_allocator(&allocator, &codegen, liveness);
     register_allocator.AllocateRegisters();
 
@@ -543,7 +545,7 @@
 
     // Set input1 to a specific register, and check that the phi and other input get allocated
     // the same register.
-    input1->GetLocations()->SetOut(Location::RegisterLocation(2));
+    input1->GetLocations()->UpdateOut(Location::RegisterLocation(2));
     RegisterAllocator register_allocator(&allocator, &codegen, liveness);
     register_allocator.AllocateRegisters();
 
@@ -560,7 +562,7 @@
 
     // Set input2 to a specific register, and check that the phi and other input get allocated
     // the same register.
-    input2->GetLocations()->SetOut(Location::RegisterLocation(2));
+    input2->GetLocations()->UpdateOut(Location::RegisterLocation(2));
     RegisterAllocator register_allocator(&allocator, &codegen, liveness);
     register_allocator.AllocateRegisters();
 
@@ -685,7 +687,7 @@
     liveness.Analyze();
 
     // check that both adds get the same register.
-    // Don't use SetOutput because output is already allocated.
+    // Don't use UpdateOutput because output is already allocated.
     first_add->InputAt(0)->GetLocations()->output_ = Location::RegisterLocation(2);
     ASSERT_EQ(first_add->GetLocations()->Out().GetPolicy(), Location::kSameAsFirstInput);
     ASSERT_EQ(second_add->GetLocations()->Out().GetPolicy(), Location::kSameAsFirstInput);
diff --git a/compiler/optimizing/ssa_liveness_analysis.h b/compiler/optimizing/ssa_liveness_analysis.h
index b0d3853..0e68a61 100644
--- a/compiler/optimizing/ssa_liveness_analysis.h
+++ b/compiler/optimizing/ssa_liveness_analysis.h
@@ -18,6 +18,7 @@
 #define ART_COMPILER_OPTIMIZING_SSA_LIVENESS_ANALYSIS_H_
 
 #include "nodes.h"
+#include <iostream>
 
 namespace art {
 
@@ -181,12 +182,21 @@
 
   void AddUse(HInstruction* instruction, size_t input_index, bool is_environment) {
     // Set the use within the instruction.
-    size_t position = instruction->GetLifetimePosition();
-    if (instruction->GetLocations()->InputOverlapsWithOutputOrTemp(input_index, is_environment)) {
-      // If it overlaps, we need to make sure the user will not try to allocate a temp
-      // or its output to the same register.
-      ++position;
+    size_t position = instruction->GetLifetimePosition() + 1;
+    LocationSummary* locations = instruction->GetLocations();
+    if (!is_environment) {
+      if (locations->IsFixedInput(input_index) || locations->OutputUsesSameAs(input_index)) {
+        // For fixed inputs and output same as input, the register allocator
+        // requires to have inputs die at the instruction, so that input moves use the
+        // location of the input just before that instruction (and not potential moves due
+        // to splitting).
+        position = instruction->GetLifetimePosition();
+      }
     }
+
+    DCHECK(position == instruction->GetLifetimePosition()
+           || position == instruction->GetLifetimePosition() + 1);
+
     if ((first_use_ != nullptr)
         && (first_use_->GetUser() == instruction)
         && (first_use_->GetPosition() < position)) {
@@ -301,6 +311,7 @@
   LiveInterval* GetParent() const { return parent_; }
 
   LiveRange* GetFirstRange() const { return first_range_; }
+  LiveRange* GetLastRange() const { return last_range_; }
 
   int GetRegister() const { return register_; }
   void SetRegister(int reg) { register_ = reg; }
@@ -403,6 +414,23 @@
     return FirstRegisterUseAfter(GetStart());
   }
 
+  size_t FirstUseAfter(size_t position) const {
+    if (is_temp_) {
+      return position == GetStart() ? position : kNoLifetime;
+    }
+
+    UsePosition* use = first_use_;
+    size_t end = GetEnd();
+    while (use != nullptr && use->GetPosition() <= end) {
+      size_t use_position = use->GetPosition();
+      if (use_position > position) {
+        return use_position;
+      }
+      use = use->GetNext();
+    }
+    return kNoLifetime;
+  }
+
   UsePosition* GetFirstUse() const {
     return first_use_;
   }
@@ -511,6 +539,13 @@
   }
 
   LiveInterval* GetNextSibling() const { return next_sibling_; }
+  LiveInterval* GetLastSibling() {
+    LiveInterval* result = this;
+    while (result->next_sibling_ != nullptr) {
+      result = result->next_sibling_;
+    }
+    return result;
+  }
 
   // Returns the first register hint that is at least free before
   // the value contained in `free_until`. If none is found, returns
@@ -541,6 +576,9 @@
 
   // Returns whether `other` and `this` share the same kind of register.
   bool SameRegisterKind(Location other) const;
+  bool SameRegisterKind(const LiveInterval& other) const {
+    return IsFloatingPoint() == other.IsFloatingPoint();
+  }
 
   bool HasHighInterval() const {
     return IsLowInterval();
@@ -594,6 +632,60 @@
     }
   }
 
+  // Returns whether an interval, when it is non-split, is using
+  // the same register of one of its input.
+  bool IsUsingInputRegister() const {
+    if (defined_by_ != nullptr && !IsSplit()) {
+      for (HInputIterator it(defined_by_); !it.Done(); it.Advance()) {
+        LiveInterval* interval = it.Current()->GetLiveInterval();
+
+        // Find the interval that covers `defined_by`_.
+        while (interval != nullptr && !interval->Covers(defined_by_->GetLifetimePosition())) {
+          interval = interval->GetNextSibling();
+        }
+
+        // Check if both intervals have the same register of the same kind.
+        if (interval != nullptr
+            && interval->SameRegisterKind(*this)
+            && interval->GetRegister() == GetRegister()) {
+          return true;
+        }
+      }
+    }
+    return false;
+  }
+
+  // Returns whether an interval, when it is non-split, can safely use
+  // the same register of one of its input. Note that this method requires
+  // IsUsingInputRegister() to be true.
+  bool CanUseInputRegister() const {
+    DCHECK(IsUsingInputRegister());
+    if (defined_by_ != nullptr && !IsSplit()) {
+      LocationSummary* locations = defined_by_->GetLocations();
+      if (locations->OutputCanOverlapWithInputs()) {
+        return false;
+      }
+      for (HInputIterator it(defined_by_); !it.Done(); it.Advance()) {
+        LiveInterval* interval = it.Current()->GetLiveInterval();
+
+        // Find the interval that covers `defined_by`_.
+        while (interval != nullptr && !interval->Covers(defined_by_->GetLifetimePosition())) {
+          interval = interval->GetNextSibling();
+        }
+
+        if (interval != nullptr
+            && interval->SameRegisterKind(*this)
+            && interval->GetRegister() == GetRegister()) {
+          // We found the input that has the same register. Check if it is live after
+          // `defined_by`_.
+          return !interval->Covers(defined_by_->GetLifetimePosition() + 1);
+        }
+      }
+    }
+    LOG(FATAL) << "Unreachable";
+    UNREACHABLE();
+  }
+
  private:
   LiveInterval(ArenaAllocator* allocator,
                Primitive::Type type,
diff --git a/compiler/optimizing/ssa_test.cc b/compiler/optimizing/ssa_test.cc
index 6b6bf05..7e90b37 100644
--- a/compiler/optimizing/ssa_test.cc
+++ b/compiler/optimizing/ssa_test.cc
@@ -78,10 +78,11 @@
 static void TestCode(const uint16_t* data, const char* expected) {
   ArenaPool pool;
   ArenaAllocator allocator(&pool);
-  HGraphBuilder builder(&allocator);
+  HGraph* graph = new (&allocator) HGraph(&allocator);
+  HGraphBuilder builder(graph);
   const DexFile::CodeItem* item = reinterpret_cast<const DexFile::CodeItem*>(data);
-  HGraph* graph = builder.BuildGraph(*item);
-  ASSERT_NE(graph, nullptr);
+  bool graph_built = builder.BuildGraph(*item);
+  ASSERT_TRUE(graph_built);
 
   graph->BuildDominatorTree();
   // Suspend checks implementation may change in the future, and this test relies
diff --git a/compiler/optimizing/suspend_check_test.cc b/compiler/optimizing/suspend_check_test.cc
index 2e48ee8..a5a0eb2 100644
--- a/compiler/optimizing/suspend_check_test.cc
+++ b/compiler/optimizing/suspend_check_test.cc
@@ -30,10 +30,11 @@
 static void TestCode(const uint16_t* data) {
   ArenaPool pool;
   ArenaAllocator allocator(&pool);
-  HGraphBuilder builder(&allocator);
+  HGraph* graph = new (&allocator) HGraph(&allocator);
+  HGraphBuilder builder(graph);
   const DexFile::CodeItem* item = reinterpret_cast<const DexFile::CodeItem*>(data);
-  HGraph* graph = builder.BuildGraph(*item);
-  ASSERT_NE(graph, nullptr);
+  bool graph_built = builder.BuildGraph(*item);
+  ASSERT_TRUE(graph_built);
 
   HBasicBlock* first_block = graph->GetEntryBlock()->GetSuccessors().Get(0);
   HInstruction* first_instruction = first_block->GetFirstInstruction();
diff --git a/runtime/arch/x86/quick_entrypoints_x86.S b/runtime/arch/x86/quick_entrypoints_x86.S
index fd3a1cf..beacd49 100644
--- a/runtime/arch/x86/quick_entrypoints_x86.S
+++ b/runtime/arch/x86/quick_entrypoints_x86.S
@@ -467,7 +467,8 @@
 
     // Now check ebx
     SKIP_OVER_FLOATS esi, edi, al, .Lgpr_setup_finished
-    // Must be first word of a long, or an integer.
+    // Must be first word of a long, or an integer. First word of long doesn't
+    // go into EBX, but can be loaded there anyways, as it is harmless.
     movl (%edi), %ebx
     jmp .Lgpr_setup_finished
 .LfirstLong:
@@ -569,7 +570,8 @@
 
     // Is there anything for ebx?
     SKIP_OVER_FLOATS esi, edi, al, .Lgpr_setup_finished2
-    // First word of long or integer.  Load into EBX.
+    // Must be first word of a long, or an integer. First word of long doesn't
+    // go into EBX, but can be loaded there anyways, as it is harmless.
     movl (%edi), %ebx
     jmp .Lgpr_setup_finished2
 .LSecondLong2:
@@ -585,7 +587,8 @@
 
     // Anything for EBX?
     SKIP_OVER_FLOATS esi, edi, al, .Lgpr_setup_finished2
-    // First word of long or integer.  Load into EBX.
+    // Must be first word of a long, or an integer. First word of long doesn't
+    // go into EBX, but can be loaded there anyways, as it is harmless.
     movl (%edi), %ebx
     jmp .Lgpr_setup_finished2
     // Nothing left to load.
diff --git a/runtime/entrypoints/quick/quick_trampoline_entrypoints.cc b/runtime/entrypoints/quick/quick_trampoline_entrypoints.cc
index a67ebca..98f1684 100644
--- a/runtime/entrypoints/quick/quick_trampoline_entrypoints.cc
+++ b/runtime/entrypoints/quick/quick_trampoline_entrypoints.cc
@@ -59,6 +59,7 @@
   // | S0         |
   // |            |    4x2 bytes padding
   // | Method*    |  <- sp
+  static constexpr bool kSplitPairAcrossRegisterAndStack = kArm32QuickCodeUseSoftFloat;
   static constexpr bool kAlignPairRegister = !kArm32QuickCodeUseSoftFloat;
   static constexpr bool kQuickSoftFloatAbi = kArm32QuickCodeUseSoftFloat;
   static constexpr bool kQuickDoubleRegAlignedFloatBackFilled = !kArm32QuickCodeUseSoftFloat;
@@ -95,6 +96,7 @@
   // | D0         |
   // |            |    padding
   // | Method*    |  <- sp
+  static constexpr bool kSplitPairAcrossRegisterAndStack = false;
   static constexpr bool kAlignPairRegister = false;
   static constexpr bool kQuickSoftFloatAbi = false;  // This is a hard float ABI.
   static constexpr bool kQuickDoubleRegAlignedFloatBackFilled = false;
@@ -125,6 +127,7 @@
   // | A2         |    arg2
   // | A1         |    arg1
   // | A0/Method* |  <- sp
+  static constexpr bool kSplitPairAcrossRegisterAndStack = true;
   static constexpr bool kAlignPairRegister = false;
   static constexpr bool kQuickSoftFloatAbi = true;  // This is a soft float ABI.
   static constexpr bool kQuickDoubleRegAlignedFloatBackFilled = false;
@@ -203,6 +206,7 @@
   // | XMM1        |    float arg 2
   // | XMM0        |    float arg 1
   // | EAX/Method* |  <- sp
+  static constexpr bool kSplitPairAcrossRegisterAndStack = false;
   static constexpr bool kAlignPairRegister = false;
   static constexpr bool kQuickSoftFloatAbi = false;  // This is a hard float ABI.
   static constexpr bool kQuickDoubleRegAlignedFloatBackFilled = false;
@@ -243,6 +247,7 @@
   // | XMM0            |    float arg 1
   // | Padding         |
   // | RDI/Method*     |  <- sp
+  static constexpr bool kSplitPairAcrossRegisterAndStack = false;
   static constexpr bool kAlignPairRegister = false;
   static constexpr bool kQuickSoftFloatAbi = false;  // This is a hard float ABI.
   static constexpr bool kQuickDoubleRegAlignedFloatBackFilled = false;
@@ -452,6 +457,11 @@
             }
             is_split_long_or_double_ = (GetBytesPerGprSpillLocation(kRuntimeISA) == 4) &&
                 ((gpr_index_ + 1) == kNumQuickGprArgs);
+            if (!kSplitPairAcrossRegisterAndStack && is_split_long_or_double_) {
+              // We don't want to split this. Pass over this register.
+              gpr_index_++;
+              is_split_long_or_double_ = false;
+            }
             Visit();
             if (kBytesStackArgLocation == 4) {
               stack_index_+= 2;
diff --git a/runtime/gc/collector/concurrent_copying.cc b/runtime/gc/collector/concurrent_copying.cc
index 2ad8e9c..734c935 100644
--- a/runtime/gc/collector/concurrent_copying.cc
+++ b/runtime/gc/collector/concurrent_copying.cc
@@ -298,7 +298,11 @@
     Thread* self = Thread::Current();
     CHECK(thread == self || thread->IsSuspended() || thread->GetState() == kWaitingPerformingGc)
         << thread->GetState() << " thread " << thread << " self " << self;
-    concurrent_copying_->GetBarrier().Pass(self);
+    // If thread is a running mutator, then act on behalf of the garbage collector.
+    // See the code in ThreadList::RunCheckpoint.
+    if (thread->GetState() == kRunnable) {
+      concurrent_copying_->GetBarrier().Pass(self);
+    }
   }
 
  private:
@@ -431,6 +435,11 @@
   ThreadList* thread_list = Runtime::Current()->GetThreadList();
   gc_barrier_->Init(self, 0);
   size_t barrier_count = thread_list->RunCheckpoint(&check_point);
+  // If there are no threads to wait which implys that all the checkpoint functions are finished,
+  // then no need to release the mutator lock.
+  if (barrier_count == 0) {
+    return;
+  }
   // Release locks then wait for all mutator threads to pass the barrier.
   Locks::mutator_lock_->SharedUnlock(self);
   {
diff --git a/runtime/gc/collector/mark_sweep.cc b/runtime/gc/collector/mark_sweep.cc
index d7a9292..cd63d26 100644
--- a/runtime/gc/collector/mark_sweep.cc
+++ b/runtime/gc/collector/mark_sweep.cc
@@ -989,7 +989,11 @@
       mark_sweep_->GetHeap()->RevokeRosAllocThreadLocalBuffers(thread);
       ATRACE_END();
     }
-    mark_sweep_->GetBarrier().Pass(self);
+    // If thread is a running mutator, then act on behalf of the garbage collector.
+    // See the code in ThreadList::RunCheckpoint.
+    if (thread->GetState() == kRunnable) {
+      mark_sweep_->GetBarrier().Pass(self);
+    }
   }
 
  private:
@@ -1006,7 +1010,11 @@
   // run through the barrier including self.
   size_t barrier_count = thread_list->RunCheckpoint(&check_point);
   // Release locks then wait for all mutator threads to pass the barrier.
-  // TODO: optimize to not release locks when there are no threads to wait for.
+  // If there are no threads to wait which implys that all the checkpoint functions are finished,
+  // then no need to release locks.
+  if (barrier_count == 0) {
+    return;
+  }
   Locks::heap_bitmap_lock_->ExclusiveUnlock(self);
   Locks::mutator_lock_->SharedUnlock(self);
   {
diff --git a/runtime/gc/heap.cc b/runtime/gc/heap.cc
index 5a60c87..3f3add8 100644
--- a/runtime/gc/heap.cc
+++ b/runtime/gc/heap.cc
@@ -1061,7 +1061,11 @@
     ATRACE_BEGIN("Trimming reference table");
     thread->GetJniEnv()->locals.Trim();
     ATRACE_END();
-    barrier_->Pass(Thread::Current());
+    // If thread is a running mutator, then act on behalf of the trim thread.
+    // See the code in ThreadList::RunCheckpoint.
+    if (thread->GetState() == kRunnable) {
+      barrier_->Pass(Thread::Current());
+    }
   }
 
  private:
@@ -1079,7 +1083,9 @@
   TrimIndirectReferenceTableClosure closure(&barrier);
   ScopedThreadStateChange tsc(self, kWaitingForCheckPointsToRun);
   size_t barrier_count = Runtime::Current()->GetThreadList()->RunCheckpoint(&closure);
-  barrier.Increment(self, barrier_count);
+  if (barrier_count != 0) {
+    barrier.Increment(self, barrier_count);
+  }
   ATRACE_END();
 }
 
diff --git a/runtime/oat.h b/runtime/oat.h
index 3e28606..7faf33b 100644
--- a/runtime/oat.h
+++ b/runtime/oat.h
@@ -32,7 +32,7 @@
 class PACKED(4) OatHeader {
  public:
   static constexpr uint8_t kOatMagic[] = { 'o', 'a', 't', '\n' };
-  static constexpr uint8_t kOatVersion[] = { '0', '5', '4', '\0' };
+  static constexpr uint8_t kOatVersion[] = { '0', '5', '5', '\0' };
 
   static constexpr const char* kImageLocationKey = "image-location";
   static constexpr const char* kDex2OatCmdLineKey = "dex2oat-cmdline";
diff --git a/runtime/thread_list.cc b/runtime/thread_list.cc
index a39de63..5b53078 100644
--- a/runtime/thread_list.cc
+++ b/runtime/thread_list.cc
@@ -179,7 +179,9 @@
       MutexLock mu(self, *Locks::logging_lock_);
       *os_ << local_os.str();
     }
-    barrier_.Pass(self);
+    if (thread->GetState() == kRunnable) {
+      barrier_.Pass(self);
+    }
   }
 
   void WaitForThreadsToRunThroughCheckpoint(size_t threads_running_checkpoint) {
@@ -207,7 +209,9 @@
   }
   DumpCheckpoint checkpoint(&os);
   size_t threads_running_checkpoint = RunCheckpoint(&checkpoint);
-  checkpoint.WaitForThreadsToRunThroughCheckpoint(threads_running_checkpoint);
+  if (threads_running_checkpoint != 0) {
+    checkpoint.WaitForThreadsToRunThroughCheckpoint(threads_running_checkpoint);
+  }
 }
 
 void ThreadList::AssertThreadsAreSuspended(Thread* self, Thread* ignore1, Thread* ignore2) {
@@ -327,8 +331,7 @@
     Thread::resume_cond_->Broadcast(self);
   }
 
-  // Add one for self.
-  return count + suspended_count_modified_threads.size() + 1;
+  return count;
 }
 
 // Request that a checkpoint function be run on all active (non-suspended)
diff --git a/test/448-multiple-returns/expected.txt b/test/448-multiple-returns/expected.txt
new file mode 100644
index 0000000..e69de29
--- /dev/null
+++ b/test/448-multiple-returns/expected.txt
diff --git a/test/448-multiple-returns/info.txt b/test/448-multiple-returns/info.txt
new file mode 100644
index 0000000..cdd354b
--- /dev/null
+++ b/test/448-multiple-returns/info.txt
@@ -0,0 +1,2 @@
+Tests inlining of a pattern not generated by DX: multiple
+returns in a single method.
diff --git a/test/448-multiple-returns/smali/MultipleReturns.smali b/test/448-multiple-returns/smali/MultipleReturns.smali
new file mode 100644
index 0000000..23815d8
--- /dev/null
+++ b/test/448-multiple-returns/smali/MultipleReturns.smali
@@ -0,0 +1,45 @@
+# Copyright (C) 2015 The Android Open Source Project
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+.class public LMultipleReturns;
+
+.super Ljava/lang/Object;
+
+.method public static caller()I
+   .registers 1
+   invoke-static {},  LMultipleReturns;->$opt$CalleeReturnVoid()V
+   invoke-static {},  LMultipleReturns;->$opt$CalleeReturnInt()I
+   move-result v0
+   return v0
+.end method
+
+.method public static $opt$CalleeReturnVoid()V
+   .registers 2
+   const/4 v0, 0x0
+   const/4 v1, 0x1
+   if-eq v1, v0, :else
+   return-void
+   :else
+   return-void
+.end method
+
+.method public static $opt$CalleeReturnInt()I
+   .registers 2
+   const/4 v0, 0x0
+   const/4 v1, 0x1
+   if-eq v1, v0, :else
+   return v0
+   :else
+   return v1
+.end method
diff --git a/test/448-multiple-returns/src/Main.java b/test/448-multiple-returns/src/Main.java
new file mode 100644
index 0000000..4050ed1
--- /dev/null
+++ b/test/448-multiple-returns/src/Main.java
@@ -0,0 +1,32 @@
+/*
+ * Copyright (C) 2015 The Android Open Source Project
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import java.lang.reflect.Method;
+
+public class Main {
+
+  // Workaround for b/18051191.
+  class InnerClass {}
+
+  public static void main(String[] args) throws Exception {
+    Class<?> c = Class.forName("MultipleReturns");
+    Method m = c.getMethod("caller");
+    int result = (Integer)m.invoke(null);
+    if (result != 0) {
+      throw new Error("Expected 0, got " + result);
+    }
+  }
+}