[optimizing] Use callee-save registers for x86

Add ESI, EDI, EBP to available registers for non-baseline mode. Ensure
that they aren't used when byte addressible registers are needed.

Change-Id: Ie7130d4084c2ae9cfcd1e47c26eb3e5dcac1ebd6
Signed-off-by: Mark Mendell <mark.p.mendell@intel.com>
diff --git a/compiler/optimizing/code_generator.cc b/compiler/optimizing/code_generator.cc
index ba5f7d8..ed3f949 100644
--- a/compiler/optimizing/code_generator.cc
+++ b/compiler/optimizing/code_generator.cc
@@ -216,6 +216,29 @@
   }
 }
 
+void CodeGenerator::BlockIfInRegister(Location location, bool is_out) const {
+  // The DCHECKS below check that a register is not specified twice in
+  // the summary. The out location can overlap with an input, so we need
+  // to special case it.
+  if (location.IsRegister()) {
+    DCHECK(is_out || !blocked_core_registers_[location.reg()]);
+    blocked_core_registers_[location.reg()] = true;
+  } else if (location.IsFpuRegister()) {
+    DCHECK(is_out || !blocked_fpu_registers_[location.reg()]);
+    blocked_fpu_registers_[location.reg()] = true;
+  } else if (location.IsFpuRegisterPair()) {
+    DCHECK(is_out || !blocked_fpu_registers_[location.AsFpuRegisterPairLow<int>()]);
+    blocked_fpu_registers_[location.AsFpuRegisterPairLow<int>()] = true;
+    DCHECK(is_out || !blocked_fpu_registers_[location.AsFpuRegisterPairHigh<int>()]);
+    blocked_fpu_registers_[location.AsFpuRegisterPairHigh<int>()] = true;
+  } else if (location.IsRegisterPair()) {
+    DCHECK(is_out || !blocked_core_registers_[location.AsRegisterPairLow<int>()]);
+    blocked_core_registers_[location.AsRegisterPairLow<int>()] = true;
+    DCHECK(is_out || !blocked_core_registers_[location.AsRegisterPairHigh<int>()]);
+    blocked_core_registers_[location.AsRegisterPairHigh<int>()] = true;
+  }
+}
+
 void CodeGenerator::AllocateRegistersLocally(HInstruction* instruction) const {
   LocationSummary* locations = instruction->GetLocations();
   if (locations == nullptr) return;
@@ -234,46 +257,19 @@
 
   // Mark all fixed input, temp and output registers as used.
   for (size_t i = 0, e = locations->GetInputCount(); i < e; ++i) {
-    Location loc = locations->InAt(i);
-    // The DCHECKS below check that a register is not specified twice in
-    // the summary.
-    if (loc.IsRegister()) {
-      DCHECK(!blocked_core_registers_[loc.reg()]);
-      blocked_core_registers_[loc.reg()] = true;
-    } else if (loc.IsFpuRegister()) {
-      DCHECK(!blocked_fpu_registers_[loc.reg()]);
-      blocked_fpu_registers_[loc.reg()] = true;
-    } else if (loc.IsFpuRegisterPair()) {
-      DCHECK(!blocked_fpu_registers_[loc.AsFpuRegisterPairLow<int>()]);
-      blocked_fpu_registers_[loc.AsFpuRegisterPairLow<int>()] = true;
-      DCHECK(!blocked_fpu_registers_[loc.AsFpuRegisterPairHigh<int>()]);
-      blocked_fpu_registers_[loc.AsFpuRegisterPairHigh<int>()] = true;
-    } else if (loc.IsRegisterPair()) {
-      DCHECK(!blocked_core_registers_[loc.AsRegisterPairLow<int>()]);
-      blocked_core_registers_[loc.AsRegisterPairLow<int>()] = true;
-      DCHECK(!blocked_core_registers_[loc.AsRegisterPairHigh<int>()]);
-      blocked_core_registers_[loc.AsRegisterPairHigh<int>()] = true;
-    }
+    BlockIfInRegister(locations->InAt(i));
   }
 
   for (size_t i = 0, e = locations->GetTempCount(); i < e; ++i) {
     Location loc = locations->GetTemp(i);
-    // The DCHECKS below check that a register is not specified twice in
-    // the summary.
-    if (loc.IsRegister()) {
-      DCHECK(!blocked_core_registers_[loc.reg()]);
-      blocked_core_registers_[loc.reg()] = true;
-    } else if (loc.IsFpuRegister()) {
-      DCHECK(!blocked_fpu_registers_[loc.reg()]);
-      blocked_fpu_registers_[loc.reg()] = true;
-    } else {
-      DCHECK(loc.GetPolicy() == Location::kRequiresRegister
-             || loc.GetPolicy() == Location::kRequiresFpuRegister);
-    }
+    BlockIfInRegister(loc);
+  }
+  Location result_location = locations->Out();
+  if (locations->OutputCanOverlapWithInputs()) {
+    BlockIfInRegister(result_location, /* is_out */ true);
   }
 
-  static constexpr bool kBaseline = true;
-  SetupBlockedRegisters(kBaseline);
+  SetupBlockedRegisters(/* is_baseline */ true);
 
   // Allocate all unallocated input locations.
   for (size_t i = 0, e = locations->GetInputCount(); i < e; ++i) {
@@ -318,7 +314,6 @@
       locations->SetTempAt(i, loc);
     }
   }
-  Location result_location = locations->Out();
   if (result_location.IsUnallocated()) {
     switch (result_location.GetPolicy()) {
       case Location::kAny:
diff --git a/compiler/optimizing/code_generator.h b/compiler/optimizing/code_generator.h
index f46a36d..5146afa 100644
--- a/compiler/optimizing/code_generator.h
+++ b/compiler/optimizing/code_generator.h
@@ -378,6 +378,7 @@
   void InitLocationsBaseline(HInstruction* instruction);
   size_t GetStackOffsetOfSavedRegister(size_t index);
   void CompileInternal(CodeAllocator* allocator, bool is_baseline);
+  void BlockIfInRegister(Location location, bool is_out = false) const;
 
   HGraph* const graph_;
   const CompilerOptions& compiler_options_;
diff --git a/compiler/optimizing/code_generator_x86.cc b/compiler/optimizing/code_generator_x86.cc
index 3c8f62c..87efa6c 100644
--- a/compiler/optimizing/code_generator_x86.cc
+++ b/compiler/optimizing/code_generator_x86.cc
@@ -37,14 +37,13 @@
 static constexpr Register kRuntimeParameterCoreRegisters[] = { EAX, ECX, EDX, EBX };
 static constexpr size_t kRuntimeParameterCoreRegistersLength =
     arraysize(kRuntimeParameterCoreRegisters);
+static constexpr Register kCoreCalleeSaves[] = { EBP, ESI, EDI };
 static constexpr XmmRegister kRuntimeParameterFpuRegisters[] = { XMM0, XMM1, XMM2, XMM3 };
 static constexpr size_t kRuntimeParameterFpuRegistersLength =
     arraysize(kRuntimeParameterFpuRegisters);
 
 static constexpr int kC2ConditionMask = 0x400;
 
-// Marker for places that can be updated once we don't follow the quick ABI.
-static constexpr bool kFollowsQuickABI = true;
 static constexpr int kFakeReturnRegister = Register(8);
 
 class InvokeRuntimeCallingConvention : public CallingConvention<Register, XmmRegister> {
@@ -371,8 +370,15 @@
 }
 
 CodeGeneratorX86::CodeGeneratorX86(HGraph* graph, const CompilerOptions& compiler_options)
-    : CodeGenerator(graph, kNumberOfCpuRegisters, kNumberOfXmmRegisters,
-                    kNumberOfRegisterPairs, (1 << kFakeReturnRegister), 0, compiler_options),
+    : CodeGenerator(graph,
+                    kNumberOfCpuRegisters,
+                    kNumberOfXmmRegisters,
+                    kNumberOfRegisterPairs,
+                    ComputeRegisterMask(reinterpret_cast<const int*>(kCoreCalleeSaves),
+                                        arraysize(kCoreCalleeSaves))
+                        | (1 << kFakeReturnRegister),
+                        0,
+                        compiler_options),
       block_labels_(graph->GetArena(), 0),
       location_builder_(graph, this),
       instruction_visitor_(graph, this),
@@ -427,18 +433,18 @@
   return Location();
 }
 
-void CodeGeneratorX86::SetupBlockedRegisters(bool is_baseline ATTRIBUTE_UNUSED) const {
+void CodeGeneratorX86::SetupBlockedRegisters(bool is_baseline) const {
   // Don't allocate the dalvik style register pair passing.
   blocked_register_pairs_[ECX_EDX] = true;
 
   // Stack register is always reserved.
   blocked_core_registers_[ESP] = true;
 
-  // TODO: We currently don't use Quick's callee saved registers.
-  DCHECK(kFollowsQuickABI);
-  blocked_core_registers_[EBP] = true;
-  blocked_core_registers_[ESI] = true;
-  blocked_core_registers_[EDI] = true;
+  if (is_baseline) {
+    blocked_core_registers_[EBP] = true;
+    blocked_core_registers_[ESI] = true;
+    blocked_core_registers_[EDI] = true;
+  }
 
   UpdateBlockedPairRegisters();
 }
@@ -470,15 +476,33 @@
     RecordPcInfo(nullptr, 0);
   }
 
-  if (!HasEmptyFrame()) {
-    __ subl(ESP, Immediate(GetFrameSize() - FrameEntrySpillSize()));
-    __ movl(Address(ESP, kCurrentMethodStackOffset), EAX);
+  if (HasEmptyFrame()) {
+    return;
   }
+
+  for (int i = arraysize(kCoreCalleeSaves) - 1; i >= 0; --i) {
+    Register reg = kCoreCalleeSaves[i];
+    if (allocated_registers_.ContainsCoreRegister(reg)) {
+      __ pushl(reg);
+    }
+  }
+
+  __ subl(ESP, Immediate(GetFrameSize() - FrameEntrySpillSize()));
+  __ movl(Address(ESP, kCurrentMethodStackOffset), EAX);
 }
 
 void CodeGeneratorX86::GenerateFrameExit() {
-  if (!HasEmptyFrame()) {
-    __ addl(ESP, Immediate(GetFrameSize() - FrameEntrySpillSize()));
+  if (HasEmptyFrame()) {
+    return;
+  }
+
+  __ addl(ESP, Immediate(GetFrameSize() - FrameEntrySpillSize()));
+
+  for (size_t i = 0; i < arraysize(kCoreCalleeSaves); ++i) {
+    Register reg = kCoreCalleeSaves[i];
+    if (allocated_registers_.ContainsCoreRegister(reg)) {
+      __ popl(reg);
+    }
   }
 }
 
@@ -907,7 +931,8 @@
   locations->SetInAt(0, Location::RequiresRegister());
   locations->SetInAt(1, Location::Any());
   if (comp->NeedsMaterialization()) {
-    locations->SetOut(Location::RequiresRegister());
+    // We need a byte register.
+    locations->SetOut(Location::RegisterLocation(ECX));
   }
 }
 
@@ -1345,8 +1370,10 @@
         case Primitive::kPrimInt:
         case Primitive::kPrimChar:
           // Processing a Dex `int-to-byte' instruction.
-          locations->SetInAt(0, Location::Any());
-          locations->SetOut(Location::RequiresRegister(), Location::kNoOutputOverlap);
+          locations->SetInAt(0, Location::ByteRegisterOrConstant(ECX, conversion->InputAt(0)));
+          // Make the output overlap to please the register allocator. This greatly simplifies
+          // the validation of the linear scan implementation
+          locations->SetOut(Location::RequiresRegister(), Location::kOutputOverlap);
           break;
 
         default:
@@ -3161,15 +3188,16 @@
 }
 
 void LocationsBuilderX86::VisitArraySet(HArraySet* instruction) {
+  // This location builder might end up asking to up to four registers, which is
+  // not currently possible for baseline. The situation in which we need four
+  // registers cannot be met by baseline though, because it has not run any
+  // optimization.
+
   Primitive::Type value_type = instruction->GetComponentType();
   bool needs_write_barrier =
       CodeGenerator::StoreNeedsWriteBarrier(value_type, instruction->GetValue());
 
-  DCHECK(kFollowsQuickABI);
-  bool not_enough_registers = needs_write_barrier
-      && !instruction->GetValue()->IsConstant()
-      && !instruction->GetIndex()->IsConstant();
-  bool needs_runtime_call = instruction->NeedsTypeCheck() || not_enough_registers;
+  bool needs_runtime_call = instruction->NeedsTypeCheck();
 
   LocationSummary* locations = new (GetGraph()->GetArena()) LocationSummary(
       instruction,
diff --git a/test/456-baseline-array-set/expected.txt b/test/456-baseline-array-set/expected.txt
new file mode 100644
index 0000000..e69de29
--- /dev/null
+++ b/test/456-baseline-array-set/expected.txt
diff --git a/test/456-baseline-array-set/info.txt b/test/456-baseline-array-set/info.txt
new file mode 100644
index 0000000..cf4137a
--- /dev/null
+++ b/test/456-baseline-array-set/info.txt
@@ -0,0 +1,3 @@
+Test for optimizing on x86, where we could run out
+of available registers when using the baseline register
+allocator.
diff --git a/test/456-baseline-array-set/src/Main.java b/test/456-baseline-array-set/src/Main.java
new file mode 100644
index 0000000..5475b41
--- /dev/null
+++ b/test/456-baseline-array-set/src/Main.java
@@ -0,0 +1,33 @@
+/*
+ * Copyright (C) 2015 The Android Open Source Project
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+public class Main {
+  public static void main(String[] args) {
+    doArrayAccess(new Integer(1), 0);
+  }
+
+  public static void doArrayAccess(Integer value, int index) {
+    try {
+      Integer[] array = new Integer[2];
+      // If we were to do optimization on the baseline register
+      // allocator, generating code for the array set would fail on x86.
+      array[index] = array[index + 1];
+      array[index] = value;
+    } catch (ArrayStoreException e) {
+      throw e;
+    }
+  }
+}