Opt Compiler: ARM64: Enable explicit memory barriers over acquire/release
Implement remaining explicit memory barrier code paths and temporarily
enable the use of explicit memory barriers for testing.
This CL also enables the use of instruction set features in the ARM64
backend. kUseAcquireRelease has been replaced with PreferAcquireRelease(),
which for now is statically set to false (prefer explicit memory barriers).
Please note that we still prefer acquire-release for the ARM64 Optimizing
Compiler, but we would like to exercise the explicit memory barrier code
path too.
Change-Id: I84e047ecd43b6fbefc5b82cf532e3f5c59076458
Signed-off-by: Serban Constantinescu <serban.constantinescu@arm.com>
diff --git a/compiler/optimizing/code_generator.cc b/compiler/optimizing/code_generator.cc
index 2a57fdc..ba5f7d8 100644
--- a/compiler/optimizing/code_generator.cc
+++ b/compiler/optimizing/code_generator.cc
@@ -386,7 +386,9 @@
compiler_options);
}
case kArm64: {
- return new arm64::CodeGeneratorARM64(graph, compiler_options);
+ return new arm64::CodeGeneratorARM64(graph,
+ *isa_features.AsArm64InstructionSetFeatures(),
+ compiler_options);
}
case kMips:
return nullptr;
diff --git a/compiler/optimizing/code_generator_arm64.cc b/compiler/optimizing/code_generator_arm64.cc
index 729bab7..c21084a 100644
--- a/compiler/optimizing/code_generator_arm64.cc
+++ b/compiler/optimizing/code_generator_arm64.cc
@@ -16,6 +16,7 @@
#include "code_generator_arm64.h"
+#include "arch/arm64/instruction_set_features_arm64.h"
#include "common_arm64.h"
#include "entrypoints/quick/quick_entrypoints.h"
#include "entrypoints/quick/quick_entrypoints_enum.h"
@@ -397,7 +398,9 @@
return next_location;
}
-CodeGeneratorARM64::CodeGeneratorARM64(HGraph* graph, const CompilerOptions& compiler_options)
+CodeGeneratorARM64::CodeGeneratorARM64(HGraph* graph,
+ const Arm64InstructionSetFeatures& isa_features,
+ const CompilerOptions& compiler_options)
: CodeGenerator(graph,
kNumberOfAllocatableRegisters,
kNumberOfAllocatableFPRegisters,
@@ -408,7 +411,8 @@
block_labels_(nullptr),
location_builder_(graph, this),
instruction_visitor_(graph, this),
- move_resolver_(graph->GetArena(), this) {
+ move_resolver_(graph->GetArena(), this),
+ isa_features_(isa_features) {
// Save the link register (containing the return address) to mimic Quick.
AddAllocatedRegister(LocationFrom(lr));
}
@@ -998,9 +1002,10 @@
UseScratchRegisterScope temps(GetVIXLAssembler());
Register temp = temps.AcquireW();
size_t status_offset = mirror::Class::StatusOffset().SizeValue();
+ bool use_acquire_release = codegen_->GetInstructionSetFeatures().PreferAcquireRelease();
// Even if the initialized flag is set, we need to ensure consistent memory ordering.
- if (kUseAcquireRelease) {
+ if (use_acquire_release) {
// TODO(vixl): Let the MacroAssembler handle MemOperand.
__ Add(temp, class_reg, status_offset);
__ Ldar(temp, HeapOperand(temp));
@@ -1689,9 +1694,10 @@
void InstructionCodeGeneratorARM64::VisitInstanceFieldGet(HInstanceFieldGet* instruction) {
MemOperand field = HeapOperand(InputRegisterAt(instruction, 0), instruction->GetFieldOffset());
+ bool use_acquire_release = codegen_->GetInstructionSetFeatures().PreferAcquireRelease();
if (instruction->IsVolatile()) {
- if (kUseAcquireRelease) {
+ if (use_acquire_release) {
// NB: LoadAcquire will record the pc info if needed.
codegen_->LoadAcquire(instruction, OutputCPURegister(instruction), field);
} else {
@@ -1718,9 +1724,10 @@
CPURegister value = InputCPURegisterAt(instruction, 1);
Offset offset = instruction->GetFieldOffset();
Primitive::Type field_type = instruction->GetFieldType();
+ bool use_acquire_release = codegen_->GetInstructionSetFeatures().PreferAcquireRelease();
if (instruction->IsVolatile()) {
- if (kUseAcquireRelease) {
+ if (use_acquire_release) {
codegen_->StoreRelease(field_type, value, HeapOperand(obj, offset));
codegen_->MaybeRecordImplicitNullCheck(instruction);
} else {
@@ -2437,9 +2444,10 @@
void InstructionCodeGeneratorARM64::VisitStaticFieldGet(HStaticFieldGet* instruction) {
MemOperand field = HeapOperand(InputRegisterAt(instruction, 0), instruction->GetFieldOffset());
+ bool use_acquire_release = codegen_->GetInstructionSetFeatures().PreferAcquireRelease();
if (instruction->IsVolatile()) {
- if (kUseAcquireRelease) {
+ if (use_acquire_release) {
// NB: LoadAcquire will record the pc info if needed.
codegen_->LoadAcquire(instruction, OutputCPURegister(instruction), field);
} else {
@@ -2464,9 +2472,10 @@
CPURegister value = InputCPURegisterAt(instruction, 1);
Offset offset = instruction->GetFieldOffset();
Primitive::Type field_type = instruction->GetFieldType();
+ bool use_acquire_release = codegen_->GetInstructionSetFeatures().PreferAcquireRelease();
if (instruction->IsVolatile()) {
- if (kUseAcquireRelease) {
+ if (use_acquire_release) {
codegen_->StoreRelease(field_type, value, HeapOperand(cls, offset));
} else {
GenerateMemoryBarrier(MemBarrierKind::kAnyStore);
diff --git a/compiler/optimizing/code_generator_arm64.h b/compiler/optimizing/code_generator_arm64.h
index afb7fc3..48961d6 100644
--- a/compiler/optimizing/code_generator_arm64.h
+++ b/compiler/optimizing/code_generator_arm64.h
@@ -32,10 +32,6 @@
class CodeGeneratorARM64;
-// TODO: Tune the use of Load-Acquire, Store-Release vs Data Memory Barriers.
-// For now we prefer the use of load-acquire, store-release over explicit memory barriers.
-static constexpr bool kUseAcquireRelease = true;
-
// Use a local definition to prevent copying mistakes.
static constexpr size_t kArm64WordSize = kArm64PointerSize;
@@ -195,7 +191,9 @@
class CodeGeneratorARM64 : public CodeGenerator {
public:
- CodeGeneratorARM64(HGraph* graph, const CompilerOptions& compiler_options);
+ CodeGeneratorARM64(HGraph* graph,
+ const Arm64InstructionSetFeatures& isa_features,
+ const CompilerOptions& compiler_options);
virtual ~CodeGeneratorARM64() {}
void GenerateFrameEntry() OVERRIDE;
@@ -273,6 +271,10 @@
return InstructionSet::kArm64;
}
+ const Arm64InstructionSetFeatures& GetInstructionSetFeatures() const {
+ return isa_features_;
+ }
+
void Initialize() OVERRIDE {
HGraph* graph = GetGraph();
int length = graph->GetBlocks().Size();
@@ -317,6 +319,7 @@
InstructionCodeGeneratorARM64 instruction_visitor_;
ParallelMoveResolverARM64 move_resolver_;
Arm64Assembler assembler_;
+ const Arm64InstructionSetFeatures& isa_features_;
DISALLOW_COPY_AND_ASSIGN(CodeGeneratorARM64);
};
diff --git a/compiler/optimizing/codegen_test.cc b/compiler/optimizing/codegen_test.cc
index e0e0b4c..868fc5b 100644
--- a/compiler/optimizing/codegen_test.cc
+++ b/compiler/optimizing/codegen_test.cc
@@ -18,6 +18,7 @@
#include "arch/instruction_set.h"
#include "arch/arm/instruction_set_features_arm.h"
+#include "arch/arm64/instruction_set_features_arm64.h"
#include "base/macros.h"
#include "builder.h"
#include "code_generator_arm.h"
@@ -115,9 +116,9 @@
Run(allocator, codegenX86, has_result, expected);
}
- std::unique_ptr<const ArmInstructionSetFeatures> features(
+ std::unique_ptr<const ArmInstructionSetFeatures> features_arm(
ArmInstructionSetFeatures::FromCppDefines());
- TestCodeGeneratorARM codegenARM(graph, *features.get(), compiler_options);
+ TestCodeGeneratorARM codegenARM(graph, *features_arm.get(), compiler_options);
codegenARM.CompileBaseline(&allocator, true);
if (kRuntimeISA == kArm || kRuntimeISA == kThumb2) {
Run(allocator, codegenARM, has_result, expected);
@@ -129,7 +130,9 @@
Run(allocator, codegenX86_64, has_result, expected);
}
- arm64::CodeGeneratorARM64 codegenARM64(graph, compiler_options);
+ std::unique_ptr<const Arm64InstructionSetFeatures> features_arm64(
+ Arm64InstructionSetFeatures::FromCppDefines());
+ arm64::CodeGeneratorARM64 codegenARM64(graph, *features_arm64.get(), compiler_options);
codegenARM64.CompileBaseline(&allocator, true);
if (kRuntimeISA == kArm64) {
Run(allocator, codegenARM64, has_result, expected);
@@ -166,7 +169,9 @@
compiler_options);
RunCodeOptimized(&codegenARM, graph, hook_before_codegen, has_result, expected);
} else if (kRuntimeISA == kArm64) {
- arm64::CodeGeneratorARM64 codegenARM64(graph, compiler_options);
+ arm64::CodeGeneratorARM64 codegenARM64(graph,
+ *Arm64InstructionSetFeatures::FromCppDefines(),
+ compiler_options);
RunCodeOptimized(&codegenARM64, graph, hook_before_codegen, has_result, expected);
} else if (kRuntimeISA == kX86) {
x86::CodeGeneratorX86 codegenX86(graph, compiler_options);
diff --git a/compiler/optimizing/intrinsics_arm64.cc b/compiler/optimizing/intrinsics_arm64.cc
index 8874edc..1ddff8a 100644
--- a/compiler/optimizing/intrinsics_arm64.cc
+++ b/compiler/optimizing/intrinsics_arm64.cc
@@ -16,6 +16,7 @@
#include "intrinsics_arm64.h"
+#include "arch/arm64/instruction_set_features_arm64.h"
#include "code_generator_arm64.h"
#include "common_arm64.h"
#include "entrypoints/quick/quick_entrypoints.h"
@@ -682,10 +683,11 @@
Register base = WRegisterFrom(locations->InAt(1)); // Object pointer.
Register offset = XRegisterFrom(locations->InAt(2)); // Long offset.
Register trg = RegisterFrom(locations->Out(), type);
+ bool use_acquire_release = codegen->GetInstructionSetFeatures().PreferAcquireRelease();
MemOperand mem_op(base.X(), offset);
if (is_volatile) {
- if (kUseAcquireRelease) {
+ if (use_acquire_release) {
codegen->LoadAcquire(invoke, trg, mem_op);
} else {
codegen->Load(type, trg, mem_op);
@@ -792,11 +794,12 @@
Register base = WRegisterFrom(locations->InAt(1)); // Object pointer.
Register offset = XRegisterFrom(locations->InAt(2)); // Long offset.
Register value = RegisterFrom(locations->InAt(3), type);
+ bool use_acquire_release = codegen->GetInstructionSetFeatures().PreferAcquireRelease();
MemOperand mem_op(base.X(), offset);
if (is_volatile || is_ordered) {
- if (kUseAcquireRelease) {
+ if (use_acquire_release) {
codegen->StoreRelease(type, value, mem_op);
} else {
__ Dmb(InnerShareable, BarrierAll);
@@ -856,10 +859,7 @@
}
static void GenCas(LocationSummary* locations, Primitive::Type type, CodeGeneratorARM64* codegen) {
- // TODO: Currently we use acquire-release load-stores in the CAS loop. One could reasonably write
- // a version relying on simple exclusive load-stores and barriers instead.
- static_assert(kUseAcquireRelease, "Non-acquire-release inlined CAS not implemented, yet.");
-
+ bool use_acquire_release = codegen->GetInstructionSetFeatures().PreferAcquireRelease();
vixl::MacroAssembler* masm = codegen->GetAssembler()->vixl_masm_;
Register out = WRegisterFrom(locations->Out()); // Boolean result.
@@ -889,15 +889,23 @@
// result = tmp_value != 0;
vixl::Label loop_head, exit_loop;
- __ Bind(&loop_head);
-
- __ Ldaxr(tmp_value, MemOperand(tmp_ptr));
- __ Cmp(tmp_value, expected);
- __ B(&exit_loop, ne);
-
- __ Stlxr(tmp_32, value, MemOperand(tmp_ptr));
- __ Cbnz(tmp_32, &loop_head);
-
+ if (use_acquire_release) {
+ __ Bind(&loop_head);
+ __ Ldaxr(tmp_value, MemOperand(tmp_ptr));
+ __ Cmp(tmp_value, expected);
+ __ B(&exit_loop, ne);
+ __ Stlxr(tmp_32, value, MemOperand(tmp_ptr));
+ __ Cbnz(tmp_32, &loop_head);
+ } else {
+ __ Dmb(InnerShareable, BarrierWrites);
+ __ Bind(&loop_head);
+ __ Ldxr(tmp_value, MemOperand(tmp_ptr));
+ __ Cmp(tmp_value, expected);
+ __ B(&exit_loop, ne);
+ __ Stxr(tmp_32, value, MemOperand(tmp_ptr));
+ __ Cbnz(tmp_32, &loop_head);
+ __ Dmb(InnerShareable, BarrierAll);
+ }
__ Bind(&exit_loop);
__ Cset(out, eq);
}