ARM64: Combine LSR+ADD into ADD_shift for Int32 HDiv/HRem
HDiv/HRem having a constant divisor are optimized by using
multiplication of the dividend by a sort of reciprocal of the divisor.
In case of Int32 the multiplication is done into a 64-bit register
high 32 bits of which are only used.
The multiplication result might need some ADD/SUB corrections.
Currently it is done by extracting high 32 bits with LSR and applying
ADD/SUB. However we can do correcting ADD/SUB on high 32 bits and extracting
those bits with the final right shift. This will eliminate the
extracting LSR instruction.
This CL implements this optimization.
Test: test.py --host --optimizing --jit
Test: test.py --target --optimizing --jit
Change-Id: I5ba557aa283291fd76d61ac0eb733cf6ea975116
diff --git a/compiler/optimizing/code_generator_arm64.cc b/compiler/optimizing/code_generator_arm64.cc
index 1e098d7..6cd51cc 100644
--- a/compiler/optimizing/code_generator_arm64.cc
+++ b/compiler/optimizing/code_generator_arm64.cc
@@ -3051,56 +3051,32 @@
return divisor < 0 && magic_number > 0;
}
-// Return true if the result of multiplication of the dividend by a sort of reciprocal
-// of the divisor (magic_number) needs to be corrected. This means additional operations will
-// be generated.
-static inline bool NeedToCorrectMulResult(int64_t magic_number, int64_t divisor) {
- return NeedToAddDividend(magic_number, divisor) || NeedToSubDividend(magic_number, divisor);
+// Generate code which increments the value in register 'in' by 1 if the value is negative.
+// It is done with 'add out, in, in, lsr #31 or #63'.
+// If the value is a result of an operation setting the N flag, CINC MI can be used
+// instead of ADD. 'use_cond_inc' controls this.
+void InstructionCodeGeneratorARM64::GenerateIncrementNegativeByOne(
+ Register out,
+ Register in,
+ bool use_cond_inc) {
+ if (use_cond_inc) {
+ __ Cinc(out, in, mi);
+ } else {
+ __ Add(out, in, Operand(in, LSR, in.GetSizeInBits() - 1));
+ }
}
-void InstructionCodeGeneratorARM64::GenerateResultDivRemWithAnyConstant(
- bool is_rem,
- int final_right_shift,
- int64_t magic_number,
- int64_t divisor,
- Register dividend,
- Register temp_result,
+// Helper to generate code producing the result of HRem with a constant divisor.
+void InstructionCodeGeneratorARM64::GenerateResultRemWithAnyConstant(
Register out,
+ Register dividend,
+ Register quotient,
+ int64_t divisor,
UseScratchRegisterScope* temps_scope) {
- // The multiplication result might need some corrections to be finalized.
- // The last correction is to increment by 1, if the result is negative.
- // Currently it is done with 'add result, temp_result, temp_result, lsr #31 or #63'.
- // Such ADD usually has latency 2, e.g. on Cortex-A55.
- // However if one of the corrections is ADD or SUB, the sign can be detected
- // with ADDS/SUBS. They set the N flag if the result is negative.
- // This allows to use CINC MI which has latency 1.
- bool use_cond_inc = false;
-
- // As magic_number can be modified to fit into 32 bits, check whether the correction is needed.
- if (NeedToAddDividend(magic_number, divisor)) {
- __ Adds(temp_result, temp_result, dividend);
- use_cond_inc = true;
- } else if (NeedToSubDividend(magic_number, divisor)) {
- __ Subs(temp_result, temp_result, dividend);
- use_cond_inc = true;
- }
-
- if (final_right_shift != 0) {
- __ Asr(temp_result, temp_result, final_right_shift);
- }
-
- Register& result = (is_rem) ? temp_result : out;
- if (use_cond_inc) {
- __ Cinc(result, temp_result, mi);
- } else {
- __ Add(result, temp_result, Operand(temp_result, LSR, temp_result.GetSizeInBits() - 1));
- }
- if (is_rem) {
- // TODO: Strength reduction for msub.
- Register temp_imm = temps_scope->AcquireSameSizeAs(out);
- __ Mov(temp_imm, divisor);
- __ Msub(out, temp_result, temp_imm, dividend);
- }
+ // TODO: Strength reduction for msub.
+ Register temp_imm = temps_scope->AcquireSameSizeAs(out);
+ __ Mov(temp_imm, divisor);
+ __ Msub(out, quotient, temp_imm, dividend);
}
void InstructionCodeGeneratorARM64::GenerateInt64DivRemWithAnyConstant(
@@ -3127,14 +3103,34 @@
__ Mov(temp, magic);
__ Smulh(temp, dividend, temp);
- GenerateResultDivRemWithAnyConstant(/* is_rem= */ instruction->IsRem(),
- /* final_right_shift= */ shift,
- magic,
- imm,
- dividend,
- temp,
- out,
- &temps);
+ // The multiplication result might need some corrections to be finalized.
+ // The last correction is to increment by 1, if the result is negative.
+ // Currently it is done with 'add result, temp_result, temp_result, lsr #31 or #63'.
+ // Such ADD usually has latency 2, e.g. on Cortex-A55.
+ // However if one of the corrections is ADD or SUB, the sign can be detected
+ // with ADDS/SUBS. They set the N flag if the result is negative.
+ // This allows to use CINC MI which has latency 1.
+ bool use_cond_inc = false;
+
+ // As magic_number can be modified to fit into 32 bits, check whether the correction is needed.
+ if (NeedToAddDividend(magic, imm)) {
+ __ Adds(temp, temp, dividend);
+ use_cond_inc = true;
+ } else if (NeedToSubDividend(magic, imm)) {
+ __ Subs(temp, temp, dividend);
+ use_cond_inc = true;
+ }
+
+ if (shift != 0) {
+ __ Asr(temp, temp, shift);
+ }
+
+ if (instruction->IsRem()) {
+ GenerateIncrementNegativeByOne(temp, temp, use_cond_inc);
+ GenerateResultRemWithAnyConstant(out, dividend, temp, imm, &temps);
+ } else {
+ GenerateIncrementNegativeByOne(out, temp, use_cond_inc);
+ }
}
void InstructionCodeGeneratorARM64::GenerateInt32DivRemWithAnyConstant(
@@ -3160,25 +3156,35 @@
__ Mov(temp, magic);
__ Smull(temp.X(), dividend, temp);
- if (NeedToCorrectMulResult(magic, imm)) {
- __ Lsr(temp.X(), temp.X(), 32);
- } else {
- // As between 'lsr temp.X(), temp.X(), #32' and 'asr temp, temp, #shift' there are
- // no other instructions modifying 'temp', they can be combined into one
- // 'asr temp.X(), temp.X(), #32 + shift'.
- DCHECK_LT(shift, 32);
- __ Asr(temp.X(), temp.X(), 32 + shift);
- shift = 0;
+ // The multiplication result might need some corrections to be finalized.
+ // The last correction is to increment by 1, if the result is negative.
+ // Currently it is done with 'add result, temp_result, temp_result, lsr #31 or #63'.
+ // Such ADD usually has latency 2, e.g. on Cortex-A55.
+ // However if one of the corrections is ADD or SUB, the sign can be detected
+ // with ADDS/SUBS. They set the N flag if the result is negative.
+ // This allows to use CINC MI which has latency 1.
+ bool use_cond_inc = false;
+
+ // ADD/SUB correction is performed in the high 32 bits
+ // as high 32 bits are ignored because type are kInt32.
+ if (NeedToAddDividend(magic, imm)) {
+ __ Adds(temp.X(), temp.X(), Operand(dividend.X(), LSL, 32));
+ use_cond_inc = true;
+ } else if (NeedToSubDividend(magic, imm)) {
+ __ Subs(temp.X(), temp.X(), Operand(dividend.X(), LSL, 32));
+ use_cond_inc = true;
}
- GenerateResultDivRemWithAnyConstant(/* is_rem= */ instruction->IsRem(),
- /* final_right_shift= */ shift,
- magic,
- imm,
- dividend,
- temp,
- out,
- &temps);
+ // Extract the result from the high 32 bits and apply the final right shift.
+ DCHECK_LT(shift, 32);
+ __ Asr(temp.X(), temp.X(), 32 + shift);
+
+ if (instruction->IsRem()) {
+ GenerateIncrementNegativeByOne(temp, temp, use_cond_inc);
+ GenerateResultRemWithAnyConstant(out, dividend, temp, imm, &temps);
+ } else {
+ GenerateIncrementNegativeByOne(out, temp, use_cond_inc);
+ }
}
void InstructionCodeGeneratorARM64::GenerateDivRemWithAnyConstant(HBinaryOperation* instruction) {
diff --git a/compiler/optimizing/code_generator_arm64.h b/compiler/optimizing/code_generator_arm64.h
index 1e1c2a9..8349732 100644
--- a/compiler/optimizing/code_generator_arm64.h
+++ b/compiler/optimizing/code_generator_arm64.h
@@ -342,22 +342,14 @@
vixl::aarch64::Label* false_target);
void DivRemOneOrMinusOne(HBinaryOperation* instruction);
void DivRemByPowerOfTwo(HBinaryOperation* instruction);
-
- // Helper to generate code producing the final result of HDiv/HRem with a constant divisor.
- // 'temp_result' holds the result of multiplication of the dividend by a sort of reciprocal
- // of the divisor (magic_number). Based on magic_number and divisor, temp_result might need
- // to be corrected before applying final_right_shift.
- // If the code is generated for HRem the final temp_result is used for producing the
- // remainder.
- void GenerateResultDivRemWithAnyConstant(bool is_rem,
- int final_right_shift,
- int64_t magic_number,
- int64_t divisor,
- vixl::aarch64::Register dividend,
- vixl::aarch64::Register temp_result,
- vixl::aarch64::Register out,
- // This function may acquire a scratch register.
- vixl::aarch64::UseScratchRegisterScope* temps_scope);
+ void GenerateIncrementNegativeByOne(vixl::aarch64::Register out,
+ vixl::aarch64::Register in, bool use_cond_inc);
+ void GenerateResultRemWithAnyConstant(vixl::aarch64::Register out,
+ vixl::aarch64::Register dividend,
+ vixl::aarch64::Register quotient,
+ int64_t divisor,
+ // This function may acquire a scratch register.
+ vixl::aarch64::UseScratchRegisterScope* temps_scope);
void GenerateInt64DivRemWithAnyConstant(HBinaryOperation* instruction);
void GenerateInt32DivRemWithAnyConstant(HBinaryOperation* instruction);
void GenerateDivRemWithAnyConstant(HBinaryOperation* instruction);
diff --git a/test/411-checker-hdiv-hrem-const/src/DivTest.java b/test/411-checker-hdiv-hrem-const/src/DivTest.java
index 9a1cd47..ed4eb62 100644
--- a/test/411-checker-hdiv-hrem-const/src/DivTest.java
+++ b/test/411-checker-hdiv-hrem-const/src/DivTest.java
@@ -110,34 +110,30 @@
return r;
}
- // A test case to check that 'lsr' and 'asr' are not combined into one 'asr'.
+ // A test case to check that 'lsr' and 'add' are combined into one 'adds'.
// For divisor 7 seen in the core library the result of get_high(dividend * magic)
- // must be corrected by the 'add' instruction which is between 'lsr' and 'asr'
- // instructions. In such a case they cannot be combined into one 'asr'.
+ // must be corrected by the 'add' instruction.
//
// The test case also checks 'add' and 'add_shift' are optimized into 'adds' and 'cinc'.
//
/// CHECK-START-ARM64: int DivTest.$noinline$IntDivBy7(int) disassembly (after)
- /// CHECK: lsr x{{\d+}}, x{{\d+}}, #32
- /// CHECK-NEXT: adds w{{\d+}}, w{{\d+}}, w{{\d+}}
- /// CHECK-NEXT: asr w{{\d+}}, w{{\d+}}, #2
+ /// CHECK: adds x{{\d+}}, x{{\d+}}, x{{\d+}}, lsl #32
+ /// CHECK-NEXT: asr x{{\d+}}, x{{\d+}}, #34
/// CHECK-NEXT: cinc w{{\d+}}, w{{\d+}}, mi
private static int $noinline$IntDivBy7(int v) {
int r = v / 7;
return r;
}
- // A test case to check that 'lsr' and 'asr' are not combined into one 'asr'.
+ // A test case to check that 'lsr' and 'add' are combined into one 'adds'.
// Divisor -7 has the same property as divisor 7: the result of get_high(dividend * magic)
- // must be corrected. In this case it is a 'sub' instruction which is between 'lsr' and 'asr'
- // instructions. So they cannot be combined into one 'asr'.
+ // must be corrected. In this case it is a 'sub' instruction.
//
// The test case also checks 'sub' and 'add_shift' are optimized into 'subs' and 'cinc'.
//
/// CHECK-START-ARM64: int DivTest.$noinline$IntDivByMinus7(int) disassembly (after)
- /// CHECK: lsr x{{\d+}}, x{{\d+}}, #32
- /// CHECK-NEXT: subs w{{\d+}}, w{{\d+}}, w{{\d+}}
- /// CHECK-NEXT: asr w{{\d+}}, w{{\d+}}, #2
+ /// CHECK: subs x{{\d+}}, x{{\d+}}, x{{\d+}}, lsl #32
+ /// CHECK-NEXT: asr x{{\d+}}, x{{\d+}}, #34
/// CHECK-NEXT: cinc w{{\d+}}, w{{\d+}}, mi
private static int $noinline$IntDivByMinus7(int v) {
int r = v / -7;
diff --git a/test/411-checker-hdiv-hrem-const/src/RemTest.java b/test/411-checker-hdiv-hrem-const/src/RemTest.java
index 11889c4..2fae275 100644
--- a/test/411-checker-hdiv-hrem-const/src/RemTest.java
+++ b/test/411-checker-hdiv-hrem-const/src/RemTest.java
@@ -114,15 +114,15 @@
return r;
}
- // A test case to check that 'lsr' and 'asr' are not combined into one 'asr'.
+ // A test case to check that 'lsr' and 'add' are combined into one 'adds'.
// For divisor 7 seen in the core library the result of get_high(dividend * magic)
- // must be corrected by the 'add' instruction which is between 'lsr' and 'asr'
- // instructions. In such a case they cannot be combined into one 'asr'.
+ // must be corrected by the 'add' instruction.
+ //
+ // The test case also checks 'add' and 'add_shift' are optimized into 'adds' and 'cinc'.
//
/// CHECK-START-ARM64: int RemTest.$noinline$IntRemBy7(int) disassembly (after)
- /// CHECK: lsr x{{\d+}}, x{{\d+}}, #32
- /// CHECK-NEXT: adds w{{\d+}}, w{{\d+}}, w{{\d+}}
- /// CHECK-NEXT: asr w{{\d+}}, w{{\d+}}, #2
+ /// CHECK: adds x{{\d+}}, x{{\d+}}, x{{\d+}}, lsl #32
+ /// CHECK-NEXT: asr x{{\d+}}, x{{\d+}}, #34
/// CHECK-NEXT: cinc w{{\d+}}, w{{\d+}}, mi
/// CHECK-NEXT: mov w{{\d+}}, #0x7
/// CHECK-NEXT: msub w{{\d+}}, w{{\d+}}, w{{\d+}}, w{{\d+}}
@@ -131,15 +131,15 @@
return r;
}
- // A test case to check that 'lsr' and 'asr' are not combined into one 'asr'.
+ // A test case to check that 'lsr' and 'add' are combined into one 'adds'.
// Divisor -7 has the same property as divisor 7: the result of get_high(dividend * magic)
- // must be corrected. In this case it is a 'sub' instruction which is between 'lsr' and 'asr'
- // instructions. So they cannot be combined into one 'asr'.
+ // must be corrected. In this case it is a 'sub' instruction.
+ //
+ // The test case also checks 'sub' and 'add_shift' are optimized into 'subs' and 'cinc'.
//
/// CHECK-START-ARM64: int RemTest.$noinline$IntRemByMinus7(int) disassembly (after)
- /// CHECK: lsr x{{\d+}}, x{{\d+}}, #32
- /// CHECK-NEXT: subs w{{\d+}}, w{{\d+}}, w{{\d+}}
- /// CHECK-NEXT: asr w{{\d+}}, w{{\d+}}, #2
+ /// CHECK: subs x{{\d+}}, x{{\d+}}, x{{\d+}}, lsl #32
+ /// CHECK-NEXT: asr x{{\d+}}, x{{\d+}}, #34
/// CHECK-NEXT: cinc w{{\d+}}, w{{\d+}}, mi
/// CHECK-NEXT: mov w{{\d+}}, #0xfffffff9
/// CHECK-NEXT: msub w{{\d+}}, w{{\d+}}, w{{\d+}}, w{{\d+}}