ARM64: Combine LSR+ADD into ADD_shift for Int32 HDiv/HRem

HDiv/HRem having a constant divisor are optimized by using
multiplication of the dividend by a sort of reciprocal of the divisor.
In case of Int32 the multiplication is done into a 64-bit register
high 32 bits of which are only used.
The multiplication result might need some ADD/SUB corrections.
Currently it is done by extracting high 32 bits with LSR and applying
ADD/SUB. However we can do correcting ADD/SUB on high 32 bits and extracting
those bits with the final right shift. This will eliminate the
extracting LSR instruction.

This CL implements this optimization.

Test: test.py --host --optimizing --jit
Test: test.py --target --optimizing --jit
Change-Id: I5ba557aa283291fd76d61ac0eb733cf6ea975116
diff --git a/compiler/optimizing/code_generator_arm64.cc b/compiler/optimizing/code_generator_arm64.cc
index 1e098d7..6cd51cc 100644
--- a/compiler/optimizing/code_generator_arm64.cc
+++ b/compiler/optimizing/code_generator_arm64.cc
@@ -3051,56 +3051,32 @@
   return divisor < 0 && magic_number > 0;
 }
 
-// Return true if the result of multiplication of the dividend by a sort of reciprocal
-// of the divisor (magic_number) needs to be corrected. This means additional operations will
-// be generated.
-static inline bool NeedToCorrectMulResult(int64_t magic_number, int64_t divisor) {
-  return NeedToAddDividend(magic_number, divisor) || NeedToSubDividend(magic_number, divisor);
+// Generate code which increments the value in register 'in' by 1 if the value is negative.
+// It is done with 'add out, in, in, lsr #31 or #63'.
+// If the value is a result of an operation setting the N flag, CINC MI can be used
+// instead of ADD. 'use_cond_inc' controls this.
+void InstructionCodeGeneratorARM64::GenerateIncrementNegativeByOne(
+    Register out,
+    Register in,
+    bool use_cond_inc) {
+  if (use_cond_inc) {
+    __ Cinc(out, in, mi);
+  } else {
+    __ Add(out, in, Operand(in, LSR, in.GetSizeInBits() - 1));
+  }
 }
 
-void InstructionCodeGeneratorARM64::GenerateResultDivRemWithAnyConstant(
-    bool is_rem,
-    int final_right_shift,
-    int64_t magic_number,
-    int64_t divisor,
-    Register dividend,
-    Register temp_result,
+// Helper to generate code producing the result of HRem with a constant divisor.
+void InstructionCodeGeneratorARM64::GenerateResultRemWithAnyConstant(
     Register out,
+    Register dividend,
+    Register quotient,
+    int64_t divisor,
     UseScratchRegisterScope* temps_scope) {
-  // The multiplication result might need some corrections to be finalized.
-  // The last correction is to increment by 1, if the result is negative.
-  // Currently it is done with 'add result, temp_result, temp_result, lsr #31 or #63'.
-  // Such ADD usually has latency 2, e.g. on Cortex-A55.
-  // However if one of the corrections is ADD or SUB, the sign can be detected
-  // with ADDS/SUBS. They set the N flag if the result is negative.
-  // This allows to use CINC MI which has latency 1.
-  bool use_cond_inc = false;
-
-  // As magic_number can be modified to fit into 32 bits, check whether the correction is needed.
-  if (NeedToAddDividend(magic_number, divisor)) {
-    __ Adds(temp_result, temp_result, dividend);
-    use_cond_inc = true;
-  } else if (NeedToSubDividend(magic_number, divisor)) {
-    __ Subs(temp_result, temp_result, dividend);
-    use_cond_inc = true;
-  }
-
-  if (final_right_shift != 0) {
-    __ Asr(temp_result, temp_result, final_right_shift);
-  }
-
-  Register& result = (is_rem) ? temp_result : out;
-  if (use_cond_inc) {
-    __ Cinc(result, temp_result, mi);
-  } else {
-    __ Add(result, temp_result, Operand(temp_result, LSR, temp_result.GetSizeInBits() - 1));
-  }
-  if (is_rem) {
-    // TODO: Strength reduction for msub.
-    Register temp_imm = temps_scope->AcquireSameSizeAs(out);
-    __ Mov(temp_imm, divisor);
-    __ Msub(out, temp_result, temp_imm, dividend);
-  }
+  // TODO: Strength reduction for msub.
+  Register temp_imm = temps_scope->AcquireSameSizeAs(out);
+  __ Mov(temp_imm, divisor);
+  __ Msub(out, quotient, temp_imm, dividend);
 }
 
 void InstructionCodeGeneratorARM64::GenerateInt64DivRemWithAnyConstant(
@@ -3127,14 +3103,34 @@
   __ Mov(temp, magic);
   __ Smulh(temp, dividend, temp);
 
-  GenerateResultDivRemWithAnyConstant(/* is_rem= */ instruction->IsRem(),
-                                      /* final_right_shift= */ shift,
-                                      magic,
-                                      imm,
-                                      dividend,
-                                      temp,
-                                      out,
-                                      &temps);
+  // The multiplication result might need some corrections to be finalized.
+  // The last correction is to increment by 1, if the result is negative.
+  // Currently it is done with 'add result, temp_result, temp_result, lsr #31 or #63'.
+  // Such ADD usually has latency 2, e.g. on Cortex-A55.
+  // However if one of the corrections is ADD or SUB, the sign can be detected
+  // with ADDS/SUBS. They set the N flag if the result is negative.
+  // This allows to use CINC MI which has latency 1.
+  bool use_cond_inc = false;
+
+  // As magic_number can be modified to fit into 32 bits, check whether the correction is needed.
+  if (NeedToAddDividend(magic, imm)) {
+    __ Adds(temp, temp, dividend);
+    use_cond_inc = true;
+  } else if (NeedToSubDividend(magic, imm)) {
+    __ Subs(temp, temp, dividend);
+    use_cond_inc = true;
+  }
+
+  if (shift != 0) {
+    __ Asr(temp, temp, shift);
+  }
+
+  if (instruction->IsRem()) {
+    GenerateIncrementNegativeByOne(temp, temp, use_cond_inc);
+    GenerateResultRemWithAnyConstant(out, dividend, temp, imm, &temps);
+  } else {
+    GenerateIncrementNegativeByOne(out, temp, use_cond_inc);
+  }
 }
 
 void InstructionCodeGeneratorARM64::GenerateInt32DivRemWithAnyConstant(
@@ -3160,25 +3156,35 @@
   __ Mov(temp, magic);
   __ Smull(temp.X(), dividend, temp);
 
-  if (NeedToCorrectMulResult(magic, imm)) {
-    __ Lsr(temp.X(), temp.X(), 32);
-  } else {
-    // As between 'lsr temp.X(), temp.X(), #32' and 'asr temp, temp, #shift' there are
-    // no other instructions modifying 'temp', they can be combined into one
-    // 'asr temp.X(), temp.X(), #32 + shift'.
-    DCHECK_LT(shift, 32);
-    __ Asr(temp.X(), temp.X(), 32 + shift);
-    shift = 0;
+  // The multiplication result might need some corrections to be finalized.
+  // The last correction is to increment by 1, if the result is negative.
+  // Currently it is done with 'add result, temp_result, temp_result, lsr #31 or #63'.
+  // Such ADD usually has latency 2, e.g. on Cortex-A55.
+  // However if one of the corrections is ADD or SUB, the sign can be detected
+  // with ADDS/SUBS. They set the N flag if the result is negative.
+  // This allows to use CINC MI which has latency 1.
+  bool use_cond_inc = false;
+
+  // ADD/SUB correction is performed in the high 32 bits
+  // as high 32 bits are ignored because type are kInt32.
+  if (NeedToAddDividend(magic, imm)) {
+    __ Adds(temp.X(), temp.X(), Operand(dividend.X(), LSL, 32));
+    use_cond_inc = true;
+  } else if (NeedToSubDividend(magic, imm)) {
+    __ Subs(temp.X(), temp.X(), Operand(dividend.X(), LSL, 32));
+    use_cond_inc = true;
   }
 
-  GenerateResultDivRemWithAnyConstant(/* is_rem= */ instruction->IsRem(),
-                                      /* final_right_shift= */ shift,
-                                      magic,
-                                      imm,
-                                      dividend,
-                                      temp,
-                                      out,
-                                      &temps);
+  // Extract the result from the high 32 bits and apply the final right shift.
+  DCHECK_LT(shift, 32);
+  __ Asr(temp.X(), temp.X(), 32 + shift);
+
+  if (instruction->IsRem()) {
+    GenerateIncrementNegativeByOne(temp, temp, use_cond_inc);
+    GenerateResultRemWithAnyConstant(out, dividend, temp, imm, &temps);
+  } else {
+    GenerateIncrementNegativeByOne(out, temp, use_cond_inc);
+  }
 }
 
 void InstructionCodeGeneratorARM64::GenerateDivRemWithAnyConstant(HBinaryOperation* instruction) {
diff --git a/compiler/optimizing/code_generator_arm64.h b/compiler/optimizing/code_generator_arm64.h
index 1e1c2a9..8349732 100644
--- a/compiler/optimizing/code_generator_arm64.h
+++ b/compiler/optimizing/code_generator_arm64.h
@@ -342,22 +342,14 @@
                              vixl::aarch64::Label* false_target);
   void DivRemOneOrMinusOne(HBinaryOperation* instruction);
   void DivRemByPowerOfTwo(HBinaryOperation* instruction);
-
-  // Helper to generate code producing the final result of HDiv/HRem with a constant divisor.
-  // 'temp_result' holds the result of multiplication of the dividend by a sort of reciprocal
-  // of the divisor (magic_number). Based on magic_number and divisor, temp_result might need
-  // to be corrected before applying final_right_shift.
-  // If the code is generated for HRem the final temp_result is used for producing the
-  // remainder.
-  void GenerateResultDivRemWithAnyConstant(bool is_rem,
-                                           int final_right_shift,
-                                           int64_t magic_number,
-                                           int64_t divisor,
-                                           vixl::aarch64::Register dividend,
-                                           vixl::aarch64::Register temp_result,
-                                           vixl::aarch64::Register out,
-                                           // This function may acquire a scratch register.
-                                           vixl::aarch64::UseScratchRegisterScope* temps_scope);
+  void GenerateIncrementNegativeByOne(vixl::aarch64::Register out,
+                                      vixl::aarch64::Register in, bool use_cond_inc);
+  void GenerateResultRemWithAnyConstant(vixl::aarch64::Register out,
+                                        vixl::aarch64::Register dividend,
+                                        vixl::aarch64::Register quotient,
+                                        int64_t divisor,
+                                        // This function may acquire a scratch register.
+                                        vixl::aarch64::UseScratchRegisterScope* temps_scope);
   void GenerateInt64DivRemWithAnyConstant(HBinaryOperation* instruction);
   void GenerateInt32DivRemWithAnyConstant(HBinaryOperation* instruction);
   void GenerateDivRemWithAnyConstant(HBinaryOperation* instruction);
diff --git a/test/411-checker-hdiv-hrem-const/src/DivTest.java b/test/411-checker-hdiv-hrem-const/src/DivTest.java
index 9a1cd47..ed4eb62 100644
--- a/test/411-checker-hdiv-hrem-const/src/DivTest.java
+++ b/test/411-checker-hdiv-hrem-const/src/DivTest.java
@@ -110,34 +110,30 @@
     return r;
   }
 
-  // A test case to check that 'lsr' and 'asr' are not combined into one 'asr'.
+  // A test case to check that 'lsr' and 'add' are combined into one 'adds'.
   // For divisor 7 seen in the core library the result of get_high(dividend * magic)
-  // must be corrected by the 'add' instruction which is between 'lsr' and 'asr'
-  // instructions. In such a case they cannot be combined into one 'asr'.
+  // must be corrected by the 'add' instruction.
   //
   // The test case also checks 'add' and 'add_shift' are optimized into 'adds' and 'cinc'.
   //
   /// CHECK-START-ARM64: int DivTest.$noinline$IntDivBy7(int) disassembly (after)
-  /// CHECK:                 lsr x{{\d+}}, x{{\d+}}, #32
-  /// CHECK-NEXT:            adds w{{\d+}}, w{{\d+}}, w{{\d+}}
-  /// CHECK-NEXT:            asr w{{\d+}}, w{{\d+}}, #2
+  /// CHECK:                 adds x{{\d+}}, x{{\d+}}, x{{\d+}}, lsl #32
+  /// CHECK-NEXT:            asr  x{{\d+}}, x{{\d+}}, #34
   /// CHECK-NEXT:            cinc w{{\d+}}, w{{\d+}}, mi
   private static int $noinline$IntDivBy7(int v) {
     int r = v / 7;
     return r;
   }
 
-  // A test case to check that 'lsr' and 'asr' are not combined into one 'asr'.
+  // A test case to check that 'lsr' and 'add' are combined into one 'adds'.
   // Divisor -7 has the same property as divisor 7: the result of get_high(dividend * magic)
-  // must be corrected. In this case it is a 'sub' instruction which is between 'lsr' and 'asr'
-  // instructions. So they cannot be combined into one 'asr'.
+  // must be corrected. In this case it is a 'sub' instruction.
   //
   // The test case also checks 'sub' and 'add_shift' are optimized into 'subs' and 'cinc'.
   //
   /// CHECK-START-ARM64: int DivTest.$noinline$IntDivByMinus7(int) disassembly (after)
-  /// CHECK:                 lsr x{{\d+}}, x{{\d+}}, #32
-  /// CHECK-NEXT:            subs w{{\d+}}, w{{\d+}}, w{{\d+}}
-  /// CHECK-NEXT:            asr w{{\d+}}, w{{\d+}}, #2
+  /// CHECK:                 subs x{{\d+}}, x{{\d+}}, x{{\d+}}, lsl #32
+  /// CHECK-NEXT:            asr  x{{\d+}}, x{{\d+}}, #34
   /// CHECK-NEXT:            cinc w{{\d+}}, w{{\d+}}, mi
   private static int $noinline$IntDivByMinus7(int v) {
     int r = v / -7;
diff --git a/test/411-checker-hdiv-hrem-const/src/RemTest.java b/test/411-checker-hdiv-hrem-const/src/RemTest.java
index 11889c4..2fae275 100644
--- a/test/411-checker-hdiv-hrem-const/src/RemTest.java
+++ b/test/411-checker-hdiv-hrem-const/src/RemTest.java
@@ -114,15 +114,15 @@
     return r;
   }
 
-  // A test case to check that 'lsr' and 'asr' are not combined into one 'asr'.
+  // A test case to check that 'lsr' and 'add' are combined into one 'adds'.
   // For divisor 7 seen in the core library the result of get_high(dividend * magic)
-  // must be corrected by the 'add' instruction which is between 'lsr' and 'asr'
-  // instructions. In such a case they cannot be combined into one 'asr'.
+  // must be corrected by the 'add' instruction.
+  //
+  // The test case also checks 'add' and 'add_shift' are optimized into 'adds' and 'cinc'.
   //
   /// CHECK-START-ARM64: int RemTest.$noinline$IntRemBy7(int) disassembly (after)
-  /// CHECK:                 lsr x{{\d+}}, x{{\d+}}, #32
-  /// CHECK-NEXT:            adds w{{\d+}}, w{{\d+}}, w{{\d+}}
-  /// CHECK-NEXT:            asr w{{\d+}}, w{{\d+}}, #2
+  /// CHECK:                 adds x{{\d+}}, x{{\d+}}, x{{\d+}}, lsl #32
+  /// CHECK-NEXT:            asr  x{{\d+}}, x{{\d+}}, #34
   /// CHECK-NEXT:            cinc w{{\d+}}, w{{\d+}}, mi
   /// CHECK-NEXT:            mov w{{\d+}}, #0x7
   /// CHECK-NEXT:            msub w{{\d+}}, w{{\d+}}, w{{\d+}}, w{{\d+}}
@@ -131,15 +131,15 @@
     return r;
   }
 
-  // A test case to check that 'lsr' and 'asr' are not combined into one 'asr'.
+  // A test case to check that 'lsr' and 'add' are combined into one 'adds'.
   // Divisor -7 has the same property as divisor 7: the result of get_high(dividend * magic)
-  // must be corrected. In this case it is a 'sub' instruction which is between 'lsr' and 'asr'
-  // instructions. So they cannot be combined into one 'asr'.
+  // must be corrected. In this case it is a 'sub' instruction.
+  //
+  // The test case also checks 'sub' and 'add_shift' are optimized into 'subs' and 'cinc'.
   //
   /// CHECK-START-ARM64: int RemTest.$noinline$IntRemByMinus7(int) disassembly (after)
-  /// CHECK:                 lsr x{{\d+}}, x{{\d+}}, #32
-  /// CHECK-NEXT:            subs w{{\d+}}, w{{\d+}}, w{{\d+}}
-  /// CHECK-NEXT:            asr w{{\d+}}, w{{\d+}}, #2
+  /// CHECK:                 subs x{{\d+}}, x{{\d+}}, x{{\d+}}, lsl #32
+  /// CHECK-NEXT:            asr  x{{\d+}}, x{{\d+}}, #34
   /// CHECK-NEXT:            cinc w{{\d+}}, w{{\d+}}, mi
   /// CHECK-NEXT:            mov w{{\d+}}, #0xfffffff9
   /// CHECK-NEXT:            msub w{{\d+}}, w{{\d+}}, w{{\d+}}, w{{\d+}}