MIPS32: Pass more arguments in registers.

Specifically, use A0-A3,T0-T1 for non-floats and F8-F19 for floats.

Test: booted MIPS32R2 in QEMU
Test: test-art-target-run-test-optimizing (MIPS32R2) on CI20
Test: test-art-target-gtest (MIPS32R2) on CI20
Test: booted MIPS64 (with 2nd arch MIPS32R6) in QEMU
Test: test-art-target-run-test-optimizing (MIPS32R6) in QEMU
Test: test-art-target-gtest (MIPS32R6) in QEMU
Test: test-art-host-gtest

Change-Id: Ib8b0310a109d9f3d70119c1e605e54b013e60728
diff --git a/runtime/entrypoints/quick/quick_trampoline_entrypoints.cc b/runtime/entrypoints/quick/quick_trampoline_entrypoints.cc
index fe82878..bf1d4ea 100644
--- a/runtime/entrypoints/quick/quick_trampoline_entrypoints.cc
+++ b/runtime/entrypoints/quick/quick_trampoline_entrypoints.cc
@@ -134,13 +134,23 @@
   // | Method*    | ---
   // | RA         |
   // | ...        |    callee saves
+  // | T1         |    arg5
+  // | T0         |    arg4
   // | A3         |    arg3
   // | A2         |    arg2
   // | A1         |    arg1
+  // | F19        |
+  // | F18        |    f_arg5
+  // | F17        |
+  // | F16        |    f_arg4
   // | F15        |
-  // | F14        |    f_arg1
+  // | F14        |    f_arg3
   // | F13        |
-  // | F12        |    f_arg0
+  // | F12        |    f_arg2
+  // | F11        |
+  // | F10        |    f_arg1
+  // | F9         |
+  // | F8         |    f_arg0
   // |            |    padding
   // | A0/Method* |  <- sp
   static constexpr bool kSplitPairAcrossRegisterAndStack = false;
@@ -148,14 +158,14 @@
   static constexpr bool kQuickSoftFloatAbi = false;
   static constexpr bool kQuickDoubleRegAlignedFloatBackFilled = false;
   static constexpr bool kQuickSkipOddFpRegisters = true;
-  static constexpr size_t kNumQuickGprArgs = 3;  // 3 arguments passed in GPRs.
-  static constexpr size_t kNumQuickFprArgs = 4;  // 2 arguments passed in FPRs. Floats can be passed
-                                                 // only in even numbered registers and each double
-                                                 // occupies two registers.
+  static constexpr size_t kNumQuickGprArgs = 5;   // 5 arguments passed in GPRs.
+  static constexpr size_t kNumQuickFprArgs = 12;  // 6 arguments passed in FPRs. Floats can be
+                                                  // passed only in even numbered registers and each
+                                                  // double occupies two registers.
   static constexpr bool kGprFprLockstep = false;
-  static constexpr size_t kQuickCalleeSaveFrame_RefAndArgs_Fpr1Offset = 16;  // Offset of first FPR arg.
-  static constexpr size_t kQuickCalleeSaveFrame_RefAndArgs_Gpr1Offset = 32;  // Offset of first GPR arg.
-  static constexpr size_t kQuickCalleeSaveFrame_RefAndArgs_LrOffset = 76;  // Offset of return address.
+  static constexpr size_t kQuickCalleeSaveFrame_RefAndArgs_Fpr1Offset = 8;  // Offset of first FPR arg.
+  static constexpr size_t kQuickCalleeSaveFrame_RefAndArgs_Gpr1Offset = 56;  // Offset of first GPR arg.
+  static constexpr size_t kQuickCalleeSaveFrame_RefAndArgs_LrOffset = 108;  // Offset of return address.
   static size_t GprIndexToGprOffset(uint32_t gpr_index) {
     return gpr_index * GetBytesPerGprSpillLocation(kRuntimeISA);
   }
@@ -187,7 +197,7 @@
   // | F12        |    f_arg0
   // |            |    padding
   // | A0/Method* |  <- sp
-  // NOTE: for Mip64, when A0 is skipped, F0 is also skipped.
+  // NOTE: for Mip64, when A0 is skipped, F12 is also skipped.
   static constexpr bool kSplitPairAcrossRegisterAndStack = false;
   static constexpr bool kAlignPairRegister = false;
   static constexpr bool kQuickSoftFloatAbi = false;
@@ -197,7 +207,7 @@
   static constexpr size_t kNumQuickFprArgs = 7;  // 7 arguments passed in FPRs.
   static constexpr bool kGprFprLockstep = true;
 
-  static constexpr size_t kQuickCalleeSaveFrame_RefAndArgs_Fpr1Offset = 24;  // Offset of first FPR arg (F1).
+  static constexpr size_t kQuickCalleeSaveFrame_RefAndArgs_Fpr1Offset = 24;  // Offset of first FPR arg (F13).
   static constexpr size_t kQuickCalleeSaveFrame_RefAndArgs_Gpr1Offset = 80;  // Offset of first GPR arg (A1).
   static constexpr size_t kQuickCalleeSaveFrame_RefAndArgs_LrOffset = 200;  // Offset of return address.
   static size_t GprIndexToGprOffset(uint32_t gpr_index) {
@@ -501,10 +511,16 @@
         case Primitive::kPrimDouble:
         case Primitive::kPrimLong:
           if (kQuickSoftFloatAbi || (cur_type_ == Primitive::kPrimLong)) {
-            if (cur_type_ == Primitive::kPrimLong && kAlignPairRegister && gpr_index_ == 0) {
-              // Currently, this is only for ARM and MIPS, where the first available parameter
-              // register is R1 (on ARM) or A1 (on MIPS). So we skip it, and use R2 (on ARM) or
-              // A2 (on MIPS) instead.
+            if (cur_type_ == Primitive::kPrimLong &&
+#if defined(__mips__) && !defined(__LP64__)
+                (gpr_index_ == 0 || gpr_index_ == 2) &&
+#else
+                gpr_index_ == 0 &&
+#endif
+                kAlignPairRegister) {
+              // Currently, this is only for ARM and MIPS, where we align long parameters with
+              // even-numbered registers by skipping R1 (on ARM) or A1(A3) (on MIPS) and using
+              // R2 (on ARM) or A2(T0) (on MIPS) instead.
               IncGprIndex();
             }
             is_split_long_or_double_ = (GetBytesPerGprSpillLocation(kRuntimeISA) == 4) &&
@@ -2086,6 +2102,41 @@
     // Note that the native code pointer will be automatically set by artFindNativeMethod().
   }
 
+#if defined(__mips__) && !defined(__LP64__)
+  // On MIPS32 if the first two arguments are floating-point, we need to know their types
+  // so that art_quick_generic_jni_trampoline can correctly extract them from the stack
+  // and load into floating-point registers.
+  // Possible arrangements of first two floating-point arguments on the stack (32-bit FPU
+  // view):
+  // (1)
+  //  |     DOUBLE    |     DOUBLE    | other args, if any
+  //  |  F12  |  F13  |  F14  |  F15  |
+  //  |  SP+0 |  SP+4 |  SP+8 | SP+12 | SP+16
+  // (2)
+  //  |     DOUBLE    | FLOAT | (PAD) | other args, if any
+  //  |  F12  |  F13  |  F14  |       |
+  //  |  SP+0 |  SP+4 |  SP+8 | SP+12 | SP+16
+  // (3)
+  //  | FLOAT | (PAD) |     DOUBLE    | other args, if any
+  //  |  F12  |       |  F14  |  F15  |
+  //  |  SP+0 |  SP+4 |  SP+8 | SP+12 | SP+16
+  // (4)
+  //  | FLOAT | FLOAT | other args, if any
+  //  |  F12  |  F14  |
+  //  |  SP+0 |  SP+4 | SP+8
+  // As you can see, only the last case (4) is special. In all others we can just
+  // load F12/F13 and F14/F15 in the same manner.
+  // Set bit 0 of the native code address to 1 in this case (valid code addresses
+  // are always a multiple of 4 on MIPS32, so we have 2 spare bits available).
+  if (nativeCode != nullptr &&
+      shorty != nullptr &&
+      shorty_len >= 3 &&
+      shorty[1] == 'F' &&
+      shorty[2] == 'F') {
+    nativeCode = reinterpret_cast<void*>(reinterpret_cast<uintptr_t>(nativeCode) | 1);
+  }
+#endif
+
   // Return native code addr(lo) and bottom of alloca address(hi).
   return GetTwoWordSuccessValue(reinterpret_cast<uintptr_t>(visitor.GetBottomOfUsedArea()),
                                 reinterpret_cast<uintptr_t>(nativeCode));