MIPS32: Improve stack alignment, use sdc1/ldc1, where possible.

- Ensure that SP is a multiple of 16 at all times, and
- Use ldc1/sdc1 to load/store FPU registers from/to 8-byte-aligned
  locations wherever possible.

Use `export ART_MIPS32_CHECK_ALIGNMENT=true` when building Android
to enable the new runtime alignment checks.

Test: Boot & run tests on 32-bit version of QEMU, and CI-20.
Test: test/testrunner/testrunner.py --target --optimizing --32
Test: test-art-host-gtest
Test: test-art-target-gtest

Change-Id: Ia667004573f419fd006098fcfadf5834239cb485
diff --git a/build/art.go b/build/art.go
index 1bcaf51..452b348 100644
--- a/build/art.go
+++ b/build/art.go
@@ -97,6 +97,11 @@
 		asflags = append(asflags, "-DART_ENABLE_ADDRESS_SANITIZER=1")
 	}
 
+	if envTrue(ctx, "ART_MIPS32_CHECK_ALIGNMENT") {
+		// Enable the use of MIPS32 CHECK_ALIGNMENT macro for debugging purposes
+		asflags = append(asflags, "-DART_MIPS32_CHECK_ALIGNMENT")
+	}
+
 	return cflags, asflags
 }
 
diff --git a/compiler/optimizing/code_generator_mips.cc b/compiler/optimizing/code_generator_mips.cc
index 3ba107a..2f65e8c 100644
--- a/compiler/optimizing/code_generator_mips.cc
+++ b/compiler/optimizing/code_generator_mips.cc
@@ -1300,7 +1300,7 @@
   // automatically unspilled when the scratch scope object is destroyed).
   ScratchRegisterScope ensure_scratch(this, TMP, V0, codegen_->GetNumberOfCoreRegisters());
   // If V0 spills onto the stack, SP-relative offsets need to be adjusted.
-  int stack_offset = ensure_scratch.IsSpilled() ? kMipsWordSize : 0;
+  int stack_offset = ensure_scratch.IsSpilled() ? kStackAlignment : 0;
   for (int i = 0; i <= (double_slot ? 1 : 0); i++, stack_offset += kMipsWordSize) {
     __ LoadFromOffset(kLoadWord,
                       Register(ensure_scratch.GetRegister()),
diff --git a/compiler/optimizing/emit_swap_mips_test.cc b/compiler/optimizing/emit_swap_mips_test.cc
index 36e932c..b63914f 100644
--- a/compiler/optimizing/emit_swap_mips_test.cc
+++ b/compiler/optimizing/emit_swap_mips_test.cc
@@ -238,14 +238,14 @@
       DataType::Type::kInt32,
       nullptr);
   const char* expected =
-      "addiu $sp, $sp, -4\n"
+      "addiu $sp, $sp, -16\n"
       "sw $v0, 0($sp)\n"
-      "lw $v0, 56($sp)\n"
-      "lw $t8, 52($sp)\n"
-      "sw $v0, 52($sp)\n"
-      "sw $t8, 56($sp)\n"
+      "lw $v0, 68($sp)\n"
+      "lw $t8, 64($sp)\n"
+      "sw $v0, 64($sp)\n"
+      "sw $t8, 68($sp)\n"
       "lw $v0, 0($sp)\n"
-      "addiu $sp, $sp, 4\n";
+      "addiu $sp, $sp, 16\n";
   DriverWrapper(moves_, expected, "TwoStackSlots");
 }
 
@@ -261,18 +261,18 @@
       DataType::Type::kInt64,
       nullptr);
   const char* expected =
-      "addiu $sp, $sp, -4\n"
+      "addiu $sp, $sp, -16\n"
       "sw $v0, 0($sp)\n"
-      "lw $v0, 60($sp)\n"
-      "lw $t8, 52($sp)\n"
-      "sw $v0, 52($sp)\n"
-      "sw $t8, 60($sp)\n"
-      "lw $v0, 64($sp)\n"
-      "lw $t8, 56($sp)\n"
-      "sw $v0, 56($sp)\n"
-      "sw $t8, 64($sp)\n"
+      "lw $v0, 72($sp)\n"
+      "lw $t8, 64($sp)\n"
+      "sw $v0, 64($sp)\n"
+      "sw $t8, 72($sp)\n"
+      "lw $v0, 76($sp)\n"
+      "lw $t8, 68($sp)\n"
+      "sw $v0, 68($sp)\n"
+      "sw $t8, 76($sp)\n"
       "lw $v0, 0($sp)\n"
-      "addiu $sp, $sp, 4\n";
+      "addiu $sp, $sp, 16\n";
   DriverWrapper(moves_, expected, "TwoDoubleStackSlots");
 }
 
diff --git a/compiler/optimizing/optimizing_cfi_test_expected.inc b/compiler/optimizing/optimizing_cfi_test_expected.inc
index fde55cb..1e82c4b 100644
--- a/compiler/optimizing/optimizing_cfi_test_expected.inc
+++ b/compiler/optimizing/optimizing_cfi_test_expected.inc
@@ -330,10 +330,10 @@
 static constexpr uint8_t expected_asm_kMips_adjust_head[] = {
     0xC0, 0xFF, 0xBD, 0x27, 0x3C, 0x00, 0xBF, 0xAF, 0x38, 0x00, 0xB1, 0xAF,
     0x34, 0x00, 0xB0, 0xAF, 0x28, 0x00, 0xB6, 0xF7, 0x20, 0x00, 0xB4, 0xF7,
-    0x08, 0x00, 0x80, 0x14, 0xFC, 0xFF, 0xBD, 0x27,
+    0x08, 0x00, 0x80, 0x14, 0xF0, 0xFF, 0xBD, 0x27,
     0x00, 0x00, 0xBF, 0xAF, 0x00, 0x00, 0x10, 0x04, 0x02, 0x00, 0x01, 0x3C,
     0x18, 0x00, 0x21, 0x34, 0x21, 0x08, 0x3F, 0x00, 0x00, 0x00, 0xBF, 0x8F,
-    0x09, 0x00, 0x20, 0x00, 0x04, 0x00, 0xBD, 0x27,
+    0x09, 0x00, 0x20, 0x00, 0x10, 0x00, 0xBD, 0x27,
 };
 static constexpr uint8_t expected_asm_kMips_adjust_tail[] = {
     0x3C, 0x00, 0xBF, 0x8F, 0x38, 0x00, 0xB1, 0x8F, 0x34, 0x00, 0xB0, 0x8F,
@@ -342,7 +342,7 @@
 };
 static constexpr uint8_t expected_cfi_kMips_adjust[] = {
     0x44, 0x0E, 0x40, 0x44, 0x9F, 0x01, 0x44, 0x91, 0x02, 0x44, 0x90, 0x03,
-    0x50, 0x0E, 0x44, 0x60, 0x0E, 0x40, 0x04, 0x04, 0x00, 0x02, 0x00, 0x0A,
+    0x50, 0x0E, 0x50, 0x60, 0x0E, 0x40, 0x04, 0x04, 0x00, 0x02, 0x00, 0x0A,
     0x44, 0xDF, 0x44, 0xD1, 0x44, 0xD0, 0x50, 0x0E, 0x00, 0x0B, 0x0E, 0x40,
 };
 // 0x00000000: addiu sp, sp, -64
@@ -356,8 +356,8 @@
 // 0x00000010: sdc1 f22, +40(sp)
 // 0x00000014: sdc1 f20, +32(sp)
 // 0x00000018: bnez a0, 0x0000003c ; +36
-// 0x0000001c: addiu sp, sp, -4
-// 0x00000020: .cfi_def_cfa_offset: 68
+// 0x0000001c: addiu sp, sp, -16
+// 0x00000020: .cfi_def_cfa_offset: 80
 // 0x00000020: sw ra, +0(sp)
 // 0x00000024: nal
 // 0x00000028: lui at, 2
@@ -365,7 +365,7 @@
 // 0x00000030: addu at, at, ra
 // 0x00000034: lw ra, +0(sp)
 // 0x00000038: jr at
-// 0x0000003c: addiu sp, sp, 4
+// 0x0000003c: addiu sp, sp, 16
 // 0x00000040: .cfi_def_cfa_offset: 64
 // 0x00000040: nop
 //             ...
diff --git a/compiler/utils/mips/assembler_mips.cc b/compiler/utils/mips/assembler_mips.cc
index cbb2c0e..9545ca6 100644
--- a/compiler/utils/mips/assembler_mips.cc
+++ b/compiler/utils/mips/assembler_mips.cc
@@ -1863,20 +1863,20 @@
 }
 
 void MipsAssembler::Push(Register rs) {
-  IncreaseFrameSize(kMipsWordSize);
+  IncreaseFrameSize(kStackAlignment);
   Sw(rs, SP, 0);
 }
 
 void MipsAssembler::Pop(Register rd) {
   Lw(rd, SP, 0);
-  DecreaseFrameSize(kMipsWordSize);
+  DecreaseFrameSize(kStackAlignment);
 }
 
 void MipsAssembler::PopAndReturn(Register rd, Register rt) {
   bool reordering = SetReorder(false);
   Lw(rd, SP, 0);
   Jr(rt);
-  DecreaseFrameSize(kMipsWordSize);  // Single instruction in delay slot.
+  DecreaseFrameSize(kStackAlignment);  // Single instruction in delay slot.
   SetReorder(reordering);
 }
 
@@ -4588,7 +4588,7 @@
       Addu(AT, AT, RA);
       Lw(RA, SP, 0);
       Jr(AT);
-      DecreaseFrameSize(kMipsWordSize);
+      DecreaseFrameSize(kStackAlignment);
       break;
     case Branch::kLongCondBranch:
       // The comment on case 'Branch::kLongUncondBranch' applies here as well.
@@ -4608,7 +4608,7 @@
       Addu(AT, AT, RA);
       Lw(RA, SP, 0);
       Jr(AT);
-      DecreaseFrameSize(kMipsWordSize);
+      DecreaseFrameSize(kStackAlignment);
       break;
     case Branch::kLongCall:
       DCHECK_NE(delayed_instruction, Branch::kUnfillableDelaySlot);
diff --git a/compiler/utils/mips/assembler_mips_test.cc b/compiler/utils/mips/assembler_mips_test.cc
index 9397be4..b027d3a 100644
--- a/compiler/utils/mips/assembler_mips_test.cc
+++ b/compiler/utils/mips/assembler_mips_test.cc
@@ -2803,7 +2803,7 @@
   oss <<
       ".set noreorder\n"
       "addiu $t0, $t1, 0x5678\n"
-      "addiu $sp, $sp, -4\n"
+      "addiu $sp, $sp, -16\n"
       "sw $ra, 0($sp)\n"
       "bltzal $zero, .+4\n"
       "lui $at, 0x" << std::hex << High16Bits(offset_forward) << "\n"
@@ -2811,11 +2811,11 @@
       "addu $at, $at, $ra\n"
       "lw $ra, 0($sp)\n"
       "jalr $zero, $at\n"
-      "addiu $sp, $sp, 4\n" <<
+      "addiu $sp, $sp, 16\n" <<
       RepeatInsn(kAdduCount1, "addu $zero, $zero, $zero\n") <<
       RepeatInsn(kAdduCount2, "addu $zero, $zero, $zero\n") <<
       "addiu $t0, $t1, 0x5678\n"
-      "addiu $sp, $sp, -4\n"
+      "addiu $sp, $sp, -16\n"
       "sw $ra, 0($sp)\n"
       "bltzal $zero, .+4\n"
       "lui $at, 0x" << std::hex << High16Bits(offset_back) << "\n"
@@ -2823,7 +2823,7 @@
       "addu $at, $at, $ra\n"
       "lw $ra, 0($sp)\n"
       "jalr $zero, $at\n"
-      "addiu $sp, $sp, 4\n";
+      "addiu $sp, $sp, 16\n";
   std::string expected = oss.str();
   DriverStr(expected, "LongBranchReorder");
   EXPECT_EQ(__ GetLabelLocation(&patcher_label1), 0 * 4u);
diff --git a/runtime/arch/mips/asm_support_mips.S b/runtime/arch/mips/asm_support_mips.S
index 50095ae..fa51059 100644
--- a/runtime/arch/mips/asm_support_mips.S
+++ b/runtime/arch/mips/asm_support_mips.S
@@ -173,4 +173,30 @@
   .set pop
 .endm
 
+// This utility macro is used to check whether the address contained in
+// a register is suitably aligned. Default usage is confirm that the
+// address stored in $sp is a multiple of 16. It can be used for other
+// alignments, and for other base address registers, if needed.
+//
+// Enable this macro by running the shell command:
+//
+//    export ART_MIPS32_CHECK_ALIGNMENT=true
+//
+// NOTE: The value of alignment must be a power of 2, and must fit in an
+// unsigned 15-bit integer. The macro won't behave as expected if these
+// conditions aren't met.
+//
+.macro CHECK_ALIGNMENT ba=$sp, tmp=$at, alignment=16
+#ifdef ART_MIPS32_CHECK_ALIGNMENT
+    .set push
+    .set noat
+    .set noreorder
+    andi  \tmp, \ba, \alignment-1
+    beqz  \tmp, .+12    # Skip break instruction if base address register (ba) is aligned
+    nop
+    break
+    .set pop
+#endif
+.endm
+
 #endif  // ART_RUNTIME_ARCH_MIPS_ASM_SUPPORT_MIPS_S_
diff --git a/runtime/arch/mips/asm_support_mips.h b/runtime/arch/mips/asm_support_mips.h
index 2edd63f..bec5238 100644
--- a/runtime/arch/mips/asm_support_mips.h
+++ b/runtime/arch/mips/asm_support_mips.h
@@ -19,7 +19,7 @@
 
 #include "asm_support.h"
 
-#define FRAME_SIZE_SAVE_ALL_CALLEE_SAVES 96
+#define FRAME_SIZE_SAVE_ALL_CALLEE_SAVES 112
 #define FRAME_SIZE_SAVE_REFS_ONLY 48
 #define FRAME_SIZE_SAVE_REFS_AND_ARGS 112
 #define FRAME_SIZE_SAVE_EVERYTHING 256
diff --git a/runtime/arch/mips/context_mips.cc b/runtime/arch/mips/context_mips.cc
index ca1de0a..3f362de 100644
--- a/runtime/arch/mips/context_mips.cc
+++ b/runtime/arch/mips/context_mips.cc
@@ -42,7 +42,16 @@
 
   // Core registers come first, from the highest down to the lowest.
   for (uint32_t core_reg : HighToLowBits(frame_info.CoreSpillMask())) {
-    gprs_[core_reg] = CalleeSaveAddress(frame, spill_pos, frame_info.FrameSizeInBytes());
+    // If the $ZERO register shows up in the list of registers to
+    // be saved this was only done to properly align the floating
+    // point register save locations to addresses which are
+    // multiples of 8. We only store the address of a register in
+    // gprs_ if the register is not the $ZERO register.  The $ZERO
+    // register is read-only so there's never a reason to save it
+    // on the stack.
+    if (core_reg != 0u) {
+      gprs_[core_reg] = CalleeSaveAddress(frame, spill_pos, frame_info.FrameSizeInBytes());
+    }
     ++spill_pos;
   }
   DCHECK_EQ(spill_pos, POPCOUNT(frame_info.CoreSpillMask()));
@@ -97,7 +106,9 @@
 
 void MipsContext::DoLongJump() {
   uintptr_t gprs[kNumberOfCoreRegisters];
-  uint32_t fprs[kNumberOfFRegisters];
+  // Align fprs[] so that art_quick_do_long_jump() can load FPU
+  // registers from it using the ldc1 instruction.
+  uint32_t fprs[kNumberOfFRegisters] __attribute__((aligned(8)));
   for (size_t i = 0; i < kNumberOfCoreRegisters; ++i) {
     gprs[i] = gprs_[i] != nullptr ? *gprs_[i] : MipsContext::kBadGprBase + i;
   }
diff --git a/runtime/arch/mips/jni_entrypoints_mips.S b/runtime/arch/mips/jni_entrypoints_mips.S
index 5c95071..2c0e750 100644
--- a/runtime/arch/mips/jni_entrypoints_mips.S
+++ b/runtime/arch/mips/jni_entrypoints_mips.S
@@ -28,8 +28,9 @@
     .cfi_adjust_cfa_offset 48
     sw    $ra, 32($sp)
     .cfi_rel_offset 31, 32
-    SDu   $f14, $f15, 24, $sp, $t0
-    SDu   $f12, $f13, 16, $sp, $t0
+    CHECK_ALIGNMENT $sp, $t0
+    sdc1  $f14, 24($sp)
+    sdc1  $f12, 16($sp)
     sw    $a3, 12($sp)
     .cfi_rel_offset 7, 12
     sw    $a2, 8($sp)
@@ -45,8 +46,9 @@
     lw    $a1, 4($sp)
     lw    $a2, 8($sp)
     lw    $a3, 12($sp)
-    LDu   $f12, $f13, 16, $sp, $t0
-    LDu   $f14, $f15, 24, $sp, $t0
+    CHECK_ALIGNMENT $sp, $t0
+    ldc1  $f12, 16($sp)
+    ldc1  $f14, 24($sp)
     lw    $ra, 32($sp)
     beq   $v0, $zero, .Lno_native_code_found
     addiu $sp, $sp, 48          # restore the stack
diff --git a/runtime/arch/mips/quick_entrypoints_mips.S b/runtime/arch/mips/quick_entrypoints_mips.S
index f6204bd..ee3f17d 100644
--- a/runtime/arch/mips/quick_entrypoints_mips.S
+++ b/runtime/arch/mips/quick_entrypoints_mips.S
@@ -37,45 +37,49 @@
      * Reserves FRAME_SIZE_SAVE_ALL_CALLEE_SAVES + ARG_SLOT_SIZE bytes on the stack
      */
 .macro SETUP_SAVE_ALL_CALLEE_SAVES_FRAME
-    addiu  $sp, $sp, -96
-    .cfi_adjust_cfa_offset 96
+    addiu  $sp, $sp, -112
+    .cfi_adjust_cfa_offset 112
 
      // Ugly compile-time check, but we only have the preprocessor.
-#if (FRAME_SIZE_SAVE_ALL_CALLEE_SAVES != 96)
+#if (FRAME_SIZE_SAVE_ALL_CALLEE_SAVES != 112)
 #error "FRAME_SIZE_SAVE_ALL_CALLEE_SAVES(MIPS) size not as expected."
 #endif
 
-    sw     $ra, 92($sp)
-    .cfi_rel_offset 31, 92
-    sw     $s8, 88($sp)
-    .cfi_rel_offset 30, 88
-    sw     $gp, 84($sp)
-    .cfi_rel_offset 28, 84
-    sw     $s7, 80($sp)
-    .cfi_rel_offset 23, 80
-    sw     $s6, 76($sp)
-    .cfi_rel_offset 22, 76
-    sw     $s5, 72($sp)
-    .cfi_rel_offset 21, 72
-    sw     $s4, 68($sp)
-    .cfi_rel_offset 20, 68
-    sw     $s3, 64($sp)
-    .cfi_rel_offset 19, 64
-    sw     $s2, 60($sp)
-    .cfi_rel_offset 18, 60
-    sw     $s1, 56($sp)
-    .cfi_rel_offset 17, 56
-    sw     $s0, 52($sp)
-    .cfi_rel_offset 16, 52
+    sw     $ra, 108($sp)
+    .cfi_rel_offset 31, 108
+    sw     $s8, 104($sp)
+    .cfi_rel_offset 30, 104
+    sw     $gp, 100($sp)
+    .cfi_rel_offset 28, 100
+    sw     $s7, 96($sp)
+    .cfi_rel_offset 23, 96
+    sw     $s6, 92($sp)
+    .cfi_rel_offset 22, 92
+    sw     $s5, 88($sp)
+    .cfi_rel_offset 21, 88
+    sw     $s4, 84($sp)
+    .cfi_rel_offset 20, 84
+    sw     $s3, 80($sp)
+    .cfi_rel_offset 19, 80
+    sw     $s2, 76($sp)
+    .cfi_rel_offset 18, 76
+    sw     $s1, 72($sp)
+    .cfi_rel_offset 17, 72
+    sw     $s0, 68($sp)
+    .cfi_rel_offset 16, 68
+    // 4-byte placeholder for register $zero, serving for alignment
+    // of the following double precision floating point registers.
 
-    SDu $f30, $f31, 44, $sp, $t1
-    SDu $f28, $f29, 36, $sp, $t1
-    SDu $f26, $f27, 28, $sp, $t1
-    SDu $f24, $f25, 20, $sp, $t1
-    SDu $f22, $f23, 12, $sp, $t1
-    SDu $f20, $f21, 4,  $sp, $t1
+    CHECK_ALIGNMENT $sp, $t1
+    sdc1   $f30, 56($sp)
+    sdc1   $f28, 48($sp)
+    sdc1   $f26, 40($sp)
+    sdc1   $f24, 32($sp)
+    sdc1   $f22, 24($sp)
+    sdc1   $f20, 16($sp)
 
-    # 1 word for holding Method*
+    # 1 word for holding Method* plus 12 bytes padding to keep contents of SP
+    # a multiple of 16.
 
     lw $t0, %got(_ZN3art7Runtime9instance_E)($gp)
     lw $t0, 0($t0)
@@ -216,12 +220,13 @@
     .cfi_rel_offset 6, 60
     sw      $a1, 56($sp)
     .cfi_rel_offset 5, 56
-    SDu     $f18, $f19, 48, $sp, $t8
-    SDu     $f16, $f17, 40, $sp, $t8
-    SDu     $f14, $f15, 32, $sp, $t8
-    SDu     $f12, $f13, 24, $sp, $t8
-    SDu     $f10, $f11, 16, $sp, $t8
-    SDu     $f8, $f9, 8, $sp, $t8
+    CHECK_ALIGNMENT $sp, $t8
+    sdc1    $f18, 48($sp)
+    sdc1    $f16, 40($sp)
+    sdc1    $f14, 32($sp)
+    sdc1    $f12, 24($sp)
+    sdc1    $f10, 16($sp)
+    sdc1    $f8,   8($sp)
     # bottom will hold Method*
 .endm
 
@@ -320,12 +325,13 @@
     lw      $a2, 60($sp)
     .cfi_restore 6
     RESTORE_SAVE_REFS_AND_ARGS_FRAME_A1
-    LDu     $f18, $f19, 48, $sp, $t8
-    LDu     $f16, $f17, 40, $sp, $t8
-    LDu     $f14, $f15, 32, $sp, $t8
-    LDu     $f12, $f13, 24, $sp, $t8
-    LDu     $f10, $f11, 16, $sp, $t8
-    LDu     $f8, $f9, 8, $sp, $t8
+    CHECK_ALIGNMENT $sp, $t8
+    ldc1    $f18, 48($sp)
+    ldc1    $f16, 40($sp)
+    ldc1    $f14, 32($sp)
+    ldc1    $f12, 24($sp)
+    ldc1    $f10, 16($sp)
+    ldc1    $f8,   8($sp)
     addiu   $sp, $sp, 112                           # Pop frame.
     .cfi_adjust_cfa_offset -112
 .endm
@@ -412,22 +418,23 @@
 1:
     .cpload $ra
 
-    SDu $f30, $f31, 136, $sp, $t1
-    SDu $f28, $f29, 128, $sp, $t1
-    SDu $f26, $f27, 120, $sp, $t1
-    SDu $f24, $f25, 112, $sp, $t1
-    SDu $f22, $f23, 104, $sp, $t1
-    SDu $f20, $f21, 96,  $sp, $t1
-    SDu $f18, $f19, 88,  $sp, $t1
-    SDu $f16, $f17, 80,  $sp, $t1
-    SDu $f14, $f15, 72,  $sp, $t1
-    SDu $f12, $f13, 64,  $sp, $t1
-    SDu $f10, $f11, 56,  $sp, $t1
-    SDu $f8, $f9, 48,  $sp, $t1
-    SDu $f6, $f7, 40,  $sp, $t1
-    SDu $f4, $f5, 32,  $sp, $t1
-    SDu $f2, $f3, 24,  $sp, $t1
-    SDu $f0, $f1, 16,  $sp, $t1
+    CHECK_ALIGNMENT $sp, $t1
+    sdc1   $f30, 136($sp)
+    sdc1   $f28, 128($sp)
+    sdc1   $f26, 120($sp)
+    sdc1   $f24, 112($sp)
+    sdc1   $f22, 104($sp)
+    sdc1   $f20,  96($sp)
+    sdc1   $f18,  88($sp)
+    sdc1   $f16,  80($sp)
+    sdc1   $f14,  72($sp)
+    sdc1   $f12,  64($sp)
+    sdc1   $f10,  56($sp)
+    sdc1   $f8,   48($sp)
+    sdc1   $f6,   40($sp)
+    sdc1   $f4,   32($sp)
+    sdc1   $f2,   24($sp)
+    sdc1   $f0,   16($sp)
 
     # 3 words padding and 1 word for holding Method*
 
@@ -460,22 +467,23 @@
     addiu  $sp, $sp, ARG_SLOT_SIZE                # remove argument slots on the stack
     .cfi_adjust_cfa_offset -ARG_SLOT_SIZE
 
-    LDu $f30, $f31, 136, $sp, $t1
-    LDu $f28, $f29, 128, $sp, $t1
-    LDu $f26, $f27, 120, $sp, $t1
-    LDu $f24, $f25, 112, $sp, $t1
-    LDu $f22, $f23, 104, $sp, $t1
-    LDu $f20, $f21, 96,  $sp, $t1
-    LDu $f18, $f19, 88,  $sp, $t1
-    LDu $f16, $f17, 80,  $sp, $t1
-    LDu $f14, $f15, 72,  $sp, $t1
-    LDu $f12, $f13, 64,  $sp, $t1
-    LDu $f10, $f11, 56,  $sp, $t1
-    LDu $f8, $f9, 48,  $sp, $t1
-    LDu $f6, $f7, 40,  $sp, $t1
-    LDu $f4, $f5, 32,  $sp, $t1
-    LDu $f2, $f3, 24,  $sp, $t1
-    LDu $f0, $f1, 16,  $sp, $t1
+    CHECK_ALIGNMENT $sp, $t1
+    ldc1   $f30, 136($sp)
+    ldc1   $f28, 128($sp)
+    ldc1   $f26, 120($sp)
+    ldc1   $f24, 112($sp)
+    ldc1   $f22, 104($sp)
+    ldc1   $f20,  96($sp)
+    ldc1   $f18,  88($sp)
+    ldc1   $f16,  80($sp)
+    ldc1   $f14,  72($sp)
+    ldc1   $f12,  64($sp)
+    ldc1   $f10,  56($sp)
+    ldc1   $f8,   48($sp)
+    ldc1   $f6,   40($sp)
+    ldc1   $f4,   32($sp)
+    ldc1   $f2,   24($sp)
+    ldc1   $f0,   16($sp)
 
     lw     $ra, 252($sp)
     .cfi_restore 31
@@ -665,7 +673,8 @@
     b      .Losr_exit
     sw     $v1, 4($a2)                     # store v0/v1 into result
 .Losr_fp_result:
-    SDu    $f0, $f1, 0, $a2, $t0           # store f0/f1 into result
+    CHECK_ALIGNMENT $a2, $t0, 8
+    sdc1   $f0, 0($a2)                     # store f0/f1 into result
 .Losr_exit:
     lw     $ra, 44($sp)
     .cfi_restore 31
@@ -701,26 +710,28 @@
 END art_quick_osr_stub
 
     /*
-     * On entry $a0 is uint32_t* gprs_ and $a1 is uint32_t* fprs_
+     * On entry $a0 is uint32_t* gprs_ and $a1 is uint32_t* fprs_.
+     * Note that fprs_ is expected to be an address that is a multiple of 8.
      * FIXME: just guessing about the shape of the jmpbuf.  Where will pc be?
      */
 ENTRY art_quick_do_long_jump
-    LDu  $f0,  $f1,   0*8, $a1, $t1
-    LDu  $f2,  $f3,   1*8, $a1, $t1
-    LDu  $f4,  $f5,   2*8, $a1, $t1
-    LDu  $f6,  $f7,   3*8, $a1, $t1
-    LDu  $f8,  $f9,   4*8, $a1, $t1
-    LDu  $f10, $f11,  5*8, $a1, $t1
-    LDu  $f12, $f13,  6*8, $a1, $t1
-    LDu  $f14, $f15,  7*8, $a1, $t1
-    LDu  $f16, $f17,  8*8, $a1, $t1
-    LDu  $f18, $f19,  9*8, $a1, $t1
-    LDu  $f20, $f21, 10*8, $a1, $t1
-    LDu  $f22, $f23, 11*8, $a1, $t1
-    LDu  $f24, $f25, 12*8, $a1, $t1
-    LDu  $f26, $f27, 13*8, $a1, $t1
-    LDu  $f28, $f29, 14*8, $a1, $t1
-    LDu  $f30, $f31, 15*8, $a1, $t1
+    CHECK_ALIGNMENT $a1, $t1, 8
+    ldc1    $f0,   0*8($a1)
+    ldc1    $f2,   1*8($a1)
+    ldc1    $f4,   2*8($a1)
+    ldc1    $f6,   3*8($a1)
+    ldc1    $f8,   4*8($a1)
+    ldc1    $f10,  5*8($a1)
+    ldc1    $f12,  6*8($a1)
+    ldc1    $f14,  7*8($a1)
+    ldc1    $f16,  8*8($a1)
+    ldc1    $f18,  9*8($a1)
+    ldc1    $f20, 10*8($a1)
+    ldc1    $f22, 11*8($a1)
+    ldc1    $f24, 12*8($a1)
+    ldc1    $f26, 13*8($a1)
+    ldc1    $f28, 14*8($a1)
+    ldc1    $f30, 15*8($a1)
 
     .set push
     .set nomacro
@@ -1067,7 +1078,8 @@
     jalr  $zero, $ra
     sw    $v1, 4($t0)           # store the other half of the result
 5:
-    SDu   $f0, $f1, 0, $t0, $t1 # store floating point result
+    CHECK_ALIGNMENT $t0, $t1, 8
+    sdc1  $f0, 0($t0)           # store floating point result
     jalr  $zero, $ra
     nop
 
@@ -1225,7 +1237,8 @@
     jalr  $zero, $ra
     sw    $v1, 4($t0)           # store the other half of the result
 6:
-    SDu   $f0, $f1, 0, $t0, $t1 # store floating point result
+    CHECK_ALIGNMENT $t0, $t1, 8
+    sdc1  $f0, 0($t0)           # store floating point result
     jalr  $zero, $ra
     nop
 
@@ -2252,7 +2265,7 @@
     move    $a0, rSELF             # pass Thread::Current
     move    $a2, $v0               # pass result
     move    $a3, $v1
-    addiu   $sp, $sp, -24          # reserve arg slots
+    addiu   $sp, $sp, -32          # reserve arg slots
     la      $t9, artQuickGenericJniEndTrampoline
     jalr    $t9
     s.d     $f0, 16($sp)           # pass result_f
@@ -3243,7 +3256,8 @@
     lhu   $v0, 16($sp)                        # Move char from JValue result to return value register.
 .Lstore_double_result:
 .Lstore_float_result:
-    LDu   $f0, $f1, 16, $sp, $t0              # Move double/float from JValue result to return value register.
+    CHECK_ALIGNMENT $sp, $t0
+    ldc1  $f0, 16($sp)                        # Move double/float from JValue result to return value register.
     b .Lcleanup_and_return
     nop
 .Lstore_long_result:
diff --git a/runtime/arch/mips/quick_method_frame_info_mips.h b/runtime/arch/mips/quick_method_frame_info_mips.h
index 45a21ab..8c86252 100644
--- a/runtime/arch/mips/quick_method_frame_info_mips.h
+++ b/runtime/arch/mips/quick_method_frame_info_mips.h
@@ -35,8 +35,24 @@
 static constexpr uint32_t kMipsCalleeSaveArgSpills =
     (1 << art::mips::A1) | (1 << art::mips::A2) | (1 << art::mips::A3) | (1 << art::mips::T0) |
     (1 << art::mips::T1);
+// We want to save all floating point register pairs at addresses
+// which are multiples of 8 so that we can eliminate use of the
+// SDu/LDu macros by using sdc1/ldc1 to store/load floating
+// register values using a single instruction. Because integer
+// registers are stored at the top of the frame, to achieve having
+// the floating point register pairs aligned on multiples of 8 the
+// number of integer registers saved must be even. Previously, the
+// only case in which we saved floating point registers beneath an
+// odd number of integer registers was when "type" is
+// CalleeSaveType::kSaveAllCalleeSaves. (There are other cases in
+// which an odd number of integer registers are saved but those
+// cases don't save any floating point registers. If no floating
+// point registers are saved we don't care if the number of integer
+// registers saved is odd or even). To save an even number of
+// integer registers in this particular case we add the ZERO
+// register to the list of registers which get saved.
 static constexpr uint32_t kMipsCalleeSaveAllSpills =
-    (1 << art::mips::S0) | (1 << art::mips::S1);
+    (1 << art::mips::ZERO) | (1 << art::mips::S0) | (1 << art::mips::S1);
 static constexpr uint32_t kMipsCalleeSaveEverythingSpills =
     (1 << art::mips::AT) | (1 << art::mips::V0) | (1 << art::mips::V1) |
     (1 << art::mips::A0) | (1 << art::mips::A1) | (1 << art::mips::A2) | (1 << art::mips::A3) |