ARM: 7784/1: mm: ensure SMP alternates assemble to exactly 4 bytes with Thumb-2

Commit ae8a8b9553bd ("ARM: 7691/1: mm: kill unused TLB_CAN_READ_FROM_L1_CACHE
and use ALT_SMP instead") added early function returns for page table
cache flushing operations on ARMv7 SMP CPUs.

Unfortunately, when targetting Thumb-2, these `mov pc, lr' sequences
assemble to 2 bytes which can lead to corruption of the instruction
stream after code patching.

This patch fixes the alternates to use wide (32-bit) instructions for
Thumb-2, therefore ensuring that the patching code works correctly.

Cc: <stable@vger.kernel.org>
Signed-off-by: Will Deacon <will.deacon@arm.com>
Signed-off-by: Russell King <rmk+kernel@arm.linux.org.uk>
diff --git a/arch/arm/mm/proc-v7.S b/arch/arm/mm/proc-v7.S
index 5c6d5a3..73398bc 100644
--- a/arch/arm/mm/proc-v7.S
+++ b/arch/arm/mm/proc-v7.S
@@ -75,13 +75,14 @@
 ENDPROC(cpu_v7_do_idle)
 
 ENTRY(cpu_v7_dcache_clean_area)
-	ALT_SMP(mov	pc, lr)			@ MP extensions imply L1 PTW
-	ALT_UP(W(nop))
-	dcache_line_size r2, r3
-1:	mcr	p15, 0, r0, c7, c10, 1		@ clean D entry
+	ALT_SMP(W(nop))			@ MP extensions imply L1 PTW
+	ALT_UP_B(1f)
+	mov	pc, lr
+1:	dcache_line_size r2, r3
+2:	mcr	p15, 0, r0, c7, c10, 1		@ clean D entry
 	add	r0, r0, r2
 	subs	r1, r1, r2
-	bhi	1b
+	bhi	2b
 	dsb
 	mov	pc, lr
 ENDPROC(cpu_v7_dcache_clean_area)