Blame - arch/arm/crypto/sha512-armv4.pl - SHIFTPHONES/mainline/linux

blob: 2fc3516912fa59e948c99f2bd7e6c84672d6506e [file] [log] [blame]

Ard Biesheuvel	c80ae7c	2015-05-08 10:46:21 +0200	[diff] [blame]	1	#!/usr/bin/env perl
Adam Langley	c2e415f	2018-05-22 12:35:11 -0700	[diff] [blame]	2	# SPDX-License-Identifier: GPL-2.0
				3
				4	# This code is taken from the OpenSSL project but the author (Andy Polyakov)
				5	# has relicensed it under the GPLv2. Therefore this program is free software;
				6	# you can redistribute it and/or modify it under the terms of the GNU General
				7	# Public License version 2 as published by the Free Software Foundation.
				8	#
				9	# The original headers, including the original license headers, are
				10	# included below for completeness.
Ard Biesheuvel	c80ae7c	2015-05-08 10:46:21 +0200	[diff] [blame]	11
				12	# ====================================================================
				13	# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
				14	# project. The module is, however, dual licensed under OpenSSL and
				15	# CRYPTOGAMS licenses depending on where you obtain it. For further
Alexander A. Klimov	9332a9e	2020-07-19 18:49:59 +0200	[diff] [blame]	16	# details see https://www.openssl.org/~appro/cryptogams/.
Ard Biesheuvel	c80ae7c	2015-05-08 10:46:21 +0200	[diff] [blame]	17	# ====================================================================
				18
				19	# SHA512 block procedure for ARMv4. September 2007.
				20
				21	# This code is ~4.5 (four and a half) times faster than code generated
				22	# by gcc 3.4 and it spends ~72 clock cycles per byte [on single-issue
				23	# Xscale PXA250 core].
				24	#
				25	# July 2010.
				26	#
				27	# Rescheduling for dual-issue pipeline resulted in 6% improvement on
				28	# Cortex A8 core and ~40 cycles per processed byte.
				29
				30	# February 2011.
				31	#
				32	# Profiler-assisted and platform-specific optimization resulted in 7%
				33	# improvement on Coxtex A8 core and ~38 cycles per byte.
				34
				35	# March 2011.
				36	#
				37	# Add NEON implementation. On Cortex A8 it was measured to process
				38	# one byte in 23.3 cycles or ~60% faster than integer-only code.
				39
				40	# August 2012.
				41	#
				42	# Improve NEON performance by 12% on Snapdragon S4. In absolute
				43	# terms it's 22.6 cycles per byte, which is disappointing result.
				44	# Technical writers asserted that 3-way S4 pipeline can sustain
				45	# multiple NEON instructions per cycle, but dual NEON issue could
Alexander A. Klimov	9332a9e	2020-07-19 18:49:59 +0200	[diff] [blame]	46	# not be observed, see https://www.openssl.org/~appro/Snapdragon-S4.html
Ard Biesheuvel	c80ae7c	2015-05-08 10:46:21 +0200	[diff] [blame]	47	# for further details. On side note Cortex-A15 processes one byte in
				48	# 16 cycles.
				49
				50	# Byte order [in]dependence. =========================================
				51	#
				52	# Originally caller was expected to maintain specific dword order in
				53	# h[0-7], namely with most significant dword at lower address, which
				54	# was reflected in below two parameters as 0 and 4. Now caller is
				55	# expected to maintain native byte order for whole 64-bit values.
				56	$hi="HI";
				57	$lo="LO";
				58	# ====================================================================
				59
				60	while (($output=shift) && ($output!~/^\w[\w\-]*\.\w+$/)) {}
				61	open STDOUT,">$output";
				62
				63	$ctx="r0"; # parameter block
				64	$inp="r1";
				65	$len="r2";
				66
				67	$Tlo="r3";
				68	$Thi="r4";
				69	$Alo="r5";
				70	$Ahi="r6";
				71	$Elo="r7";
				72	$Ehi="r8";
				73	$t0="r9";
				74	$t1="r10";
				75	$t2="r11";
				76	$t3="r12";
				77	############ r13 is stack pointer
				78	$Ktbl="r14";
				79	############ r15 is program counter
				80
				81	$Aoff=8*0;
				82	$Boff=8*1;
				83	$Coff=8*2;
				84	$Doff=8*3;
				85	$Eoff=8*4;
				86	$Foff=8*5;
				87	$Goff=8*6;
				88	$Hoff=8*7;
				89	$Xoff=8*8;
				90
				91	sub BODY_00_15() {
				92	my $magic = shift;
				93	$code.=<<___;
				94	@ Sigma1(x) (ROTR((x),14) ^ ROTR((x),18) ^ ROTR((x),41))
				95	@ LO lo>>14^hi<<18 ^ lo>>18^hi<<14 ^ hi>>9^lo<<23
				96	@ HI hi>>14^lo<<18 ^ hi>>18^lo<<14 ^ lo>>9^hi<<23
				97	mov $t0,$Elo,lsr#14
				98	str $Tlo,[sp,#$Xoff+0]
				99	mov $t1,$Ehi,lsr#14
				100	str $Thi,[sp,#$Xoff+4]
				101	eor $t0,$t0,$Ehi,lsl#18
				102	ldr $t2,[sp,#$Hoff+0] @ h.lo
				103	eor $t1,$t1,$Elo,lsl#18
				104	ldr $t3,[sp,#$Hoff+4] @ h.hi
				105	eor $t0,$t0,$Elo,lsr#18
				106	eor $t1,$t1,$Ehi,lsr#18
				107	eor $t0,$t0,$Ehi,lsl#14
				108	eor $t1,$t1,$Elo,lsl#14
				109	eor $t0,$t0,$Ehi,lsr#9
				110	eor $t1,$t1,$Elo,lsr#9
				111	eor $t0,$t0,$Elo,lsl#23
				112	eor $t1,$t1,$Ehi,lsl#23 @ Sigma1(e)
				113	adds $Tlo,$Tlo,$t0
				114	ldr $t0,[sp,#$Foff+0] @ f.lo
				115	adc $Thi,$Thi,$t1 @ T += Sigma1(e)
				116	ldr $t1,[sp,#$Foff+4] @ f.hi
				117	adds $Tlo,$Tlo,$t2
				118	ldr $t2,[sp,#$Goff+0] @ g.lo
				119	adc $Thi,$Thi,$t3 @ T += h
				120	ldr $t3,[sp,#$Goff+4] @ g.hi
				121
				122	eor $t0,$t0,$t2
				123	str $Elo,[sp,#$Eoff+0]
				124	eor $t1,$t1,$t3
				125	str $Ehi,[sp,#$Eoff+4]
				126	and $t0,$t0,$Elo
				127	str $Alo,[sp,#$Aoff+0]
				128	and $t1,$t1,$Ehi
				129	str $Ahi,[sp,#$Aoff+4]
				130	eor $t0,$t0,$t2
				131	ldr $t2,[$Ktbl,#$lo] @ K[i].lo
				132	eor $t1,$t1,$t3 @ Ch(e,f,g)
				133	ldr $t3,[$Ktbl,#$hi] @ K[i].hi
				134
				135	adds $Tlo,$Tlo,$t0
				136	ldr $Elo,[sp,#$Doff+0] @ d.lo
				137	adc $Thi,$Thi,$t1 @ T += Ch(e,f,g)
				138	ldr $Ehi,[sp,#$Doff+4] @ d.hi
				139	adds $Tlo,$Tlo,$t2
				140	and $t0,$t2,#0xff
				141	adc $Thi,$Thi,$t3 @ T += K[i]
				142	adds $Elo,$Elo,$Tlo
				143	ldr $t2,[sp,#$Boff+0] @ b.lo
				144	adc $Ehi,$Ehi,$Thi @ d += T
				145	teq $t0,#$magic
				146
				147	ldr $t3,[sp,#$Coff+0] @ c.lo
				148	#if __ARM_ARCH__>=7
				149	it eq @ Thumb2 thing, sanity check in ARM
				150	#endif
				151	orreq $Ktbl,$Ktbl,#1
				152	@ Sigma0(x) (ROTR((x),28) ^ ROTR((x),34) ^ ROTR((x),39))
				153	@ LO lo>>28^hi<<4 ^ hi>>2^lo<<30 ^ hi>>7^lo<<25
				154	@ HI hi>>28^lo<<4 ^ lo>>2^hi<<30 ^ lo>>7^hi<<25
				155	mov $t0,$Alo,lsr#28
				156	mov $t1,$Ahi,lsr#28
				157	eor $t0,$t0,$Ahi,lsl#4
				158	eor $t1,$t1,$Alo,lsl#4
				159	eor $t0,$t0,$Ahi,lsr#2
				160	eor $t1,$t1,$Alo,lsr#2
				161	eor $t0,$t0,$Alo,lsl#30
				162	eor $t1,$t1,$Ahi,lsl#30
				163	eor $t0,$t0,$Ahi,lsr#7
				164	eor $t1,$t1,$Alo,lsr#7
				165	eor $t0,$t0,$Alo,lsl#25
				166	eor $t1,$t1,$Ahi,lsl#25 @ Sigma0(a)
				167	adds $Tlo,$Tlo,$t0
				168	and $t0,$Alo,$t2
				169	adc $Thi,$Thi,$t1 @ T += Sigma0(a)
				170
				171	ldr $t1,[sp,#$Boff+4] @ b.hi
				172	orr $Alo,$Alo,$t2
				173	ldr $t2,[sp,#$Coff+4] @ c.hi
				174	and $Alo,$Alo,$t3
				175	and $t3,$Ahi,$t1
				176	orr $Ahi,$Ahi,$t1
				177	orr $Alo,$Alo,$t0 @ Maj(a,b,c).lo
				178	and $Ahi,$Ahi,$t2
				179	adds $Alo,$Alo,$Tlo
				180	orr $Ahi,$Ahi,$t3 @ Maj(a,b,c).hi
				181	sub sp,sp,#8
				182	adc $Ahi,$Ahi,$Thi @ h += T
				183	tst $Ktbl,#1
				184	add $Ktbl,$Ktbl,#8
				185	___
				186	}
				187	$code=<<___;
				188	#ifndef __KERNEL__
				189	# include "arm_arch.h"
				190	# define VFP_ABI_PUSH vstmdb sp!,{d8-d15}
				191	# define VFP_ABI_POP vldmia sp!,{d8-d15}
				192	#else
				193	# define __ARM_ARCH__ __LINUX_ARM_ARCH__
				194	# define __ARM_MAX_ARCH__ 7
				195	# define VFP_ABI_PUSH
				196	# define VFP_ABI_POP
				197	#endif
				198
				199	#ifdef __ARMEL__
				200	# define LO 0
				201	# define HI 4
				202	# define WORD64(hi0,lo0,hi1,lo1) .word lo0,hi0, lo1,hi1
				203	#else
				204	# define HI 0
				205	# define LO 4
				206	# define WORD64(hi0,lo0,hi1,lo1) .word hi0,lo0, hi1,lo1
				207	#endif
				208
				209	.text
				210	#if __ARM_ARCH__<7
				211	.code 32
				212	#else
				213	.syntax unified
				214	# ifdef __thumb2__
Ard Biesheuvel	c80ae7c	2015-05-08 10:46:21 +0200	[diff] [blame]	215	.thumb
				216	# else
				217	.code 32
				218	# endif
				219	#endif
				220
				221	.type K512,%object
				222	.align 5
				223	K512:
				224	WORD64(0x428a2f98,0xd728ae22, 0x71374491,0x23ef65cd)
				225	WORD64(0xb5c0fbcf,0xec4d3b2f, 0xe9b5dba5,0x8189dbbc)
				226	WORD64(0x3956c25b,0xf348b538, 0x59f111f1,0xb605d019)
				227	WORD64(0x923f82a4,0xaf194f9b, 0xab1c5ed5,0xda6d8118)
				228	WORD64(0xd807aa98,0xa3030242, 0x12835b01,0x45706fbe)
				229	WORD64(0x243185be,0x4ee4b28c, 0x550c7dc3,0xd5ffb4e2)
				230	WORD64(0x72be5d74,0xf27b896f, 0x80deb1fe,0x3b1696b1)
				231	WORD64(0x9bdc06a7,0x25c71235, 0xc19bf174,0xcf692694)
				232	WORD64(0xe49b69c1,0x9ef14ad2, 0xefbe4786,0x384f25e3)
				233	WORD64(0x0fc19dc6,0x8b8cd5b5, 0x240ca1cc,0x77ac9c65)
				234	WORD64(0x2de92c6f,0x592b0275, 0x4a7484aa,0x6ea6e483)
				235	WORD64(0x5cb0a9dc,0xbd41fbd4, 0x76f988da,0x831153b5)
				236	WORD64(0x983e5152,0xee66dfab, 0xa831c66d,0x2db43210)
				237	WORD64(0xb00327c8,0x98fb213f, 0xbf597fc7,0xbeef0ee4)
				238	WORD64(0xc6e00bf3,0x3da88fc2, 0xd5a79147,0x930aa725)
				239	WORD64(0x06ca6351,0xe003826f, 0x14292967,0x0a0e6e70)
				240	WORD64(0x27b70a85,0x46d22ffc, 0x2e1b2138,0x5c26c926)
				241	WORD64(0x4d2c6dfc,0x5ac42aed, 0x53380d13,0x9d95b3df)
				242	WORD64(0x650a7354,0x8baf63de, 0x766a0abb,0x3c77b2a8)
				243	WORD64(0x81c2c92e,0x47edaee6, 0x92722c85,0x1482353b)
				244	WORD64(0xa2bfe8a1,0x4cf10364, 0xa81a664b,0xbc423001)
				245	WORD64(0xc24b8b70,0xd0f89791, 0xc76c51a3,0x0654be30)
				246	WORD64(0xd192e819,0xd6ef5218, 0xd6990624,0x5565a910)
				247	WORD64(0xf40e3585,0x5771202a, 0x106aa070,0x32bbd1b8)
				248	WORD64(0x19a4c116,0xb8d2d0c8, 0x1e376c08,0x5141ab53)
				249	WORD64(0x2748774c,0xdf8eeb99, 0x34b0bcb5,0xe19b48a8)
				250	WORD64(0x391c0cb3,0xc5c95a63, 0x4ed8aa4a,0xe3418acb)
				251	WORD64(0x5b9cca4f,0x7763e373, 0x682e6ff3,0xd6b2b8a3)
				252	WORD64(0x748f82ee,0x5defb2fc, 0x78a5636f,0x43172f60)
				253	WORD64(0x84c87814,0xa1f0ab72, 0x8cc70208,0x1a6439ec)
				254	WORD64(0x90befffa,0x23631e28, 0xa4506ceb,0xde82bde9)
				255	WORD64(0xbef9a3f7,0xb2c67915, 0xc67178f2,0xe372532b)
				256	WORD64(0xca273ece,0xea26619c, 0xd186b8c7,0x21c0c207)
				257	WORD64(0xeada7dd6,0xcde0eb1e, 0xf57d4f7f,0xee6ed178)
				258	WORD64(0x06f067aa,0x72176fba, 0x0a637dc5,0xa2c898a6)
				259	WORD64(0x113f9804,0xbef90dae, 0x1b710b35,0x131c471b)
				260	WORD64(0x28db77f5,0x23047d84, 0x32caab7b,0x40c72493)
				261	WORD64(0x3c9ebe0a,0x15c9bebc, 0x431d67c4,0x9c100d4c)
				262	WORD64(0x4cc5d4be,0xcb3e42b6, 0x597f299c,0xfc657e2a)
				263	WORD64(0x5fcb6fab,0x3ad6faec, 0x6c44198c,0x4a475817)
				264	.size K512,.-K512
				265	#if __ARM_MAX_ARCH__>=7 && !defined(__KERNEL__)
				266	.LOPENSSL_armcap:
				267	.word OPENSSL_armcap_P-sha512_block_data_order
				268	.skip 32-4
				269	#else
				270	.skip 32
				271	#endif
				272
				273	.global sha512_block_data_order
				274	.type sha512_block_data_order,%function
				275	sha512_block_data_order:
Ard Biesheuvel	c643165	2019-02-16 14:51:26 +0100	[diff] [blame]	276	.Lsha512_block_data_order:
Ard Biesheuvel	c80ae7c	2015-05-08 10:46:21 +0200	[diff] [blame]	277	#if __ARM_ARCH__<7
				278	sub r3,pc,#8 @ sha512_block_data_order
				279	#else
Ard Biesheuvel	c643165	2019-02-16 14:51:26 +0100	[diff] [blame]	280	adr r3,.Lsha512_block_data_order
Ard Biesheuvel	c80ae7c	2015-05-08 10:46:21 +0200	[diff] [blame]	281	#endif
				282	#if __ARM_MAX_ARCH__>=7 && !defined(__KERNEL__)
				283	ldr r12,.LOPENSSL_armcap
				284	ldr r12,[r3,r12] @ OPENSSL_armcap_P
				285	tst r12,#1
				286	bne .LNEON
				287	#endif
				288	add $len,$inp,$len,lsl#7 @ len to point at the end of inp
				289	stmdb sp!,{r4-r12,lr}
				290	sub $Ktbl,r3,#672 @ K512
				291	sub sp,sp,#9*8
				292
				293	ldr $Elo,[$ctx,#$Eoff+$lo]
				294	ldr $Ehi,[$ctx,#$Eoff+$hi]
				295	ldr $t0, [$ctx,#$Goff+$lo]
				296	ldr $t1, [$ctx,#$Goff+$hi]
				297	ldr $t2, [$ctx,#$Hoff+$lo]
				298	ldr $t3, [$ctx,#$Hoff+$hi]
				299	.Loop:
				300	str $t0, [sp,#$Goff+0]
				301	str $t1, [sp,#$Goff+4]
				302	str $t2, [sp,#$Hoff+0]
				303	str $t3, [sp,#$Hoff+4]
				304	ldr $Alo,[$ctx,#$Aoff+$lo]
				305	ldr $Ahi,[$ctx,#$Aoff+$hi]
				306	ldr $Tlo,[$ctx,#$Boff+$lo]
				307	ldr $Thi,[$ctx,#$Boff+$hi]
				308	ldr $t0, [$ctx,#$Coff+$lo]
				309	ldr $t1, [$ctx,#$Coff+$hi]
				310	ldr $t2, [$ctx,#$Doff+$lo]
				311	ldr $t3, [$ctx,#$Doff+$hi]
				312	str $Tlo,[sp,#$Boff+0]
				313	str $Thi,[sp,#$Boff+4]
				314	str $t0, [sp,#$Coff+0]
				315	str $t1, [sp,#$Coff+4]
				316	str $t2, [sp,#$Doff+0]
				317	str $t3, [sp,#$Doff+4]
				318	ldr $Tlo,[$ctx,#$Foff+$lo]
				319	ldr $Thi,[$ctx,#$Foff+$hi]
				320	str $Tlo,[sp,#$Foff+0]
				321	str $Thi,[sp,#$Foff+4]
				322
				323	.L00_15:
				324	#if __ARM_ARCH__<7
				325	ldrb $Tlo,[$inp,#7]
				326	ldrb $t0, [$inp,#6]
				327	ldrb $t1, [$inp,#5]
				328	ldrb $t2, [$inp,#4]
				329	ldrb $Thi,[$inp,#3]
				330	ldrb $t3, [$inp,#2]
				331	orr $Tlo,$Tlo,$t0,lsl#8
				332	ldrb $t0, [$inp,#1]
				333	orr $Tlo,$Tlo,$t1,lsl#16
				334	ldrb $t1, [$inp],#8
				335	orr $Tlo,$Tlo,$t2,lsl#24
				336	orr $Thi,$Thi,$t3,lsl#8
				337	orr $Thi,$Thi,$t0,lsl#16
				338	orr $Thi,$Thi,$t1,lsl#24
				339	#else
				340	ldr $Tlo,[$inp,#4]
				341	ldr $Thi,[$inp],#8
				342	#ifdef __ARMEL__
				343	rev $Tlo,$Tlo
				344	rev $Thi,$Thi
				345	#endif
				346	#endif
				347	___
				348	&BODY_00_15(0x94);
				349	$code.=<<___;
				350	tst $Ktbl,#1
				351	beq .L00_15
				352	ldr $t0,[sp,#`$Xoff+8*(16-1)`+0]
				353	ldr $t1,[sp,#`$Xoff+8*(16-1)`+4]
				354	bic $Ktbl,$Ktbl,#1
				355	.L16_79:
				356	@ sigma0(x) (ROTR((x),1) ^ ROTR((x),8) ^ ((x)>>7))
				357	@ LO lo>>1^hi<<31 ^ lo>>8^hi<<24 ^ lo>>7^hi<<25
				358	@ HI hi>>1^lo<<31 ^ hi>>8^lo<<24 ^ hi>>7
				359	mov $Tlo,$t0,lsr#1
				360	ldr $t2,[sp,#`$Xoff+8*(16-14)`+0]
				361	mov $Thi,$t1,lsr#1
				362	ldr $t3,[sp,#`$Xoff+8*(16-14)`+4]
				363	eor $Tlo,$Tlo,$t1,lsl#31
				364	eor $Thi,$Thi,$t0,lsl#31
				365	eor $Tlo,$Tlo,$t0,lsr#8
				366	eor $Thi,$Thi,$t1,lsr#8
				367	eor $Tlo,$Tlo,$t1,lsl#24
				368	eor $Thi,$Thi,$t0,lsl#24
				369	eor $Tlo,$Tlo,$t0,lsr#7
				370	eor $Thi,$Thi,$t1,lsr#7
				371	eor $Tlo,$Tlo,$t1,lsl#25
				372
				373	@ sigma1(x) (ROTR((x),19) ^ ROTR((x),61) ^ ((x)>>6))
				374	@ LO lo>>19^hi<<13 ^ hi>>29^lo<<3 ^ lo>>6^hi<<26
				375	@ HI hi>>19^lo<<13 ^ lo>>29^hi<<3 ^ hi>>6
				376	mov $t0,$t2,lsr#19
				377	mov $t1,$t3,lsr#19
				378	eor $t0,$t0,$t3,lsl#13
				379	eor $t1,$t1,$t2,lsl#13
				380	eor $t0,$t0,$t3,lsr#29
				381	eor $t1,$t1,$t2,lsr#29
				382	eor $t0,$t0,$t2,lsl#3
				383	eor $t1,$t1,$t3,lsl#3
				384	eor $t0,$t0,$t2,lsr#6
				385	eor $t1,$t1,$t3,lsr#6
				386	ldr $t2,[sp,#`$Xoff+8*(16-9)`+0]
				387	eor $t0,$t0,$t3,lsl#26
				388
				389	ldr $t3,[sp,#`$Xoff+8*(16-9)`+4]
				390	adds $Tlo,$Tlo,$t0
				391	ldr $t0,[sp,#`$Xoff+8*16`+0]
				392	adc $Thi,$Thi,$t1
				393
				394	ldr $t1,[sp,#`$Xoff+8*16`+4]
				395	adds $Tlo,$Tlo,$t2
				396	adc $Thi,$Thi,$t3
				397	adds $Tlo,$Tlo,$t0
				398	adc $Thi,$Thi,$t1
				399	___
				400	&BODY_00_15(0x17);
				401	$code.=<<___;
				402	#if __ARM_ARCH__>=7
				403	ittt eq @ Thumb2 thing, sanity check in ARM
				404	#endif
				405	ldreq $t0,[sp,#`$Xoff+8*(16-1)`+0]
				406	ldreq $t1,[sp,#`$Xoff+8*(16-1)`+4]
				407	beq .L16_79
				408	bic $Ktbl,$Ktbl,#1
				409
				410	ldr $Tlo,[sp,#$Boff+0]
				411	ldr $Thi,[sp,#$Boff+4]
				412	ldr $t0, [$ctx,#$Aoff+$lo]
				413	ldr $t1, [$ctx,#$Aoff+$hi]
				414	ldr $t2, [$ctx,#$Boff+$lo]
				415	ldr $t3, [$ctx,#$Boff+$hi]
				416	adds $t0,$Alo,$t0
				417	str $t0, [$ctx,#$Aoff+$lo]
				418	adc $t1,$Ahi,$t1
				419	str $t1, [$ctx,#$Aoff+$hi]
				420	adds $t2,$Tlo,$t2
				421	str $t2, [$ctx,#$Boff+$lo]
				422	adc $t3,$Thi,$t3
				423	str $t3, [$ctx,#$Boff+$hi]
				424
				425	ldr $Alo,[sp,#$Coff+0]
				426	ldr $Ahi,[sp,#$Coff+4]
				427	ldr $Tlo,[sp,#$Doff+0]
				428	ldr $Thi,[sp,#$Doff+4]
				429	ldr $t0, [$ctx,#$Coff+$lo]
				430	ldr $t1, [$ctx,#$Coff+$hi]
				431	ldr $t2, [$ctx,#$Doff+$lo]
				432	ldr $t3, [$ctx,#$Doff+$hi]
				433	adds $t0,$Alo,$t0
				434	str $t0, [$ctx,#$Coff+$lo]
				435	adc $t1,$Ahi,$t1
				436	str $t1, [$ctx,#$Coff+$hi]
				437	adds $t2,$Tlo,$t2
				438	str $t2, [$ctx,#$Doff+$lo]
				439	adc $t3,$Thi,$t3
				440	str $t3, [$ctx,#$Doff+$hi]
				441
				442	ldr $Tlo,[sp,#$Foff+0]
				443	ldr $Thi,[sp,#$Foff+4]
				444	ldr $t0, [$ctx,#$Eoff+$lo]
				445	ldr $t1, [$ctx,#$Eoff+$hi]
				446	ldr $t2, [$ctx,#$Foff+$lo]
				447	ldr $t3, [$ctx,#$Foff+$hi]
				448	adds $Elo,$Elo,$t0
				449	str $Elo,[$ctx,#$Eoff+$lo]
				450	adc $Ehi,$Ehi,$t1
				451	str $Ehi,[$ctx,#$Eoff+$hi]
				452	adds $t2,$Tlo,$t2
				453	str $t2, [$ctx,#$Foff+$lo]
				454	adc $t3,$Thi,$t3
				455	str $t3, [$ctx,#$Foff+$hi]
				456
				457	ldr $Alo,[sp,#$Goff+0]
				458	ldr $Ahi,[sp,#$Goff+4]
				459	ldr $Tlo,[sp,#$Hoff+0]
				460	ldr $Thi,[sp,#$Hoff+4]
				461	ldr $t0, [$ctx,#$Goff+$lo]
				462	ldr $t1, [$ctx,#$Goff+$hi]
				463	ldr $t2, [$ctx,#$Hoff+$lo]
				464	ldr $t3, [$ctx,#$Hoff+$hi]
				465	adds $t0,$Alo,$t0
				466	str $t0, [$ctx,#$Goff+$lo]
				467	adc $t1,$Ahi,$t1
				468	str $t1, [$ctx,#$Goff+$hi]
				469	adds $t2,$Tlo,$t2
				470	str $t2, [$ctx,#$Hoff+$lo]
				471	adc $t3,$Thi,$t3
				472	str $t3, [$ctx,#$Hoff+$hi]
				473
				474	add sp,sp,#640
				475	sub $Ktbl,$Ktbl,#640
				476
				477	teq $inp,$len
				478	bne .Loop
				479
				480	add sp,sp,#8*9 @ destroy frame
				481	#if __ARM_ARCH__>=5
				482	ldmia sp!,{r4-r12,pc}
				483	#else
				484	ldmia sp!,{r4-r12,lr}
				485	tst lr,#1
				486	moveq pc,lr @ be binary compatible with V4, yet
				487	bx lr @ interoperable with Thumb ISA:-)
				488	#endif
				489	.size sha512_block_data_order,.-sha512_block_data_order
				490	___
				491
				492	{
				493	my @Sigma0=(28,34,39);
				494	my @Sigma1=(14,18,41);
				495	my @sigma0=(1, 8, 7);
				496	my @sigma1=(19,61,6);
				497
				498	my $Ktbl="r3";
				499	my $cnt="r12"; # volatile register known as ip, intra-procedure-call scratch
				500
				501	my @X=map("d$_",(0..15));
				502	my @V=($A,$B,$C,$D,$E,$F,$G,$H)=map("d$_",(16..23));
				503
				504	sub NEON_00_15() {
				505	my $i=shift;
				506	my ($a,$b,$c,$d,$e,$f,$g,$h)=@_;
				507	my ($t0,$t1,$t2,$T1,$K,$Ch,$Maj)=map("d$_",(24..31)); # temps
				508
				509	$code.=<<___ if ($i<16 \|\| $i&1);
				510	vshr.u64 $t0,$e,#@Sigma1[0] @ $i
				511	#if $i<16
				512	vld1.64 {@X[$i%16]},[$inp]! @ handles unaligned
				513	#endif
				514	vshr.u64 $t1,$e,#@Sigma1[1]
				515	#if $i>0
				516	vadd.i64 $a,$Maj @ h+=Maj from the past
				517	#endif
				518	vshr.u64 $t2,$e,#@Sigma1[2]
				519	___
				520	$code.=<<___;
				521	vld1.64 {$K},[$Ktbl,:64]! @ K[i++]
				522	vsli.64 $t0,$e,#`64-@Sigma1[0]`
				523	vsli.64 $t1,$e,#`64-@Sigma1[1]`
				524	vmov $Ch,$e
				525	vsli.64 $t2,$e,#`64-@Sigma1[2]`
				526	#if $i<16 && defined(__ARMEL__)
				527	vrev64.8 @X[$i],@X[$i]
				528	#endif
				529	veor $t1,$t0
				530	vbsl $Ch,$f,$g @ Ch(e,f,g)
				531	vshr.u64 $t0,$a,#@Sigma0[0]
				532	veor $t2,$t1 @ Sigma1(e)
				533	vadd.i64 $T1,$Ch,$h
				534	vshr.u64 $t1,$a,#@Sigma0[1]
				535	vsli.64 $t0,$a,#`64-@Sigma0[0]`
				536	vadd.i64 $T1,$t2
				537	vshr.u64 $t2,$a,#@Sigma0[2]
				538	vadd.i64 $K,@X[$i%16]
				539	vsli.64 $t1,$a,#`64-@Sigma0[1]`
				540	veor $Maj,$a,$b
				541	vsli.64 $t2,$a,#`64-@Sigma0[2]`
				542	veor $h,$t0,$t1
				543	vadd.i64 $T1,$K
				544	vbsl $Maj,$c,$b @ Maj(a,b,c)
				545	veor $h,$t2 @ Sigma0(a)
				546	vadd.i64 $d,$T1
				547	vadd.i64 $Maj,$T1
				548	@ vadd.i64 $h,$Maj
				549	___
				550	}
				551
				552	sub NEON_16_79() {
				553	my $i=shift;
				554
				555	if ($i&1) { &NEON_00_15($i,@_); return; }
				556
				557	# 2x-vectorized, therefore runs every 2nd round
				558	my @X=map("q$_",(0..7)); # view @X as 128-bit vector
				559	my ($t0,$t1,$s0,$s1) = map("q$_",(12..15)); # temps
				560	my ($d0,$d1,$d2) = map("d$_",(24..26)); # temps from NEON_00_15
				561	my $e=@_[4]; # $e from NEON_00_15
				562	$i /= 2;
				563	$code.=<<___;
				564	vshr.u64 $t0,@X[($i+7)%8],#@sigma1[0]
				565	vshr.u64 $t1,@X[($i+7)%8],#@sigma1[1]
				566	vadd.i64 @_[0],d30 @ h+=Maj from the past
				567	vshr.u64 $s1,@X[($i+7)%8],#@sigma1[2]
				568	vsli.64 $t0,@X[($i+7)%8],#`64-@sigma1[0]`
				569	vext.8 $s0,@X[$i%8],@X[($i+1)%8],#8 @ X[i+1]
				570	vsli.64 $t1,@X[($i+7)%8],#`64-@sigma1[1]`
				571	veor $s1,$t0
				572	vshr.u64 $t0,$s0,#@sigma0[0]
				573	veor $s1,$t1 @ sigma1(X[i+14])
				574	vshr.u64 $t1,$s0,#@sigma0[1]
				575	vadd.i64 @X[$i%8],$s1
				576	vshr.u64 $s1,$s0,#@sigma0[2]
				577	vsli.64 $t0,$s0,#`64-@sigma0[0]`
				578	vsli.64 $t1,$s0,#`64-@sigma0[1]`
				579	vext.8 $s0,@X[($i+4)%8],@X[($i+5)%8],#8 @ X[i+9]
				580	veor $s1,$t0
				581	vshr.u64 $d0,$e,#@Sigma1[0] @ from NEON_00_15
				582	vadd.i64 @X[$i%8],$s0
				583	vshr.u64 $d1,$e,#@Sigma1[1] @ from NEON_00_15
				584	veor $s1,$t1 @ sigma0(X[i+1])
				585	vshr.u64 $d2,$e,#@Sigma1[2] @ from NEON_00_15
				586	vadd.i64 @X[$i%8],$s1
				587	___
				588	&NEON_00_15(2*$i,@_);
				589	}
				590
				591	$code.=<<___;
				592	#if __ARM_MAX_ARCH__>=7
				593	.arch armv7-a
				594	.fpu neon
				595
				596	.global sha512_block_data_order_neon
				597	.type sha512_block_data_order_neon,%function
				598	.align 4
				599	sha512_block_data_order_neon:
				600	.LNEON:
				601	dmb @ errata #451034 on early Cortex A8
				602	add $len,$inp,$len,lsl#7 @ len to point at the end of inp
				603	VFP_ABI_PUSH
Ard Biesheuvel	0f5e832	2020-09-16 09:14:18 +0300	[diff] [blame]	604	adr $Ktbl,.Lsha512_block_data_order
				605	sub $Ktbl,$Ktbl,.Lsha512_block_data_order-K512
Ard Biesheuvel	c80ae7c	2015-05-08 10:46:21 +0200	[diff] [blame]	606	vldmia $ctx,{$A-$H} @ load context
				607	.Loop_neon:
				608	___
				609	for($i=0;$i<16;$i++) { &NEON_00_15($i,@V); unshift(@V,pop(@V)); }
				610	$code.=<<___;
				611	mov $cnt,#4
				612	.L16_79_neon:
				613	subs $cnt,#1
				614	___
				615	for(;$i<32;$i++) { &NEON_16_79($i,@V); unshift(@V,pop(@V)); }
				616	$code.=<<___;
				617	bne .L16_79_neon
				618
				619	vadd.i64 $A,d30 @ h+=Maj from the past
				620	vldmia $ctx,{d24-d31} @ load context to temp
				621	vadd.i64 q8,q12 @ vectorized accumulate
				622	vadd.i64 q9,q13
				623	vadd.i64 q10,q14
				624	vadd.i64 q11,q15
				625	vstmia $ctx,{$A-$H} @ save context
				626	teq $inp,$len
				627	sub $Ktbl,#640 @ rewind K512
				628	bne .Loop_neon
				629
				630	VFP_ABI_POP
				631	ret @ bx lr
				632	.size sha512_block_data_order_neon,.-sha512_block_data_order_neon
				633	#endif
				634	___
				635	}
				636	$code.=<<___;
				637	.asciz "SHA512 block transform for ARMv4/NEON, CRYPTOGAMS by <appro\@openssl.org>"
				638	.align 2
				639	#if __ARM_MAX_ARCH__>=7 && !defined(__KERNEL__)
				640	.comm OPENSSL_armcap_P,4,4
				641	#endif
				642	___
				643
				644	$code =~ s/\`([^\`]*)\`/eval $1/gem;
				645	$code =~ s/\bbx\s+lr\b/.word\t0xe12fff1e/gm; # make it possible to compile with -march=armv4
				646	$code =~ s/\bret\b/bx lr/gm;
				647
				648	open SELF,$0;
				649	while(<SELF>) {
				650	next if (/^#!/);
				651	last if (!s/^#/@/ and !/^$/);
				652	print;
				653	}
				654	close SELF;
				655
				656	print $code;
				657	close STDOUT; # enforce flush