Blame - arch/arm/crypto/sha256-armv4.pl - SHIFTPHONES/mainline/linux

blob: f3a2b54efd4ee39fbeaefc87ffd850e97915233b [file] [log] [blame]

Sami Tolvanen	f2f770d	2015-04-03 18:03:40 +0800	[diff] [blame]	1	#!/usr/bin/env perl
Adam Langley	c2e415f	2018-05-22 12:35:11 -0700	[diff] [blame]	2	# SPDX-License-Identifier: GPL-2.0
				3
				4	# This code is taken from the OpenSSL project but the author (Andy Polyakov)
				5	# has relicensed it under the GPLv2. Therefore this program is free software;
				6	# you can redistribute it and/or modify it under the terms of the GNU General
				7	# Public License version 2 as published by the Free Software Foundation.
				8	#
				9	# The original headers, including the original license headers, are
				10	# included below for completeness.
Sami Tolvanen	f2f770d	2015-04-03 18:03:40 +0800	[diff] [blame]	11
				12	# ====================================================================
				13	# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
				14	# project. The module is, however, dual licensed under OpenSSL and
				15	# CRYPTOGAMS licenses depending on where you obtain it. For further
Alexander A. Klimov	9332a9e	2020-07-19 18:49:59 +0200	[diff] [blame]	16	# details see https://www.openssl.org/~appro/cryptogams/.
Sami Tolvanen	f2f770d	2015-04-03 18:03:40 +0800	[diff] [blame]	17	# ====================================================================
				18
				19	# SHA256 block procedure for ARMv4. May 2007.
				20
				21	# Performance is ~2x better than gcc 3.4 generated code and in "abso-
				22	# lute" terms is ~2250 cycles per 64-byte block or ~35 cycles per
				23	# byte [on single-issue Xscale PXA250 core].
				24
				25	# July 2010.
				26	#
				27	# Rescheduling for dual-issue pipeline resulted in 22% improvement on
				28	# Cortex A8 core and ~20 cycles per processed byte.
				29
				30	# February 2011.
				31	#
				32	# Profiler-assisted and platform-specific optimization resulted in 16%
				33	# improvement on Cortex A8 core and ~15.4 cycles per processed byte.
				34
				35	# September 2013.
				36	#
				37	# Add NEON implementation. On Cortex A8 it was measured to process one
				38	# byte in 12.5 cycles or 23% faster than integer-only code. Snapdragon
				39	# S4 does it in 12.5 cycles too, but it's 50% faster than integer-only
				40	# code (meaning that latter performs sub-optimally, nothing was done
				41	# about it).
				42
				43	# May 2014.
				44	#
				45	# Add ARMv8 code path performing at 2.0 cpb on Apple A7.
				46
				47	while (($output=shift) && ($output!~/^\w[\w\-]*\.\w+$/)) {}
				48	open STDOUT,">$output";
				49
				50	$ctx="r0"; $t0="r0";
				51	$inp="r1"; $t4="r1";
				52	$len="r2"; $t1="r2";
				53	$T1="r3"; $t3="r3";
				54	$A="r4";
				55	$B="r5";
				56	$C="r6";
				57	$D="r7";
				58	$E="r8";
				59	$F="r9";
				60	$G="r10";
				61	$H="r11";
				62	@V=($A,$B,$C,$D,$E,$F,$G,$H);
				63	$t2="r12";
				64	$Ktbl="r14";
				65
				66	@Sigma0=( 2,13,22);
				67	@Sigma1=( 6,11,25);
				68	@sigma0=( 7,18, 3);
				69	@sigma1=(17,19,10);
				70
				71	sub BODY_00_15 {
				72	my ($i,$a,$b,$c,$d,$e,$f,$g,$h) = @_;
				73
				74	$code.=<<___ if ($i<16);
				75	#if __ARM_ARCH__>=7
				76	@ ldr $t1,[$inp],#4 @ $i
				77	# if $i==15
				78	str $inp,[sp,#17*4] @ make room for $t4
				79	# endif
				80	eor $t0,$e,$e,ror#`$Sigma1[1]-$Sigma1[0]`
				81	add $a,$a,$t2 @ h+=Maj(a,b,c) from the past
				82	eor $t0,$t0,$e,ror#`$Sigma1[2]-$Sigma1[0]` @ Sigma1(e)
				83	# ifndef __ARMEB__
				84	rev $t1,$t1
				85	# endif
				86	#else
				87	@ ldrb $t1,[$inp,#3] @ $i
				88	add $a,$a,$t2 @ h+=Maj(a,b,c) from the past
				89	ldrb $t2,[$inp,#2]
				90	ldrb $t0,[$inp,#1]
				91	orr $t1,$t1,$t2,lsl#8
				92	ldrb $t2,[$inp],#4
				93	orr $t1,$t1,$t0,lsl#16
				94	# if $i==15
				95	str $inp,[sp,#17*4] @ make room for $t4
				96	# endif
				97	eor $t0,$e,$e,ror#`$Sigma1[1]-$Sigma1[0]`
				98	orr $t1,$t1,$t2,lsl#24
				99	eor $t0,$t0,$e,ror#`$Sigma1[2]-$Sigma1[0]` @ Sigma1(e)
				100	#endif
				101	___
				102	$code.=<<___;
				103	ldr $t2,[$Ktbl],#4 @ *K256++
				104	add $h,$h,$t1 @ h+=X[i]
				105	str $t1,[sp,#`$i%16`*4]
				106	eor $t1,$f,$g
				107	add $h,$h,$t0,ror#$Sigma1[0] @ h+=Sigma1(e)
				108	and $t1,$t1,$e
				109	add $h,$h,$t2 @ h+=K256[i]
				110	eor $t1,$t1,$g @ Ch(e,f,g)
				111	eor $t0,$a,$a,ror#`$Sigma0[1]-$Sigma0[0]`
				112	add $h,$h,$t1 @ h+=Ch(e,f,g)
				113	#if $i==31
				114	and $t2,$t2,#0xff
				115	cmp $t2,#0xf2 @ done?
				116	#endif
				117	#if $i<15
				118	# if __ARM_ARCH__>=7
				119	ldr $t1,[$inp],#4 @ prefetch
				120	# else
				121	ldrb $t1,[$inp,#3]
				122	# endif
				123	eor $t2,$a,$b @ a^b, b^c in next round
				124	#else
				125	ldr $t1,[sp,#`($i+2)%16`*4] @ from future BODY_16_xx
				126	eor $t2,$a,$b @ a^b, b^c in next round
				127	ldr $t4,[sp,#`($i+15)%16`*4] @ from future BODY_16_xx
				128	#endif
				129	eor $t0,$t0,$a,ror#`$Sigma0[2]-$Sigma0[0]` @ Sigma0(a)
				130	and $t3,$t3,$t2 @ (b^c)&=(a^b)
				131	add $d,$d,$h @ d+=h
				132	eor $t3,$t3,$b @ Maj(a,b,c)
				133	add $h,$h,$t0,ror#$Sigma0[0] @ h+=Sigma0(a)
				134	@ add $h,$h,$t3 @ h+=Maj(a,b,c)
				135	___
				136	($t2,$t3)=($t3,$t2);
				137	}
				138
				139	sub BODY_16_XX {
				140	my ($i,$a,$b,$c,$d,$e,$f,$g,$h) = @_;
				141
				142	$code.=<<___;
				143	@ ldr $t1,[sp,#`($i+1)%16`*4] @ $i
				144	@ ldr $t4,[sp,#`($i+14)%16`*4]
				145	mov $t0,$t1,ror#$sigma0[0]
				146	add $a,$a,$t2 @ h+=Maj(a,b,c) from the past
				147	mov $t2,$t4,ror#$sigma1[0]
				148	eor $t0,$t0,$t1,ror#$sigma0[1]
				149	eor $t2,$t2,$t4,ror#$sigma1[1]
				150	eor $t0,$t0,$t1,lsr#$sigma0[2] @ sigma0(X[i+1])
				151	ldr $t1,[sp,#`($i+0)%16`*4]
				152	eor $t2,$t2,$t4,lsr#$sigma1[2] @ sigma1(X[i+14])
				153	ldr $t4,[sp,#`($i+9)%16`*4]
				154
				155	add $t2,$t2,$t0
				156	eor $t0,$e,$e,ror#`$Sigma1[1]-$Sigma1[0]` @ from BODY_00_15
				157	add $t1,$t1,$t2
				158	eor $t0,$t0,$e,ror#`$Sigma1[2]-$Sigma1[0]` @ Sigma1(e)
				159	add $t1,$t1,$t4 @ X[i]
				160	___
				161	&BODY_00_15(@_);
				162	}
				163
				164	$code=<<___;
				165	#ifndef __KERNEL__
				166	# include "arm_arch.h"
				167	#else
				168	# define __ARM_ARCH__ __LINUX_ARM_ARCH__
				169	# define __ARM_MAX_ARCH__ 7
				170	#endif
				171
				172	.text
				173	#if __ARM_ARCH__<7
				174	.code 32
				175	#else
				176	.syntax unified
				177	# ifdef __thumb2__
Sami Tolvanen	f2f770d	2015-04-03 18:03:40 +0800	[diff] [blame]	178	.thumb
				179	# else
				180	.code 32
				181	# endif
				182	#endif
				183
				184	.type K256,%object
				185	.align 5
				186	K256:
				187	.word 0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5
				188	.word 0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5
				189	.word 0xd807aa98,0x12835b01,0x243185be,0x550c7dc3
				190	.word 0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174
				191	.word 0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc
				192	.word 0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da
				193	.word 0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7
				194	.word 0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967
				195	.word 0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13
				196	.word 0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85
				197	.word 0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3
				198	.word 0xd192e819,0xd6990624,0xf40e3585,0x106aa070
				199	.word 0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5
				200	.word 0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3
				201	.word 0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208
				202	.word 0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2
				203	.size K256,.-K256
				204	.word 0 @ terminator
				205	#if __ARM_MAX_ARCH__>=7 && !defined(__KERNEL__)
				206	.LOPENSSL_armcap:
				207	.word OPENSSL_armcap_P-sha256_block_data_order
				208	#endif
				209	.align 5
				210
				211	.global sha256_block_data_order
				212	.type sha256_block_data_order,%function
				213	sha256_block_data_order:
Ard Biesheuvel	69216a5	2019-02-16 14:51:25 +0100	[diff] [blame]	214	.Lsha256_block_data_order:
Sami Tolvanen	f2f770d	2015-04-03 18:03:40 +0800	[diff] [blame]	215	#if __ARM_ARCH__<7
				216	sub r3,pc,#8 @ sha256_block_data_order
				217	#else
Ard Biesheuvel	69216a5	2019-02-16 14:51:25 +0100	[diff] [blame]	218	adr r3,.Lsha256_block_data_order
Sami Tolvanen	f2f770d	2015-04-03 18:03:40 +0800	[diff] [blame]	219	#endif
				220	#if __ARM_MAX_ARCH__>=7 && !defined(__KERNEL__)
				221	ldr r12,.LOPENSSL_armcap
				222	ldr r12,[r3,r12] @ OPENSSL_armcap_P
				223	tst r12,#ARMV8_SHA256
				224	bne .LARMv8
				225	tst r12,#ARMV7_NEON
				226	bne .LNEON
				227	#endif
				228	add $len,$inp,$len,lsl#6 @ len to point at the end of inp
				229	stmdb sp!,{$ctx,$inp,$len,r4-r11,lr}
				230	ldmia $ctx,{$A,$B,$C,$D,$E,$F,$G,$H}
				231	sub $Ktbl,r3,#256+32 @ K256
				232	sub sp,sp,#16*4 @ alloca(X[16])
				233	.Loop:
				234	# if __ARM_ARCH__>=7
				235	ldr $t1,[$inp],#4
				236	# else
				237	ldrb $t1,[$inp,#3]
				238	# endif
				239	eor $t3,$B,$C @ magic
				240	eor $t2,$t2,$t2
				241	___
				242	for($i=0;$i<16;$i++) { &BODY_00_15($i,@V); unshift(@V,pop(@V)); }
				243	$code.=".Lrounds_16_xx:\n";
				244	for (;$i<32;$i++) { &BODY_16_XX($i,@V); unshift(@V,pop(@V)); }
				245	$code.=<<___;
				246	#if __ARM_ARCH__>=7
				247	ite eq @ Thumb2 thing, sanity check in ARM
				248	#endif
				249	ldreq $t3,[sp,#16*4] @ pull ctx
				250	bne .Lrounds_16_xx
				251
				252	add $A,$A,$t2 @ h+=Maj(a,b,c) from the past
				253	ldr $t0,[$t3,#0]
				254	ldr $t1,[$t3,#4]
				255	ldr $t2,[$t3,#8]
				256	add $A,$A,$t0
				257	ldr $t0,[$t3,#12]
				258	add $B,$B,$t1
				259	ldr $t1,[$t3,#16]
				260	add $C,$C,$t2
				261	ldr $t2,[$t3,#20]
				262	add $D,$D,$t0
				263	ldr $t0,[$t3,#24]
				264	add $E,$E,$t1
				265	ldr $t1,[$t3,#28]
				266	add $F,$F,$t2
				267	ldr $inp,[sp,#17*4] @ pull inp
				268	ldr $t2,[sp,#18*4] @ pull inp+len
				269	add $G,$G,$t0
				270	add $H,$H,$t1
				271	stmia $t3,{$A,$B,$C,$D,$E,$F,$G,$H}
				272	cmp $inp,$t2
				273	sub $Ktbl,$Ktbl,#256 @ rewind Ktbl
				274	bne .Loop
				275
				276	add sp,sp,#`16+3`*4 @ destroy frame
				277	#if __ARM_ARCH__>=5
				278	ldmia sp!,{r4-r11,pc}
				279	#else
				280	ldmia sp!,{r4-r11,lr}
				281	tst lr,#1
				282	moveq pc,lr @ be binary compatible with V4, yet
				283	bx lr @ interoperable with Thumb ISA:-)
				284	#endif
				285	.size sha256_block_data_order,.-sha256_block_data_order
				286	___
				287	######################################################################
				288	# NEON stuff
				289	#
				290	{{{
				291	my @X=map("q$_",(0..3));
				292	my ($T0,$T1,$T2,$T3,$T4,$T5)=("q8","q9","q10","q11","d24","d25");
				293	my $Xfer=$t4;
				294	my $j=0;
				295
				296	sub Dlo() { shift=~m\|q([1]?[0-9])\|?"d".($1*2):""; }
				297	sub Dhi() { shift=~m\|q([1]?[0-9])\|?"d".($1*2+1):""; }
				298
				299	sub AUTOLOAD() # thunk [simplified] x86-style perlasm
				300	{ my $opcode = $AUTOLOAD; $opcode =~ s/.*:://; $opcode =~ s/_/\./;
				301	my $arg = pop;
				302	$arg = "#$arg" if ($arg*1 eq $arg);
				303	$code .= "\t$opcode\t".join(',',@_,$arg)."\n";
				304	}
				305
				306	sub Xupdate()
				307	{ use integer;
				308	my $body = shift;
				309	my @insns = (&$body,&$body,&$body,&$body);
				310	my ($a,$b,$c,$d,$e,$f,$g,$h);
				311
				312	&vext_8 ($T0,@X[0],@X[1],4); # X[1..4]
				313	eval(shift(@insns));
				314	eval(shift(@insns));
				315	eval(shift(@insns));
				316	&vext_8 ($T1,@X[2],@X[3],4); # X[9..12]
				317	eval(shift(@insns));
				318	eval(shift(@insns));
				319	eval(shift(@insns));
				320	&vshr_u32 ($T2,$T0,$sigma0[0]);
				321	eval(shift(@insns));
				322	eval(shift(@insns));
				323	&vadd_i32 (@X[0],@X[0],$T1); # X[0..3] += X[9..12]
				324	eval(shift(@insns));
				325	eval(shift(@insns));
				326	&vshr_u32 ($T1,$T0,$sigma0[2]);
				327	eval(shift(@insns));
				328	eval(shift(@insns));
				329	&vsli_32 ($T2,$T0,32-$sigma0[0]);
				330	eval(shift(@insns));
				331	eval(shift(@insns));
				332	&vshr_u32 ($T3,$T0,$sigma0[1]);
				333	eval(shift(@insns));
				334	eval(shift(@insns));
				335	&veor ($T1,$T1,$T2);
				336	eval(shift(@insns));
				337	eval(shift(@insns));
				338	&vsli_32 ($T3,$T0,32-$sigma0[1]);
				339	eval(shift(@insns));
				340	eval(shift(@insns));
				341	&vshr_u32 ($T4,&Dhi(@X[3]),$sigma1[0]);
				342	eval(shift(@insns));
				343	eval(shift(@insns));
				344	&veor ($T1,$T1,$T3); # sigma0(X[1..4])
				345	eval(shift(@insns));
				346	eval(shift(@insns));
				347	&vsli_32 ($T4,&Dhi(@X[3]),32-$sigma1[0]);
				348	eval(shift(@insns));
				349	eval(shift(@insns));
				350	&vshr_u32 ($T5,&Dhi(@X[3]),$sigma1[2]);
				351	eval(shift(@insns));
				352	eval(shift(@insns));
				353	&vadd_i32 (@X[0],@X[0],$T1); # X[0..3] += sigma0(X[1..4])
				354	eval(shift(@insns));
				355	eval(shift(@insns));
				356	&veor ($T5,$T5,$T4);
				357	eval(shift(@insns));
				358	eval(shift(@insns));
				359	&vshr_u32 ($T4,&Dhi(@X[3]),$sigma1[1]);
				360	eval(shift(@insns));
				361	eval(shift(@insns));
				362	&vsli_32 ($T4,&Dhi(@X[3]),32-$sigma1[1]);
				363	eval(shift(@insns));
				364	eval(shift(@insns));
				365	&veor ($T5,$T5,$T4); # sigma1(X[14..15])
				366	eval(shift(@insns));
				367	eval(shift(@insns));
				368	&vadd_i32 (&Dlo(@X[0]),&Dlo(@X[0]),$T5);# X[0..1] += sigma1(X[14..15])
				369	eval(shift(@insns));
				370	eval(shift(@insns));
				371	&vshr_u32 ($T4,&Dlo(@X[0]),$sigma1[0]);
				372	eval(shift(@insns));
				373	eval(shift(@insns));
				374	&vsli_32 ($T4,&Dlo(@X[0]),32-$sigma1[0]);
				375	eval(shift(@insns));
				376	eval(shift(@insns));
				377	&vshr_u32 ($T5,&Dlo(@X[0]),$sigma1[2]);
				378	eval(shift(@insns));
				379	eval(shift(@insns));
				380	&veor ($T5,$T5,$T4);
				381	eval(shift(@insns));
				382	eval(shift(@insns));
				383	&vshr_u32 ($T4,&Dlo(@X[0]),$sigma1[1]);
				384	eval(shift(@insns));
				385	eval(shift(@insns));
				386	&vld1_32 ("{$T0}","[$Ktbl,:128]!");
				387	eval(shift(@insns));
				388	eval(shift(@insns));
				389	&vsli_32 ($T4,&Dlo(@X[0]),32-$sigma1[1]);
				390	eval(shift(@insns));
				391	eval(shift(@insns));
				392	&veor ($T5,$T5,$T4); # sigma1(X[16..17])
				393	eval(shift(@insns));
				394	eval(shift(@insns));
				395	&vadd_i32 (&Dhi(@X[0]),&Dhi(@X[0]),$T5);# X[2..3] += sigma1(X[16..17])
				396	eval(shift(@insns));
				397	eval(shift(@insns));
				398	&vadd_i32 ($T0,$T0,@X[0]);
				399	while($#insns>=2) { eval(shift(@insns)); }
				400	&vst1_32 ("{$T0}","[$Xfer,:128]!");
				401	eval(shift(@insns));
				402	eval(shift(@insns));
				403
				404	push(@X,shift(@X)); # "rotate" X[]
				405	}
				406
				407	sub Xpreload()
				408	{ use integer;
				409	my $body = shift;
				410	my @insns = (&$body,&$body,&$body,&$body);
				411	my ($a,$b,$c,$d,$e,$f,$g,$h);
				412
				413	eval(shift(@insns));
				414	eval(shift(@insns));
				415	eval(shift(@insns));
				416	eval(shift(@insns));
				417	&vld1_32 ("{$T0}","[$Ktbl,:128]!");
				418	eval(shift(@insns));
				419	eval(shift(@insns));
				420	eval(shift(@insns));
				421	eval(shift(@insns));
				422	&vrev32_8 (@X[0],@X[0]);
				423	eval(shift(@insns));
				424	eval(shift(@insns));
				425	eval(shift(@insns));
				426	eval(shift(@insns));
				427	&vadd_i32 ($T0,$T0,@X[0]);
				428	foreach (@insns) { eval; } # remaining instructions
				429	&vst1_32 ("{$T0}","[$Xfer,:128]!");
				430
				431	push(@X,shift(@X)); # "rotate" X[]
				432	}
				433
				434	sub body_00_15 () {
				435	(
				436	'($a,$b,$c,$d,$e,$f,$g,$h)=@V;'.
				437	'&add ($h,$h,$t1)', # h+=X[i]+K[i]
				438	'&eor ($t1,$f,$g)',
				439	'&eor ($t0,$e,$e,"ror#".($Sigma1[1]-$Sigma1[0]))',
				440	'&add ($a,$a,$t2)', # h+=Maj(a,b,c) from the past
				441	'&and ($t1,$t1,$e)',
				442	'&eor ($t2,$t0,$e,"ror#".($Sigma1[2]-$Sigma1[0]))', # Sigma1(e)
				443	'&eor ($t0,$a,$a,"ror#".($Sigma0[1]-$Sigma0[0]))',
				444	'&eor ($t1,$t1,$g)', # Ch(e,f,g)
				445	'&add ($h,$h,$t2,"ror#$Sigma1[0]")', # h+=Sigma1(e)
				446	'&eor ($t2,$a,$b)', # a^b, b^c in next round
				447	'&eor ($t0,$t0,$a,"ror#".($Sigma0[2]-$Sigma0[0]))', # Sigma0(a)
				448	'&add ($h,$h,$t1)', # h+=Ch(e,f,g)
				449	'&ldr ($t1,sprintf "[sp,#%d]",4*(($j+1)&15)) if (($j&15)!=15);'.
				450	'&ldr ($t1,"[$Ktbl]") if ($j==15);'.
				451	'&ldr ($t1,"[sp,#64]") if ($j==31)',
				452	'&and ($t3,$t3,$t2)', # (b^c)&=(a^b)
				453	'&add ($d,$d,$h)', # d+=h
				454	'&add ($h,$h,$t0,"ror#$Sigma0[0]");'. # h+=Sigma0(a)
				455	'&eor ($t3,$t3,$b)', # Maj(a,b,c)
				456	'$j++; unshift(@V,pop(@V)); ($t2,$t3)=($t3,$t2);'
				457	)
				458	}
				459
				460	$code.=<<___;
				461	#if __ARM_MAX_ARCH__>=7
				462	.arch armv7-a
				463	.fpu neon
				464
				465	.global sha256_block_data_order_neon
				466	.type sha256_block_data_order_neon,%function
				467	.align 4
				468	sha256_block_data_order_neon:
				469	.LNEON:
				470	stmdb sp!,{r4-r12,lr}
				471
				472	sub $H,sp,#16*4+16
Ard Biesheuvel	5478193	2020-09-16 09:14:17 +0300	[diff] [blame]	473	adr $Ktbl,.Lsha256_block_data_order
				474	sub $Ktbl,$Ktbl,#.Lsha256_block_data_order-K256
Sami Tolvanen	f2f770d	2015-04-03 18:03:40 +0800	[diff] [blame]	475	bic $H,$H,#15 @ align for 128-bit stores
				476	mov $t2,sp
				477	mov sp,$H @ alloca
				478	add $len,$inp,$len,lsl#6 @ len to point at the end of inp
				479
				480	vld1.8 {@X[0]},[$inp]!
				481	vld1.8 {@X[1]},[$inp]!
				482	vld1.8 {@X[2]},[$inp]!
				483	vld1.8 {@X[3]},[$inp]!
				484	vld1.32 {$T0},[$Ktbl,:128]!
				485	vld1.32 {$T1},[$Ktbl,:128]!
				486	vld1.32 {$T2},[$Ktbl,:128]!
				487	vld1.32 {$T3},[$Ktbl,:128]!
				488	vrev32.8 @X[0],@X[0] @ yes, even on
				489	str $ctx,[sp,#64]
				490	vrev32.8 @X[1],@X[1] @ big-endian
				491	str $inp,[sp,#68]
				492	mov $Xfer,sp
				493	vrev32.8 @X[2],@X[2]
				494	str $len,[sp,#72]
				495	vrev32.8 @X[3],@X[3]
				496	str $t2,[sp,#76] @ save original sp
				497	vadd.i32 $T0,$T0,@X[0]
				498	vadd.i32 $T1,$T1,@X[1]
				499	vst1.32 {$T0},[$Xfer,:128]!
				500	vadd.i32 $T2,$T2,@X[2]
				501	vst1.32 {$T1},[$Xfer,:128]!
				502	vadd.i32 $T3,$T3,@X[3]
				503	vst1.32 {$T2},[$Xfer,:128]!
				504	vst1.32 {$T3},[$Xfer,:128]!
				505
				506	ldmia $ctx,{$A-$H}
				507	sub $Xfer,$Xfer,#64
				508	ldr $t1,[sp,#0]
				509	eor $t2,$t2,$t2
				510	eor $t3,$B,$C
				511	b .L_00_48
				512
				513	.align 4
				514	.L_00_48:
				515	___
				516	&Xupdate(\&body_00_15);
				517	&Xupdate(\&body_00_15);
				518	&Xupdate(\&body_00_15);
				519	&Xupdate(\&body_00_15);
				520	$code.=<<___;
				521	teq $t1,#0 @ check for K256 terminator
				522	ldr $t1,[sp,#0]
				523	sub $Xfer,$Xfer,#64
				524	bne .L_00_48
				525
				526	ldr $inp,[sp,#68]
				527	ldr $t0,[sp,#72]
				528	sub $Ktbl,$Ktbl,#256 @ rewind $Ktbl
				529	teq $inp,$t0
				530	it eq
				531	subeq $inp,$inp,#64 @ avoid SEGV
				532	vld1.8 {@X[0]},[$inp]! @ load next input block
				533	vld1.8 {@X[1]},[$inp]!
				534	vld1.8 {@X[2]},[$inp]!
				535	vld1.8 {@X[3]},[$inp]!
				536	it ne
				537	strne $inp,[sp,#68]
				538	mov $Xfer,sp
				539	___
				540	&Xpreload(\&body_00_15);
				541	&Xpreload(\&body_00_15);
				542	&Xpreload(\&body_00_15);
				543	&Xpreload(\&body_00_15);
				544	$code.=<<___;
				545	ldr $t0,[$t1,#0]
				546	add $A,$A,$t2 @ h+=Maj(a,b,c) from the past
				547	ldr $t2,[$t1,#4]
				548	ldr $t3,[$t1,#8]
				549	ldr $t4,[$t1,#12]
				550	add $A,$A,$t0 @ accumulate
				551	ldr $t0,[$t1,#16]
				552	add $B,$B,$t2
				553	ldr $t2,[$t1,#20]
				554	add $C,$C,$t3
				555	ldr $t3,[$t1,#24]
				556	add $D,$D,$t4
				557	ldr $t4,[$t1,#28]
				558	add $E,$E,$t0
				559	str $A,[$t1],#4
				560	add $F,$F,$t2
				561	str $B,[$t1],#4
				562	add $G,$G,$t3
				563	str $C,[$t1],#4
				564	add $H,$H,$t4
				565	str $D,[$t1],#4
				566	stmia $t1,{$E-$H}
				567
				568	ittte ne
				569	movne $Xfer,sp
				570	ldrne $t1,[sp,#0]
				571	eorne $t2,$t2,$t2
				572	ldreq sp,[sp,#76] @ restore original sp
				573	itt ne
				574	eorne $t3,$B,$C
				575	bne .L_00_48
				576
				577	ldmia sp!,{r4-r12,pc}
				578	.size sha256_block_data_order_neon,.-sha256_block_data_order_neon
				579	#endif
				580	___
				581	}}}
				582	######################################################################
				583	# ARMv8 stuff
				584	#
				585	{{{
				586	my ($ABCD,$EFGH,$abcd)=map("q$_",(0..2));
				587	my @MSG=map("q$_",(8..11));
				588	my ($W0,$W1,$ABCD_SAVE,$EFGH_SAVE)=map("q$_",(12..15));
				589	my $Ktbl="r3";
				590
				591	$code.=<<___;
				592	#if __ARM_MAX_ARCH__>=7 && !defined(__KERNEL__)
				593
				594	# ifdef __thumb2__
				595	# define INST(a,b,c,d) .byte c,d\|0xc,a,b
				596	# else
				597	# define INST(a,b,c,d) .byte a,b,c,d
				598	# endif
				599
				600	.type sha256_block_data_order_armv8,%function
				601	.align 5
				602	sha256_block_data_order_armv8:
				603	.LARMv8:
				604	vld1.32 {$ABCD,$EFGH},[$ctx]
				605	# ifdef __thumb2__
				606	adr $Ktbl,.LARMv8
				607	sub $Ktbl,$Ktbl,#.LARMv8-K256
				608	# else
				609	adrl $Ktbl,K256
				610	# endif
				611	add $len,$inp,$len,lsl#6 @ len to point at the end of inp
				612
				613	.Loop_v8:
				614	vld1.8 {@MSG[0]-@MSG[1]},[$inp]!
				615	vld1.8 {@MSG[2]-@MSG[3]},[$inp]!
				616	vld1.32 {$W0},[$Ktbl]!
				617	vrev32.8 @MSG[0],@MSG[0]
				618	vrev32.8 @MSG[1],@MSG[1]
				619	vrev32.8 @MSG[2],@MSG[2]
				620	vrev32.8 @MSG[3],@MSG[3]
				621	vmov $ABCD_SAVE,$ABCD @ offload
				622	vmov $EFGH_SAVE,$EFGH
				623	teq $inp,$len
				624	___
				625	for($i=0;$i<12;$i++) {
				626	$code.=<<___;
				627	vld1.32 {$W1},[$Ktbl]!
				628	vadd.i32 $W0,$W0,@MSG[0]
				629	sha256su0 @MSG[0],@MSG[1]
				630	vmov $abcd,$ABCD
				631	sha256h $ABCD,$EFGH,$W0
				632	sha256h2 $EFGH,$abcd,$W0
				633	sha256su1 @MSG[0],@MSG[2],@MSG[3]
				634	___
				635	($W0,$W1)=($W1,$W0); push(@MSG,shift(@MSG));
				636	}
				637	$code.=<<___;
				638	vld1.32 {$W1},[$Ktbl]!
				639	vadd.i32 $W0,$W0,@MSG[0]
				640	vmov $abcd,$ABCD
				641	sha256h $ABCD,$EFGH,$W0
				642	sha256h2 $EFGH,$abcd,$W0
				643
				644	vld1.32 {$W0},[$Ktbl]!
				645	vadd.i32 $W1,$W1,@MSG[1]
				646	vmov $abcd,$ABCD
				647	sha256h $ABCD,$EFGH,$W1
				648	sha256h2 $EFGH,$abcd,$W1
				649
				650	vld1.32 {$W1},[$Ktbl]
				651	vadd.i32 $W0,$W0,@MSG[2]
				652	sub $Ktbl,$Ktbl,#256-16 @ rewind
				653	vmov $abcd,$ABCD
				654	sha256h $ABCD,$EFGH,$W0
				655	sha256h2 $EFGH,$abcd,$W0
				656
				657	vadd.i32 $W1,$W1,@MSG[3]
				658	vmov $abcd,$ABCD
				659	sha256h $ABCD,$EFGH,$W1
				660	sha256h2 $EFGH,$abcd,$W1
				661
				662	vadd.i32 $ABCD,$ABCD,$ABCD_SAVE
				663	vadd.i32 $EFGH,$EFGH,$EFGH_SAVE
				664	it ne
				665	bne .Loop_v8
				666
				667	vst1.32 {$ABCD,$EFGH},[$ctx]
				668
				669	ret @ bx lr
				670	.size sha256_block_data_order_armv8,.-sha256_block_data_order_armv8
				671	#endif
				672	___
				673	}}}
				674	$code.=<<___;
				675	.asciz "SHA256 block transform for ARMv4/NEON/ARMv8, CRYPTOGAMS by <appro\@openssl.org>"
				676	.align 2
				677	#if __ARM_MAX_ARCH__>=7 && !defined(__KERNEL__)
				678	.comm OPENSSL_armcap_P,4,4
				679	#endif
				680	___
				681
				682	open SELF,$0;
				683	while(<SELF>) {
				684	next if (/^#!/);
				685	last if (!s/^#/@/ and !/^$/);
				686	print;
				687	}
				688	close SELF;
				689
				690	{ my %opcode = (
				691	"sha256h" => 0xf3000c40, "sha256h2" => 0xf3100c40,
				692	"sha256su0" => 0xf3ba03c0, "sha256su1" => 0xf3200c40 );
				693
				694	sub unsha256 {
				695	my ($mnemonic,$arg)=@_;
				696
				697	if ($arg =~ m/q([0-9]+)(?:,\sq([0-9]+))?,\sq([0-9]+)/o) {
				698	my $word = $opcode{$mnemonic}\|(($1&7)<<13)\|(($1&8)<<19)
				699	\|(($2&7)<<17)\|(($2&8)<<4)
				700	\|(($3&7)<<1) \|(($3&8)<<2);
				701	# since ARMv7 instructions are always encoded little-endian.
				702	# correct solution is to use .inst directive, but older
				703	# assemblers don't implement it:-(
				704	sprintf "INST(0x%02x,0x%02x,0x%02x,0x%02x)\t@ %s %s",
				705	$word&0xff,($word>>8)&0xff,
				706	($word>>16)&0xff,($word>>24)&0xff,
				707	$mnemonic,$arg;
				708	}
				709	}
				710	}
				711
				712	foreach (split($/,$code)) {
				713
				714	s/\`([^\`]*)\`/eval $1/geo;
				715
				716	s/\b(sha256\w+)\s+(q.*)/unsha256($1,$2)/geo;
				717
				718	s/\bret\b/bx lr/go or
				719	s/\bbx\s+lr\b/.word\t0xe12fff1e/go; # make it possible to compile with -march=armv4
				720
				721	print $_,"\n";
				722	}
				723
				724	close STDOUT; # enforce flush