blob: f3a2b54efd4ee39fbeaefc87ffd850e97915233b [file] [log] [blame]
Sami Tolvanenf2f770d2015-04-03 18:03:40 +08001#!/usr/bin/env perl
Adam Langleyc2e415f2018-05-22 12:35:11 -07002# SPDX-License-Identifier: GPL-2.0
3
4# This code is taken from the OpenSSL project but the author (Andy Polyakov)
5# has relicensed it under the GPLv2. Therefore this program is free software;
6# you can redistribute it and/or modify it under the terms of the GNU General
7# Public License version 2 as published by the Free Software Foundation.
8#
9# The original headers, including the original license headers, are
10# included below for completeness.
Sami Tolvanenf2f770d2015-04-03 18:03:40 +080011
12# ====================================================================
13# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
14# project. The module is, however, dual licensed under OpenSSL and
15# CRYPTOGAMS licenses depending on where you obtain it. For further
Alexander A. Klimov9332a9e2020-07-19 18:49:59 +020016# details see https://www.openssl.org/~appro/cryptogams/.
Sami Tolvanenf2f770d2015-04-03 18:03:40 +080017# ====================================================================
18
19# SHA256 block procedure for ARMv4. May 2007.
20
21# Performance is ~2x better than gcc 3.4 generated code and in "abso-
22# lute" terms is ~2250 cycles per 64-byte block or ~35 cycles per
23# byte [on single-issue Xscale PXA250 core].
24
25# July 2010.
26#
27# Rescheduling for dual-issue pipeline resulted in 22% improvement on
28# Cortex A8 core and ~20 cycles per processed byte.
29
30# February 2011.
31#
32# Profiler-assisted and platform-specific optimization resulted in 16%
33# improvement on Cortex A8 core and ~15.4 cycles per processed byte.
34
35# September 2013.
36#
37# Add NEON implementation. On Cortex A8 it was measured to process one
38# byte in 12.5 cycles or 23% faster than integer-only code. Snapdragon
39# S4 does it in 12.5 cycles too, but it's 50% faster than integer-only
40# code (meaning that latter performs sub-optimally, nothing was done
41# about it).
42
43# May 2014.
44#
45# Add ARMv8 code path performing at 2.0 cpb on Apple A7.
46
47while (($output=shift) && ($output!~/^\w[\w\-]*\.\w+$/)) {}
48open STDOUT,">$output";
49
50$ctx="r0"; $t0="r0";
51$inp="r1"; $t4="r1";
52$len="r2"; $t1="r2";
53$T1="r3"; $t3="r3";
54$A="r4";
55$B="r5";
56$C="r6";
57$D="r7";
58$E="r8";
59$F="r9";
60$G="r10";
61$H="r11";
62@V=($A,$B,$C,$D,$E,$F,$G,$H);
63$t2="r12";
64$Ktbl="r14";
65
66@Sigma0=( 2,13,22);
67@Sigma1=( 6,11,25);
68@sigma0=( 7,18, 3);
69@sigma1=(17,19,10);
70
71sub BODY_00_15 {
72my ($i,$a,$b,$c,$d,$e,$f,$g,$h) = @_;
73
74$code.=<<___ if ($i<16);
75#if __ARM_ARCH__>=7
76 @ ldr $t1,[$inp],#4 @ $i
77# if $i==15
78 str $inp,[sp,#17*4] @ make room for $t4
79# endif
80 eor $t0,$e,$e,ror#`$Sigma1[1]-$Sigma1[0]`
81 add $a,$a,$t2 @ h+=Maj(a,b,c) from the past
82 eor $t0,$t0,$e,ror#`$Sigma1[2]-$Sigma1[0]` @ Sigma1(e)
83# ifndef __ARMEB__
84 rev $t1,$t1
85# endif
86#else
87 @ ldrb $t1,[$inp,#3] @ $i
88 add $a,$a,$t2 @ h+=Maj(a,b,c) from the past
89 ldrb $t2,[$inp,#2]
90 ldrb $t0,[$inp,#1]
91 orr $t1,$t1,$t2,lsl#8
92 ldrb $t2,[$inp],#4
93 orr $t1,$t1,$t0,lsl#16
94# if $i==15
95 str $inp,[sp,#17*4] @ make room for $t4
96# endif
97 eor $t0,$e,$e,ror#`$Sigma1[1]-$Sigma1[0]`
98 orr $t1,$t1,$t2,lsl#24
99 eor $t0,$t0,$e,ror#`$Sigma1[2]-$Sigma1[0]` @ Sigma1(e)
100#endif
101___
102$code.=<<___;
103 ldr $t2,[$Ktbl],#4 @ *K256++
104 add $h,$h,$t1 @ h+=X[i]
105 str $t1,[sp,#`$i%16`*4]
106 eor $t1,$f,$g
107 add $h,$h,$t0,ror#$Sigma1[0] @ h+=Sigma1(e)
108 and $t1,$t1,$e
109 add $h,$h,$t2 @ h+=K256[i]
110 eor $t1,$t1,$g @ Ch(e,f,g)
111 eor $t0,$a,$a,ror#`$Sigma0[1]-$Sigma0[0]`
112 add $h,$h,$t1 @ h+=Ch(e,f,g)
113#if $i==31
114 and $t2,$t2,#0xff
115 cmp $t2,#0xf2 @ done?
116#endif
117#if $i<15
118# if __ARM_ARCH__>=7
119 ldr $t1,[$inp],#4 @ prefetch
120# else
121 ldrb $t1,[$inp,#3]
122# endif
123 eor $t2,$a,$b @ a^b, b^c in next round
124#else
125 ldr $t1,[sp,#`($i+2)%16`*4] @ from future BODY_16_xx
126 eor $t2,$a,$b @ a^b, b^c in next round
127 ldr $t4,[sp,#`($i+15)%16`*4] @ from future BODY_16_xx
128#endif
129 eor $t0,$t0,$a,ror#`$Sigma0[2]-$Sigma0[0]` @ Sigma0(a)
130 and $t3,$t3,$t2 @ (b^c)&=(a^b)
131 add $d,$d,$h @ d+=h
132 eor $t3,$t3,$b @ Maj(a,b,c)
133 add $h,$h,$t0,ror#$Sigma0[0] @ h+=Sigma0(a)
134 @ add $h,$h,$t3 @ h+=Maj(a,b,c)
135___
136 ($t2,$t3)=($t3,$t2);
137}
138
139sub BODY_16_XX {
140my ($i,$a,$b,$c,$d,$e,$f,$g,$h) = @_;
141
142$code.=<<___;
143 @ ldr $t1,[sp,#`($i+1)%16`*4] @ $i
144 @ ldr $t4,[sp,#`($i+14)%16`*4]
145 mov $t0,$t1,ror#$sigma0[0]
146 add $a,$a,$t2 @ h+=Maj(a,b,c) from the past
147 mov $t2,$t4,ror#$sigma1[0]
148 eor $t0,$t0,$t1,ror#$sigma0[1]
149 eor $t2,$t2,$t4,ror#$sigma1[1]
150 eor $t0,$t0,$t1,lsr#$sigma0[2] @ sigma0(X[i+1])
151 ldr $t1,[sp,#`($i+0)%16`*4]
152 eor $t2,$t2,$t4,lsr#$sigma1[2] @ sigma1(X[i+14])
153 ldr $t4,[sp,#`($i+9)%16`*4]
154
155 add $t2,$t2,$t0
156 eor $t0,$e,$e,ror#`$Sigma1[1]-$Sigma1[0]` @ from BODY_00_15
157 add $t1,$t1,$t2
158 eor $t0,$t0,$e,ror#`$Sigma1[2]-$Sigma1[0]` @ Sigma1(e)
159 add $t1,$t1,$t4 @ X[i]
160___
161 &BODY_00_15(@_);
162}
163
164$code=<<___;
165#ifndef __KERNEL__
166# include "arm_arch.h"
167#else
168# define __ARM_ARCH__ __LINUX_ARM_ARCH__
169# define __ARM_MAX_ARCH__ 7
170#endif
171
172.text
173#if __ARM_ARCH__<7
174.code 32
175#else
176.syntax unified
177# ifdef __thumb2__
Sami Tolvanenf2f770d2015-04-03 18:03:40 +0800178.thumb
179# else
180.code 32
181# endif
182#endif
183
184.type K256,%object
185.align 5
186K256:
187.word 0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5
188.word 0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5
189.word 0xd807aa98,0x12835b01,0x243185be,0x550c7dc3
190.word 0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174
191.word 0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc
192.word 0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da
193.word 0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7
194.word 0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967
195.word 0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13
196.word 0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85
197.word 0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3
198.word 0xd192e819,0xd6990624,0xf40e3585,0x106aa070
199.word 0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5
200.word 0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3
201.word 0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208
202.word 0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2
203.size K256,.-K256
204.word 0 @ terminator
205#if __ARM_MAX_ARCH__>=7 && !defined(__KERNEL__)
206.LOPENSSL_armcap:
207.word OPENSSL_armcap_P-sha256_block_data_order
208#endif
209.align 5
210
211.global sha256_block_data_order
212.type sha256_block_data_order,%function
213sha256_block_data_order:
Ard Biesheuvel69216a52019-02-16 14:51:25 +0100214.Lsha256_block_data_order:
Sami Tolvanenf2f770d2015-04-03 18:03:40 +0800215#if __ARM_ARCH__<7
216 sub r3,pc,#8 @ sha256_block_data_order
217#else
Ard Biesheuvel69216a52019-02-16 14:51:25 +0100218 adr r3,.Lsha256_block_data_order
Sami Tolvanenf2f770d2015-04-03 18:03:40 +0800219#endif
220#if __ARM_MAX_ARCH__>=7 && !defined(__KERNEL__)
221 ldr r12,.LOPENSSL_armcap
222 ldr r12,[r3,r12] @ OPENSSL_armcap_P
223 tst r12,#ARMV8_SHA256
224 bne .LARMv8
225 tst r12,#ARMV7_NEON
226 bne .LNEON
227#endif
228 add $len,$inp,$len,lsl#6 @ len to point at the end of inp
229 stmdb sp!,{$ctx,$inp,$len,r4-r11,lr}
230 ldmia $ctx,{$A,$B,$C,$D,$E,$F,$G,$H}
231 sub $Ktbl,r3,#256+32 @ K256
232 sub sp,sp,#16*4 @ alloca(X[16])
233.Loop:
234# if __ARM_ARCH__>=7
235 ldr $t1,[$inp],#4
236# else
237 ldrb $t1,[$inp,#3]
238# endif
239 eor $t3,$B,$C @ magic
240 eor $t2,$t2,$t2
241___
242for($i=0;$i<16;$i++) { &BODY_00_15($i,@V); unshift(@V,pop(@V)); }
243$code.=".Lrounds_16_xx:\n";
244for (;$i<32;$i++) { &BODY_16_XX($i,@V); unshift(@V,pop(@V)); }
245$code.=<<___;
246#if __ARM_ARCH__>=7
247 ite eq @ Thumb2 thing, sanity check in ARM
248#endif
249 ldreq $t3,[sp,#16*4] @ pull ctx
250 bne .Lrounds_16_xx
251
252 add $A,$A,$t2 @ h+=Maj(a,b,c) from the past
253 ldr $t0,[$t3,#0]
254 ldr $t1,[$t3,#4]
255 ldr $t2,[$t3,#8]
256 add $A,$A,$t0
257 ldr $t0,[$t3,#12]
258 add $B,$B,$t1
259 ldr $t1,[$t3,#16]
260 add $C,$C,$t2
261 ldr $t2,[$t3,#20]
262 add $D,$D,$t0
263 ldr $t0,[$t3,#24]
264 add $E,$E,$t1
265 ldr $t1,[$t3,#28]
266 add $F,$F,$t2
267 ldr $inp,[sp,#17*4] @ pull inp
268 ldr $t2,[sp,#18*4] @ pull inp+len
269 add $G,$G,$t0
270 add $H,$H,$t1
271 stmia $t3,{$A,$B,$C,$D,$E,$F,$G,$H}
272 cmp $inp,$t2
273 sub $Ktbl,$Ktbl,#256 @ rewind Ktbl
274 bne .Loop
275
276 add sp,sp,#`16+3`*4 @ destroy frame
277#if __ARM_ARCH__>=5
278 ldmia sp!,{r4-r11,pc}
279#else
280 ldmia sp!,{r4-r11,lr}
281 tst lr,#1
282 moveq pc,lr @ be binary compatible with V4, yet
283 bx lr @ interoperable with Thumb ISA:-)
284#endif
285.size sha256_block_data_order,.-sha256_block_data_order
286___
287######################################################################
288# NEON stuff
289#
290{{{
291my @X=map("q$_",(0..3));
292my ($T0,$T1,$T2,$T3,$T4,$T5)=("q8","q9","q10","q11","d24","d25");
293my $Xfer=$t4;
294my $j=0;
295
296sub Dlo() { shift=~m|q([1]?[0-9])|?"d".($1*2):""; }
297sub Dhi() { shift=~m|q([1]?[0-9])|?"d".($1*2+1):""; }
298
299sub AUTOLOAD() # thunk [simplified] x86-style perlasm
300{ my $opcode = $AUTOLOAD; $opcode =~ s/.*:://; $opcode =~ s/_/\./;
301 my $arg = pop;
302 $arg = "#$arg" if ($arg*1 eq $arg);
303 $code .= "\t$opcode\t".join(',',@_,$arg)."\n";
304}
305
306sub Xupdate()
307{ use integer;
308 my $body = shift;
309 my @insns = (&$body,&$body,&$body,&$body);
310 my ($a,$b,$c,$d,$e,$f,$g,$h);
311
312 &vext_8 ($T0,@X[0],@X[1],4); # X[1..4]
313 eval(shift(@insns));
314 eval(shift(@insns));
315 eval(shift(@insns));
316 &vext_8 ($T1,@X[2],@X[3],4); # X[9..12]
317 eval(shift(@insns));
318 eval(shift(@insns));
319 eval(shift(@insns));
320 &vshr_u32 ($T2,$T0,$sigma0[0]);
321 eval(shift(@insns));
322 eval(shift(@insns));
323 &vadd_i32 (@X[0],@X[0],$T1); # X[0..3] += X[9..12]
324 eval(shift(@insns));
325 eval(shift(@insns));
326 &vshr_u32 ($T1,$T0,$sigma0[2]);
327 eval(shift(@insns));
328 eval(shift(@insns));
329 &vsli_32 ($T2,$T0,32-$sigma0[0]);
330 eval(shift(@insns));
331 eval(shift(@insns));
332 &vshr_u32 ($T3,$T0,$sigma0[1]);
333 eval(shift(@insns));
334 eval(shift(@insns));
335 &veor ($T1,$T1,$T2);
336 eval(shift(@insns));
337 eval(shift(@insns));
338 &vsli_32 ($T3,$T0,32-$sigma0[1]);
339 eval(shift(@insns));
340 eval(shift(@insns));
341 &vshr_u32 ($T4,&Dhi(@X[3]),$sigma1[0]);
342 eval(shift(@insns));
343 eval(shift(@insns));
344 &veor ($T1,$T1,$T3); # sigma0(X[1..4])
345 eval(shift(@insns));
346 eval(shift(@insns));
347 &vsli_32 ($T4,&Dhi(@X[3]),32-$sigma1[0]);
348 eval(shift(@insns));
349 eval(shift(@insns));
350 &vshr_u32 ($T5,&Dhi(@X[3]),$sigma1[2]);
351 eval(shift(@insns));
352 eval(shift(@insns));
353 &vadd_i32 (@X[0],@X[0],$T1); # X[0..3] += sigma0(X[1..4])
354 eval(shift(@insns));
355 eval(shift(@insns));
356 &veor ($T5,$T5,$T4);
357 eval(shift(@insns));
358 eval(shift(@insns));
359 &vshr_u32 ($T4,&Dhi(@X[3]),$sigma1[1]);
360 eval(shift(@insns));
361 eval(shift(@insns));
362 &vsli_32 ($T4,&Dhi(@X[3]),32-$sigma1[1]);
363 eval(shift(@insns));
364 eval(shift(@insns));
365 &veor ($T5,$T5,$T4); # sigma1(X[14..15])
366 eval(shift(@insns));
367 eval(shift(@insns));
368 &vadd_i32 (&Dlo(@X[0]),&Dlo(@X[0]),$T5);# X[0..1] += sigma1(X[14..15])
369 eval(shift(@insns));
370 eval(shift(@insns));
371 &vshr_u32 ($T4,&Dlo(@X[0]),$sigma1[0]);
372 eval(shift(@insns));
373 eval(shift(@insns));
374 &vsli_32 ($T4,&Dlo(@X[0]),32-$sigma1[0]);
375 eval(shift(@insns));
376 eval(shift(@insns));
377 &vshr_u32 ($T5,&Dlo(@X[0]),$sigma1[2]);
378 eval(shift(@insns));
379 eval(shift(@insns));
380 &veor ($T5,$T5,$T4);
381 eval(shift(@insns));
382 eval(shift(@insns));
383 &vshr_u32 ($T4,&Dlo(@X[0]),$sigma1[1]);
384 eval(shift(@insns));
385 eval(shift(@insns));
386 &vld1_32 ("{$T0}","[$Ktbl,:128]!");
387 eval(shift(@insns));
388 eval(shift(@insns));
389 &vsli_32 ($T4,&Dlo(@X[0]),32-$sigma1[1]);
390 eval(shift(@insns));
391 eval(shift(@insns));
392 &veor ($T5,$T5,$T4); # sigma1(X[16..17])
393 eval(shift(@insns));
394 eval(shift(@insns));
395 &vadd_i32 (&Dhi(@X[0]),&Dhi(@X[0]),$T5);# X[2..3] += sigma1(X[16..17])
396 eval(shift(@insns));
397 eval(shift(@insns));
398 &vadd_i32 ($T0,$T0,@X[0]);
399 while($#insns>=2) { eval(shift(@insns)); }
400 &vst1_32 ("{$T0}","[$Xfer,:128]!");
401 eval(shift(@insns));
402 eval(shift(@insns));
403
404 push(@X,shift(@X)); # "rotate" X[]
405}
406
407sub Xpreload()
408{ use integer;
409 my $body = shift;
410 my @insns = (&$body,&$body,&$body,&$body);
411 my ($a,$b,$c,$d,$e,$f,$g,$h);
412
413 eval(shift(@insns));
414 eval(shift(@insns));
415 eval(shift(@insns));
416 eval(shift(@insns));
417 &vld1_32 ("{$T0}","[$Ktbl,:128]!");
418 eval(shift(@insns));
419 eval(shift(@insns));
420 eval(shift(@insns));
421 eval(shift(@insns));
422 &vrev32_8 (@X[0],@X[0]);
423 eval(shift(@insns));
424 eval(shift(@insns));
425 eval(shift(@insns));
426 eval(shift(@insns));
427 &vadd_i32 ($T0,$T0,@X[0]);
428 foreach (@insns) { eval; } # remaining instructions
429 &vst1_32 ("{$T0}","[$Xfer,:128]!");
430
431 push(@X,shift(@X)); # "rotate" X[]
432}
433
434sub body_00_15 () {
435 (
436 '($a,$b,$c,$d,$e,$f,$g,$h)=@V;'.
437 '&add ($h,$h,$t1)', # h+=X[i]+K[i]
438 '&eor ($t1,$f,$g)',
439 '&eor ($t0,$e,$e,"ror#".($Sigma1[1]-$Sigma1[0]))',
440 '&add ($a,$a,$t2)', # h+=Maj(a,b,c) from the past
441 '&and ($t1,$t1,$e)',
442 '&eor ($t2,$t0,$e,"ror#".($Sigma1[2]-$Sigma1[0]))', # Sigma1(e)
443 '&eor ($t0,$a,$a,"ror#".($Sigma0[1]-$Sigma0[0]))',
444 '&eor ($t1,$t1,$g)', # Ch(e,f,g)
445 '&add ($h,$h,$t2,"ror#$Sigma1[0]")', # h+=Sigma1(e)
446 '&eor ($t2,$a,$b)', # a^b, b^c in next round
447 '&eor ($t0,$t0,$a,"ror#".($Sigma0[2]-$Sigma0[0]))', # Sigma0(a)
448 '&add ($h,$h,$t1)', # h+=Ch(e,f,g)
449 '&ldr ($t1,sprintf "[sp,#%d]",4*(($j+1)&15)) if (($j&15)!=15);'.
450 '&ldr ($t1,"[$Ktbl]") if ($j==15);'.
451 '&ldr ($t1,"[sp,#64]") if ($j==31)',
452 '&and ($t3,$t3,$t2)', # (b^c)&=(a^b)
453 '&add ($d,$d,$h)', # d+=h
454 '&add ($h,$h,$t0,"ror#$Sigma0[0]");'. # h+=Sigma0(a)
455 '&eor ($t3,$t3,$b)', # Maj(a,b,c)
456 '$j++; unshift(@V,pop(@V)); ($t2,$t3)=($t3,$t2);'
457 )
458}
459
460$code.=<<___;
461#if __ARM_MAX_ARCH__>=7
462.arch armv7-a
463.fpu neon
464
465.global sha256_block_data_order_neon
466.type sha256_block_data_order_neon,%function
467.align 4
468sha256_block_data_order_neon:
469.LNEON:
470 stmdb sp!,{r4-r12,lr}
471
472 sub $H,sp,#16*4+16
Ard Biesheuvel54781932020-09-16 09:14:17 +0300473 adr $Ktbl,.Lsha256_block_data_order
474 sub $Ktbl,$Ktbl,#.Lsha256_block_data_order-K256
Sami Tolvanenf2f770d2015-04-03 18:03:40 +0800475 bic $H,$H,#15 @ align for 128-bit stores
476 mov $t2,sp
477 mov sp,$H @ alloca
478 add $len,$inp,$len,lsl#6 @ len to point at the end of inp
479
480 vld1.8 {@X[0]},[$inp]!
481 vld1.8 {@X[1]},[$inp]!
482 vld1.8 {@X[2]},[$inp]!
483 vld1.8 {@X[3]},[$inp]!
484 vld1.32 {$T0},[$Ktbl,:128]!
485 vld1.32 {$T1},[$Ktbl,:128]!
486 vld1.32 {$T2},[$Ktbl,:128]!
487 vld1.32 {$T3},[$Ktbl,:128]!
488 vrev32.8 @X[0],@X[0] @ yes, even on
489 str $ctx,[sp,#64]
490 vrev32.8 @X[1],@X[1] @ big-endian
491 str $inp,[sp,#68]
492 mov $Xfer,sp
493 vrev32.8 @X[2],@X[2]
494 str $len,[sp,#72]
495 vrev32.8 @X[3],@X[3]
496 str $t2,[sp,#76] @ save original sp
497 vadd.i32 $T0,$T0,@X[0]
498 vadd.i32 $T1,$T1,@X[1]
499 vst1.32 {$T0},[$Xfer,:128]!
500 vadd.i32 $T2,$T2,@X[2]
501 vst1.32 {$T1},[$Xfer,:128]!
502 vadd.i32 $T3,$T3,@X[3]
503 vst1.32 {$T2},[$Xfer,:128]!
504 vst1.32 {$T3},[$Xfer,:128]!
505
506 ldmia $ctx,{$A-$H}
507 sub $Xfer,$Xfer,#64
508 ldr $t1,[sp,#0]
509 eor $t2,$t2,$t2
510 eor $t3,$B,$C
511 b .L_00_48
512
513.align 4
514.L_00_48:
515___
516 &Xupdate(\&body_00_15);
517 &Xupdate(\&body_00_15);
518 &Xupdate(\&body_00_15);
519 &Xupdate(\&body_00_15);
520$code.=<<___;
521 teq $t1,#0 @ check for K256 terminator
522 ldr $t1,[sp,#0]
523 sub $Xfer,$Xfer,#64
524 bne .L_00_48
525
526 ldr $inp,[sp,#68]
527 ldr $t0,[sp,#72]
528 sub $Ktbl,$Ktbl,#256 @ rewind $Ktbl
529 teq $inp,$t0
530 it eq
531 subeq $inp,$inp,#64 @ avoid SEGV
532 vld1.8 {@X[0]},[$inp]! @ load next input block
533 vld1.8 {@X[1]},[$inp]!
534 vld1.8 {@X[2]},[$inp]!
535 vld1.8 {@X[3]},[$inp]!
536 it ne
537 strne $inp,[sp,#68]
538 mov $Xfer,sp
539___
540 &Xpreload(\&body_00_15);
541 &Xpreload(\&body_00_15);
542 &Xpreload(\&body_00_15);
543 &Xpreload(\&body_00_15);
544$code.=<<___;
545 ldr $t0,[$t1,#0]
546 add $A,$A,$t2 @ h+=Maj(a,b,c) from the past
547 ldr $t2,[$t1,#4]
548 ldr $t3,[$t1,#8]
549 ldr $t4,[$t1,#12]
550 add $A,$A,$t0 @ accumulate
551 ldr $t0,[$t1,#16]
552 add $B,$B,$t2
553 ldr $t2,[$t1,#20]
554 add $C,$C,$t3
555 ldr $t3,[$t1,#24]
556 add $D,$D,$t4
557 ldr $t4,[$t1,#28]
558 add $E,$E,$t0
559 str $A,[$t1],#4
560 add $F,$F,$t2
561 str $B,[$t1],#4
562 add $G,$G,$t3
563 str $C,[$t1],#4
564 add $H,$H,$t4
565 str $D,[$t1],#4
566 stmia $t1,{$E-$H}
567
568 ittte ne
569 movne $Xfer,sp
570 ldrne $t1,[sp,#0]
571 eorne $t2,$t2,$t2
572 ldreq sp,[sp,#76] @ restore original sp
573 itt ne
574 eorne $t3,$B,$C
575 bne .L_00_48
576
577 ldmia sp!,{r4-r12,pc}
578.size sha256_block_data_order_neon,.-sha256_block_data_order_neon
579#endif
580___
581}}}
582######################################################################
583# ARMv8 stuff
584#
585{{{
586my ($ABCD,$EFGH,$abcd)=map("q$_",(0..2));
587my @MSG=map("q$_",(8..11));
588my ($W0,$W1,$ABCD_SAVE,$EFGH_SAVE)=map("q$_",(12..15));
589my $Ktbl="r3";
590
591$code.=<<___;
592#if __ARM_MAX_ARCH__>=7 && !defined(__KERNEL__)
593
594# ifdef __thumb2__
595# define INST(a,b,c,d) .byte c,d|0xc,a,b
596# else
597# define INST(a,b,c,d) .byte a,b,c,d
598# endif
599
600.type sha256_block_data_order_armv8,%function
601.align 5
602sha256_block_data_order_armv8:
603.LARMv8:
604 vld1.32 {$ABCD,$EFGH},[$ctx]
605# ifdef __thumb2__
606 adr $Ktbl,.LARMv8
607 sub $Ktbl,$Ktbl,#.LARMv8-K256
608# else
609 adrl $Ktbl,K256
610# endif
611 add $len,$inp,$len,lsl#6 @ len to point at the end of inp
612
613.Loop_v8:
614 vld1.8 {@MSG[0]-@MSG[1]},[$inp]!
615 vld1.8 {@MSG[2]-@MSG[3]},[$inp]!
616 vld1.32 {$W0},[$Ktbl]!
617 vrev32.8 @MSG[0],@MSG[0]
618 vrev32.8 @MSG[1],@MSG[1]
619 vrev32.8 @MSG[2],@MSG[2]
620 vrev32.8 @MSG[3],@MSG[3]
621 vmov $ABCD_SAVE,$ABCD @ offload
622 vmov $EFGH_SAVE,$EFGH
623 teq $inp,$len
624___
625for($i=0;$i<12;$i++) {
626$code.=<<___;
627 vld1.32 {$W1},[$Ktbl]!
628 vadd.i32 $W0,$W0,@MSG[0]
629 sha256su0 @MSG[0],@MSG[1]
630 vmov $abcd,$ABCD
631 sha256h $ABCD,$EFGH,$W0
632 sha256h2 $EFGH,$abcd,$W0
633 sha256su1 @MSG[0],@MSG[2],@MSG[3]
634___
635 ($W0,$W1)=($W1,$W0); push(@MSG,shift(@MSG));
636}
637$code.=<<___;
638 vld1.32 {$W1},[$Ktbl]!
639 vadd.i32 $W0,$W0,@MSG[0]
640 vmov $abcd,$ABCD
641 sha256h $ABCD,$EFGH,$W0
642 sha256h2 $EFGH,$abcd,$W0
643
644 vld1.32 {$W0},[$Ktbl]!
645 vadd.i32 $W1,$W1,@MSG[1]
646 vmov $abcd,$ABCD
647 sha256h $ABCD,$EFGH,$W1
648 sha256h2 $EFGH,$abcd,$W1
649
650 vld1.32 {$W1},[$Ktbl]
651 vadd.i32 $W0,$W0,@MSG[2]
652 sub $Ktbl,$Ktbl,#256-16 @ rewind
653 vmov $abcd,$ABCD
654 sha256h $ABCD,$EFGH,$W0
655 sha256h2 $EFGH,$abcd,$W0
656
657 vadd.i32 $W1,$W1,@MSG[3]
658 vmov $abcd,$ABCD
659 sha256h $ABCD,$EFGH,$W1
660 sha256h2 $EFGH,$abcd,$W1
661
662 vadd.i32 $ABCD,$ABCD,$ABCD_SAVE
663 vadd.i32 $EFGH,$EFGH,$EFGH_SAVE
664 it ne
665 bne .Loop_v8
666
667 vst1.32 {$ABCD,$EFGH},[$ctx]
668
669 ret @ bx lr
670.size sha256_block_data_order_armv8,.-sha256_block_data_order_armv8
671#endif
672___
673}}}
674$code.=<<___;
675.asciz "SHA256 block transform for ARMv4/NEON/ARMv8, CRYPTOGAMS by <appro\@openssl.org>"
676.align 2
677#if __ARM_MAX_ARCH__>=7 && !defined(__KERNEL__)
678.comm OPENSSL_armcap_P,4,4
679#endif
680___
681
682open SELF,$0;
683while(<SELF>) {
684 next if (/^#!/);
685 last if (!s/^#/@/ and !/^$/);
686 print;
687}
688close SELF;
689
690{ my %opcode = (
691 "sha256h" => 0xf3000c40, "sha256h2" => 0xf3100c40,
692 "sha256su0" => 0xf3ba03c0, "sha256su1" => 0xf3200c40 );
693
694 sub unsha256 {
695 my ($mnemonic,$arg)=@_;
696
697 if ($arg =~ m/q([0-9]+)(?:,\s*q([0-9]+))?,\s*q([0-9]+)/o) {
698 my $word = $opcode{$mnemonic}|(($1&7)<<13)|(($1&8)<<19)
699 |(($2&7)<<17)|(($2&8)<<4)
700 |(($3&7)<<1) |(($3&8)<<2);
701 # since ARMv7 instructions are always encoded little-endian.
702 # correct solution is to use .inst directive, but older
703 # assemblers don't implement it:-(
704 sprintf "INST(0x%02x,0x%02x,0x%02x,0x%02x)\t@ %s %s",
705 $word&0xff,($word>>8)&0xff,
706 ($word>>16)&0xff,($word>>24)&0xff,
707 $mnemonic,$arg;
708 }
709 }
710}
711
712foreach (split($/,$code)) {
713
714 s/\`([^\`]*)\`/eval $1/geo;
715
716 s/\b(sha256\w+)\s+(q.*)/unsha256($1,$2)/geo;
717
718 s/\bret\b/bx lr/go or
719 s/\bbx\s+lr\b/.word\t0xe12fff1e/go; # make it possible to compile with -march=armv4
720
721 print $_,"\n";
722}
723
724close STDOUT; # enforce flush