Jussi Kivilinna | 6046825 | 2014-07-29 17:14:14 +0100 | [diff] [blame] | 1 | /* sha1-armv7-neon.S - ARM/NEON accelerated SHA-1 transform function |
| 2 | * |
| 3 | * Copyright © 2013-2014 Jussi Kivilinna <jussi.kivilinna@iki.fi> |
| 4 | * |
| 5 | * This program is free software; you can redistribute it and/or modify it |
| 6 | * under the terms of the GNU General Public License as published by the Free |
| 7 | * Software Foundation; either version 2 of the License, or (at your option) |
| 8 | * any later version. |
| 9 | */ |
| 10 | |
| 11 | #include <linux/linkage.h> |
Ard Biesheuvel | 0777e3e | 2014-08-05 21:15:19 +0100 | [diff] [blame] | 12 | #include <asm/assembler.h> |
Jussi Kivilinna | 6046825 | 2014-07-29 17:14:14 +0100 | [diff] [blame] | 13 | |
| 14 | .syntax unified |
Jussi Kivilinna | 6046825 | 2014-07-29 17:14:14 +0100 | [diff] [blame] | 15 | .fpu neon |
| 16 | |
| 17 | .text |
| 18 | |
| 19 | |
| 20 | /* Context structure */ |
| 21 | |
| 22 | #define state_h0 0 |
| 23 | #define state_h1 4 |
| 24 | #define state_h2 8 |
| 25 | #define state_h3 12 |
| 26 | #define state_h4 16 |
| 27 | |
| 28 | |
| 29 | /* Constants */ |
| 30 | |
| 31 | #define K1 0x5A827999 |
| 32 | #define K2 0x6ED9EBA1 |
| 33 | #define K3 0x8F1BBCDC |
| 34 | #define K4 0xCA62C1D6 |
| 35 | .align 4 |
| 36 | .LK_VEC: |
| 37 | .LK1: .long K1, K1, K1, K1 |
| 38 | .LK2: .long K2, K2, K2, K2 |
| 39 | .LK3: .long K3, K3, K3, K3 |
| 40 | .LK4: .long K4, K4, K4, K4 |
| 41 | |
| 42 | |
| 43 | /* Register macros */ |
| 44 | |
| 45 | #define RSTATE r0 |
| 46 | #define RDATA r1 |
| 47 | #define RNBLKS r2 |
| 48 | #define ROLDSTACK r3 |
| 49 | #define RWK lr |
| 50 | |
| 51 | #define _a r4 |
| 52 | #define _b r5 |
| 53 | #define _c r6 |
| 54 | #define _d r7 |
| 55 | #define _e r8 |
| 56 | |
| 57 | #define RT0 r9 |
| 58 | #define RT1 r10 |
| 59 | #define RT2 r11 |
| 60 | #define RT3 r12 |
| 61 | |
| 62 | #define W0 q0 |
Ard Biesheuvel | 0777e3e | 2014-08-05 21:15:19 +0100 | [diff] [blame] | 63 | #define W1 q7 |
Jussi Kivilinna | 6046825 | 2014-07-29 17:14:14 +0100 | [diff] [blame] | 64 | #define W2 q2 |
| 65 | #define W3 q3 |
| 66 | #define W4 q4 |
Ard Biesheuvel | 0777e3e | 2014-08-05 21:15:19 +0100 | [diff] [blame] | 67 | #define W5 q6 |
| 68 | #define W6 q5 |
| 69 | #define W7 q1 |
Jussi Kivilinna | 6046825 | 2014-07-29 17:14:14 +0100 | [diff] [blame] | 70 | |
| 71 | #define tmp0 q8 |
| 72 | #define tmp1 q9 |
| 73 | #define tmp2 q10 |
| 74 | #define tmp3 q11 |
| 75 | |
| 76 | #define qK1 q12 |
| 77 | #define qK2 q13 |
| 78 | #define qK3 q14 |
| 79 | #define qK4 q15 |
| 80 | |
Ard Biesheuvel | 0777e3e | 2014-08-05 21:15:19 +0100 | [diff] [blame] | 81 | #ifdef CONFIG_CPU_BIG_ENDIAN |
| 82 | #define ARM_LE(code...) |
| 83 | #else |
| 84 | #define ARM_LE(code...) code |
| 85 | #endif |
Jussi Kivilinna | 6046825 | 2014-07-29 17:14:14 +0100 | [diff] [blame] | 86 | |
| 87 | /* Round function macros. */ |
| 88 | |
| 89 | #define WK_offs(i) (((i) & 15) * 4) |
| 90 | |
| 91 | #define _R_F1(a,b,c,d,e,i,pre1,pre2,pre3,i16,\ |
| 92 | W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28) \ |
| 93 | ldr RT3, [sp, WK_offs(i)]; \ |
| 94 | pre1(i16,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28); \ |
| 95 | bic RT0, d, b; \ |
| 96 | add e, e, a, ror #(32 - 5); \ |
| 97 | and RT1, c, b; \ |
| 98 | pre2(i16,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28); \ |
| 99 | add RT0, RT0, RT3; \ |
| 100 | add e, e, RT1; \ |
| 101 | ror b, #(32 - 30); \ |
| 102 | pre3(i16,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28); \ |
| 103 | add e, e, RT0; |
| 104 | |
| 105 | #define _R_F2(a,b,c,d,e,i,pre1,pre2,pre3,i16,\ |
| 106 | W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28) \ |
| 107 | ldr RT3, [sp, WK_offs(i)]; \ |
| 108 | pre1(i16,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28); \ |
| 109 | eor RT0, d, b; \ |
| 110 | add e, e, a, ror #(32 - 5); \ |
| 111 | eor RT0, RT0, c; \ |
| 112 | pre2(i16,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28); \ |
| 113 | add e, e, RT3; \ |
| 114 | ror b, #(32 - 30); \ |
| 115 | pre3(i16,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28); \ |
| 116 | add e, e, RT0; \ |
| 117 | |
| 118 | #define _R_F3(a,b,c,d,e,i,pre1,pre2,pre3,i16,\ |
| 119 | W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28) \ |
| 120 | ldr RT3, [sp, WK_offs(i)]; \ |
| 121 | pre1(i16,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28); \ |
| 122 | eor RT0, b, c; \ |
| 123 | and RT1, b, c; \ |
| 124 | add e, e, a, ror #(32 - 5); \ |
| 125 | pre2(i16,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28); \ |
| 126 | and RT0, RT0, d; \ |
| 127 | add RT1, RT1, RT3; \ |
| 128 | add e, e, RT0; \ |
| 129 | ror b, #(32 - 30); \ |
| 130 | pre3(i16,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28); \ |
| 131 | add e, e, RT1; |
| 132 | |
| 133 | #define _R_F4(a,b,c,d,e,i,pre1,pre2,pre3,i16,\ |
| 134 | W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28) \ |
| 135 | _R_F2(a,b,c,d,e,i,pre1,pre2,pre3,i16,\ |
| 136 | W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28) |
| 137 | |
| 138 | #define _R(a,b,c,d,e,f,i,pre1,pre2,pre3,i16,\ |
| 139 | W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28) \ |
| 140 | _R_##f(a,b,c,d,e,i,pre1,pre2,pre3,i16,\ |
| 141 | W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28) |
| 142 | |
| 143 | #define R(a,b,c,d,e,f,i) \ |
| 144 | _R_##f(a,b,c,d,e,i,dummy,dummy,dummy,i16,\ |
| 145 | W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28) |
| 146 | |
| 147 | #define dummy(...) |
| 148 | |
| 149 | |
| 150 | /* Input expansion macros. */ |
| 151 | |
| 152 | /********* Precalc macros for rounds 0-15 *************************************/ |
| 153 | |
| 154 | #define W_PRECALC_00_15() \ |
| 155 | add RWK, sp, #(WK_offs(0)); \ |
| 156 | \ |
Ard Biesheuvel | 0777e3e | 2014-08-05 21:15:19 +0100 | [diff] [blame] | 157 | vld1.32 {W0, W7}, [RDATA]!; \ |
| 158 | ARM_LE(vrev32.8 W0, W0; ) /* big => little */ \ |
| 159 | vld1.32 {W6, W5}, [RDATA]!; \ |
Jussi Kivilinna | 6046825 | 2014-07-29 17:14:14 +0100 | [diff] [blame] | 160 | vadd.u32 tmp0, W0, curK; \ |
Ard Biesheuvel | 0777e3e | 2014-08-05 21:15:19 +0100 | [diff] [blame] | 161 | ARM_LE(vrev32.8 W7, W7; ) /* big => little */ \ |
| 162 | ARM_LE(vrev32.8 W6, W6; ) /* big => little */ \ |
Jussi Kivilinna | 6046825 | 2014-07-29 17:14:14 +0100 | [diff] [blame] | 163 | vadd.u32 tmp1, W7, curK; \ |
Ard Biesheuvel | 0777e3e | 2014-08-05 21:15:19 +0100 | [diff] [blame] | 164 | ARM_LE(vrev32.8 W5, W5; ) /* big => little */ \ |
Jussi Kivilinna | 6046825 | 2014-07-29 17:14:14 +0100 | [diff] [blame] | 165 | vadd.u32 tmp2, W6, curK; \ |
| 166 | vst1.32 {tmp0, tmp1}, [RWK]!; \ |
| 167 | vadd.u32 tmp3, W5, curK; \ |
| 168 | vst1.32 {tmp2, tmp3}, [RWK]; \ |
| 169 | |
| 170 | #define WPRECALC_00_15_0(i,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28) \ |
Ard Biesheuvel | 0777e3e | 2014-08-05 21:15:19 +0100 | [diff] [blame] | 171 | vld1.32 {W0, W7}, [RDATA]!; \ |
Jussi Kivilinna | 6046825 | 2014-07-29 17:14:14 +0100 | [diff] [blame] | 172 | |
| 173 | #define WPRECALC_00_15_1(i,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28) \ |
| 174 | add RWK, sp, #(WK_offs(0)); \ |
| 175 | |
| 176 | #define WPRECALC_00_15_2(i,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28) \ |
Ard Biesheuvel | 0777e3e | 2014-08-05 21:15:19 +0100 | [diff] [blame] | 177 | ARM_LE(vrev32.8 W0, W0; ) /* big => little */ \ |
Jussi Kivilinna | 6046825 | 2014-07-29 17:14:14 +0100 | [diff] [blame] | 178 | |
| 179 | #define WPRECALC_00_15_3(i,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28) \ |
Ard Biesheuvel | 0777e3e | 2014-08-05 21:15:19 +0100 | [diff] [blame] | 180 | vld1.32 {W6, W5}, [RDATA]!; \ |
Jussi Kivilinna | 6046825 | 2014-07-29 17:14:14 +0100 | [diff] [blame] | 181 | |
| 182 | #define WPRECALC_00_15_4(i,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28) \ |
| 183 | vadd.u32 tmp0, W0, curK; \ |
| 184 | |
| 185 | #define WPRECALC_00_15_5(i,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28) \ |
Ard Biesheuvel | 0777e3e | 2014-08-05 21:15:19 +0100 | [diff] [blame] | 186 | ARM_LE(vrev32.8 W7, W7; ) /* big => little */ \ |
Jussi Kivilinna | 6046825 | 2014-07-29 17:14:14 +0100 | [diff] [blame] | 187 | |
| 188 | #define WPRECALC_00_15_6(i,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28) \ |
Ard Biesheuvel | 0777e3e | 2014-08-05 21:15:19 +0100 | [diff] [blame] | 189 | ARM_LE(vrev32.8 W6, W6; ) /* big => little */ \ |
Jussi Kivilinna | 6046825 | 2014-07-29 17:14:14 +0100 | [diff] [blame] | 190 | |
| 191 | #define WPRECALC_00_15_7(i,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28) \ |
| 192 | vadd.u32 tmp1, W7, curK; \ |
| 193 | |
| 194 | #define WPRECALC_00_15_8(i,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28) \ |
Ard Biesheuvel | 0777e3e | 2014-08-05 21:15:19 +0100 | [diff] [blame] | 195 | ARM_LE(vrev32.8 W5, W5; ) /* big => little */ \ |
Jussi Kivilinna | 6046825 | 2014-07-29 17:14:14 +0100 | [diff] [blame] | 196 | |
| 197 | #define WPRECALC_00_15_9(i,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28) \ |
| 198 | vadd.u32 tmp2, W6, curK; \ |
| 199 | |
| 200 | #define WPRECALC_00_15_10(i,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28) \ |
| 201 | vst1.32 {tmp0, tmp1}, [RWK]!; \ |
| 202 | |
| 203 | #define WPRECALC_00_15_11(i,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28) \ |
| 204 | vadd.u32 tmp3, W5, curK; \ |
| 205 | |
| 206 | #define WPRECALC_00_15_12(i,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28) \ |
| 207 | vst1.32 {tmp2, tmp3}, [RWK]; \ |
| 208 | |
| 209 | |
| 210 | /********* Precalc macros for rounds 16-31 ************************************/ |
| 211 | |
| 212 | #define WPRECALC_16_31_0(i,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28) \ |
| 213 | veor tmp0, tmp0; \ |
| 214 | vext.8 W, W_m16, W_m12, #8; \ |
| 215 | |
| 216 | #define WPRECALC_16_31_1(i,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28) \ |
| 217 | add RWK, sp, #(WK_offs(i)); \ |
| 218 | vext.8 tmp0, W_m04, tmp0, #4; \ |
| 219 | |
| 220 | #define WPRECALC_16_31_2(i,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28) \ |
| 221 | veor tmp0, tmp0, W_m16; \ |
| 222 | veor.32 W, W, W_m08; \ |
| 223 | |
| 224 | #define WPRECALC_16_31_3(i,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28) \ |
| 225 | veor tmp1, tmp1; \ |
| 226 | veor W, W, tmp0; \ |
| 227 | |
| 228 | #define WPRECALC_16_31_4(i,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28) \ |
| 229 | vshl.u32 tmp0, W, #1; \ |
| 230 | |
| 231 | #define WPRECALC_16_31_5(i,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28) \ |
| 232 | vext.8 tmp1, tmp1, W, #(16-12); \ |
| 233 | vshr.u32 W, W, #31; \ |
| 234 | |
| 235 | #define WPRECALC_16_31_6(i,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28) \ |
| 236 | vorr tmp0, tmp0, W; \ |
| 237 | vshr.u32 W, tmp1, #30; \ |
| 238 | |
| 239 | #define WPRECALC_16_31_7(i,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28) \ |
| 240 | vshl.u32 tmp1, tmp1, #2; \ |
| 241 | |
| 242 | #define WPRECALC_16_31_8(i,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28) \ |
| 243 | veor tmp0, tmp0, W; \ |
| 244 | |
| 245 | #define WPRECALC_16_31_9(i,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28) \ |
| 246 | veor W, tmp0, tmp1; \ |
| 247 | |
| 248 | #define WPRECALC_16_31_10(i,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28) \ |
| 249 | vadd.u32 tmp0, W, curK; \ |
| 250 | |
| 251 | #define WPRECALC_16_31_11(i,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28) \ |
| 252 | vst1.32 {tmp0}, [RWK]; |
| 253 | |
| 254 | |
| 255 | /********* Precalc macros for rounds 32-79 ************************************/ |
| 256 | |
| 257 | #define WPRECALC_32_79_0(i,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28) \ |
| 258 | veor W, W_m28; \ |
| 259 | |
| 260 | #define WPRECALC_32_79_1(i,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28) \ |
| 261 | vext.8 tmp0, W_m08, W_m04, #8; \ |
| 262 | |
| 263 | #define WPRECALC_32_79_2(i,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28) \ |
| 264 | veor W, W_m16; \ |
| 265 | |
| 266 | #define WPRECALC_32_79_3(i,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28) \ |
| 267 | veor W, tmp0; \ |
| 268 | |
| 269 | #define WPRECALC_32_79_4(i,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28) \ |
| 270 | add RWK, sp, #(WK_offs(i&~3)); \ |
| 271 | |
| 272 | #define WPRECALC_32_79_5(i,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28) \ |
| 273 | vshl.u32 tmp1, W, #2; \ |
| 274 | |
| 275 | #define WPRECALC_32_79_6(i,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28) \ |
| 276 | vshr.u32 tmp0, W, #30; \ |
| 277 | |
| 278 | #define WPRECALC_32_79_7(i,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28) \ |
| 279 | vorr W, tmp0, tmp1; \ |
| 280 | |
| 281 | #define WPRECALC_32_79_8(i,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28) \ |
| 282 | vadd.u32 tmp0, W, curK; \ |
| 283 | |
| 284 | #define WPRECALC_32_79_9(i,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28) \ |
| 285 | vst1.32 {tmp0}, [RWK]; |
| 286 | |
| 287 | |
| 288 | /* |
| 289 | * Transform nblks*64 bytes (nblks*16 32-bit words) at DATA. |
| 290 | * |
| 291 | * unsigned int |
| 292 | * sha1_transform_neon (void *ctx, const unsigned char *data, |
| 293 | * unsigned int nblks) |
| 294 | */ |
| 295 | .align 3 |
| 296 | ENTRY(sha1_transform_neon) |
| 297 | /* input: |
| 298 | * r0: ctx, CTX |
| 299 | * r1: data (64*nblks bytes) |
| 300 | * r2: nblks |
| 301 | */ |
| 302 | |
| 303 | cmp RNBLKS, #0; |
| 304 | beq .Ldo_nothing; |
| 305 | |
| 306 | push {r4-r12, lr}; |
| 307 | /*vpush {q4-q7};*/ |
| 308 | |
| 309 | adr RT3, .LK_VEC; |
| 310 | |
| 311 | mov ROLDSTACK, sp; |
| 312 | |
| 313 | /* Align stack. */ |
| 314 | sub RT0, sp, #(16*4); |
| 315 | and RT0, #(~(16-1)); |
| 316 | mov sp, RT0; |
| 317 | |
| 318 | vld1.32 {qK1-qK2}, [RT3]!; /* Load K1,K2 */ |
| 319 | |
| 320 | /* Get the values of the chaining variables. */ |
| 321 | ldm RSTATE, {_a-_e}; |
| 322 | |
| 323 | vld1.32 {qK3-qK4}, [RT3]; /* Load K3,K4 */ |
| 324 | |
| 325 | #undef curK |
| 326 | #define curK qK1 |
| 327 | /* Precalc 0-15. */ |
| 328 | W_PRECALC_00_15(); |
| 329 | |
| 330 | .Loop: |
| 331 | /* Transform 0-15 + Precalc 16-31. */ |
| 332 | _R( _a, _b, _c, _d, _e, F1, 0, |
| 333 | WPRECALC_16_31_0, WPRECALC_16_31_1, WPRECALC_16_31_2, 16, |
| 334 | W4, W5, W6, W7, W0, _, _, _ ); |
| 335 | _R( _e, _a, _b, _c, _d, F1, 1, |
| 336 | WPRECALC_16_31_3, WPRECALC_16_31_4, WPRECALC_16_31_5, 16, |
| 337 | W4, W5, W6, W7, W0, _, _, _ ); |
| 338 | _R( _d, _e, _a, _b, _c, F1, 2, |
| 339 | WPRECALC_16_31_6, WPRECALC_16_31_7, WPRECALC_16_31_8, 16, |
| 340 | W4, W5, W6, W7, W0, _, _, _ ); |
| 341 | _R( _c, _d, _e, _a, _b, F1, 3, |
| 342 | WPRECALC_16_31_9, WPRECALC_16_31_10,WPRECALC_16_31_11,16, |
| 343 | W4, W5, W6, W7, W0, _, _, _ ); |
| 344 | |
| 345 | #undef curK |
| 346 | #define curK qK2 |
| 347 | _R( _b, _c, _d, _e, _a, F1, 4, |
| 348 | WPRECALC_16_31_0, WPRECALC_16_31_1, WPRECALC_16_31_2, 20, |
| 349 | W3, W4, W5, W6, W7, _, _, _ ); |
| 350 | _R( _a, _b, _c, _d, _e, F1, 5, |
| 351 | WPRECALC_16_31_3, WPRECALC_16_31_4, WPRECALC_16_31_5, 20, |
| 352 | W3, W4, W5, W6, W7, _, _, _ ); |
| 353 | _R( _e, _a, _b, _c, _d, F1, 6, |
| 354 | WPRECALC_16_31_6, WPRECALC_16_31_7, WPRECALC_16_31_8, 20, |
| 355 | W3, W4, W5, W6, W7, _, _, _ ); |
| 356 | _R( _d, _e, _a, _b, _c, F1, 7, |
| 357 | WPRECALC_16_31_9, WPRECALC_16_31_10,WPRECALC_16_31_11,20, |
| 358 | W3, W4, W5, W6, W7, _, _, _ ); |
| 359 | |
| 360 | _R( _c, _d, _e, _a, _b, F1, 8, |
| 361 | WPRECALC_16_31_0, WPRECALC_16_31_1, WPRECALC_16_31_2, 24, |
| 362 | W2, W3, W4, W5, W6, _, _, _ ); |
| 363 | _R( _b, _c, _d, _e, _a, F1, 9, |
| 364 | WPRECALC_16_31_3, WPRECALC_16_31_4, WPRECALC_16_31_5, 24, |
| 365 | W2, W3, W4, W5, W6, _, _, _ ); |
| 366 | _R( _a, _b, _c, _d, _e, F1, 10, |
| 367 | WPRECALC_16_31_6, WPRECALC_16_31_7, WPRECALC_16_31_8, 24, |
| 368 | W2, W3, W4, W5, W6, _, _, _ ); |
| 369 | _R( _e, _a, _b, _c, _d, F1, 11, |
| 370 | WPRECALC_16_31_9, WPRECALC_16_31_10,WPRECALC_16_31_11,24, |
| 371 | W2, W3, W4, W5, W6, _, _, _ ); |
| 372 | |
| 373 | _R( _d, _e, _a, _b, _c, F1, 12, |
| 374 | WPRECALC_16_31_0, WPRECALC_16_31_1, WPRECALC_16_31_2, 28, |
| 375 | W1, W2, W3, W4, W5, _, _, _ ); |
| 376 | _R( _c, _d, _e, _a, _b, F1, 13, |
| 377 | WPRECALC_16_31_3, WPRECALC_16_31_4, WPRECALC_16_31_5, 28, |
| 378 | W1, W2, W3, W4, W5, _, _, _ ); |
| 379 | _R( _b, _c, _d, _e, _a, F1, 14, |
| 380 | WPRECALC_16_31_6, WPRECALC_16_31_7, WPRECALC_16_31_8, 28, |
| 381 | W1, W2, W3, W4, W5, _, _, _ ); |
| 382 | _R( _a, _b, _c, _d, _e, F1, 15, |
| 383 | WPRECALC_16_31_9, WPRECALC_16_31_10,WPRECALC_16_31_11,28, |
| 384 | W1, W2, W3, W4, W5, _, _, _ ); |
| 385 | |
| 386 | /* Transform 16-63 + Precalc 32-79. */ |
| 387 | _R( _e, _a, _b, _c, _d, F1, 16, |
| 388 | WPRECALC_32_79_0, WPRECALC_32_79_1, WPRECALC_32_79_2, 32, |
| 389 | W0, W1, W2, W3, W4, W5, W6, W7); |
| 390 | _R( _d, _e, _a, _b, _c, F1, 17, |
| 391 | WPRECALC_32_79_3, WPRECALC_32_79_4, WPRECALC_32_79_5, 32, |
| 392 | W0, W1, W2, W3, W4, W5, W6, W7); |
| 393 | _R( _c, _d, _e, _a, _b, F1, 18, |
| 394 | WPRECALC_32_79_6, dummy, WPRECALC_32_79_7, 32, |
| 395 | W0, W1, W2, W3, W4, W5, W6, W7); |
| 396 | _R( _b, _c, _d, _e, _a, F1, 19, |
| 397 | WPRECALC_32_79_8, dummy, WPRECALC_32_79_9, 32, |
| 398 | W0, W1, W2, W3, W4, W5, W6, W7); |
| 399 | |
| 400 | _R( _a, _b, _c, _d, _e, F2, 20, |
| 401 | WPRECALC_32_79_0, WPRECALC_32_79_1, WPRECALC_32_79_2, 36, |
| 402 | W7, W0, W1, W2, W3, W4, W5, W6); |
| 403 | _R( _e, _a, _b, _c, _d, F2, 21, |
| 404 | WPRECALC_32_79_3, WPRECALC_32_79_4, WPRECALC_32_79_5, 36, |
| 405 | W7, W0, W1, W2, W3, W4, W5, W6); |
| 406 | _R( _d, _e, _a, _b, _c, F2, 22, |
| 407 | WPRECALC_32_79_6, dummy, WPRECALC_32_79_7, 36, |
| 408 | W7, W0, W1, W2, W3, W4, W5, W6); |
| 409 | _R( _c, _d, _e, _a, _b, F2, 23, |
| 410 | WPRECALC_32_79_8, dummy, WPRECALC_32_79_9, 36, |
| 411 | W7, W0, W1, W2, W3, W4, W5, W6); |
| 412 | |
| 413 | #undef curK |
| 414 | #define curK qK3 |
| 415 | _R( _b, _c, _d, _e, _a, F2, 24, |
| 416 | WPRECALC_32_79_0, WPRECALC_32_79_1, WPRECALC_32_79_2, 40, |
| 417 | W6, W7, W0, W1, W2, W3, W4, W5); |
| 418 | _R( _a, _b, _c, _d, _e, F2, 25, |
| 419 | WPRECALC_32_79_3, WPRECALC_32_79_4, WPRECALC_32_79_5, 40, |
| 420 | W6, W7, W0, W1, W2, W3, W4, W5); |
| 421 | _R( _e, _a, _b, _c, _d, F2, 26, |
| 422 | WPRECALC_32_79_6, dummy, WPRECALC_32_79_7, 40, |
| 423 | W6, W7, W0, W1, W2, W3, W4, W5); |
| 424 | _R( _d, _e, _a, _b, _c, F2, 27, |
| 425 | WPRECALC_32_79_8, dummy, WPRECALC_32_79_9, 40, |
| 426 | W6, W7, W0, W1, W2, W3, W4, W5); |
| 427 | |
| 428 | _R( _c, _d, _e, _a, _b, F2, 28, |
| 429 | WPRECALC_32_79_0, WPRECALC_32_79_1, WPRECALC_32_79_2, 44, |
| 430 | W5, W6, W7, W0, W1, W2, W3, W4); |
| 431 | _R( _b, _c, _d, _e, _a, F2, 29, |
| 432 | WPRECALC_32_79_3, WPRECALC_32_79_4, WPRECALC_32_79_5, 44, |
| 433 | W5, W6, W7, W0, W1, W2, W3, W4); |
| 434 | _R( _a, _b, _c, _d, _e, F2, 30, |
| 435 | WPRECALC_32_79_6, dummy, WPRECALC_32_79_7, 44, |
| 436 | W5, W6, W7, W0, W1, W2, W3, W4); |
| 437 | _R( _e, _a, _b, _c, _d, F2, 31, |
| 438 | WPRECALC_32_79_8, dummy, WPRECALC_32_79_9, 44, |
| 439 | W5, W6, W7, W0, W1, W2, W3, W4); |
| 440 | |
| 441 | _R( _d, _e, _a, _b, _c, F2, 32, |
| 442 | WPRECALC_32_79_0, WPRECALC_32_79_1, WPRECALC_32_79_2, 48, |
| 443 | W4, W5, W6, W7, W0, W1, W2, W3); |
| 444 | _R( _c, _d, _e, _a, _b, F2, 33, |
| 445 | WPRECALC_32_79_3, WPRECALC_32_79_4, WPRECALC_32_79_5, 48, |
| 446 | W4, W5, W6, W7, W0, W1, W2, W3); |
| 447 | _R( _b, _c, _d, _e, _a, F2, 34, |
| 448 | WPRECALC_32_79_6, dummy, WPRECALC_32_79_7, 48, |
| 449 | W4, W5, W6, W7, W0, W1, W2, W3); |
| 450 | _R( _a, _b, _c, _d, _e, F2, 35, |
| 451 | WPRECALC_32_79_8, dummy, WPRECALC_32_79_9, 48, |
| 452 | W4, W5, W6, W7, W0, W1, W2, W3); |
| 453 | |
| 454 | _R( _e, _a, _b, _c, _d, F2, 36, |
| 455 | WPRECALC_32_79_0, WPRECALC_32_79_1, WPRECALC_32_79_2, 52, |
| 456 | W3, W4, W5, W6, W7, W0, W1, W2); |
| 457 | _R( _d, _e, _a, _b, _c, F2, 37, |
| 458 | WPRECALC_32_79_3, WPRECALC_32_79_4, WPRECALC_32_79_5, 52, |
| 459 | W3, W4, W5, W6, W7, W0, W1, W2); |
| 460 | _R( _c, _d, _e, _a, _b, F2, 38, |
| 461 | WPRECALC_32_79_6, dummy, WPRECALC_32_79_7, 52, |
| 462 | W3, W4, W5, W6, W7, W0, W1, W2); |
| 463 | _R( _b, _c, _d, _e, _a, F2, 39, |
| 464 | WPRECALC_32_79_8, dummy, WPRECALC_32_79_9, 52, |
| 465 | W3, W4, W5, W6, W7, W0, W1, W2); |
| 466 | |
| 467 | _R( _a, _b, _c, _d, _e, F3, 40, |
| 468 | WPRECALC_32_79_0, WPRECALC_32_79_1, WPRECALC_32_79_2, 56, |
| 469 | W2, W3, W4, W5, W6, W7, W0, W1); |
| 470 | _R( _e, _a, _b, _c, _d, F3, 41, |
| 471 | WPRECALC_32_79_3, WPRECALC_32_79_4, WPRECALC_32_79_5, 56, |
| 472 | W2, W3, W4, W5, W6, W7, W0, W1); |
| 473 | _R( _d, _e, _a, _b, _c, F3, 42, |
| 474 | WPRECALC_32_79_6, dummy, WPRECALC_32_79_7, 56, |
| 475 | W2, W3, W4, W5, W6, W7, W0, W1); |
| 476 | _R( _c, _d, _e, _a, _b, F3, 43, |
| 477 | WPRECALC_32_79_8, dummy, WPRECALC_32_79_9, 56, |
| 478 | W2, W3, W4, W5, W6, W7, W0, W1); |
| 479 | |
| 480 | #undef curK |
| 481 | #define curK qK4 |
| 482 | _R( _b, _c, _d, _e, _a, F3, 44, |
| 483 | WPRECALC_32_79_0, WPRECALC_32_79_1, WPRECALC_32_79_2, 60, |
| 484 | W1, W2, W3, W4, W5, W6, W7, W0); |
| 485 | _R( _a, _b, _c, _d, _e, F3, 45, |
| 486 | WPRECALC_32_79_3, WPRECALC_32_79_4, WPRECALC_32_79_5, 60, |
| 487 | W1, W2, W3, W4, W5, W6, W7, W0); |
| 488 | _R( _e, _a, _b, _c, _d, F3, 46, |
| 489 | WPRECALC_32_79_6, dummy, WPRECALC_32_79_7, 60, |
| 490 | W1, W2, W3, W4, W5, W6, W7, W0); |
| 491 | _R( _d, _e, _a, _b, _c, F3, 47, |
| 492 | WPRECALC_32_79_8, dummy, WPRECALC_32_79_9, 60, |
| 493 | W1, W2, W3, W4, W5, W6, W7, W0); |
| 494 | |
| 495 | _R( _c, _d, _e, _a, _b, F3, 48, |
| 496 | WPRECALC_32_79_0, WPRECALC_32_79_1, WPRECALC_32_79_2, 64, |
| 497 | W0, W1, W2, W3, W4, W5, W6, W7); |
| 498 | _R( _b, _c, _d, _e, _a, F3, 49, |
| 499 | WPRECALC_32_79_3, WPRECALC_32_79_4, WPRECALC_32_79_5, 64, |
| 500 | W0, W1, W2, W3, W4, W5, W6, W7); |
| 501 | _R( _a, _b, _c, _d, _e, F3, 50, |
| 502 | WPRECALC_32_79_6, dummy, WPRECALC_32_79_7, 64, |
| 503 | W0, W1, W2, W3, W4, W5, W6, W7); |
| 504 | _R( _e, _a, _b, _c, _d, F3, 51, |
| 505 | WPRECALC_32_79_8, dummy, WPRECALC_32_79_9, 64, |
| 506 | W0, W1, W2, W3, W4, W5, W6, W7); |
| 507 | |
| 508 | _R( _d, _e, _a, _b, _c, F3, 52, |
| 509 | WPRECALC_32_79_0, WPRECALC_32_79_1, WPRECALC_32_79_2, 68, |
| 510 | W7, W0, W1, W2, W3, W4, W5, W6); |
| 511 | _R( _c, _d, _e, _a, _b, F3, 53, |
| 512 | WPRECALC_32_79_3, WPRECALC_32_79_4, WPRECALC_32_79_5, 68, |
| 513 | W7, W0, W1, W2, W3, W4, W5, W6); |
| 514 | _R( _b, _c, _d, _e, _a, F3, 54, |
| 515 | WPRECALC_32_79_6, dummy, WPRECALC_32_79_7, 68, |
| 516 | W7, W0, W1, W2, W3, W4, W5, W6); |
| 517 | _R( _a, _b, _c, _d, _e, F3, 55, |
| 518 | WPRECALC_32_79_8, dummy, WPRECALC_32_79_9, 68, |
| 519 | W7, W0, W1, W2, W3, W4, W5, W6); |
| 520 | |
| 521 | _R( _e, _a, _b, _c, _d, F3, 56, |
| 522 | WPRECALC_32_79_0, WPRECALC_32_79_1, WPRECALC_32_79_2, 72, |
| 523 | W6, W7, W0, W1, W2, W3, W4, W5); |
| 524 | _R( _d, _e, _a, _b, _c, F3, 57, |
| 525 | WPRECALC_32_79_3, WPRECALC_32_79_4, WPRECALC_32_79_5, 72, |
| 526 | W6, W7, W0, W1, W2, W3, W4, W5); |
| 527 | _R( _c, _d, _e, _a, _b, F3, 58, |
| 528 | WPRECALC_32_79_6, dummy, WPRECALC_32_79_7, 72, |
| 529 | W6, W7, W0, W1, W2, W3, W4, W5); |
| 530 | _R( _b, _c, _d, _e, _a, F3, 59, |
| 531 | WPRECALC_32_79_8, dummy, WPRECALC_32_79_9, 72, |
| 532 | W6, W7, W0, W1, W2, W3, W4, W5); |
| 533 | |
| 534 | subs RNBLKS, #1; |
| 535 | |
| 536 | _R( _a, _b, _c, _d, _e, F4, 60, |
| 537 | WPRECALC_32_79_0, WPRECALC_32_79_1, WPRECALC_32_79_2, 76, |
| 538 | W5, W6, W7, W0, W1, W2, W3, W4); |
| 539 | _R( _e, _a, _b, _c, _d, F4, 61, |
| 540 | WPRECALC_32_79_3, WPRECALC_32_79_4, WPRECALC_32_79_5, 76, |
| 541 | W5, W6, W7, W0, W1, W2, W3, W4); |
| 542 | _R( _d, _e, _a, _b, _c, F4, 62, |
| 543 | WPRECALC_32_79_6, dummy, WPRECALC_32_79_7, 76, |
| 544 | W5, W6, W7, W0, W1, W2, W3, W4); |
| 545 | _R( _c, _d, _e, _a, _b, F4, 63, |
| 546 | WPRECALC_32_79_8, dummy, WPRECALC_32_79_9, 76, |
| 547 | W5, W6, W7, W0, W1, W2, W3, W4); |
| 548 | |
| 549 | beq .Lend; |
| 550 | |
| 551 | /* Transform 64-79 + Precalc 0-15 of next block. */ |
| 552 | #undef curK |
| 553 | #define curK qK1 |
| 554 | _R( _b, _c, _d, _e, _a, F4, 64, |
| 555 | WPRECALC_00_15_0, dummy, dummy, _, _, _, _, _, _, _, _, _ ); |
| 556 | _R( _a, _b, _c, _d, _e, F4, 65, |
| 557 | WPRECALC_00_15_1, dummy, dummy, _, _, _, _, _, _, _, _, _ ); |
| 558 | _R( _e, _a, _b, _c, _d, F4, 66, |
| 559 | WPRECALC_00_15_2, dummy, dummy, _, _, _, _, _, _, _, _, _ ); |
| 560 | _R( _d, _e, _a, _b, _c, F4, 67, |
| 561 | WPRECALC_00_15_3, dummy, dummy, _, _, _, _, _, _, _, _, _ ); |
| 562 | |
| 563 | _R( _c, _d, _e, _a, _b, F4, 68, |
| 564 | dummy, dummy, dummy, _, _, _, _, _, _, _, _, _ ); |
| 565 | _R( _b, _c, _d, _e, _a, F4, 69, |
| 566 | dummy, dummy, dummy, _, _, _, _, _, _, _, _, _ ); |
| 567 | _R( _a, _b, _c, _d, _e, F4, 70, |
| 568 | WPRECALC_00_15_4, dummy, dummy, _, _, _, _, _, _, _, _, _ ); |
| 569 | _R( _e, _a, _b, _c, _d, F4, 71, |
| 570 | WPRECALC_00_15_5, dummy, dummy, _, _, _, _, _, _, _, _, _ ); |
| 571 | |
| 572 | _R( _d, _e, _a, _b, _c, F4, 72, |
| 573 | dummy, dummy, dummy, _, _, _, _, _, _, _, _, _ ); |
| 574 | _R( _c, _d, _e, _a, _b, F4, 73, |
| 575 | dummy, dummy, dummy, _, _, _, _, _, _, _, _, _ ); |
| 576 | _R( _b, _c, _d, _e, _a, F4, 74, |
| 577 | WPRECALC_00_15_6, dummy, dummy, _, _, _, _, _, _, _, _, _ ); |
| 578 | _R( _a, _b, _c, _d, _e, F4, 75, |
| 579 | WPRECALC_00_15_7, dummy, dummy, _, _, _, _, _, _, _, _, _ ); |
| 580 | |
| 581 | _R( _e, _a, _b, _c, _d, F4, 76, |
| 582 | WPRECALC_00_15_8, dummy, dummy, _, _, _, _, _, _, _, _, _ ); |
| 583 | _R( _d, _e, _a, _b, _c, F4, 77, |
| 584 | WPRECALC_00_15_9, dummy, dummy, _, _, _, _, _, _, _, _, _ ); |
| 585 | _R( _c, _d, _e, _a, _b, F4, 78, |
| 586 | WPRECALC_00_15_10, dummy, dummy, _, _, _, _, _, _, _, _, _ ); |
| 587 | _R( _b, _c, _d, _e, _a, F4, 79, |
| 588 | WPRECALC_00_15_11, dummy, WPRECALC_00_15_12, _, _, _, _, _, _, _, _, _ ); |
| 589 | |
| 590 | /* Update the chaining variables. */ |
| 591 | ldm RSTATE, {RT0-RT3}; |
| 592 | add _a, RT0; |
| 593 | ldr RT0, [RSTATE, #state_h4]; |
| 594 | add _b, RT1; |
| 595 | add _c, RT2; |
| 596 | add _d, RT3; |
| 597 | add _e, RT0; |
| 598 | stm RSTATE, {_a-_e}; |
| 599 | |
| 600 | b .Loop; |
| 601 | |
| 602 | .Lend: |
| 603 | /* Transform 64-79 */ |
| 604 | R( _b, _c, _d, _e, _a, F4, 64 ); |
| 605 | R( _a, _b, _c, _d, _e, F4, 65 ); |
| 606 | R( _e, _a, _b, _c, _d, F4, 66 ); |
| 607 | R( _d, _e, _a, _b, _c, F4, 67 ); |
| 608 | R( _c, _d, _e, _a, _b, F4, 68 ); |
| 609 | R( _b, _c, _d, _e, _a, F4, 69 ); |
| 610 | R( _a, _b, _c, _d, _e, F4, 70 ); |
| 611 | R( _e, _a, _b, _c, _d, F4, 71 ); |
| 612 | R( _d, _e, _a, _b, _c, F4, 72 ); |
| 613 | R( _c, _d, _e, _a, _b, F4, 73 ); |
| 614 | R( _b, _c, _d, _e, _a, F4, 74 ); |
| 615 | R( _a, _b, _c, _d, _e, F4, 75 ); |
| 616 | R( _e, _a, _b, _c, _d, F4, 76 ); |
| 617 | R( _d, _e, _a, _b, _c, F4, 77 ); |
| 618 | R( _c, _d, _e, _a, _b, F4, 78 ); |
| 619 | R( _b, _c, _d, _e, _a, F4, 79 ); |
| 620 | |
| 621 | mov sp, ROLDSTACK; |
| 622 | |
| 623 | /* Update the chaining variables. */ |
| 624 | ldm RSTATE, {RT0-RT3}; |
| 625 | add _a, RT0; |
| 626 | ldr RT0, [RSTATE, #state_h4]; |
| 627 | add _b, RT1; |
| 628 | add _c, RT2; |
| 629 | add _d, RT3; |
| 630 | /*vpop {q4-q7};*/ |
| 631 | add _e, RT0; |
| 632 | stm RSTATE, {_a-_e}; |
| 633 | |
| 634 | pop {r4-r12, pc}; |
| 635 | |
| 636 | .Ldo_nothing: |
| 637 | bx lr |
| 638 | ENDPROC(sha1_transform_neon) |