Mathias Krause | 66be895 | 2011-08-04 20:19:25 +0200 | [diff] [blame] | 1 | /* |
| 2 | * This is a SIMD SHA-1 implementation. It requires the Intel(R) Supplemental |
| 3 | * SSE3 instruction set extensions introduced in Intel Core Microarchitecture |
| 4 | * processors. CPUs supporting Intel(R) AVX extensions will get an additional |
| 5 | * boost. |
| 6 | * |
| 7 | * This work was inspired by the vectorized implementation of Dean Gaudet. |
| 8 | * Additional information on it can be found at: |
| 9 | * http://www.arctic.org/~dean/crypto/sha1.html |
| 10 | * |
| 11 | * It was improved upon with more efficient vectorization of the message |
| 12 | * scheduling. This implementation has also been optimized for all current and |
| 13 | * several future generations of Intel CPUs. |
| 14 | * |
| 15 | * See this article for more information about the implementation details: |
| 16 | * http://software.intel.com/en-us/articles/improving-the-performance-of-the-secure-hash-algorithm-1/ |
| 17 | * |
| 18 | * Copyright (C) 2010, Intel Corp. |
| 19 | * Authors: Maxim Locktyukhin <maxim.locktyukhin@intel.com> |
| 20 | * Ronen Zohar <ronen.zohar@intel.com> |
| 21 | * |
| 22 | * Converted to AT&T syntax and adapted for inclusion in the Linux kernel: |
| 23 | * Author: Mathias Krause <minipli@googlemail.com> |
| 24 | * |
| 25 | * This program is free software; you can redistribute it and/or modify |
| 26 | * it under the terms of the GNU General Public License as published by |
| 27 | * the Free Software Foundation; either version 2 of the License, or |
| 28 | * (at your option) any later version. |
| 29 | */ |
| 30 | |
Jussi Kivilinna | ac9d55d | 2013-01-19 13:39:41 +0200 | [diff] [blame] | 31 | #include <linux/linkage.h> |
| 32 | |
Mathias Krause | 66be895 | 2011-08-04 20:19:25 +0200 | [diff] [blame] | 33 | #define CTX %rdi // arg1 |
| 34 | #define BUF %rsi // arg2 |
| 35 | #define CNT %rdx // arg3 |
| 36 | |
| 37 | #define REG_A %ecx |
| 38 | #define REG_B %esi |
| 39 | #define REG_C %edi |
| 40 | #define REG_D %ebp |
| 41 | #define REG_E %edx |
| 42 | |
| 43 | #define REG_T1 %eax |
| 44 | #define REG_T2 %ebx |
| 45 | |
| 46 | #define K_BASE %r8 |
| 47 | #define HASH_PTR %r9 |
| 48 | #define BUFFER_PTR %r10 |
| 49 | #define BUFFER_END %r11 |
| 50 | |
| 51 | #define W_TMP1 %xmm0 |
| 52 | #define W_TMP2 %xmm9 |
| 53 | |
| 54 | #define W0 %xmm1 |
| 55 | #define W4 %xmm2 |
| 56 | #define W8 %xmm3 |
| 57 | #define W12 %xmm4 |
| 58 | #define W16 %xmm5 |
| 59 | #define W20 %xmm6 |
| 60 | #define W24 %xmm7 |
| 61 | #define W28 %xmm8 |
| 62 | |
| 63 | #define XMM_SHUFB_BSWAP %xmm10 |
| 64 | |
| 65 | /* we keep window of 64 w[i]+K pre-calculated values in a circular buffer */ |
| 66 | #define WK(t) (((t) & 15) * 4)(%rsp) |
| 67 | #define W_PRECALC_AHEAD 16 |
| 68 | |
| 69 | /* |
| 70 | * This macro implements the SHA-1 function's body for single 64-byte block |
| 71 | * param: function's name |
| 72 | */ |
| 73 | .macro SHA1_VECTOR_ASM name |
Jussi Kivilinna | ac9d55d | 2013-01-19 13:39:41 +0200 | [diff] [blame] | 74 | ENTRY(\name) |
| 75 | |
Mathias Krause | 66be895 | 2011-08-04 20:19:25 +0200 | [diff] [blame] | 76 | push %rbx |
| 77 | push %rbp |
| 78 | push %r12 |
| 79 | |
| 80 | mov %rsp, %r12 |
| 81 | sub $64, %rsp # allocate workspace |
| 82 | and $~15, %rsp # align stack |
| 83 | |
| 84 | mov CTX, HASH_PTR |
| 85 | mov BUF, BUFFER_PTR |
| 86 | |
| 87 | shl $6, CNT # multiply by 64 |
| 88 | add BUF, CNT |
| 89 | mov CNT, BUFFER_END |
| 90 | |
| 91 | lea K_XMM_AR(%rip), K_BASE |
| 92 | xmm_mov BSWAP_SHUFB_CTL(%rip), XMM_SHUFB_BSWAP |
| 93 | |
| 94 | SHA1_PIPELINED_MAIN_BODY |
| 95 | |
| 96 | # cleanup workspace |
| 97 | mov $8, %ecx |
| 98 | mov %rsp, %rdi |
| 99 | xor %rax, %rax |
| 100 | rep stosq |
| 101 | |
| 102 | mov %r12, %rsp # deallocate workspace |
| 103 | |
| 104 | pop %r12 |
| 105 | pop %rbp |
| 106 | pop %rbx |
| 107 | ret |
| 108 | |
Jussi Kivilinna | ac9d55d | 2013-01-19 13:39:41 +0200 | [diff] [blame] | 109 | ENDPROC(\name) |
Mathias Krause | 66be895 | 2011-08-04 20:19:25 +0200 | [diff] [blame] | 110 | .endm |
| 111 | |
| 112 | /* |
| 113 | * This macro implements 80 rounds of SHA-1 for one 64-byte block |
| 114 | */ |
| 115 | .macro SHA1_PIPELINED_MAIN_BODY |
| 116 | INIT_REGALLOC |
| 117 | |
| 118 | mov (HASH_PTR), A |
| 119 | mov 4(HASH_PTR), B |
| 120 | mov 8(HASH_PTR), C |
| 121 | mov 12(HASH_PTR), D |
| 122 | mov 16(HASH_PTR), E |
| 123 | |
| 124 | .set i, 0 |
| 125 | .rept W_PRECALC_AHEAD |
| 126 | W_PRECALC i |
| 127 | .set i, (i+1) |
| 128 | .endr |
| 129 | |
| 130 | .align 4 |
| 131 | 1: |
| 132 | RR F1,A,B,C,D,E,0 |
| 133 | RR F1,D,E,A,B,C,2 |
| 134 | RR F1,B,C,D,E,A,4 |
| 135 | RR F1,E,A,B,C,D,6 |
| 136 | RR F1,C,D,E,A,B,8 |
| 137 | |
| 138 | RR F1,A,B,C,D,E,10 |
| 139 | RR F1,D,E,A,B,C,12 |
| 140 | RR F1,B,C,D,E,A,14 |
| 141 | RR F1,E,A,B,C,D,16 |
| 142 | RR F1,C,D,E,A,B,18 |
| 143 | |
| 144 | RR F2,A,B,C,D,E,20 |
| 145 | RR F2,D,E,A,B,C,22 |
| 146 | RR F2,B,C,D,E,A,24 |
| 147 | RR F2,E,A,B,C,D,26 |
| 148 | RR F2,C,D,E,A,B,28 |
| 149 | |
| 150 | RR F2,A,B,C,D,E,30 |
| 151 | RR F2,D,E,A,B,C,32 |
| 152 | RR F2,B,C,D,E,A,34 |
| 153 | RR F2,E,A,B,C,D,36 |
| 154 | RR F2,C,D,E,A,B,38 |
| 155 | |
| 156 | RR F3,A,B,C,D,E,40 |
| 157 | RR F3,D,E,A,B,C,42 |
| 158 | RR F3,B,C,D,E,A,44 |
| 159 | RR F3,E,A,B,C,D,46 |
| 160 | RR F3,C,D,E,A,B,48 |
| 161 | |
| 162 | RR F3,A,B,C,D,E,50 |
| 163 | RR F3,D,E,A,B,C,52 |
| 164 | RR F3,B,C,D,E,A,54 |
| 165 | RR F3,E,A,B,C,D,56 |
| 166 | RR F3,C,D,E,A,B,58 |
| 167 | |
| 168 | add $64, BUFFER_PTR # move to the next 64-byte block |
| 169 | cmp BUFFER_END, BUFFER_PTR # if the current is the last one use |
| 170 | cmovae K_BASE, BUFFER_PTR # dummy source to avoid buffer overrun |
| 171 | |
| 172 | RR F4,A,B,C,D,E,60 |
| 173 | RR F4,D,E,A,B,C,62 |
| 174 | RR F4,B,C,D,E,A,64 |
| 175 | RR F4,E,A,B,C,D,66 |
| 176 | RR F4,C,D,E,A,B,68 |
| 177 | |
| 178 | RR F4,A,B,C,D,E,70 |
| 179 | RR F4,D,E,A,B,C,72 |
| 180 | RR F4,B,C,D,E,A,74 |
| 181 | RR F4,E,A,B,C,D,76 |
| 182 | RR F4,C,D,E,A,B,78 |
| 183 | |
| 184 | UPDATE_HASH (HASH_PTR), A |
| 185 | UPDATE_HASH 4(HASH_PTR), B |
| 186 | UPDATE_HASH 8(HASH_PTR), C |
| 187 | UPDATE_HASH 12(HASH_PTR), D |
| 188 | UPDATE_HASH 16(HASH_PTR), E |
| 189 | |
| 190 | RESTORE_RENAMED_REGS |
| 191 | cmp K_BASE, BUFFER_PTR # K_BASE means, we reached the end |
| 192 | jne 1b |
| 193 | .endm |
| 194 | |
| 195 | .macro INIT_REGALLOC |
| 196 | .set A, REG_A |
| 197 | .set B, REG_B |
| 198 | .set C, REG_C |
| 199 | .set D, REG_D |
| 200 | .set E, REG_E |
| 201 | .set T1, REG_T1 |
| 202 | .set T2, REG_T2 |
| 203 | .endm |
| 204 | |
| 205 | .macro RESTORE_RENAMED_REGS |
| 206 | # order is important (REG_C is where it should be) |
| 207 | mov B, REG_B |
| 208 | mov D, REG_D |
| 209 | mov A, REG_A |
| 210 | mov E, REG_E |
| 211 | .endm |
| 212 | |
| 213 | .macro SWAP_REG_NAMES a, b |
| 214 | .set _T, \a |
| 215 | .set \a, \b |
| 216 | .set \b, _T |
| 217 | .endm |
| 218 | |
| 219 | .macro F1 b, c, d |
| 220 | mov \c, T1 |
| 221 | SWAP_REG_NAMES \c, T1 |
| 222 | xor \d, T1 |
| 223 | and \b, T1 |
| 224 | xor \d, T1 |
| 225 | .endm |
| 226 | |
| 227 | .macro F2 b, c, d |
| 228 | mov \d, T1 |
| 229 | SWAP_REG_NAMES \d, T1 |
| 230 | xor \c, T1 |
| 231 | xor \b, T1 |
| 232 | .endm |
| 233 | |
| 234 | .macro F3 b, c ,d |
| 235 | mov \c, T1 |
| 236 | SWAP_REG_NAMES \c, T1 |
| 237 | mov \b, T2 |
| 238 | or \b, T1 |
| 239 | and \c, T2 |
| 240 | and \d, T1 |
| 241 | or T2, T1 |
| 242 | .endm |
| 243 | |
| 244 | .macro F4 b, c, d |
| 245 | F2 \b, \c, \d |
| 246 | .endm |
| 247 | |
| 248 | .macro UPDATE_HASH hash, val |
| 249 | add \hash, \val |
| 250 | mov \val, \hash |
| 251 | .endm |
| 252 | |
| 253 | /* |
| 254 | * RR does two rounds of SHA-1 back to back with W[] pre-calc |
| 255 | * t1 = F(b, c, d); e += w(i) |
| 256 | * e += t1; b <<= 30; d += w(i+1); |
| 257 | * t1 = F(a, b, c); |
| 258 | * d += t1; a <<= 5; |
| 259 | * e += a; |
| 260 | * t1 = e; a >>= 7; |
| 261 | * t1 <<= 5; |
| 262 | * d += t1; |
| 263 | */ |
| 264 | .macro RR F, a, b, c, d, e, round |
| 265 | add WK(\round), \e |
| 266 | \F \b, \c, \d # t1 = F(b, c, d); |
| 267 | W_PRECALC (\round + W_PRECALC_AHEAD) |
| 268 | rol $30, \b |
| 269 | add T1, \e |
| 270 | add WK(\round + 1), \d |
| 271 | |
| 272 | \F \a, \b, \c |
| 273 | W_PRECALC (\round + W_PRECALC_AHEAD + 1) |
| 274 | rol $5, \a |
| 275 | add \a, \e |
| 276 | add T1, \d |
| 277 | ror $7, \a # (a <<r 5) >>r 7) => a <<r 30) |
| 278 | |
| 279 | mov \e, T1 |
| 280 | SWAP_REG_NAMES \e, T1 |
| 281 | |
| 282 | rol $5, T1 |
| 283 | add T1, \d |
| 284 | |
| 285 | # write: \a, \b |
| 286 | # rotate: \a<=\d, \b<=\e, \c<=\a, \d<=\b, \e<=\c |
| 287 | .endm |
| 288 | |
| 289 | .macro W_PRECALC r |
| 290 | .set i, \r |
| 291 | |
| 292 | .if (i < 20) |
| 293 | .set K_XMM, 0 |
| 294 | .elseif (i < 40) |
| 295 | .set K_XMM, 16 |
| 296 | .elseif (i < 60) |
| 297 | .set K_XMM, 32 |
| 298 | .elseif (i < 80) |
| 299 | .set K_XMM, 48 |
| 300 | .endif |
| 301 | |
| 302 | .if ((i < 16) || ((i >= 80) && (i < (80 + W_PRECALC_AHEAD)))) |
| 303 | .set i, ((\r) % 80) # pre-compute for the next iteration |
| 304 | .if (i == 0) |
| 305 | W_PRECALC_RESET |
| 306 | .endif |
| 307 | W_PRECALC_00_15 |
| 308 | .elseif (i<32) |
| 309 | W_PRECALC_16_31 |
| 310 | .elseif (i < 80) // rounds 32-79 |
| 311 | W_PRECALC_32_79 |
| 312 | .endif |
| 313 | .endm |
| 314 | |
| 315 | .macro W_PRECALC_RESET |
| 316 | .set W, W0 |
| 317 | .set W_minus_04, W4 |
| 318 | .set W_minus_08, W8 |
| 319 | .set W_minus_12, W12 |
| 320 | .set W_minus_16, W16 |
| 321 | .set W_minus_20, W20 |
| 322 | .set W_minus_24, W24 |
| 323 | .set W_minus_28, W28 |
| 324 | .set W_minus_32, W |
| 325 | .endm |
| 326 | |
| 327 | .macro W_PRECALC_ROTATE |
| 328 | .set W_minus_32, W_minus_28 |
| 329 | .set W_minus_28, W_minus_24 |
| 330 | .set W_minus_24, W_minus_20 |
| 331 | .set W_minus_20, W_minus_16 |
| 332 | .set W_minus_16, W_minus_12 |
| 333 | .set W_minus_12, W_minus_08 |
| 334 | .set W_minus_08, W_minus_04 |
| 335 | .set W_minus_04, W |
| 336 | .set W, W_minus_32 |
| 337 | .endm |
| 338 | |
| 339 | .macro W_PRECALC_SSSE3 |
| 340 | |
| 341 | .macro W_PRECALC_00_15 |
| 342 | W_PRECALC_00_15_SSSE3 |
| 343 | .endm |
| 344 | .macro W_PRECALC_16_31 |
| 345 | W_PRECALC_16_31_SSSE3 |
| 346 | .endm |
| 347 | .macro W_PRECALC_32_79 |
| 348 | W_PRECALC_32_79_SSSE3 |
| 349 | .endm |
| 350 | |
| 351 | /* message scheduling pre-compute for rounds 0-15 */ |
| 352 | .macro W_PRECALC_00_15_SSSE3 |
| 353 | .if ((i & 3) == 0) |
| 354 | movdqu (i*4)(BUFFER_PTR), W_TMP1 |
| 355 | .elseif ((i & 3) == 1) |
| 356 | pshufb XMM_SHUFB_BSWAP, W_TMP1 |
| 357 | movdqa W_TMP1, W |
| 358 | .elseif ((i & 3) == 2) |
| 359 | paddd (K_BASE), W_TMP1 |
| 360 | .elseif ((i & 3) == 3) |
| 361 | movdqa W_TMP1, WK(i&~3) |
| 362 | W_PRECALC_ROTATE |
| 363 | .endif |
| 364 | .endm |
| 365 | |
| 366 | /* message scheduling pre-compute for rounds 16-31 |
| 367 | * |
| 368 | * - calculating last 32 w[i] values in 8 XMM registers |
| 369 | * - pre-calculate K+w[i] values and store to mem, for later load by ALU add |
| 370 | * instruction |
| 371 | * |
| 372 | * some "heavy-lifting" vectorization for rounds 16-31 due to w[i]->w[i-3] |
| 373 | * dependency, but improves for 32-79 |
| 374 | */ |
| 375 | .macro W_PRECALC_16_31_SSSE3 |
| 376 | # blended scheduling of vector and scalar instruction streams, one 4-wide |
| 377 | # vector iteration / 4 scalar rounds |
| 378 | .if ((i & 3) == 0) |
| 379 | movdqa W_minus_12, W |
| 380 | palignr $8, W_minus_16, W # w[i-14] |
| 381 | movdqa W_minus_04, W_TMP1 |
| 382 | psrldq $4, W_TMP1 # w[i-3] |
| 383 | pxor W_minus_08, W |
| 384 | .elseif ((i & 3) == 1) |
| 385 | pxor W_minus_16, W_TMP1 |
| 386 | pxor W_TMP1, W |
| 387 | movdqa W, W_TMP2 |
| 388 | movdqa W, W_TMP1 |
| 389 | pslldq $12, W_TMP2 |
| 390 | .elseif ((i & 3) == 2) |
| 391 | psrld $31, W |
| 392 | pslld $1, W_TMP1 |
| 393 | por W, W_TMP1 |
| 394 | movdqa W_TMP2, W |
| 395 | psrld $30, W_TMP2 |
| 396 | pslld $2, W |
| 397 | .elseif ((i & 3) == 3) |
| 398 | pxor W, W_TMP1 |
| 399 | pxor W_TMP2, W_TMP1 |
| 400 | movdqa W_TMP1, W |
| 401 | paddd K_XMM(K_BASE), W_TMP1 |
| 402 | movdqa W_TMP1, WK(i&~3) |
| 403 | W_PRECALC_ROTATE |
| 404 | .endif |
| 405 | .endm |
| 406 | |
| 407 | /* message scheduling pre-compute for rounds 32-79 |
| 408 | * |
| 409 | * in SHA-1 specification: w[i] = (w[i-3] ^ w[i-8] ^ w[i-14] ^ w[i-16]) rol 1 |
| 410 | * instead we do equal: w[i] = (w[i-6] ^ w[i-16] ^ w[i-28] ^ w[i-32]) rol 2 |
| 411 | * allows more efficient vectorization since w[i]=>w[i-3] dependency is broken |
| 412 | */ |
| 413 | .macro W_PRECALC_32_79_SSSE3 |
| 414 | .if ((i & 3) == 0) |
| 415 | movdqa W_minus_04, W_TMP1 |
| 416 | pxor W_minus_28, W # W is W_minus_32 before xor |
| 417 | palignr $8, W_minus_08, W_TMP1 |
| 418 | .elseif ((i & 3) == 1) |
| 419 | pxor W_minus_16, W |
| 420 | pxor W_TMP1, W |
| 421 | movdqa W, W_TMP1 |
| 422 | .elseif ((i & 3) == 2) |
| 423 | psrld $30, W |
| 424 | pslld $2, W_TMP1 |
| 425 | por W, W_TMP1 |
| 426 | .elseif ((i & 3) == 3) |
| 427 | movdqa W_TMP1, W |
| 428 | paddd K_XMM(K_BASE), W_TMP1 |
| 429 | movdqa W_TMP1, WK(i&~3) |
| 430 | W_PRECALC_ROTATE |
| 431 | .endif |
| 432 | .endm |
| 433 | |
| 434 | .endm // W_PRECALC_SSSE3 |
| 435 | |
| 436 | |
| 437 | #define K1 0x5a827999 |
| 438 | #define K2 0x6ed9eba1 |
| 439 | #define K3 0x8f1bbcdc |
| 440 | #define K4 0xca62c1d6 |
| 441 | |
| 442 | .section .rodata |
| 443 | .align 16 |
| 444 | |
| 445 | K_XMM_AR: |
| 446 | .long K1, K1, K1, K1 |
| 447 | .long K2, K2, K2, K2 |
| 448 | .long K3, K3, K3, K3 |
| 449 | .long K4, K4, K4, K4 |
| 450 | |
| 451 | BSWAP_SHUFB_CTL: |
| 452 | .long 0x00010203 |
| 453 | .long 0x04050607 |
| 454 | .long 0x08090a0b |
| 455 | .long 0x0c0d0e0f |
| 456 | |
| 457 | |
| 458 | .section .text |
| 459 | |
| 460 | W_PRECALC_SSSE3 |
| 461 | .macro xmm_mov a, b |
| 462 | movdqu \a,\b |
| 463 | .endm |
| 464 | |
| 465 | /* SSSE3 optimized implementation: |
| 466 | * extern "C" void sha1_transform_ssse3(u32 *digest, const char *data, u32 *ws, |
| 467 | * unsigned int rounds); |
| 468 | */ |
| 469 | SHA1_VECTOR_ASM sha1_transform_ssse3 |
| 470 | |
Mathias Krause | 65df577 | 2012-05-24 11:13:42 +0200 | [diff] [blame] | 471 | #ifdef CONFIG_AS_AVX |
Mathias Krause | 66be895 | 2011-08-04 20:19:25 +0200 | [diff] [blame] | 472 | |
| 473 | .macro W_PRECALC_AVX |
| 474 | |
| 475 | .purgem W_PRECALC_00_15 |
| 476 | .macro W_PRECALC_00_15 |
| 477 | W_PRECALC_00_15_AVX |
| 478 | .endm |
| 479 | .purgem W_PRECALC_16_31 |
| 480 | .macro W_PRECALC_16_31 |
| 481 | W_PRECALC_16_31_AVX |
| 482 | .endm |
| 483 | .purgem W_PRECALC_32_79 |
| 484 | .macro W_PRECALC_32_79 |
| 485 | W_PRECALC_32_79_AVX |
| 486 | .endm |
| 487 | |
| 488 | .macro W_PRECALC_00_15_AVX |
| 489 | .if ((i & 3) == 0) |
| 490 | vmovdqu (i*4)(BUFFER_PTR), W_TMP1 |
| 491 | .elseif ((i & 3) == 1) |
| 492 | vpshufb XMM_SHUFB_BSWAP, W_TMP1, W |
| 493 | .elseif ((i & 3) == 2) |
| 494 | vpaddd (K_BASE), W, W_TMP1 |
| 495 | .elseif ((i & 3) == 3) |
| 496 | vmovdqa W_TMP1, WK(i&~3) |
| 497 | W_PRECALC_ROTATE |
| 498 | .endif |
| 499 | .endm |
| 500 | |
| 501 | .macro W_PRECALC_16_31_AVX |
| 502 | .if ((i & 3) == 0) |
| 503 | vpalignr $8, W_minus_16, W_minus_12, W # w[i-14] |
| 504 | vpsrldq $4, W_minus_04, W_TMP1 # w[i-3] |
| 505 | vpxor W_minus_08, W, W |
| 506 | vpxor W_minus_16, W_TMP1, W_TMP1 |
| 507 | .elseif ((i & 3) == 1) |
| 508 | vpxor W_TMP1, W, W |
| 509 | vpslldq $12, W, W_TMP2 |
| 510 | vpslld $1, W, W_TMP1 |
| 511 | .elseif ((i & 3) == 2) |
| 512 | vpsrld $31, W, W |
| 513 | vpor W, W_TMP1, W_TMP1 |
| 514 | vpslld $2, W_TMP2, W |
| 515 | vpsrld $30, W_TMP2, W_TMP2 |
| 516 | .elseif ((i & 3) == 3) |
| 517 | vpxor W, W_TMP1, W_TMP1 |
| 518 | vpxor W_TMP2, W_TMP1, W |
| 519 | vpaddd K_XMM(K_BASE), W, W_TMP1 |
| 520 | vmovdqu W_TMP1, WK(i&~3) |
| 521 | W_PRECALC_ROTATE |
| 522 | .endif |
| 523 | .endm |
| 524 | |
| 525 | .macro W_PRECALC_32_79_AVX |
| 526 | .if ((i & 3) == 0) |
| 527 | vpalignr $8, W_minus_08, W_minus_04, W_TMP1 |
| 528 | vpxor W_minus_28, W, W # W is W_minus_32 before xor |
| 529 | .elseif ((i & 3) == 1) |
| 530 | vpxor W_minus_16, W_TMP1, W_TMP1 |
| 531 | vpxor W_TMP1, W, W |
| 532 | .elseif ((i & 3) == 2) |
| 533 | vpslld $2, W, W_TMP1 |
| 534 | vpsrld $30, W, W |
| 535 | vpor W, W_TMP1, W |
| 536 | .elseif ((i & 3) == 3) |
| 537 | vpaddd K_XMM(K_BASE), W, W_TMP1 |
| 538 | vmovdqu W_TMP1, WK(i&~3) |
| 539 | W_PRECALC_ROTATE |
| 540 | .endif |
| 541 | .endm |
| 542 | |
| 543 | .endm // W_PRECALC_AVX |
| 544 | |
| 545 | W_PRECALC_AVX |
| 546 | .purgem xmm_mov |
| 547 | .macro xmm_mov a, b |
| 548 | vmovdqu \a,\b |
| 549 | .endm |
| 550 | |
| 551 | |
| 552 | /* AVX optimized implementation: |
| 553 | * extern "C" void sha1_transform_avx(u32 *digest, const char *data, u32 *ws, |
| 554 | * unsigned int rounds); |
| 555 | */ |
| 556 | SHA1_VECTOR_ASM sha1_transform_avx |
| 557 | |
| 558 | #endif |