Martin Willi | 3d1e93c | 2015-07-16 19:14:03 +0200 | [diff] [blame] | 1 | /* |
| 2 | * ChaCha20 256-bit cipher algorithm, RFC7539, x64 AVX2 functions |
| 3 | * |
| 4 | * Copyright (C) 2015 Martin Willi |
| 5 | * |
| 6 | * This program is free software; you can redistribute it and/or modify |
| 7 | * it under the terms of the GNU General Public License as published by |
| 8 | * the Free Software Foundation; either version 2 of the License, or |
| 9 | * (at your option) any later version. |
| 10 | */ |
| 11 | |
| 12 | #include <linux/linkage.h> |
| 13 | |
Denys Vlasenko | e183914 | 2017-01-19 22:33:04 +0100 | [diff] [blame] | 14 | .section .rodata.cst32.ROT8, "aM", @progbits, 32 |
Martin Willi | 3d1e93c | 2015-07-16 19:14:03 +0200 | [diff] [blame] | 15 | .align 32 |
Martin Willi | 3d1e93c | 2015-07-16 19:14:03 +0200 | [diff] [blame] | 16 | ROT8: .octa 0x0e0d0c0f0a09080b0605040702010003 |
| 17 | .octa 0x0e0d0c0f0a09080b0605040702010003 |
Denys Vlasenko | e183914 | 2017-01-19 22:33:04 +0100 | [diff] [blame] | 18 | |
| 19 | .section .rodata.cst32.ROT16, "aM", @progbits, 32 |
| 20 | .align 32 |
Martin Willi | 3d1e93c | 2015-07-16 19:14:03 +0200 | [diff] [blame] | 21 | ROT16: .octa 0x0d0c0f0e09080b0a0504070601000302 |
| 22 | .octa 0x0d0c0f0e09080b0a0504070601000302 |
Denys Vlasenko | e183914 | 2017-01-19 22:33:04 +0100 | [diff] [blame] | 23 | |
| 24 | .section .rodata.cst32.CTRINC, "aM", @progbits, 32 |
| 25 | .align 32 |
Martin Willi | 3d1e93c | 2015-07-16 19:14:03 +0200 | [diff] [blame] | 26 | CTRINC: .octa 0x00000003000000020000000100000000 |
| 27 | .octa 0x00000007000000060000000500000004 |
| 28 | |
| 29 | .text |
| 30 | |
| 31 | ENTRY(chacha20_8block_xor_avx2) |
| 32 | # %rdi: Input state matrix, s |
Martin Willi | c3b734d | 2018-11-11 10:36:27 +0100 | [diff] [blame^] | 33 | # %rsi: up to 8 data blocks output, o |
| 34 | # %rdx: up to 8 data blocks input, i |
| 35 | # %rcx: input/output length in bytes |
Martin Willi | 3d1e93c | 2015-07-16 19:14:03 +0200 | [diff] [blame] | 36 | |
| 37 | # This function encrypts eight consecutive ChaCha20 blocks by loading |
| 38 | # the state matrix in AVX registers eight times. As we need some |
| 39 | # scratch registers, we save the first four registers on the stack. The |
| 40 | # algorithm performs each operation on the corresponding word of each |
| 41 | # state matrix, hence requires no word shuffling. For final XORing step |
| 42 | # we transpose the matrix by interleaving 32-, 64- and then 128-bit |
| 43 | # words, which allows us to do XOR in AVX registers. 8/16-bit word |
| 44 | # rotation is done with the slightly better performing byte shuffling, |
| 45 | # 7/12-bit word rotation uses traditional shift+OR. |
| 46 | |
| 47 | vzeroupper |
| 48 | # 4 * 32 byte stack, 32-byte aligned |
Jason A. Donenfeld | 4635742 | 2017-10-08 22:50:53 +0200 | [diff] [blame] | 49 | lea 8(%rsp),%r10 |
Martin Willi | 3d1e93c | 2015-07-16 19:14:03 +0200 | [diff] [blame] | 50 | and $~31, %rsp |
| 51 | sub $0x80, %rsp |
Martin Willi | c3b734d | 2018-11-11 10:36:27 +0100 | [diff] [blame^] | 52 | mov %rcx,%rax |
Martin Willi | 3d1e93c | 2015-07-16 19:14:03 +0200 | [diff] [blame] | 53 | |
| 54 | # x0..15[0-7] = s[0..15] |
| 55 | vpbroadcastd 0x00(%rdi),%ymm0 |
| 56 | vpbroadcastd 0x04(%rdi),%ymm1 |
| 57 | vpbroadcastd 0x08(%rdi),%ymm2 |
| 58 | vpbroadcastd 0x0c(%rdi),%ymm3 |
| 59 | vpbroadcastd 0x10(%rdi),%ymm4 |
| 60 | vpbroadcastd 0x14(%rdi),%ymm5 |
| 61 | vpbroadcastd 0x18(%rdi),%ymm6 |
| 62 | vpbroadcastd 0x1c(%rdi),%ymm7 |
| 63 | vpbroadcastd 0x20(%rdi),%ymm8 |
| 64 | vpbroadcastd 0x24(%rdi),%ymm9 |
| 65 | vpbroadcastd 0x28(%rdi),%ymm10 |
| 66 | vpbroadcastd 0x2c(%rdi),%ymm11 |
| 67 | vpbroadcastd 0x30(%rdi),%ymm12 |
| 68 | vpbroadcastd 0x34(%rdi),%ymm13 |
| 69 | vpbroadcastd 0x38(%rdi),%ymm14 |
| 70 | vpbroadcastd 0x3c(%rdi),%ymm15 |
| 71 | # x0..3 on stack |
| 72 | vmovdqa %ymm0,0x00(%rsp) |
| 73 | vmovdqa %ymm1,0x20(%rsp) |
| 74 | vmovdqa %ymm2,0x40(%rsp) |
| 75 | vmovdqa %ymm3,0x60(%rsp) |
| 76 | |
| 77 | vmovdqa CTRINC(%rip),%ymm1 |
| 78 | vmovdqa ROT8(%rip),%ymm2 |
| 79 | vmovdqa ROT16(%rip),%ymm3 |
| 80 | |
| 81 | # x12 += counter values 0-3 |
| 82 | vpaddd %ymm1,%ymm12,%ymm12 |
| 83 | |
| 84 | mov $10,%ecx |
| 85 | |
| 86 | .Ldoubleround8: |
| 87 | # x0 += x4, x12 = rotl32(x12 ^ x0, 16) |
| 88 | vpaddd 0x00(%rsp),%ymm4,%ymm0 |
| 89 | vmovdqa %ymm0,0x00(%rsp) |
| 90 | vpxor %ymm0,%ymm12,%ymm12 |
| 91 | vpshufb %ymm3,%ymm12,%ymm12 |
| 92 | # x1 += x5, x13 = rotl32(x13 ^ x1, 16) |
| 93 | vpaddd 0x20(%rsp),%ymm5,%ymm0 |
| 94 | vmovdqa %ymm0,0x20(%rsp) |
| 95 | vpxor %ymm0,%ymm13,%ymm13 |
| 96 | vpshufb %ymm3,%ymm13,%ymm13 |
| 97 | # x2 += x6, x14 = rotl32(x14 ^ x2, 16) |
| 98 | vpaddd 0x40(%rsp),%ymm6,%ymm0 |
| 99 | vmovdqa %ymm0,0x40(%rsp) |
| 100 | vpxor %ymm0,%ymm14,%ymm14 |
| 101 | vpshufb %ymm3,%ymm14,%ymm14 |
| 102 | # x3 += x7, x15 = rotl32(x15 ^ x3, 16) |
| 103 | vpaddd 0x60(%rsp),%ymm7,%ymm0 |
| 104 | vmovdqa %ymm0,0x60(%rsp) |
| 105 | vpxor %ymm0,%ymm15,%ymm15 |
| 106 | vpshufb %ymm3,%ymm15,%ymm15 |
| 107 | |
| 108 | # x8 += x12, x4 = rotl32(x4 ^ x8, 12) |
| 109 | vpaddd %ymm12,%ymm8,%ymm8 |
| 110 | vpxor %ymm8,%ymm4,%ymm4 |
| 111 | vpslld $12,%ymm4,%ymm0 |
| 112 | vpsrld $20,%ymm4,%ymm4 |
| 113 | vpor %ymm0,%ymm4,%ymm4 |
| 114 | # x9 += x13, x5 = rotl32(x5 ^ x9, 12) |
| 115 | vpaddd %ymm13,%ymm9,%ymm9 |
| 116 | vpxor %ymm9,%ymm5,%ymm5 |
| 117 | vpslld $12,%ymm5,%ymm0 |
| 118 | vpsrld $20,%ymm5,%ymm5 |
| 119 | vpor %ymm0,%ymm5,%ymm5 |
| 120 | # x10 += x14, x6 = rotl32(x6 ^ x10, 12) |
| 121 | vpaddd %ymm14,%ymm10,%ymm10 |
| 122 | vpxor %ymm10,%ymm6,%ymm6 |
| 123 | vpslld $12,%ymm6,%ymm0 |
| 124 | vpsrld $20,%ymm6,%ymm6 |
| 125 | vpor %ymm0,%ymm6,%ymm6 |
| 126 | # x11 += x15, x7 = rotl32(x7 ^ x11, 12) |
| 127 | vpaddd %ymm15,%ymm11,%ymm11 |
| 128 | vpxor %ymm11,%ymm7,%ymm7 |
| 129 | vpslld $12,%ymm7,%ymm0 |
| 130 | vpsrld $20,%ymm7,%ymm7 |
| 131 | vpor %ymm0,%ymm7,%ymm7 |
| 132 | |
| 133 | # x0 += x4, x12 = rotl32(x12 ^ x0, 8) |
| 134 | vpaddd 0x00(%rsp),%ymm4,%ymm0 |
| 135 | vmovdqa %ymm0,0x00(%rsp) |
| 136 | vpxor %ymm0,%ymm12,%ymm12 |
| 137 | vpshufb %ymm2,%ymm12,%ymm12 |
| 138 | # x1 += x5, x13 = rotl32(x13 ^ x1, 8) |
| 139 | vpaddd 0x20(%rsp),%ymm5,%ymm0 |
| 140 | vmovdqa %ymm0,0x20(%rsp) |
| 141 | vpxor %ymm0,%ymm13,%ymm13 |
| 142 | vpshufb %ymm2,%ymm13,%ymm13 |
| 143 | # x2 += x6, x14 = rotl32(x14 ^ x2, 8) |
| 144 | vpaddd 0x40(%rsp),%ymm6,%ymm0 |
| 145 | vmovdqa %ymm0,0x40(%rsp) |
| 146 | vpxor %ymm0,%ymm14,%ymm14 |
| 147 | vpshufb %ymm2,%ymm14,%ymm14 |
| 148 | # x3 += x7, x15 = rotl32(x15 ^ x3, 8) |
| 149 | vpaddd 0x60(%rsp),%ymm7,%ymm0 |
| 150 | vmovdqa %ymm0,0x60(%rsp) |
| 151 | vpxor %ymm0,%ymm15,%ymm15 |
| 152 | vpshufb %ymm2,%ymm15,%ymm15 |
| 153 | |
| 154 | # x8 += x12, x4 = rotl32(x4 ^ x8, 7) |
| 155 | vpaddd %ymm12,%ymm8,%ymm8 |
| 156 | vpxor %ymm8,%ymm4,%ymm4 |
| 157 | vpslld $7,%ymm4,%ymm0 |
| 158 | vpsrld $25,%ymm4,%ymm4 |
| 159 | vpor %ymm0,%ymm4,%ymm4 |
| 160 | # x9 += x13, x5 = rotl32(x5 ^ x9, 7) |
| 161 | vpaddd %ymm13,%ymm9,%ymm9 |
| 162 | vpxor %ymm9,%ymm5,%ymm5 |
| 163 | vpslld $7,%ymm5,%ymm0 |
| 164 | vpsrld $25,%ymm5,%ymm5 |
| 165 | vpor %ymm0,%ymm5,%ymm5 |
| 166 | # x10 += x14, x6 = rotl32(x6 ^ x10, 7) |
| 167 | vpaddd %ymm14,%ymm10,%ymm10 |
| 168 | vpxor %ymm10,%ymm6,%ymm6 |
| 169 | vpslld $7,%ymm6,%ymm0 |
| 170 | vpsrld $25,%ymm6,%ymm6 |
| 171 | vpor %ymm0,%ymm6,%ymm6 |
| 172 | # x11 += x15, x7 = rotl32(x7 ^ x11, 7) |
| 173 | vpaddd %ymm15,%ymm11,%ymm11 |
| 174 | vpxor %ymm11,%ymm7,%ymm7 |
| 175 | vpslld $7,%ymm7,%ymm0 |
| 176 | vpsrld $25,%ymm7,%ymm7 |
| 177 | vpor %ymm0,%ymm7,%ymm7 |
| 178 | |
| 179 | # x0 += x5, x15 = rotl32(x15 ^ x0, 16) |
| 180 | vpaddd 0x00(%rsp),%ymm5,%ymm0 |
| 181 | vmovdqa %ymm0,0x00(%rsp) |
| 182 | vpxor %ymm0,%ymm15,%ymm15 |
| 183 | vpshufb %ymm3,%ymm15,%ymm15 |
| 184 | # x1 += x6, x12 = rotl32(x12 ^ x1, 16)%ymm0 |
| 185 | vpaddd 0x20(%rsp),%ymm6,%ymm0 |
| 186 | vmovdqa %ymm0,0x20(%rsp) |
| 187 | vpxor %ymm0,%ymm12,%ymm12 |
| 188 | vpshufb %ymm3,%ymm12,%ymm12 |
| 189 | # x2 += x7, x13 = rotl32(x13 ^ x2, 16) |
| 190 | vpaddd 0x40(%rsp),%ymm7,%ymm0 |
| 191 | vmovdqa %ymm0,0x40(%rsp) |
| 192 | vpxor %ymm0,%ymm13,%ymm13 |
| 193 | vpshufb %ymm3,%ymm13,%ymm13 |
| 194 | # x3 += x4, x14 = rotl32(x14 ^ x3, 16) |
| 195 | vpaddd 0x60(%rsp),%ymm4,%ymm0 |
| 196 | vmovdqa %ymm0,0x60(%rsp) |
| 197 | vpxor %ymm0,%ymm14,%ymm14 |
| 198 | vpshufb %ymm3,%ymm14,%ymm14 |
| 199 | |
| 200 | # x10 += x15, x5 = rotl32(x5 ^ x10, 12) |
| 201 | vpaddd %ymm15,%ymm10,%ymm10 |
| 202 | vpxor %ymm10,%ymm5,%ymm5 |
| 203 | vpslld $12,%ymm5,%ymm0 |
| 204 | vpsrld $20,%ymm5,%ymm5 |
| 205 | vpor %ymm0,%ymm5,%ymm5 |
| 206 | # x11 += x12, x6 = rotl32(x6 ^ x11, 12) |
| 207 | vpaddd %ymm12,%ymm11,%ymm11 |
| 208 | vpxor %ymm11,%ymm6,%ymm6 |
| 209 | vpslld $12,%ymm6,%ymm0 |
| 210 | vpsrld $20,%ymm6,%ymm6 |
| 211 | vpor %ymm0,%ymm6,%ymm6 |
| 212 | # x8 += x13, x7 = rotl32(x7 ^ x8, 12) |
| 213 | vpaddd %ymm13,%ymm8,%ymm8 |
| 214 | vpxor %ymm8,%ymm7,%ymm7 |
| 215 | vpslld $12,%ymm7,%ymm0 |
| 216 | vpsrld $20,%ymm7,%ymm7 |
| 217 | vpor %ymm0,%ymm7,%ymm7 |
| 218 | # x9 += x14, x4 = rotl32(x4 ^ x9, 12) |
| 219 | vpaddd %ymm14,%ymm9,%ymm9 |
| 220 | vpxor %ymm9,%ymm4,%ymm4 |
| 221 | vpslld $12,%ymm4,%ymm0 |
| 222 | vpsrld $20,%ymm4,%ymm4 |
| 223 | vpor %ymm0,%ymm4,%ymm4 |
| 224 | |
| 225 | # x0 += x5, x15 = rotl32(x15 ^ x0, 8) |
| 226 | vpaddd 0x00(%rsp),%ymm5,%ymm0 |
| 227 | vmovdqa %ymm0,0x00(%rsp) |
| 228 | vpxor %ymm0,%ymm15,%ymm15 |
| 229 | vpshufb %ymm2,%ymm15,%ymm15 |
| 230 | # x1 += x6, x12 = rotl32(x12 ^ x1, 8) |
| 231 | vpaddd 0x20(%rsp),%ymm6,%ymm0 |
| 232 | vmovdqa %ymm0,0x20(%rsp) |
| 233 | vpxor %ymm0,%ymm12,%ymm12 |
| 234 | vpshufb %ymm2,%ymm12,%ymm12 |
| 235 | # x2 += x7, x13 = rotl32(x13 ^ x2, 8) |
| 236 | vpaddd 0x40(%rsp),%ymm7,%ymm0 |
| 237 | vmovdqa %ymm0,0x40(%rsp) |
| 238 | vpxor %ymm0,%ymm13,%ymm13 |
| 239 | vpshufb %ymm2,%ymm13,%ymm13 |
| 240 | # x3 += x4, x14 = rotl32(x14 ^ x3, 8) |
| 241 | vpaddd 0x60(%rsp),%ymm4,%ymm0 |
| 242 | vmovdqa %ymm0,0x60(%rsp) |
| 243 | vpxor %ymm0,%ymm14,%ymm14 |
| 244 | vpshufb %ymm2,%ymm14,%ymm14 |
| 245 | |
| 246 | # x10 += x15, x5 = rotl32(x5 ^ x10, 7) |
| 247 | vpaddd %ymm15,%ymm10,%ymm10 |
| 248 | vpxor %ymm10,%ymm5,%ymm5 |
| 249 | vpslld $7,%ymm5,%ymm0 |
| 250 | vpsrld $25,%ymm5,%ymm5 |
| 251 | vpor %ymm0,%ymm5,%ymm5 |
| 252 | # x11 += x12, x6 = rotl32(x6 ^ x11, 7) |
| 253 | vpaddd %ymm12,%ymm11,%ymm11 |
| 254 | vpxor %ymm11,%ymm6,%ymm6 |
| 255 | vpslld $7,%ymm6,%ymm0 |
| 256 | vpsrld $25,%ymm6,%ymm6 |
| 257 | vpor %ymm0,%ymm6,%ymm6 |
| 258 | # x8 += x13, x7 = rotl32(x7 ^ x8, 7) |
| 259 | vpaddd %ymm13,%ymm8,%ymm8 |
| 260 | vpxor %ymm8,%ymm7,%ymm7 |
| 261 | vpslld $7,%ymm7,%ymm0 |
| 262 | vpsrld $25,%ymm7,%ymm7 |
| 263 | vpor %ymm0,%ymm7,%ymm7 |
| 264 | # x9 += x14, x4 = rotl32(x4 ^ x9, 7) |
| 265 | vpaddd %ymm14,%ymm9,%ymm9 |
| 266 | vpxor %ymm9,%ymm4,%ymm4 |
| 267 | vpslld $7,%ymm4,%ymm0 |
| 268 | vpsrld $25,%ymm4,%ymm4 |
| 269 | vpor %ymm0,%ymm4,%ymm4 |
| 270 | |
| 271 | dec %ecx |
| 272 | jnz .Ldoubleround8 |
| 273 | |
| 274 | # x0..15[0-3] += s[0..15] |
| 275 | vpbroadcastd 0x00(%rdi),%ymm0 |
| 276 | vpaddd 0x00(%rsp),%ymm0,%ymm0 |
| 277 | vmovdqa %ymm0,0x00(%rsp) |
| 278 | vpbroadcastd 0x04(%rdi),%ymm0 |
| 279 | vpaddd 0x20(%rsp),%ymm0,%ymm0 |
| 280 | vmovdqa %ymm0,0x20(%rsp) |
| 281 | vpbroadcastd 0x08(%rdi),%ymm0 |
| 282 | vpaddd 0x40(%rsp),%ymm0,%ymm0 |
| 283 | vmovdqa %ymm0,0x40(%rsp) |
| 284 | vpbroadcastd 0x0c(%rdi),%ymm0 |
| 285 | vpaddd 0x60(%rsp),%ymm0,%ymm0 |
| 286 | vmovdqa %ymm0,0x60(%rsp) |
| 287 | vpbroadcastd 0x10(%rdi),%ymm0 |
| 288 | vpaddd %ymm0,%ymm4,%ymm4 |
| 289 | vpbroadcastd 0x14(%rdi),%ymm0 |
| 290 | vpaddd %ymm0,%ymm5,%ymm5 |
| 291 | vpbroadcastd 0x18(%rdi),%ymm0 |
| 292 | vpaddd %ymm0,%ymm6,%ymm6 |
| 293 | vpbroadcastd 0x1c(%rdi),%ymm0 |
| 294 | vpaddd %ymm0,%ymm7,%ymm7 |
| 295 | vpbroadcastd 0x20(%rdi),%ymm0 |
| 296 | vpaddd %ymm0,%ymm8,%ymm8 |
| 297 | vpbroadcastd 0x24(%rdi),%ymm0 |
| 298 | vpaddd %ymm0,%ymm9,%ymm9 |
| 299 | vpbroadcastd 0x28(%rdi),%ymm0 |
| 300 | vpaddd %ymm0,%ymm10,%ymm10 |
| 301 | vpbroadcastd 0x2c(%rdi),%ymm0 |
| 302 | vpaddd %ymm0,%ymm11,%ymm11 |
| 303 | vpbroadcastd 0x30(%rdi),%ymm0 |
| 304 | vpaddd %ymm0,%ymm12,%ymm12 |
| 305 | vpbroadcastd 0x34(%rdi),%ymm0 |
| 306 | vpaddd %ymm0,%ymm13,%ymm13 |
| 307 | vpbroadcastd 0x38(%rdi),%ymm0 |
| 308 | vpaddd %ymm0,%ymm14,%ymm14 |
| 309 | vpbroadcastd 0x3c(%rdi),%ymm0 |
| 310 | vpaddd %ymm0,%ymm15,%ymm15 |
| 311 | |
| 312 | # x12 += counter values 0-3 |
| 313 | vpaddd %ymm1,%ymm12,%ymm12 |
| 314 | |
| 315 | # interleave 32-bit words in state n, n+1 |
| 316 | vmovdqa 0x00(%rsp),%ymm0 |
| 317 | vmovdqa 0x20(%rsp),%ymm1 |
| 318 | vpunpckldq %ymm1,%ymm0,%ymm2 |
| 319 | vpunpckhdq %ymm1,%ymm0,%ymm1 |
| 320 | vmovdqa %ymm2,0x00(%rsp) |
| 321 | vmovdqa %ymm1,0x20(%rsp) |
| 322 | vmovdqa 0x40(%rsp),%ymm0 |
| 323 | vmovdqa 0x60(%rsp),%ymm1 |
| 324 | vpunpckldq %ymm1,%ymm0,%ymm2 |
| 325 | vpunpckhdq %ymm1,%ymm0,%ymm1 |
| 326 | vmovdqa %ymm2,0x40(%rsp) |
| 327 | vmovdqa %ymm1,0x60(%rsp) |
| 328 | vmovdqa %ymm4,%ymm0 |
| 329 | vpunpckldq %ymm5,%ymm0,%ymm4 |
| 330 | vpunpckhdq %ymm5,%ymm0,%ymm5 |
| 331 | vmovdqa %ymm6,%ymm0 |
| 332 | vpunpckldq %ymm7,%ymm0,%ymm6 |
| 333 | vpunpckhdq %ymm7,%ymm0,%ymm7 |
| 334 | vmovdqa %ymm8,%ymm0 |
| 335 | vpunpckldq %ymm9,%ymm0,%ymm8 |
| 336 | vpunpckhdq %ymm9,%ymm0,%ymm9 |
| 337 | vmovdqa %ymm10,%ymm0 |
| 338 | vpunpckldq %ymm11,%ymm0,%ymm10 |
| 339 | vpunpckhdq %ymm11,%ymm0,%ymm11 |
| 340 | vmovdqa %ymm12,%ymm0 |
| 341 | vpunpckldq %ymm13,%ymm0,%ymm12 |
| 342 | vpunpckhdq %ymm13,%ymm0,%ymm13 |
| 343 | vmovdqa %ymm14,%ymm0 |
| 344 | vpunpckldq %ymm15,%ymm0,%ymm14 |
| 345 | vpunpckhdq %ymm15,%ymm0,%ymm15 |
| 346 | |
| 347 | # interleave 64-bit words in state n, n+2 |
| 348 | vmovdqa 0x00(%rsp),%ymm0 |
| 349 | vmovdqa 0x40(%rsp),%ymm2 |
| 350 | vpunpcklqdq %ymm2,%ymm0,%ymm1 |
| 351 | vpunpckhqdq %ymm2,%ymm0,%ymm2 |
| 352 | vmovdqa %ymm1,0x00(%rsp) |
| 353 | vmovdqa %ymm2,0x40(%rsp) |
| 354 | vmovdqa 0x20(%rsp),%ymm0 |
| 355 | vmovdqa 0x60(%rsp),%ymm2 |
| 356 | vpunpcklqdq %ymm2,%ymm0,%ymm1 |
| 357 | vpunpckhqdq %ymm2,%ymm0,%ymm2 |
| 358 | vmovdqa %ymm1,0x20(%rsp) |
| 359 | vmovdqa %ymm2,0x60(%rsp) |
| 360 | vmovdqa %ymm4,%ymm0 |
| 361 | vpunpcklqdq %ymm6,%ymm0,%ymm4 |
| 362 | vpunpckhqdq %ymm6,%ymm0,%ymm6 |
| 363 | vmovdqa %ymm5,%ymm0 |
| 364 | vpunpcklqdq %ymm7,%ymm0,%ymm5 |
| 365 | vpunpckhqdq %ymm7,%ymm0,%ymm7 |
| 366 | vmovdqa %ymm8,%ymm0 |
| 367 | vpunpcklqdq %ymm10,%ymm0,%ymm8 |
| 368 | vpunpckhqdq %ymm10,%ymm0,%ymm10 |
| 369 | vmovdqa %ymm9,%ymm0 |
| 370 | vpunpcklqdq %ymm11,%ymm0,%ymm9 |
| 371 | vpunpckhqdq %ymm11,%ymm0,%ymm11 |
| 372 | vmovdqa %ymm12,%ymm0 |
| 373 | vpunpcklqdq %ymm14,%ymm0,%ymm12 |
| 374 | vpunpckhqdq %ymm14,%ymm0,%ymm14 |
| 375 | vmovdqa %ymm13,%ymm0 |
| 376 | vpunpcklqdq %ymm15,%ymm0,%ymm13 |
| 377 | vpunpckhqdq %ymm15,%ymm0,%ymm15 |
| 378 | |
| 379 | # interleave 128-bit words in state n, n+4 |
Martin Willi | c3b734d | 2018-11-11 10:36:27 +0100 | [diff] [blame^] | 380 | # xor/write first four blocks |
| 381 | vmovdqa 0x00(%rsp),%ymm1 |
| 382 | vperm2i128 $0x20,%ymm4,%ymm1,%ymm0 |
| 383 | cmp $0x0020,%rax |
| 384 | jl .Lxorpart8 |
Martin Willi | 3d1e93c | 2015-07-16 19:14:03 +0200 | [diff] [blame] | 385 | vpxor 0x0000(%rdx),%ymm0,%ymm0 |
| 386 | vmovdqu %ymm0,0x0000(%rsi) |
Martin Willi | c3b734d | 2018-11-11 10:36:27 +0100 | [diff] [blame^] | 387 | vperm2i128 $0x31,%ymm4,%ymm1,%ymm4 |
| 388 | |
| 389 | vperm2i128 $0x20,%ymm12,%ymm8,%ymm0 |
| 390 | cmp $0x0040,%rax |
| 391 | jl .Lxorpart8 |
| 392 | vpxor 0x0020(%rdx),%ymm0,%ymm0 |
| 393 | vmovdqu %ymm0,0x0020(%rsi) |
| 394 | vperm2i128 $0x31,%ymm12,%ymm8,%ymm12 |
| 395 | |
| 396 | vmovdqa 0x40(%rsp),%ymm1 |
| 397 | vperm2i128 $0x20,%ymm6,%ymm1,%ymm0 |
| 398 | cmp $0x0060,%rax |
| 399 | jl .Lxorpart8 |
Martin Willi | 3d1e93c | 2015-07-16 19:14:03 +0200 | [diff] [blame] | 400 | vpxor 0x0040(%rdx),%ymm0,%ymm0 |
| 401 | vmovdqu %ymm0,0x0040(%rsi) |
Martin Willi | c3b734d | 2018-11-11 10:36:27 +0100 | [diff] [blame^] | 402 | vperm2i128 $0x31,%ymm6,%ymm1,%ymm6 |
| 403 | |
| 404 | vperm2i128 $0x20,%ymm14,%ymm10,%ymm0 |
| 405 | cmp $0x0080,%rax |
| 406 | jl .Lxorpart8 |
| 407 | vpxor 0x0060(%rdx),%ymm0,%ymm0 |
| 408 | vmovdqu %ymm0,0x0060(%rsi) |
| 409 | vperm2i128 $0x31,%ymm14,%ymm10,%ymm14 |
| 410 | |
| 411 | vmovdqa 0x20(%rsp),%ymm1 |
| 412 | vperm2i128 $0x20,%ymm5,%ymm1,%ymm0 |
| 413 | cmp $0x00a0,%rax |
| 414 | jl .Lxorpart8 |
| 415 | vpxor 0x0080(%rdx),%ymm0,%ymm0 |
| 416 | vmovdqu %ymm0,0x0080(%rsi) |
| 417 | vperm2i128 $0x31,%ymm5,%ymm1,%ymm5 |
| 418 | |
| 419 | vperm2i128 $0x20,%ymm13,%ymm9,%ymm0 |
| 420 | cmp $0x00c0,%rax |
| 421 | jl .Lxorpart8 |
| 422 | vpxor 0x00a0(%rdx),%ymm0,%ymm0 |
| 423 | vmovdqu %ymm0,0x00a0(%rsi) |
| 424 | vperm2i128 $0x31,%ymm13,%ymm9,%ymm13 |
| 425 | |
| 426 | vmovdqa 0x60(%rsp),%ymm1 |
| 427 | vperm2i128 $0x20,%ymm7,%ymm1,%ymm0 |
| 428 | cmp $0x00e0,%rax |
| 429 | jl .Lxorpart8 |
Martin Willi | 3d1e93c | 2015-07-16 19:14:03 +0200 | [diff] [blame] | 430 | vpxor 0x00c0(%rdx),%ymm0,%ymm0 |
| 431 | vmovdqu %ymm0,0x00c0(%rsi) |
Martin Willi | c3b734d | 2018-11-11 10:36:27 +0100 | [diff] [blame^] | 432 | vperm2i128 $0x31,%ymm7,%ymm1,%ymm7 |
Martin Willi | 3d1e93c | 2015-07-16 19:14:03 +0200 | [diff] [blame] | 433 | |
Martin Willi | c3b734d | 2018-11-11 10:36:27 +0100 | [diff] [blame^] | 434 | vperm2i128 $0x20,%ymm15,%ymm11,%ymm0 |
| 435 | cmp $0x0100,%rax |
| 436 | jl .Lxorpart8 |
| 437 | vpxor 0x00e0(%rdx),%ymm0,%ymm0 |
| 438 | vmovdqu %ymm0,0x00e0(%rsi) |
| 439 | vperm2i128 $0x31,%ymm15,%ymm11,%ymm15 |
| 440 | |
| 441 | # xor remaining blocks, write to output |
| 442 | vmovdqa %ymm4,%ymm0 |
| 443 | cmp $0x0120,%rax |
| 444 | jl .Lxorpart8 |
| 445 | vpxor 0x0100(%rdx),%ymm0,%ymm0 |
| 446 | vmovdqu %ymm0,0x0100(%rsi) |
| 447 | |
| 448 | vmovdqa %ymm12,%ymm0 |
| 449 | cmp $0x0140,%rax |
| 450 | jl .Lxorpart8 |
| 451 | vpxor 0x0120(%rdx),%ymm0,%ymm0 |
| 452 | vmovdqu %ymm0,0x0120(%rsi) |
| 453 | |
| 454 | vmovdqa %ymm6,%ymm0 |
| 455 | cmp $0x0160,%rax |
| 456 | jl .Lxorpart8 |
| 457 | vpxor 0x0140(%rdx),%ymm0,%ymm0 |
| 458 | vmovdqu %ymm0,0x0140(%rsi) |
| 459 | |
| 460 | vmovdqa %ymm14,%ymm0 |
| 461 | cmp $0x0180,%rax |
| 462 | jl .Lxorpart8 |
| 463 | vpxor 0x0160(%rdx),%ymm0,%ymm0 |
| 464 | vmovdqu %ymm0,0x0160(%rsi) |
| 465 | |
| 466 | vmovdqa %ymm5,%ymm0 |
| 467 | cmp $0x01a0,%rax |
| 468 | jl .Lxorpart8 |
| 469 | vpxor 0x0180(%rdx),%ymm0,%ymm0 |
| 470 | vmovdqu %ymm0,0x0180(%rsi) |
| 471 | |
| 472 | vmovdqa %ymm13,%ymm0 |
| 473 | cmp $0x01c0,%rax |
| 474 | jl .Lxorpart8 |
| 475 | vpxor 0x01a0(%rdx),%ymm0,%ymm0 |
| 476 | vmovdqu %ymm0,0x01a0(%rsi) |
| 477 | |
| 478 | vmovdqa %ymm7,%ymm0 |
| 479 | cmp $0x01e0,%rax |
| 480 | jl .Lxorpart8 |
| 481 | vpxor 0x01c0(%rdx),%ymm0,%ymm0 |
| 482 | vmovdqu %ymm0,0x01c0(%rsi) |
| 483 | |
| 484 | vmovdqa %ymm15,%ymm0 |
| 485 | cmp $0x0200,%rax |
| 486 | jl .Lxorpart8 |
| 487 | vpxor 0x01e0(%rdx),%ymm0,%ymm0 |
| 488 | vmovdqu %ymm0,0x01e0(%rsi) |
| 489 | |
| 490 | .Ldone8: |
Martin Willi | 3d1e93c | 2015-07-16 19:14:03 +0200 | [diff] [blame] | 491 | vzeroupper |
Jason A. Donenfeld | 4635742 | 2017-10-08 22:50:53 +0200 | [diff] [blame] | 492 | lea -8(%r10),%rsp |
Martin Willi | 3d1e93c | 2015-07-16 19:14:03 +0200 | [diff] [blame] | 493 | ret |
Martin Willi | c3b734d | 2018-11-11 10:36:27 +0100 | [diff] [blame^] | 494 | |
| 495 | .Lxorpart8: |
| 496 | # xor remaining bytes from partial register into output |
| 497 | mov %rax,%r9 |
| 498 | and $0x1f,%r9 |
| 499 | jz .Ldone8 |
| 500 | and $~0x1f,%rax |
| 501 | |
| 502 | mov %rsi,%r11 |
| 503 | |
| 504 | lea (%rdx,%rax),%rsi |
| 505 | mov %rsp,%rdi |
| 506 | mov %r9,%rcx |
| 507 | rep movsb |
| 508 | |
| 509 | vpxor 0x00(%rsp),%ymm0,%ymm0 |
| 510 | vmovdqa %ymm0,0x00(%rsp) |
| 511 | |
| 512 | mov %rsp,%rsi |
| 513 | lea (%r11,%rax),%rdi |
| 514 | mov %r9,%rcx |
| 515 | rep movsb |
| 516 | |
| 517 | jmp .Ldone8 |
| 518 | |
Martin Willi | 3d1e93c | 2015-07-16 19:14:03 +0200 | [diff] [blame] | 519 | ENDPROC(chacha20_8block_xor_avx2) |