blob: 7b62d55bee3d40ed2d264b9dea391c4a267b92f7 [file] [log] [blame]
Martin Willi3d1e93c2015-07-16 19:14:03 +02001/*
2 * ChaCha20 256-bit cipher algorithm, RFC7539, x64 AVX2 functions
3 *
4 * Copyright (C) 2015 Martin Willi
5 *
6 * This program is free software; you can redistribute it and/or modify
7 * it under the terms of the GNU General Public License as published by
8 * the Free Software Foundation; either version 2 of the License, or
9 * (at your option) any later version.
10 */
11
12#include <linux/linkage.h>
13
Denys Vlasenkoe1839142017-01-19 22:33:04 +010014.section .rodata.cst32.ROT8, "aM", @progbits, 32
Martin Willi3d1e93c2015-07-16 19:14:03 +020015.align 32
Martin Willi3d1e93c2015-07-16 19:14:03 +020016ROT8: .octa 0x0e0d0c0f0a09080b0605040702010003
17 .octa 0x0e0d0c0f0a09080b0605040702010003
Denys Vlasenkoe1839142017-01-19 22:33:04 +010018
19.section .rodata.cst32.ROT16, "aM", @progbits, 32
20.align 32
Martin Willi3d1e93c2015-07-16 19:14:03 +020021ROT16: .octa 0x0d0c0f0e09080b0a0504070601000302
22 .octa 0x0d0c0f0e09080b0a0504070601000302
Denys Vlasenkoe1839142017-01-19 22:33:04 +010023
24.section .rodata.cst32.CTRINC, "aM", @progbits, 32
25.align 32
Martin Willi3d1e93c2015-07-16 19:14:03 +020026CTRINC: .octa 0x00000003000000020000000100000000
27 .octa 0x00000007000000060000000500000004
28
29.text
30
31ENTRY(chacha20_8block_xor_avx2)
32 # %rdi: Input state matrix, s
Martin Willic3b734d2018-11-11 10:36:27 +010033 # %rsi: up to 8 data blocks output, o
34 # %rdx: up to 8 data blocks input, i
35 # %rcx: input/output length in bytes
Martin Willi3d1e93c2015-07-16 19:14:03 +020036
37 # This function encrypts eight consecutive ChaCha20 blocks by loading
38 # the state matrix in AVX registers eight times. As we need some
39 # scratch registers, we save the first four registers on the stack. The
40 # algorithm performs each operation on the corresponding word of each
41 # state matrix, hence requires no word shuffling. For final XORing step
42 # we transpose the matrix by interleaving 32-, 64- and then 128-bit
43 # words, which allows us to do XOR in AVX registers. 8/16-bit word
44 # rotation is done with the slightly better performing byte shuffling,
45 # 7/12-bit word rotation uses traditional shift+OR.
46
47 vzeroupper
48 # 4 * 32 byte stack, 32-byte aligned
Jason A. Donenfeld46357422017-10-08 22:50:53 +020049 lea 8(%rsp),%r10
Martin Willi3d1e93c2015-07-16 19:14:03 +020050 and $~31, %rsp
51 sub $0x80, %rsp
Martin Willic3b734d2018-11-11 10:36:27 +010052 mov %rcx,%rax
Martin Willi3d1e93c2015-07-16 19:14:03 +020053
54 # x0..15[0-7] = s[0..15]
55 vpbroadcastd 0x00(%rdi),%ymm0
56 vpbroadcastd 0x04(%rdi),%ymm1
57 vpbroadcastd 0x08(%rdi),%ymm2
58 vpbroadcastd 0x0c(%rdi),%ymm3
59 vpbroadcastd 0x10(%rdi),%ymm4
60 vpbroadcastd 0x14(%rdi),%ymm5
61 vpbroadcastd 0x18(%rdi),%ymm6
62 vpbroadcastd 0x1c(%rdi),%ymm7
63 vpbroadcastd 0x20(%rdi),%ymm8
64 vpbroadcastd 0x24(%rdi),%ymm9
65 vpbroadcastd 0x28(%rdi),%ymm10
66 vpbroadcastd 0x2c(%rdi),%ymm11
67 vpbroadcastd 0x30(%rdi),%ymm12
68 vpbroadcastd 0x34(%rdi),%ymm13
69 vpbroadcastd 0x38(%rdi),%ymm14
70 vpbroadcastd 0x3c(%rdi),%ymm15
71 # x0..3 on stack
72 vmovdqa %ymm0,0x00(%rsp)
73 vmovdqa %ymm1,0x20(%rsp)
74 vmovdqa %ymm2,0x40(%rsp)
75 vmovdqa %ymm3,0x60(%rsp)
76
77 vmovdqa CTRINC(%rip),%ymm1
78 vmovdqa ROT8(%rip),%ymm2
79 vmovdqa ROT16(%rip),%ymm3
80
81 # x12 += counter values 0-3
82 vpaddd %ymm1,%ymm12,%ymm12
83
84 mov $10,%ecx
85
86.Ldoubleround8:
87 # x0 += x4, x12 = rotl32(x12 ^ x0, 16)
88 vpaddd 0x00(%rsp),%ymm4,%ymm0
89 vmovdqa %ymm0,0x00(%rsp)
90 vpxor %ymm0,%ymm12,%ymm12
91 vpshufb %ymm3,%ymm12,%ymm12
92 # x1 += x5, x13 = rotl32(x13 ^ x1, 16)
93 vpaddd 0x20(%rsp),%ymm5,%ymm0
94 vmovdqa %ymm0,0x20(%rsp)
95 vpxor %ymm0,%ymm13,%ymm13
96 vpshufb %ymm3,%ymm13,%ymm13
97 # x2 += x6, x14 = rotl32(x14 ^ x2, 16)
98 vpaddd 0x40(%rsp),%ymm6,%ymm0
99 vmovdqa %ymm0,0x40(%rsp)
100 vpxor %ymm0,%ymm14,%ymm14
101 vpshufb %ymm3,%ymm14,%ymm14
102 # x3 += x7, x15 = rotl32(x15 ^ x3, 16)
103 vpaddd 0x60(%rsp),%ymm7,%ymm0
104 vmovdqa %ymm0,0x60(%rsp)
105 vpxor %ymm0,%ymm15,%ymm15
106 vpshufb %ymm3,%ymm15,%ymm15
107
108 # x8 += x12, x4 = rotl32(x4 ^ x8, 12)
109 vpaddd %ymm12,%ymm8,%ymm8
110 vpxor %ymm8,%ymm4,%ymm4
111 vpslld $12,%ymm4,%ymm0
112 vpsrld $20,%ymm4,%ymm4
113 vpor %ymm0,%ymm4,%ymm4
114 # x9 += x13, x5 = rotl32(x5 ^ x9, 12)
115 vpaddd %ymm13,%ymm9,%ymm9
116 vpxor %ymm9,%ymm5,%ymm5
117 vpslld $12,%ymm5,%ymm0
118 vpsrld $20,%ymm5,%ymm5
119 vpor %ymm0,%ymm5,%ymm5
120 # x10 += x14, x6 = rotl32(x6 ^ x10, 12)
121 vpaddd %ymm14,%ymm10,%ymm10
122 vpxor %ymm10,%ymm6,%ymm6
123 vpslld $12,%ymm6,%ymm0
124 vpsrld $20,%ymm6,%ymm6
125 vpor %ymm0,%ymm6,%ymm6
126 # x11 += x15, x7 = rotl32(x7 ^ x11, 12)
127 vpaddd %ymm15,%ymm11,%ymm11
128 vpxor %ymm11,%ymm7,%ymm7
129 vpslld $12,%ymm7,%ymm0
130 vpsrld $20,%ymm7,%ymm7
131 vpor %ymm0,%ymm7,%ymm7
132
133 # x0 += x4, x12 = rotl32(x12 ^ x0, 8)
134 vpaddd 0x00(%rsp),%ymm4,%ymm0
135 vmovdqa %ymm0,0x00(%rsp)
136 vpxor %ymm0,%ymm12,%ymm12
137 vpshufb %ymm2,%ymm12,%ymm12
138 # x1 += x5, x13 = rotl32(x13 ^ x1, 8)
139 vpaddd 0x20(%rsp),%ymm5,%ymm0
140 vmovdqa %ymm0,0x20(%rsp)
141 vpxor %ymm0,%ymm13,%ymm13
142 vpshufb %ymm2,%ymm13,%ymm13
143 # x2 += x6, x14 = rotl32(x14 ^ x2, 8)
144 vpaddd 0x40(%rsp),%ymm6,%ymm0
145 vmovdqa %ymm0,0x40(%rsp)
146 vpxor %ymm0,%ymm14,%ymm14
147 vpshufb %ymm2,%ymm14,%ymm14
148 # x3 += x7, x15 = rotl32(x15 ^ x3, 8)
149 vpaddd 0x60(%rsp),%ymm7,%ymm0
150 vmovdqa %ymm0,0x60(%rsp)
151 vpxor %ymm0,%ymm15,%ymm15
152 vpshufb %ymm2,%ymm15,%ymm15
153
154 # x8 += x12, x4 = rotl32(x4 ^ x8, 7)
155 vpaddd %ymm12,%ymm8,%ymm8
156 vpxor %ymm8,%ymm4,%ymm4
157 vpslld $7,%ymm4,%ymm0
158 vpsrld $25,%ymm4,%ymm4
159 vpor %ymm0,%ymm4,%ymm4
160 # x9 += x13, x5 = rotl32(x5 ^ x9, 7)
161 vpaddd %ymm13,%ymm9,%ymm9
162 vpxor %ymm9,%ymm5,%ymm5
163 vpslld $7,%ymm5,%ymm0
164 vpsrld $25,%ymm5,%ymm5
165 vpor %ymm0,%ymm5,%ymm5
166 # x10 += x14, x6 = rotl32(x6 ^ x10, 7)
167 vpaddd %ymm14,%ymm10,%ymm10
168 vpxor %ymm10,%ymm6,%ymm6
169 vpslld $7,%ymm6,%ymm0
170 vpsrld $25,%ymm6,%ymm6
171 vpor %ymm0,%ymm6,%ymm6
172 # x11 += x15, x7 = rotl32(x7 ^ x11, 7)
173 vpaddd %ymm15,%ymm11,%ymm11
174 vpxor %ymm11,%ymm7,%ymm7
175 vpslld $7,%ymm7,%ymm0
176 vpsrld $25,%ymm7,%ymm7
177 vpor %ymm0,%ymm7,%ymm7
178
179 # x0 += x5, x15 = rotl32(x15 ^ x0, 16)
180 vpaddd 0x00(%rsp),%ymm5,%ymm0
181 vmovdqa %ymm0,0x00(%rsp)
182 vpxor %ymm0,%ymm15,%ymm15
183 vpshufb %ymm3,%ymm15,%ymm15
184 # x1 += x6, x12 = rotl32(x12 ^ x1, 16)%ymm0
185 vpaddd 0x20(%rsp),%ymm6,%ymm0
186 vmovdqa %ymm0,0x20(%rsp)
187 vpxor %ymm0,%ymm12,%ymm12
188 vpshufb %ymm3,%ymm12,%ymm12
189 # x2 += x7, x13 = rotl32(x13 ^ x2, 16)
190 vpaddd 0x40(%rsp),%ymm7,%ymm0
191 vmovdqa %ymm0,0x40(%rsp)
192 vpxor %ymm0,%ymm13,%ymm13
193 vpshufb %ymm3,%ymm13,%ymm13
194 # x3 += x4, x14 = rotl32(x14 ^ x3, 16)
195 vpaddd 0x60(%rsp),%ymm4,%ymm0
196 vmovdqa %ymm0,0x60(%rsp)
197 vpxor %ymm0,%ymm14,%ymm14
198 vpshufb %ymm3,%ymm14,%ymm14
199
200 # x10 += x15, x5 = rotl32(x5 ^ x10, 12)
201 vpaddd %ymm15,%ymm10,%ymm10
202 vpxor %ymm10,%ymm5,%ymm5
203 vpslld $12,%ymm5,%ymm0
204 vpsrld $20,%ymm5,%ymm5
205 vpor %ymm0,%ymm5,%ymm5
206 # x11 += x12, x6 = rotl32(x6 ^ x11, 12)
207 vpaddd %ymm12,%ymm11,%ymm11
208 vpxor %ymm11,%ymm6,%ymm6
209 vpslld $12,%ymm6,%ymm0
210 vpsrld $20,%ymm6,%ymm6
211 vpor %ymm0,%ymm6,%ymm6
212 # x8 += x13, x7 = rotl32(x7 ^ x8, 12)
213 vpaddd %ymm13,%ymm8,%ymm8
214 vpxor %ymm8,%ymm7,%ymm7
215 vpslld $12,%ymm7,%ymm0
216 vpsrld $20,%ymm7,%ymm7
217 vpor %ymm0,%ymm7,%ymm7
218 # x9 += x14, x4 = rotl32(x4 ^ x9, 12)
219 vpaddd %ymm14,%ymm9,%ymm9
220 vpxor %ymm9,%ymm4,%ymm4
221 vpslld $12,%ymm4,%ymm0
222 vpsrld $20,%ymm4,%ymm4
223 vpor %ymm0,%ymm4,%ymm4
224
225 # x0 += x5, x15 = rotl32(x15 ^ x0, 8)
226 vpaddd 0x00(%rsp),%ymm5,%ymm0
227 vmovdqa %ymm0,0x00(%rsp)
228 vpxor %ymm0,%ymm15,%ymm15
229 vpshufb %ymm2,%ymm15,%ymm15
230 # x1 += x6, x12 = rotl32(x12 ^ x1, 8)
231 vpaddd 0x20(%rsp),%ymm6,%ymm0
232 vmovdqa %ymm0,0x20(%rsp)
233 vpxor %ymm0,%ymm12,%ymm12
234 vpshufb %ymm2,%ymm12,%ymm12
235 # x2 += x7, x13 = rotl32(x13 ^ x2, 8)
236 vpaddd 0x40(%rsp),%ymm7,%ymm0
237 vmovdqa %ymm0,0x40(%rsp)
238 vpxor %ymm0,%ymm13,%ymm13
239 vpshufb %ymm2,%ymm13,%ymm13
240 # x3 += x4, x14 = rotl32(x14 ^ x3, 8)
241 vpaddd 0x60(%rsp),%ymm4,%ymm0
242 vmovdqa %ymm0,0x60(%rsp)
243 vpxor %ymm0,%ymm14,%ymm14
244 vpshufb %ymm2,%ymm14,%ymm14
245
246 # x10 += x15, x5 = rotl32(x5 ^ x10, 7)
247 vpaddd %ymm15,%ymm10,%ymm10
248 vpxor %ymm10,%ymm5,%ymm5
249 vpslld $7,%ymm5,%ymm0
250 vpsrld $25,%ymm5,%ymm5
251 vpor %ymm0,%ymm5,%ymm5
252 # x11 += x12, x6 = rotl32(x6 ^ x11, 7)
253 vpaddd %ymm12,%ymm11,%ymm11
254 vpxor %ymm11,%ymm6,%ymm6
255 vpslld $7,%ymm6,%ymm0
256 vpsrld $25,%ymm6,%ymm6
257 vpor %ymm0,%ymm6,%ymm6
258 # x8 += x13, x7 = rotl32(x7 ^ x8, 7)
259 vpaddd %ymm13,%ymm8,%ymm8
260 vpxor %ymm8,%ymm7,%ymm7
261 vpslld $7,%ymm7,%ymm0
262 vpsrld $25,%ymm7,%ymm7
263 vpor %ymm0,%ymm7,%ymm7
264 # x9 += x14, x4 = rotl32(x4 ^ x9, 7)
265 vpaddd %ymm14,%ymm9,%ymm9
266 vpxor %ymm9,%ymm4,%ymm4
267 vpslld $7,%ymm4,%ymm0
268 vpsrld $25,%ymm4,%ymm4
269 vpor %ymm0,%ymm4,%ymm4
270
271 dec %ecx
272 jnz .Ldoubleround8
273
274 # x0..15[0-3] += s[0..15]
275 vpbroadcastd 0x00(%rdi),%ymm0
276 vpaddd 0x00(%rsp),%ymm0,%ymm0
277 vmovdqa %ymm0,0x00(%rsp)
278 vpbroadcastd 0x04(%rdi),%ymm0
279 vpaddd 0x20(%rsp),%ymm0,%ymm0
280 vmovdqa %ymm0,0x20(%rsp)
281 vpbroadcastd 0x08(%rdi),%ymm0
282 vpaddd 0x40(%rsp),%ymm0,%ymm0
283 vmovdqa %ymm0,0x40(%rsp)
284 vpbroadcastd 0x0c(%rdi),%ymm0
285 vpaddd 0x60(%rsp),%ymm0,%ymm0
286 vmovdqa %ymm0,0x60(%rsp)
287 vpbroadcastd 0x10(%rdi),%ymm0
288 vpaddd %ymm0,%ymm4,%ymm4
289 vpbroadcastd 0x14(%rdi),%ymm0
290 vpaddd %ymm0,%ymm5,%ymm5
291 vpbroadcastd 0x18(%rdi),%ymm0
292 vpaddd %ymm0,%ymm6,%ymm6
293 vpbroadcastd 0x1c(%rdi),%ymm0
294 vpaddd %ymm0,%ymm7,%ymm7
295 vpbroadcastd 0x20(%rdi),%ymm0
296 vpaddd %ymm0,%ymm8,%ymm8
297 vpbroadcastd 0x24(%rdi),%ymm0
298 vpaddd %ymm0,%ymm9,%ymm9
299 vpbroadcastd 0x28(%rdi),%ymm0
300 vpaddd %ymm0,%ymm10,%ymm10
301 vpbroadcastd 0x2c(%rdi),%ymm0
302 vpaddd %ymm0,%ymm11,%ymm11
303 vpbroadcastd 0x30(%rdi),%ymm0
304 vpaddd %ymm0,%ymm12,%ymm12
305 vpbroadcastd 0x34(%rdi),%ymm0
306 vpaddd %ymm0,%ymm13,%ymm13
307 vpbroadcastd 0x38(%rdi),%ymm0
308 vpaddd %ymm0,%ymm14,%ymm14
309 vpbroadcastd 0x3c(%rdi),%ymm0
310 vpaddd %ymm0,%ymm15,%ymm15
311
312 # x12 += counter values 0-3
313 vpaddd %ymm1,%ymm12,%ymm12
314
315 # interleave 32-bit words in state n, n+1
316 vmovdqa 0x00(%rsp),%ymm0
317 vmovdqa 0x20(%rsp),%ymm1
318 vpunpckldq %ymm1,%ymm0,%ymm2
319 vpunpckhdq %ymm1,%ymm0,%ymm1
320 vmovdqa %ymm2,0x00(%rsp)
321 vmovdqa %ymm1,0x20(%rsp)
322 vmovdqa 0x40(%rsp),%ymm0
323 vmovdqa 0x60(%rsp),%ymm1
324 vpunpckldq %ymm1,%ymm0,%ymm2
325 vpunpckhdq %ymm1,%ymm0,%ymm1
326 vmovdqa %ymm2,0x40(%rsp)
327 vmovdqa %ymm1,0x60(%rsp)
328 vmovdqa %ymm4,%ymm0
329 vpunpckldq %ymm5,%ymm0,%ymm4
330 vpunpckhdq %ymm5,%ymm0,%ymm5
331 vmovdqa %ymm6,%ymm0
332 vpunpckldq %ymm7,%ymm0,%ymm6
333 vpunpckhdq %ymm7,%ymm0,%ymm7
334 vmovdqa %ymm8,%ymm0
335 vpunpckldq %ymm9,%ymm0,%ymm8
336 vpunpckhdq %ymm9,%ymm0,%ymm9
337 vmovdqa %ymm10,%ymm0
338 vpunpckldq %ymm11,%ymm0,%ymm10
339 vpunpckhdq %ymm11,%ymm0,%ymm11
340 vmovdqa %ymm12,%ymm0
341 vpunpckldq %ymm13,%ymm0,%ymm12
342 vpunpckhdq %ymm13,%ymm0,%ymm13
343 vmovdqa %ymm14,%ymm0
344 vpunpckldq %ymm15,%ymm0,%ymm14
345 vpunpckhdq %ymm15,%ymm0,%ymm15
346
347 # interleave 64-bit words in state n, n+2
348 vmovdqa 0x00(%rsp),%ymm0
349 vmovdqa 0x40(%rsp),%ymm2
350 vpunpcklqdq %ymm2,%ymm0,%ymm1
351 vpunpckhqdq %ymm2,%ymm0,%ymm2
352 vmovdqa %ymm1,0x00(%rsp)
353 vmovdqa %ymm2,0x40(%rsp)
354 vmovdqa 0x20(%rsp),%ymm0
355 vmovdqa 0x60(%rsp),%ymm2
356 vpunpcklqdq %ymm2,%ymm0,%ymm1
357 vpunpckhqdq %ymm2,%ymm0,%ymm2
358 vmovdqa %ymm1,0x20(%rsp)
359 vmovdqa %ymm2,0x60(%rsp)
360 vmovdqa %ymm4,%ymm0
361 vpunpcklqdq %ymm6,%ymm0,%ymm4
362 vpunpckhqdq %ymm6,%ymm0,%ymm6
363 vmovdqa %ymm5,%ymm0
364 vpunpcklqdq %ymm7,%ymm0,%ymm5
365 vpunpckhqdq %ymm7,%ymm0,%ymm7
366 vmovdqa %ymm8,%ymm0
367 vpunpcklqdq %ymm10,%ymm0,%ymm8
368 vpunpckhqdq %ymm10,%ymm0,%ymm10
369 vmovdqa %ymm9,%ymm0
370 vpunpcklqdq %ymm11,%ymm0,%ymm9
371 vpunpckhqdq %ymm11,%ymm0,%ymm11
372 vmovdqa %ymm12,%ymm0
373 vpunpcklqdq %ymm14,%ymm0,%ymm12
374 vpunpckhqdq %ymm14,%ymm0,%ymm14
375 vmovdqa %ymm13,%ymm0
376 vpunpcklqdq %ymm15,%ymm0,%ymm13
377 vpunpckhqdq %ymm15,%ymm0,%ymm15
378
379 # interleave 128-bit words in state n, n+4
Martin Willic3b734d2018-11-11 10:36:27 +0100380 # xor/write first four blocks
381 vmovdqa 0x00(%rsp),%ymm1
382 vperm2i128 $0x20,%ymm4,%ymm1,%ymm0
383 cmp $0x0020,%rax
384 jl .Lxorpart8
Martin Willi3d1e93c2015-07-16 19:14:03 +0200385 vpxor 0x0000(%rdx),%ymm0,%ymm0
386 vmovdqu %ymm0,0x0000(%rsi)
Martin Willic3b734d2018-11-11 10:36:27 +0100387 vperm2i128 $0x31,%ymm4,%ymm1,%ymm4
388
389 vperm2i128 $0x20,%ymm12,%ymm8,%ymm0
390 cmp $0x0040,%rax
391 jl .Lxorpart8
392 vpxor 0x0020(%rdx),%ymm0,%ymm0
393 vmovdqu %ymm0,0x0020(%rsi)
394 vperm2i128 $0x31,%ymm12,%ymm8,%ymm12
395
396 vmovdqa 0x40(%rsp),%ymm1
397 vperm2i128 $0x20,%ymm6,%ymm1,%ymm0
398 cmp $0x0060,%rax
399 jl .Lxorpart8
Martin Willi3d1e93c2015-07-16 19:14:03 +0200400 vpxor 0x0040(%rdx),%ymm0,%ymm0
401 vmovdqu %ymm0,0x0040(%rsi)
Martin Willic3b734d2018-11-11 10:36:27 +0100402 vperm2i128 $0x31,%ymm6,%ymm1,%ymm6
403
404 vperm2i128 $0x20,%ymm14,%ymm10,%ymm0
405 cmp $0x0080,%rax
406 jl .Lxorpart8
407 vpxor 0x0060(%rdx),%ymm0,%ymm0
408 vmovdqu %ymm0,0x0060(%rsi)
409 vperm2i128 $0x31,%ymm14,%ymm10,%ymm14
410
411 vmovdqa 0x20(%rsp),%ymm1
412 vperm2i128 $0x20,%ymm5,%ymm1,%ymm0
413 cmp $0x00a0,%rax
414 jl .Lxorpart8
415 vpxor 0x0080(%rdx),%ymm0,%ymm0
416 vmovdqu %ymm0,0x0080(%rsi)
417 vperm2i128 $0x31,%ymm5,%ymm1,%ymm5
418
419 vperm2i128 $0x20,%ymm13,%ymm9,%ymm0
420 cmp $0x00c0,%rax
421 jl .Lxorpart8
422 vpxor 0x00a0(%rdx),%ymm0,%ymm0
423 vmovdqu %ymm0,0x00a0(%rsi)
424 vperm2i128 $0x31,%ymm13,%ymm9,%ymm13
425
426 vmovdqa 0x60(%rsp),%ymm1
427 vperm2i128 $0x20,%ymm7,%ymm1,%ymm0
428 cmp $0x00e0,%rax
429 jl .Lxorpart8
Martin Willi3d1e93c2015-07-16 19:14:03 +0200430 vpxor 0x00c0(%rdx),%ymm0,%ymm0
431 vmovdqu %ymm0,0x00c0(%rsi)
Martin Willic3b734d2018-11-11 10:36:27 +0100432 vperm2i128 $0x31,%ymm7,%ymm1,%ymm7
Martin Willi3d1e93c2015-07-16 19:14:03 +0200433
Martin Willic3b734d2018-11-11 10:36:27 +0100434 vperm2i128 $0x20,%ymm15,%ymm11,%ymm0
435 cmp $0x0100,%rax
436 jl .Lxorpart8
437 vpxor 0x00e0(%rdx),%ymm0,%ymm0
438 vmovdqu %ymm0,0x00e0(%rsi)
439 vperm2i128 $0x31,%ymm15,%ymm11,%ymm15
440
441 # xor remaining blocks, write to output
442 vmovdqa %ymm4,%ymm0
443 cmp $0x0120,%rax
444 jl .Lxorpart8
445 vpxor 0x0100(%rdx),%ymm0,%ymm0
446 vmovdqu %ymm0,0x0100(%rsi)
447
448 vmovdqa %ymm12,%ymm0
449 cmp $0x0140,%rax
450 jl .Lxorpart8
451 vpxor 0x0120(%rdx),%ymm0,%ymm0
452 vmovdqu %ymm0,0x0120(%rsi)
453
454 vmovdqa %ymm6,%ymm0
455 cmp $0x0160,%rax
456 jl .Lxorpart8
457 vpxor 0x0140(%rdx),%ymm0,%ymm0
458 vmovdqu %ymm0,0x0140(%rsi)
459
460 vmovdqa %ymm14,%ymm0
461 cmp $0x0180,%rax
462 jl .Lxorpart8
463 vpxor 0x0160(%rdx),%ymm0,%ymm0
464 vmovdqu %ymm0,0x0160(%rsi)
465
466 vmovdqa %ymm5,%ymm0
467 cmp $0x01a0,%rax
468 jl .Lxorpart8
469 vpxor 0x0180(%rdx),%ymm0,%ymm0
470 vmovdqu %ymm0,0x0180(%rsi)
471
472 vmovdqa %ymm13,%ymm0
473 cmp $0x01c0,%rax
474 jl .Lxorpart8
475 vpxor 0x01a0(%rdx),%ymm0,%ymm0
476 vmovdqu %ymm0,0x01a0(%rsi)
477
478 vmovdqa %ymm7,%ymm0
479 cmp $0x01e0,%rax
480 jl .Lxorpart8
481 vpxor 0x01c0(%rdx),%ymm0,%ymm0
482 vmovdqu %ymm0,0x01c0(%rsi)
483
484 vmovdqa %ymm15,%ymm0
485 cmp $0x0200,%rax
486 jl .Lxorpart8
487 vpxor 0x01e0(%rdx),%ymm0,%ymm0
488 vmovdqu %ymm0,0x01e0(%rsi)
489
490.Ldone8:
Martin Willi3d1e93c2015-07-16 19:14:03 +0200491 vzeroupper
Jason A. Donenfeld46357422017-10-08 22:50:53 +0200492 lea -8(%r10),%rsp
Martin Willi3d1e93c2015-07-16 19:14:03 +0200493 ret
Martin Willic3b734d2018-11-11 10:36:27 +0100494
495.Lxorpart8:
496 # xor remaining bytes from partial register into output
497 mov %rax,%r9
498 and $0x1f,%r9
499 jz .Ldone8
500 and $~0x1f,%rax
501
502 mov %rsi,%r11
503
504 lea (%rdx,%rax),%rsi
505 mov %rsp,%rdi
506 mov %r9,%rcx
507 rep movsb
508
509 vpxor 0x00(%rsp),%ymm0,%ymm0
510 vmovdqa %ymm0,0x00(%rsp)
511
512 mov %rsp,%rsi
513 lea (%r11,%rax),%rdi
514 mov %r9,%rcx
515 rep movsb
516
517 jmp .Ldone8
518
Martin Willi3d1e93c2015-07-16 19:14:03 +0200519ENDPROC(chacha20_8block_xor_avx2)