blob: 3d09e3aca18dad33ba0c27c3673e49dcbfac0ce8 [file] [log] [blame]
Huang Ying54b6a1b2009-01-18 16:28:34 +11001/*
2 * Implement AES algorithm in Intel AES-NI instructions.
3 *
4 * The white paper of AES-NI instructions can be downloaded from:
5 * http://softwarecommunity.intel.com/isn/downloads/intelavx/AES-Instructions-Set_WP.pdf
6 *
7 * Copyright (C) 2008, Intel Corp.
8 * Author: Huang Ying <ying.huang@intel.com>
9 * Vinodh Gopal <vinodh.gopal@intel.com>
10 * Kahraman Akdemir
11 *
Tadeusz Struk0bd82f52010-11-04 15:00:45 -040012 * Added RFC4106 AES-GCM support for 128-bit keys under the AEAD
13 * interface for 64-bit kernels.
14 * Authors: Erdinc Ozturk (erdinc.ozturk@intel.com)
15 * Aidan O'Mahony (aidan.o.mahony@intel.com)
16 * Adrian Hoban <adrian.hoban@intel.com>
17 * James Guilford (james.guilford@intel.com)
18 * Gabriele Paoloni <gabriele.paoloni@intel.com>
19 * Tadeusz Struk (tadeusz.struk@intel.com)
20 * Wajdi Feghali (wajdi.k.feghali@intel.com)
21 * Copyright (c) 2010, Intel Corporation.
22 *
Mathias Krause0d258ef2010-11-27 16:34:46 +080023 * Ported x86_64 version to x86:
24 * Author: Mathias Krause <minipli@googlemail.com>
25 *
Huang Ying54b6a1b2009-01-18 16:28:34 +110026 * This program is free software; you can redistribute it and/or modify
27 * it under the terms of the GNU General Public License as published by
28 * the Free Software Foundation; either version 2 of the License, or
29 * (at your option) any later version.
30 */
31
32#include <linux/linkage.h>
Huang Yingb369e522009-11-23 19:54:06 +080033#include <asm/inst.h>
Josh Poimboeuf8691ccd2016-01-21 16:49:19 -060034#include <asm/frame.h>
David Woodhouse9697fa32018-01-11 21:46:27 +000035#include <asm/nospec-branch.h>
Huang Ying54b6a1b2009-01-18 16:28:34 +110036
Timothy McCaffreye31ac322015-01-13 13:16:43 -050037/*
38 * The following macros are used to move an (un)aligned 16 byte value to/from
39 * an XMM register. This can done for either FP or integer values, for FP use
40 * movaps (move aligned packed single) or integer use movdqa (move double quad
41 * aligned). It doesn't make a performance difference which instruction is used
42 * since Nehalem (original Core i7) was released. However, the movaps is a byte
43 * shorter, so that is the one we'll use for now. (same for unaligned).
44 */
45#define MOVADQ movaps
46#define MOVUDQ movups
47
Mathias Krause559ad0f2010-11-29 08:35:39 +080048#ifdef __x86_64__
Timothy McCaffreye31ac322015-01-13 13:16:43 -050049
Denys Vlasenkoe1839142017-01-19 22:33:04 +010050# constants in mergeable sections, linker can reorder and merge
51.section .rodata.cst16.gf128mul_x_ble_mask, "aM", @progbits, 16
Jussi Kivilinnac456a9c2013-04-08 21:51:16 +030052.align 16
53.Lgf128mul_x_ble_mask:
54 .octa 0x00000000000000010000000000000087
Denys Vlasenkoe1839142017-01-19 22:33:04 +010055.section .rodata.cst16.POLY, "aM", @progbits, 16
56.align 16
Tadeusz Struk0bd82f52010-11-04 15:00:45 -040057POLY: .octa 0xC2000000000000000000000000000001
Denys Vlasenkoe1839142017-01-19 22:33:04 +010058.section .rodata.cst16.TWOONE, "aM", @progbits, 16
59.align 16
Tadeusz Struk0bd82f52010-11-04 15:00:45 -040060TWOONE: .octa 0x00000001000000000000000000000001
61
Denys Vlasenkoe1839142017-01-19 22:33:04 +010062.section .rodata.cst16.SHUF_MASK, "aM", @progbits, 16
63.align 16
64SHUF_MASK: .octa 0x000102030405060708090A0B0C0D0E0F
65.section .rodata.cst16.MASK1, "aM", @progbits, 16
66.align 16
67MASK1: .octa 0x0000000000000000ffffffffffffffff
68.section .rodata.cst16.MASK2, "aM", @progbits, 16
69.align 16
70MASK2: .octa 0xffffffffffffffff0000000000000000
71.section .rodata.cst16.ONE, "aM", @progbits, 16
72.align 16
73ONE: .octa 0x00000000000000000000000000000001
74.section .rodata.cst16.F_MIN_MASK, "aM", @progbits, 16
75.align 16
76F_MIN_MASK: .octa 0xf1f2f3f4f5f6f7f8f9fafbfcfdfeff0
77.section .rodata.cst16.dec, "aM", @progbits, 16
78.align 16
79dec: .octa 0x1
80.section .rodata.cst16.enc, "aM", @progbits, 16
81.align 16
82enc: .octa 0x2
83
Tadeusz Struk0bd82f52010-11-04 15:00:45 -040084# order of these constants should not change.
85# more specifically, ALL_F should follow SHIFT_MASK,
Denys Vlasenkoe1839142017-01-19 22:33:04 +010086# and zero should follow ALL_F
87.section .rodata, "a", @progbits
88.align 16
Tadeusz Struk0bd82f52010-11-04 15:00:45 -040089SHIFT_MASK: .octa 0x0f0e0d0c0b0a09080706050403020100
90ALL_F: .octa 0xffffffffffffffffffffffffffffffff
Denys Vlasenkoe1839142017-01-19 22:33:04 +010091 .octa 0x00000000000000000000000000000000
Tadeusz Struk0bd82f52010-11-04 15:00:45 -040092
Sabrina Dubroca0487cca2017-04-28 18:11:56 +020093.section .rodata
94.align 16
95.type aad_shift_arr, @object
96.size aad_shift_arr, 272
97aad_shift_arr:
98 .octa 0xffffffffffffffffffffffffffffffff
99 .octa 0xffffffffffffffffffffffffffffff0C
100 .octa 0xffffffffffffffffffffffffffff0D0C
101 .octa 0xffffffffffffffffffffffffff0E0D0C
102 .octa 0xffffffffffffffffffffffff0F0E0D0C
103 .octa 0xffffffffffffffffffffff0C0B0A0908
104 .octa 0xffffffffffffffffffff0D0C0B0A0908
105 .octa 0xffffffffffffffffff0E0D0C0B0A0908
106 .octa 0xffffffffffffffff0F0E0D0C0B0A0908
107 .octa 0xffffffffffffff0C0B0A090807060504
108 .octa 0xffffffffffff0D0C0B0A090807060504
109 .octa 0xffffffffff0E0D0C0B0A090807060504
110 .octa 0xffffffff0F0E0D0C0B0A090807060504
111 .octa 0xffffff0C0B0A09080706050403020100
112 .octa 0xffff0D0C0B0A09080706050403020100
113 .octa 0xff0E0D0C0B0A09080706050403020100
114 .octa 0x0F0E0D0C0B0A09080706050403020100
115
Tadeusz Struk0bd82f52010-11-04 15:00:45 -0400116
Huang Ying54b6a1b2009-01-18 16:28:34 +1100117.text
118
Tadeusz Struk0bd82f52010-11-04 15:00:45 -0400119
120#define STACK_OFFSET 8*3
121#define HashKey 16*0 // store HashKey <<1 mod poly here
122#define HashKey_2 16*1 // store HashKey^2 <<1 mod poly here
123#define HashKey_3 16*2 // store HashKey^3 <<1 mod poly here
124#define HashKey_4 16*3 // store HashKey^4 <<1 mod poly here
125#define HashKey_k 16*4 // store XOR of High 64 bits and Low 64
126 // bits of HashKey <<1 mod poly here
127 //(for Karatsuba purposes)
128#define HashKey_2_k 16*5 // store XOR of High 64 bits and Low 64
129 // bits of HashKey^2 <<1 mod poly here
130 // (for Karatsuba purposes)
131#define HashKey_3_k 16*6 // store XOR of High 64 bits and Low 64
132 // bits of HashKey^3 <<1 mod poly here
133 // (for Karatsuba purposes)
134#define HashKey_4_k 16*7 // store XOR of High 64 bits and Low 64
135 // bits of HashKey^4 <<1 mod poly here
136 // (for Karatsuba purposes)
137#define VARIABLE_OFFSET 16*8
138
139#define arg1 rdi
140#define arg2 rsi
141#define arg3 rdx
142#define arg4 rcx
143#define arg5 r8
144#define arg6 r9
145#define arg7 STACK_OFFSET+8(%r14)
146#define arg8 STACK_OFFSET+16(%r14)
147#define arg9 STACK_OFFSET+24(%r14)
148#define arg10 STACK_OFFSET+32(%r14)
Timothy McCaffreye31ac322015-01-13 13:16:43 -0500149#define keysize 2*15*16(%arg1)
Mathias Krause559ad0f2010-11-29 08:35:39 +0800150#endif
Tadeusz Struk0bd82f52010-11-04 15:00:45 -0400151
152
Huang Ying54b6a1b2009-01-18 16:28:34 +1100153#define STATE1 %xmm0
154#define STATE2 %xmm4
155#define STATE3 %xmm5
156#define STATE4 %xmm6
157#define STATE STATE1
158#define IN1 %xmm1
159#define IN2 %xmm7
160#define IN3 %xmm8
161#define IN4 %xmm9
162#define IN IN1
163#define KEY %xmm2
164#define IV %xmm3
Mathias Krause0d258ef2010-11-27 16:34:46 +0800165
Huang Ying12387a42010-03-10 18:28:55 +0800166#define BSWAP_MASK %xmm10
167#define CTR %xmm11
168#define INC %xmm12
Huang Ying54b6a1b2009-01-18 16:28:34 +1100169
Jussi Kivilinnac456a9c2013-04-08 21:51:16 +0300170#define GF128MUL_MASK %xmm10
171
Mathias Krause0d258ef2010-11-27 16:34:46 +0800172#ifdef __x86_64__
173#define AREG %rax
Huang Ying54b6a1b2009-01-18 16:28:34 +1100174#define KEYP %rdi
175#define OUTP %rsi
Mathias Krause0d258ef2010-11-27 16:34:46 +0800176#define UKEYP OUTP
Huang Ying54b6a1b2009-01-18 16:28:34 +1100177#define INP %rdx
178#define LEN %rcx
179#define IVP %r8
180#define KLEN %r9d
181#define T1 %r10
182#define TKEYP T1
183#define T2 %r11
Huang Ying12387a42010-03-10 18:28:55 +0800184#define TCTR_LOW T2
Mathias Krause0d258ef2010-11-27 16:34:46 +0800185#else
186#define AREG %eax
187#define KEYP %edi
188#define OUTP AREG
189#define UKEYP OUTP
190#define INP %edx
191#define LEN %esi
192#define IVP %ebp
193#define KLEN %ebx
194#define T1 %ecx
195#define TKEYP T1
196#endif
Huang Ying54b6a1b2009-01-18 16:28:34 +1100197
Tadeusz Struk0bd82f52010-11-04 15:00:45 -0400198
Mathias Krause559ad0f2010-11-29 08:35:39 +0800199#ifdef __x86_64__
Tadeusz Struk0bd82f52010-11-04 15:00:45 -0400200/* GHASH_MUL MACRO to implement: Data*HashKey mod (128,127,126,121,0)
201*
202*
203* Input: A and B (128-bits each, bit-reflected)
204* Output: C = A*B*x mod poly, (i.e. >>1 )
205* To compute GH = GH*HashKey mod poly, give HK = HashKey<<1 mod poly as input
206* GH = GH * HK * x mod poly which is equivalent to GH*HashKey mod poly.
207*
208*/
209.macro GHASH_MUL GH HK TMP1 TMP2 TMP3 TMP4 TMP5
210 movdqa \GH, \TMP1
211 pshufd $78, \GH, \TMP2
212 pshufd $78, \HK, \TMP3
213 pxor \GH, \TMP2 # TMP2 = a1+a0
214 pxor \HK, \TMP3 # TMP3 = b1+b0
215 PCLMULQDQ 0x11, \HK, \TMP1 # TMP1 = a1*b1
216 PCLMULQDQ 0x00, \HK, \GH # GH = a0*b0
217 PCLMULQDQ 0x00, \TMP3, \TMP2 # TMP2 = (a0+a1)*(b1+b0)
218 pxor \GH, \TMP2
219 pxor \TMP1, \TMP2 # TMP2 = (a0*b0)+(a1*b0)
220 movdqa \TMP2, \TMP3
221 pslldq $8, \TMP3 # left shift TMP3 2 DWs
222 psrldq $8, \TMP2 # right shift TMP2 2 DWs
223 pxor \TMP3, \GH
224 pxor \TMP2, \TMP1 # TMP2:GH holds the result of GH*HK
225
226 # first phase of the reduction
227
228 movdqa \GH, \TMP2
229 movdqa \GH, \TMP3
230 movdqa \GH, \TMP4 # copy GH into TMP2,TMP3 and TMP4
231 # in in order to perform
232 # independent shifts
233 pslld $31, \TMP2 # packed right shift <<31
234 pslld $30, \TMP3 # packed right shift <<30
235 pslld $25, \TMP4 # packed right shift <<25
236 pxor \TMP3, \TMP2 # xor the shifted versions
237 pxor \TMP4, \TMP2
238 movdqa \TMP2, \TMP5
239 psrldq $4, \TMP5 # right shift TMP5 1 DW
240 pslldq $12, \TMP2 # left shift TMP2 3 DWs
241 pxor \TMP2, \GH
242
243 # second phase of the reduction
244
245 movdqa \GH,\TMP2 # copy GH into TMP2,TMP3 and TMP4
246 # in in order to perform
247 # independent shifts
248 movdqa \GH,\TMP3
249 movdqa \GH,\TMP4
250 psrld $1,\TMP2 # packed left shift >>1
251 psrld $2,\TMP3 # packed left shift >>2
252 psrld $7,\TMP4 # packed left shift >>7
253 pxor \TMP3,\TMP2 # xor the shifted versions
254 pxor \TMP4,\TMP2
255 pxor \TMP5, \TMP2
256 pxor \TMP2, \GH
257 pxor \TMP1, \GH # result is in TMP1
258.endm
259
260/*
261* if a = number of total plaintext bytes
262* b = floor(a/16)
263* num_initial_blocks = b mod 4
264* encrypt the initial num_initial_blocks blocks and apply ghash on
265* the ciphertext
266* %r10, %r11, %r12, %rax, %xmm5, %xmm6, %xmm7, %xmm8, %xmm9 registers
267* are clobbered
268* arg1, %arg2, %arg3, %r14 are used as a pointer only, not modified
269*/
270
Tadeusz Struk0bd82f52010-11-04 15:00:45 -0400271
Tadeusz Struk3c097b82010-12-13 19:51:15 +0800272.macro INITIAL_BLOCKS_DEC num_initial_blocks TMP1 TMP2 TMP3 TMP4 TMP5 XMM0 XMM1 \
273XMM2 XMM3 XMM4 XMMDst TMP6 TMP7 i i_seq operation
Timothy McCaffreye31ac322015-01-13 13:16:43 -0500274 MOVADQ SHUF_MASK(%rip), %xmm14
Tadeusz Struk0bd82f52010-11-04 15:00:45 -0400275 mov arg7, %r10 # %r10 = AAD
276 mov arg8, %r12 # %r12 = aadLen
277 mov %r12, %r11
278 pxor %xmm\i, %xmm\i
Sabrina Dubroca0487cca2017-04-28 18:11:56 +0200279 pxor \XMM2, \XMM2
Timothy McCaffreye31ac322015-01-13 13:16:43 -0500280
Sabrina Dubroca0487cca2017-04-28 18:11:56 +0200281 cmp $16, %r11
282 jl _get_AAD_rest8\num_initial_blocks\operation
283_get_AAD_blocks\num_initial_blocks\operation:
284 movdqu (%r10), %xmm\i
285 PSHUFB_XMM %xmm14, %xmm\i # byte-reflect the AAD data
286 pxor %xmm\i, \XMM2
287 GHASH_MUL \XMM2, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1
288 add $16, %r10
289 sub $16, %r12
290 sub $16, %r11
291 cmp $16, %r11
292 jge _get_AAD_blocks\num_initial_blocks\operation
293
294 movdqu \XMM2, %xmm\i
295 cmp $0, %r11
296 je _get_AAD_done\num_initial_blocks\operation
297
298 pxor %xmm\i,%xmm\i
299
300 /* read the last <16B of AAD. since we have at least 4B of
301 data right after the AAD (the ICV, and maybe some CT), we can
302 read 4B/8B blocks safely, and then get rid of the extra stuff */
303_get_AAD_rest8\num_initial_blocks\operation:
304 cmp $4, %r11
305 jle _get_AAD_rest4\num_initial_blocks\operation
306 movq (%r10), \TMP1
307 add $8, %r10
308 sub $8, %r11
309 pslldq $8, \TMP1
310 psrldq $8, %xmm\i
311 pxor \TMP1, %xmm\i
312 jmp _get_AAD_rest8\num_initial_blocks\operation
313_get_AAD_rest4\num_initial_blocks\operation:
314 cmp $0, %r11
315 jle _get_AAD_rest0\num_initial_blocks\operation
316 mov (%r10), %eax
317 movq %rax, \TMP1
318 add $4, %r10
319 sub $4, %r10
Tadeusz Struk0bd82f52010-11-04 15:00:45 -0400320 pslldq $12, \TMP1
321 psrldq $4, %xmm\i
322 pxor \TMP1, %xmm\i
Sabrina Dubroca0487cca2017-04-28 18:11:56 +0200323_get_AAD_rest0\num_initial_blocks\operation:
324 /* finalize: shift out the extra bytes we read, and align
325 left. since pslldq can only shift by an immediate, we use
326 vpshufb and an array of shuffle masks */
327 movq %r12, %r11
328 salq $4, %r11
329 movdqu aad_shift_arr(%r11), \TMP1
330 PSHUFB_XMM \TMP1, %xmm\i
331_get_AAD_rest_final\num_initial_blocks\operation:
Tadeusz Struk3c097b82010-12-13 19:51:15 +0800332 PSHUFB_XMM %xmm14, %xmm\i # byte-reflect the AAD data
Sabrina Dubroca0487cca2017-04-28 18:11:56 +0200333 pxor \XMM2, %xmm\i
334 GHASH_MUL %xmm\i, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1
Tadeusz Struk3c097b82010-12-13 19:51:15 +0800335
Sabrina Dubroca0487cca2017-04-28 18:11:56 +0200336_get_AAD_done\num_initial_blocks\operation:
Tadeusz Struk0bd82f52010-11-04 15:00:45 -0400337 xor %r11, %r11 # initialise the data pointer offset as zero
Sabrina Dubroca0487cca2017-04-28 18:11:56 +0200338 # start AES for num_initial_blocks blocks
Tadeusz Struk0bd82f52010-11-04 15:00:45 -0400339
340 mov %arg5, %rax # %rax = *Y0
341 movdqu (%rax), \XMM0 # XMM0 = Y0
Tadeusz Struk3c097b82010-12-13 19:51:15 +0800342 PSHUFB_XMM %xmm14, \XMM0
343
344.if (\i == 5) || (\i == 6) || (\i == 7)
Timothy McCaffreye31ac322015-01-13 13:16:43 -0500345 MOVADQ ONE(%RIP),\TMP1
346 MOVADQ (%arg1),\TMP2
Tadeusz Struk0bd82f52010-11-04 15:00:45 -0400347.irpc index, \i_seq
Timothy McCaffreye31ac322015-01-13 13:16:43 -0500348 paddd \TMP1, \XMM0 # INCR Y0
Tadeusz Struk0bd82f52010-11-04 15:00:45 -0400349 movdqa \XMM0, %xmm\index
Tadeusz Struk3c097b82010-12-13 19:51:15 +0800350 PSHUFB_XMM %xmm14, %xmm\index # perform a 16 byte swap
Timothy McCaffreye31ac322015-01-13 13:16:43 -0500351 pxor \TMP2, %xmm\index
352.endr
353 lea 0x10(%arg1),%r10
354 mov keysize,%eax
355 shr $2,%eax # 128->4, 192->6, 256->8
356 add $5,%eax # 128->9, 192->11, 256->13
Tadeusz Struk3c097b82010-12-13 19:51:15 +0800357
Timothy McCaffreye31ac322015-01-13 13:16:43 -0500358aes_loop_initial_dec\num_initial_blocks:
359 MOVADQ (%r10),\TMP1
360.irpc index, \i_seq
361 AESENC \TMP1, %xmm\index
Tadeusz Struk0bd82f52010-11-04 15:00:45 -0400362.endr
Timothy McCaffreye31ac322015-01-13 13:16:43 -0500363 add $16,%r10
364 sub $1,%eax
365 jnz aes_loop_initial_dec\num_initial_blocks
366
367 MOVADQ (%r10), \TMP1
Tadeusz Struk0bd82f52010-11-04 15:00:45 -0400368.irpc index, \i_seq
Timothy McCaffreye31ac322015-01-13 13:16:43 -0500369 AESENCLAST \TMP1, %xmm\index # Last Round
Tadeusz Struk0bd82f52010-11-04 15:00:45 -0400370.endr
371.irpc index, \i_seq
372 movdqu (%arg3 , %r11, 1), \TMP1
373 pxor \TMP1, %xmm\index
374 movdqu %xmm\index, (%arg2 , %r11, 1)
375 # write back plaintext/ciphertext for num_initial_blocks
376 add $16, %r11
Tadeusz Struk3c097b82010-12-13 19:51:15 +0800377
Tadeusz Struk0bd82f52010-11-04 15:00:45 -0400378 movdqa \TMP1, %xmm\index
Tadeusz Struk3c097b82010-12-13 19:51:15 +0800379 PSHUFB_XMM %xmm14, %xmm\index
Timothy McCaffreye31ac322015-01-13 13:16:43 -0500380 # prepare plaintext/ciphertext for GHASH computation
Tadeusz Struk0bd82f52010-11-04 15:00:45 -0400381.endr
382.endif
Sabrina Dubroca0487cca2017-04-28 18:11:56 +0200383
Tadeusz Struk0bd82f52010-11-04 15:00:45 -0400384 # apply GHASH on num_initial_blocks blocks
385
386.if \i == 5
387 pxor %xmm5, %xmm6
388 GHASH_MUL %xmm6, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1
389 pxor %xmm6, %xmm7
390 GHASH_MUL %xmm7, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1
391 pxor %xmm7, %xmm8
392 GHASH_MUL %xmm8, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1
393.elseif \i == 6
394 pxor %xmm6, %xmm7
395 GHASH_MUL %xmm7, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1
396 pxor %xmm7, %xmm8
397 GHASH_MUL %xmm8, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1
398.elseif \i == 7
399 pxor %xmm7, %xmm8
400 GHASH_MUL %xmm8, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1
401.endif
402 cmp $64, %r13
403 jl _initial_blocks_done\num_initial_blocks\operation
404 # no need for precomputed values
405/*
406*
407* Precomputations for HashKey parallel with encryption of first 4 blocks.
408* Haskey_i_k holds XORed values of the low and high parts of the Haskey_i
409*/
Timothy McCaffreye31ac322015-01-13 13:16:43 -0500410 MOVADQ ONE(%rip), \TMP1
411 paddd \TMP1, \XMM0 # INCR Y0
412 MOVADQ \XMM0, \XMM1
Tadeusz Struk3c097b82010-12-13 19:51:15 +0800413 PSHUFB_XMM %xmm14, \XMM1 # perform a 16 byte swap
414
Timothy McCaffreye31ac322015-01-13 13:16:43 -0500415 paddd \TMP1, \XMM0 # INCR Y0
416 MOVADQ \XMM0, \XMM2
Tadeusz Struk3c097b82010-12-13 19:51:15 +0800417 PSHUFB_XMM %xmm14, \XMM2 # perform a 16 byte swap
418
Timothy McCaffreye31ac322015-01-13 13:16:43 -0500419 paddd \TMP1, \XMM0 # INCR Y0
420 MOVADQ \XMM0, \XMM3
Tadeusz Struk3c097b82010-12-13 19:51:15 +0800421 PSHUFB_XMM %xmm14, \XMM3 # perform a 16 byte swap
422
Timothy McCaffreye31ac322015-01-13 13:16:43 -0500423 paddd \TMP1, \XMM0 # INCR Y0
424 MOVADQ \XMM0, \XMM4
Tadeusz Struk3c097b82010-12-13 19:51:15 +0800425 PSHUFB_XMM %xmm14, \XMM4 # perform a 16 byte swap
426
Timothy McCaffreye31ac322015-01-13 13:16:43 -0500427 MOVADQ 0(%arg1),\TMP1
428 pxor \TMP1, \XMM1
429 pxor \TMP1, \XMM2
430 pxor \TMP1, \XMM3
431 pxor \TMP1, \XMM4
Tadeusz Struk0bd82f52010-11-04 15:00:45 -0400432 movdqa \TMP3, \TMP5
433 pshufd $78, \TMP3, \TMP1
434 pxor \TMP3, \TMP1
435 movdqa \TMP1, HashKey_k(%rsp)
436 GHASH_MUL \TMP5, \TMP3, \TMP1, \TMP2, \TMP4, \TMP6, \TMP7
437# TMP5 = HashKey^2<<1 (mod poly)
438 movdqa \TMP5, HashKey_2(%rsp)
439# HashKey_2 = HashKey^2<<1 (mod poly)
440 pshufd $78, \TMP5, \TMP1
441 pxor \TMP5, \TMP1
442 movdqa \TMP1, HashKey_2_k(%rsp)
443.irpc index, 1234 # do 4 rounds
444 movaps 0x10*\index(%arg1), \TMP1
445 AESENC \TMP1, \XMM1
446 AESENC \TMP1, \XMM2
447 AESENC \TMP1, \XMM3
448 AESENC \TMP1, \XMM4
449.endr
450 GHASH_MUL \TMP5, \TMP3, \TMP1, \TMP2, \TMP4, \TMP6, \TMP7
451# TMP5 = HashKey^3<<1 (mod poly)
452 movdqa \TMP5, HashKey_3(%rsp)
453 pshufd $78, \TMP5, \TMP1
454 pxor \TMP5, \TMP1
455 movdqa \TMP1, HashKey_3_k(%rsp)
456.irpc index, 56789 # do next 5 rounds
457 movaps 0x10*\index(%arg1), \TMP1
458 AESENC \TMP1, \XMM1
459 AESENC \TMP1, \XMM2
460 AESENC \TMP1, \XMM3
461 AESENC \TMP1, \XMM4
462.endr
463 GHASH_MUL \TMP5, \TMP3, \TMP1, \TMP2, \TMP4, \TMP6, \TMP7
464# TMP5 = HashKey^3<<1 (mod poly)
465 movdqa \TMP5, HashKey_4(%rsp)
466 pshufd $78, \TMP5, \TMP1
467 pxor \TMP5, \TMP1
468 movdqa \TMP1, HashKey_4_k(%rsp)
Timothy McCaffreye31ac322015-01-13 13:16:43 -0500469 lea 0xa0(%arg1),%r10
470 mov keysize,%eax
471 shr $2,%eax # 128->4, 192->6, 256->8
472 sub $4,%eax # 128->0, 192->2, 256->4
473 jz aes_loop_pre_dec_done\num_initial_blocks
474
475aes_loop_pre_dec\num_initial_blocks:
476 MOVADQ (%r10),\TMP2
477.irpc index, 1234
478 AESENC \TMP2, %xmm\index
479.endr
480 add $16,%r10
481 sub $1,%eax
482 jnz aes_loop_pre_dec\num_initial_blocks
483
484aes_loop_pre_dec_done\num_initial_blocks:
485 MOVADQ (%r10), \TMP2
Tadeusz Struk0bd82f52010-11-04 15:00:45 -0400486 AESENCLAST \TMP2, \XMM1
487 AESENCLAST \TMP2, \XMM2
488 AESENCLAST \TMP2, \XMM3
489 AESENCLAST \TMP2, \XMM4
490 movdqu 16*0(%arg3 , %r11 , 1), \TMP1
491 pxor \TMP1, \XMM1
Tadeusz Struk0bd82f52010-11-04 15:00:45 -0400492 movdqu \XMM1, 16*0(%arg2 , %r11 , 1)
493 movdqa \TMP1, \XMM1
Tadeusz Struk0bd82f52010-11-04 15:00:45 -0400494 movdqu 16*1(%arg3 , %r11 , 1), \TMP1
495 pxor \TMP1, \XMM2
Tadeusz Struk0bd82f52010-11-04 15:00:45 -0400496 movdqu \XMM2, 16*1(%arg2 , %r11 , 1)
497 movdqa \TMP1, \XMM2
Tadeusz Struk0bd82f52010-11-04 15:00:45 -0400498 movdqu 16*2(%arg3 , %r11 , 1), \TMP1
499 pxor \TMP1, \XMM3
Tadeusz Struk0bd82f52010-11-04 15:00:45 -0400500 movdqu \XMM3, 16*2(%arg2 , %r11 , 1)
501 movdqa \TMP1, \XMM3
Tadeusz Struk0bd82f52010-11-04 15:00:45 -0400502 movdqu 16*3(%arg3 , %r11 , 1), \TMP1
503 pxor \TMP1, \XMM4
Tadeusz Struk0bd82f52010-11-04 15:00:45 -0400504 movdqu \XMM4, 16*3(%arg2 , %r11 , 1)
505 movdqa \TMP1, \XMM4
Tadeusz Struk3c097b82010-12-13 19:51:15 +0800506 add $64, %r11
Tadeusz Struk3c097b82010-12-13 19:51:15 +0800507 PSHUFB_XMM %xmm14, \XMM1 # perform a 16 byte swap
508 pxor \XMMDst, \XMM1
509# combine GHASHed value with the corresponding ciphertext
Tadeusz Struk3c097b82010-12-13 19:51:15 +0800510 PSHUFB_XMM %xmm14, \XMM2 # perform a 16 byte swap
Tadeusz Struk3c097b82010-12-13 19:51:15 +0800511 PSHUFB_XMM %xmm14, \XMM3 # perform a 16 byte swap
Tadeusz Struk3c097b82010-12-13 19:51:15 +0800512 PSHUFB_XMM %xmm14, \XMM4 # perform a 16 byte swap
513
514_initial_blocks_done\num_initial_blocks\operation:
515
516.endm
517
518
519/*
520* if a = number of total plaintext bytes
521* b = floor(a/16)
522* num_initial_blocks = b mod 4
523* encrypt the initial num_initial_blocks blocks and apply ghash on
524* the ciphertext
525* %r10, %r11, %r12, %rax, %xmm5, %xmm6, %xmm7, %xmm8, %xmm9 registers
526* are clobbered
527* arg1, %arg2, %arg3, %r14 are used as a pointer only, not modified
528*/
529
530
531.macro INITIAL_BLOCKS_ENC num_initial_blocks TMP1 TMP2 TMP3 TMP4 TMP5 XMM0 XMM1 \
532XMM2 XMM3 XMM4 XMMDst TMP6 TMP7 i i_seq operation
Timothy McCaffreye31ac322015-01-13 13:16:43 -0500533 MOVADQ SHUF_MASK(%rip), %xmm14
Tadeusz Struk3c097b82010-12-13 19:51:15 +0800534 mov arg7, %r10 # %r10 = AAD
535 mov arg8, %r12 # %r12 = aadLen
536 mov %r12, %r11
537 pxor %xmm\i, %xmm\i
Sabrina Dubroca0487cca2017-04-28 18:11:56 +0200538 pxor \XMM2, \XMM2
539
540 cmp $16, %r11
541 jl _get_AAD_rest8\num_initial_blocks\operation
542_get_AAD_blocks\num_initial_blocks\operation:
543 movdqu (%r10), %xmm\i
544 PSHUFB_XMM %xmm14, %xmm\i # byte-reflect the AAD data
545 pxor %xmm\i, \XMM2
546 GHASH_MUL \XMM2, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1
547 add $16, %r10
548 sub $16, %r12
549 sub $16, %r11
550 cmp $16, %r11
551 jge _get_AAD_blocks\num_initial_blocks\operation
552
553 movdqu \XMM2, %xmm\i
554 cmp $0, %r11
555 je _get_AAD_done\num_initial_blocks\operation
556
557 pxor %xmm\i,%xmm\i
558
559 /* read the last <16B of AAD. since we have at least 4B of
560 data right after the AAD (the ICV, and maybe some PT), we can
561 read 4B/8B blocks safely, and then get rid of the extra stuff */
562_get_AAD_rest8\num_initial_blocks\operation:
563 cmp $4, %r11
564 jle _get_AAD_rest4\num_initial_blocks\operation
565 movq (%r10), \TMP1
566 add $8, %r10
567 sub $8, %r11
568 pslldq $8, \TMP1
569 psrldq $8, %xmm\i
570 pxor \TMP1, %xmm\i
571 jmp _get_AAD_rest8\num_initial_blocks\operation
572_get_AAD_rest4\num_initial_blocks\operation:
573 cmp $0, %r11
574 jle _get_AAD_rest0\num_initial_blocks\operation
575 mov (%r10), %eax
576 movq %rax, \TMP1
577 add $4, %r10
578 sub $4, %r10
Tadeusz Struk3c097b82010-12-13 19:51:15 +0800579 pslldq $12, \TMP1
580 psrldq $4, %xmm\i
581 pxor \TMP1, %xmm\i
Sabrina Dubroca0487cca2017-04-28 18:11:56 +0200582_get_AAD_rest0\num_initial_blocks\operation:
583 /* finalize: shift out the extra bytes we read, and align
584 left. since pslldq can only shift by an immediate, we use
585 vpshufb and an array of shuffle masks */
586 movq %r12, %r11
587 salq $4, %r11
588 movdqu aad_shift_arr(%r11), \TMP1
589 PSHUFB_XMM \TMP1, %xmm\i
590_get_AAD_rest_final\num_initial_blocks\operation:
Tadeusz Struk3c097b82010-12-13 19:51:15 +0800591 PSHUFB_XMM %xmm14, %xmm\i # byte-reflect the AAD data
Sabrina Dubroca0487cca2017-04-28 18:11:56 +0200592 pxor \XMM2, %xmm\i
593 GHASH_MUL %xmm\i, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1
Tadeusz Struk3c097b82010-12-13 19:51:15 +0800594
Sabrina Dubroca0487cca2017-04-28 18:11:56 +0200595_get_AAD_done\num_initial_blocks\operation:
Tadeusz Struk3c097b82010-12-13 19:51:15 +0800596 xor %r11, %r11 # initialise the data pointer offset as zero
Sabrina Dubroca0487cca2017-04-28 18:11:56 +0200597 # start AES for num_initial_blocks blocks
Tadeusz Struk3c097b82010-12-13 19:51:15 +0800598
599 mov %arg5, %rax # %rax = *Y0
600 movdqu (%rax), \XMM0 # XMM0 = Y0
Tadeusz Struk3c097b82010-12-13 19:51:15 +0800601 PSHUFB_XMM %xmm14, \XMM0
602
603.if (\i == 5) || (\i == 6) || (\i == 7)
Tadeusz Struk3c097b82010-12-13 19:51:15 +0800604
Timothy McCaffreye31ac322015-01-13 13:16:43 -0500605 MOVADQ ONE(%RIP),\TMP1
606 MOVADQ 0(%arg1),\TMP2
Tadeusz Struk3c097b82010-12-13 19:51:15 +0800607.irpc index, \i_seq
Timothy McCaffreye31ac322015-01-13 13:16:43 -0500608 paddd \TMP1, \XMM0 # INCR Y0
609 MOVADQ \XMM0, %xmm\index
610 PSHUFB_XMM %xmm14, %xmm\index # perform a 16 byte swap
611 pxor \TMP2, %xmm\index
Tadeusz Struk3c097b82010-12-13 19:51:15 +0800612.endr
Timothy McCaffreye31ac322015-01-13 13:16:43 -0500613 lea 0x10(%arg1),%r10
614 mov keysize,%eax
615 shr $2,%eax # 128->4, 192->6, 256->8
616 add $5,%eax # 128->9, 192->11, 256->13
617
618aes_loop_initial_enc\num_initial_blocks:
619 MOVADQ (%r10),\TMP1
620.irpc index, \i_seq
621 AESENC \TMP1, %xmm\index
Tadeusz Struk3c097b82010-12-13 19:51:15 +0800622.endr
Timothy McCaffreye31ac322015-01-13 13:16:43 -0500623 add $16,%r10
624 sub $1,%eax
625 jnz aes_loop_initial_enc\num_initial_blocks
626
627 MOVADQ (%r10), \TMP1
Tadeusz Struk3c097b82010-12-13 19:51:15 +0800628.irpc index, \i_seq
Timothy McCaffreye31ac322015-01-13 13:16:43 -0500629 AESENCLAST \TMP1, %xmm\index # Last Round
Tadeusz Struk3c097b82010-12-13 19:51:15 +0800630.endr
631.irpc index, \i_seq
632 movdqu (%arg3 , %r11, 1), \TMP1
633 pxor \TMP1, %xmm\index
634 movdqu %xmm\index, (%arg2 , %r11, 1)
635 # write back plaintext/ciphertext for num_initial_blocks
636 add $16, %r11
Tadeusz Struk3c097b82010-12-13 19:51:15 +0800637 PSHUFB_XMM %xmm14, %xmm\index
638
639 # prepare plaintext/ciphertext for GHASH computation
640.endr
641.endif
Sabrina Dubroca0487cca2017-04-28 18:11:56 +0200642
Tadeusz Struk3c097b82010-12-13 19:51:15 +0800643 # apply GHASH on num_initial_blocks blocks
644
645.if \i == 5
646 pxor %xmm5, %xmm6
647 GHASH_MUL %xmm6, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1
648 pxor %xmm6, %xmm7
649 GHASH_MUL %xmm7, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1
650 pxor %xmm7, %xmm8
651 GHASH_MUL %xmm8, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1
652.elseif \i == 6
653 pxor %xmm6, %xmm7
654 GHASH_MUL %xmm7, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1
655 pxor %xmm7, %xmm8
656 GHASH_MUL %xmm8, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1
657.elseif \i == 7
658 pxor %xmm7, %xmm8
659 GHASH_MUL %xmm8, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1
660.endif
661 cmp $64, %r13
662 jl _initial_blocks_done\num_initial_blocks\operation
663 # no need for precomputed values
664/*
665*
666* Precomputations for HashKey parallel with encryption of first 4 blocks.
667* Haskey_i_k holds XORed values of the low and high parts of the Haskey_i
668*/
Timothy McCaffreye31ac322015-01-13 13:16:43 -0500669 MOVADQ ONE(%RIP),\TMP1
670 paddd \TMP1, \XMM0 # INCR Y0
671 MOVADQ \XMM0, \XMM1
Tadeusz Struk3c097b82010-12-13 19:51:15 +0800672 PSHUFB_XMM %xmm14, \XMM1 # perform a 16 byte swap
673
Timothy McCaffreye31ac322015-01-13 13:16:43 -0500674 paddd \TMP1, \XMM0 # INCR Y0
675 MOVADQ \XMM0, \XMM2
Tadeusz Struk3c097b82010-12-13 19:51:15 +0800676 PSHUFB_XMM %xmm14, \XMM2 # perform a 16 byte swap
677
Timothy McCaffreye31ac322015-01-13 13:16:43 -0500678 paddd \TMP1, \XMM0 # INCR Y0
679 MOVADQ \XMM0, \XMM3
Tadeusz Struk3c097b82010-12-13 19:51:15 +0800680 PSHUFB_XMM %xmm14, \XMM3 # perform a 16 byte swap
681
Timothy McCaffreye31ac322015-01-13 13:16:43 -0500682 paddd \TMP1, \XMM0 # INCR Y0
683 MOVADQ \XMM0, \XMM4
Tadeusz Struk3c097b82010-12-13 19:51:15 +0800684 PSHUFB_XMM %xmm14, \XMM4 # perform a 16 byte swap
685
Timothy McCaffreye31ac322015-01-13 13:16:43 -0500686 MOVADQ 0(%arg1),\TMP1
687 pxor \TMP1, \XMM1
688 pxor \TMP1, \XMM2
689 pxor \TMP1, \XMM3
690 pxor \TMP1, \XMM4
Tadeusz Struk3c097b82010-12-13 19:51:15 +0800691 movdqa \TMP3, \TMP5
692 pshufd $78, \TMP3, \TMP1
693 pxor \TMP3, \TMP1
694 movdqa \TMP1, HashKey_k(%rsp)
695 GHASH_MUL \TMP5, \TMP3, \TMP1, \TMP2, \TMP4, \TMP6, \TMP7
696# TMP5 = HashKey^2<<1 (mod poly)
697 movdqa \TMP5, HashKey_2(%rsp)
698# HashKey_2 = HashKey^2<<1 (mod poly)
699 pshufd $78, \TMP5, \TMP1
700 pxor \TMP5, \TMP1
701 movdqa \TMP1, HashKey_2_k(%rsp)
702.irpc index, 1234 # do 4 rounds
703 movaps 0x10*\index(%arg1), \TMP1
704 AESENC \TMP1, \XMM1
705 AESENC \TMP1, \XMM2
706 AESENC \TMP1, \XMM3
707 AESENC \TMP1, \XMM4
708.endr
709 GHASH_MUL \TMP5, \TMP3, \TMP1, \TMP2, \TMP4, \TMP6, \TMP7
710# TMP5 = HashKey^3<<1 (mod poly)
711 movdqa \TMP5, HashKey_3(%rsp)
712 pshufd $78, \TMP5, \TMP1
713 pxor \TMP5, \TMP1
714 movdqa \TMP1, HashKey_3_k(%rsp)
715.irpc index, 56789 # do next 5 rounds
716 movaps 0x10*\index(%arg1), \TMP1
717 AESENC \TMP1, \XMM1
718 AESENC \TMP1, \XMM2
719 AESENC \TMP1, \XMM3
720 AESENC \TMP1, \XMM4
721.endr
722 GHASH_MUL \TMP5, \TMP3, \TMP1, \TMP2, \TMP4, \TMP6, \TMP7
723# TMP5 = HashKey^3<<1 (mod poly)
724 movdqa \TMP5, HashKey_4(%rsp)
725 pshufd $78, \TMP5, \TMP1
726 pxor \TMP5, \TMP1
727 movdqa \TMP1, HashKey_4_k(%rsp)
Timothy McCaffreye31ac322015-01-13 13:16:43 -0500728 lea 0xa0(%arg1),%r10
729 mov keysize,%eax
730 shr $2,%eax # 128->4, 192->6, 256->8
731 sub $4,%eax # 128->0, 192->2, 256->4
732 jz aes_loop_pre_enc_done\num_initial_blocks
733
734aes_loop_pre_enc\num_initial_blocks:
735 MOVADQ (%r10),\TMP2
736.irpc index, 1234
737 AESENC \TMP2, %xmm\index
738.endr
739 add $16,%r10
740 sub $1,%eax
741 jnz aes_loop_pre_enc\num_initial_blocks
742
743aes_loop_pre_enc_done\num_initial_blocks:
744 MOVADQ (%r10), \TMP2
Tadeusz Struk3c097b82010-12-13 19:51:15 +0800745 AESENCLAST \TMP2, \XMM1
746 AESENCLAST \TMP2, \XMM2
747 AESENCLAST \TMP2, \XMM3
748 AESENCLAST \TMP2, \XMM4
749 movdqu 16*0(%arg3 , %r11 , 1), \TMP1
750 pxor \TMP1, \XMM1
751 movdqu 16*1(%arg3 , %r11 , 1), \TMP1
752 pxor \TMP1, \XMM2
753 movdqu 16*2(%arg3 , %r11 , 1), \TMP1
754 pxor \TMP1, \XMM3
755 movdqu 16*3(%arg3 , %r11 , 1), \TMP1
756 pxor \TMP1, \XMM4
Tadeusz Struk0bd82f52010-11-04 15:00:45 -0400757 movdqu \XMM1, 16*0(%arg2 , %r11 , 1)
758 movdqu \XMM2, 16*1(%arg2 , %r11 , 1)
759 movdqu \XMM3, 16*2(%arg2 , %r11 , 1)
760 movdqu \XMM4, 16*3(%arg2 , %r11 , 1)
Tadeusz Struk3c097b82010-12-13 19:51:15 +0800761
Tadeusz Struk0bd82f52010-11-04 15:00:45 -0400762 add $64, %r11
Tadeusz Struk3c097b82010-12-13 19:51:15 +0800763 PSHUFB_XMM %xmm14, \XMM1 # perform a 16 byte swap
Tadeusz Struk0bd82f52010-11-04 15:00:45 -0400764 pxor \XMMDst, \XMM1
765# combine GHASHed value with the corresponding ciphertext
Tadeusz Struk3c097b82010-12-13 19:51:15 +0800766 PSHUFB_XMM %xmm14, \XMM2 # perform a 16 byte swap
Tadeusz Struk3c097b82010-12-13 19:51:15 +0800767 PSHUFB_XMM %xmm14, \XMM3 # perform a 16 byte swap
Tadeusz Struk3c097b82010-12-13 19:51:15 +0800768 PSHUFB_XMM %xmm14, \XMM4 # perform a 16 byte swap
769
Tadeusz Struk0bd82f52010-11-04 15:00:45 -0400770_initial_blocks_done\num_initial_blocks\operation:
Tadeusz Struk3c097b82010-12-13 19:51:15 +0800771
Tadeusz Struk0bd82f52010-11-04 15:00:45 -0400772.endm
773
774/*
775* encrypt 4 blocks at a time
776* ghash the 4 previously encrypted ciphertext blocks
777* arg1, %arg2, %arg3 are used as pointers only, not modified
778* %r11 is the data offset value
779*/
Tadeusz Struk3c097b82010-12-13 19:51:15 +0800780.macro GHASH_4_ENCRYPT_4_PARALLEL_ENC TMP1 TMP2 TMP3 TMP4 TMP5 \
Tadeusz Struk0bd82f52010-11-04 15:00:45 -0400781TMP6 XMM0 XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7 XMM8 operation
782
783 movdqa \XMM1, \XMM5
784 movdqa \XMM2, \XMM6
785 movdqa \XMM3, \XMM7
786 movdqa \XMM4, \XMM8
787
Tadeusz Struk3c097b82010-12-13 19:51:15 +0800788 movdqa SHUF_MASK(%rip), %xmm15
Tadeusz Struk0bd82f52010-11-04 15:00:45 -0400789 # multiply TMP5 * HashKey using karatsuba
790
791 movdqa \XMM5, \TMP4
792 pshufd $78, \XMM5, \TMP6
793 pxor \XMM5, \TMP6
794 paddd ONE(%rip), \XMM0 # INCR CNT
795 movdqa HashKey_4(%rsp), \TMP5
796 PCLMULQDQ 0x11, \TMP5, \TMP4 # TMP4 = a1*b1
797 movdqa \XMM0, \XMM1
798 paddd ONE(%rip), \XMM0 # INCR CNT
799 movdqa \XMM0, \XMM2
800 paddd ONE(%rip), \XMM0 # INCR CNT
801 movdqa \XMM0, \XMM3
802 paddd ONE(%rip), \XMM0 # INCR CNT
803 movdqa \XMM0, \XMM4
Tadeusz Struk3c097b82010-12-13 19:51:15 +0800804 PSHUFB_XMM %xmm15, \XMM1 # perform a 16 byte swap
Tadeusz Struk0bd82f52010-11-04 15:00:45 -0400805 PCLMULQDQ 0x00, \TMP5, \XMM5 # XMM5 = a0*b0
Tadeusz Struk3c097b82010-12-13 19:51:15 +0800806 PSHUFB_XMM %xmm15, \XMM2 # perform a 16 byte swap
807 PSHUFB_XMM %xmm15, \XMM3 # perform a 16 byte swap
808 PSHUFB_XMM %xmm15, \XMM4 # perform a 16 byte swap
809
Tadeusz Struk0bd82f52010-11-04 15:00:45 -0400810 pxor (%arg1), \XMM1
811 pxor (%arg1), \XMM2
812 pxor (%arg1), \XMM3
813 pxor (%arg1), \XMM4
814 movdqa HashKey_4_k(%rsp), \TMP5
815 PCLMULQDQ 0x00, \TMP5, \TMP6 # TMP6 = (a1+a0)*(b1+b0)
816 movaps 0x10(%arg1), \TMP1
817 AESENC \TMP1, \XMM1 # Round 1
818 AESENC \TMP1, \XMM2
819 AESENC \TMP1, \XMM3
820 AESENC \TMP1, \XMM4
821 movaps 0x20(%arg1), \TMP1
822 AESENC \TMP1, \XMM1 # Round 2
823 AESENC \TMP1, \XMM2
824 AESENC \TMP1, \XMM3
825 AESENC \TMP1, \XMM4
826 movdqa \XMM6, \TMP1
827 pshufd $78, \XMM6, \TMP2
828 pxor \XMM6, \TMP2
829 movdqa HashKey_3(%rsp), \TMP5
830 PCLMULQDQ 0x11, \TMP5, \TMP1 # TMP1 = a1 * b1
831 movaps 0x30(%arg1), \TMP3
832 AESENC \TMP3, \XMM1 # Round 3
833 AESENC \TMP3, \XMM2
834 AESENC \TMP3, \XMM3
835 AESENC \TMP3, \XMM4
836 PCLMULQDQ 0x00, \TMP5, \XMM6 # XMM6 = a0*b0
837 movaps 0x40(%arg1), \TMP3
838 AESENC \TMP3, \XMM1 # Round 4
839 AESENC \TMP3, \XMM2
840 AESENC \TMP3, \XMM3
841 AESENC \TMP3, \XMM4
842 movdqa HashKey_3_k(%rsp), \TMP5
843 PCLMULQDQ 0x00, \TMP5, \TMP2 # TMP2 = (a1+a0)*(b1+b0)
844 movaps 0x50(%arg1), \TMP3
845 AESENC \TMP3, \XMM1 # Round 5
846 AESENC \TMP3, \XMM2
847 AESENC \TMP3, \XMM3
848 AESENC \TMP3, \XMM4
849 pxor \TMP1, \TMP4
850# accumulate the results in TMP4:XMM5, TMP6 holds the middle part
851 pxor \XMM6, \XMM5
852 pxor \TMP2, \TMP6
853 movdqa \XMM7, \TMP1
854 pshufd $78, \XMM7, \TMP2
855 pxor \XMM7, \TMP2
856 movdqa HashKey_2(%rsp ), \TMP5
857
858 # Multiply TMP5 * HashKey using karatsuba
859
860 PCLMULQDQ 0x11, \TMP5, \TMP1 # TMP1 = a1*b1
861 movaps 0x60(%arg1), \TMP3
862 AESENC \TMP3, \XMM1 # Round 6
863 AESENC \TMP3, \XMM2
864 AESENC \TMP3, \XMM3
865 AESENC \TMP3, \XMM4
866 PCLMULQDQ 0x00, \TMP5, \XMM7 # XMM7 = a0*b0
867 movaps 0x70(%arg1), \TMP3
868 AESENC \TMP3, \XMM1 # Round 7
869 AESENC \TMP3, \XMM2
870 AESENC \TMP3, \XMM3
871 AESENC \TMP3, \XMM4
872 movdqa HashKey_2_k(%rsp), \TMP5
873 PCLMULQDQ 0x00, \TMP5, \TMP2 # TMP2 = (a1+a0)*(b1+b0)
874 movaps 0x80(%arg1), \TMP3
875 AESENC \TMP3, \XMM1 # Round 8
876 AESENC \TMP3, \XMM2
877 AESENC \TMP3, \XMM3
878 AESENC \TMP3, \XMM4
879 pxor \TMP1, \TMP4
880# accumulate the results in TMP4:XMM5, TMP6 holds the middle part
881 pxor \XMM7, \XMM5
882 pxor \TMP2, \TMP6
883
884 # Multiply XMM8 * HashKey
885 # XMM8 and TMP5 hold the values for the two operands
886
887 movdqa \XMM8, \TMP1
888 pshufd $78, \XMM8, \TMP2
889 pxor \XMM8, \TMP2
890 movdqa HashKey(%rsp), \TMP5
891 PCLMULQDQ 0x11, \TMP5, \TMP1 # TMP1 = a1*b1
892 movaps 0x90(%arg1), \TMP3
893 AESENC \TMP3, \XMM1 # Round 9
894 AESENC \TMP3, \XMM2
895 AESENC \TMP3, \XMM3
896 AESENC \TMP3, \XMM4
897 PCLMULQDQ 0x00, \TMP5, \XMM8 # XMM8 = a0*b0
Timothy McCaffreye31ac322015-01-13 13:16:43 -0500898 lea 0xa0(%arg1),%r10
899 mov keysize,%eax
900 shr $2,%eax # 128->4, 192->6, 256->8
901 sub $4,%eax # 128->0, 192->2, 256->4
902 jz aes_loop_par_enc_done
903
904aes_loop_par_enc:
905 MOVADQ (%r10),\TMP3
906.irpc index, 1234
907 AESENC \TMP3, %xmm\index
908.endr
909 add $16,%r10
910 sub $1,%eax
911 jnz aes_loop_par_enc
912
913aes_loop_par_enc_done:
914 MOVADQ (%r10), \TMP3
Tadeusz Struk0bd82f52010-11-04 15:00:45 -0400915 AESENCLAST \TMP3, \XMM1 # Round 10
916 AESENCLAST \TMP3, \XMM2
917 AESENCLAST \TMP3, \XMM3
918 AESENCLAST \TMP3, \XMM4
919 movdqa HashKey_k(%rsp), \TMP5
920 PCLMULQDQ 0x00, \TMP5, \TMP2 # TMP2 = (a1+a0)*(b1+b0)
921 movdqu (%arg3,%r11,1), \TMP3
922 pxor \TMP3, \XMM1 # Ciphertext/Plaintext XOR EK
Tadeusz Struk0bd82f52010-11-04 15:00:45 -0400923 movdqu 16(%arg3,%r11,1), \TMP3
924 pxor \TMP3, \XMM2 # Ciphertext/Plaintext XOR EK
Tadeusz Struk0bd82f52010-11-04 15:00:45 -0400925 movdqu 32(%arg3,%r11,1), \TMP3
926 pxor \TMP3, \XMM3 # Ciphertext/Plaintext XOR EK
Tadeusz Struk0bd82f52010-11-04 15:00:45 -0400927 movdqu 48(%arg3,%r11,1), \TMP3
928 pxor \TMP3, \XMM4 # Ciphertext/Plaintext XOR EK
Tadeusz Struk3c097b82010-12-13 19:51:15 +0800929 movdqu \XMM1, (%arg2,%r11,1) # Write to the ciphertext buffer
930 movdqu \XMM2, 16(%arg2,%r11,1) # Write to the ciphertext buffer
931 movdqu \XMM3, 32(%arg2,%r11,1) # Write to the ciphertext buffer
932 movdqu \XMM4, 48(%arg2,%r11,1) # Write to the ciphertext buffer
933 PSHUFB_XMM %xmm15, \XMM1 # perform a 16 byte swap
934 PSHUFB_XMM %xmm15, \XMM2 # perform a 16 byte swap
935 PSHUFB_XMM %xmm15, \XMM3 # perform a 16 byte swap
936 PSHUFB_XMM %xmm15, \XMM4 # perform a 16 byte swap
937
938 pxor \TMP4, \TMP1
939 pxor \XMM8, \XMM5
940 pxor \TMP6, \TMP2
941 pxor \TMP1, \TMP2
942 pxor \XMM5, \TMP2
943 movdqa \TMP2, \TMP3
944 pslldq $8, \TMP3 # left shift TMP3 2 DWs
945 psrldq $8, \TMP2 # right shift TMP2 2 DWs
946 pxor \TMP3, \XMM5
947 pxor \TMP2, \TMP1 # accumulate the results in TMP1:XMM5
948
949 # first phase of reduction
950
951 movdqa \XMM5, \TMP2
952 movdqa \XMM5, \TMP3
953 movdqa \XMM5, \TMP4
954# move XMM5 into TMP2, TMP3, TMP4 in order to perform shifts independently
955 pslld $31, \TMP2 # packed right shift << 31
956 pslld $30, \TMP3 # packed right shift << 30
957 pslld $25, \TMP4 # packed right shift << 25
958 pxor \TMP3, \TMP2 # xor the shifted versions
959 pxor \TMP4, \TMP2
960 movdqa \TMP2, \TMP5
961 psrldq $4, \TMP5 # right shift T5 1 DW
962 pslldq $12, \TMP2 # left shift T2 3 DWs
963 pxor \TMP2, \XMM5
964
965 # second phase of reduction
966
967 movdqa \XMM5,\TMP2 # make 3 copies of XMM5 into TMP2, TMP3, TMP4
968 movdqa \XMM5,\TMP3
969 movdqa \XMM5,\TMP4
970 psrld $1, \TMP2 # packed left shift >>1
971 psrld $2, \TMP3 # packed left shift >>2
972 psrld $7, \TMP4 # packed left shift >>7
973 pxor \TMP3,\TMP2 # xor the shifted versions
974 pxor \TMP4,\TMP2
975 pxor \TMP5, \TMP2
976 pxor \TMP2, \XMM5
977 pxor \TMP1, \XMM5 # result is in TMP1
978
979 pxor \XMM5, \XMM1
980.endm
981
982/*
983* decrypt 4 blocks at a time
984* ghash the 4 previously decrypted ciphertext blocks
985* arg1, %arg2, %arg3 are used as pointers only, not modified
986* %r11 is the data offset value
987*/
988.macro GHASH_4_ENCRYPT_4_PARALLEL_DEC TMP1 TMP2 TMP3 TMP4 TMP5 \
989TMP6 XMM0 XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7 XMM8 operation
990
991 movdqa \XMM1, \XMM5
992 movdqa \XMM2, \XMM6
993 movdqa \XMM3, \XMM7
994 movdqa \XMM4, \XMM8
995
996 movdqa SHUF_MASK(%rip), %xmm15
997 # multiply TMP5 * HashKey using karatsuba
998
999 movdqa \XMM5, \TMP4
1000 pshufd $78, \XMM5, \TMP6
1001 pxor \XMM5, \TMP6
1002 paddd ONE(%rip), \XMM0 # INCR CNT
1003 movdqa HashKey_4(%rsp), \TMP5
1004 PCLMULQDQ 0x11, \TMP5, \TMP4 # TMP4 = a1*b1
1005 movdqa \XMM0, \XMM1
1006 paddd ONE(%rip), \XMM0 # INCR CNT
1007 movdqa \XMM0, \XMM2
1008 paddd ONE(%rip), \XMM0 # INCR CNT
1009 movdqa \XMM0, \XMM3
1010 paddd ONE(%rip), \XMM0 # INCR CNT
1011 movdqa \XMM0, \XMM4
1012 PSHUFB_XMM %xmm15, \XMM1 # perform a 16 byte swap
1013 PCLMULQDQ 0x00, \TMP5, \XMM5 # XMM5 = a0*b0
1014 PSHUFB_XMM %xmm15, \XMM2 # perform a 16 byte swap
1015 PSHUFB_XMM %xmm15, \XMM3 # perform a 16 byte swap
1016 PSHUFB_XMM %xmm15, \XMM4 # perform a 16 byte swap
1017
1018 pxor (%arg1), \XMM1
1019 pxor (%arg1), \XMM2
1020 pxor (%arg1), \XMM3
1021 pxor (%arg1), \XMM4
1022 movdqa HashKey_4_k(%rsp), \TMP5
1023 PCLMULQDQ 0x00, \TMP5, \TMP6 # TMP6 = (a1+a0)*(b1+b0)
1024 movaps 0x10(%arg1), \TMP1
1025 AESENC \TMP1, \XMM1 # Round 1
1026 AESENC \TMP1, \XMM2
1027 AESENC \TMP1, \XMM3
1028 AESENC \TMP1, \XMM4
1029 movaps 0x20(%arg1), \TMP1
1030 AESENC \TMP1, \XMM1 # Round 2
1031 AESENC \TMP1, \XMM2
1032 AESENC \TMP1, \XMM3
1033 AESENC \TMP1, \XMM4
1034 movdqa \XMM6, \TMP1
1035 pshufd $78, \XMM6, \TMP2
1036 pxor \XMM6, \TMP2
1037 movdqa HashKey_3(%rsp), \TMP5
1038 PCLMULQDQ 0x11, \TMP5, \TMP1 # TMP1 = a1 * b1
1039 movaps 0x30(%arg1), \TMP3
1040 AESENC \TMP3, \XMM1 # Round 3
1041 AESENC \TMP3, \XMM2
1042 AESENC \TMP3, \XMM3
1043 AESENC \TMP3, \XMM4
1044 PCLMULQDQ 0x00, \TMP5, \XMM6 # XMM6 = a0*b0
1045 movaps 0x40(%arg1), \TMP3
1046 AESENC \TMP3, \XMM1 # Round 4
1047 AESENC \TMP3, \XMM2
1048 AESENC \TMP3, \XMM3
1049 AESENC \TMP3, \XMM4
1050 movdqa HashKey_3_k(%rsp), \TMP5
1051 PCLMULQDQ 0x00, \TMP5, \TMP2 # TMP2 = (a1+a0)*(b1+b0)
1052 movaps 0x50(%arg1), \TMP3
1053 AESENC \TMP3, \XMM1 # Round 5
1054 AESENC \TMP3, \XMM2
1055 AESENC \TMP3, \XMM3
1056 AESENC \TMP3, \XMM4
1057 pxor \TMP1, \TMP4
1058# accumulate the results in TMP4:XMM5, TMP6 holds the middle part
1059 pxor \XMM6, \XMM5
1060 pxor \TMP2, \TMP6
1061 movdqa \XMM7, \TMP1
1062 pshufd $78, \XMM7, \TMP2
1063 pxor \XMM7, \TMP2
1064 movdqa HashKey_2(%rsp ), \TMP5
1065
1066 # Multiply TMP5 * HashKey using karatsuba
1067
1068 PCLMULQDQ 0x11, \TMP5, \TMP1 # TMP1 = a1*b1
1069 movaps 0x60(%arg1), \TMP3
1070 AESENC \TMP3, \XMM1 # Round 6
1071 AESENC \TMP3, \XMM2
1072 AESENC \TMP3, \XMM3
1073 AESENC \TMP3, \XMM4
1074 PCLMULQDQ 0x00, \TMP5, \XMM7 # XMM7 = a0*b0
1075 movaps 0x70(%arg1), \TMP3
1076 AESENC \TMP3, \XMM1 # Round 7
1077 AESENC \TMP3, \XMM2
1078 AESENC \TMP3, \XMM3
1079 AESENC \TMP3, \XMM4
1080 movdqa HashKey_2_k(%rsp), \TMP5
1081 PCLMULQDQ 0x00, \TMP5, \TMP2 # TMP2 = (a1+a0)*(b1+b0)
1082 movaps 0x80(%arg1), \TMP3
1083 AESENC \TMP3, \XMM1 # Round 8
1084 AESENC \TMP3, \XMM2
1085 AESENC \TMP3, \XMM3
1086 AESENC \TMP3, \XMM4
1087 pxor \TMP1, \TMP4
1088# accumulate the results in TMP4:XMM5, TMP6 holds the middle part
1089 pxor \XMM7, \XMM5
1090 pxor \TMP2, \TMP6
1091
1092 # Multiply XMM8 * HashKey
1093 # XMM8 and TMP5 hold the values for the two operands
1094
1095 movdqa \XMM8, \TMP1
1096 pshufd $78, \XMM8, \TMP2
1097 pxor \XMM8, \TMP2
1098 movdqa HashKey(%rsp), \TMP5
1099 PCLMULQDQ 0x11, \TMP5, \TMP1 # TMP1 = a1*b1
1100 movaps 0x90(%arg1), \TMP3
1101 AESENC \TMP3, \XMM1 # Round 9
1102 AESENC \TMP3, \XMM2
1103 AESENC \TMP3, \XMM3
1104 AESENC \TMP3, \XMM4
1105 PCLMULQDQ 0x00, \TMP5, \XMM8 # XMM8 = a0*b0
Timothy McCaffreye31ac322015-01-13 13:16:43 -05001106 lea 0xa0(%arg1),%r10
1107 mov keysize,%eax
1108 shr $2,%eax # 128->4, 192->6, 256->8
1109 sub $4,%eax # 128->0, 192->2, 256->4
1110 jz aes_loop_par_dec_done
1111
1112aes_loop_par_dec:
1113 MOVADQ (%r10),\TMP3
1114.irpc index, 1234
1115 AESENC \TMP3, %xmm\index
1116.endr
1117 add $16,%r10
1118 sub $1,%eax
1119 jnz aes_loop_par_dec
1120
1121aes_loop_par_dec_done:
1122 MOVADQ (%r10), \TMP3
1123 AESENCLAST \TMP3, \XMM1 # last round
Tadeusz Struk3c097b82010-12-13 19:51:15 +08001124 AESENCLAST \TMP3, \XMM2
1125 AESENCLAST \TMP3, \XMM3
1126 AESENCLAST \TMP3, \XMM4
1127 movdqa HashKey_k(%rsp), \TMP5
1128 PCLMULQDQ 0x00, \TMP5, \TMP2 # TMP2 = (a1+a0)*(b1+b0)
1129 movdqu (%arg3,%r11,1), \TMP3
1130 pxor \TMP3, \XMM1 # Ciphertext/Plaintext XOR EK
1131 movdqu \XMM1, (%arg2,%r11,1) # Write to plaintext buffer
1132 movdqa \TMP3, \XMM1
1133 movdqu 16(%arg3,%r11,1), \TMP3
1134 pxor \TMP3, \XMM2 # Ciphertext/Plaintext XOR EK
1135 movdqu \XMM2, 16(%arg2,%r11,1) # Write to plaintext buffer
1136 movdqa \TMP3, \XMM2
1137 movdqu 32(%arg3,%r11,1), \TMP3
1138 pxor \TMP3, \XMM3 # Ciphertext/Plaintext XOR EK
1139 movdqu \XMM3, 32(%arg2,%r11,1) # Write to plaintext buffer
1140 movdqa \TMP3, \XMM3
1141 movdqu 48(%arg3,%r11,1), \TMP3
1142 pxor \TMP3, \XMM4 # Ciphertext/Plaintext XOR EK
Tadeusz Struk0bd82f52010-11-04 15:00:45 -04001143 movdqu \XMM4, 48(%arg2,%r11,1) # Write to plaintext buffer
1144 movdqa \TMP3, \XMM4
Tadeusz Struk3c097b82010-12-13 19:51:15 +08001145 PSHUFB_XMM %xmm15, \XMM1 # perform a 16 byte swap
1146 PSHUFB_XMM %xmm15, \XMM2 # perform a 16 byte swap
1147 PSHUFB_XMM %xmm15, \XMM3 # perform a 16 byte swap
1148 PSHUFB_XMM %xmm15, \XMM4 # perform a 16 byte swap
Tadeusz Struk0bd82f52010-11-04 15:00:45 -04001149
1150 pxor \TMP4, \TMP1
1151 pxor \XMM8, \XMM5
1152 pxor \TMP6, \TMP2
1153 pxor \TMP1, \TMP2
1154 pxor \XMM5, \TMP2
1155 movdqa \TMP2, \TMP3
1156 pslldq $8, \TMP3 # left shift TMP3 2 DWs
1157 psrldq $8, \TMP2 # right shift TMP2 2 DWs
1158 pxor \TMP3, \XMM5
1159 pxor \TMP2, \TMP1 # accumulate the results in TMP1:XMM5
1160
1161 # first phase of reduction
1162
1163 movdqa \XMM5, \TMP2
1164 movdqa \XMM5, \TMP3
1165 movdqa \XMM5, \TMP4
1166# move XMM5 into TMP2, TMP3, TMP4 in order to perform shifts independently
1167 pslld $31, \TMP2 # packed right shift << 31
1168 pslld $30, \TMP3 # packed right shift << 30
1169 pslld $25, \TMP4 # packed right shift << 25
1170 pxor \TMP3, \TMP2 # xor the shifted versions
1171 pxor \TMP4, \TMP2
1172 movdqa \TMP2, \TMP5
1173 psrldq $4, \TMP5 # right shift T5 1 DW
1174 pslldq $12, \TMP2 # left shift T2 3 DWs
1175 pxor \TMP2, \XMM5
1176
1177 # second phase of reduction
1178
1179 movdqa \XMM5,\TMP2 # make 3 copies of XMM5 into TMP2, TMP3, TMP4
1180 movdqa \XMM5,\TMP3
1181 movdqa \XMM5,\TMP4
1182 psrld $1, \TMP2 # packed left shift >>1
1183 psrld $2, \TMP3 # packed left shift >>2
1184 psrld $7, \TMP4 # packed left shift >>7
1185 pxor \TMP3,\TMP2 # xor the shifted versions
1186 pxor \TMP4,\TMP2
1187 pxor \TMP5, \TMP2
1188 pxor \TMP2, \XMM5
1189 pxor \TMP1, \XMM5 # result is in TMP1
1190
1191 pxor \XMM5, \XMM1
1192.endm
1193
1194/* GHASH the last 4 ciphertext blocks. */
1195.macro GHASH_LAST_4 TMP1 TMP2 TMP3 TMP4 TMP5 TMP6 \
1196TMP7 XMM1 XMM2 XMM3 XMM4 XMMDst
1197
1198 # Multiply TMP6 * HashKey (using Karatsuba)
1199
1200 movdqa \XMM1, \TMP6
1201 pshufd $78, \XMM1, \TMP2
1202 pxor \XMM1, \TMP2
1203 movdqa HashKey_4(%rsp), \TMP5
1204 PCLMULQDQ 0x11, \TMP5, \TMP6 # TMP6 = a1*b1
1205 PCLMULQDQ 0x00, \TMP5, \XMM1 # XMM1 = a0*b0
1206 movdqa HashKey_4_k(%rsp), \TMP4
1207 PCLMULQDQ 0x00, \TMP4, \TMP2 # TMP2 = (a1+a0)*(b1+b0)
1208 movdqa \XMM1, \XMMDst
1209 movdqa \TMP2, \XMM1 # result in TMP6, XMMDst, XMM1
1210
1211 # Multiply TMP1 * HashKey (using Karatsuba)
1212
1213 movdqa \XMM2, \TMP1
1214 pshufd $78, \XMM2, \TMP2
1215 pxor \XMM2, \TMP2
1216 movdqa HashKey_3(%rsp), \TMP5
1217 PCLMULQDQ 0x11, \TMP5, \TMP1 # TMP1 = a1*b1
1218 PCLMULQDQ 0x00, \TMP5, \XMM2 # XMM2 = a0*b0
1219 movdqa HashKey_3_k(%rsp), \TMP4
1220 PCLMULQDQ 0x00, \TMP4, \TMP2 # TMP2 = (a1+a0)*(b1+b0)
1221 pxor \TMP1, \TMP6
1222 pxor \XMM2, \XMMDst
1223 pxor \TMP2, \XMM1
1224# results accumulated in TMP6, XMMDst, XMM1
1225
1226 # Multiply TMP1 * HashKey (using Karatsuba)
1227
1228 movdqa \XMM3, \TMP1
1229 pshufd $78, \XMM3, \TMP2
1230 pxor \XMM3, \TMP2
1231 movdqa HashKey_2(%rsp), \TMP5
1232 PCLMULQDQ 0x11, \TMP5, \TMP1 # TMP1 = a1*b1
1233 PCLMULQDQ 0x00, \TMP5, \XMM3 # XMM3 = a0*b0
1234 movdqa HashKey_2_k(%rsp), \TMP4
1235 PCLMULQDQ 0x00, \TMP4, \TMP2 # TMP2 = (a1+a0)*(b1+b0)
1236 pxor \TMP1, \TMP6
1237 pxor \XMM3, \XMMDst
1238 pxor \TMP2, \XMM1 # results accumulated in TMP6, XMMDst, XMM1
1239
1240 # Multiply TMP1 * HashKey (using Karatsuba)
1241 movdqa \XMM4, \TMP1
1242 pshufd $78, \XMM4, \TMP2
1243 pxor \XMM4, \TMP2
1244 movdqa HashKey(%rsp), \TMP5
1245 PCLMULQDQ 0x11, \TMP5, \TMP1 # TMP1 = a1*b1
1246 PCLMULQDQ 0x00, \TMP5, \XMM4 # XMM4 = a0*b0
1247 movdqa HashKey_k(%rsp), \TMP4
1248 PCLMULQDQ 0x00, \TMP4, \TMP2 # TMP2 = (a1+a0)*(b1+b0)
1249 pxor \TMP1, \TMP6
1250 pxor \XMM4, \XMMDst
1251 pxor \XMM1, \TMP2
1252 pxor \TMP6, \TMP2
1253 pxor \XMMDst, \TMP2
1254 # middle section of the temp results combined as in karatsuba algorithm
1255 movdqa \TMP2, \TMP4
1256 pslldq $8, \TMP4 # left shift TMP4 2 DWs
1257 psrldq $8, \TMP2 # right shift TMP2 2 DWs
1258 pxor \TMP4, \XMMDst
1259 pxor \TMP2, \TMP6
1260# TMP6:XMMDst holds the result of the accumulated carry-less multiplications
1261 # first phase of the reduction
1262 movdqa \XMMDst, \TMP2
1263 movdqa \XMMDst, \TMP3
1264 movdqa \XMMDst, \TMP4
1265# move XMMDst into TMP2, TMP3, TMP4 in order to perform 3 shifts independently
1266 pslld $31, \TMP2 # packed right shifting << 31
1267 pslld $30, \TMP3 # packed right shifting << 30
1268 pslld $25, \TMP4 # packed right shifting << 25
1269 pxor \TMP3, \TMP2 # xor the shifted versions
1270 pxor \TMP4, \TMP2
1271 movdqa \TMP2, \TMP7
1272 psrldq $4, \TMP7 # right shift TMP7 1 DW
1273 pslldq $12, \TMP2 # left shift TMP2 3 DWs
1274 pxor \TMP2, \XMMDst
1275
1276 # second phase of the reduction
1277 movdqa \XMMDst, \TMP2
1278 # make 3 copies of XMMDst for doing 3 shift operations
1279 movdqa \XMMDst, \TMP3
1280 movdqa \XMMDst, \TMP4
1281 psrld $1, \TMP2 # packed left shift >> 1
1282 psrld $2, \TMP3 # packed left shift >> 2
1283 psrld $7, \TMP4 # packed left shift >> 7
1284 pxor \TMP3, \TMP2 # xor the shifted versions
1285 pxor \TMP4, \TMP2
1286 pxor \TMP7, \TMP2
1287 pxor \TMP2, \XMMDst
1288 pxor \TMP6, \XMMDst # reduced result is in XMMDst
1289.endm
1290
Timothy McCaffreye31ac322015-01-13 13:16:43 -05001291
1292/* Encryption of a single block
1293* uses eax & r10
1294*/
1295
Tadeusz Struk0bd82f52010-11-04 15:00:45 -04001296.macro ENCRYPT_SINGLE_BLOCK XMM0 TMP1
1297
Timothy McCaffreye31ac322015-01-13 13:16:43 -05001298 pxor (%arg1), \XMM0
1299 mov keysize,%eax
1300 shr $2,%eax # 128->4, 192->6, 256->8
1301 add $5,%eax # 128->9, 192->11, 256->13
1302 lea 16(%arg1), %r10 # get first expanded key address
1303
1304_esb_loop_\@:
1305 MOVADQ (%r10),\TMP1
1306 AESENC \TMP1,\XMM0
1307 add $16,%r10
1308 sub $1,%eax
1309 jnz _esb_loop_\@
1310
1311 MOVADQ (%r10),\TMP1
1312 AESENCLAST \TMP1,\XMM0
Tadeusz Struk0bd82f52010-11-04 15:00:45 -04001313.endm
Tadeusz Struk0bd82f52010-11-04 15:00:45 -04001314/*****************************************************************************
1315* void aesni_gcm_dec(void *aes_ctx, // AES Key schedule. Starts on a 16 byte boundary.
1316* u8 *out, // Plaintext output. Encrypt in-place is allowed.
1317* const u8 *in, // Ciphertext input
1318* u64 plaintext_len, // Length of data in bytes for decryption.
1319* u8 *iv, // Pre-counter block j0: 4 byte salt (from Security Association)
1320* // concatenated with 8 byte Initialisation Vector (from IPSec ESP Payload)
1321* // concatenated with 0x00000001. 16-byte aligned pointer.
1322* u8 *hash_subkey, // H, the Hash sub key input. Data starts on a 16-byte boundary.
1323* const u8 *aad, // Additional Authentication Data (AAD)
1324* u64 aad_len, // Length of AAD in bytes. With RFC4106 this is going to be 8 or 12 bytes
1325* u8 *auth_tag, // Authenticated Tag output. The driver will compare this to the
1326* // given authentication tag and only return the plaintext if they match.
1327* u64 auth_tag_len); // Authenticated Tag Length in bytes. Valid values are 16
1328* // (most likely), 12 or 8.
1329*
1330* Assumptions:
1331*
1332* keys:
1333* keys are pre-expanded and aligned to 16 bytes. we are using the first
1334* set of 11 keys in the data structure void *aes_ctx
1335*
1336* iv:
1337* 0 1 2 3
1338* 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
1339* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1340* | Salt (From the SA) |
1341* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1342* | Initialization Vector |
1343* | (This is the sequence number from IPSec header) |
1344* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1345* | 0x1 |
1346* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1347*
1348*
1349*
1350* AAD:
1351* AAD padded to 128 bits with 0
1352* for example, assume AAD is a u32 vector
1353*
1354* if AAD is 8 bytes:
1355* AAD[3] = {A0, A1};
1356* padded AAD in xmm register = {A1 A0 0 0}
1357*
1358* 0 1 2 3
1359* 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
1360* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1361* | SPI (A1) |
1362* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1363* | 32-bit Sequence Number (A0) |
1364* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1365* | 0x0 |
1366* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1367*
1368* AAD Format with 32-bit Sequence Number
1369*
1370* if AAD is 12 bytes:
1371* AAD[3] = {A0, A1, A2};
1372* padded AAD in xmm register = {A2 A1 A0 0}
1373*
1374* 0 1 2 3
1375* 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
1376* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1377* 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
1378* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1379* | SPI (A2) |
1380* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1381* | 64-bit Extended Sequence Number {A1,A0} |
1382* | |
1383* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1384* | 0x0 |
1385* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1386*
1387* AAD Format with 64-bit Extended Sequence Number
1388*
1389* aadLen:
1390* from the definition of the spec, aadLen can only be 8 or 12 bytes.
1391* The code supports 16 too but for other sizes, the code will fail.
1392*
1393* TLen:
1394* from the definition of the spec, TLen can only be 8, 12 or 16 bytes.
1395* For other sizes, the code will fail.
1396*
1397* poly = x^128 + x^127 + x^126 + x^121 + 1
1398*
1399*****************************************************************************/
Tadeusz Struk0bd82f52010-11-04 15:00:45 -04001400ENTRY(aesni_gcm_dec)
1401 push %r12
1402 push %r13
1403 push %r14
1404 mov %rsp, %r14
1405/*
1406* states of %xmm registers %xmm6:%xmm15 not saved
1407* all %xmm registers are clobbered
1408*/
1409 sub $VARIABLE_OFFSET, %rsp
1410 and $~63, %rsp # align rsp to 64 bytes
1411 mov %arg6, %r12
1412 movdqu (%r12), %xmm13 # %xmm13 = HashKey
Tadeusz Struk3c097b82010-12-13 19:51:15 +08001413 movdqa SHUF_MASK(%rip), %xmm2
1414 PSHUFB_XMM %xmm2, %xmm13
1415
Tadeusz Struk0bd82f52010-11-04 15:00:45 -04001416
1417# Precompute HashKey<<1 (mod poly) from the hash key (required for GHASH)
1418
1419 movdqa %xmm13, %xmm2
1420 psllq $1, %xmm13
1421 psrlq $63, %xmm2
1422 movdqa %xmm2, %xmm1
1423 pslldq $8, %xmm2
1424 psrldq $8, %xmm1
1425 por %xmm2, %xmm13
1426
1427 # Reduction
1428
1429 pshufd $0x24, %xmm1, %xmm2
1430 pcmpeqd TWOONE(%rip), %xmm2
1431 pand POLY(%rip), %xmm2
1432 pxor %xmm2, %xmm13 # %xmm13 holds the HashKey<<1 (mod poly)
1433
1434
1435 # Decrypt first few blocks
1436
1437 movdqa %xmm13, HashKey(%rsp) # store HashKey<<1 (mod poly)
1438 mov %arg4, %r13 # save the number of bytes of plaintext/ciphertext
1439 and $-16, %r13 # %r13 = %r13 - (%r13 mod 16)
1440 mov %r13, %r12
1441 and $(3<<4), %r12
1442 jz _initial_num_blocks_is_0_decrypt
1443 cmp $(2<<4), %r12
1444 jb _initial_num_blocks_is_1_decrypt
1445 je _initial_num_blocks_is_2_decrypt
1446_initial_num_blocks_is_3_decrypt:
Tadeusz Struk3c097b82010-12-13 19:51:15 +08001447 INITIAL_BLOCKS_DEC 3, %xmm9, %xmm10, %xmm13, %xmm11, %xmm12, %xmm0, \
Tadeusz Struk0bd82f52010-11-04 15:00:45 -04001448%xmm1, %xmm2, %xmm3, %xmm4, %xmm8, %xmm5, %xmm6, 5, 678, dec
1449 sub $48, %r13
1450 jmp _initial_blocks_decrypted
1451_initial_num_blocks_is_2_decrypt:
Tadeusz Struk3c097b82010-12-13 19:51:15 +08001452 INITIAL_BLOCKS_DEC 2, %xmm9, %xmm10, %xmm13, %xmm11, %xmm12, %xmm0, \
Tadeusz Struk0bd82f52010-11-04 15:00:45 -04001453%xmm1, %xmm2, %xmm3, %xmm4, %xmm8, %xmm5, %xmm6, 6, 78, dec
1454 sub $32, %r13
1455 jmp _initial_blocks_decrypted
1456_initial_num_blocks_is_1_decrypt:
Tadeusz Struk3c097b82010-12-13 19:51:15 +08001457 INITIAL_BLOCKS_DEC 1, %xmm9, %xmm10, %xmm13, %xmm11, %xmm12, %xmm0, \
Tadeusz Struk0bd82f52010-11-04 15:00:45 -04001458%xmm1, %xmm2, %xmm3, %xmm4, %xmm8, %xmm5, %xmm6, 7, 8, dec
1459 sub $16, %r13
1460 jmp _initial_blocks_decrypted
1461_initial_num_blocks_is_0_decrypt:
Tadeusz Struk3c097b82010-12-13 19:51:15 +08001462 INITIAL_BLOCKS_DEC 0, %xmm9, %xmm10, %xmm13, %xmm11, %xmm12, %xmm0, \
Tadeusz Struk0bd82f52010-11-04 15:00:45 -04001463%xmm1, %xmm2, %xmm3, %xmm4, %xmm8, %xmm5, %xmm6, 8, 0, dec
1464_initial_blocks_decrypted:
1465 cmp $0, %r13
1466 je _zero_cipher_left_decrypt
1467 sub $64, %r13
1468 je _four_cipher_left_decrypt
1469_decrypt_by_4:
Tadeusz Struk3c097b82010-12-13 19:51:15 +08001470 GHASH_4_ENCRYPT_4_PARALLEL_DEC %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, \
Tadeusz Struk0bd82f52010-11-04 15:00:45 -04001471%xmm14, %xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, dec
1472 add $64, %r11
1473 sub $64, %r13
1474 jne _decrypt_by_4
1475_four_cipher_left_decrypt:
1476 GHASH_LAST_4 %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14, \
1477%xmm15, %xmm1, %xmm2, %xmm3, %xmm4, %xmm8
1478_zero_cipher_left_decrypt:
1479 mov %arg4, %r13
1480 and $15, %r13 # %r13 = arg4 (mod 16)
1481 je _multiple_of_16_bytes_decrypt
1482
Lucas De Marchi0d2eb442011-03-17 16:24:16 -03001483 # Handle the last <16 byte block separately
Tadeusz Struk0bd82f52010-11-04 15:00:45 -04001484
1485 paddd ONE(%rip), %xmm0 # increment CNT to get Yn
Tadeusz Struk3c097b82010-12-13 19:51:15 +08001486 movdqa SHUF_MASK(%rip), %xmm10
1487 PSHUFB_XMM %xmm10, %xmm0
1488
Tadeusz Struk0bd82f52010-11-04 15:00:45 -04001489 ENCRYPT_SINGLE_BLOCK %xmm0, %xmm1 # E(K, Yn)
1490 sub $16, %r11
1491 add %r13, %r11
Lucas De Marchi0d2eb442011-03-17 16:24:16 -03001492 movdqu (%arg3,%r11,1), %xmm1 # receive the last <16 byte block
Tadeusz Struk0bd82f52010-11-04 15:00:45 -04001493 lea SHIFT_MASK+16(%rip), %r12
1494 sub %r13, %r12
1495# adjust the shuffle mask pointer to be able to shift 16-%r13 bytes
1496# (%r13 is the number of bytes in plaintext mod 16)
1497 movdqu (%r12), %xmm2 # get the appropriate shuffle mask
Tadeusz Struk3c097b82010-12-13 19:51:15 +08001498 PSHUFB_XMM %xmm2, %xmm1 # right shift 16-%r13 butes
1499
Tadeusz Struk0bd82f52010-11-04 15:00:45 -04001500 movdqa %xmm1, %xmm2
1501 pxor %xmm1, %xmm0 # Ciphertext XOR E(K, Yn)
1502 movdqu ALL_F-SHIFT_MASK(%r12), %xmm1
1503 # get the appropriate mask to mask out top 16-%r13 bytes of %xmm0
1504 pand %xmm1, %xmm0 # mask out top 16-%r13 bytes of %xmm0
1505 pand %xmm1, %xmm2
Tadeusz Struk3c097b82010-12-13 19:51:15 +08001506 movdqa SHUF_MASK(%rip), %xmm10
1507 PSHUFB_XMM %xmm10 ,%xmm2
1508
Tadeusz Struk0bd82f52010-11-04 15:00:45 -04001509 pxor %xmm2, %xmm8
1510 GHASH_MUL %xmm8, %xmm13, %xmm9, %xmm10, %xmm11, %xmm5, %xmm6
1511 # GHASH computation for the last <16 byte block
1512 sub %r13, %r11
1513 add $16, %r11
1514
1515 # output %r13 bytes
Tadeusz Struk3c097b82010-12-13 19:51:15 +08001516 MOVQ_R64_XMM %xmm0, %rax
Tadeusz Struk0bd82f52010-11-04 15:00:45 -04001517 cmp $8, %r13
1518 jle _less_than_8_bytes_left_decrypt
1519 mov %rax, (%arg2 , %r11, 1)
1520 add $8, %r11
1521 psrldq $8, %xmm0
Tadeusz Struk3c097b82010-12-13 19:51:15 +08001522 MOVQ_R64_XMM %xmm0, %rax
Tadeusz Struk0bd82f52010-11-04 15:00:45 -04001523 sub $8, %r13
1524_less_than_8_bytes_left_decrypt:
1525 mov %al, (%arg2, %r11, 1)
1526 add $1, %r11
1527 shr $8, %rax
1528 sub $1, %r13
1529 jne _less_than_8_bytes_left_decrypt
1530_multiple_of_16_bytes_decrypt:
1531 mov arg8, %r12 # %r13 = aadLen (number of bytes)
1532 shl $3, %r12 # convert into number of bits
1533 movd %r12d, %xmm15 # len(A) in %xmm15
1534 shl $3, %arg4 # len(C) in bits (*128)
Tadeusz Struk3c097b82010-12-13 19:51:15 +08001535 MOVQ_R64_XMM %arg4, %xmm1
Tadeusz Struk0bd82f52010-11-04 15:00:45 -04001536 pslldq $8, %xmm15 # %xmm15 = len(A)||0x0000000000000000
1537 pxor %xmm1, %xmm15 # %xmm15 = len(A)||len(C)
1538 pxor %xmm15, %xmm8
1539 GHASH_MUL %xmm8, %xmm13, %xmm9, %xmm10, %xmm11, %xmm5, %xmm6
1540 # final GHASH computation
Tadeusz Struk3c097b82010-12-13 19:51:15 +08001541 movdqa SHUF_MASK(%rip), %xmm10
1542 PSHUFB_XMM %xmm10, %xmm8
1543
Tadeusz Struk0bd82f52010-11-04 15:00:45 -04001544 mov %arg5, %rax # %rax = *Y0
1545 movdqu (%rax), %xmm0 # %xmm0 = Y0
1546 ENCRYPT_SINGLE_BLOCK %xmm0, %xmm1 # E(K, Y0)
1547 pxor %xmm8, %xmm0
1548_return_T_decrypt:
1549 mov arg9, %r10 # %r10 = authTag
1550 mov arg10, %r11 # %r11 = auth_tag_len
1551 cmp $16, %r11
1552 je _T_16_decrypt
Sabrina Dubroca38d9dee2017-04-28 18:11:57 +02001553 cmp $8, %r11
1554 jl _T_4_decrypt
Tadeusz Struk0bd82f52010-11-04 15:00:45 -04001555_T_8_decrypt:
Tadeusz Struk3c097b82010-12-13 19:51:15 +08001556 MOVQ_R64_XMM %xmm0, %rax
Tadeusz Struk0bd82f52010-11-04 15:00:45 -04001557 mov %rax, (%r10)
Sabrina Dubroca38d9dee2017-04-28 18:11:57 +02001558 add $8, %r10
1559 sub $8, %r11
Tadeusz Struk0bd82f52010-11-04 15:00:45 -04001560 psrldq $8, %xmm0
Sabrina Dubroca38d9dee2017-04-28 18:11:57 +02001561 cmp $0, %r11
1562 je _return_T_done_decrypt
1563_T_4_decrypt:
Tadeusz Struk0bd82f52010-11-04 15:00:45 -04001564 movd %xmm0, %eax
Sabrina Dubroca38d9dee2017-04-28 18:11:57 +02001565 mov %eax, (%r10)
1566 add $4, %r10
1567 sub $4, %r11
1568 psrldq $4, %xmm0
1569 cmp $0, %r11
1570 je _return_T_done_decrypt
1571_T_123_decrypt:
1572 movd %xmm0, %eax
1573 cmp $2, %r11
1574 jl _T_1_decrypt
1575 mov %ax, (%r10)
1576 cmp $2, %r11
1577 je _return_T_done_decrypt
1578 add $2, %r10
1579 sar $16, %eax
1580_T_1_decrypt:
1581 mov %al, (%r10)
Tadeusz Struk0bd82f52010-11-04 15:00:45 -04001582 jmp _return_T_done_decrypt
1583_T_16_decrypt:
1584 movdqu %xmm0, (%r10)
1585_return_T_done_decrypt:
1586 mov %r14, %rsp
1587 pop %r14
1588 pop %r13
1589 pop %r12
1590 ret
Jussi Kivilinna8309b742013-01-19 13:38:55 +02001591ENDPROC(aesni_gcm_dec)
Tadeusz Struk0bd82f52010-11-04 15:00:45 -04001592
1593
1594/*****************************************************************************
1595* void aesni_gcm_enc(void *aes_ctx, // AES Key schedule. Starts on a 16 byte boundary.
1596* u8 *out, // Ciphertext output. Encrypt in-place is allowed.
1597* const u8 *in, // Plaintext input
1598* u64 plaintext_len, // Length of data in bytes for encryption.
1599* u8 *iv, // Pre-counter block j0: 4 byte salt (from Security Association)
1600* // concatenated with 8 byte Initialisation Vector (from IPSec ESP Payload)
1601* // concatenated with 0x00000001. 16-byte aligned pointer.
1602* u8 *hash_subkey, // H, the Hash sub key input. Data starts on a 16-byte boundary.
1603* const u8 *aad, // Additional Authentication Data (AAD)
1604* u64 aad_len, // Length of AAD in bytes. With RFC4106 this is going to be 8 or 12 bytes
1605* u8 *auth_tag, // Authenticated Tag output.
1606* u64 auth_tag_len); // Authenticated Tag Length in bytes. Valid values are 16 (most likely),
1607* // 12 or 8.
1608*
1609* Assumptions:
1610*
1611* keys:
1612* keys are pre-expanded and aligned to 16 bytes. we are using the
1613* first set of 11 keys in the data structure void *aes_ctx
1614*
1615*
1616* iv:
1617* 0 1 2 3
1618* 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
1619* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1620* | Salt (From the SA) |
1621* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1622* | Initialization Vector |
1623* | (This is the sequence number from IPSec header) |
1624* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1625* | 0x1 |
1626* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1627*
1628*
1629*
1630* AAD:
1631* AAD padded to 128 bits with 0
1632* for example, assume AAD is a u32 vector
1633*
1634* if AAD is 8 bytes:
1635* AAD[3] = {A0, A1};
1636* padded AAD in xmm register = {A1 A0 0 0}
1637*
1638* 0 1 2 3
1639* 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
1640* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1641* | SPI (A1) |
1642* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1643* | 32-bit Sequence Number (A0) |
1644* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1645* | 0x0 |
1646* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1647*
1648* AAD Format with 32-bit Sequence Number
1649*
1650* if AAD is 12 bytes:
1651* AAD[3] = {A0, A1, A2};
1652* padded AAD in xmm register = {A2 A1 A0 0}
1653*
1654* 0 1 2 3
1655* 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
1656* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1657* | SPI (A2) |
1658* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1659* | 64-bit Extended Sequence Number {A1,A0} |
1660* | |
1661* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1662* | 0x0 |
1663* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1664*
1665* AAD Format with 64-bit Extended Sequence Number
1666*
1667* aadLen:
1668* from the definition of the spec, aadLen can only be 8 or 12 bytes.
1669* The code supports 16 too but for other sizes, the code will fail.
1670*
1671* TLen:
1672* from the definition of the spec, TLen can only be 8, 12 or 16 bytes.
1673* For other sizes, the code will fail.
1674*
1675* poly = x^128 + x^127 + x^126 + x^121 + 1
1676***************************************************************************/
1677ENTRY(aesni_gcm_enc)
1678 push %r12
1679 push %r13
1680 push %r14
1681 mov %rsp, %r14
1682#
1683# states of %xmm registers %xmm6:%xmm15 not saved
1684# all %xmm registers are clobbered
1685#
1686 sub $VARIABLE_OFFSET, %rsp
1687 and $~63, %rsp
1688 mov %arg6, %r12
1689 movdqu (%r12), %xmm13
Tadeusz Struk3c097b82010-12-13 19:51:15 +08001690 movdqa SHUF_MASK(%rip), %xmm2
1691 PSHUFB_XMM %xmm2, %xmm13
1692
Tadeusz Struk0bd82f52010-11-04 15:00:45 -04001693
1694# precompute HashKey<<1 mod poly from the HashKey (required for GHASH)
1695
1696 movdqa %xmm13, %xmm2
1697 psllq $1, %xmm13
1698 psrlq $63, %xmm2
1699 movdqa %xmm2, %xmm1
1700 pslldq $8, %xmm2
1701 psrldq $8, %xmm1
1702 por %xmm2, %xmm13
1703
1704 # reduce HashKey<<1
1705
1706 pshufd $0x24, %xmm1, %xmm2
1707 pcmpeqd TWOONE(%rip), %xmm2
1708 pand POLY(%rip), %xmm2
1709 pxor %xmm2, %xmm13
1710 movdqa %xmm13, HashKey(%rsp)
1711 mov %arg4, %r13 # %xmm13 holds HashKey<<1 (mod poly)
1712 and $-16, %r13
1713 mov %r13, %r12
1714
1715 # Encrypt first few blocks
1716
1717 and $(3<<4), %r12
1718 jz _initial_num_blocks_is_0_encrypt
1719 cmp $(2<<4), %r12
1720 jb _initial_num_blocks_is_1_encrypt
1721 je _initial_num_blocks_is_2_encrypt
1722_initial_num_blocks_is_3_encrypt:
Tadeusz Struk3c097b82010-12-13 19:51:15 +08001723 INITIAL_BLOCKS_ENC 3, %xmm9, %xmm10, %xmm13, %xmm11, %xmm12, %xmm0, \
Tadeusz Struk0bd82f52010-11-04 15:00:45 -04001724%xmm1, %xmm2, %xmm3, %xmm4, %xmm8, %xmm5, %xmm6, 5, 678, enc
1725 sub $48, %r13
1726 jmp _initial_blocks_encrypted
1727_initial_num_blocks_is_2_encrypt:
Tadeusz Struk3c097b82010-12-13 19:51:15 +08001728 INITIAL_BLOCKS_ENC 2, %xmm9, %xmm10, %xmm13, %xmm11, %xmm12, %xmm0, \
Tadeusz Struk0bd82f52010-11-04 15:00:45 -04001729%xmm1, %xmm2, %xmm3, %xmm4, %xmm8, %xmm5, %xmm6, 6, 78, enc
1730 sub $32, %r13
1731 jmp _initial_blocks_encrypted
1732_initial_num_blocks_is_1_encrypt:
Tadeusz Struk3c097b82010-12-13 19:51:15 +08001733 INITIAL_BLOCKS_ENC 1, %xmm9, %xmm10, %xmm13, %xmm11, %xmm12, %xmm0, \
Tadeusz Struk0bd82f52010-11-04 15:00:45 -04001734%xmm1, %xmm2, %xmm3, %xmm4, %xmm8, %xmm5, %xmm6, 7, 8, enc
1735 sub $16, %r13
1736 jmp _initial_blocks_encrypted
1737_initial_num_blocks_is_0_encrypt:
Tadeusz Struk3c097b82010-12-13 19:51:15 +08001738 INITIAL_BLOCKS_ENC 0, %xmm9, %xmm10, %xmm13, %xmm11, %xmm12, %xmm0, \
Tadeusz Struk0bd82f52010-11-04 15:00:45 -04001739%xmm1, %xmm2, %xmm3, %xmm4, %xmm8, %xmm5, %xmm6, 8, 0, enc
1740_initial_blocks_encrypted:
1741
1742 # Main loop - Encrypt remaining blocks
1743
1744 cmp $0, %r13
1745 je _zero_cipher_left_encrypt
1746 sub $64, %r13
1747 je _four_cipher_left_encrypt
1748_encrypt_by_4_encrypt:
Tadeusz Struk3c097b82010-12-13 19:51:15 +08001749 GHASH_4_ENCRYPT_4_PARALLEL_ENC %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, \
Tadeusz Struk0bd82f52010-11-04 15:00:45 -04001750%xmm14, %xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, enc
1751 add $64, %r11
1752 sub $64, %r13
1753 jne _encrypt_by_4_encrypt
1754_four_cipher_left_encrypt:
1755 GHASH_LAST_4 %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14, \
1756%xmm15, %xmm1, %xmm2, %xmm3, %xmm4, %xmm8
1757_zero_cipher_left_encrypt:
1758 mov %arg4, %r13
1759 and $15, %r13 # %r13 = arg4 (mod 16)
1760 je _multiple_of_16_bytes_encrypt
1761
Lucas De Marchi0d2eb442011-03-17 16:24:16 -03001762 # Handle the last <16 Byte block separately
Tadeusz Struk0bd82f52010-11-04 15:00:45 -04001763 paddd ONE(%rip), %xmm0 # INCR CNT to get Yn
Tadeusz Struk3c097b82010-12-13 19:51:15 +08001764 movdqa SHUF_MASK(%rip), %xmm10
1765 PSHUFB_XMM %xmm10, %xmm0
1766
Tadeusz Struk60af5202011-03-13 16:56:17 +08001767
Tadeusz Struk0bd82f52010-11-04 15:00:45 -04001768 ENCRYPT_SINGLE_BLOCK %xmm0, %xmm1 # Encrypt(K, Yn)
1769 sub $16, %r11
1770 add %r13, %r11
1771 movdqu (%arg3,%r11,1), %xmm1 # receive the last <16 byte blocks
1772 lea SHIFT_MASK+16(%rip), %r12
1773 sub %r13, %r12
1774 # adjust the shuffle mask pointer to be able to shift 16-r13 bytes
1775 # (%r13 is the number of bytes in plaintext mod 16)
1776 movdqu (%r12), %xmm2 # get the appropriate shuffle mask
Tadeusz Struk3c097b82010-12-13 19:51:15 +08001777 PSHUFB_XMM %xmm2, %xmm1 # shift right 16-r13 byte
Tadeusz Struk0bd82f52010-11-04 15:00:45 -04001778 pxor %xmm1, %xmm0 # Plaintext XOR Encrypt(K, Yn)
1779 movdqu ALL_F-SHIFT_MASK(%r12), %xmm1
1780 # get the appropriate mask to mask out top 16-r13 bytes of xmm0
1781 pand %xmm1, %xmm0 # mask out top 16-r13 bytes of xmm0
Tadeusz Struk3c097b82010-12-13 19:51:15 +08001782 movdqa SHUF_MASK(%rip), %xmm10
1783 PSHUFB_XMM %xmm10,%xmm0
Tadeusz Struk0bd82f52010-11-04 15:00:45 -04001784
Tadeusz Struk0bd82f52010-11-04 15:00:45 -04001785 pxor %xmm0, %xmm8
1786 GHASH_MUL %xmm8, %xmm13, %xmm9, %xmm10, %xmm11, %xmm5, %xmm6
1787 # GHASH computation for the last <16 byte block
1788 sub %r13, %r11
1789 add $16, %r11
Tadeusz Struk60af5202011-03-13 16:56:17 +08001790
1791 movdqa SHUF_MASK(%rip), %xmm10
1792 PSHUFB_XMM %xmm10, %xmm0
Tadeusz Struk3c097b82010-12-13 19:51:15 +08001793
Tadeusz Struk0bd82f52010-11-04 15:00:45 -04001794 # shuffle xmm0 back to output as ciphertext
1795
1796 # Output %r13 bytes
Tadeusz Struk3c097b82010-12-13 19:51:15 +08001797 MOVQ_R64_XMM %xmm0, %rax
Tadeusz Struk0bd82f52010-11-04 15:00:45 -04001798 cmp $8, %r13
1799 jle _less_than_8_bytes_left_encrypt
1800 mov %rax, (%arg2 , %r11, 1)
1801 add $8, %r11
1802 psrldq $8, %xmm0
Tadeusz Struk3c097b82010-12-13 19:51:15 +08001803 MOVQ_R64_XMM %xmm0, %rax
Tadeusz Struk0bd82f52010-11-04 15:00:45 -04001804 sub $8, %r13
1805_less_than_8_bytes_left_encrypt:
1806 mov %al, (%arg2, %r11, 1)
1807 add $1, %r11
1808 shr $8, %rax
1809 sub $1, %r13
1810 jne _less_than_8_bytes_left_encrypt
1811_multiple_of_16_bytes_encrypt:
1812 mov arg8, %r12 # %r12 = addLen (number of bytes)
1813 shl $3, %r12
1814 movd %r12d, %xmm15 # len(A) in %xmm15
1815 shl $3, %arg4 # len(C) in bits (*128)
Tadeusz Struk3c097b82010-12-13 19:51:15 +08001816 MOVQ_R64_XMM %arg4, %xmm1
Tadeusz Struk0bd82f52010-11-04 15:00:45 -04001817 pslldq $8, %xmm15 # %xmm15 = len(A)||0x0000000000000000
1818 pxor %xmm1, %xmm15 # %xmm15 = len(A)||len(C)
1819 pxor %xmm15, %xmm8
1820 GHASH_MUL %xmm8, %xmm13, %xmm9, %xmm10, %xmm11, %xmm5, %xmm6
1821 # final GHASH computation
Tadeusz Struk3c097b82010-12-13 19:51:15 +08001822 movdqa SHUF_MASK(%rip), %xmm10
1823 PSHUFB_XMM %xmm10, %xmm8 # perform a 16 byte swap
Tadeusz Struk0bd82f52010-11-04 15:00:45 -04001824
Tadeusz Struk0bd82f52010-11-04 15:00:45 -04001825 mov %arg5, %rax # %rax = *Y0
1826 movdqu (%rax), %xmm0 # %xmm0 = Y0
1827 ENCRYPT_SINGLE_BLOCK %xmm0, %xmm15 # Encrypt(K, Y0)
1828 pxor %xmm8, %xmm0
1829_return_T_encrypt:
1830 mov arg9, %r10 # %r10 = authTag
1831 mov arg10, %r11 # %r11 = auth_tag_len
1832 cmp $16, %r11
1833 je _T_16_encrypt
Sabrina Dubroca38d9dee2017-04-28 18:11:57 +02001834 cmp $8, %r11
1835 jl _T_4_encrypt
Tadeusz Struk0bd82f52010-11-04 15:00:45 -04001836_T_8_encrypt:
Tadeusz Struk3c097b82010-12-13 19:51:15 +08001837 MOVQ_R64_XMM %xmm0, %rax
Tadeusz Struk0bd82f52010-11-04 15:00:45 -04001838 mov %rax, (%r10)
Sabrina Dubroca38d9dee2017-04-28 18:11:57 +02001839 add $8, %r10
1840 sub $8, %r11
Tadeusz Struk0bd82f52010-11-04 15:00:45 -04001841 psrldq $8, %xmm0
Sabrina Dubroca38d9dee2017-04-28 18:11:57 +02001842 cmp $0, %r11
1843 je _return_T_done_encrypt
1844_T_4_encrypt:
Tadeusz Struk0bd82f52010-11-04 15:00:45 -04001845 movd %xmm0, %eax
Sabrina Dubroca38d9dee2017-04-28 18:11:57 +02001846 mov %eax, (%r10)
1847 add $4, %r10
1848 sub $4, %r11
1849 psrldq $4, %xmm0
1850 cmp $0, %r11
1851 je _return_T_done_encrypt
1852_T_123_encrypt:
1853 movd %xmm0, %eax
1854 cmp $2, %r11
1855 jl _T_1_encrypt
1856 mov %ax, (%r10)
1857 cmp $2, %r11
1858 je _return_T_done_encrypt
1859 add $2, %r10
1860 sar $16, %eax
1861_T_1_encrypt:
1862 mov %al, (%r10)
Tadeusz Struk0bd82f52010-11-04 15:00:45 -04001863 jmp _return_T_done_encrypt
1864_T_16_encrypt:
1865 movdqu %xmm0, (%r10)
1866_return_T_done_encrypt:
1867 mov %r14, %rsp
1868 pop %r14
1869 pop %r13
1870 pop %r12
1871 ret
Jussi Kivilinna8309b742013-01-19 13:38:55 +02001872ENDPROC(aesni_gcm_enc)
Tadeusz Struk3c097b82010-12-13 19:51:15 +08001873
Mathias Krause559ad0f2010-11-29 08:35:39 +08001874#endif
Tadeusz Struk0bd82f52010-11-04 15:00:45 -04001875
1876
Jussi Kivilinna8309b742013-01-19 13:38:55 +02001877.align 4
Huang Ying54b6a1b2009-01-18 16:28:34 +11001878_key_expansion_128:
1879_key_expansion_256a:
1880 pshufd $0b11111111, %xmm1, %xmm1
1881 shufps $0b00010000, %xmm0, %xmm4
1882 pxor %xmm4, %xmm0
1883 shufps $0b10001100, %xmm0, %xmm4
1884 pxor %xmm4, %xmm0
1885 pxor %xmm1, %xmm0
Mathias Krause0d258ef2010-11-27 16:34:46 +08001886 movaps %xmm0, (TKEYP)
1887 add $0x10, TKEYP
Huang Ying54b6a1b2009-01-18 16:28:34 +11001888 ret
Jussi Kivilinna8309b742013-01-19 13:38:55 +02001889ENDPROC(_key_expansion_128)
1890ENDPROC(_key_expansion_256a)
Huang Ying54b6a1b2009-01-18 16:28:34 +11001891
Mathias Krause0d258ef2010-11-27 16:34:46 +08001892.align 4
Huang Ying54b6a1b2009-01-18 16:28:34 +11001893_key_expansion_192a:
1894 pshufd $0b01010101, %xmm1, %xmm1
1895 shufps $0b00010000, %xmm0, %xmm4
1896 pxor %xmm4, %xmm0
1897 shufps $0b10001100, %xmm0, %xmm4
1898 pxor %xmm4, %xmm0
1899 pxor %xmm1, %xmm0
1900
1901 movaps %xmm2, %xmm5
1902 movaps %xmm2, %xmm6
1903 pslldq $4, %xmm5
1904 pshufd $0b11111111, %xmm0, %xmm3
1905 pxor %xmm3, %xmm2
1906 pxor %xmm5, %xmm2
1907
1908 movaps %xmm0, %xmm1
1909 shufps $0b01000100, %xmm0, %xmm6
Mathias Krause0d258ef2010-11-27 16:34:46 +08001910 movaps %xmm6, (TKEYP)
Huang Ying54b6a1b2009-01-18 16:28:34 +11001911 shufps $0b01001110, %xmm2, %xmm1
Mathias Krause0d258ef2010-11-27 16:34:46 +08001912 movaps %xmm1, 0x10(TKEYP)
1913 add $0x20, TKEYP
Huang Ying54b6a1b2009-01-18 16:28:34 +11001914 ret
Jussi Kivilinna8309b742013-01-19 13:38:55 +02001915ENDPROC(_key_expansion_192a)
Huang Ying54b6a1b2009-01-18 16:28:34 +11001916
Mathias Krause0d258ef2010-11-27 16:34:46 +08001917.align 4
Huang Ying54b6a1b2009-01-18 16:28:34 +11001918_key_expansion_192b:
1919 pshufd $0b01010101, %xmm1, %xmm1
1920 shufps $0b00010000, %xmm0, %xmm4
1921 pxor %xmm4, %xmm0
1922 shufps $0b10001100, %xmm0, %xmm4
1923 pxor %xmm4, %xmm0
1924 pxor %xmm1, %xmm0
1925
1926 movaps %xmm2, %xmm5
1927 pslldq $4, %xmm5
1928 pshufd $0b11111111, %xmm0, %xmm3
1929 pxor %xmm3, %xmm2
1930 pxor %xmm5, %xmm2
1931
Mathias Krause0d258ef2010-11-27 16:34:46 +08001932 movaps %xmm0, (TKEYP)
1933 add $0x10, TKEYP
Huang Ying54b6a1b2009-01-18 16:28:34 +11001934 ret
Jussi Kivilinna8309b742013-01-19 13:38:55 +02001935ENDPROC(_key_expansion_192b)
Huang Ying54b6a1b2009-01-18 16:28:34 +11001936
Mathias Krause0d258ef2010-11-27 16:34:46 +08001937.align 4
Huang Ying54b6a1b2009-01-18 16:28:34 +11001938_key_expansion_256b:
1939 pshufd $0b10101010, %xmm1, %xmm1
1940 shufps $0b00010000, %xmm2, %xmm4
1941 pxor %xmm4, %xmm2
1942 shufps $0b10001100, %xmm2, %xmm4
1943 pxor %xmm4, %xmm2
1944 pxor %xmm1, %xmm2
Mathias Krause0d258ef2010-11-27 16:34:46 +08001945 movaps %xmm2, (TKEYP)
1946 add $0x10, TKEYP
Huang Ying54b6a1b2009-01-18 16:28:34 +11001947 ret
Jussi Kivilinna8309b742013-01-19 13:38:55 +02001948ENDPROC(_key_expansion_256b)
Huang Ying54b6a1b2009-01-18 16:28:34 +11001949
1950/*
1951 * int aesni_set_key(struct crypto_aes_ctx *ctx, const u8 *in_key,
1952 * unsigned int key_len)
1953 */
1954ENTRY(aesni_set_key)
Josh Poimboeuf8691ccd2016-01-21 16:49:19 -06001955 FRAME_BEGIN
Mathias Krause0d258ef2010-11-27 16:34:46 +08001956#ifndef __x86_64__
1957 pushl KEYP
Josh Poimboeuf8691ccd2016-01-21 16:49:19 -06001958 movl (FRAME_OFFSET+8)(%esp), KEYP # ctx
1959 movl (FRAME_OFFSET+12)(%esp), UKEYP # in_key
1960 movl (FRAME_OFFSET+16)(%esp), %edx # key_len
Mathias Krause0d258ef2010-11-27 16:34:46 +08001961#endif
1962 movups (UKEYP), %xmm0 # user key (first 16 bytes)
1963 movaps %xmm0, (KEYP)
1964 lea 0x10(KEYP), TKEYP # key addr
1965 movl %edx, 480(KEYP)
Huang Ying54b6a1b2009-01-18 16:28:34 +11001966 pxor %xmm4, %xmm4 # xmm4 is assumed 0 in _key_expansion_x
1967 cmp $24, %dl
1968 jb .Lenc_key128
1969 je .Lenc_key192
Mathias Krause0d258ef2010-11-27 16:34:46 +08001970 movups 0x10(UKEYP), %xmm2 # other user key
1971 movaps %xmm2, (TKEYP)
1972 add $0x10, TKEYP
Huang Yingb369e522009-11-23 19:54:06 +08001973 AESKEYGENASSIST 0x1 %xmm2 %xmm1 # round 1
Huang Ying54b6a1b2009-01-18 16:28:34 +11001974 call _key_expansion_256a
Huang Yingb369e522009-11-23 19:54:06 +08001975 AESKEYGENASSIST 0x1 %xmm0 %xmm1
Huang Ying54b6a1b2009-01-18 16:28:34 +11001976 call _key_expansion_256b
Huang Yingb369e522009-11-23 19:54:06 +08001977 AESKEYGENASSIST 0x2 %xmm2 %xmm1 # round 2
Huang Ying54b6a1b2009-01-18 16:28:34 +11001978 call _key_expansion_256a
Huang Yingb369e522009-11-23 19:54:06 +08001979 AESKEYGENASSIST 0x2 %xmm0 %xmm1
Huang Ying54b6a1b2009-01-18 16:28:34 +11001980 call _key_expansion_256b
Huang Yingb369e522009-11-23 19:54:06 +08001981 AESKEYGENASSIST 0x4 %xmm2 %xmm1 # round 3
Huang Ying54b6a1b2009-01-18 16:28:34 +11001982 call _key_expansion_256a
Huang Yingb369e522009-11-23 19:54:06 +08001983 AESKEYGENASSIST 0x4 %xmm0 %xmm1
Huang Ying54b6a1b2009-01-18 16:28:34 +11001984 call _key_expansion_256b
Huang Yingb369e522009-11-23 19:54:06 +08001985 AESKEYGENASSIST 0x8 %xmm2 %xmm1 # round 4
Huang Ying54b6a1b2009-01-18 16:28:34 +11001986 call _key_expansion_256a
Huang Yingb369e522009-11-23 19:54:06 +08001987 AESKEYGENASSIST 0x8 %xmm0 %xmm1
Huang Ying54b6a1b2009-01-18 16:28:34 +11001988 call _key_expansion_256b
Huang Yingb369e522009-11-23 19:54:06 +08001989 AESKEYGENASSIST 0x10 %xmm2 %xmm1 # round 5
Huang Ying54b6a1b2009-01-18 16:28:34 +11001990 call _key_expansion_256a
Huang Yingb369e522009-11-23 19:54:06 +08001991 AESKEYGENASSIST 0x10 %xmm0 %xmm1
Huang Ying54b6a1b2009-01-18 16:28:34 +11001992 call _key_expansion_256b
Huang Yingb369e522009-11-23 19:54:06 +08001993 AESKEYGENASSIST 0x20 %xmm2 %xmm1 # round 6
Huang Ying54b6a1b2009-01-18 16:28:34 +11001994 call _key_expansion_256a
Huang Yingb369e522009-11-23 19:54:06 +08001995 AESKEYGENASSIST 0x20 %xmm0 %xmm1
Huang Ying54b6a1b2009-01-18 16:28:34 +11001996 call _key_expansion_256b
Huang Yingb369e522009-11-23 19:54:06 +08001997 AESKEYGENASSIST 0x40 %xmm2 %xmm1 # round 7
Huang Ying54b6a1b2009-01-18 16:28:34 +11001998 call _key_expansion_256a
1999 jmp .Ldec_key
2000.Lenc_key192:
Mathias Krause0d258ef2010-11-27 16:34:46 +08002001 movq 0x10(UKEYP), %xmm2 # other user key
Huang Yingb369e522009-11-23 19:54:06 +08002002 AESKEYGENASSIST 0x1 %xmm2 %xmm1 # round 1
Huang Ying54b6a1b2009-01-18 16:28:34 +11002003 call _key_expansion_192a
Huang Yingb369e522009-11-23 19:54:06 +08002004 AESKEYGENASSIST 0x2 %xmm2 %xmm1 # round 2
Huang Ying54b6a1b2009-01-18 16:28:34 +11002005 call _key_expansion_192b
Huang Yingb369e522009-11-23 19:54:06 +08002006 AESKEYGENASSIST 0x4 %xmm2 %xmm1 # round 3
Huang Ying54b6a1b2009-01-18 16:28:34 +11002007 call _key_expansion_192a
Huang Yingb369e522009-11-23 19:54:06 +08002008 AESKEYGENASSIST 0x8 %xmm2 %xmm1 # round 4
Huang Ying54b6a1b2009-01-18 16:28:34 +11002009 call _key_expansion_192b
Huang Yingb369e522009-11-23 19:54:06 +08002010 AESKEYGENASSIST 0x10 %xmm2 %xmm1 # round 5
Huang Ying54b6a1b2009-01-18 16:28:34 +11002011 call _key_expansion_192a
Huang Yingb369e522009-11-23 19:54:06 +08002012 AESKEYGENASSIST 0x20 %xmm2 %xmm1 # round 6
Huang Ying54b6a1b2009-01-18 16:28:34 +11002013 call _key_expansion_192b
Huang Yingb369e522009-11-23 19:54:06 +08002014 AESKEYGENASSIST 0x40 %xmm2 %xmm1 # round 7
Huang Ying54b6a1b2009-01-18 16:28:34 +11002015 call _key_expansion_192a
Huang Yingb369e522009-11-23 19:54:06 +08002016 AESKEYGENASSIST 0x80 %xmm2 %xmm1 # round 8
Huang Ying54b6a1b2009-01-18 16:28:34 +11002017 call _key_expansion_192b
2018 jmp .Ldec_key
2019.Lenc_key128:
Huang Yingb369e522009-11-23 19:54:06 +08002020 AESKEYGENASSIST 0x1 %xmm0 %xmm1 # round 1
Huang Ying54b6a1b2009-01-18 16:28:34 +11002021 call _key_expansion_128
Huang Yingb369e522009-11-23 19:54:06 +08002022 AESKEYGENASSIST 0x2 %xmm0 %xmm1 # round 2
Huang Ying54b6a1b2009-01-18 16:28:34 +11002023 call _key_expansion_128
Huang Yingb369e522009-11-23 19:54:06 +08002024 AESKEYGENASSIST 0x4 %xmm0 %xmm1 # round 3
Huang Ying54b6a1b2009-01-18 16:28:34 +11002025 call _key_expansion_128
Huang Yingb369e522009-11-23 19:54:06 +08002026 AESKEYGENASSIST 0x8 %xmm0 %xmm1 # round 4
Huang Ying54b6a1b2009-01-18 16:28:34 +11002027 call _key_expansion_128
Huang Yingb369e522009-11-23 19:54:06 +08002028 AESKEYGENASSIST 0x10 %xmm0 %xmm1 # round 5
Huang Ying54b6a1b2009-01-18 16:28:34 +11002029 call _key_expansion_128
Huang Yingb369e522009-11-23 19:54:06 +08002030 AESKEYGENASSIST 0x20 %xmm0 %xmm1 # round 6
Huang Ying54b6a1b2009-01-18 16:28:34 +11002031 call _key_expansion_128
Huang Yingb369e522009-11-23 19:54:06 +08002032 AESKEYGENASSIST 0x40 %xmm0 %xmm1 # round 7
Huang Ying54b6a1b2009-01-18 16:28:34 +11002033 call _key_expansion_128
Huang Yingb369e522009-11-23 19:54:06 +08002034 AESKEYGENASSIST 0x80 %xmm0 %xmm1 # round 8
Huang Ying54b6a1b2009-01-18 16:28:34 +11002035 call _key_expansion_128
Huang Yingb369e522009-11-23 19:54:06 +08002036 AESKEYGENASSIST 0x1b %xmm0 %xmm1 # round 9
Huang Ying54b6a1b2009-01-18 16:28:34 +11002037 call _key_expansion_128
Huang Yingb369e522009-11-23 19:54:06 +08002038 AESKEYGENASSIST 0x36 %xmm0 %xmm1 # round 10
Huang Ying54b6a1b2009-01-18 16:28:34 +11002039 call _key_expansion_128
2040.Ldec_key:
Mathias Krause0d258ef2010-11-27 16:34:46 +08002041 sub $0x10, TKEYP
2042 movaps (KEYP), %xmm0
2043 movaps (TKEYP), %xmm1
2044 movaps %xmm0, 240(TKEYP)
2045 movaps %xmm1, 240(KEYP)
2046 add $0x10, KEYP
2047 lea 240-16(TKEYP), UKEYP
Huang Ying54b6a1b2009-01-18 16:28:34 +11002048.align 4
2049.Ldec_key_loop:
Mathias Krause0d258ef2010-11-27 16:34:46 +08002050 movaps (KEYP), %xmm0
Huang Yingb369e522009-11-23 19:54:06 +08002051 AESIMC %xmm0 %xmm1
Mathias Krause0d258ef2010-11-27 16:34:46 +08002052 movaps %xmm1, (UKEYP)
2053 add $0x10, KEYP
2054 sub $0x10, UKEYP
2055 cmp TKEYP, KEYP
Huang Ying54b6a1b2009-01-18 16:28:34 +11002056 jb .Ldec_key_loop
Mathias Krause0d258ef2010-11-27 16:34:46 +08002057 xor AREG, AREG
2058#ifndef __x86_64__
2059 popl KEYP
2060#endif
Josh Poimboeuf8691ccd2016-01-21 16:49:19 -06002061 FRAME_END
Huang Ying54b6a1b2009-01-18 16:28:34 +11002062 ret
Jussi Kivilinna8309b742013-01-19 13:38:55 +02002063ENDPROC(aesni_set_key)
Huang Ying54b6a1b2009-01-18 16:28:34 +11002064
2065/*
2066 * void aesni_enc(struct crypto_aes_ctx *ctx, u8 *dst, const u8 *src)
2067 */
2068ENTRY(aesni_enc)
Josh Poimboeuf8691ccd2016-01-21 16:49:19 -06002069 FRAME_BEGIN
Mathias Krause0d258ef2010-11-27 16:34:46 +08002070#ifndef __x86_64__
2071 pushl KEYP
2072 pushl KLEN
Josh Poimboeuf8691ccd2016-01-21 16:49:19 -06002073 movl (FRAME_OFFSET+12)(%esp), KEYP # ctx
2074 movl (FRAME_OFFSET+16)(%esp), OUTP # dst
2075 movl (FRAME_OFFSET+20)(%esp), INP # src
Mathias Krause0d258ef2010-11-27 16:34:46 +08002076#endif
Huang Ying54b6a1b2009-01-18 16:28:34 +11002077 movl 480(KEYP), KLEN # key length
2078 movups (INP), STATE # input
2079 call _aesni_enc1
2080 movups STATE, (OUTP) # output
Mathias Krause0d258ef2010-11-27 16:34:46 +08002081#ifndef __x86_64__
2082 popl KLEN
2083 popl KEYP
2084#endif
Josh Poimboeuf8691ccd2016-01-21 16:49:19 -06002085 FRAME_END
Huang Ying54b6a1b2009-01-18 16:28:34 +11002086 ret
Jussi Kivilinna8309b742013-01-19 13:38:55 +02002087ENDPROC(aesni_enc)
Huang Ying54b6a1b2009-01-18 16:28:34 +11002088
2089/*
2090 * _aesni_enc1: internal ABI
2091 * input:
2092 * KEYP: key struct pointer
2093 * KLEN: round count
2094 * STATE: initial state (input)
2095 * output:
2096 * STATE: finial state (output)
2097 * changed:
2098 * KEY
2099 * TKEYP (T1)
2100 */
Mathias Krause0d258ef2010-11-27 16:34:46 +08002101.align 4
Huang Ying54b6a1b2009-01-18 16:28:34 +11002102_aesni_enc1:
2103 movaps (KEYP), KEY # key
2104 mov KEYP, TKEYP
2105 pxor KEY, STATE # round 0
2106 add $0x30, TKEYP
2107 cmp $24, KLEN
2108 jb .Lenc128
2109 lea 0x20(TKEYP), TKEYP
2110 je .Lenc192
2111 add $0x20, TKEYP
2112 movaps -0x60(TKEYP), KEY
Huang Yingb369e522009-11-23 19:54:06 +08002113 AESENC KEY STATE
Huang Ying54b6a1b2009-01-18 16:28:34 +11002114 movaps -0x50(TKEYP), KEY
Huang Yingb369e522009-11-23 19:54:06 +08002115 AESENC KEY STATE
Huang Ying54b6a1b2009-01-18 16:28:34 +11002116.align 4
2117.Lenc192:
2118 movaps -0x40(TKEYP), KEY
Huang Yingb369e522009-11-23 19:54:06 +08002119 AESENC KEY STATE
Huang Ying54b6a1b2009-01-18 16:28:34 +11002120 movaps -0x30(TKEYP), KEY
Huang Yingb369e522009-11-23 19:54:06 +08002121 AESENC KEY STATE
Huang Ying54b6a1b2009-01-18 16:28:34 +11002122.align 4
2123.Lenc128:
2124 movaps -0x20(TKEYP), KEY
Huang Yingb369e522009-11-23 19:54:06 +08002125 AESENC KEY STATE
Huang Ying54b6a1b2009-01-18 16:28:34 +11002126 movaps -0x10(TKEYP), KEY
Huang Yingb369e522009-11-23 19:54:06 +08002127 AESENC KEY STATE
Huang Ying54b6a1b2009-01-18 16:28:34 +11002128 movaps (TKEYP), KEY
Huang Yingb369e522009-11-23 19:54:06 +08002129 AESENC KEY STATE
Huang Ying54b6a1b2009-01-18 16:28:34 +11002130 movaps 0x10(TKEYP), KEY
Huang Yingb369e522009-11-23 19:54:06 +08002131 AESENC KEY STATE
Huang Ying54b6a1b2009-01-18 16:28:34 +11002132 movaps 0x20(TKEYP), KEY
Huang Yingb369e522009-11-23 19:54:06 +08002133 AESENC KEY STATE
Huang Ying54b6a1b2009-01-18 16:28:34 +11002134 movaps 0x30(TKEYP), KEY
Huang Yingb369e522009-11-23 19:54:06 +08002135 AESENC KEY STATE
Huang Ying54b6a1b2009-01-18 16:28:34 +11002136 movaps 0x40(TKEYP), KEY
Huang Yingb369e522009-11-23 19:54:06 +08002137 AESENC KEY STATE
Huang Ying54b6a1b2009-01-18 16:28:34 +11002138 movaps 0x50(TKEYP), KEY
Huang Yingb369e522009-11-23 19:54:06 +08002139 AESENC KEY STATE
Huang Ying54b6a1b2009-01-18 16:28:34 +11002140 movaps 0x60(TKEYP), KEY
Huang Yingb369e522009-11-23 19:54:06 +08002141 AESENC KEY STATE
Huang Ying54b6a1b2009-01-18 16:28:34 +11002142 movaps 0x70(TKEYP), KEY
Huang Yingb369e522009-11-23 19:54:06 +08002143 AESENCLAST KEY STATE
Huang Ying54b6a1b2009-01-18 16:28:34 +11002144 ret
Jussi Kivilinna8309b742013-01-19 13:38:55 +02002145ENDPROC(_aesni_enc1)
Huang Ying54b6a1b2009-01-18 16:28:34 +11002146
2147/*
2148 * _aesni_enc4: internal ABI
2149 * input:
2150 * KEYP: key struct pointer
2151 * KLEN: round count
2152 * STATE1: initial state (input)
2153 * STATE2
2154 * STATE3
2155 * STATE4
2156 * output:
2157 * STATE1: finial state (output)
2158 * STATE2
2159 * STATE3
2160 * STATE4
2161 * changed:
2162 * KEY
2163 * TKEYP (T1)
2164 */
Mathias Krause0d258ef2010-11-27 16:34:46 +08002165.align 4
Huang Ying54b6a1b2009-01-18 16:28:34 +11002166_aesni_enc4:
2167 movaps (KEYP), KEY # key
2168 mov KEYP, TKEYP
2169 pxor KEY, STATE1 # round 0
2170 pxor KEY, STATE2
2171 pxor KEY, STATE3
2172 pxor KEY, STATE4
2173 add $0x30, TKEYP
2174 cmp $24, KLEN
2175 jb .L4enc128
2176 lea 0x20(TKEYP), TKEYP
2177 je .L4enc192
2178 add $0x20, TKEYP
2179 movaps -0x60(TKEYP), KEY
Huang Yingb369e522009-11-23 19:54:06 +08002180 AESENC KEY STATE1
2181 AESENC KEY STATE2
2182 AESENC KEY STATE3
2183 AESENC KEY STATE4
Huang Ying54b6a1b2009-01-18 16:28:34 +11002184 movaps -0x50(TKEYP), KEY
Huang Yingb369e522009-11-23 19:54:06 +08002185 AESENC KEY STATE1
2186 AESENC KEY STATE2
2187 AESENC KEY STATE3
2188 AESENC KEY STATE4
Huang Ying54b6a1b2009-01-18 16:28:34 +11002189#.align 4
2190.L4enc192:
2191 movaps -0x40(TKEYP), KEY
Huang Yingb369e522009-11-23 19:54:06 +08002192 AESENC KEY STATE1
2193 AESENC KEY STATE2
2194 AESENC KEY STATE3
2195 AESENC KEY STATE4
Huang Ying54b6a1b2009-01-18 16:28:34 +11002196 movaps -0x30(TKEYP), KEY
Huang Yingb369e522009-11-23 19:54:06 +08002197 AESENC KEY STATE1
2198 AESENC KEY STATE2
2199 AESENC KEY STATE3
2200 AESENC KEY STATE4
Huang Ying54b6a1b2009-01-18 16:28:34 +11002201#.align 4
2202.L4enc128:
2203 movaps -0x20(TKEYP), KEY
Huang Yingb369e522009-11-23 19:54:06 +08002204 AESENC KEY STATE1
2205 AESENC KEY STATE2
2206 AESENC KEY STATE3
2207 AESENC KEY STATE4
Huang Ying54b6a1b2009-01-18 16:28:34 +11002208 movaps -0x10(TKEYP), KEY
Huang Yingb369e522009-11-23 19:54:06 +08002209 AESENC KEY STATE1
2210 AESENC KEY STATE2
2211 AESENC KEY STATE3
2212 AESENC KEY STATE4
Huang Ying54b6a1b2009-01-18 16:28:34 +11002213 movaps (TKEYP), KEY
Huang Yingb369e522009-11-23 19:54:06 +08002214 AESENC KEY STATE1
2215 AESENC KEY STATE2
2216 AESENC KEY STATE3
2217 AESENC KEY STATE4
Huang Ying54b6a1b2009-01-18 16:28:34 +11002218 movaps 0x10(TKEYP), KEY
Huang Yingb369e522009-11-23 19:54:06 +08002219 AESENC KEY STATE1
2220 AESENC KEY STATE2
2221 AESENC KEY STATE3
2222 AESENC KEY STATE4
Huang Ying54b6a1b2009-01-18 16:28:34 +11002223 movaps 0x20(TKEYP), KEY
Huang Yingb369e522009-11-23 19:54:06 +08002224 AESENC KEY STATE1
2225 AESENC KEY STATE2
2226 AESENC KEY STATE3
2227 AESENC KEY STATE4
Huang Ying54b6a1b2009-01-18 16:28:34 +11002228 movaps 0x30(TKEYP), KEY
Huang Yingb369e522009-11-23 19:54:06 +08002229 AESENC KEY STATE1
2230 AESENC KEY STATE2
2231 AESENC KEY STATE3
2232 AESENC KEY STATE4
Huang Ying54b6a1b2009-01-18 16:28:34 +11002233 movaps 0x40(TKEYP), KEY
Huang Yingb369e522009-11-23 19:54:06 +08002234 AESENC KEY STATE1
2235 AESENC KEY STATE2
2236 AESENC KEY STATE3
2237 AESENC KEY STATE4
Huang Ying54b6a1b2009-01-18 16:28:34 +11002238 movaps 0x50(TKEYP), KEY
Huang Yingb369e522009-11-23 19:54:06 +08002239 AESENC KEY STATE1
2240 AESENC KEY STATE2
2241 AESENC KEY STATE3
2242 AESENC KEY STATE4
Huang Ying54b6a1b2009-01-18 16:28:34 +11002243 movaps 0x60(TKEYP), KEY
Huang Yingb369e522009-11-23 19:54:06 +08002244 AESENC KEY STATE1
2245 AESENC KEY STATE2
2246 AESENC KEY STATE3
2247 AESENC KEY STATE4
Huang Ying54b6a1b2009-01-18 16:28:34 +11002248 movaps 0x70(TKEYP), KEY
Huang Yingb369e522009-11-23 19:54:06 +08002249 AESENCLAST KEY STATE1 # last round
2250 AESENCLAST KEY STATE2
2251 AESENCLAST KEY STATE3
2252 AESENCLAST KEY STATE4
Huang Ying54b6a1b2009-01-18 16:28:34 +11002253 ret
Jussi Kivilinna8309b742013-01-19 13:38:55 +02002254ENDPROC(_aesni_enc4)
Huang Ying54b6a1b2009-01-18 16:28:34 +11002255
2256/*
2257 * void aesni_dec (struct crypto_aes_ctx *ctx, u8 *dst, const u8 *src)
2258 */
2259ENTRY(aesni_dec)
Josh Poimboeuf8691ccd2016-01-21 16:49:19 -06002260 FRAME_BEGIN
Mathias Krause0d258ef2010-11-27 16:34:46 +08002261#ifndef __x86_64__
2262 pushl KEYP
2263 pushl KLEN
Josh Poimboeuf8691ccd2016-01-21 16:49:19 -06002264 movl (FRAME_OFFSET+12)(%esp), KEYP # ctx
2265 movl (FRAME_OFFSET+16)(%esp), OUTP # dst
2266 movl (FRAME_OFFSET+20)(%esp), INP # src
Mathias Krause0d258ef2010-11-27 16:34:46 +08002267#endif
Huang Ying54b6a1b2009-01-18 16:28:34 +11002268 mov 480(KEYP), KLEN # key length
2269 add $240, KEYP
2270 movups (INP), STATE # input
2271 call _aesni_dec1
2272 movups STATE, (OUTP) #output
Mathias Krause0d258ef2010-11-27 16:34:46 +08002273#ifndef __x86_64__
2274 popl KLEN
2275 popl KEYP
2276#endif
Josh Poimboeuf8691ccd2016-01-21 16:49:19 -06002277 FRAME_END
Huang Ying54b6a1b2009-01-18 16:28:34 +11002278 ret
Jussi Kivilinna8309b742013-01-19 13:38:55 +02002279ENDPROC(aesni_dec)
Huang Ying54b6a1b2009-01-18 16:28:34 +11002280
2281/*
2282 * _aesni_dec1: internal ABI
2283 * input:
2284 * KEYP: key struct pointer
2285 * KLEN: key length
2286 * STATE: initial state (input)
2287 * output:
2288 * STATE: finial state (output)
2289 * changed:
2290 * KEY
2291 * TKEYP (T1)
2292 */
Mathias Krause0d258ef2010-11-27 16:34:46 +08002293.align 4
Huang Ying54b6a1b2009-01-18 16:28:34 +11002294_aesni_dec1:
2295 movaps (KEYP), KEY # key
2296 mov KEYP, TKEYP
2297 pxor KEY, STATE # round 0
2298 add $0x30, TKEYP
2299 cmp $24, KLEN
2300 jb .Ldec128
2301 lea 0x20(TKEYP), TKEYP
2302 je .Ldec192
2303 add $0x20, TKEYP
2304 movaps -0x60(TKEYP), KEY
Huang Yingb369e522009-11-23 19:54:06 +08002305 AESDEC KEY STATE
Huang Ying54b6a1b2009-01-18 16:28:34 +11002306 movaps -0x50(TKEYP), KEY
Huang Yingb369e522009-11-23 19:54:06 +08002307 AESDEC KEY STATE
Huang Ying54b6a1b2009-01-18 16:28:34 +11002308.align 4
2309.Ldec192:
2310 movaps -0x40(TKEYP), KEY
Huang Yingb369e522009-11-23 19:54:06 +08002311 AESDEC KEY STATE
Huang Ying54b6a1b2009-01-18 16:28:34 +11002312 movaps -0x30(TKEYP), KEY
Huang Yingb369e522009-11-23 19:54:06 +08002313 AESDEC KEY STATE
Huang Ying54b6a1b2009-01-18 16:28:34 +11002314.align 4
2315.Ldec128:
2316 movaps -0x20(TKEYP), KEY
Huang Yingb369e522009-11-23 19:54:06 +08002317 AESDEC KEY STATE
Huang Ying54b6a1b2009-01-18 16:28:34 +11002318 movaps -0x10(TKEYP), KEY
Huang Yingb369e522009-11-23 19:54:06 +08002319 AESDEC KEY STATE
Huang Ying54b6a1b2009-01-18 16:28:34 +11002320 movaps (TKEYP), KEY
Huang Yingb369e522009-11-23 19:54:06 +08002321 AESDEC KEY STATE
Huang Ying54b6a1b2009-01-18 16:28:34 +11002322 movaps 0x10(TKEYP), KEY
Huang Yingb369e522009-11-23 19:54:06 +08002323 AESDEC KEY STATE
Huang Ying54b6a1b2009-01-18 16:28:34 +11002324 movaps 0x20(TKEYP), KEY
Huang Yingb369e522009-11-23 19:54:06 +08002325 AESDEC KEY STATE
Huang Ying54b6a1b2009-01-18 16:28:34 +11002326 movaps 0x30(TKEYP), KEY
Huang Yingb369e522009-11-23 19:54:06 +08002327 AESDEC KEY STATE
Huang Ying54b6a1b2009-01-18 16:28:34 +11002328 movaps 0x40(TKEYP), KEY
Huang Yingb369e522009-11-23 19:54:06 +08002329 AESDEC KEY STATE
Huang Ying54b6a1b2009-01-18 16:28:34 +11002330 movaps 0x50(TKEYP), KEY
Huang Yingb369e522009-11-23 19:54:06 +08002331 AESDEC KEY STATE
Huang Ying54b6a1b2009-01-18 16:28:34 +11002332 movaps 0x60(TKEYP), KEY
Huang Yingb369e522009-11-23 19:54:06 +08002333 AESDEC KEY STATE
Huang Ying54b6a1b2009-01-18 16:28:34 +11002334 movaps 0x70(TKEYP), KEY
Huang Yingb369e522009-11-23 19:54:06 +08002335 AESDECLAST KEY STATE
Huang Ying54b6a1b2009-01-18 16:28:34 +11002336 ret
Jussi Kivilinna8309b742013-01-19 13:38:55 +02002337ENDPROC(_aesni_dec1)
Huang Ying54b6a1b2009-01-18 16:28:34 +11002338
2339/*
2340 * _aesni_dec4: internal ABI
2341 * input:
2342 * KEYP: key struct pointer
2343 * KLEN: key length
2344 * STATE1: initial state (input)
2345 * STATE2
2346 * STATE3
2347 * STATE4
2348 * output:
2349 * STATE1: finial state (output)
2350 * STATE2
2351 * STATE3
2352 * STATE4
2353 * changed:
2354 * KEY
2355 * TKEYP (T1)
2356 */
Mathias Krause0d258ef2010-11-27 16:34:46 +08002357.align 4
Huang Ying54b6a1b2009-01-18 16:28:34 +11002358_aesni_dec4:
2359 movaps (KEYP), KEY # key
2360 mov KEYP, TKEYP
2361 pxor KEY, STATE1 # round 0
2362 pxor KEY, STATE2
2363 pxor KEY, STATE3
2364 pxor KEY, STATE4
2365 add $0x30, TKEYP
2366 cmp $24, KLEN
2367 jb .L4dec128
2368 lea 0x20(TKEYP), TKEYP
2369 je .L4dec192
2370 add $0x20, TKEYP
2371 movaps -0x60(TKEYP), KEY
Huang Yingb369e522009-11-23 19:54:06 +08002372 AESDEC KEY STATE1
2373 AESDEC KEY STATE2
2374 AESDEC KEY STATE3
2375 AESDEC KEY STATE4
Huang Ying54b6a1b2009-01-18 16:28:34 +11002376 movaps -0x50(TKEYP), KEY
Huang Yingb369e522009-11-23 19:54:06 +08002377 AESDEC KEY STATE1
2378 AESDEC KEY STATE2
2379 AESDEC KEY STATE3
2380 AESDEC KEY STATE4
Huang Ying54b6a1b2009-01-18 16:28:34 +11002381.align 4
2382.L4dec192:
2383 movaps -0x40(TKEYP), KEY
Huang Yingb369e522009-11-23 19:54:06 +08002384 AESDEC KEY STATE1
2385 AESDEC KEY STATE2
2386 AESDEC KEY STATE3
2387 AESDEC KEY STATE4
Huang Ying54b6a1b2009-01-18 16:28:34 +11002388 movaps -0x30(TKEYP), KEY
Huang Yingb369e522009-11-23 19:54:06 +08002389 AESDEC KEY STATE1
2390 AESDEC KEY STATE2
2391 AESDEC KEY STATE3
2392 AESDEC KEY STATE4
Huang Ying54b6a1b2009-01-18 16:28:34 +11002393.align 4
2394.L4dec128:
2395 movaps -0x20(TKEYP), KEY
Huang Yingb369e522009-11-23 19:54:06 +08002396 AESDEC KEY STATE1
2397 AESDEC KEY STATE2
2398 AESDEC KEY STATE3
2399 AESDEC KEY STATE4
Huang Ying54b6a1b2009-01-18 16:28:34 +11002400 movaps -0x10(TKEYP), KEY
Huang Yingb369e522009-11-23 19:54:06 +08002401 AESDEC KEY STATE1
2402 AESDEC KEY STATE2
2403 AESDEC KEY STATE3
2404 AESDEC KEY STATE4
Huang Ying54b6a1b2009-01-18 16:28:34 +11002405 movaps (TKEYP), KEY
Huang Yingb369e522009-11-23 19:54:06 +08002406 AESDEC KEY STATE1
2407 AESDEC KEY STATE2
2408 AESDEC KEY STATE3
2409 AESDEC KEY STATE4
Huang Ying54b6a1b2009-01-18 16:28:34 +11002410 movaps 0x10(TKEYP), KEY
Huang Yingb369e522009-11-23 19:54:06 +08002411 AESDEC KEY STATE1
2412 AESDEC KEY STATE2
2413 AESDEC KEY STATE3
2414 AESDEC KEY STATE4
Huang Ying54b6a1b2009-01-18 16:28:34 +11002415 movaps 0x20(TKEYP), KEY
Huang Yingb369e522009-11-23 19:54:06 +08002416 AESDEC KEY STATE1
2417 AESDEC KEY STATE2
2418 AESDEC KEY STATE3
2419 AESDEC KEY STATE4
Huang Ying54b6a1b2009-01-18 16:28:34 +11002420 movaps 0x30(TKEYP), KEY
Huang Yingb369e522009-11-23 19:54:06 +08002421 AESDEC KEY STATE1
2422 AESDEC KEY STATE2
2423 AESDEC KEY STATE3
2424 AESDEC KEY STATE4
Huang Ying54b6a1b2009-01-18 16:28:34 +11002425 movaps 0x40(TKEYP), KEY
Huang Yingb369e522009-11-23 19:54:06 +08002426 AESDEC KEY STATE1
2427 AESDEC KEY STATE2
2428 AESDEC KEY STATE3
2429 AESDEC KEY STATE4
Huang Ying54b6a1b2009-01-18 16:28:34 +11002430 movaps 0x50(TKEYP), KEY
Huang Yingb369e522009-11-23 19:54:06 +08002431 AESDEC KEY STATE1
2432 AESDEC KEY STATE2
2433 AESDEC KEY STATE3
2434 AESDEC KEY STATE4
Huang Ying54b6a1b2009-01-18 16:28:34 +11002435 movaps 0x60(TKEYP), KEY
Huang Yingb369e522009-11-23 19:54:06 +08002436 AESDEC KEY STATE1
2437 AESDEC KEY STATE2
2438 AESDEC KEY STATE3
2439 AESDEC KEY STATE4
Huang Ying54b6a1b2009-01-18 16:28:34 +11002440 movaps 0x70(TKEYP), KEY
Huang Yingb369e522009-11-23 19:54:06 +08002441 AESDECLAST KEY STATE1 # last round
2442 AESDECLAST KEY STATE2
2443 AESDECLAST KEY STATE3
2444 AESDECLAST KEY STATE4
Huang Ying54b6a1b2009-01-18 16:28:34 +11002445 ret
Jussi Kivilinna8309b742013-01-19 13:38:55 +02002446ENDPROC(_aesni_dec4)
Huang Ying54b6a1b2009-01-18 16:28:34 +11002447
2448/*
2449 * void aesni_ecb_enc(struct crypto_aes_ctx *ctx, const u8 *dst, u8 *src,
2450 * size_t len)
2451 */
2452ENTRY(aesni_ecb_enc)
Josh Poimboeuf8691ccd2016-01-21 16:49:19 -06002453 FRAME_BEGIN
Mathias Krause0d258ef2010-11-27 16:34:46 +08002454#ifndef __x86_64__
2455 pushl LEN
2456 pushl KEYP
2457 pushl KLEN
Josh Poimboeuf8691ccd2016-01-21 16:49:19 -06002458 movl (FRAME_OFFSET+16)(%esp), KEYP # ctx
2459 movl (FRAME_OFFSET+20)(%esp), OUTP # dst
2460 movl (FRAME_OFFSET+24)(%esp), INP # src
2461 movl (FRAME_OFFSET+28)(%esp), LEN # len
Mathias Krause0d258ef2010-11-27 16:34:46 +08002462#endif
Huang Ying54b6a1b2009-01-18 16:28:34 +11002463 test LEN, LEN # check length
2464 jz .Lecb_enc_ret
2465 mov 480(KEYP), KLEN
2466 cmp $16, LEN
2467 jb .Lecb_enc_ret
2468 cmp $64, LEN
2469 jb .Lecb_enc_loop1
2470.align 4
2471.Lecb_enc_loop4:
2472 movups (INP), STATE1
2473 movups 0x10(INP), STATE2
2474 movups 0x20(INP), STATE3
2475 movups 0x30(INP), STATE4
2476 call _aesni_enc4
2477 movups STATE1, (OUTP)
2478 movups STATE2, 0x10(OUTP)
2479 movups STATE3, 0x20(OUTP)
2480 movups STATE4, 0x30(OUTP)
2481 sub $64, LEN
2482 add $64, INP
2483 add $64, OUTP
2484 cmp $64, LEN
2485 jge .Lecb_enc_loop4
2486 cmp $16, LEN
2487 jb .Lecb_enc_ret
2488.align 4
2489.Lecb_enc_loop1:
2490 movups (INP), STATE1
2491 call _aesni_enc1
2492 movups STATE1, (OUTP)
2493 sub $16, LEN
2494 add $16, INP
2495 add $16, OUTP
2496 cmp $16, LEN
2497 jge .Lecb_enc_loop1
2498.Lecb_enc_ret:
Mathias Krause0d258ef2010-11-27 16:34:46 +08002499#ifndef __x86_64__
2500 popl KLEN
2501 popl KEYP
2502 popl LEN
2503#endif
Josh Poimboeuf8691ccd2016-01-21 16:49:19 -06002504 FRAME_END
Huang Ying54b6a1b2009-01-18 16:28:34 +11002505 ret
Jussi Kivilinna8309b742013-01-19 13:38:55 +02002506ENDPROC(aesni_ecb_enc)
Huang Ying54b6a1b2009-01-18 16:28:34 +11002507
2508/*
2509 * void aesni_ecb_dec(struct crypto_aes_ctx *ctx, const u8 *dst, u8 *src,
2510 * size_t len);
2511 */
2512ENTRY(aesni_ecb_dec)
Josh Poimboeuf8691ccd2016-01-21 16:49:19 -06002513 FRAME_BEGIN
Mathias Krause0d258ef2010-11-27 16:34:46 +08002514#ifndef __x86_64__
2515 pushl LEN
2516 pushl KEYP
2517 pushl KLEN
Josh Poimboeuf8691ccd2016-01-21 16:49:19 -06002518 movl (FRAME_OFFSET+16)(%esp), KEYP # ctx
2519 movl (FRAME_OFFSET+20)(%esp), OUTP # dst
2520 movl (FRAME_OFFSET+24)(%esp), INP # src
2521 movl (FRAME_OFFSET+28)(%esp), LEN # len
Mathias Krause0d258ef2010-11-27 16:34:46 +08002522#endif
Huang Ying54b6a1b2009-01-18 16:28:34 +11002523 test LEN, LEN
2524 jz .Lecb_dec_ret
2525 mov 480(KEYP), KLEN
2526 add $240, KEYP
2527 cmp $16, LEN
2528 jb .Lecb_dec_ret
2529 cmp $64, LEN
2530 jb .Lecb_dec_loop1
2531.align 4
2532.Lecb_dec_loop4:
2533 movups (INP), STATE1
2534 movups 0x10(INP), STATE2
2535 movups 0x20(INP), STATE3
2536 movups 0x30(INP), STATE4
2537 call _aesni_dec4
2538 movups STATE1, (OUTP)
2539 movups STATE2, 0x10(OUTP)
2540 movups STATE3, 0x20(OUTP)
2541 movups STATE4, 0x30(OUTP)
2542 sub $64, LEN
2543 add $64, INP
2544 add $64, OUTP
2545 cmp $64, LEN
2546 jge .Lecb_dec_loop4
2547 cmp $16, LEN
2548 jb .Lecb_dec_ret
2549.align 4
2550.Lecb_dec_loop1:
2551 movups (INP), STATE1
2552 call _aesni_dec1
2553 movups STATE1, (OUTP)
2554 sub $16, LEN
2555 add $16, INP
2556 add $16, OUTP
2557 cmp $16, LEN
2558 jge .Lecb_dec_loop1
2559.Lecb_dec_ret:
Mathias Krause0d258ef2010-11-27 16:34:46 +08002560#ifndef __x86_64__
2561 popl KLEN
2562 popl KEYP
2563 popl LEN
2564#endif
Josh Poimboeuf8691ccd2016-01-21 16:49:19 -06002565 FRAME_END
Huang Ying54b6a1b2009-01-18 16:28:34 +11002566 ret
Jussi Kivilinna8309b742013-01-19 13:38:55 +02002567ENDPROC(aesni_ecb_dec)
Huang Ying54b6a1b2009-01-18 16:28:34 +11002568
2569/*
2570 * void aesni_cbc_enc(struct crypto_aes_ctx *ctx, const u8 *dst, u8 *src,
2571 * size_t len, u8 *iv)
2572 */
2573ENTRY(aesni_cbc_enc)
Josh Poimboeuf8691ccd2016-01-21 16:49:19 -06002574 FRAME_BEGIN
Mathias Krause0d258ef2010-11-27 16:34:46 +08002575#ifndef __x86_64__
2576 pushl IVP
2577 pushl LEN
2578 pushl KEYP
2579 pushl KLEN
Josh Poimboeuf8691ccd2016-01-21 16:49:19 -06002580 movl (FRAME_OFFSET+20)(%esp), KEYP # ctx
2581 movl (FRAME_OFFSET+24)(%esp), OUTP # dst
2582 movl (FRAME_OFFSET+28)(%esp), INP # src
2583 movl (FRAME_OFFSET+32)(%esp), LEN # len
2584 movl (FRAME_OFFSET+36)(%esp), IVP # iv
Mathias Krause0d258ef2010-11-27 16:34:46 +08002585#endif
Huang Ying54b6a1b2009-01-18 16:28:34 +11002586 cmp $16, LEN
2587 jb .Lcbc_enc_ret
2588 mov 480(KEYP), KLEN
2589 movups (IVP), STATE # load iv as initial state
2590.align 4
2591.Lcbc_enc_loop:
2592 movups (INP), IN # load input
2593 pxor IN, STATE
2594 call _aesni_enc1
2595 movups STATE, (OUTP) # store output
2596 sub $16, LEN
2597 add $16, INP
2598 add $16, OUTP
2599 cmp $16, LEN
2600 jge .Lcbc_enc_loop
2601 movups STATE, (IVP)
2602.Lcbc_enc_ret:
Mathias Krause0d258ef2010-11-27 16:34:46 +08002603#ifndef __x86_64__
2604 popl KLEN
2605 popl KEYP
2606 popl LEN
2607 popl IVP
2608#endif
Josh Poimboeuf8691ccd2016-01-21 16:49:19 -06002609 FRAME_END
Huang Ying54b6a1b2009-01-18 16:28:34 +11002610 ret
Jussi Kivilinna8309b742013-01-19 13:38:55 +02002611ENDPROC(aesni_cbc_enc)
Huang Ying54b6a1b2009-01-18 16:28:34 +11002612
2613/*
2614 * void aesni_cbc_dec(struct crypto_aes_ctx *ctx, const u8 *dst, u8 *src,
2615 * size_t len, u8 *iv)
2616 */
2617ENTRY(aesni_cbc_dec)
Josh Poimboeuf8691ccd2016-01-21 16:49:19 -06002618 FRAME_BEGIN
Mathias Krause0d258ef2010-11-27 16:34:46 +08002619#ifndef __x86_64__
2620 pushl IVP
2621 pushl LEN
2622 pushl KEYP
2623 pushl KLEN
Josh Poimboeuf8691ccd2016-01-21 16:49:19 -06002624 movl (FRAME_OFFSET+20)(%esp), KEYP # ctx
2625 movl (FRAME_OFFSET+24)(%esp), OUTP # dst
2626 movl (FRAME_OFFSET+28)(%esp), INP # src
2627 movl (FRAME_OFFSET+32)(%esp), LEN # len
2628 movl (FRAME_OFFSET+36)(%esp), IVP # iv
Mathias Krause0d258ef2010-11-27 16:34:46 +08002629#endif
Huang Ying54b6a1b2009-01-18 16:28:34 +11002630 cmp $16, LEN
Huang Yinge6efaa02009-06-18 19:33:57 +08002631 jb .Lcbc_dec_just_ret
Huang Ying54b6a1b2009-01-18 16:28:34 +11002632 mov 480(KEYP), KLEN
2633 add $240, KEYP
2634 movups (IVP), IV
2635 cmp $64, LEN
2636 jb .Lcbc_dec_loop1
2637.align 4
2638.Lcbc_dec_loop4:
2639 movups (INP), IN1
2640 movaps IN1, STATE1
2641 movups 0x10(INP), IN2
2642 movaps IN2, STATE2
Mathias Krause0d258ef2010-11-27 16:34:46 +08002643#ifdef __x86_64__
Huang Ying54b6a1b2009-01-18 16:28:34 +11002644 movups 0x20(INP), IN3
2645 movaps IN3, STATE3
2646 movups 0x30(INP), IN4
2647 movaps IN4, STATE4
Mathias Krause0d258ef2010-11-27 16:34:46 +08002648#else
2649 movups 0x20(INP), IN1
2650 movaps IN1, STATE3
2651 movups 0x30(INP), IN2
2652 movaps IN2, STATE4
2653#endif
Huang Ying54b6a1b2009-01-18 16:28:34 +11002654 call _aesni_dec4
2655 pxor IV, STATE1
Mathias Krause0d258ef2010-11-27 16:34:46 +08002656#ifdef __x86_64__
Huang Ying54b6a1b2009-01-18 16:28:34 +11002657 pxor IN1, STATE2
2658 pxor IN2, STATE3
2659 pxor IN3, STATE4
2660 movaps IN4, IV
Mathias Krause0d258ef2010-11-27 16:34:46 +08002661#else
Mathias Krause0d258ef2010-11-27 16:34:46 +08002662 pxor IN1, STATE4
2663 movaps IN2, IV
Mathias Krause7c8d5182012-05-30 01:43:08 +02002664 movups (INP), IN1
2665 pxor IN1, STATE2
2666 movups 0x10(INP), IN2
2667 pxor IN2, STATE3
Mathias Krause0d258ef2010-11-27 16:34:46 +08002668#endif
Huang Ying54b6a1b2009-01-18 16:28:34 +11002669 movups STATE1, (OUTP)
2670 movups STATE2, 0x10(OUTP)
2671 movups STATE3, 0x20(OUTP)
2672 movups STATE4, 0x30(OUTP)
2673 sub $64, LEN
2674 add $64, INP
2675 add $64, OUTP
2676 cmp $64, LEN
2677 jge .Lcbc_dec_loop4
2678 cmp $16, LEN
2679 jb .Lcbc_dec_ret
2680.align 4
2681.Lcbc_dec_loop1:
2682 movups (INP), IN
2683 movaps IN, STATE
2684 call _aesni_dec1
2685 pxor IV, STATE
2686 movups STATE, (OUTP)
2687 movaps IN, IV
2688 sub $16, LEN
2689 add $16, INP
2690 add $16, OUTP
2691 cmp $16, LEN
2692 jge .Lcbc_dec_loop1
Huang Ying54b6a1b2009-01-18 16:28:34 +11002693.Lcbc_dec_ret:
Huang Yinge6efaa02009-06-18 19:33:57 +08002694 movups IV, (IVP)
2695.Lcbc_dec_just_ret:
Mathias Krause0d258ef2010-11-27 16:34:46 +08002696#ifndef __x86_64__
2697 popl KLEN
2698 popl KEYP
2699 popl LEN
2700 popl IVP
2701#endif
Josh Poimboeuf8691ccd2016-01-21 16:49:19 -06002702 FRAME_END
Huang Ying54b6a1b2009-01-18 16:28:34 +11002703 ret
Jussi Kivilinna8309b742013-01-19 13:38:55 +02002704ENDPROC(aesni_cbc_dec)
Huang Ying12387a42010-03-10 18:28:55 +08002705
Mathias Krause0d258ef2010-11-27 16:34:46 +08002706#ifdef __x86_64__
Josh Poimboeuf1253cab2016-01-21 16:49:15 -06002707.pushsection .rodata
Huang Ying12387a42010-03-10 18:28:55 +08002708.align 16
2709.Lbswap_mask:
2710 .byte 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0
Josh Poimboeuf1253cab2016-01-21 16:49:15 -06002711.popsection
Huang Ying12387a42010-03-10 18:28:55 +08002712
2713/*
2714 * _aesni_inc_init: internal ABI
2715 * setup registers used by _aesni_inc
2716 * input:
2717 * IV
2718 * output:
2719 * CTR: == IV, in little endian
2720 * TCTR_LOW: == lower qword of CTR
2721 * INC: == 1, in little endian
2722 * BSWAP_MASK == endian swapping mask
2723 */
Mathias Krause0d258ef2010-11-27 16:34:46 +08002724.align 4
Huang Ying12387a42010-03-10 18:28:55 +08002725_aesni_inc_init:
2726 movaps .Lbswap_mask, BSWAP_MASK
2727 movaps IV, CTR
2728 PSHUFB_XMM BSWAP_MASK CTR
2729 mov $1, TCTR_LOW
Huang Ying32cbd7d2010-03-13 16:28:42 +08002730 MOVQ_R64_XMM TCTR_LOW INC
2731 MOVQ_R64_XMM CTR TCTR_LOW
Huang Ying12387a42010-03-10 18:28:55 +08002732 ret
Jussi Kivilinna8309b742013-01-19 13:38:55 +02002733ENDPROC(_aesni_inc_init)
Huang Ying12387a42010-03-10 18:28:55 +08002734
2735/*
2736 * _aesni_inc: internal ABI
2737 * Increase IV by 1, IV is in big endian
2738 * input:
2739 * IV
2740 * CTR: == IV, in little endian
2741 * TCTR_LOW: == lower qword of CTR
2742 * INC: == 1, in little endian
2743 * BSWAP_MASK == endian swapping mask
2744 * output:
2745 * IV: Increase by 1
2746 * changed:
2747 * CTR: == output IV, in little endian
2748 * TCTR_LOW: == lower qword of CTR
2749 */
Mathias Krause0d258ef2010-11-27 16:34:46 +08002750.align 4
Huang Ying12387a42010-03-10 18:28:55 +08002751_aesni_inc:
2752 paddq INC, CTR
2753 add $1, TCTR_LOW
2754 jnc .Linc_low
2755 pslldq $8, INC
2756 paddq INC, CTR
2757 psrldq $8, INC
2758.Linc_low:
2759 movaps CTR, IV
2760 PSHUFB_XMM BSWAP_MASK IV
2761 ret
Jussi Kivilinna8309b742013-01-19 13:38:55 +02002762ENDPROC(_aesni_inc)
Huang Ying12387a42010-03-10 18:28:55 +08002763
2764/*
2765 * void aesni_ctr_enc(struct crypto_aes_ctx *ctx, const u8 *dst, u8 *src,
2766 * size_t len, u8 *iv)
2767 */
2768ENTRY(aesni_ctr_enc)
Josh Poimboeuf8691ccd2016-01-21 16:49:19 -06002769 FRAME_BEGIN
Huang Ying12387a42010-03-10 18:28:55 +08002770 cmp $16, LEN
2771 jb .Lctr_enc_just_ret
2772 mov 480(KEYP), KLEN
2773 movups (IVP), IV
2774 call _aesni_inc_init
2775 cmp $64, LEN
2776 jb .Lctr_enc_loop1
2777.align 4
2778.Lctr_enc_loop4:
2779 movaps IV, STATE1
2780 call _aesni_inc
2781 movups (INP), IN1
2782 movaps IV, STATE2
2783 call _aesni_inc
2784 movups 0x10(INP), IN2
2785 movaps IV, STATE3
2786 call _aesni_inc
2787 movups 0x20(INP), IN3
2788 movaps IV, STATE4
2789 call _aesni_inc
2790 movups 0x30(INP), IN4
2791 call _aesni_enc4
2792 pxor IN1, STATE1
2793 movups STATE1, (OUTP)
2794 pxor IN2, STATE2
2795 movups STATE2, 0x10(OUTP)
2796 pxor IN3, STATE3
2797 movups STATE3, 0x20(OUTP)
2798 pxor IN4, STATE4
2799 movups STATE4, 0x30(OUTP)
2800 sub $64, LEN
2801 add $64, INP
2802 add $64, OUTP
2803 cmp $64, LEN
2804 jge .Lctr_enc_loop4
2805 cmp $16, LEN
2806 jb .Lctr_enc_ret
2807.align 4
2808.Lctr_enc_loop1:
2809 movaps IV, STATE
2810 call _aesni_inc
2811 movups (INP), IN
2812 call _aesni_enc1
2813 pxor IN, STATE
2814 movups STATE, (OUTP)
2815 sub $16, LEN
2816 add $16, INP
2817 add $16, OUTP
2818 cmp $16, LEN
2819 jge .Lctr_enc_loop1
2820.Lctr_enc_ret:
2821 movups IV, (IVP)
2822.Lctr_enc_just_ret:
Josh Poimboeuf8691ccd2016-01-21 16:49:19 -06002823 FRAME_END
Huang Ying12387a42010-03-10 18:28:55 +08002824 ret
Jussi Kivilinna8309b742013-01-19 13:38:55 +02002825ENDPROC(aesni_ctr_enc)
Jussi Kivilinnac456a9c2013-04-08 21:51:16 +03002826
2827/*
2828 * _aesni_gf128mul_x_ble: internal ABI
2829 * Multiply in GF(2^128) for XTS IVs
2830 * input:
2831 * IV: current IV
2832 * GF128MUL_MASK == mask with 0x87 and 0x01
2833 * output:
2834 * IV: next IV
2835 * changed:
2836 * CTR: == temporary value
2837 */
2838#define _aesni_gf128mul_x_ble() \
2839 pshufd $0x13, IV, CTR; \
2840 paddq IV, IV; \
2841 psrad $31, CTR; \
2842 pand GF128MUL_MASK, CTR; \
2843 pxor CTR, IV;
2844
2845/*
2846 * void aesni_xts_crypt8(struct crypto_aes_ctx *ctx, const u8 *dst, u8 *src,
2847 * bool enc, u8 *iv)
2848 */
2849ENTRY(aesni_xts_crypt8)
Josh Poimboeuf8691ccd2016-01-21 16:49:19 -06002850 FRAME_BEGIN
Jussi Kivilinnac456a9c2013-04-08 21:51:16 +03002851 cmpb $0, %cl
2852 movl $0, %ecx
2853 movl $240, %r10d
2854 leaq _aesni_enc4, %r11
2855 leaq _aesni_dec4, %rax
2856 cmovel %r10d, %ecx
2857 cmoveq %rax, %r11
2858
2859 movdqa .Lgf128mul_x_ble_mask, GF128MUL_MASK
2860 movups (IVP), IV
2861
2862 mov 480(KEYP), KLEN
2863 addq %rcx, KEYP
2864
2865 movdqa IV, STATE1
Jussi Kivilinnafe6510b2013-06-11 22:25:22 +03002866 movdqu 0x00(INP), INC
2867 pxor INC, STATE1
Jussi Kivilinnac456a9c2013-04-08 21:51:16 +03002868 movdqu IV, 0x00(OUTP)
2869
2870 _aesni_gf128mul_x_ble()
2871 movdqa IV, STATE2
Jussi Kivilinnafe6510b2013-06-11 22:25:22 +03002872 movdqu 0x10(INP), INC
2873 pxor INC, STATE2
Jussi Kivilinnac456a9c2013-04-08 21:51:16 +03002874 movdqu IV, 0x10(OUTP)
2875
2876 _aesni_gf128mul_x_ble()
2877 movdqa IV, STATE3
Jussi Kivilinnafe6510b2013-06-11 22:25:22 +03002878 movdqu 0x20(INP), INC
2879 pxor INC, STATE3
Jussi Kivilinnac456a9c2013-04-08 21:51:16 +03002880 movdqu IV, 0x20(OUTP)
2881
2882 _aesni_gf128mul_x_ble()
2883 movdqa IV, STATE4
Jussi Kivilinnafe6510b2013-06-11 22:25:22 +03002884 movdqu 0x30(INP), INC
2885 pxor INC, STATE4
Jussi Kivilinnac456a9c2013-04-08 21:51:16 +03002886 movdqu IV, 0x30(OUTP)
2887
David Woodhouse9697fa32018-01-11 21:46:27 +00002888 CALL_NOSPEC %r11
Jussi Kivilinnac456a9c2013-04-08 21:51:16 +03002889
Jussi Kivilinnafe6510b2013-06-11 22:25:22 +03002890 movdqu 0x00(OUTP), INC
2891 pxor INC, STATE1
Jussi Kivilinnac456a9c2013-04-08 21:51:16 +03002892 movdqu STATE1, 0x00(OUTP)
2893
2894 _aesni_gf128mul_x_ble()
2895 movdqa IV, STATE1
Jussi Kivilinnafe6510b2013-06-11 22:25:22 +03002896 movdqu 0x40(INP), INC
2897 pxor INC, STATE1
Jussi Kivilinnac456a9c2013-04-08 21:51:16 +03002898 movdqu IV, 0x40(OUTP)
2899
Jussi Kivilinnafe6510b2013-06-11 22:25:22 +03002900 movdqu 0x10(OUTP), INC
2901 pxor INC, STATE2
Jussi Kivilinnac456a9c2013-04-08 21:51:16 +03002902 movdqu STATE2, 0x10(OUTP)
2903
2904 _aesni_gf128mul_x_ble()
2905 movdqa IV, STATE2
Jussi Kivilinnafe6510b2013-06-11 22:25:22 +03002906 movdqu 0x50(INP), INC
2907 pxor INC, STATE2
Jussi Kivilinnac456a9c2013-04-08 21:51:16 +03002908 movdqu IV, 0x50(OUTP)
2909
Jussi Kivilinnafe6510b2013-06-11 22:25:22 +03002910 movdqu 0x20(OUTP), INC
2911 pxor INC, STATE3
Jussi Kivilinnac456a9c2013-04-08 21:51:16 +03002912 movdqu STATE3, 0x20(OUTP)
2913
2914 _aesni_gf128mul_x_ble()
2915 movdqa IV, STATE3
Jussi Kivilinnafe6510b2013-06-11 22:25:22 +03002916 movdqu 0x60(INP), INC
2917 pxor INC, STATE3
Jussi Kivilinnac456a9c2013-04-08 21:51:16 +03002918 movdqu IV, 0x60(OUTP)
2919
Jussi Kivilinnafe6510b2013-06-11 22:25:22 +03002920 movdqu 0x30(OUTP), INC
2921 pxor INC, STATE4
Jussi Kivilinnac456a9c2013-04-08 21:51:16 +03002922 movdqu STATE4, 0x30(OUTP)
2923
2924 _aesni_gf128mul_x_ble()
2925 movdqa IV, STATE4
Jussi Kivilinnafe6510b2013-06-11 22:25:22 +03002926 movdqu 0x70(INP), INC
2927 pxor INC, STATE4
Jussi Kivilinnac456a9c2013-04-08 21:51:16 +03002928 movdqu IV, 0x70(OUTP)
2929
2930 _aesni_gf128mul_x_ble()
2931 movups IV, (IVP)
2932
David Woodhouse9697fa32018-01-11 21:46:27 +00002933 CALL_NOSPEC %r11
Jussi Kivilinnac456a9c2013-04-08 21:51:16 +03002934
Jussi Kivilinnafe6510b2013-06-11 22:25:22 +03002935 movdqu 0x40(OUTP), INC
2936 pxor INC, STATE1
Jussi Kivilinnac456a9c2013-04-08 21:51:16 +03002937 movdqu STATE1, 0x40(OUTP)
2938
Jussi Kivilinnafe6510b2013-06-11 22:25:22 +03002939 movdqu 0x50(OUTP), INC
2940 pxor INC, STATE2
Jussi Kivilinnac456a9c2013-04-08 21:51:16 +03002941 movdqu STATE2, 0x50(OUTP)
2942
Jussi Kivilinnafe6510b2013-06-11 22:25:22 +03002943 movdqu 0x60(OUTP), INC
2944 pxor INC, STATE3
Jussi Kivilinnac456a9c2013-04-08 21:51:16 +03002945 movdqu STATE3, 0x60(OUTP)
2946
Jussi Kivilinnafe6510b2013-06-11 22:25:22 +03002947 movdqu 0x70(OUTP), INC
2948 pxor INC, STATE4
Jussi Kivilinnac456a9c2013-04-08 21:51:16 +03002949 movdqu STATE4, 0x70(OUTP)
2950
Josh Poimboeuf8691ccd2016-01-21 16:49:19 -06002951 FRAME_END
Jussi Kivilinnac456a9c2013-04-08 21:51:16 +03002952 ret
2953ENDPROC(aesni_xts_crypt8)
2954
Mathias Krause0d258ef2010-11-27 16:34:46 +08002955#endif