blob: 16627fec80b26b211ba3c4b50c3850c134e2bd62 [file] [log] [blame]
Huang Ying54b6a1b2009-01-18 16:28:34 +11001/*
2 * Implement AES algorithm in Intel AES-NI instructions.
3 *
4 * The white paper of AES-NI instructions can be downloaded from:
5 * http://softwarecommunity.intel.com/isn/downloads/intelavx/AES-Instructions-Set_WP.pdf
6 *
7 * Copyright (C) 2008, Intel Corp.
8 * Author: Huang Ying <ying.huang@intel.com>
9 * Vinodh Gopal <vinodh.gopal@intel.com>
10 * Kahraman Akdemir
11 *
Tadeusz Struk0bd82f52010-11-04 15:00:45 -040012 * Added RFC4106 AES-GCM support for 128-bit keys under the AEAD
13 * interface for 64-bit kernels.
14 * Authors: Erdinc Ozturk (erdinc.ozturk@intel.com)
15 * Aidan O'Mahony (aidan.o.mahony@intel.com)
16 * Adrian Hoban <adrian.hoban@intel.com>
17 * James Guilford (james.guilford@intel.com)
18 * Gabriele Paoloni <gabriele.paoloni@intel.com>
19 * Tadeusz Struk (tadeusz.struk@intel.com)
20 * Wajdi Feghali (wajdi.k.feghali@intel.com)
21 * Copyright (c) 2010, Intel Corporation.
22 *
Mathias Krause0d258ef2010-11-27 16:34:46 +080023 * Ported x86_64 version to x86:
24 * Author: Mathias Krause <minipli@googlemail.com>
25 *
Huang Ying54b6a1b2009-01-18 16:28:34 +110026 * This program is free software; you can redistribute it and/or modify
27 * it under the terms of the GNU General Public License as published by
28 * the Free Software Foundation; either version 2 of the License, or
29 * (at your option) any later version.
30 */
31
32#include <linux/linkage.h>
Huang Yingb369e522009-11-23 19:54:06 +080033#include <asm/inst.h>
Josh Poimboeuf8691ccd2016-01-21 16:49:19 -060034#include <asm/frame.h>
Huang Ying54b6a1b2009-01-18 16:28:34 +110035
Timothy McCaffreye31ac322015-01-13 13:16:43 -050036/*
37 * The following macros are used to move an (un)aligned 16 byte value to/from
38 * an XMM register. This can done for either FP or integer values, for FP use
39 * movaps (move aligned packed single) or integer use movdqa (move double quad
40 * aligned). It doesn't make a performance difference which instruction is used
41 * since Nehalem (original Core i7) was released. However, the movaps is a byte
42 * shorter, so that is the one we'll use for now. (same for unaligned).
43 */
44#define MOVADQ movaps
45#define MOVUDQ movups
46
Mathias Krause559ad0f2010-11-29 08:35:39 +080047#ifdef __x86_64__
Timothy McCaffreye31ac322015-01-13 13:16:43 -050048
Denys Vlasenkoe1839142017-01-19 22:33:04 +010049# constants in mergeable sections, linker can reorder and merge
50.section .rodata.cst16.gf128mul_x_ble_mask, "aM", @progbits, 16
Jussi Kivilinnac456a9c2013-04-08 21:51:16 +030051.align 16
52.Lgf128mul_x_ble_mask:
53 .octa 0x00000000000000010000000000000087
Denys Vlasenkoe1839142017-01-19 22:33:04 +010054.section .rodata.cst16.POLY, "aM", @progbits, 16
55.align 16
Tadeusz Struk0bd82f52010-11-04 15:00:45 -040056POLY: .octa 0xC2000000000000000000000000000001
Denys Vlasenkoe1839142017-01-19 22:33:04 +010057.section .rodata.cst16.TWOONE, "aM", @progbits, 16
58.align 16
Tadeusz Struk0bd82f52010-11-04 15:00:45 -040059TWOONE: .octa 0x00000001000000000000000000000001
60
Denys Vlasenkoe1839142017-01-19 22:33:04 +010061.section .rodata.cst16.SHUF_MASK, "aM", @progbits, 16
62.align 16
63SHUF_MASK: .octa 0x000102030405060708090A0B0C0D0E0F
64.section .rodata.cst16.MASK1, "aM", @progbits, 16
65.align 16
66MASK1: .octa 0x0000000000000000ffffffffffffffff
67.section .rodata.cst16.MASK2, "aM", @progbits, 16
68.align 16
69MASK2: .octa 0xffffffffffffffff0000000000000000
70.section .rodata.cst16.ONE, "aM", @progbits, 16
71.align 16
72ONE: .octa 0x00000000000000000000000000000001
73.section .rodata.cst16.F_MIN_MASK, "aM", @progbits, 16
74.align 16
75F_MIN_MASK: .octa 0xf1f2f3f4f5f6f7f8f9fafbfcfdfeff0
76.section .rodata.cst16.dec, "aM", @progbits, 16
77.align 16
78dec: .octa 0x1
79.section .rodata.cst16.enc, "aM", @progbits, 16
80.align 16
81enc: .octa 0x2
82
Tadeusz Struk0bd82f52010-11-04 15:00:45 -040083# order of these constants should not change.
84# more specifically, ALL_F should follow SHIFT_MASK,
Denys Vlasenkoe1839142017-01-19 22:33:04 +010085# and zero should follow ALL_F
86.section .rodata, "a", @progbits
87.align 16
Tadeusz Struk0bd82f52010-11-04 15:00:45 -040088SHIFT_MASK: .octa 0x0f0e0d0c0b0a09080706050403020100
89ALL_F: .octa 0xffffffffffffffffffffffffffffffff
Denys Vlasenkoe1839142017-01-19 22:33:04 +010090 .octa 0x00000000000000000000000000000000
Tadeusz Struk0bd82f52010-11-04 15:00:45 -040091
Sabrina Dubroca0487cca2017-04-28 18:11:56 +020092.section .rodata
93.align 16
94.type aad_shift_arr, @object
95.size aad_shift_arr, 272
96aad_shift_arr:
97 .octa 0xffffffffffffffffffffffffffffffff
98 .octa 0xffffffffffffffffffffffffffffff0C
99 .octa 0xffffffffffffffffffffffffffff0D0C
100 .octa 0xffffffffffffffffffffffffff0E0D0C
101 .octa 0xffffffffffffffffffffffff0F0E0D0C
102 .octa 0xffffffffffffffffffffff0C0B0A0908
103 .octa 0xffffffffffffffffffff0D0C0B0A0908
104 .octa 0xffffffffffffffffff0E0D0C0B0A0908
105 .octa 0xffffffffffffffff0F0E0D0C0B0A0908
106 .octa 0xffffffffffffff0C0B0A090807060504
107 .octa 0xffffffffffff0D0C0B0A090807060504
108 .octa 0xffffffffff0E0D0C0B0A090807060504
109 .octa 0xffffffff0F0E0D0C0B0A090807060504
110 .octa 0xffffff0C0B0A09080706050403020100
111 .octa 0xffff0D0C0B0A09080706050403020100
112 .octa 0xff0E0D0C0B0A09080706050403020100
113 .octa 0x0F0E0D0C0B0A09080706050403020100
114
Tadeusz Struk0bd82f52010-11-04 15:00:45 -0400115
Huang Ying54b6a1b2009-01-18 16:28:34 +1100116.text
117
Tadeusz Struk0bd82f52010-11-04 15:00:45 -0400118
119#define STACK_OFFSET 8*3
120#define HashKey 16*0 // store HashKey <<1 mod poly here
121#define HashKey_2 16*1 // store HashKey^2 <<1 mod poly here
122#define HashKey_3 16*2 // store HashKey^3 <<1 mod poly here
123#define HashKey_4 16*3 // store HashKey^4 <<1 mod poly here
124#define HashKey_k 16*4 // store XOR of High 64 bits and Low 64
125 // bits of HashKey <<1 mod poly here
126 //(for Karatsuba purposes)
127#define HashKey_2_k 16*5 // store XOR of High 64 bits and Low 64
128 // bits of HashKey^2 <<1 mod poly here
129 // (for Karatsuba purposes)
130#define HashKey_3_k 16*6 // store XOR of High 64 bits and Low 64
131 // bits of HashKey^3 <<1 mod poly here
132 // (for Karatsuba purposes)
133#define HashKey_4_k 16*7 // store XOR of High 64 bits and Low 64
134 // bits of HashKey^4 <<1 mod poly here
135 // (for Karatsuba purposes)
136#define VARIABLE_OFFSET 16*8
137
138#define arg1 rdi
139#define arg2 rsi
140#define arg3 rdx
141#define arg4 rcx
142#define arg5 r8
143#define arg6 r9
144#define arg7 STACK_OFFSET+8(%r14)
145#define arg8 STACK_OFFSET+16(%r14)
146#define arg9 STACK_OFFSET+24(%r14)
147#define arg10 STACK_OFFSET+32(%r14)
Timothy McCaffreye31ac322015-01-13 13:16:43 -0500148#define keysize 2*15*16(%arg1)
Mathias Krause559ad0f2010-11-29 08:35:39 +0800149#endif
Tadeusz Struk0bd82f52010-11-04 15:00:45 -0400150
151
Huang Ying54b6a1b2009-01-18 16:28:34 +1100152#define STATE1 %xmm0
153#define STATE2 %xmm4
154#define STATE3 %xmm5
155#define STATE4 %xmm6
156#define STATE STATE1
157#define IN1 %xmm1
158#define IN2 %xmm7
159#define IN3 %xmm8
160#define IN4 %xmm9
161#define IN IN1
162#define KEY %xmm2
163#define IV %xmm3
Mathias Krause0d258ef2010-11-27 16:34:46 +0800164
Huang Ying12387a42010-03-10 18:28:55 +0800165#define BSWAP_MASK %xmm10
166#define CTR %xmm11
167#define INC %xmm12
Huang Ying54b6a1b2009-01-18 16:28:34 +1100168
Jussi Kivilinnac456a9c2013-04-08 21:51:16 +0300169#define GF128MUL_MASK %xmm10
170
Mathias Krause0d258ef2010-11-27 16:34:46 +0800171#ifdef __x86_64__
172#define AREG %rax
Huang Ying54b6a1b2009-01-18 16:28:34 +1100173#define KEYP %rdi
174#define OUTP %rsi
Mathias Krause0d258ef2010-11-27 16:34:46 +0800175#define UKEYP OUTP
Huang Ying54b6a1b2009-01-18 16:28:34 +1100176#define INP %rdx
177#define LEN %rcx
178#define IVP %r8
179#define KLEN %r9d
180#define T1 %r10
181#define TKEYP T1
182#define T2 %r11
Huang Ying12387a42010-03-10 18:28:55 +0800183#define TCTR_LOW T2
Mathias Krause0d258ef2010-11-27 16:34:46 +0800184#else
185#define AREG %eax
186#define KEYP %edi
187#define OUTP AREG
188#define UKEYP OUTP
189#define INP %edx
190#define LEN %esi
191#define IVP %ebp
192#define KLEN %ebx
193#define T1 %ecx
194#define TKEYP T1
195#endif
Huang Ying54b6a1b2009-01-18 16:28:34 +1100196
Tadeusz Struk0bd82f52010-11-04 15:00:45 -0400197
Mathias Krause559ad0f2010-11-29 08:35:39 +0800198#ifdef __x86_64__
Tadeusz Struk0bd82f52010-11-04 15:00:45 -0400199/* GHASH_MUL MACRO to implement: Data*HashKey mod (128,127,126,121,0)
200*
201*
202* Input: A and B (128-bits each, bit-reflected)
203* Output: C = A*B*x mod poly, (i.e. >>1 )
204* To compute GH = GH*HashKey mod poly, give HK = HashKey<<1 mod poly as input
205* GH = GH * HK * x mod poly which is equivalent to GH*HashKey mod poly.
206*
207*/
208.macro GHASH_MUL GH HK TMP1 TMP2 TMP3 TMP4 TMP5
209 movdqa \GH, \TMP1
210 pshufd $78, \GH, \TMP2
211 pshufd $78, \HK, \TMP3
212 pxor \GH, \TMP2 # TMP2 = a1+a0
213 pxor \HK, \TMP3 # TMP3 = b1+b0
214 PCLMULQDQ 0x11, \HK, \TMP1 # TMP1 = a1*b1
215 PCLMULQDQ 0x00, \HK, \GH # GH = a0*b0
216 PCLMULQDQ 0x00, \TMP3, \TMP2 # TMP2 = (a0+a1)*(b1+b0)
217 pxor \GH, \TMP2
218 pxor \TMP1, \TMP2 # TMP2 = (a0*b0)+(a1*b0)
219 movdqa \TMP2, \TMP3
220 pslldq $8, \TMP3 # left shift TMP3 2 DWs
221 psrldq $8, \TMP2 # right shift TMP2 2 DWs
222 pxor \TMP3, \GH
223 pxor \TMP2, \TMP1 # TMP2:GH holds the result of GH*HK
224
225 # first phase of the reduction
226
227 movdqa \GH, \TMP2
228 movdqa \GH, \TMP3
229 movdqa \GH, \TMP4 # copy GH into TMP2,TMP3 and TMP4
230 # in in order to perform
231 # independent shifts
232 pslld $31, \TMP2 # packed right shift <<31
233 pslld $30, \TMP3 # packed right shift <<30
234 pslld $25, \TMP4 # packed right shift <<25
235 pxor \TMP3, \TMP2 # xor the shifted versions
236 pxor \TMP4, \TMP2
237 movdqa \TMP2, \TMP5
238 psrldq $4, \TMP5 # right shift TMP5 1 DW
239 pslldq $12, \TMP2 # left shift TMP2 3 DWs
240 pxor \TMP2, \GH
241
242 # second phase of the reduction
243
244 movdqa \GH,\TMP2 # copy GH into TMP2,TMP3 and TMP4
245 # in in order to perform
246 # independent shifts
247 movdqa \GH,\TMP3
248 movdqa \GH,\TMP4
249 psrld $1,\TMP2 # packed left shift >>1
250 psrld $2,\TMP3 # packed left shift >>2
251 psrld $7,\TMP4 # packed left shift >>7
252 pxor \TMP3,\TMP2 # xor the shifted versions
253 pxor \TMP4,\TMP2
254 pxor \TMP5, \TMP2
255 pxor \TMP2, \GH
256 pxor \TMP1, \GH # result is in TMP1
257.endm
258
259/*
260* if a = number of total plaintext bytes
261* b = floor(a/16)
262* num_initial_blocks = b mod 4
263* encrypt the initial num_initial_blocks blocks and apply ghash on
264* the ciphertext
265* %r10, %r11, %r12, %rax, %xmm5, %xmm6, %xmm7, %xmm8, %xmm9 registers
266* are clobbered
267* arg1, %arg2, %arg3, %r14 are used as a pointer only, not modified
268*/
269
Tadeusz Struk0bd82f52010-11-04 15:00:45 -0400270
Tadeusz Struk3c097b82010-12-13 19:51:15 +0800271.macro INITIAL_BLOCKS_DEC num_initial_blocks TMP1 TMP2 TMP3 TMP4 TMP5 XMM0 XMM1 \
272XMM2 XMM3 XMM4 XMMDst TMP6 TMP7 i i_seq operation
Timothy McCaffreye31ac322015-01-13 13:16:43 -0500273 MOVADQ SHUF_MASK(%rip), %xmm14
Tadeusz Struk0bd82f52010-11-04 15:00:45 -0400274 mov arg7, %r10 # %r10 = AAD
275 mov arg8, %r12 # %r12 = aadLen
276 mov %r12, %r11
277 pxor %xmm\i, %xmm\i
Sabrina Dubroca0487cca2017-04-28 18:11:56 +0200278 pxor \XMM2, \XMM2
Timothy McCaffreye31ac322015-01-13 13:16:43 -0500279
Sabrina Dubroca0487cca2017-04-28 18:11:56 +0200280 cmp $16, %r11
281 jl _get_AAD_rest8\num_initial_blocks\operation
282_get_AAD_blocks\num_initial_blocks\operation:
283 movdqu (%r10), %xmm\i
284 PSHUFB_XMM %xmm14, %xmm\i # byte-reflect the AAD data
285 pxor %xmm\i, \XMM2
286 GHASH_MUL \XMM2, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1
287 add $16, %r10
288 sub $16, %r12
289 sub $16, %r11
290 cmp $16, %r11
291 jge _get_AAD_blocks\num_initial_blocks\operation
292
293 movdqu \XMM2, %xmm\i
294 cmp $0, %r11
295 je _get_AAD_done\num_initial_blocks\operation
296
297 pxor %xmm\i,%xmm\i
298
299 /* read the last <16B of AAD. since we have at least 4B of
300 data right after the AAD (the ICV, and maybe some CT), we can
301 read 4B/8B blocks safely, and then get rid of the extra stuff */
302_get_AAD_rest8\num_initial_blocks\operation:
303 cmp $4, %r11
304 jle _get_AAD_rest4\num_initial_blocks\operation
305 movq (%r10), \TMP1
306 add $8, %r10
307 sub $8, %r11
308 pslldq $8, \TMP1
309 psrldq $8, %xmm\i
310 pxor \TMP1, %xmm\i
311 jmp _get_AAD_rest8\num_initial_blocks\operation
312_get_AAD_rest4\num_initial_blocks\operation:
313 cmp $0, %r11
314 jle _get_AAD_rest0\num_initial_blocks\operation
315 mov (%r10), %eax
316 movq %rax, \TMP1
317 add $4, %r10
318 sub $4, %r10
Tadeusz Struk0bd82f52010-11-04 15:00:45 -0400319 pslldq $12, \TMP1
320 psrldq $4, %xmm\i
321 pxor \TMP1, %xmm\i
Sabrina Dubroca0487cca2017-04-28 18:11:56 +0200322_get_AAD_rest0\num_initial_blocks\operation:
323 /* finalize: shift out the extra bytes we read, and align
324 left. since pslldq can only shift by an immediate, we use
325 vpshufb and an array of shuffle masks */
326 movq %r12, %r11
327 salq $4, %r11
328 movdqu aad_shift_arr(%r11), \TMP1
329 PSHUFB_XMM \TMP1, %xmm\i
330_get_AAD_rest_final\num_initial_blocks\operation:
Tadeusz Struk3c097b82010-12-13 19:51:15 +0800331 PSHUFB_XMM %xmm14, %xmm\i # byte-reflect the AAD data
Sabrina Dubroca0487cca2017-04-28 18:11:56 +0200332 pxor \XMM2, %xmm\i
333 GHASH_MUL %xmm\i, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1
Tadeusz Struk3c097b82010-12-13 19:51:15 +0800334
Sabrina Dubroca0487cca2017-04-28 18:11:56 +0200335_get_AAD_done\num_initial_blocks\operation:
Tadeusz Struk0bd82f52010-11-04 15:00:45 -0400336 xor %r11, %r11 # initialise the data pointer offset as zero
Sabrina Dubroca0487cca2017-04-28 18:11:56 +0200337 # start AES for num_initial_blocks blocks
Tadeusz Struk0bd82f52010-11-04 15:00:45 -0400338
339 mov %arg5, %rax # %rax = *Y0
340 movdqu (%rax), \XMM0 # XMM0 = Y0
Tadeusz Struk3c097b82010-12-13 19:51:15 +0800341 PSHUFB_XMM %xmm14, \XMM0
342
343.if (\i == 5) || (\i == 6) || (\i == 7)
Timothy McCaffreye31ac322015-01-13 13:16:43 -0500344 MOVADQ ONE(%RIP),\TMP1
345 MOVADQ (%arg1),\TMP2
Tadeusz Struk0bd82f52010-11-04 15:00:45 -0400346.irpc index, \i_seq
Timothy McCaffreye31ac322015-01-13 13:16:43 -0500347 paddd \TMP1, \XMM0 # INCR Y0
Tadeusz Struk0bd82f52010-11-04 15:00:45 -0400348 movdqa \XMM0, %xmm\index
Tadeusz Struk3c097b82010-12-13 19:51:15 +0800349 PSHUFB_XMM %xmm14, %xmm\index # perform a 16 byte swap
Timothy McCaffreye31ac322015-01-13 13:16:43 -0500350 pxor \TMP2, %xmm\index
351.endr
352 lea 0x10(%arg1),%r10
353 mov keysize,%eax
354 shr $2,%eax # 128->4, 192->6, 256->8
355 add $5,%eax # 128->9, 192->11, 256->13
Tadeusz Struk3c097b82010-12-13 19:51:15 +0800356
Timothy McCaffreye31ac322015-01-13 13:16:43 -0500357aes_loop_initial_dec\num_initial_blocks:
358 MOVADQ (%r10),\TMP1
359.irpc index, \i_seq
360 AESENC \TMP1, %xmm\index
Tadeusz Struk0bd82f52010-11-04 15:00:45 -0400361.endr
Timothy McCaffreye31ac322015-01-13 13:16:43 -0500362 add $16,%r10
363 sub $1,%eax
364 jnz aes_loop_initial_dec\num_initial_blocks
365
366 MOVADQ (%r10), \TMP1
Tadeusz Struk0bd82f52010-11-04 15:00:45 -0400367.irpc index, \i_seq
Timothy McCaffreye31ac322015-01-13 13:16:43 -0500368 AESENCLAST \TMP1, %xmm\index # Last Round
Tadeusz Struk0bd82f52010-11-04 15:00:45 -0400369.endr
370.irpc index, \i_seq
371 movdqu (%arg3 , %r11, 1), \TMP1
372 pxor \TMP1, %xmm\index
373 movdqu %xmm\index, (%arg2 , %r11, 1)
374 # write back plaintext/ciphertext for num_initial_blocks
375 add $16, %r11
Tadeusz Struk3c097b82010-12-13 19:51:15 +0800376
Tadeusz Struk0bd82f52010-11-04 15:00:45 -0400377 movdqa \TMP1, %xmm\index
Tadeusz Struk3c097b82010-12-13 19:51:15 +0800378 PSHUFB_XMM %xmm14, %xmm\index
Timothy McCaffreye31ac322015-01-13 13:16:43 -0500379 # prepare plaintext/ciphertext for GHASH computation
Tadeusz Struk0bd82f52010-11-04 15:00:45 -0400380.endr
381.endif
Sabrina Dubroca0487cca2017-04-28 18:11:56 +0200382
Tadeusz Struk0bd82f52010-11-04 15:00:45 -0400383 # apply GHASH on num_initial_blocks blocks
384
385.if \i == 5
386 pxor %xmm5, %xmm6
387 GHASH_MUL %xmm6, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1
388 pxor %xmm6, %xmm7
389 GHASH_MUL %xmm7, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1
390 pxor %xmm7, %xmm8
391 GHASH_MUL %xmm8, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1
392.elseif \i == 6
393 pxor %xmm6, %xmm7
394 GHASH_MUL %xmm7, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1
395 pxor %xmm7, %xmm8
396 GHASH_MUL %xmm8, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1
397.elseif \i == 7
398 pxor %xmm7, %xmm8
399 GHASH_MUL %xmm8, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1
400.endif
401 cmp $64, %r13
402 jl _initial_blocks_done\num_initial_blocks\operation
403 # no need for precomputed values
404/*
405*
406* Precomputations for HashKey parallel with encryption of first 4 blocks.
407* Haskey_i_k holds XORed values of the low and high parts of the Haskey_i
408*/
Timothy McCaffreye31ac322015-01-13 13:16:43 -0500409 MOVADQ ONE(%rip), \TMP1
410 paddd \TMP1, \XMM0 # INCR Y0
411 MOVADQ \XMM0, \XMM1
Tadeusz Struk3c097b82010-12-13 19:51:15 +0800412 PSHUFB_XMM %xmm14, \XMM1 # perform a 16 byte swap
413
Timothy McCaffreye31ac322015-01-13 13:16:43 -0500414 paddd \TMP1, \XMM0 # INCR Y0
415 MOVADQ \XMM0, \XMM2
Tadeusz Struk3c097b82010-12-13 19:51:15 +0800416 PSHUFB_XMM %xmm14, \XMM2 # perform a 16 byte swap
417
Timothy McCaffreye31ac322015-01-13 13:16:43 -0500418 paddd \TMP1, \XMM0 # INCR Y0
419 MOVADQ \XMM0, \XMM3
Tadeusz Struk3c097b82010-12-13 19:51:15 +0800420 PSHUFB_XMM %xmm14, \XMM3 # perform a 16 byte swap
421
Timothy McCaffreye31ac322015-01-13 13:16:43 -0500422 paddd \TMP1, \XMM0 # INCR Y0
423 MOVADQ \XMM0, \XMM4
Tadeusz Struk3c097b82010-12-13 19:51:15 +0800424 PSHUFB_XMM %xmm14, \XMM4 # perform a 16 byte swap
425
Timothy McCaffreye31ac322015-01-13 13:16:43 -0500426 MOVADQ 0(%arg1),\TMP1
427 pxor \TMP1, \XMM1
428 pxor \TMP1, \XMM2
429 pxor \TMP1, \XMM3
430 pxor \TMP1, \XMM4
Tadeusz Struk0bd82f52010-11-04 15:00:45 -0400431 movdqa \TMP3, \TMP5
432 pshufd $78, \TMP3, \TMP1
433 pxor \TMP3, \TMP1
434 movdqa \TMP1, HashKey_k(%rsp)
435 GHASH_MUL \TMP5, \TMP3, \TMP1, \TMP2, \TMP4, \TMP6, \TMP7
436# TMP5 = HashKey^2<<1 (mod poly)
437 movdqa \TMP5, HashKey_2(%rsp)
438# HashKey_2 = HashKey^2<<1 (mod poly)
439 pshufd $78, \TMP5, \TMP1
440 pxor \TMP5, \TMP1
441 movdqa \TMP1, HashKey_2_k(%rsp)
442.irpc index, 1234 # do 4 rounds
443 movaps 0x10*\index(%arg1), \TMP1
444 AESENC \TMP1, \XMM1
445 AESENC \TMP1, \XMM2
446 AESENC \TMP1, \XMM3
447 AESENC \TMP1, \XMM4
448.endr
449 GHASH_MUL \TMP5, \TMP3, \TMP1, \TMP2, \TMP4, \TMP6, \TMP7
450# TMP5 = HashKey^3<<1 (mod poly)
451 movdqa \TMP5, HashKey_3(%rsp)
452 pshufd $78, \TMP5, \TMP1
453 pxor \TMP5, \TMP1
454 movdqa \TMP1, HashKey_3_k(%rsp)
455.irpc index, 56789 # do next 5 rounds
456 movaps 0x10*\index(%arg1), \TMP1
457 AESENC \TMP1, \XMM1
458 AESENC \TMP1, \XMM2
459 AESENC \TMP1, \XMM3
460 AESENC \TMP1, \XMM4
461.endr
462 GHASH_MUL \TMP5, \TMP3, \TMP1, \TMP2, \TMP4, \TMP6, \TMP7
463# TMP5 = HashKey^3<<1 (mod poly)
464 movdqa \TMP5, HashKey_4(%rsp)
465 pshufd $78, \TMP5, \TMP1
466 pxor \TMP5, \TMP1
467 movdqa \TMP1, HashKey_4_k(%rsp)
Timothy McCaffreye31ac322015-01-13 13:16:43 -0500468 lea 0xa0(%arg1),%r10
469 mov keysize,%eax
470 shr $2,%eax # 128->4, 192->6, 256->8
471 sub $4,%eax # 128->0, 192->2, 256->4
472 jz aes_loop_pre_dec_done\num_initial_blocks
473
474aes_loop_pre_dec\num_initial_blocks:
475 MOVADQ (%r10),\TMP2
476.irpc index, 1234
477 AESENC \TMP2, %xmm\index
478.endr
479 add $16,%r10
480 sub $1,%eax
481 jnz aes_loop_pre_dec\num_initial_blocks
482
483aes_loop_pre_dec_done\num_initial_blocks:
484 MOVADQ (%r10), \TMP2
Tadeusz Struk0bd82f52010-11-04 15:00:45 -0400485 AESENCLAST \TMP2, \XMM1
486 AESENCLAST \TMP2, \XMM2
487 AESENCLAST \TMP2, \XMM3
488 AESENCLAST \TMP2, \XMM4
489 movdqu 16*0(%arg3 , %r11 , 1), \TMP1
490 pxor \TMP1, \XMM1
Tadeusz Struk0bd82f52010-11-04 15:00:45 -0400491 movdqu \XMM1, 16*0(%arg2 , %r11 , 1)
492 movdqa \TMP1, \XMM1
Tadeusz Struk0bd82f52010-11-04 15:00:45 -0400493 movdqu 16*1(%arg3 , %r11 , 1), \TMP1
494 pxor \TMP1, \XMM2
Tadeusz Struk0bd82f52010-11-04 15:00:45 -0400495 movdqu \XMM2, 16*1(%arg2 , %r11 , 1)
496 movdqa \TMP1, \XMM2
Tadeusz Struk0bd82f52010-11-04 15:00:45 -0400497 movdqu 16*2(%arg3 , %r11 , 1), \TMP1
498 pxor \TMP1, \XMM3
Tadeusz Struk0bd82f52010-11-04 15:00:45 -0400499 movdqu \XMM3, 16*2(%arg2 , %r11 , 1)
500 movdqa \TMP1, \XMM3
Tadeusz Struk0bd82f52010-11-04 15:00:45 -0400501 movdqu 16*3(%arg3 , %r11 , 1), \TMP1
502 pxor \TMP1, \XMM4
Tadeusz Struk0bd82f52010-11-04 15:00:45 -0400503 movdqu \XMM4, 16*3(%arg2 , %r11 , 1)
504 movdqa \TMP1, \XMM4
Tadeusz Struk3c097b82010-12-13 19:51:15 +0800505 add $64, %r11
Tadeusz Struk3c097b82010-12-13 19:51:15 +0800506 PSHUFB_XMM %xmm14, \XMM1 # perform a 16 byte swap
507 pxor \XMMDst, \XMM1
508# combine GHASHed value with the corresponding ciphertext
Tadeusz Struk3c097b82010-12-13 19:51:15 +0800509 PSHUFB_XMM %xmm14, \XMM2 # perform a 16 byte swap
Tadeusz Struk3c097b82010-12-13 19:51:15 +0800510 PSHUFB_XMM %xmm14, \XMM3 # perform a 16 byte swap
Tadeusz Struk3c097b82010-12-13 19:51:15 +0800511 PSHUFB_XMM %xmm14, \XMM4 # perform a 16 byte swap
512
513_initial_blocks_done\num_initial_blocks\operation:
514
515.endm
516
517
518/*
519* if a = number of total plaintext bytes
520* b = floor(a/16)
521* num_initial_blocks = b mod 4
522* encrypt the initial num_initial_blocks blocks and apply ghash on
523* the ciphertext
524* %r10, %r11, %r12, %rax, %xmm5, %xmm6, %xmm7, %xmm8, %xmm9 registers
525* are clobbered
526* arg1, %arg2, %arg3, %r14 are used as a pointer only, not modified
527*/
528
529
530.macro INITIAL_BLOCKS_ENC num_initial_blocks TMP1 TMP2 TMP3 TMP4 TMP5 XMM0 XMM1 \
531XMM2 XMM3 XMM4 XMMDst TMP6 TMP7 i i_seq operation
Timothy McCaffreye31ac322015-01-13 13:16:43 -0500532 MOVADQ SHUF_MASK(%rip), %xmm14
Tadeusz Struk3c097b82010-12-13 19:51:15 +0800533 mov arg7, %r10 # %r10 = AAD
534 mov arg8, %r12 # %r12 = aadLen
535 mov %r12, %r11
536 pxor %xmm\i, %xmm\i
Sabrina Dubroca0487cca2017-04-28 18:11:56 +0200537 pxor \XMM2, \XMM2
538
539 cmp $16, %r11
540 jl _get_AAD_rest8\num_initial_blocks\operation
541_get_AAD_blocks\num_initial_blocks\operation:
542 movdqu (%r10), %xmm\i
543 PSHUFB_XMM %xmm14, %xmm\i # byte-reflect the AAD data
544 pxor %xmm\i, \XMM2
545 GHASH_MUL \XMM2, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1
546 add $16, %r10
547 sub $16, %r12
548 sub $16, %r11
549 cmp $16, %r11
550 jge _get_AAD_blocks\num_initial_blocks\operation
551
552 movdqu \XMM2, %xmm\i
553 cmp $0, %r11
554 je _get_AAD_done\num_initial_blocks\operation
555
556 pxor %xmm\i,%xmm\i
557
558 /* read the last <16B of AAD. since we have at least 4B of
559 data right after the AAD (the ICV, and maybe some PT), we can
560 read 4B/8B blocks safely, and then get rid of the extra stuff */
561_get_AAD_rest8\num_initial_blocks\operation:
562 cmp $4, %r11
563 jle _get_AAD_rest4\num_initial_blocks\operation
564 movq (%r10), \TMP1
565 add $8, %r10
566 sub $8, %r11
567 pslldq $8, \TMP1
568 psrldq $8, %xmm\i
569 pxor \TMP1, %xmm\i
570 jmp _get_AAD_rest8\num_initial_blocks\operation
571_get_AAD_rest4\num_initial_blocks\operation:
572 cmp $0, %r11
573 jle _get_AAD_rest0\num_initial_blocks\operation
574 mov (%r10), %eax
575 movq %rax, \TMP1
576 add $4, %r10
577 sub $4, %r10
Tadeusz Struk3c097b82010-12-13 19:51:15 +0800578 pslldq $12, \TMP1
579 psrldq $4, %xmm\i
580 pxor \TMP1, %xmm\i
Sabrina Dubroca0487cca2017-04-28 18:11:56 +0200581_get_AAD_rest0\num_initial_blocks\operation:
582 /* finalize: shift out the extra bytes we read, and align
583 left. since pslldq can only shift by an immediate, we use
584 vpshufb and an array of shuffle masks */
585 movq %r12, %r11
586 salq $4, %r11
587 movdqu aad_shift_arr(%r11), \TMP1
588 PSHUFB_XMM \TMP1, %xmm\i
589_get_AAD_rest_final\num_initial_blocks\operation:
Tadeusz Struk3c097b82010-12-13 19:51:15 +0800590 PSHUFB_XMM %xmm14, %xmm\i # byte-reflect the AAD data
Sabrina Dubroca0487cca2017-04-28 18:11:56 +0200591 pxor \XMM2, %xmm\i
592 GHASH_MUL %xmm\i, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1
Tadeusz Struk3c097b82010-12-13 19:51:15 +0800593
Sabrina Dubroca0487cca2017-04-28 18:11:56 +0200594_get_AAD_done\num_initial_blocks\operation:
Tadeusz Struk3c097b82010-12-13 19:51:15 +0800595 xor %r11, %r11 # initialise the data pointer offset as zero
Sabrina Dubroca0487cca2017-04-28 18:11:56 +0200596 # start AES for num_initial_blocks blocks
Tadeusz Struk3c097b82010-12-13 19:51:15 +0800597
598 mov %arg5, %rax # %rax = *Y0
599 movdqu (%rax), \XMM0 # XMM0 = Y0
Tadeusz Struk3c097b82010-12-13 19:51:15 +0800600 PSHUFB_XMM %xmm14, \XMM0
601
602.if (\i == 5) || (\i == 6) || (\i == 7)
Tadeusz Struk3c097b82010-12-13 19:51:15 +0800603
Timothy McCaffreye31ac322015-01-13 13:16:43 -0500604 MOVADQ ONE(%RIP),\TMP1
605 MOVADQ 0(%arg1),\TMP2
Tadeusz Struk3c097b82010-12-13 19:51:15 +0800606.irpc index, \i_seq
Timothy McCaffreye31ac322015-01-13 13:16:43 -0500607 paddd \TMP1, \XMM0 # INCR Y0
608 MOVADQ \XMM0, %xmm\index
609 PSHUFB_XMM %xmm14, %xmm\index # perform a 16 byte swap
610 pxor \TMP2, %xmm\index
Tadeusz Struk3c097b82010-12-13 19:51:15 +0800611.endr
Timothy McCaffreye31ac322015-01-13 13:16:43 -0500612 lea 0x10(%arg1),%r10
613 mov keysize,%eax
614 shr $2,%eax # 128->4, 192->6, 256->8
615 add $5,%eax # 128->9, 192->11, 256->13
616
617aes_loop_initial_enc\num_initial_blocks:
618 MOVADQ (%r10),\TMP1
619.irpc index, \i_seq
620 AESENC \TMP1, %xmm\index
Tadeusz Struk3c097b82010-12-13 19:51:15 +0800621.endr
Timothy McCaffreye31ac322015-01-13 13:16:43 -0500622 add $16,%r10
623 sub $1,%eax
624 jnz aes_loop_initial_enc\num_initial_blocks
625
626 MOVADQ (%r10), \TMP1
Tadeusz Struk3c097b82010-12-13 19:51:15 +0800627.irpc index, \i_seq
Timothy McCaffreye31ac322015-01-13 13:16:43 -0500628 AESENCLAST \TMP1, %xmm\index # Last Round
Tadeusz Struk3c097b82010-12-13 19:51:15 +0800629.endr
630.irpc index, \i_seq
631 movdqu (%arg3 , %r11, 1), \TMP1
632 pxor \TMP1, %xmm\index
633 movdqu %xmm\index, (%arg2 , %r11, 1)
634 # write back plaintext/ciphertext for num_initial_blocks
635 add $16, %r11
Tadeusz Struk3c097b82010-12-13 19:51:15 +0800636 PSHUFB_XMM %xmm14, %xmm\index
637
638 # prepare plaintext/ciphertext for GHASH computation
639.endr
640.endif
Sabrina Dubroca0487cca2017-04-28 18:11:56 +0200641
Tadeusz Struk3c097b82010-12-13 19:51:15 +0800642 # apply GHASH on num_initial_blocks blocks
643
644.if \i == 5
645 pxor %xmm5, %xmm6
646 GHASH_MUL %xmm6, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1
647 pxor %xmm6, %xmm7
648 GHASH_MUL %xmm7, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1
649 pxor %xmm7, %xmm8
650 GHASH_MUL %xmm8, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1
651.elseif \i == 6
652 pxor %xmm6, %xmm7
653 GHASH_MUL %xmm7, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1
654 pxor %xmm7, %xmm8
655 GHASH_MUL %xmm8, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1
656.elseif \i == 7
657 pxor %xmm7, %xmm8
658 GHASH_MUL %xmm8, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1
659.endif
660 cmp $64, %r13
661 jl _initial_blocks_done\num_initial_blocks\operation
662 # no need for precomputed values
663/*
664*
665* Precomputations for HashKey parallel with encryption of first 4 blocks.
666* Haskey_i_k holds XORed values of the low and high parts of the Haskey_i
667*/
Timothy McCaffreye31ac322015-01-13 13:16:43 -0500668 MOVADQ ONE(%RIP),\TMP1
669 paddd \TMP1, \XMM0 # INCR Y0
670 MOVADQ \XMM0, \XMM1
Tadeusz Struk3c097b82010-12-13 19:51:15 +0800671 PSHUFB_XMM %xmm14, \XMM1 # perform a 16 byte swap
672
Timothy McCaffreye31ac322015-01-13 13:16:43 -0500673 paddd \TMP1, \XMM0 # INCR Y0
674 MOVADQ \XMM0, \XMM2
Tadeusz Struk3c097b82010-12-13 19:51:15 +0800675 PSHUFB_XMM %xmm14, \XMM2 # perform a 16 byte swap
676
Timothy McCaffreye31ac322015-01-13 13:16:43 -0500677 paddd \TMP1, \XMM0 # INCR Y0
678 MOVADQ \XMM0, \XMM3
Tadeusz Struk3c097b82010-12-13 19:51:15 +0800679 PSHUFB_XMM %xmm14, \XMM3 # perform a 16 byte swap
680
Timothy McCaffreye31ac322015-01-13 13:16:43 -0500681 paddd \TMP1, \XMM0 # INCR Y0
682 MOVADQ \XMM0, \XMM4
Tadeusz Struk3c097b82010-12-13 19:51:15 +0800683 PSHUFB_XMM %xmm14, \XMM4 # perform a 16 byte swap
684
Timothy McCaffreye31ac322015-01-13 13:16:43 -0500685 MOVADQ 0(%arg1),\TMP1
686 pxor \TMP1, \XMM1
687 pxor \TMP1, \XMM2
688 pxor \TMP1, \XMM3
689 pxor \TMP1, \XMM4
Tadeusz Struk3c097b82010-12-13 19:51:15 +0800690 movdqa \TMP3, \TMP5
691 pshufd $78, \TMP3, \TMP1
692 pxor \TMP3, \TMP1
693 movdqa \TMP1, HashKey_k(%rsp)
694 GHASH_MUL \TMP5, \TMP3, \TMP1, \TMP2, \TMP4, \TMP6, \TMP7
695# TMP5 = HashKey^2<<1 (mod poly)
696 movdqa \TMP5, HashKey_2(%rsp)
697# HashKey_2 = HashKey^2<<1 (mod poly)
698 pshufd $78, \TMP5, \TMP1
699 pxor \TMP5, \TMP1
700 movdqa \TMP1, HashKey_2_k(%rsp)
701.irpc index, 1234 # do 4 rounds
702 movaps 0x10*\index(%arg1), \TMP1
703 AESENC \TMP1, \XMM1
704 AESENC \TMP1, \XMM2
705 AESENC \TMP1, \XMM3
706 AESENC \TMP1, \XMM4
707.endr
708 GHASH_MUL \TMP5, \TMP3, \TMP1, \TMP2, \TMP4, \TMP6, \TMP7
709# TMP5 = HashKey^3<<1 (mod poly)
710 movdqa \TMP5, HashKey_3(%rsp)
711 pshufd $78, \TMP5, \TMP1
712 pxor \TMP5, \TMP1
713 movdqa \TMP1, HashKey_3_k(%rsp)
714.irpc index, 56789 # do next 5 rounds
715 movaps 0x10*\index(%arg1), \TMP1
716 AESENC \TMP1, \XMM1
717 AESENC \TMP1, \XMM2
718 AESENC \TMP1, \XMM3
719 AESENC \TMP1, \XMM4
720.endr
721 GHASH_MUL \TMP5, \TMP3, \TMP1, \TMP2, \TMP4, \TMP6, \TMP7
722# TMP5 = HashKey^3<<1 (mod poly)
723 movdqa \TMP5, HashKey_4(%rsp)
724 pshufd $78, \TMP5, \TMP1
725 pxor \TMP5, \TMP1
726 movdqa \TMP1, HashKey_4_k(%rsp)
Timothy McCaffreye31ac322015-01-13 13:16:43 -0500727 lea 0xa0(%arg1),%r10
728 mov keysize,%eax
729 shr $2,%eax # 128->4, 192->6, 256->8
730 sub $4,%eax # 128->0, 192->2, 256->4
731 jz aes_loop_pre_enc_done\num_initial_blocks
732
733aes_loop_pre_enc\num_initial_blocks:
734 MOVADQ (%r10),\TMP2
735.irpc index, 1234
736 AESENC \TMP2, %xmm\index
737.endr
738 add $16,%r10
739 sub $1,%eax
740 jnz aes_loop_pre_enc\num_initial_blocks
741
742aes_loop_pre_enc_done\num_initial_blocks:
743 MOVADQ (%r10), \TMP2
Tadeusz Struk3c097b82010-12-13 19:51:15 +0800744 AESENCLAST \TMP2, \XMM1
745 AESENCLAST \TMP2, \XMM2
746 AESENCLAST \TMP2, \XMM3
747 AESENCLAST \TMP2, \XMM4
748 movdqu 16*0(%arg3 , %r11 , 1), \TMP1
749 pxor \TMP1, \XMM1
750 movdqu 16*1(%arg3 , %r11 , 1), \TMP1
751 pxor \TMP1, \XMM2
752 movdqu 16*2(%arg3 , %r11 , 1), \TMP1
753 pxor \TMP1, \XMM3
754 movdqu 16*3(%arg3 , %r11 , 1), \TMP1
755 pxor \TMP1, \XMM4
Tadeusz Struk0bd82f52010-11-04 15:00:45 -0400756 movdqu \XMM1, 16*0(%arg2 , %r11 , 1)
757 movdqu \XMM2, 16*1(%arg2 , %r11 , 1)
758 movdqu \XMM3, 16*2(%arg2 , %r11 , 1)
759 movdqu \XMM4, 16*3(%arg2 , %r11 , 1)
Tadeusz Struk3c097b82010-12-13 19:51:15 +0800760
Tadeusz Struk0bd82f52010-11-04 15:00:45 -0400761 add $64, %r11
Tadeusz Struk3c097b82010-12-13 19:51:15 +0800762 PSHUFB_XMM %xmm14, \XMM1 # perform a 16 byte swap
Tadeusz Struk0bd82f52010-11-04 15:00:45 -0400763 pxor \XMMDst, \XMM1
764# combine GHASHed value with the corresponding ciphertext
Tadeusz Struk3c097b82010-12-13 19:51:15 +0800765 PSHUFB_XMM %xmm14, \XMM2 # perform a 16 byte swap
Tadeusz Struk3c097b82010-12-13 19:51:15 +0800766 PSHUFB_XMM %xmm14, \XMM3 # perform a 16 byte swap
Tadeusz Struk3c097b82010-12-13 19:51:15 +0800767 PSHUFB_XMM %xmm14, \XMM4 # perform a 16 byte swap
768
Tadeusz Struk0bd82f52010-11-04 15:00:45 -0400769_initial_blocks_done\num_initial_blocks\operation:
Tadeusz Struk3c097b82010-12-13 19:51:15 +0800770
Tadeusz Struk0bd82f52010-11-04 15:00:45 -0400771.endm
772
773/*
774* encrypt 4 blocks at a time
775* ghash the 4 previously encrypted ciphertext blocks
776* arg1, %arg2, %arg3 are used as pointers only, not modified
777* %r11 is the data offset value
778*/
Tadeusz Struk3c097b82010-12-13 19:51:15 +0800779.macro GHASH_4_ENCRYPT_4_PARALLEL_ENC TMP1 TMP2 TMP3 TMP4 TMP5 \
Tadeusz Struk0bd82f52010-11-04 15:00:45 -0400780TMP6 XMM0 XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7 XMM8 operation
781
782 movdqa \XMM1, \XMM5
783 movdqa \XMM2, \XMM6
784 movdqa \XMM3, \XMM7
785 movdqa \XMM4, \XMM8
786
Tadeusz Struk3c097b82010-12-13 19:51:15 +0800787 movdqa SHUF_MASK(%rip), %xmm15
Tadeusz Struk0bd82f52010-11-04 15:00:45 -0400788 # multiply TMP5 * HashKey using karatsuba
789
790 movdqa \XMM5, \TMP4
791 pshufd $78, \XMM5, \TMP6
792 pxor \XMM5, \TMP6
793 paddd ONE(%rip), \XMM0 # INCR CNT
794 movdqa HashKey_4(%rsp), \TMP5
795 PCLMULQDQ 0x11, \TMP5, \TMP4 # TMP4 = a1*b1
796 movdqa \XMM0, \XMM1
797 paddd ONE(%rip), \XMM0 # INCR CNT
798 movdqa \XMM0, \XMM2
799 paddd ONE(%rip), \XMM0 # INCR CNT
800 movdqa \XMM0, \XMM3
801 paddd ONE(%rip), \XMM0 # INCR CNT
802 movdqa \XMM0, \XMM4
Tadeusz Struk3c097b82010-12-13 19:51:15 +0800803 PSHUFB_XMM %xmm15, \XMM1 # perform a 16 byte swap
Tadeusz Struk0bd82f52010-11-04 15:00:45 -0400804 PCLMULQDQ 0x00, \TMP5, \XMM5 # XMM5 = a0*b0
Tadeusz Struk3c097b82010-12-13 19:51:15 +0800805 PSHUFB_XMM %xmm15, \XMM2 # perform a 16 byte swap
806 PSHUFB_XMM %xmm15, \XMM3 # perform a 16 byte swap
807 PSHUFB_XMM %xmm15, \XMM4 # perform a 16 byte swap
808
Tadeusz Struk0bd82f52010-11-04 15:00:45 -0400809 pxor (%arg1), \XMM1
810 pxor (%arg1), \XMM2
811 pxor (%arg1), \XMM3
812 pxor (%arg1), \XMM4
813 movdqa HashKey_4_k(%rsp), \TMP5
814 PCLMULQDQ 0x00, \TMP5, \TMP6 # TMP6 = (a1+a0)*(b1+b0)
815 movaps 0x10(%arg1), \TMP1
816 AESENC \TMP1, \XMM1 # Round 1
817 AESENC \TMP1, \XMM2
818 AESENC \TMP1, \XMM3
819 AESENC \TMP1, \XMM4
820 movaps 0x20(%arg1), \TMP1
821 AESENC \TMP1, \XMM1 # Round 2
822 AESENC \TMP1, \XMM2
823 AESENC \TMP1, \XMM3
824 AESENC \TMP1, \XMM4
825 movdqa \XMM6, \TMP1
826 pshufd $78, \XMM6, \TMP2
827 pxor \XMM6, \TMP2
828 movdqa HashKey_3(%rsp), \TMP5
829 PCLMULQDQ 0x11, \TMP5, \TMP1 # TMP1 = a1 * b1
830 movaps 0x30(%arg1), \TMP3
831 AESENC \TMP3, \XMM1 # Round 3
832 AESENC \TMP3, \XMM2
833 AESENC \TMP3, \XMM3
834 AESENC \TMP3, \XMM4
835 PCLMULQDQ 0x00, \TMP5, \XMM6 # XMM6 = a0*b0
836 movaps 0x40(%arg1), \TMP3
837 AESENC \TMP3, \XMM1 # Round 4
838 AESENC \TMP3, \XMM2
839 AESENC \TMP3, \XMM3
840 AESENC \TMP3, \XMM4
841 movdqa HashKey_3_k(%rsp), \TMP5
842 PCLMULQDQ 0x00, \TMP5, \TMP2 # TMP2 = (a1+a0)*(b1+b0)
843 movaps 0x50(%arg1), \TMP3
844 AESENC \TMP3, \XMM1 # Round 5
845 AESENC \TMP3, \XMM2
846 AESENC \TMP3, \XMM3
847 AESENC \TMP3, \XMM4
848 pxor \TMP1, \TMP4
849# accumulate the results in TMP4:XMM5, TMP6 holds the middle part
850 pxor \XMM6, \XMM5
851 pxor \TMP2, \TMP6
852 movdqa \XMM7, \TMP1
853 pshufd $78, \XMM7, \TMP2
854 pxor \XMM7, \TMP2
855 movdqa HashKey_2(%rsp ), \TMP5
856
857 # Multiply TMP5 * HashKey using karatsuba
858
859 PCLMULQDQ 0x11, \TMP5, \TMP1 # TMP1 = a1*b1
860 movaps 0x60(%arg1), \TMP3
861 AESENC \TMP3, \XMM1 # Round 6
862 AESENC \TMP3, \XMM2
863 AESENC \TMP3, \XMM3
864 AESENC \TMP3, \XMM4
865 PCLMULQDQ 0x00, \TMP5, \XMM7 # XMM7 = a0*b0
866 movaps 0x70(%arg1), \TMP3
867 AESENC \TMP3, \XMM1 # Round 7
868 AESENC \TMP3, \XMM2
869 AESENC \TMP3, \XMM3
870 AESENC \TMP3, \XMM4
871 movdqa HashKey_2_k(%rsp), \TMP5
872 PCLMULQDQ 0x00, \TMP5, \TMP2 # TMP2 = (a1+a0)*(b1+b0)
873 movaps 0x80(%arg1), \TMP3
874 AESENC \TMP3, \XMM1 # Round 8
875 AESENC \TMP3, \XMM2
876 AESENC \TMP3, \XMM3
877 AESENC \TMP3, \XMM4
878 pxor \TMP1, \TMP4
879# accumulate the results in TMP4:XMM5, TMP6 holds the middle part
880 pxor \XMM7, \XMM5
881 pxor \TMP2, \TMP6
882
883 # Multiply XMM8 * HashKey
884 # XMM8 and TMP5 hold the values for the two operands
885
886 movdqa \XMM8, \TMP1
887 pshufd $78, \XMM8, \TMP2
888 pxor \XMM8, \TMP2
889 movdqa HashKey(%rsp), \TMP5
890 PCLMULQDQ 0x11, \TMP5, \TMP1 # TMP1 = a1*b1
891 movaps 0x90(%arg1), \TMP3
892 AESENC \TMP3, \XMM1 # Round 9
893 AESENC \TMP3, \XMM2
894 AESENC \TMP3, \XMM3
895 AESENC \TMP3, \XMM4
896 PCLMULQDQ 0x00, \TMP5, \XMM8 # XMM8 = a0*b0
Timothy McCaffreye31ac322015-01-13 13:16:43 -0500897 lea 0xa0(%arg1),%r10
898 mov keysize,%eax
899 shr $2,%eax # 128->4, 192->6, 256->8
900 sub $4,%eax # 128->0, 192->2, 256->4
901 jz aes_loop_par_enc_done
902
903aes_loop_par_enc:
904 MOVADQ (%r10),\TMP3
905.irpc index, 1234
906 AESENC \TMP3, %xmm\index
907.endr
908 add $16,%r10
909 sub $1,%eax
910 jnz aes_loop_par_enc
911
912aes_loop_par_enc_done:
913 MOVADQ (%r10), \TMP3
Tadeusz Struk0bd82f52010-11-04 15:00:45 -0400914 AESENCLAST \TMP3, \XMM1 # Round 10
915 AESENCLAST \TMP3, \XMM2
916 AESENCLAST \TMP3, \XMM3
917 AESENCLAST \TMP3, \XMM4
918 movdqa HashKey_k(%rsp), \TMP5
919 PCLMULQDQ 0x00, \TMP5, \TMP2 # TMP2 = (a1+a0)*(b1+b0)
920 movdqu (%arg3,%r11,1), \TMP3
921 pxor \TMP3, \XMM1 # Ciphertext/Plaintext XOR EK
Tadeusz Struk0bd82f52010-11-04 15:00:45 -0400922 movdqu 16(%arg3,%r11,1), \TMP3
923 pxor \TMP3, \XMM2 # Ciphertext/Plaintext XOR EK
Tadeusz Struk0bd82f52010-11-04 15:00:45 -0400924 movdqu 32(%arg3,%r11,1), \TMP3
925 pxor \TMP3, \XMM3 # Ciphertext/Plaintext XOR EK
Tadeusz Struk0bd82f52010-11-04 15:00:45 -0400926 movdqu 48(%arg3,%r11,1), \TMP3
927 pxor \TMP3, \XMM4 # Ciphertext/Plaintext XOR EK
Tadeusz Struk3c097b82010-12-13 19:51:15 +0800928 movdqu \XMM1, (%arg2,%r11,1) # Write to the ciphertext buffer
929 movdqu \XMM2, 16(%arg2,%r11,1) # Write to the ciphertext buffer
930 movdqu \XMM3, 32(%arg2,%r11,1) # Write to the ciphertext buffer
931 movdqu \XMM4, 48(%arg2,%r11,1) # Write to the ciphertext buffer
932 PSHUFB_XMM %xmm15, \XMM1 # perform a 16 byte swap
933 PSHUFB_XMM %xmm15, \XMM2 # perform a 16 byte swap
934 PSHUFB_XMM %xmm15, \XMM3 # perform a 16 byte swap
935 PSHUFB_XMM %xmm15, \XMM4 # perform a 16 byte swap
936
937 pxor \TMP4, \TMP1
938 pxor \XMM8, \XMM5
939 pxor \TMP6, \TMP2
940 pxor \TMP1, \TMP2
941 pxor \XMM5, \TMP2
942 movdqa \TMP2, \TMP3
943 pslldq $8, \TMP3 # left shift TMP3 2 DWs
944 psrldq $8, \TMP2 # right shift TMP2 2 DWs
945 pxor \TMP3, \XMM5
946 pxor \TMP2, \TMP1 # accumulate the results in TMP1:XMM5
947
948 # first phase of reduction
949
950 movdqa \XMM5, \TMP2
951 movdqa \XMM5, \TMP3
952 movdqa \XMM5, \TMP4
953# move XMM5 into TMP2, TMP3, TMP4 in order to perform shifts independently
954 pslld $31, \TMP2 # packed right shift << 31
955 pslld $30, \TMP3 # packed right shift << 30
956 pslld $25, \TMP4 # packed right shift << 25
957 pxor \TMP3, \TMP2 # xor the shifted versions
958 pxor \TMP4, \TMP2
959 movdqa \TMP2, \TMP5
960 psrldq $4, \TMP5 # right shift T5 1 DW
961 pslldq $12, \TMP2 # left shift T2 3 DWs
962 pxor \TMP2, \XMM5
963
964 # second phase of reduction
965
966 movdqa \XMM5,\TMP2 # make 3 copies of XMM5 into TMP2, TMP3, TMP4
967 movdqa \XMM5,\TMP3
968 movdqa \XMM5,\TMP4
969 psrld $1, \TMP2 # packed left shift >>1
970 psrld $2, \TMP3 # packed left shift >>2
971 psrld $7, \TMP4 # packed left shift >>7
972 pxor \TMP3,\TMP2 # xor the shifted versions
973 pxor \TMP4,\TMP2
974 pxor \TMP5, \TMP2
975 pxor \TMP2, \XMM5
976 pxor \TMP1, \XMM5 # result is in TMP1
977
978 pxor \XMM5, \XMM1
979.endm
980
981/*
982* decrypt 4 blocks at a time
983* ghash the 4 previously decrypted ciphertext blocks
984* arg1, %arg2, %arg3 are used as pointers only, not modified
985* %r11 is the data offset value
986*/
987.macro GHASH_4_ENCRYPT_4_PARALLEL_DEC TMP1 TMP2 TMP3 TMP4 TMP5 \
988TMP6 XMM0 XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7 XMM8 operation
989
990 movdqa \XMM1, \XMM5
991 movdqa \XMM2, \XMM6
992 movdqa \XMM3, \XMM7
993 movdqa \XMM4, \XMM8
994
995 movdqa SHUF_MASK(%rip), %xmm15
996 # multiply TMP5 * HashKey using karatsuba
997
998 movdqa \XMM5, \TMP4
999 pshufd $78, \XMM5, \TMP6
1000 pxor \XMM5, \TMP6
1001 paddd ONE(%rip), \XMM0 # INCR CNT
1002 movdqa HashKey_4(%rsp), \TMP5
1003 PCLMULQDQ 0x11, \TMP5, \TMP4 # TMP4 = a1*b1
1004 movdqa \XMM0, \XMM1
1005 paddd ONE(%rip), \XMM0 # INCR CNT
1006 movdqa \XMM0, \XMM2
1007 paddd ONE(%rip), \XMM0 # INCR CNT
1008 movdqa \XMM0, \XMM3
1009 paddd ONE(%rip), \XMM0 # INCR CNT
1010 movdqa \XMM0, \XMM4
1011 PSHUFB_XMM %xmm15, \XMM1 # perform a 16 byte swap
1012 PCLMULQDQ 0x00, \TMP5, \XMM5 # XMM5 = a0*b0
1013 PSHUFB_XMM %xmm15, \XMM2 # perform a 16 byte swap
1014 PSHUFB_XMM %xmm15, \XMM3 # perform a 16 byte swap
1015 PSHUFB_XMM %xmm15, \XMM4 # perform a 16 byte swap
1016
1017 pxor (%arg1), \XMM1
1018 pxor (%arg1), \XMM2
1019 pxor (%arg1), \XMM3
1020 pxor (%arg1), \XMM4
1021 movdqa HashKey_4_k(%rsp), \TMP5
1022 PCLMULQDQ 0x00, \TMP5, \TMP6 # TMP6 = (a1+a0)*(b1+b0)
1023 movaps 0x10(%arg1), \TMP1
1024 AESENC \TMP1, \XMM1 # Round 1
1025 AESENC \TMP1, \XMM2
1026 AESENC \TMP1, \XMM3
1027 AESENC \TMP1, \XMM4
1028 movaps 0x20(%arg1), \TMP1
1029 AESENC \TMP1, \XMM1 # Round 2
1030 AESENC \TMP1, \XMM2
1031 AESENC \TMP1, \XMM3
1032 AESENC \TMP1, \XMM4
1033 movdqa \XMM6, \TMP1
1034 pshufd $78, \XMM6, \TMP2
1035 pxor \XMM6, \TMP2
1036 movdqa HashKey_3(%rsp), \TMP5
1037 PCLMULQDQ 0x11, \TMP5, \TMP1 # TMP1 = a1 * b1
1038 movaps 0x30(%arg1), \TMP3
1039 AESENC \TMP3, \XMM1 # Round 3
1040 AESENC \TMP3, \XMM2
1041 AESENC \TMP3, \XMM3
1042 AESENC \TMP3, \XMM4
1043 PCLMULQDQ 0x00, \TMP5, \XMM6 # XMM6 = a0*b0
1044 movaps 0x40(%arg1), \TMP3
1045 AESENC \TMP3, \XMM1 # Round 4
1046 AESENC \TMP3, \XMM2
1047 AESENC \TMP3, \XMM3
1048 AESENC \TMP3, \XMM4
1049 movdqa HashKey_3_k(%rsp), \TMP5
1050 PCLMULQDQ 0x00, \TMP5, \TMP2 # TMP2 = (a1+a0)*(b1+b0)
1051 movaps 0x50(%arg1), \TMP3
1052 AESENC \TMP3, \XMM1 # Round 5
1053 AESENC \TMP3, \XMM2
1054 AESENC \TMP3, \XMM3
1055 AESENC \TMP3, \XMM4
1056 pxor \TMP1, \TMP4
1057# accumulate the results in TMP4:XMM5, TMP6 holds the middle part
1058 pxor \XMM6, \XMM5
1059 pxor \TMP2, \TMP6
1060 movdqa \XMM7, \TMP1
1061 pshufd $78, \XMM7, \TMP2
1062 pxor \XMM7, \TMP2
1063 movdqa HashKey_2(%rsp ), \TMP5
1064
1065 # Multiply TMP5 * HashKey using karatsuba
1066
1067 PCLMULQDQ 0x11, \TMP5, \TMP1 # TMP1 = a1*b1
1068 movaps 0x60(%arg1), \TMP3
1069 AESENC \TMP3, \XMM1 # Round 6
1070 AESENC \TMP3, \XMM2
1071 AESENC \TMP3, \XMM3
1072 AESENC \TMP3, \XMM4
1073 PCLMULQDQ 0x00, \TMP5, \XMM7 # XMM7 = a0*b0
1074 movaps 0x70(%arg1), \TMP3
1075 AESENC \TMP3, \XMM1 # Round 7
1076 AESENC \TMP3, \XMM2
1077 AESENC \TMP3, \XMM3
1078 AESENC \TMP3, \XMM4
1079 movdqa HashKey_2_k(%rsp), \TMP5
1080 PCLMULQDQ 0x00, \TMP5, \TMP2 # TMP2 = (a1+a0)*(b1+b0)
1081 movaps 0x80(%arg1), \TMP3
1082 AESENC \TMP3, \XMM1 # Round 8
1083 AESENC \TMP3, \XMM2
1084 AESENC \TMP3, \XMM3
1085 AESENC \TMP3, \XMM4
1086 pxor \TMP1, \TMP4
1087# accumulate the results in TMP4:XMM5, TMP6 holds the middle part
1088 pxor \XMM7, \XMM5
1089 pxor \TMP2, \TMP6
1090
1091 # Multiply XMM8 * HashKey
1092 # XMM8 and TMP5 hold the values for the two operands
1093
1094 movdqa \XMM8, \TMP1
1095 pshufd $78, \XMM8, \TMP2
1096 pxor \XMM8, \TMP2
1097 movdqa HashKey(%rsp), \TMP5
1098 PCLMULQDQ 0x11, \TMP5, \TMP1 # TMP1 = a1*b1
1099 movaps 0x90(%arg1), \TMP3
1100 AESENC \TMP3, \XMM1 # Round 9
1101 AESENC \TMP3, \XMM2
1102 AESENC \TMP3, \XMM3
1103 AESENC \TMP3, \XMM4
1104 PCLMULQDQ 0x00, \TMP5, \XMM8 # XMM8 = a0*b0
Timothy McCaffreye31ac322015-01-13 13:16:43 -05001105 lea 0xa0(%arg1),%r10
1106 mov keysize,%eax
1107 shr $2,%eax # 128->4, 192->6, 256->8
1108 sub $4,%eax # 128->0, 192->2, 256->4
1109 jz aes_loop_par_dec_done
1110
1111aes_loop_par_dec:
1112 MOVADQ (%r10),\TMP3
1113.irpc index, 1234
1114 AESENC \TMP3, %xmm\index
1115.endr
1116 add $16,%r10
1117 sub $1,%eax
1118 jnz aes_loop_par_dec
1119
1120aes_loop_par_dec_done:
1121 MOVADQ (%r10), \TMP3
1122 AESENCLAST \TMP3, \XMM1 # last round
Tadeusz Struk3c097b82010-12-13 19:51:15 +08001123 AESENCLAST \TMP3, \XMM2
1124 AESENCLAST \TMP3, \XMM3
1125 AESENCLAST \TMP3, \XMM4
1126 movdqa HashKey_k(%rsp), \TMP5
1127 PCLMULQDQ 0x00, \TMP5, \TMP2 # TMP2 = (a1+a0)*(b1+b0)
1128 movdqu (%arg3,%r11,1), \TMP3
1129 pxor \TMP3, \XMM1 # Ciphertext/Plaintext XOR EK
1130 movdqu \XMM1, (%arg2,%r11,1) # Write to plaintext buffer
1131 movdqa \TMP3, \XMM1
1132 movdqu 16(%arg3,%r11,1), \TMP3
1133 pxor \TMP3, \XMM2 # Ciphertext/Plaintext XOR EK
1134 movdqu \XMM2, 16(%arg2,%r11,1) # Write to plaintext buffer
1135 movdqa \TMP3, \XMM2
1136 movdqu 32(%arg3,%r11,1), \TMP3
1137 pxor \TMP3, \XMM3 # Ciphertext/Plaintext XOR EK
1138 movdqu \XMM3, 32(%arg2,%r11,1) # Write to plaintext buffer
1139 movdqa \TMP3, \XMM3
1140 movdqu 48(%arg3,%r11,1), \TMP3
1141 pxor \TMP3, \XMM4 # Ciphertext/Plaintext XOR EK
Tadeusz Struk0bd82f52010-11-04 15:00:45 -04001142 movdqu \XMM4, 48(%arg2,%r11,1) # Write to plaintext buffer
1143 movdqa \TMP3, \XMM4
Tadeusz Struk3c097b82010-12-13 19:51:15 +08001144 PSHUFB_XMM %xmm15, \XMM1 # perform a 16 byte swap
1145 PSHUFB_XMM %xmm15, \XMM2 # perform a 16 byte swap
1146 PSHUFB_XMM %xmm15, \XMM3 # perform a 16 byte swap
1147 PSHUFB_XMM %xmm15, \XMM4 # perform a 16 byte swap
Tadeusz Struk0bd82f52010-11-04 15:00:45 -04001148
1149 pxor \TMP4, \TMP1
1150 pxor \XMM8, \XMM5
1151 pxor \TMP6, \TMP2
1152 pxor \TMP1, \TMP2
1153 pxor \XMM5, \TMP2
1154 movdqa \TMP2, \TMP3
1155 pslldq $8, \TMP3 # left shift TMP3 2 DWs
1156 psrldq $8, \TMP2 # right shift TMP2 2 DWs
1157 pxor \TMP3, \XMM5
1158 pxor \TMP2, \TMP1 # accumulate the results in TMP1:XMM5
1159
1160 # first phase of reduction
1161
1162 movdqa \XMM5, \TMP2
1163 movdqa \XMM5, \TMP3
1164 movdqa \XMM5, \TMP4
1165# move XMM5 into TMP2, TMP3, TMP4 in order to perform shifts independently
1166 pslld $31, \TMP2 # packed right shift << 31
1167 pslld $30, \TMP3 # packed right shift << 30
1168 pslld $25, \TMP4 # packed right shift << 25
1169 pxor \TMP3, \TMP2 # xor the shifted versions
1170 pxor \TMP4, \TMP2
1171 movdqa \TMP2, \TMP5
1172 psrldq $4, \TMP5 # right shift T5 1 DW
1173 pslldq $12, \TMP2 # left shift T2 3 DWs
1174 pxor \TMP2, \XMM5
1175
1176 # second phase of reduction
1177
1178 movdqa \XMM5,\TMP2 # make 3 copies of XMM5 into TMP2, TMP3, TMP4
1179 movdqa \XMM5,\TMP3
1180 movdqa \XMM5,\TMP4
1181 psrld $1, \TMP2 # packed left shift >>1
1182 psrld $2, \TMP3 # packed left shift >>2
1183 psrld $7, \TMP4 # packed left shift >>7
1184 pxor \TMP3,\TMP2 # xor the shifted versions
1185 pxor \TMP4,\TMP2
1186 pxor \TMP5, \TMP2
1187 pxor \TMP2, \XMM5
1188 pxor \TMP1, \XMM5 # result is in TMP1
1189
1190 pxor \XMM5, \XMM1
1191.endm
1192
1193/* GHASH the last 4 ciphertext blocks. */
1194.macro GHASH_LAST_4 TMP1 TMP2 TMP3 TMP4 TMP5 TMP6 \
1195TMP7 XMM1 XMM2 XMM3 XMM4 XMMDst
1196
1197 # Multiply TMP6 * HashKey (using Karatsuba)
1198
1199 movdqa \XMM1, \TMP6
1200 pshufd $78, \XMM1, \TMP2
1201 pxor \XMM1, \TMP2
1202 movdqa HashKey_4(%rsp), \TMP5
1203 PCLMULQDQ 0x11, \TMP5, \TMP6 # TMP6 = a1*b1
1204 PCLMULQDQ 0x00, \TMP5, \XMM1 # XMM1 = a0*b0
1205 movdqa HashKey_4_k(%rsp), \TMP4
1206 PCLMULQDQ 0x00, \TMP4, \TMP2 # TMP2 = (a1+a0)*(b1+b0)
1207 movdqa \XMM1, \XMMDst
1208 movdqa \TMP2, \XMM1 # result in TMP6, XMMDst, XMM1
1209
1210 # Multiply TMP1 * HashKey (using Karatsuba)
1211
1212 movdqa \XMM2, \TMP1
1213 pshufd $78, \XMM2, \TMP2
1214 pxor \XMM2, \TMP2
1215 movdqa HashKey_3(%rsp), \TMP5
1216 PCLMULQDQ 0x11, \TMP5, \TMP1 # TMP1 = a1*b1
1217 PCLMULQDQ 0x00, \TMP5, \XMM2 # XMM2 = a0*b0
1218 movdqa HashKey_3_k(%rsp), \TMP4
1219 PCLMULQDQ 0x00, \TMP4, \TMP2 # TMP2 = (a1+a0)*(b1+b0)
1220 pxor \TMP1, \TMP6
1221 pxor \XMM2, \XMMDst
1222 pxor \TMP2, \XMM1
1223# results accumulated in TMP6, XMMDst, XMM1
1224
1225 # Multiply TMP1 * HashKey (using Karatsuba)
1226
1227 movdqa \XMM3, \TMP1
1228 pshufd $78, \XMM3, \TMP2
1229 pxor \XMM3, \TMP2
1230 movdqa HashKey_2(%rsp), \TMP5
1231 PCLMULQDQ 0x11, \TMP5, \TMP1 # TMP1 = a1*b1
1232 PCLMULQDQ 0x00, \TMP5, \XMM3 # XMM3 = a0*b0
1233 movdqa HashKey_2_k(%rsp), \TMP4
1234 PCLMULQDQ 0x00, \TMP4, \TMP2 # TMP2 = (a1+a0)*(b1+b0)
1235 pxor \TMP1, \TMP6
1236 pxor \XMM3, \XMMDst
1237 pxor \TMP2, \XMM1 # results accumulated in TMP6, XMMDst, XMM1
1238
1239 # Multiply TMP1 * HashKey (using Karatsuba)
1240 movdqa \XMM4, \TMP1
1241 pshufd $78, \XMM4, \TMP2
1242 pxor \XMM4, \TMP2
1243 movdqa HashKey(%rsp), \TMP5
1244 PCLMULQDQ 0x11, \TMP5, \TMP1 # TMP1 = a1*b1
1245 PCLMULQDQ 0x00, \TMP5, \XMM4 # XMM4 = a0*b0
1246 movdqa HashKey_k(%rsp), \TMP4
1247 PCLMULQDQ 0x00, \TMP4, \TMP2 # TMP2 = (a1+a0)*(b1+b0)
1248 pxor \TMP1, \TMP6
1249 pxor \XMM4, \XMMDst
1250 pxor \XMM1, \TMP2
1251 pxor \TMP6, \TMP2
1252 pxor \XMMDst, \TMP2
1253 # middle section of the temp results combined as in karatsuba algorithm
1254 movdqa \TMP2, \TMP4
1255 pslldq $8, \TMP4 # left shift TMP4 2 DWs
1256 psrldq $8, \TMP2 # right shift TMP2 2 DWs
1257 pxor \TMP4, \XMMDst
1258 pxor \TMP2, \TMP6
1259# TMP6:XMMDst holds the result of the accumulated carry-less multiplications
1260 # first phase of the reduction
1261 movdqa \XMMDst, \TMP2
1262 movdqa \XMMDst, \TMP3
1263 movdqa \XMMDst, \TMP4
1264# move XMMDst into TMP2, TMP3, TMP4 in order to perform 3 shifts independently
1265 pslld $31, \TMP2 # packed right shifting << 31
1266 pslld $30, \TMP3 # packed right shifting << 30
1267 pslld $25, \TMP4 # packed right shifting << 25
1268 pxor \TMP3, \TMP2 # xor the shifted versions
1269 pxor \TMP4, \TMP2
1270 movdqa \TMP2, \TMP7
1271 psrldq $4, \TMP7 # right shift TMP7 1 DW
1272 pslldq $12, \TMP2 # left shift TMP2 3 DWs
1273 pxor \TMP2, \XMMDst
1274
1275 # second phase of the reduction
1276 movdqa \XMMDst, \TMP2
1277 # make 3 copies of XMMDst for doing 3 shift operations
1278 movdqa \XMMDst, \TMP3
1279 movdqa \XMMDst, \TMP4
1280 psrld $1, \TMP2 # packed left shift >> 1
1281 psrld $2, \TMP3 # packed left shift >> 2
1282 psrld $7, \TMP4 # packed left shift >> 7
1283 pxor \TMP3, \TMP2 # xor the shifted versions
1284 pxor \TMP4, \TMP2
1285 pxor \TMP7, \TMP2
1286 pxor \TMP2, \XMMDst
1287 pxor \TMP6, \XMMDst # reduced result is in XMMDst
1288.endm
1289
Timothy McCaffreye31ac322015-01-13 13:16:43 -05001290
1291/* Encryption of a single block
1292* uses eax & r10
1293*/
1294
Tadeusz Struk0bd82f52010-11-04 15:00:45 -04001295.macro ENCRYPT_SINGLE_BLOCK XMM0 TMP1
1296
Timothy McCaffreye31ac322015-01-13 13:16:43 -05001297 pxor (%arg1), \XMM0
1298 mov keysize,%eax
1299 shr $2,%eax # 128->4, 192->6, 256->8
1300 add $5,%eax # 128->9, 192->11, 256->13
1301 lea 16(%arg1), %r10 # get first expanded key address
1302
1303_esb_loop_\@:
1304 MOVADQ (%r10),\TMP1
1305 AESENC \TMP1,\XMM0
1306 add $16,%r10
1307 sub $1,%eax
1308 jnz _esb_loop_\@
1309
1310 MOVADQ (%r10),\TMP1
1311 AESENCLAST \TMP1,\XMM0
Tadeusz Struk0bd82f52010-11-04 15:00:45 -04001312.endm
Tadeusz Struk0bd82f52010-11-04 15:00:45 -04001313/*****************************************************************************
1314* void aesni_gcm_dec(void *aes_ctx, // AES Key schedule. Starts on a 16 byte boundary.
1315* u8 *out, // Plaintext output. Encrypt in-place is allowed.
1316* const u8 *in, // Ciphertext input
1317* u64 plaintext_len, // Length of data in bytes for decryption.
1318* u8 *iv, // Pre-counter block j0: 4 byte salt (from Security Association)
1319* // concatenated with 8 byte Initialisation Vector (from IPSec ESP Payload)
1320* // concatenated with 0x00000001. 16-byte aligned pointer.
1321* u8 *hash_subkey, // H, the Hash sub key input. Data starts on a 16-byte boundary.
1322* const u8 *aad, // Additional Authentication Data (AAD)
1323* u64 aad_len, // Length of AAD in bytes. With RFC4106 this is going to be 8 or 12 bytes
1324* u8 *auth_tag, // Authenticated Tag output. The driver will compare this to the
1325* // given authentication tag and only return the plaintext if they match.
1326* u64 auth_tag_len); // Authenticated Tag Length in bytes. Valid values are 16
1327* // (most likely), 12 or 8.
1328*
1329* Assumptions:
1330*
1331* keys:
1332* keys are pre-expanded and aligned to 16 bytes. we are using the first
1333* set of 11 keys in the data structure void *aes_ctx
1334*
1335* iv:
1336* 0 1 2 3
1337* 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
1338* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1339* | Salt (From the SA) |
1340* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1341* | Initialization Vector |
1342* | (This is the sequence number from IPSec header) |
1343* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1344* | 0x1 |
1345* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1346*
1347*
1348*
1349* AAD:
1350* AAD padded to 128 bits with 0
1351* for example, assume AAD is a u32 vector
1352*
1353* if AAD is 8 bytes:
1354* AAD[3] = {A0, A1};
1355* padded AAD in xmm register = {A1 A0 0 0}
1356*
1357* 0 1 2 3
1358* 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
1359* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1360* | SPI (A1) |
1361* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1362* | 32-bit Sequence Number (A0) |
1363* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1364* | 0x0 |
1365* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1366*
1367* AAD Format with 32-bit Sequence Number
1368*
1369* if AAD is 12 bytes:
1370* AAD[3] = {A0, A1, A2};
1371* padded AAD in xmm register = {A2 A1 A0 0}
1372*
1373* 0 1 2 3
1374* 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
1375* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1376* 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
1377* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1378* | SPI (A2) |
1379* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1380* | 64-bit Extended Sequence Number {A1,A0} |
1381* | |
1382* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1383* | 0x0 |
1384* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1385*
1386* AAD Format with 64-bit Extended Sequence Number
1387*
1388* aadLen:
1389* from the definition of the spec, aadLen can only be 8 or 12 bytes.
1390* The code supports 16 too but for other sizes, the code will fail.
1391*
1392* TLen:
1393* from the definition of the spec, TLen can only be 8, 12 or 16 bytes.
1394* For other sizes, the code will fail.
1395*
1396* poly = x^128 + x^127 + x^126 + x^121 + 1
1397*
1398*****************************************************************************/
Tadeusz Struk0bd82f52010-11-04 15:00:45 -04001399ENTRY(aesni_gcm_dec)
1400 push %r12
1401 push %r13
1402 push %r14
1403 mov %rsp, %r14
1404/*
1405* states of %xmm registers %xmm6:%xmm15 not saved
1406* all %xmm registers are clobbered
1407*/
1408 sub $VARIABLE_OFFSET, %rsp
1409 and $~63, %rsp # align rsp to 64 bytes
1410 mov %arg6, %r12
1411 movdqu (%r12), %xmm13 # %xmm13 = HashKey
Tadeusz Struk3c097b82010-12-13 19:51:15 +08001412 movdqa SHUF_MASK(%rip), %xmm2
1413 PSHUFB_XMM %xmm2, %xmm13
1414
Tadeusz Struk0bd82f52010-11-04 15:00:45 -04001415
1416# Precompute HashKey<<1 (mod poly) from the hash key (required for GHASH)
1417
1418 movdqa %xmm13, %xmm2
1419 psllq $1, %xmm13
1420 psrlq $63, %xmm2
1421 movdqa %xmm2, %xmm1
1422 pslldq $8, %xmm2
1423 psrldq $8, %xmm1
1424 por %xmm2, %xmm13
1425
1426 # Reduction
1427
1428 pshufd $0x24, %xmm1, %xmm2
1429 pcmpeqd TWOONE(%rip), %xmm2
1430 pand POLY(%rip), %xmm2
1431 pxor %xmm2, %xmm13 # %xmm13 holds the HashKey<<1 (mod poly)
1432
1433
1434 # Decrypt first few blocks
1435
1436 movdqa %xmm13, HashKey(%rsp) # store HashKey<<1 (mod poly)
1437 mov %arg4, %r13 # save the number of bytes of plaintext/ciphertext
1438 and $-16, %r13 # %r13 = %r13 - (%r13 mod 16)
1439 mov %r13, %r12
1440 and $(3<<4), %r12
1441 jz _initial_num_blocks_is_0_decrypt
1442 cmp $(2<<4), %r12
1443 jb _initial_num_blocks_is_1_decrypt
1444 je _initial_num_blocks_is_2_decrypt
1445_initial_num_blocks_is_3_decrypt:
Tadeusz Struk3c097b82010-12-13 19:51:15 +08001446 INITIAL_BLOCKS_DEC 3, %xmm9, %xmm10, %xmm13, %xmm11, %xmm12, %xmm0, \
Tadeusz Struk0bd82f52010-11-04 15:00:45 -04001447%xmm1, %xmm2, %xmm3, %xmm4, %xmm8, %xmm5, %xmm6, 5, 678, dec
1448 sub $48, %r13
1449 jmp _initial_blocks_decrypted
1450_initial_num_blocks_is_2_decrypt:
Tadeusz Struk3c097b82010-12-13 19:51:15 +08001451 INITIAL_BLOCKS_DEC 2, %xmm9, %xmm10, %xmm13, %xmm11, %xmm12, %xmm0, \
Tadeusz Struk0bd82f52010-11-04 15:00:45 -04001452%xmm1, %xmm2, %xmm3, %xmm4, %xmm8, %xmm5, %xmm6, 6, 78, dec
1453 sub $32, %r13
1454 jmp _initial_blocks_decrypted
1455_initial_num_blocks_is_1_decrypt:
Tadeusz Struk3c097b82010-12-13 19:51:15 +08001456 INITIAL_BLOCKS_DEC 1, %xmm9, %xmm10, %xmm13, %xmm11, %xmm12, %xmm0, \
Tadeusz Struk0bd82f52010-11-04 15:00:45 -04001457%xmm1, %xmm2, %xmm3, %xmm4, %xmm8, %xmm5, %xmm6, 7, 8, dec
1458 sub $16, %r13
1459 jmp _initial_blocks_decrypted
1460_initial_num_blocks_is_0_decrypt:
Tadeusz Struk3c097b82010-12-13 19:51:15 +08001461 INITIAL_BLOCKS_DEC 0, %xmm9, %xmm10, %xmm13, %xmm11, %xmm12, %xmm0, \
Tadeusz Struk0bd82f52010-11-04 15:00:45 -04001462%xmm1, %xmm2, %xmm3, %xmm4, %xmm8, %xmm5, %xmm6, 8, 0, dec
1463_initial_blocks_decrypted:
1464 cmp $0, %r13
1465 je _zero_cipher_left_decrypt
1466 sub $64, %r13
1467 je _four_cipher_left_decrypt
1468_decrypt_by_4:
Tadeusz Struk3c097b82010-12-13 19:51:15 +08001469 GHASH_4_ENCRYPT_4_PARALLEL_DEC %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, \
Tadeusz Struk0bd82f52010-11-04 15:00:45 -04001470%xmm14, %xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, dec
1471 add $64, %r11
1472 sub $64, %r13
1473 jne _decrypt_by_4
1474_four_cipher_left_decrypt:
1475 GHASH_LAST_4 %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14, \
1476%xmm15, %xmm1, %xmm2, %xmm3, %xmm4, %xmm8
1477_zero_cipher_left_decrypt:
1478 mov %arg4, %r13
1479 and $15, %r13 # %r13 = arg4 (mod 16)
1480 je _multiple_of_16_bytes_decrypt
1481
Lucas De Marchi0d2eb442011-03-17 16:24:16 -03001482 # Handle the last <16 byte block separately
Tadeusz Struk0bd82f52010-11-04 15:00:45 -04001483
1484 paddd ONE(%rip), %xmm0 # increment CNT to get Yn
Tadeusz Struk3c097b82010-12-13 19:51:15 +08001485 movdqa SHUF_MASK(%rip), %xmm10
1486 PSHUFB_XMM %xmm10, %xmm0
1487
Tadeusz Struk0bd82f52010-11-04 15:00:45 -04001488 ENCRYPT_SINGLE_BLOCK %xmm0, %xmm1 # E(K, Yn)
1489 sub $16, %r11
1490 add %r13, %r11
Lucas De Marchi0d2eb442011-03-17 16:24:16 -03001491 movdqu (%arg3,%r11,1), %xmm1 # receive the last <16 byte block
Tadeusz Struk0bd82f52010-11-04 15:00:45 -04001492 lea SHIFT_MASK+16(%rip), %r12
1493 sub %r13, %r12
1494# adjust the shuffle mask pointer to be able to shift 16-%r13 bytes
1495# (%r13 is the number of bytes in plaintext mod 16)
1496 movdqu (%r12), %xmm2 # get the appropriate shuffle mask
Tadeusz Struk3c097b82010-12-13 19:51:15 +08001497 PSHUFB_XMM %xmm2, %xmm1 # right shift 16-%r13 butes
1498
Tadeusz Struk0bd82f52010-11-04 15:00:45 -04001499 movdqa %xmm1, %xmm2
1500 pxor %xmm1, %xmm0 # Ciphertext XOR E(K, Yn)
1501 movdqu ALL_F-SHIFT_MASK(%r12), %xmm1
1502 # get the appropriate mask to mask out top 16-%r13 bytes of %xmm0
1503 pand %xmm1, %xmm0 # mask out top 16-%r13 bytes of %xmm0
1504 pand %xmm1, %xmm2
Tadeusz Struk3c097b82010-12-13 19:51:15 +08001505 movdqa SHUF_MASK(%rip), %xmm10
1506 PSHUFB_XMM %xmm10 ,%xmm2
1507
Tadeusz Struk0bd82f52010-11-04 15:00:45 -04001508 pxor %xmm2, %xmm8
1509 GHASH_MUL %xmm8, %xmm13, %xmm9, %xmm10, %xmm11, %xmm5, %xmm6
1510 # GHASH computation for the last <16 byte block
1511 sub %r13, %r11
1512 add $16, %r11
1513
1514 # output %r13 bytes
Tadeusz Struk3c097b82010-12-13 19:51:15 +08001515 MOVQ_R64_XMM %xmm0, %rax
Tadeusz Struk0bd82f52010-11-04 15:00:45 -04001516 cmp $8, %r13
1517 jle _less_than_8_bytes_left_decrypt
1518 mov %rax, (%arg2 , %r11, 1)
1519 add $8, %r11
1520 psrldq $8, %xmm0
Tadeusz Struk3c097b82010-12-13 19:51:15 +08001521 MOVQ_R64_XMM %xmm0, %rax
Tadeusz Struk0bd82f52010-11-04 15:00:45 -04001522 sub $8, %r13
1523_less_than_8_bytes_left_decrypt:
1524 mov %al, (%arg2, %r11, 1)
1525 add $1, %r11
1526 shr $8, %rax
1527 sub $1, %r13
1528 jne _less_than_8_bytes_left_decrypt
1529_multiple_of_16_bytes_decrypt:
1530 mov arg8, %r12 # %r13 = aadLen (number of bytes)
1531 shl $3, %r12 # convert into number of bits
1532 movd %r12d, %xmm15 # len(A) in %xmm15
1533 shl $3, %arg4 # len(C) in bits (*128)
Tadeusz Struk3c097b82010-12-13 19:51:15 +08001534 MOVQ_R64_XMM %arg4, %xmm1
Tadeusz Struk0bd82f52010-11-04 15:00:45 -04001535 pslldq $8, %xmm15 # %xmm15 = len(A)||0x0000000000000000
1536 pxor %xmm1, %xmm15 # %xmm15 = len(A)||len(C)
1537 pxor %xmm15, %xmm8
1538 GHASH_MUL %xmm8, %xmm13, %xmm9, %xmm10, %xmm11, %xmm5, %xmm6
1539 # final GHASH computation
Tadeusz Struk3c097b82010-12-13 19:51:15 +08001540 movdqa SHUF_MASK(%rip), %xmm10
1541 PSHUFB_XMM %xmm10, %xmm8
1542
Tadeusz Struk0bd82f52010-11-04 15:00:45 -04001543 mov %arg5, %rax # %rax = *Y0
1544 movdqu (%rax), %xmm0 # %xmm0 = Y0
1545 ENCRYPT_SINGLE_BLOCK %xmm0, %xmm1 # E(K, Y0)
1546 pxor %xmm8, %xmm0
1547_return_T_decrypt:
1548 mov arg9, %r10 # %r10 = authTag
1549 mov arg10, %r11 # %r11 = auth_tag_len
1550 cmp $16, %r11
1551 je _T_16_decrypt
Sabrina Dubroca38d9dee2017-04-28 18:11:57 +02001552 cmp $8, %r11
1553 jl _T_4_decrypt
Tadeusz Struk0bd82f52010-11-04 15:00:45 -04001554_T_8_decrypt:
Tadeusz Struk3c097b82010-12-13 19:51:15 +08001555 MOVQ_R64_XMM %xmm0, %rax
Tadeusz Struk0bd82f52010-11-04 15:00:45 -04001556 mov %rax, (%r10)
Sabrina Dubroca38d9dee2017-04-28 18:11:57 +02001557 add $8, %r10
1558 sub $8, %r11
Tadeusz Struk0bd82f52010-11-04 15:00:45 -04001559 psrldq $8, %xmm0
Sabrina Dubroca38d9dee2017-04-28 18:11:57 +02001560 cmp $0, %r11
1561 je _return_T_done_decrypt
1562_T_4_decrypt:
Tadeusz Struk0bd82f52010-11-04 15:00:45 -04001563 movd %xmm0, %eax
Sabrina Dubroca38d9dee2017-04-28 18:11:57 +02001564 mov %eax, (%r10)
1565 add $4, %r10
1566 sub $4, %r11
1567 psrldq $4, %xmm0
1568 cmp $0, %r11
1569 je _return_T_done_decrypt
1570_T_123_decrypt:
1571 movd %xmm0, %eax
1572 cmp $2, %r11
1573 jl _T_1_decrypt
1574 mov %ax, (%r10)
1575 cmp $2, %r11
1576 je _return_T_done_decrypt
1577 add $2, %r10
1578 sar $16, %eax
1579_T_1_decrypt:
1580 mov %al, (%r10)
Tadeusz Struk0bd82f52010-11-04 15:00:45 -04001581 jmp _return_T_done_decrypt
1582_T_16_decrypt:
1583 movdqu %xmm0, (%r10)
1584_return_T_done_decrypt:
1585 mov %r14, %rsp
1586 pop %r14
1587 pop %r13
1588 pop %r12
1589 ret
Jussi Kivilinna8309b742013-01-19 13:38:55 +02001590ENDPROC(aesni_gcm_dec)
Tadeusz Struk0bd82f52010-11-04 15:00:45 -04001591
1592
1593/*****************************************************************************
1594* void aesni_gcm_enc(void *aes_ctx, // AES Key schedule. Starts on a 16 byte boundary.
1595* u8 *out, // Ciphertext output. Encrypt in-place is allowed.
1596* const u8 *in, // Plaintext input
1597* u64 plaintext_len, // Length of data in bytes for encryption.
1598* u8 *iv, // Pre-counter block j0: 4 byte salt (from Security Association)
1599* // concatenated with 8 byte Initialisation Vector (from IPSec ESP Payload)
1600* // concatenated with 0x00000001. 16-byte aligned pointer.
1601* u8 *hash_subkey, // H, the Hash sub key input. Data starts on a 16-byte boundary.
1602* const u8 *aad, // Additional Authentication Data (AAD)
1603* u64 aad_len, // Length of AAD in bytes. With RFC4106 this is going to be 8 or 12 bytes
1604* u8 *auth_tag, // Authenticated Tag output.
1605* u64 auth_tag_len); // Authenticated Tag Length in bytes. Valid values are 16 (most likely),
1606* // 12 or 8.
1607*
1608* Assumptions:
1609*
1610* keys:
1611* keys are pre-expanded and aligned to 16 bytes. we are using the
1612* first set of 11 keys in the data structure void *aes_ctx
1613*
1614*
1615* iv:
1616* 0 1 2 3
1617* 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
1618* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1619* | Salt (From the SA) |
1620* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1621* | Initialization Vector |
1622* | (This is the sequence number from IPSec header) |
1623* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1624* | 0x1 |
1625* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1626*
1627*
1628*
1629* AAD:
1630* AAD padded to 128 bits with 0
1631* for example, assume AAD is a u32 vector
1632*
1633* if AAD is 8 bytes:
1634* AAD[3] = {A0, A1};
1635* padded AAD in xmm register = {A1 A0 0 0}
1636*
1637* 0 1 2 3
1638* 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
1639* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1640* | SPI (A1) |
1641* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1642* | 32-bit Sequence Number (A0) |
1643* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1644* | 0x0 |
1645* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1646*
1647* AAD Format with 32-bit Sequence Number
1648*
1649* if AAD is 12 bytes:
1650* AAD[3] = {A0, A1, A2};
1651* padded AAD in xmm register = {A2 A1 A0 0}
1652*
1653* 0 1 2 3
1654* 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
1655* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1656* | SPI (A2) |
1657* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1658* | 64-bit Extended Sequence Number {A1,A0} |
1659* | |
1660* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1661* | 0x0 |
1662* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1663*
1664* AAD Format with 64-bit Extended Sequence Number
1665*
1666* aadLen:
1667* from the definition of the spec, aadLen can only be 8 or 12 bytes.
1668* The code supports 16 too but for other sizes, the code will fail.
1669*
1670* TLen:
1671* from the definition of the spec, TLen can only be 8, 12 or 16 bytes.
1672* For other sizes, the code will fail.
1673*
1674* poly = x^128 + x^127 + x^126 + x^121 + 1
1675***************************************************************************/
1676ENTRY(aesni_gcm_enc)
1677 push %r12
1678 push %r13
1679 push %r14
1680 mov %rsp, %r14
1681#
1682# states of %xmm registers %xmm6:%xmm15 not saved
1683# all %xmm registers are clobbered
1684#
1685 sub $VARIABLE_OFFSET, %rsp
1686 and $~63, %rsp
1687 mov %arg6, %r12
1688 movdqu (%r12), %xmm13
Tadeusz Struk3c097b82010-12-13 19:51:15 +08001689 movdqa SHUF_MASK(%rip), %xmm2
1690 PSHUFB_XMM %xmm2, %xmm13
1691
Tadeusz Struk0bd82f52010-11-04 15:00:45 -04001692
1693# precompute HashKey<<1 mod poly from the HashKey (required for GHASH)
1694
1695 movdqa %xmm13, %xmm2
1696 psllq $1, %xmm13
1697 psrlq $63, %xmm2
1698 movdqa %xmm2, %xmm1
1699 pslldq $8, %xmm2
1700 psrldq $8, %xmm1
1701 por %xmm2, %xmm13
1702
1703 # reduce HashKey<<1
1704
1705 pshufd $0x24, %xmm1, %xmm2
1706 pcmpeqd TWOONE(%rip), %xmm2
1707 pand POLY(%rip), %xmm2
1708 pxor %xmm2, %xmm13
1709 movdqa %xmm13, HashKey(%rsp)
1710 mov %arg4, %r13 # %xmm13 holds HashKey<<1 (mod poly)
1711 and $-16, %r13
1712 mov %r13, %r12
1713
1714 # Encrypt first few blocks
1715
1716 and $(3<<4), %r12
1717 jz _initial_num_blocks_is_0_encrypt
1718 cmp $(2<<4), %r12
1719 jb _initial_num_blocks_is_1_encrypt
1720 je _initial_num_blocks_is_2_encrypt
1721_initial_num_blocks_is_3_encrypt:
Tadeusz Struk3c097b82010-12-13 19:51:15 +08001722 INITIAL_BLOCKS_ENC 3, %xmm9, %xmm10, %xmm13, %xmm11, %xmm12, %xmm0, \
Tadeusz Struk0bd82f52010-11-04 15:00:45 -04001723%xmm1, %xmm2, %xmm3, %xmm4, %xmm8, %xmm5, %xmm6, 5, 678, enc
1724 sub $48, %r13
1725 jmp _initial_blocks_encrypted
1726_initial_num_blocks_is_2_encrypt:
Tadeusz Struk3c097b82010-12-13 19:51:15 +08001727 INITIAL_BLOCKS_ENC 2, %xmm9, %xmm10, %xmm13, %xmm11, %xmm12, %xmm0, \
Tadeusz Struk0bd82f52010-11-04 15:00:45 -04001728%xmm1, %xmm2, %xmm3, %xmm4, %xmm8, %xmm5, %xmm6, 6, 78, enc
1729 sub $32, %r13
1730 jmp _initial_blocks_encrypted
1731_initial_num_blocks_is_1_encrypt:
Tadeusz Struk3c097b82010-12-13 19:51:15 +08001732 INITIAL_BLOCKS_ENC 1, %xmm9, %xmm10, %xmm13, %xmm11, %xmm12, %xmm0, \
Tadeusz Struk0bd82f52010-11-04 15:00:45 -04001733%xmm1, %xmm2, %xmm3, %xmm4, %xmm8, %xmm5, %xmm6, 7, 8, enc
1734 sub $16, %r13
1735 jmp _initial_blocks_encrypted
1736_initial_num_blocks_is_0_encrypt:
Tadeusz Struk3c097b82010-12-13 19:51:15 +08001737 INITIAL_BLOCKS_ENC 0, %xmm9, %xmm10, %xmm13, %xmm11, %xmm12, %xmm0, \
Tadeusz Struk0bd82f52010-11-04 15:00:45 -04001738%xmm1, %xmm2, %xmm3, %xmm4, %xmm8, %xmm5, %xmm6, 8, 0, enc
1739_initial_blocks_encrypted:
1740
1741 # Main loop - Encrypt remaining blocks
1742
1743 cmp $0, %r13
1744 je _zero_cipher_left_encrypt
1745 sub $64, %r13
1746 je _four_cipher_left_encrypt
1747_encrypt_by_4_encrypt:
Tadeusz Struk3c097b82010-12-13 19:51:15 +08001748 GHASH_4_ENCRYPT_4_PARALLEL_ENC %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, \
Tadeusz Struk0bd82f52010-11-04 15:00:45 -04001749%xmm14, %xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, enc
1750 add $64, %r11
1751 sub $64, %r13
1752 jne _encrypt_by_4_encrypt
1753_four_cipher_left_encrypt:
1754 GHASH_LAST_4 %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14, \
1755%xmm15, %xmm1, %xmm2, %xmm3, %xmm4, %xmm8
1756_zero_cipher_left_encrypt:
1757 mov %arg4, %r13
1758 and $15, %r13 # %r13 = arg4 (mod 16)
1759 je _multiple_of_16_bytes_encrypt
1760
Lucas De Marchi0d2eb442011-03-17 16:24:16 -03001761 # Handle the last <16 Byte block separately
Tadeusz Struk0bd82f52010-11-04 15:00:45 -04001762 paddd ONE(%rip), %xmm0 # INCR CNT to get Yn
Tadeusz Struk3c097b82010-12-13 19:51:15 +08001763 movdqa SHUF_MASK(%rip), %xmm10
1764 PSHUFB_XMM %xmm10, %xmm0
1765
Tadeusz Struk60af5202011-03-13 16:56:17 +08001766
Tadeusz Struk0bd82f52010-11-04 15:00:45 -04001767 ENCRYPT_SINGLE_BLOCK %xmm0, %xmm1 # Encrypt(K, Yn)
1768 sub $16, %r11
1769 add %r13, %r11
1770 movdqu (%arg3,%r11,1), %xmm1 # receive the last <16 byte blocks
1771 lea SHIFT_MASK+16(%rip), %r12
1772 sub %r13, %r12
1773 # adjust the shuffle mask pointer to be able to shift 16-r13 bytes
1774 # (%r13 is the number of bytes in plaintext mod 16)
1775 movdqu (%r12), %xmm2 # get the appropriate shuffle mask
Tadeusz Struk3c097b82010-12-13 19:51:15 +08001776 PSHUFB_XMM %xmm2, %xmm1 # shift right 16-r13 byte
Tadeusz Struk0bd82f52010-11-04 15:00:45 -04001777 pxor %xmm1, %xmm0 # Plaintext XOR Encrypt(K, Yn)
1778 movdqu ALL_F-SHIFT_MASK(%r12), %xmm1
1779 # get the appropriate mask to mask out top 16-r13 bytes of xmm0
1780 pand %xmm1, %xmm0 # mask out top 16-r13 bytes of xmm0
Tadeusz Struk3c097b82010-12-13 19:51:15 +08001781 movdqa SHUF_MASK(%rip), %xmm10
1782 PSHUFB_XMM %xmm10,%xmm0
Tadeusz Struk0bd82f52010-11-04 15:00:45 -04001783
Tadeusz Struk0bd82f52010-11-04 15:00:45 -04001784 pxor %xmm0, %xmm8
1785 GHASH_MUL %xmm8, %xmm13, %xmm9, %xmm10, %xmm11, %xmm5, %xmm6
1786 # GHASH computation for the last <16 byte block
1787 sub %r13, %r11
1788 add $16, %r11
Tadeusz Struk60af5202011-03-13 16:56:17 +08001789
1790 movdqa SHUF_MASK(%rip), %xmm10
1791 PSHUFB_XMM %xmm10, %xmm0
Tadeusz Struk3c097b82010-12-13 19:51:15 +08001792
Tadeusz Struk0bd82f52010-11-04 15:00:45 -04001793 # shuffle xmm0 back to output as ciphertext
1794
1795 # Output %r13 bytes
Tadeusz Struk3c097b82010-12-13 19:51:15 +08001796 MOVQ_R64_XMM %xmm0, %rax
Tadeusz Struk0bd82f52010-11-04 15:00:45 -04001797 cmp $8, %r13
1798 jle _less_than_8_bytes_left_encrypt
1799 mov %rax, (%arg2 , %r11, 1)
1800 add $8, %r11
1801 psrldq $8, %xmm0
Tadeusz Struk3c097b82010-12-13 19:51:15 +08001802 MOVQ_R64_XMM %xmm0, %rax
Tadeusz Struk0bd82f52010-11-04 15:00:45 -04001803 sub $8, %r13
1804_less_than_8_bytes_left_encrypt:
1805 mov %al, (%arg2, %r11, 1)
1806 add $1, %r11
1807 shr $8, %rax
1808 sub $1, %r13
1809 jne _less_than_8_bytes_left_encrypt
1810_multiple_of_16_bytes_encrypt:
1811 mov arg8, %r12 # %r12 = addLen (number of bytes)
1812 shl $3, %r12
1813 movd %r12d, %xmm15 # len(A) in %xmm15
1814 shl $3, %arg4 # len(C) in bits (*128)
Tadeusz Struk3c097b82010-12-13 19:51:15 +08001815 MOVQ_R64_XMM %arg4, %xmm1
Tadeusz Struk0bd82f52010-11-04 15:00:45 -04001816 pslldq $8, %xmm15 # %xmm15 = len(A)||0x0000000000000000
1817 pxor %xmm1, %xmm15 # %xmm15 = len(A)||len(C)
1818 pxor %xmm15, %xmm8
1819 GHASH_MUL %xmm8, %xmm13, %xmm9, %xmm10, %xmm11, %xmm5, %xmm6
1820 # final GHASH computation
Tadeusz Struk3c097b82010-12-13 19:51:15 +08001821 movdqa SHUF_MASK(%rip), %xmm10
1822 PSHUFB_XMM %xmm10, %xmm8 # perform a 16 byte swap
Tadeusz Struk0bd82f52010-11-04 15:00:45 -04001823
Tadeusz Struk0bd82f52010-11-04 15:00:45 -04001824 mov %arg5, %rax # %rax = *Y0
1825 movdqu (%rax), %xmm0 # %xmm0 = Y0
1826 ENCRYPT_SINGLE_BLOCK %xmm0, %xmm15 # Encrypt(K, Y0)
1827 pxor %xmm8, %xmm0
1828_return_T_encrypt:
1829 mov arg9, %r10 # %r10 = authTag
1830 mov arg10, %r11 # %r11 = auth_tag_len
1831 cmp $16, %r11
1832 je _T_16_encrypt
Sabrina Dubroca38d9dee2017-04-28 18:11:57 +02001833 cmp $8, %r11
1834 jl _T_4_encrypt
Tadeusz Struk0bd82f52010-11-04 15:00:45 -04001835_T_8_encrypt:
Tadeusz Struk3c097b82010-12-13 19:51:15 +08001836 MOVQ_R64_XMM %xmm0, %rax
Tadeusz Struk0bd82f52010-11-04 15:00:45 -04001837 mov %rax, (%r10)
Sabrina Dubroca38d9dee2017-04-28 18:11:57 +02001838 add $8, %r10
1839 sub $8, %r11
Tadeusz Struk0bd82f52010-11-04 15:00:45 -04001840 psrldq $8, %xmm0
Sabrina Dubroca38d9dee2017-04-28 18:11:57 +02001841 cmp $0, %r11
1842 je _return_T_done_encrypt
1843_T_4_encrypt:
Tadeusz Struk0bd82f52010-11-04 15:00:45 -04001844 movd %xmm0, %eax
Sabrina Dubroca38d9dee2017-04-28 18:11:57 +02001845 mov %eax, (%r10)
1846 add $4, %r10
1847 sub $4, %r11
1848 psrldq $4, %xmm0
1849 cmp $0, %r11
1850 je _return_T_done_encrypt
1851_T_123_encrypt:
1852 movd %xmm0, %eax
1853 cmp $2, %r11
1854 jl _T_1_encrypt
1855 mov %ax, (%r10)
1856 cmp $2, %r11
1857 je _return_T_done_encrypt
1858 add $2, %r10
1859 sar $16, %eax
1860_T_1_encrypt:
1861 mov %al, (%r10)
Tadeusz Struk0bd82f52010-11-04 15:00:45 -04001862 jmp _return_T_done_encrypt
1863_T_16_encrypt:
1864 movdqu %xmm0, (%r10)
1865_return_T_done_encrypt:
1866 mov %r14, %rsp
1867 pop %r14
1868 pop %r13
1869 pop %r12
1870 ret
Jussi Kivilinna8309b742013-01-19 13:38:55 +02001871ENDPROC(aesni_gcm_enc)
Tadeusz Struk3c097b82010-12-13 19:51:15 +08001872
Mathias Krause559ad0f2010-11-29 08:35:39 +08001873#endif
Tadeusz Struk0bd82f52010-11-04 15:00:45 -04001874
1875
Jussi Kivilinna8309b742013-01-19 13:38:55 +02001876.align 4
Huang Ying54b6a1b2009-01-18 16:28:34 +11001877_key_expansion_128:
1878_key_expansion_256a:
1879 pshufd $0b11111111, %xmm1, %xmm1
1880 shufps $0b00010000, %xmm0, %xmm4
1881 pxor %xmm4, %xmm0
1882 shufps $0b10001100, %xmm0, %xmm4
1883 pxor %xmm4, %xmm0
1884 pxor %xmm1, %xmm0
Mathias Krause0d258ef2010-11-27 16:34:46 +08001885 movaps %xmm0, (TKEYP)
1886 add $0x10, TKEYP
Huang Ying54b6a1b2009-01-18 16:28:34 +11001887 ret
Jussi Kivilinna8309b742013-01-19 13:38:55 +02001888ENDPROC(_key_expansion_128)
1889ENDPROC(_key_expansion_256a)
Huang Ying54b6a1b2009-01-18 16:28:34 +11001890
Mathias Krause0d258ef2010-11-27 16:34:46 +08001891.align 4
Huang Ying54b6a1b2009-01-18 16:28:34 +11001892_key_expansion_192a:
1893 pshufd $0b01010101, %xmm1, %xmm1
1894 shufps $0b00010000, %xmm0, %xmm4
1895 pxor %xmm4, %xmm0
1896 shufps $0b10001100, %xmm0, %xmm4
1897 pxor %xmm4, %xmm0
1898 pxor %xmm1, %xmm0
1899
1900 movaps %xmm2, %xmm5
1901 movaps %xmm2, %xmm6
1902 pslldq $4, %xmm5
1903 pshufd $0b11111111, %xmm0, %xmm3
1904 pxor %xmm3, %xmm2
1905 pxor %xmm5, %xmm2
1906
1907 movaps %xmm0, %xmm1
1908 shufps $0b01000100, %xmm0, %xmm6
Mathias Krause0d258ef2010-11-27 16:34:46 +08001909 movaps %xmm6, (TKEYP)
Huang Ying54b6a1b2009-01-18 16:28:34 +11001910 shufps $0b01001110, %xmm2, %xmm1
Mathias Krause0d258ef2010-11-27 16:34:46 +08001911 movaps %xmm1, 0x10(TKEYP)
1912 add $0x20, TKEYP
Huang Ying54b6a1b2009-01-18 16:28:34 +11001913 ret
Jussi Kivilinna8309b742013-01-19 13:38:55 +02001914ENDPROC(_key_expansion_192a)
Huang Ying54b6a1b2009-01-18 16:28:34 +11001915
Mathias Krause0d258ef2010-11-27 16:34:46 +08001916.align 4
Huang Ying54b6a1b2009-01-18 16:28:34 +11001917_key_expansion_192b:
1918 pshufd $0b01010101, %xmm1, %xmm1
1919 shufps $0b00010000, %xmm0, %xmm4
1920 pxor %xmm4, %xmm0
1921 shufps $0b10001100, %xmm0, %xmm4
1922 pxor %xmm4, %xmm0
1923 pxor %xmm1, %xmm0
1924
1925 movaps %xmm2, %xmm5
1926 pslldq $4, %xmm5
1927 pshufd $0b11111111, %xmm0, %xmm3
1928 pxor %xmm3, %xmm2
1929 pxor %xmm5, %xmm2
1930
Mathias Krause0d258ef2010-11-27 16:34:46 +08001931 movaps %xmm0, (TKEYP)
1932 add $0x10, TKEYP
Huang Ying54b6a1b2009-01-18 16:28:34 +11001933 ret
Jussi Kivilinna8309b742013-01-19 13:38:55 +02001934ENDPROC(_key_expansion_192b)
Huang Ying54b6a1b2009-01-18 16:28:34 +11001935
Mathias Krause0d258ef2010-11-27 16:34:46 +08001936.align 4
Huang Ying54b6a1b2009-01-18 16:28:34 +11001937_key_expansion_256b:
1938 pshufd $0b10101010, %xmm1, %xmm1
1939 shufps $0b00010000, %xmm2, %xmm4
1940 pxor %xmm4, %xmm2
1941 shufps $0b10001100, %xmm2, %xmm4
1942 pxor %xmm4, %xmm2
1943 pxor %xmm1, %xmm2
Mathias Krause0d258ef2010-11-27 16:34:46 +08001944 movaps %xmm2, (TKEYP)
1945 add $0x10, TKEYP
Huang Ying54b6a1b2009-01-18 16:28:34 +11001946 ret
Jussi Kivilinna8309b742013-01-19 13:38:55 +02001947ENDPROC(_key_expansion_256b)
Huang Ying54b6a1b2009-01-18 16:28:34 +11001948
1949/*
1950 * int aesni_set_key(struct crypto_aes_ctx *ctx, const u8 *in_key,
1951 * unsigned int key_len)
1952 */
1953ENTRY(aesni_set_key)
Josh Poimboeuf8691ccd2016-01-21 16:49:19 -06001954 FRAME_BEGIN
Mathias Krause0d258ef2010-11-27 16:34:46 +08001955#ifndef __x86_64__
1956 pushl KEYP
Josh Poimboeuf8691ccd2016-01-21 16:49:19 -06001957 movl (FRAME_OFFSET+8)(%esp), KEYP # ctx
1958 movl (FRAME_OFFSET+12)(%esp), UKEYP # in_key
1959 movl (FRAME_OFFSET+16)(%esp), %edx # key_len
Mathias Krause0d258ef2010-11-27 16:34:46 +08001960#endif
1961 movups (UKEYP), %xmm0 # user key (first 16 bytes)
1962 movaps %xmm0, (KEYP)
1963 lea 0x10(KEYP), TKEYP # key addr
1964 movl %edx, 480(KEYP)
Huang Ying54b6a1b2009-01-18 16:28:34 +11001965 pxor %xmm4, %xmm4 # xmm4 is assumed 0 in _key_expansion_x
1966 cmp $24, %dl
1967 jb .Lenc_key128
1968 je .Lenc_key192
Mathias Krause0d258ef2010-11-27 16:34:46 +08001969 movups 0x10(UKEYP), %xmm2 # other user key
1970 movaps %xmm2, (TKEYP)
1971 add $0x10, TKEYP
Huang Yingb369e522009-11-23 19:54:06 +08001972 AESKEYGENASSIST 0x1 %xmm2 %xmm1 # round 1
Huang Ying54b6a1b2009-01-18 16:28:34 +11001973 call _key_expansion_256a
Huang Yingb369e522009-11-23 19:54:06 +08001974 AESKEYGENASSIST 0x1 %xmm0 %xmm1
Huang Ying54b6a1b2009-01-18 16:28:34 +11001975 call _key_expansion_256b
Huang Yingb369e522009-11-23 19:54:06 +08001976 AESKEYGENASSIST 0x2 %xmm2 %xmm1 # round 2
Huang Ying54b6a1b2009-01-18 16:28:34 +11001977 call _key_expansion_256a
Huang Yingb369e522009-11-23 19:54:06 +08001978 AESKEYGENASSIST 0x2 %xmm0 %xmm1
Huang Ying54b6a1b2009-01-18 16:28:34 +11001979 call _key_expansion_256b
Huang Yingb369e522009-11-23 19:54:06 +08001980 AESKEYGENASSIST 0x4 %xmm2 %xmm1 # round 3
Huang Ying54b6a1b2009-01-18 16:28:34 +11001981 call _key_expansion_256a
Huang Yingb369e522009-11-23 19:54:06 +08001982 AESKEYGENASSIST 0x4 %xmm0 %xmm1
Huang Ying54b6a1b2009-01-18 16:28:34 +11001983 call _key_expansion_256b
Huang Yingb369e522009-11-23 19:54:06 +08001984 AESKEYGENASSIST 0x8 %xmm2 %xmm1 # round 4
Huang Ying54b6a1b2009-01-18 16:28:34 +11001985 call _key_expansion_256a
Huang Yingb369e522009-11-23 19:54:06 +08001986 AESKEYGENASSIST 0x8 %xmm0 %xmm1
Huang Ying54b6a1b2009-01-18 16:28:34 +11001987 call _key_expansion_256b
Huang Yingb369e522009-11-23 19:54:06 +08001988 AESKEYGENASSIST 0x10 %xmm2 %xmm1 # round 5
Huang Ying54b6a1b2009-01-18 16:28:34 +11001989 call _key_expansion_256a
Huang Yingb369e522009-11-23 19:54:06 +08001990 AESKEYGENASSIST 0x10 %xmm0 %xmm1
Huang Ying54b6a1b2009-01-18 16:28:34 +11001991 call _key_expansion_256b
Huang Yingb369e522009-11-23 19:54:06 +08001992 AESKEYGENASSIST 0x20 %xmm2 %xmm1 # round 6
Huang Ying54b6a1b2009-01-18 16:28:34 +11001993 call _key_expansion_256a
Huang Yingb369e522009-11-23 19:54:06 +08001994 AESKEYGENASSIST 0x20 %xmm0 %xmm1
Huang Ying54b6a1b2009-01-18 16:28:34 +11001995 call _key_expansion_256b
Huang Yingb369e522009-11-23 19:54:06 +08001996 AESKEYGENASSIST 0x40 %xmm2 %xmm1 # round 7
Huang Ying54b6a1b2009-01-18 16:28:34 +11001997 call _key_expansion_256a
1998 jmp .Ldec_key
1999.Lenc_key192:
Mathias Krause0d258ef2010-11-27 16:34:46 +08002000 movq 0x10(UKEYP), %xmm2 # other user key
Huang Yingb369e522009-11-23 19:54:06 +08002001 AESKEYGENASSIST 0x1 %xmm2 %xmm1 # round 1
Huang Ying54b6a1b2009-01-18 16:28:34 +11002002 call _key_expansion_192a
Huang Yingb369e522009-11-23 19:54:06 +08002003 AESKEYGENASSIST 0x2 %xmm2 %xmm1 # round 2
Huang Ying54b6a1b2009-01-18 16:28:34 +11002004 call _key_expansion_192b
Huang Yingb369e522009-11-23 19:54:06 +08002005 AESKEYGENASSIST 0x4 %xmm2 %xmm1 # round 3
Huang Ying54b6a1b2009-01-18 16:28:34 +11002006 call _key_expansion_192a
Huang Yingb369e522009-11-23 19:54:06 +08002007 AESKEYGENASSIST 0x8 %xmm2 %xmm1 # round 4
Huang Ying54b6a1b2009-01-18 16:28:34 +11002008 call _key_expansion_192b
Huang Yingb369e522009-11-23 19:54:06 +08002009 AESKEYGENASSIST 0x10 %xmm2 %xmm1 # round 5
Huang Ying54b6a1b2009-01-18 16:28:34 +11002010 call _key_expansion_192a
Huang Yingb369e522009-11-23 19:54:06 +08002011 AESKEYGENASSIST 0x20 %xmm2 %xmm1 # round 6
Huang Ying54b6a1b2009-01-18 16:28:34 +11002012 call _key_expansion_192b
Huang Yingb369e522009-11-23 19:54:06 +08002013 AESKEYGENASSIST 0x40 %xmm2 %xmm1 # round 7
Huang Ying54b6a1b2009-01-18 16:28:34 +11002014 call _key_expansion_192a
Huang Yingb369e522009-11-23 19:54:06 +08002015 AESKEYGENASSIST 0x80 %xmm2 %xmm1 # round 8
Huang Ying54b6a1b2009-01-18 16:28:34 +11002016 call _key_expansion_192b
2017 jmp .Ldec_key
2018.Lenc_key128:
Huang Yingb369e522009-11-23 19:54:06 +08002019 AESKEYGENASSIST 0x1 %xmm0 %xmm1 # round 1
Huang Ying54b6a1b2009-01-18 16:28:34 +11002020 call _key_expansion_128
Huang Yingb369e522009-11-23 19:54:06 +08002021 AESKEYGENASSIST 0x2 %xmm0 %xmm1 # round 2
Huang Ying54b6a1b2009-01-18 16:28:34 +11002022 call _key_expansion_128
Huang Yingb369e522009-11-23 19:54:06 +08002023 AESKEYGENASSIST 0x4 %xmm0 %xmm1 # round 3
Huang Ying54b6a1b2009-01-18 16:28:34 +11002024 call _key_expansion_128
Huang Yingb369e522009-11-23 19:54:06 +08002025 AESKEYGENASSIST 0x8 %xmm0 %xmm1 # round 4
Huang Ying54b6a1b2009-01-18 16:28:34 +11002026 call _key_expansion_128
Huang Yingb369e522009-11-23 19:54:06 +08002027 AESKEYGENASSIST 0x10 %xmm0 %xmm1 # round 5
Huang Ying54b6a1b2009-01-18 16:28:34 +11002028 call _key_expansion_128
Huang Yingb369e522009-11-23 19:54:06 +08002029 AESKEYGENASSIST 0x20 %xmm0 %xmm1 # round 6
Huang Ying54b6a1b2009-01-18 16:28:34 +11002030 call _key_expansion_128
Huang Yingb369e522009-11-23 19:54:06 +08002031 AESKEYGENASSIST 0x40 %xmm0 %xmm1 # round 7
Huang Ying54b6a1b2009-01-18 16:28:34 +11002032 call _key_expansion_128
Huang Yingb369e522009-11-23 19:54:06 +08002033 AESKEYGENASSIST 0x80 %xmm0 %xmm1 # round 8
Huang Ying54b6a1b2009-01-18 16:28:34 +11002034 call _key_expansion_128
Huang Yingb369e522009-11-23 19:54:06 +08002035 AESKEYGENASSIST 0x1b %xmm0 %xmm1 # round 9
Huang Ying54b6a1b2009-01-18 16:28:34 +11002036 call _key_expansion_128
Huang Yingb369e522009-11-23 19:54:06 +08002037 AESKEYGENASSIST 0x36 %xmm0 %xmm1 # round 10
Huang Ying54b6a1b2009-01-18 16:28:34 +11002038 call _key_expansion_128
2039.Ldec_key:
Mathias Krause0d258ef2010-11-27 16:34:46 +08002040 sub $0x10, TKEYP
2041 movaps (KEYP), %xmm0
2042 movaps (TKEYP), %xmm1
2043 movaps %xmm0, 240(TKEYP)
2044 movaps %xmm1, 240(KEYP)
2045 add $0x10, KEYP
2046 lea 240-16(TKEYP), UKEYP
Huang Ying54b6a1b2009-01-18 16:28:34 +11002047.align 4
2048.Ldec_key_loop:
Mathias Krause0d258ef2010-11-27 16:34:46 +08002049 movaps (KEYP), %xmm0
Huang Yingb369e522009-11-23 19:54:06 +08002050 AESIMC %xmm0 %xmm1
Mathias Krause0d258ef2010-11-27 16:34:46 +08002051 movaps %xmm1, (UKEYP)
2052 add $0x10, KEYP
2053 sub $0x10, UKEYP
2054 cmp TKEYP, KEYP
Huang Ying54b6a1b2009-01-18 16:28:34 +11002055 jb .Ldec_key_loop
Mathias Krause0d258ef2010-11-27 16:34:46 +08002056 xor AREG, AREG
2057#ifndef __x86_64__
2058 popl KEYP
2059#endif
Josh Poimboeuf8691ccd2016-01-21 16:49:19 -06002060 FRAME_END
Huang Ying54b6a1b2009-01-18 16:28:34 +11002061 ret
Jussi Kivilinna8309b742013-01-19 13:38:55 +02002062ENDPROC(aesni_set_key)
Huang Ying54b6a1b2009-01-18 16:28:34 +11002063
2064/*
2065 * void aesni_enc(struct crypto_aes_ctx *ctx, u8 *dst, const u8 *src)
2066 */
2067ENTRY(aesni_enc)
Josh Poimboeuf8691ccd2016-01-21 16:49:19 -06002068 FRAME_BEGIN
Mathias Krause0d258ef2010-11-27 16:34:46 +08002069#ifndef __x86_64__
2070 pushl KEYP
2071 pushl KLEN
Josh Poimboeuf8691ccd2016-01-21 16:49:19 -06002072 movl (FRAME_OFFSET+12)(%esp), KEYP # ctx
2073 movl (FRAME_OFFSET+16)(%esp), OUTP # dst
2074 movl (FRAME_OFFSET+20)(%esp), INP # src
Mathias Krause0d258ef2010-11-27 16:34:46 +08002075#endif
Huang Ying54b6a1b2009-01-18 16:28:34 +11002076 movl 480(KEYP), KLEN # key length
2077 movups (INP), STATE # input
2078 call _aesni_enc1
2079 movups STATE, (OUTP) # output
Mathias Krause0d258ef2010-11-27 16:34:46 +08002080#ifndef __x86_64__
2081 popl KLEN
2082 popl KEYP
2083#endif
Josh Poimboeuf8691ccd2016-01-21 16:49:19 -06002084 FRAME_END
Huang Ying54b6a1b2009-01-18 16:28:34 +11002085 ret
Jussi Kivilinna8309b742013-01-19 13:38:55 +02002086ENDPROC(aesni_enc)
Huang Ying54b6a1b2009-01-18 16:28:34 +11002087
2088/*
2089 * _aesni_enc1: internal ABI
2090 * input:
2091 * KEYP: key struct pointer
2092 * KLEN: round count
2093 * STATE: initial state (input)
2094 * output:
2095 * STATE: finial state (output)
2096 * changed:
2097 * KEY
2098 * TKEYP (T1)
2099 */
Mathias Krause0d258ef2010-11-27 16:34:46 +08002100.align 4
Huang Ying54b6a1b2009-01-18 16:28:34 +11002101_aesni_enc1:
2102 movaps (KEYP), KEY # key
2103 mov KEYP, TKEYP
2104 pxor KEY, STATE # round 0
2105 add $0x30, TKEYP
2106 cmp $24, KLEN
2107 jb .Lenc128
2108 lea 0x20(TKEYP), TKEYP
2109 je .Lenc192
2110 add $0x20, TKEYP
2111 movaps -0x60(TKEYP), KEY
Huang Yingb369e522009-11-23 19:54:06 +08002112 AESENC KEY STATE
Huang Ying54b6a1b2009-01-18 16:28:34 +11002113 movaps -0x50(TKEYP), KEY
Huang Yingb369e522009-11-23 19:54:06 +08002114 AESENC KEY STATE
Huang Ying54b6a1b2009-01-18 16:28:34 +11002115.align 4
2116.Lenc192:
2117 movaps -0x40(TKEYP), KEY
Huang Yingb369e522009-11-23 19:54:06 +08002118 AESENC KEY STATE
Huang Ying54b6a1b2009-01-18 16:28:34 +11002119 movaps -0x30(TKEYP), KEY
Huang Yingb369e522009-11-23 19:54:06 +08002120 AESENC KEY STATE
Huang Ying54b6a1b2009-01-18 16:28:34 +11002121.align 4
2122.Lenc128:
2123 movaps -0x20(TKEYP), KEY
Huang Yingb369e522009-11-23 19:54:06 +08002124 AESENC KEY STATE
Huang Ying54b6a1b2009-01-18 16:28:34 +11002125 movaps -0x10(TKEYP), KEY
Huang Yingb369e522009-11-23 19:54:06 +08002126 AESENC KEY STATE
Huang Ying54b6a1b2009-01-18 16:28:34 +11002127 movaps (TKEYP), KEY
Huang Yingb369e522009-11-23 19:54:06 +08002128 AESENC KEY STATE
Huang Ying54b6a1b2009-01-18 16:28:34 +11002129 movaps 0x10(TKEYP), KEY
Huang Yingb369e522009-11-23 19:54:06 +08002130 AESENC KEY STATE
Huang Ying54b6a1b2009-01-18 16:28:34 +11002131 movaps 0x20(TKEYP), KEY
Huang Yingb369e522009-11-23 19:54:06 +08002132 AESENC KEY STATE
Huang Ying54b6a1b2009-01-18 16:28:34 +11002133 movaps 0x30(TKEYP), KEY
Huang Yingb369e522009-11-23 19:54:06 +08002134 AESENC KEY STATE
Huang Ying54b6a1b2009-01-18 16:28:34 +11002135 movaps 0x40(TKEYP), KEY
Huang Yingb369e522009-11-23 19:54:06 +08002136 AESENC KEY STATE
Huang Ying54b6a1b2009-01-18 16:28:34 +11002137 movaps 0x50(TKEYP), KEY
Huang Yingb369e522009-11-23 19:54:06 +08002138 AESENC KEY STATE
Huang Ying54b6a1b2009-01-18 16:28:34 +11002139 movaps 0x60(TKEYP), KEY
Huang Yingb369e522009-11-23 19:54:06 +08002140 AESENC KEY STATE
Huang Ying54b6a1b2009-01-18 16:28:34 +11002141 movaps 0x70(TKEYP), KEY
Huang Yingb369e522009-11-23 19:54:06 +08002142 AESENCLAST KEY STATE
Huang Ying54b6a1b2009-01-18 16:28:34 +11002143 ret
Jussi Kivilinna8309b742013-01-19 13:38:55 +02002144ENDPROC(_aesni_enc1)
Huang Ying54b6a1b2009-01-18 16:28:34 +11002145
2146/*
2147 * _aesni_enc4: internal ABI
2148 * input:
2149 * KEYP: key struct pointer
2150 * KLEN: round count
2151 * STATE1: initial state (input)
2152 * STATE2
2153 * STATE3
2154 * STATE4
2155 * output:
2156 * STATE1: finial state (output)
2157 * STATE2
2158 * STATE3
2159 * STATE4
2160 * changed:
2161 * KEY
2162 * TKEYP (T1)
2163 */
Mathias Krause0d258ef2010-11-27 16:34:46 +08002164.align 4
Huang Ying54b6a1b2009-01-18 16:28:34 +11002165_aesni_enc4:
2166 movaps (KEYP), KEY # key
2167 mov KEYP, TKEYP
2168 pxor KEY, STATE1 # round 0
2169 pxor KEY, STATE2
2170 pxor KEY, STATE3
2171 pxor KEY, STATE4
2172 add $0x30, TKEYP
2173 cmp $24, KLEN
2174 jb .L4enc128
2175 lea 0x20(TKEYP), TKEYP
2176 je .L4enc192
2177 add $0x20, TKEYP
2178 movaps -0x60(TKEYP), KEY
Huang Yingb369e522009-11-23 19:54:06 +08002179 AESENC KEY STATE1
2180 AESENC KEY STATE2
2181 AESENC KEY STATE3
2182 AESENC KEY STATE4
Huang Ying54b6a1b2009-01-18 16:28:34 +11002183 movaps -0x50(TKEYP), KEY
Huang Yingb369e522009-11-23 19:54:06 +08002184 AESENC KEY STATE1
2185 AESENC KEY STATE2
2186 AESENC KEY STATE3
2187 AESENC KEY STATE4
Huang Ying54b6a1b2009-01-18 16:28:34 +11002188#.align 4
2189.L4enc192:
2190 movaps -0x40(TKEYP), KEY
Huang Yingb369e522009-11-23 19:54:06 +08002191 AESENC KEY STATE1
2192 AESENC KEY STATE2
2193 AESENC KEY STATE3
2194 AESENC KEY STATE4
Huang Ying54b6a1b2009-01-18 16:28:34 +11002195 movaps -0x30(TKEYP), KEY
Huang Yingb369e522009-11-23 19:54:06 +08002196 AESENC KEY STATE1
2197 AESENC KEY STATE2
2198 AESENC KEY STATE3
2199 AESENC KEY STATE4
Huang Ying54b6a1b2009-01-18 16:28:34 +11002200#.align 4
2201.L4enc128:
2202 movaps -0x20(TKEYP), KEY
Huang Yingb369e522009-11-23 19:54:06 +08002203 AESENC KEY STATE1
2204 AESENC KEY STATE2
2205 AESENC KEY STATE3
2206 AESENC KEY STATE4
Huang Ying54b6a1b2009-01-18 16:28:34 +11002207 movaps -0x10(TKEYP), KEY
Huang Yingb369e522009-11-23 19:54:06 +08002208 AESENC KEY STATE1
2209 AESENC KEY STATE2
2210 AESENC KEY STATE3
2211 AESENC KEY STATE4
Huang Ying54b6a1b2009-01-18 16:28:34 +11002212 movaps (TKEYP), KEY
Huang Yingb369e522009-11-23 19:54:06 +08002213 AESENC KEY STATE1
2214 AESENC KEY STATE2
2215 AESENC KEY STATE3
2216 AESENC KEY STATE4
Huang Ying54b6a1b2009-01-18 16:28:34 +11002217 movaps 0x10(TKEYP), KEY
Huang Yingb369e522009-11-23 19:54:06 +08002218 AESENC KEY STATE1
2219 AESENC KEY STATE2
2220 AESENC KEY STATE3
2221 AESENC KEY STATE4
Huang Ying54b6a1b2009-01-18 16:28:34 +11002222 movaps 0x20(TKEYP), KEY
Huang Yingb369e522009-11-23 19:54:06 +08002223 AESENC KEY STATE1
2224 AESENC KEY STATE2
2225 AESENC KEY STATE3
2226 AESENC KEY STATE4
Huang Ying54b6a1b2009-01-18 16:28:34 +11002227 movaps 0x30(TKEYP), KEY
Huang Yingb369e522009-11-23 19:54:06 +08002228 AESENC KEY STATE1
2229 AESENC KEY STATE2
2230 AESENC KEY STATE3
2231 AESENC KEY STATE4
Huang Ying54b6a1b2009-01-18 16:28:34 +11002232 movaps 0x40(TKEYP), KEY
Huang Yingb369e522009-11-23 19:54:06 +08002233 AESENC KEY STATE1
2234 AESENC KEY STATE2
2235 AESENC KEY STATE3
2236 AESENC KEY STATE4
Huang Ying54b6a1b2009-01-18 16:28:34 +11002237 movaps 0x50(TKEYP), KEY
Huang Yingb369e522009-11-23 19:54:06 +08002238 AESENC KEY STATE1
2239 AESENC KEY STATE2
2240 AESENC KEY STATE3
2241 AESENC KEY STATE4
Huang Ying54b6a1b2009-01-18 16:28:34 +11002242 movaps 0x60(TKEYP), KEY
Huang Yingb369e522009-11-23 19:54:06 +08002243 AESENC KEY STATE1
2244 AESENC KEY STATE2
2245 AESENC KEY STATE3
2246 AESENC KEY STATE4
Huang Ying54b6a1b2009-01-18 16:28:34 +11002247 movaps 0x70(TKEYP), KEY
Huang Yingb369e522009-11-23 19:54:06 +08002248 AESENCLAST KEY STATE1 # last round
2249 AESENCLAST KEY STATE2
2250 AESENCLAST KEY STATE3
2251 AESENCLAST KEY STATE4
Huang Ying54b6a1b2009-01-18 16:28:34 +11002252 ret
Jussi Kivilinna8309b742013-01-19 13:38:55 +02002253ENDPROC(_aesni_enc4)
Huang Ying54b6a1b2009-01-18 16:28:34 +11002254
2255/*
2256 * void aesni_dec (struct crypto_aes_ctx *ctx, u8 *dst, const u8 *src)
2257 */
2258ENTRY(aesni_dec)
Josh Poimboeuf8691ccd2016-01-21 16:49:19 -06002259 FRAME_BEGIN
Mathias Krause0d258ef2010-11-27 16:34:46 +08002260#ifndef __x86_64__
2261 pushl KEYP
2262 pushl KLEN
Josh Poimboeuf8691ccd2016-01-21 16:49:19 -06002263 movl (FRAME_OFFSET+12)(%esp), KEYP # ctx
2264 movl (FRAME_OFFSET+16)(%esp), OUTP # dst
2265 movl (FRAME_OFFSET+20)(%esp), INP # src
Mathias Krause0d258ef2010-11-27 16:34:46 +08002266#endif
Huang Ying54b6a1b2009-01-18 16:28:34 +11002267 mov 480(KEYP), KLEN # key length
2268 add $240, KEYP
2269 movups (INP), STATE # input
2270 call _aesni_dec1
2271 movups STATE, (OUTP) #output
Mathias Krause0d258ef2010-11-27 16:34:46 +08002272#ifndef __x86_64__
2273 popl KLEN
2274 popl KEYP
2275#endif
Josh Poimboeuf8691ccd2016-01-21 16:49:19 -06002276 FRAME_END
Huang Ying54b6a1b2009-01-18 16:28:34 +11002277 ret
Jussi Kivilinna8309b742013-01-19 13:38:55 +02002278ENDPROC(aesni_dec)
Huang Ying54b6a1b2009-01-18 16:28:34 +11002279
2280/*
2281 * _aesni_dec1: internal ABI
2282 * input:
2283 * KEYP: key struct pointer
2284 * KLEN: key length
2285 * STATE: initial state (input)
2286 * output:
2287 * STATE: finial state (output)
2288 * changed:
2289 * KEY
2290 * TKEYP (T1)
2291 */
Mathias Krause0d258ef2010-11-27 16:34:46 +08002292.align 4
Huang Ying54b6a1b2009-01-18 16:28:34 +11002293_aesni_dec1:
2294 movaps (KEYP), KEY # key
2295 mov KEYP, TKEYP
2296 pxor KEY, STATE # round 0
2297 add $0x30, TKEYP
2298 cmp $24, KLEN
2299 jb .Ldec128
2300 lea 0x20(TKEYP), TKEYP
2301 je .Ldec192
2302 add $0x20, TKEYP
2303 movaps -0x60(TKEYP), KEY
Huang Yingb369e522009-11-23 19:54:06 +08002304 AESDEC KEY STATE
Huang Ying54b6a1b2009-01-18 16:28:34 +11002305 movaps -0x50(TKEYP), KEY
Huang Yingb369e522009-11-23 19:54:06 +08002306 AESDEC KEY STATE
Huang Ying54b6a1b2009-01-18 16:28:34 +11002307.align 4
2308.Ldec192:
2309 movaps -0x40(TKEYP), KEY
Huang Yingb369e522009-11-23 19:54:06 +08002310 AESDEC KEY STATE
Huang Ying54b6a1b2009-01-18 16:28:34 +11002311 movaps -0x30(TKEYP), KEY
Huang Yingb369e522009-11-23 19:54:06 +08002312 AESDEC KEY STATE
Huang Ying54b6a1b2009-01-18 16:28:34 +11002313.align 4
2314.Ldec128:
2315 movaps -0x20(TKEYP), KEY
Huang Yingb369e522009-11-23 19:54:06 +08002316 AESDEC KEY STATE
Huang Ying54b6a1b2009-01-18 16:28:34 +11002317 movaps -0x10(TKEYP), KEY
Huang Yingb369e522009-11-23 19:54:06 +08002318 AESDEC KEY STATE
Huang Ying54b6a1b2009-01-18 16:28:34 +11002319 movaps (TKEYP), KEY
Huang Yingb369e522009-11-23 19:54:06 +08002320 AESDEC KEY STATE
Huang Ying54b6a1b2009-01-18 16:28:34 +11002321 movaps 0x10(TKEYP), KEY
Huang Yingb369e522009-11-23 19:54:06 +08002322 AESDEC KEY STATE
Huang Ying54b6a1b2009-01-18 16:28:34 +11002323 movaps 0x20(TKEYP), KEY
Huang Yingb369e522009-11-23 19:54:06 +08002324 AESDEC KEY STATE
Huang Ying54b6a1b2009-01-18 16:28:34 +11002325 movaps 0x30(TKEYP), KEY
Huang Yingb369e522009-11-23 19:54:06 +08002326 AESDEC KEY STATE
Huang Ying54b6a1b2009-01-18 16:28:34 +11002327 movaps 0x40(TKEYP), KEY
Huang Yingb369e522009-11-23 19:54:06 +08002328 AESDEC KEY STATE
Huang Ying54b6a1b2009-01-18 16:28:34 +11002329 movaps 0x50(TKEYP), KEY
Huang Yingb369e522009-11-23 19:54:06 +08002330 AESDEC KEY STATE
Huang Ying54b6a1b2009-01-18 16:28:34 +11002331 movaps 0x60(TKEYP), KEY
Huang Yingb369e522009-11-23 19:54:06 +08002332 AESDEC KEY STATE
Huang Ying54b6a1b2009-01-18 16:28:34 +11002333 movaps 0x70(TKEYP), KEY
Huang Yingb369e522009-11-23 19:54:06 +08002334 AESDECLAST KEY STATE
Huang Ying54b6a1b2009-01-18 16:28:34 +11002335 ret
Jussi Kivilinna8309b742013-01-19 13:38:55 +02002336ENDPROC(_aesni_dec1)
Huang Ying54b6a1b2009-01-18 16:28:34 +11002337
2338/*
2339 * _aesni_dec4: internal ABI
2340 * input:
2341 * KEYP: key struct pointer
2342 * KLEN: key length
2343 * STATE1: initial state (input)
2344 * STATE2
2345 * STATE3
2346 * STATE4
2347 * output:
2348 * STATE1: finial state (output)
2349 * STATE2
2350 * STATE3
2351 * STATE4
2352 * changed:
2353 * KEY
2354 * TKEYP (T1)
2355 */
Mathias Krause0d258ef2010-11-27 16:34:46 +08002356.align 4
Huang Ying54b6a1b2009-01-18 16:28:34 +11002357_aesni_dec4:
2358 movaps (KEYP), KEY # key
2359 mov KEYP, TKEYP
2360 pxor KEY, STATE1 # round 0
2361 pxor KEY, STATE2
2362 pxor KEY, STATE3
2363 pxor KEY, STATE4
2364 add $0x30, TKEYP
2365 cmp $24, KLEN
2366 jb .L4dec128
2367 lea 0x20(TKEYP), TKEYP
2368 je .L4dec192
2369 add $0x20, TKEYP
2370 movaps -0x60(TKEYP), KEY
Huang Yingb369e522009-11-23 19:54:06 +08002371 AESDEC KEY STATE1
2372 AESDEC KEY STATE2
2373 AESDEC KEY STATE3
2374 AESDEC KEY STATE4
Huang Ying54b6a1b2009-01-18 16:28:34 +11002375 movaps -0x50(TKEYP), KEY
Huang Yingb369e522009-11-23 19:54:06 +08002376 AESDEC KEY STATE1
2377 AESDEC KEY STATE2
2378 AESDEC KEY STATE3
2379 AESDEC KEY STATE4
Huang Ying54b6a1b2009-01-18 16:28:34 +11002380.align 4
2381.L4dec192:
2382 movaps -0x40(TKEYP), KEY
Huang Yingb369e522009-11-23 19:54:06 +08002383 AESDEC KEY STATE1
2384 AESDEC KEY STATE2
2385 AESDEC KEY STATE3
2386 AESDEC KEY STATE4
Huang Ying54b6a1b2009-01-18 16:28:34 +11002387 movaps -0x30(TKEYP), KEY
Huang Yingb369e522009-11-23 19:54:06 +08002388 AESDEC KEY STATE1
2389 AESDEC KEY STATE2
2390 AESDEC KEY STATE3
2391 AESDEC KEY STATE4
Huang Ying54b6a1b2009-01-18 16:28:34 +11002392.align 4
2393.L4dec128:
2394 movaps -0x20(TKEYP), KEY
Huang Yingb369e522009-11-23 19:54:06 +08002395 AESDEC KEY STATE1
2396 AESDEC KEY STATE2
2397 AESDEC KEY STATE3
2398 AESDEC KEY STATE4
Huang Ying54b6a1b2009-01-18 16:28:34 +11002399 movaps -0x10(TKEYP), KEY
Huang Yingb369e522009-11-23 19:54:06 +08002400 AESDEC KEY STATE1
2401 AESDEC KEY STATE2
2402 AESDEC KEY STATE3
2403 AESDEC KEY STATE4
Huang Ying54b6a1b2009-01-18 16:28:34 +11002404 movaps (TKEYP), KEY
Huang Yingb369e522009-11-23 19:54:06 +08002405 AESDEC KEY STATE1
2406 AESDEC KEY STATE2
2407 AESDEC KEY STATE3
2408 AESDEC KEY STATE4
Huang Ying54b6a1b2009-01-18 16:28:34 +11002409 movaps 0x10(TKEYP), KEY
Huang Yingb369e522009-11-23 19:54:06 +08002410 AESDEC KEY STATE1
2411 AESDEC KEY STATE2
2412 AESDEC KEY STATE3
2413 AESDEC KEY STATE4
Huang Ying54b6a1b2009-01-18 16:28:34 +11002414 movaps 0x20(TKEYP), KEY
Huang Yingb369e522009-11-23 19:54:06 +08002415 AESDEC KEY STATE1
2416 AESDEC KEY STATE2
2417 AESDEC KEY STATE3
2418 AESDEC KEY STATE4
Huang Ying54b6a1b2009-01-18 16:28:34 +11002419 movaps 0x30(TKEYP), KEY
Huang Yingb369e522009-11-23 19:54:06 +08002420 AESDEC KEY STATE1
2421 AESDEC KEY STATE2
2422 AESDEC KEY STATE3
2423 AESDEC KEY STATE4
Huang Ying54b6a1b2009-01-18 16:28:34 +11002424 movaps 0x40(TKEYP), KEY
Huang Yingb369e522009-11-23 19:54:06 +08002425 AESDEC KEY STATE1
2426 AESDEC KEY STATE2
2427 AESDEC KEY STATE3
2428 AESDEC KEY STATE4
Huang Ying54b6a1b2009-01-18 16:28:34 +11002429 movaps 0x50(TKEYP), KEY
Huang Yingb369e522009-11-23 19:54:06 +08002430 AESDEC KEY STATE1
2431 AESDEC KEY STATE2
2432 AESDEC KEY STATE3
2433 AESDEC KEY STATE4
Huang Ying54b6a1b2009-01-18 16:28:34 +11002434 movaps 0x60(TKEYP), KEY
Huang Yingb369e522009-11-23 19:54:06 +08002435 AESDEC KEY STATE1
2436 AESDEC KEY STATE2
2437 AESDEC KEY STATE3
2438 AESDEC KEY STATE4
Huang Ying54b6a1b2009-01-18 16:28:34 +11002439 movaps 0x70(TKEYP), KEY
Huang Yingb369e522009-11-23 19:54:06 +08002440 AESDECLAST KEY STATE1 # last round
2441 AESDECLAST KEY STATE2
2442 AESDECLAST KEY STATE3
2443 AESDECLAST KEY STATE4
Huang Ying54b6a1b2009-01-18 16:28:34 +11002444 ret
Jussi Kivilinna8309b742013-01-19 13:38:55 +02002445ENDPROC(_aesni_dec4)
Huang Ying54b6a1b2009-01-18 16:28:34 +11002446
2447/*
2448 * void aesni_ecb_enc(struct crypto_aes_ctx *ctx, const u8 *dst, u8 *src,
2449 * size_t len)
2450 */
2451ENTRY(aesni_ecb_enc)
Josh Poimboeuf8691ccd2016-01-21 16:49:19 -06002452 FRAME_BEGIN
Mathias Krause0d258ef2010-11-27 16:34:46 +08002453#ifndef __x86_64__
2454 pushl LEN
2455 pushl KEYP
2456 pushl KLEN
Josh Poimboeuf8691ccd2016-01-21 16:49:19 -06002457 movl (FRAME_OFFSET+16)(%esp), KEYP # ctx
2458 movl (FRAME_OFFSET+20)(%esp), OUTP # dst
2459 movl (FRAME_OFFSET+24)(%esp), INP # src
2460 movl (FRAME_OFFSET+28)(%esp), LEN # len
Mathias Krause0d258ef2010-11-27 16:34:46 +08002461#endif
Huang Ying54b6a1b2009-01-18 16:28:34 +11002462 test LEN, LEN # check length
2463 jz .Lecb_enc_ret
2464 mov 480(KEYP), KLEN
2465 cmp $16, LEN
2466 jb .Lecb_enc_ret
2467 cmp $64, LEN
2468 jb .Lecb_enc_loop1
2469.align 4
2470.Lecb_enc_loop4:
2471 movups (INP), STATE1
2472 movups 0x10(INP), STATE2
2473 movups 0x20(INP), STATE3
2474 movups 0x30(INP), STATE4
2475 call _aesni_enc4
2476 movups STATE1, (OUTP)
2477 movups STATE2, 0x10(OUTP)
2478 movups STATE3, 0x20(OUTP)
2479 movups STATE4, 0x30(OUTP)
2480 sub $64, LEN
2481 add $64, INP
2482 add $64, OUTP
2483 cmp $64, LEN
2484 jge .Lecb_enc_loop4
2485 cmp $16, LEN
2486 jb .Lecb_enc_ret
2487.align 4
2488.Lecb_enc_loop1:
2489 movups (INP), STATE1
2490 call _aesni_enc1
2491 movups STATE1, (OUTP)
2492 sub $16, LEN
2493 add $16, INP
2494 add $16, OUTP
2495 cmp $16, LEN
2496 jge .Lecb_enc_loop1
2497.Lecb_enc_ret:
Mathias Krause0d258ef2010-11-27 16:34:46 +08002498#ifndef __x86_64__
2499 popl KLEN
2500 popl KEYP
2501 popl LEN
2502#endif
Josh Poimboeuf8691ccd2016-01-21 16:49:19 -06002503 FRAME_END
Huang Ying54b6a1b2009-01-18 16:28:34 +11002504 ret
Jussi Kivilinna8309b742013-01-19 13:38:55 +02002505ENDPROC(aesni_ecb_enc)
Huang Ying54b6a1b2009-01-18 16:28:34 +11002506
2507/*
2508 * void aesni_ecb_dec(struct crypto_aes_ctx *ctx, const u8 *dst, u8 *src,
2509 * size_t len);
2510 */
2511ENTRY(aesni_ecb_dec)
Josh Poimboeuf8691ccd2016-01-21 16:49:19 -06002512 FRAME_BEGIN
Mathias Krause0d258ef2010-11-27 16:34:46 +08002513#ifndef __x86_64__
2514 pushl LEN
2515 pushl KEYP
2516 pushl KLEN
Josh Poimboeuf8691ccd2016-01-21 16:49:19 -06002517 movl (FRAME_OFFSET+16)(%esp), KEYP # ctx
2518 movl (FRAME_OFFSET+20)(%esp), OUTP # dst
2519 movl (FRAME_OFFSET+24)(%esp), INP # src
2520 movl (FRAME_OFFSET+28)(%esp), LEN # len
Mathias Krause0d258ef2010-11-27 16:34:46 +08002521#endif
Huang Ying54b6a1b2009-01-18 16:28:34 +11002522 test LEN, LEN
2523 jz .Lecb_dec_ret
2524 mov 480(KEYP), KLEN
2525 add $240, KEYP
2526 cmp $16, LEN
2527 jb .Lecb_dec_ret
2528 cmp $64, LEN
2529 jb .Lecb_dec_loop1
2530.align 4
2531.Lecb_dec_loop4:
2532 movups (INP), STATE1
2533 movups 0x10(INP), STATE2
2534 movups 0x20(INP), STATE3
2535 movups 0x30(INP), STATE4
2536 call _aesni_dec4
2537 movups STATE1, (OUTP)
2538 movups STATE2, 0x10(OUTP)
2539 movups STATE3, 0x20(OUTP)
2540 movups STATE4, 0x30(OUTP)
2541 sub $64, LEN
2542 add $64, INP
2543 add $64, OUTP
2544 cmp $64, LEN
2545 jge .Lecb_dec_loop4
2546 cmp $16, LEN
2547 jb .Lecb_dec_ret
2548.align 4
2549.Lecb_dec_loop1:
2550 movups (INP), STATE1
2551 call _aesni_dec1
2552 movups STATE1, (OUTP)
2553 sub $16, LEN
2554 add $16, INP
2555 add $16, OUTP
2556 cmp $16, LEN
2557 jge .Lecb_dec_loop1
2558.Lecb_dec_ret:
Mathias Krause0d258ef2010-11-27 16:34:46 +08002559#ifndef __x86_64__
2560 popl KLEN
2561 popl KEYP
2562 popl LEN
2563#endif
Josh Poimboeuf8691ccd2016-01-21 16:49:19 -06002564 FRAME_END
Huang Ying54b6a1b2009-01-18 16:28:34 +11002565 ret
Jussi Kivilinna8309b742013-01-19 13:38:55 +02002566ENDPROC(aesni_ecb_dec)
Huang Ying54b6a1b2009-01-18 16:28:34 +11002567
2568/*
2569 * void aesni_cbc_enc(struct crypto_aes_ctx *ctx, const u8 *dst, u8 *src,
2570 * size_t len, u8 *iv)
2571 */
2572ENTRY(aesni_cbc_enc)
Josh Poimboeuf8691ccd2016-01-21 16:49:19 -06002573 FRAME_BEGIN
Mathias Krause0d258ef2010-11-27 16:34:46 +08002574#ifndef __x86_64__
2575 pushl IVP
2576 pushl LEN
2577 pushl KEYP
2578 pushl KLEN
Josh Poimboeuf8691ccd2016-01-21 16:49:19 -06002579 movl (FRAME_OFFSET+20)(%esp), KEYP # ctx
2580 movl (FRAME_OFFSET+24)(%esp), OUTP # dst
2581 movl (FRAME_OFFSET+28)(%esp), INP # src
2582 movl (FRAME_OFFSET+32)(%esp), LEN # len
2583 movl (FRAME_OFFSET+36)(%esp), IVP # iv
Mathias Krause0d258ef2010-11-27 16:34:46 +08002584#endif
Huang Ying54b6a1b2009-01-18 16:28:34 +11002585 cmp $16, LEN
2586 jb .Lcbc_enc_ret
2587 mov 480(KEYP), KLEN
2588 movups (IVP), STATE # load iv as initial state
2589.align 4
2590.Lcbc_enc_loop:
2591 movups (INP), IN # load input
2592 pxor IN, STATE
2593 call _aesni_enc1
2594 movups STATE, (OUTP) # store output
2595 sub $16, LEN
2596 add $16, INP
2597 add $16, OUTP
2598 cmp $16, LEN
2599 jge .Lcbc_enc_loop
2600 movups STATE, (IVP)
2601.Lcbc_enc_ret:
Mathias Krause0d258ef2010-11-27 16:34:46 +08002602#ifndef __x86_64__
2603 popl KLEN
2604 popl KEYP
2605 popl LEN
2606 popl IVP
2607#endif
Josh Poimboeuf8691ccd2016-01-21 16:49:19 -06002608 FRAME_END
Huang Ying54b6a1b2009-01-18 16:28:34 +11002609 ret
Jussi Kivilinna8309b742013-01-19 13:38:55 +02002610ENDPROC(aesni_cbc_enc)
Huang Ying54b6a1b2009-01-18 16:28:34 +11002611
2612/*
2613 * void aesni_cbc_dec(struct crypto_aes_ctx *ctx, const u8 *dst, u8 *src,
2614 * size_t len, u8 *iv)
2615 */
2616ENTRY(aesni_cbc_dec)
Josh Poimboeuf8691ccd2016-01-21 16:49:19 -06002617 FRAME_BEGIN
Mathias Krause0d258ef2010-11-27 16:34:46 +08002618#ifndef __x86_64__
2619 pushl IVP
2620 pushl LEN
2621 pushl KEYP
2622 pushl KLEN
Josh Poimboeuf8691ccd2016-01-21 16:49:19 -06002623 movl (FRAME_OFFSET+20)(%esp), KEYP # ctx
2624 movl (FRAME_OFFSET+24)(%esp), OUTP # dst
2625 movl (FRAME_OFFSET+28)(%esp), INP # src
2626 movl (FRAME_OFFSET+32)(%esp), LEN # len
2627 movl (FRAME_OFFSET+36)(%esp), IVP # iv
Mathias Krause0d258ef2010-11-27 16:34:46 +08002628#endif
Huang Ying54b6a1b2009-01-18 16:28:34 +11002629 cmp $16, LEN
Huang Yinge6efaa02009-06-18 19:33:57 +08002630 jb .Lcbc_dec_just_ret
Huang Ying54b6a1b2009-01-18 16:28:34 +11002631 mov 480(KEYP), KLEN
2632 add $240, KEYP
2633 movups (IVP), IV
2634 cmp $64, LEN
2635 jb .Lcbc_dec_loop1
2636.align 4
2637.Lcbc_dec_loop4:
2638 movups (INP), IN1
2639 movaps IN1, STATE1
2640 movups 0x10(INP), IN2
2641 movaps IN2, STATE2
Mathias Krause0d258ef2010-11-27 16:34:46 +08002642#ifdef __x86_64__
Huang Ying54b6a1b2009-01-18 16:28:34 +11002643 movups 0x20(INP), IN3
2644 movaps IN3, STATE3
2645 movups 0x30(INP), IN4
2646 movaps IN4, STATE4
Mathias Krause0d258ef2010-11-27 16:34:46 +08002647#else
2648 movups 0x20(INP), IN1
2649 movaps IN1, STATE3
2650 movups 0x30(INP), IN2
2651 movaps IN2, STATE4
2652#endif
Huang Ying54b6a1b2009-01-18 16:28:34 +11002653 call _aesni_dec4
2654 pxor IV, STATE1
Mathias Krause0d258ef2010-11-27 16:34:46 +08002655#ifdef __x86_64__
Huang Ying54b6a1b2009-01-18 16:28:34 +11002656 pxor IN1, STATE2
2657 pxor IN2, STATE3
2658 pxor IN3, STATE4
2659 movaps IN4, IV
Mathias Krause0d258ef2010-11-27 16:34:46 +08002660#else
Mathias Krause0d258ef2010-11-27 16:34:46 +08002661 pxor IN1, STATE4
2662 movaps IN2, IV
Mathias Krause7c8d5182012-05-30 01:43:08 +02002663 movups (INP), IN1
2664 pxor IN1, STATE2
2665 movups 0x10(INP), IN2
2666 pxor IN2, STATE3
Mathias Krause0d258ef2010-11-27 16:34:46 +08002667#endif
Huang Ying54b6a1b2009-01-18 16:28:34 +11002668 movups STATE1, (OUTP)
2669 movups STATE2, 0x10(OUTP)
2670 movups STATE3, 0x20(OUTP)
2671 movups STATE4, 0x30(OUTP)
2672 sub $64, LEN
2673 add $64, INP
2674 add $64, OUTP
2675 cmp $64, LEN
2676 jge .Lcbc_dec_loop4
2677 cmp $16, LEN
2678 jb .Lcbc_dec_ret
2679.align 4
2680.Lcbc_dec_loop1:
2681 movups (INP), IN
2682 movaps IN, STATE
2683 call _aesni_dec1
2684 pxor IV, STATE
2685 movups STATE, (OUTP)
2686 movaps IN, IV
2687 sub $16, LEN
2688 add $16, INP
2689 add $16, OUTP
2690 cmp $16, LEN
2691 jge .Lcbc_dec_loop1
Huang Ying54b6a1b2009-01-18 16:28:34 +11002692.Lcbc_dec_ret:
Huang Yinge6efaa02009-06-18 19:33:57 +08002693 movups IV, (IVP)
2694.Lcbc_dec_just_ret:
Mathias Krause0d258ef2010-11-27 16:34:46 +08002695#ifndef __x86_64__
2696 popl KLEN
2697 popl KEYP
2698 popl LEN
2699 popl IVP
2700#endif
Josh Poimboeuf8691ccd2016-01-21 16:49:19 -06002701 FRAME_END
Huang Ying54b6a1b2009-01-18 16:28:34 +11002702 ret
Jussi Kivilinna8309b742013-01-19 13:38:55 +02002703ENDPROC(aesni_cbc_dec)
Huang Ying12387a42010-03-10 18:28:55 +08002704
Mathias Krause0d258ef2010-11-27 16:34:46 +08002705#ifdef __x86_64__
Josh Poimboeuf1253cab2016-01-21 16:49:15 -06002706.pushsection .rodata
Huang Ying12387a42010-03-10 18:28:55 +08002707.align 16
2708.Lbswap_mask:
2709 .byte 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0
Josh Poimboeuf1253cab2016-01-21 16:49:15 -06002710.popsection
Huang Ying12387a42010-03-10 18:28:55 +08002711
2712/*
2713 * _aesni_inc_init: internal ABI
2714 * setup registers used by _aesni_inc
2715 * input:
2716 * IV
2717 * output:
2718 * CTR: == IV, in little endian
2719 * TCTR_LOW: == lower qword of CTR
2720 * INC: == 1, in little endian
2721 * BSWAP_MASK == endian swapping mask
2722 */
Mathias Krause0d258ef2010-11-27 16:34:46 +08002723.align 4
Huang Ying12387a42010-03-10 18:28:55 +08002724_aesni_inc_init:
2725 movaps .Lbswap_mask, BSWAP_MASK
2726 movaps IV, CTR
2727 PSHUFB_XMM BSWAP_MASK CTR
2728 mov $1, TCTR_LOW
Huang Ying32cbd7d2010-03-13 16:28:42 +08002729 MOVQ_R64_XMM TCTR_LOW INC
2730 MOVQ_R64_XMM CTR TCTR_LOW
Huang Ying12387a42010-03-10 18:28:55 +08002731 ret
Jussi Kivilinna8309b742013-01-19 13:38:55 +02002732ENDPROC(_aesni_inc_init)
Huang Ying12387a42010-03-10 18:28:55 +08002733
2734/*
2735 * _aesni_inc: internal ABI
2736 * Increase IV by 1, IV is in big endian
2737 * input:
2738 * IV
2739 * CTR: == IV, in little endian
2740 * TCTR_LOW: == lower qword of CTR
2741 * INC: == 1, in little endian
2742 * BSWAP_MASK == endian swapping mask
2743 * output:
2744 * IV: Increase by 1
2745 * changed:
2746 * CTR: == output IV, in little endian
2747 * TCTR_LOW: == lower qword of CTR
2748 */
Mathias Krause0d258ef2010-11-27 16:34:46 +08002749.align 4
Huang Ying12387a42010-03-10 18:28:55 +08002750_aesni_inc:
2751 paddq INC, CTR
2752 add $1, TCTR_LOW
2753 jnc .Linc_low
2754 pslldq $8, INC
2755 paddq INC, CTR
2756 psrldq $8, INC
2757.Linc_low:
2758 movaps CTR, IV
2759 PSHUFB_XMM BSWAP_MASK IV
2760 ret
Jussi Kivilinna8309b742013-01-19 13:38:55 +02002761ENDPROC(_aesni_inc)
Huang Ying12387a42010-03-10 18:28:55 +08002762
2763/*
2764 * void aesni_ctr_enc(struct crypto_aes_ctx *ctx, const u8 *dst, u8 *src,
2765 * size_t len, u8 *iv)
2766 */
2767ENTRY(aesni_ctr_enc)
Josh Poimboeuf8691ccd2016-01-21 16:49:19 -06002768 FRAME_BEGIN
Huang Ying12387a42010-03-10 18:28:55 +08002769 cmp $16, LEN
2770 jb .Lctr_enc_just_ret
2771 mov 480(KEYP), KLEN
2772 movups (IVP), IV
2773 call _aesni_inc_init
2774 cmp $64, LEN
2775 jb .Lctr_enc_loop1
2776.align 4
2777.Lctr_enc_loop4:
2778 movaps IV, STATE1
2779 call _aesni_inc
2780 movups (INP), IN1
2781 movaps IV, STATE2
2782 call _aesni_inc
2783 movups 0x10(INP), IN2
2784 movaps IV, STATE3
2785 call _aesni_inc
2786 movups 0x20(INP), IN3
2787 movaps IV, STATE4
2788 call _aesni_inc
2789 movups 0x30(INP), IN4
2790 call _aesni_enc4
2791 pxor IN1, STATE1
2792 movups STATE1, (OUTP)
2793 pxor IN2, STATE2
2794 movups STATE2, 0x10(OUTP)
2795 pxor IN3, STATE3
2796 movups STATE3, 0x20(OUTP)
2797 pxor IN4, STATE4
2798 movups STATE4, 0x30(OUTP)
2799 sub $64, LEN
2800 add $64, INP
2801 add $64, OUTP
2802 cmp $64, LEN
2803 jge .Lctr_enc_loop4
2804 cmp $16, LEN
2805 jb .Lctr_enc_ret
2806.align 4
2807.Lctr_enc_loop1:
2808 movaps IV, STATE
2809 call _aesni_inc
2810 movups (INP), IN
2811 call _aesni_enc1
2812 pxor IN, STATE
2813 movups STATE, (OUTP)
2814 sub $16, LEN
2815 add $16, INP
2816 add $16, OUTP
2817 cmp $16, LEN
2818 jge .Lctr_enc_loop1
2819.Lctr_enc_ret:
2820 movups IV, (IVP)
2821.Lctr_enc_just_ret:
Josh Poimboeuf8691ccd2016-01-21 16:49:19 -06002822 FRAME_END
Huang Ying12387a42010-03-10 18:28:55 +08002823 ret
Jussi Kivilinna8309b742013-01-19 13:38:55 +02002824ENDPROC(aesni_ctr_enc)
Jussi Kivilinnac456a9c2013-04-08 21:51:16 +03002825
2826/*
2827 * _aesni_gf128mul_x_ble: internal ABI
2828 * Multiply in GF(2^128) for XTS IVs
2829 * input:
2830 * IV: current IV
2831 * GF128MUL_MASK == mask with 0x87 and 0x01
2832 * output:
2833 * IV: next IV
2834 * changed:
2835 * CTR: == temporary value
2836 */
2837#define _aesni_gf128mul_x_ble() \
2838 pshufd $0x13, IV, CTR; \
2839 paddq IV, IV; \
2840 psrad $31, CTR; \
2841 pand GF128MUL_MASK, CTR; \
2842 pxor CTR, IV;
2843
2844/*
2845 * void aesni_xts_crypt8(struct crypto_aes_ctx *ctx, const u8 *dst, u8 *src,
2846 * bool enc, u8 *iv)
2847 */
2848ENTRY(aesni_xts_crypt8)
Josh Poimboeuf8691ccd2016-01-21 16:49:19 -06002849 FRAME_BEGIN
Jussi Kivilinnac456a9c2013-04-08 21:51:16 +03002850 cmpb $0, %cl
2851 movl $0, %ecx
2852 movl $240, %r10d
2853 leaq _aesni_enc4, %r11
2854 leaq _aesni_dec4, %rax
2855 cmovel %r10d, %ecx
2856 cmoveq %rax, %r11
2857
2858 movdqa .Lgf128mul_x_ble_mask, GF128MUL_MASK
2859 movups (IVP), IV
2860
2861 mov 480(KEYP), KLEN
2862 addq %rcx, KEYP
2863
2864 movdqa IV, STATE1
Jussi Kivilinnafe6510b2013-06-11 22:25:22 +03002865 movdqu 0x00(INP), INC
2866 pxor INC, STATE1
Jussi Kivilinnac456a9c2013-04-08 21:51:16 +03002867 movdqu IV, 0x00(OUTP)
2868
2869 _aesni_gf128mul_x_ble()
2870 movdqa IV, STATE2
Jussi Kivilinnafe6510b2013-06-11 22:25:22 +03002871 movdqu 0x10(INP), INC
2872 pxor INC, STATE2
Jussi Kivilinnac456a9c2013-04-08 21:51:16 +03002873 movdqu IV, 0x10(OUTP)
2874
2875 _aesni_gf128mul_x_ble()
2876 movdqa IV, STATE3
Jussi Kivilinnafe6510b2013-06-11 22:25:22 +03002877 movdqu 0x20(INP), INC
2878 pxor INC, STATE3
Jussi Kivilinnac456a9c2013-04-08 21:51:16 +03002879 movdqu IV, 0x20(OUTP)
2880
2881 _aesni_gf128mul_x_ble()
2882 movdqa IV, STATE4
Jussi Kivilinnafe6510b2013-06-11 22:25:22 +03002883 movdqu 0x30(INP), INC
2884 pxor INC, STATE4
Jussi Kivilinnac456a9c2013-04-08 21:51:16 +03002885 movdqu IV, 0x30(OUTP)
2886
2887 call *%r11
2888
Jussi Kivilinnafe6510b2013-06-11 22:25:22 +03002889 movdqu 0x00(OUTP), INC
2890 pxor INC, STATE1
Jussi Kivilinnac456a9c2013-04-08 21:51:16 +03002891 movdqu STATE1, 0x00(OUTP)
2892
2893 _aesni_gf128mul_x_ble()
2894 movdqa IV, STATE1
Jussi Kivilinnafe6510b2013-06-11 22:25:22 +03002895 movdqu 0x40(INP), INC
2896 pxor INC, STATE1
Jussi Kivilinnac456a9c2013-04-08 21:51:16 +03002897 movdqu IV, 0x40(OUTP)
2898
Jussi Kivilinnafe6510b2013-06-11 22:25:22 +03002899 movdqu 0x10(OUTP), INC
2900 pxor INC, STATE2
Jussi Kivilinnac456a9c2013-04-08 21:51:16 +03002901 movdqu STATE2, 0x10(OUTP)
2902
2903 _aesni_gf128mul_x_ble()
2904 movdqa IV, STATE2
Jussi Kivilinnafe6510b2013-06-11 22:25:22 +03002905 movdqu 0x50(INP), INC
2906 pxor INC, STATE2
Jussi Kivilinnac456a9c2013-04-08 21:51:16 +03002907 movdqu IV, 0x50(OUTP)
2908
Jussi Kivilinnafe6510b2013-06-11 22:25:22 +03002909 movdqu 0x20(OUTP), INC
2910 pxor INC, STATE3
Jussi Kivilinnac456a9c2013-04-08 21:51:16 +03002911 movdqu STATE3, 0x20(OUTP)
2912
2913 _aesni_gf128mul_x_ble()
2914 movdqa IV, STATE3
Jussi Kivilinnafe6510b2013-06-11 22:25:22 +03002915 movdqu 0x60(INP), INC
2916 pxor INC, STATE3
Jussi Kivilinnac456a9c2013-04-08 21:51:16 +03002917 movdqu IV, 0x60(OUTP)
2918
Jussi Kivilinnafe6510b2013-06-11 22:25:22 +03002919 movdqu 0x30(OUTP), INC
2920 pxor INC, STATE4
Jussi Kivilinnac456a9c2013-04-08 21:51:16 +03002921 movdqu STATE4, 0x30(OUTP)
2922
2923 _aesni_gf128mul_x_ble()
2924 movdqa IV, STATE4
Jussi Kivilinnafe6510b2013-06-11 22:25:22 +03002925 movdqu 0x70(INP), INC
2926 pxor INC, STATE4
Jussi Kivilinnac456a9c2013-04-08 21:51:16 +03002927 movdqu IV, 0x70(OUTP)
2928
2929 _aesni_gf128mul_x_ble()
2930 movups IV, (IVP)
2931
2932 call *%r11
2933
Jussi Kivilinnafe6510b2013-06-11 22:25:22 +03002934 movdqu 0x40(OUTP), INC
2935 pxor INC, STATE1
Jussi Kivilinnac456a9c2013-04-08 21:51:16 +03002936 movdqu STATE1, 0x40(OUTP)
2937
Jussi Kivilinnafe6510b2013-06-11 22:25:22 +03002938 movdqu 0x50(OUTP), INC
2939 pxor INC, STATE2
Jussi Kivilinnac456a9c2013-04-08 21:51:16 +03002940 movdqu STATE2, 0x50(OUTP)
2941
Jussi Kivilinnafe6510b2013-06-11 22:25:22 +03002942 movdqu 0x60(OUTP), INC
2943 pxor INC, STATE3
Jussi Kivilinnac456a9c2013-04-08 21:51:16 +03002944 movdqu STATE3, 0x60(OUTP)
2945
Jussi Kivilinnafe6510b2013-06-11 22:25:22 +03002946 movdqu 0x70(OUTP), INC
2947 pxor INC, STATE4
Jussi Kivilinnac456a9c2013-04-08 21:51:16 +03002948 movdqu STATE4, 0x70(OUTP)
2949
Josh Poimboeuf8691ccd2016-01-21 16:49:19 -06002950 FRAME_END
Jussi Kivilinnac456a9c2013-04-08 21:51:16 +03002951 ret
2952ENDPROC(aesni_xts_crypt8)
2953
Mathias Krause0d258ef2010-11-27 16:34:46 +08002954#endif