blob: 251a65f841984355b55a1ff75892293178e6dba7 [file] [log] [blame]
Huang Ying54b6a1b2009-01-18 16:28:34 +11001/*
2 * Implement AES algorithm in Intel AES-NI instructions.
3 *
4 * The white paper of AES-NI instructions can be downloaded from:
5 * http://softwarecommunity.intel.com/isn/downloads/intelavx/AES-Instructions-Set_WP.pdf
6 *
7 * Copyright (C) 2008, Intel Corp.
8 * Author: Huang Ying <ying.huang@intel.com>
9 * Vinodh Gopal <vinodh.gopal@intel.com>
10 * Kahraman Akdemir
11 *
Tadeusz Struk0bd82f52010-11-04 15:00:45 -040012 * Added RFC4106 AES-GCM support for 128-bit keys under the AEAD
13 * interface for 64-bit kernels.
14 * Authors: Erdinc Ozturk (erdinc.ozturk@intel.com)
15 * Aidan O'Mahony (aidan.o.mahony@intel.com)
16 * Adrian Hoban <adrian.hoban@intel.com>
17 * James Guilford (james.guilford@intel.com)
18 * Gabriele Paoloni <gabriele.paoloni@intel.com>
19 * Tadeusz Struk (tadeusz.struk@intel.com)
20 * Wajdi Feghali (wajdi.k.feghali@intel.com)
21 * Copyright (c) 2010, Intel Corporation.
22 *
Mathias Krause0d258ef2010-11-27 16:34:46 +080023 * Ported x86_64 version to x86:
24 * Author: Mathias Krause <minipli@googlemail.com>
25 *
Huang Ying54b6a1b2009-01-18 16:28:34 +110026 * This program is free software; you can redistribute it and/or modify
27 * it under the terms of the GNU General Public License as published by
28 * the Free Software Foundation; either version 2 of the License, or
29 * (at your option) any later version.
30 */
31
32#include <linux/linkage.h>
Huang Yingb369e522009-11-23 19:54:06 +080033#include <asm/inst.h>
Josh Poimboeuf8691ccd2016-01-21 16:49:19 -060034#include <asm/frame.h>
David Woodhouse9697fa32018-01-11 21:46:27 +000035#include <asm/nospec-branch.h>
Huang Ying54b6a1b2009-01-18 16:28:34 +110036
Timothy McCaffreye31ac322015-01-13 13:16:43 -050037/*
38 * The following macros are used to move an (un)aligned 16 byte value to/from
39 * an XMM register. This can done for either FP or integer values, for FP use
40 * movaps (move aligned packed single) or integer use movdqa (move double quad
41 * aligned). It doesn't make a performance difference which instruction is used
42 * since Nehalem (original Core i7) was released. However, the movaps is a byte
43 * shorter, so that is the one we'll use for now. (same for unaligned).
44 */
45#define MOVADQ movaps
46#define MOVUDQ movups
47
Mathias Krause559ad0f2010-11-29 08:35:39 +080048#ifdef __x86_64__
Timothy McCaffreye31ac322015-01-13 13:16:43 -050049
Denys Vlasenkoe1839142017-01-19 22:33:04 +010050# constants in mergeable sections, linker can reorder and merge
51.section .rodata.cst16.gf128mul_x_ble_mask, "aM", @progbits, 16
Jussi Kivilinnac456a9c2013-04-08 21:51:16 +030052.align 16
53.Lgf128mul_x_ble_mask:
54 .octa 0x00000000000000010000000000000087
Denys Vlasenkoe1839142017-01-19 22:33:04 +010055.section .rodata.cst16.POLY, "aM", @progbits, 16
56.align 16
Tadeusz Struk0bd82f52010-11-04 15:00:45 -040057POLY: .octa 0xC2000000000000000000000000000001
Denys Vlasenkoe1839142017-01-19 22:33:04 +010058.section .rodata.cst16.TWOONE, "aM", @progbits, 16
59.align 16
Tadeusz Struk0bd82f52010-11-04 15:00:45 -040060TWOONE: .octa 0x00000001000000000000000000000001
61
Denys Vlasenkoe1839142017-01-19 22:33:04 +010062.section .rodata.cst16.SHUF_MASK, "aM", @progbits, 16
63.align 16
64SHUF_MASK: .octa 0x000102030405060708090A0B0C0D0E0F
65.section .rodata.cst16.MASK1, "aM", @progbits, 16
66.align 16
67MASK1: .octa 0x0000000000000000ffffffffffffffff
68.section .rodata.cst16.MASK2, "aM", @progbits, 16
69.align 16
70MASK2: .octa 0xffffffffffffffff0000000000000000
71.section .rodata.cst16.ONE, "aM", @progbits, 16
72.align 16
73ONE: .octa 0x00000000000000000000000000000001
74.section .rodata.cst16.F_MIN_MASK, "aM", @progbits, 16
75.align 16
76F_MIN_MASK: .octa 0xf1f2f3f4f5f6f7f8f9fafbfcfdfeff0
77.section .rodata.cst16.dec, "aM", @progbits, 16
78.align 16
79dec: .octa 0x1
80.section .rodata.cst16.enc, "aM", @progbits, 16
81.align 16
82enc: .octa 0x2
83
Tadeusz Struk0bd82f52010-11-04 15:00:45 -040084# order of these constants should not change.
85# more specifically, ALL_F should follow SHIFT_MASK,
Denys Vlasenkoe1839142017-01-19 22:33:04 +010086# and zero should follow ALL_F
87.section .rodata, "a", @progbits
88.align 16
Tadeusz Struk0bd82f52010-11-04 15:00:45 -040089SHIFT_MASK: .octa 0x0f0e0d0c0b0a09080706050403020100
90ALL_F: .octa 0xffffffffffffffffffffffffffffffff
Denys Vlasenkoe1839142017-01-19 22:33:04 +010091 .octa 0x00000000000000000000000000000000
Tadeusz Struk0bd82f52010-11-04 15:00:45 -040092
Huang Ying54b6a1b2009-01-18 16:28:34 +110093.text
94
Tadeusz Struk0bd82f52010-11-04 15:00:45 -040095
96#define STACK_OFFSET 8*3
97#define HashKey 16*0 // store HashKey <<1 mod poly here
98#define HashKey_2 16*1 // store HashKey^2 <<1 mod poly here
99#define HashKey_3 16*2 // store HashKey^3 <<1 mod poly here
100#define HashKey_4 16*3 // store HashKey^4 <<1 mod poly here
101#define HashKey_k 16*4 // store XOR of High 64 bits and Low 64
102 // bits of HashKey <<1 mod poly here
103 //(for Karatsuba purposes)
104#define HashKey_2_k 16*5 // store XOR of High 64 bits and Low 64
105 // bits of HashKey^2 <<1 mod poly here
106 // (for Karatsuba purposes)
107#define HashKey_3_k 16*6 // store XOR of High 64 bits and Low 64
108 // bits of HashKey^3 <<1 mod poly here
109 // (for Karatsuba purposes)
110#define HashKey_4_k 16*7 // store XOR of High 64 bits and Low 64
111 // bits of HashKey^4 <<1 mod poly here
112 // (for Karatsuba purposes)
113#define VARIABLE_OFFSET 16*8
114
115#define arg1 rdi
116#define arg2 rsi
117#define arg3 rdx
118#define arg4 rcx
119#define arg5 r8
120#define arg6 r9
121#define arg7 STACK_OFFSET+8(%r14)
122#define arg8 STACK_OFFSET+16(%r14)
123#define arg9 STACK_OFFSET+24(%r14)
124#define arg10 STACK_OFFSET+32(%r14)
Timothy McCaffreye31ac322015-01-13 13:16:43 -0500125#define keysize 2*15*16(%arg1)
Mathias Krause559ad0f2010-11-29 08:35:39 +0800126#endif
Tadeusz Struk0bd82f52010-11-04 15:00:45 -0400127
128
Huang Ying54b6a1b2009-01-18 16:28:34 +1100129#define STATE1 %xmm0
130#define STATE2 %xmm4
131#define STATE3 %xmm5
132#define STATE4 %xmm6
133#define STATE STATE1
134#define IN1 %xmm1
135#define IN2 %xmm7
136#define IN3 %xmm8
137#define IN4 %xmm9
138#define IN IN1
139#define KEY %xmm2
140#define IV %xmm3
Mathias Krause0d258ef2010-11-27 16:34:46 +0800141
Huang Ying12387a42010-03-10 18:28:55 +0800142#define BSWAP_MASK %xmm10
143#define CTR %xmm11
144#define INC %xmm12
Huang Ying54b6a1b2009-01-18 16:28:34 +1100145
Jussi Kivilinnac456a9c2013-04-08 21:51:16 +0300146#define GF128MUL_MASK %xmm10
147
Mathias Krause0d258ef2010-11-27 16:34:46 +0800148#ifdef __x86_64__
149#define AREG %rax
Huang Ying54b6a1b2009-01-18 16:28:34 +1100150#define KEYP %rdi
151#define OUTP %rsi
Mathias Krause0d258ef2010-11-27 16:34:46 +0800152#define UKEYP OUTP
Huang Ying54b6a1b2009-01-18 16:28:34 +1100153#define INP %rdx
154#define LEN %rcx
155#define IVP %r8
156#define KLEN %r9d
157#define T1 %r10
158#define TKEYP T1
159#define T2 %r11
Huang Ying12387a42010-03-10 18:28:55 +0800160#define TCTR_LOW T2
Mathias Krause0d258ef2010-11-27 16:34:46 +0800161#else
162#define AREG %eax
163#define KEYP %edi
164#define OUTP AREG
165#define UKEYP OUTP
166#define INP %edx
167#define LEN %esi
168#define IVP %ebp
169#define KLEN %ebx
170#define T1 %ecx
171#define TKEYP T1
172#endif
Huang Ying54b6a1b2009-01-18 16:28:34 +1100173
Tadeusz Struk0bd82f52010-11-04 15:00:45 -0400174
Mathias Krause559ad0f2010-11-29 08:35:39 +0800175#ifdef __x86_64__
Tadeusz Struk0bd82f52010-11-04 15:00:45 -0400176/* GHASH_MUL MACRO to implement: Data*HashKey mod (128,127,126,121,0)
177*
178*
179* Input: A and B (128-bits each, bit-reflected)
180* Output: C = A*B*x mod poly, (i.e. >>1 )
181* To compute GH = GH*HashKey mod poly, give HK = HashKey<<1 mod poly as input
182* GH = GH * HK * x mod poly which is equivalent to GH*HashKey mod poly.
183*
184*/
185.macro GHASH_MUL GH HK TMP1 TMP2 TMP3 TMP4 TMP5
186 movdqa \GH, \TMP1
187 pshufd $78, \GH, \TMP2
188 pshufd $78, \HK, \TMP3
189 pxor \GH, \TMP2 # TMP2 = a1+a0
190 pxor \HK, \TMP3 # TMP3 = b1+b0
191 PCLMULQDQ 0x11, \HK, \TMP1 # TMP1 = a1*b1
192 PCLMULQDQ 0x00, \HK, \GH # GH = a0*b0
193 PCLMULQDQ 0x00, \TMP3, \TMP2 # TMP2 = (a0+a1)*(b1+b0)
194 pxor \GH, \TMP2
195 pxor \TMP1, \TMP2 # TMP2 = (a0*b0)+(a1*b0)
196 movdqa \TMP2, \TMP3
197 pslldq $8, \TMP3 # left shift TMP3 2 DWs
198 psrldq $8, \TMP2 # right shift TMP2 2 DWs
199 pxor \TMP3, \GH
200 pxor \TMP2, \TMP1 # TMP2:GH holds the result of GH*HK
201
202 # first phase of the reduction
203
204 movdqa \GH, \TMP2
205 movdqa \GH, \TMP3
206 movdqa \GH, \TMP4 # copy GH into TMP2,TMP3 and TMP4
207 # in in order to perform
208 # independent shifts
209 pslld $31, \TMP2 # packed right shift <<31
210 pslld $30, \TMP3 # packed right shift <<30
211 pslld $25, \TMP4 # packed right shift <<25
212 pxor \TMP3, \TMP2 # xor the shifted versions
213 pxor \TMP4, \TMP2
214 movdqa \TMP2, \TMP5
215 psrldq $4, \TMP5 # right shift TMP5 1 DW
216 pslldq $12, \TMP2 # left shift TMP2 3 DWs
217 pxor \TMP2, \GH
218
219 # second phase of the reduction
220
221 movdqa \GH,\TMP2 # copy GH into TMP2,TMP3 and TMP4
222 # in in order to perform
223 # independent shifts
224 movdqa \GH,\TMP3
225 movdqa \GH,\TMP4
226 psrld $1,\TMP2 # packed left shift >>1
227 psrld $2,\TMP3 # packed left shift >>2
228 psrld $7,\TMP4 # packed left shift >>7
229 pxor \TMP3,\TMP2 # xor the shifted versions
230 pxor \TMP4,\TMP2
231 pxor \TMP5, \TMP2
232 pxor \TMP2, \GH
233 pxor \TMP1, \GH # result is in TMP1
234.endm
235
Junaid Shahidb20209c2017-12-20 17:08:37 -0800236# Reads DLEN bytes starting at DPTR and stores in XMMDst
237# where 0 < DLEN < 16
238# Clobbers %rax, DLEN and XMM1
239.macro READ_PARTIAL_BLOCK DPTR DLEN XMM1 XMMDst
240 cmp $8, \DLEN
241 jl _read_lt8_\@
242 mov (\DPTR), %rax
243 MOVQ_R64_XMM %rax, \XMMDst
244 sub $8, \DLEN
245 jz _done_read_partial_block_\@
246 xor %eax, %eax
247_read_next_byte_\@:
248 shl $8, %rax
249 mov 7(\DPTR, \DLEN, 1), %al
250 dec \DLEN
251 jnz _read_next_byte_\@
252 MOVQ_R64_XMM %rax, \XMM1
253 pslldq $8, \XMM1
254 por \XMM1, \XMMDst
255 jmp _done_read_partial_block_\@
256_read_lt8_\@:
257 xor %eax, %eax
258_read_next_byte_lt8_\@:
259 shl $8, %rax
260 mov -1(\DPTR, \DLEN, 1), %al
261 dec \DLEN
262 jnz _read_next_byte_lt8_\@
263 MOVQ_R64_XMM %rax, \XMMDst
264_done_read_partial_block_\@:
265.endm
266
Tadeusz Struk0bd82f52010-11-04 15:00:45 -0400267/*
268* if a = number of total plaintext bytes
269* b = floor(a/16)
270* num_initial_blocks = b mod 4
271* encrypt the initial num_initial_blocks blocks and apply ghash on
272* the ciphertext
273* %r10, %r11, %r12, %rax, %xmm5, %xmm6, %xmm7, %xmm8, %xmm9 registers
274* are clobbered
275* arg1, %arg2, %arg3, %r14 are used as a pointer only, not modified
276*/
277
Tadeusz Struk0bd82f52010-11-04 15:00:45 -0400278
Dave Watsone1fd3162018-02-14 09:38:12 -0800279.macro INITIAL_BLOCKS_ENC_DEC TMP1 TMP2 TMP3 TMP4 TMP5 XMM0 XMM1 \
Tadeusz Struk3c097b82010-12-13 19:51:15 +0800280XMM2 XMM3 XMM4 XMMDst TMP6 TMP7 i i_seq operation
Timothy McCaffreye31ac322015-01-13 13:16:43 -0500281 MOVADQ SHUF_MASK(%rip), %xmm14
Tadeusz Struk3c097b82010-12-13 19:51:15 +0800282 mov arg7, %r10 # %r10 = AAD
Junaid Shahid1ecdd372017-12-20 17:08:38 -0800283 mov arg8, %r11 # %r11 = aadLen
Tadeusz Struk3c097b82010-12-13 19:51:15 +0800284 pxor %xmm\i, %xmm\i
Sabrina Dubroca0487cca2017-04-28 18:11:56 +0200285 pxor \XMM2, \XMM2
286
287 cmp $16, %r11
Dave Watsone1fd3162018-02-14 09:38:12 -0800288 jl _get_AAD_rest\@
289_get_AAD_blocks\@:
Sabrina Dubroca0487cca2017-04-28 18:11:56 +0200290 movdqu (%r10), %xmm\i
291 PSHUFB_XMM %xmm14, %xmm\i # byte-reflect the AAD data
292 pxor %xmm\i, \XMM2
293 GHASH_MUL \XMM2, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1
294 add $16, %r10
Sabrina Dubroca0487cca2017-04-28 18:11:56 +0200295 sub $16, %r11
296 cmp $16, %r11
Dave Watsone1fd3162018-02-14 09:38:12 -0800297 jge _get_AAD_blocks\@
Sabrina Dubroca0487cca2017-04-28 18:11:56 +0200298
299 movdqu \XMM2, %xmm\i
Junaid Shahid1ecdd372017-12-20 17:08:38 -0800300
301 /* read the last <16B of AAD */
Dave Watsone1fd3162018-02-14 09:38:12 -0800302_get_AAD_rest\@:
Sabrina Dubroca0487cca2017-04-28 18:11:56 +0200303 cmp $0, %r11
Dave Watsone1fd3162018-02-14 09:38:12 -0800304 je _get_AAD_done\@
Sabrina Dubroca0487cca2017-04-28 18:11:56 +0200305
Junaid Shahid1ecdd372017-12-20 17:08:38 -0800306 READ_PARTIAL_BLOCK %r10, %r11, \TMP1, %xmm\i
Tadeusz Struk3c097b82010-12-13 19:51:15 +0800307 PSHUFB_XMM %xmm14, %xmm\i # byte-reflect the AAD data
Sabrina Dubroca0487cca2017-04-28 18:11:56 +0200308 pxor \XMM2, %xmm\i
309 GHASH_MUL %xmm\i, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1
Tadeusz Struk3c097b82010-12-13 19:51:15 +0800310
Dave Watsone1fd3162018-02-14 09:38:12 -0800311_get_AAD_done\@:
Tadeusz Struk3c097b82010-12-13 19:51:15 +0800312 xor %r11, %r11 # initialise the data pointer offset as zero
Sabrina Dubroca0487cca2017-04-28 18:11:56 +0200313 # start AES for num_initial_blocks blocks
Tadeusz Struk3c097b82010-12-13 19:51:15 +0800314
315 mov %arg5, %rax # %rax = *Y0
316 movdqu (%rax), \XMM0 # XMM0 = Y0
Tadeusz Struk3c097b82010-12-13 19:51:15 +0800317 PSHUFB_XMM %xmm14, \XMM0
318
319.if (\i == 5) || (\i == 6) || (\i == 7)
Tadeusz Struk3c097b82010-12-13 19:51:15 +0800320
Timothy McCaffreye31ac322015-01-13 13:16:43 -0500321 MOVADQ ONE(%RIP),\TMP1
322 MOVADQ 0(%arg1),\TMP2
Tadeusz Struk3c097b82010-12-13 19:51:15 +0800323.irpc index, \i_seq
Timothy McCaffreye31ac322015-01-13 13:16:43 -0500324 paddd \TMP1, \XMM0 # INCR Y0
Dave Watsone1fd3162018-02-14 09:38:12 -0800325.ifc \operation, dec
326 movdqa \XMM0, %xmm\index
327.else
Timothy McCaffreye31ac322015-01-13 13:16:43 -0500328 MOVADQ \XMM0, %xmm\index
Dave Watsone1fd3162018-02-14 09:38:12 -0800329.endif
Timothy McCaffreye31ac322015-01-13 13:16:43 -0500330 PSHUFB_XMM %xmm14, %xmm\index # perform a 16 byte swap
331 pxor \TMP2, %xmm\index
Tadeusz Struk3c097b82010-12-13 19:51:15 +0800332.endr
Timothy McCaffreye31ac322015-01-13 13:16:43 -0500333 lea 0x10(%arg1),%r10
334 mov keysize,%eax
335 shr $2,%eax # 128->4, 192->6, 256->8
336 add $5,%eax # 128->9, 192->11, 256->13
337
Dave Watsone1fd3162018-02-14 09:38:12 -0800338aes_loop_initial_\@:
Timothy McCaffreye31ac322015-01-13 13:16:43 -0500339 MOVADQ (%r10),\TMP1
340.irpc index, \i_seq
341 AESENC \TMP1, %xmm\index
Tadeusz Struk3c097b82010-12-13 19:51:15 +0800342.endr
Timothy McCaffreye31ac322015-01-13 13:16:43 -0500343 add $16,%r10
344 sub $1,%eax
Dave Watsone1fd3162018-02-14 09:38:12 -0800345 jnz aes_loop_initial_\@
Timothy McCaffreye31ac322015-01-13 13:16:43 -0500346
347 MOVADQ (%r10), \TMP1
Tadeusz Struk3c097b82010-12-13 19:51:15 +0800348.irpc index, \i_seq
Timothy McCaffreye31ac322015-01-13 13:16:43 -0500349 AESENCLAST \TMP1, %xmm\index # Last Round
Tadeusz Struk3c097b82010-12-13 19:51:15 +0800350.endr
351.irpc index, \i_seq
352 movdqu (%arg3 , %r11, 1), \TMP1
353 pxor \TMP1, %xmm\index
354 movdqu %xmm\index, (%arg2 , %r11, 1)
355 # write back plaintext/ciphertext for num_initial_blocks
356 add $16, %r11
Dave Watsone1fd3162018-02-14 09:38:12 -0800357
358.ifc \operation, dec
359 movdqa \TMP1, %xmm\index
360.endif
Tadeusz Struk3c097b82010-12-13 19:51:15 +0800361 PSHUFB_XMM %xmm14, %xmm\index
362
363 # prepare plaintext/ciphertext for GHASH computation
364.endr
365.endif
Sabrina Dubroca0487cca2017-04-28 18:11:56 +0200366
Tadeusz Struk3c097b82010-12-13 19:51:15 +0800367 # apply GHASH on num_initial_blocks blocks
368
369.if \i == 5
370 pxor %xmm5, %xmm6
371 GHASH_MUL %xmm6, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1
372 pxor %xmm6, %xmm7
373 GHASH_MUL %xmm7, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1
374 pxor %xmm7, %xmm8
375 GHASH_MUL %xmm8, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1
376.elseif \i == 6
377 pxor %xmm6, %xmm7
378 GHASH_MUL %xmm7, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1
379 pxor %xmm7, %xmm8
380 GHASH_MUL %xmm8, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1
381.elseif \i == 7
382 pxor %xmm7, %xmm8
383 GHASH_MUL %xmm8, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1
384.endif
385 cmp $64, %r13
Dave Watsone1fd3162018-02-14 09:38:12 -0800386 jl _initial_blocks_done\@
Tadeusz Struk3c097b82010-12-13 19:51:15 +0800387 # no need for precomputed values
388/*
389*
390* Precomputations for HashKey parallel with encryption of first 4 blocks.
391* Haskey_i_k holds XORed values of the low and high parts of the Haskey_i
392*/
Timothy McCaffreye31ac322015-01-13 13:16:43 -0500393 MOVADQ ONE(%RIP),\TMP1
394 paddd \TMP1, \XMM0 # INCR Y0
395 MOVADQ \XMM0, \XMM1
Tadeusz Struk3c097b82010-12-13 19:51:15 +0800396 PSHUFB_XMM %xmm14, \XMM1 # perform a 16 byte swap
397
Timothy McCaffreye31ac322015-01-13 13:16:43 -0500398 paddd \TMP1, \XMM0 # INCR Y0
399 MOVADQ \XMM0, \XMM2
Tadeusz Struk3c097b82010-12-13 19:51:15 +0800400 PSHUFB_XMM %xmm14, \XMM2 # perform a 16 byte swap
401
Timothy McCaffreye31ac322015-01-13 13:16:43 -0500402 paddd \TMP1, \XMM0 # INCR Y0
403 MOVADQ \XMM0, \XMM3
Tadeusz Struk3c097b82010-12-13 19:51:15 +0800404 PSHUFB_XMM %xmm14, \XMM3 # perform a 16 byte swap
405
Timothy McCaffreye31ac322015-01-13 13:16:43 -0500406 paddd \TMP1, \XMM0 # INCR Y0
407 MOVADQ \XMM0, \XMM4
Tadeusz Struk3c097b82010-12-13 19:51:15 +0800408 PSHUFB_XMM %xmm14, \XMM4 # perform a 16 byte swap
409
Timothy McCaffreye31ac322015-01-13 13:16:43 -0500410 MOVADQ 0(%arg1),\TMP1
411 pxor \TMP1, \XMM1
412 pxor \TMP1, \XMM2
413 pxor \TMP1, \XMM3
414 pxor \TMP1, \XMM4
Tadeusz Struk3c097b82010-12-13 19:51:15 +0800415 movdqa \TMP3, \TMP5
416 pshufd $78, \TMP3, \TMP1
417 pxor \TMP3, \TMP1
418 movdqa \TMP1, HashKey_k(%rsp)
419 GHASH_MUL \TMP5, \TMP3, \TMP1, \TMP2, \TMP4, \TMP6, \TMP7
420# TMP5 = HashKey^2<<1 (mod poly)
421 movdqa \TMP5, HashKey_2(%rsp)
422# HashKey_2 = HashKey^2<<1 (mod poly)
423 pshufd $78, \TMP5, \TMP1
424 pxor \TMP5, \TMP1
425 movdqa \TMP1, HashKey_2_k(%rsp)
426.irpc index, 1234 # do 4 rounds
427 movaps 0x10*\index(%arg1), \TMP1
428 AESENC \TMP1, \XMM1
429 AESENC \TMP1, \XMM2
430 AESENC \TMP1, \XMM3
431 AESENC \TMP1, \XMM4
432.endr
433 GHASH_MUL \TMP5, \TMP3, \TMP1, \TMP2, \TMP4, \TMP6, \TMP7
434# TMP5 = HashKey^3<<1 (mod poly)
435 movdqa \TMP5, HashKey_3(%rsp)
436 pshufd $78, \TMP5, \TMP1
437 pxor \TMP5, \TMP1
438 movdqa \TMP1, HashKey_3_k(%rsp)
439.irpc index, 56789 # do next 5 rounds
440 movaps 0x10*\index(%arg1), \TMP1
441 AESENC \TMP1, \XMM1
442 AESENC \TMP1, \XMM2
443 AESENC \TMP1, \XMM3
444 AESENC \TMP1, \XMM4
445.endr
446 GHASH_MUL \TMP5, \TMP3, \TMP1, \TMP2, \TMP4, \TMP6, \TMP7
447# TMP5 = HashKey^3<<1 (mod poly)
448 movdqa \TMP5, HashKey_4(%rsp)
449 pshufd $78, \TMP5, \TMP1
450 pxor \TMP5, \TMP1
451 movdqa \TMP1, HashKey_4_k(%rsp)
Timothy McCaffreye31ac322015-01-13 13:16:43 -0500452 lea 0xa0(%arg1),%r10
453 mov keysize,%eax
454 shr $2,%eax # 128->4, 192->6, 256->8
455 sub $4,%eax # 128->0, 192->2, 256->4
Dave Watsone1fd3162018-02-14 09:38:12 -0800456 jz aes_loop_pre_done\@
Timothy McCaffreye31ac322015-01-13 13:16:43 -0500457
Dave Watsone1fd3162018-02-14 09:38:12 -0800458aes_loop_pre_\@:
Timothy McCaffreye31ac322015-01-13 13:16:43 -0500459 MOVADQ (%r10),\TMP2
460.irpc index, 1234
461 AESENC \TMP2, %xmm\index
462.endr
463 add $16,%r10
464 sub $1,%eax
Dave Watsone1fd3162018-02-14 09:38:12 -0800465 jnz aes_loop_pre_\@
Timothy McCaffreye31ac322015-01-13 13:16:43 -0500466
Dave Watsone1fd3162018-02-14 09:38:12 -0800467aes_loop_pre_done\@:
Timothy McCaffreye31ac322015-01-13 13:16:43 -0500468 MOVADQ (%r10), \TMP2
Tadeusz Struk3c097b82010-12-13 19:51:15 +0800469 AESENCLAST \TMP2, \XMM1
470 AESENCLAST \TMP2, \XMM2
471 AESENCLAST \TMP2, \XMM3
472 AESENCLAST \TMP2, \XMM4
473 movdqu 16*0(%arg3 , %r11 , 1), \TMP1
474 pxor \TMP1, \XMM1
Dave Watsone1fd3162018-02-14 09:38:12 -0800475.ifc \operation, dec
476 movdqu \XMM1, 16*0(%arg2 , %r11 , 1)
477 movdqa \TMP1, \XMM1
478.endif
Tadeusz Struk3c097b82010-12-13 19:51:15 +0800479 movdqu 16*1(%arg3 , %r11 , 1), \TMP1
480 pxor \TMP1, \XMM2
Dave Watsone1fd3162018-02-14 09:38:12 -0800481.ifc \operation, dec
482 movdqu \XMM2, 16*1(%arg2 , %r11 , 1)
483 movdqa \TMP1, \XMM2
484.endif
Tadeusz Struk3c097b82010-12-13 19:51:15 +0800485 movdqu 16*2(%arg3 , %r11 , 1), \TMP1
486 pxor \TMP1, \XMM3
Dave Watsone1fd3162018-02-14 09:38:12 -0800487.ifc \operation, dec
488 movdqu \XMM3, 16*2(%arg2 , %r11 , 1)
489 movdqa \TMP1, \XMM3
490.endif
Tadeusz Struk3c097b82010-12-13 19:51:15 +0800491 movdqu 16*3(%arg3 , %r11 , 1), \TMP1
492 pxor \TMP1, \XMM4
Dave Watsone1fd3162018-02-14 09:38:12 -0800493.ifc \operation, dec
494 movdqu \XMM4, 16*3(%arg2 , %r11 , 1)
495 movdqa \TMP1, \XMM4
496.else
Tadeusz Struk0bd82f52010-11-04 15:00:45 -0400497 movdqu \XMM1, 16*0(%arg2 , %r11 , 1)
498 movdqu \XMM2, 16*1(%arg2 , %r11 , 1)
499 movdqu \XMM3, 16*2(%arg2 , %r11 , 1)
500 movdqu \XMM4, 16*3(%arg2 , %r11 , 1)
Dave Watsone1fd3162018-02-14 09:38:12 -0800501.endif
Tadeusz Struk3c097b82010-12-13 19:51:15 +0800502
Tadeusz Struk0bd82f52010-11-04 15:00:45 -0400503 add $64, %r11
Tadeusz Struk3c097b82010-12-13 19:51:15 +0800504 PSHUFB_XMM %xmm14, \XMM1 # perform a 16 byte swap
Tadeusz Struk0bd82f52010-11-04 15:00:45 -0400505 pxor \XMMDst, \XMM1
506# combine GHASHed value with the corresponding ciphertext
Tadeusz Struk3c097b82010-12-13 19:51:15 +0800507 PSHUFB_XMM %xmm14, \XMM2 # perform a 16 byte swap
Tadeusz Struk3c097b82010-12-13 19:51:15 +0800508 PSHUFB_XMM %xmm14, \XMM3 # perform a 16 byte swap
Tadeusz Struk3c097b82010-12-13 19:51:15 +0800509 PSHUFB_XMM %xmm14, \XMM4 # perform a 16 byte swap
510
Dave Watsone1fd3162018-02-14 09:38:12 -0800511_initial_blocks_done\@:
Tadeusz Struk3c097b82010-12-13 19:51:15 +0800512
Tadeusz Struk0bd82f52010-11-04 15:00:45 -0400513.endm
514
515/*
516* encrypt 4 blocks at a time
517* ghash the 4 previously encrypted ciphertext blocks
518* arg1, %arg2, %arg3 are used as pointers only, not modified
519* %r11 is the data offset value
520*/
Tadeusz Struk3c097b82010-12-13 19:51:15 +0800521.macro GHASH_4_ENCRYPT_4_PARALLEL_ENC TMP1 TMP2 TMP3 TMP4 TMP5 \
Tadeusz Struk0bd82f52010-11-04 15:00:45 -0400522TMP6 XMM0 XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7 XMM8 operation
523
524 movdqa \XMM1, \XMM5
525 movdqa \XMM2, \XMM6
526 movdqa \XMM3, \XMM7
527 movdqa \XMM4, \XMM8
528
Tadeusz Struk3c097b82010-12-13 19:51:15 +0800529 movdqa SHUF_MASK(%rip), %xmm15
Tadeusz Struk0bd82f52010-11-04 15:00:45 -0400530 # multiply TMP5 * HashKey using karatsuba
531
532 movdqa \XMM5, \TMP4
533 pshufd $78, \XMM5, \TMP6
534 pxor \XMM5, \TMP6
535 paddd ONE(%rip), \XMM0 # INCR CNT
536 movdqa HashKey_4(%rsp), \TMP5
537 PCLMULQDQ 0x11, \TMP5, \TMP4 # TMP4 = a1*b1
538 movdqa \XMM0, \XMM1
539 paddd ONE(%rip), \XMM0 # INCR CNT
540 movdqa \XMM0, \XMM2
541 paddd ONE(%rip), \XMM0 # INCR CNT
542 movdqa \XMM0, \XMM3
543 paddd ONE(%rip), \XMM0 # INCR CNT
544 movdqa \XMM0, \XMM4
Tadeusz Struk3c097b82010-12-13 19:51:15 +0800545 PSHUFB_XMM %xmm15, \XMM1 # perform a 16 byte swap
Tadeusz Struk0bd82f52010-11-04 15:00:45 -0400546 PCLMULQDQ 0x00, \TMP5, \XMM5 # XMM5 = a0*b0
Tadeusz Struk3c097b82010-12-13 19:51:15 +0800547 PSHUFB_XMM %xmm15, \XMM2 # perform a 16 byte swap
548 PSHUFB_XMM %xmm15, \XMM3 # perform a 16 byte swap
549 PSHUFB_XMM %xmm15, \XMM4 # perform a 16 byte swap
550
Tadeusz Struk0bd82f52010-11-04 15:00:45 -0400551 pxor (%arg1), \XMM1
552 pxor (%arg1), \XMM2
553 pxor (%arg1), \XMM3
554 pxor (%arg1), \XMM4
555 movdqa HashKey_4_k(%rsp), \TMP5
556 PCLMULQDQ 0x00, \TMP5, \TMP6 # TMP6 = (a1+a0)*(b1+b0)
557 movaps 0x10(%arg1), \TMP1
558 AESENC \TMP1, \XMM1 # Round 1
559 AESENC \TMP1, \XMM2
560 AESENC \TMP1, \XMM3
561 AESENC \TMP1, \XMM4
562 movaps 0x20(%arg1), \TMP1
563 AESENC \TMP1, \XMM1 # Round 2
564 AESENC \TMP1, \XMM2
565 AESENC \TMP1, \XMM3
566 AESENC \TMP1, \XMM4
567 movdqa \XMM6, \TMP1
568 pshufd $78, \XMM6, \TMP2
569 pxor \XMM6, \TMP2
570 movdqa HashKey_3(%rsp), \TMP5
571 PCLMULQDQ 0x11, \TMP5, \TMP1 # TMP1 = a1 * b1
572 movaps 0x30(%arg1), \TMP3
573 AESENC \TMP3, \XMM1 # Round 3
574 AESENC \TMP3, \XMM2
575 AESENC \TMP3, \XMM3
576 AESENC \TMP3, \XMM4
577 PCLMULQDQ 0x00, \TMP5, \XMM6 # XMM6 = a0*b0
578 movaps 0x40(%arg1), \TMP3
579 AESENC \TMP3, \XMM1 # Round 4
580 AESENC \TMP3, \XMM2
581 AESENC \TMP3, \XMM3
582 AESENC \TMP3, \XMM4
583 movdqa HashKey_3_k(%rsp), \TMP5
584 PCLMULQDQ 0x00, \TMP5, \TMP2 # TMP2 = (a1+a0)*(b1+b0)
585 movaps 0x50(%arg1), \TMP3
586 AESENC \TMP3, \XMM1 # Round 5
587 AESENC \TMP3, \XMM2
588 AESENC \TMP3, \XMM3
589 AESENC \TMP3, \XMM4
590 pxor \TMP1, \TMP4
591# accumulate the results in TMP4:XMM5, TMP6 holds the middle part
592 pxor \XMM6, \XMM5
593 pxor \TMP2, \TMP6
594 movdqa \XMM7, \TMP1
595 pshufd $78, \XMM7, \TMP2
596 pxor \XMM7, \TMP2
597 movdqa HashKey_2(%rsp ), \TMP5
598
599 # Multiply TMP5 * HashKey using karatsuba
600
601 PCLMULQDQ 0x11, \TMP5, \TMP1 # TMP1 = a1*b1
602 movaps 0x60(%arg1), \TMP3
603 AESENC \TMP3, \XMM1 # Round 6
604 AESENC \TMP3, \XMM2
605 AESENC \TMP3, \XMM3
606 AESENC \TMP3, \XMM4
607 PCLMULQDQ 0x00, \TMP5, \XMM7 # XMM7 = a0*b0
608 movaps 0x70(%arg1), \TMP3
609 AESENC \TMP3, \XMM1 # Round 7
610 AESENC \TMP3, \XMM2
611 AESENC \TMP3, \XMM3
612 AESENC \TMP3, \XMM4
613 movdqa HashKey_2_k(%rsp), \TMP5
614 PCLMULQDQ 0x00, \TMP5, \TMP2 # TMP2 = (a1+a0)*(b1+b0)
615 movaps 0x80(%arg1), \TMP3
616 AESENC \TMP3, \XMM1 # Round 8
617 AESENC \TMP3, \XMM2
618 AESENC \TMP3, \XMM3
619 AESENC \TMP3, \XMM4
620 pxor \TMP1, \TMP4
621# accumulate the results in TMP4:XMM5, TMP6 holds the middle part
622 pxor \XMM7, \XMM5
623 pxor \TMP2, \TMP6
624
625 # Multiply XMM8 * HashKey
626 # XMM8 and TMP5 hold the values for the two operands
627
628 movdqa \XMM8, \TMP1
629 pshufd $78, \XMM8, \TMP2
630 pxor \XMM8, \TMP2
631 movdqa HashKey(%rsp), \TMP5
632 PCLMULQDQ 0x11, \TMP5, \TMP1 # TMP1 = a1*b1
633 movaps 0x90(%arg1), \TMP3
634 AESENC \TMP3, \XMM1 # Round 9
635 AESENC \TMP3, \XMM2
636 AESENC \TMP3, \XMM3
637 AESENC \TMP3, \XMM4
638 PCLMULQDQ 0x00, \TMP5, \XMM8 # XMM8 = a0*b0
Timothy McCaffreye31ac322015-01-13 13:16:43 -0500639 lea 0xa0(%arg1),%r10
640 mov keysize,%eax
641 shr $2,%eax # 128->4, 192->6, 256->8
642 sub $4,%eax # 128->0, 192->2, 256->4
643 jz aes_loop_par_enc_done
644
645aes_loop_par_enc:
646 MOVADQ (%r10),\TMP3
647.irpc index, 1234
648 AESENC \TMP3, %xmm\index
649.endr
650 add $16,%r10
651 sub $1,%eax
652 jnz aes_loop_par_enc
653
654aes_loop_par_enc_done:
655 MOVADQ (%r10), \TMP3
Tadeusz Struk0bd82f52010-11-04 15:00:45 -0400656 AESENCLAST \TMP3, \XMM1 # Round 10
657 AESENCLAST \TMP3, \XMM2
658 AESENCLAST \TMP3, \XMM3
659 AESENCLAST \TMP3, \XMM4
660 movdqa HashKey_k(%rsp), \TMP5
661 PCLMULQDQ 0x00, \TMP5, \TMP2 # TMP2 = (a1+a0)*(b1+b0)
662 movdqu (%arg3,%r11,1), \TMP3
663 pxor \TMP3, \XMM1 # Ciphertext/Plaintext XOR EK
Tadeusz Struk0bd82f52010-11-04 15:00:45 -0400664 movdqu 16(%arg3,%r11,1), \TMP3
665 pxor \TMP3, \XMM2 # Ciphertext/Plaintext XOR EK
Tadeusz Struk0bd82f52010-11-04 15:00:45 -0400666 movdqu 32(%arg3,%r11,1), \TMP3
667 pxor \TMP3, \XMM3 # Ciphertext/Plaintext XOR EK
Tadeusz Struk0bd82f52010-11-04 15:00:45 -0400668 movdqu 48(%arg3,%r11,1), \TMP3
669 pxor \TMP3, \XMM4 # Ciphertext/Plaintext XOR EK
Tadeusz Struk3c097b82010-12-13 19:51:15 +0800670 movdqu \XMM1, (%arg2,%r11,1) # Write to the ciphertext buffer
671 movdqu \XMM2, 16(%arg2,%r11,1) # Write to the ciphertext buffer
672 movdqu \XMM3, 32(%arg2,%r11,1) # Write to the ciphertext buffer
673 movdqu \XMM4, 48(%arg2,%r11,1) # Write to the ciphertext buffer
674 PSHUFB_XMM %xmm15, \XMM1 # perform a 16 byte swap
675 PSHUFB_XMM %xmm15, \XMM2 # perform a 16 byte swap
676 PSHUFB_XMM %xmm15, \XMM3 # perform a 16 byte swap
677 PSHUFB_XMM %xmm15, \XMM4 # perform a 16 byte swap
678
679 pxor \TMP4, \TMP1
680 pxor \XMM8, \XMM5
681 pxor \TMP6, \TMP2
682 pxor \TMP1, \TMP2
683 pxor \XMM5, \TMP2
684 movdqa \TMP2, \TMP3
685 pslldq $8, \TMP3 # left shift TMP3 2 DWs
686 psrldq $8, \TMP2 # right shift TMP2 2 DWs
687 pxor \TMP3, \XMM5
688 pxor \TMP2, \TMP1 # accumulate the results in TMP1:XMM5
689
690 # first phase of reduction
691
692 movdqa \XMM5, \TMP2
693 movdqa \XMM5, \TMP3
694 movdqa \XMM5, \TMP4
695# move XMM5 into TMP2, TMP3, TMP4 in order to perform shifts independently
696 pslld $31, \TMP2 # packed right shift << 31
697 pslld $30, \TMP3 # packed right shift << 30
698 pslld $25, \TMP4 # packed right shift << 25
699 pxor \TMP3, \TMP2 # xor the shifted versions
700 pxor \TMP4, \TMP2
701 movdqa \TMP2, \TMP5
702 psrldq $4, \TMP5 # right shift T5 1 DW
703 pslldq $12, \TMP2 # left shift T2 3 DWs
704 pxor \TMP2, \XMM5
705
706 # second phase of reduction
707
708 movdqa \XMM5,\TMP2 # make 3 copies of XMM5 into TMP2, TMP3, TMP4
709 movdqa \XMM5,\TMP3
710 movdqa \XMM5,\TMP4
711 psrld $1, \TMP2 # packed left shift >>1
712 psrld $2, \TMP3 # packed left shift >>2
713 psrld $7, \TMP4 # packed left shift >>7
714 pxor \TMP3,\TMP2 # xor the shifted versions
715 pxor \TMP4,\TMP2
716 pxor \TMP5, \TMP2
717 pxor \TMP2, \XMM5
718 pxor \TMP1, \XMM5 # result is in TMP1
719
720 pxor \XMM5, \XMM1
721.endm
722
723/*
724* decrypt 4 blocks at a time
725* ghash the 4 previously decrypted ciphertext blocks
726* arg1, %arg2, %arg3 are used as pointers only, not modified
727* %r11 is the data offset value
728*/
729.macro GHASH_4_ENCRYPT_4_PARALLEL_DEC TMP1 TMP2 TMP3 TMP4 TMP5 \
730TMP6 XMM0 XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7 XMM8 operation
731
732 movdqa \XMM1, \XMM5
733 movdqa \XMM2, \XMM6
734 movdqa \XMM3, \XMM7
735 movdqa \XMM4, \XMM8
736
737 movdqa SHUF_MASK(%rip), %xmm15
738 # multiply TMP5 * HashKey using karatsuba
739
740 movdqa \XMM5, \TMP4
741 pshufd $78, \XMM5, \TMP6
742 pxor \XMM5, \TMP6
743 paddd ONE(%rip), \XMM0 # INCR CNT
744 movdqa HashKey_4(%rsp), \TMP5
745 PCLMULQDQ 0x11, \TMP5, \TMP4 # TMP4 = a1*b1
746 movdqa \XMM0, \XMM1
747 paddd ONE(%rip), \XMM0 # INCR CNT
748 movdqa \XMM0, \XMM2
749 paddd ONE(%rip), \XMM0 # INCR CNT
750 movdqa \XMM0, \XMM3
751 paddd ONE(%rip), \XMM0 # INCR CNT
752 movdqa \XMM0, \XMM4
753 PSHUFB_XMM %xmm15, \XMM1 # perform a 16 byte swap
754 PCLMULQDQ 0x00, \TMP5, \XMM5 # XMM5 = a0*b0
755 PSHUFB_XMM %xmm15, \XMM2 # perform a 16 byte swap
756 PSHUFB_XMM %xmm15, \XMM3 # perform a 16 byte swap
757 PSHUFB_XMM %xmm15, \XMM4 # perform a 16 byte swap
758
759 pxor (%arg1), \XMM1
760 pxor (%arg1), \XMM2
761 pxor (%arg1), \XMM3
762 pxor (%arg1), \XMM4
763 movdqa HashKey_4_k(%rsp), \TMP5
764 PCLMULQDQ 0x00, \TMP5, \TMP6 # TMP6 = (a1+a0)*(b1+b0)
765 movaps 0x10(%arg1), \TMP1
766 AESENC \TMP1, \XMM1 # Round 1
767 AESENC \TMP1, \XMM2
768 AESENC \TMP1, \XMM3
769 AESENC \TMP1, \XMM4
770 movaps 0x20(%arg1), \TMP1
771 AESENC \TMP1, \XMM1 # Round 2
772 AESENC \TMP1, \XMM2
773 AESENC \TMP1, \XMM3
774 AESENC \TMP1, \XMM4
775 movdqa \XMM6, \TMP1
776 pshufd $78, \XMM6, \TMP2
777 pxor \XMM6, \TMP2
778 movdqa HashKey_3(%rsp), \TMP5
779 PCLMULQDQ 0x11, \TMP5, \TMP1 # TMP1 = a1 * b1
780 movaps 0x30(%arg1), \TMP3
781 AESENC \TMP3, \XMM1 # Round 3
782 AESENC \TMP3, \XMM2
783 AESENC \TMP3, \XMM3
784 AESENC \TMP3, \XMM4
785 PCLMULQDQ 0x00, \TMP5, \XMM6 # XMM6 = a0*b0
786 movaps 0x40(%arg1), \TMP3
787 AESENC \TMP3, \XMM1 # Round 4
788 AESENC \TMP3, \XMM2
789 AESENC \TMP3, \XMM3
790 AESENC \TMP3, \XMM4
791 movdqa HashKey_3_k(%rsp), \TMP5
792 PCLMULQDQ 0x00, \TMP5, \TMP2 # TMP2 = (a1+a0)*(b1+b0)
793 movaps 0x50(%arg1), \TMP3
794 AESENC \TMP3, \XMM1 # Round 5
795 AESENC \TMP3, \XMM2
796 AESENC \TMP3, \XMM3
797 AESENC \TMP3, \XMM4
798 pxor \TMP1, \TMP4
799# accumulate the results in TMP4:XMM5, TMP6 holds the middle part
800 pxor \XMM6, \XMM5
801 pxor \TMP2, \TMP6
802 movdqa \XMM7, \TMP1
803 pshufd $78, \XMM7, \TMP2
804 pxor \XMM7, \TMP2
805 movdqa HashKey_2(%rsp ), \TMP5
806
807 # Multiply TMP5 * HashKey using karatsuba
808
809 PCLMULQDQ 0x11, \TMP5, \TMP1 # TMP1 = a1*b1
810 movaps 0x60(%arg1), \TMP3
811 AESENC \TMP3, \XMM1 # Round 6
812 AESENC \TMP3, \XMM2
813 AESENC \TMP3, \XMM3
814 AESENC \TMP3, \XMM4
815 PCLMULQDQ 0x00, \TMP5, \XMM7 # XMM7 = a0*b0
816 movaps 0x70(%arg1), \TMP3
817 AESENC \TMP3, \XMM1 # Round 7
818 AESENC \TMP3, \XMM2
819 AESENC \TMP3, \XMM3
820 AESENC \TMP3, \XMM4
821 movdqa HashKey_2_k(%rsp), \TMP5
822 PCLMULQDQ 0x00, \TMP5, \TMP2 # TMP2 = (a1+a0)*(b1+b0)
823 movaps 0x80(%arg1), \TMP3
824 AESENC \TMP3, \XMM1 # Round 8
825 AESENC \TMP3, \XMM2
826 AESENC \TMP3, \XMM3
827 AESENC \TMP3, \XMM4
828 pxor \TMP1, \TMP4
829# accumulate the results in TMP4:XMM5, TMP6 holds the middle part
830 pxor \XMM7, \XMM5
831 pxor \TMP2, \TMP6
832
833 # Multiply XMM8 * HashKey
834 # XMM8 and TMP5 hold the values for the two operands
835
836 movdqa \XMM8, \TMP1
837 pshufd $78, \XMM8, \TMP2
838 pxor \XMM8, \TMP2
839 movdqa HashKey(%rsp), \TMP5
840 PCLMULQDQ 0x11, \TMP5, \TMP1 # TMP1 = a1*b1
841 movaps 0x90(%arg1), \TMP3
842 AESENC \TMP3, \XMM1 # Round 9
843 AESENC \TMP3, \XMM2
844 AESENC \TMP3, \XMM3
845 AESENC \TMP3, \XMM4
846 PCLMULQDQ 0x00, \TMP5, \XMM8 # XMM8 = a0*b0
Timothy McCaffreye31ac322015-01-13 13:16:43 -0500847 lea 0xa0(%arg1),%r10
848 mov keysize,%eax
849 shr $2,%eax # 128->4, 192->6, 256->8
850 sub $4,%eax # 128->0, 192->2, 256->4
851 jz aes_loop_par_dec_done
852
853aes_loop_par_dec:
854 MOVADQ (%r10),\TMP3
855.irpc index, 1234
856 AESENC \TMP3, %xmm\index
857.endr
858 add $16,%r10
859 sub $1,%eax
860 jnz aes_loop_par_dec
861
862aes_loop_par_dec_done:
863 MOVADQ (%r10), \TMP3
864 AESENCLAST \TMP3, \XMM1 # last round
Tadeusz Struk3c097b82010-12-13 19:51:15 +0800865 AESENCLAST \TMP3, \XMM2
866 AESENCLAST \TMP3, \XMM3
867 AESENCLAST \TMP3, \XMM4
868 movdqa HashKey_k(%rsp), \TMP5
869 PCLMULQDQ 0x00, \TMP5, \TMP2 # TMP2 = (a1+a0)*(b1+b0)
870 movdqu (%arg3,%r11,1), \TMP3
871 pxor \TMP3, \XMM1 # Ciphertext/Plaintext XOR EK
872 movdqu \XMM1, (%arg2,%r11,1) # Write to plaintext buffer
873 movdqa \TMP3, \XMM1
874 movdqu 16(%arg3,%r11,1), \TMP3
875 pxor \TMP3, \XMM2 # Ciphertext/Plaintext XOR EK
876 movdqu \XMM2, 16(%arg2,%r11,1) # Write to plaintext buffer
877 movdqa \TMP3, \XMM2
878 movdqu 32(%arg3,%r11,1), \TMP3
879 pxor \TMP3, \XMM3 # Ciphertext/Plaintext XOR EK
880 movdqu \XMM3, 32(%arg2,%r11,1) # Write to plaintext buffer
881 movdqa \TMP3, \XMM3
882 movdqu 48(%arg3,%r11,1), \TMP3
883 pxor \TMP3, \XMM4 # Ciphertext/Plaintext XOR EK
Tadeusz Struk0bd82f52010-11-04 15:00:45 -0400884 movdqu \XMM4, 48(%arg2,%r11,1) # Write to plaintext buffer
885 movdqa \TMP3, \XMM4
Tadeusz Struk3c097b82010-12-13 19:51:15 +0800886 PSHUFB_XMM %xmm15, \XMM1 # perform a 16 byte swap
887 PSHUFB_XMM %xmm15, \XMM2 # perform a 16 byte swap
888 PSHUFB_XMM %xmm15, \XMM3 # perform a 16 byte swap
889 PSHUFB_XMM %xmm15, \XMM4 # perform a 16 byte swap
Tadeusz Struk0bd82f52010-11-04 15:00:45 -0400890
891 pxor \TMP4, \TMP1
892 pxor \XMM8, \XMM5
893 pxor \TMP6, \TMP2
894 pxor \TMP1, \TMP2
895 pxor \XMM5, \TMP2
896 movdqa \TMP2, \TMP3
897 pslldq $8, \TMP3 # left shift TMP3 2 DWs
898 psrldq $8, \TMP2 # right shift TMP2 2 DWs
899 pxor \TMP3, \XMM5
900 pxor \TMP2, \TMP1 # accumulate the results in TMP1:XMM5
901
902 # first phase of reduction
903
904 movdqa \XMM5, \TMP2
905 movdqa \XMM5, \TMP3
906 movdqa \XMM5, \TMP4
907# move XMM5 into TMP2, TMP3, TMP4 in order to perform shifts independently
908 pslld $31, \TMP2 # packed right shift << 31
909 pslld $30, \TMP3 # packed right shift << 30
910 pslld $25, \TMP4 # packed right shift << 25
911 pxor \TMP3, \TMP2 # xor the shifted versions
912 pxor \TMP4, \TMP2
913 movdqa \TMP2, \TMP5
914 psrldq $4, \TMP5 # right shift T5 1 DW
915 pslldq $12, \TMP2 # left shift T2 3 DWs
916 pxor \TMP2, \XMM5
917
918 # second phase of reduction
919
920 movdqa \XMM5,\TMP2 # make 3 copies of XMM5 into TMP2, TMP3, TMP4
921 movdqa \XMM5,\TMP3
922 movdqa \XMM5,\TMP4
923 psrld $1, \TMP2 # packed left shift >>1
924 psrld $2, \TMP3 # packed left shift >>2
925 psrld $7, \TMP4 # packed left shift >>7
926 pxor \TMP3,\TMP2 # xor the shifted versions
927 pxor \TMP4,\TMP2
928 pxor \TMP5, \TMP2
929 pxor \TMP2, \XMM5
930 pxor \TMP1, \XMM5 # result is in TMP1
931
932 pxor \XMM5, \XMM1
933.endm
934
935/* GHASH the last 4 ciphertext blocks. */
936.macro GHASH_LAST_4 TMP1 TMP2 TMP3 TMP4 TMP5 TMP6 \
937TMP7 XMM1 XMM2 XMM3 XMM4 XMMDst
938
939 # Multiply TMP6 * HashKey (using Karatsuba)
940
941 movdqa \XMM1, \TMP6
942 pshufd $78, \XMM1, \TMP2
943 pxor \XMM1, \TMP2
944 movdqa HashKey_4(%rsp), \TMP5
945 PCLMULQDQ 0x11, \TMP5, \TMP6 # TMP6 = a1*b1
946 PCLMULQDQ 0x00, \TMP5, \XMM1 # XMM1 = a0*b0
947 movdqa HashKey_4_k(%rsp), \TMP4
948 PCLMULQDQ 0x00, \TMP4, \TMP2 # TMP2 = (a1+a0)*(b1+b0)
949 movdqa \XMM1, \XMMDst
950 movdqa \TMP2, \XMM1 # result in TMP6, XMMDst, XMM1
951
952 # Multiply TMP1 * HashKey (using Karatsuba)
953
954 movdqa \XMM2, \TMP1
955 pshufd $78, \XMM2, \TMP2
956 pxor \XMM2, \TMP2
957 movdqa HashKey_3(%rsp), \TMP5
958 PCLMULQDQ 0x11, \TMP5, \TMP1 # TMP1 = a1*b1
959 PCLMULQDQ 0x00, \TMP5, \XMM2 # XMM2 = a0*b0
960 movdqa HashKey_3_k(%rsp), \TMP4
961 PCLMULQDQ 0x00, \TMP4, \TMP2 # TMP2 = (a1+a0)*(b1+b0)
962 pxor \TMP1, \TMP6
963 pxor \XMM2, \XMMDst
964 pxor \TMP2, \XMM1
965# results accumulated in TMP6, XMMDst, XMM1
966
967 # Multiply TMP1 * HashKey (using Karatsuba)
968
969 movdqa \XMM3, \TMP1
970 pshufd $78, \XMM3, \TMP2
971 pxor \XMM3, \TMP2
972 movdqa HashKey_2(%rsp), \TMP5
973 PCLMULQDQ 0x11, \TMP5, \TMP1 # TMP1 = a1*b1
974 PCLMULQDQ 0x00, \TMP5, \XMM3 # XMM3 = a0*b0
975 movdqa HashKey_2_k(%rsp), \TMP4
976 PCLMULQDQ 0x00, \TMP4, \TMP2 # TMP2 = (a1+a0)*(b1+b0)
977 pxor \TMP1, \TMP6
978 pxor \XMM3, \XMMDst
979 pxor \TMP2, \XMM1 # results accumulated in TMP6, XMMDst, XMM1
980
981 # Multiply TMP1 * HashKey (using Karatsuba)
982 movdqa \XMM4, \TMP1
983 pshufd $78, \XMM4, \TMP2
984 pxor \XMM4, \TMP2
985 movdqa HashKey(%rsp), \TMP5
986 PCLMULQDQ 0x11, \TMP5, \TMP1 # TMP1 = a1*b1
987 PCLMULQDQ 0x00, \TMP5, \XMM4 # XMM4 = a0*b0
988 movdqa HashKey_k(%rsp), \TMP4
989 PCLMULQDQ 0x00, \TMP4, \TMP2 # TMP2 = (a1+a0)*(b1+b0)
990 pxor \TMP1, \TMP6
991 pxor \XMM4, \XMMDst
992 pxor \XMM1, \TMP2
993 pxor \TMP6, \TMP2
994 pxor \XMMDst, \TMP2
995 # middle section of the temp results combined as in karatsuba algorithm
996 movdqa \TMP2, \TMP4
997 pslldq $8, \TMP4 # left shift TMP4 2 DWs
998 psrldq $8, \TMP2 # right shift TMP2 2 DWs
999 pxor \TMP4, \XMMDst
1000 pxor \TMP2, \TMP6
1001# TMP6:XMMDst holds the result of the accumulated carry-less multiplications
1002 # first phase of the reduction
1003 movdqa \XMMDst, \TMP2
1004 movdqa \XMMDst, \TMP3
1005 movdqa \XMMDst, \TMP4
1006# move XMMDst into TMP2, TMP3, TMP4 in order to perform 3 shifts independently
1007 pslld $31, \TMP2 # packed right shifting << 31
1008 pslld $30, \TMP3 # packed right shifting << 30
1009 pslld $25, \TMP4 # packed right shifting << 25
1010 pxor \TMP3, \TMP2 # xor the shifted versions
1011 pxor \TMP4, \TMP2
1012 movdqa \TMP2, \TMP7
1013 psrldq $4, \TMP7 # right shift TMP7 1 DW
1014 pslldq $12, \TMP2 # left shift TMP2 3 DWs
1015 pxor \TMP2, \XMMDst
1016
1017 # second phase of the reduction
1018 movdqa \XMMDst, \TMP2
1019 # make 3 copies of XMMDst for doing 3 shift operations
1020 movdqa \XMMDst, \TMP3
1021 movdqa \XMMDst, \TMP4
1022 psrld $1, \TMP2 # packed left shift >> 1
1023 psrld $2, \TMP3 # packed left shift >> 2
1024 psrld $7, \TMP4 # packed left shift >> 7
1025 pxor \TMP3, \TMP2 # xor the shifted versions
1026 pxor \TMP4, \TMP2
1027 pxor \TMP7, \TMP2
1028 pxor \TMP2, \XMMDst
1029 pxor \TMP6, \XMMDst # reduced result is in XMMDst
1030.endm
1031
Timothy McCaffreye31ac322015-01-13 13:16:43 -05001032
1033/* Encryption of a single block
1034* uses eax & r10
1035*/
1036
Tadeusz Struk0bd82f52010-11-04 15:00:45 -04001037.macro ENCRYPT_SINGLE_BLOCK XMM0 TMP1
1038
Timothy McCaffreye31ac322015-01-13 13:16:43 -05001039 pxor (%arg1), \XMM0
1040 mov keysize,%eax
1041 shr $2,%eax # 128->4, 192->6, 256->8
1042 add $5,%eax # 128->9, 192->11, 256->13
1043 lea 16(%arg1), %r10 # get first expanded key address
1044
1045_esb_loop_\@:
1046 MOVADQ (%r10),\TMP1
1047 AESENC \TMP1,\XMM0
1048 add $16,%r10
1049 sub $1,%eax
1050 jnz _esb_loop_\@
1051
1052 MOVADQ (%r10),\TMP1
1053 AESENCLAST \TMP1,\XMM0
Tadeusz Struk0bd82f52010-11-04 15:00:45 -04001054.endm
Tadeusz Struk0bd82f52010-11-04 15:00:45 -04001055/*****************************************************************************
1056* void aesni_gcm_dec(void *aes_ctx, // AES Key schedule. Starts on a 16 byte boundary.
1057* u8 *out, // Plaintext output. Encrypt in-place is allowed.
1058* const u8 *in, // Ciphertext input
1059* u64 plaintext_len, // Length of data in bytes for decryption.
1060* u8 *iv, // Pre-counter block j0: 4 byte salt (from Security Association)
1061* // concatenated with 8 byte Initialisation Vector (from IPSec ESP Payload)
1062* // concatenated with 0x00000001. 16-byte aligned pointer.
1063* u8 *hash_subkey, // H, the Hash sub key input. Data starts on a 16-byte boundary.
1064* const u8 *aad, // Additional Authentication Data (AAD)
1065* u64 aad_len, // Length of AAD in bytes. With RFC4106 this is going to be 8 or 12 bytes
1066* u8 *auth_tag, // Authenticated Tag output. The driver will compare this to the
1067* // given authentication tag and only return the plaintext if they match.
1068* u64 auth_tag_len); // Authenticated Tag Length in bytes. Valid values are 16
1069* // (most likely), 12 or 8.
1070*
1071* Assumptions:
1072*
1073* keys:
1074* keys are pre-expanded and aligned to 16 bytes. we are using the first
1075* set of 11 keys in the data structure void *aes_ctx
1076*
1077* iv:
1078* 0 1 2 3
1079* 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
1080* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1081* | Salt (From the SA) |
1082* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1083* | Initialization Vector |
1084* | (This is the sequence number from IPSec header) |
1085* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1086* | 0x1 |
1087* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1088*
1089*
1090*
1091* AAD:
1092* AAD padded to 128 bits with 0
1093* for example, assume AAD is a u32 vector
1094*
1095* if AAD is 8 bytes:
1096* AAD[3] = {A0, A1};
1097* padded AAD in xmm register = {A1 A0 0 0}
1098*
1099* 0 1 2 3
1100* 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
1101* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1102* | SPI (A1) |
1103* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1104* | 32-bit Sequence Number (A0) |
1105* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1106* | 0x0 |
1107* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1108*
1109* AAD Format with 32-bit Sequence Number
1110*
1111* if AAD is 12 bytes:
1112* AAD[3] = {A0, A1, A2};
1113* padded AAD in xmm register = {A2 A1 A0 0}
1114*
1115* 0 1 2 3
1116* 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
1117* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1118* 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
1119* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1120* | SPI (A2) |
1121* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1122* | 64-bit Extended Sequence Number {A1,A0} |
1123* | |
1124* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1125* | 0x0 |
1126* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1127*
1128* AAD Format with 64-bit Extended Sequence Number
1129*
Tadeusz Struk0bd82f52010-11-04 15:00:45 -04001130* poly = x^128 + x^127 + x^126 + x^121 + 1
1131*
1132*****************************************************************************/
Tadeusz Struk0bd82f52010-11-04 15:00:45 -04001133ENTRY(aesni_gcm_dec)
1134 push %r12
1135 push %r13
1136 push %r14
1137 mov %rsp, %r14
1138/*
1139* states of %xmm registers %xmm6:%xmm15 not saved
1140* all %xmm registers are clobbered
1141*/
1142 sub $VARIABLE_OFFSET, %rsp
1143 and $~63, %rsp # align rsp to 64 bytes
1144 mov %arg6, %r12
1145 movdqu (%r12), %xmm13 # %xmm13 = HashKey
Tadeusz Struk3c097b82010-12-13 19:51:15 +08001146 movdqa SHUF_MASK(%rip), %xmm2
1147 PSHUFB_XMM %xmm2, %xmm13
1148
Tadeusz Struk0bd82f52010-11-04 15:00:45 -04001149
1150# Precompute HashKey<<1 (mod poly) from the hash key (required for GHASH)
1151
1152 movdqa %xmm13, %xmm2
1153 psllq $1, %xmm13
1154 psrlq $63, %xmm2
1155 movdqa %xmm2, %xmm1
1156 pslldq $8, %xmm2
1157 psrldq $8, %xmm1
1158 por %xmm2, %xmm13
1159
1160 # Reduction
1161
1162 pshufd $0x24, %xmm1, %xmm2
1163 pcmpeqd TWOONE(%rip), %xmm2
1164 pand POLY(%rip), %xmm2
1165 pxor %xmm2, %xmm13 # %xmm13 holds the HashKey<<1 (mod poly)
1166
1167
1168 # Decrypt first few blocks
1169
1170 movdqa %xmm13, HashKey(%rsp) # store HashKey<<1 (mod poly)
1171 mov %arg4, %r13 # save the number of bytes of plaintext/ciphertext
1172 and $-16, %r13 # %r13 = %r13 - (%r13 mod 16)
1173 mov %r13, %r12
1174 and $(3<<4), %r12
1175 jz _initial_num_blocks_is_0_decrypt
1176 cmp $(2<<4), %r12
1177 jb _initial_num_blocks_is_1_decrypt
1178 je _initial_num_blocks_is_2_decrypt
1179_initial_num_blocks_is_3_decrypt:
Dave Watsone1fd3162018-02-14 09:38:12 -08001180 INITIAL_BLOCKS_ENC_DEC %xmm9, %xmm10, %xmm13, %xmm11, %xmm12, %xmm0, \
Tadeusz Struk0bd82f52010-11-04 15:00:45 -04001181%xmm1, %xmm2, %xmm3, %xmm4, %xmm8, %xmm5, %xmm6, 5, 678, dec
1182 sub $48, %r13
1183 jmp _initial_blocks_decrypted
1184_initial_num_blocks_is_2_decrypt:
Dave Watsone1fd3162018-02-14 09:38:12 -08001185 INITIAL_BLOCKS_ENC_DEC %xmm9, %xmm10, %xmm13, %xmm11, %xmm12, %xmm0, \
Tadeusz Struk0bd82f52010-11-04 15:00:45 -04001186%xmm1, %xmm2, %xmm3, %xmm4, %xmm8, %xmm5, %xmm6, 6, 78, dec
1187 sub $32, %r13
1188 jmp _initial_blocks_decrypted
1189_initial_num_blocks_is_1_decrypt:
Dave Watsone1fd3162018-02-14 09:38:12 -08001190 INITIAL_BLOCKS_ENC_DEC %xmm9, %xmm10, %xmm13, %xmm11, %xmm12, %xmm0, \
Tadeusz Struk0bd82f52010-11-04 15:00:45 -04001191%xmm1, %xmm2, %xmm3, %xmm4, %xmm8, %xmm5, %xmm6, 7, 8, dec
1192 sub $16, %r13
1193 jmp _initial_blocks_decrypted
1194_initial_num_blocks_is_0_decrypt:
Dave Watsone1fd3162018-02-14 09:38:12 -08001195 INITIAL_BLOCKS_ENC_DEC %xmm9, %xmm10, %xmm13, %xmm11, %xmm12, %xmm0, \
Tadeusz Struk0bd82f52010-11-04 15:00:45 -04001196%xmm1, %xmm2, %xmm3, %xmm4, %xmm8, %xmm5, %xmm6, 8, 0, dec
1197_initial_blocks_decrypted:
1198 cmp $0, %r13
1199 je _zero_cipher_left_decrypt
1200 sub $64, %r13
1201 je _four_cipher_left_decrypt
1202_decrypt_by_4:
Tadeusz Struk3c097b82010-12-13 19:51:15 +08001203 GHASH_4_ENCRYPT_4_PARALLEL_DEC %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, \
Tadeusz Struk0bd82f52010-11-04 15:00:45 -04001204%xmm14, %xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, dec
1205 add $64, %r11
1206 sub $64, %r13
1207 jne _decrypt_by_4
1208_four_cipher_left_decrypt:
1209 GHASH_LAST_4 %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14, \
1210%xmm15, %xmm1, %xmm2, %xmm3, %xmm4, %xmm8
1211_zero_cipher_left_decrypt:
1212 mov %arg4, %r13
1213 and $15, %r13 # %r13 = arg4 (mod 16)
1214 je _multiple_of_16_bytes_decrypt
1215
Lucas De Marchi0d2eb442011-03-17 16:24:16 -03001216 # Handle the last <16 byte block separately
Tadeusz Struk0bd82f52010-11-04 15:00:45 -04001217
1218 paddd ONE(%rip), %xmm0 # increment CNT to get Yn
Tadeusz Struk3c097b82010-12-13 19:51:15 +08001219 movdqa SHUF_MASK(%rip), %xmm10
1220 PSHUFB_XMM %xmm10, %xmm0
1221
Tadeusz Struk0bd82f52010-11-04 15:00:45 -04001222 ENCRYPT_SINGLE_BLOCK %xmm0, %xmm1 # E(K, Yn)
Tadeusz Struk3c097b82010-12-13 19:51:15 +08001223
Junaid Shahidb20209c2017-12-20 17:08:37 -08001224 lea (%arg3,%r11,1), %r10
1225 mov %r13, %r12
1226 READ_PARTIAL_BLOCK %r10 %r12 %xmm2 %xmm1
1227
1228 lea ALL_F+16(%rip), %r12
1229 sub %r13, %r12
Tadeusz Struk0bd82f52010-11-04 15:00:45 -04001230 movdqa %xmm1, %xmm2
1231 pxor %xmm1, %xmm0 # Ciphertext XOR E(K, Yn)
Junaid Shahidb20209c2017-12-20 17:08:37 -08001232 movdqu (%r12), %xmm1
Tadeusz Struk0bd82f52010-11-04 15:00:45 -04001233 # get the appropriate mask to mask out top 16-%r13 bytes of %xmm0
1234 pand %xmm1, %xmm0 # mask out top 16-%r13 bytes of %xmm0
1235 pand %xmm1, %xmm2
Tadeusz Struk3c097b82010-12-13 19:51:15 +08001236 movdqa SHUF_MASK(%rip), %xmm10
1237 PSHUFB_XMM %xmm10 ,%xmm2
1238
Tadeusz Struk0bd82f52010-11-04 15:00:45 -04001239 pxor %xmm2, %xmm8
1240 GHASH_MUL %xmm8, %xmm13, %xmm9, %xmm10, %xmm11, %xmm5, %xmm6
Tadeusz Struk0bd82f52010-11-04 15:00:45 -04001241
1242 # output %r13 bytes
Tadeusz Struk3c097b82010-12-13 19:51:15 +08001243 MOVQ_R64_XMM %xmm0, %rax
Tadeusz Struk0bd82f52010-11-04 15:00:45 -04001244 cmp $8, %r13
1245 jle _less_than_8_bytes_left_decrypt
1246 mov %rax, (%arg2 , %r11, 1)
1247 add $8, %r11
1248 psrldq $8, %xmm0
Tadeusz Struk3c097b82010-12-13 19:51:15 +08001249 MOVQ_R64_XMM %xmm0, %rax
Tadeusz Struk0bd82f52010-11-04 15:00:45 -04001250 sub $8, %r13
1251_less_than_8_bytes_left_decrypt:
1252 mov %al, (%arg2, %r11, 1)
1253 add $1, %r11
1254 shr $8, %rax
1255 sub $1, %r13
1256 jne _less_than_8_bytes_left_decrypt
1257_multiple_of_16_bytes_decrypt:
1258 mov arg8, %r12 # %r13 = aadLen (number of bytes)
1259 shl $3, %r12 # convert into number of bits
1260 movd %r12d, %xmm15 # len(A) in %xmm15
1261 shl $3, %arg4 # len(C) in bits (*128)
Tadeusz Struk3c097b82010-12-13 19:51:15 +08001262 MOVQ_R64_XMM %arg4, %xmm1
Tadeusz Struk0bd82f52010-11-04 15:00:45 -04001263 pslldq $8, %xmm15 # %xmm15 = len(A)||0x0000000000000000
1264 pxor %xmm1, %xmm15 # %xmm15 = len(A)||len(C)
1265 pxor %xmm15, %xmm8
1266 GHASH_MUL %xmm8, %xmm13, %xmm9, %xmm10, %xmm11, %xmm5, %xmm6
1267 # final GHASH computation
Tadeusz Struk3c097b82010-12-13 19:51:15 +08001268 movdqa SHUF_MASK(%rip), %xmm10
1269 PSHUFB_XMM %xmm10, %xmm8
1270
Tadeusz Struk0bd82f52010-11-04 15:00:45 -04001271 mov %arg5, %rax # %rax = *Y0
1272 movdqu (%rax), %xmm0 # %xmm0 = Y0
1273 ENCRYPT_SINGLE_BLOCK %xmm0, %xmm1 # E(K, Y0)
1274 pxor %xmm8, %xmm0
1275_return_T_decrypt:
1276 mov arg9, %r10 # %r10 = authTag
1277 mov arg10, %r11 # %r11 = auth_tag_len
1278 cmp $16, %r11
1279 je _T_16_decrypt
Sabrina Dubroca38d9dee2017-04-28 18:11:57 +02001280 cmp $8, %r11
1281 jl _T_4_decrypt
Tadeusz Struk0bd82f52010-11-04 15:00:45 -04001282_T_8_decrypt:
Tadeusz Struk3c097b82010-12-13 19:51:15 +08001283 MOVQ_R64_XMM %xmm0, %rax
Tadeusz Struk0bd82f52010-11-04 15:00:45 -04001284 mov %rax, (%r10)
Sabrina Dubroca38d9dee2017-04-28 18:11:57 +02001285 add $8, %r10
1286 sub $8, %r11
Tadeusz Struk0bd82f52010-11-04 15:00:45 -04001287 psrldq $8, %xmm0
Sabrina Dubroca38d9dee2017-04-28 18:11:57 +02001288 cmp $0, %r11
1289 je _return_T_done_decrypt
1290_T_4_decrypt:
Tadeusz Struk0bd82f52010-11-04 15:00:45 -04001291 movd %xmm0, %eax
Sabrina Dubroca38d9dee2017-04-28 18:11:57 +02001292 mov %eax, (%r10)
1293 add $4, %r10
1294 sub $4, %r11
1295 psrldq $4, %xmm0
1296 cmp $0, %r11
1297 je _return_T_done_decrypt
1298_T_123_decrypt:
1299 movd %xmm0, %eax
1300 cmp $2, %r11
1301 jl _T_1_decrypt
1302 mov %ax, (%r10)
1303 cmp $2, %r11
1304 je _return_T_done_decrypt
1305 add $2, %r10
1306 sar $16, %eax
1307_T_1_decrypt:
1308 mov %al, (%r10)
Tadeusz Struk0bd82f52010-11-04 15:00:45 -04001309 jmp _return_T_done_decrypt
1310_T_16_decrypt:
1311 movdqu %xmm0, (%r10)
1312_return_T_done_decrypt:
1313 mov %r14, %rsp
1314 pop %r14
1315 pop %r13
1316 pop %r12
1317 ret
Jussi Kivilinna8309b742013-01-19 13:38:55 +02001318ENDPROC(aesni_gcm_dec)
Tadeusz Struk0bd82f52010-11-04 15:00:45 -04001319
1320
1321/*****************************************************************************
1322* void aesni_gcm_enc(void *aes_ctx, // AES Key schedule. Starts on a 16 byte boundary.
1323* u8 *out, // Ciphertext output. Encrypt in-place is allowed.
1324* const u8 *in, // Plaintext input
1325* u64 plaintext_len, // Length of data in bytes for encryption.
1326* u8 *iv, // Pre-counter block j0: 4 byte salt (from Security Association)
1327* // concatenated with 8 byte Initialisation Vector (from IPSec ESP Payload)
1328* // concatenated with 0x00000001. 16-byte aligned pointer.
1329* u8 *hash_subkey, // H, the Hash sub key input. Data starts on a 16-byte boundary.
1330* const u8 *aad, // Additional Authentication Data (AAD)
1331* u64 aad_len, // Length of AAD in bytes. With RFC4106 this is going to be 8 or 12 bytes
1332* u8 *auth_tag, // Authenticated Tag output.
1333* u64 auth_tag_len); // Authenticated Tag Length in bytes. Valid values are 16 (most likely),
1334* // 12 or 8.
1335*
1336* Assumptions:
1337*
1338* keys:
1339* keys are pre-expanded and aligned to 16 bytes. we are using the
1340* first set of 11 keys in the data structure void *aes_ctx
1341*
1342*
1343* iv:
1344* 0 1 2 3
1345* 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
1346* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1347* | Salt (From the SA) |
1348* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1349* | Initialization Vector |
1350* | (This is the sequence number from IPSec header) |
1351* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1352* | 0x1 |
1353* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1354*
1355*
1356*
1357* AAD:
1358* AAD padded to 128 bits with 0
1359* for example, assume AAD is a u32 vector
1360*
1361* if AAD is 8 bytes:
1362* AAD[3] = {A0, A1};
1363* padded AAD in xmm register = {A1 A0 0 0}
1364*
1365* 0 1 2 3
1366* 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
1367* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1368* | SPI (A1) |
1369* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1370* | 32-bit Sequence Number (A0) |
1371* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1372* | 0x0 |
1373* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1374*
1375* AAD Format with 32-bit Sequence Number
1376*
1377* if AAD is 12 bytes:
1378* AAD[3] = {A0, A1, A2};
1379* padded AAD in xmm register = {A2 A1 A0 0}
1380*
1381* 0 1 2 3
1382* 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
1383* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1384* | SPI (A2) |
1385* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1386* | 64-bit Extended Sequence Number {A1,A0} |
1387* | |
1388* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1389* | 0x0 |
1390* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1391*
1392* AAD Format with 64-bit Extended Sequence Number
1393*
Tadeusz Struk0bd82f52010-11-04 15:00:45 -04001394* poly = x^128 + x^127 + x^126 + x^121 + 1
1395***************************************************************************/
1396ENTRY(aesni_gcm_enc)
1397 push %r12
1398 push %r13
1399 push %r14
1400 mov %rsp, %r14
1401#
1402# states of %xmm registers %xmm6:%xmm15 not saved
1403# all %xmm registers are clobbered
1404#
1405 sub $VARIABLE_OFFSET, %rsp
1406 and $~63, %rsp
1407 mov %arg6, %r12
1408 movdqu (%r12), %xmm13
Tadeusz Struk3c097b82010-12-13 19:51:15 +08001409 movdqa SHUF_MASK(%rip), %xmm2
1410 PSHUFB_XMM %xmm2, %xmm13
1411
Tadeusz Struk0bd82f52010-11-04 15:00:45 -04001412
1413# precompute HashKey<<1 mod poly from the HashKey (required for GHASH)
1414
1415 movdqa %xmm13, %xmm2
1416 psllq $1, %xmm13
1417 psrlq $63, %xmm2
1418 movdqa %xmm2, %xmm1
1419 pslldq $8, %xmm2
1420 psrldq $8, %xmm1
1421 por %xmm2, %xmm13
1422
1423 # reduce HashKey<<1
1424
1425 pshufd $0x24, %xmm1, %xmm2
1426 pcmpeqd TWOONE(%rip), %xmm2
1427 pand POLY(%rip), %xmm2
1428 pxor %xmm2, %xmm13
1429 movdqa %xmm13, HashKey(%rsp)
1430 mov %arg4, %r13 # %xmm13 holds HashKey<<1 (mod poly)
1431 and $-16, %r13
1432 mov %r13, %r12
1433
1434 # Encrypt first few blocks
1435
1436 and $(3<<4), %r12
1437 jz _initial_num_blocks_is_0_encrypt
1438 cmp $(2<<4), %r12
1439 jb _initial_num_blocks_is_1_encrypt
1440 je _initial_num_blocks_is_2_encrypt
1441_initial_num_blocks_is_3_encrypt:
Dave Watsone1fd3162018-02-14 09:38:12 -08001442 INITIAL_BLOCKS_ENC_DEC %xmm9, %xmm10, %xmm13, %xmm11, %xmm12, %xmm0, \
Tadeusz Struk0bd82f52010-11-04 15:00:45 -04001443%xmm1, %xmm2, %xmm3, %xmm4, %xmm8, %xmm5, %xmm6, 5, 678, enc
1444 sub $48, %r13
1445 jmp _initial_blocks_encrypted
1446_initial_num_blocks_is_2_encrypt:
Dave Watsone1fd3162018-02-14 09:38:12 -08001447 INITIAL_BLOCKS_ENC_DEC %xmm9, %xmm10, %xmm13, %xmm11, %xmm12, %xmm0, \
Tadeusz Struk0bd82f52010-11-04 15:00:45 -04001448%xmm1, %xmm2, %xmm3, %xmm4, %xmm8, %xmm5, %xmm6, 6, 78, enc
1449 sub $32, %r13
1450 jmp _initial_blocks_encrypted
1451_initial_num_blocks_is_1_encrypt:
Dave Watsone1fd3162018-02-14 09:38:12 -08001452 INITIAL_BLOCKS_ENC_DEC %xmm9, %xmm10, %xmm13, %xmm11, %xmm12, %xmm0, \
Tadeusz Struk0bd82f52010-11-04 15:00:45 -04001453%xmm1, %xmm2, %xmm3, %xmm4, %xmm8, %xmm5, %xmm6, 7, 8, enc
1454 sub $16, %r13
1455 jmp _initial_blocks_encrypted
1456_initial_num_blocks_is_0_encrypt:
Dave Watsone1fd3162018-02-14 09:38:12 -08001457 INITIAL_BLOCKS_ENC_DEC %xmm9, %xmm10, %xmm13, %xmm11, %xmm12, %xmm0, \
Tadeusz Struk0bd82f52010-11-04 15:00:45 -04001458%xmm1, %xmm2, %xmm3, %xmm4, %xmm8, %xmm5, %xmm6, 8, 0, enc
1459_initial_blocks_encrypted:
1460
1461 # Main loop - Encrypt remaining blocks
1462
1463 cmp $0, %r13
1464 je _zero_cipher_left_encrypt
1465 sub $64, %r13
1466 je _four_cipher_left_encrypt
1467_encrypt_by_4_encrypt:
Tadeusz Struk3c097b82010-12-13 19:51:15 +08001468 GHASH_4_ENCRYPT_4_PARALLEL_ENC %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, \
Tadeusz Struk0bd82f52010-11-04 15:00:45 -04001469%xmm14, %xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, enc
1470 add $64, %r11
1471 sub $64, %r13
1472 jne _encrypt_by_4_encrypt
1473_four_cipher_left_encrypt:
1474 GHASH_LAST_4 %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14, \
1475%xmm15, %xmm1, %xmm2, %xmm3, %xmm4, %xmm8
1476_zero_cipher_left_encrypt:
1477 mov %arg4, %r13
1478 and $15, %r13 # %r13 = arg4 (mod 16)
1479 je _multiple_of_16_bytes_encrypt
1480
Lucas De Marchi0d2eb442011-03-17 16:24:16 -03001481 # Handle the last <16 Byte block separately
Tadeusz Struk0bd82f52010-11-04 15:00:45 -04001482 paddd ONE(%rip), %xmm0 # INCR CNT to get Yn
Tadeusz Struk3c097b82010-12-13 19:51:15 +08001483 movdqa SHUF_MASK(%rip), %xmm10
1484 PSHUFB_XMM %xmm10, %xmm0
1485
Tadeusz Struk0bd82f52010-11-04 15:00:45 -04001486 ENCRYPT_SINGLE_BLOCK %xmm0, %xmm1 # Encrypt(K, Yn)
Junaid Shahidb20209c2017-12-20 17:08:37 -08001487
1488 lea (%arg3,%r11,1), %r10
1489 mov %r13, %r12
1490 READ_PARTIAL_BLOCK %r10 %r12 %xmm2 %xmm1
1491
1492 lea ALL_F+16(%rip), %r12
Tadeusz Struk0bd82f52010-11-04 15:00:45 -04001493 sub %r13, %r12
Tadeusz Struk0bd82f52010-11-04 15:00:45 -04001494 pxor %xmm1, %xmm0 # Plaintext XOR Encrypt(K, Yn)
Junaid Shahidb20209c2017-12-20 17:08:37 -08001495 movdqu (%r12), %xmm1
Tadeusz Struk0bd82f52010-11-04 15:00:45 -04001496 # get the appropriate mask to mask out top 16-r13 bytes of xmm0
1497 pand %xmm1, %xmm0 # mask out top 16-r13 bytes of xmm0
Tadeusz Struk3c097b82010-12-13 19:51:15 +08001498 movdqa SHUF_MASK(%rip), %xmm10
1499 PSHUFB_XMM %xmm10,%xmm0
Tadeusz Struk0bd82f52010-11-04 15:00:45 -04001500
Tadeusz Struk0bd82f52010-11-04 15:00:45 -04001501 pxor %xmm0, %xmm8
1502 GHASH_MUL %xmm8, %xmm13, %xmm9, %xmm10, %xmm11, %xmm5, %xmm6
1503 # GHASH computation for the last <16 byte block
Tadeusz Struk60af5202011-03-13 16:56:17 +08001504 movdqa SHUF_MASK(%rip), %xmm10
1505 PSHUFB_XMM %xmm10, %xmm0
Tadeusz Struk3c097b82010-12-13 19:51:15 +08001506
Tadeusz Struk0bd82f52010-11-04 15:00:45 -04001507 # shuffle xmm0 back to output as ciphertext
1508
1509 # Output %r13 bytes
Tadeusz Struk3c097b82010-12-13 19:51:15 +08001510 MOVQ_R64_XMM %xmm0, %rax
Tadeusz Struk0bd82f52010-11-04 15:00:45 -04001511 cmp $8, %r13
1512 jle _less_than_8_bytes_left_encrypt
1513 mov %rax, (%arg2 , %r11, 1)
1514 add $8, %r11
1515 psrldq $8, %xmm0
Tadeusz Struk3c097b82010-12-13 19:51:15 +08001516 MOVQ_R64_XMM %xmm0, %rax
Tadeusz Struk0bd82f52010-11-04 15:00:45 -04001517 sub $8, %r13
1518_less_than_8_bytes_left_encrypt:
1519 mov %al, (%arg2, %r11, 1)
1520 add $1, %r11
1521 shr $8, %rax
1522 sub $1, %r13
1523 jne _less_than_8_bytes_left_encrypt
1524_multiple_of_16_bytes_encrypt:
1525 mov arg8, %r12 # %r12 = addLen (number of bytes)
1526 shl $3, %r12
1527 movd %r12d, %xmm15 # len(A) in %xmm15
1528 shl $3, %arg4 # len(C) in bits (*128)
Tadeusz Struk3c097b82010-12-13 19:51:15 +08001529 MOVQ_R64_XMM %arg4, %xmm1
Tadeusz Struk0bd82f52010-11-04 15:00:45 -04001530 pslldq $8, %xmm15 # %xmm15 = len(A)||0x0000000000000000
1531 pxor %xmm1, %xmm15 # %xmm15 = len(A)||len(C)
1532 pxor %xmm15, %xmm8
1533 GHASH_MUL %xmm8, %xmm13, %xmm9, %xmm10, %xmm11, %xmm5, %xmm6
1534 # final GHASH computation
Tadeusz Struk3c097b82010-12-13 19:51:15 +08001535 movdqa SHUF_MASK(%rip), %xmm10
1536 PSHUFB_XMM %xmm10, %xmm8 # perform a 16 byte swap
Tadeusz Struk0bd82f52010-11-04 15:00:45 -04001537
Tadeusz Struk0bd82f52010-11-04 15:00:45 -04001538 mov %arg5, %rax # %rax = *Y0
1539 movdqu (%rax), %xmm0 # %xmm0 = Y0
1540 ENCRYPT_SINGLE_BLOCK %xmm0, %xmm15 # Encrypt(K, Y0)
1541 pxor %xmm8, %xmm0
1542_return_T_encrypt:
1543 mov arg9, %r10 # %r10 = authTag
1544 mov arg10, %r11 # %r11 = auth_tag_len
1545 cmp $16, %r11
1546 je _T_16_encrypt
Sabrina Dubroca38d9dee2017-04-28 18:11:57 +02001547 cmp $8, %r11
1548 jl _T_4_encrypt
Tadeusz Struk0bd82f52010-11-04 15:00:45 -04001549_T_8_encrypt:
Tadeusz Struk3c097b82010-12-13 19:51:15 +08001550 MOVQ_R64_XMM %xmm0, %rax
Tadeusz Struk0bd82f52010-11-04 15:00:45 -04001551 mov %rax, (%r10)
Sabrina Dubroca38d9dee2017-04-28 18:11:57 +02001552 add $8, %r10
1553 sub $8, %r11
Tadeusz Struk0bd82f52010-11-04 15:00:45 -04001554 psrldq $8, %xmm0
Sabrina Dubroca38d9dee2017-04-28 18:11:57 +02001555 cmp $0, %r11
1556 je _return_T_done_encrypt
1557_T_4_encrypt:
Tadeusz Struk0bd82f52010-11-04 15:00:45 -04001558 movd %xmm0, %eax
Sabrina Dubroca38d9dee2017-04-28 18:11:57 +02001559 mov %eax, (%r10)
1560 add $4, %r10
1561 sub $4, %r11
1562 psrldq $4, %xmm0
1563 cmp $0, %r11
1564 je _return_T_done_encrypt
1565_T_123_encrypt:
1566 movd %xmm0, %eax
1567 cmp $2, %r11
1568 jl _T_1_encrypt
1569 mov %ax, (%r10)
1570 cmp $2, %r11
1571 je _return_T_done_encrypt
1572 add $2, %r10
1573 sar $16, %eax
1574_T_1_encrypt:
1575 mov %al, (%r10)
Tadeusz Struk0bd82f52010-11-04 15:00:45 -04001576 jmp _return_T_done_encrypt
1577_T_16_encrypt:
1578 movdqu %xmm0, (%r10)
1579_return_T_done_encrypt:
1580 mov %r14, %rsp
1581 pop %r14
1582 pop %r13
1583 pop %r12
1584 ret
Jussi Kivilinna8309b742013-01-19 13:38:55 +02001585ENDPROC(aesni_gcm_enc)
Tadeusz Struk3c097b82010-12-13 19:51:15 +08001586
Mathias Krause559ad0f2010-11-29 08:35:39 +08001587#endif
Tadeusz Struk0bd82f52010-11-04 15:00:45 -04001588
1589
Jussi Kivilinna8309b742013-01-19 13:38:55 +02001590.align 4
Huang Ying54b6a1b2009-01-18 16:28:34 +11001591_key_expansion_128:
1592_key_expansion_256a:
1593 pshufd $0b11111111, %xmm1, %xmm1
1594 shufps $0b00010000, %xmm0, %xmm4
1595 pxor %xmm4, %xmm0
1596 shufps $0b10001100, %xmm0, %xmm4
1597 pxor %xmm4, %xmm0
1598 pxor %xmm1, %xmm0
Mathias Krause0d258ef2010-11-27 16:34:46 +08001599 movaps %xmm0, (TKEYP)
1600 add $0x10, TKEYP
Huang Ying54b6a1b2009-01-18 16:28:34 +11001601 ret
Jussi Kivilinna8309b742013-01-19 13:38:55 +02001602ENDPROC(_key_expansion_128)
1603ENDPROC(_key_expansion_256a)
Huang Ying54b6a1b2009-01-18 16:28:34 +11001604
Mathias Krause0d258ef2010-11-27 16:34:46 +08001605.align 4
Huang Ying54b6a1b2009-01-18 16:28:34 +11001606_key_expansion_192a:
1607 pshufd $0b01010101, %xmm1, %xmm1
1608 shufps $0b00010000, %xmm0, %xmm4
1609 pxor %xmm4, %xmm0
1610 shufps $0b10001100, %xmm0, %xmm4
1611 pxor %xmm4, %xmm0
1612 pxor %xmm1, %xmm0
1613
1614 movaps %xmm2, %xmm5
1615 movaps %xmm2, %xmm6
1616 pslldq $4, %xmm5
1617 pshufd $0b11111111, %xmm0, %xmm3
1618 pxor %xmm3, %xmm2
1619 pxor %xmm5, %xmm2
1620
1621 movaps %xmm0, %xmm1
1622 shufps $0b01000100, %xmm0, %xmm6
Mathias Krause0d258ef2010-11-27 16:34:46 +08001623 movaps %xmm6, (TKEYP)
Huang Ying54b6a1b2009-01-18 16:28:34 +11001624 shufps $0b01001110, %xmm2, %xmm1
Mathias Krause0d258ef2010-11-27 16:34:46 +08001625 movaps %xmm1, 0x10(TKEYP)
1626 add $0x20, TKEYP
Huang Ying54b6a1b2009-01-18 16:28:34 +11001627 ret
Jussi Kivilinna8309b742013-01-19 13:38:55 +02001628ENDPROC(_key_expansion_192a)
Huang Ying54b6a1b2009-01-18 16:28:34 +11001629
Mathias Krause0d258ef2010-11-27 16:34:46 +08001630.align 4
Huang Ying54b6a1b2009-01-18 16:28:34 +11001631_key_expansion_192b:
1632 pshufd $0b01010101, %xmm1, %xmm1
1633 shufps $0b00010000, %xmm0, %xmm4
1634 pxor %xmm4, %xmm0
1635 shufps $0b10001100, %xmm0, %xmm4
1636 pxor %xmm4, %xmm0
1637 pxor %xmm1, %xmm0
1638
1639 movaps %xmm2, %xmm5
1640 pslldq $4, %xmm5
1641 pshufd $0b11111111, %xmm0, %xmm3
1642 pxor %xmm3, %xmm2
1643 pxor %xmm5, %xmm2
1644
Mathias Krause0d258ef2010-11-27 16:34:46 +08001645 movaps %xmm0, (TKEYP)
1646 add $0x10, TKEYP
Huang Ying54b6a1b2009-01-18 16:28:34 +11001647 ret
Jussi Kivilinna8309b742013-01-19 13:38:55 +02001648ENDPROC(_key_expansion_192b)
Huang Ying54b6a1b2009-01-18 16:28:34 +11001649
Mathias Krause0d258ef2010-11-27 16:34:46 +08001650.align 4
Huang Ying54b6a1b2009-01-18 16:28:34 +11001651_key_expansion_256b:
1652 pshufd $0b10101010, %xmm1, %xmm1
1653 shufps $0b00010000, %xmm2, %xmm4
1654 pxor %xmm4, %xmm2
1655 shufps $0b10001100, %xmm2, %xmm4
1656 pxor %xmm4, %xmm2
1657 pxor %xmm1, %xmm2
Mathias Krause0d258ef2010-11-27 16:34:46 +08001658 movaps %xmm2, (TKEYP)
1659 add $0x10, TKEYP
Huang Ying54b6a1b2009-01-18 16:28:34 +11001660 ret
Jussi Kivilinna8309b742013-01-19 13:38:55 +02001661ENDPROC(_key_expansion_256b)
Huang Ying54b6a1b2009-01-18 16:28:34 +11001662
1663/*
1664 * int aesni_set_key(struct crypto_aes_ctx *ctx, const u8 *in_key,
1665 * unsigned int key_len)
1666 */
1667ENTRY(aesni_set_key)
Josh Poimboeuf8691ccd2016-01-21 16:49:19 -06001668 FRAME_BEGIN
Mathias Krause0d258ef2010-11-27 16:34:46 +08001669#ifndef __x86_64__
1670 pushl KEYP
Josh Poimboeuf8691ccd2016-01-21 16:49:19 -06001671 movl (FRAME_OFFSET+8)(%esp), KEYP # ctx
1672 movl (FRAME_OFFSET+12)(%esp), UKEYP # in_key
1673 movl (FRAME_OFFSET+16)(%esp), %edx # key_len
Mathias Krause0d258ef2010-11-27 16:34:46 +08001674#endif
1675 movups (UKEYP), %xmm0 # user key (first 16 bytes)
1676 movaps %xmm0, (KEYP)
1677 lea 0x10(KEYP), TKEYP # key addr
1678 movl %edx, 480(KEYP)
Huang Ying54b6a1b2009-01-18 16:28:34 +11001679 pxor %xmm4, %xmm4 # xmm4 is assumed 0 in _key_expansion_x
1680 cmp $24, %dl
1681 jb .Lenc_key128
1682 je .Lenc_key192
Mathias Krause0d258ef2010-11-27 16:34:46 +08001683 movups 0x10(UKEYP), %xmm2 # other user key
1684 movaps %xmm2, (TKEYP)
1685 add $0x10, TKEYP
Huang Yingb369e522009-11-23 19:54:06 +08001686 AESKEYGENASSIST 0x1 %xmm2 %xmm1 # round 1
Huang Ying54b6a1b2009-01-18 16:28:34 +11001687 call _key_expansion_256a
Huang Yingb369e522009-11-23 19:54:06 +08001688 AESKEYGENASSIST 0x1 %xmm0 %xmm1
Huang Ying54b6a1b2009-01-18 16:28:34 +11001689 call _key_expansion_256b
Huang Yingb369e522009-11-23 19:54:06 +08001690 AESKEYGENASSIST 0x2 %xmm2 %xmm1 # round 2
Huang Ying54b6a1b2009-01-18 16:28:34 +11001691 call _key_expansion_256a
Huang Yingb369e522009-11-23 19:54:06 +08001692 AESKEYGENASSIST 0x2 %xmm0 %xmm1
Huang Ying54b6a1b2009-01-18 16:28:34 +11001693 call _key_expansion_256b
Huang Yingb369e522009-11-23 19:54:06 +08001694 AESKEYGENASSIST 0x4 %xmm2 %xmm1 # round 3
Huang Ying54b6a1b2009-01-18 16:28:34 +11001695 call _key_expansion_256a
Huang Yingb369e522009-11-23 19:54:06 +08001696 AESKEYGENASSIST 0x4 %xmm0 %xmm1
Huang Ying54b6a1b2009-01-18 16:28:34 +11001697 call _key_expansion_256b
Huang Yingb369e522009-11-23 19:54:06 +08001698 AESKEYGENASSIST 0x8 %xmm2 %xmm1 # round 4
Huang Ying54b6a1b2009-01-18 16:28:34 +11001699 call _key_expansion_256a
Huang Yingb369e522009-11-23 19:54:06 +08001700 AESKEYGENASSIST 0x8 %xmm0 %xmm1
Huang Ying54b6a1b2009-01-18 16:28:34 +11001701 call _key_expansion_256b
Huang Yingb369e522009-11-23 19:54:06 +08001702 AESKEYGENASSIST 0x10 %xmm2 %xmm1 # round 5
Huang Ying54b6a1b2009-01-18 16:28:34 +11001703 call _key_expansion_256a
Huang Yingb369e522009-11-23 19:54:06 +08001704 AESKEYGENASSIST 0x10 %xmm0 %xmm1
Huang Ying54b6a1b2009-01-18 16:28:34 +11001705 call _key_expansion_256b
Huang Yingb369e522009-11-23 19:54:06 +08001706 AESKEYGENASSIST 0x20 %xmm2 %xmm1 # round 6
Huang Ying54b6a1b2009-01-18 16:28:34 +11001707 call _key_expansion_256a
Huang Yingb369e522009-11-23 19:54:06 +08001708 AESKEYGENASSIST 0x20 %xmm0 %xmm1
Huang Ying54b6a1b2009-01-18 16:28:34 +11001709 call _key_expansion_256b
Huang Yingb369e522009-11-23 19:54:06 +08001710 AESKEYGENASSIST 0x40 %xmm2 %xmm1 # round 7
Huang Ying54b6a1b2009-01-18 16:28:34 +11001711 call _key_expansion_256a
1712 jmp .Ldec_key
1713.Lenc_key192:
Mathias Krause0d258ef2010-11-27 16:34:46 +08001714 movq 0x10(UKEYP), %xmm2 # other user key
Huang Yingb369e522009-11-23 19:54:06 +08001715 AESKEYGENASSIST 0x1 %xmm2 %xmm1 # round 1
Huang Ying54b6a1b2009-01-18 16:28:34 +11001716 call _key_expansion_192a
Huang Yingb369e522009-11-23 19:54:06 +08001717 AESKEYGENASSIST 0x2 %xmm2 %xmm1 # round 2
Huang Ying54b6a1b2009-01-18 16:28:34 +11001718 call _key_expansion_192b
Huang Yingb369e522009-11-23 19:54:06 +08001719 AESKEYGENASSIST 0x4 %xmm2 %xmm1 # round 3
Huang Ying54b6a1b2009-01-18 16:28:34 +11001720 call _key_expansion_192a
Huang Yingb369e522009-11-23 19:54:06 +08001721 AESKEYGENASSIST 0x8 %xmm2 %xmm1 # round 4
Huang Ying54b6a1b2009-01-18 16:28:34 +11001722 call _key_expansion_192b
Huang Yingb369e522009-11-23 19:54:06 +08001723 AESKEYGENASSIST 0x10 %xmm2 %xmm1 # round 5
Huang Ying54b6a1b2009-01-18 16:28:34 +11001724 call _key_expansion_192a
Huang Yingb369e522009-11-23 19:54:06 +08001725 AESKEYGENASSIST 0x20 %xmm2 %xmm1 # round 6
Huang Ying54b6a1b2009-01-18 16:28:34 +11001726 call _key_expansion_192b
Huang Yingb369e522009-11-23 19:54:06 +08001727 AESKEYGENASSIST 0x40 %xmm2 %xmm1 # round 7
Huang Ying54b6a1b2009-01-18 16:28:34 +11001728 call _key_expansion_192a
Huang Yingb369e522009-11-23 19:54:06 +08001729 AESKEYGENASSIST 0x80 %xmm2 %xmm1 # round 8
Huang Ying54b6a1b2009-01-18 16:28:34 +11001730 call _key_expansion_192b
1731 jmp .Ldec_key
1732.Lenc_key128:
Huang Yingb369e522009-11-23 19:54:06 +08001733 AESKEYGENASSIST 0x1 %xmm0 %xmm1 # round 1
Huang Ying54b6a1b2009-01-18 16:28:34 +11001734 call _key_expansion_128
Huang Yingb369e522009-11-23 19:54:06 +08001735 AESKEYGENASSIST 0x2 %xmm0 %xmm1 # round 2
Huang Ying54b6a1b2009-01-18 16:28:34 +11001736 call _key_expansion_128
Huang Yingb369e522009-11-23 19:54:06 +08001737 AESKEYGENASSIST 0x4 %xmm0 %xmm1 # round 3
Huang Ying54b6a1b2009-01-18 16:28:34 +11001738 call _key_expansion_128
Huang Yingb369e522009-11-23 19:54:06 +08001739 AESKEYGENASSIST 0x8 %xmm0 %xmm1 # round 4
Huang Ying54b6a1b2009-01-18 16:28:34 +11001740 call _key_expansion_128
Huang Yingb369e522009-11-23 19:54:06 +08001741 AESKEYGENASSIST 0x10 %xmm0 %xmm1 # round 5
Huang Ying54b6a1b2009-01-18 16:28:34 +11001742 call _key_expansion_128
Huang Yingb369e522009-11-23 19:54:06 +08001743 AESKEYGENASSIST 0x20 %xmm0 %xmm1 # round 6
Huang Ying54b6a1b2009-01-18 16:28:34 +11001744 call _key_expansion_128
Huang Yingb369e522009-11-23 19:54:06 +08001745 AESKEYGENASSIST 0x40 %xmm0 %xmm1 # round 7
Huang Ying54b6a1b2009-01-18 16:28:34 +11001746 call _key_expansion_128
Huang Yingb369e522009-11-23 19:54:06 +08001747 AESKEYGENASSIST 0x80 %xmm0 %xmm1 # round 8
Huang Ying54b6a1b2009-01-18 16:28:34 +11001748 call _key_expansion_128
Huang Yingb369e522009-11-23 19:54:06 +08001749 AESKEYGENASSIST 0x1b %xmm0 %xmm1 # round 9
Huang Ying54b6a1b2009-01-18 16:28:34 +11001750 call _key_expansion_128
Huang Yingb369e522009-11-23 19:54:06 +08001751 AESKEYGENASSIST 0x36 %xmm0 %xmm1 # round 10
Huang Ying54b6a1b2009-01-18 16:28:34 +11001752 call _key_expansion_128
1753.Ldec_key:
Mathias Krause0d258ef2010-11-27 16:34:46 +08001754 sub $0x10, TKEYP
1755 movaps (KEYP), %xmm0
1756 movaps (TKEYP), %xmm1
1757 movaps %xmm0, 240(TKEYP)
1758 movaps %xmm1, 240(KEYP)
1759 add $0x10, KEYP
1760 lea 240-16(TKEYP), UKEYP
Huang Ying54b6a1b2009-01-18 16:28:34 +11001761.align 4
1762.Ldec_key_loop:
Mathias Krause0d258ef2010-11-27 16:34:46 +08001763 movaps (KEYP), %xmm0
Huang Yingb369e522009-11-23 19:54:06 +08001764 AESIMC %xmm0 %xmm1
Mathias Krause0d258ef2010-11-27 16:34:46 +08001765 movaps %xmm1, (UKEYP)
1766 add $0x10, KEYP
1767 sub $0x10, UKEYP
1768 cmp TKEYP, KEYP
Huang Ying54b6a1b2009-01-18 16:28:34 +11001769 jb .Ldec_key_loop
Mathias Krause0d258ef2010-11-27 16:34:46 +08001770 xor AREG, AREG
1771#ifndef __x86_64__
1772 popl KEYP
1773#endif
Josh Poimboeuf8691ccd2016-01-21 16:49:19 -06001774 FRAME_END
Huang Ying54b6a1b2009-01-18 16:28:34 +11001775 ret
Jussi Kivilinna8309b742013-01-19 13:38:55 +02001776ENDPROC(aesni_set_key)
Huang Ying54b6a1b2009-01-18 16:28:34 +11001777
1778/*
1779 * void aesni_enc(struct crypto_aes_ctx *ctx, u8 *dst, const u8 *src)
1780 */
1781ENTRY(aesni_enc)
Josh Poimboeuf8691ccd2016-01-21 16:49:19 -06001782 FRAME_BEGIN
Mathias Krause0d258ef2010-11-27 16:34:46 +08001783#ifndef __x86_64__
1784 pushl KEYP
1785 pushl KLEN
Josh Poimboeuf8691ccd2016-01-21 16:49:19 -06001786 movl (FRAME_OFFSET+12)(%esp), KEYP # ctx
1787 movl (FRAME_OFFSET+16)(%esp), OUTP # dst
1788 movl (FRAME_OFFSET+20)(%esp), INP # src
Mathias Krause0d258ef2010-11-27 16:34:46 +08001789#endif
Huang Ying54b6a1b2009-01-18 16:28:34 +11001790 movl 480(KEYP), KLEN # key length
1791 movups (INP), STATE # input
1792 call _aesni_enc1
1793 movups STATE, (OUTP) # output
Mathias Krause0d258ef2010-11-27 16:34:46 +08001794#ifndef __x86_64__
1795 popl KLEN
1796 popl KEYP
1797#endif
Josh Poimboeuf8691ccd2016-01-21 16:49:19 -06001798 FRAME_END
Huang Ying54b6a1b2009-01-18 16:28:34 +11001799 ret
Jussi Kivilinna8309b742013-01-19 13:38:55 +02001800ENDPROC(aesni_enc)
Huang Ying54b6a1b2009-01-18 16:28:34 +11001801
1802/*
1803 * _aesni_enc1: internal ABI
1804 * input:
1805 * KEYP: key struct pointer
1806 * KLEN: round count
1807 * STATE: initial state (input)
1808 * output:
1809 * STATE: finial state (output)
1810 * changed:
1811 * KEY
1812 * TKEYP (T1)
1813 */
Mathias Krause0d258ef2010-11-27 16:34:46 +08001814.align 4
Huang Ying54b6a1b2009-01-18 16:28:34 +11001815_aesni_enc1:
1816 movaps (KEYP), KEY # key
1817 mov KEYP, TKEYP
1818 pxor KEY, STATE # round 0
1819 add $0x30, TKEYP
1820 cmp $24, KLEN
1821 jb .Lenc128
1822 lea 0x20(TKEYP), TKEYP
1823 je .Lenc192
1824 add $0x20, TKEYP
1825 movaps -0x60(TKEYP), KEY
Huang Yingb369e522009-11-23 19:54:06 +08001826 AESENC KEY STATE
Huang Ying54b6a1b2009-01-18 16:28:34 +11001827 movaps -0x50(TKEYP), KEY
Huang Yingb369e522009-11-23 19:54:06 +08001828 AESENC KEY STATE
Huang Ying54b6a1b2009-01-18 16:28:34 +11001829.align 4
1830.Lenc192:
1831 movaps -0x40(TKEYP), KEY
Huang Yingb369e522009-11-23 19:54:06 +08001832 AESENC KEY STATE
Huang Ying54b6a1b2009-01-18 16:28:34 +11001833 movaps -0x30(TKEYP), KEY
Huang Yingb369e522009-11-23 19:54:06 +08001834 AESENC KEY STATE
Huang Ying54b6a1b2009-01-18 16:28:34 +11001835.align 4
1836.Lenc128:
1837 movaps -0x20(TKEYP), KEY
Huang Yingb369e522009-11-23 19:54:06 +08001838 AESENC KEY STATE
Huang Ying54b6a1b2009-01-18 16:28:34 +11001839 movaps -0x10(TKEYP), KEY
Huang Yingb369e522009-11-23 19:54:06 +08001840 AESENC KEY STATE
Huang Ying54b6a1b2009-01-18 16:28:34 +11001841 movaps (TKEYP), KEY
Huang Yingb369e522009-11-23 19:54:06 +08001842 AESENC KEY STATE
Huang Ying54b6a1b2009-01-18 16:28:34 +11001843 movaps 0x10(TKEYP), KEY
Huang Yingb369e522009-11-23 19:54:06 +08001844 AESENC KEY STATE
Huang Ying54b6a1b2009-01-18 16:28:34 +11001845 movaps 0x20(TKEYP), KEY
Huang Yingb369e522009-11-23 19:54:06 +08001846 AESENC KEY STATE
Huang Ying54b6a1b2009-01-18 16:28:34 +11001847 movaps 0x30(TKEYP), KEY
Huang Yingb369e522009-11-23 19:54:06 +08001848 AESENC KEY STATE
Huang Ying54b6a1b2009-01-18 16:28:34 +11001849 movaps 0x40(TKEYP), KEY
Huang Yingb369e522009-11-23 19:54:06 +08001850 AESENC KEY STATE
Huang Ying54b6a1b2009-01-18 16:28:34 +11001851 movaps 0x50(TKEYP), KEY
Huang Yingb369e522009-11-23 19:54:06 +08001852 AESENC KEY STATE
Huang Ying54b6a1b2009-01-18 16:28:34 +11001853 movaps 0x60(TKEYP), KEY
Huang Yingb369e522009-11-23 19:54:06 +08001854 AESENC KEY STATE
Huang Ying54b6a1b2009-01-18 16:28:34 +11001855 movaps 0x70(TKEYP), KEY
Huang Yingb369e522009-11-23 19:54:06 +08001856 AESENCLAST KEY STATE
Huang Ying54b6a1b2009-01-18 16:28:34 +11001857 ret
Jussi Kivilinna8309b742013-01-19 13:38:55 +02001858ENDPROC(_aesni_enc1)
Huang Ying54b6a1b2009-01-18 16:28:34 +11001859
1860/*
1861 * _aesni_enc4: internal ABI
1862 * input:
1863 * KEYP: key struct pointer
1864 * KLEN: round count
1865 * STATE1: initial state (input)
1866 * STATE2
1867 * STATE3
1868 * STATE4
1869 * output:
1870 * STATE1: finial state (output)
1871 * STATE2
1872 * STATE3
1873 * STATE4
1874 * changed:
1875 * KEY
1876 * TKEYP (T1)
1877 */
Mathias Krause0d258ef2010-11-27 16:34:46 +08001878.align 4
Huang Ying54b6a1b2009-01-18 16:28:34 +11001879_aesni_enc4:
1880 movaps (KEYP), KEY # key
1881 mov KEYP, TKEYP
1882 pxor KEY, STATE1 # round 0
1883 pxor KEY, STATE2
1884 pxor KEY, STATE3
1885 pxor KEY, STATE4
1886 add $0x30, TKEYP
1887 cmp $24, KLEN
1888 jb .L4enc128
1889 lea 0x20(TKEYP), TKEYP
1890 je .L4enc192
1891 add $0x20, TKEYP
1892 movaps -0x60(TKEYP), KEY
Huang Yingb369e522009-11-23 19:54:06 +08001893 AESENC KEY STATE1
1894 AESENC KEY STATE2
1895 AESENC KEY STATE3
1896 AESENC KEY STATE4
Huang Ying54b6a1b2009-01-18 16:28:34 +11001897 movaps -0x50(TKEYP), KEY
Huang Yingb369e522009-11-23 19:54:06 +08001898 AESENC KEY STATE1
1899 AESENC KEY STATE2
1900 AESENC KEY STATE3
1901 AESENC KEY STATE4
Huang Ying54b6a1b2009-01-18 16:28:34 +11001902#.align 4
1903.L4enc192:
1904 movaps -0x40(TKEYP), KEY
Huang Yingb369e522009-11-23 19:54:06 +08001905 AESENC KEY STATE1
1906 AESENC KEY STATE2
1907 AESENC KEY STATE3
1908 AESENC KEY STATE4
Huang Ying54b6a1b2009-01-18 16:28:34 +11001909 movaps -0x30(TKEYP), KEY
Huang Yingb369e522009-11-23 19:54:06 +08001910 AESENC KEY STATE1
1911 AESENC KEY STATE2
1912 AESENC KEY STATE3
1913 AESENC KEY STATE4
Huang Ying54b6a1b2009-01-18 16:28:34 +11001914#.align 4
1915.L4enc128:
1916 movaps -0x20(TKEYP), KEY
Huang Yingb369e522009-11-23 19:54:06 +08001917 AESENC KEY STATE1
1918 AESENC KEY STATE2
1919 AESENC KEY STATE3
1920 AESENC KEY STATE4
Huang Ying54b6a1b2009-01-18 16:28:34 +11001921 movaps -0x10(TKEYP), KEY
Huang Yingb369e522009-11-23 19:54:06 +08001922 AESENC KEY STATE1
1923 AESENC KEY STATE2
1924 AESENC KEY STATE3
1925 AESENC KEY STATE4
Huang Ying54b6a1b2009-01-18 16:28:34 +11001926 movaps (TKEYP), KEY
Huang Yingb369e522009-11-23 19:54:06 +08001927 AESENC KEY STATE1
1928 AESENC KEY STATE2
1929 AESENC KEY STATE3
1930 AESENC KEY STATE4
Huang Ying54b6a1b2009-01-18 16:28:34 +11001931 movaps 0x10(TKEYP), KEY
Huang Yingb369e522009-11-23 19:54:06 +08001932 AESENC KEY STATE1
1933 AESENC KEY STATE2
1934 AESENC KEY STATE3
1935 AESENC KEY STATE4
Huang Ying54b6a1b2009-01-18 16:28:34 +11001936 movaps 0x20(TKEYP), KEY
Huang Yingb369e522009-11-23 19:54:06 +08001937 AESENC KEY STATE1
1938 AESENC KEY STATE2
1939 AESENC KEY STATE3
1940 AESENC KEY STATE4
Huang Ying54b6a1b2009-01-18 16:28:34 +11001941 movaps 0x30(TKEYP), KEY
Huang Yingb369e522009-11-23 19:54:06 +08001942 AESENC KEY STATE1
1943 AESENC KEY STATE2
1944 AESENC KEY STATE3
1945 AESENC KEY STATE4
Huang Ying54b6a1b2009-01-18 16:28:34 +11001946 movaps 0x40(TKEYP), KEY
Huang Yingb369e522009-11-23 19:54:06 +08001947 AESENC KEY STATE1
1948 AESENC KEY STATE2
1949 AESENC KEY STATE3
1950 AESENC KEY STATE4
Huang Ying54b6a1b2009-01-18 16:28:34 +11001951 movaps 0x50(TKEYP), KEY
Huang Yingb369e522009-11-23 19:54:06 +08001952 AESENC KEY STATE1
1953 AESENC KEY STATE2
1954 AESENC KEY STATE3
1955 AESENC KEY STATE4
Huang Ying54b6a1b2009-01-18 16:28:34 +11001956 movaps 0x60(TKEYP), KEY
Huang Yingb369e522009-11-23 19:54:06 +08001957 AESENC KEY STATE1
1958 AESENC KEY STATE2
1959 AESENC KEY STATE3
1960 AESENC KEY STATE4
Huang Ying54b6a1b2009-01-18 16:28:34 +11001961 movaps 0x70(TKEYP), KEY
Huang Yingb369e522009-11-23 19:54:06 +08001962 AESENCLAST KEY STATE1 # last round
1963 AESENCLAST KEY STATE2
1964 AESENCLAST KEY STATE3
1965 AESENCLAST KEY STATE4
Huang Ying54b6a1b2009-01-18 16:28:34 +11001966 ret
Jussi Kivilinna8309b742013-01-19 13:38:55 +02001967ENDPROC(_aesni_enc4)
Huang Ying54b6a1b2009-01-18 16:28:34 +11001968
1969/*
1970 * void aesni_dec (struct crypto_aes_ctx *ctx, u8 *dst, const u8 *src)
1971 */
1972ENTRY(aesni_dec)
Josh Poimboeuf8691ccd2016-01-21 16:49:19 -06001973 FRAME_BEGIN
Mathias Krause0d258ef2010-11-27 16:34:46 +08001974#ifndef __x86_64__
1975 pushl KEYP
1976 pushl KLEN
Josh Poimboeuf8691ccd2016-01-21 16:49:19 -06001977 movl (FRAME_OFFSET+12)(%esp), KEYP # ctx
1978 movl (FRAME_OFFSET+16)(%esp), OUTP # dst
1979 movl (FRAME_OFFSET+20)(%esp), INP # src
Mathias Krause0d258ef2010-11-27 16:34:46 +08001980#endif
Huang Ying54b6a1b2009-01-18 16:28:34 +11001981 mov 480(KEYP), KLEN # key length
1982 add $240, KEYP
1983 movups (INP), STATE # input
1984 call _aesni_dec1
1985 movups STATE, (OUTP) #output
Mathias Krause0d258ef2010-11-27 16:34:46 +08001986#ifndef __x86_64__
1987 popl KLEN
1988 popl KEYP
1989#endif
Josh Poimboeuf8691ccd2016-01-21 16:49:19 -06001990 FRAME_END
Huang Ying54b6a1b2009-01-18 16:28:34 +11001991 ret
Jussi Kivilinna8309b742013-01-19 13:38:55 +02001992ENDPROC(aesni_dec)
Huang Ying54b6a1b2009-01-18 16:28:34 +11001993
1994/*
1995 * _aesni_dec1: internal ABI
1996 * input:
1997 * KEYP: key struct pointer
1998 * KLEN: key length
1999 * STATE: initial state (input)
2000 * output:
2001 * STATE: finial state (output)
2002 * changed:
2003 * KEY
2004 * TKEYP (T1)
2005 */
Mathias Krause0d258ef2010-11-27 16:34:46 +08002006.align 4
Huang Ying54b6a1b2009-01-18 16:28:34 +11002007_aesni_dec1:
2008 movaps (KEYP), KEY # key
2009 mov KEYP, TKEYP
2010 pxor KEY, STATE # round 0
2011 add $0x30, TKEYP
2012 cmp $24, KLEN
2013 jb .Ldec128
2014 lea 0x20(TKEYP), TKEYP
2015 je .Ldec192
2016 add $0x20, TKEYP
2017 movaps -0x60(TKEYP), KEY
Huang Yingb369e522009-11-23 19:54:06 +08002018 AESDEC KEY STATE
Huang Ying54b6a1b2009-01-18 16:28:34 +11002019 movaps -0x50(TKEYP), KEY
Huang Yingb369e522009-11-23 19:54:06 +08002020 AESDEC KEY STATE
Huang Ying54b6a1b2009-01-18 16:28:34 +11002021.align 4
2022.Ldec192:
2023 movaps -0x40(TKEYP), KEY
Huang Yingb369e522009-11-23 19:54:06 +08002024 AESDEC KEY STATE
Huang Ying54b6a1b2009-01-18 16:28:34 +11002025 movaps -0x30(TKEYP), KEY
Huang Yingb369e522009-11-23 19:54:06 +08002026 AESDEC KEY STATE
Huang Ying54b6a1b2009-01-18 16:28:34 +11002027.align 4
2028.Ldec128:
2029 movaps -0x20(TKEYP), KEY
Huang Yingb369e522009-11-23 19:54:06 +08002030 AESDEC KEY STATE
Huang Ying54b6a1b2009-01-18 16:28:34 +11002031 movaps -0x10(TKEYP), KEY
Huang Yingb369e522009-11-23 19:54:06 +08002032 AESDEC KEY STATE
Huang Ying54b6a1b2009-01-18 16:28:34 +11002033 movaps (TKEYP), KEY
Huang Yingb369e522009-11-23 19:54:06 +08002034 AESDEC KEY STATE
Huang Ying54b6a1b2009-01-18 16:28:34 +11002035 movaps 0x10(TKEYP), KEY
Huang Yingb369e522009-11-23 19:54:06 +08002036 AESDEC KEY STATE
Huang Ying54b6a1b2009-01-18 16:28:34 +11002037 movaps 0x20(TKEYP), KEY
Huang Yingb369e522009-11-23 19:54:06 +08002038 AESDEC KEY STATE
Huang Ying54b6a1b2009-01-18 16:28:34 +11002039 movaps 0x30(TKEYP), KEY
Huang Yingb369e522009-11-23 19:54:06 +08002040 AESDEC KEY STATE
Huang Ying54b6a1b2009-01-18 16:28:34 +11002041 movaps 0x40(TKEYP), KEY
Huang Yingb369e522009-11-23 19:54:06 +08002042 AESDEC KEY STATE
Huang Ying54b6a1b2009-01-18 16:28:34 +11002043 movaps 0x50(TKEYP), KEY
Huang Yingb369e522009-11-23 19:54:06 +08002044 AESDEC KEY STATE
Huang Ying54b6a1b2009-01-18 16:28:34 +11002045 movaps 0x60(TKEYP), KEY
Huang Yingb369e522009-11-23 19:54:06 +08002046 AESDEC KEY STATE
Huang Ying54b6a1b2009-01-18 16:28:34 +11002047 movaps 0x70(TKEYP), KEY
Huang Yingb369e522009-11-23 19:54:06 +08002048 AESDECLAST KEY STATE
Huang Ying54b6a1b2009-01-18 16:28:34 +11002049 ret
Jussi Kivilinna8309b742013-01-19 13:38:55 +02002050ENDPROC(_aesni_dec1)
Huang Ying54b6a1b2009-01-18 16:28:34 +11002051
2052/*
2053 * _aesni_dec4: internal ABI
2054 * input:
2055 * KEYP: key struct pointer
2056 * KLEN: key length
2057 * STATE1: initial state (input)
2058 * STATE2
2059 * STATE3
2060 * STATE4
2061 * output:
2062 * STATE1: finial state (output)
2063 * STATE2
2064 * STATE3
2065 * STATE4
2066 * changed:
2067 * KEY
2068 * TKEYP (T1)
2069 */
Mathias Krause0d258ef2010-11-27 16:34:46 +08002070.align 4
Huang Ying54b6a1b2009-01-18 16:28:34 +11002071_aesni_dec4:
2072 movaps (KEYP), KEY # key
2073 mov KEYP, TKEYP
2074 pxor KEY, STATE1 # round 0
2075 pxor KEY, STATE2
2076 pxor KEY, STATE3
2077 pxor KEY, STATE4
2078 add $0x30, TKEYP
2079 cmp $24, KLEN
2080 jb .L4dec128
2081 lea 0x20(TKEYP), TKEYP
2082 je .L4dec192
2083 add $0x20, TKEYP
2084 movaps -0x60(TKEYP), KEY
Huang Yingb369e522009-11-23 19:54:06 +08002085 AESDEC KEY STATE1
2086 AESDEC KEY STATE2
2087 AESDEC KEY STATE3
2088 AESDEC KEY STATE4
Huang Ying54b6a1b2009-01-18 16:28:34 +11002089 movaps -0x50(TKEYP), KEY
Huang Yingb369e522009-11-23 19:54:06 +08002090 AESDEC KEY STATE1
2091 AESDEC KEY STATE2
2092 AESDEC KEY STATE3
2093 AESDEC KEY STATE4
Huang Ying54b6a1b2009-01-18 16:28:34 +11002094.align 4
2095.L4dec192:
2096 movaps -0x40(TKEYP), KEY
Huang Yingb369e522009-11-23 19:54:06 +08002097 AESDEC KEY STATE1
2098 AESDEC KEY STATE2
2099 AESDEC KEY STATE3
2100 AESDEC KEY STATE4
Huang Ying54b6a1b2009-01-18 16:28:34 +11002101 movaps -0x30(TKEYP), KEY
Huang Yingb369e522009-11-23 19:54:06 +08002102 AESDEC KEY STATE1
2103 AESDEC KEY STATE2
2104 AESDEC KEY STATE3
2105 AESDEC KEY STATE4
Huang Ying54b6a1b2009-01-18 16:28:34 +11002106.align 4
2107.L4dec128:
2108 movaps -0x20(TKEYP), KEY
Huang Yingb369e522009-11-23 19:54:06 +08002109 AESDEC KEY STATE1
2110 AESDEC KEY STATE2
2111 AESDEC KEY STATE3
2112 AESDEC KEY STATE4
Huang Ying54b6a1b2009-01-18 16:28:34 +11002113 movaps -0x10(TKEYP), KEY
Huang Yingb369e522009-11-23 19:54:06 +08002114 AESDEC KEY STATE1
2115 AESDEC KEY STATE2
2116 AESDEC KEY STATE3
2117 AESDEC KEY STATE4
Huang Ying54b6a1b2009-01-18 16:28:34 +11002118 movaps (TKEYP), KEY
Huang Yingb369e522009-11-23 19:54:06 +08002119 AESDEC KEY STATE1
2120 AESDEC KEY STATE2
2121 AESDEC KEY STATE3
2122 AESDEC KEY STATE4
Huang Ying54b6a1b2009-01-18 16:28:34 +11002123 movaps 0x10(TKEYP), KEY
Huang Yingb369e522009-11-23 19:54:06 +08002124 AESDEC KEY STATE1
2125 AESDEC KEY STATE2
2126 AESDEC KEY STATE3
2127 AESDEC KEY STATE4
Huang Ying54b6a1b2009-01-18 16:28:34 +11002128 movaps 0x20(TKEYP), KEY
Huang Yingb369e522009-11-23 19:54:06 +08002129 AESDEC KEY STATE1
2130 AESDEC KEY STATE2
2131 AESDEC KEY STATE3
2132 AESDEC KEY STATE4
Huang Ying54b6a1b2009-01-18 16:28:34 +11002133 movaps 0x30(TKEYP), KEY
Huang Yingb369e522009-11-23 19:54:06 +08002134 AESDEC KEY STATE1
2135 AESDEC KEY STATE2
2136 AESDEC KEY STATE3
2137 AESDEC KEY STATE4
Huang Ying54b6a1b2009-01-18 16:28:34 +11002138 movaps 0x40(TKEYP), KEY
Huang Yingb369e522009-11-23 19:54:06 +08002139 AESDEC KEY STATE1
2140 AESDEC KEY STATE2
2141 AESDEC KEY STATE3
2142 AESDEC KEY STATE4
Huang Ying54b6a1b2009-01-18 16:28:34 +11002143 movaps 0x50(TKEYP), KEY
Huang Yingb369e522009-11-23 19:54:06 +08002144 AESDEC KEY STATE1
2145 AESDEC KEY STATE2
2146 AESDEC KEY STATE3
2147 AESDEC KEY STATE4
Huang Ying54b6a1b2009-01-18 16:28:34 +11002148 movaps 0x60(TKEYP), KEY
Huang Yingb369e522009-11-23 19:54:06 +08002149 AESDEC KEY STATE1
2150 AESDEC KEY STATE2
2151 AESDEC KEY STATE3
2152 AESDEC KEY STATE4
Huang Ying54b6a1b2009-01-18 16:28:34 +11002153 movaps 0x70(TKEYP), KEY
Huang Yingb369e522009-11-23 19:54:06 +08002154 AESDECLAST KEY STATE1 # last round
2155 AESDECLAST KEY STATE2
2156 AESDECLAST KEY STATE3
2157 AESDECLAST KEY STATE4
Huang Ying54b6a1b2009-01-18 16:28:34 +11002158 ret
Jussi Kivilinna8309b742013-01-19 13:38:55 +02002159ENDPROC(_aesni_dec4)
Huang Ying54b6a1b2009-01-18 16:28:34 +11002160
2161/*
2162 * void aesni_ecb_enc(struct crypto_aes_ctx *ctx, const u8 *dst, u8 *src,
2163 * size_t len)
2164 */
2165ENTRY(aesni_ecb_enc)
Josh Poimboeuf8691ccd2016-01-21 16:49:19 -06002166 FRAME_BEGIN
Mathias Krause0d258ef2010-11-27 16:34:46 +08002167#ifndef __x86_64__
2168 pushl LEN
2169 pushl KEYP
2170 pushl KLEN
Josh Poimboeuf8691ccd2016-01-21 16:49:19 -06002171 movl (FRAME_OFFSET+16)(%esp), KEYP # ctx
2172 movl (FRAME_OFFSET+20)(%esp), OUTP # dst
2173 movl (FRAME_OFFSET+24)(%esp), INP # src
2174 movl (FRAME_OFFSET+28)(%esp), LEN # len
Mathias Krause0d258ef2010-11-27 16:34:46 +08002175#endif
Huang Ying54b6a1b2009-01-18 16:28:34 +11002176 test LEN, LEN # check length
2177 jz .Lecb_enc_ret
2178 mov 480(KEYP), KLEN
2179 cmp $16, LEN
2180 jb .Lecb_enc_ret
2181 cmp $64, LEN
2182 jb .Lecb_enc_loop1
2183.align 4
2184.Lecb_enc_loop4:
2185 movups (INP), STATE1
2186 movups 0x10(INP), STATE2
2187 movups 0x20(INP), STATE3
2188 movups 0x30(INP), STATE4
2189 call _aesni_enc4
2190 movups STATE1, (OUTP)
2191 movups STATE2, 0x10(OUTP)
2192 movups STATE3, 0x20(OUTP)
2193 movups STATE4, 0x30(OUTP)
2194 sub $64, LEN
2195 add $64, INP
2196 add $64, OUTP
2197 cmp $64, LEN
2198 jge .Lecb_enc_loop4
2199 cmp $16, LEN
2200 jb .Lecb_enc_ret
2201.align 4
2202.Lecb_enc_loop1:
2203 movups (INP), STATE1
2204 call _aesni_enc1
2205 movups STATE1, (OUTP)
2206 sub $16, LEN
2207 add $16, INP
2208 add $16, OUTP
2209 cmp $16, LEN
2210 jge .Lecb_enc_loop1
2211.Lecb_enc_ret:
Mathias Krause0d258ef2010-11-27 16:34:46 +08002212#ifndef __x86_64__
2213 popl KLEN
2214 popl KEYP
2215 popl LEN
2216#endif
Josh Poimboeuf8691ccd2016-01-21 16:49:19 -06002217 FRAME_END
Huang Ying54b6a1b2009-01-18 16:28:34 +11002218 ret
Jussi Kivilinna8309b742013-01-19 13:38:55 +02002219ENDPROC(aesni_ecb_enc)
Huang Ying54b6a1b2009-01-18 16:28:34 +11002220
2221/*
2222 * void aesni_ecb_dec(struct crypto_aes_ctx *ctx, const u8 *dst, u8 *src,
2223 * size_t len);
2224 */
2225ENTRY(aesni_ecb_dec)
Josh Poimboeuf8691ccd2016-01-21 16:49:19 -06002226 FRAME_BEGIN
Mathias Krause0d258ef2010-11-27 16:34:46 +08002227#ifndef __x86_64__
2228 pushl LEN
2229 pushl KEYP
2230 pushl KLEN
Josh Poimboeuf8691ccd2016-01-21 16:49:19 -06002231 movl (FRAME_OFFSET+16)(%esp), KEYP # ctx
2232 movl (FRAME_OFFSET+20)(%esp), OUTP # dst
2233 movl (FRAME_OFFSET+24)(%esp), INP # src
2234 movl (FRAME_OFFSET+28)(%esp), LEN # len
Mathias Krause0d258ef2010-11-27 16:34:46 +08002235#endif
Huang Ying54b6a1b2009-01-18 16:28:34 +11002236 test LEN, LEN
2237 jz .Lecb_dec_ret
2238 mov 480(KEYP), KLEN
2239 add $240, KEYP
2240 cmp $16, LEN
2241 jb .Lecb_dec_ret
2242 cmp $64, LEN
2243 jb .Lecb_dec_loop1
2244.align 4
2245.Lecb_dec_loop4:
2246 movups (INP), STATE1
2247 movups 0x10(INP), STATE2
2248 movups 0x20(INP), STATE3
2249 movups 0x30(INP), STATE4
2250 call _aesni_dec4
2251 movups STATE1, (OUTP)
2252 movups STATE2, 0x10(OUTP)
2253 movups STATE3, 0x20(OUTP)
2254 movups STATE4, 0x30(OUTP)
2255 sub $64, LEN
2256 add $64, INP
2257 add $64, OUTP
2258 cmp $64, LEN
2259 jge .Lecb_dec_loop4
2260 cmp $16, LEN
2261 jb .Lecb_dec_ret
2262.align 4
2263.Lecb_dec_loop1:
2264 movups (INP), STATE1
2265 call _aesni_dec1
2266 movups STATE1, (OUTP)
2267 sub $16, LEN
2268 add $16, INP
2269 add $16, OUTP
2270 cmp $16, LEN
2271 jge .Lecb_dec_loop1
2272.Lecb_dec_ret:
Mathias Krause0d258ef2010-11-27 16:34:46 +08002273#ifndef __x86_64__
2274 popl KLEN
2275 popl KEYP
2276 popl LEN
2277#endif
Josh Poimboeuf8691ccd2016-01-21 16:49:19 -06002278 FRAME_END
Huang Ying54b6a1b2009-01-18 16:28:34 +11002279 ret
Jussi Kivilinna8309b742013-01-19 13:38:55 +02002280ENDPROC(aesni_ecb_dec)
Huang Ying54b6a1b2009-01-18 16:28:34 +11002281
2282/*
2283 * void aesni_cbc_enc(struct crypto_aes_ctx *ctx, const u8 *dst, u8 *src,
2284 * size_t len, u8 *iv)
2285 */
2286ENTRY(aesni_cbc_enc)
Josh Poimboeuf8691ccd2016-01-21 16:49:19 -06002287 FRAME_BEGIN
Mathias Krause0d258ef2010-11-27 16:34:46 +08002288#ifndef __x86_64__
2289 pushl IVP
2290 pushl LEN
2291 pushl KEYP
2292 pushl KLEN
Josh Poimboeuf8691ccd2016-01-21 16:49:19 -06002293 movl (FRAME_OFFSET+20)(%esp), KEYP # ctx
2294 movl (FRAME_OFFSET+24)(%esp), OUTP # dst
2295 movl (FRAME_OFFSET+28)(%esp), INP # src
2296 movl (FRAME_OFFSET+32)(%esp), LEN # len
2297 movl (FRAME_OFFSET+36)(%esp), IVP # iv
Mathias Krause0d258ef2010-11-27 16:34:46 +08002298#endif
Huang Ying54b6a1b2009-01-18 16:28:34 +11002299 cmp $16, LEN
2300 jb .Lcbc_enc_ret
2301 mov 480(KEYP), KLEN
2302 movups (IVP), STATE # load iv as initial state
2303.align 4
2304.Lcbc_enc_loop:
2305 movups (INP), IN # load input
2306 pxor IN, STATE
2307 call _aesni_enc1
2308 movups STATE, (OUTP) # store output
2309 sub $16, LEN
2310 add $16, INP
2311 add $16, OUTP
2312 cmp $16, LEN
2313 jge .Lcbc_enc_loop
2314 movups STATE, (IVP)
2315.Lcbc_enc_ret:
Mathias Krause0d258ef2010-11-27 16:34:46 +08002316#ifndef __x86_64__
2317 popl KLEN
2318 popl KEYP
2319 popl LEN
2320 popl IVP
2321#endif
Josh Poimboeuf8691ccd2016-01-21 16:49:19 -06002322 FRAME_END
Huang Ying54b6a1b2009-01-18 16:28:34 +11002323 ret
Jussi Kivilinna8309b742013-01-19 13:38:55 +02002324ENDPROC(aesni_cbc_enc)
Huang Ying54b6a1b2009-01-18 16:28:34 +11002325
2326/*
2327 * void aesni_cbc_dec(struct crypto_aes_ctx *ctx, const u8 *dst, u8 *src,
2328 * size_t len, u8 *iv)
2329 */
2330ENTRY(aesni_cbc_dec)
Josh Poimboeuf8691ccd2016-01-21 16:49:19 -06002331 FRAME_BEGIN
Mathias Krause0d258ef2010-11-27 16:34:46 +08002332#ifndef __x86_64__
2333 pushl IVP
2334 pushl LEN
2335 pushl KEYP
2336 pushl KLEN
Josh Poimboeuf8691ccd2016-01-21 16:49:19 -06002337 movl (FRAME_OFFSET+20)(%esp), KEYP # ctx
2338 movl (FRAME_OFFSET+24)(%esp), OUTP # dst
2339 movl (FRAME_OFFSET+28)(%esp), INP # src
2340 movl (FRAME_OFFSET+32)(%esp), LEN # len
2341 movl (FRAME_OFFSET+36)(%esp), IVP # iv
Mathias Krause0d258ef2010-11-27 16:34:46 +08002342#endif
Huang Ying54b6a1b2009-01-18 16:28:34 +11002343 cmp $16, LEN
Huang Yinge6efaa02009-06-18 19:33:57 +08002344 jb .Lcbc_dec_just_ret
Huang Ying54b6a1b2009-01-18 16:28:34 +11002345 mov 480(KEYP), KLEN
2346 add $240, KEYP
2347 movups (IVP), IV
2348 cmp $64, LEN
2349 jb .Lcbc_dec_loop1
2350.align 4
2351.Lcbc_dec_loop4:
2352 movups (INP), IN1
2353 movaps IN1, STATE1
2354 movups 0x10(INP), IN2
2355 movaps IN2, STATE2
Mathias Krause0d258ef2010-11-27 16:34:46 +08002356#ifdef __x86_64__
Huang Ying54b6a1b2009-01-18 16:28:34 +11002357 movups 0x20(INP), IN3
2358 movaps IN3, STATE3
2359 movups 0x30(INP), IN4
2360 movaps IN4, STATE4
Mathias Krause0d258ef2010-11-27 16:34:46 +08002361#else
2362 movups 0x20(INP), IN1
2363 movaps IN1, STATE3
2364 movups 0x30(INP), IN2
2365 movaps IN2, STATE4
2366#endif
Huang Ying54b6a1b2009-01-18 16:28:34 +11002367 call _aesni_dec4
2368 pxor IV, STATE1
Mathias Krause0d258ef2010-11-27 16:34:46 +08002369#ifdef __x86_64__
Huang Ying54b6a1b2009-01-18 16:28:34 +11002370 pxor IN1, STATE2
2371 pxor IN2, STATE3
2372 pxor IN3, STATE4
2373 movaps IN4, IV
Mathias Krause0d258ef2010-11-27 16:34:46 +08002374#else
Mathias Krause0d258ef2010-11-27 16:34:46 +08002375 pxor IN1, STATE4
2376 movaps IN2, IV
Mathias Krause7c8d5182012-05-30 01:43:08 +02002377 movups (INP), IN1
2378 pxor IN1, STATE2
2379 movups 0x10(INP), IN2
2380 pxor IN2, STATE3
Mathias Krause0d258ef2010-11-27 16:34:46 +08002381#endif
Huang Ying54b6a1b2009-01-18 16:28:34 +11002382 movups STATE1, (OUTP)
2383 movups STATE2, 0x10(OUTP)
2384 movups STATE3, 0x20(OUTP)
2385 movups STATE4, 0x30(OUTP)
2386 sub $64, LEN
2387 add $64, INP
2388 add $64, OUTP
2389 cmp $64, LEN
2390 jge .Lcbc_dec_loop4
2391 cmp $16, LEN
2392 jb .Lcbc_dec_ret
2393.align 4
2394.Lcbc_dec_loop1:
2395 movups (INP), IN
2396 movaps IN, STATE
2397 call _aesni_dec1
2398 pxor IV, STATE
2399 movups STATE, (OUTP)
2400 movaps IN, IV
2401 sub $16, LEN
2402 add $16, INP
2403 add $16, OUTP
2404 cmp $16, LEN
2405 jge .Lcbc_dec_loop1
Huang Ying54b6a1b2009-01-18 16:28:34 +11002406.Lcbc_dec_ret:
Huang Yinge6efaa02009-06-18 19:33:57 +08002407 movups IV, (IVP)
2408.Lcbc_dec_just_ret:
Mathias Krause0d258ef2010-11-27 16:34:46 +08002409#ifndef __x86_64__
2410 popl KLEN
2411 popl KEYP
2412 popl LEN
2413 popl IVP
2414#endif
Josh Poimboeuf8691ccd2016-01-21 16:49:19 -06002415 FRAME_END
Huang Ying54b6a1b2009-01-18 16:28:34 +11002416 ret
Jussi Kivilinna8309b742013-01-19 13:38:55 +02002417ENDPROC(aesni_cbc_dec)
Huang Ying12387a42010-03-10 18:28:55 +08002418
Mathias Krause0d258ef2010-11-27 16:34:46 +08002419#ifdef __x86_64__
Josh Poimboeuf1253cab2016-01-21 16:49:15 -06002420.pushsection .rodata
Huang Ying12387a42010-03-10 18:28:55 +08002421.align 16
2422.Lbswap_mask:
2423 .byte 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0
Josh Poimboeuf1253cab2016-01-21 16:49:15 -06002424.popsection
Huang Ying12387a42010-03-10 18:28:55 +08002425
2426/*
2427 * _aesni_inc_init: internal ABI
2428 * setup registers used by _aesni_inc
2429 * input:
2430 * IV
2431 * output:
2432 * CTR: == IV, in little endian
2433 * TCTR_LOW: == lower qword of CTR
2434 * INC: == 1, in little endian
2435 * BSWAP_MASK == endian swapping mask
2436 */
Mathias Krause0d258ef2010-11-27 16:34:46 +08002437.align 4
Huang Ying12387a42010-03-10 18:28:55 +08002438_aesni_inc_init:
2439 movaps .Lbswap_mask, BSWAP_MASK
2440 movaps IV, CTR
2441 PSHUFB_XMM BSWAP_MASK CTR
2442 mov $1, TCTR_LOW
Huang Ying32cbd7d2010-03-13 16:28:42 +08002443 MOVQ_R64_XMM TCTR_LOW INC
2444 MOVQ_R64_XMM CTR TCTR_LOW
Huang Ying12387a42010-03-10 18:28:55 +08002445 ret
Jussi Kivilinna8309b742013-01-19 13:38:55 +02002446ENDPROC(_aesni_inc_init)
Huang Ying12387a42010-03-10 18:28:55 +08002447
2448/*
2449 * _aesni_inc: internal ABI
2450 * Increase IV by 1, IV is in big endian
2451 * input:
2452 * IV
2453 * CTR: == IV, in little endian
2454 * TCTR_LOW: == lower qword of CTR
2455 * INC: == 1, in little endian
2456 * BSWAP_MASK == endian swapping mask
2457 * output:
2458 * IV: Increase by 1
2459 * changed:
2460 * CTR: == output IV, in little endian
2461 * TCTR_LOW: == lower qword of CTR
2462 */
Mathias Krause0d258ef2010-11-27 16:34:46 +08002463.align 4
Huang Ying12387a42010-03-10 18:28:55 +08002464_aesni_inc:
2465 paddq INC, CTR
2466 add $1, TCTR_LOW
2467 jnc .Linc_low
2468 pslldq $8, INC
2469 paddq INC, CTR
2470 psrldq $8, INC
2471.Linc_low:
2472 movaps CTR, IV
2473 PSHUFB_XMM BSWAP_MASK IV
2474 ret
Jussi Kivilinna8309b742013-01-19 13:38:55 +02002475ENDPROC(_aesni_inc)
Huang Ying12387a42010-03-10 18:28:55 +08002476
2477/*
2478 * void aesni_ctr_enc(struct crypto_aes_ctx *ctx, const u8 *dst, u8 *src,
2479 * size_t len, u8 *iv)
2480 */
2481ENTRY(aesni_ctr_enc)
Josh Poimboeuf8691ccd2016-01-21 16:49:19 -06002482 FRAME_BEGIN
Huang Ying12387a42010-03-10 18:28:55 +08002483 cmp $16, LEN
2484 jb .Lctr_enc_just_ret
2485 mov 480(KEYP), KLEN
2486 movups (IVP), IV
2487 call _aesni_inc_init
2488 cmp $64, LEN
2489 jb .Lctr_enc_loop1
2490.align 4
2491.Lctr_enc_loop4:
2492 movaps IV, STATE1
2493 call _aesni_inc
2494 movups (INP), IN1
2495 movaps IV, STATE2
2496 call _aesni_inc
2497 movups 0x10(INP), IN2
2498 movaps IV, STATE3
2499 call _aesni_inc
2500 movups 0x20(INP), IN3
2501 movaps IV, STATE4
2502 call _aesni_inc
2503 movups 0x30(INP), IN4
2504 call _aesni_enc4
2505 pxor IN1, STATE1
2506 movups STATE1, (OUTP)
2507 pxor IN2, STATE2
2508 movups STATE2, 0x10(OUTP)
2509 pxor IN3, STATE3
2510 movups STATE3, 0x20(OUTP)
2511 pxor IN4, STATE4
2512 movups STATE4, 0x30(OUTP)
2513 sub $64, LEN
2514 add $64, INP
2515 add $64, OUTP
2516 cmp $64, LEN
2517 jge .Lctr_enc_loop4
2518 cmp $16, LEN
2519 jb .Lctr_enc_ret
2520.align 4
2521.Lctr_enc_loop1:
2522 movaps IV, STATE
2523 call _aesni_inc
2524 movups (INP), IN
2525 call _aesni_enc1
2526 pxor IN, STATE
2527 movups STATE, (OUTP)
2528 sub $16, LEN
2529 add $16, INP
2530 add $16, OUTP
2531 cmp $16, LEN
2532 jge .Lctr_enc_loop1
2533.Lctr_enc_ret:
2534 movups IV, (IVP)
2535.Lctr_enc_just_ret:
Josh Poimboeuf8691ccd2016-01-21 16:49:19 -06002536 FRAME_END
Huang Ying12387a42010-03-10 18:28:55 +08002537 ret
Jussi Kivilinna8309b742013-01-19 13:38:55 +02002538ENDPROC(aesni_ctr_enc)
Jussi Kivilinnac456a9c2013-04-08 21:51:16 +03002539
2540/*
2541 * _aesni_gf128mul_x_ble: internal ABI
2542 * Multiply in GF(2^128) for XTS IVs
2543 * input:
2544 * IV: current IV
2545 * GF128MUL_MASK == mask with 0x87 and 0x01
2546 * output:
2547 * IV: next IV
2548 * changed:
2549 * CTR: == temporary value
2550 */
2551#define _aesni_gf128mul_x_ble() \
2552 pshufd $0x13, IV, CTR; \
2553 paddq IV, IV; \
2554 psrad $31, CTR; \
2555 pand GF128MUL_MASK, CTR; \
2556 pxor CTR, IV;
2557
2558/*
2559 * void aesni_xts_crypt8(struct crypto_aes_ctx *ctx, const u8 *dst, u8 *src,
2560 * bool enc, u8 *iv)
2561 */
2562ENTRY(aesni_xts_crypt8)
Josh Poimboeuf8691ccd2016-01-21 16:49:19 -06002563 FRAME_BEGIN
Jussi Kivilinnac456a9c2013-04-08 21:51:16 +03002564 cmpb $0, %cl
2565 movl $0, %ecx
2566 movl $240, %r10d
2567 leaq _aesni_enc4, %r11
2568 leaq _aesni_dec4, %rax
2569 cmovel %r10d, %ecx
2570 cmoveq %rax, %r11
2571
2572 movdqa .Lgf128mul_x_ble_mask, GF128MUL_MASK
2573 movups (IVP), IV
2574
2575 mov 480(KEYP), KLEN
2576 addq %rcx, KEYP
2577
2578 movdqa IV, STATE1
Jussi Kivilinnafe6510b2013-06-11 22:25:22 +03002579 movdqu 0x00(INP), INC
2580 pxor INC, STATE1
Jussi Kivilinnac456a9c2013-04-08 21:51:16 +03002581 movdqu IV, 0x00(OUTP)
2582
2583 _aesni_gf128mul_x_ble()
2584 movdqa IV, STATE2
Jussi Kivilinnafe6510b2013-06-11 22:25:22 +03002585 movdqu 0x10(INP), INC
2586 pxor INC, STATE2
Jussi Kivilinnac456a9c2013-04-08 21:51:16 +03002587 movdqu IV, 0x10(OUTP)
2588
2589 _aesni_gf128mul_x_ble()
2590 movdqa IV, STATE3
Jussi Kivilinnafe6510b2013-06-11 22:25:22 +03002591 movdqu 0x20(INP), INC
2592 pxor INC, STATE3
Jussi Kivilinnac456a9c2013-04-08 21:51:16 +03002593 movdqu IV, 0x20(OUTP)
2594
2595 _aesni_gf128mul_x_ble()
2596 movdqa IV, STATE4
Jussi Kivilinnafe6510b2013-06-11 22:25:22 +03002597 movdqu 0x30(INP), INC
2598 pxor INC, STATE4
Jussi Kivilinnac456a9c2013-04-08 21:51:16 +03002599 movdqu IV, 0x30(OUTP)
2600
David Woodhouse9697fa32018-01-11 21:46:27 +00002601 CALL_NOSPEC %r11
Jussi Kivilinnac456a9c2013-04-08 21:51:16 +03002602
Jussi Kivilinnafe6510b2013-06-11 22:25:22 +03002603 movdqu 0x00(OUTP), INC
2604 pxor INC, STATE1
Jussi Kivilinnac456a9c2013-04-08 21:51:16 +03002605 movdqu STATE1, 0x00(OUTP)
2606
2607 _aesni_gf128mul_x_ble()
2608 movdqa IV, STATE1
Jussi Kivilinnafe6510b2013-06-11 22:25:22 +03002609 movdqu 0x40(INP), INC
2610 pxor INC, STATE1
Jussi Kivilinnac456a9c2013-04-08 21:51:16 +03002611 movdqu IV, 0x40(OUTP)
2612
Jussi Kivilinnafe6510b2013-06-11 22:25:22 +03002613 movdqu 0x10(OUTP), INC
2614 pxor INC, STATE2
Jussi Kivilinnac456a9c2013-04-08 21:51:16 +03002615 movdqu STATE2, 0x10(OUTP)
2616
2617 _aesni_gf128mul_x_ble()
2618 movdqa IV, STATE2
Jussi Kivilinnafe6510b2013-06-11 22:25:22 +03002619 movdqu 0x50(INP), INC
2620 pxor INC, STATE2
Jussi Kivilinnac456a9c2013-04-08 21:51:16 +03002621 movdqu IV, 0x50(OUTP)
2622
Jussi Kivilinnafe6510b2013-06-11 22:25:22 +03002623 movdqu 0x20(OUTP), INC
2624 pxor INC, STATE3
Jussi Kivilinnac456a9c2013-04-08 21:51:16 +03002625 movdqu STATE3, 0x20(OUTP)
2626
2627 _aesni_gf128mul_x_ble()
2628 movdqa IV, STATE3
Jussi Kivilinnafe6510b2013-06-11 22:25:22 +03002629 movdqu 0x60(INP), INC
2630 pxor INC, STATE3
Jussi Kivilinnac456a9c2013-04-08 21:51:16 +03002631 movdqu IV, 0x60(OUTP)
2632
Jussi Kivilinnafe6510b2013-06-11 22:25:22 +03002633 movdqu 0x30(OUTP), INC
2634 pxor INC, STATE4
Jussi Kivilinnac456a9c2013-04-08 21:51:16 +03002635 movdqu STATE4, 0x30(OUTP)
2636
2637 _aesni_gf128mul_x_ble()
2638 movdqa IV, STATE4
Jussi Kivilinnafe6510b2013-06-11 22:25:22 +03002639 movdqu 0x70(INP), INC
2640 pxor INC, STATE4
Jussi Kivilinnac456a9c2013-04-08 21:51:16 +03002641 movdqu IV, 0x70(OUTP)
2642
2643 _aesni_gf128mul_x_ble()
2644 movups IV, (IVP)
2645
David Woodhouse9697fa32018-01-11 21:46:27 +00002646 CALL_NOSPEC %r11
Jussi Kivilinnac456a9c2013-04-08 21:51:16 +03002647
Jussi Kivilinnafe6510b2013-06-11 22:25:22 +03002648 movdqu 0x40(OUTP), INC
2649 pxor INC, STATE1
Jussi Kivilinnac456a9c2013-04-08 21:51:16 +03002650 movdqu STATE1, 0x40(OUTP)
2651
Jussi Kivilinnafe6510b2013-06-11 22:25:22 +03002652 movdqu 0x50(OUTP), INC
2653 pxor INC, STATE2
Jussi Kivilinnac456a9c2013-04-08 21:51:16 +03002654 movdqu STATE2, 0x50(OUTP)
2655
Jussi Kivilinnafe6510b2013-06-11 22:25:22 +03002656 movdqu 0x60(OUTP), INC
2657 pxor INC, STATE3
Jussi Kivilinnac456a9c2013-04-08 21:51:16 +03002658 movdqu STATE3, 0x60(OUTP)
2659
Jussi Kivilinnafe6510b2013-06-11 22:25:22 +03002660 movdqu 0x70(OUTP), INC
2661 pxor INC, STATE4
Jussi Kivilinnac456a9c2013-04-08 21:51:16 +03002662 movdqu STATE4, 0x70(OUTP)
2663
Josh Poimboeuf8691ccd2016-01-21 16:49:19 -06002664 FRAME_END
Jussi Kivilinnac456a9c2013-04-08 21:51:16 +03002665 ret
2666ENDPROC(aesni_xts_crypt8)
2667
Mathias Krause0d258ef2010-11-27 16:34:46 +08002668#endif