blob: 760a5174e3087c671b1bce7976dcee611034d1d3 [file] [log] [blame]
Huang Ying54b6a1b2009-01-18 16:28:34 +11001/*
2 * Implement AES algorithm in Intel AES-NI instructions.
3 *
4 * The white paper of AES-NI instructions can be downloaded from:
5 * http://softwarecommunity.intel.com/isn/downloads/intelavx/AES-Instructions-Set_WP.pdf
6 *
7 * Copyright (C) 2008, Intel Corp.
8 * Author: Huang Ying <ying.huang@intel.com>
9 * Vinodh Gopal <vinodh.gopal@intel.com>
10 * Kahraman Akdemir
11 *
Tadeusz Struk0bd82f52010-11-04 15:00:45 -040012 * Added RFC4106 AES-GCM support for 128-bit keys under the AEAD
13 * interface for 64-bit kernels.
14 * Authors: Erdinc Ozturk (erdinc.ozturk@intel.com)
15 * Aidan O'Mahony (aidan.o.mahony@intel.com)
16 * Adrian Hoban <adrian.hoban@intel.com>
17 * James Guilford (james.guilford@intel.com)
18 * Gabriele Paoloni <gabriele.paoloni@intel.com>
19 * Tadeusz Struk (tadeusz.struk@intel.com)
20 * Wajdi Feghali (wajdi.k.feghali@intel.com)
21 * Copyright (c) 2010, Intel Corporation.
22 *
Mathias Krause0d258ef2010-11-27 16:34:46 +080023 * Ported x86_64 version to x86:
24 * Author: Mathias Krause <minipli@googlemail.com>
25 *
Huang Ying54b6a1b2009-01-18 16:28:34 +110026 * This program is free software; you can redistribute it and/or modify
27 * it under the terms of the GNU General Public License as published by
28 * the Free Software Foundation; either version 2 of the License, or
29 * (at your option) any later version.
30 */
31
32#include <linux/linkage.h>
Huang Yingb369e522009-11-23 19:54:06 +080033#include <asm/inst.h>
Josh Poimboeuf8691ccd2016-01-21 16:49:19 -060034#include <asm/frame.h>
David Woodhouse9697fa32018-01-11 21:46:27 +000035#include <asm/nospec-branch.h>
Huang Ying54b6a1b2009-01-18 16:28:34 +110036
Timothy McCaffreye31ac322015-01-13 13:16:43 -050037/*
38 * The following macros are used to move an (un)aligned 16 byte value to/from
39 * an XMM register. This can done for either FP or integer values, for FP use
40 * movaps (move aligned packed single) or integer use movdqa (move double quad
41 * aligned). It doesn't make a performance difference which instruction is used
42 * since Nehalem (original Core i7) was released. However, the movaps is a byte
43 * shorter, so that is the one we'll use for now. (same for unaligned).
44 */
45#define MOVADQ movaps
46#define MOVUDQ movups
47
Mathias Krause559ad0f2010-11-29 08:35:39 +080048#ifdef __x86_64__
Timothy McCaffreye31ac322015-01-13 13:16:43 -050049
Denys Vlasenkoe1839142017-01-19 22:33:04 +010050# constants in mergeable sections, linker can reorder and merge
51.section .rodata.cst16.gf128mul_x_ble_mask, "aM", @progbits, 16
Jussi Kivilinnac456a9c2013-04-08 21:51:16 +030052.align 16
53.Lgf128mul_x_ble_mask:
54 .octa 0x00000000000000010000000000000087
Denys Vlasenkoe1839142017-01-19 22:33:04 +010055.section .rodata.cst16.POLY, "aM", @progbits, 16
56.align 16
Tadeusz Struk0bd82f52010-11-04 15:00:45 -040057POLY: .octa 0xC2000000000000000000000000000001
Denys Vlasenkoe1839142017-01-19 22:33:04 +010058.section .rodata.cst16.TWOONE, "aM", @progbits, 16
59.align 16
Tadeusz Struk0bd82f52010-11-04 15:00:45 -040060TWOONE: .octa 0x00000001000000000000000000000001
61
Denys Vlasenkoe1839142017-01-19 22:33:04 +010062.section .rodata.cst16.SHUF_MASK, "aM", @progbits, 16
63.align 16
64SHUF_MASK: .octa 0x000102030405060708090A0B0C0D0E0F
65.section .rodata.cst16.MASK1, "aM", @progbits, 16
66.align 16
67MASK1: .octa 0x0000000000000000ffffffffffffffff
68.section .rodata.cst16.MASK2, "aM", @progbits, 16
69.align 16
70MASK2: .octa 0xffffffffffffffff0000000000000000
71.section .rodata.cst16.ONE, "aM", @progbits, 16
72.align 16
73ONE: .octa 0x00000000000000000000000000000001
74.section .rodata.cst16.F_MIN_MASK, "aM", @progbits, 16
75.align 16
76F_MIN_MASK: .octa 0xf1f2f3f4f5f6f7f8f9fafbfcfdfeff0
77.section .rodata.cst16.dec, "aM", @progbits, 16
78.align 16
79dec: .octa 0x1
80.section .rodata.cst16.enc, "aM", @progbits, 16
81.align 16
82enc: .octa 0x2
83
Tadeusz Struk0bd82f52010-11-04 15:00:45 -040084# order of these constants should not change.
85# more specifically, ALL_F should follow SHIFT_MASK,
Denys Vlasenkoe1839142017-01-19 22:33:04 +010086# and zero should follow ALL_F
87.section .rodata, "a", @progbits
88.align 16
Tadeusz Struk0bd82f52010-11-04 15:00:45 -040089SHIFT_MASK: .octa 0x0f0e0d0c0b0a09080706050403020100
90ALL_F: .octa 0xffffffffffffffffffffffffffffffff
Denys Vlasenkoe1839142017-01-19 22:33:04 +010091 .octa 0x00000000000000000000000000000000
Tadeusz Struk0bd82f52010-11-04 15:00:45 -040092
Huang Ying54b6a1b2009-01-18 16:28:34 +110093.text
94
Tadeusz Struk0bd82f52010-11-04 15:00:45 -040095
96#define STACK_OFFSET 8*3
Tadeusz Struk0bd82f52010-11-04 15:00:45 -040097
Dave Watson9ee4a5d2018-02-14 09:39:23 -080098#define AadHash 16*0
99#define AadLen 16*1
100#define InLen (16*1)+8
101#define PBlockEncKey 16*2
102#define OrigIV 16*3
103#define CurCount 16*4
104#define PBlockLen 16*5
Dave Watson1476db22018-02-14 09:40:10 -0800105#define HashKey 16*6 // store HashKey <<1 mod poly here
106#define HashKey_2 16*7 // store HashKey^2 <<1 mod poly here
107#define HashKey_3 16*8 // store HashKey^3 <<1 mod poly here
108#define HashKey_4 16*9 // store HashKey^4 <<1 mod poly here
109#define HashKey_k 16*10 // store XOR of High 64 bits and Low 64
110 // bits of HashKey <<1 mod poly here
111 //(for Karatsuba purposes)
112#define HashKey_2_k 16*11 // store XOR of High 64 bits and Low 64
113 // bits of HashKey^2 <<1 mod poly here
114 // (for Karatsuba purposes)
115#define HashKey_3_k 16*12 // store XOR of High 64 bits and Low 64
116 // bits of HashKey^3 <<1 mod poly here
117 // (for Karatsuba purposes)
118#define HashKey_4_k 16*13 // store XOR of High 64 bits and Low 64
119 // bits of HashKey^4 <<1 mod poly here
120 // (for Karatsuba purposes)
Dave Watson9ee4a5d2018-02-14 09:39:23 -0800121
Tadeusz Struk0bd82f52010-11-04 15:00:45 -0400122#define arg1 rdi
123#define arg2 rsi
124#define arg3 rdx
125#define arg4 rcx
126#define arg5 r8
127#define arg6 r9
Dave Watson1476db22018-02-14 09:40:10 -0800128#define arg7 STACK_OFFSET+8(%rsp)
129#define arg8 STACK_OFFSET+16(%rsp)
130#define arg9 STACK_OFFSET+24(%rsp)
131#define arg10 STACK_OFFSET+32(%rsp)
132#define arg11 STACK_OFFSET+40(%rsp)
Timothy McCaffreye31ac322015-01-13 13:16:43 -0500133#define keysize 2*15*16(%arg1)
Mathias Krause559ad0f2010-11-29 08:35:39 +0800134#endif
Tadeusz Struk0bd82f52010-11-04 15:00:45 -0400135
136
Huang Ying54b6a1b2009-01-18 16:28:34 +1100137#define STATE1 %xmm0
138#define STATE2 %xmm4
139#define STATE3 %xmm5
140#define STATE4 %xmm6
141#define STATE STATE1
142#define IN1 %xmm1
143#define IN2 %xmm7
144#define IN3 %xmm8
145#define IN4 %xmm9
146#define IN IN1
147#define KEY %xmm2
148#define IV %xmm3
Mathias Krause0d258ef2010-11-27 16:34:46 +0800149
Huang Ying12387a42010-03-10 18:28:55 +0800150#define BSWAP_MASK %xmm10
151#define CTR %xmm11
152#define INC %xmm12
Huang Ying54b6a1b2009-01-18 16:28:34 +1100153
Jussi Kivilinnac456a9c2013-04-08 21:51:16 +0300154#define GF128MUL_MASK %xmm10
155
Mathias Krause0d258ef2010-11-27 16:34:46 +0800156#ifdef __x86_64__
157#define AREG %rax
Huang Ying54b6a1b2009-01-18 16:28:34 +1100158#define KEYP %rdi
159#define OUTP %rsi
Mathias Krause0d258ef2010-11-27 16:34:46 +0800160#define UKEYP OUTP
Huang Ying54b6a1b2009-01-18 16:28:34 +1100161#define INP %rdx
162#define LEN %rcx
163#define IVP %r8
164#define KLEN %r9d
165#define T1 %r10
166#define TKEYP T1
167#define T2 %r11
Huang Ying12387a42010-03-10 18:28:55 +0800168#define TCTR_LOW T2
Mathias Krause0d258ef2010-11-27 16:34:46 +0800169#else
170#define AREG %eax
171#define KEYP %edi
172#define OUTP AREG
173#define UKEYP OUTP
174#define INP %edx
175#define LEN %esi
176#define IVP %ebp
177#define KLEN %ebx
178#define T1 %ecx
179#define TKEYP T1
180#endif
Huang Ying54b6a1b2009-01-18 16:28:34 +1100181
Dave Watson6c2c86b2018-02-14 09:38:35 -0800182.macro FUNC_SAVE
183 push %r12
184 push %r13
185 push %r14
Dave Watson6c2c86b2018-02-14 09:38:35 -0800186#
187# states of %xmm registers %xmm6:%xmm15 not saved
188# all %xmm registers are clobbered
189#
Dave Watson6c2c86b2018-02-14 09:38:35 -0800190.endm
191
192
193.macro FUNC_RESTORE
Dave Watson6c2c86b2018-02-14 09:38:35 -0800194 pop %r14
195 pop %r13
196 pop %r12
197.endm
Tadeusz Struk0bd82f52010-11-04 15:00:45 -0400198
Dave Watson1476db22018-02-14 09:40:10 -0800199# Precompute hashkeys.
200# Input: Hash subkey.
201# Output: HashKeys stored in gcm_context_data. Only needs to be called
202# once per key.
203# clobbers r12, and tmp xmm registers.
204.macro PRECOMPUTE TMP1 TMP2 TMP3 TMP4 TMP5 TMP6 TMP7
205 mov arg7, %r12
206 movdqu (%r12), \TMP3
207 movdqa SHUF_MASK(%rip), \TMP2
208 PSHUFB_XMM \TMP2, \TMP3
209
210 # precompute HashKey<<1 mod poly from the HashKey (required for GHASH)
211
212 movdqa \TMP3, \TMP2
213 psllq $1, \TMP3
214 psrlq $63, \TMP2
215 movdqa \TMP2, \TMP1
216 pslldq $8, \TMP2
217 psrldq $8, \TMP1
218 por \TMP2, \TMP3
219
220 # reduce HashKey<<1
221
222 pshufd $0x24, \TMP1, \TMP2
223 pcmpeqd TWOONE(%rip), \TMP2
224 pand POLY(%rip), \TMP2
225 pxor \TMP2, \TMP3
226 movdqa \TMP3, HashKey(%arg2)
227
228 movdqa \TMP3, \TMP5
229 pshufd $78, \TMP3, \TMP1
230 pxor \TMP3, \TMP1
231 movdqa \TMP1, HashKey_k(%arg2)
232
233 GHASH_MUL \TMP5, \TMP3, \TMP1, \TMP2, \TMP4, \TMP6, \TMP7
234# TMP5 = HashKey^2<<1 (mod poly)
235 movdqa \TMP5, HashKey_2(%arg2)
236# HashKey_2 = HashKey^2<<1 (mod poly)
237 pshufd $78, \TMP5, \TMP1
238 pxor \TMP5, \TMP1
239 movdqa \TMP1, HashKey_2_k(%arg2)
240
241 GHASH_MUL \TMP5, \TMP3, \TMP1, \TMP2, \TMP4, \TMP6, \TMP7
242# TMP5 = HashKey^3<<1 (mod poly)
243 movdqa \TMP5, HashKey_3(%arg2)
244 pshufd $78, \TMP5, \TMP1
245 pxor \TMP5, \TMP1
246 movdqa \TMP1, HashKey_3_k(%arg2)
247
248 GHASH_MUL \TMP5, \TMP3, \TMP1, \TMP2, \TMP4, \TMP6, \TMP7
249# TMP5 = HashKey^3<<1 (mod poly)
250 movdqa \TMP5, HashKey_4(%arg2)
251 pshufd $78, \TMP5, \TMP1
252 pxor \TMP5, \TMP1
253 movdqa \TMP1, HashKey_4_k(%arg2)
254.endm
Dave Watson7af964c2018-02-14 09:38:45 -0800255
256# GCM_INIT initializes a gcm_context struct to prepare for encoding/decoding.
257# Clobbers rax, r10-r13 and xmm0-xmm6, %xmm13
258.macro GCM_INIT
Dave Watson96604742018-02-14 09:39:45 -0800259 mov arg9, %r11
260 mov %r11, AadLen(%arg2) # ctx_data.aad_length = aad_length
261 xor %r11, %r11
262 mov %r11, InLen(%arg2) # ctx_data.in_length = 0
263 mov %r11, PBlockLen(%arg2) # ctx_data.partial_block_length = 0
264 mov %r11, PBlockEncKey(%arg2) # ctx_data.partial_block_enc_key = 0
265 mov %arg6, %rax
266 movdqu (%rax), %xmm0
267 movdqu %xmm0, OrigIV(%arg2) # ctx_data.orig_IV = iv
268
269 movdqa SHUF_MASK(%rip), %xmm2
270 PSHUFB_XMM %xmm2, %xmm0
271 movdqu %xmm0, CurCount(%arg2) # ctx_data.current_counter = iv
272
Dave Watson1476db22018-02-14 09:40:10 -0800273 PRECOMPUTE %xmm1 %xmm2 %xmm3 %xmm4 %xmm5 %xmm6 %xmm7
274 movdqa HashKey(%arg2), %xmm13
Dave Watsonc594c542018-02-14 09:39:36 -0800275
276 CALC_AAD_HASH %xmm13 %xmm0 %xmm1 %xmm2 %xmm3 %xmm4 \
277 %xmm5 %xmm6
Dave Watson7af964c2018-02-14 09:38:45 -0800278.endm
279
Dave Watsonba458332018-02-14 09:39:10 -0800280# GCM_ENC_DEC Encodes/Decodes given data. Assumes that the passed gcm_context
281# struct has been initialized by GCM_INIT.
282# Requires the input data be at least 1 byte long because of READ_PARTIAL_BLOCK
283# Clobbers rax, r10-r13, and xmm0-xmm15
284.macro GCM_ENC_DEC operation
Dave Watson96604742018-02-14 09:39:45 -0800285 movdqu AadHash(%arg2), %xmm8
Dave Watson1476db22018-02-14 09:40:10 -0800286 movdqu HashKey(%arg2), %xmm13
Dave Watson96604742018-02-14 09:39:45 -0800287 add %arg5, InLen(%arg2)
Dave Watsonae952c52018-02-14 09:40:19 -0800288
289 xor %r11, %r11 # initialise the data pointer offset as zero
290 PARTIAL_BLOCK %arg3 %arg4 %arg5 %r11 %xmm8 \operation
291
292 sub %r11, %arg5 # sub partial block data used
Dave Watson96604742018-02-14 09:39:45 -0800293 mov %arg5, %r13 # save the number of bytes
Dave Watsonae952c52018-02-14 09:40:19 -0800294
Dave Watson96604742018-02-14 09:39:45 -0800295 and $-16, %r13 # %r13 = %r13 - (%r13 mod 16)
296 mov %r13, %r12
Dave Watsonba458332018-02-14 09:39:10 -0800297 # Encrypt/Decrypt first few blocks
298
299 and $(3<<4), %r12
300 jz _initial_num_blocks_is_0_\@
301 cmp $(2<<4), %r12
302 jb _initial_num_blocks_is_1_\@
303 je _initial_num_blocks_is_2_\@
304_initial_num_blocks_is_3_\@:
305 INITIAL_BLOCKS_ENC_DEC %xmm9, %xmm10, %xmm13, %xmm11, %xmm12, %xmm0, \
306%xmm1, %xmm2, %xmm3, %xmm4, %xmm8, %xmm5, %xmm6, 5, 678, \operation
307 sub $48, %r13
308 jmp _initial_blocks_\@
309_initial_num_blocks_is_2_\@:
310 INITIAL_BLOCKS_ENC_DEC %xmm9, %xmm10, %xmm13, %xmm11, %xmm12, %xmm0, \
311%xmm1, %xmm2, %xmm3, %xmm4, %xmm8, %xmm5, %xmm6, 6, 78, \operation
312 sub $32, %r13
313 jmp _initial_blocks_\@
314_initial_num_blocks_is_1_\@:
315 INITIAL_BLOCKS_ENC_DEC %xmm9, %xmm10, %xmm13, %xmm11, %xmm12, %xmm0, \
316%xmm1, %xmm2, %xmm3, %xmm4, %xmm8, %xmm5, %xmm6, 7, 8, \operation
317 sub $16, %r13
318 jmp _initial_blocks_\@
319_initial_num_blocks_is_0_\@:
320 INITIAL_BLOCKS_ENC_DEC %xmm9, %xmm10, %xmm13, %xmm11, %xmm12, %xmm0, \
321%xmm1, %xmm2, %xmm3, %xmm4, %xmm8, %xmm5, %xmm6, 8, 0, \operation
322_initial_blocks_\@:
323
324 # Main loop - Encrypt/Decrypt remaining blocks
325
326 cmp $0, %r13
327 je _zero_cipher_left_\@
328 sub $64, %r13
329 je _four_cipher_left_\@
330_crypt_by_4_\@:
331 GHASH_4_ENCRYPT_4_PARALLEL_\operation %xmm9, %xmm10, %xmm11, %xmm12, \
332 %xmm13, %xmm14, %xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, \
333 %xmm7, %xmm8, enc
334 add $64, %r11
335 sub $64, %r13
336 jne _crypt_by_4_\@
337_four_cipher_left_\@:
338 GHASH_LAST_4 %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14, \
339%xmm15, %xmm1, %xmm2, %xmm3, %xmm4, %xmm8
340_zero_cipher_left_\@:
Dave Watson96604742018-02-14 09:39:45 -0800341 movdqu %xmm8, AadHash(%arg2)
342 movdqu %xmm0, CurCount(%arg2)
343
Dave Watson9ee4a5d2018-02-14 09:39:23 -0800344 mov %arg5, %r13
345 and $15, %r13 # %r13 = arg5 (mod 16)
Dave Watsonba458332018-02-14 09:39:10 -0800346 je _multiple_of_16_bytes_\@
347
Dave Watson96604742018-02-14 09:39:45 -0800348 mov %r13, PBlockLen(%arg2)
349
Dave Watsonba458332018-02-14 09:39:10 -0800350 # Handle the last <16 Byte block separately
351 paddd ONE(%rip), %xmm0 # INCR CNT to get Yn
Dave Watson96604742018-02-14 09:39:45 -0800352 movdqu %xmm0, CurCount(%arg2)
Dave Watson9ee4a5d2018-02-14 09:39:23 -0800353 movdqa SHUF_MASK(%rip), %xmm10
Dave Watsonba458332018-02-14 09:39:10 -0800354 PSHUFB_XMM %xmm10, %xmm0
355
356 ENCRYPT_SINGLE_BLOCK %xmm0, %xmm1 # Encrypt(K, Yn)
Dave Watson96604742018-02-14 09:39:45 -0800357 movdqu %xmm0, PBlockEncKey(%arg2)
Dave Watsonba458332018-02-14 09:39:10 -0800358
Dave Watson9ee4a5d2018-02-14 09:39:23 -0800359 lea (%arg4,%r11,1), %r10
Dave Watsonba458332018-02-14 09:39:10 -0800360 mov %r13, %r12
361 READ_PARTIAL_BLOCK %r10 %r12 %xmm2 %xmm1
362
363 lea ALL_F+16(%rip), %r12
364 sub %r13, %r12
365.ifc \operation, dec
366 movdqa %xmm1, %xmm2
367.endif
368 pxor %xmm1, %xmm0 # XOR Encrypt(K, Yn)
369 movdqu (%r12), %xmm1
370 # get the appropriate mask to mask out top 16-r13 bytes of xmm0
371 pand %xmm1, %xmm0 # mask out top 16-r13 bytes of xmm0
372.ifc \operation, dec
373 pand %xmm1, %xmm2
374 movdqa SHUF_MASK(%rip), %xmm10
375 PSHUFB_XMM %xmm10 ,%xmm2
376
377 pxor %xmm2, %xmm8
378.else
379 movdqa SHUF_MASK(%rip), %xmm10
380 PSHUFB_XMM %xmm10,%xmm0
381
382 pxor %xmm0, %xmm8
383.endif
384
Dave Watson96604742018-02-14 09:39:45 -0800385 movdqu %xmm8, AadHash(%arg2)
Dave Watsonba458332018-02-14 09:39:10 -0800386.ifc \operation, enc
387 # GHASH computation for the last <16 byte block
388 movdqa SHUF_MASK(%rip), %xmm10
389 # shuffle xmm0 back to output as ciphertext
390 PSHUFB_XMM %xmm10, %xmm0
391.endif
392
393 # Output %r13 bytes
394 MOVQ_R64_XMM %xmm0, %rax
395 cmp $8, %r13
396 jle _less_than_8_bytes_left_\@
Dave Watson9ee4a5d2018-02-14 09:39:23 -0800397 mov %rax, (%arg3 , %r11, 1)
Dave Watsonba458332018-02-14 09:39:10 -0800398 add $8, %r11
399 psrldq $8, %xmm0
400 MOVQ_R64_XMM %xmm0, %rax
401 sub $8, %r13
402_less_than_8_bytes_left_\@:
Dave Watson9ee4a5d2018-02-14 09:39:23 -0800403 mov %al, (%arg3, %r11, 1)
Dave Watsonba458332018-02-14 09:39:10 -0800404 add $1, %r11
405 shr $8, %rax
406 sub $1, %r13
407 jne _less_than_8_bytes_left_\@
408_multiple_of_16_bytes_\@:
409.endm
410
Dave Watsonadcadab2018-02-14 09:38:57 -0800411# GCM_COMPLETE Finishes update of tag of last partial block
412# Output: Authorization Tag (AUTH_TAG)
413# Clobbers rax, r10-r12, and xmm0, xmm1, xmm5-xmm15
414.macro GCM_COMPLETE
Dave Watson96604742018-02-14 09:39:45 -0800415 movdqu AadHash(%arg2), %xmm8
Dave Watson1476db22018-02-14 09:40:10 -0800416 movdqu HashKey(%arg2), %xmm13
Dave Watsone2e34b02018-02-14 09:39:55 -0800417
418 mov PBlockLen(%arg2), %r12
419
420 cmp $0, %r12
421 je _partial_done\@
422
423 GHASH_MUL %xmm8, %xmm13, %xmm9, %xmm10, %xmm11, %xmm5, %xmm6
424
425_partial_done\@:
Dave Watson96604742018-02-14 09:39:45 -0800426 mov AadLen(%arg2), %r12 # %r13 = aadLen (number of bytes)
Dave Watsonadcadab2018-02-14 09:38:57 -0800427 shl $3, %r12 # convert into number of bits
428 movd %r12d, %xmm15 # len(A) in %xmm15
Dave Watson96604742018-02-14 09:39:45 -0800429 mov InLen(%arg2), %r12
430 shl $3, %r12 # len(C) in bits (*128)
431 MOVQ_R64_XMM %r12, %xmm1
432
Dave Watsonadcadab2018-02-14 09:38:57 -0800433 pslldq $8, %xmm15 # %xmm15 = len(A)||0x0000000000000000
434 pxor %xmm1, %xmm15 # %xmm15 = len(A)||len(C)
435 pxor %xmm15, %xmm8
436 GHASH_MUL %xmm8, %xmm13, %xmm9, %xmm10, %xmm11, %xmm5, %xmm6
437 # final GHASH computation
438 movdqa SHUF_MASK(%rip), %xmm10
439 PSHUFB_XMM %xmm10, %xmm8
440
Dave Watson96604742018-02-14 09:39:45 -0800441 movdqu OrigIV(%arg2), %xmm0 # %xmm0 = Y0
Dave Watsonadcadab2018-02-14 09:38:57 -0800442 ENCRYPT_SINGLE_BLOCK %xmm0, %xmm1 # E(K, Y0)
443 pxor %xmm8, %xmm0
444_return_T_\@:
Dave Watson9ee4a5d2018-02-14 09:39:23 -0800445 mov arg10, %r10 # %r10 = authTag
446 mov arg11, %r11 # %r11 = auth_tag_len
Dave Watsonadcadab2018-02-14 09:38:57 -0800447 cmp $16, %r11
448 je _T_16_\@
449 cmp $8, %r11
450 jl _T_4_\@
451_T_8_\@:
452 MOVQ_R64_XMM %xmm0, %rax
453 mov %rax, (%r10)
454 add $8, %r10
455 sub $8, %r11
456 psrldq $8, %xmm0
457 cmp $0, %r11
458 je _return_T_done_\@
459_T_4_\@:
460 movd %xmm0, %eax
461 mov %eax, (%r10)
462 add $4, %r10
463 sub $4, %r11
464 psrldq $4, %xmm0
465 cmp $0, %r11
466 je _return_T_done_\@
467_T_123_\@:
468 movd %xmm0, %eax
469 cmp $2, %r11
470 jl _T_1_\@
471 mov %ax, (%r10)
472 cmp $2, %r11
473 je _return_T_done_\@
474 add $2, %r10
475 sar $16, %eax
476_T_1_\@:
477 mov %al, (%r10)
478 jmp _return_T_done_\@
479_T_16_\@:
480 movdqu %xmm0, (%r10)
481_return_T_done_\@:
482.endm
483
Mathias Krause559ad0f2010-11-29 08:35:39 +0800484#ifdef __x86_64__
Tadeusz Struk0bd82f52010-11-04 15:00:45 -0400485/* GHASH_MUL MACRO to implement: Data*HashKey mod (128,127,126,121,0)
486*
487*
488* Input: A and B (128-bits each, bit-reflected)
489* Output: C = A*B*x mod poly, (i.e. >>1 )
490* To compute GH = GH*HashKey mod poly, give HK = HashKey<<1 mod poly as input
491* GH = GH * HK * x mod poly which is equivalent to GH*HashKey mod poly.
492*
493*/
494.macro GHASH_MUL GH HK TMP1 TMP2 TMP3 TMP4 TMP5
495 movdqa \GH, \TMP1
496 pshufd $78, \GH, \TMP2
497 pshufd $78, \HK, \TMP3
498 pxor \GH, \TMP2 # TMP2 = a1+a0
499 pxor \HK, \TMP3 # TMP3 = b1+b0
500 PCLMULQDQ 0x11, \HK, \TMP1 # TMP1 = a1*b1
501 PCLMULQDQ 0x00, \HK, \GH # GH = a0*b0
502 PCLMULQDQ 0x00, \TMP3, \TMP2 # TMP2 = (a0+a1)*(b1+b0)
503 pxor \GH, \TMP2
504 pxor \TMP1, \TMP2 # TMP2 = (a0*b0)+(a1*b0)
505 movdqa \TMP2, \TMP3
506 pslldq $8, \TMP3 # left shift TMP3 2 DWs
507 psrldq $8, \TMP2 # right shift TMP2 2 DWs
508 pxor \TMP3, \GH
509 pxor \TMP2, \TMP1 # TMP2:GH holds the result of GH*HK
510
511 # first phase of the reduction
512
513 movdqa \GH, \TMP2
514 movdqa \GH, \TMP3
515 movdqa \GH, \TMP4 # copy GH into TMP2,TMP3 and TMP4
516 # in in order to perform
517 # independent shifts
518 pslld $31, \TMP2 # packed right shift <<31
519 pslld $30, \TMP3 # packed right shift <<30
520 pslld $25, \TMP4 # packed right shift <<25
521 pxor \TMP3, \TMP2 # xor the shifted versions
522 pxor \TMP4, \TMP2
523 movdqa \TMP2, \TMP5
524 psrldq $4, \TMP5 # right shift TMP5 1 DW
525 pslldq $12, \TMP2 # left shift TMP2 3 DWs
526 pxor \TMP2, \GH
527
528 # second phase of the reduction
529
530 movdqa \GH,\TMP2 # copy GH into TMP2,TMP3 and TMP4
531 # in in order to perform
532 # independent shifts
533 movdqa \GH,\TMP3
534 movdqa \GH,\TMP4
535 psrld $1,\TMP2 # packed left shift >>1
536 psrld $2,\TMP3 # packed left shift >>2
537 psrld $7,\TMP4 # packed left shift >>7
538 pxor \TMP3,\TMP2 # xor the shifted versions
539 pxor \TMP4,\TMP2
540 pxor \TMP5, \TMP2
541 pxor \TMP2, \GH
542 pxor \TMP1, \GH # result is in TMP1
543.endm
544
Junaid Shahidb20209c2017-12-20 17:08:37 -0800545# Reads DLEN bytes starting at DPTR and stores in XMMDst
546# where 0 < DLEN < 16
547# Clobbers %rax, DLEN and XMM1
548.macro READ_PARTIAL_BLOCK DPTR DLEN XMM1 XMMDst
549 cmp $8, \DLEN
550 jl _read_lt8_\@
551 mov (\DPTR), %rax
552 MOVQ_R64_XMM %rax, \XMMDst
553 sub $8, \DLEN
554 jz _done_read_partial_block_\@
555 xor %eax, %eax
556_read_next_byte_\@:
557 shl $8, %rax
558 mov 7(\DPTR, \DLEN, 1), %al
559 dec \DLEN
560 jnz _read_next_byte_\@
561 MOVQ_R64_XMM %rax, \XMM1
562 pslldq $8, \XMM1
563 por \XMM1, \XMMDst
564 jmp _done_read_partial_block_\@
565_read_lt8_\@:
566 xor %eax, %eax
567_read_next_byte_lt8_\@:
568 shl $8, %rax
569 mov -1(\DPTR, \DLEN, 1), %al
570 dec \DLEN
571 jnz _read_next_byte_lt8_\@
572 MOVQ_R64_XMM %rax, \XMMDst
573_done_read_partial_block_\@:
574.endm
575
Dave Watsonc594c542018-02-14 09:39:36 -0800576# CALC_AAD_HASH: Calculates the hash of the data which will not be encrypted.
577# clobbers r10-11, xmm14
578.macro CALC_AAD_HASH HASHKEY TMP1 TMP2 TMP3 TMP4 TMP5 \
579 TMP6 TMP7
580 MOVADQ SHUF_MASK(%rip), %xmm14
581 mov arg8, %r10 # %r10 = AAD
582 mov arg9, %r11 # %r11 = aadLen
583 pxor \TMP7, \TMP7
584 pxor \TMP6, \TMP6
585
586 cmp $16, %r11
587 jl _get_AAD_rest\@
588_get_AAD_blocks\@:
589 movdqu (%r10), \TMP7
590 PSHUFB_XMM %xmm14, \TMP7 # byte-reflect the AAD data
591 pxor \TMP7, \TMP6
592 GHASH_MUL \TMP6, \HASHKEY, \TMP1, \TMP2, \TMP3, \TMP4, \TMP5
593 add $16, %r10
594 sub $16, %r11
595 cmp $16, %r11
596 jge _get_AAD_blocks\@
597
598 movdqu \TMP6, \TMP7
599
600 /* read the last <16B of AAD */
601_get_AAD_rest\@:
602 cmp $0, %r11
603 je _get_AAD_done\@
604
605 READ_PARTIAL_BLOCK %r10, %r11, \TMP1, \TMP7
606 PSHUFB_XMM %xmm14, \TMP7 # byte-reflect the AAD data
607 pxor \TMP6, \TMP7
608 GHASH_MUL \TMP7, \HASHKEY, \TMP1, \TMP2, \TMP3, \TMP4, \TMP5
609 movdqu \TMP7, \TMP6
610
611_get_AAD_done\@:
612 movdqu \TMP6, AadHash(%arg2)
613.endm
614
Dave Watsonae952c52018-02-14 09:40:19 -0800615# PARTIAL_BLOCK: Handles encryption/decryption and the tag partial blocks
616# between update calls.
617# Requires the input data be at least 1 byte long due to READ_PARTIAL_BLOCK
618# Outputs encrypted bytes, and updates hash and partial info in gcm_data_context
619# Clobbers rax, r10, r12, r13, xmm0-6, xmm9-13
620.macro PARTIAL_BLOCK CYPH_PLAIN_OUT PLAIN_CYPH_IN PLAIN_CYPH_LEN DATA_OFFSET \
621 AAD_HASH operation
622 mov PBlockLen(%arg2), %r13
623 cmp $0, %r13
624 je _partial_block_done_\@ # Leave Macro if no partial blocks
625 # Read in input data without over reading
626 cmp $16, \PLAIN_CYPH_LEN
627 jl _fewer_than_16_bytes_\@
628 movups (\PLAIN_CYPH_IN), %xmm1 # If more than 16 bytes, just fill xmm
629 jmp _data_read_\@
630
631_fewer_than_16_bytes_\@:
632 lea (\PLAIN_CYPH_IN, \DATA_OFFSET, 1), %r10
633 mov \PLAIN_CYPH_LEN, %r12
634 READ_PARTIAL_BLOCK %r10 %r12 %xmm0 %xmm1
635
636 mov PBlockLen(%arg2), %r13
637
638_data_read_\@: # Finished reading in data
639
640 movdqu PBlockEncKey(%arg2), %xmm9
641 movdqu HashKey(%arg2), %xmm13
642
643 lea SHIFT_MASK(%rip), %r12
644
645 # adjust the shuffle mask pointer to be able to shift r13 bytes
646 # r16-r13 is the number of bytes in plaintext mod 16)
647 add %r13, %r12
648 movdqu (%r12), %xmm2 # get the appropriate shuffle mask
649 PSHUFB_XMM %xmm2, %xmm9 # shift right r13 bytes
650
651.ifc \operation, dec
652 movdqa %xmm1, %xmm3
653 pxor %xmm1, %xmm9 # Cyphertext XOR E(K, Yn)
654
655 mov \PLAIN_CYPH_LEN, %r10
656 add %r13, %r10
657 # Set r10 to be the amount of data left in CYPH_PLAIN_IN after filling
658 sub $16, %r10
659 # Determine if if partial block is not being filled and
660 # shift mask accordingly
661 jge _no_extra_mask_1_\@
662 sub %r10, %r12
663_no_extra_mask_1_\@:
664
665 movdqu ALL_F-SHIFT_MASK(%r12), %xmm1
666 # get the appropriate mask to mask out bottom r13 bytes of xmm9
667 pand %xmm1, %xmm9 # mask out bottom r13 bytes of xmm9
668
669 pand %xmm1, %xmm3
670 movdqa SHUF_MASK(%rip), %xmm10
671 PSHUFB_XMM %xmm10, %xmm3
672 PSHUFB_XMM %xmm2, %xmm3
673 pxor %xmm3, \AAD_HASH
674
675 cmp $0, %r10
676 jl _partial_incomplete_1_\@
677
678 # GHASH computation for the last <16 Byte block
679 GHASH_MUL \AAD_HASH, %xmm13, %xmm0, %xmm10, %xmm11, %xmm5, %xmm6
680 xor %rax,%rax
681
682 mov %rax, PBlockLen(%arg2)
683 jmp _dec_done_\@
684_partial_incomplete_1_\@:
685 add \PLAIN_CYPH_LEN, PBlockLen(%arg2)
686_dec_done_\@:
687 movdqu \AAD_HASH, AadHash(%arg2)
688.else
689 pxor %xmm1, %xmm9 # Plaintext XOR E(K, Yn)
690
691 mov \PLAIN_CYPH_LEN, %r10
692 add %r13, %r10
693 # Set r10 to be the amount of data left in CYPH_PLAIN_IN after filling
694 sub $16, %r10
695 # Determine if if partial block is not being filled and
696 # shift mask accordingly
697 jge _no_extra_mask_2_\@
698 sub %r10, %r12
699_no_extra_mask_2_\@:
700
701 movdqu ALL_F-SHIFT_MASK(%r12), %xmm1
702 # get the appropriate mask to mask out bottom r13 bytes of xmm9
703 pand %xmm1, %xmm9
704
705 movdqa SHUF_MASK(%rip), %xmm1
706 PSHUFB_XMM %xmm1, %xmm9
707 PSHUFB_XMM %xmm2, %xmm9
708 pxor %xmm9, \AAD_HASH
709
710 cmp $0, %r10
711 jl _partial_incomplete_2_\@
712
713 # GHASH computation for the last <16 Byte block
714 GHASH_MUL \AAD_HASH, %xmm13, %xmm0, %xmm10, %xmm11, %xmm5, %xmm6
715 xor %rax,%rax
716
717 mov %rax, PBlockLen(%arg2)
718 jmp _encode_done_\@
719_partial_incomplete_2_\@:
720 add \PLAIN_CYPH_LEN, PBlockLen(%arg2)
721_encode_done_\@:
722 movdqu \AAD_HASH, AadHash(%arg2)
723
724 movdqa SHUF_MASK(%rip), %xmm10
725 # shuffle xmm9 back to output as ciphertext
726 PSHUFB_XMM %xmm10, %xmm9
727 PSHUFB_XMM %xmm2, %xmm9
728.endif
729 # output encrypted Bytes
730 cmp $0, %r10
731 jl _partial_fill_\@
732 mov %r13, %r12
733 mov $16, %r13
734 # Set r13 to be the number of bytes to write out
735 sub %r12, %r13
736 jmp _count_set_\@
737_partial_fill_\@:
738 mov \PLAIN_CYPH_LEN, %r13
739_count_set_\@:
740 movdqa %xmm9, %xmm0
741 MOVQ_R64_XMM %xmm0, %rax
742 cmp $8, %r13
743 jle _less_than_8_bytes_left_\@
744
745 mov %rax, (\CYPH_PLAIN_OUT, \DATA_OFFSET, 1)
746 add $8, \DATA_OFFSET
747 psrldq $8, %xmm0
748 MOVQ_R64_XMM %xmm0, %rax
749 sub $8, %r13
750_less_than_8_bytes_left_\@:
751 movb %al, (\CYPH_PLAIN_OUT, \DATA_OFFSET, 1)
752 add $1, \DATA_OFFSET
753 shr $8, %rax
754 sub $1, %r13
755 jne _less_than_8_bytes_left_\@
756_partial_block_done_\@:
757.endm # PARTIAL_BLOCK
758
Tadeusz Struk0bd82f52010-11-04 15:00:45 -0400759/*
760* if a = number of total plaintext bytes
761* b = floor(a/16)
762* num_initial_blocks = b mod 4
763* encrypt the initial num_initial_blocks blocks and apply ghash on
764* the ciphertext
765* %r10, %r11, %r12, %rax, %xmm5, %xmm6, %xmm7, %xmm8, %xmm9 registers
766* are clobbered
Dave Watson1476db22018-02-14 09:40:10 -0800767* arg1, %arg2, %arg3 are used as a pointer only, not modified
Tadeusz Struk0bd82f52010-11-04 15:00:45 -0400768*/
769
Tadeusz Struk0bd82f52010-11-04 15:00:45 -0400770
Dave Watsone1fd3162018-02-14 09:38:12 -0800771.macro INITIAL_BLOCKS_ENC_DEC TMP1 TMP2 TMP3 TMP4 TMP5 XMM0 XMM1 \
Dave Watsonc594c542018-02-14 09:39:36 -0800772 XMM2 XMM3 XMM4 XMMDst TMP6 TMP7 i i_seq operation
Dave Watson96604742018-02-14 09:39:45 -0800773 MOVADQ SHUF_MASK(%rip), %xmm14
Sabrina Dubroca0487cca2017-04-28 18:11:56 +0200774
Dave Watsonc594c542018-02-14 09:39:36 -0800775 movdqu AadHash(%arg2), %xmm\i # XMM0 = Y0
Sabrina Dubroca0487cca2017-04-28 18:11:56 +0200776
Sabrina Dubroca0487cca2017-04-28 18:11:56 +0200777 # start AES for num_initial_blocks blocks
Tadeusz Struk3c097b82010-12-13 19:51:15 +0800778
Dave Watson96604742018-02-14 09:39:45 -0800779 movdqu CurCount(%arg2), \XMM0 # XMM0 = Y0
Tadeusz Struk3c097b82010-12-13 19:51:15 +0800780
781.if (\i == 5) || (\i == 6) || (\i == 7)
Tadeusz Struk3c097b82010-12-13 19:51:15 +0800782
Timothy McCaffreye31ac322015-01-13 13:16:43 -0500783 MOVADQ ONE(%RIP),\TMP1
784 MOVADQ 0(%arg1),\TMP2
Tadeusz Struk3c097b82010-12-13 19:51:15 +0800785.irpc index, \i_seq
Timothy McCaffreye31ac322015-01-13 13:16:43 -0500786 paddd \TMP1, \XMM0 # INCR Y0
Dave Watsone1fd3162018-02-14 09:38:12 -0800787.ifc \operation, dec
788 movdqa \XMM0, %xmm\index
789.else
Timothy McCaffreye31ac322015-01-13 13:16:43 -0500790 MOVADQ \XMM0, %xmm\index
Dave Watsone1fd3162018-02-14 09:38:12 -0800791.endif
Timothy McCaffreye31ac322015-01-13 13:16:43 -0500792 PSHUFB_XMM %xmm14, %xmm\index # perform a 16 byte swap
793 pxor \TMP2, %xmm\index
Tadeusz Struk3c097b82010-12-13 19:51:15 +0800794.endr
Timothy McCaffreye31ac322015-01-13 13:16:43 -0500795 lea 0x10(%arg1),%r10
796 mov keysize,%eax
797 shr $2,%eax # 128->4, 192->6, 256->8
798 add $5,%eax # 128->9, 192->11, 256->13
799
Dave Watsone1fd3162018-02-14 09:38:12 -0800800aes_loop_initial_\@:
Timothy McCaffreye31ac322015-01-13 13:16:43 -0500801 MOVADQ (%r10),\TMP1
802.irpc index, \i_seq
803 AESENC \TMP1, %xmm\index
Tadeusz Struk3c097b82010-12-13 19:51:15 +0800804.endr
Timothy McCaffreye31ac322015-01-13 13:16:43 -0500805 add $16,%r10
806 sub $1,%eax
Dave Watsone1fd3162018-02-14 09:38:12 -0800807 jnz aes_loop_initial_\@
Timothy McCaffreye31ac322015-01-13 13:16:43 -0500808
809 MOVADQ (%r10), \TMP1
Tadeusz Struk3c097b82010-12-13 19:51:15 +0800810.irpc index, \i_seq
Timothy McCaffreye31ac322015-01-13 13:16:43 -0500811 AESENCLAST \TMP1, %xmm\index # Last Round
Tadeusz Struk3c097b82010-12-13 19:51:15 +0800812.endr
813.irpc index, \i_seq
Dave Watson9ee4a5d2018-02-14 09:39:23 -0800814 movdqu (%arg4 , %r11, 1), \TMP1
Tadeusz Struk3c097b82010-12-13 19:51:15 +0800815 pxor \TMP1, %xmm\index
Dave Watson9ee4a5d2018-02-14 09:39:23 -0800816 movdqu %xmm\index, (%arg3 , %r11, 1)
Tadeusz Struk3c097b82010-12-13 19:51:15 +0800817 # write back plaintext/ciphertext for num_initial_blocks
818 add $16, %r11
Dave Watsone1fd3162018-02-14 09:38:12 -0800819
820.ifc \operation, dec
821 movdqa \TMP1, %xmm\index
822.endif
Tadeusz Struk3c097b82010-12-13 19:51:15 +0800823 PSHUFB_XMM %xmm14, %xmm\index
824
825 # prepare plaintext/ciphertext for GHASH computation
826.endr
827.endif
Sabrina Dubroca0487cca2017-04-28 18:11:56 +0200828
Tadeusz Struk3c097b82010-12-13 19:51:15 +0800829 # apply GHASH on num_initial_blocks blocks
830
831.if \i == 5
832 pxor %xmm5, %xmm6
833 GHASH_MUL %xmm6, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1
834 pxor %xmm6, %xmm7
835 GHASH_MUL %xmm7, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1
836 pxor %xmm7, %xmm8
837 GHASH_MUL %xmm8, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1
838.elseif \i == 6
839 pxor %xmm6, %xmm7
840 GHASH_MUL %xmm7, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1
841 pxor %xmm7, %xmm8
842 GHASH_MUL %xmm8, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1
843.elseif \i == 7
844 pxor %xmm7, %xmm8
845 GHASH_MUL %xmm8, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1
846.endif
847 cmp $64, %r13
Dave Watsone1fd3162018-02-14 09:38:12 -0800848 jl _initial_blocks_done\@
Tadeusz Struk3c097b82010-12-13 19:51:15 +0800849 # no need for precomputed values
850/*
851*
852* Precomputations for HashKey parallel with encryption of first 4 blocks.
853* Haskey_i_k holds XORed values of the low and high parts of the Haskey_i
854*/
Timothy McCaffreye31ac322015-01-13 13:16:43 -0500855 MOVADQ ONE(%RIP),\TMP1
856 paddd \TMP1, \XMM0 # INCR Y0
857 MOVADQ \XMM0, \XMM1
Tadeusz Struk3c097b82010-12-13 19:51:15 +0800858 PSHUFB_XMM %xmm14, \XMM1 # perform a 16 byte swap
859
Timothy McCaffreye31ac322015-01-13 13:16:43 -0500860 paddd \TMP1, \XMM0 # INCR Y0
861 MOVADQ \XMM0, \XMM2
Tadeusz Struk3c097b82010-12-13 19:51:15 +0800862 PSHUFB_XMM %xmm14, \XMM2 # perform a 16 byte swap
863
Timothy McCaffreye31ac322015-01-13 13:16:43 -0500864 paddd \TMP1, \XMM0 # INCR Y0
865 MOVADQ \XMM0, \XMM3
Tadeusz Struk3c097b82010-12-13 19:51:15 +0800866 PSHUFB_XMM %xmm14, \XMM3 # perform a 16 byte swap
867
Timothy McCaffreye31ac322015-01-13 13:16:43 -0500868 paddd \TMP1, \XMM0 # INCR Y0
869 MOVADQ \XMM0, \XMM4
Tadeusz Struk3c097b82010-12-13 19:51:15 +0800870 PSHUFB_XMM %xmm14, \XMM4 # perform a 16 byte swap
871
Timothy McCaffreye31ac322015-01-13 13:16:43 -0500872 MOVADQ 0(%arg1),\TMP1
873 pxor \TMP1, \XMM1
874 pxor \TMP1, \XMM2
875 pxor \TMP1, \XMM3
876 pxor \TMP1, \XMM4
Tadeusz Struk3c097b82010-12-13 19:51:15 +0800877.irpc index, 1234 # do 4 rounds
878 movaps 0x10*\index(%arg1), \TMP1
879 AESENC \TMP1, \XMM1
880 AESENC \TMP1, \XMM2
881 AESENC \TMP1, \XMM3
882 AESENC \TMP1, \XMM4
883.endr
Tadeusz Struk3c097b82010-12-13 19:51:15 +0800884.irpc index, 56789 # do next 5 rounds
885 movaps 0x10*\index(%arg1), \TMP1
886 AESENC \TMP1, \XMM1
887 AESENC \TMP1, \XMM2
888 AESENC \TMP1, \XMM3
889 AESENC \TMP1, \XMM4
890.endr
Timothy McCaffreye31ac322015-01-13 13:16:43 -0500891 lea 0xa0(%arg1),%r10
892 mov keysize,%eax
893 shr $2,%eax # 128->4, 192->6, 256->8
894 sub $4,%eax # 128->0, 192->2, 256->4
Dave Watsone1fd3162018-02-14 09:38:12 -0800895 jz aes_loop_pre_done\@
Timothy McCaffreye31ac322015-01-13 13:16:43 -0500896
Dave Watsone1fd3162018-02-14 09:38:12 -0800897aes_loop_pre_\@:
Timothy McCaffreye31ac322015-01-13 13:16:43 -0500898 MOVADQ (%r10),\TMP2
899.irpc index, 1234
900 AESENC \TMP2, %xmm\index
901.endr
902 add $16,%r10
903 sub $1,%eax
Dave Watsone1fd3162018-02-14 09:38:12 -0800904 jnz aes_loop_pre_\@
Timothy McCaffreye31ac322015-01-13 13:16:43 -0500905
Dave Watsone1fd3162018-02-14 09:38:12 -0800906aes_loop_pre_done\@:
Timothy McCaffreye31ac322015-01-13 13:16:43 -0500907 MOVADQ (%r10), \TMP2
Tadeusz Struk3c097b82010-12-13 19:51:15 +0800908 AESENCLAST \TMP2, \XMM1
909 AESENCLAST \TMP2, \XMM2
910 AESENCLAST \TMP2, \XMM3
911 AESENCLAST \TMP2, \XMM4
Dave Watson9ee4a5d2018-02-14 09:39:23 -0800912 movdqu 16*0(%arg4 , %r11 , 1), \TMP1
Tadeusz Struk3c097b82010-12-13 19:51:15 +0800913 pxor \TMP1, \XMM1
Dave Watsone1fd3162018-02-14 09:38:12 -0800914.ifc \operation, dec
Dave Watson9ee4a5d2018-02-14 09:39:23 -0800915 movdqu \XMM1, 16*0(%arg3 , %r11 , 1)
Dave Watsone1fd3162018-02-14 09:38:12 -0800916 movdqa \TMP1, \XMM1
917.endif
Dave Watson9ee4a5d2018-02-14 09:39:23 -0800918 movdqu 16*1(%arg4 , %r11 , 1), \TMP1
Tadeusz Struk3c097b82010-12-13 19:51:15 +0800919 pxor \TMP1, \XMM2
Dave Watsone1fd3162018-02-14 09:38:12 -0800920.ifc \operation, dec
Dave Watson9ee4a5d2018-02-14 09:39:23 -0800921 movdqu \XMM2, 16*1(%arg3 , %r11 , 1)
Dave Watsone1fd3162018-02-14 09:38:12 -0800922 movdqa \TMP1, \XMM2
923.endif
Dave Watson9ee4a5d2018-02-14 09:39:23 -0800924 movdqu 16*2(%arg4 , %r11 , 1), \TMP1
Tadeusz Struk3c097b82010-12-13 19:51:15 +0800925 pxor \TMP1, \XMM3
Dave Watsone1fd3162018-02-14 09:38:12 -0800926.ifc \operation, dec
Dave Watson9ee4a5d2018-02-14 09:39:23 -0800927 movdqu \XMM3, 16*2(%arg3 , %r11 , 1)
Dave Watsone1fd3162018-02-14 09:38:12 -0800928 movdqa \TMP1, \XMM3
929.endif
Dave Watson9ee4a5d2018-02-14 09:39:23 -0800930 movdqu 16*3(%arg4 , %r11 , 1), \TMP1
Tadeusz Struk3c097b82010-12-13 19:51:15 +0800931 pxor \TMP1, \XMM4
Dave Watsone1fd3162018-02-14 09:38:12 -0800932.ifc \operation, dec
Dave Watson9ee4a5d2018-02-14 09:39:23 -0800933 movdqu \XMM4, 16*3(%arg3 , %r11 , 1)
Dave Watsone1fd3162018-02-14 09:38:12 -0800934 movdqa \TMP1, \XMM4
935.else
Dave Watson9ee4a5d2018-02-14 09:39:23 -0800936 movdqu \XMM1, 16*0(%arg3 , %r11 , 1)
937 movdqu \XMM2, 16*1(%arg3 , %r11 , 1)
938 movdqu \XMM3, 16*2(%arg3 , %r11 , 1)
939 movdqu \XMM4, 16*3(%arg3 , %r11 , 1)
Dave Watsone1fd3162018-02-14 09:38:12 -0800940.endif
Tadeusz Struk3c097b82010-12-13 19:51:15 +0800941
Tadeusz Struk0bd82f52010-11-04 15:00:45 -0400942 add $64, %r11
Tadeusz Struk3c097b82010-12-13 19:51:15 +0800943 PSHUFB_XMM %xmm14, \XMM1 # perform a 16 byte swap
Tadeusz Struk0bd82f52010-11-04 15:00:45 -0400944 pxor \XMMDst, \XMM1
945# combine GHASHed value with the corresponding ciphertext
Tadeusz Struk3c097b82010-12-13 19:51:15 +0800946 PSHUFB_XMM %xmm14, \XMM2 # perform a 16 byte swap
Tadeusz Struk3c097b82010-12-13 19:51:15 +0800947 PSHUFB_XMM %xmm14, \XMM3 # perform a 16 byte swap
Tadeusz Struk3c097b82010-12-13 19:51:15 +0800948 PSHUFB_XMM %xmm14, \XMM4 # perform a 16 byte swap
949
Dave Watsone1fd3162018-02-14 09:38:12 -0800950_initial_blocks_done\@:
Tadeusz Struk3c097b82010-12-13 19:51:15 +0800951
Tadeusz Struk0bd82f52010-11-04 15:00:45 -0400952.endm
953
954/*
955* encrypt 4 blocks at a time
956* ghash the 4 previously encrypted ciphertext blocks
Dave Watson9ee4a5d2018-02-14 09:39:23 -0800957* arg1, %arg3, %arg4 are used as pointers only, not modified
Tadeusz Struk0bd82f52010-11-04 15:00:45 -0400958* %r11 is the data offset value
959*/
Tadeusz Struk3c097b82010-12-13 19:51:15 +0800960.macro GHASH_4_ENCRYPT_4_PARALLEL_ENC TMP1 TMP2 TMP3 TMP4 TMP5 \
Tadeusz Struk0bd82f52010-11-04 15:00:45 -0400961TMP6 XMM0 XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7 XMM8 operation
962
963 movdqa \XMM1, \XMM5
964 movdqa \XMM2, \XMM6
965 movdqa \XMM3, \XMM7
966 movdqa \XMM4, \XMM8
967
Tadeusz Struk3c097b82010-12-13 19:51:15 +0800968 movdqa SHUF_MASK(%rip), %xmm15
Tadeusz Struk0bd82f52010-11-04 15:00:45 -0400969 # multiply TMP5 * HashKey using karatsuba
970
971 movdqa \XMM5, \TMP4
972 pshufd $78, \XMM5, \TMP6
973 pxor \XMM5, \TMP6
974 paddd ONE(%rip), \XMM0 # INCR CNT
Dave Watson1476db22018-02-14 09:40:10 -0800975 movdqa HashKey_4(%arg2), \TMP5
Tadeusz Struk0bd82f52010-11-04 15:00:45 -0400976 PCLMULQDQ 0x11, \TMP5, \TMP4 # TMP4 = a1*b1
977 movdqa \XMM0, \XMM1
978 paddd ONE(%rip), \XMM0 # INCR CNT
979 movdqa \XMM0, \XMM2
980 paddd ONE(%rip), \XMM0 # INCR CNT
981 movdqa \XMM0, \XMM3
982 paddd ONE(%rip), \XMM0 # INCR CNT
983 movdqa \XMM0, \XMM4
Tadeusz Struk3c097b82010-12-13 19:51:15 +0800984 PSHUFB_XMM %xmm15, \XMM1 # perform a 16 byte swap
Tadeusz Struk0bd82f52010-11-04 15:00:45 -0400985 PCLMULQDQ 0x00, \TMP5, \XMM5 # XMM5 = a0*b0
Tadeusz Struk3c097b82010-12-13 19:51:15 +0800986 PSHUFB_XMM %xmm15, \XMM2 # perform a 16 byte swap
987 PSHUFB_XMM %xmm15, \XMM3 # perform a 16 byte swap
988 PSHUFB_XMM %xmm15, \XMM4 # perform a 16 byte swap
989
Tadeusz Struk0bd82f52010-11-04 15:00:45 -0400990 pxor (%arg1), \XMM1
991 pxor (%arg1), \XMM2
992 pxor (%arg1), \XMM3
993 pxor (%arg1), \XMM4
Dave Watson1476db22018-02-14 09:40:10 -0800994 movdqa HashKey_4_k(%arg2), \TMP5
Tadeusz Struk0bd82f52010-11-04 15:00:45 -0400995 PCLMULQDQ 0x00, \TMP5, \TMP6 # TMP6 = (a1+a0)*(b1+b0)
996 movaps 0x10(%arg1), \TMP1
997 AESENC \TMP1, \XMM1 # Round 1
998 AESENC \TMP1, \XMM2
999 AESENC \TMP1, \XMM3
1000 AESENC \TMP1, \XMM4
1001 movaps 0x20(%arg1), \TMP1
1002 AESENC \TMP1, \XMM1 # Round 2
1003 AESENC \TMP1, \XMM2
1004 AESENC \TMP1, \XMM3
1005 AESENC \TMP1, \XMM4
1006 movdqa \XMM6, \TMP1
1007 pshufd $78, \XMM6, \TMP2
1008 pxor \XMM6, \TMP2
Dave Watson1476db22018-02-14 09:40:10 -08001009 movdqa HashKey_3(%arg2), \TMP5
Tadeusz Struk0bd82f52010-11-04 15:00:45 -04001010 PCLMULQDQ 0x11, \TMP5, \TMP1 # TMP1 = a1 * b1
1011 movaps 0x30(%arg1), \TMP3
1012 AESENC \TMP3, \XMM1 # Round 3
1013 AESENC \TMP3, \XMM2
1014 AESENC \TMP3, \XMM3
1015 AESENC \TMP3, \XMM4
1016 PCLMULQDQ 0x00, \TMP5, \XMM6 # XMM6 = a0*b0
1017 movaps 0x40(%arg1), \TMP3
1018 AESENC \TMP3, \XMM1 # Round 4
1019 AESENC \TMP3, \XMM2
1020 AESENC \TMP3, \XMM3
1021 AESENC \TMP3, \XMM4
Dave Watson1476db22018-02-14 09:40:10 -08001022 movdqa HashKey_3_k(%arg2), \TMP5
Tadeusz Struk0bd82f52010-11-04 15:00:45 -04001023 PCLMULQDQ 0x00, \TMP5, \TMP2 # TMP2 = (a1+a0)*(b1+b0)
1024 movaps 0x50(%arg1), \TMP3
1025 AESENC \TMP3, \XMM1 # Round 5
1026 AESENC \TMP3, \XMM2
1027 AESENC \TMP3, \XMM3
1028 AESENC \TMP3, \XMM4
1029 pxor \TMP1, \TMP4
1030# accumulate the results in TMP4:XMM5, TMP6 holds the middle part
1031 pxor \XMM6, \XMM5
1032 pxor \TMP2, \TMP6
1033 movdqa \XMM7, \TMP1
1034 pshufd $78, \XMM7, \TMP2
1035 pxor \XMM7, \TMP2
Dave Watson1476db22018-02-14 09:40:10 -08001036 movdqa HashKey_2(%arg2), \TMP5
Tadeusz Struk0bd82f52010-11-04 15:00:45 -04001037
1038 # Multiply TMP5 * HashKey using karatsuba
1039
1040 PCLMULQDQ 0x11, \TMP5, \TMP1 # TMP1 = a1*b1
1041 movaps 0x60(%arg1), \TMP3
1042 AESENC \TMP3, \XMM1 # Round 6
1043 AESENC \TMP3, \XMM2
1044 AESENC \TMP3, \XMM3
1045 AESENC \TMP3, \XMM4
1046 PCLMULQDQ 0x00, \TMP5, \XMM7 # XMM7 = a0*b0
1047 movaps 0x70(%arg1), \TMP3
1048 AESENC \TMP3, \XMM1 # Round 7
1049 AESENC \TMP3, \XMM2
1050 AESENC \TMP3, \XMM3
1051 AESENC \TMP3, \XMM4
Dave Watson1476db22018-02-14 09:40:10 -08001052 movdqa HashKey_2_k(%arg2), \TMP5
Tadeusz Struk0bd82f52010-11-04 15:00:45 -04001053 PCLMULQDQ 0x00, \TMP5, \TMP2 # TMP2 = (a1+a0)*(b1+b0)
1054 movaps 0x80(%arg1), \TMP3
1055 AESENC \TMP3, \XMM1 # Round 8
1056 AESENC \TMP3, \XMM2
1057 AESENC \TMP3, \XMM3
1058 AESENC \TMP3, \XMM4
1059 pxor \TMP1, \TMP4
1060# accumulate the results in TMP4:XMM5, TMP6 holds the middle part
1061 pxor \XMM7, \XMM5
1062 pxor \TMP2, \TMP6
1063
1064 # Multiply XMM8 * HashKey
1065 # XMM8 and TMP5 hold the values for the two operands
1066
1067 movdqa \XMM8, \TMP1
1068 pshufd $78, \XMM8, \TMP2
1069 pxor \XMM8, \TMP2
Dave Watson1476db22018-02-14 09:40:10 -08001070 movdqa HashKey(%arg2), \TMP5
Tadeusz Struk0bd82f52010-11-04 15:00:45 -04001071 PCLMULQDQ 0x11, \TMP5, \TMP1 # TMP1 = a1*b1
1072 movaps 0x90(%arg1), \TMP3
1073 AESENC \TMP3, \XMM1 # Round 9
1074 AESENC \TMP3, \XMM2
1075 AESENC \TMP3, \XMM3
1076 AESENC \TMP3, \XMM4
1077 PCLMULQDQ 0x00, \TMP5, \XMM8 # XMM8 = a0*b0
Timothy McCaffreye31ac322015-01-13 13:16:43 -05001078 lea 0xa0(%arg1),%r10
1079 mov keysize,%eax
1080 shr $2,%eax # 128->4, 192->6, 256->8
1081 sub $4,%eax # 128->0, 192->2, 256->4
1082 jz aes_loop_par_enc_done
1083
1084aes_loop_par_enc:
1085 MOVADQ (%r10),\TMP3
1086.irpc index, 1234
1087 AESENC \TMP3, %xmm\index
1088.endr
1089 add $16,%r10
1090 sub $1,%eax
1091 jnz aes_loop_par_enc
1092
1093aes_loop_par_enc_done:
1094 MOVADQ (%r10), \TMP3
Tadeusz Struk0bd82f52010-11-04 15:00:45 -04001095 AESENCLAST \TMP3, \XMM1 # Round 10
1096 AESENCLAST \TMP3, \XMM2
1097 AESENCLAST \TMP3, \XMM3
1098 AESENCLAST \TMP3, \XMM4
Dave Watson1476db22018-02-14 09:40:10 -08001099 movdqa HashKey_k(%arg2), \TMP5
Tadeusz Struk0bd82f52010-11-04 15:00:45 -04001100 PCLMULQDQ 0x00, \TMP5, \TMP2 # TMP2 = (a1+a0)*(b1+b0)
Dave Watson9ee4a5d2018-02-14 09:39:23 -08001101 movdqu (%arg4,%r11,1), \TMP3
Tadeusz Struk0bd82f52010-11-04 15:00:45 -04001102 pxor \TMP3, \XMM1 # Ciphertext/Plaintext XOR EK
Dave Watson9ee4a5d2018-02-14 09:39:23 -08001103 movdqu 16(%arg4,%r11,1), \TMP3
Tadeusz Struk0bd82f52010-11-04 15:00:45 -04001104 pxor \TMP3, \XMM2 # Ciphertext/Plaintext XOR EK
Dave Watson9ee4a5d2018-02-14 09:39:23 -08001105 movdqu 32(%arg4,%r11,1), \TMP3
Tadeusz Struk0bd82f52010-11-04 15:00:45 -04001106 pxor \TMP3, \XMM3 # Ciphertext/Plaintext XOR EK
Dave Watson9ee4a5d2018-02-14 09:39:23 -08001107 movdqu 48(%arg4,%r11,1), \TMP3
Tadeusz Struk0bd82f52010-11-04 15:00:45 -04001108 pxor \TMP3, \XMM4 # Ciphertext/Plaintext XOR EK
Dave Watson9ee4a5d2018-02-14 09:39:23 -08001109 movdqu \XMM1, (%arg3,%r11,1) # Write to the ciphertext buffer
1110 movdqu \XMM2, 16(%arg3,%r11,1) # Write to the ciphertext buffer
1111 movdqu \XMM3, 32(%arg3,%r11,1) # Write to the ciphertext buffer
1112 movdqu \XMM4, 48(%arg3,%r11,1) # Write to the ciphertext buffer
Tadeusz Struk3c097b82010-12-13 19:51:15 +08001113 PSHUFB_XMM %xmm15, \XMM1 # perform a 16 byte swap
1114 PSHUFB_XMM %xmm15, \XMM2 # perform a 16 byte swap
1115 PSHUFB_XMM %xmm15, \XMM3 # perform a 16 byte swap
1116 PSHUFB_XMM %xmm15, \XMM4 # perform a 16 byte swap
1117
1118 pxor \TMP4, \TMP1
1119 pxor \XMM8, \XMM5
1120 pxor \TMP6, \TMP2
1121 pxor \TMP1, \TMP2
1122 pxor \XMM5, \TMP2
1123 movdqa \TMP2, \TMP3
1124 pslldq $8, \TMP3 # left shift TMP3 2 DWs
1125 psrldq $8, \TMP2 # right shift TMP2 2 DWs
1126 pxor \TMP3, \XMM5
1127 pxor \TMP2, \TMP1 # accumulate the results in TMP1:XMM5
1128
1129 # first phase of reduction
1130
1131 movdqa \XMM5, \TMP2
1132 movdqa \XMM5, \TMP3
1133 movdqa \XMM5, \TMP4
1134# move XMM5 into TMP2, TMP3, TMP4 in order to perform shifts independently
1135 pslld $31, \TMP2 # packed right shift << 31
1136 pslld $30, \TMP3 # packed right shift << 30
1137 pslld $25, \TMP4 # packed right shift << 25
1138 pxor \TMP3, \TMP2 # xor the shifted versions
1139 pxor \TMP4, \TMP2
1140 movdqa \TMP2, \TMP5
1141 psrldq $4, \TMP5 # right shift T5 1 DW
1142 pslldq $12, \TMP2 # left shift T2 3 DWs
1143 pxor \TMP2, \XMM5
1144
1145 # second phase of reduction
1146
1147 movdqa \XMM5,\TMP2 # make 3 copies of XMM5 into TMP2, TMP3, TMP4
1148 movdqa \XMM5,\TMP3
1149 movdqa \XMM5,\TMP4
1150 psrld $1, \TMP2 # packed left shift >>1
1151 psrld $2, \TMP3 # packed left shift >>2
1152 psrld $7, \TMP4 # packed left shift >>7
1153 pxor \TMP3,\TMP2 # xor the shifted versions
1154 pxor \TMP4,\TMP2
1155 pxor \TMP5, \TMP2
1156 pxor \TMP2, \XMM5
1157 pxor \TMP1, \XMM5 # result is in TMP1
1158
1159 pxor \XMM5, \XMM1
1160.endm
1161
1162/*
1163* decrypt 4 blocks at a time
1164* ghash the 4 previously decrypted ciphertext blocks
Dave Watson9ee4a5d2018-02-14 09:39:23 -08001165* arg1, %arg3, %arg4 are used as pointers only, not modified
Tadeusz Struk3c097b82010-12-13 19:51:15 +08001166* %r11 is the data offset value
1167*/
1168.macro GHASH_4_ENCRYPT_4_PARALLEL_DEC TMP1 TMP2 TMP3 TMP4 TMP5 \
1169TMP6 XMM0 XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7 XMM8 operation
1170
1171 movdqa \XMM1, \XMM5
1172 movdqa \XMM2, \XMM6
1173 movdqa \XMM3, \XMM7
1174 movdqa \XMM4, \XMM8
1175
1176 movdqa SHUF_MASK(%rip), %xmm15
1177 # multiply TMP5 * HashKey using karatsuba
1178
1179 movdqa \XMM5, \TMP4
1180 pshufd $78, \XMM5, \TMP6
1181 pxor \XMM5, \TMP6
1182 paddd ONE(%rip), \XMM0 # INCR CNT
Dave Watson1476db22018-02-14 09:40:10 -08001183 movdqa HashKey_4(%arg2), \TMP5
Tadeusz Struk3c097b82010-12-13 19:51:15 +08001184 PCLMULQDQ 0x11, \TMP5, \TMP4 # TMP4 = a1*b1
1185 movdqa \XMM0, \XMM1
1186 paddd ONE(%rip), \XMM0 # INCR CNT
1187 movdqa \XMM0, \XMM2
1188 paddd ONE(%rip), \XMM0 # INCR CNT
1189 movdqa \XMM0, \XMM3
1190 paddd ONE(%rip), \XMM0 # INCR CNT
1191 movdqa \XMM0, \XMM4
1192 PSHUFB_XMM %xmm15, \XMM1 # perform a 16 byte swap
1193 PCLMULQDQ 0x00, \TMP5, \XMM5 # XMM5 = a0*b0
1194 PSHUFB_XMM %xmm15, \XMM2 # perform a 16 byte swap
1195 PSHUFB_XMM %xmm15, \XMM3 # perform a 16 byte swap
1196 PSHUFB_XMM %xmm15, \XMM4 # perform a 16 byte swap
1197
1198 pxor (%arg1), \XMM1
1199 pxor (%arg1), \XMM2
1200 pxor (%arg1), \XMM3
1201 pxor (%arg1), \XMM4
Dave Watson1476db22018-02-14 09:40:10 -08001202 movdqa HashKey_4_k(%arg2), \TMP5
Tadeusz Struk3c097b82010-12-13 19:51:15 +08001203 PCLMULQDQ 0x00, \TMP5, \TMP6 # TMP6 = (a1+a0)*(b1+b0)
1204 movaps 0x10(%arg1), \TMP1
1205 AESENC \TMP1, \XMM1 # Round 1
1206 AESENC \TMP1, \XMM2
1207 AESENC \TMP1, \XMM3
1208 AESENC \TMP1, \XMM4
1209 movaps 0x20(%arg1), \TMP1
1210 AESENC \TMP1, \XMM1 # Round 2
1211 AESENC \TMP1, \XMM2
1212 AESENC \TMP1, \XMM3
1213 AESENC \TMP1, \XMM4
1214 movdqa \XMM6, \TMP1
1215 pshufd $78, \XMM6, \TMP2
1216 pxor \XMM6, \TMP2
Dave Watson1476db22018-02-14 09:40:10 -08001217 movdqa HashKey_3(%arg2), \TMP5
Tadeusz Struk3c097b82010-12-13 19:51:15 +08001218 PCLMULQDQ 0x11, \TMP5, \TMP1 # TMP1 = a1 * b1
1219 movaps 0x30(%arg1), \TMP3
1220 AESENC \TMP3, \XMM1 # Round 3
1221 AESENC \TMP3, \XMM2
1222 AESENC \TMP3, \XMM3
1223 AESENC \TMP3, \XMM4
1224 PCLMULQDQ 0x00, \TMP5, \XMM6 # XMM6 = a0*b0
1225 movaps 0x40(%arg1), \TMP3
1226 AESENC \TMP3, \XMM1 # Round 4
1227 AESENC \TMP3, \XMM2
1228 AESENC \TMP3, \XMM3
1229 AESENC \TMP3, \XMM4
Dave Watson1476db22018-02-14 09:40:10 -08001230 movdqa HashKey_3_k(%arg2), \TMP5
Tadeusz Struk3c097b82010-12-13 19:51:15 +08001231 PCLMULQDQ 0x00, \TMP5, \TMP2 # TMP2 = (a1+a0)*(b1+b0)
1232 movaps 0x50(%arg1), \TMP3
1233 AESENC \TMP3, \XMM1 # Round 5
1234 AESENC \TMP3, \XMM2
1235 AESENC \TMP3, \XMM3
1236 AESENC \TMP3, \XMM4
1237 pxor \TMP1, \TMP4
1238# accumulate the results in TMP4:XMM5, TMP6 holds the middle part
1239 pxor \XMM6, \XMM5
1240 pxor \TMP2, \TMP6
1241 movdqa \XMM7, \TMP1
1242 pshufd $78, \XMM7, \TMP2
1243 pxor \XMM7, \TMP2
Dave Watson1476db22018-02-14 09:40:10 -08001244 movdqa HashKey_2(%arg2), \TMP5
Tadeusz Struk3c097b82010-12-13 19:51:15 +08001245
1246 # Multiply TMP5 * HashKey using karatsuba
1247
1248 PCLMULQDQ 0x11, \TMP5, \TMP1 # TMP1 = a1*b1
1249 movaps 0x60(%arg1), \TMP3
1250 AESENC \TMP3, \XMM1 # Round 6
1251 AESENC \TMP3, \XMM2
1252 AESENC \TMP3, \XMM3
1253 AESENC \TMP3, \XMM4
1254 PCLMULQDQ 0x00, \TMP5, \XMM7 # XMM7 = a0*b0
1255 movaps 0x70(%arg1), \TMP3
1256 AESENC \TMP3, \XMM1 # Round 7
1257 AESENC \TMP3, \XMM2
1258 AESENC \TMP3, \XMM3
1259 AESENC \TMP3, \XMM4
Dave Watson1476db22018-02-14 09:40:10 -08001260 movdqa HashKey_2_k(%arg2), \TMP5
Tadeusz Struk3c097b82010-12-13 19:51:15 +08001261 PCLMULQDQ 0x00, \TMP5, \TMP2 # TMP2 = (a1+a0)*(b1+b0)
1262 movaps 0x80(%arg1), \TMP3
1263 AESENC \TMP3, \XMM1 # Round 8
1264 AESENC \TMP3, \XMM2
1265 AESENC \TMP3, \XMM3
1266 AESENC \TMP3, \XMM4
1267 pxor \TMP1, \TMP4
1268# accumulate the results in TMP4:XMM5, TMP6 holds the middle part
1269 pxor \XMM7, \XMM5
1270 pxor \TMP2, \TMP6
1271
1272 # Multiply XMM8 * HashKey
1273 # XMM8 and TMP5 hold the values for the two operands
1274
1275 movdqa \XMM8, \TMP1
1276 pshufd $78, \XMM8, \TMP2
1277 pxor \XMM8, \TMP2
Dave Watson1476db22018-02-14 09:40:10 -08001278 movdqa HashKey(%arg2), \TMP5
Tadeusz Struk3c097b82010-12-13 19:51:15 +08001279 PCLMULQDQ 0x11, \TMP5, \TMP1 # TMP1 = a1*b1
1280 movaps 0x90(%arg1), \TMP3
1281 AESENC \TMP3, \XMM1 # Round 9
1282 AESENC \TMP3, \XMM2
1283 AESENC \TMP3, \XMM3
1284 AESENC \TMP3, \XMM4
1285 PCLMULQDQ 0x00, \TMP5, \XMM8 # XMM8 = a0*b0
Timothy McCaffreye31ac322015-01-13 13:16:43 -05001286 lea 0xa0(%arg1),%r10
1287 mov keysize,%eax
1288 shr $2,%eax # 128->4, 192->6, 256->8
1289 sub $4,%eax # 128->0, 192->2, 256->4
1290 jz aes_loop_par_dec_done
1291
1292aes_loop_par_dec:
1293 MOVADQ (%r10),\TMP3
1294.irpc index, 1234
1295 AESENC \TMP3, %xmm\index
1296.endr
1297 add $16,%r10
1298 sub $1,%eax
1299 jnz aes_loop_par_dec
1300
1301aes_loop_par_dec_done:
1302 MOVADQ (%r10), \TMP3
1303 AESENCLAST \TMP3, \XMM1 # last round
Tadeusz Struk3c097b82010-12-13 19:51:15 +08001304 AESENCLAST \TMP3, \XMM2
1305 AESENCLAST \TMP3, \XMM3
1306 AESENCLAST \TMP3, \XMM4
Dave Watson1476db22018-02-14 09:40:10 -08001307 movdqa HashKey_k(%arg2), \TMP5
Tadeusz Struk3c097b82010-12-13 19:51:15 +08001308 PCLMULQDQ 0x00, \TMP5, \TMP2 # TMP2 = (a1+a0)*(b1+b0)
Dave Watson9ee4a5d2018-02-14 09:39:23 -08001309 movdqu (%arg4,%r11,1), \TMP3
Tadeusz Struk3c097b82010-12-13 19:51:15 +08001310 pxor \TMP3, \XMM1 # Ciphertext/Plaintext XOR EK
Dave Watson9ee4a5d2018-02-14 09:39:23 -08001311 movdqu \XMM1, (%arg3,%r11,1) # Write to plaintext buffer
Tadeusz Struk3c097b82010-12-13 19:51:15 +08001312 movdqa \TMP3, \XMM1
Dave Watson9ee4a5d2018-02-14 09:39:23 -08001313 movdqu 16(%arg4,%r11,1), \TMP3
Tadeusz Struk3c097b82010-12-13 19:51:15 +08001314 pxor \TMP3, \XMM2 # Ciphertext/Plaintext XOR EK
Dave Watson9ee4a5d2018-02-14 09:39:23 -08001315 movdqu \XMM2, 16(%arg3,%r11,1) # Write to plaintext buffer
Tadeusz Struk3c097b82010-12-13 19:51:15 +08001316 movdqa \TMP3, \XMM2
Dave Watson9ee4a5d2018-02-14 09:39:23 -08001317 movdqu 32(%arg4,%r11,1), \TMP3
Tadeusz Struk3c097b82010-12-13 19:51:15 +08001318 pxor \TMP3, \XMM3 # Ciphertext/Plaintext XOR EK
Dave Watson9ee4a5d2018-02-14 09:39:23 -08001319 movdqu \XMM3, 32(%arg3,%r11,1) # Write to plaintext buffer
Tadeusz Struk3c097b82010-12-13 19:51:15 +08001320 movdqa \TMP3, \XMM3
Dave Watson9ee4a5d2018-02-14 09:39:23 -08001321 movdqu 48(%arg4,%r11,1), \TMP3
Tadeusz Struk3c097b82010-12-13 19:51:15 +08001322 pxor \TMP3, \XMM4 # Ciphertext/Plaintext XOR EK
Dave Watson9ee4a5d2018-02-14 09:39:23 -08001323 movdqu \XMM4, 48(%arg3,%r11,1) # Write to plaintext buffer
Tadeusz Struk0bd82f52010-11-04 15:00:45 -04001324 movdqa \TMP3, \XMM4
Tadeusz Struk3c097b82010-12-13 19:51:15 +08001325 PSHUFB_XMM %xmm15, \XMM1 # perform a 16 byte swap
1326 PSHUFB_XMM %xmm15, \XMM2 # perform a 16 byte swap
1327 PSHUFB_XMM %xmm15, \XMM3 # perform a 16 byte swap
1328 PSHUFB_XMM %xmm15, \XMM4 # perform a 16 byte swap
Tadeusz Struk0bd82f52010-11-04 15:00:45 -04001329
1330 pxor \TMP4, \TMP1
1331 pxor \XMM8, \XMM5
1332 pxor \TMP6, \TMP2
1333 pxor \TMP1, \TMP2
1334 pxor \XMM5, \TMP2
1335 movdqa \TMP2, \TMP3
1336 pslldq $8, \TMP3 # left shift TMP3 2 DWs
1337 psrldq $8, \TMP2 # right shift TMP2 2 DWs
1338 pxor \TMP3, \XMM5
1339 pxor \TMP2, \TMP1 # accumulate the results in TMP1:XMM5
1340
1341 # first phase of reduction
1342
1343 movdqa \XMM5, \TMP2
1344 movdqa \XMM5, \TMP3
1345 movdqa \XMM5, \TMP4
1346# move XMM5 into TMP2, TMP3, TMP4 in order to perform shifts independently
1347 pslld $31, \TMP2 # packed right shift << 31
1348 pslld $30, \TMP3 # packed right shift << 30
1349 pslld $25, \TMP4 # packed right shift << 25
1350 pxor \TMP3, \TMP2 # xor the shifted versions
1351 pxor \TMP4, \TMP2
1352 movdqa \TMP2, \TMP5
1353 psrldq $4, \TMP5 # right shift T5 1 DW
1354 pslldq $12, \TMP2 # left shift T2 3 DWs
1355 pxor \TMP2, \XMM5
1356
1357 # second phase of reduction
1358
1359 movdqa \XMM5,\TMP2 # make 3 copies of XMM5 into TMP2, TMP3, TMP4
1360 movdqa \XMM5,\TMP3
1361 movdqa \XMM5,\TMP4
1362 psrld $1, \TMP2 # packed left shift >>1
1363 psrld $2, \TMP3 # packed left shift >>2
1364 psrld $7, \TMP4 # packed left shift >>7
1365 pxor \TMP3,\TMP2 # xor the shifted versions
1366 pxor \TMP4,\TMP2
1367 pxor \TMP5, \TMP2
1368 pxor \TMP2, \XMM5
1369 pxor \TMP1, \XMM5 # result is in TMP1
1370
1371 pxor \XMM5, \XMM1
1372.endm
1373
1374/* GHASH the last 4 ciphertext blocks. */
1375.macro GHASH_LAST_4 TMP1 TMP2 TMP3 TMP4 TMP5 TMP6 \
1376TMP7 XMM1 XMM2 XMM3 XMM4 XMMDst
1377
1378 # Multiply TMP6 * HashKey (using Karatsuba)
1379
1380 movdqa \XMM1, \TMP6
1381 pshufd $78, \XMM1, \TMP2
1382 pxor \XMM1, \TMP2
Dave Watson1476db22018-02-14 09:40:10 -08001383 movdqa HashKey_4(%arg2), \TMP5
Tadeusz Struk0bd82f52010-11-04 15:00:45 -04001384 PCLMULQDQ 0x11, \TMP5, \TMP6 # TMP6 = a1*b1
1385 PCLMULQDQ 0x00, \TMP5, \XMM1 # XMM1 = a0*b0
Dave Watson1476db22018-02-14 09:40:10 -08001386 movdqa HashKey_4_k(%arg2), \TMP4
Tadeusz Struk0bd82f52010-11-04 15:00:45 -04001387 PCLMULQDQ 0x00, \TMP4, \TMP2 # TMP2 = (a1+a0)*(b1+b0)
1388 movdqa \XMM1, \XMMDst
1389 movdqa \TMP2, \XMM1 # result in TMP6, XMMDst, XMM1
1390
1391 # Multiply TMP1 * HashKey (using Karatsuba)
1392
1393 movdqa \XMM2, \TMP1
1394 pshufd $78, \XMM2, \TMP2
1395 pxor \XMM2, \TMP2
Dave Watson1476db22018-02-14 09:40:10 -08001396 movdqa HashKey_3(%arg2), \TMP5
Tadeusz Struk0bd82f52010-11-04 15:00:45 -04001397 PCLMULQDQ 0x11, \TMP5, \TMP1 # TMP1 = a1*b1
1398 PCLMULQDQ 0x00, \TMP5, \XMM2 # XMM2 = a0*b0
Dave Watson1476db22018-02-14 09:40:10 -08001399 movdqa HashKey_3_k(%arg2), \TMP4
Tadeusz Struk0bd82f52010-11-04 15:00:45 -04001400 PCLMULQDQ 0x00, \TMP4, \TMP2 # TMP2 = (a1+a0)*(b1+b0)
1401 pxor \TMP1, \TMP6
1402 pxor \XMM2, \XMMDst
1403 pxor \TMP2, \XMM1
1404# results accumulated in TMP6, XMMDst, XMM1
1405
1406 # Multiply TMP1 * HashKey (using Karatsuba)
1407
1408 movdqa \XMM3, \TMP1
1409 pshufd $78, \XMM3, \TMP2
1410 pxor \XMM3, \TMP2
Dave Watson1476db22018-02-14 09:40:10 -08001411 movdqa HashKey_2(%arg2), \TMP5
Tadeusz Struk0bd82f52010-11-04 15:00:45 -04001412 PCLMULQDQ 0x11, \TMP5, \TMP1 # TMP1 = a1*b1
1413 PCLMULQDQ 0x00, \TMP5, \XMM3 # XMM3 = a0*b0
Dave Watson1476db22018-02-14 09:40:10 -08001414 movdqa HashKey_2_k(%arg2), \TMP4
Tadeusz Struk0bd82f52010-11-04 15:00:45 -04001415 PCLMULQDQ 0x00, \TMP4, \TMP2 # TMP2 = (a1+a0)*(b1+b0)
1416 pxor \TMP1, \TMP6
1417 pxor \XMM3, \XMMDst
1418 pxor \TMP2, \XMM1 # results accumulated in TMP6, XMMDst, XMM1
1419
1420 # Multiply TMP1 * HashKey (using Karatsuba)
1421 movdqa \XMM4, \TMP1
1422 pshufd $78, \XMM4, \TMP2
1423 pxor \XMM4, \TMP2
Dave Watson1476db22018-02-14 09:40:10 -08001424 movdqa HashKey(%arg2), \TMP5
Tadeusz Struk0bd82f52010-11-04 15:00:45 -04001425 PCLMULQDQ 0x11, \TMP5, \TMP1 # TMP1 = a1*b1
1426 PCLMULQDQ 0x00, \TMP5, \XMM4 # XMM4 = a0*b0
Dave Watson1476db22018-02-14 09:40:10 -08001427 movdqa HashKey_k(%arg2), \TMP4
Tadeusz Struk0bd82f52010-11-04 15:00:45 -04001428 PCLMULQDQ 0x00, \TMP4, \TMP2 # TMP2 = (a1+a0)*(b1+b0)
1429 pxor \TMP1, \TMP6
1430 pxor \XMM4, \XMMDst
1431 pxor \XMM1, \TMP2
1432 pxor \TMP6, \TMP2
1433 pxor \XMMDst, \TMP2
1434 # middle section of the temp results combined as in karatsuba algorithm
1435 movdqa \TMP2, \TMP4
1436 pslldq $8, \TMP4 # left shift TMP4 2 DWs
1437 psrldq $8, \TMP2 # right shift TMP2 2 DWs
1438 pxor \TMP4, \XMMDst
1439 pxor \TMP2, \TMP6
1440# TMP6:XMMDst holds the result of the accumulated carry-less multiplications
1441 # first phase of the reduction
1442 movdqa \XMMDst, \TMP2
1443 movdqa \XMMDst, \TMP3
1444 movdqa \XMMDst, \TMP4
1445# move XMMDst into TMP2, TMP3, TMP4 in order to perform 3 shifts independently
1446 pslld $31, \TMP2 # packed right shifting << 31
1447 pslld $30, \TMP3 # packed right shifting << 30
1448 pslld $25, \TMP4 # packed right shifting << 25
1449 pxor \TMP3, \TMP2 # xor the shifted versions
1450 pxor \TMP4, \TMP2
1451 movdqa \TMP2, \TMP7
1452 psrldq $4, \TMP7 # right shift TMP7 1 DW
1453 pslldq $12, \TMP2 # left shift TMP2 3 DWs
1454 pxor \TMP2, \XMMDst
1455
1456 # second phase of the reduction
1457 movdqa \XMMDst, \TMP2
1458 # make 3 copies of XMMDst for doing 3 shift operations
1459 movdqa \XMMDst, \TMP3
1460 movdqa \XMMDst, \TMP4
1461 psrld $1, \TMP2 # packed left shift >> 1
1462 psrld $2, \TMP3 # packed left shift >> 2
1463 psrld $7, \TMP4 # packed left shift >> 7
1464 pxor \TMP3, \TMP2 # xor the shifted versions
1465 pxor \TMP4, \TMP2
1466 pxor \TMP7, \TMP2
1467 pxor \TMP2, \XMMDst
1468 pxor \TMP6, \XMMDst # reduced result is in XMMDst
1469.endm
1470
Timothy McCaffreye31ac322015-01-13 13:16:43 -05001471
1472/* Encryption of a single block
1473* uses eax & r10
1474*/
1475
Tadeusz Struk0bd82f52010-11-04 15:00:45 -04001476.macro ENCRYPT_SINGLE_BLOCK XMM0 TMP1
1477
Timothy McCaffreye31ac322015-01-13 13:16:43 -05001478 pxor (%arg1), \XMM0
1479 mov keysize,%eax
1480 shr $2,%eax # 128->4, 192->6, 256->8
1481 add $5,%eax # 128->9, 192->11, 256->13
1482 lea 16(%arg1), %r10 # get first expanded key address
1483
1484_esb_loop_\@:
1485 MOVADQ (%r10),\TMP1
1486 AESENC \TMP1,\XMM0
1487 add $16,%r10
1488 sub $1,%eax
1489 jnz _esb_loop_\@
1490
1491 MOVADQ (%r10),\TMP1
1492 AESENCLAST \TMP1,\XMM0
Tadeusz Struk0bd82f52010-11-04 15:00:45 -04001493.endm
Tadeusz Struk0bd82f52010-11-04 15:00:45 -04001494/*****************************************************************************
1495* void aesni_gcm_dec(void *aes_ctx, // AES Key schedule. Starts on a 16 byte boundary.
Dave Watson9ee4a5d2018-02-14 09:39:23 -08001496* struct gcm_context_data *data
1497* // Context data
Tadeusz Struk0bd82f52010-11-04 15:00:45 -04001498* u8 *out, // Plaintext output. Encrypt in-place is allowed.
1499* const u8 *in, // Ciphertext input
1500* u64 plaintext_len, // Length of data in bytes for decryption.
1501* u8 *iv, // Pre-counter block j0: 4 byte salt (from Security Association)
1502* // concatenated with 8 byte Initialisation Vector (from IPSec ESP Payload)
1503* // concatenated with 0x00000001. 16-byte aligned pointer.
1504* u8 *hash_subkey, // H, the Hash sub key input. Data starts on a 16-byte boundary.
1505* const u8 *aad, // Additional Authentication Data (AAD)
1506* u64 aad_len, // Length of AAD in bytes. With RFC4106 this is going to be 8 or 12 bytes
1507* u8 *auth_tag, // Authenticated Tag output. The driver will compare this to the
1508* // given authentication tag and only return the plaintext if they match.
1509* u64 auth_tag_len); // Authenticated Tag Length in bytes. Valid values are 16
1510* // (most likely), 12 or 8.
1511*
1512* Assumptions:
1513*
1514* keys:
1515* keys are pre-expanded and aligned to 16 bytes. we are using the first
1516* set of 11 keys in the data structure void *aes_ctx
1517*
1518* iv:
1519* 0 1 2 3
1520* 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
1521* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1522* | Salt (From the SA) |
1523* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1524* | Initialization Vector |
1525* | (This is the sequence number from IPSec header) |
1526* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1527* | 0x1 |
1528* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1529*
1530*
1531*
1532* AAD:
1533* AAD padded to 128 bits with 0
1534* for example, assume AAD is a u32 vector
1535*
1536* if AAD is 8 bytes:
1537* AAD[3] = {A0, A1};
1538* padded AAD in xmm register = {A1 A0 0 0}
1539*
1540* 0 1 2 3
1541* 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
1542* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1543* | SPI (A1) |
1544* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1545* | 32-bit Sequence Number (A0) |
1546* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1547* | 0x0 |
1548* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1549*
1550* AAD Format with 32-bit Sequence Number
1551*
1552* if AAD is 12 bytes:
1553* AAD[3] = {A0, A1, A2};
1554* padded AAD in xmm register = {A2 A1 A0 0}
1555*
1556* 0 1 2 3
1557* 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
1558* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1559* 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
1560* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1561* | SPI (A2) |
1562* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1563* | 64-bit Extended Sequence Number {A1,A0} |
1564* | |
1565* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1566* | 0x0 |
1567* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1568*
1569* AAD Format with 64-bit Extended Sequence Number
1570*
Tadeusz Struk0bd82f52010-11-04 15:00:45 -04001571* poly = x^128 + x^127 + x^126 + x^121 + 1
1572*
1573*****************************************************************************/
Tadeusz Struk0bd82f52010-11-04 15:00:45 -04001574ENTRY(aesni_gcm_dec)
Dave Watson6c2c86b2018-02-14 09:38:35 -08001575 FUNC_SAVE
Tadeusz Struk3c097b82010-12-13 19:51:15 +08001576
Dave Watson7af964c2018-02-14 09:38:45 -08001577 GCM_INIT
Dave Watsonba458332018-02-14 09:39:10 -08001578 GCM_ENC_DEC dec
Dave Watsonadcadab2018-02-14 09:38:57 -08001579 GCM_COMPLETE
Dave Watson6c2c86b2018-02-14 09:38:35 -08001580 FUNC_RESTORE
Tadeusz Struk0bd82f52010-11-04 15:00:45 -04001581 ret
Jussi Kivilinna8309b742013-01-19 13:38:55 +02001582ENDPROC(aesni_gcm_dec)
Tadeusz Struk0bd82f52010-11-04 15:00:45 -04001583
1584
1585/*****************************************************************************
1586* void aesni_gcm_enc(void *aes_ctx, // AES Key schedule. Starts on a 16 byte boundary.
Dave Watson9ee4a5d2018-02-14 09:39:23 -08001587* struct gcm_context_data *data
1588* // Context data
Tadeusz Struk0bd82f52010-11-04 15:00:45 -04001589* u8 *out, // Ciphertext output. Encrypt in-place is allowed.
1590* const u8 *in, // Plaintext input
1591* u64 plaintext_len, // Length of data in bytes for encryption.
1592* u8 *iv, // Pre-counter block j0: 4 byte salt (from Security Association)
1593* // concatenated with 8 byte Initialisation Vector (from IPSec ESP Payload)
1594* // concatenated with 0x00000001. 16-byte aligned pointer.
1595* u8 *hash_subkey, // H, the Hash sub key input. Data starts on a 16-byte boundary.
1596* const u8 *aad, // Additional Authentication Data (AAD)
1597* u64 aad_len, // Length of AAD in bytes. With RFC4106 this is going to be 8 or 12 bytes
1598* u8 *auth_tag, // Authenticated Tag output.
1599* u64 auth_tag_len); // Authenticated Tag Length in bytes. Valid values are 16 (most likely),
1600* // 12 or 8.
1601*
1602* Assumptions:
1603*
1604* keys:
1605* keys are pre-expanded and aligned to 16 bytes. we are using the
1606* first set of 11 keys in the data structure void *aes_ctx
1607*
1608*
1609* iv:
1610* 0 1 2 3
1611* 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
1612* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1613* | Salt (From the SA) |
1614* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1615* | Initialization Vector |
1616* | (This is the sequence number from IPSec header) |
1617* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1618* | 0x1 |
1619* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1620*
1621*
1622*
1623* AAD:
1624* AAD padded to 128 bits with 0
1625* for example, assume AAD is a u32 vector
1626*
1627* if AAD is 8 bytes:
1628* AAD[3] = {A0, A1};
1629* padded AAD in xmm register = {A1 A0 0 0}
1630*
1631* 0 1 2 3
1632* 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
1633* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1634* | SPI (A1) |
1635* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1636* | 32-bit Sequence Number (A0) |
1637* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1638* | 0x0 |
1639* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1640*
1641* AAD Format with 32-bit Sequence Number
1642*
1643* if AAD is 12 bytes:
1644* AAD[3] = {A0, A1, A2};
1645* padded AAD in xmm register = {A2 A1 A0 0}
1646*
1647* 0 1 2 3
1648* 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
1649* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1650* | SPI (A2) |
1651* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1652* | 64-bit Extended Sequence Number {A1,A0} |
1653* | |
1654* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1655* | 0x0 |
1656* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1657*
1658* AAD Format with 64-bit Extended Sequence Number
1659*
Tadeusz Struk0bd82f52010-11-04 15:00:45 -04001660* poly = x^128 + x^127 + x^126 + x^121 + 1
1661***************************************************************************/
1662ENTRY(aesni_gcm_enc)
Dave Watson6c2c86b2018-02-14 09:38:35 -08001663 FUNC_SAVE
Tadeusz Struk3c097b82010-12-13 19:51:15 +08001664
Dave Watson7af964c2018-02-14 09:38:45 -08001665 GCM_INIT
Dave Watsonba458332018-02-14 09:39:10 -08001666 GCM_ENC_DEC enc
Dave Watsonadcadab2018-02-14 09:38:57 -08001667 GCM_COMPLETE
Dave Watson6c2c86b2018-02-14 09:38:35 -08001668 FUNC_RESTORE
Tadeusz Struk0bd82f52010-11-04 15:00:45 -04001669 ret
Jussi Kivilinna8309b742013-01-19 13:38:55 +02001670ENDPROC(aesni_gcm_enc)
Tadeusz Struk3c097b82010-12-13 19:51:15 +08001671
Mathias Krause559ad0f2010-11-29 08:35:39 +08001672#endif
Tadeusz Struk0bd82f52010-11-04 15:00:45 -04001673
1674
Jussi Kivilinna8309b742013-01-19 13:38:55 +02001675.align 4
Huang Ying54b6a1b2009-01-18 16:28:34 +11001676_key_expansion_128:
1677_key_expansion_256a:
1678 pshufd $0b11111111, %xmm1, %xmm1
1679 shufps $0b00010000, %xmm0, %xmm4
1680 pxor %xmm4, %xmm0
1681 shufps $0b10001100, %xmm0, %xmm4
1682 pxor %xmm4, %xmm0
1683 pxor %xmm1, %xmm0
Mathias Krause0d258ef2010-11-27 16:34:46 +08001684 movaps %xmm0, (TKEYP)
1685 add $0x10, TKEYP
Huang Ying54b6a1b2009-01-18 16:28:34 +11001686 ret
Jussi Kivilinna8309b742013-01-19 13:38:55 +02001687ENDPROC(_key_expansion_128)
1688ENDPROC(_key_expansion_256a)
Huang Ying54b6a1b2009-01-18 16:28:34 +11001689
Mathias Krause0d258ef2010-11-27 16:34:46 +08001690.align 4
Huang Ying54b6a1b2009-01-18 16:28:34 +11001691_key_expansion_192a:
1692 pshufd $0b01010101, %xmm1, %xmm1
1693 shufps $0b00010000, %xmm0, %xmm4
1694 pxor %xmm4, %xmm0
1695 shufps $0b10001100, %xmm0, %xmm4
1696 pxor %xmm4, %xmm0
1697 pxor %xmm1, %xmm0
1698
1699 movaps %xmm2, %xmm5
1700 movaps %xmm2, %xmm6
1701 pslldq $4, %xmm5
1702 pshufd $0b11111111, %xmm0, %xmm3
1703 pxor %xmm3, %xmm2
1704 pxor %xmm5, %xmm2
1705
1706 movaps %xmm0, %xmm1
1707 shufps $0b01000100, %xmm0, %xmm6
Mathias Krause0d258ef2010-11-27 16:34:46 +08001708 movaps %xmm6, (TKEYP)
Huang Ying54b6a1b2009-01-18 16:28:34 +11001709 shufps $0b01001110, %xmm2, %xmm1
Mathias Krause0d258ef2010-11-27 16:34:46 +08001710 movaps %xmm1, 0x10(TKEYP)
1711 add $0x20, TKEYP
Huang Ying54b6a1b2009-01-18 16:28:34 +11001712 ret
Jussi Kivilinna8309b742013-01-19 13:38:55 +02001713ENDPROC(_key_expansion_192a)
Huang Ying54b6a1b2009-01-18 16:28:34 +11001714
Mathias Krause0d258ef2010-11-27 16:34:46 +08001715.align 4
Huang Ying54b6a1b2009-01-18 16:28:34 +11001716_key_expansion_192b:
1717 pshufd $0b01010101, %xmm1, %xmm1
1718 shufps $0b00010000, %xmm0, %xmm4
1719 pxor %xmm4, %xmm0
1720 shufps $0b10001100, %xmm0, %xmm4
1721 pxor %xmm4, %xmm0
1722 pxor %xmm1, %xmm0
1723
1724 movaps %xmm2, %xmm5
1725 pslldq $4, %xmm5
1726 pshufd $0b11111111, %xmm0, %xmm3
1727 pxor %xmm3, %xmm2
1728 pxor %xmm5, %xmm2
1729
Mathias Krause0d258ef2010-11-27 16:34:46 +08001730 movaps %xmm0, (TKEYP)
1731 add $0x10, TKEYP
Huang Ying54b6a1b2009-01-18 16:28:34 +11001732 ret
Jussi Kivilinna8309b742013-01-19 13:38:55 +02001733ENDPROC(_key_expansion_192b)
Huang Ying54b6a1b2009-01-18 16:28:34 +11001734
Mathias Krause0d258ef2010-11-27 16:34:46 +08001735.align 4
Huang Ying54b6a1b2009-01-18 16:28:34 +11001736_key_expansion_256b:
1737 pshufd $0b10101010, %xmm1, %xmm1
1738 shufps $0b00010000, %xmm2, %xmm4
1739 pxor %xmm4, %xmm2
1740 shufps $0b10001100, %xmm2, %xmm4
1741 pxor %xmm4, %xmm2
1742 pxor %xmm1, %xmm2
Mathias Krause0d258ef2010-11-27 16:34:46 +08001743 movaps %xmm2, (TKEYP)
1744 add $0x10, TKEYP
Huang Ying54b6a1b2009-01-18 16:28:34 +11001745 ret
Jussi Kivilinna8309b742013-01-19 13:38:55 +02001746ENDPROC(_key_expansion_256b)
Huang Ying54b6a1b2009-01-18 16:28:34 +11001747
1748/*
1749 * int aesni_set_key(struct crypto_aes_ctx *ctx, const u8 *in_key,
1750 * unsigned int key_len)
1751 */
1752ENTRY(aesni_set_key)
Josh Poimboeuf8691ccd2016-01-21 16:49:19 -06001753 FRAME_BEGIN
Mathias Krause0d258ef2010-11-27 16:34:46 +08001754#ifndef __x86_64__
1755 pushl KEYP
Josh Poimboeuf8691ccd2016-01-21 16:49:19 -06001756 movl (FRAME_OFFSET+8)(%esp), KEYP # ctx
1757 movl (FRAME_OFFSET+12)(%esp), UKEYP # in_key
1758 movl (FRAME_OFFSET+16)(%esp), %edx # key_len
Mathias Krause0d258ef2010-11-27 16:34:46 +08001759#endif
1760 movups (UKEYP), %xmm0 # user key (first 16 bytes)
1761 movaps %xmm0, (KEYP)
1762 lea 0x10(KEYP), TKEYP # key addr
1763 movl %edx, 480(KEYP)
Huang Ying54b6a1b2009-01-18 16:28:34 +11001764 pxor %xmm4, %xmm4 # xmm4 is assumed 0 in _key_expansion_x
1765 cmp $24, %dl
1766 jb .Lenc_key128
1767 je .Lenc_key192
Mathias Krause0d258ef2010-11-27 16:34:46 +08001768 movups 0x10(UKEYP), %xmm2 # other user key
1769 movaps %xmm2, (TKEYP)
1770 add $0x10, TKEYP
Huang Yingb369e522009-11-23 19:54:06 +08001771 AESKEYGENASSIST 0x1 %xmm2 %xmm1 # round 1
Huang Ying54b6a1b2009-01-18 16:28:34 +11001772 call _key_expansion_256a
Huang Yingb369e522009-11-23 19:54:06 +08001773 AESKEYGENASSIST 0x1 %xmm0 %xmm1
Huang Ying54b6a1b2009-01-18 16:28:34 +11001774 call _key_expansion_256b
Huang Yingb369e522009-11-23 19:54:06 +08001775 AESKEYGENASSIST 0x2 %xmm2 %xmm1 # round 2
Huang Ying54b6a1b2009-01-18 16:28:34 +11001776 call _key_expansion_256a
Huang Yingb369e522009-11-23 19:54:06 +08001777 AESKEYGENASSIST 0x2 %xmm0 %xmm1
Huang Ying54b6a1b2009-01-18 16:28:34 +11001778 call _key_expansion_256b
Huang Yingb369e522009-11-23 19:54:06 +08001779 AESKEYGENASSIST 0x4 %xmm2 %xmm1 # round 3
Huang Ying54b6a1b2009-01-18 16:28:34 +11001780 call _key_expansion_256a
Huang Yingb369e522009-11-23 19:54:06 +08001781 AESKEYGENASSIST 0x4 %xmm0 %xmm1
Huang Ying54b6a1b2009-01-18 16:28:34 +11001782 call _key_expansion_256b
Huang Yingb369e522009-11-23 19:54:06 +08001783 AESKEYGENASSIST 0x8 %xmm2 %xmm1 # round 4
Huang Ying54b6a1b2009-01-18 16:28:34 +11001784 call _key_expansion_256a
Huang Yingb369e522009-11-23 19:54:06 +08001785 AESKEYGENASSIST 0x8 %xmm0 %xmm1
Huang Ying54b6a1b2009-01-18 16:28:34 +11001786 call _key_expansion_256b
Huang Yingb369e522009-11-23 19:54:06 +08001787 AESKEYGENASSIST 0x10 %xmm2 %xmm1 # round 5
Huang Ying54b6a1b2009-01-18 16:28:34 +11001788 call _key_expansion_256a
Huang Yingb369e522009-11-23 19:54:06 +08001789 AESKEYGENASSIST 0x10 %xmm0 %xmm1
Huang Ying54b6a1b2009-01-18 16:28:34 +11001790 call _key_expansion_256b
Huang Yingb369e522009-11-23 19:54:06 +08001791 AESKEYGENASSIST 0x20 %xmm2 %xmm1 # round 6
Huang Ying54b6a1b2009-01-18 16:28:34 +11001792 call _key_expansion_256a
Huang Yingb369e522009-11-23 19:54:06 +08001793 AESKEYGENASSIST 0x20 %xmm0 %xmm1
Huang Ying54b6a1b2009-01-18 16:28:34 +11001794 call _key_expansion_256b
Huang Yingb369e522009-11-23 19:54:06 +08001795 AESKEYGENASSIST 0x40 %xmm2 %xmm1 # round 7
Huang Ying54b6a1b2009-01-18 16:28:34 +11001796 call _key_expansion_256a
1797 jmp .Ldec_key
1798.Lenc_key192:
Mathias Krause0d258ef2010-11-27 16:34:46 +08001799 movq 0x10(UKEYP), %xmm2 # other user key
Huang Yingb369e522009-11-23 19:54:06 +08001800 AESKEYGENASSIST 0x1 %xmm2 %xmm1 # round 1
Huang Ying54b6a1b2009-01-18 16:28:34 +11001801 call _key_expansion_192a
Huang Yingb369e522009-11-23 19:54:06 +08001802 AESKEYGENASSIST 0x2 %xmm2 %xmm1 # round 2
Huang Ying54b6a1b2009-01-18 16:28:34 +11001803 call _key_expansion_192b
Huang Yingb369e522009-11-23 19:54:06 +08001804 AESKEYGENASSIST 0x4 %xmm2 %xmm1 # round 3
Huang Ying54b6a1b2009-01-18 16:28:34 +11001805 call _key_expansion_192a
Huang Yingb369e522009-11-23 19:54:06 +08001806 AESKEYGENASSIST 0x8 %xmm2 %xmm1 # round 4
Huang Ying54b6a1b2009-01-18 16:28:34 +11001807 call _key_expansion_192b
Huang Yingb369e522009-11-23 19:54:06 +08001808 AESKEYGENASSIST 0x10 %xmm2 %xmm1 # round 5
Huang Ying54b6a1b2009-01-18 16:28:34 +11001809 call _key_expansion_192a
Huang Yingb369e522009-11-23 19:54:06 +08001810 AESKEYGENASSIST 0x20 %xmm2 %xmm1 # round 6
Huang Ying54b6a1b2009-01-18 16:28:34 +11001811 call _key_expansion_192b
Huang Yingb369e522009-11-23 19:54:06 +08001812 AESKEYGENASSIST 0x40 %xmm2 %xmm1 # round 7
Huang Ying54b6a1b2009-01-18 16:28:34 +11001813 call _key_expansion_192a
Huang Yingb369e522009-11-23 19:54:06 +08001814 AESKEYGENASSIST 0x80 %xmm2 %xmm1 # round 8
Huang Ying54b6a1b2009-01-18 16:28:34 +11001815 call _key_expansion_192b
1816 jmp .Ldec_key
1817.Lenc_key128:
Huang Yingb369e522009-11-23 19:54:06 +08001818 AESKEYGENASSIST 0x1 %xmm0 %xmm1 # round 1
Huang Ying54b6a1b2009-01-18 16:28:34 +11001819 call _key_expansion_128
Huang Yingb369e522009-11-23 19:54:06 +08001820 AESKEYGENASSIST 0x2 %xmm0 %xmm1 # round 2
Huang Ying54b6a1b2009-01-18 16:28:34 +11001821 call _key_expansion_128
Huang Yingb369e522009-11-23 19:54:06 +08001822 AESKEYGENASSIST 0x4 %xmm0 %xmm1 # round 3
Huang Ying54b6a1b2009-01-18 16:28:34 +11001823 call _key_expansion_128
Huang Yingb369e522009-11-23 19:54:06 +08001824 AESKEYGENASSIST 0x8 %xmm0 %xmm1 # round 4
Huang Ying54b6a1b2009-01-18 16:28:34 +11001825 call _key_expansion_128
Huang Yingb369e522009-11-23 19:54:06 +08001826 AESKEYGENASSIST 0x10 %xmm0 %xmm1 # round 5
Huang Ying54b6a1b2009-01-18 16:28:34 +11001827 call _key_expansion_128
Huang Yingb369e522009-11-23 19:54:06 +08001828 AESKEYGENASSIST 0x20 %xmm0 %xmm1 # round 6
Huang Ying54b6a1b2009-01-18 16:28:34 +11001829 call _key_expansion_128
Huang Yingb369e522009-11-23 19:54:06 +08001830 AESKEYGENASSIST 0x40 %xmm0 %xmm1 # round 7
Huang Ying54b6a1b2009-01-18 16:28:34 +11001831 call _key_expansion_128
Huang Yingb369e522009-11-23 19:54:06 +08001832 AESKEYGENASSIST 0x80 %xmm0 %xmm1 # round 8
Huang Ying54b6a1b2009-01-18 16:28:34 +11001833 call _key_expansion_128
Huang Yingb369e522009-11-23 19:54:06 +08001834 AESKEYGENASSIST 0x1b %xmm0 %xmm1 # round 9
Huang Ying54b6a1b2009-01-18 16:28:34 +11001835 call _key_expansion_128
Huang Yingb369e522009-11-23 19:54:06 +08001836 AESKEYGENASSIST 0x36 %xmm0 %xmm1 # round 10
Huang Ying54b6a1b2009-01-18 16:28:34 +11001837 call _key_expansion_128
1838.Ldec_key:
Mathias Krause0d258ef2010-11-27 16:34:46 +08001839 sub $0x10, TKEYP
1840 movaps (KEYP), %xmm0
1841 movaps (TKEYP), %xmm1
1842 movaps %xmm0, 240(TKEYP)
1843 movaps %xmm1, 240(KEYP)
1844 add $0x10, KEYP
1845 lea 240-16(TKEYP), UKEYP
Huang Ying54b6a1b2009-01-18 16:28:34 +11001846.align 4
1847.Ldec_key_loop:
Mathias Krause0d258ef2010-11-27 16:34:46 +08001848 movaps (KEYP), %xmm0
Huang Yingb369e522009-11-23 19:54:06 +08001849 AESIMC %xmm0 %xmm1
Mathias Krause0d258ef2010-11-27 16:34:46 +08001850 movaps %xmm1, (UKEYP)
1851 add $0x10, KEYP
1852 sub $0x10, UKEYP
1853 cmp TKEYP, KEYP
Huang Ying54b6a1b2009-01-18 16:28:34 +11001854 jb .Ldec_key_loop
Mathias Krause0d258ef2010-11-27 16:34:46 +08001855 xor AREG, AREG
1856#ifndef __x86_64__
1857 popl KEYP
1858#endif
Josh Poimboeuf8691ccd2016-01-21 16:49:19 -06001859 FRAME_END
Huang Ying54b6a1b2009-01-18 16:28:34 +11001860 ret
Jussi Kivilinna8309b742013-01-19 13:38:55 +02001861ENDPROC(aesni_set_key)
Huang Ying54b6a1b2009-01-18 16:28:34 +11001862
1863/*
1864 * void aesni_enc(struct crypto_aes_ctx *ctx, u8 *dst, const u8 *src)
1865 */
1866ENTRY(aesni_enc)
Josh Poimboeuf8691ccd2016-01-21 16:49:19 -06001867 FRAME_BEGIN
Mathias Krause0d258ef2010-11-27 16:34:46 +08001868#ifndef __x86_64__
1869 pushl KEYP
1870 pushl KLEN
Josh Poimboeuf8691ccd2016-01-21 16:49:19 -06001871 movl (FRAME_OFFSET+12)(%esp), KEYP # ctx
1872 movl (FRAME_OFFSET+16)(%esp), OUTP # dst
1873 movl (FRAME_OFFSET+20)(%esp), INP # src
Mathias Krause0d258ef2010-11-27 16:34:46 +08001874#endif
Huang Ying54b6a1b2009-01-18 16:28:34 +11001875 movl 480(KEYP), KLEN # key length
1876 movups (INP), STATE # input
1877 call _aesni_enc1
1878 movups STATE, (OUTP) # output
Mathias Krause0d258ef2010-11-27 16:34:46 +08001879#ifndef __x86_64__
1880 popl KLEN
1881 popl KEYP
1882#endif
Josh Poimboeuf8691ccd2016-01-21 16:49:19 -06001883 FRAME_END
Huang Ying54b6a1b2009-01-18 16:28:34 +11001884 ret
Jussi Kivilinna8309b742013-01-19 13:38:55 +02001885ENDPROC(aesni_enc)
Huang Ying54b6a1b2009-01-18 16:28:34 +11001886
1887/*
1888 * _aesni_enc1: internal ABI
1889 * input:
1890 * KEYP: key struct pointer
1891 * KLEN: round count
1892 * STATE: initial state (input)
1893 * output:
1894 * STATE: finial state (output)
1895 * changed:
1896 * KEY
1897 * TKEYP (T1)
1898 */
Mathias Krause0d258ef2010-11-27 16:34:46 +08001899.align 4
Huang Ying54b6a1b2009-01-18 16:28:34 +11001900_aesni_enc1:
1901 movaps (KEYP), KEY # key
1902 mov KEYP, TKEYP
1903 pxor KEY, STATE # round 0
1904 add $0x30, TKEYP
1905 cmp $24, KLEN
1906 jb .Lenc128
1907 lea 0x20(TKEYP), TKEYP
1908 je .Lenc192
1909 add $0x20, TKEYP
1910 movaps -0x60(TKEYP), KEY
Huang Yingb369e522009-11-23 19:54:06 +08001911 AESENC KEY STATE
Huang Ying54b6a1b2009-01-18 16:28:34 +11001912 movaps -0x50(TKEYP), KEY
Huang Yingb369e522009-11-23 19:54:06 +08001913 AESENC KEY STATE
Huang Ying54b6a1b2009-01-18 16:28:34 +11001914.align 4
1915.Lenc192:
1916 movaps -0x40(TKEYP), KEY
Huang Yingb369e522009-11-23 19:54:06 +08001917 AESENC KEY STATE
Huang Ying54b6a1b2009-01-18 16:28:34 +11001918 movaps -0x30(TKEYP), KEY
Huang Yingb369e522009-11-23 19:54:06 +08001919 AESENC KEY STATE
Huang Ying54b6a1b2009-01-18 16:28:34 +11001920.align 4
1921.Lenc128:
1922 movaps -0x20(TKEYP), KEY
Huang Yingb369e522009-11-23 19:54:06 +08001923 AESENC KEY STATE
Huang Ying54b6a1b2009-01-18 16:28:34 +11001924 movaps -0x10(TKEYP), KEY
Huang Yingb369e522009-11-23 19:54:06 +08001925 AESENC KEY STATE
Huang Ying54b6a1b2009-01-18 16:28:34 +11001926 movaps (TKEYP), KEY
Huang Yingb369e522009-11-23 19:54:06 +08001927 AESENC KEY STATE
Huang Ying54b6a1b2009-01-18 16:28:34 +11001928 movaps 0x10(TKEYP), KEY
Huang Yingb369e522009-11-23 19:54:06 +08001929 AESENC KEY STATE
Huang Ying54b6a1b2009-01-18 16:28:34 +11001930 movaps 0x20(TKEYP), KEY
Huang Yingb369e522009-11-23 19:54:06 +08001931 AESENC KEY STATE
Huang Ying54b6a1b2009-01-18 16:28:34 +11001932 movaps 0x30(TKEYP), KEY
Huang Yingb369e522009-11-23 19:54:06 +08001933 AESENC KEY STATE
Huang Ying54b6a1b2009-01-18 16:28:34 +11001934 movaps 0x40(TKEYP), KEY
Huang Yingb369e522009-11-23 19:54:06 +08001935 AESENC KEY STATE
Huang Ying54b6a1b2009-01-18 16:28:34 +11001936 movaps 0x50(TKEYP), KEY
Huang Yingb369e522009-11-23 19:54:06 +08001937 AESENC KEY STATE
Huang Ying54b6a1b2009-01-18 16:28:34 +11001938 movaps 0x60(TKEYP), KEY
Huang Yingb369e522009-11-23 19:54:06 +08001939 AESENC KEY STATE
Huang Ying54b6a1b2009-01-18 16:28:34 +11001940 movaps 0x70(TKEYP), KEY
Huang Yingb369e522009-11-23 19:54:06 +08001941 AESENCLAST KEY STATE
Huang Ying54b6a1b2009-01-18 16:28:34 +11001942 ret
Jussi Kivilinna8309b742013-01-19 13:38:55 +02001943ENDPROC(_aesni_enc1)
Huang Ying54b6a1b2009-01-18 16:28:34 +11001944
1945/*
1946 * _aesni_enc4: internal ABI
1947 * input:
1948 * KEYP: key struct pointer
1949 * KLEN: round count
1950 * STATE1: initial state (input)
1951 * STATE2
1952 * STATE3
1953 * STATE4
1954 * output:
1955 * STATE1: finial state (output)
1956 * STATE2
1957 * STATE3
1958 * STATE4
1959 * changed:
1960 * KEY
1961 * TKEYP (T1)
1962 */
Mathias Krause0d258ef2010-11-27 16:34:46 +08001963.align 4
Huang Ying54b6a1b2009-01-18 16:28:34 +11001964_aesni_enc4:
1965 movaps (KEYP), KEY # key
1966 mov KEYP, TKEYP
1967 pxor KEY, STATE1 # round 0
1968 pxor KEY, STATE2
1969 pxor KEY, STATE3
1970 pxor KEY, STATE4
1971 add $0x30, TKEYP
1972 cmp $24, KLEN
1973 jb .L4enc128
1974 lea 0x20(TKEYP), TKEYP
1975 je .L4enc192
1976 add $0x20, TKEYP
1977 movaps -0x60(TKEYP), KEY
Huang Yingb369e522009-11-23 19:54:06 +08001978 AESENC KEY STATE1
1979 AESENC KEY STATE2
1980 AESENC KEY STATE3
1981 AESENC KEY STATE4
Huang Ying54b6a1b2009-01-18 16:28:34 +11001982 movaps -0x50(TKEYP), KEY
Huang Yingb369e522009-11-23 19:54:06 +08001983 AESENC KEY STATE1
1984 AESENC KEY STATE2
1985 AESENC KEY STATE3
1986 AESENC KEY STATE4
Huang Ying54b6a1b2009-01-18 16:28:34 +11001987#.align 4
1988.L4enc192:
1989 movaps -0x40(TKEYP), KEY
Huang Yingb369e522009-11-23 19:54:06 +08001990 AESENC KEY STATE1
1991 AESENC KEY STATE2
1992 AESENC KEY STATE3
1993 AESENC KEY STATE4
Huang Ying54b6a1b2009-01-18 16:28:34 +11001994 movaps -0x30(TKEYP), KEY
Huang Yingb369e522009-11-23 19:54:06 +08001995 AESENC KEY STATE1
1996 AESENC KEY STATE2
1997 AESENC KEY STATE3
1998 AESENC KEY STATE4
Huang Ying54b6a1b2009-01-18 16:28:34 +11001999#.align 4
2000.L4enc128:
2001 movaps -0x20(TKEYP), KEY
Huang Yingb369e522009-11-23 19:54:06 +08002002 AESENC KEY STATE1
2003 AESENC KEY STATE2
2004 AESENC KEY STATE3
2005 AESENC KEY STATE4
Huang Ying54b6a1b2009-01-18 16:28:34 +11002006 movaps -0x10(TKEYP), KEY
Huang Yingb369e522009-11-23 19:54:06 +08002007 AESENC KEY STATE1
2008 AESENC KEY STATE2
2009 AESENC KEY STATE3
2010 AESENC KEY STATE4
Huang Ying54b6a1b2009-01-18 16:28:34 +11002011 movaps (TKEYP), KEY
Huang Yingb369e522009-11-23 19:54:06 +08002012 AESENC KEY STATE1
2013 AESENC KEY STATE2
2014 AESENC KEY STATE3
2015 AESENC KEY STATE4
Huang Ying54b6a1b2009-01-18 16:28:34 +11002016 movaps 0x10(TKEYP), KEY
Huang Yingb369e522009-11-23 19:54:06 +08002017 AESENC KEY STATE1
2018 AESENC KEY STATE2
2019 AESENC KEY STATE3
2020 AESENC KEY STATE4
Huang Ying54b6a1b2009-01-18 16:28:34 +11002021 movaps 0x20(TKEYP), KEY
Huang Yingb369e522009-11-23 19:54:06 +08002022 AESENC KEY STATE1
2023 AESENC KEY STATE2
2024 AESENC KEY STATE3
2025 AESENC KEY STATE4
Huang Ying54b6a1b2009-01-18 16:28:34 +11002026 movaps 0x30(TKEYP), KEY
Huang Yingb369e522009-11-23 19:54:06 +08002027 AESENC KEY STATE1
2028 AESENC KEY STATE2
2029 AESENC KEY STATE3
2030 AESENC KEY STATE4
Huang Ying54b6a1b2009-01-18 16:28:34 +11002031 movaps 0x40(TKEYP), KEY
Huang Yingb369e522009-11-23 19:54:06 +08002032 AESENC KEY STATE1
2033 AESENC KEY STATE2
2034 AESENC KEY STATE3
2035 AESENC KEY STATE4
Huang Ying54b6a1b2009-01-18 16:28:34 +11002036 movaps 0x50(TKEYP), KEY
Huang Yingb369e522009-11-23 19:54:06 +08002037 AESENC KEY STATE1
2038 AESENC KEY STATE2
2039 AESENC KEY STATE3
2040 AESENC KEY STATE4
Huang Ying54b6a1b2009-01-18 16:28:34 +11002041 movaps 0x60(TKEYP), KEY
Huang Yingb369e522009-11-23 19:54:06 +08002042 AESENC KEY STATE1
2043 AESENC KEY STATE2
2044 AESENC KEY STATE3
2045 AESENC KEY STATE4
Huang Ying54b6a1b2009-01-18 16:28:34 +11002046 movaps 0x70(TKEYP), KEY
Huang Yingb369e522009-11-23 19:54:06 +08002047 AESENCLAST KEY STATE1 # last round
2048 AESENCLAST KEY STATE2
2049 AESENCLAST KEY STATE3
2050 AESENCLAST KEY STATE4
Huang Ying54b6a1b2009-01-18 16:28:34 +11002051 ret
Jussi Kivilinna8309b742013-01-19 13:38:55 +02002052ENDPROC(_aesni_enc4)
Huang Ying54b6a1b2009-01-18 16:28:34 +11002053
2054/*
2055 * void aesni_dec (struct crypto_aes_ctx *ctx, u8 *dst, const u8 *src)
2056 */
2057ENTRY(aesni_dec)
Josh Poimboeuf8691ccd2016-01-21 16:49:19 -06002058 FRAME_BEGIN
Mathias Krause0d258ef2010-11-27 16:34:46 +08002059#ifndef __x86_64__
2060 pushl KEYP
2061 pushl KLEN
Josh Poimboeuf8691ccd2016-01-21 16:49:19 -06002062 movl (FRAME_OFFSET+12)(%esp), KEYP # ctx
2063 movl (FRAME_OFFSET+16)(%esp), OUTP # dst
2064 movl (FRAME_OFFSET+20)(%esp), INP # src
Mathias Krause0d258ef2010-11-27 16:34:46 +08002065#endif
Huang Ying54b6a1b2009-01-18 16:28:34 +11002066 mov 480(KEYP), KLEN # key length
2067 add $240, KEYP
2068 movups (INP), STATE # input
2069 call _aesni_dec1
2070 movups STATE, (OUTP) #output
Mathias Krause0d258ef2010-11-27 16:34:46 +08002071#ifndef __x86_64__
2072 popl KLEN
2073 popl KEYP
2074#endif
Josh Poimboeuf8691ccd2016-01-21 16:49:19 -06002075 FRAME_END
Huang Ying54b6a1b2009-01-18 16:28:34 +11002076 ret
Jussi Kivilinna8309b742013-01-19 13:38:55 +02002077ENDPROC(aesni_dec)
Huang Ying54b6a1b2009-01-18 16:28:34 +11002078
2079/*
2080 * _aesni_dec1: internal ABI
2081 * input:
2082 * KEYP: key struct pointer
2083 * KLEN: key length
2084 * STATE: initial state (input)
2085 * output:
2086 * STATE: finial state (output)
2087 * changed:
2088 * KEY
2089 * TKEYP (T1)
2090 */
Mathias Krause0d258ef2010-11-27 16:34:46 +08002091.align 4
Huang Ying54b6a1b2009-01-18 16:28:34 +11002092_aesni_dec1:
2093 movaps (KEYP), KEY # key
2094 mov KEYP, TKEYP
2095 pxor KEY, STATE # round 0
2096 add $0x30, TKEYP
2097 cmp $24, KLEN
2098 jb .Ldec128
2099 lea 0x20(TKEYP), TKEYP
2100 je .Ldec192
2101 add $0x20, TKEYP
2102 movaps -0x60(TKEYP), KEY
Huang Yingb369e522009-11-23 19:54:06 +08002103 AESDEC KEY STATE
Huang Ying54b6a1b2009-01-18 16:28:34 +11002104 movaps -0x50(TKEYP), KEY
Huang Yingb369e522009-11-23 19:54:06 +08002105 AESDEC KEY STATE
Huang Ying54b6a1b2009-01-18 16:28:34 +11002106.align 4
2107.Ldec192:
2108 movaps -0x40(TKEYP), KEY
Huang Yingb369e522009-11-23 19:54:06 +08002109 AESDEC KEY STATE
Huang Ying54b6a1b2009-01-18 16:28:34 +11002110 movaps -0x30(TKEYP), KEY
Huang Yingb369e522009-11-23 19:54:06 +08002111 AESDEC KEY STATE
Huang Ying54b6a1b2009-01-18 16:28:34 +11002112.align 4
2113.Ldec128:
2114 movaps -0x20(TKEYP), KEY
Huang Yingb369e522009-11-23 19:54:06 +08002115 AESDEC KEY STATE
Huang Ying54b6a1b2009-01-18 16:28:34 +11002116 movaps -0x10(TKEYP), KEY
Huang Yingb369e522009-11-23 19:54:06 +08002117 AESDEC KEY STATE
Huang Ying54b6a1b2009-01-18 16:28:34 +11002118 movaps (TKEYP), KEY
Huang Yingb369e522009-11-23 19:54:06 +08002119 AESDEC KEY STATE
Huang Ying54b6a1b2009-01-18 16:28:34 +11002120 movaps 0x10(TKEYP), KEY
Huang Yingb369e522009-11-23 19:54:06 +08002121 AESDEC KEY STATE
Huang Ying54b6a1b2009-01-18 16:28:34 +11002122 movaps 0x20(TKEYP), KEY
Huang Yingb369e522009-11-23 19:54:06 +08002123 AESDEC KEY STATE
Huang Ying54b6a1b2009-01-18 16:28:34 +11002124 movaps 0x30(TKEYP), KEY
Huang Yingb369e522009-11-23 19:54:06 +08002125 AESDEC KEY STATE
Huang Ying54b6a1b2009-01-18 16:28:34 +11002126 movaps 0x40(TKEYP), KEY
Huang Yingb369e522009-11-23 19:54:06 +08002127 AESDEC KEY STATE
Huang Ying54b6a1b2009-01-18 16:28:34 +11002128 movaps 0x50(TKEYP), KEY
Huang Yingb369e522009-11-23 19:54:06 +08002129 AESDEC KEY STATE
Huang Ying54b6a1b2009-01-18 16:28:34 +11002130 movaps 0x60(TKEYP), KEY
Huang Yingb369e522009-11-23 19:54:06 +08002131 AESDEC KEY STATE
Huang Ying54b6a1b2009-01-18 16:28:34 +11002132 movaps 0x70(TKEYP), KEY
Huang Yingb369e522009-11-23 19:54:06 +08002133 AESDECLAST KEY STATE
Huang Ying54b6a1b2009-01-18 16:28:34 +11002134 ret
Jussi Kivilinna8309b742013-01-19 13:38:55 +02002135ENDPROC(_aesni_dec1)
Huang Ying54b6a1b2009-01-18 16:28:34 +11002136
2137/*
2138 * _aesni_dec4: internal ABI
2139 * input:
2140 * KEYP: key struct pointer
2141 * KLEN: key length
2142 * STATE1: initial state (input)
2143 * STATE2
2144 * STATE3
2145 * STATE4
2146 * output:
2147 * STATE1: finial state (output)
2148 * STATE2
2149 * STATE3
2150 * STATE4
2151 * changed:
2152 * KEY
2153 * TKEYP (T1)
2154 */
Mathias Krause0d258ef2010-11-27 16:34:46 +08002155.align 4
Huang Ying54b6a1b2009-01-18 16:28:34 +11002156_aesni_dec4:
2157 movaps (KEYP), KEY # key
2158 mov KEYP, TKEYP
2159 pxor KEY, STATE1 # round 0
2160 pxor KEY, STATE2
2161 pxor KEY, STATE3
2162 pxor KEY, STATE4
2163 add $0x30, TKEYP
2164 cmp $24, KLEN
2165 jb .L4dec128
2166 lea 0x20(TKEYP), TKEYP
2167 je .L4dec192
2168 add $0x20, TKEYP
2169 movaps -0x60(TKEYP), KEY
Huang Yingb369e522009-11-23 19:54:06 +08002170 AESDEC KEY STATE1
2171 AESDEC KEY STATE2
2172 AESDEC KEY STATE3
2173 AESDEC KEY STATE4
Huang Ying54b6a1b2009-01-18 16:28:34 +11002174 movaps -0x50(TKEYP), KEY
Huang Yingb369e522009-11-23 19:54:06 +08002175 AESDEC KEY STATE1
2176 AESDEC KEY STATE2
2177 AESDEC KEY STATE3
2178 AESDEC KEY STATE4
Huang Ying54b6a1b2009-01-18 16:28:34 +11002179.align 4
2180.L4dec192:
2181 movaps -0x40(TKEYP), KEY
Huang Yingb369e522009-11-23 19:54:06 +08002182 AESDEC KEY STATE1
2183 AESDEC KEY STATE2
2184 AESDEC KEY STATE3
2185 AESDEC KEY STATE4
Huang Ying54b6a1b2009-01-18 16:28:34 +11002186 movaps -0x30(TKEYP), KEY
Huang Yingb369e522009-11-23 19:54:06 +08002187 AESDEC KEY STATE1
2188 AESDEC KEY STATE2
2189 AESDEC KEY STATE3
2190 AESDEC KEY STATE4
Huang Ying54b6a1b2009-01-18 16:28:34 +11002191.align 4
2192.L4dec128:
2193 movaps -0x20(TKEYP), KEY
Huang Yingb369e522009-11-23 19:54:06 +08002194 AESDEC KEY STATE1
2195 AESDEC KEY STATE2
2196 AESDEC KEY STATE3
2197 AESDEC KEY STATE4
Huang Ying54b6a1b2009-01-18 16:28:34 +11002198 movaps -0x10(TKEYP), KEY
Huang Yingb369e522009-11-23 19:54:06 +08002199 AESDEC KEY STATE1
2200 AESDEC KEY STATE2
2201 AESDEC KEY STATE3
2202 AESDEC KEY STATE4
Huang Ying54b6a1b2009-01-18 16:28:34 +11002203 movaps (TKEYP), KEY
Huang Yingb369e522009-11-23 19:54:06 +08002204 AESDEC KEY STATE1
2205 AESDEC KEY STATE2
2206 AESDEC KEY STATE3
2207 AESDEC KEY STATE4
Huang Ying54b6a1b2009-01-18 16:28:34 +11002208 movaps 0x10(TKEYP), KEY
Huang Yingb369e522009-11-23 19:54:06 +08002209 AESDEC KEY STATE1
2210 AESDEC KEY STATE2
2211 AESDEC KEY STATE3
2212 AESDEC KEY STATE4
Huang Ying54b6a1b2009-01-18 16:28:34 +11002213 movaps 0x20(TKEYP), KEY
Huang Yingb369e522009-11-23 19:54:06 +08002214 AESDEC KEY STATE1
2215 AESDEC KEY STATE2
2216 AESDEC KEY STATE3
2217 AESDEC KEY STATE4
Huang Ying54b6a1b2009-01-18 16:28:34 +11002218 movaps 0x30(TKEYP), KEY
Huang Yingb369e522009-11-23 19:54:06 +08002219 AESDEC KEY STATE1
2220 AESDEC KEY STATE2
2221 AESDEC KEY STATE3
2222 AESDEC KEY STATE4
Huang Ying54b6a1b2009-01-18 16:28:34 +11002223 movaps 0x40(TKEYP), KEY
Huang Yingb369e522009-11-23 19:54:06 +08002224 AESDEC KEY STATE1
2225 AESDEC KEY STATE2
2226 AESDEC KEY STATE3
2227 AESDEC KEY STATE4
Huang Ying54b6a1b2009-01-18 16:28:34 +11002228 movaps 0x50(TKEYP), KEY
Huang Yingb369e522009-11-23 19:54:06 +08002229 AESDEC KEY STATE1
2230 AESDEC KEY STATE2
2231 AESDEC KEY STATE3
2232 AESDEC KEY STATE4
Huang Ying54b6a1b2009-01-18 16:28:34 +11002233 movaps 0x60(TKEYP), KEY
Huang Yingb369e522009-11-23 19:54:06 +08002234 AESDEC KEY STATE1
2235 AESDEC KEY STATE2
2236 AESDEC KEY STATE3
2237 AESDEC KEY STATE4
Huang Ying54b6a1b2009-01-18 16:28:34 +11002238 movaps 0x70(TKEYP), KEY
Huang Yingb369e522009-11-23 19:54:06 +08002239 AESDECLAST KEY STATE1 # last round
2240 AESDECLAST KEY STATE2
2241 AESDECLAST KEY STATE3
2242 AESDECLAST KEY STATE4
Huang Ying54b6a1b2009-01-18 16:28:34 +11002243 ret
Jussi Kivilinna8309b742013-01-19 13:38:55 +02002244ENDPROC(_aesni_dec4)
Huang Ying54b6a1b2009-01-18 16:28:34 +11002245
2246/*
2247 * void aesni_ecb_enc(struct crypto_aes_ctx *ctx, const u8 *dst, u8 *src,
2248 * size_t len)
2249 */
2250ENTRY(aesni_ecb_enc)
Josh Poimboeuf8691ccd2016-01-21 16:49:19 -06002251 FRAME_BEGIN
Mathias Krause0d258ef2010-11-27 16:34:46 +08002252#ifndef __x86_64__
2253 pushl LEN
2254 pushl KEYP
2255 pushl KLEN
Josh Poimboeuf8691ccd2016-01-21 16:49:19 -06002256 movl (FRAME_OFFSET+16)(%esp), KEYP # ctx
2257 movl (FRAME_OFFSET+20)(%esp), OUTP # dst
2258 movl (FRAME_OFFSET+24)(%esp), INP # src
2259 movl (FRAME_OFFSET+28)(%esp), LEN # len
Mathias Krause0d258ef2010-11-27 16:34:46 +08002260#endif
Huang Ying54b6a1b2009-01-18 16:28:34 +11002261 test LEN, LEN # check length
2262 jz .Lecb_enc_ret
2263 mov 480(KEYP), KLEN
2264 cmp $16, LEN
2265 jb .Lecb_enc_ret
2266 cmp $64, LEN
2267 jb .Lecb_enc_loop1
2268.align 4
2269.Lecb_enc_loop4:
2270 movups (INP), STATE1
2271 movups 0x10(INP), STATE2
2272 movups 0x20(INP), STATE3
2273 movups 0x30(INP), STATE4
2274 call _aesni_enc4
2275 movups STATE1, (OUTP)
2276 movups STATE2, 0x10(OUTP)
2277 movups STATE3, 0x20(OUTP)
2278 movups STATE4, 0x30(OUTP)
2279 sub $64, LEN
2280 add $64, INP
2281 add $64, OUTP
2282 cmp $64, LEN
2283 jge .Lecb_enc_loop4
2284 cmp $16, LEN
2285 jb .Lecb_enc_ret
2286.align 4
2287.Lecb_enc_loop1:
2288 movups (INP), STATE1
2289 call _aesni_enc1
2290 movups STATE1, (OUTP)
2291 sub $16, LEN
2292 add $16, INP
2293 add $16, OUTP
2294 cmp $16, LEN
2295 jge .Lecb_enc_loop1
2296.Lecb_enc_ret:
Mathias Krause0d258ef2010-11-27 16:34:46 +08002297#ifndef __x86_64__
2298 popl KLEN
2299 popl KEYP
2300 popl LEN
2301#endif
Josh Poimboeuf8691ccd2016-01-21 16:49:19 -06002302 FRAME_END
Huang Ying54b6a1b2009-01-18 16:28:34 +11002303 ret
Jussi Kivilinna8309b742013-01-19 13:38:55 +02002304ENDPROC(aesni_ecb_enc)
Huang Ying54b6a1b2009-01-18 16:28:34 +11002305
2306/*
2307 * void aesni_ecb_dec(struct crypto_aes_ctx *ctx, const u8 *dst, u8 *src,
2308 * size_t len);
2309 */
2310ENTRY(aesni_ecb_dec)
Josh Poimboeuf8691ccd2016-01-21 16:49:19 -06002311 FRAME_BEGIN
Mathias Krause0d258ef2010-11-27 16:34:46 +08002312#ifndef __x86_64__
2313 pushl LEN
2314 pushl KEYP
2315 pushl KLEN
Josh Poimboeuf8691ccd2016-01-21 16:49:19 -06002316 movl (FRAME_OFFSET+16)(%esp), KEYP # ctx
2317 movl (FRAME_OFFSET+20)(%esp), OUTP # dst
2318 movl (FRAME_OFFSET+24)(%esp), INP # src
2319 movl (FRAME_OFFSET+28)(%esp), LEN # len
Mathias Krause0d258ef2010-11-27 16:34:46 +08002320#endif
Huang Ying54b6a1b2009-01-18 16:28:34 +11002321 test LEN, LEN
2322 jz .Lecb_dec_ret
2323 mov 480(KEYP), KLEN
2324 add $240, KEYP
2325 cmp $16, LEN
2326 jb .Lecb_dec_ret
2327 cmp $64, LEN
2328 jb .Lecb_dec_loop1
2329.align 4
2330.Lecb_dec_loop4:
2331 movups (INP), STATE1
2332 movups 0x10(INP), STATE2
2333 movups 0x20(INP), STATE3
2334 movups 0x30(INP), STATE4
2335 call _aesni_dec4
2336 movups STATE1, (OUTP)
2337 movups STATE2, 0x10(OUTP)
2338 movups STATE3, 0x20(OUTP)
2339 movups STATE4, 0x30(OUTP)
2340 sub $64, LEN
2341 add $64, INP
2342 add $64, OUTP
2343 cmp $64, LEN
2344 jge .Lecb_dec_loop4
2345 cmp $16, LEN
2346 jb .Lecb_dec_ret
2347.align 4
2348.Lecb_dec_loop1:
2349 movups (INP), STATE1
2350 call _aesni_dec1
2351 movups STATE1, (OUTP)
2352 sub $16, LEN
2353 add $16, INP
2354 add $16, OUTP
2355 cmp $16, LEN
2356 jge .Lecb_dec_loop1
2357.Lecb_dec_ret:
Mathias Krause0d258ef2010-11-27 16:34:46 +08002358#ifndef __x86_64__
2359 popl KLEN
2360 popl KEYP
2361 popl LEN
2362#endif
Josh Poimboeuf8691ccd2016-01-21 16:49:19 -06002363 FRAME_END
Huang Ying54b6a1b2009-01-18 16:28:34 +11002364 ret
Jussi Kivilinna8309b742013-01-19 13:38:55 +02002365ENDPROC(aesni_ecb_dec)
Huang Ying54b6a1b2009-01-18 16:28:34 +11002366
2367/*
2368 * void aesni_cbc_enc(struct crypto_aes_ctx *ctx, const u8 *dst, u8 *src,
2369 * size_t len, u8 *iv)
2370 */
2371ENTRY(aesni_cbc_enc)
Josh Poimboeuf8691ccd2016-01-21 16:49:19 -06002372 FRAME_BEGIN
Mathias Krause0d258ef2010-11-27 16:34:46 +08002373#ifndef __x86_64__
2374 pushl IVP
2375 pushl LEN
2376 pushl KEYP
2377 pushl KLEN
Josh Poimboeuf8691ccd2016-01-21 16:49:19 -06002378 movl (FRAME_OFFSET+20)(%esp), KEYP # ctx
2379 movl (FRAME_OFFSET+24)(%esp), OUTP # dst
2380 movl (FRAME_OFFSET+28)(%esp), INP # src
2381 movl (FRAME_OFFSET+32)(%esp), LEN # len
2382 movl (FRAME_OFFSET+36)(%esp), IVP # iv
Mathias Krause0d258ef2010-11-27 16:34:46 +08002383#endif
Huang Ying54b6a1b2009-01-18 16:28:34 +11002384 cmp $16, LEN
2385 jb .Lcbc_enc_ret
2386 mov 480(KEYP), KLEN
2387 movups (IVP), STATE # load iv as initial state
2388.align 4
2389.Lcbc_enc_loop:
2390 movups (INP), IN # load input
2391 pxor IN, STATE
2392 call _aesni_enc1
2393 movups STATE, (OUTP) # store output
2394 sub $16, LEN
2395 add $16, INP
2396 add $16, OUTP
2397 cmp $16, LEN
2398 jge .Lcbc_enc_loop
2399 movups STATE, (IVP)
2400.Lcbc_enc_ret:
Mathias Krause0d258ef2010-11-27 16:34:46 +08002401#ifndef __x86_64__
2402 popl KLEN
2403 popl KEYP
2404 popl LEN
2405 popl IVP
2406#endif
Josh Poimboeuf8691ccd2016-01-21 16:49:19 -06002407 FRAME_END
Huang Ying54b6a1b2009-01-18 16:28:34 +11002408 ret
Jussi Kivilinna8309b742013-01-19 13:38:55 +02002409ENDPROC(aesni_cbc_enc)
Huang Ying54b6a1b2009-01-18 16:28:34 +11002410
2411/*
2412 * void aesni_cbc_dec(struct crypto_aes_ctx *ctx, const u8 *dst, u8 *src,
2413 * size_t len, u8 *iv)
2414 */
2415ENTRY(aesni_cbc_dec)
Josh Poimboeuf8691ccd2016-01-21 16:49:19 -06002416 FRAME_BEGIN
Mathias Krause0d258ef2010-11-27 16:34:46 +08002417#ifndef __x86_64__
2418 pushl IVP
2419 pushl LEN
2420 pushl KEYP
2421 pushl KLEN
Josh Poimboeuf8691ccd2016-01-21 16:49:19 -06002422 movl (FRAME_OFFSET+20)(%esp), KEYP # ctx
2423 movl (FRAME_OFFSET+24)(%esp), OUTP # dst
2424 movl (FRAME_OFFSET+28)(%esp), INP # src
2425 movl (FRAME_OFFSET+32)(%esp), LEN # len
2426 movl (FRAME_OFFSET+36)(%esp), IVP # iv
Mathias Krause0d258ef2010-11-27 16:34:46 +08002427#endif
Huang Ying54b6a1b2009-01-18 16:28:34 +11002428 cmp $16, LEN
Huang Yinge6efaa02009-06-18 19:33:57 +08002429 jb .Lcbc_dec_just_ret
Huang Ying54b6a1b2009-01-18 16:28:34 +11002430 mov 480(KEYP), KLEN
2431 add $240, KEYP
2432 movups (IVP), IV
2433 cmp $64, LEN
2434 jb .Lcbc_dec_loop1
2435.align 4
2436.Lcbc_dec_loop4:
2437 movups (INP), IN1
2438 movaps IN1, STATE1
2439 movups 0x10(INP), IN2
2440 movaps IN2, STATE2
Mathias Krause0d258ef2010-11-27 16:34:46 +08002441#ifdef __x86_64__
Huang Ying54b6a1b2009-01-18 16:28:34 +11002442 movups 0x20(INP), IN3
2443 movaps IN3, STATE3
2444 movups 0x30(INP), IN4
2445 movaps IN4, STATE4
Mathias Krause0d258ef2010-11-27 16:34:46 +08002446#else
2447 movups 0x20(INP), IN1
2448 movaps IN1, STATE3
2449 movups 0x30(INP), IN2
2450 movaps IN2, STATE4
2451#endif
Huang Ying54b6a1b2009-01-18 16:28:34 +11002452 call _aesni_dec4
2453 pxor IV, STATE1
Mathias Krause0d258ef2010-11-27 16:34:46 +08002454#ifdef __x86_64__
Huang Ying54b6a1b2009-01-18 16:28:34 +11002455 pxor IN1, STATE2
2456 pxor IN2, STATE3
2457 pxor IN3, STATE4
2458 movaps IN4, IV
Mathias Krause0d258ef2010-11-27 16:34:46 +08002459#else
Mathias Krause0d258ef2010-11-27 16:34:46 +08002460 pxor IN1, STATE4
2461 movaps IN2, IV
Mathias Krause7c8d5182012-05-30 01:43:08 +02002462 movups (INP), IN1
2463 pxor IN1, STATE2
2464 movups 0x10(INP), IN2
2465 pxor IN2, STATE3
Mathias Krause0d258ef2010-11-27 16:34:46 +08002466#endif
Huang Ying54b6a1b2009-01-18 16:28:34 +11002467 movups STATE1, (OUTP)
2468 movups STATE2, 0x10(OUTP)
2469 movups STATE3, 0x20(OUTP)
2470 movups STATE4, 0x30(OUTP)
2471 sub $64, LEN
2472 add $64, INP
2473 add $64, OUTP
2474 cmp $64, LEN
2475 jge .Lcbc_dec_loop4
2476 cmp $16, LEN
2477 jb .Lcbc_dec_ret
2478.align 4
2479.Lcbc_dec_loop1:
2480 movups (INP), IN
2481 movaps IN, STATE
2482 call _aesni_dec1
2483 pxor IV, STATE
2484 movups STATE, (OUTP)
2485 movaps IN, IV
2486 sub $16, LEN
2487 add $16, INP
2488 add $16, OUTP
2489 cmp $16, LEN
2490 jge .Lcbc_dec_loop1
Huang Ying54b6a1b2009-01-18 16:28:34 +11002491.Lcbc_dec_ret:
Huang Yinge6efaa02009-06-18 19:33:57 +08002492 movups IV, (IVP)
2493.Lcbc_dec_just_ret:
Mathias Krause0d258ef2010-11-27 16:34:46 +08002494#ifndef __x86_64__
2495 popl KLEN
2496 popl KEYP
2497 popl LEN
2498 popl IVP
2499#endif
Josh Poimboeuf8691ccd2016-01-21 16:49:19 -06002500 FRAME_END
Huang Ying54b6a1b2009-01-18 16:28:34 +11002501 ret
Jussi Kivilinna8309b742013-01-19 13:38:55 +02002502ENDPROC(aesni_cbc_dec)
Huang Ying12387a42010-03-10 18:28:55 +08002503
Mathias Krause0d258ef2010-11-27 16:34:46 +08002504#ifdef __x86_64__
Josh Poimboeuf1253cab2016-01-21 16:49:15 -06002505.pushsection .rodata
Huang Ying12387a42010-03-10 18:28:55 +08002506.align 16
2507.Lbswap_mask:
2508 .byte 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0
Josh Poimboeuf1253cab2016-01-21 16:49:15 -06002509.popsection
Huang Ying12387a42010-03-10 18:28:55 +08002510
2511/*
2512 * _aesni_inc_init: internal ABI
2513 * setup registers used by _aesni_inc
2514 * input:
2515 * IV
2516 * output:
2517 * CTR: == IV, in little endian
2518 * TCTR_LOW: == lower qword of CTR
2519 * INC: == 1, in little endian
2520 * BSWAP_MASK == endian swapping mask
2521 */
Mathias Krause0d258ef2010-11-27 16:34:46 +08002522.align 4
Huang Ying12387a42010-03-10 18:28:55 +08002523_aesni_inc_init:
2524 movaps .Lbswap_mask, BSWAP_MASK
2525 movaps IV, CTR
2526 PSHUFB_XMM BSWAP_MASK CTR
2527 mov $1, TCTR_LOW
Huang Ying32cbd7d2010-03-13 16:28:42 +08002528 MOVQ_R64_XMM TCTR_LOW INC
2529 MOVQ_R64_XMM CTR TCTR_LOW
Huang Ying12387a42010-03-10 18:28:55 +08002530 ret
Jussi Kivilinna8309b742013-01-19 13:38:55 +02002531ENDPROC(_aesni_inc_init)
Huang Ying12387a42010-03-10 18:28:55 +08002532
2533/*
2534 * _aesni_inc: internal ABI
2535 * Increase IV by 1, IV is in big endian
2536 * input:
2537 * IV
2538 * CTR: == IV, in little endian
2539 * TCTR_LOW: == lower qword of CTR
2540 * INC: == 1, in little endian
2541 * BSWAP_MASK == endian swapping mask
2542 * output:
2543 * IV: Increase by 1
2544 * changed:
2545 * CTR: == output IV, in little endian
2546 * TCTR_LOW: == lower qword of CTR
2547 */
Mathias Krause0d258ef2010-11-27 16:34:46 +08002548.align 4
Huang Ying12387a42010-03-10 18:28:55 +08002549_aesni_inc:
2550 paddq INC, CTR
2551 add $1, TCTR_LOW
2552 jnc .Linc_low
2553 pslldq $8, INC
2554 paddq INC, CTR
2555 psrldq $8, INC
2556.Linc_low:
2557 movaps CTR, IV
2558 PSHUFB_XMM BSWAP_MASK IV
2559 ret
Jussi Kivilinna8309b742013-01-19 13:38:55 +02002560ENDPROC(_aesni_inc)
Huang Ying12387a42010-03-10 18:28:55 +08002561
2562/*
2563 * void aesni_ctr_enc(struct crypto_aes_ctx *ctx, const u8 *dst, u8 *src,
2564 * size_t len, u8 *iv)
2565 */
2566ENTRY(aesni_ctr_enc)
Josh Poimboeuf8691ccd2016-01-21 16:49:19 -06002567 FRAME_BEGIN
Huang Ying12387a42010-03-10 18:28:55 +08002568 cmp $16, LEN
2569 jb .Lctr_enc_just_ret
2570 mov 480(KEYP), KLEN
2571 movups (IVP), IV
2572 call _aesni_inc_init
2573 cmp $64, LEN
2574 jb .Lctr_enc_loop1
2575.align 4
2576.Lctr_enc_loop4:
2577 movaps IV, STATE1
2578 call _aesni_inc
2579 movups (INP), IN1
2580 movaps IV, STATE2
2581 call _aesni_inc
2582 movups 0x10(INP), IN2
2583 movaps IV, STATE3
2584 call _aesni_inc
2585 movups 0x20(INP), IN3
2586 movaps IV, STATE4
2587 call _aesni_inc
2588 movups 0x30(INP), IN4
2589 call _aesni_enc4
2590 pxor IN1, STATE1
2591 movups STATE1, (OUTP)
2592 pxor IN2, STATE2
2593 movups STATE2, 0x10(OUTP)
2594 pxor IN3, STATE3
2595 movups STATE3, 0x20(OUTP)
2596 pxor IN4, STATE4
2597 movups STATE4, 0x30(OUTP)
2598 sub $64, LEN
2599 add $64, INP
2600 add $64, OUTP
2601 cmp $64, LEN
2602 jge .Lctr_enc_loop4
2603 cmp $16, LEN
2604 jb .Lctr_enc_ret
2605.align 4
2606.Lctr_enc_loop1:
2607 movaps IV, STATE
2608 call _aesni_inc
2609 movups (INP), IN
2610 call _aesni_enc1
2611 pxor IN, STATE
2612 movups STATE, (OUTP)
2613 sub $16, LEN
2614 add $16, INP
2615 add $16, OUTP
2616 cmp $16, LEN
2617 jge .Lctr_enc_loop1
2618.Lctr_enc_ret:
2619 movups IV, (IVP)
2620.Lctr_enc_just_ret:
Josh Poimboeuf8691ccd2016-01-21 16:49:19 -06002621 FRAME_END
Huang Ying12387a42010-03-10 18:28:55 +08002622 ret
Jussi Kivilinna8309b742013-01-19 13:38:55 +02002623ENDPROC(aesni_ctr_enc)
Jussi Kivilinnac456a9c2013-04-08 21:51:16 +03002624
2625/*
2626 * _aesni_gf128mul_x_ble: internal ABI
2627 * Multiply in GF(2^128) for XTS IVs
2628 * input:
2629 * IV: current IV
2630 * GF128MUL_MASK == mask with 0x87 and 0x01
2631 * output:
2632 * IV: next IV
2633 * changed:
2634 * CTR: == temporary value
2635 */
2636#define _aesni_gf128mul_x_ble() \
2637 pshufd $0x13, IV, CTR; \
2638 paddq IV, IV; \
2639 psrad $31, CTR; \
2640 pand GF128MUL_MASK, CTR; \
2641 pxor CTR, IV;
2642
2643/*
2644 * void aesni_xts_crypt8(struct crypto_aes_ctx *ctx, const u8 *dst, u8 *src,
2645 * bool enc, u8 *iv)
2646 */
2647ENTRY(aesni_xts_crypt8)
Josh Poimboeuf8691ccd2016-01-21 16:49:19 -06002648 FRAME_BEGIN
Jussi Kivilinnac456a9c2013-04-08 21:51:16 +03002649 cmpb $0, %cl
2650 movl $0, %ecx
2651 movl $240, %r10d
2652 leaq _aesni_enc4, %r11
2653 leaq _aesni_dec4, %rax
2654 cmovel %r10d, %ecx
2655 cmoveq %rax, %r11
2656
2657 movdqa .Lgf128mul_x_ble_mask, GF128MUL_MASK
2658 movups (IVP), IV
2659
2660 mov 480(KEYP), KLEN
2661 addq %rcx, KEYP
2662
2663 movdqa IV, STATE1
Jussi Kivilinnafe6510b2013-06-11 22:25:22 +03002664 movdqu 0x00(INP), INC
2665 pxor INC, STATE1
Jussi Kivilinnac456a9c2013-04-08 21:51:16 +03002666 movdqu IV, 0x00(OUTP)
2667
2668 _aesni_gf128mul_x_ble()
2669 movdqa IV, STATE2
Jussi Kivilinnafe6510b2013-06-11 22:25:22 +03002670 movdqu 0x10(INP), INC
2671 pxor INC, STATE2
Jussi Kivilinnac456a9c2013-04-08 21:51:16 +03002672 movdqu IV, 0x10(OUTP)
2673
2674 _aesni_gf128mul_x_ble()
2675 movdqa IV, STATE3
Jussi Kivilinnafe6510b2013-06-11 22:25:22 +03002676 movdqu 0x20(INP), INC
2677 pxor INC, STATE3
Jussi Kivilinnac456a9c2013-04-08 21:51:16 +03002678 movdqu IV, 0x20(OUTP)
2679
2680 _aesni_gf128mul_x_ble()
2681 movdqa IV, STATE4
Jussi Kivilinnafe6510b2013-06-11 22:25:22 +03002682 movdqu 0x30(INP), INC
2683 pxor INC, STATE4
Jussi Kivilinnac456a9c2013-04-08 21:51:16 +03002684 movdqu IV, 0x30(OUTP)
2685
David Woodhouse9697fa32018-01-11 21:46:27 +00002686 CALL_NOSPEC %r11
Jussi Kivilinnac456a9c2013-04-08 21:51:16 +03002687
Jussi Kivilinnafe6510b2013-06-11 22:25:22 +03002688 movdqu 0x00(OUTP), INC
2689 pxor INC, STATE1
Jussi Kivilinnac456a9c2013-04-08 21:51:16 +03002690 movdqu STATE1, 0x00(OUTP)
2691
2692 _aesni_gf128mul_x_ble()
2693 movdqa IV, STATE1
Jussi Kivilinnafe6510b2013-06-11 22:25:22 +03002694 movdqu 0x40(INP), INC
2695 pxor INC, STATE1
Jussi Kivilinnac456a9c2013-04-08 21:51:16 +03002696 movdqu IV, 0x40(OUTP)
2697
Jussi Kivilinnafe6510b2013-06-11 22:25:22 +03002698 movdqu 0x10(OUTP), INC
2699 pxor INC, STATE2
Jussi Kivilinnac456a9c2013-04-08 21:51:16 +03002700 movdqu STATE2, 0x10(OUTP)
2701
2702 _aesni_gf128mul_x_ble()
2703 movdqa IV, STATE2
Jussi Kivilinnafe6510b2013-06-11 22:25:22 +03002704 movdqu 0x50(INP), INC
2705 pxor INC, STATE2
Jussi Kivilinnac456a9c2013-04-08 21:51:16 +03002706 movdqu IV, 0x50(OUTP)
2707
Jussi Kivilinnafe6510b2013-06-11 22:25:22 +03002708 movdqu 0x20(OUTP), INC
2709 pxor INC, STATE3
Jussi Kivilinnac456a9c2013-04-08 21:51:16 +03002710 movdqu STATE3, 0x20(OUTP)
2711
2712 _aesni_gf128mul_x_ble()
2713 movdqa IV, STATE3
Jussi Kivilinnafe6510b2013-06-11 22:25:22 +03002714 movdqu 0x60(INP), INC
2715 pxor INC, STATE3
Jussi Kivilinnac456a9c2013-04-08 21:51:16 +03002716 movdqu IV, 0x60(OUTP)
2717
Jussi Kivilinnafe6510b2013-06-11 22:25:22 +03002718 movdqu 0x30(OUTP), INC
2719 pxor INC, STATE4
Jussi Kivilinnac456a9c2013-04-08 21:51:16 +03002720 movdqu STATE4, 0x30(OUTP)
2721
2722 _aesni_gf128mul_x_ble()
2723 movdqa IV, STATE4
Jussi Kivilinnafe6510b2013-06-11 22:25:22 +03002724 movdqu 0x70(INP), INC
2725 pxor INC, STATE4
Jussi Kivilinnac456a9c2013-04-08 21:51:16 +03002726 movdqu IV, 0x70(OUTP)
2727
2728 _aesni_gf128mul_x_ble()
2729 movups IV, (IVP)
2730
David Woodhouse9697fa32018-01-11 21:46:27 +00002731 CALL_NOSPEC %r11
Jussi Kivilinnac456a9c2013-04-08 21:51:16 +03002732
Jussi Kivilinnafe6510b2013-06-11 22:25:22 +03002733 movdqu 0x40(OUTP), INC
2734 pxor INC, STATE1
Jussi Kivilinnac456a9c2013-04-08 21:51:16 +03002735 movdqu STATE1, 0x40(OUTP)
2736
Jussi Kivilinnafe6510b2013-06-11 22:25:22 +03002737 movdqu 0x50(OUTP), INC
2738 pxor INC, STATE2
Jussi Kivilinnac456a9c2013-04-08 21:51:16 +03002739 movdqu STATE2, 0x50(OUTP)
2740
Jussi Kivilinnafe6510b2013-06-11 22:25:22 +03002741 movdqu 0x60(OUTP), INC
2742 pxor INC, STATE3
Jussi Kivilinnac456a9c2013-04-08 21:51:16 +03002743 movdqu STATE3, 0x60(OUTP)
2744
Jussi Kivilinnafe6510b2013-06-11 22:25:22 +03002745 movdqu 0x70(OUTP), INC
2746 pxor INC, STATE4
Jussi Kivilinnac456a9c2013-04-08 21:51:16 +03002747 movdqu STATE4, 0x70(OUTP)
2748
Josh Poimboeuf8691ccd2016-01-21 16:49:19 -06002749 FRAME_END
Jussi Kivilinnac456a9c2013-04-08 21:51:16 +03002750 ret
2751ENDPROC(aesni_xts_crypt8)
2752
Mathias Krause0d258ef2010-11-27 16:34:46 +08002753#endif