blob: d1436c37008b4ed6379c5b94a3d80d18a21b94ea [file] [log] [blame]
Thomas Gleixner2874c5f2019-05-27 08:55:01 +02001/* SPDX-License-Identifier: GPL-2.0-or-later */
Huang Ying54b6a1b2009-01-18 16:28:34 +11002/*
3 * Implement AES algorithm in Intel AES-NI instructions.
4 *
5 * The white paper of AES-NI instructions can be downloaded from:
6 * http://softwarecommunity.intel.com/isn/downloads/intelavx/AES-Instructions-Set_WP.pdf
7 *
8 * Copyright (C) 2008, Intel Corp.
9 * Author: Huang Ying <ying.huang@intel.com>
10 * Vinodh Gopal <vinodh.gopal@intel.com>
11 * Kahraman Akdemir
12 *
Tadeusz Struk0bd82f52010-11-04 15:00:45 -040013 * Added RFC4106 AES-GCM support for 128-bit keys under the AEAD
14 * interface for 64-bit kernels.
15 * Authors: Erdinc Ozturk (erdinc.ozturk@intel.com)
16 * Aidan O'Mahony (aidan.o.mahony@intel.com)
17 * Adrian Hoban <adrian.hoban@intel.com>
18 * James Guilford (james.guilford@intel.com)
19 * Gabriele Paoloni <gabriele.paoloni@intel.com>
20 * Tadeusz Struk (tadeusz.struk@intel.com)
21 * Wajdi Feghali (wajdi.k.feghali@intel.com)
22 * Copyright (c) 2010, Intel Corporation.
23 *
Mathias Krause0d258ef2010-11-27 16:34:46 +080024 * Ported x86_64 version to x86:
25 * Author: Mathias Krause <minipli@googlemail.com>
Huang Ying54b6a1b2009-01-18 16:28:34 +110026 */
27
28#include <linux/linkage.h>
Josh Poimboeuf8691ccd2016-01-21 16:49:19 -060029#include <asm/frame.h>
David Woodhouse9697fa32018-01-11 21:46:27 +000030#include <asm/nospec-branch.h>
Huang Ying54b6a1b2009-01-18 16:28:34 +110031
Timothy McCaffreye31ac322015-01-13 13:16:43 -050032/*
33 * The following macros are used to move an (un)aligned 16 byte value to/from
34 * an XMM register. This can done for either FP or integer values, for FP use
35 * movaps (move aligned packed single) or integer use movdqa (move double quad
36 * aligned). It doesn't make a performance difference which instruction is used
37 * since Nehalem (original Core i7) was released. However, the movaps is a byte
38 * shorter, so that is the one we'll use for now. (same for unaligned).
39 */
40#define MOVADQ movaps
41#define MOVUDQ movups
42
Mathias Krause559ad0f2010-11-29 08:35:39 +080043#ifdef __x86_64__
Timothy McCaffreye31ac322015-01-13 13:16:43 -050044
Denys Vlasenkoe1839142017-01-19 22:33:04 +010045# constants in mergeable sections, linker can reorder and merge
46.section .rodata.cst16.gf128mul_x_ble_mask, "aM", @progbits, 16
Jussi Kivilinnac456a9c2013-04-08 21:51:16 +030047.align 16
48.Lgf128mul_x_ble_mask:
49 .octa 0x00000000000000010000000000000087
Denys Vlasenkoe1839142017-01-19 22:33:04 +010050.section .rodata.cst16.POLY, "aM", @progbits, 16
51.align 16
Tadeusz Struk0bd82f52010-11-04 15:00:45 -040052POLY: .octa 0xC2000000000000000000000000000001
Denys Vlasenkoe1839142017-01-19 22:33:04 +010053.section .rodata.cst16.TWOONE, "aM", @progbits, 16
54.align 16
Tadeusz Struk0bd82f52010-11-04 15:00:45 -040055TWOONE: .octa 0x00000001000000000000000000000001
56
Denys Vlasenkoe1839142017-01-19 22:33:04 +010057.section .rodata.cst16.SHUF_MASK, "aM", @progbits, 16
58.align 16
59SHUF_MASK: .octa 0x000102030405060708090A0B0C0D0E0F
60.section .rodata.cst16.MASK1, "aM", @progbits, 16
61.align 16
62MASK1: .octa 0x0000000000000000ffffffffffffffff
63.section .rodata.cst16.MASK2, "aM", @progbits, 16
64.align 16
65MASK2: .octa 0xffffffffffffffff0000000000000000
66.section .rodata.cst16.ONE, "aM", @progbits, 16
67.align 16
68ONE: .octa 0x00000000000000000000000000000001
69.section .rodata.cst16.F_MIN_MASK, "aM", @progbits, 16
70.align 16
71F_MIN_MASK: .octa 0xf1f2f3f4f5f6f7f8f9fafbfcfdfeff0
72.section .rodata.cst16.dec, "aM", @progbits, 16
73.align 16
74dec: .octa 0x1
75.section .rodata.cst16.enc, "aM", @progbits, 16
76.align 16
77enc: .octa 0x2
78
Tadeusz Struk0bd82f52010-11-04 15:00:45 -040079# order of these constants should not change.
80# more specifically, ALL_F should follow SHIFT_MASK,
Denys Vlasenkoe1839142017-01-19 22:33:04 +010081# and zero should follow ALL_F
82.section .rodata, "a", @progbits
83.align 16
Tadeusz Struk0bd82f52010-11-04 15:00:45 -040084SHIFT_MASK: .octa 0x0f0e0d0c0b0a09080706050403020100
85ALL_F: .octa 0xffffffffffffffffffffffffffffffff
Denys Vlasenkoe1839142017-01-19 22:33:04 +010086 .octa 0x00000000000000000000000000000000
Tadeusz Struk0bd82f52010-11-04 15:00:45 -040087
Huang Ying54b6a1b2009-01-18 16:28:34 +110088.text
89
Tadeusz Struk0bd82f52010-11-04 15:00:45 -040090
91#define STACK_OFFSET 8*3
Tadeusz Struk0bd82f52010-11-04 15:00:45 -040092
Dave Watson9ee4a5d2018-02-14 09:39:23 -080093#define AadHash 16*0
94#define AadLen 16*1
95#define InLen (16*1)+8
96#define PBlockEncKey 16*2
97#define OrigIV 16*3
98#define CurCount 16*4
99#define PBlockLen 16*5
Dave Watson1476db22018-02-14 09:40:10 -0800100#define HashKey 16*6 // store HashKey <<1 mod poly here
101#define HashKey_2 16*7 // store HashKey^2 <<1 mod poly here
102#define HashKey_3 16*8 // store HashKey^3 <<1 mod poly here
103#define HashKey_4 16*9 // store HashKey^4 <<1 mod poly here
104#define HashKey_k 16*10 // store XOR of High 64 bits and Low 64
105 // bits of HashKey <<1 mod poly here
106 //(for Karatsuba purposes)
107#define HashKey_2_k 16*11 // store XOR of High 64 bits and Low 64
108 // bits of HashKey^2 <<1 mod poly here
109 // (for Karatsuba purposes)
110#define HashKey_3_k 16*12 // store XOR of High 64 bits and Low 64
111 // bits of HashKey^3 <<1 mod poly here
112 // (for Karatsuba purposes)
113#define HashKey_4_k 16*13 // store XOR of High 64 bits and Low 64
114 // bits of HashKey^4 <<1 mod poly here
115 // (for Karatsuba purposes)
Dave Watson9ee4a5d2018-02-14 09:39:23 -0800116
Tadeusz Struk0bd82f52010-11-04 15:00:45 -0400117#define arg1 rdi
118#define arg2 rsi
119#define arg3 rdx
120#define arg4 rcx
121#define arg5 r8
122#define arg6 r9
Dave Watson1476db22018-02-14 09:40:10 -0800123#define arg7 STACK_OFFSET+8(%rsp)
124#define arg8 STACK_OFFSET+16(%rsp)
125#define arg9 STACK_OFFSET+24(%rsp)
126#define arg10 STACK_OFFSET+32(%rsp)
127#define arg11 STACK_OFFSET+40(%rsp)
Timothy McCaffreye31ac322015-01-13 13:16:43 -0500128#define keysize 2*15*16(%arg1)
Mathias Krause559ad0f2010-11-29 08:35:39 +0800129#endif
Tadeusz Struk0bd82f52010-11-04 15:00:45 -0400130
131
Huang Ying54b6a1b2009-01-18 16:28:34 +1100132#define STATE1 %xmm0
133#define STATE2 %xmm4
134#define STATE3 %xmm5
135#define STATE4 %xmm6
136#define STATE STATE1
137#define IN1 %xmm1
138#define IN2 %xmm7
139#define IN3 %xmm8
140#define IN4 %xmm9
141#define IN IN1
142#define KEY %xmm2
143#define IV %xmm3
Mathias Krause0d258ef2010-11-27 16:34:46 +0800144
Huang Ying12387a42010-03-10 18:28:55 +0800145#define BSWAP_MASK %xmm10
146#define CTR %xmm11
147#define INC %xmm12
Huang Ying54b6a1b2009-01-18 16:28:34 +1100148
Jussi Kivilinnac456a9c2013-04-08 21:51:16 +0300149#define GF128MUL_MASK %xmm10
150
Mathias Krause0d258ef2010-11-27 16:34:46 +0800151#ifdef __x86_64__
152#define AREG %rax
Huang Ying54b6a1b2009-01-18 16:28:34 +1100153#define KEYP %rdi
154#define OUTP %rsi
Mathias Krause0d258ef2010-11-27 16:34:46 +0800155#define UKEYP OUTP
Huang Ying54b6a1b2009-01-18 16:28:34 +1100156#define INP %rdx
157#define LEN %rcx
158#define IVP %r8
159#define KLEN %r9d
160#define T1 %r10
161#define TKEYP T1
162#define T2 %r11
Huang Ying12387a42010-03-10 18:28:55 +0800163#define TCTR_LOW T2
Mathias Krause0d258ef2010-11-27 16:34:46 +0800164#else
165#define AREG %eax
166#define KEYP %edi
167#define OUTP AREG
168#define UKEYP OUTP
169#define INP %edx
170#define LEN %esi
171#define IVP %ebp
172#define KLEN %ebx
173#define T1 %ecx
174#define TKEYP T1
175#endif
Huang Ying54b6a1b2009-01-18 16:28:34 +1100176
Dave Watson6c2c86b2018-02-14 09:38:35 -0800177.macro FUNC_SAVE
178 push %r12
179 push %r13
180 push %r14
Dave Watson6c2c86b2018-02-14 09:38:35 -0800181#
182# states of %xmm registers %xmm6:%xmm15 not saved
183# all %xmm registers are clobbered
184#
Dave Watson6c2c86b2018-02-14 09:38:35 -0800185.endm
186
187
188.macro FUNC_RESTORE
Dave Watson6c2c86b2018-02-14 09:38:35 -0800189 pop %r14
190 pop %r13
191 pop %r12
192.endm
Tadeusz Struk0bd82f52010-11-04 15:00:45 -0400193
Dave Watson1476db22018-02-14 09:40:10 -0800194# Precompute hashkeys.
195# Input: Hash subkey.
196# Output: HashKeys stored in gcm_context_data. Only needs to be called
197# once per key.
198# clobbers r12, and tmp xmm registers.
Dave Watsonfb8986e2018-02-14 09:40:47 -0800199.macro PRECOMPUTE SUBKEY TMP1 TMP2 TMP3 TMP4 TMP5 TMP6 TMP7
200 mov \SUBKEY, %r12
Dave Watson1476db22018-02-14 09:40:10 -0800201 movdqu (%r12), \TMP3
202 movdqa SHUF_MASK(%rip), \TMP2
Uros Bizjakd7866e52020-07-09 17:08:57 +0200203 pshufb \TMP2, \TMP3
Dave Watson1476db22018-02-14 09:40:10 -0800204
205 # precompute HashKey<<1 mod poly from the HashKey (required for GHASH)
206
207 movdqa \TMP3, \TMP2
208 psllq $1, \TMP3
209 psrlq $63, \TMP2
210 movdqa \TMP2, \TMP1
211 pslldq $8, \TMP2
212 psrldq $8, \TMP1
213 por \TMP2, \TMP3
214
215 # reduce HashKey<<1
216
217 pshufd $0x24, \TMP1, \TMP2
218 pcmpeqd TWOONE(%rip), \TMP2
219 pand POLY(%rip), \TMP2
220 pxor \TMP2, \TMP3
Dave Watsone5b954e2018-08-15 10:29:42 -0700221 movdqu \TMP3, HashKey(%arg2)
Dave Watson1476db22018-02-14 09:40:10 -0800222
223 movdqa \TMP3, \TMP5
224 pshufd $78, \TMP3, \TMP1
225 pxor \TMP3, \TMP1
Dave Watsone5b954e2018-08-15 10:29:42 -0700226 movdqu \TMP1, HashKey_k(%arg2)
Dave Watson1476db22018-02-14 09:40:10 -0800227
228 GHASH_MUL \TMP5, \TMP3, \TMP1, \TMP2, \TMP4, \TMP6, \TMP7
229# TMP5 = HashKey^2<<1 (mod poly)
Dave Watsone5b954e2018-08-15 10:29:42 -0700230 movdqu \TMP5, HashKey_2(%arg2)
Dave Watson1476db22018-02-14 09:40:10 -0800231# HashKey_2 = HashKey^2<<1 (mod poly)
232 pshufd $78, \TMP5, \TMP1
233 pxor \TMP5, \TMP1
Dave Watsone5b954e2018-08-15 10:29:42 -0700234 movdqu \TMP1, HashKey_2_k(%arg2)
Dave Watson1476db22018-02-14 09:40:10 -0800235
236 GHASH_MUL \TMP5, \TMP3, \TMP1, \TMP2, \TMP4, \TMP6, \TMP7
237# TMP5 = HashKey^3<<1 (mod poly)
Dave Watsone5b954e2018-08-15 10:29:42 -0700238 movdqu \TMP5, HashKey_3(%arg2)
Dave Watson1476db22018-02-14 09:40:10 -0800239 pshufd $78, \TMP5, \TMP1
240 pxor \TMP5, \TMP1
Dave Watsone5b954e2018-08-15 10:29:42 -0700241 movdqu \TMP1, HashKey_3_k(%arg2)
Dave Watson1476db22018-02-14 09:40:10 -0800242
243 GHASH_MUL \TMP5, \TMP3, \TMP1, \TMP2, \TMP4, \TMP6, \TMP7
244# TMP5 = HashKey^3<<1 (mod poly)
Dave Watsone5b954e2018-08-15 10:29:42 -0700245 movdqu \TMP5, HashKey_4(%arg2)
Dave Watson1476db22018-02-14 09:40:10 -0800246 pshufd $78, \TMP5, \TMP1
247 pxor \TMP5, \TMP1
Dave Watsone5b954e2018-08-15 10:29:42 -0700248 movdqu \TMP1, HashKey_4_k(%arg2)
Dave Watson1476db22018-02-14 09:40:10 -0800249.endm
Dave Watson7af964c2018-02-14 09:38:45 -0800250
251# GCM_INIT initializes a gcm_context struct to prepare for encoding/decoding.
252# Clobbers rax, r10-r13 and xmm0-xmm6, %xmm13
Dave Watsonfb8986e2018-02-14 09:40:47 -0800253.macro GCM_INIT Iv SUBKEY AAD AADLEN
254 mov \AADLEN, %r11
Dave Watson96604742018-02-14 09:39:45 -0800255 mov %r11, AadLen(%arg2) # ctx_data.aad_length = aad_length
Jan Beulicha7bea832018-07-02 04:31:54 -0600256 xor %r11d, %r11d
Dave Watson96604742018-02-14 09:39:45 -0800257 mov %r11, InLen(%arg2) # ctx_data.in_length = 0
258 mov %r11, PBlockLen(%arg2) # ctx_data.partial_block_length = 0
259 mov %r11, PBlockEncKey(%arg2) # ctx_data.partial_block_enc_key = 0
Dave Watsonfb8986e2018-02-14 09:40:47 -0800260 mov \Iv, %rax
Dave Watson96604742018-02-14 09:39:45 -0800261 movdqu (%rax), %xmm0
262 movdqu %xmm0, OrigIV(%arg2) # ctx_data.orig_IV = iv
263
264 movdqa SHUF_MASK(%rip), %xmm2
Uros Bizjakd7866e52020-07-09 17:08:57 +0200265 pshufb %xmm2, %xmm0
Dave Watson96604742018-02-14 09:39:45 -0800266 movdqu %xmm0, CurCount(%arg2) # ctx_data.current_counter = iv
267
Sedat Dilek3347c8a2020-07-03 16:32:06 +0200268 PRECOMPUTE \SUBKEY, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7
Dave Watsone5b954e2018-08-15 10:29:42 -0700269 movdqu HashKey(%arg2), %xmm13
Dave Watsonc594c542018-02-14 09:39:36 -0800270
Dave Watsonfb8986e2018-02-14 09:40:47 -0800271 CALC_AAD_HASH %xmm13, \AAD, \AADLEN, %xmm0, %xmm1, %xmm2, %xmm3, \
272 %xmm4, %xmm5, %xmm6
Dave Watson7af964c2018-02-14 09:38:45 -0800273.endm
274
Dave Watsonba458332018-02-14 09:39:10 -0800275# GCM_ENC_DEC Encodes/Decodes given data. Assumes that the passed gcm_context
276# struct has been initialized by GCM_INIT.
277# Requires the input data be at least 1 byte long because of READ_PARTIAL_BLOCK
278# Clobbers rax, r10-r13, and xmm0-xmm15
279.macro GCM_ENC_DEC operation
Dave Watson96604742018-02-14 09:39:45 -0800280 movdqu AadHash(%arg2), %xmm8
Dave Watson1476db22018-02-14 09:40:10 -0800281 movdqu HashKey(%arg2), %xmm13
Dave Watson96604742018-02-14 09:39:45 -0800282 add %arg5, InLen(%arg2)
Dave Watsonae952c52018-02-14 09:40:19 -0800283
Jan Beulicha7bea832018-07-02 04:31:54 -0600284 xor %r11d, %r11d # initialise the data pointer offset as zero
Dave Watsonae952c52018-02-14 09:40:19 -0800285 PARTIAL_BLOCK %arg3 %arg4 %arg5 %r11 %xmm8 \operation
286
287 sub %r11, %arg5 # sub partial block data used
Dave Watson96604742018-02-14 09:39:45 -0800288 mov %arg5, %r13 # save the number of bytes
Dave Watsonae952c52018-02-14 09:40:19 -0800289
Dave Watson96604742018-02-14 09:39:45 -0800290 and $-16, %r13 # %r13 = %r13 - (%r13 mod 16)
291 mov %r13, %r12
Dave Watsonba458332018-02-14 09:39:10 -0800292 # Encrypt/Decrypt first few blocks
293
294 and $(3<<4), %r12
295 jz _initial_num_blocks_is_0_\@
296 cmp $(2<<4), %r12
297 jb _initial_num_blocks_is_1_\@
298 je _initial_num_blocks_is_2_\@
299_initial_num_blocks_is_3_\@:
300 INITIAL_BLOCKS_ENC_DEC %xmm9, %xmm10, %xmm13, %xmm11, %xmm12, %xmm0, \
301%xmm1, %xmm2, %xmm3, %xmm4, %xmm8, %xmm5, %xmm6, 5, 678, \operation
302 sub $48, %r13
303 jmp _initial_blocks_\@
304_initial_num_blocks_is_2_\@:
305 INITIAL_BLOCKS_ENC_DEC %xmm9, %xmm10, %xmm13, %xmm11, %xmm12, %xmm0, \
306%xmm1, %xmm2, %xmm3, %xmm4, %xmm8, %xmm5, %xmm6, 6, 78, \operation
307 sub $32, %r13
308 jmp _initial_blocks_\@
309_initial_num_blocks_is_1_\@:
310 INITIAL_BLOCKS_ENC_DEC %xmm9, %xmm10, %xmm13, %xmm11, %xmm12, %xmm0, \
311%xmm1, %xmm2, %xmm3, %xmm4, %xmm8, %xmm5, %xmm6, 7, 8, \operation
312 sub $16, %r13
313 jmp _initial_blocks_\@
314_initial_num_blocks_is_0_\@:
315 INITIAL_BLOCKS_ENC_DEC %xmm9, %xmm10, %xmm13, %xmm11, %xmm12, %xmm0, \
316%xmm1, %xmm2, %xmm3, %xmm4, %xmm8, %xmm5, %xmm6, 8, 0, \operation
317_initial_blocks_\@:
318
319 # Main loop - Encrypt/Decrypt remaining blocks
320
Uros Bizjak032d0492020-11-27 10:44:52 +0100321 test %r13, %r13
Dave Watsonba458332018-02-14 09:39:10 -0800322 je _zero_cipher_left_\@
323 sub $64, %r13
324 je _four_cipher_left_\@
325_crypt_by_4_\@:
326 GHASH_4_ENCRYPT_4_PARALLEL_\operation %xmm9, %xmm10, %xmm11, %xmm12, \
327 %xmm13, %xmm14, %xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, \
328 %xmm7, %xmm8, enc
329 add $64, %r11
330 sub $64, %r13
331 jne _crypt_by_4_\@
332_four_cipher_left_\@:
333 GHASH_LAST_4 %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14, \
334%xmm15, %xmm1, %xmm2, %xmm3, %xmm4, %xmm8
335_zero_cipher_left_\@:
Dave Watson96604742018-02-14 09:39:45 -0800336 movdqu %xmm8, AadHash(%arg2)
337 movdqu %xmm0, CurCount(%arg2)
338
Dave Watson9ee4a5d2018-02-14 09:39:23 -0800339 mov %arg5, %r13
340 and $15, %r13 # %r13 = arg5 (mod 16)
Dave Watsonba458332018-02-14 09:39:10 -0800341 je _multiple_of_16_bytes_\@
342
Dave Watson96604742018-02-14 09:39:45 -0800343 mov %r13, PBlockLen(%arg2)
344
Dave Watsonba458332018-02-14 09:39:10 -0800345 # Handle the last <16 Byte block separately
346 paddd ONE(%rip), %xmm0 # INCR CNT to get Yn
Dave Watson96604742018-02-14 09:39:45 -0800347 movdqu %xmm0, CurCount(%arg2)
Dave Watson9ee4a5d2018-02-14 09:39:23 -0800348 movdqa SHUF_MASK(%rip), %xmm10
Uros Bizjakd7866e52020-07-09 17:08:57 +0200349 pshufb %xmm10, %xmm0
Dave Watsonba458332018-02-14 09:39:10 -0800350
351 ENCRYPT_SINGLE_BLOCK %xmm0, %xmm1 # Encrypt(K, Yn)
Dave Watson96604742018-02-14 09:39:45 -0800352 movdqu %xmm0, PBlockEncKey(%arg2)
Dave Watsonba458332018-02-14 09:39:10 -0800353
Dave Watson933d6ae2018-02-14 09:40:31 -0800354 cmp $16, %arg5
355 jge _large_enough_update_\@
356
Dave Watson9ee4a5d2018-02-14 09:39:23 -0800357 lea (%arg4,%r11,1), %r10
Dave Watsonba458332018-02-14 09:39:10 -0800358 mov %r13, %r12
359 READ_PARTIAL_BLOCK %r10 %r12 %xmm2 %xmm1
Dave Watson933d6ae2018-02-14 09:40:31 -0800360 jmp _data_read_\@
Dave Watsonba458332018-02-14 09:39:10 -0800361
Dave Watson933d6ae2018-02-14 09:40:31 -0800362_large_enough_update_\@:
363 sub $16, %r11
364 add %r13, %r11
365
366 # receive the last <16 Byte block
367 movdqu (%arg4, %r11, 1), %xmm1
368
369 sub %r13, %r11
370 add $16, %r11
371
372 lea SHIFT_MASK+16(%rip), %r12
373 # adjust the shuffle mask pointer to be able to shift 16-r13 bytes
374 # (r13 is the number of bytes in plaintext mod 16)
375 sub %r13, %r12
376 # get the appropriate shuffle mask
377 movdqu (%r12), %xmm2
378 # shift right 16-r13 bytes
Uros Bizjakd7866e52020-07-09 17:08:57 +0200379 pshufb %xmm2, %xmm1
Dave Watson933d6ae2018-02-14 09:40:31 -0800380
381_data_read_\@:
Dave Watsonba458332018-02-14 09:39:10 -0800382 lea ALL_F+16(%rip), %r12
383 sub %r13, %r12
Dave Watson933d6ae2018-02-14 09:40:31 -0800384
Dave Watsonba458332018-02-14 09:39:10 -0800385.ifc \operation, dec
386 movdqa %xmm1, %xmm2
387.endif
388 pxor %xmm1, %xmm0 # XOR Encrypt(K, Yn)
389 movdqu (%r12), %xmm1
390 # get the appropriate mask to mask out top 16-r13 bytes of xmm0
391 pand %xmm1, %xmm0 # mask out top 16-r13 bytes of xmm0
392.ifc \operation, dec
393 pand %xmm1, %xmm2
394 movdqa SHUF_MASK(%rip), %xmm10
Uros Bizjakd7866e52020-07-09 17:08:57 +0200395 pshufb %xmm10 ,%xmm2
Dave Watsonba458332018-02-14 09:39:10 -0800396
397 pxor %xmm2, %xmm8
398.else
399 movdqa SHUF_MASK(%rip), %xmm10
Uros Bizjakd7866e52020-07-09 17:08:57 +0200400 pshufb %xmm10,%xmm0
Dave Watsonba458332018-02-14 09:39:10 -0800401
402 pxor %xmm0, %xmm8
403.endif
404
Dave Watson96604742018-02-14 09:39:45 -0800405 movdqu %xmm8, AadHash(%arg2)
Dave Watsonba458332018-02-14 09:39:10 -0800406.ifc \operation, enc
407 # GHASH computation for the last <16 byte block
408 movdqa SHUF_MASK(%rip), %xmm10
409 # shuffle xmm0 back to output as ciphertext
Uros Bizjakd7866e52020-07-09 17:08:57 +0200410 pshufb %xmm10, %xmm0
Dave Watsonba458332018-02-14 09:39:10 -0800411.endif
412
413 # Output %r13 bytes
Uros Bizjakd7866e52020-07-09 17:08:57 +0200414 movq %xmm0, %rax
Dave Watsonba458332018-02-14 09:39:10 -0800415 cmp $8, %r13
416 jle _less_than_8_bytes_left_\@
Dave Watson9ee4a5d2018-02-14 09:39:23 -0800417 mov %rax, (%arg3 , %r11, 1)
Dave Watsonba458332018-02-14 09:39:10 -0800418 add $8, %r11
419 psrldq $8, %xmm0
Uros Bizjakd7866e52020-07-09 17:08:57 +0200420 movq %xmm0, %rax
Dave Watsonba458332018-02-14 09:39:10 -0800421 sub $8, %r13
422_less_than_8_bytes_left_\@:
Dave Watson9ee4a5d2018-02-14 09:39:23 -0800423 mov %al, (%arg3, %r11, 1)
Dave Watsonba458332018-02-14 09:39:10 -0800424 add $1, %r11
425 shr $8, %rax
426 sub $1, %r13
427 jne _less_than_8_bytes_left_\@
428_multiple_of_16_bytes_\@:
429.endm
430
Dave Watsonadcadab2018-02-14 09:38:57 -0800431# GCM_COMPLETE Finishes update of tag of last partial block
432# Output: Authorization Tag (AUTH_TAG)
433# Clobbers rax, r10-r12, and xmm0, xmm1, xmm5-xmm15
Dave Watsonfb8986e2018-02-14 09:40:47 -0800434.macro GCM_COMPLETE AUTHTAG AUTHTAGLEN
Dave Watson96604742018-02-14 09:39:45 -0800435 movdqu AadHash(%arg2), %xmm8
Dave Watson1476db22018-02-14 09:40:10 -0800436 movdqu HashKey(%arg2), %xmm13
Dave Watsone2e34b02018-02-14 09:39:55 -0800437
438 mov PBlockLen(%arg2), %r12
439
Uros Bizjak032d0492020-11-27 10:44:52 +0100440 test %r12, %r12
Dave Watsone2e34b02018-02-14 09:39:55 -0800441 je _partial_done\@
442
443 GHASH_MUL %xmm8, %xmm13, %xmm9, %xmm10, %xmm11, %xmm5, %xmm6
444
445_partial_done\@:
Dave Watson96604742018-02-14 09:39:45 -0800446 mov AadLen(%arg2), %r12 # %r13 = aadLen (number of bytes)
Dave Watsonadcadab2018-02-14 09:38:57 -0800447 shl $3, %r12 # convert into number of bits
448 movd %r12d, %xmm15 # len(A) in %xmm15
Dave Watson96604742018-02-14 09:39:45 -0800449 mov InLen(%arg2), %r12
450 shl $3, %r12 # len(C) in bits (*128)
Uros Bizjakd7866e52020-07-09 17:08:57 +0200451 movq %r12, %xmm1
Dave Watson96604742018-02-14 09:39:45 -0800452
Dave Watsonadcadab2018-02-14 09:38:57 -0800453 pslldq $8, %xmm15 # %xmm15 = len(A)||0x0000000000000000
454 pxor %xmm1, %xmm15 # %xmm15 = len(A)||len(C)
455 pxor %xmm15, %xmm8
456 GHASH_MUL %xmm8, %xmm13, %xmm9, %xmm10, %xmm11, %xmm5, %xmm6
457 # final GHASH computation
458 movdqa SHUF_MASK(%rip), %xmm10
Uros Bizjakd7866e52020-07-09 17:08:57 +0200459 pshufb %xmm10, %xmm8
Dave Watsonadcadab2018-02-14 09:38:57 -0800460
Dave Watson96604742018-02-14 09:39:45 -0800461 movdqu OrigIV(%arg2), %xmm0 # %xmm0 = Y0
Dave Watsonadcadab2018-02-14 09:38:57 -0800462 ENCRYPT_SINGLE_BLOCK %xmm0, %xmm1 # E(K, Y0)
463 pxor %xmm8, %xmm0
464_return_T_\@:
Dave Watsonfb8986e2018-02-14 09:40:47 -0800465 mov \AUTHTAG, %r10 # %r10 = authTag
466 mov \AUTHTAGLEN, %r11 # %r11 = auth_tag_len
Dave Watsonadcadab2018-02-14 09:38:57 -0800467 cmp $16, %r11
468 je _T_16_\@
469 cmp $8, %r11
470 jl _T_4_\@
471_T_8_\@:
Uros Bizjakd7866e52020-07-09 17:08:57 +0200472 movq %xmm0, %rax
Dave Watsonadcadab2018-02-14 09:38:57 -0800473 mov %rax, (%r10)
474 add $8, %r10
475 sub $8, %r11
476 psrldq $8, %xmm0
Uros Bizjak032d0492020-11-27 10:44:52 +0100477 test %r11, %r11
Dave Watsonadcadab2018-02-14 09:38:57 -0800478 je _return_T_done_\@
479_T_4_\@:
480 movd %xmm0, %eax
481 mov %eax, (%r10)
482 add $4, %r10
483 sub $4, %r11
484 psrldq $4, %xmm0
Uros Bizjak032d0492020-11-27 10:44:52 +0100485 test %r11, %r11
Dave Watsonadcadab2018-02-14 09:38:57 -0800486 je _return_T_done_\@
487_T_123_\@:
488 movd %xmm0, %eax
489 cmp $2, %r11
490 jl _T_1_\@
491 mov %ax, (%r10)
492 cmp $2, %r11
493 je _return_T_done_\@
494 add $2, %r10
495 sar $16, %eax
496_T_1_\@:
497 mov %al, (%r10)
498 jmp _return_T_done_\@
499_T_16_\@:
500 movdqu %xmm0, (%r10)
501_return_T_done_\@:
502.endm
503
Mathias Krause559ad0f2010-11-29 08:35:39 +0800504#ifdef __x86_64__
Tadeusz Struk0bd82f52010-11-04 15:00:45 -0400505/* GHASH_MUL MACRO to implement: Data*HashKey mod (128,127,126,121,0)
506*
507*
508* Input: A and B (128-bits each, bit-reflected)
509* Output: C = A*B*x mod poly, (i.e. >>1 )
510* To compute GH = GH*HashKey mod poly, give HK = HashKey<<1 mod poly as input
511* GH = GH * HK * x mod poly which is equivalent to GH*HashKey mod poly.
512*
513*/
514.macro GHASH_MUL GH HK TMP1 TMP2 TMP3 TMP4 TMP5
515 movdqa \GH, \TMP1
516 pshufd $78, \GH, \TMP2
517 pshufd $78, \HK, \TMP3
518 pxor \GH, \TMP2 # TMP2 = a1+a0
519 pxor \HK, \TMP3 # TMP3 = b1+b0
Uros Bizjakd7866e52020-07-09 17:08:57 +0200520 pclmulqdq $0x11, \HK, \TMP1 # TMP1 = a1*b1
521 pclmulqdq $0x00, \HK, \GH # GH = a0*b0
522 pclmulqdq $0x00, \TMP3, \TMP2 # TMP2 = (a0+a1)*(b1+b0)
Tadeusz Struk0bd82f52010-11-04 15:00:45 -0400523 pxor \GH, \TMP2
524 pxor \TMP1, \TMP2 # TMP2 = (a0*b0)+(a1*b0)
525 movdqa \TMP2, \TMP3
526 pslldq $8, \TMP3 # left shift TMP3 2 DWs
527 psrldq $8, \TMP2 # right shift TMP2 2 DWs
528 pxor \TMP3, \GH
529 pxor \TMP2, \TMP1 # TMP2:GH holds the result of GH*HK
530
531 # first phase of the reduction
532
533 movdqa \GH, \TMP2
534 movdqa \GH, \TMP3
535 movdqa \GH, \TMP4 # copy GH into TMP2,TMP3 and TMP4
536 # in in order to perform
537 # independent shifts
538 pslld $31, \TMP2 # packed right shift <<31
539 pslld $30, \TMP3 # packed right shift <<30
540 pslld $25, \TMP4 # packed right shift <<25
541 pxor \TMP3, \TMP2 # xor the shifted versions
542 pxor \TMP4, \TMP2
543 movdqa \TMP2, \TMP5
544 psrldq $4, \TMP5 # right shift TMP5 1 DW
545 pslldq $12, \TMP2 # left shift TMP2 3 DWs
546 pxor \TMP2, \GH
547
548 # second phase of the reduction
549
550 movdqa \GH,\TMP2 # copy GH into TMP2,TMP3 and TMP4
551 # in in order to perform
552 # independent shifts
553 movdqa \GH,\TMP3
554 movdqa \GH,\TMP4
555 psrld $1,\TMP2 # packed left shift >>1
556 psrld $2,\TMP3 # packed left shift >>2
557 psrld $7,\TMP4 # packed left shift >>7
558 pxor \TMP3,\TMP2 # xor the shifted versions
559 pxor \TMP4,\TMP2
560 pxor \TMP5, \TMP2
561 pxor \TMP2, \GH
562 pxor \TMP1, \GH # result is in TMP1
563.endm
564
Junaid Shahidb20209c2017-12-20 17:08:37 -0800565# Reads DLEN bytes starting at DPTR and stores in XMMDst
566# where 0 < DLEN < 16
567# Clobbers %rax, DLEN and XMM1
568.macro READ_PARTIAL_BLOCK DPTR DLEN XMM1 XMMDst
569 cmp $8, \DLEN
570 jl _read_lt8_\@
571 mov (\DPTR), %rax
Uros Bizjakd7866e52020-07-09 17:08:57 +0200572 movq %rax, \XMMDst
Junaid Shahidb20209c2017-12-20 17:08:37 -0800573 sub $8, \DLEN
574 jz _done_read_partial_block_\@
575 xor %eax, %eax
576_read_next_byte_\@:
577 shl $8, %rax
578 mov 7(\DPTR, \DLEN, 1), %al
579 dec \DLEN
580 jnz _read_next_byte_\@
Uros Bizjakd7866e52020-07-09 17:08:57 +0200581 movq %rax, \XMM1
Junaid Shahidb20209c2017-12-20 17:08:37 -0800582 pslldq $8, \XMM1
583 por \XMM1, \XMMDst
584 jmp _done_read_partial_block_\@
585_read_lt8_\@:
586 xor %eax, %eax
587_read_next_byte_lt8_\@:
588 shl $8, %rax
589 mov -1(\DPTR, \DLEN, 1), %al
590 dec \DLEN
591 jnz _read_next_byte_lt8_\@
Uros Bizjakd7866e52020-07-09 17:08:57 +0200592 movq %rax, \XMMDst
Junaid Shahidb20209c2017-12-20 17:08:37 -0800593_done_read_partial_block_\@:
594.endm
595
Dave Watsonc594c542018-02-14 09:39:36 -0800596# CALC_AAD_HASH: Calculates the hash of the data which will not be encrypted.
597# clobbers r10-11, xmm14
Dave Watsonfb8986e2018-02-14 09:40:47 -0800598.macro CALC_AAD_HASH HASHKEY AAD AADLEN TMP1 TMP2 TMP3 TMP4 TMP5 \
Dave Watsonc594c542018-02-14 09:39:36 -0800599 TMP6 TMP7
600 MOVADQ SHUF_MASK(%rip), %xmm14
Dave Watsonfb8986e2018-02-14 09:40:47 -0800601 mov \AAD, %r10 # %r10 = AAD
602 mov \AADLEN, %r11 # %r11 = aadLen
Dave Watsonc594c542018-02-14 09:39:36 -0800603 pxor \TMP7, \TMP7
604 pxor \TMP6, \TMP6
605
606 cmp $16, %r11
607 jl _get_AAD_rest\@
608_get_AAD_blocks\@:
609 movdqu (%r10), \TMP7
Uros Bizjakd7866e52020-07-09 17:08:57 +0200610 pshufb %xmm14, \TMP7 # byte-reflect the AAD data
Dave Watsonc594c542018-02-14 09:39:36 -0800611 pxor \TMP7, \TMP6
612 GHASH_MUL \TMP6, \HASHKEY, \TMP1, \TMP2, \TMP3, \TMP4, \TMP5
613 add $16, %r10
614 sub $16, %r11
615 cmp $16, %r11
616 jge _get_AAD_blocks\@
617
618 movdqu \TMP6, \TMP7
619
620 /* read the last <16B of AAD */
621_get_AAD_rest\@:
Uros Bizjak032d0492020-11-27 10:44:52 +0100622 test %r11, %r11
Dave Watsonc594c542018-02-14 09:39:36 -0800623 je _get_AAD_done\@
624
625 READ_PARTIAL_BLOCK %r10, %r11, \TMP1, \TMP7
Uros Bizjakd7866e52020-07-09 17:08:57 +0200626 pshufb %xmm14, \TMP7 # byte-reflect the AAD data
Dave Watsonc594c542018-02-14 09:39:36 -0800627 pxor \TMP6, \TMP7
628 GHASH_MUL \TMP7, \HASHKEY, \TMP1, \TMP2, \TMP3, \TMP4, \TMP5
629 movdqu \TMP7, \TMP6
630
631_get_AAD_done\@:
632 movdqu \TMP6, AadHash(%arg2)
633.endm
634
Dave Watsonae952c52018-02-14 09:40:19 -0800635# PARTIAL_BLOCK: Handles encryption/decryption and the tag partial blocks
636# between update calls.
637# Requires the input data be at least 1 byte long due to READ_PARTIAL_BLOCK
638# Outputs encrypted bytes, and updates hash and partial info in gcm_data_context
639# Clobbers rax, r10, r12, r13, xmm0-6, xmm9-13
640.macro PARTIAL_BLOCK CYPH_PLAIN_OUT PLAIN_CYPH_IN PLAIN_CYPH_LEN DATA_OFFSET \
641 AAD_HASH operation
642 mov PBlockLen(%arg2), %r13
Uros Bizjak032d0492020-11-27 10:44:52 +0100643 test %r13, %r13
Dave Watsonae952c52018-02-14 09:40:19 -0800644 je _partial_block_done_\@ # Leave Macro if no partial blocks
645 # Read in input data without over reading
646 cmp $16, \PLAIN_CYPH_LEN
647 jl _fewer_than_16_bytes_\@
648 movups (\PLAIN_CYPH_IN), %xmm1 # If more than 16 bytes, just fill xmm
649 jmp _data_read_\@
650
651_fewer_than_16_bytes_\@:
652 lea (\PLAIN_CYPH_IN, \DATA_OFFSET, 1), %r10
653 mov \PLAIN_CYPH_LEN, %r12
654 READ_PARTIAL_BLOCK %r10 %r12 %xmm0 %xmm1
655
656 mov PBlockLen(%arg2), %r13
657
658_data_read_\@: # Finished reading in data
659
660 movdqu PBlockEncKey(%arg2), %xmm9
661 movdqu HashKey(%arg2), %xmm13
662
663 lea SHIFT_MASK(%rip), %r12
664
665 # adjust the shuffle mask pointer to be able to shift r13 bytes
666 # r16-r13 is the number of bytes in plaintext mod 16)
667 add %r13, %r12
668 movdqu (%r12), %xmm2 # get the appropriate shuffle mask
Uros Bizjakd7866e52020-07-09 17:08:57 +0200669 pshufb %xmm2, %xmm9 # shift right r13 bytes
Dave Watsonae952c52018-02-14 09:40:19 -0800670
671.ifc \operation, dec
672 movdqa %xmm1, %xmm3
673 pxor %xmm1, %xmm9 # Cyphertext XOR E(K, Yn)
674
675 mov \PLAIN_CYPH_LEN, %r10
676 add %r13, %r10
677 # Set r10 to be the amount of data left in CYPH_PLAIN_IN after filling
678 sub $16, %r10
679 # Determine if if partial block is not being filled and
680 # shift mask accordingly
681 jge _no_extra_mask_1_\@
682 sub %r10, %r12
683_no_extra_mask_1_\@:
684
685 movdqu ALL_F-SHIFT_MASK(%r12), %xmm1
686 # get the appropriate mask to mask out bottom r13 bytes of xmm9
687 pand %xmm1, %xmm9 # mask out bottom r13 bytes of xmm9
688
689 pand %xmm1, %xmm3
690 movdqa SHUF_MASK(%rip), %xmm10
Uros Bizjakd7866e52020-07-09 17:08:57 +0200691 pshufb %xmm10, %xmm3
692 pshufb %xmm2, %xmm3
Dave Watsonae952c52018-02-14 09:40:19 -0800693 pxor %xmm3, \AAD_HASH
694
Uros Bizjak032d0492020-11-27 10:44:52 +0100695 test %r10, %r10
Dave Watsonae952c52018-02-14 09:40:19 -0800696 jl _partial_incomplete_1_\@
697
698 # GHASH computation for the last <16 Byte block
699 GHASH_MUL \AAD_HASH, %xmm13, %xmm0, %xmm10, %xmm11, %xmm5, %xmm6
Jan Beulicha7bea832018-07-02 04:31:54 -0600700 xor %eax, %eax
Dave Watsonae952c52018-02-14 09:40:19 -0800701
702 mov %rax, PBlockLen(%arg2)
703 jmp _dec_done_\@
704_partial_incomplete_1_\@:
705 add \PLAIN_CYPH_LEN, PBlockLen(%arg2)
706_dec_done_\@:
707 movdqu \AAD_HASH, AadHash(%arg2)
708.else
709 pxor %xmm1, %xmm9 # Plaintext XOR E(K, Yn)
710
711 mov \PLAIN_CYPH_LEN, %r10
712 add %r13, %r10
713 # Set r10 to be the amount of data left in CYPH_PLAIN_IN after filling
714 sub $16, %r10
715 # Determine if if partial block is not being filled and
716 # shift mask accordingly
717 jge _no_extra_mask_2_\@
718 sub %r10, %r12
719_no_extra_mask_2_\@:
720
721 movdqu ALL_F-SHIFT_MASK(%r12), %xmm1
722 # get the appropriate mask to mask out bottom r13 bytes of xmm9
723 pand %xmm1, %xmm9
724
725 movdqa SHUF_MASK(%rip), %xmm1
Uros Bizjakd7866e52020-07-09 17:08:57 +0200726 pshufb %xmm1, %xmm9
727 pshufb %xmm2, %xmm9
Dave Watsonae952c52018-02-14 09:40:19 -0800728 pxor %xmm9, \AAD_HASH
729
Uros Bizjak032d0492020-11-27 10:44:52 +0100730 test %r10, %r10
Dave Watsonae952c52018-02-14 09:40:19 -0800731 jl _partial_incomplete_2_\@
732
733 # GHASH computation for the last <16 Byte block
734 GHASH_MUL \AAD_HASH, %xmm13, %xmm0, %xmm10, %xmm11, %xmm5, %xmm6
Jan Beulicha7bea832018-07-02 04:31:54 -0600735 xor %eax, %eax
Dave Watsonae952c52018-02-14 09:40:19 -0800736
737 mov %rax, PBlockLen(%arg2)
738 jmp _encode_done_\@
739_partial_incomplete_2_\@:
740 add \PLAIN_CYPH_LEN, PBlockLen(%arg2)
741_encode_done_\@:
742 movdqu \AAD_HASH, AadHash(%arg2)
743
744 movdqa SHUF_MASK(%rip), %xmm10
745 # shuffle xmm9 back to output as ciphertext
Uros Bizjakd7866e52020-07-09 17:08:57 +0200746 pshufb %xmm10, %xmm9
747 pshufb %xmm2, %xmm9
Dave Watsonae952c52018-02-14 09:40:19 -0800748.endif
749 # output encrypted Bytes
Uros Bizjak032d0492020-11-27 10:44:52 +0100750 test %r10, %r10
Dave Watsonae952c52018-02-14 09:40:19 -0800751 jl _partial_fill_\@
752 mov %r13, %r12
753 mov $16, %r13
754 # Set r13 to be the number of bytes to write out
755 sub %r12, %r13
756 jmp _count_set_\@
757_partial_fill_\@:
758 mov \PLAIN_CYPH_LEN, %r13
759_count_set_\@:
760 movdqa %xmm9, %xmm0
Uros Bizjakd7866e52020-07-09 17:08:57 +0200761 movq %xmm0, %rax
Dave Watsonae952c52018-02-14 09:40:19 -0800762 cmp $8, %r13
763 jle _less_than_8_bytes_left_\@
764
765 mov %rax, (\CYPH_PLAIN_OUT, \DATA_OFFSET, 1)
766 add $8, \DATA_OFFSET
767 psrldq $8, %xmm0
Uros Bizjakd7866e52020-07-09 17:08:57 +0200768 movq %xmm0, %rax
Dave Watsonae952c52018-02-14 09:40:19 -0800769 sub $8, %r13
770_less_than_8_bytes_left_\@:
771 movb %al, (\CYPH_PLAIN_OUT, \DATA_OFFSET, 1)
772 add $1, \DATA_OFFSET
773 shr $8, %rax
774 sub $1, %r13
775 jne _less_than_8_bytes_left_\@
776_partial_block_done_\@:
777.endm # PARTIAL_BLOCK
778
Tadeusz Struk0bd82f52010-11-04 15:00:45 -0400779/*
780* if a = number of total plaintext bytes
781* b = floor(a/16)
782* num_initial_blocks = b mod 4
783* encrypt the initial num_initial_blocks blocks and apply ghash on
784* the ciphertext
785* %r10, %r11, %r12, %rax, %xmm5, %xmm6, %xmm7, %xmm8, %xmm9 registers
786* are clobbered
Dave Watson1476db22018-02-14 09:40:10 -0800787* arg1, %arg2, %arg3 are used as a pointer only, not modified
Tadeusz Struk0bd82f52010-11-04 15:00:45 -0400788*/
789
Tadeusz Struk0bd82f52010-11-04 15:00:45 -0400790
Dave Watsone1fd3162018-02-14 09:38:12 -0800791.macro INITIAL_BLOCKS_ENC_DEC TMP1 TMP2 TMP3 TMP4 TMP5 XMM0 XMM1 \
Dave Watsonc594c542018-02-14 09:39:36 -0800792 XMM2 XMM3 XMM4 XMMDst TMP6 TMP7 i i_seq operation
Dave Watson96604742018-02-14 09:39:45 -0800793 MOVADQ SHUF_MASK(%rip), %xmm14
Sabrina Dubroca0487cca2017-04-28 18:11:56 +0200794
Dave Watsonc594c542018-02-14 09:39:36 -0800795 movdqu AadHash(%arg2), %xmm\i # XMM0 = Y0
Sabrina Dubroca0487cca2017-04-28 18:11:56 +0200796
Sabrina Dubroca0487cca2017-04-28 18:11:56 +0200797 # start AES for num_initial_blocks blocks
Tadeusz Struk3c097b82010-12-13 19:51:15 +0800798
Dave Watson96604742018-02-14 09:39:45 -0800799 movdqu CurCount(%arg2), \XMM0 # XMM0 = Y0
Tadeusz Struk3c097b82010-12-13 19:51:15 +0800800
801.if (\i == 5) || (\i == 6) || (\i == 7)
Tadeusz Struk3c097b82010-12-13 19:51:15 +0800802
Timothy McCaffreye31ac322015-01-13 13:16:43 -0500803 MOVADQ ONE(%RIP),\TMP1
804 MOVADQ 0(%arg1),\TMP2
Tadeusz Struk3c097b82010-12-13 19:51:15 +0800805.irpc index, \i_seq
Timothy McCaffreye31ac322015-01-13 13:16:43 -0500806 paddd \TMP1, \XMM0 # INCR Y0
Dave Watsone1fd3162018-02-14 09:38:12 -0800807.ifc \operation, dec
808 movdqa \XMM0, %xmm\index
809.else
Timothy McCaffreye31ac322015-01-13 13:16:43 -0500810 MOVADQ \XMM0, %xmm\index
Dave Watsone1fd3162018-02-14 09:38:12 -0800811.endif
Uros Bizjakd7866e52020-07-09 17:08:57 +0200812 pshufb %xmm14, %xmm\index # perform a 16 byte swap
Timothy McCaffreye31ac322015-01-13 13:16:43 -0500813 pxor \TMP2, %xmm\index
Tadeusz Struk3c097b82010-12-13 19:51:15 +0800814.endr
Timothy McCaffreye31ac322015-01-13 13:16:43 -0500815 lea 0x10(%arg1),%r10
816 mov keysize,%eax
817 shr $2,%eax # 128->4, 192->6, 256->8
818 add $5,%eax # 128->9, 192->11, 256->13
819
Dave Watsone1fd3162018-02-14 09:38:12 -0800820aes_loop_initial_\@:
Timothy McCaffreye31ac322015-01-13 13:16:43 -0500821 MOVADQ (%r10),\TMP1
822.irpc index, \i_seq
Uros Bizjakd7866e52020-07-09 17:08:57 +0200823 aesenc \TMP1, %xmm\index
Tadeusz Struk3c097b82010-12-13 19:51:15 +0800824.endr
Timothy McCaffreye31ac322015-01-13 13:16:43 -0500825 add $16,%r10
826 sub $1,%eax
Dave Watsone1fd3162018-02-14 09:38:12 -0800827 jnz aes_loop_initial_\@
Timothy McCaffreye31ac322015-01-13 13:16:43 -0500828
829 MOVADQ (%r10), \TMP1
Tadeusz Struk3c097b82010-12-13 19:51:15 +0800830.irpc index, \i_seq
Uros Bizjakd7866e52020-07-09 17:08:57 +0200831 aesenclast \TMP1, %xmm\index # Last Round
Tadeusz Struk3c097b82010-12-13 19:51:15 +0800832.endr
833.irpc index, \i_seq
Dave Watson9ee4a5d2018-02-14 09:39:23 -0800834 movdqu (%arg4 , %r11, 1), \TMP1
Tadeusz Struk3c097b82010-12-13 19:51:15 +0800835 pxor \TMP1, %xmm\index
Dave Watson9ee4a5d2018-02-14 09:39:23 -0800836 movdqu %xmm\index, (%arg3 , %r11, 1)
Tadeusz Struk3c097b82010-12-13 19:51:15 +0800837 # write back plaintext/ciphertext for num_initial_blocks
838 add $16, %r11
Dave Watsone1fd3162018-02-14 09:38:12 -0800839
840.ifc \operation, dec
841 movdqa \TMP1, %xmm\index
842.endif
Uros Bizjakd7866e52020-07-09 17:08:57 +0200843 pshufb %xmm14, %xmm\index
Tadeusz Struk3c097b82010-12-13 19:51:15 +0800844
845 # prepare plaintext/ciphertext for GHASH computation
846.endr
847.endif
Sabrina Dubroca0487cca2017-04-28 18:11:56 +0200848
Tadeusz Struk3c097b82010-12-13 19:51:15 +0800849 # apply GHASH on num_initial_blocks blocks
850
851.if \i == 5
852 pxor %xmm5, %xmm6
853 GHASH_MUL %xmm6, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1
854 pxor %xmm6, %xmm7
855 GHASH_MUL %xmm7, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1
856 pxor %xmm7, %xmm8
857 GHASH_MUL %xmm8, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1
858.elseif \i == 6
859 pxor %xmm6, %xmm7
860 GHASH_MUL %xmm7, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1
861 pxor %xmm7, %xmm8
862 GHASH_MUL %xmm8, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1
863.elseif \i == 7
864 pxor %xmm7, %xmm8
865 GHASH_MUL %xmm8, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1
866.endif
867 cmp $64, %r13
Dave Watsone1fd3162018-02-14 09:38:12 -0800868 jl _initial_blocks_done\@
Tadeusz Struk3c097b82010-12-13 19:51:15 +0800869 # no need for precomputed values
870/*
871*
872* Precomputations for HashKey parallel with encryption of first 4 blocks.
873* Haskey_i_k holds XORed values of the low and high parts of the Haskey_i
874*/
Timothy McCaffreye31ac322015-01-13 13:16:43 -0500875 MOVADQ ONE(%RIP),\TMP1
876 paddd \TMP1, \XMM0 # INCR Y0
877 MOVADQ \XMM0, \XMM1
Uros Bizjakd7866e52020-07-09 17:08:57 +0200878 pshufb %xmm14, \XMM1 # perform a 16 byte swap
Tadeusz Struk3c097b82010-12-13 19:51:15 +0800879
Timothy McCaffreye31ac322015-01-13 13:16:43 -0500880 paddd \TMP1, \XMM0 # INCR Y0
881 MOVADQ \XMM0, \XMM2
Uros Bizjakd7866e52020-07-09 17:08:57 +0200882 pshufb %xmm14, \XMM2 # perform a 16 byte swap
Tadeusz Struk3c097b82010-12-13 19:51:15 +0800883
Timothy McCaffreye31ac322015-01-13 13:16:43 -0500884 paddd \TMP1, \XMM0 # INCR Y0
885 MOVADQ \XMM0, \XMM3
Uros Bizjakd7866e52020-07-09 17:08:57 +0200886 pshufb %xmm14, \XMM3 # perform a 16 byte swap
Tadeusz Struk3c097b82010-12-13 19:51:15 +0800887
Timothy McCaffreye31ac322015-01-13 13:16:43 -0500888 paddd \TMP1, \XMM0 # INCR Y0
889 MOVADQ \XMM0, \XMM4
Uros Bizjakd7866e52020-07-09 17:08:57 +0200890 pshufb %xmm14, \XMM4 # perform a 16 byte swap
Tadeusz Struk3c097b82010-12-13 19:51:15 +0800891
Timothy McCaffreye31ac322015-01-13 13:16:43 -0500892 MOVADQ 0(%arg1),\TMP1
893 pxor \TMP1, \XMM1
894 pxor \TMP1, \XMM2
895 pxor \TMP1, \XMM3
896 pxor \TMP1, \XMM4
Tadeusz Struk3c097b82010-12-13 19:51:15 +0800897.irpc index, 1234 # do 4 rounds
898 movaps 0x10*\index(%arg1), \TMP1
Uros Bizjakd7866e52020-07-09 17:08:57 +0200899 aesenc \TMP1, \XMM1
900 aesenc \TMP1, \XMM2
901 aesenc \TMP1, \XMM3
902 aesenc \TMP1, \XMM4
Tadeusz Struk3c097b82010-12-13 19:51:15 +0800903.endr
Tadeusz Struk3c097b82010-12-13 19:51:15 +0800904.irpc index, 56789 # do next 5 rounds
905 movaps 0x10*\index(%arg1), \TMP1
Uros Bizjakd7866e52020-07-09 17:08:57 +0200906 aesenc \TMP1, \XMM1
907 aesenc \TMP1, \XMM2
908 aesenc \TMP1, \XMM3
909 aesenc \TMP1, \XMM4
Tadeusz Struk3c097b82010-12-13 19:51:15 +0800910.endr
Timothy McCaffreye31ac322015-01-13 13:16:43 -0500911 lea 0xa0(%arg1),%r10
912 mov keysize,%eax
913 shr $2,%eax # 128->4, 192->6, 256->8
914 sub $4,%eax # 128->0, 192->2, 256->4
Dave Watsone1fd3162018-02-14 09:38:12 -0800915 jz aes_loop_pre_done\@
Timothy McCaffreye31ac322015-01-13 13:16:43 -0500916
Dave Watsone1fd3162018-02-14 09:38:12 -0800917aes_loop_pre_\@:
Timothy McCaffreye31ac322015-01-13 13:16:43 -0500918 MOVADQ (%r10),\TMP2
919.irpc index, 1234
Uros Bizjakd7866e52020-07-09 17:08:57 +0200920 aesenc \TMP2, %xmm\index
Timothy McCaffreye31ac322015-01-13 13:16:43 -0500921.endr
922 add $16,%r10
923 sub $1,%eax
Dave Watsone1fd3162018-02-14 09:38:12 -0800924 jnz aes_loop_pre_\@
Timothy McCaffreye31ac322015-01-13 13:16:43 -0500925
Dave Watsone1fd3162018-02-14 09:38:12 -0800926aes_loop_pre_done\@:
Timothy McCaffreye31ac322015-01-13 13:16:43 -0500927 MOVADQ (%r10), \TMP2
Uros Bizjakd7866e52020-07-09 17:08:57 +0200928 aesenclast \TMP2, \XMM1
929 aesenclast \TMP2, \XMM2
930 aesenclast \TMP2, \XMM3
931 aesenclast \TMP2, \XMM4
Dave Watson9ee4a5d2018-02-14 09:39:23 -0800932 movdqu 16*0(%arg4 , %r11 , 1), \TMP1
Tadeusz Struk3c097b82010-12-13 19:51:15 +0800933 pxor \TMP1, \XMM1
Dave Watsone1fd3162018-02-14 09:38:12 -0800934.ifc \operation, dec
Dave Watson9ee4a5d2018-02-14 09:39:23 -0800935 movdqu \XMM1, 16*0(%arg3 , %r11 , 1)
Dave Watsone1fd3162018-02-14 09:38:12 -0800936 movdqa \TMP1, \XMM1
937.endif
Dave Watson9ee4a5d2018-02-14 09:39:23 -0800938 movdqu 16*1(%arg4 , %r11 , 1), \TMP1
Tadeusz Struk3c097b82010-12-13 19:51:15 +0800939 pxor \TMP1, \XMM2
Dave Watsone1fd3162018-02-14 09:38:12 -0800940.ifc \operation, dec
Dave Watson9ee4a5d2018-02-14 09:39:23 -0800941 movdqu \XMM2, 16*1(%arg3 , %r11 , 1)
Dave Watsone1fd3162018-02-14 09:38:12 -0800942 movdqa \TMP1, \XMM2
943.endif
Dave Watson9ee4a5d2018-02-14 09:39:23 -0800944 movdqu 16*2(%arg4 , %r11 , 1), \TMP1
Tadeusz Struk3c097b82010-12-13 19:51:15 +0800945 pxor \TMP1, \XMM3
Dave Watsone1fd3162018-02-14 09:38:12 -0800946.ifc \operation, dec
Dave Watson9ee4a5d2018-02-14 09:39:23 -0800947 movdqu \XMM3, 16*2(%arg3 , %r11 , 1)
Dave Watsone1fd3162018-02-14 09:38:12 -0800948 movdqa \TMP1, \XMM3
949.endif
Dave Watson9ee4a5d2018-02-14 09:39:23 -0800950 movdqu 16*3(%arg4 , %r11 , 1), \TMP1
Tadeusz Struk3c097b82010-12-13 19:51:15 +0800951 pxor \TMP1, \XMM4
Dave Watsone1fd3162018-02-14 09:38:12 -0800952.ifc \operation, dec
Dave Watson9ee4a5d2018-02-14 09:39:23 -0800953 movdqu \XMM4, 16*3(%arg3 , %r11 , 1)
Dave Watsone1fd3162018-02-14 09:38:12 -0800954 movdqa \TMP1, \XMM4
955.else
Dave Watson9ee4a5d2018-02-14 09:39:23 -0800956 movdqu \XMM1, 16*0(%arg3 , %r11 , 1)
957 movdqu \XMM2, 16*1(%arg3 , %r11 , 1)
958 movdqu \XMM3, 16*2(%arg3 , %r11 , 1)
959 movdqu \XMM4, 16*3(%arg3 , %r11 , 1)
Dave Watsone1fd3162018-02-14 09:38:12 -0800960.endif
Tadeusz Struk3c097b82010-12-13 19:51:15 +0800961
Tadeusz Struk0bd82f52010-11-04 15:00:45 -0400962 add $64, %r11
Uros Bizjakd7866e52020-07-09 17:08:57 +0200963 pshufb %xmm14, \XMM1 # perform a 16 byte swap
Tadeusz Struk0bd82f52010-11-04 15:00:45 -0400964 pxor \XMMDst, \XMM1
965# combine GHASHed value with the corresponding ciphertext
Uros Bizjakd7866e52020-07-09 17:08:57 +0200966 pshufb %xmm14, \XMM2 # perform a 16 byte swap
967 pshufb %xmm14, \XMM3 # perform a 16 byte swap
968 pshufb %xmm14, \XMM4 # perform a 16 byte swap
Tadeusz Struk3c097b82010-12-13 19:51:15 +0800969
Dave Watsone1fd3162018-02-14 09:38:12 -0800970_initial_blocks_done\@:
Tadeusz Struk3c097b82010-12-13 19:51:15 +0800971
Tadeusz Struk0bd82f52010-11-04 15:00:45 -0400972.endm
973
974/*
975* encrypt 4 blocks at a time
976* ghash the 4 previously encrypted ciphertext blocks
Dave Watson9ee4a5d2018-02-14 09:39:23 -0800977* arg1, %arg3, %arg4 are used as pointers only, not modified
Tadeusz Struk0bd82f52010-11-04 15:00:45 -0400978* %r11 is the data offset value
979*/
Sedat Dilek3347c8a2020-07-03 16:32:06 +0200980.macro GHASH_4_ENCRYPT_4_PARALLEL_enc TMP1 TMP2 TMP3 TMP4 TMP5 \
Tadeusz Struk0bd82f52010-11-04 15:00:45 -0400981TMP6 XMM0 XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7 XMM8 operation
982
983 movdqa \XMM1, \XMM5
984 movdqa \XMM2, \XMM6
985 movdqa \XMM3, \XMM7
986 movdqa \XMM4, \XMM8
987
Tadeusz Struk3c097b82010-12-13 19:51:15 +0800988 movdqa SHUF_MASK(%rip), %xmm15
Tadeusz Struk0bd82f52010-11-04 15:00:45 -0400989 # multiply TMP5 * HashKey using karatsuba
990
991 movdqa \XMM5, \TMP4
992 pshufd $78, \XMM5, \TMP6
993 pxor \XMM5, \TMP6
994 paddd ONE(%rip), \XMM0 # INCR CNT
Dave Watsone5b954e2018-08-15 10:29:42 -0700995 movdqu HashKey_4(%arg2), \TMP5
Uros Bizjakd7866e52020-07-09 17:08:57 +0200996 pclmulqdq $0x11, \TMP5, \TMP4 # TMP4 = a1*b1
Tadeusz Struk0bd82f52010-11-04 15:00:45 -0400997 movdqa \XMM0, \XMM1
998 paddd ONE(%rip), \XMM0 # INCR CNT
999 movdqa \XMM0, \XMM2
1000 paddd ONE(%rip), \XMM0 # INCR CNT
1001 movdqa \XMM0, \XMM3
1002 paddd ONE(%rip), \XMM0 # INCR CNT
1003 movdqa \XMM0, \XMM4
Uros Bizjakd7866e52020-07-09 17:08:57 +02001004 pshufb %xmm15, \XMM1 # perform a 16 byte swap
1005 pclmulqdq $0x00, \TMP5, \XMM5 # XMM5 = a0*b0
1006 pshufb %xmm15, \XMM2 # perform a 16 byte swap
1007 pshufb %xmm15, \XMM3 # perform a 16 byte swap
1008 pshufb %xmm15, \XMM4 # perform a 16 byte swap
Tadeusz Struk3c097b82010-12-13 19:51:15 +08001009
Tadeusz Struk0bd82f52010-11-04 15:00:45 -04001010 pxor (%arg1), \XMM1
1011 pxor (%arg1), \XMM2
1012 pxor (%arg1), \XMM3
1013 pxor (%arg1), \XMM4
Dave Watsone5b954e2018-08-15 10:29:42 -07001014 movdqu HashKey_4_k(%arg2), \TMP5
Uros Bizjakd7866e52020-07-09 17:08:57 +02001015 pclmulqdq $0x00, \TMP5, \TMP6 # TMP6 = (a1+a0)*(b1+b0)
Tadeusz Struk0bd82f52010-11-04 15:00:45 -04001016 movaps 0x10(%arg1), \TMP1
Uros Bizjakd7866e52020-07-09 17:08:57 +02001017 aesenc \TMP1, \XMM1 # Round 1
1018 aesenc \TMP1, \XMM2
1019 aesenc \TMP1, \XMM3
1020 aesenc \TMP1, \XMM4
Tadeusz Struk0bd82f52010-11-04 15:00:45 -04001021 movaps 0x20(%arg1), \TMP1
Uros Bizjakd7866e52020-07-09 17:08:57 +02001022 aesenc \TMP1, \XMM1 # Round 2
1023 aesenc \TMP1, \XMM2
1024 aesenc \TMP1, \XMM3
1025 aesenc \TMP1, \XMM4
Tadeusz Struk0bd82f52010-11-04 15:00:45 -04001026 movdqa \XMM6, \TMP1
1027 pshufd $78, \XMM6, \TMP2
1028 pxor \XMM6, \TMP2
Dave Watsone5b954e2018-08-15 10:29:42 -07001029 movdqu HashKey_3(%arg2), \TMP5
Uros Bizjakd7866e52020-07-09 17:08:57 +02001030 pclmulqdq $0x11, \TMP5, \TMP1 # TMP1 = a1 * b1
Tadeusz Struk0bd82f52010-11-04 15:00:45 -04001031 movaps 0x30(%arg1), \TMP3
Uros Bizjakd7866e52020-07-09 17:08:57 +02001032 aesenc \TMP3, \XMM1 # Round 3
1033 aesenc \TMP3, \XMM2
1034 aesenc \TMP3, \XMM3
1035 aesenc \TMP3, \XMM4
1036 pclmulqdq $0x00, \TMP5, \XMM6 # XMM6 = a0*b0
Tadeusz Struk0bd82f52010-11-04 15:00:45 -04001037 movaps 0x40(%arg1), \TMP3
Uros Bizjakd7866e52020-07-09 17:08:57 +02001038 aesenc \TMP3, \XMM1 # Round 4
1039 aesenc \TMP3, \XMM2
1040 aesenc \TMP3, \XMM3
1041 aesenc \TMP3, \XMM4
Dave Watsone5b954e2018-08-15 10:29:42 -07001042 movdqu HashKey_3_k(%arg2), \TMP5
Uros Bizjakd7866e52020-07-09 17:08:57 +02001043 pclmulqdq $0x00, \TMP5, \TMP2 # TMP2 = (a1+a0)*(b1+b0)
Tadeusz Struk0bd82f52010-11-04 15:00:45 -04001044 movaps 0x50(%arg1), \TMP3
Uros Bizjakd7866e52020-07-09 17:08:57 +02001045 aesenc \TMP3, \XMM1 # Round 5
1046 aesenc \TMP3, \XMM2
1047 aesenc \TMP3, \XMM3
1048 aesenc \TMP3, \XMM4
Tadeusz Struk0bd82f52010-11-04 15:00:45 -04001049 pxor \TMP1, \TMP4
1050# accumulate the results in TMP4:XMM5, TMP6 holds the middle part
1051 pxor \XMM6, \XMM5
1052 pxor \TMP2, \TMP6
1053 movdqa \XMM7, \TMP1
1054 pshufd $78, \XMM7, \TMP2
1055 pxor \XMM7, \TMP2
Dave Watsone5b954e2018-08-15 10:29:42 -07001056 movdqu HashKey_2(%arg2), \TMP5
Tadeusz Struk0bd82f52010-11-04 15:00:45 -04001057
1058 # Multiply TMP5 * HashKey using karatsuba
1059
Uros Bizjakd7866e52020-07-09 17:08:57 +02001060 pclmulqdq $0x11, \TMP5, \TMP1 # TMP1 = a1*b1
Tadeusz Struk0bd82f52010-11-04 15:00:45 -04001061 movaps 0x60(%arg1), \TMP3
Uros Bizjakd7866e52020-07-09 17:08:57 +02001062 aesenc \TMP3, \XMM1 # Round 6
1063 aesenc \TMP3, \XMM2
1064 aesenc \TMP3, \XMM3
1065 aesenc \TMP3, \XMM4
1066 pclmulqdq $0x00, \TMP5, \XMM7 # XMM7 = a0*b0
Tadeusz Struk0bd82f52010-11-04 15:00:45 -04001067 movaps 0x70(%arg1), \TMP3
Uros Bizjakd7866e52020-07-09 17:08:57 +02001068 aesenc \TMP3, \XMM1 # Round 7
1069 aesenc \TMP3, \XMM2
1070 aesenc \TMP3, \XMM3
1071 aesenc \TMP3, \XMM4
Dave Watsone5b954e2018-08-15 10:29:42 -07001072 movdqu HashKey_2_k(%arg2), \TMP5
Uros Bizjakd7866e52020-07-09 17:08:57 +02001073 pclmulqdq $0x00, \TMP5, \TMP2 # TMP2 = (a1+a0)*(b1+b0)
Tadeusz Struk0bd82f52010-11-04 15:00:45 -04001074 movaps 0x80(%arg1), \TMP3
Uros Bizjakd7866e52020-07-09 17:08:57 +02001075 aesenc \TMP3, \XMM1 # Round 8
1076 aesenc \TMP3, \XMM2
1077 aesenc \TMP3, \XMM3
1078 aesenc \TMP3, \XMM4
Tadeusz Struk0bd82f52010-11-04 15:00:45 -04001079 pxor \TMP1, \TMP4
1080# accumulate the results in TMP4:XMM5, TMP6 holds the middle part
1081 pxor \XMM7, \XMM5
1082 pxor \TMP2, \TMP6
1083
1084 # Multiply XMM8 * HashKey
1085 # XMM8 and TMP5 hold the values for the two operands
1086
1087 movdqa \XMM8, \TMP1
1088 pshufd $78, \XMM8, \TMP2
1089 pxor \XMM8, \TMP2
Dave Watsone5b954e2018-08-15 10:29:42 -07001090 movdqu HashKey(%arg2), \TMP5
Uros Bizjakd7866e52020-07-09 17:08:57 +02001091 pclmulqdq $0x11, \TMP5, \TMP1 # TMP1 = a1*b1
Tadeusz Struk0bd82f52010-11-04 15:00:45 -04001092 movaps 0x90(%arg1), \TMP3
Uros Bizjakd7866e52020-07-09 17:08:57 +02001093 aesenc \TMP3, \XMM1 # Round 9
1094 aesenc \TMP3, \XMM2
1095 aesenc \TMP3, \XMM3
1096 aesenc \TMP3, \XMM4
1097 pclmulqdq $0x00, \TMP5, \XMM8 # XMM8 = a0*b0
Timothy McCaffreye31ac322015-01-13 13:16:43 -05001098 lea 0xa0(%arg1),%r10
1099 mov keysize,%eax
1100 shr $2,%eax # 128->4, 192->6, 256->8
1101 sub $4,%eax # 128->0, 192->2, 256->4
Dave Watsonfb8986e2018-02-14 09:40:47 -08001102 jz aes_loop_par_enc_done\@
Timothy McCaffreye31ac322015-01-13 13:16:43 -05001103
Dave Watsonfb8986e2018-02-14 09:40:47 -08001104aes_loop_par_enc\@:
Timothy McCaffreye31ac322015-01-13 13:16:43 -05001105 MOVADQ (%r10),\TMP3
1106.irpc index, 1234
Uros Bizjakd7866e52020-07-09 17:08:57 +02001107 aesenc \TMP3, %xmm\index
Timothy McCaffreye31ac322015-01-13 13:16:43 -05001108.endr
1109 add $16,%r10
1110 sub $1,%eax
Dave Watsonfb8986e2018-02-14 09:40:47 -08001111 jnz aes_loop_par_enc\@
Timothy McCaffreye31ac322015-01-13 13:16:43 -05001112
Dave Watsonfb8986e2018-02-14 09:40:47 -08001113aes_loop_par_enc_done\@:
Timothy McCaffreye31ac322015-01-13 13:16:43 -05001114 MOVADQ (%r10), \TMP3
Uros Bizjakd7866e52020-07-09 17:08:57 +02001115 aesenclast \TMP3, \XMM1 # Round 10
1116 aesenclast \TMP3, \XMM2
1117 aesenclast \TMP3, \XMM3
1118 aesenclast \TMP3, \XMM4
Dave Watsone5b954e2018-08-15 10:29:42 -07001119 movdqu HashKey_k(%arg2), \TMP5
Uros Bizjakd7866e52020-07-09 17:08:57 +02001120 pclmulqdq $0x00, \TMP5, \TMP2 # TMP2 = (a1+a0)*(b1+b0)
Dave Watson9ee4a5d2018-02-14 09:39:23 -08001121 movdqu (%arg4,%r11,1), \TMP3
Tadeusz Struk0bd82f52010-11-04 15:00:45 -04001122 pxor \TMP3, \XMM1 # Ciphertext/Plaintext XOR EK
Dave Watson9ee4a5d2018-02-14 09:39:23 -08001123 movdqu 16(%arg4,%r11,1), \TMP3
Tadeusz Struk0bd82f52010-11-04 15:00:45 -04001124 pxor \TMP3, \XMM2 # Ciphertext/Plaintext XOR EK
Dave Watson9ee4a5d2018-02-14 09:39:23 -08001125 movdqu 32(%arg4,%r11,1), \TMP3
Tadeusz Struk0bd82f52010-11-04 15:00:45 -04001126 pxor \TMP3, \XMM3 # Ciphertext/Plaintext XOR EK
Dave Watson9ee4a5d2018-02-14 09:39:23 -08001127 movdqu 48(%arg4,%r11,1), \TMP3
Tadeusz Struk0bd82f52010-11-04 15:00:45 -04001128 pxor \TMP3, \XMM4 # Ciphertext/Plaintext XOR EK
Dave Watson9ee4a5d2018-02-14 09:39:23 -08001129 movdqu \XMM1, (%arg3,%r11,1) # Write to the ciphertext buffer
1130 movdqu \XMM2, 16(%arg3,%r11,1) # Write to the ciphertext buffer
1131 movdqu \XMM3, 32(%arg3,%r11,1) # Write to the ciphertext buffer
1132 movdqu \XMM4, 48(%arg3,%r11,1) # Write to the ciphertext buffer
Uros Bizjakd7866e52020-07-09 17:08:57 +02001133 pshufb %xmm15, \XMM1 # perform a 16 byte swap
1134 pshufb %xmm15, \XMM2 # perform a 16 byte swap
1135 pshufb %xmm15, \XMM3 # perform a 16 byte swap
1136 pshufb %xmm15, \XMM4 # perform a 16 byte swap
Tadeusz Struk3c097b82010-12-13 19:51:15 +08001137
1138 pxor \TMP4, \TMP1
1139 pxor \XMM8, \XMM5
1140 pxor \TMP6, \TMP2
1141 pxor \TMP1, \TMP2
1142 pxor \XMM5, \TMP2
1143 movdqa \TMP2, \TMP3
1144 pslldq $8, \TMP3 # left shift TMP3 2 DWs
1145 psrldq $8, \TMP2 # right shift TMP2 2 DWs
1146 pxor \TMP3, \XMM5
1147 pxor \TMP2, \TMP1 # accumulate the results in TMP1:XMM5
1148
1149 # first phase of reduction
1150
1151 movdqa \XMM5, \TMP2
1152 movdqa \XMM5, \TMP3
1153 movdqa \XMM5, \TMP4
1154# move XMM5 into TMP2, TMP3, TMP4 in order to perform shifts independently
1155 pslld $31, \TMP2 # packed right shift << 31
1156 pslld $30, \TMP3 # packed right shift << 30
1157 pslld $25, \TMP4 # packed right shift << 25
1158 pxor \TMP3, \TMP2 # xor the shifted versions
1159 pxor \TMP4, \TMP2
1160 movdqa \TMP2, \TMP5
1161 psrldq $4, \TMP5 # right shift T5 1 DW
1162 pslldq $12, \TMP2 # left shift T2 3 DWs
1163 pxor \TMP2, \XMM5
1164
1165 # second phase of reduction
1166
1167 movdqa \XMM5,\TMP2 # make 3 copies of XMM5 into TMP2, TMP3, TMP4
1168 movdqa \XMM5,\TMP3
1169 movdqa \XMM5,\TMP4
1170 psrld $1, \TMP2 # packed left shift >>1
1171 psrld $2, \TMP3 # packed left shift >>2
1172 psrld $7, \TMP4 # packed left shift >>7
1173 pxor \TMP3,\TMP2 # xor the shifted versions
1174 pxor \TMP4,\TMP2
1175 pxor \TMP5, \TMP2
1176 pxor \TMP2, \XMM5
1177 pxor \TMP1, \XMM5 # result is in TMP1
1178
1179 pxor \XMM5, \XMM1
1180.endm
1181
1182/*
1183* decrypt 4 blocks at a time
1184* ghash the 4 previously decrypted ciphertext blocks
Dave Watson9ee4a5d2018-02-14 09:39:23 -08001185* arg1, %arg3, %arg4 are used as pointers only, not modified
Tadeusz Struk3c097b82010-12-13 19:51:15 +08001186* %r11 is the data offset value
1187*/
Sedat Dilek3347c8a2020-07-03 16:32:06 +02001188.macro GHASH_4_ENCRYPT_4_PARALLEL_dec TMP1 TMP2 TMP3 TMP4 TMP5 \
Tadeusz Struk3c097b82010-12-13 19:51:15 +08001189TMP6 XMM0 XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7 XMM8 operation
1190
1191 movdqa \XMM1, \XMM5
1192 movdqa \XMM2, \XMM6
1193 movdqa \XMM3, \XMM7
1194 movdqa \XMM4, \XMM8
1195
1196 movdqa SHUF_MASK(%rip), %xmm15
1197 # multiply TMP5 * HashKey using karatsuba
1198
1199 movdqa \XMM5, \TMP4
1200 pshufd $78, \XMM5, \TMP6
1201 pxor \XMM5, \TMP6
1202 paddd ONE(%rip), \XMM0 # INCR CNT
Dave Watsone5b954e2018-08-15 10:29:42 -07001203 movdqu HashKey_4(%arg2), \TMP5
Uros Bizjakd7866e52020-07-09 17:08:57 +02001204 pclmulqdq $0x11, \TMP5, \TMP4 # TMP4 = a1*b1
Tadeusz Struk3c097b82010-12-13 19:51:15 +08001205 movdqa \XMM0, \XMM1
1206 paddd ONE(%rip), \XMM0 # INCR CNT
1207 movdqa \XMM0, \XMM2
1208 paddd ONE(%rip), \XMM0 # INCR CNT
1209 movdqa \XMM0, \XMM3
1210 paddd ONE(%rip), \XMM0 # INCR CNT
1211 movdqa \XMM0, \XMM4
Uros Bizjakd7866e52020-07-09 17:08:57 +02001212 pshufb %xmm15, \XMM1 # perform a 16 byte swap
1213 pclmulqdq $0x00, \TMP5, \XMM5 # XMM5 = a0*b0
1214 pshufb %xmm15, \XMM2 # perform a 16 byte swap
1215 pshufb %xmm15, \XMM3 # perform a 16 byte swap
1216 pshufb %xmm15, \XMM4 # perform a 16 byte swap
Tadeusz Struk3c097b82010-12-13 19:51:15 +08001217
1218 pxor (%arg1), \XMM1
1219 pxor (%arg1), \XMM2
1220 pxor (%arg1), \XMM3
1221 pxor (%arg1), \XMM4
Dave Watsone5b954e2018-08-15 10:29:42 -07001222 movdqu HashKey_4_k(%arg2), \TMP5
Uros Bizjakd7866e52020-07-09 17:08:57 +02001223 pclmulqdq $0x00, \TMP5, \TMP6 # TMP6 = (a1+a0)*(b1+b0)
Tadeusz Struk3c097b82010-12-13 19:51:15 +08001224 movaps 0x10(%arg1), \TMP1
Uros Bizjakd7866e52020-07-09 17:08:57 +02001225 aesenc \TMP1, \XMM1 # Round 1
1226 aesenc \TMP1, \XMM2
1227 aesenc \TMP1, \XMM3
1228 aesenc \TMP1, \XMM4
Tadeusz Struk3c097b82010-12-13 19:51:15 +08001229 movaps 0x20(%arg1), \TMP1
Uros Bizjakd7866e52020-07-09 17:08:57 +02001230 aesenc \TMP1, \XMM1 # Round 2
1231 aesenc \TMP1, \XMM2
1232 aesenc \TMP1, \XMM3
1233 aesenc \TMP1, \XMM4
Tadeusz Struk3c097b82010-12-13 19:51:15 +08001234 movdqa \XMM6, \TMP1
1235 pshufd $78, \XMM6, \TMP2
1236 pxor \XMM6, \TMP2
Dave Watsone5b954e2018-08-15 10:29:42 -07001237 movdqu HashKey_3(%arg2), \TMP5
Uros Bizjakd7866e52020-07-09 17:08:57 +02001238 pclmulqdq $0x11, \TMP5, \TMP1 # TMP1 = a1 * b1
Tadeusz Struk3c097b82010-12-13 19:51:15 +08001239 movaps 0x30(%arg1), \TMP3
Uros Bizjakd7866e52020-07-09 17:08:57 +02001240 aesenc \TMP3, \XMM1 # Round 3
1241 aesenc \TMP3, \XMM2
1242 aesenc \TMP3, \XMM3
1243 aesenc \TMP3, \XMM4
1244 pclmulqdq $0x00, \TMP5, \XMM6 # XMM6 = a0*b0
Tadeusz Struk3c097b82010-12-13 19:51:15 +08001245 movaps 0x40(%arg1), \TMP3
Uros Bizjakd7866e52020-07-09 17:08:57 +02001246 aesenc \TMP3, \XMM1 # Round 4
1247 aesenc \TMP3, \XMM2
1248 aesenc \TMP3, \XMM3
1249 aesenc \TMP3, \XMM4
Dave Watsone5b954e2018-08-15 10:29:42 -07001250 movdqu HashKey_3_k(%arg2), \TMP5
Uros Bizjakd7866e52020-07-09 17:08:57 +02001251 pclmulqdq $0x00, \TMP5, \TMP2 # TMP2 = (a1+a0)*(b1+b0)
Tadeusz Struk3c097b82010-12-13 19:51:15 +08001252 movaps 0x50(%arg1), \TMP3
Uros Bizjakd7866e52020-07-09 17:08:57 +02001253 aesenc \TMP3, \XMM1 # Round 5
1254 aesenc \TMP3, \XMM2
1255 aesenc \TMP3, \XMM3
1256 aesenc \TMP3, \XMM4
Tadeusz Struk3c097b82010-12-13 19:51:15 +08001257 pxor \TMP1, \TMP4
1258# accumulate the results in TMP4:XMM5, TMP6 holds the middle part
1259 pxor \XMM6, \XMM5
1260 pxor \TMP2, \TMP6
1261 movdqa \XMM7, \TMP1
1262 pshufd $78, \XMM7, \TMP2
1263 pxor \XMM7, \TMP2
Dave Watsone5b954e2018-08-15 10:29:42 -07001264 movdqu HashKey_2(%arg2), \TMP5
Tadeusz Struk3c097b82010-12-13 19:51:15 +08001265
1266 # Multiply TMP5 * HashKey using karatsuba
1267
Uros Bizjakd7866e52020-07-09 17:08:57 +02001268 pclmulqdq $0x11, \TMP5, \TMP1 # TMP1 = a1*b1
Tadeusz Struk3c097b82010-12-13 19:51:15 +08001269 movaps 0x60(%arg1), \TMP3
Uros Bizjakd7866e52020-07-09 17:08:57 +02001270 aesenc \TMP3, \XMM1 # Round 6
1271 aesenc \TMP3, \XMM2
1272 aesenc \TMP3, \XMM3
1273 aesenc \TMP3, \XMM4
1274 pclmulqdq $0x00, \TMP5, \XMM7 # XMM7 = a0*b0
Tadeusz Struk3c097b82010-12-13 19:51:15 +08001275 movaps 0x70(%arg1), \TMP3
Uros Bizjakd7866e52020-07-09 17:08:57 +02001276 aesenc \TMP3, \XMM1 # Round 7
1277 aesenc \TMP3, \XMM2
1278 aesenc \TMP3, \XMM3
1279 aesenc \TMP3, \XMM4
Dave Watsone5b954e2018-08-15 10:29:42 -07001280 movdqu HashKey_2_k(%arg2), \TMP5
Uros Bizjakd7866e52020-07-09 17:08:57 +02001281 pclmulqdq $0x00, \TMP5, \TMP2 # TMP2 = (a1+a0)*(b1+b0)
Tadeusz Struk3c097b82010-12-13 19:51:15 +08001282 movaps 0x80(%arg1), \TMP3
Uros Bizjakd7866e52020-07-09 17:08:57 +02001283 aesenc \TMP3, \XMM1 # Round 8
1284 aesenc \TMP3, \XMM2
1285 aesenc \TMP3, \XMM3
1286 aesenc \TMP3, \XMM4
Tadeusz Struk3c097b82010-12-13 19:51:15 +08001287 pxor \TMP1, \TMP4
1288# accumulate the results in TMP4:XMM5, TMP6 holds the middle part
1289 pxor \XMM7, \XMM5
1290 pxor \TMP2, \TMP6
1291
1292 # Multiply XMM8 * HashKey
1293 # XMM8 and TMP5 hold the values for the two operands
1294
1295 movdqa \XMM8, \TMP1
1296 pshufd $78, \XMM8, \TMP2
1297 pxor \XMM8, \TMP2
Dave Watsone5b954e2018-08-15 10:29:42 -07001298 movdqu HashKey(%arg2), \TMP5
Uros Bizjakd7866e52020-07-09 17:08:57 +02001299 pclmulqdq $0x11, \TMP5, \TMP1 # TMP1 = a1*b1
Tadeusz Struk3c097b82010-12-13 19:51:15 +08001300 movaps 0x90(%arg1), \TMP3
Uros Bizjakd7866e52020-07-09 17:08:57 +02001301 aesenc \TMP3, \XMM1 # Round 9
1302 aesenc \TMP3, \XMM2
1303 aesenc \TMP3, \XMM3
1304 aesenc \TMP3, \XMM4
1305 pclmulqdq $0x00, \TMP5, \XMM8 # XMM8 = a0*b0
Timothy McCaffreye31ac322015-01-13 13:16:43 -05001306 lea 0xa0(%arg1),%r10
1307 mov keysize,%eax
1308 shr $2,%eax # 128->4, 192->6, 256->8
1309 sub $4,%eax # 128->0, 192->2, 256->4
Dave Watsonfb8986e2018-02-14 09:40:47 -08001310 jz aes_loop_par_dec_done\@
Timothy McCaffreye31ac322015-01-13 13:16:43 -05001311
Dave Watsonfb8986e2018-02-14 09:40:47 -08001312aes_loop_par_dec\@:
Timothy McCaffreye31ac322015-01-13 13:16:43 -05001313 MOVADQ (%r10),\TMP3
1314.irpc index, 1234
Uros Bizjakd7866e52020-07-09 17:08:57 +02001315 aesenc \TMP3, %xmm\index
Timothy McCaffreye31ac322015-01-13 13:16:43 -05001316.endr
1317 add $16,%r10
1318 sub $1,%eax
Dave Watsonfb8986e2018-02-14 09:40:47 -08001319 jnz aes_loop_par_dec\@
Timothy McCaffreye31ac322015-01-13 13:16:43 -05001320
Dave Watsonfb8986e2018-02-14 09:40:47 -08001321aes_loop_par_dec_done\@:
Timothy McCaffreye31ac322015-01-13 13:16:43 -05001322 MOVADQ (%r10), \TMP3
Uros Bizjakd7866e52020-07-09 17:08:57 +02001323 aesenclast \TMP3, \XMM1 # last round
1324 aesenclast \TMP3, \XMM2
1325 aesenclast \TMP3, \XMM3
1326 aesenclast \TMP3, \XMM4
Dave Watsone5b954e2018-08-15 10:29:42 -07001327 movdqu HashKey_k(%arg2), \TMP5
Uros Bizjakd7866e52020-07-09 17:08:57 +02001328 pclmulqdq $0x00, \TMP5, \TMP2 # TMP2 = (a1+a0)*(b1+b0)
Dave Watson9ee4a5d2018-02-14 09:39:23 -08001329 movdqu (%arg4,%r11,1), \TMP3
Tadeusz Struk3c097b82010-12-13 19:51:15 +08001330 pxor \TMP3, \XMM1 # Ciphertext/Plaintext XOR EK
Dave Watson9ee4a5d2018-02-14 09:39:23 -08001331 movdqu \XMM1, (%arg3,%r11,1) # Write to plaintext buffer
Tadeusz Struk3c097b82010-12-13 19:51:15 +08001332 movdqa \TMP3, \XMM1
Dave Watson9ee4a5d2018-02-14 09:39:23 -08001333 movdqu 16(%arg4,%r11,1), \TMP3
Tadeusz Struk3c097b82010-12-13 19:51:15 +08001334 pxor \TMP3, \XMM2 # Ciphertext/Plaintext XOR EK
Dave Watson9ee4a5d2018-02-14 09:39:23 -08001335 movdqu \XMM2, 16(%arg3,%r11,1) # Write to plaintext buffer
Tadeusz Struk3c097b82010-12-13 19:51:15 +08001336 movdqa \TMP3, \XMM2
Dave Watson9ee4a5d2018-02-14 09:39:23 -08001337 movdqu 32(%arg4,%r11,1), \TMP3
Tadeusz Struk3c097b82010-12-13 19:51:15 +08001338 pxor \TMP3, \XMM3 # Ciphertext/Plaintext XOR EK
Dave Watson9ee4a5d2018-02-14 09:39:23 -08001339 movdqu \XMM3, 32(%arg3,%r11,1) # Write to plaintext buffer
Tadeusz Struk3c097b82010-12-13 19:51:15 +08001340 movdqa \TMP3, \XMM3
Dave Watson9ee4a5d2018-02-14 09:39:23 -08001341 movdqu 48(%arg4,%r11,1), \TMP3
Tadeusz Struk3c097b82010-12-13 19:51:15 +08001342 pxor \TMP3, \XMM4 # Ciphertext/Plaintext XOR EK
Dave Watson9ee4a5d2018-02-14 09:39:23 -08001343 movdqu \XMM4, 48(%arg3,%r11,1) # Write to plaintext buffer
Tadeusz Struk0bd82f52010-11-04 15:00:45 -04001344 movdqa \TMP3, \XMM4
Uros Bizjakd7866e52020-07-09 17:08:57 +02001345 pshufb %xmm15, \XMM1 # perform a 16 byte swap
1346 pshufb %xmm15, \XMM2 # perform a 16 byte swap
1347 pshufb %xmm15, \XMM3 # perform a 16 byte swap
1348 pshufb %xmm15, \XMM4 # perform a 16 byte swap
Tadeusz Struk0bd82f52010-11-04 15:00:45 -04001349
1350 pxor \TMP4, \TMP1
1351 pxor \XMM8, \XMM5
1352 pxor \TMP6, \TMP2
1353 pxor \TMP1, \TMP2
1354 pxor \XMM5, \TMP2
1355 movdqa \TMP2, \TMP3
1356 pslldq $8, \TMP3 # left shift TMP3 2 DWs
1357 psrldq $8, \TMP2 # right shift TMP2 2 DWs
1358 pxor \TMP3, \XMM5
1359 pxor \TMP2, \TMP1 # accumulate the results in TMP1:XMM5
1360
1361 # first phase of reduction
1362
1363 movdqa \XMM5, \TMP2
1364 movdqa \XMM5, \TMP3
1365 movdqa \XMM5, \TMP4
1366# move XMM5 into TMP2, TMP3, TMP4 in order to perform shifts independently
1367 pslld $31, \TMP2 # packed right shift << 31
1368 pslld $30, \TMP3 # packed right shift << 30
1369 pslld $25, \TMP4 # packed right shift << 25
1370 pxor \TMP3, \TMP2 # xor the shifted versions
1371 pxor \TMP4, \TMP2
1372 movdqa \TMP2, \TMP5
1373 psrldq $4, \TMP5 # right shift T5 1 DW
1374 pslldq $12, \TMP2 # left shift T2 3 DWs
1375 pxor \TMP2, \XMM5
1376
1377 # second phase of reduction
1378
1379 movdqa \XMM5,\TMP2 # make 3 copies of XMM5 into TMP2, TMP3, TMP4
1380 movdqa \XMM5,\TMP3
1381 movdqa \XMM5,\TMP4
1382 psrld $1, \TMP2 # packed left shift >>1
1383 psrld $2, \TMP3 # packed left shift >>2
1384 psrld $7, \TMP4 # packed left shift >>7
1385 pxor \TMP3,\TMP2 # xor the shifted versions
1386 pxor \TMP4,\TMP2
1387 pxor \TMP5, \TMP2
1388 pxor \TMP2, \XMM5
1389 pxor \TMP1, \XMM5 # result is in TMP1
1390
1391 pxor \XMM5, \XMM1
1392.endm
1393
1394/* GHASH the last 4 ciphertext blocks. */
1395.macro GHASH_LAST_4 TMP1 TMP2 TMP3 TMP4 TMP5 TMP6 \
1396TMP7 XMM1 XMM2 XMM3 XMM4 XMMDst
1397
1398 # Multiply TMP6 * HashKey (using Karatsuba)
1399
1400 movdqa \XMM1, \TMP6
1401 pshufd $78, \XMM1, \TMP2
1402 pxor \XMM1, \TMP2
Dave Watsone5b954e2018-08-15 10:29:42 -07001403 movdqu HashKey_4(%arg2), \TMP5
Uros Bizjakd7866e52020-07-09 17:08:57 +02001404 pclmulqdq $0x11, \TMP5, \TMP6 # TMP6 = a1*b1
1405 pclmulqdq $0x00, \TMP5, \XMM1 # XMM1 = a0*b0
Dave Watsone5b954e2018-08-15 10:29:42 -07001406 movdqu HashKey_4_k(%arg2), \TMP4
Uros Bizjakd7866e52020-07-09 17:08:57 +02001407 pclmulqdq $0x00, \TMP4, \TMP2 # TMP2 = (a1+a0)*(b1+b0)
Tadeusz Struk0bd82f52010-11-04 15:00:45 -04001408 movdqa \XMM1, \XMMDst
1409 movdqa \TMP2, \XMM1 # result in TMP6, XMMDst, XMM1
1410
1411 # Multiply TMP1 * HashKey (using Karatsuba)
1412
1413 movdqa \XMM2, \TMP1
1414 pshufd $78, \XMM2, \TMP2
1415 pxor \XMM2, \TMP2
Dave Watsone5b954e2018-08-15 10:29:42 -07001416 movdqu HashKey_3(%arg2), \TMP5
Uros Bizjakd7866e52020-07-09 17:08:57 +02001417 pclmulqdq $0x11, \TMP5, \TMP1 # TMP1 = a1*b1
1418 pclmulqdq $0x00, \TMP5, \XMM2 # XMM2 = a0*b0
Dave Watsone5b954e2018-08-15 10:29:42 -07001419 movdqu HashKey_3_k(%arg2), \TMP4
Uros Bizjakd7866e52020-07-09 17:08:57 +02001420 pclmulqdq $0x00, \TMP4, \TMP2 # TMP2 = (a1+a0)*(b1+b0)
Tadeusz Struk0bd82f52010-11-04 15:00:45 -04001421 pxor \TMP1, \TMP6
1422 pxor \XMM2, \XMMDst
1423 pxor \TMP2, \XMM1
1424# results accumulated in TMP6, XMMDst, XMM1
1425
1426 # Multiply TMP1 * HashKey (using Karatsuba)
1427
1428 movdqa \XMM3, \TMP1
1429 pshufd $78, \XMM3, \TMP2
1430 pxor \XMM3, \TMP2
Dave Watsone5b954e2018-08-15 10:29:42 -07001431 movdqu HashKey_2(%arg2), \TMP5
Uros Bizjakd7866e52020-07-09 17:08:57 +02001432 pclmulqdq $0x11, \TMP5, \TMP1 # TMP1 = a1*b1
1433 pclmulqdq $0x00, \TMP5, \XMM3 # XMM3 = a0*b0
Dave Watsone5b954e2018-08-15 10:29:42 -07001434 movdqu HashKey_2_k(%arg2), \TMP4
Uros Bizjakd7866e52020-07-09 17:08:57 +02001435 pclmulqdq $0x00, \TMP4, \TMP2 # TMP2 = (a1+a0)*(b1+b0)
Tadeusz Struk0bd82f52010-11-04 15:00:45 -04001436 pxor \TMP1, \TMP6
1437 pxor \XMM3, \XMMDst
1438 pxor \TMP2, \XMM1 # results accumulated in TMP6, XMMDst, XMM1
1439
1440 # Multiply TMP1 * HashKey (using Karatsuba)
1441 movdqa \XMM4, \TMP1
1442 pshufd $78, \XMM4, \TMP2
1443 pxor \XMM4, \TMP2
Dave Watsone5b954e2018-08-15 10:29:42 -07001444 movdqu HashKey(%arg2), \TMP5
Uros Bizjakd7866e52020-07-09 17:08:57 +02001445 pclmulqdq $0x11, \TMP5, \TMP1 # TMP1 = a1*b1
1446 pclmulqdq $0x00, \TMP5, \XMM4 # XMM4 = a0*b0
Dave Watsone5b954e2018-08-15 10:29:42 -07001447 movdqu HashKey_k(%arg2), \TMP4
Uros Bizjakd7866e52020-07-09 17:08:57 +02001448 pclmulqdq $0x00, \TMP4, \TMP2 # TMP2 = (a1+a0)*(b1+b0)
Tadeusz Struk0bd82f52010-11-04 15:00:45 -04001449 pxor \TMP1, \TMP6
1450 pxor \XMM4, \XMMDst
1451 pxor \XMM1, \TMP2
1452 pxor \TMP6, \TMP2
1453 pxor \XMMDst, \TMP2
1454 # middle section of the temp results combined as in karatsuba algorithm
1455 movdqa \TMP2, \TMP4
1456 pslldq $8, \TMP4 # left shift TMP4 2 DWs
1457 psrldq $8, \TMP2 # right shift TMP2 2 DWs
1458 pxor \TMP4, \XMMDst
1459 pxor \TMP2, \TMP6
1460# TMP6:XMMDst holds the result of the accumulated carry-less multiplications
1461 # first phase of the reduction
1462 movdqa \XMMDst, \TMP2
1463 movdqa \XMMDst, \TMP3
1464 movdqa \XMMDst, \TMP4
1465# move XMMDst into TMP2, TMP3, TMP4 in order to perform 3 shifts independently
1466 pslld $31, \TMP2 # packed right shifting << 31
1467 pslld $30, \TMP3 # packed right shifting << 30
1468 pslld $25, \TMP4 # packed right shifting << 25
1469 pxor \TMP3, \TMP2 # xor the shifted versions
1470 pxor \TMP4, \TMP2
1471 movdqa \TMP2, \TMP7
1472 psrldq $4, \TMP7 # right shift TMP7 1 DW
1473 pslldq $12, \TMP2 # left shift TMP2 3 DWs
1474 pxor \TMP2, \XMMDst
1475
1476 # second phase of the reduction
1477 movdqa \XMMDst, \TMP2
1478 # make 3 copies of XMMDst for doing 3 shift operations
1479 movdqa \XMMDst, \TMP3
1480 movdqa \XMMDst, \TMP4
1481 psrld $1, \TMP2 # packed left shift >> 1
1482 psrld $2, \TMP3 # packed left shift >> 2
1483 psrld $7, \TMP4 # packed left shift >> 7
1484 pxor \TMP3, \TMP2 # xor the shifted versions
1485 pxor \TMP4, \TMP2
1486 pxor \TMP7, \TMP2
1487 pxor \TMP2, \XMMDst
1488 pxor \TMP6, \XMMDst # reduced result is in XMMDst
1489.endm
1490
Timothy McCaffreye31ac322015-01-13 13:16:43 -05001491
1492/* Encryption of a single block
1493* uses eax & r10
1494*/
1495
Tadeusz Struk0bd82f52010-11-04 15:00:45 -04001496.macro ENCRYPT_SINGLE_BLOCK XMM0 TMP1
1497
Timothy McCaffreye31ac322015-01-13 13:16:43 -05001498 pxor (%arg1), \XMM0
1499 mov keysize,%eax
1500 shr $2,%eax # 128->4, 192->6, 256->8
1501 add $5,%eax # 128->9, 192->11, 256->13
1502 lea 16(%arg1), %r10 # get first expanded key address
1503
1504_esb_loop_\@:
1505 MOVADQ (%r10),\TMP1
Uros Bizjakd7866e52020-07-09 17:08:57 +02001506 aesenc \TMP1,\XMM0
Timothy McCaffreye31ac322015-01-13 13:16:43 -05001507 add $16,%r10
1508 sub $1,%eax
1509 jnz _esb_loop_\@
1510
1511 MOVADQ (%r10),\TMP1
Uros Bizjakd7866e52020-07-09 17:08:57 +02001512 aesenclast \TMP1,\XMM0
Tadeusz Struk0bd82f52010-11-04 15:00:45 -04001513.endm
Tadeusz Struk0bd82f52010-11-04 15:00:45 -04001514/*****************************************************************************
1515* void aesni_gcm_dec(void *aes_ctx, // AES Key schedule. Starts on a 16 byte boundary.
Dave Watson9ee4a5d2018-02-14 09:39:23 -08001516* struct gcm_context_data *data
1517* // Context data
Tadeusz Struk0bd82f52010-11-04 15:00:45 -04001518* u8 *out, // Plaintext output. Encrypt in-place is allowed.
1519* const u8 *in, // Ciphertext input
1520* u64 plaintext_len, // Length of data in bytes for decryption.
1521* u8 *iv, // Pre-counter block j0: 4 byte salt (from Security Association)
1522* // concatenated with 8 byte Initialisation Vector (from IPSec ESP Payload)
1523* // concatenated with 0x00000001. 16-byte aligned pointer.
1524* u8 *hash_subkey, // H, the Hash sub key input. Data starts on a 16-byte boundary.
1525* const u8 *aad, // Additional Authentication Data (AAD)
1526* u64 aad_len, // Length of AAD in bytes. With RFC4106 this is going to be 8 or 12 bytes
1527* u8 *auth_tag, // Authenticated Tag output. The driver will compare this to the
1528* // given authentication tag and only return the plaintext if they match.
1529* u64 auth_tag_len); // Authenticated Tag Length in bytes. Valid values are 16
1530* // (most likely), 12 or 8.
1531*
1532* Assumptions:
1533*
1534* keys:
1535* keys are pre-expanded and aligned to 16 bytes. we are using the first
1536* set of 11 keys in the data structure void *aes_ctx
1537*
1538* iv:
1539* 0 1 2 3
1540* 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
1541* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1542* | Salt (From the SA) |
1543* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1544* | Initialization Vector |
1545* | (This is the sequence number from IPSec header) |
1546* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1547* | 0x1 |
1548* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1549*
1550*
1551*
1552* AAD:
1553* AAD padded to 128 bits with 0
1554* for example, assume AAD is a u32 vector
1555*
1556* if AAD is 8 bytes:
1557* AAD[3] = {A0, A1};
1558* padded AAD in xmm register = {A1 A0 0 0}
1559*
1560* 0 1 2 3
1561* 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
1562* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1563* | SPI (A1) |
1564* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1565* | 32-bit Sequence Number (A0) |
1566* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1567* | 0x0 |
1568* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1569*
1570* AAD Format with 32-bit Sequence Number
1571*
1572* if AAD is 12 bytes:
1573* AAD[3] = {A0, A1, A2};
1574* padded AAD in xmm register = {A2 A1 A0 0}
1575*
1576* 0 1 2 3
1577* 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
1578* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1579* 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
1580* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1581* | SPI (A2) |
1582* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1583* | 64-bit Extended Sequence Number {A1,A0} |
1584* | |
1585* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1586* | 0x0 |
1587* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1588*
1589* AAD Format with 64-bit Extended Sequence Number
1590*
Tadeusz Struk0bd82f52010-11-04 15:00:45 -04001591* poly = x^128 + x^127 + x^126 + x^121 + 1
1592*
1593*****************************************************************************/
Jiri Slaby6dcc5622019-10-11 13:51:04 +02001594SYM_FUNC_START(aesni_gcm_dec)
Dave Watson6c2c86b2018-02-14 09:38:35 -08001595 FUNC_SAVE
Tadeusz Struk3c097b82010-12-13 19:51:15 +08001596
Dave Watsonfb8986e2018-02-14 09:40:47 -08001597 GCM_INIT %arg6, arg7, arg8, arg9
Dave Watsonba458332018-02-14 09:39:10 -08001598 GCM_ENC_DEC dec
Dave Watsonfb8986e2018-02-14 09:40:47 -08001599 GCM_COMPLETE arg10, arg11
Dave Watson6c2c86b2018-02-14 09:38:35 -08001600 FUNC_RESTORE
Tadeusz Struk0bd82f52010-11-04 15:00:45 -04001601 ret
Jiri Slaby6dcc5622019-10-11 13:51:04 +02001602SYM_FUNC_END(aesni_gcm_dec)
Tadeusz Struk0bd82f52010-11-04 15:00:45 -04001603
1604
1605/*****************************************************************************
1606* void aesni_gcm_enc(void *aes_ctx, // AES Key schedule. Starts on a 16 byte boundary.
Dave Watson9ee4a5d2018-02-14 09:39:23 -08001607* struct gcm_context_data *data
1608* // Context data
Tadeusz Struk0bd82f52010-11-04 15:00:45 -04001609* u8 *out, // Ciphertext output. Encrypt in-place is allowed.
1610* const u8 *in, // Plaintext input
1611* u64 plaintext_len, // Length of data in bytes for encryption.
1612* u8 *iv, // Pre-counter block j0: 4 byte salt (from Security Association)
1613* // concatenated with 8 byte Initialisation Vector (from IPSec ESP Payload)
1614* // concatenated with 0x00000001. 16-byte aligned pointer.
1615* u8 *hash_subkey, // H, the Hash sub key input. Data starts on a 16-byte boundary.
1616* const u8 *aad, // Additional Authentication Data (AAD)
1617* u64 aad_len, // Length of AAD in bytes. With RFC4106 this is going to be 8 or 12 bytes
1618* u8 *auth_tag, // Authenticated Tag output.
1619* u64 auth_tag_len); // Authenticated Tag Length in bytes. Valid values are 16 (most likely),
1620* // 12 or 8.
1621*
1622* Assumptions:
1623*
1624* keys:
1625* keys are pre-expanded and aligned to 16 bytes. we are using the
1626* first set of 11 keys in the data structure void *aes_ctx
1627*
1628*
1629* iv:
1630* 0 1 2 3
1631* 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
1632* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1633* | Salt (From the SA) |
1634* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1635* | Initialization Vector |
1636* | (This is the sequence number from IPSec header) |
1637* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1638* | 0x1 |
1639* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1640*
1641*
1642*
1643* AAD:
1644* AAD padded to 128 bits with 0
1645* for example, assume AAD is a u32 vector
1646*
1647* if AAD is 8 bytes:
1648* AAD[3] = {A0, A1};
1649* padded AAD in xmm register = {A1 A0 0 0}
1650*
1651* 0 1 2 3
1652* 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
1653* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1654* | SPI (A1) |
1655* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1656* | 32-bit Sequence Number (A0) |
1657* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1658* | 0x0 |
1659* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1660*
1661* AAD Format with 32-bit Sequence Number
1662*
1663* if AAD is 12 bytes:
1664* AAD[3] = {A0, A1, A2};
1665* padded AAD in xmm register = {A2 A1 A0 0}
1666*
1667* 0 1 2 3
1668* 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
1669* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1670* | SPI (A2) |
1671* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1672* | 64-bit Extended Sequence Number {A1,A0} |
1673* | |
1674* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1675* | 0x0 |
1676* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1677*
1678* AAD Format with 64-bit Extended Sequence Number
1679*
Tadeusz Struk0bd82f52010-11-04 15:00:45 -04001680* poly = x^128 + x^127 + x^126 + x^121 + 1
1681***************************************************************************/
Jiri Slaby6dcc5622019-10-11 13:51:04 +02001682SYM_FUNC_START(aesni_gcm_enc)
Dave Watson6c2c86b2018-02-14 09:38:35 -08001683 FUNC_SAVE
Tadeusz Struk3c097b82010-12-13 19:51:15 +08001684
Dave Watsonfb8986e2018-02-14 09:40:47 -08001685 GCM_INIT %arg6, arg7, arg8, arg9
Dave Watsonba458332018-02-14 09:39:10 -08001686 GCM_ENC_DEC enc
Dave Watsonfb8986e2018-02-14 09:40:47 -08001687
1688 GCM_COMPLETE arg10, arg11
Dave Watson6c2c86b2018-02-14 09:38:35 -08001689 FUNC_RESTORE
Tadeusz Struk0bd82f52010-11-04 15:00:45 -04001690 ret
Jiri Slaby6dcc5622019-10-11 13:51:04 +02001691SYM_FUNC_END(aesni_gcm_enc)
Tadeusz Struk3c097b82010-12-13 19:51:15 +08001692
Dave Watsonfb8986e2018-02-14 09:40:47 -08001693/*****************************************************************************
1694* void aesni_gcm_init(void *aes_ctx, // AES Key schedule. Starts on a 16 byte boundary.
1695* struct gcm_context_data *data,
1696* // context data
1697* u8 *iv, // Pre-counter block j0: 4 byte salt (from Security Association)
1698* // concatenated with 8 byte Initialisation Vector (from IPSec ESP Payload)
1699* // concatenated with 0x00000001. 16-byte aligned pointer.
1700* u8 *hash_subkey, // H, the Hash sub key input. Data starts on a 16-byte boundary.
1701* const u8 *aad, // Additional Authentication Data (AAD)
1702* u64 aad_len) // Length of AAD in bytes.
1703*/
Jiri Slaby6dcc5622019-10-11 13:51:04 +02001704SYM_FUNC_START(aesni_gcm_init)
Dave Watsonfb8986e2018-02-14 09:40:47 -08001705 FUNC_SAVE
1706 GCM_INIT %arg3, %arg4,%arg5, %arg6
1707 FUNC_RESTORE
1708 ret
Jiri Slaby6dcc5622019-10-11 13:51:04 +02001709SYM_FUNC_END(aesni_gcm_init)
Dave Watsonfb8986e2018-02-14 09:40:47 -08001710
1711/*****************************************************************************
1712* void aesni_gcm_enc_update(void *aes_ctx, // AES Key schedule. Starts on a 16 byte boundary.
1713* struct gcm_context_data *data,
1714* // context data
1715* u8 *out, // Ciphertext output. Encrypt in-place is allowed.
1716* const u8 *in, // Plaintext input
1717* u64 plaintext_len, // Length of data in bytes for encryption.
1718*/
Jiri Slaby6dcc5622019-10-11 13:51:04 +02001719SYM_FUNC_START(aesni_gcm_enc_update)
Dave Watsonfb8986e2018-02-14 09:40:47 -08001720 FUNC_SAVE
1721 GCM_ENC_DEC enc
1722 FUNC_RESTORE
1723 ret
Jiri Slaby6dcc5622019-10-11 13:51:04 +02001724SYM_FUNC_END(aesni_gcm_enc_update)
Dave Watsonfb8986e2018-02-14 09:40:47 -08001725
1726/*****************************************************************************
1727* void aesni_gcm_dec_update(void *aes_ctx, // AES Key schedule. Starts on a 16 byte boundary.
1728* struct gcm_context_data *data,
1729* // context data
1730* u8 *out, // Ciphertext output. Encrypt in-place is allowed.
1731* const u8 *in, // Plaintext input
1732* u64 plaintext_len, // Length of data in bytes for encryption.
1733*/
Jiri Slaby6dcc5622019-10-11 13:51:04 +02001734SYM_FUNC_START(aesni_gcm_dec_update)
Dave Watsonfb8986e2018-02-14 09:40:47 -08001735 FUNC_SAVE
1736 GCM_ENC_DEC dec
1737 FUNC_RESTORE
1738 ret
Jiri Slaby6dcc5622019-10-11 13:51:04 +02001739SYM_FUNC_END(aesni_gcm_dec_update)
Dave Watsonfb8986e2018-02-14 09:40:47 -08001740
1741/*****************************************************************************
1742* void aesni_gcm_finalize(void *aes_ctx, // AES Key schedule. Starts on a 16 byte boundary.
1743* struct gcm_context_data *data,
1744* // context data
1745* u8 *auth_tag, // Authenticated Tag output.
1746* u64 auth_tag_len); // Authenticated Tag Length in bytes. Valid values are 16 (most likely),
1747* // 12 or 8.
1748*/
Jiri Slaby6dcc5622019-10-11 13:51:04 +02001749SYM_FUNC_START(aesni_gcm_finalize)
Dave Watsonfb8986e2018-02-14 09:40:47 -08001750 FUNC_SAVE
1751 GCM_COMPLETE %arg3 %arg4
1752 FUNC_RESTORE
1753 ret
Jiri Slaby6dcc5622019-10-11 13:51:04 +02001754SYM_FUNC_END(aesni_gcm_finalize)
Dave Watsonfb8986e2018-02-14 09:40:47 -08001755
Mathias Krause559ad0f2010-11-29 08:35:39 +08001756#endif
Tadeusz Struk0bd82f52010-11-04 15:00:45 -04001757
1758
Jiri Slabye9b9d022019-10-11 13:50:49 +02001759SYM_FUNC_START_LOCAL_ALIAS(_key_expansion_128)
Jiri Slaby74d8b902019-10-11 13:50:46 +02001760SYM_FUNC_START_LOCAL(_key_expansion_256a)
Huang Ying54b6a1b2009-01-18 16:28:34 +11001761 pshufd $0b11111111, %xmm1, %xmm1
1762 shufps $0b00010000, %xmm0, %xmm4
1763 pxor %xmm4, %xmm0
1764 shufps $0b10001100, %xmm0, %xmm4
1765 pxor %xmm4, %xmm0
1766 pxor %xmm1, %xmm0
Mathias Krause0d258ef2010-11-27 16:34:46 +08001767 movaps %xmm0, (TKEYP)
1768 add $0x10, TKEYP
Huang Ying54b6a1b2009-01-18 16:28:34 +11001769 ret
Jiri Slaby74d8b902019-10-11 13:50:46 +02001770SYM_FUNC_END(_key_expansion_256a)
Jiri Slabye9b9d022019-10-11 13:50:49 +02001771SYM_FUNC_END_ALIAS(_key_expansion_128)
Huang Ying54b6a1b2009-01-18 16:28:34 +11001772
Jiri Slaby74d8b902019-10-11 13:50:46 +02001773SYM_FUNC_START_LOCAL(_key_expansion_192a)
Huang Ying54b6a1b2009-01-18 16:28:34 +11001774 pshufd $0b01010101, %xmm1, %xmm1
1775 shufps $0b00010000, %xmm0, %xmm4
1776 pxor %xmm4, %xmm0
1777 shufps $0b10001100, %xmm0, %xmm4
1778 pxor %xmm4, %xmm0
1779 pxor %xmm1, %xmm0
1780
1781 movaps %xmm2, %xmm5
1782 movaps %xmm2, %xmm6
1783 pslldq $4, %xmm5
1784 pshufd $0b11111111, %xmm0, %xmm3
1785 pxor %xmm3, %xmm2
1786 pxor %xmm5, %xmm2
1787
1788 movaps %xmm0, %xmm1
1789 shufps $0b01000100, %xmm0, %xmm6
Mathias Krause0d258ef2010-11-27 16:34:46 +08001790 movaps %xmm6, (TKEYP)
Huang Ying54b6a1b2009-01-18 16:28:34 +11001791 shufps $0b01001110, %xmm2, %xmm1
Mathias Krause0d258ef2010-11-27 16:34:46 +08001792 movaps %xmm1, 0x10(TKEYP)
1793 add $0x20, TKEYP
Huang Ying54b6a1b2009-01-18 16:28:34 +11001794 ret
Jiri Slaby74d8b902019-10-11 13:50:46 +02001795SYM_FUNC_END(_key_expansion_192a)
Huang Ying54b6a1b2009-01-18 16:28:34 +11001796
Jiri Slaby74d8b902019-10-11 13:50:46 +02001797SYM_FUNC_START_LOCAL(_key_expansion_192b)
Huang Ying54b6a1b2009-01-18 16:28:34 +11001798 pshufd $0b01010101, %xmm1, %xmm1
1799 shufps $0b00010000, %xmm0, %xmm4
1800 pxor %xmm4, %xmm0
1801 shufps $0b10001100, %xmm0, %xmm4
1802 pxor %xmm4, %xmm0
1803 pxor %xmm1, %xmm0
1804
1805 movaps %xmm2, %xmm5
1806 pslldq $4, %xmm5
1807 pshufd $0b11111111, %xmm0, %xmm3
1808 pxor %xmm3, %xmm2
1809 pxor %xmm5, %xmm2
1810
Mathias Krause0d258ef2010-11-27 16:34:46 +08001811 movaps %xmm0, (TKEYP)
1812 add $0x10, TKEYP
Huang Ying54b6a1b2009-01-18 16:28:34 +11001813 ret
Jiri Slaby74d8b902019-10-11 13:50:46 +02001814SYM_FUNC_END(_key_expansion_192b)
Huang Ying54b6a1b2009-01-18 16:28:34 +11001815
Jiri Slaby74d8b902019-10-11 13:50:46 +02001816SYM_FUNC_START_LOCAL(_key_expansion_256b)
Huang Ying54b6a1b2009-01-18 16:28:34 +11001817 pshufd $0b10101010, %xmm1, %xmm1
1818 shufps $0b00010000, %xmm2, %xmm4
1819 pxor %xmm4, %xmm2
1820 shufps $0b10001100, %xmm2, %xmm4
1821 pxor %xmm4, %xmm2
1822 pxor %xmm1, %xmm2
Mathias Krause0d258ef2010-11-27 16:34:46 +08001823 movaps %xmm2, (TKEYP)
1824 add $0x10, TKEYP
Huang Ying54b6a1b2009-01-18 16:28:34 +11001825 ret
Jiri Slaby74d8b902019-10-11 13:50:46 +02001826SYM_FUNC_END(_key_expansion_256b)
Huang Ying54b6a1b2009-01-18 16:28:34 +11001827
1828/*
1829 * int aesni_set_key(struct crypto_aes_ctx *ctx, const u8 *in_key,
1830 * unsigned int key_len)
1831 */
Jiri Slaby6dcc5622019-10-11 13:51:04 +02001832SYM_FUNC_START(aesni_set_key)
Josh Poimboeuf8691ccd2016-01-21 16:49:19 -06001833 FRAME_BEGIN
Mathias Krause0d258ef2010-11-27 16:34:46 +08001834#ifndef __x86_64__
1835 pushl KEYP
Josh Poimboeuf8691ccd2016-01-21 16:49:19 -06001836 movl (FRAME_OFFSET+8)(%esp), KEYP # ctx
1837 movl (FRAME_OFFSET+12)(%esp), UKEYP # in_key
1838 movl (FRAME_OFFSET+16)(%esp), %edx # key_len
Mathias Krause0d258ef2010-11-27 16:34:46 +08001839#endif
1840 movups (UKEYP), %xmm0 # user key (first 16 bytes)
1841 movaps %xmm0, (KEYP)
1842 lea 0x10(KEYP), TKEYP # key addr
1843 movl %edx, 480(KEYP)
Huang Ying54b6a1b2009-01-18 16:28:34 +11001844 pxor %xmm4, %xmm4 # xmm4 is assumed 0 in _key_expansion_x
1845 cmp $24, %dl
1846 jb .Lenc_key128
1847 je .Lenc_key192
Mathias Krause0d258ef2010-11-27 16:34:46 +08001848 movups 0x10(UKEYP), %xmm2 # other user key
1849 movaps %xmm2, (TKEYP)
1850 add $0x10, TKEYP
Uros Bizjakd7866e52020-07-09 17:08:57 +02001851 aeskeygenassist $0x1, %xmm2, %xmm1 # round 1
Huang Ying54b6a1b2009-01-18 16:28:34 +11001852 call _key_expansion_256a
Uros Bizjakd7866e52020-07-09 17:08:57 +02001853 aeskeygenassist $0x1, %xmm0, %xmm1
Huang Ying54b6a1b2009-01-18 16:28:34 +11001854 call _key_expansion_256b
Uros Bizjakd7866e52020-07-09 17:08:57 +02001855 aeskeygenassist $0x2, %xmm2, %xmm1 # round 2
Huang Ying54b6a1b2009-01-18 16:28:34 +11001856 call _key_expansion_256a
Uros Bizjakd7866e52020-07-09 17:08:57 +02001857 aeskeygenassist $0x2, %xmm0, %xmm1
Huang Ying54b6a1b2009-01-18 16:28:34 +11001858 call _key_expansion_256b
Uros Bizjakd7866e52020-07-09 17:08:57 +02001859 aeskeygenassist $0x4, %xmm2, %xmm1 # round 3
Huang Ying54b6a1b2009-01-18 16:28:34 +11001860 call _key_expansion_256a
Uros Bizjakd7866e52020-07-09 17:08:57 +02001861 aeskeygenassist $0x4, %xmm0, %xmm1
Huang Ying54b6a1b2009-01-18 16:28:34 +11001862 call _key_expansion_256b
Uros Bizjakd7866e52020-07-09 17:08:57 +02001863 aeskeygenassist $0x8, %xmm2, %xmm1 # round 4
Huang Ying54b6a1b2009-01-18 16:28:34 +11001864 call _key_expansion_256a
Uros Bizjakd7866e52020-07-09 17:08:57 +02001865 aeskeygenassist $0x8, %xmm0, %xmm1
Huang Ying54b6a1b2009-01-18 16:28:34 +11001866 call _key_expansion_256b
Uros Bizjakd7866e52020-07-09 17:08:57 +02001867 aeskeygenassist $0x10, %xmm2, %xmm1 # round 5
Huang Ying54b6a1b2009-01-18 16:28:34 +11001868 call _key_expansion_256a
Uros Bizjakd7866e52020-07-09 17:08:57 +02001869 aeskeygenassist $0x10, %xmm0, %xmm1
Huang Ying54b6a1b2009-01-18 16:28:34 +11001870 call _key_expansion_256b
Uros Bizjakd7866e52020-07-09 17:08:57 +02001871 aeskeygenassist $0x20, %xmm2, %xmm1 # round 6
Huang Ying54b6a1b2009-01-18 16:28:34 +11001872 call _key_expansion_256a
Uros Bizjakd7866e52020-07-09 17:08:57 +02001873 aeskeygenassist $0x20, %xmm0, %xmm1
Huang Ying54b6a1b2009-01-18 16:28:34 +11001874 call _key_expansion_256b
Uros Bizjakd7866e52020-07-09 17:08:57 +02001875 aeskeygenassist $0x40, %xmm2, %xmm1 # round 7
Huang Ying54b6a1b2009-01-18 16:28:34 +11001876 call _key_expansion_256a
1877 jmp .Ldec_key
1878.Lenc_key192:
Mathias Krause0d258ef2010-11-27 16:34:46 +08001879 movq 0x10(UKEYP), %xmm2 # other user key
Uros Bizjakd7866e52020-07-09 17:08:57 +02001880 aeskeygenassist $0x1, %xmm2, %xmm1 # round 1
Huang Ying54b6a1b2009-01-18 16:28:34 +11001881 call _key_expansion_192a
Uros Bizjakd7866e52020-07-09 17:08:57 +02001882 aeskeygenassist $0x2, %xmm2, %xmm1 # round 2
Huang Ying54b6a1b2009-01-18 16:28:34 +11001883 call _key_expansion_192b
Uros Bizjakd7866e52020-07-09 17:08:57 +02001884 aeskeygenassist $0x4, %xmm2, %xmm1 # round 3
Huang Ying54b6a1b2009-01-18 16:28:34 +11001885 call _key_expansion_192a
Uros Bizjakd7866e52020-07-09 17:08:57 +02001886 aeskeygenassist $0x8, %xmm2, %xmm1 # round 4
Huang Ying54b6a1b2009-01-18 16:28:34 +11001887 call _key_expansion_192b
Uros Bizjakd7866e52020-07-09 17:08:57 +02001888 aeskeygenassist $0x10, %xmm2, %xmm1 # round 5
Huang Ying54b6a1b2009-01-18 16:28:34 +11001889 call _key_expansion_192a
Uros Bizjakd7866e52020-07-09 17:08:57 +02001890 aeskeygenassist $0x20, %xmm2, %xmm1 # round 6
Huang Ying54b6a1b2009-01-18 16:28:34 +11001891 call _key_expansion_192b
Uros Bizjakd7866e52020-07-09 17:08:57 +02001892 aeskeygenassist $0x40, %xmm2, %xmm1 # round 7
Huang Ying54b6a1b2009-01-18 16:28:34 +11001893 call _key_expansion_192a
Uros Bizjakd7866e52020-07-09 17:08:57 +02001894 aeskeygenassist $0x80, %xmm2, %xmm1 # round 8
Huang Ying54b6a1b2009-01-18 16:28:34 +11001895 call _key_expansion_192b
1896 jmp .Ldec_key
1897.Lenc_key128:
Uros Bizjakd7866e52020-07-09 17:08:57 +02001898 aeskeygenassist $0x1, %xmm0, %xmm1 # round 1
Huang Ying54b6a1b2009-01-18 16:28:34 +11001899 call _key_expansion_128
Uros Bizjakd7866e52020-07-09 17:08:57 +02001900 aeskeygenassist $0x2, %xmm0, %xmm1 # round 2
Huang Ying54b6a1b2009-01-18 16:28:34 +11001901 call _key_expansion_128
Uros Bizjakd7866e52020-07-09 17:08:57 +02001902 aeskeygenassist $0x4, %xmm0, %xmm1 # round 3
Huang Ying54b6a1b2009-01-18 16:28:34 +11001903 call _key_expansion_128
Uros Bizjakd7866e52020-07-09 17:08:57 +02001904 aeskeygenassist $0x8, %xmm0, %xmm1 # round 4
Huang Ying54b6a1b2009-01-18 16:28:34 +11001905 call _key_expansion_128
Uros Bizjakd7866e52020-07-09 17:08:57 +02001906 aeskeygenassist $0x10, %xmm0, %xmm1 # round 5
Huang Ying54b6a1b2009-01-18 16:28:34 +11001907 call _key_expansion_128
Uros Bizjakd7866e52020-07-09 17:08:57 +02001908 aeskeygenassist $0x20, %xmm0, %xmm1 # round 6
Huang Ying54b6a1b2009-01-18 16:28:34 +11001909 call _key_expansion_128
Uros Bizjakd7866e52020-07-09 17:08:57 +02001910 aeskeygenassist $0x40, %xmm0, %xmm1 # round 7
Huang Ying54b6a1b2009-01-18 16:28:34 +11001911 call _key_expansion_128
Uros Bizjakd7866e52020-07-09 17:08:57 +02001912 aeskeygenassist $0x80, %xmm0, %xmm1 # round 8
Huang Ying54b6a1b2009-01-18 16:28:34 +11001913 call _key_expansion_128
Uros Bizjakd7866e52020-07-09 17:08:57 +02001914 aeskeygenassist $0x1b, %xmm0, %xmm1 # round 9
Huang Ying54b6a1b2009-01-18 16:28:34 +11001915 call _key_expansion_128
Uros Bizjakd7866e52020-07-09 17:08:57 +02001916 aeskeygenassist $0x36, %xmm0, %xmm1 # round 10
Huang Ying54b6a1b2009-01-18 16:28:34 +11001917 call _key_expansion_128
1918.Ldec_key:
Mathias Krause0d258ef2010-11-27 16:34:46 +08001919 sub $0x10, TKEYP
1920 movaps (KEYP), %xmm0
1921 movaps (TKEYP), %xmm1
1922 movaps %xmm0, 240(TKEYP)
1923 movaps %xmm1, 240(KEYP)
1924 add $0x10, KEYP
1925 lea 240-16(TKEYP), UKEYP
Huang Ying54b6a1b2009-01-18 16:28:34 +11001926.align 4
1927.Ldec_key_loop:
Mathias Krause0d258ef2010-11-27 16:34:46 +08001928 movaps (KEYP), %xmm0
Uros Bizjakd7866e52020-07-09 17:08:57 +02001929 aesimc %xmm0, %xmm1
Mathias Krause0d258ef2010-11-27 16:34:46 +08001930 movaps %xmm1, (UKEYP)
1931 add $0x10, KEYP
1932 sub $0x10, UKEYP
1933 cmp TKEYP, KEYP
Huang Ying54b6a1b2009-01-18 16:28:34 +11001934 jb .Ldec_key_loop
Mathias Krause0d258ef2010-11-27 16:34:46 +08001935 xor AREG, AREG
1936#ifndef __x86_64__
1937 popl KEYP
1938#endif
Josh Poimboeuf8691ccd2016-01-21 16:49:19 -06001939 FRAME_END
Huang Ying54b6a1b2009-01-18 16:28:34 +11001940 ret
Jiri Slaby6dcc5622019-10-11 13:51:04 +02001941SYM_FUNC_END(aesni_set_key)
Huang Ying54b6a1b2009-01-18 16:28:34 +11001942
1943/*
Kees Cook9c1e8832019-11-26 22:08:02 -08001944 * void aesni_enc(const void *ctx, u8 *dst, const u8 *src)
Huang Ying54b6a1b2009-01-18 16:28:34 +11001945 */
Jiri Slaby6dcc5622019-10-11 13:51:04 +02001946SYM_FUNC_START(aesni_enc)
Josh Poimboeuf8691ccd2016-01-21 16:49:19 -06001947 FRAME_BEGIN
Mathias Krause0d258ef2010-11-27 16:34:46 +08001948#ifndef __x86_64__
1949 pushl KEYP
1950 pushl KLEN
Josh Poimboeuf8691ccd2016-01-21 16:49:19 -06001951 movl (FRAME_OFFSET+12)(%esp), KEYP # ctx
1952 movl (FRAME_OFFSET+16)(%esp), OUTP # dst
1953 movl (FRAME_OFFSET+20)(%esp), INP # src
Mathias Krause0d258ef2010-11-27 16:34:46 +08001954#endif
Huang Ying54b6a1b2009-01-18 16:28:34 +11001955 movl 480(KEYP), KLEN # key length
1956 movups (INP), STATE # input
1957 call _aesni_enc1
1958 movups STATE, (OUTP) # output
Mathias Krause0d258ef2010-11-27 16:34:46 +08001959#ifndef __x86_64__
1960 popl KLEN
1961 popl KEYP
1962#endif
Josh Poimboeuf8691ccd2016-01-21 16:49:19 -06001963 FRAME_END
Huang Ying54b6a1b2009-01-18 16:28:34 +11001964 ret
Jiri Slaby6dcc5622019-10-11 13:51:04 +02001965SYM_FUNC_END(aesni_enc)
Huang Ying54b6a1b2009-01-18 16:28:34 +11001966
1967/*
1968 * _aesni_enc1: internal ABI
1969 * input:
1970 * KEYP: key struct pointer
1971 * KLEN: round count
1972 * STATE: initial state (input)
1973 * output:
1974 * STATE: finial state (output)
1975 * changed:
1976 * KEY
1977 * TKEYP (T1)
1978 */
Jiri Slaby74d8b902019-10-11 13:50:46 +02001979SYM_FUNC_START_LOCAL(_aesni_enc1)
Huang Ying54b6a1b2009-01-18 16:28:34 +11001980 movaps (KEYP), KEY # key
1981 mov KEYP, TKEYP
1982 pxor KEY, STATE # round 0
1983 add $0x30, TKEYP
1984 cmp $24, KLEN
1985 jb .Lenc128
1986 lea 0x20(TKEYP), TKEYP
1987 je .Lenc192
1988 add $0x20, TKEYP
1989 movaps -0x60(TKEYP), KEY
Uros Bizjakd7866e52020-07-09 17:08:57 +02001990 aesenc KEY, STATE
Huang Ying54b6a1b2009-01-18 16:28:34 +11001991 movaps -0x50(TKEYP), KEY
Uros Bizjakd7866e52020-07-09 17:08:57 +02001992 aesenc KEY, STATE
Huang Ying54b6a1b2009-01-18 16:28:34 +11001993.align 4
1994.Lenc192:
1995 movaps -0x40(TKEYP), KEY
Uros Bizjakd7866e52020-07-09 17:08:57 +02001996 aesenc KEY, STATE
Huang Ying54b6a1b2009-01-18 16:28:34 +11001997 movaps -0x30(TKEYP), KEY
Uros Bizjakd7866e52020-07-09 17:08:57 +02001998 aesenc KEY, STATE
Huang Ying54b6a1b2009-01-18 16:28:34 +11001999.align 4
2000.Lenc128:
2001 movaps -0x20(TKEYP), KEY
Uros Bizjakd7866e52020-07-09 17:08:57 +02002002 aesenc KEY, STATE
Huang Ying54b6a1b2009-01-18 16:28:34 +11002003 movaps -0x10(TKEYP), KEY
Uros Bizjakd7866e52020-07-09 17:08:57 +02002004 aesenc KEY, STATE
Huang Ying54b6a1b2009-01-18 16:28:34 +11002005 movaps (TKEYP), KEY
Uros Bizjakd7866e52020-07-09 17:08:57 +02002006 aesenc KEY, STATE
Huang Ying54b6a1b2009-01-18 16:28:34 +11002007 movaps 0x10(TKEYP), KEY
Uros Bizjakd7866e52020-07-09 17:08:57 +02002008 aesenc KEY, STATE
Huang Ying54b6a1b2009-01-18 16:28:34 +11002009 movaps 0x20(TKEYP), KEY
Uros Bizjakd7866e52020-07-09 17:08:57 +02002010 aesenc KEY, STATE
Huang Ying54b6a1b2009-01-18 16:28:34 +11002011 movaps 0x30(TKEYP), KEY
Uros Bizjakd7866e52020-07-09 17:08:57 +02002012 aesenc KEY, STATE
Huang Ying54b6a1b2009-01-18 16:28:34 +11002013 movaps 0x40(TKEYP), KEY
Uros Bizjakd7866e52020-07-09 17:08:57 +02002014 aesenc KEY, STATE
Huang Ying54b6a1b2009-01-18 16:28:34 +11002015 movaps 0x50(TKEYP), KEY
Uros Bizjakd7866e52020-07-09 17:08:57 +02002016 aesenc KEY, STATE
Huang Ying54b6a1b2009-01-18 16:28:34 +11002017 movaps 0x60(TKEYP), KEY
Uros Bizjakd7866e52020-07-09 17:08:57 +02002018 aesenc KEY, STATE
Huang Ying54b6a1b2009-01-18 16:28:34 +11002019 movaps 0x70(TKEYP), KEY
Uros Bizjakd7866e52020-07-09 17:08:57 +02002020 aesenclast KEY, STATE
Huang Ying54b6a1b2009-01-18 16:28:34 +11002021 ret
Jiri Slaby74d8b902019-10-11 13:50:46 +02002022SYM_FUNC_END(_aesni_enc1)
Huang Ying54b6a1b2009-01-18 16:28:34 +11002023
2024/*
2025 * _aesni_enc4: internal ABI
2026 * input:
2027 * KEYP: key struct pointer
2028 * KLEN: round count
2029 * STATE1: initial state (input)
2030 * STATE2
2031 * STATE3
2032 * STATE4
2033 * output:
2034 * STATE1: finial state (output)
2035 * STATE2
2036 * STATE3
2037 * STATE4
2038 * changed:
2039 * KEY
2040 * TKEYP (T1)
2041 */
Jiri Slaby74d8b902019-10-11 13:50:46 +02002042SYM_FUNC_START_LOCAL(_aesni_enc4)
Huang Ying54b6a1b2009-01-18 16:28:34 +11002043 movaps (KEYP), KEY # key
2044 mov KEYP, TKEYP
2045 pxor KEY, STATE1 # round 0
2046 pxor KEY, STATE2
2047 pxor KEY, STATE3
2048 pxor KEY, STATE4
2049 add $0x30, TKEYP
2050 cmp $24, KLEN
2051 jb .L4enc128
2052 lea 0x20(TKEYP), TKEYP
2053 je .L4enc192
2054 add $0x20, TKEYP
2055 movaps -0x60(TKEYP), KEY
Uros Bizjakd7866e52020-07-09 17:08:57 +02002056 aesenc KEY, STATE1
2057 aesenc KEY, STATE2
2058 aesenc KEY, STATE3
2059 aesenc KEY, STATE4
Huang Ying54b6a1b2009-01-18 16:28:34 +11002060 movaps -0x50(TKEYP), KEY
Uros Bizjakd7866e52020-07-09 17:08:57 +02002061 aesenc KEY, STATE1
2062 aesenc KEY, STATE2
2063 aesenc KEY, STATE3
2064 aesenc KEY, STATE4
Huang Ying54b6a1b2009-01-18 16:28:34 +11002065#.align 4
2066.L4enc192:
2067 movaps -0x40(TKEYP), KEY
Uros Bizjakd7866e52020-07-09 17:08:57 +02002068 aesenc KEY, STATE1
2069 aesenc KEY, STATE2
2070 aesenc KEY, STATE3
2071 aesenc KEY, STATE4
Huang Ying54b6a1b2009-01-18 16:28:34 +11002072 movaps -0x30(TKEYP), KEY
Uros Bizjakd7866e52020-07-09 17:08:57 +02002073 aesenc KEY, STATE1
2074 aesenc KEY, STATE2
2075 aesenc KEY, STATE3
2076 aesenc KEY, STATE4
Huang Ying54b6a1b2009-01-18 16:28:34 +11002077#.align 4
2078.L4enc128:
2079 movaps -0x20(TKEYP), KEY
Uros Bizjakd7866e52020-07-09 17:08:57 +02002080 aesenc KEY, STATE1
2081 aesenc KEY, STATE2
2082 aesenc KEY, STATE3
2083 aesenc KEY, STATE4
Huang Ying54b6a1b2009-01-18 16:28:34 +11002084 movaps -0x10(TKEYP), KEY
Uros Bizjakd7866e52020-07-09 17:08:57 +02002085 aesenc KEY, STATE1
2086 aesenc KEY, STATE2
2087 aesenc KEY, STATE3
2088 aesenc KEY, STATE4
Huang Ying54b6a1b2009-01-18 16:28:34 +11002089 movaps (TKEYP), KEY
Uros Bizjakd7866e52020-07-09 17:08:57 +02002090 aesenc KEY, STATE1
2091 aesenc KEY, STATE2
2092 aesenc KEY, STATE3
2093 aesenc KEY, STATE4
Huang Ying54b6a1b2009-01-18 16:28:34 +11002094 movaps 0x10(TKEYP), KEY
Uros Bizjakd7866e52020-07-09 17:08:57 +02002095 aesenc KEY, STATE1
2096 aesenc KEY, STATE2
2097 aesenc KEY, STATE3
2098 aesenc KEY, STATE4
Huang Ying54b6a1b2009-01-18 16:28:34 +11002099 movaps 0x20(TKEYP), KEY
Uros Bizjakd7866e52020-07-09 17:08:57 +02002100 aesenc KEY, STATE1
2101 aesenc KEY, STATE2
2102 aesenc KEY, STATE3
2103 aesenc KEY, STATE4
Huang Ying54b6a1b2009-01-18 16:28:34 +11002104 movaps 0x30(TKEYP), KEY
Uros Bizjakd7866e52020-07-09 17:08:57 +02002105 aesenc KEY, STATE1
2106 aesenc KEY, STATE2
2107 aesenc KEY, STATE3
2108 aesenc KEY, STATE4
Huang Ying54b6a1b2009-01-18 16:28:34 +11002109 movaps 0x40(TKEYP), KEY
Uros Bizjakd7866e52020-07-09 17:08:57 +02002110 aesenc KEY, STATE1
2111 aesenc KEY, STATE2
2112 aesenc KEY, STATE3
2113 aesenc KEY, STATE4
Huang Ying54b6a1b2009-01-18 16:28:34 +11002114 movaps 0x50(TKEYP), KEY
Uros Bizjakd7866e52020-07-09 17:08:57 +02002115 aesenc KEY, STATE1
2116 aesenc KEY, STATE2
2117 aesenc KEY, STATE3
2118 aesenc KEY, STATE4
Huang Ying54b6a1b2009-01-18 16:28:34 +11002119 movaps 0x60(TKEYP), KEY
Uros Bizjakd7866e52020-07-09 17:08:57 +02002120 aesenc KEY, STATE1
2121 aesenc KEY, STATE2
2122 aesenc KEY, STATE3
2123 aesenc KEY, STATE4
Huang Ying54b6a1b2009-01-18 16:28:34 +11002124 movaps 0x70(TKEYP), KEY
Uros Bizjakd7866e52020-07-09 17:08:57 +02002125 aesenclast KEY, STATE1 # last round
2126 aesenclast KEY, STATE2
2127 aesenclast KEY, STATE3
2128 aesenclast KEY, STATE4
Huang Ying54b6a1b2009-01-18 16:28:34 +11002129 ret
Jiri Slaby74d8b902019-10-11 13:50:46 +02002130SYM_FUNC_END(_aesni_enc4)
Huang Ying54b6a1b2009-01-18 16:28:34 +11002131
2132/*
Kees Cook9c1e8832019-11-26 22:08:02 -08002133 * void aesni_dec (const void *ctx, u8 *dst, const u8 *src)
Huang Ying54b6a1b2009-01-18 16:28:34 +11002134 */
Jiri Slaby6dcc5622019-10-11 13:51:04 +02002135SYM_FUNC_START(aesni_dec)
Josh Poimboeuf8691ccd2016-01-21 16:49:19 -06002136 FRAME_BEGIN
Mathias Krause0d258ef2010-11-27 16:34:46 +08002137#ifndef __x86_64__
2138 pushl KEYP
2139 pushl KLEN
Josh Poimboeuf8691ccd2016-01-21 16:49:19 -06002140 movl (FRAME_OFFSET+12)(%esp), KEYP # ctx
2141 movl (FRAME_OFFSET+16)(%esp), OUTP # dst
2142 movl (FRAME_OFFSET+20)(%esp), INP # src
Mathias Krause0d258ef2010-11-27 16:34:46 +08002143#endif
Huang Ying54b6a1b2009-01-18 16:28:34 +11002144 mov 480(KEYP), KLEN # key length
2145 add $240, KEYP
2146 movups (INP), STATE # input
2147 call _aesni_dec1
2148 movups STATE, (OUTP) #output
Mathias Krause0d258ef2010-11-27 16:34:46 +08002149#ifndef __x86_64__
2150 popl KLEN
2151 popl KEYP
2152#endif
Josh Poimboeuf8691ccd2016-01-21 16:49:19 -06002153 FRAME_END
Huang Ying54b6a1b2009-01-18 16:28:34 +11002154 ret
Jiri Slaby6dcc5622019-10-11 13:51:04 +02002155SYM_FUNC_END(aesni_dec)
Huang Ying54b6a1b2009-01-18 16:28:34 +11002156
2157/*
2158 * _aesni_dec1: internal ABI
2159 * input:
2160 * KEYP: key struct pointer
2161 * KLEN: key length
2162 * STATE: initial state (input)
2163 * output:
2164 * STATE: finial state (output)
2165 * changed:
2166 * KEY
2167 * TKEYP (T1)
2168 */
Jiri Slaby74d8b902019-10-11 13:50:46 +02002169SYM_FUNC_START_LOCAL(_aesni_dec1)
Huang Ying54b6a1b2009-01-18 16:28:34 +11002170 movaps (KEYP), KEY # key
2171 mov KEYP, TKEYP
2172 pxor KEY, STATE # round 0
2173 add $0x30, TKEYP
2174 cmp $24, KLEN
2175 jb .Ldec128
2176 lea 0x20(TKEYP), TKEYP
2177 je .Ldec192
2178 add $0x20, TKEYP
2179 movaps -0x60(TKEYP), KEY
Uros Bizjakd7866e52020-07-09 17:08:57 +02002180 aesdec KEY, STATE
Huang Ying54b6a1b2009-01-18 16:28:34 +11002181 movaps -0x50(TKEYP), KEY
Uros Bizjakd7866e52020-07-09 17:08:57 +02002182 aesdec KEY, STATE
Huang Ying54b6a1b2009-01-18 16:28:34 +11002183.align 4
2184.Ldec192:
2185 movaps -0x40(TKEYP), KEY
Uros Bizjakd7866e52020-07-09 17:08:57 +02002186 aesdec KEY, STATE
Huang Ying54b6a1b2009-01-18 16:28:34 +11002187 movaps -0x30(TKEYP), KEY
Uros Bizjakd7866e52020-07-09 17:08:57 +02002188 aesdec KEY, STATE
Huang Ying54b6a1b2009-01-18 16:28:34 +11002189.align 4
2190.Ldec128:
2191 movaps -0x20(TKEYP), KEY
Uros Bizjakd7866e52020-07-09 17:08:57 +02002192 aesdec KEY, STATE
Huang Ying54b6a1b2009-01-18 16:28:34 +11002193 movaps -0x10(TKEYP), KEY
Uros Bizjakd7866e52020-07-09 17:08:57 +02002194 aesdec KEY, STATE
Huang Ying54b6a1b2009-01-18 16:28:34 +11002195 movaps (TKEYP), KEY
Uros Bizjakd7866e52020-07-09 17:08:57 +02002196 aesdec KEY, STATE
Huang Ying54b6a1b2009-01-18 16:28:34 +11002197 movaps 0x10(TKEYP), KEY
Uros Bizjakd7866e52020-07-09 17:08:57 +02002198 aesdec KEY, STATE
Huang Ying54b6a1b2009-01-18 16:28:34 +11002199 movaps 0x20(TKEYP), KEY
Uros Bizjakd7866e52020-07-09 17:08:57 +02002200 aesdec KEY, STATE
Huang Ying54b6a1b2009-01-18 16:28:34 +11002201 movaps 0x30(TKEYP), KEY
Uros Bizjakd7866e52020-07-09 17:08:57 +02002202 aesdec KEY, STATE
Huang Ying54b6a1b2009-01-18 16:28:34 +11002203 movaps 0x40(TKEYP), KEY
Uros Bizjakd7866e52020-07-09 17:08:57 +02002204 aesdec KEY, STATE
Huang Ying54b6a1b2009-01-18 16:28:34 +11002205 movaps 0x50(TKEYP), KEY
Uros Bizjakd7866e52020-07-09 17:08:57 +02002206 aesdec KEY, STATE
Huang Ying54b6a1b2009-01-18 16:28:34 +11002207 movaps 0x60(TKEYP), KEY
Uros Bizjakd7866e52020-07-09 17:08:57 +02002208 aesdec KEY, STATE
Huang Ying54b6a1b2009-01-18 16:28:34 +11002209 movaps 0x70(TKEYP), KEY
Uros Bizjakd7866e52020-07-09 17:08:57 +02002210 aesdeclast KEY, STATE
Huang Ying54b6a1b2009-01-18 16:28:34 +11002211 ret
Jiri Slaby74d8b902019-10-11 13:50:46 +02002212SYM_FUNC_END(_aesni_dec1)
Huang Ying54b6a1b2009-01-18 16:28:34 +11002213
2214/*
2215 * _aesni_dec4: internal ABI
2216 * input:
2217 * KEYP: key struct pointer
2218 * KLEN: key length
2219 * STATE1: initial state (input)
2220 * STATE2
2221 * STATE3
2222 * STATE4
2223 * output:
2224 * STATE1: finial state (output)
2225 * STATE2
2226 * STATE3
2227 * STATE4
2228 * changed:
2229 * KEY
2230 * TKEYP (T1)
2231 */
Jiri Slaby74d8b902019-10-11 13:50:46 +02002232SYM_FUNC_START_LOCAL(_aesni_dec4)
Huang Ying54b6a1b2009-01-18 16:28:34 +11002233 movaps (KEYP), KEY # key
2234 mov KEYP, TKEYP
2235 pxor KEY, STATE1 # round 0
2236 pxor KEY, STATE2
2237 pxor KEY, STATE3
2238 pxor KEY, STATE4
2239 add $0x30, TKEYP
2240 cmp $24, KLEN
2241 jb .L4dec128
2242 lea 0x20(TKEYP), TKEYP
2243 je .L4dec192
2244 add $0x20, TKEYP
2245 movaps -0x60(TKEYP), KEY
Uros Bizjakd7866e52020-07-09 17:08:57 +02002246 aesdec KEY, STATE1
2247 aesdec KEY, STATE2
2248 aesdec KEY, STATE3
2249 aesdec KEY, STATE4
Huang Ying54b6a1b2009-01-18 16:28:34 +11002250 movaps -0x50(TKEYP), KEY
Uros Bizjakd7866e52020-07-09 17:08:57 +02002251 aesdec KEY, STATE1
2252 aesdec KEY, STATE2
2253 aesdec KEY, STATE3
2254 aesdec KEY, STATE4
Huang Ying54b6a1b2009-01-18 16:28:34 +11002255.align 4
2256.L4dec192:
2257 movaps -0x40(TKEYP), KEY
Uros Bizjakd7866e52020-07-09 17:08:57 +02002258 aesdec KEY, STATE1
2259 aesdec KEY, STATE2
2260 aesdec KEY, STATE3
2261 aesdec KEY, STATE4
Huang Ying54b6a1b2009-01-18 16:28:34 +11002262 movaps -0x30(TKEYP), KEY
Uros Bizjakd7866e52020-07-09 17:08:57 +02002263 aesdec KEY, STATE1
2264 aesdec KEY, STATE2
2265 aesdec KEY, STATE3
2266 aesdec KEY, STATE4
Huang Ying54b6a1b2009-01-18 16:28:34 +11002267.align 4
2268.L4dec128:
2269 movaps -0x20(TKEYP), KEY
Uros Bizjakd7866e52020-07-09 17:08:57 +02002270 aesdec KEY, STATE1
2271 aesdec KEY, STATE2
2272 aesdec KEY, STATE3
2273 aesdec KEY, STATE4
Huang Ying54b6a1b2009-01-18 16:28:34 +11002274 movaps -0x10(TKEYP), KEY
Uros Bizjakd7866e52020-07-09 17:08:57 +02002275 aesdec KEY, STATE1
2276 aesdec KEY, STATE2
2277 aesdec KEY, STATE3
2278 aesdec KEY, STATE4
Huang Ying54b6a1b2009-01-18 16:28:34 +11002279 movaps (TKEYP), KEY
Uros Bizjakd7866e52020-07-09 17:08:57 +02002280 aesdec KEY, STATE1
2281 aesdec KEY, STATE2
2282 aesdec KEY, STATE3
2283 aesdec KEY, STATE4
Huang Ying54b6a1b2009-01-18 16:28:34 +11002284 movaps 0x10(TKEYP), KEY
Uros Bizjakd7866e52020-07-09 17:08:57 +02002285 aesdec KEY, STATE1
2286 aesdec KEY, STATE2
2287 aesdec KEY, STATE3
2288 aesdec KEY, STATE4
Huang Ying54b6a1b2009-01-18 16:28:34 +11002289 movaps 0x20(TKEYP), KEY
Uros Bizjakd7866e52020-07-09 17:08:57 +02002290 aesdec KEY, STATE1
2291 aesdec KEY, STATE2
2292 aesdec KEY, STATE3
2293 aesdec KEY, STATE4
Huang Ying54b6a1b2009-01-18 16:28:34 +11002294 movaps 0x30(TKEYP), KEY
Uros Bizjakd7866e52020-07-09 17:08:57 +02002295 aesdec KEY, STATE1
2296 aesdec KEY, STATE2
2297 aesdec KEY, STATE3
2298 aesdec KEY, STATE4
Huang Ying54b6a1b2009-01-18 16:28:34 +11002299 movaps 0x40(TKEYP), KEY
Uros Bizjakd7866e52020-07-09 17:08:57 +02002300 aesdec KEY, STATE1
2301 aesdec KEY, STATE2
2302 aesdec KEY, STATE3
2303 aesdec KEY, STATE4
Huang Ying54b6a1b2009-01-18 16:28:34 +11002304 movaps 0x50(TKEYP), KEY
Uros Bizjakd7866e52020-07-09 17:08:57 +02002305 aesdec KEY, STATE1
2306 aesdec KEY, STATE2
2307 aesdec KEY, STATE3
2308 aesdec KEY, STATE4
Huang Ying54b6a1b2009-01-18 16:28:34 +11002309 movaps 0x60(TKEYP), KEY
Uros Bizjakd7866e52020-07-09 17:08:57 +02002310 aesdec KEY, STATE1
2311 aesdec KEY, STATE2
2312 aesdec KEY, STATE3
2313 aesdec KEY, STATE4
Huang Ying54b6a1b2009-01-18 16:28:34 +11002314 movaps 0x70(TKEYP), KEY
Uros Bizjakd7866e52020-07-09 17:08:57 +02002315 aesdeclast KEY, STATE1 # last round
2316 aesdeclast KEY, STATE2
2317 aesdeclast KEY, STATE3
2318 aesdeclast KEY, STATE4
Huang Ying54b6a1b2009-01-18 16:28:34 +11002319 ret
Jiri Slaby74d8b902019-10-11 13:50:46 +02002320SYM_FUNC_END(_aesni_dec4)
Huang Ying54b6a1b2009-01-18 16:28:34 +11002321
2322/*
2323 * void aesni_ecb_enc(struct crypto_aes_ctx *ctx, const u8 *dst, u8 *src,
2324 * size_t len)
2325 */
Jiri Slaby6dcc5622019-10-11 13:51:04 +02002326SYM_FUNC_START(aesni_ecb_enc)
Josh Poimboeuf8691ccd2016-01-21 16:49:19 -06002327 FRAME_BEGIN
Mathias Krause0d258ef2010-11-27 16:34:46 +08002328#ifndef __x86_64__
2329 pushl LEN
2330 pushl KEYP
2331 pushl KLEN
Josh Poimboeuf8691ccd2016-01-21 16:49:19 -06002332 movl (FRAME_OFFSET+16)(%esp), KEYP # ctx
2333 movl (FRAME_OFFSET+20)(%esp), OUTP # dst
2334 movl (FRAME_OFFSET+24)(%esp), INP # src
2335 movl (FRAME_OFFSET+28)(%esp), LEN # len
Mathias Krause0d258ef2010-11-27 16:34:46 +08002336#endif
Huang Ying54b6a1b2009-01-18 16:28:34 +11002337 test LEN, LEN # check length
2338 jz .Lecb_enc_ret
2339 mov 480(KEYP), KLEN
2340 cmp $16, LEN
2341 jb .Lecb_enc_ret
2342 cmp $64, LEN
2343 jb .Lecb_enc_loop1
2344.align 4
2345.Lecb_enc_loop4:
2346 movups (INP), STATE1
2347 movups 0x10(INP), STATE2
2348 movups 0x20(INP), STATE3
2349 movups 0x30(INP), STATE4
2350 call _aesni_enc4
2351 movups STATE1, (OUTP)
2352 movups STATE2, 0x10(OUTP)
2353 movups STATE3, 0x20(OUTP)
2354 movups STATE4, 0x30(OUTP)
2355 sub $64, LEN
2356 add $64, INP
2357 add $64, OUTP
2358 cmp $64, LEN
2359 jge .Lecb_enc_loop4
2360 cmp $16, LEN
2361 jb .Lecb_enc_ret
2362.align 4
2363.Lecb_enc_loop1:
2364 movups (INP), STATE1
2365 call _aesni_enc1
2366 movups STATE1, (OUTP)
2367 sub $16, LEN
2368 add $16, INP
2369 add $16, OUTP
2370 cmp $16, LEN
2371 jge .Lecb_enc_loop1
2372.Lecb_enc_ret:
Mathias Krause0d258ef2010-11-27 16:34:46 +08002373#ifndef __x86_64__
2374 popl KLEN
2375 popl KEYP
2376 popl LEN
2377#endif
Josh Poimboeuf8691ccd2016-01-21 16:49:19 -06002378 FRAME_END
Huang Ying54b6a1b2009-01-18 16:28:34 +11002379 ret
Jiri Slaby6dcc5622019-10-11 13:51:04 +02002380SYM_FUNC_END(aesni_ecb_enc)
Huang Ying54b6a1b2009-01-18 16:28:34 +11002381
2382/*
2383 * void aesni_ecb_dec(struct crypto_aes_ctx *ctx, const u8 *dst, u8 *src,
2384 * size_t len);
2385 */
Jiri Slaby6dcc5622019-10-11 13:51:04 +02002386SYM_FUNC_START(aesni_ecb_dec)
Josh Poimboeuf8691ccd2016-01-21 16:49:19 -06002387 FRAME_BEGIN
Mathias Krause0d258ef2010-11-27 16:34:46 +08002388#ifndef __x86_64__
2389 pushl LEN
2390 pushl KEYP
2391 pushl KLEN
Josh Poimboeuf8691ccd2016-01-21 16:49:19 -06002392 movl (FRAME_OFFSET+16)(%esp), KEYP # ctx
2393 movl (FRAME_OFFSET+20)(%esp), OUTP # dst
2394 movl (FRAME_OFFSET+24)(%esp), INP # src
2395 movl (FRAME_OFFSET+28)(%esp), LEN # len
Mathias Krause0d258ef2010-11-27 16:34:46 +08002396#endif
Huang Ying54b6a1b2009-01-18 16:28:34 +11002397 test LEN, LEN
2398 jz .Lecb_dec_ret
2399 mov 480(KEYP), KLEN
2400 add $240, KEYP
2401 cmp $16, LEN
2402 jb .Lecb_dec_ret
2403 cmp $64, LEN
2404 jb .Lecb_dec_loop1
2405.align 4
2406.Lecb_dec_loop4:
2407 movups (INP), STATE1
2408 movups 0x10(INP), STATE2
2409 movups 0x20(INP), STATE3
2410 movups 0x30(INP), STATE4
2411 call _aesni_dec4
2412 movups STATE1, (OUTP)
2413 movups STATE2, 0x10(OUTP)
2414 movups STATE3, 0x20(OUTP)
2415 movups STATE4, 0x30(OUTP)
2416 sub $64, LEN
2417 add $64, INP
2418 add $64, OUTP
2419 cmp $64, LEN
2420 jge .Lecb_dec_loop4
2421 cmp $16, LEN
2422 jb .Lecb_dec_ret
2423.align 4
2424.Lecb_dec_loop1:
2425 movups (INP), STATE1
2426 call _aesni_dec1
2427 movups STATE1, (OUTP)
2428 sub $16, LEN
2429 add $16, INP
2430 add $16, OUTP
2431 cmp $16, LEN
2432 jge .Lecb_dec_loop1
2433.Lecb_dec_ret:
Mathias Krause0d258ef2010-11-27 16:34:46 +08002434#ifndef __x86_64__
2435 popl KLEN
2436 popl KEYP
2437 popl LEN
2438#endif
Josh Poimboeuf8691ccd2016-01-21 16:49:19 -06002439 FRAME_END
Huang Ying54b6a1b2009-01-18 16:28:34 +11002440 ret
Jiri Slaby6dcc5622019-10-11 13:51:04 +02002441SYM_FUNC_END(aesni_ecb_dec)
Huang Ying54b6a1b2009-01-18 16:28:34 +11002442
2443/*
2444 * void aesni_cbc_enc(struct crypto_aes_ctx *ctx, const u8 *dst, u8 *src,
2445 * size_t len, u8 *iv)
2446 */
Jiri Slaby6dcc5622019-10-11 13:51:04 +02002447SYM_FUNC_START(aesni_cbc_enc)
Josh Poimboeuf8691ccd2016-01-21 16:49:19 -06002448 FRAME_BEGIN
Mathias Krause0d258ef2010-11-27 16:34:46 +08002449#ifndef __x86_64__
2450 pushl IVP
2451 pushl LEN
2452 pushl KEYP
2453 pushl KLEN
Josh Poimboeuf8691ccd2016-01-21 16:49:19 -06002454 movl (FRAME_OFFSET+20)(%esp), KEYP # ctx
2455 movl (FRAME_OFFSET+24)(%esp), OUTP # dst
2456 movl (FRAME_OFFSET+28)(%esp), INP # src
2457 movl (FRAME_OFFSET+32)(%esp), LEN # len
2458 movl (FRAME_OFFSET+36)(%esp), IVP # iv
Mathias Krause0d258ef2010-11-27 16:34:46 +08002459#endif
Huang Ying54b6a1b2009-01-18 16:28:34 +11002460 cmp $16, LEN
2461 jb .Lcbc_enc_ret
2462 mov 480(KEYP), KLEN
2463 movups (IVP), STATE # load iv as initial state
2464.align 4
2465.Lcbc_enc_loop:
2466 movups (INP), IN # load input
2467 pxor IN, STATE
2468 call _aesni_enc1
2469 movups STATE, (OUTP) # store output
2470 sub $16, LEN
2471 add $16, INP
2472 add $16, OUTP
2473 cmp $16, LEN
2474 jge .Lcbc_enc_loop
2475 movups STATE, (IVP)
2476.Lcbc_enc_ret:
Mathias Krause0d258ef2010-11-27 16:34:46 +08002477#ifndef __x86_64__
2478 popl KLEN
2479 popl KEYP
2480 popl LEN
2481 popl IVP
2482#endif
Josh Poimboeuf8691ccd2016-01-21 16:49:19 -06002483 FRAME_END
Huang Ying54b6a1b2009-01-18 16:28:34 +11002484 ret
Jiri Slaby6dcc5622019-10-11 13:51:04 +02002485SYM_FUNC_END(aesni_cbc_enc)
Huang Ying54b6a1b2009-01-18 16:28:34 +11002486
2487/*
2488 * void aesni_cbc_dec(struct crypto_aes_ctx *ctx, const u8 *dst, u8 *src,
2489 * size_t len, u8 *iv)
2490 */
Jiri Slaby6dcc5622019-10-11 13:51:04 +02002491SYM_FUNC_START(aesni_cbc_dec)
Josh Poimboeuf8691ccd2016-01-21 16:49:19 -06002492 FRAME_BEGIN
Mathias Krause0d258ef2010-11-27 16:34:46 +08002493#ifndef __x86_64__
2494 pushl IVP
2495 pushl LEN
2496 pushl KEYP
2497 pushl KLEN
Josh Poimboeuf8691ccd2016-01-21 16:49:19 -06002498 movl (FRAME_OFFSET+20)(%esp), KEYP # ctx
2499 movl (FRAME_OFFSET+24)(%esp), OUTP # dst
2500 movl (FRAME_OFFSET+28)(%esp), INP # src
2501 movl (FRAME_OFFSET+32)(%esp), LEN # len
2502 movl (FRAME_OFFSET+36)(%esp), IVP # iv
Mathias Krause0d258ef2010-11-27 16:34:46 +08002503#endif
Huang Ying54b6a1b2009-01-18 16:28:34 +11002504 cmp $16, LEN
Huang Yinge6efaa02009-06-18 19:33:57 +08002505 jb .Lcbc_dec_just_ret
Huang Ying54b6a1b2009-01-18 16:28:34 +11002506 mov 480(KEYP), KLEN
2507 add $240, KEYP
2508 movups (IVP), IV
2509 cmp $64, LEN
2510 jb .Lcbc_dec_loop1
2511.align 4
2512.Lcbc_dec_loop4:
2513 movups (INP), IN1
2514 movaps IN1, STATE1
2515 movups 0x10(INP), IN2
2516 movaps IN2, STATE2
Mathias Krause0d258ef2010-11-27 16:34:46 +08002517#ifdef __x86_64__
Huang Ying54b6a1b2009-01-18 16:28:34 +11002518 movups 0x20(INP), IN3
2519 movaps IN3, STATE3
2520 movups 0x30(INP), IN4
2521 movaps IN4, STATE4
Mathias Krause0d258ef2010-11-27 16:34:46 +08002522#else
2523 movups 0x20(INP), IN1
2524 movaps IN1, STATE3
2525 movups 0x30(INP), IN2
2526 movaps IN2, STATE4
2527#endif
Huang Ying54b6a1b2009-01-18 16:28:34 +11002528 call _aesni_dec4
2529 pxor IV, STATE1
Mathias Krause0d258ef2010-11-27 16:34:46 +08002530#ifdef __x86_64__
Huang Ying54b6a1b2009-01-18 16:28:34 +11002531 pxor IN1, STATE2
2532 pxor IN2, STATE3
2533 pxor IN3, STATE4
2534 movaps IN4, IV
Mathias Krause0d258ef2010-11-27 16:34:46 +08002535#else
Mathias Krause0d258ef2010-11-27 16:34:46 +08002536 pxor IN1, STATE4
2537 movaps IN2, IV
Mathias Krause7c8d5182012-05-30 01:43:08 +02002538 movups (INP), IN1
2539 pxor IN1, STATE2
2540 movups 0x10(INP), IN2
2541 pxor IN2, STATE3
Mathias Krause0d258ef2010-11-27 16:34:46 +08002542#endif
Huang Ying54b6a1b2009-01-18 16:28:34 +11002543 movups STATE1, (OUTP)
2544 movups STATE2, 0x10(OUTP)
2545 movups STATE3, 0x20(OUTP)
2546 movups STATE4, 0x30(OUTP)
2547 sub $64, LEN
2548 add $64, INP
2549 add $64, OUTP
2550 cmp $64, LEN
2551 jge .Lcbc_dec_loop4
2552 cmp $16, LEN
2553 jb .Lcbc_dec_ret
2554.align 4
2555.Lcbc_dec_loop1:
2556 movups (INP), IN
2557 movaps IN, STATE
2558 call _aesni_dec1
2559 pxor IV, STATE
2560 movups STATE, (OUTP)
2561 movaps IN, IV
2562 sub $16, LEN
2563 add $16, INP
2564 add $16, OUTP
2565 cmp $16, LEN
2566 jge .Lcbc_dec_loop1
Huang Ying54b6a1b2009-01-18 16:28:34 +11002567.Lcbc_dec_ret:
Huang Yinge6efaa02009-06-18 19:33:57 +08002568 movups IV, (IVP)
2569.Lcbc_dec_just_ret:
Mathias Krause0d258ef2010-11-27 16:34:46 +08002570#ifndef __x86_64__
2571 popl KLEN
2572 popl KEYP
2573 popl LEN
2574 popl IVP
2575#endif
Josh Poimboeuf8691ccd2016-01-21 16:49:19 -06002576 FRAME_END
Huang Ying54b6a1b2009-01-18 16:28:34 +11002577 ret
Jiri Slaby6dcc5622019-10-11 13:51:04 +02002578SYM_FUNC_END(aesni_cbc_dec)
Huang Ying12387a42010-03-10 18:28:55 +08002579
Mathias Krause0d258ef2010-11-27 16:34:46 +08002580#ifdef __x86_64__
Josh Poimboeuf1253cab2016-01-21 16:49:15 -06002581.pushsection .rodata
Huang Ying12387a42010-03-10 18:28:55 +08002582.align 16
2583.Lbswap_mask:
2584 .byte 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0
Josh Poimboeuf1253cab2016-01-21 16:49:15 -06002585.popsection
Huang Ying12387a42010-03-10 18:28:55 +08002586
2587/*
2588 * _aesni_inc_init: internal ABI
2589 * setup registers used by _aesni_inc
2590 * input:
2591 * IV
2592 * output:
2593 * CTR: == IV, in little endian
2594 * TCTR_LOW: == lower qword of CTR
2595 * INC: == 1, in little endian
2596 * BSWAP_MASK == endian swapping mask
2597 */
Jiri Slaby74d8b902019-10-11 13:50:46 +02002598SYM_FUNC_START_LOCAL(_aesni_inc_init)
Huang Ying12387a42010-03-10 18:28:55 +08002599 movaps .Lbswap_mask, BSWAP_MASK
2600 movaps IV, CTR
Uros Bizjakd7866e52020-07-09 17:08:57 +02002601 pshufb BSWAP_MASK, CTR
Huang Ying12387a42010-03-10 18:28:55 +08002602 mov $1, TCTR_LOW
Uros Bizjakd7866e52020-07-09 17:08:57 +02002603 movq TCTR_LOW, INC
2604 movq CTR, TCTR_LOW
Huang Ying12387a42010-03-10 18:28:55 +08002605 ret
Jiri Slaby74d8b902019-10-11 13:50:46 +02002606SYM_FUNC_END(_aesni_inc_init)
Huang Ying12387a42010-03-10 18:28:55 +08002607
2608/*
2609 * _aesni_inc: internal ABI
2610 * Increase IV by 1, IV is in big endian
2611 * input:
2612 * IV
2613 * CTR: == IV, in little endian
2614 * TCTR_LOW: == lower qword of CTR
2615 * INC: == 1, in little endian
2616 * BSWAP_MASK == endian swapping mask
2617 * output:
2618 * IV: Increase by 1
2619 * changed:
2620 * CTR: == output IV, in little endian
2621 * TCTR_LOW: == lower qword of CTR
2622 */
Jiri Slaby74d8b902019-10-11 13:50:46 +02002623SYM_FUNC_START_LOCAL(_aesni_inc)
Huang Ying12387a42010-03-10 18:28:55 +08002624 paddq INC, CTR
2625 add $1, TCTR_LOW
2626 jnc .Linc_low
2627 pslldq $8, INC
2628 paddq INC, CTR
2629 psrldq $8, INC
2630.Linc_low:
2631 movaps CTR, IV
Uros Bizjakd7866e52020-07-09 17:08:57 +02002632 pshufb BSWAP_MASK, IV
Huang Ying12387a42010-03-10 18:28:55 +08002633 ret
Jiri Slaby74d8b902019-10-11 13:50:46 +02002634SYM_FUNC_END(_aesni_inc)
Huang Ying12387a42010-03-10 18:28:55 +08002635
2636/*
2637 * void aesni_ctr_enc(struct crypto_aes_ctx *ctx, const u8 *dst, u8 *src,
2638 * size_t len, u8 *iv)
2639 */
Jiri Slaby6dcc5622019-10-11 13:51:04 +02002640SYM_FUNC_START(aesni_ctr_enc)
Josh Poimboeuf8691ccd2016-01-21 16:49:19 -06002641 FRAME_BEGIN
Huang Ying12387a42010-03-10 18:28:55 +08002642 cmp $16, LEN
2643 jb .Lctr_enc_just_ret
2644 mov 480(KEYP), KLEN
2645 movups (IVP), IV
2646 call _aesni_inc_init
2647 cmp $64, LEN
2648 jb .Lctr_enc_loop1
2649.align 4
2650.Lctr_enc_loop4:
2651 movaps IV, STATE1
2652 call _aesni_inc
2653 movups (INP), IN1
2654 movaps IV, STATE2
2655 call _aesni_inc
2656 movups 0x10(INP), IN2
2657 movaps IV, STATE3
2658 call _aesni_inc
2659 movups 0x20(INP), IN3
2660 movaps IV, STATE4
2661 call _aesni_inc
2662 movups 0x30(INP), IN4
2663 call _aesni_enc4
2664 pxor IN1, STATE1
2665 movups STATE1, (OUTP)
2666 pxor IN2, STATE2
2667 movups STATE2, 0x10(OUTP)
2668 pxor IN3, STATE3
2669 movups STATE3, 0x20(OUTP)
2670 pxor IN4, STATE4
2671 movups STATE4, 0x30(OUTP)
2672 sub $64, LEN
2673 add $64, INP
2674 add $64, OUTP
2675 cmp $64, LEN
2676 jge .Lctr_enc_loop4
2677 cmp $16, LEN
2678 jb .Lctr_enc_ret
2679.align 4
2680.Lctr_enc_loop1:
2681 movaps IV, STATE
2682 call _aesni_inc
2683 movups (INP), IN
2684 call _aesni_enc1
2685 pxor IN, STATE
2686 movups STATE, (OUTP)
2687 sub $16, LEN
2688 add $16, INP
2689 add $16, OUTP
2690 cmp $16, LEN
2691 jge .Lctr_enc_loop1
2692.Lctr_enc_ret:
2693 movups IV, (IVP)
2694.Lctr_enc_just_ret:
Josh Poimboeuf8691ccd2016-01-21 16:49:19 -06002695 FRAME_END
Huang Ying12387a42010-03-10 18:28:55 +08002696 ret
Jiri Slaby6dcc5622019-10-11 13:51:04 +02002697SYM_FUNC_END(aesni_ctr_enc)
Jussi Kivilinnac456a9c2013-04-08 21:51:16 +03002698
2699/*
2700 * _aesni_gf128mul_x_ble: internal ABI
2701 * Multiply in GF(2^128) for XTS IVs
2702 * input:
2703 * IV: current IV
2704 * GF128MUL_MASK == mask with 0x87 and 0x01
2705 * output:
2706 * IV: next IV
2707 * changed:
2708 * CTR: == temporary value
2709 */
2710#define _aesni_gf128mul_x_ble() \
2711 pshufd $0x13, IV, CTR; \
2712 paddq IV, IV; \
2713 psrad $31, CTR; \
2714 pand GF128MUL_MASK, CTR; \
2715 pxor CTR, IV;
2716
2717/*
Kees Cook9c1e8832019-11-26 22:08:02 -08002718 * void aesni_xts_crypt8(const struct crypto_aes_ctx *ctx, u8 *dst,
2719 * const u8 *src, bool enc, le128 *iv)
Jussi Kivilinnac456a9c2013-04-08 21:51:16 +03002720 */
Jiri Slaby6dcc5622019-10-11 13:51:04 +02002721SYM_FUNC_START(aesni_xts_crypt8)
Josh Poimboeuf8691ccd2016-01-21 16:49:19 -06002722 FRAME_BEGIN
Uros Bizjak032d0492020-11-27 10:44:52 +01002723 testb %cl, %cl
Jussi Kivilinnac456a9c2013-04-08 21:51:16 +03002724 movl $0, %ecx
2725 movl $240, %r10d
2726 leaq _aesni_enc4, %r11
2727 leaq _aesni_dec4, %rax
2728 cmovel %r10d, %ecx
2729 cmoveq %rax, %r11
2730
2731 movdqa .Lgf128mul_x_ble_mask, GF128MUL_MASK
2732 movups (IVP), IV
2733
2734 mov 480(KEYP), KLEN
2735 addq %rcx, KEYP
2736
2737 movdqa IV, STATE1
Jussi Kivilinnafe6510b2013-06-11 22:25:22 +03002738 movdqu 0x00(INP), INC
2739 pxor INC, STATE1
Jussi Kivilinnac456a9c2013-04-08 21:51:16 +03002740 movdqu IV, 0x00(OUTP)
2741
2742 _aesni_gf128mul_x_ble()
2743 movdqa IV, STATE2
Jussi Kivilinnafe6510b2013-06-11 22:25:22 +03002744 movdqu 0x10(INP), INC
2745 pxor INC, STATE2
Jussi Kivilinnac456a9c2013-04-08 21:51:16 +03002746 movdqu IV, 0x10(OUTP)
2747
2748 _aesni_gf128mul_x_ble()
2749 movdqa IV, STATE3
Jussi Kivilinnafe6510b2013-06-11 22:25:22 +03002750 movdqu 0x20(INP), INC
2751 pxor INC, STATE3
Jussi Kivilinnac456a9c2013-04-08 21:51:16 +03002752 movdqu IV, 0x20(OUTP)
2753
2754 _aesni_gf128mul_x_ble()
2755 movdqa IV, STATE4
Jussi Kivilinnafe6510b2013-06-11 22:25:22 +03002756 movdqu 0x30(INP), INC
2757 pxor INC, STATE4
Jussi Kivilinnac456a9c2013-04-08 21:51:16 +03002758 movdqu IV, 0x30(OUTP)
2759
Peter Zijlstra34fdce62020-04-22 17:16:40 +02002760 CALL_NOSPEC r11
Jussi Kivilinnac456a9c2013-04-08 21:51:16 +03002761
Jussi Kivilinnafe6510b2013-06-11 22:25:22 +03002762 movdqu 0x00(OUTP), INC
2763 pxor INC, STATE1
Jussi Kivilinnac456a9c2013-04-08 21:51:16 +03002764 movdqu STATE1, 0x00(OUTP)
2765
2766 _aesni_gf128mul_x_ble()
2767 movdqa IV, STATE1
Jussi Kivilinnafe6510b2013-06-11 22:25:22 +03002768 movdqu 0x40(INP), INC
2769 pxor INC, STATE1
Jussi Kivilinnac456a9c2013-04-08 21:51:16 +03002770 movdqu IV, 0x40(OUTP)
2771
Jussi Kivilinnafe6510b2013-06-11 22:25:22 +03002772 movdqu 0x10(OUTP), INC
2773 pxor INC, STATE2
Jussi Kivilinnac456a9c2013-04-08 21:51:16 +03002774 movdqu STATE2, 0x10(OUTP)
2775
2776 _aesni_gf128mul_x_ble()
2777 movdqa IV, STATE2
Jussi Kivilinnafe6510b2013-06-11 22:25:22 +03002778 movdqu 0x50(INP), INC
2779 pxor INC, STATE2
Jussi Kivilinnac456a9c2013-04-08 21:51:16 +03002780 movdqu IV, 0x50(OUTP)
2781
Jussi Kivilinnafe6510b2013-06-11 22:25:22 +03002782 movdqu 0x20(OUTP), INC
2783 pxor INC, STATE3
Jussi Kivilinnac456a9c2013-04-08 21:51:16 +03002784 movdqu STATE3, 0x20(OUTP)
2785
2786 _aesni_gf128mul_x_ble()
2787 movdqa IV, STATE3
Jussi Kivilinnafe6510b2013-06-11 22:25:22 +03002788 movdqu 0x60(INP), INC
2789 pxor INC, STATE3
Jussi Kivilinnac456a9c2013-04-08 21:51:16 +03002790 movdqu IV, 0x60(OUTP)
2791
Jussi Kivilinnafe6510b2013-06-11 22:25:22 +03002792 movdqu 0x30(OUTP), INC
2793 pxor INC, STATE4
Jussi Kivilinnac456a9c2013-04-08 21:51:16 +03002794 movdqu STATE4, 0x30(OUTP)
2795
2796 _aesni_gf128mul_x_ble()
2797 movdqa IV, STATE4
Jussi Kivilinnafe6510b2013-06-11 22:25:22 +03002798 movdqu 0x70(INP), INC
2799 pxor INC, STATE4
Jussi Kivilinnac456a9c2013-04-08 21:51:16 +03002800 movdqu IV, 0x70(OUTP)
2801
2802 _aesni_gf128mul_x_ble()
2803 movups IV, (IVP)
2804
Peter Zijlstra34fdce62020-04-22 17:16:40 +02002805 CALL_NOSPEC r11
Jussi Kivilinnac456a9c2013-04-08 21:51:16 +03002806
Jussi Kivilinnafe6510b2013-06-11 22:25:22 +03002807 movdqu 0x40(OUTP), INC
2808 pxor INC, STATE1
Jussi Kivilinnac456a9c2013-04-08 21:51:16 +03002809 movdqu STATE1, 0x40(OUTP)
2810
Jussi Kivilinnafe6510b2013-06-11 22:25:22 +03002811 movdqu 0x50(OUTP), INC
2812 pxor INC, STATE2
Jussi Kivilinnac456a9c2013-04-08 21:51:16 +03002813 movdqu STATE2, 0x50(OUTP)
2814
Jussi Kivilinnafe6510b2013-06-11 22:25:22 +03002815 movdqu 0x60(OUTP), INC
2816 pxor INC, STATE3
Jussi Kivilinnac456a9c2013-04-08 21:51:16 +03002817 movdqu STATE3, 0x60(OUTP)
2818
Jussi Kivilinnafe6510b2013-06-11 22:25:22 +03002819 movdqu 0x70(OUTP), INC
2820 pxor INC, STATE4
Jussi Kivilinnac456a9c2013-04-08 21:51:16 +03002821 movdqu STATE4, 0x70(OUTP)
2822
Josh Poimboeuf8691ccd2016-01-21 16:49:19 -06002823 FRAME_END
Jussi Kivilinnac456a9c2013-04-08 21:51:16 +03002824 ret
Jiri Slaby6dcc5622019-10-11 13:51:04 +02002825SYM_FUNC_END(aesni_xts_crypt8)
Jussi Kivilinnac456a9c2013-04-08 21:51:16 +03002826
Mathias Krause0d258ef2010-11-27 16:34:46 +08002827#endif