blob: 4e3972570916ed973c9abdeb2609069275715bc6 [file] [log] [blame]
Thomas Gleixner2874c5f2019-05-27 08:55:01 +02001/* SPDX-License-Identifier: GPL-2.0-or-later */
Huang Ying54b6a1b2009-01-18 16:28:34 +11002/*
3 * Implement AES algorithm in Intel AES-NI instructions.
4 *
5 * The white paper of AES-NI instructions can be downloaded from:
6 * http://softwarecommunity.intel.com/isn/downloads/intelavx/AES-Instructions-Set_WP.pdf
7 *
8 * Copyright (C) 2008, Intel Corp.
9 * Author: Huang Ying <ying.huang@intel.com>
10 * Vinodh Gopal <vinodh.gopal@intel.com>
11 * Kahraman Akdemir
12 *
Tadeusz Struk0bd82f52010-11-04 15:00:45 -040013 * Added RFC4106 AES-GCM support for 128-bit keys under the AEAD
14 * interface for 64-bit kernels.
15 * Authors: Erdinc Ozturk (erdinc.ozturk@intel.com)
16 * Aidan O'Mahony (aidan.o.mahony@intel.com)
17 * Adrian Hoban <adrian.hoban@intel.com>
18 * James Guilford (james.guilford@intel.com)
19 * Gabriele Paoloni <gabriele.paoloni@intel.com>
20 * Tadeusz Struk (tadeusz.struk@intel.com)
21 * Wajdi Feghali (wajdi.k.feghali@intel.com)
22 * Copyright (c) 2010, Intel Corporation.
23 *
Mathias Krause0d258ef2010-11-27 16:34:46 +080024 * Ported x86_64 version to x86:
25 * Author: Mathias Krause <minipli@googlemail.com>
Huang Ying54b6a1b2009-01-18 16:28:34 +110026 */
27
28#include <linux/linkage.h>
Josh Poimboeuf8691ccd2016-01-21 16:49:19 -060029#include <asm/frame.h>
David Woodhouse9697fa32018-01-11 21:46:27 +000030#include <asm/nospec-branch.h>
Huang Ying54b6a1b2009-01-18 16:28:34 +110031
Timothy McCaffreye31ac322015-01-13 13:16:43 -050032/*
33 * The following macros are used to move an (un)aligned 16 byte value to/from
34 * an XMM register. This can done for either FP or integer values, for FP use
35 * movaps (move aligned packed single) or integer use movdqa (move double quad
36 * aligned). It doesn't make a performance difference which instruction is used
37 * since Nehalem (original Core i7) was released. However, the movaps is a byte
38 * shorter, so that is the one we'll use for now. (same for unaligned).
39 */
40#define MOVADQ movaps
41#define MOVUDQ movups
42
Mathias Krause559ad0f2010-11-29 08:35:39 +080043#ifdef __x86_64__
Timothy McCaffreye31ac322015-01-13 13:16:43 -050044
Denys Vlasenkoe1839142017-01-19 22:33:04 +010045# constants in mergeable sections, linker can reorder and merge
Denys Vlasenkoe1839142017-01-19 22:33:04 +010046.section .rodata.cst16.POLY, "aM", @progbits, 16
47.align 16
Tadeusz Struk0bd82f52010-11-04 15:00:45 -040048POLY: .octa 0xC2000000000000000000000000000001
Denys Vlasenkoe1839142017-01-19 22:33:04 +010049.section .rodata.cst16.TWOONE, "aM", @progbits, 16
50.align 16
Tadeusz Struk0bd82f52010-11-04 15:00:45 -040051TWOONE: .octa 0x00000001000000000000000000000001
52
Denys Vlasenkoe1839142017-01-19 22:33:04 +010053.section .rodata.cst16.SHUF_MASK, "aM", @progbits, 16
54.align 16
55SHUF_MASK: .octa 0x000102030405060708090A0B0C0D0E0F
56.section .rodata.cst16.MASK1, "aM", @progbits, 16
57.align 16
58MASK1: .octa 0x0000000000000000ffffffffffffffff
59.section .rodata.cst16.MASK2, "aM", @progbits, 16
60.align 16
61MASK2: .octa 0xffffffffffffffff0000000000000000
62.section .rodata.cst16.ONE, "aM", @progbits, 16
63.align 16
64ONE: .octa 0x00000000000000000000000000000001
65.section .rodata.cst16.F_MIN_MASK, "aM", @progbits, 16
66.align 16
67F_MIN_MASK: .octa 0xf1f2f3f4f5f6f7f8f9fafbfcfdfeff0
68.section .rodata.cst16.dec, "aM", @progbits, 16
69.align 16
70dec: .octa 0x1
71.section .rodata.cst16.enc, "aM", @progbits, 16
72.align 16
73enc: .octa 0x2
74
Tadeusz Struk0bd82f52010-11-04 15:00:45 -040075# order of these constants should not change.
76# more specifically, ALL_F should follow SHIFT_MASK,
Denys Vlasenkoe1839142017-01-19 22:33:04 +010077# and zero should follow ALL_F
78.section .rodata, "a", @progbits
79.align 16
Tadeusz Struk0bd82f52010-11-04 15:00:45 -040080SHIFT_MASK: .octa 0x0f0e0d0c0b0a09080706050403020100
81ALL_F: .octa 0xffffffffffffffffffffffffffffffff
Denys Vlasenkoe1839142017-01-19 22:33:04 +010082 .octa 0x00000000000000000000000000000000
Tadeusz Struk0bd82f52010-11-04 15:00:45 -040083
Huang Ying54b6a1b2009-01-18 16:28:34 +110084.text
85
Tadeusz Struk0bd82f52010-11-04 15:00:45 -040086
87#define STACK_OFFSET 8*3
Tadeusz Struk0bd82f52010-11-04 15:00:45 -040088
Dave Watson9ee4a5d2018-02-14 09:39:23 -080089#define AadHash 16*0
90#define AadLen 16*1
91#define InLen (16*1)+8
92#define PBlockEncKey 16*2
93#define OrigIV 16*3
94#define CurCount 16*4
95#define PBlockLen 16*5
Dave Watson1476db22018-02-14 09:40:10 -080096#define HashKey 16*6 // store HashKey <<1 mod poly here
97#define HashKey_2 16*7 // store HashKey^2 <<1 mod poly here
98#define HashKey_3 16*8 // store HashKey^3 <<1 mod poly here
99#define HashKey_4 16*9 // store HashKey^4 <<1 mod poly here
100#define HashKey_k 16*10 // store XOR of High 64 bits and Low 64
101 // bits of HashKey <<1 mod poly here
102 //(for Karatsuba purposes)
103#define HashKey_2_k 16*11 // store XOR of High 64 bits and Low 64
104 // bits of HashKey^2 <<1 mod poly here
105 // (for Karatsuba purposes)
106#define HashKey_3_k 16*12 // store XOR of High 64 bits and Low 64
107 // bits of HashKey^3 <<1 mod poly here
108 // (for Karatsuba purposes)
109#define HashKey_4_k 16*13 // store XOR of High 64 bits and Low 64
110 // bits of HashKey^4 <<1 mod poly here
111 // (for Karatsuba purposes)
Dave Watson9ee4a5d2018-02-14 09:39:23 -0800112
Tadeusz Struk0bd82f52010-11-04 15:00:45 -0400113#define arg1 rdi
114#define arg2 rsi
115#define arg3 rdx
116#define arg4 rcx
117#define arg5 r8
118#define arg6 r9
Dave Watson1476db22018-02-14 09:40:10 -0800119#define arg7 STACK_OFFSET+8(%rsp)
120#define arg8 STACK_OFFSET+16(%rsp)
121#define arg9 STACK_OFFSET+24(%rsp)
122#define arg10 STACK_OFFSET+32(%rsp)
123#define arg11 STACK_OFFSET+40(%rsp)
Timothy McCaffreye31ac322015-01-13 13:16:43 -0500124#define keysize 2*15*16(%arg1)
Mathias Krause559ad0f2010-11-29 08:35:39 +0800125#endif
Tadeusz Struk0bd82f52010-11-04 15:00:45 -0400126
127
Huang Ying54b6a1b2009-01-18 16:28:34 +1100128#define STATE1 %xmm0
129#define STATE2 %xmm4
130#define STATE3 %xmm5
131#define STATE4 %xmm6
132#define STATE STATE1
133#define IN1 %xmm1
134#define IN2 %xmm7
135#define IN3 %xmm8
136#define IN4 %xmm9
137#define IN IN1
138#define KEY %xmm2
139#define IV %xmm3
Mathias Krause0d258ef2010-11-27 16:34:46 +0800140
Huang Ying12387a42010-03-10 18:28:55 +0800141#define BSWAP_MASK %xmm10
142#define CTR %xmm11
143#define INC %xmm12
Huang Ying54b6a1b2009-01-18 16:28:34 +1100144
Ard Biesheuvel24811042020-12-31 17:41:55 +0100145#define GF128MUL_MASK %xmm7
Jussi Kivilinnac456a9c2013-04-08 21:51:16 +0300146
Mathias Krause0d258ef2010-11-27 16:34:46 +0800147#ifdef __x86_64__
148#define AREG %rax
Huang Ying54b6a1b2009-01-18 16:28:34 +1100149#define KEYP %rdi
150#define OUTP %rsi
Mathias Krause0d258ef2010-11-27 16:34:46 +0800151#define UKEYP OUTP
Huang Ying54b6a1b2009-01-18 16:28:34 +1100152#define INP %rdx
153#define LEN %rcx
154#define IVP %r8
155#define KLEN %r9d
156#define T1 %r10
157#define TKEYP T1
158#define T2 %r11
Huang Ying12387a42010-03-10 18:28:55 +0800159#define TCTR_LOW T2
Mathias Krause0d258ef2010-11-27 16:34:46 +0800160#else
161#define AREG %eax
162#define KEYP %edi
163#define OUTP AREG
164#define UKEYP OUTP
165#define INP %edx
166#define LEN %esi
167#define IVP %ebp
168#define KLEN %ebx
169#define T1 %ecx
170#define TKEYP T1
171#endif
Huang Ying54b6a1b2009-01-18 16:28:34 +1100172
Dave Watson6c2c86b2018-02-14 09:38:35 -0800173.macro FUNC_SAVE
174 push %r12
175 push %r13
176 push %r14
Dave Watson6c2c86b2018-02-14 09:38:35 -0800177#
178# states of %xmm registers %xmm6:%xmm15 not saved
179# all %xmm registers are clobbered
180#
Dave Watson6c2c86b2018-02-14 09:38:35 -0800181.endm
182
183
184.macro FUNC_RESTORE
Dave Watson6c2c86b2018-02-14 09:38:35 -0800185 pop %r14
186 pop %r13
187 pop %r12
188.endm
Tadeusz Struk0bd82f52010-11-04 15:00:45 -0400189
Dave Watson1476db22018-02-14 09:40:10 -0800190# Precompute hashkeys.
191# Input: Hash subkey.
192# Output: HashKeys stored in gcm_context_data. Only needs to be called
193# once per key.
194# clobbers r12, and tmp xmm registers.
Dave Watsonfb8986e2018-02-14 09:40:47 -0800195.macro PRECOMPUTE SUBKEY TMP1 TMP2 TMP3 TMP4 TMP5 TMP6 TMP7
196 mov \SUBKEY, %r12
Dave Watson1476db22018-02-14 09:40:10 -0800197 movdqu (%r12), \TMP3
198 movdqa SHUF_MASK(%rip), \TMP2
Uros Bizjakd7866e52020-07-09 17:08:57 +0200199 pshufb \TMP2, \TMP3
Dave Watson1476db22018-02-14 09:40:10 -0800200
201 # precompute HashKey<<1 mod poly from the HashKey (required for GHASH)
202
203 movdqa \TMP3, \TMP2
204 psllq $1, \TMP3
205 psrlq $63, \TMP2
206 movdqa \TMP2, \TMP1
207 pslldq $8, \TMP2
208 psrldq $8, \TMP1
209 por \TMP2, \TMP3
210
211 # reduce HashKey<<1
212
213 pshufd $0x24, \TMP1, \TMP2
214 pcmpeqd TWOONE(%rip), \TMP2
215 pand POLY(%rip), \TMP2
216 pxor \TMP2, \TMP3
Dave Watsone5b954e2018-08-15 10:29:42 -0700217 movdqu \TMP3, HashKey(%arg2)
Dave Watson1476db22018-02-14 09:40:10 -0800218
219 movdqa \TMP3, \TMP5
220 pshufd $78, \TMP3, \TMP1
221 pxor \TMP3, \TMP1
Dave Watsone5b954e2018-08-15 10:29:42 -0700222 movdqu \TMP1, HashKey_k(%arg2)
Dave Watson1476db22018-02-14 09:40:10 -0800223
224 GHASH_MUL \TMP5, \TMP3, \TMP1, \TMP2, \TMP4, \TMP6, \TMP7
225# TMP5 = HashKey^2<<1 (mod poly)
Dave Watsone5b954e2018-08-15 10:29:42 -0700226 movdqu \TMP5, HashKey_2(%arg2)
Dave Watson1476db22018-02-14 09:40:10 -0800227# HashKey_2 = HashKey^2<<1 (mod poly)
228 pshufd $78, \TMP5, \TMP1
229 pxor \TMP5, \TMP1
Dave Watsone5b954e2018-08-15 10:29:42 -0700230 movdqu \TMP1, HashKey_2_k(%arg2)
Dave Watson1476db22018-02-14 09:40:10 -0800231
232 GHASH_MUL \TMP5, \TMP3, \TMP1, \TMP2, \TMP4, \TMP6, \TMP7
233# TMP5 = HashKey^3<<1 (mod poly)
Dave Watsone5b954e2018-08-15 10:29:42 -0700234 movdqu \TMP5, HashKey_3(%arg2)
Dave Watson1476db22018-02-14 09:40:10 -0800235 pshufd $78, \TMP5, \TMP1
236 pxor \TMP5, \TMP1
Dave Watsone5b954e2018-08-15 10:29:42 -0700237 movdqu \TMP1, HashKey_3_k(%arg2)
Dave Watson1476db22018-02-14 09:40:10 -0800238
239 GHASH_MUL \TMP5, \TMP3, \TMP1, \TMP2, \TMP4, \TMP6, \TMP7
240# TMP5 = HashKey^3<<1 (mod poly)
Dave Watsone5b954e2018-08-15 10:29:42 -0700241 movdqu \TMP5, HashKey_4(%arg2)
Dave Watson1476db22018-02-14 09:40:10 -0800242 pshufd $78, \TMP5, \TMP1
243 pxor \TMP5, \TMP1
Dave Watsone5b954e2018-08-15 10:29:42 -0700244 movdqu \TMP1, HashKey_4_k(%arg2)
Dave Watson1476db22018-02-14 09:40:10 -0800245.endm
Dave Watson7af964c2018-02-14 09:38:45 -0800246
247# GCM_INIT initializes a gcm_context struct to prepare for encoding/decoding.
248# Clobbers rax, r10-r13 and xmm0-xmm6, %xmm13
Dave Watsonfb8986e2018-02-14 09:40:47 -0800249.macro GCM_INIT Iv SUBKEY AAD AADLEN
250 mov \AADLEN, %r11
Dave Watson96604742018-02-14 09:39:45 -0800251 mov %r11, AadLen(%arg2) # ctx_data.aad_length = aad_length
Jan Beulicha7bea832018-07-02 04:31:54 -0600252 xor %r11d, %r11d
Dave Watson96604742018-02-14 09:39:45 -0800253 mov %r11, InLen(%arg2) # ctx_data.in_length = 0
254 mov %r11, PBlockLen(%arg2) # ctx_data.partial_block_length = 0
255 mov %r11, PBlockEncKey(%arg2) # ctx_data.partial_block_enc_key = 0
Dave Watsonfb8986e2018-02-14 09:40:47 -0800256 mov \Iv, %rax
Dave Watson96604742018-02-14 09:39:45 -0800257 movdqu (%rax), %xmm0
258 movdqu %xmm0, OrigIV(%arg2) # ctx_data.orig_IV = iv
259
260 movdqa SHUF_MASK(%rip), %xmm2
Uros Bizjakd7866e52020-07-09 17:08:57 +0200261 pshufb %xmm2, %xmm0
Dave Watson96604742018-02-14 09:39:45 -0800262 movdqu %xmm0, CurCount(%arg2) # ctx_data.current_counter = iv
263
Sedat Dilek3347c8a2020-07-03 16:32:06 +0200264 PRECOMPUTE \SUBKEY, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7
Dave Watsone5b954e2018-08-15 10:29:42 -0700265 movdqu HashKey(%arg2), %xmm13
Dave Watsonc594c542018-02-14 09:39:36 -0800266
Dave Watsonfb8986e2018-02-14 09:40:47 -0800267 CALC_AAD_HASH %xmm13, \AAD, \AADLEN, %xmm0, %xmm1, %xmm2, %xmm3, \
268 %xmm4, %xmm5, %xmm6
Dave Watson7af964c2018-02-14 09:38:45 -0800269.endm
270
Dave Watsonba458332018-02-14 09:39:10 -0800271# GCM_ENC_DEC Encodes/Decodes given data. Assumes that the passed gcm_context
272# struct has been initialized by GCM_INIT.
273# Requires the input data be at least 1 byte long because of READ_PARTIAL_BLOCK
274# Clobbers rax, r10-r13, and xmm0-xmm15
275.macro GCM_ENC_DEC operation
Dave Watson96604742018-02-14 09:39:45 -0800276 movdqu AadHash(%arg2), %xmm8
Dave Watson1476db22018-02-14 09:40:10 -0800277 movdqu HashKey(%arg2), %xmm13
Dave Watson96604742018-02-14 09:39:45 -0800278 add %arg5, InLen(%arg2)
Dave Watsonae952c52018-02-14 09:40:19 -0800279
Jan Beulicha7bea832018-07-02 04:31:54 -0600280 xor %r11d, %r11d # initialise the data pointer offset as zero
Dave Watsonae952c52018-02-14 09:40:19 -0800281 PARTIAL_BLOCK %arg3 %arg4 %arg5 %r11 %xmm8 \operation
282
283 sub %r11, %arg5 # sub partial block data used
Dave Watson96604742018-02-14 09:39:45 -0800284 mov %arg5, %r13 # save the number of bytes
Dave Watsonae952c52018-02-14 09:40:19 -0800285
Dave Watson96604742018-02-14 09:39:45 -0800286 and $-16, %r13 # %r13 = %r13 - (%r13 mod 16)
287 mov %r13, %r12
Dave Watsonba458332018-02-14 09:39:10 -0800288 # Encrypt/Decrypt first few blocks
289
290 and $(3<<4), %r12
291 jz _initial_num_blocks_is_0_\@
292 cmp $(2<<4), %r12
293 jb _initial_num_blocks_is_1_\@
294 je _initial_num_blocks_is_2_\@
295_initial_num_blocks_is_3_\@:
296 INITIAL_BLOCKS_ENC_DEC %xmm9, %xmm10, %xmm13, %xmm11, %xmm12, %xmm0, \
297%xmm1, %xmm2, %xmm3, %xmm4, %xmm8, %xmm5, %xmm6, 5, 678, \operation
298 sub $48, %r13
299 jmp _initial_blocks_\@
300_initial_num_blocks_is_2_\@:
301 INITIAL_BLOCKS_ENC_DEC %xmm9, %xmm10, %xmm13, %xmm11, %xmm12, %xmm0, \
302%xmm1, %xmm2, %xmm3, %xmm4, %xmm8, %xmm5, %xmm6, 6, 78, \operation
303 sub $32, %r13
304 jmp _initial_blocks_\@
305_initial_num_blocks_is_1_\@:
306 INITIAL_BLOCKS_ENC_DEC %xmm9, %xmm10, %xmm13, %xmm11, %xmm12, %xmm0, \
307%xmm1, %xmm2, %xmm3, %xmm4, %xmm8, %xmm5, %xmm6, 7, 8, \operation
308 sub $16, %r13
309 jmp _initial_blocks_\@
310_initial_num_blocks_is_0_\@:
311 INITIAL_BLOCKS_ENC_DEC %xmm9, %xmm10, %xmm13, %xmm11, %xmm12, %xmm0, \
312%xmm1, %xmm2, %xmm3, %xmm4, %xmm8, %xmm5, %xmm6, 8, 0, \operation
313_initial_blocks_\@:
314
315 # Main loop - Encrypt/Decrypt remaining blocks
316
Uros Bizjak032d0492020-11-27 10:44:52 +0100317 test %r13, %r13
Dave Watsonba458332018-02-14 09:39:10 -0800318 je _zero_cipher_left_\@
319 sub $64, %r13
320 je _four_cipher_left_\@
321_crypt_by_4_\@:
322 GHASH_4_ENCRYPT_4_PARALLEL_\operation %xmm9, %xmm10, %xmm11, %xmm12, \
323 %xmm13, %xmm14, %xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, \
324 %xmm7, %xmm8, enc
325 add $64, %r11
326 sub $64, %r13
327 jne _crypt_by_4_\@
328_four_cipher_left_\@:
329 GHASH_LAST_4 %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14, \
330%xmm15, %xmm1, %xmm2, %xmm3, %xmm4, %xmm8
331_zero_cipher_left_\@:
Dave Watson96604742018-02-14 09:39:45 -0800332 movdqu %xmm8, AadHash(%arg2)
333 movdqu %xmm0, CurCount(%arg2)
334
Dave Watson9ee4a5d2018-02-14 09:39:23 -0800335 mov %arg5, %r13
336 and $15, %r13 # %r13 = arg5 (mod 16)
Dave Watsonba458332018-02-14 09:39:10 -0800337 je _multiple_of_16_bytes_\@
338
Dave Watson96604742018-02-14 09:39:45 -0800339 mov %r13, PBlockLen(%arg2)
340
Dave Watsonba458332018-02-14 09:39:10 -0800341 # Handle the last <16 Byte block separately
342 paddd ONE(%rip), %xmm0 # INCR CNT to get Yn
Dave Watson96604742018-02-14 09:39:45 -0800343 movdqu %xmm0, CurCount(%arg2)
Dave Watson9ee4a5d2018-02-14 09:39:23 -0800344 movdqa SHUF_MASK(%rip), %xmm10
Uros Bizjakd7866e52020-07-09 17:08:57 +0200345 pshufb %xmm10, %xmm0
Dave Watsonba458332018-02-14 09:39:10 -0800346
347 ENCRYPT_SINGLE_BLOCK %xmm0, %xmm1 # Encrypt(K, Yn)
Dave Watson96604742018-02-14 09:39:45 -0800348 movdqu %xmm0, PBlockEncKey(%arg2)
Dave Watsonba458332018-02-14 09:39:10 -0800349
Dave Watson933d6ae2018-02-14 09:40:31 -0800350 cmp $16, %arg5
351 jge _large_enough_update_\@
352
Dave Watson9ee4a5d2018-02-14 09:39:23 -0800353 lea (%arg4,%r11,1), %r10
Dave Watsonba458332018-02-14 09:39:10 -0800354 mov %r13, %r12
355 READ_PARTIAL_BLOCK %r10 %r12 %xmm2 %xmm1
Dave Watson933d6ae2018-02-14 09:40:31 -0800356 jmp _data_read_\@
Dave Watsonba458332018-02-14 09:39:10 -0800357
Dave Watson933d6ae2018-02-14 09:40:31 -0800358_large_enough_update_\@:
359 sub $16, %r11
360 add %r13, %r11
361
362 # receive the last <16 Byte block
363 movdqu (%arg4, %r11, 1), %xmm1
364
365 sub %r13, %r11
366 add $16, %r11
367
368 lea SHIFT_MASK+16(%rip), %r12
369 # adjust the shuffle mask pointer to be able to shift 16-r13 bytes
370 # (r13 is the number of bytes in plaintext mod 16)
371 sub %r13, %r12
372 # get the appropriate shuffle mask
373 movdqu (%r12), %xmm2
374 # shift right 16-r13 bytes
Uros Bizjakd7866e52020-07-09 17:08:57 +0200375 pshufb %xmm2, %xmm1
Dave Watson933d6ae2018-02-14 09:40:31 -0800376
377_data_read_\@:
Dave Watsonba458332018-02-14 09:39:10 -0800378 lea ALL_F+16(%rip), %r12
379 sub %r13, %r12
Dave Watson933d6ae2018-02-14 09:40:31 -0800380
Dave Watsonba458332018-02-14 09:39:10 -0800381.ifc \operation, dec
382 movdqa %xmm1, %xmm2
383.endif
384 pxor %xmm1, %xmm0 # XOR Encrypt(K, Yn)
385 movdqu (%r12), %xmm1
386 # get the appropriate mask to mask out top 16-r13 bytes of xmm0
387 pand %xmm1, %xmm0 # mask out top 16-r13 bytes of xmm0
388.ifc \operation, dec
389 pand %xmm1, %xmm2
390 movdqa SHUF_MASK(%rip), %xmm10
Uros Bizjakd7866e52020-07-09 17:08:57 +0200391 pshufb %xmm10 ,%xmm2
Dave Watsonba458332018-02-14 09:39:10 -0800392
393 pxor %xmm2, %xmm8
394.else
395 movdqa SHUF_MASK(%rip), %xmm10
Uros Bizjakd7866e52020-07-09 17:08:57 +0200396 pshufb %xmm10,%xmm0
Dave Watsonba458332018-02-14 09:39:10 -0800397
398 pxor %xmm0, %xmm8
399.endif
400
Dave Watson96604742018-02-14 09:39:45 -0800401 movdqu %xmm8, AadHash(%arg2)
Dave Watsonba458332018-02-14 09:39:10 -0800402.ifc \operation, enc
403 # GHASH computation for the last <16 byte block
404 movdqa SHUF_MASK(%rip), %xmm10
405 # shuffle xmm0 back to output as ciphertext
Uros Bizjakd7866e52020-07-09 17:08:57 +0200406 pshufb %xmm10, %xmm0
Dave Watsonba458332018-02-14 09:39:10 -0800407.endif
408
409 # Output %r13 bytes
Uros Bizjakd7866e52020-07-09 17:08:57 +0200410 movq %xmm0, %rax
Dave Watsonba458332018-02-14 09:39:10 -0800411 cmp $8, %r13
412 jle _less_than_8_bytes_left_\@
Dave Watson9ee4a5d2018-02-14 09:39:23 -0800413 mov %rax, (%arg3 , %r11, 1)
Dave Watsonba458332018-02-14 09:39:10 -0800414 add $8, %r11
415 psrldq $8, %xmm0
Uros Bizjakd7866e52020-07-09 17:08:57 +0200416 movq %xmm0, %rax
Dave Watsonba458332018-02-14 09:39:10 -0800417 sub $8, %r13
418_less_than_8_bytes_left_\@:
Dave Watson9ee4a5d2018-02-14 09:39:23 -0800419 mov %al, (%arg3, %r11, 1)
Dave Watsonba458332018-02-14 09:39:10 -0800420 add $1, %r11
421 shr $8, %rax
422 sub $1, %r13
423 jne _less_than_8_bytes_left_\@
424_multiple_of_16_bytes_\@:
425.endm
426
Dave Watsonadcadab2018-02-14 09:38:57 -0800427# GCM_COMPLETE Finishes update of tag of last partial block
428# Output: Authorization Tag (AUTH_TAG)
429# Clobbers rax, r10-r12, and xmm0, xmm1, xmm5-xmm15
Dave Watsonfb8986e2018-02-14 09:40:47 -0800430.macro GCM_COMPLETE AUTHTAG AUTHTAGLEN
Dave Watson96604742018-02-14 09:39:45 -0800431 movdqu AadHash(%arg2), %xmm8
Dave Watson1476db22018-02-14 09:40:10 -0800432 movdqu HashKey(%arg2), %xmm13
Dave Watsone2e34b02018-02-14 09:39:55 -0800433
434 mov PBlockLen(%arg2), %r12
435
Uros Bizjak032d0492020-11-27 10:44:52 +0100436 test %r12, %r12
Dave Watsone2e34b02018-02-14 09:39:55 -0800437 je _partial_done\@
438
439 GHASH_MUL %xmm8, %xmm13, %xmm9, %xmm10, %xmm11, %xmm5, %xmm6
440
441_partial_done\@:
Dave Watson96604742018-02-14 09:39:45 -0800442 mov AadLen(%arg2), %r12 # %r13 = aadLen (number of bytes)
Dave Watsonadcadab2018-02-14 09:38:57 -0800443 shl $3, %r12 # convert into number of bits
444 movd %r12d, %xmm15 # len(A) in %xmm15
Dave Watson96604742018-02-14 09:39:45 -0800445 mov InLen(%arg2), %r12
446 shl $3, %r12 # len(C) in bits (*128)
Uros Bizjakd7866e52020-07-09 17:08:57 +0200447 movq %r12, %xmm1
Dave Watson96604742018-02-14 09:39:45 -0800448
Dave Watsonadcadab2018-02-14 09:38:57 -0800449 pslldq $8, %xmm15 # %xmm15 = len(A)||0x0000000000000000
450 pxor %xmm1, %xmm15 # %xmm15 = len(A)||len(C)
451 pxor %xmm15, %xmm8
452 GHASH_MUL %xmm8, %xmm13, %xmm9, %xmm10, %xmm11, %xmm5, %xmm6
453 # final GHASH computation
454 movdqa SHUF_MASK(%rip), %xmm10
Uros Bizjakd7866e52020-07-09 17:08:57 +0200455 pshufb %xmm10, %xmm8
Dave Watsonadcadab2018-02-14 09:38:57 -0800456
Dave Watson96604742018-02-14 09:39:45 -0800457 movdqu OrigIV(%arg2), %xmm0 # %xmm0 = Y0
Dave Watsonadcadab2018-02-14 09:38:57 -0800458 ENCRYPT_SINGLE_BLOCK %xmm0, %xmm1 # E(K, Y0)
459 pxor %xmm8, %xmm0
460_return_T_\@:
Dave Watsonfb8986e2018-02-14 09:40:47 -0800461 mov \AUTHTAG, %r10 # %r10 = authTag
462 mov \AUTHTAGLEN, %r11 # %r11 = auth_tag_len
Dave Watsonadcadab2018-02-14 09:38:57 -0800463 cmp $16, %r11
464 je _T_16_\@
465 cmp $8, %r11
466 jl _T_4_\@
467_T_8_\@:
Uros Bizjakd7866e52020-07-09 17:08:57 +0200468 movq %xmm0, %rax
Dave Watsonadcadab2018-02-14 09:38:57 -0800469 mov %rax, (%r10)
470 add $8, %r10
471 sub $8, %r11
472 psrldq $8, %xmm0
Uros Bizjak032d0492020-11-27 10:44:52 +0100473 test %r11, %r11
Dave Watsonadcadab2018-02-14 09:38:57 -0800474 je _return_T_done_\@
475_T_4_\@:
476 movd %xmm0, %eax
477 mov %eax, (%r10)
478 add $4, %r10
479 sub $4, %r11
480 psrldq $4, %xmm0
Uros Bizjak032d0492020-11-27 10:44:52 +0100481 test %r11, %r11
Dave Watsonadcadab2018-02-14 09:38:57 -0800482 je _return_T_done_\@
483_T_123_\@:
484 movd %xmm0, %eax
485 cmp $2, %r11
486 jl _T_1_\@
487 mov %ax, (%r10)
488 cmp $2, %r11
489 je _return_T_done_\@
490 add $2, %r10
491 sar $16, %eax
492_T_1_\@:
493 mov %al, (%r10)
494 jmp _return_T_done_\@
495_T_16_\@:
496 movdqu %xmm0, (%r10)
497_return_T_done_\@:
498.endm
499
Mathias Krause559ad0f2010-11-29 08:35:39 +0800500#ifdef __x86_64__
Tadeusz Struk0bd82f52010-11-04 15:00:45 -0400501/* GHASH_MUL MACRO to implement: Data*HashKey mod (128,127,126,121,0)
502*
503*
504* Input: A and B (128-bits each, bit-reflected)
505* Output: C = A*B*x mod poly, (i.e. >>1 )
506* To compute GH = GH*HashKey mod poly, give HK = HashKey<<1 mod poly as input
507* GH = GH * HK * x mod poly which is equivalent to GH*HashKey mod poly.
508*
509*/
510.macro GHASH_MUL GH HK TMP1 TMP2 TMP3 TMP4 TMP5
511 movdqa \GH, \TMP1
512 pshufd $78, \GH, \TMP2
513 pshufd $78, \HK, \TMP3
514 pxor \GH, \TMP2 # TMP2 = a1+a0
515 pxor \HK, \TMP3 # TMP3 = b1+b0
Uros Bizjakd7866e52020-07-09 17:08:57 +0200516 pclmulqdq $0x11, \HK, \TMP1 # TMP1 = a1*b1
517 pclmulqdq $0x00, \HK, \GH # GH = a0*b0
518 pclmulqdq $0x00, \TMP3, \TMP2 # TMP2 = (a0+a1)*(b1+b0)
Tadeusz Struk0bd82f52010-11-04 15:00:45 -0400519 pxor \GH, \TMP2
520 pxor \TMP1, \TMP2 # TMP2 = (a0*b0)+(a1*b0)
521 movdqa \TMP2, \TMP3
522 pslldq $8, \TMP3 # left shift TMP3 2 DWs
523 psrldq $8, \TMP2 # right shift TMP2 2 DWs
524 pxor \TMP3, \GH
525 pxor \TMP2, \TMP1 # TMP2:GH holds the result of GH*HK
526
527 # first phase of the reduction
528
529 movdqa \GH, \TMP2
530 movdqa \GH, \TMP3
531 movdqa \GH, \TMP4 # copy GH into TMP2,TMP3 and TMP4
532 # in in order to perform
533 # independent shifts
534 pslld $31, \TMP2 # packed right shift <<31
535 pslld $30, \TMP3 # packed right shift <<30
536 pslld $25, \TMP4 # packed right shift <<25
537 pxor \TMP3, \TMP2 # xor the shifted versions
538 pxor \TMP4, \TMP2
539 movdqa \TMP2, \TMP5
540 psrldq $4, \TMP5 # right shift TMP5 1 DW
541 pslldq $12, \TMP2 # left shift TMP2 3 DWs
542 pxor \TMP2, \GH
543
544 # second phase of the reduction
545
546 movdqa \GH,\TMP2 # copy GH into TMP2,TMP3 and TMP4
547 # in in order to perform
548 # independent shifts
549 movdqa \GH,\TMP3
550 movdqa \GH,\TMP4
551 psrld $1,\TMP2 # packed left shift >>1
552 psrld $2,\TMP3 # packed left shift >>2
553 psrld $7,\TMP4 # packed left shift >>7
554 pxor \TMP3,\TMP2 # xor the shifted versions
555 pxor \TMP4,\TMP2
556 pxor \TMP5, \TMP2
557 pxor \TMP2, \GH
558 pxor \TMP1, \GH # result is in TMP1
559.endm
560
Junaid Shahidb20209c2017-12-20 17:08:37 -0800561# Reads DLEN bytes starting at DPTR and stores in XMMDst
562# where 0 < DLEN < 16
563# Clobbers %rax, DLEN and XMM1
564.macro READ_PARTIAL_BLOCK DPTR DLEN XMM1 XMMDst
565 cmp $8, \DLEN
566 jl _read_lt8_\@
567 mov (\DPTR), %rax
Uros Bizjakd7866e52020-07-09 17:08:57 +0200568 movq %rax, \XMMDst
Junaid Shahidb20209c2017-12-20 17:08:37 -0800569 sub $8, \DLEN
570 jz _done_read_partial_block_\@
571 xor %eax, %eax
572_read_next_byte_\@:
573 shl $8, %rax
574 mov 7(\DPTR, \DLEN, 1), %al
575 dec \DLEN
576 jnz _read_next_byte_\@
Uros Bizjakd7866e52020-07-09 17:08:57 +0200577 movq %rax, \XMM1
Junaid Shahidb20209c2017-12-20 17:08:37 -0800578 pslldq $8, \XMM1
579 por \XMM1, \XMMDst
580 jmp _done_read_partial_block_\@
581_read_lt8_\@:
582 xor %eax, %eax
583_read_next_byte_lt8_\@:
584 shl $8, %rax
585 mov -1(\DPTR, \DLEN, 1), %al
586 dec \DLEN
587 jnz _read_next_byte_lt8_\@
Uros Bizjakd7866e52020-07-09 17:08:57 +0200588 movq %rax, \XMMDst
Junaid Shahidb20209c2017-12-20 17:08:37 -0800589_done_read_partial_block_\@:
590.endm
591
Dave Watsonc594c542018-02-14 09:39:36 -0800592# CALC_AAD_HASH: Calculates the hash of the data which will not be encrypted.
593# clobbers r10-11, xmm14
Dave Watsonfb8986e2018-02-14 09:40:47 -0800594.macro CALC_AAD_HASH HASHKEY AAD AADLEN TMP1 TMP2 TMP3 TMP4 TMP5 \
Dave Watsonc594c542018-02-14 09:39:36 -0800595 TMP6 TMP7
596 MOVADQ SHUF_MASK(%rip), %xmm14
Dave Watsonfb8986e2018-02-14 09:40:47 -0800597 mov \AAD, %r10 # %r10 = AAD
598 mov \AADLEN, %r11 # %r11 = aadLen
Dave Watsonc594c542018-02-14 09:39:36 -0800599 pxor \TMP7, \TMP7
600 pxor \TMP6, \TMP6
601
602 cmp $16, %r11
603 jl _get_AAD_rest\@
604_get_AAD_blocks\@:
605 movdqu (%r10), \TMP7
Uros Bizjakd7866e52020-07-09 17:08:57 +0200606 pshufb %xmm14, \TMP7 # byte-reflect the AAD data
Dave Watsonc594c542018-02-14 09:39:36 -0800607 pxor \TMP7, \TMP6
608 GHASH_MUL \TMP6, \HASHKEY, \TMP1, \TMP2, \TMP3, \TMP4, \TMP5
609 add $16, %r10
610 sub $16, %r11
611 cmp $16, %r11
612 jge _get_AAD_blocks\@
613
614 movdqu \TMP6, \TMP7
615
616 /* read the last <16B of AAD */
617_get_AAD_rest\@:
Uros Bizjak032d0492020-11-27 10:44:52 +0100618 test %r11, %r11
Dave Watsonc594c542018-02-14 09:39:36 -0800619 je _get_AAD_done\@
620
621 READ_PARTIAL_BLOCK %r10, %r11, \TMP1, \TMP7
Uros Bizjakd7866e52020-07-09 17:08:57 +0200622 pshufb %xmm14, \TMP7 # byte-reflect the AAD data
Dave Watsonc594c542018-02-14 09:39:36 -0800623 pxor \TMP6, \TMP7
624 GHASH_MUL \TMP7, \HASHKEY, \TMP1, \TMP2, \TMP3, \TMP4, \TMP5
625 movdqu \TMP7, \TMP6
626
627_get_AAD_done\@:
628 movdqu \TMP6, AadHash(%arg2)
629.endm
630
Dave Watsonae952c52018-02-14 09:40:19 -0800631# PARTIAL_BLOCK: Handles encryption/decryption and the tag partial blocks
632# between update calls.
633# Requires the input data be at least 1 byte long due to READ_PARTIAL_BLOCK
634# Outputs encrypted bytes, and updates hash and partial info in gcm_data_context
635# Clobbers rax, r10, r12, r13, xmm0-6, xmm9-13
636.macro PARTIAL_BLOCK CYPH_PLAIN_OUT PLAIN_CYPH_IN PLAIN_CYPH_LEN DATA_OFFSET \
637 AAD_HASH operation
638 mov PBlockLen(%arg2), %r13
Uros Bizjak032d0492020-11-27 10:44:52 +0100639 test %r13, %r13
Dave Watsonae952c52018-02-14 09:40:19 -0800640 je _partial_block_done_\@ # Leave Macro if no partial blocks
641 # Read in input data without over reading
642 cmp $16, \PLAIN_CYPH_LEN
643 jl _fewer_than_16_bytes_\@
644 movups (\PLAIN_CYPH_IN), %xmm1 # If more than 16 bytes, just fill xmm
645 jmp _data_read_\@
646
647_fewer_than_16_bytes_\@:
648 lea (\PLAIN_CYPH_IN, \DATA_OFFSET, 1), %r10
649 mov \PLAIN_CYPH_LEN, %r12
650 READ_PARTIAL_BLOCK %r10 %r12 %xmm0 %xmm1
651
652 mov PBlockLen(%arg2), %r13
653
654_data_read_\@: # Finished reading in data
655
656 movdqu PBlockEncKey(%arg2), %xmm9
657 movdqu HashKey(%arg2), %xmm13
658
659 lea SHIFT_MASK(%rip), %r12
660
661 # adjust the shuffle mask pointer to be able to shift r13 bytes
662 # r16-r13 is the number of bytes in plaintext mod 16)
663 add %r13, %r12
664 movdqu (%r12), %xmm2 # get the appropriate shuffle mask
Uros Bizjakd7866e52020-07-09 17:08:57 +0200665 pshufb %xmm2, %xmm9 # shift right r13 bytes
Dave Watsonae952c52018-02-14 09:40:19 -0800666
667.ifc \operation, dec
668 movdqa %xmm1, %xmm3
669 pxor %xmm1, %xmm9 # Cyphertext XOR E(K, Yn)
670
671 mov \PLAIN_CYPH_LEN, %r10
672 add %r13, %r10
673 # Set r10 to be the amount of data left in CYPH_PLAIN_IN after filling
674 sub $16, %r10
675 # Determine if if partial block is not being filled and
676 # shift mask accordingly
677 jge _no_extra_mask_1_\@
678 sub %r10, %r12
679_no_extra_mask_1_\@:
680
681 movdqu ALL_F-SHIFT_MASK(%r12), %xmm1
682 # get the appropriate mask to mask out bottom r13 bytes of xmm9
683 pand %xmm1, %xmm9 # mask out bottom r13 bytes of xmm9
684
685 pand %xmm1, %xmm3
686 movdqa SHUF_MASK(%rip), %xmm10
Uros Bizjakd7866e52020-07-09 17:08:57 +0200687 pshufb %xmm10, %xmm3
688 pshufb %xmm2, %xmm3
Dave Watsonae952c52018-02-14 09:40:19 -0800689 pxor %xmm3, \AAD_HASH
690
Uros Bizjak032d0492020-11-27 10:44:52 +0100691 test %r10, %r10
Dave Watsonae952c52018-02-14 09:40:19 -0800692 jl _partial_incomplete_1_\@
693
694 # GHASH computation for the last <16 Byte block
695 GHASH_MUL \AAD_HASH, %xmm13, %xmm0, %xmm10, %xmm11, %xmm5, %xmm6
Jan Beulicha7bea832018-07-02 04:31:54 -0600696 xor %eax, %eax
Dave Watsonae952c52018-02-14 09:40:19 -0800697
698 mov %rax, PBlockLen(%arg2)
699 jmp _dec_done_\@
700_partial_incomplete_1_\@:
701 add \PLAIN_CYPH_LEN, PBlockLen(%arg2)
702_dec_done_\@:
703 movdqu \AAD_HASH, AadHash(%arg2)
704.else
705 pxor %xmm1, %xmm9 # Plaintext XOR E(K, Yn)
706
707 mov \PLAIN_CYPH_LEN, %r10
708 add %r13, %r10
709 # Set r10 to be the amount of data left in CYPH_PLAIN_IN after filling
710 sub $16, %r10
711 # Determine if if partial block is not being filled and
712 # shift mask accordingly
713 jge _no_extra_mask_2_\@
714 sub %r10, %r12
715_no_extra_mask_2_\@:
716
717 movdqu ALL_F-SHIFT_MASK(%r12), %xmm1
718 # get the appropriate mask to mask out bottom r13 bytes of xmm9
719 pand %xmm1, %xmm9
720
721 movdqa SHUF_MASK(%rip), %xmm1
Uros Bizjakd7866e52020-07-09 17:08:57 +0200722 pshufb %xmm1, %xmm9
723 pshufb %xmm2, %xmm9
Dave Watsonae952c52018-02-14 09:40:19 -0800724 pxor %xmm9, \AAD_HASH
725
Uros Bizjak032d0492020-11-27 10:44:52 +0100726 test %r10, %r10
Dave Watsonae952c52018-02-14 09:40:19 -0800727 jl _partial_incomplete_2_\@
728
729 # GHASH computation for the last <16 Byte block
730 GHASH_MUL \AAD_HASH, %xmm13, %xmm0, %xmm10, %xmm11, %xmm5, %xmm6
Jan Beulicha7bea832018-07-02 04:31:54 -0600731 xor %eax, %eax
Dave Watsonae952c52018-02-14 09:40:19 -0800732
733 mov %rax, PBlockLen(%arg2)
734 jmp _encode_done_\@
735_partial_incomplete_2_\@:
736 add \PLAIN_CYPH_LEN, PBlockLen(%arg2)
737_encode_done_\@:
738 movdqu \AAD_HASH, AadHash(%arg2)
739
740 movdqa SHUF_MASK(%rip), %xmm10
741 # shuffle xmm9 back to output as ciphertext
Uros Bizjakd7866e52020-07-09 17:08:57 +0200742 pshufb %xmm10, %xmm9
743 pshufb %xmm2, %xmm9
Dave Watsonae952c52018-02-14 09:40:19 -0800744.endif
745 # output encrypted Bytes
Uros Bizjak032d0492020-11-27 10:44:52 +0100746 test %r10, %r10
Dave Watsonae952c52018-02-14 09:40:19 -0800747 jl _partial_fill_\@
748 mov %r13, %r12
749 mov $16, %r13
750 # Set r13 to be the number of bytes to write out
751 sub %r12, %r13
752 jmp _count_set_\@
753_partial_fill_\@:
754 mov \PLAIN_CYPH_LEN, %r13
755_count_set_\@:
756 movdqa %xmm9, %xmm0
Uros Bizjakd7866e52020-07-09 17:08:57 +0200757 movq %xmm0, %rax
Dave Watsonae952c52018-02-14 09:40:19 -0800758 cmp $8, %r13
759 jle _less_than_8_bytes_left_\@
760
761 mov %rax, (\CYPH_PLAIN_OUT, \DATA_OFFSET, 1)
762 add $8, \DATA_OFFSET
763 psrldq $8, %xmm0
Uros Bizjakd7866e52020-07-09 17:08:57 +0200764 movq %xmm0, %rax
Dave Watsonae952c52018-02-14 09:40:19 -0800765 sub $8, %r13
766_less_than_8_bytes_left_\@:
767 movb %al, (\CYPH_PLAIN_OUT, \DATA_OFFSET, 1)
768 add $1, \DATA_OFFSET
769 shr $8, %rax
770 sub $1, %r13
771 jne _less_than_8_bytes_left_\@
772_partial_block_done_\@:
773.endm # PARTIAL_BLOCK
774
Tadeusz Struk0bd82f52010-11-04 15:00:45 -0400775/*
776* if a = number of total plaintext bytes
777* b = floor(a/16)
778* num_initial_blocks = b mod 4
779* encrypt the initial num_initial_blocks blocks and apply ghash on
780* the ciphertext
781* %r10, %r11, %r12, %rax, %xmm5, %xmm6, %xmm7, %xmm8, %xmm9 registers
782* are clobbered
Dave Watson1476db22018-02-14 09:40:10 -0800783* arg1, %arg2, %arg3 are used as a pointer only, not modified
Tadeusz Struk0bd82f52010-11-04 15:00:45 -0400784*/
785
Tadeusz Struk0bd82f52010-11-04 15:00:45 -0400786
Dave Watsone1fd3162018-02-14 09:38:12 -0800787.macro INITIAL_BLOCKS_ENC_DEC TMP1 TMP2 TMP3 TMP4 TMP5 XMM0 XMM1 \
Dave Watsonc594c542018-02-14 09:39:36 -0800788 XMM2 XMM3 XMM4 XMMDst TMP6 TMP7 i i_seq operation
Dave Watson96604742018-02-14 09:39:45 -0800789 MOVADQ SHUF_MASK(%rip), %xmm14
Sabrina Dubroca0487cca2017-04-28 18:11:56 +0200790
Dave Watsonc594c542018-02-14 09:39:36 -0800791 movdqu AadHash(%arg2), %xmm\i # XMM0 = Y0
Sabrina Dubroca0487cca2017-04-28 18:11:56 +0200792
Sabrina Dubroca0487cca2017-04-28 18:11:56 +0200793 # start AES for num_initial_blocks blocks
Tadeusz Struk3c097b82010-12-13 19:51:15 +0800794
Dave Watson96604742018-02-14 09:39:45 -0800795 movdqu CurCount(%arg2), \XMM0 # XMM0 = Y0
Tadeusz Struk3c097b82010-12-13 19:51:15 +0800796
797.if (\i == 5) || (\i == 6) || (\i == 7)
Tadeusz Struk3c097b82010-12-13 19:51:15 +0800798
Timothy McCaffreye31ac322015-01-13 13:16:43 -0500799 MOVADQ ONE(%RIP),\TMP1
800 MOVADQ 0(%arg1),\TMP2
Tadeusz Struk3c097b82010-12-13 19:51:15 +0800801.irpc index, \i_seq
Timothy McCaffreye31ac322015-01-13 13:16:43 -0500802 paddd \TMP1, \XMM0 # INCR Y0
Dave Watsone1fd3162018-02-14 09:38:12 -0800803.ifc \operation, dec
804 movdqa \XMM0, %xmm\index
805.else
Timothy McCaffreye31ac322015-01-13 13:16:43 -0500806 MOVADQ \XMM0, %xmm\index
Dave Watsone1fd3162018-02-14 09:38:12 -0800807.endif
Uros Bizjakd7866e52020-07-09 17:08:57 +0200808 pshufb %xmm14, %xmm\index # perform a 16 byte swap
Timothy McCaffreye31ac322015-01-13 13:16:43 -0500809 pxor \TMP2, %xmm\index
Tadeusz Struk3c097b82010-12-13 19:51:15 +0800810.endr
Timothy McCaffreye31ac322015-01-13 13:16:43 -0500811 lea 0x10(%arg1),%r10
812 mov keysize,%eax
813 shr $2,%eax # 128->4, 192->6, 256->8
814 add $5,%eax # 128->9, 192->11, 256->13
815
Dave Watsone1fd3162018-02-14 09:38:12 -0800816aes_loop_initial_\@:
Timothy McCaffreye31ac322015-01-13 13:16:43 -0500817 MOVADQ (%r10),\TMP1
818.irpc index, \i_seq
Uros Bizjakd7866e52020-07-09 17:08:57 +0200819 aesenc \TMP1, %xmm\index
Tadeusz Struk3c097b82010-12-13 19:51:15 +0800820.endr
Timothy McCaffreye31ac322015-01-13 13:16:43 -0500821 add $16,%r10
822 sub $1,%eax
Dave Watsone1fd3162018-02-14 09:38:12 -0800823 jnz aes_loop_initial_\@
Timothy McCaffreye31ac322015-01-13 13:16:43 -0500824
825 MOVADQ (%r10), \TMP1
Tadeusz Struk3c097b82010-12-13 19:51:15 +0800826.irpc index, \i_seq
Uros Bizjakd7866e52020-07-09 17:08:57 +0200827 aesenclast \TMP1, %xmm\index # Last Round
Tadeusz Struk3c097b82010-12-13 19:51:15 +0800828.endr
829.irpc index, \i_seq
Dave Watson9ee4a5d2018-02-14 09:39:23 -0800830 movdqu (%arg4 , %r11, 1), \TMP1
Tadeusz Struk3c097b82010-12-13 19:51:15 +0800831 pxor \TMP1, %xmm\index
Dave Watson9ee4a5d2018-02-14 09:39:23 -0800832 movdqu %xmm\index, (%arg3 , %r11, 1)
Tadeusz Struk3c097b82010-12-13 19:51:15 +0800833 # write back plaintext/ciphertext for num_initial_blocks
834 add $16, %r11
Dave Watsone1fd3162018-02-14 09:38:12 -0800835
836.ifc \operation, dec
837 movdqa \TMP1, %xmm\index
838.endif
Uros Bizjakd7866e52020-07-09 17:08:57 +0200839 pshufb %xmm14, %xmm\index
Tadeusz Struk3c097b82010-12-13 19:51:15 +0800840
841 # prepare plaintext/ciphertext for GHASH computation
842.endr
843.endif
Sabrina Dubroca0487cca2017-04-28 18:11:56 +0200844
Tadeusz Struk3c097b82010-12-13 19:51:15 +0800845 # apply GHASH on num_initial_blocks blocks
846
847.if \i == 5
848 pxor %xmm5, %xmm6
849 GHASH_MUL %xmm6, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1
850 pxor %xmm6, %xmm7
851 GHASH_MUL %xmm7, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1
852 pxor %xmm7, %xmm8
853 GHASH_MUL %xmm8, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1
854.elseif \i == 6
855 pxor %xmm6, %xmm7
856 GHASH_MUL %xmm7, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1
857 pxor %xmm7, %xmm8
858 GHASH_MUL %xmm8, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1
859.elseif \i == 7
860 pxor %xmm7, %xmm8
861 GHASH_MUL %xmm8, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1
862.endif
863 cmp $64, %r13
Dave Watsone1fd3162018-02-14 09:38:12 -0800864 jl _initial_blocks_done\@
Tadeusz Struk3c097b82010-12-13 19:51:15 +0800865 # no need for precomputed values
866/*
867*
868* Precomputations for HashKey parallel with encryption of first 4 blocks.
869* Haskey_i_k holds XORed values of the low and high parts of the Haskey_i
870*/
Timothy McCaffreye31ac322015-01-13 13:16:43 -0500871 MOVADQ ONE(%RIP),\TMP1
872 paddd \TMP1, \XMM0 # INCR Y0
873 MOVADQ \XMM0, \XMM1
Uros Bizjakd7866e52020-07-09 17:08:57 +0200874 pshufb %xmm14, \XMM1 # perform a 16 byte swap
Tadeusz Struk3c097b82010-12-13 19:51:15 +0800875
Timothy McCaffreye31ac322015-01-13 13:16:43 -0500876 paddd \TMP1, \XMM0 # INCR Y0
877 MOVADQ \XMM0, \XMM2
Uros Bizjakd7866e52020-07-09 17:08:57 +0200878 pshufb %xmm14, \XMM2 # perform a 16 byte swap
Tadeusz Struk3c097b82010-12-13 19:51:15 +0800879
Timothy McCaffreye31ac322015-01-13 13:16:43 -0500880 paddd \TMP1, \XMM0 # INCR Y0
881 MOVADQ \XMM0, \XMM3
Uros Bizjakd7866e52020-07-09 17:08:57 +0200882 pshufb %xmm14, \XMM3 # perform a 16 byte swap
Tadeusz Struk3c097b82010-12-13 19:51:15 +0800883
Timothy McCaffreye31ac322015-01-13 13:16:43 -0500884 paddd \TMP1, \XMM0 # INCR Y0
885 MOVADQ \XMM0, \XMM4
Uros Bizjakd7866e52020-07-09 17:08:57 +0200886 pshufb %xmm14, \XMM4 # perform a 16 byte swap
Tadeusz Struk3c097b82010-12-13 19:51:15 +0800887
Timothy McCaffreye31ac322015-01-13 13:16:43 -0500888 MOVADQ 0(%arg1),\TMP1
889 pxor \TMP1, \XMM1
890 pxor \TMP1, \XMM2
891 pxor \TMP1, \XMM3
892 pxor \TMP1, \XMM4
Tadeusz Struk3c097b82010-12-13 19:51:15 +0800893.irpc index, 1234 # do 4 rounds
894 movaps 0x10*\index(%arg1), \TMP1
Uros Bizjakd7866e52020-07-09 17:08:57 +0200895 aesenc \TMP1, \XMM1
896 aesenc \TMP1, \XMM2
897 aesenc \TMP1, \XMM3
898 aesenc \TMP1, \XMM4
Tadeusz Struk3c097b82010-12-13 19:51:15 +0800899.endr
Tadeusz Struk3c097b82010-12-13 19:51:15 +0800900.irpc index, 56789 # do next 5 rounds
901 movaps 0x10*\index(%arg1), \TMP1
Uros Bizjakd7866e52020-07-09 17:08:57 +0200902 aesenc \TMP1, \XMM1
903 aesenc \TMP1, \XMM2
904 aesenc \TMP1, \XMM3
905 aesenc \TMP1, \XMM4
Tadeusz Struk3c097b82010-12-13 19:51:15 +0800906.endr
Timothy McCaffreye31ac322015-01-13 13:16:43 -0500907 lea 0xa0(%arg1),%r10
908 mov keysize,%eax
909 shr $2,%eax # 128->4, 192->6, 256->8
910 sub $4,%eax # 128->0, 192->2, 256->4
Dave Watsone1fd3162018-02-14 09:38:12 -0800911 jz aes_loop_pre_done\@
Timothy McCaffreye31ac322015-01-13 13:16:43 -0500912
Dave Watsone1fd3162018-02-14 09:38:12 -0800913aes_loop_pre_\@:
Timothy McCaffreye31ac322015-01-13 13:16:43 -0500914 MOVADQ (%r10),\TMP2
915.irpc index, 1234
Uros Bizjakd7866e52020-07-09 17:08:57 +0200916 aesenc \TMP2, %xmm\index
Timothy McCaffreye31ac322015-01-13 13:16:43 -0500917.endr
918 add $16,%r10
919 sub $1,%eax
Dave Watsone1fd3162018-02-14 09:38:12 -0800920 jnz aes_loop_pre_\@
Timothy McCaffreye31ac322015-01-13 13:16:43 -0500921
Dave Watsone1fd3162018-02-14 09:38:12 -0800922aes_loop_pre_done\@:
Timothy McCaffreye31ac322015-01-13 13:16:43 -0500923 MOVADQ (%r10), \TMP2
Uros Bizjakd7866e52020-07-09 17:08:57 +0200924 aesenclast \TMP2, \XMM1
925 aesenclast \TMP2, \XMM2
926 aesenclast \TMP2, \XMM3
927 aesenclast \TMP2, \XMM4
Dave Watson9ee4a5d2018-02-14 09:39:23 -0800928 movdqu 16*0(%arg4 , %r11 , 1), \TMP1
Tadeusz Struk3c097b82010-12-13 19:51:15 +0800929 pxor \TMP1, \XMM1
Dave Watsone1fd3162018-02-14 09:38:12 -0800930.ifc \operation, dec
Dave Watson9ee4a5d2018-02-14 09:39:23 -0800931 movdqu \XMM1, 16*0(%arg3 , %r11 , 1)
Dave Watsone1fd3162018-02-14 09:38:12 -0800932 movdqa \TMP1, \XMM1
933.endif
Dave Watson9ee4a5d2018-02-14 09:39:23 -0800934 movdqu 16*1(%arg4 , %r11 , 1), \TMP1
Tadeusz Struk3c097b82010-12-13 19:51:15 +0800935 pxor \TMP1, \XMM2
Dave Watsone1fd3162018-02-14 09:38:12 -0800936.ifc \operation, dec
Dave Watson9ee4a5d2018-02-14 09:39:23 -0800937 movdqu \XMM2, 16*1(%arg3 , %r11 , 1)
Dave Watsone1fd3162018-02-14 09:38:12 -0800938 movdqa \TMP1, \XMM2
939.endif
Dave Watson9ee4a5d2018-02-14 09:39:23 -0800940 movdqu 16*2(%arg4 , %r11 , 1), \TMP1
Tadeusz Struk3c097b82010-12-13 19:51:15 +0800941 pxor \TMP1, \XMM3
Dave Watsone1fd3162018-02-14 09:38:12 -0800942.ifc \operation, dec
Dave Watson9ee4a5d2018-02-14 09:39:23 -0800943 movdqu \XMM3, 16*2(%arg3 , %r11 , 1)
Dave Watsone1fd3162018-02-14 09:38:12 -0800944 movdqa \TMP1, \XMM3
945.endif
Dave Watson9ee4a5d2018-02-14 09:39:23 -0800946 movdqu 16*3(%arg4 , %r11 , 1), \TMP1
Tadeusz Struk3c097b82010-12-13 19:51:15 +0800947 pxor \TMP1, \XMM4
Dave Watsone1fd3162018-02-14 09:38:12 -0800948.ifc \operation, dec
Dave Watson9ee4a5d2018-02-14 09:39:23 -0800949 movdqu \XMM4, 16*3(%arg3 , %r11 , 1)
Dave Watsone1fd3162018-02-14 09:38:12 -0800950 movdqa \TMP1, \XMM4
951.else
Dave Watson9ee4a5d2018-02-14 09:39:23 -0800952 movdqu \XMM1, 16*0(%arg3 , %r11 , 1)
953 movdqu \XMM2, 16*1(%arg3 , %r11 , 1)
954 movdqu \XMM3, 16*2(%arg3 , %r11 , 1)
955 movdqu \XMM4, 16*3(%arg3 , %r11 , 1)
Dave Watsone1fd3162018-02-14 09:38:12 -0800956.endif
Tadeusz Struk3c097b82010-12-13 19:51:15 +0800957
Tadeusz Struk0bd82f52010-11-04 15:00:45 -0400958 add $64, %r11
Uros Bizjakd7866e52020-07-09 17:08:57 +0200959 pshufb %xmm14, \XMM1 # perform a 16 byte swap
Tadeusz Struk0bd82f52010-11-04 15:00:45 -0400960 pxor \XMMDst, \XMM1
961# combine GHASHed value with the corresponding ciphertext
Uros Bizjakd7866e52020-07-09 17:08:57 +0200962 pshufb %xmm14, \XMM2 # perform a 16 byte swap
963 pshufb %xmm14, \XMM3 # perform a 16 byte swap
964 pshufb %xmm14, \XMM4 # perform a 16 byte swap
Tadeusz Struk3c097b82010-12-13 19:51:15 +0800965
Dave Watsone1fd3162018-02-14 09:38:12 -0800966_initial_blocks_done\@:
Tadeusz Struk3c097b82010-12-13 19:51:15 +0800967
Tadeusz Struk0bd82f52010-11-04 15:00:45 -0400968.endm
969
970/*
971* encrypt 4 blocks at a time
972* ghash the 4 previously encrypted ciphertext blocks
Dave Watson9ee4a5d2018-02-14 09:39:23 -0800973* arg1, %arg3, %arg4 are used as pointers only, not modified
Tadeusz Struk0bd82f52010-11-04 15:00:45 -0400974* %r11 is the data offset value
975*/
Sedat Dilek3347c8a2020-07-03 16:32:06 +0200976.macro GHASH_4_ENCRYPT_4_PARALLEL_enc TMP1 TMP2 TMP3 TMP4 TMP5 \
Tadeusz Struk0bd82f52010-11-04 15:00:45 -0400977TMP6 XMM0 XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7 XMM8 operation
978
979 movdqa \XMM1, \XMM5
980 movdqa \XMM2, \XMM6
981 movdqa \XMM3, \XMM7
982 movdqa \XMM4, \XMM8
983
Tadeusz Struk3c097b82010-12-13 19:51:15 +0800984 movdqa SHUF_MASK(%rip), %xmm15
Tadeusz Struk0bd82f52010-11-04 15:00:45 -0400985 # multiply TMP5 * HashKey using karatsuba
986
987 movdqa \XMM5, \TMP4
988 pshufd $78, \XMM5, \TMP6
989 pxor \XMM5, \TMP6
990 paddd ONE(%rip), \XMM0 # INCR CNT
Dave Watsone5b954e2018-08-15 10:29:42 -0700991 movdqu HashKey_4(%arg2), \TMP5
Uros Bizjakd7866e52020-07-09 17:08:57 +0200992 pclmulqdq $0x11, \TMP5, \TMP4 # TMP4 = a1*b1
Tadeusz Struk0bd82f52010-11-04 15:00:45 -0400993 movdqa \XMM0, \XMM1
994 paddd ONE(%rip), \XMM0 # INCR CNT
995 movdqa \XMM0, \XMM2
996 paddd ONE(%rip), \XMM0 # INCR CNT
997 movdqa \XMM0, \XMM3
998 paddd ONE(%rip), \XMM0 # INCR CNT
999 movdqa \XMM0, \XMM4
Uros Bizjakd7866e52020-07-09 17:08:57 +02001000 pshufb %xmm15, \XMM1 # perform a 16 byte swap
1001 pclmulqdq $0x00, \TMP5, \XMM5 # XMM5 = a0*b0
1002 pshufb %xmm15, \XMM2 # perform a 16 byte swap
1003 pshufb %xmm15, \XMM3 # perform a 16 byte swap
1004 pshufb %xmm15, \XMM4 # perform a 16 byte swap
Tadeusz Struk3c097b82010-12-13 19:51:15 +08001005
Tadeusz Struk0bd82f52010-11-04 15:00:45 -04001006 pxor (%arg1), \XMM1
1007 pxor (%arg1), \XMM2
1008 pxor (%arg1), \XMM3
1009 pxor (%arg1), \XMM4
Dave Watsone5b954e2018-08-15 10:29:42 -07001010 movdqu HashKey_4_k(%arg2), \TMP5
Uros Bizjakd7866e52020-07-09 17:08:57 +02001011 pclmulqdq $0x00, \TMP5, \TMP6 # TMP6 = (a1+a0)*(b1+b0)
Tadeusz Struk0bd82f52010-11-04 15:00:45 -04001012 movaps 0x10(%arg1), \TMP1
Uros Bizjakd7866e52020-07-09 17:08:57 +02001013 aesenc \TMP1, \XMM1 # Round 1
1014 aesenc \TMP1, \XMM2
1015 aesenc \TMP1, \XMM3
1016 aesenc \TMP1, \XMM4
Tadeusz Struk0bd82f52010-11-04 15:00:45 -04001017 movaps 0x20(%arg1), \TMP1
Uros Bizjakd7866e52020-07-09 17:08:57 +02001018 aesenc \TMP1, \XMM1 # Round 2
1019 aesenc \TMP1, \XMM2
1020 aesenc \TMP1, \XMM3
1021 aesenc \TMP1, \XMM4
Tadeusz Struk0bd82f52010-11-04 15:00:45 -04001022 movdqa \XMM6, \TMP1
1023 pshufd $78, \XMM6, \TMP2
1024 pxor \XMM6, \TMP2
Dave Watsone5b954e2018-08-15 10:29:42 -07001025 movdqu HashKey_3(%arg2), \TMP5
Uros Bizjakd7866e52020-07-09 17:08:57 +02001026 pclmulqdq $0x11, \TMP5, \TMP1 # TMP1 = a1 * b1
Tadeusz Struk0bd82f52010-11-04 15:00:45 -04001027 movaps 0x30(%arg1), \TMP3
Uros Bizjakd7866e52020-07-09 17:08:57 +02001028 aesenc \TMP3, \XMM1 # Round 3
1029 aesenc \TMP3, \XMM2
1030 aesenc \TMP3, \XMM3
1031 aesenc \TMP3, \XMM4
1032 pclmulqdq $0x00, \TMP5, \XMM6 # XMM6 = a0*b0
Tadeusz Struk0bd82f52010-11-04 15:00:45 -04001033 movaps 0x40(%arg1), \TMP3
Uros Bizjakd7866e52020-07-09 17:08:57 +02001034 aesenc \TMP3, \XMM1 # Round 4
1035 aesenc \TMP3, \XMM2
1036 aesenc \TMP3, \XMM3
1037 aesenc \TMP3, \XMM4
Dave Watsone5b954e2018-08-15 10:29:42 -07001038 movdqu HashKey_3_k(%arg2), \TMP5
Uros Bizjakd7866e52020-07-09 17:08:57 +02001039 pclmulqdq $0x00, \TMP5, \TMP2 # TMP2 = (a1+a0)*(b1+b0)
Tadeusz Struk0bd82f52010-11-04 15:00:45 -04001040 movaps 0x50(%arg1), \TMP3
Uros Bizjakd7866e52020-07-09 17:08:57 +02001041 aesenc \TMP3, \XMM1 # Round 5
1042 aesenc \TMP3, \XMM2
1043 aesenc \TMP3, \XMM3
1044 aesenc \TMP3, \XMM4
Tadeusz Struk0bd82f52010-11-04 15:00:45 -04001045 pxor \TMP1, \TMP4
1046# accumulate the results in TMP4:XMM5, TMP6 holds the middle part
1047 pxor \XMM6, \XMM5
1048 pxor \TMP2, \TMP6
1049 movdqa \XMM7, \TMP1
1050 pshufd $78, \XMM7, \TMP2
1051 pxor \XMM7, \TMP2
Dave Watsone5b954e2018-08-15 10:29:42 -07001052 movdqu HashKey_2(%arg2), \TMP5
Tadeusz Struk0bd82f52010-11-04 15:00:45 -04001053
1054 # Multiply TMP5 * HashKey using karatsuba
1055
Uros Bizjakd7866e52020-07-09 17:08:57 +02001056 pclmulqdq $0x11, \TMP5, \TMP1 # TMP1 = a1*b1
Tadeusz Struk0bd82f52010-11-04 15:00:45 -04001057 movaps 0x60(%arg1), \TMP3
Uros Bizjakd7866e52020-07-09 17:08:57 +02001058 aesenc \TMP3, \XMM1 # Round 6
1059 aesenc \TMP3, \XMM2
1060 aesenc \TMP3, \XMM3
1061 aesenc \TMP3, \XMM4
1062 pclmulqdq $0x00, \TMP5, \XMM7 # XMM7 = a0*b0
Tadeusz Struk0bd82f52010-11-04 15:00:45 -04001063 movaps 0x70(%arg1), \TMP3
Uros Bizjakd7866e52020-07-09 17:08:57 +02001064 aesenc \TMP3, \XMM1 # Round 7
1065 aesenc \TMP3, \XMM2
1066 aesenc \TMP3, \XMM3
1067 aesenc \TMP3, \XMM4
Dave Watsone5b954e2018-08-15 10:29:42 -07001068 movdqu HashKey_2_k(%arg2), \TMP5
Uros Bizjakd7866e52020-07-09 17:08:57 +02001069 pclmulqdq $0x00, \TMP5, \TMP2 # TMP2 = (a1+a0)*(b1+b0)
Tadeusz Struk0bd82f52010-11-04 15:00:45 -04001070 movaps 0x80(%arg1), \TMP3
Uros Bizjakd7866e52020-07-09 17:08:57 +02001071 aesenc \TMP3, \XMM1 # Round 8
1072 aesenc \TMP3, \XMM2
1073 aesenc \TMP3, \XMM3
1074 aesenc \TMP3, \XMM4
Tadeusz Struk0bd82f52010-11-04 15:00:45 -04001075 pxor \TMP1, \TMP4
1076# accumulate the results in TMP4:XMM5, TMP6 holds the middle part
1077 pxor \XMM7, \XMM5
1078 pxor \TMP2, \TMP6
1079
1080 # Multiply XMM8 * HashKey
1081 # XMM8 and TMP5 hold the values for the two operands
1082
1083 movdqa \XMM8, \TMP1
1084 pshufd $78, \XMM8, \TMP2
1085 pxor \XMM8, \TMP2
Dave Watsone5b954e2018-08-15 10:29:42 -07001086 movdqu HashKey(%arg2), \TMP5
Uros Bizjakd7866e52020-07-09 17:08:57 +02001087 pclmulqdq $0x11, \TMP5, \TMP1 # TMP1 = a1*b1
Tadeusz Struk0bd82f52010-11-04 15:00:45 -04001088 movaps 0x90(%arg1), \TMP3
Uros Bizjakd7866e52020-07-09 17:08:57 +02001089 aesenc \TMP3, \XMM1 # Round 9
1090 aesenc \TMP3, \XMM2
1091 aesenc \TMP3, \XMM3
1092 aesenc \TMP3, \XMM4
1093 pclmulqdq $0x00, \TMP5, \XMM8 # XMM8 = a0*b0
Timothy McCaffreye31ac322015-01-13 13:16:43 -05001094 lea 0xa0(%arg1),%r10
1095 mov keysize,%eax
1096 shr $2,%eax # 128->4, 192->6, 256->8
1097 sub $4,%eax # 128->0, 192->2, 256->4
Dave Watsonfb8986e2018-02-14 09:40:47 -08001098 jz aes_loop_par_enc_done\@
Timothy McCaffreye31ac322015-01-13 13:16:43 -05001099
Dave Watsonfb8986e2018-02-14 09:40:47 -08001100aes_loop_par_enc\@:
Timothy McCaffreye31ac322015-01-13 13:16:43 -05001101 MOVADQ (%r10),\TMP3
1102.irpc index, 1234
Uros Bizjakd7866e52020-07-09 17:08:57 +02001103 aesenc \TMP3, %xmm\index
Timothy McCaffreye31ac322015-01-13 13:16:43 -05001104.endr
1105 add $16,%r10
1106 sub $1,%eax
Dave Watsonfb8986e2018-02-14 09:40:47 -08001107 jnz aes_loop_par_enc\@
Timothy McCaffreye31ac322015-01-13 13:16:43 -05001108
Dave Watsonfb8986e2018-02-14 09:40:47 -08001109aes_loop_par_enc_done\@:
Timothy McCaffreye31ac322015-01-13 13:16:43 -05001110 MOVADQ (%r10), \TMP3
Uros Bizjakd7866e52020-07-09 17:08:57 +02001111 aesenclast \TMP3, \XMM1 # Round 10
1112 aesenclast \TMP3, \XMM2
1113 aesenclast \TMP3, \XMM3
1114 aesenclast \TMP3, \XMM4
Dave Watsone5b954e2018-08-15 10:29:42 -07001115 movdqu HashKey_k(%arg2), \TMP5
Uros Bizjakd7866e52020-07-09 17:08:57 +02001116 pclmulqdq $0x00, \TMP5, \TMP2 # TMP2 = (a1+a0)*(b1+b0)
Dave Watson9ee4a5d2018-02-14 09:39:23 -08001117 movdqu (%arg4,%r11,1), \TMP3
Tadeusz Struk0bd82f52010-11-04 15:00:45 -04001118 pxor \TMP3, \XMM1 # Ciphertext/Plaintext XOR EK
Dave Watson9ee4a5d2018-02-14 09:39:23 -08001119 movdqu 16(%arg4,%r11,1), \TMP3
Tadeusz Struk0bd82f52010-11-04 15:00:45 -04001120 pxor \TMP3, \XMM2 # Ciphertext/Plaintext XOR EK
Dave Watson9ee4a5d2018-02-14 09:39:23 -08001121 movdqu 32(%arg4,%r11,1), \TMP3
Tadeusz Struk0bd82f52010-11-04 15:00:45 -04001122 pxor \TMP3, \XMM3 # Ciphertext/Plaintext XOR EK
Dave Watson9ee4a5d2018-02-14 09:39:23 -08001123 movdqu 48(%arg4,%r11,1), \TMP3
Tadeusz Struk0bd82f52010-11-04 15:00:45 -04001124 pxor \TMP3, \XMM4 # Ciphertext/Plaintext XOR EK
Dave Watson9ee4a5d2018-02-14 09:39:23 -08001125 movdqu \XMM1, (%arg3,%r11,1) # Write to the ciphertext buffer
1126 movdqu \XMM2, 16(%arg3,%r11,1) # Write to the ciphertext buffer
1127 movdqu \XMM3, 32(%arg3,%r11,1) # Write to the ciphertext buffer
1128 movdqu \XMM4, 48(%arg3,%r11,1) # Write to the ciphertext buffer
Uros Bizjakd7866e52020-07-09 17:08:57 +02001129 pshufb %xmm15, \XMM1 # perform a 16 byte swap
1130 pshufb %xmm15, \XMM2 # perform a 16 byte swap
1131 pshufb %xmm15, \XMM3 # perform a 16 byte swap
1132 pshufb %xmm15, \XMM4 # perform a 16 byte swap
Tadeusz Struk3c097b82010-12-13 19:51:15 +08001133
1134 pxor \TMP4, \TMP1
1135 pxor \XMM8, \XMM5
1136 pxor \TMP6, \TMP2
1137 pxor \TMP1, \TMP2
1138 pxor \XMM5, \TMP2
1139 movdqa \TMP2, \TMP3
1140 pslldq $8, \TMP3 # left shift TMP3 2 DWs
1141 psrldq $8, \TMP2 # right shift TMP2 2 DWs
1142 pxor \TMP3, \XMM5
1143 pxor \TMP2, \TMP1 # accumulate the results in TMP1:XMM5
1144
1145 # first phase of reduction
1146
1147 movdqa \XMM5, \TMP2
1148 movdqa \XMM5, \TMP3
1149 movdqa \XMM5, \TMP4
1150# move XMM5 into TMP2, TMP3, TMP4 in order to perform shifts independently
1151 pslld $31, \TMP2 # packed right shift << 31
1152 pslld $30, \TMP3 # packed right shift << 30
1153 pslld $25, \TMP4 # packed right shift << 25
1154 pxor \TMP3, \TMP2 # xor the shifted versions
1155 pxor \TMP4, \TMP2
1156 movdqa \TMP2, \TMP5
1157 psrldq $4, \TMP5 # right shift T5 1 DW
1158 pslldq $12, \TMP2 # left shift T2 3 DWs
1159 pxor \TMP2, \XMM5
1160
1161 # second phase of reduction
1162
1163 movdqa \XMM5,\TMP2 # make 3 copies of XMM5 into TMP2, TMP3, TMP4
1164 movdqa \XMM5,\TMP3
1165 movdqa \XMM5,\TMP4
1166 psrld $1, \TMP2 # packed left shift >>1
1167 psrld $2, \TMP3 # packed left shift >>2
1168 psrld $7, \TMP4 # packed left shift >>7
1169 pxor \TMP3,\TMP2 # xor the shifted versions
1170 pxor \TMP4,\TMP2
1171 pxor \TMP5, \TMP2
1172 pxor \TMP2, \XMM5
1173 pxor \TMP1, \XMM5 # result is in TMP1
1174
1175 pxor \XMM5, \XMM1
1176.endm
1177
1178/*
1179* decrypt 4 blocks at a time
1180* ghash the 4 previously decrypted ciphertext blocks
Dave Watson9ee4a5d2018-02-14 09:39:23 -08001181* arg1, %arg3, %arg4 are used as pointers only, not modified
Tadeusz Struk3c097b82010-12-13 19:51:15 +08001182* %r11 is the data offset value
1183*/
Sedat Dilek3347c8a2020-07-03 16:32:06 +02001184.macro GHASH_4_ENCRYPT_4_PARALLEL_dec TMP1 TMP2 TMP3 TMP4 TMP5 \
Tadeusz Struk3c097b82010-12-13 19:51:15 +08001185TMP6 XMM0 XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7 XMM8 operation
1186
1187 movdqa \XMM1, \XMM5
1188 movdqa \XMM2, \XMM6
1189 movdqa \XMM3, \XMM7
1190 movdqa \XMM4, \XMM8
1191
1192 movdqa SHUF_MASK(%rip), %xmm15
1193 # multiply TMP5 * HashKey using karatsuba
1194
1195 movdqa \XMM5, \TMP4
1196 pshufd $78, \XMM5, \TMP6
1197 pxor \XMM5, \TMP6
1198 paddd ONE(%rip), \XMM0 # INCR CNT
Dave Watsone5b954e2018-08-15 10:29:42 -07001199 movdqu HashKey_4(%arg2), \TMP5
Uros Bizjakd7866e52020-07-09 17:08:57 +02001200 pclmulqdq $0x11, \TMP5, \TMP4 # TMP4 = a1*b1
Tadeusz Struk3c097b82010-12-13 19:51:15 +08001201 movdqa \XMM0, \XMM1
1202 paddd ONE(%rip), \XMM0 # INCR CNT
1203 movdqa \XMM0, \XMM2
1204 paddd ONE(%rip), \XMM0 # INCR CNT
1205 movdqa \XMM0, \XMM3
1206 paddd ONE(%rip), \XMM0 # INCR CNT
1207 movdqa \XMM0, \XMM4
Uros Bizjakd7866e52020-07-09 17:08:57 +02001208 pshufb %xmm15, \XMM1 # perform a 16 byte swap
1209 pclmulqdq $0x00, \TMP5, \XMM5 # XMM5 = a0*b0
1210 pshufb %xmm15, \XMM2 # perform a 16 byte swap
1211 pshufb %xmm15, \XMM3 # perform a 16 byte swap
1212 pshufb %xmm15, \XMM4 # perform a 16 byte swap
Tadeusz Struk3c097b82010-12-13 19:51:15 +08001213
1214 pxor (%arg1), \XMM1
1215 pxor (%arg1), \XMM2
1216 pxor (%arg1), \XMM3
1217 pxor (%arg1), \XMM4
Dave Watsone5b954e2018-08-15 10:29:42 -07001218 movdqu HashKey_4_k(%arg2), \TMP5
Uros Bizjakd7866e52020-07-09 17:08:57 +02001219 pclmulqdq $0x00, \TMP5, \TMP6 # TMP6 = (a1+a0)*(b1+b0)
Tadeusz Struk3c097b82010-12-13 19:51:15 +08001220 movaps 0x10(%arg1), \TMP1
Uros Bizjakd7866e52020-07-09 17:08:57 +02001221 aesenc \TMP1, \XMM1 # Round 1
1222 aesenc \TMP1, \XMM2
1223 aesenc \TMP1, \XMM3
1224 aesenc \TMP1, \XMM4
Tadeusz Struk3c097b82010-12-13 19:51:15 +08001225 movaps 0x20(%arg1), \TMP1
Uros Bizjakd7866e52020-07-09 17:08:57 +02001226 aesenc \TMP1, \XMM1 # Round 2
1227 aesenc \TMP1, \XMM2
1228 aesenc \TMP1, \XMM3
1229 aesenc \TMP1, \XMM4
Tadeusz Struk3c097b82010-12-13 19:51:15 +08001230 movdqa \XMM6, \TMP1
1231 pshufd $78, \XMM6, \TMP2
1232 pxor \XMM6, \TMP2
Dave Watsone5b954e2018-08-15 10:29:42 -07001233 movdqu HashKey_3(%arg2), \TMP5
Uros Bizjakd7866e52020-07-09 17:08:57 +02001234 pclmulqdq $0x11, \TMP5, \TMP1 # TMP1 = a1 * b1
Tadeusz Struk3c097b82010-12-13 19:51:15 +08001235 movaps 0x30(%arg1), \TMP3
Uros Bizjakd7866e52020-07-09 17:08:57 +02001236 aesenc \TMP3, \XMM1 # Round 3
1237 aesenc \TMP3, \XMM2
1238 aesenc \TMP3, \XMM3
1239 aesenc \TMP3, \XMM4
1240 pclmulqdq $0x00, \TMP5, \XMM6 # XMM6 = a0*b0
Tadeusz Struk3c097b82010-12-13 19:51:15 +08001241 movaps 0x40(%arg1), \TMP3
Uros Bizjakd7866e52020-07-09 17:08:57 +02001242 aesenc \TMP3, \XMM1 # Round 4
1243 aesenc \TMP3, \XMM2
1244 aesenc \TMP3, \XMM3
1245 aesenc \TMP3, \XMM4
Dave Watsone5b954e2018-08-15 10:29:42 -07001246 movdqu HashKey_3_k(%arg2), \TMP5
Uros Bizjakd7866e52020-07-09 17:08:57 +02001247 pclmulqdq $0x00, \TMP5, \TMP2 # TMP2 = (a1+a0)*(b1+b0)
Tadeusz Struk3c097b82010-12-13 19:51:15 +08001248 movaps 0x50(%arg1), \TMP3
Uros Bizjakd7866e52020-07-09 17:08:57 +02001249 aesenc \TMP3, \XMM1 # Round 5
1250 aesenc \TMP3, \XMM2
1251 aesenc \TMP3, \XMM3
1252 aesenc \TMP3, \XMM4
Tadeusz Struk3c097b82010-12-13 19:51:15 +08001253 pxor \TMP1, \TMP4
1254# accumulate the results in TMP4:XMM5, TMP6 holds the middle part
1255 pxor \XMM6, \XMM5
1256 pxor \TMP2, \TMP6
1257 movdqa \XMM7, \TMP1
1258 pshufd $78, \XMM7, \TMP2
1259 pxor \XMM7, \TMP2
Dave Watsone5b954e2018-08-15 10:29:42 -07001260 movdqu HashKey_2(%arg2), \TMP5
Tadeusz Struk3c097b82010-12-13 19:51:15 +08001261
1262 # Multiply TMP5 * HashKey using karatsuba
1263
Uros Bizjakd7866e52020-07-09 17:08:57 +02001264 pclmulqdq $0x11, \TMP5, \TMP1 # TMP1 = a1*b1
Tadeusz Struk3c097b82010-12-13 19:51:15 +08001265 movaps 0x60(%arg1), \TMP3
Uros Bizjakd7866e52020-07-09 17:08:57 +02001266 aesenc \TMP3, \XMM1 # Round 6
1267 aesenc \TMP3, \XMM2
1268 aesenc \TMP3, \XMM3
1269 aesenc \TMP3, \XMM4
1270 pclmulqdq $0x00, \TMP5, \XMM7 # XMM7 = a0*b0
Tadeusz Struk3c097b82010-12-13 19:51:15 +08001271 movaps 0x70(%arg1), \TMP3
Uros Bizjakd7866e52020-07-09 17:08:57 +02001272 aesenc \TMP3, \XMM1 # Round 7
1273 aesenc \TMP3, \XMM2
1274 aesenc \TMP3, \XMM3
1275 aesenc \TMP3, \XMM4
Dave Watsone5b954e2018-08-15 10:29:42 -07001276 movdqu HashKey_2_k(%arg2), \TMP5
Uros Bizjakd7866e52020-07-09 17:08:57 +02001277 pclmulqdq $0x00, \TMP5, \TMP2 # TMP2 = (a1+a0)*(b1+b0)
Tadeusz Struk3c097b82010-12-13 19:51:15 +08001278 movaps 0x80(%arg1), \TMP3
Uros Bizjakd7866e52020-07-09 17:08:57 +02001279 aesenc \TMP3, \XMM1 # Round 8
1280 aesenc \TMP3, \XMM2
1281 aesenc \TMP3, \XMM3
1282 aesenc \TMP3, \XMM4
Tadeusz Struk3c097b82010-12-13 19:51:15 +08001283 pxor \TMP1, \TMP4
1284# accumulate the results in TMP4:XMM5, TMP6 holds the middle part
1285 pxor \XMM7, \XMM5
1286 pxor \TMP2, \TMP6
1287
1288 # Multiply XMM8 * HashKey
1289 # XMM8 and TMP5 hold the values for the two operands
1290
1291 movdqa \XMM8, \TMP1
1292 pshufd $78, \XMM8, \TMP2
1293 pxor \XMM8, \TMP2
Dave Watsone5b954e2018-08-15 10:29:42 -07001294 movdqu HashKey(%arg2), \TMP5
Uros Bizjakd7866e52020-07-09 17:08:57 +02001295 pclmulqdq $0x11, \TMP5, \TMP1 # TMP1 = a1*b1
Tadeusz Struk3c097b82010-12-13 19:51:15 +08001296 movaps 0x90(%arg1), \TMP3
Uros Bizjakd7866e52020-07-09 17:08:57 +02001297 aesenc \TMP3, \XMM1 # Round 9
1298 aesenc \TMP3, \XMM2
1299 aesenc \TMP3, \XMM3
1300 aesenc \TMP3, \XMM4
1301 pclmulqdq $0x00, \TMP5, \XMM8 # XMM8 = a0*b0
Timothy McCaffreye31ac322015-01-13 13:16:43 -05001302 lea 0xa0(%arg1),%r10
1303 mov keysize,%eax
1304 shr $2,%eax # 128->4, 192->6, 256->8
1305 sub $4,%eax # 128->0, 192->2, 256->4
Dave Watsonfb8986e2018-02-14 09:40:47 -08001306 jz aes_loop_par_dec_done\@
Timothy McCaffreye31ac322015-01-13 13:16:43 -05001307
Dave Watsonfb8986e2018-02-14 09:40:47 -08001308aes_loop_par_dec\@:
Timothy McCaffreye31ac322015-01-13 13:16:43 -05001309 MOVADQ (%r10),\TMP3
1310.irpc index, 1234
Uros Bizjakd7866e52020-07-09 17:08:57 +02001311 aesenc \TMP3, %xmm\index
Timothy McCaffreye31ac322015-01-13 13:16:43 -05001312.endr
1313 add $16,%r10
1314 sub $1,%eax
Dave Watsonfb8986e2018-02-14 09:40:47 -08001315 jnz aes_loop_par_dec\@
Timothy McCaffreye31ac322015-01-13 13:16:43 -05001316
Dave Watsonfb8986e2018-02-14 09:40:47 -08001317aes_loop_par_dec_done\@:
Timothy McCaffreye31ac322015-01-13 13:16:43 -05001318 MOVADQ (%r10), \TMP3
Uros Bizjakd7866e52020-07-09 17:08:57 +02001319 aesenclast \TMP3, \XMM1 # last round
1320 aesenclast \TMP3, \XMM2
1321 aesenclast \TMP3, \XMM3
1322 aesenclast \TMP3, \XMM4
Dave Watsone5b954e2018-08-15 10:29:42 -07001323 movdqu HashKey_k(%arg2), \TMP5
Uros Bizjakd7866e52020-07-09 17:08:57 +02001324 pclmulqdq $0x00, \TMP5, \TMP2 # TMP2 = (a1+a0)*(b1+b0)
Dave Watson9ee4a5d2018-02-14 09:39:23 -08001325 movdqu (%arg4,%r11,1), \TMP3
Tadeusz Struk3c097b82010-12-13 19:51:15 +08001326 pxor \TMP3, \XMM1 # Ciphertext/Plaintext XOR EK
Dave Watson9ee4a5d2018-02-14 09:39:23 -08001327 movdqu \XMM1, (%arg3,%r11,1) # Write to plaintext buffer
Tadeusz Struk3c097b82010-12-13 19:51:15 +08001328 movdqa \TMP3, \XMM1
Dave Watson9ee4a5d2018-02-14 09:39:23 -08001329 movdqu 16(%arg4,%r11,1), \TMP3
Tadeusz Struk3c097b82010-12-13 19:51:15 +08001330 pxor \TMP3, \XMM2 # Ciphertext/Plaintext XOR EK
Dave Watson9ee4a5d2018-02-14 09:39:23 -08001331 movdqu \XMM2, 16(%arg3,%r11,1) # Write to plaintext buffer
Tadeusz Struk3c097b82010-12-13 19:51:15 +08001332 movdqa \TMP3, \XMM2
Dave Watson9ee4a5d2018-02-14 09:39:23 -08001333 movdqu 32(%arg4,%r11,1), \TMP3
Tadeusz Struk3c097b82010-12-13 19:51:15 +08001334 pxor \TMP3, \XMM3 # Ciphertext/Plaintext XOR EK
Dave Watson9ee4a5d2018-02-14 09:39:23 -08001335 movdqu \XMM3, 32(%arg3,%r11,1) # Write to plaintext buffer
Tadeusz Struk3c097b82010-12-13 19:51:15 +08001336 movdqa \TMP3, \XMM3
Dave Watson9ee4a5d2018-02-14 09:39:23 -08001337 movdqu 48(%arg4,%r11,1), \TMP3
Tadeusz Struk3c097b82010-12-13 19:51:15 +08001338 pxor \TMP3, \XMM4 # Ciphertext/Plaintext XOR EK
Dave Watson9ee4a5d2018-02-14 09:39:23 -08001339 movdqu \XMM4, 48(%arg3,%r11,1) # Write to plaintext buffer
Tadeusz Struk0bd82f52010-11-04 15:00:45 -04001340 movdqa \TMP3, \XMM4
Uros Bizjakd7866e52020-07-09 17:08:57 +02001341 pshufb %xmm15, \XMM1 # perform a 16 byte swap
1342 pshufb %xmm15, \XMM2 # perform a 16 byte swap
1343 pshufb %xmm15, \XMM3 # perform a 16 byte swap
1344 pshufb %xmm15, \XMM4 # perform a 16 byte swap
Tadeusz Struk0bd82f52010-11-04 15:00:45 -04001345
1346 pxor \TMP4, \TMP1
1347 pxor \XMM8, \XMM5
1348 pxor \TMP6, \TMP2
1349 pxor \TMP1, \TMP2
1350 pxor \XMM5, \TMP2
1351 movdqa \TMP2, \TMP3
1352 pslldq $8, \TMP3 # left shift TMP3 2 DWs
1353 psrldq $8, \TMP2 # right shift TMP2 2 DWs
1354 pxor \TMP3, \XMM5
1355 pxor \TMP2, \TMP1 # accumulate the results in TMP1:XMM5
1356
1357 # first phase of reduction
1358
1359 movdqa \XMM5, \TMP2
1360 movdqa \XMM5, \TMP3
1361 movdqa \XMM5, \TMP4
1362# move XMM5 into TMP2, TMP3, TMP4 in order to perform shifts independently
1363 pslld $31, \TMP2 # packed right shift << 31
1364 pslld $30, \TMP3 # packed right shift << 30
1365 pslld $25, \TMP4 # packed right shift << 25
1366 pxor \TMP3, \TMP2 # xor the shifted versions
1367 pxor \TMP4, \TMP2
1368 movdqa \TMP2, \TMP5
1369 psrldq $4, \TMP5 # right shift T5 1 DW
1370 pslldq $12, \TMP2 # left shift T2 3 DWs
1371 pxor \TMP2, \XMM5
1372
1373 # second phase of reduction
1374
1375 movdqa \XMM5,\TMP2 # make 3 copies of XMM5 into TMP2, TMP3, TMP4
1376 movdqa \XMM5,\TMP3
1377 movdqa \XMM5,\TMP4
1378 psrld $1, \TMP2 # packed left shift >>1
1379 psrld $2, \TMP3 # packed left shift >>2
1380 psrld $7, \TMP4 # packed left shift >>7
1381 pxor \TMP3,\TMP2 # xor the shifted versions
1382 pxor \TMP4,\TMP2
1383 pxor \TMP5, \TMP2
1384 pxor \TMP2, \XMM5
1385 pxor \TMP1, \XMM5 # result is in TMP1
1386
1387 pxor \XMM5, \XMM1
1388.endm
1389
1390/* GHASH the last 4 ciphertext blocks. */
1391.macro GHASH_LAST_4 TMP1 TMP2 TMP3 TMP4 TMP5 TMP6 \
1392TMP7 XMM1 XMM2 XMM3 XMM4 XMMDst
1393
1394 # Multiply TMP6 * HashKey (using Karatsuba)
1395
1396 movdqa \XMM1, \TMP6
1397 pshufd $78, \XMM1, \TMP2
1398 pxor \XMM1, \TMP2
Dave Watsone5b954e2018-08-15 10:29:42 -07001399 movdqu HashKey_4(%arg2), \TMP5
Uros Bizjakd7866e52020-07-09 17:08:57 +02001400 pclmulqdq $0x11, \TMP5, \TMP6 # TMP6 = a1*b1
1401 pclmulqdq $0x00, \TMP5, \XMM1 # XMM1 = a0*b0
Dave Watsone5b954e2018-08-15 10:29:42 -07001402 movdqu HashKey_4_k(%arg2), \TMP4
Uros Bizjakd7866e52020-07-09 17:08:57 +02001403 pclmulqdq $0x00, \TMP4, \TMP2 # TMP2 = (a1+a0)*(b1+b0)
Tadeusz Struk0bd82f52010-11-04 15:00:45 -04001404 movdqa \XMM1, \XMMDst
1405 movdqa \TMP2, \XMM1 # result in TMP6, XMMDst, XMM1
1406
1407 # Multiply TMP1 * HashKey (using Karatsuba)
1408
1409 movdqa \XMM2, \TMP1
1410 pshufd $78, \XMM2, \TMP2
1411 pxor \XMM2, \TMP2
Dave Watsone5b954e2018-08-15 10:29:42 -07001412 movdqu HashKey_3(%arg2), \TMP5
Uros Bizjakd7866e52020-07-09 17:08:57 +02001413 pclmulqdq $0x11, \TMP5, \TMP1 # TMP1 = a1*b1
1414 pclmulqdq $0x00, \TMP5, \XMM2 # XMM2 = a0*b0
Dave Watsone5b954e2018-08-15 10:29:42 -07001415 movdqu HashKey_3_k(%arg2), \TMP4
Uros Bizjakd7866e52020-07-09 17:08:57 +02001416 pclmulqdq $0x00, \TMP4, \TMP2 # TMP2 = (a1+a0)*(b1+b0)
Tadeusz Struk0bd82f52010-11-04 15:00:45 -04001417 pxor \TMP1, \TMP6
1418 pxor \XMM2, \XMMDst
1419 pxor \TMP2, \XMM1
1420# results accumulated in TMP6, XMMDst, XMM1
1421
1422 # Multiply TMP1 * HashKey (using Karatsuba)
1423
1424 movdqa \XMM3, \TMP1
1425 pshufd $78, \XMM3, \TMP2
1426 pxor \XMM3, \TMP2
Dave Watsone5b954e2018-08-15 10:29:42 -07001427 movdqu HashKey_2(%arg2), \TMP5
Uros Bizjakd7866e52020-07-09 17:08:57 +02001428 pclmulqdq $0x11, \TMP5, \TMP1 # TMP1 = a1*b1
1429 pclmulqdq $0x00, \TMP5, \XMM3 # XMM3 = a0*b0
Dave Watsone5b954e2018-08-15 10:29:42 -07001430 movdqu HashKey_2_k(%arg2), \TMP4
Uros Bizjakd7866e52020-07-09 17:08:57 +02001431 pclmulqdq $0x00, \TMP4, \TMP2 # TMP2 = (a1+a0)*(b1+b0)
Tadeusz Struk0bd82f52010-11-04 15:00:45 -04001432 pxor \TMP1, \TMP6
1433 pxor \XMM3, \XMMDst
1434 pxor \TMP2, \XMM1 # results accumulated in TMP6, XMMDst, XMM1
1435
1436 # Multiply TMP1 * HashKey (using Karatsuba)
1437 movdqa \XMM4, \TMP1
1438 pshufd $78, \XMM4, \TMP2
1439 pxor \XMM4, \TMP2
Dave Watsone5b954e2018-08-15 10:29:42 -07001440 movdqu HashKey(%arg2), \TMP5
Uros Bizjakd7866e52020-07-09 17:08:57 +02001441 pclmulqdq $0x11, \TMP5, \TMP1 # TMP1 = a1*b1
1442 pclmulqdq $0x00, \TMP5, \XMM4 # XMM4 = a0*b0
Dave Watsone5b954e2018-08-15 10:29:42 -07001443 movdqu HashKey_k(%arg2), \TMP4
Uros Bizjakd7866e52020-07-09 17:08:57 +02001444 pclmulqdq $0x00, \TMP4, \TMP2 # TMP2 = (a1+a0)*(b1+b0)
Tadeusz Struk0bd82f52010-11-04 15:00:45 -04001445 pxor \TMP1, \TMP6
1446 pxor \XMM4, \XMMDst
1447 pxor \XMM1, \TMP2
1448 pxor \TMP6, \TMP2
1449 pxor \XMMDst, \TMP2
1450 # middle section of the temp results combined as in karatsuba algorithm
1451 movdqa \TMP2, \TMP4
1452 pslldq $8, \TMP4 # left shift TMP4 2 DWs
1453 psrldq $8, \TMP2 # right shift TMP2 2 DWs
1454 pxor \TMP4, \XMMDst
1455 pxor \TMP2, \TMP6
1456# TMP6:XMMDst holds the result of the accumulated carry-less multiplications
1457 # first phase of the reduction
1458 movdqa \XMMDst, \TMP2
1459 movdqa \XMMDst, \TMP3
1460 movdqa \XMMDst, \TMP4
1461# move XMMDst into TMP2, TMP3, TMP4 in order to perform 3 shifts independently
1462 pslld $31, \TMP2 # packed right shifting << 31
1463 pslld $30, \TMP3 # packed right shifting << 30
1464 pslld $25, \TMP4 # packed right shifting << 25
1465 pxor \TMP3, \TMP2 # xor the shifted versions
1466 pxor \TMP4, \TMP2
1467 movdqa \TMP2, \TMP7
1468 psrldq $4, \TMP7 # right shift TMP7 1 DW
1469 pslldq $12, \TMP2 # left shift TMP2 3 DWs
1470 pxor \TMP2, \XMMDst
1471
1472 # second phase of the reduction
1473 movdqa \XMMDst, \TMP2
1474 # make 3 copies of XMMDst for doing 3 shift operations
1475 movdqa \XMMDst, \TMP3
1476 movdqa \XMMDst, \TMP4
1477 psrld $1, \TMP2 # packed left shift >> 1
1478 psrld $2, \TMP3 # packed left shift >> 2
1479 psrld $7, \TMP4 # packed left shift >> 7
1480 pxor \TMP3, \TMP2 # xor the shifted versions
1481 pxor \TMP4, \TMP2
1482 pxor \TMP7, \TMP2
1483 pxor \TMP2, \XMMDst
1484 pxor \TMP6, \XMMDst # reduced result is in XMMDst
1485.endm
1486
Timothy McCaffreye31ac322015-01-13 13:16:43 -05001487
1488/* Encryption of a single block
1489* uses eax & r10
1490*/
1491
Tadeusz Struk0bd82f52010-11-04 15:00:45 -04001492.macro ENCRYPT_SINGLE_BLOCK XMM0 TMP1
1493
Timothy McCaffreye31ac322015-01-13 13:16:43 -05001494 pxor (%arg1), \XMM0
1495 mov keysize,%eax
1496 shr $2,%eax # 128->4, 192->6, 256->8
1497 add $5,%eax # 128->9, 192->11, 256->13
1498 lea 16(%arg1), %r10 # get first expanded key address
1499
1500_esb_loop_\@:
1501 MOVADQ (%r10),\TMP1
Uros Bizjakd7866e52020-07-09 17:08:57 +02001502 aesenc \TMP1,\XMM0
Timothy McCaffreye31ac322015-01-13 13:16:43 -05001503 add $16,%r10
1504 sub $1,%eax
1505 jnz _esb_loop_\@
1506
1507 MOVADQ (%r10),\TMP1
Uros Bizjakd7866e52020-07-09 17:08:57 +02001508 aesenclast \TMP1,\XMM0
Tadeusz Struk0bd82f52010-11-04 15:00:45 -04001509.endm
Tadeusz Struk0bd82f52010-11-04 15:00:45 -04001510/*****************************************************************************
1511* void aesni_gcm_dec(void *aes_ctx, // AES Key schedule. Starts on a 16 byte boundary.
Dave Watson9ee4a5d2018-02-14 09:39:23 -08001512* struct gcm_context_data *data
1513* // Context data
Tadeusz Struk0bd82f52010-11-04 15:00:45 -04001514* u8 *out, // Plaintext output. Encrypt in-place is allowed.
1515* const u8 *in, // Ciphertext input
1516* u64 plaintext_len, // Length of data in bytes for decryption.
1517* u8 *iv, // Pre-counter block j0: 4 byte salt (from Security Association)
1518* // concatenated with 8 byte Initialisation Vector (from IPSec ESP Payload)
1519* // concatenated with 0x00000001. 16-byte aligned pointer.
1520* u8 *hash_subkey, // H, the Hash sub key input. Data starts on a 16-byte boundary.
1521* const u8 *aad, // Additional Authentication Data (AAD)
1522* u64 aad_len, // Length of AAD in bytes. With RFC4106 this is going to be 8 or 12 bytes
1523* u8 *auth_tag, // Authenticated Tag output. The driver will compare this to the
1524* // given authentication tag and only return the plaintext if they match.
1525* u64 auth_tag_len); // Authenticated Tag Length in bytes. Valid values are 16
1526* // (most likely), 12 or 8.
1527*
1528* Assumptions:
1529*
1530* keys:
1531* keys are pre-expanded and aligned to 16 bytes. we are using the first
1532* set of 11 keys in the data structure void *aes_ctx
1533*
1534* iv:
1535* 0 1 2 3
1536* 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
1537* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1538* | Salt (From the SA) |
1539* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1540* | Initialization Vector |
1541* | (This is the sequence number from IPSec header) |
1542* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1543* | 0x1 |
1544* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1545*
1546*
1547*
1548* AAD:
1549* AAD padded to 128 bits with 0
1550* for example, assume AAD is a u32 vector
1551*
1552* if AAD is 8 bytes:
1553* AAD[3] = {A0, A1};
1554* padded AAD in xmm register = {A1 A0 0 0}
1555*
1556* 0 1 2 3
1557* 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
1558* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1559* | SPI (A1) |
1560* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1561* | 32-bit Sequence Number (A0) |
1562* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1563* | 0x0 |
1564* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1565*
1566* AAD Format with 32-bit Sequence Number
1567*
1568* if AAD is 12 bytes:
1569* AAD[3] = {A0, A1, A2};
1570* padded AAD in xmm register = {A2 A1 A0 0}
1571*
1572* 0 1 2 3
1573* 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
1574* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1575* 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
1576* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1577* | SPI (A2) |
1578* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1579* | 64-bit Extended Sequence Number {A1,A0} |
1580* | |
1581* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1582* | 0x0 |
1583* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1584*
1585* AAD Format with 64-bit Extended Sequence Number
1586*
Tadeusz Struk0bd82f52010-11-04 15:00:45 -04001587* poly = x^128 + x^127 + x^126 + x^121 + 1
1588*
1589*****************************************************************************/
Jiri Slaby6dcc5622019-10-11 13:51:04 +02001590SYM_FUNC_START(aesni_gcm_dec)
Dave Watson6c2c86b2018-02-14 09:38:35 -08001591 FUNC_SAVE
Tadeusz Struk3c097b82010-12-13 19:51:15 +08001592
Dave Watsonfb8986e2018-02-14 09:40:47 -08001593 GCM_INIT %arg6, arg7, arg8, arg9
Dave Watsonba458332018-02-14 09:39:10 -08001594 GCM_ENC_DEC dec
Dave Watsonfb8986e2018-02-14 09:40:47 -08001595 GCM_COMPLETE arg10, arg11
Dave Watson6c2c86b2018-02-14 09:38:35 -08001596 FUNC_RESTORE
Tadeusz Struk0bd82f52010-11-04 15:00:45 -04001597 ret
Jiri Slaby6dcc5622019-10-11 13:51:04 +02001598SYM_FUNC_END(aesni_gcm_dec)
Tadeusz Struk0bd82f52010-11-04 15:00:45 -04001599
1600
1601/*****************************************************************************
1602* void aesni_gcm_enc(void *aes_ctx, // AES Key schedule. Starts on a 16 byte boundary.
Dave Watson9ee4a5d2018-02-14 09:39:23 -08001603* struct gcm_context_data *data
1604* // Context data
Tadeusz Struk0bd82f52010-11-04 15:00:45 -04001605* u8 *out, // Ciphertext output. Encrypt in-place is allowed.
1606* const u8 *in, // Plaintext input
1607* u64 plaintext_len, // Length of data in bytes for encryption.
1608* u8 *iv, // Pre-counter block j0: 4 byte salt (from Security Association)
1609* // concatenated with 8 byte Initialisation Vector (from IPSec ESP Payload)
1610* // concatenated with 0x00000001. 16-byte aligned pointer.
1611* u8 *hash_subkey, // H, the Hash sub key input. Data starts on a 16-byte boundary.
1612* const u8 *aad, // Additional Authentication Data (AAD)
1613* u64 aad_len, // Length of AAD in bytes. With RFC4106 this is going to be 8 or 12 bytes
1614* u8 *auth_tag, // Authenticated Tag output.
1615* u64 auth_tag_len); // Authenticated Tag Length in bytes. Valid values are 16 (most likely),
1616* // 12 or 8.
1617*
1618* Assumptions:
1619*
1620* keys:
1621* keys are pre-expanded and aligned to 16 bytes. we are using the
1622* first set of 11 keys in the data structure void *aes_ctx
1623*
1624*
1625* iv:
1626* 0 1 2 3
1627* 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
1628* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1629* | Salt (From the SA) |
1630* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1631* | Initialization Vector |
1632* | (This is the sequence number from IPSec header) |
1633* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1634* | 0x1 |
1635* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1636*
1637*
1638*
1639* AAD:
1640* AAD padded to 128 bits with 0
1641* for example, assume AAD is a u32 vector
1642*
1643* if AAD is 8 bytes:
1644* AAD[3] = {A0, A1};
1645* padded AAD in xmm register = {A1 A0 0 0}
1646*
1647* 0 1 2 3
1648* 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
1649* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1650* | SPI (A1) |
1651* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1652* | 32-bit Sequence Number (A0) |
1653* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1654* | 0x0 |
1655* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1656*
1657* AAD Format with 32-bit Sequence Number
1658*
1659* if AAD is 12 bytes:
1660* AAD[3] = {A0, A1, A2};
1661* padded AAD in xmm register = {A2 A1 A0 0}
1662*
1663* 0 1 2 3
1664* 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
1665* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1666* | SPI (A2) |
1667* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1668* | 64-bit Extended Sequence Number {A1,A0} |
1669* | |
1670* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1671* | 0x0 |
1672* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1673*
1674* AAD Format with 64-bit Extended Sequence Number
1675*
Tadeusz Struk0bd82f52010-11-04 15:00:45 -04001676* poly = x^128 + x^127 + x^126 + x^121 + 1
1677***************************************************************************/
Jiri Slaby6dcc5622019-10-11 13:51:04 +02001678SYM_FUNC_START(aesni_gcm_enc)
Dave Watson6c2c86b2018-02-14 09:38:35 -08001679 FUNC_SAVE
Tadeusz Struk3c097b82010-12-13 19:51:15 +08001680
Dave Watsonfb8986e2018-02-14 09:40:47 -08001681 GCM_INIT %arg6, arg7, arg8, arg9
Dave Watsonba458332018-02-14 09:39:10 -08001682 GCM_ENC_DEC enc
Dave Watsonfb8986e2018-02-14 09:40:47 -08001683
1684 GCM_COMPLETE arg10, arg11
Dave Watson6c2c86b2018-02-14 09:38:35 -08001685 FUNC_RESTORE
Tadeusz Struk0bd82f52010-11-04 15:00:45 -04001686 ret
Jiri Slaby6dcc5622019-10-11 13:51:04 +02001687SYM_FUNC_END(aesni_gcm_enc)
Tadeusz Struk3c097b82010-12-13 19:51:15 +08001688
Dave Watsonfb8986e2018-02-14 09:40:47 -08001689/*****************************************************************************
1690* void aesni_gcm_init(void *aes_ctx, // AES Key schedule. Starts on a 16 byte boundary.
1691* struct gcm_context_data *data,
1692* // context data
1693* u8 *iv, // Pre-counter block j0: 4 byte salt (from Security Association)
1694* // concatenated with 8 byte Initialisation Vector (from IPSec ESP Payload)
1695* // concatenated with 0x00000001. 16-byte aligned pointer.
1696* u8 *hash_subkey, // H, the Hash sub key input. Data starts on a 16-byte boundary.
1697* const u8 *aad, // Additional Authentication Data (AAD)
1698* u64 aad_len) // Length of AAD in bytes.
1699*/
Jiri Slaby6dcc5622019-10-11 13:51:04 +02001700SYM_FUNC_START(aesni_gcm_init)
Dave Watsonfb8986e2018-02-14 09:40:47 -08001701 FUNC_SAVE
1702 GCM_INIT %arg3, %arg4,%arg5, %arg6
1703 FUNC_RESTORE
1704 ret
Jiri Slaby6dcc5622019-10-11 13:51:04 +02001705SYM_FUNC_END(aesni_gcm_init)
Dave Watsonfb8986e2018-02-14 09:40:47 -08001706
1707/*****************************************************************************
1708* void aesni_gcm_enc_update(void *aes_ctx, // AES Key schedule. Starts on a 16 byte boundary.
1709* struct gcm_context_data *data,
1710* // context data
1711* u8 *out, // Ciphertext output. Encrypt in-place is allowed.
1712* const u8 *in, // Plaintext input
1713* u64 plaintext_len, // Length of data in bytes for encryption.
1714*/
Jiri Slaby6dcc5622019-10-11 13:51:04 +02001715SYM_FUNC_START(aesni_gcm_enc_update)
Dave Watsonfb8986e2018-02-14 09:40:47 -08001716 FUNC_SAVE
1717 GCM_ENC_DEC enc
1718 FUNC_RESTORE
1719 ret
Jiri Slaby6dcc5622019-10-11 13:51:04 +02001720SYM_FUNC_END(aesni_gcm_enc_update)
Dave Watsonfb8986e2018-02-14 09:40:47 -08001721
1722/*****************************************************************************
1723* void aesni_gcm_dec_update(void *aes_ctx, // AES Key schedule. Starts on a 16 byte boundary.
1724* struct gcm_context_data *data,
1725* // context data
1726* u8 *out, // Ciphertext output. Encrypt in-place is allowed.
1727* const u8 *in, // Plaintext input
1728* u64 plaintext_len, // Length of data in bytes for encryption.
1729*/
Jiri Slaby6dcc5622019-10-11 13:51:04 +02001730SYM_FUNC_START(aesni_gcm_dec_update)
Dave Watsonfb8986e2018-02-14 09:40:47 -08001731 FUNC_SAVE
1732 GCM_ENC_DEC dec
1733 FUNC_RESTORE
1734 ret
Jiri Slaby6dcc5622019-10-11 13:51:04 +02001735SYM_FUNC_END(aesni_gcm_dec_update)
Dave Watsonfb8986e2018-02-14 09:40:47 -08001736
1737/*****************************************************************************
1738* void aesni_gcm_finalize(void *aes_ctx, // AES Key schedule. Starts on a 16 byte boundary.
1739* struct gcm_context_data *data,
1740* // context data
1741* u8 *auth_tag, // Authenticated Tag output.
1742* u64 auth_tag_len); // Authenticated Tag Length in bytes. Valid values are 16 (most likely),
1743* // 12 or 8.
1744*/
Jiri Slaby6dcc5622019-10-11 13:51:04 +02001745SYM_FUNC_START(aesni_gcm_finalize)
Dave Watsonfb8986e2018-02-14 09:40:47 -08001746 FUNC_SAVE
1747 GCM_COMPLETE %arg3 %arg4
1748 FUNC_RESTORE
1749 ret
Jiri Slaby6dcc5622019-10-11 13:51:04 +02001750SYM_FUNC_END(aesni_gcm_finalize)
Dave Watsonfb8986e2018-02-14 09:40:47 -08001751
Mathias Krause559ad0f2010-11-29 08:35:39 +08001752#endif
Tadeusz Struk0bd82f52010-11-04 15:00:45 -04001753
1754
Jiri Slabye9b9d022019-10-11 13:50:49 +02001755SYM_FUNC_START_LOCAL_ALIAS(_key_expansion_128)
Jiri Slaby74d8b902019-10-11 13:50:46 +02001756SYM_FUNC_START_LOCAL(_key_expansion_256a)
Huang Ying54b6a1b2009-01-18 16:28:34 +11001757 pshufd $0b11111111, %xmm1, %xmm1
1758 shufps $0b00010000, %xmm0, %xmm4
1759 pxor %xmm4, %xmm0
1760 shufps $0b10001100, %xmm0, %xmm4
1761 pxor %xmm4, %xmm0
1762 pxor %xmm1, %xmm0
Mathias Krause0d258ef2010-11-27 16:34:46 +08001763 movaps %xmm0, (TKEYP)
1764 add $0x10, TKEYP
Huang Ying54b6a1b2009-01-18 16:28:34 +11001765 ret
Jiri Slaby74d8b902019-10-11 13:50:46 +02001766SYM_FUNC_END(_key_expansion_256a)
Jiri Slabye9b9d022019-10-11 13:50:49 +02001767SYM_FUNC_END_ALIAS(_key_expansion_128)
Huang Ying54b6a1b2009-01-18 16:28:34 +11001768
Jiri Slaby74d8b902019-10-11 13:50:46 +02001769SYM_FUNC_START_LOCAL(_key_expansion_192a)
Huang Ying54b6a1b2009-01-18 16:28:34 +11001770 pshufd $0b01010101, %xmm1, %xmm1
1771 shufps $0b00010000, %xmm0, %xmm4
1772 pxor %xmm4, %xmm0
1773 shufps $0b10001100, %xmm0, %xmm4
1774 pxor %xmm4, %xmm0
1775 pxor %xmm1, %xmm0
1776
1777 movaps %xmm2, %xmm5
1778 movaps %xmm2, %xmm6
1779 pslldq $4, %xmm5
1780 pshufd $0b11111111, %xmm0, %xmm3
1781 pxor %xmm3, %xmm2
1782 pxor %xmm5, %xmm2
1783
1784 movaps %xmm0, %xmm1
1785 shufps $0b01000100, %xmm0, %xmm6
Mathias Krause0d258ef2010-11-27 16:34:46 +08001786 movaps %xmm6, (TKEYP)
Huang Ying54b6a1b2009-01-18 16:28:34 +11001787 shufps $0b01001110, %xmm2, %xmm1
Mathias Krause0d258ef2010-11-27 16:34:46 +08001788 movaps %xmm1, 0x10(TKEYP)
1789 add $0x20, TKEYP
Huang Ying54b6a1b2009-01-18 16:28:34 +11001790 ret
Jiri Slaby74d8b902019-10-11 13:50:46 +02001791SYM_FUNC_END(_key_expansion_192a)
Huang Ying54b6a1b2009-01-18 16:28:34 +11001792
Jiri Slaby74d8b902019-10-11 13:50:46 +02001793SYM_FUNC_START_LOCAL(_key_expansion_192b)
Huang Ying54b6a1b2009-01-18 16:28:34 +11001794 pshufd $0b01010101, %xmm1, %xmm1
1795 shufps $0b00010000, %xmm0, %xmm4
1796 pxor %xmm4, %xmm0
1797 shufps $0b10001100, %xmm0, %xmm4
1798 pxor %xmm4, %xmm0
1799 pxor %xmm1, %xmm0
1800
1801 movaps %xmm2, %xmm5
1802 pslldq $4, %xmm5
1803 pshufd $0b11111111, %xmm0, %xmm3
1804 pxor %xmm3, %xmm2
1805 pxor %xmm5, %xmm2
1806
Mathias Krause0d258ef2010-11-27 16:34:46 +08001807 movaps %xmm0, (TKEYP)
1808 add $0x10, TKEYP
Huang Ying54b6a1b2009-01-18 16:28:34 +11001809 ret
Jiri Slaby74d8b902019-10-11 13:50:46 +02001810SYM_FUNC_END(_key_expansion_192b)
Huang Ying54b6a1b2009-01-18 16:28:34 +11001811
Jiri Slaby74d8b902019-10-11 13:50:46 +02001812SYM_FUNC_START_LOCAL(_key_expansion_256b)
Huang Ying54b6a1b2009-01-18 16:28:34 +11001813 pshufd $0b10101010, %xmm1, %xmm1
1814 shufps $0b00010000, %xmm2, %xmm4
1815 pxor %xmm4, %xmm2
1816 shufps $0b10001100, %xmm2, %xmm4
1817 pxor %xmm4, %xmm2
1818 pxor %xmm1, %xmm2
Mathias Krause0d258ef2010-11-27 16:34:46 +08001819 movaps %xmm2, (TKEYP)
1820 add $0x10, TKEYP
Huang Ying54b6a1b2009-01-18 16:28:34 +11001821 ret
Jiri Slaby74d8b902019-10-11 13:50:46 +02001822SYM_FUNC_END(_key_expansion_256b)
Huang Ying54b6a1b2009-01-18 16:28:34 +11001823
1824/*
1825 * int aesni_set_key(struct crypto_aes_ctx *ctx, const u8 *in_key,
1826 * unsigned int key_len)
1827 */
Jiri Slaby6dcc5622019-10-11 13:51:04 +02001828SYM_FUNC_START(aesni_set_key)
Josh Poimboeuf8691ccd2016-01-21 16:49:19 -06001829 FRAME_BEGIN
Mathias Krause0d258ef2010-11-27 16:34:46 +08001830#ifndef __x86_64__
1831 pushl KEYP
Josh Poimboeuf8691ccd2016-01-21 16:49:19 -06001832 movl (FRAME_OFFSET+8)(%esp), KEYP # ctx
1833 movl (FRAME_OFFSET+12)(%esp), UKEYP # in_key
1834 movl (FRAME_OFFSET+16)(%esp), %edx # key_len
Mathias Krause0d258ef2010-11-27 16:34:46 +08001835#endif
1836 movups (UKEYP), %xmm0 # user key (first 16 bytes)
1837 movaps %xmm0, (KEYP)
1838 lea 0x10(KEYP), TKEYP # key addr
1839 movl %edx, 480(KEYP)
Huang Ying54b6a1b2009-01-18 16:28:34 +11001840 pxor %xmm4, %xmm4 # xmm4 is assumed 0 in _key_expansion_x
1841 cmp $24, %dl
1842 jb .Lenc_key128
1843 je .Lenc_key192
Mathias Krause0d258ef2010-11-27 16:34:46 +08001844 movups 0x10(UKEYP), %xmm2 # other user key
1845 movaps %xmm2, (TKEYP)
1846 add $0x10, TKEYP
Uros Bizjakd7866e52020-07-09 17:08:57 +02001847 aeskeygenassist $0x1, %xmm2, %xmm1 # round 1
Huang Ying54b6a1b2009-01-18 16:28:34 +11001848 call _key_expansion_256a
Uros Bizjakd7866e52020-07-09 17:08:57 +02001849 aeskeygenassist $0x1, %xmm0, %xmm1
Huang Ying54b6a1b2009-01-18 16:28:34 +11001850 call _key_expansion_256b
Uros Bizjakd7866e52020-07-09 17:08:57 +02001851 aeskeygenassist $0x2, %xmm2, %xmm1 # round 2
Huang Ying54b6a1b2009-01-18 16:28:34 +11001852 call _key_expansion_256a
Uros Bizjakd7866e52020-07-09 17:08:57 +02001853 aeskeygenassist $0x2, %xmm0, %xmm1
Huang Ying54b6a1b2009-01-18 16:28:34 +11001854 call _key_expansion_256b
Uros Bizjakd7866e52020-07-09 17:08:57 +02001855 aeskeygenassist $0x4, %xmm2, %xmm1 # round 3
Huang Ying54b6a1b2009-01-18 16:28:34 +11001856 call _key_expansion_256a
Uros Bizjakd7866e52020-07-09 17:08:57 +02001857 aeskeygenassist $0x4, %xmm0, %xmm1
Huang Ying54b6a1b2009-01-18 16:28:34 +11001858 call _key_expansion_256b
Uros Bizjakd7866e52020-07-09 17:08:57 +02001859 aeskeygenassist $0x8, %xmm2, %xmm1 # round 4
Huang Ying54b6a1b2009-01-18 16:28:34 +11001860 call _key_expansion_256a
Uros Bizjakd7866e52020-07-09 17:08:57 +02001861 aeskeygenassist $0x8, %xmm0, %xmm1
Huang Ying54b6a1b2009-01-18 16:28:34 +11001862 call _key_expansion_256b
Uros Bizjakd7866e52020-07-09 17:08:57 +02001863 aeskeygenassist $0x10, %xmm2, %xmm1 # round 5
Huang Ying54b6a1b2009-01-18 16:28:34 +11001864 call _key_expansion_256a
Uros Bizjakd7866e52020-07-09 17:08:57 +02001865 aeskeygenassist $0x10, %xmm0, %xmm1
Huang Ying54b6a1b2009-01-18 16:28:34 +11001866 call _key_expansion_256b
Uros Bizjakd7866e52020-07-09 17:08:57 +02001867 aeskeygenassist $0x20, %xmm2, %xmm1 # round 6
Huang Ying54b6a1b2009-01-18 16:28:34 +11001868 call _key_expansion_256a
Uros Bizjakd7866e52020-07-09 17:08:57 +02001869 aeskeygenassist $0x20, %xmm0, %xmm1
Huang Ying54b6a1b2009-01-18 16:28:34 +11001870 call _key_expansion_256b
Uros Bizjakd7866e52020-07-09 17:08:57 +02001871 aeskeygenassist $0x40, %xmm2, %xmm1 # round 7
Huang Ying54b6a1b2009-01-18 16:28:34 +11001872 call _key_expansion_256a
1873 jmp .Ldec_key
1874.Lenc_key192:
Mathias Krause0d258ef2010-11-27 16:34:46 +08001875 movq 0x10(UKEYP), %xmm2 # other user key
Uros Bizjakd7866e52020-07-09 17:08:57 +02001876 aeskeygenassist $0x1, %xmm2, %xmm1 # round 1
Huang Ying54b6a1b2009-01-18 16:28:34 +11001877 call _key_expansion_192a
Uros Bizjakd7866e52020-07-09 17:08:57 +02001878 aeskeygenassist $0x2, %xmm2, %xmm1 # round 2
Huang Ying54b6a1b2009-01-18 16:28:34 +11001879 call _key_expansion_192b
Uros Bizjakd7866e52020-07-09 17:08:57 +02001880 aeskeygenassist $0x4, %xmm2, %xmm1 # round 3
Huang Ying54b6a1b2009-01-18 16:28:34 +11001881 call _key_expansion_192a
Uros Bizjakd7866e52020-07-09 17:08:57 +02001882 aeskeygenassist $0x8, %xmm2, %xmm1 # round 4
Huang Ying54b6a1b2009-01-18 16:28:34 +11001883 call _key_expansion_192b
Uros Bizjakd7866e52020-07-09 17:08:57 +02001884 aeskeygenassist $0x10, %xmm2, %xmm1 # round 5
Huang Ying54b6a1b2009-01-18 16:28:34 +11001885 call _key_expansion_192a
Uros Bizjakd7866e52020-07-09 17:08:57 +02001886 aeskeygenassist $0x20, %xmm2, %xmm1 # round 6
Huang Ying54b6a1b2009-01-18 16:28:34 +11001887 call _key_expansion_192b
Uros Bizjakd7866e52020-07-09 17:08:57 +02001888 aeskeygenassist $0x40, %xmm2, %xmm1 # round 7
Huang Ying54b6a1b2009-01-18 16:28:34 +11001889 call _key_expansion_192a
Uros Bizjakd7866e52020-07-09 17:08:57 +02001890 aeskeygenassist $0x80, %xmm2, %xmm1 # round 8
Huang Ying54b6a1b2009-01-18 16:28:34 +11001891 call _key_expansion_192b
1892 jmp .Ldec_key
1893.Lenc_key128:
Uros Bizjakd7866e52020-07-09 17:08:57 +02001894 aeskeygenassist $0x1, %xmm0, %xmm1 # round 1
Huang Ying54b6a1b2009-01-18 16:28:34 +11001895 call _key_expansion_128
Uros Bizjakd7866e52020-07-09 17:08:57 +02001896 aeskeygenassist $0x2, %xmm0, %xmm1 # round 2
Huang Ying54b6a1b2009-01-18 16:28:34 +11001897 call _key_expansion_128
Uros Bizjakd7866e52020-07-09 17:08:57 +02001898 aeskeygenassist $0x4, %xmm0, %xmm1 # round 3
Huang Ying54b6a1b2009-01-18 16:28:34 +11001899 call _key_expansion_128
Uros Bizjakd7866e52020-07-09 17:08:57 +02001900 aeskeygenassist $0x8, %xmm0, %xmm1 # round 4
Huang Ying54b6a1b2009-01-18 16:28:34 +11001901 call _key_expansion_128
Uros Bizjakd7866e52020-07-09 17:08:57 +02001902 aeskeygenassist $0x10, %xmm0, %xmm1 # round 5
Huang Ying54b6a1b2009-01-18 16:28:34 +11001903 call _key_expansion_128
Uros Bizjakd7866e52020-07-09 17:08:57 +02001904 aeskeygenassist $0x20, %xmm0, %xmm1 # round 6
Huang Ying54b6a1b2009-01-18 16:28:34 +11001905 call _key_expansion_128
Uros Bizjakd7866e52020-07-09 17:08:57 +02001906 aeskeygenassist $0x40, %xmm0, %xmm1 # round 7
Huang Ying54b6a1b2009-01-18 16:28:34 +11001907 call _key_expansion_128
Uros Bizjakd7866e52020-07-09 17:08:57 +02001908 aeskeygenassist $0x80, %xmm0, %xmm1 # round 8
Huang Ying54b6a1b2009-01-18 16:28:34 +11001909 call _key_expansion_128
Uros Bizjakd7866e52020-07-09 17:08:57 +02001910 aeskeygenassist $0x1b, %xmm0, %xmm1 # round 9
Huang Ying54b6a1b2009-01-18 16:28:34 +11001911 call _key_expansion_128
Uros Bizjakd7866e52020-07-09 17:08:57 +02001912 aeskeygenassist $0x36, %xmm0, %xmm1 # round 10
Huang Ying54b6a1b2009-01-18 16:28:34 +11001913 call _key_expansion_128
1914.Ldec_key:
Mathias Krause0d258ef2010-11-27 16:34:46 +08001915 sub $0x10, TKEYP
1916 movaps (KEYP), %xmm0
1917 movaps (TKEYP), %xmm1
1918 movaps %xmm0, 240(TKEYP)
1919 movaps %xmm1, 240(KEYP)
1920 add $0x10, KEYP
1921 lea 240-16(TKEYP), UKEYP
Huang Ying54b6a1b2009-01-18 16:28:34 +11001922.align 4
1923.Ldec_key_loop:
Mathias Krause0d258ef2010-11-27 16:34:46 +08001924 movaps (KEYP), %xmm0
Uros Bizjakd7866e52020-07-09 17:08:57 +02001925 aesimc %xmm0, %xmm1
Mathias Krause0d258ef2010-11-27 16:34:46 +08001926 movaps %xmm1, (UKEYP)
1927 add $0x10, KEYP
1928 sub $0x10, UKEYP
1929 cmp TKEYP, KEYP
Huang Ying54b6a1b2009-01-18 16:28:34 +11001930 jb .Ldec_key_loop
Mathias Krause0d258ef2010-11-27 16:34:46 +08001931 xor AREG, AREG
1932#ifndef __x86_64__
1933 popl KEYP
1934#endif
Josh Poimboeuf8691ccd2016-01-21 16:49:19 -06001935 FRAME_END
Huang Ying54b6a1b2009-01-18 16:28:34 +11001936 ret
Jiri Slaby6dcc5622019-10-11 13:51:04 +02001937SYM_FUNC_END(aesni_set_key)
Huang Ying54b6a1b2009-01-18 16:28:34 +11001938
1939/*
Kees Cook9c1e8832019-11-26 22:08:02 -08001940 * void aesni_enc(const void *ctx, u8 *dst, const u8 *src)
Huang Ying54b6a1b2009-01-18 16:28:34 +11001941 */
Jiri Slaby6dcc5622019-10-11 13:51:04 +02001942SYM_FUNC_START(aesni_enc)
Josh Poimboeuf8691ccd2016-01-21 16:49:19 -06001943 FRAME_BEGIN
Mathias Krause0d258ef2010-11-27 16:34:46 +08001944#ifndef __x86_64__
1945 pushl KEYP
1946 pushl KLEN
Josh Poimboeuf8691ccd2016-01-21 16:49:19 -06001947 movl (FRAME_OFFSET+12)(%esp), KEYP # ctx
1948 movl (FRAME_OFFSET+16)(%esp), OUTP # dst
1949 movl (FRAME_OFFSET+20)(%esp), INP # src
Mathias Krause0d258ef2010-11-27 16:34:46 +08001950#endif
Huang Ying54b6a1b2009-01-18 16:28:34 +11001951 movl 480(KEYP), KLEN # key length
1952 movups (INP), STATE # input
1953 call _aesni_enc1
1954 movups STATE, (OUTP) # output
Mathias Krause0d258ef2010-11-27 16:34:46 +08001955#ifndef __x86_64__
1956 popl KLEN
1957 popl KEYP
1958#endif
Josh Poimboeuf8691ccd2016-01-21 16:49:19 -06001959 FRAME_END
Huang Ying54b6a1b2009-01-18 16:28:34 +11001960 ret
Jiri Slaby6dcc5622019-10-11 13:51:04 +02001961SYM_FUNC_END(aesni_enc)
Huang Ying54b6a1b2009-01-18 16:28:34 +11001962
1963/*
1964 * _aesni_enc1: internal ABI
1965 * input:
1966 * KEYP: key struct pointer
1967 * KLEN: round count
1968 * STATE: initial state (input)
1969 * output:
1970 * STATE: finial state (output)
1971 * changed:
1972 * KEY
1973 * TKEYP (T1)
1974 */
Jiri Slaby74d8b902019-10-11 13:50:46 +02001975SYM_FUNC_START_LOCAL(_aesni_enc1)
Huang Ying54b6a1b2009-01-18 16:28:34 +11001976 movaps (KEYP), KEY # key
1977 mov KEYP, TKEYP
1978 pxor KEY, STATE # round 0
1979 add $0x30, TKEYP
1980 cmp $24, KLEN
1981 jb .Lenc128
1982 lea 0x20(TKEYP), TKEYP
1983 je .Lenc192
1984 add $0x20, TKEYP
1985 movaps -0x60(TKEYP), KEY
Uros Bizjakd7866e52020-07-09 17:08:57 +02001986 aesenc KEY, STATE
Huang Ying54b6a1b2009-01-18 16:28:34 +11001987 movaps -0x50(TKEYP), KEY
Uros Bizjakd7866e52020-07-09 17:08:57 +02001988 aesenc KEY, STATE
Huang Ying54b6a1b2009-01-18 16:28:34 +11001989.align 4
1990.Lenc192:
1991 movaps -0x40(TKEYP), KEY
Uros Bizjakd7866e52020-07-09 17:08:57 +02001992 aesenc KEY, STATE
Huang Ying54b6a1b2009-01-18 16:28:34 +11001993 movaps -0x30(TKEYP), KEY
Uros Bizjakd7866e52020-07-09 17:08:57 +02001994 aesenc KEY, STATE
Huang Ying54b6a1b2009-01-18 16:28:34 +11001995.align 4
1996.Lenc128:
1997 movaps -0x20(TKEYP), KEY
Uros Bizjakd7866e52020-07-09 17:08:57 +02001998 aesenc KEY, STATE
Huang Ying54b6a1b2009-01-18 16:28:34 +11001999 movaps -0x10(TKEYP), KEY
Uros Bizjakd7866e52020-07-09 17:08:57 +02002000 aesenc KEY, STATE
Huang Ying54b6a1b2009-01-18 16:28:34 +11002001 movaps (TKEYP), KEY
Uros Bizjakd7866e52020-07-09 17:08:57 +02002002 aesenc KEY, STATE
Huang Ying54b6a1b2009-01-18 16:28:34 +11002003 movaps 0x10(TKEYP), KEY
Uros Bizjakd7866e52020-07-09 17:08:57 +02002004 aesenc KEY, STATE
Huang Ying54b6a1b2009-01-18 16:28:34 +11002005 movaps 0x20(TKEYP), KEY
Uros Bizjakd7866e52020-07-09 17:08:57 +02002006 aesenc KEY, STATE
Huang Ying54b6a1b2009-01-18 16:28:34 +11002007 movaps 0x30(TKEYP), KEY
Uros Bizjakd7866e52020-07-09 17:08:57 +02002008 aesenc KEY, STATE
Huang Ying54b6a1b2009-01-18 16:28:34 +11002009 movaps 0x40(TKEYP), KEY
Uros Bizjakd7866e52020-07-09 17:08:57 +02002010 aesenc KEY, STATE
Huang Ying54b6a1b2009-01-18 16:28:34 +11002011 movaps 0x50(TKEYP), KEY
Uros Bizjakd7866e52020-07-09 17:08:57 +02002012 aesenc KEY, STATE
Huang Ying54b6a1b2009-01-18 16:28:34 +11002013 movaps 0x60(TKEYP), KEY
Uros Bizjakd7866e52020-07-09 17:08:57 +02002014 aesenc KEY, STATE
Huang Ying54b6a1b2009-01-18 16:28:34 +11002015 movaps 0x70(TKEYP), KEY
Uros Bizjakd7866e52020-07-09 17:08:57 +02002016 aesenclast KEY, STATE
Huang Ying54b6a1b2009-01-18 16:28:34 +11002017 ret
Jiri Slaby74d8b902019-10-11 13:50:46 +02002018SYM_FUNC_END(_aesni_enc1)
Huang Ying54b6a1b2009-01-18 16:28:34 +11002019
2020/*
2021 * _aesni_enc4: internal ABI
2022 * input:
2023 * KEYP: key struct pointer
2024 * KLEN: round count
2025 * STATE1: initial state (input)
2026 * STATE2
2027 * STATE3
2028 * STATE4
2029 * output:
2030 * STATE1: finial state (output)
2031 * STATE2
2032 * STATE3
2033 * STATE4
2034 * changed:
2035 * KEY
2036 * TKEYP (T1)
2037 */
Jiri Slaby74d8b902019-10-11 13:50:46 +02002038SYM_FUNC_START_LOCAL(_aesni_enc4)
Huang Ying54b6a1b2009-01-18 16:28:34 +11002039 movaps (KEYP), KEY # key
2040 mov KEYP, TKEYP
2041 pxor KEY, STATE1 # round 0
2042 pxor KEY, STATE2
2043 pxor KEY, STATE3
2044 pxor KEY, STATE4
2045 add $0x30, TKEYP
2046 cmp $24, KLEN
2047 jb .L4enc128
2048 lea 0x20(TKEYP), TKEYP
2049 je .L4enc192
2050 add $0x20, TKEYP
2051 movaps -0x60(TKEYP), KEY
Uros Bizjakd7866e52020-07-09 17:08:57 +02002052 aesenc KEY, STATE1
2053 aesenc KEY, STATE2
2054 aesenc KEY, STATE3
2055 aesenc KEY, STATE4
Huang Ying54b6a1b2009-01-18 16:28:34 +11002056 movaps -0x50(TKEYP), KEY
Uros Bizjakd7866e52020-07-09 17:08:57 +02002057 aesenc KEY, STATE1
2058 aesenc KEY, STATE2
2059 aesenc KEY, STATE3
2060 aesenc KEY, STATE4
Huang Ying54b6a1b2009-01-18 16:28:34 +11002061#.align 4
2062.L4enc192:
2063 movaps -0x40(TKEYP), KEY
Uros Bizjakd7866e52020-07-09 17:08:57 +02002064 aesenc KEY, STATE1
2065 aesenc KEY, STATE2
2066 aesenc KEY, STATE3
2067 aesenc KEY, STATE4
Huang Ying54b6a1b2009-01-18 16:28:34 +11002068 movaps -0x30(TKEYP), KEY
Uros Bizjakd7866e52020-07-09 17:08:57 +02002069 aesenc KEY, STATE1
2070 aesenc KEY, STATE2
2071 aesenc KEY, STATE3
2072 aesenc KEY, STATE4
Huang Ying54b6a1b2009-01-18 16:28:34 +11002073#.align 4
2074.L4enc128:
2075 movaps -0x20(TKEYP), KEY
Uros Bizjakd7866e52020-07-09 17:08:57 +02002076 aesenc KEY, STATE1
2077 aesenc KEY, STATE2
2078 aesenc KEY, STATE3
2079 aesenc KEY, STATE4
Huang Ying54b6a1b2009-01-18 16:28:34 +11002080 movaps -0x10(TKEYP), KEY
Uros Bizjakd7866e52020-07-09 17:08:57 +02002081 aesenc KEY, STATE1
2082 aesenc KEY, STATE2
2083 aesenc KEY, STATE3
2084 aesenc KEY, STATE4
Huang Ying54b6a1b2009-01-18 16:28:34 +11002085 movaps (TKEYP), KEY
Uros Bizjakd7866e52020-07-09 17:08:57 +02002086 aesenc KEY, STATE1
2087 aesenc KEY, STATE2
2088 aesenc KEY, STATE3
2089 aesenc KEY, STATE4
Huang Ying54b6a1b2009-01-18 16:28:34 +11002090 movaps 0x10(TKEYP), KEY
Uros Bizjakd7866e52020-07-09 17:08:57 +02002091 aesenc KEY, STATE1
2092 aesenc KEY, STATE2
2093 aesenc KEY, STATE3
2094 aesenc KEY, STATE4
Huang Ying54b6a1b2009-01-18 16:28:34 +11002095 movaps 0x20(TKEYP), KEY
Uros Bizjakd7866e52020-07-09 17:08:57 +02002096 aesenc KEY, STATE1
2097 aesenc KEY, STATE2
2098 aesenc KEY, STATE3
2099 aesenc KEY, STATE4
Huang Ying54b6a1b2009-01-18 16:28:34 +11002100 movaps 0x30(TKEYP), KEY
Uros Bizjakd7866e52020-07-09 17:08:57 +02002101 aesenc KEY, STATE1
2102 aesenc KEY, STATE2
2103 aesenc KEY, STATE3
2104 aesenc KEY, STATE4
Huang Ying54b6a1b2009-01-18 16:28:34 +11002105 movaps 0x40(TKEYP), KEY
Uros Bizjakd7866e52020-07-09 17:08:57 +02002106 aesenc KEY, STATE1
2107 aesenc KEY, STATE2
2108 aesenc KEY, STATE3
2109 aesenc KEY, STATE4
Huang Ying54b6a1b2009-01-18 16:28:34 +11002110 movaps 0x50(TKEYP), KEY
Uros Bizjakd7866e52020-07-09 17:08:57 +02002111 aesenc KEY, STATE1
2112 aesenc KEY, STATE2
2113 aesenc KEY, STATE3
2114 aesenc KEY, STATE4
Huang Ying54b6a1b2009-01-18 16:28:34 +11002115 movaps 0x60(TKEYP), KEY
Uros Bizjakd7866e52020-07-09 17:08:57 +02002116 aesenc KEY, STATE1
2117 aesenc KEY, STATE2
2118 aesenc KEY, STATE3
2119 aesenc KEY, STATE4
Huang Ying54b6a1b2009-01-18 16:28:34 +11002120 movaps 0x70(TKEYP), KEY
Uros Bizjakd7866e52020-07-09 17:08:57 +02002121 aesenclast KEY, STATE1 # last round
2122 aesenclast KEY, STATE2
2123 aesenclast KEY, STATE3
2124 aesenclast KEY, STATE4
Huang Ying54b6a1b2009-01-18 16:28:34 +11002125 ret
Jiri Slaby74d8b902019-10-11 13:50:46 +02002126SYM_FUNC_END(_aesni_enc4)
Huang Ying54b6a1b2009-01-18 16:28:34 +11002127
2128/*
Kees Cook9c1e8832019-11-26 22:08:02 -08002129 * void aesni_dec (const void *ctx, u8 *dst, const u8 *src)
Huang Ying54b6a1b2009-01-18 16:28:34 +11002130 */
Jiri Slaby6dcc5622019-10-11 13:51:04 +02002131SYM_FUNC_START(aesni_dec)
Josh Poimboeuf8691ccd2016-01-21 16:49:19 -06002132 FRAME_BEGIN
Mathias Krause0d258ef2010-11-27 16:34:46 +08002133#ifndef __x86_64__
2134 pushl KEYP
2135 pushl KLEN
Josh Poimboeuf8691ccd2016-01-21 16:49:19 -06002136 movl (FRAME_OFFSET+12)(%esp), KEYP # ctx
2137 movl (FRAME_OFFSET+16)(%esp), OUTP # dst
2138 movl (FRAME_OFFSET+20)(%esp), INP # src
Mathias Krause0d258ef2010-11-27 16:34:46 +08002139#endif
Huang Ying54b6a1b2009-01-18 16:28:34 +11002140 mov 480(KEYP), KLEN # key length
2141 add $240, KEYP
2142 movups (INP), STATE # input
2143 call _aesni_dec1
2144 movups STATE, (OUTP) #output
Mathias Krause0d258ef2010-11-27 16:34:46 +08002145#ifndef __x86_64__
2146 popl KLEN
2147 popl KEYP
2148#endif
Josh Poimboeuf8691ccd2016-01-21 16:49:19 -06002149 FRAME_END
Huang Ying54b6a1b2009-01-18 16:28:34 +11002150 ret
Jiri Slaby6dcc5622019-10-11 13:51:04 +02002151SYM_FUNC_END(aesni_dec)
Huang Ying54b6a1b2009-01-18 16:28:34 +11002152
2153/*
2154 * _aesni_dec1: internal ABI
2155 * input:
2156 * KEYP: key struct pointer
2157 * KLEN: key length
2158 * STATE: initial state (input)
2159 * output:
2160 * STATE: finial state (output)
2161 * changed:
2162 * KEY
2163 * TKEYP (T1)
2164 */
Jiri Slaby74d8b902019-10-11 13:50:46 +02002165SYM_FUNC_START_LOCAL(_aesni_dec1)
Huang Ying54b6a1b2009-01-18 16:28:34 +11002166 movaps (KEYP), KEY # key
2167 mov KEYP, TKEYP
2168 pxor KEY, STATE # round 0
2169 add $0x30, TKEYP
2170 cmp $24, KLEN
2171 jb .Ldec128
2172 lea 0x20(TKEYP), TKEYP
2173 je .Ldec192
2174 add $0x20, TKEYP
2175 movaps -0x60(TKEYP), KEY
Uros Bizjakd7866e52020-07-09 17:08:57 +02002176 aesdec KEY, STATE
Huang Ying54b6a1b2009-01-18 16:28:34 +11002177 movaps -0x50(TKEYP), KEY
Uros Bizjakd7866e52020-07-09 17:08:57 +02002178 aesdec KEY, STATE
Huang Ying54b6a1b2009-01-18 16:28:34 +11002179.align 4
2180.Ldec192:
2181 movaps -0x40(TKEYP), KEY
Uros Bizjakd7866e52020-07-09 17:08:57 +02002182 aesdec KEY, STATE
Huang Ying54b6a1b2009-01-18 16:28:34 +11002183 movaps -0x30(TKEYP), KEY
Uros Bizjakd7866e52020-07-09 17:08:57 +02002184 aesdec KEY, STATE
Huang Ying54b6a1b2009-01-18 16:28:34 +11002185.align 4
2186.Ldec128:
2187 movaps -0x20(TKEYP), KEY
Uros Bizjakd7866e52020-07-09 17:08:57 +02002188 aesdec KEY, STATE
Huang Ying54b6a1b2009-01-18 16:28:34 +11002189 movaps -0x10(TKEYP), KEY
Uros Bizjakd7866e52020-07-09 17:08:57 +02002190 aesdec KEY, STATE
Huang Ying54b6a1b2009-01-18 16:28:34 +11002191 movaps (TKEYP), KEY
Uros Bizjakd7866e52020-07-09 17:08:57 +02002192 aesdec KEY, STATE
Huang Ying54b6a1b2009-01-18 16:28:34 +11002193 movaps 0x10(TKEYP), KEY
Uros Bizjakd7866e52020-07-09 17:08:57 +02002194 aesdec KEY, STATE
Huang Ying54b6a1b2009-01-18 16:28:34 +11002195 movaps 0x20(TKEYP), KEY
Uros Bizjakd7866e52020-07-09 17:08:57 +02002196 aesdec KEY, STATE
Huang Ying54b6a1b2009-01-18 16:28:34 +11002197 movaps 0x30(TKEYP), KEY
Uros Bizjakd7866e52020-07-09 17:08:57 +02002198 aesdec KEY, STATE
Huang Ying54b6a1b2009-01-18 16:28:34 +11002199 movaps 0x40(TKEYP), KEY
Uros Bizjakd7866e52020-07-09 17:08:57 +02002200 aesdec KEY, STATE
Huang Ying54b6a1b2009-01-18 16:28:34 +11002201 movaps 0x50(TKEYP), KEY
Uros Bizjakd7866e52020-07-09 17:08:57 +02002202 aesdec KEY, STATE
Huang Ying54b6a1b2009-01-18 16:28:34 +11002203 movaps 0x60(TKEYP), KEY
Uros Bizjakd7866e52020-07-09 17:08:57 +02002204 aesdec KEY, STATE
Huang Ying54b6a1b2009-01-18 16:28:34 +11002205 movaps 0x70(TKEYP), KEY
Uros Bizjakd7866e52020-07-09 17:08:57 +02002206 aesdeclast KEY, STATE
Huang Ying54b6a1b2009-01-18 16:28:34 +11002207 ret
Jiri Slaby74d8b902019-10-11 13:50:46 +02002208SYM_FUNC_END(_aesni_dec1)
Huang Ying54b6a1b2009-01-18 16:28:34 +11002209
2210/*
2211 * _aesni_dec4: internal ABI
2212 * input:
2213 * KEYP: key struct pointer
2214 * KLEN: key length
2215 * STATE1: initial state (input)
2216 * STATE2
2217 * STATE3
2218 * STATE4
2219 * output:
2220 * STATE1: finial state (output)
2221 * STATE2
2222 * STATE3
2223 * STATE4
2224 * changed:
2225 * KEY
2226 * TKEYP (T1)
2227 */
Jiri Slaby74d8b902019-10-11 13:50:46 +02002228SYM_FUNC_START_LOCAL(_aesni_dec4)
Huang Ying54b6a1b2009-01-18 16:28:34 +11002229 movaps (KEYP), KEY # key
2230 mov KEYP, TKEYP
2231 pxor KEY, STATE1 # round 0
2232 pxor KEY, STATE2
2233 pxor KEY, STATE3
2234 pxor KEY, STATE4
2235 add $0x30, TKEYP
2236 cmp $24, KLEN
2237 jb .L4dec128
2238 lea 0x20(TKEYP), TKEYP
2239 je .L4dec192
2240 add $0x20, TKEYP
2241 movaps -0x60(TKEYP), KEY
Uros Bizjakd7866e52020-07-09 17:08:57 +02002242 aesdec KEY, STATE1
2243 aesdec KEY, STATE2
2244 aesdec KEY, STATE3
2245 aesdec KEY, STATE4
Huang Ying54b6a1b2009-01-18 16:28:34 +11002246 movaps -0x50(TKEYP), KEY
Uros Bizjakd7866e52020-07-09 17:08:57 +02002247 aesdec KEY, STATE1
2248 aesdec KEY, STATE2
2249 aesdec KEY, STATE3
2250 aesdec KEY, STATE4
Huang Ying54b6a1b2009-01-18 16:28:34 +11002251.align 4
2252.L4dec192:
2253 movaps -0x40(TKEYP), KEY
Uros Bizjakd7866e52020-07-09 17:08:57 +02002254 aesdec KEY, STATE1
2255 aesdec KEY, STATE2
2256 aesdec KEY, STATE3
2257 aesdec KEY, STATE4
Huang Ying54b6a1b2009-01-18 16:28:34 +11002258 movaps -0x30(TKEYP), KEY
Uros Bizjakd7866e52020-07-09 17:08:57 +02002259 aesdec KEY, STATE1
2260 aesdec KEY, STATE2
2261 aesdec KEY, STATE3
2262 aesdec KEY, STATE4
Huang Ying54b6a1b2009-01-18 16:28:34 +11002263.align 4
2264.L4dec128:
2265 movaps -0x20(TKEYP), KEY
Uros Bizjakd7866e52020-07-09 17:08:57 +02002266 aesdec KEY, STATE1
2267 aesdec KEY, STATE2
2268 aesdec KEY, STATE3
2269 aesdec KEY, STATE4
Huang Ying54b6a1b2009-01-18 16:28:34 +11002270 movaps -0x10(TKEYP), KEY
Uros Bizjakd7866e52020-07-09 17:08:57 +02002271 aesdec KEY, STATE1
2272 aesdec KEY, STATE2
2273 aesdec KEY, STATE3
2274 aesdec KEY, STATE4
Huang Ying54b6a1b2009-01-18 16:28:34 +11002275 movaps (TKEYP), KEY
Uros Bizjakd7866e52020-07-09 17:08:57 +02002276 aesdec KEY, STATE1
2277 aesdec KEY, STATE2
2278 aesdec KEY, STATE3
2279 aesdec KEY, STATE4
Huang Ying54b6a1b2009-01-18 16:28:34 +11002280 movaps 0x10(TKEYP), KEY
Uros Bizjakd7866e52020-07-09 17:08:57 +02002281 aesdec KEY, STATE1
2282 aesdec KEY, STATE2
2283 aesdec KEY, STATE3
2284 aesdec KEY, STATE4
Huang Ying54b6a1b2009-01-18 16:28:34 +11002285 movaps 0x20(TKEYP), KEY
Uros Bizjakd7866e52020-07-09 17:08:57 +02002286 aesdec KEY, STATE1
2287 aesdec KEY, STATE2
2288 aesdec KEY, STATE3
2289 aesdec KEY, STATE4
Huang Ying54b6a1b2009-01-18 16:28:34 +11002290 movaps 0x30(TKEYP), KEY
Uros Bizjakd7866e52020-07-09 17:08:57 +02002291 aesdec KEY, STATE1
2292 aesdec KEY, STATE2
2293 aesdec KEY, STATE3
2294 aesdec KEY, STATE4
Huang Ying54b6a1b2009-01-18 16:28:34 +11002295 movaps 0x40(TKEYP), KEY
Uros Bizjakd7866e52020-07-09 17:08:57 +02002296 aesdec KEY, STATE1
2297 aesdec KEY, STATE2
2298 aesdec KEY, STATE3
2299 aesdec KEY, STATE4
Huang Ying54b6a1b2009-01-18 16:28:34 +11002300 movaps 0x50(TKEYP), KEY
Uros Bizjakd7866e52020-07-09 17:08:57 +02002301 aesdec KEY, STATE1
2302 aesdec KEY, STATE2
2303 aesdec KEY, STATE3
2304 aesdec KEY, STATE4
Huang Ying54b6a1b2009-01-18 16:28:34 +11002305 movaps 0x60(TKEYP), KEY
Uros Bizjakd7866e52020-07-09 17:08:57 +02002306 aesdec KEY, STATE1
2307 aesdec KEY, STATE2
2308 aesdec KEY, STATE3
2309 aesdec KEY, STATE4
Huang Ying54b6a1b2009-01-18 16:28:34 +11002310 movaps 0x70(TKEYP), KEY
Uros Bizjakd7866e52020-07-09 17:08:57 +02002311 aesdeclast KEY, STATE1 # last round
2312 aesdeclast KEY, STATE2
2313 aesdeclast KEY, STATE3
2314 aesdeclast KEY, STATE4
Huang Ying54b6a1b2009-01-18 16:28:34 +11002315 ret
Jiri Slaby74d8b902019-10-11 13:50:46 +02002316SYM_FUNC_END(_aesni_dec4)
Huang Ying54b6a1b2009-01-18 16:28:34 +11002317
2318/*
2319 * void aesni_ecb_enc(struct crypto_aes_ctx *ctx, const u8 *dst, u8 *src,
2320 * size_t len)
2321 */
Jiri Slaby6dcc5622019-10-11 13:51:04 +02002322SYM_FUNC_START(aesni_ecb_enc)
Josh Poimboeuf8691ccd2016-01-21 16:49:19 -06002323 FRAME_BEGIN
Mathias Krause0d258ef2010-11-27 16:34:46 +08002324#ifndef __x86_64__
2325 pushl LEN
2326 pushl KEYP
2327 pushl KLEN
Josh Poimboeuf8691ccd2016-01-21 16:49:19 -06002328 movl (FRAME_OFFSET+16)(%esp), KEYP # ctx
2329 movl (FRAME_OFFSET+20)(%esp), OUTP # dst
2330 movl (FRAME_OFFSET+24)(%esp), INP # src
2331 movl (FRAME_OFFSET+28)(%esp), LEN # len
Mathias Krause0d258ef2010-11-27 16:34:46 +08002332#endif
Huang Ying54b6a1b2009-01-18 16:28:34 +11002333 test LEN, LEN # check length
2334 jz .Lecb_enc_ret
2335 mov 480(KEYP), KLEN
2336 cmp $16, LEN
2337 jb .Lecb_enc_ret
2338 cmp $64, LEN
2339 jb .Lecb_enc_loop1
2340.align 4
2341.Lecb_enc_loop4:
2342 movups (INP), STATE1
2343 movups 0x10(INP), STATE2
2344 movups 0x20(INP), STATE3
2345 movups 0x30(INP), STATE4
2346 call _aesni_enc4
2347 movups STATE1, (OUTP)
2348 movups STATE2, 0x10(OUTP)
2349 movups STATE3, 0x20(OUTP)
2350 movups STATE4, 0x30(OUTP)
2351 sub $64, LEN
2352 add $64, INP
2353 add $64, OUTP
2354 cmp $64, LEN
2355 jge .Lecb_enc_loop4
2356 cmp $16, LEN
2357 jb .Lecb_enc_ret
2358.align 4
2359.Lecb_enc_loop1:
2360 movups (INP), STATE1
2361 call _aesni_enc1
2362 movups STATE1, (OUTP)
2363 sub $16, LEN
2364 add $16, INP
2365 add $16, OUTP
2366 cmp $16, LEN
2367 jge .Lecb_enc_loop1
2368.Lecb_enc_ret:
Mathias Krause0d258ef2010-11-27 16:34:46 +08002369#ifndef __x86_64__
2370 popl KLEN
2371 popl KEYP
2372 popl LEN
2373#endif
Josh Poimboeuf8691ccd2016-01-21 16:49:19 -06002374 FRAME_END
Huang Ying54b6a1b2009-01-18 16:28:34 +11002375 ret
Jiri Slaby6dcc5622019-10-11 13:51:04 +02002376SYM_FUNC_END(aesni_ecb_enc)
Huang Ying54b6a1b2009-01-18 16:28:34 +11002377
2378/*
2379 * void aesni_ecb_dec(struct crypto_aes_ctx *ctx, const u8 *dst, u8 *src,
2380 * size_t len);
2381 */
Jiri Slaby6dcc5622019-10-11 13:51:04 +02002382SYM_FUNC_START(aesni_ecb_dec)
Josh Poimboeuf8691ccd2016-01-21 16:49:19 -06002383 FRAME_BEGIN
Mathias Krause0d258ef2010-11-27 16:34:46 +08002384#ifndef __x86_64__
2385 pushl LEN
2386 pushl KEYP
2387 pushl KLEN
Josh Poimboeuf8691ccd2016-01-21 16:49:19 -06002388 movl (FRAME_OFFSET+16)(%esp), KEYP # ctx
2389 movl (FRAME_OFFSET+20)(%esp), OUTP # dst
2390 movl (FRAME_OFFSET+24)(%esp), INP # src
2391 movl (FRAME_OFFSET+28)(%esp), LEN # len
Mathias Krause0d258ef2010-11-27 16:34:46 +08002392#endif
Huang Ying54b6a1b2009-01-18 16:28:34 +11002393 test LEN, LEN
2394 jz .Lecb_dec_ret
2395 mov 480(KEYP), KLEN
2396 add $240, KEYP
2397 cmp $16, LEN
2398 jb .Lecb_dec_ret
2399 cmp $64, LEN
2400 jb .Lecb_dec_loop1
2401.align 4
2402.Lecb_dec_loop4:
2403 movups (INP), STATE1
2404 movups 0x10(INP), STATE2
2405 movups 0x20(INP), STATE3
2406 movups 0x30(INP), STATE4
2407 call _aesni_dec4
2408 movups STATE1, (OUTP)
2409 movups STATE2, 0x10(OUTP)
2410 movups STATE3, 0x20(OUTP)
2411 movups STATE4, 0x30(OUTP)
2412 sub $64, LEN
2413 add $64, INP
2414 add $64, OUTP
2415 cmp $64, LEN
2416 jge .Lecb_dec_loop4
2417 cmp $16, LEN
2418 jb .Lecb_dec_ret
2419.align 4
2420.Lecb_dec_loop1:
2421 movups (INP), STATE1
2422 call _aesni_dec1
2423 movups STATE1, (OUTP)
2424 sub $16, LEN
2425 add $16, INP
2426 add $16, OUTP
2427 cmp $16, LEN
2428 jge .Lecb_dec_loop1
2429.Lecb_dec_ret:
Mathias Krause0d258ef2010-11-27 16:34:46 +08002430#ifndef __x86_64__
2431 popl KLEN
2432 popl KEYP
2433 popl LEN
2434#endif
Josh Poimboeuf8691ccd2016-01-21 16:49:19 -06002435 FRAME_END
Huang Ying54b6a1b2009-01-18 16:28:34 +11002436 ret
Jiri Slaby6dcc5622019-10-11 13:51:04 +02002437SYM_FUNC_END(aesni_ecb_dec)
Huang Ying54b6a1b2009-01-18 16:28:34 +11002438
2439/*
2440 * void aesni_cbc_enc(struct crypto_aes_ctx *ctx, const u8 *dst, u8 *src,
2441 * size_t len, u8 *iv)
2442 */
Jiri Slaby6dcc5622019-10-11 13:51:04 +02002443SYM_FUNC_START(aesni_cbc_enc)
Josh Poimboeuf8691ccd2016-01-21 16:49:19 -06002444 FRAME_BEGIN
Mathias Krause0d258ef2010-11-27 16:34:46 +08002445#ifndef __x86_64__
2446 pushl IVP
2447 pushl LEN
2448 pushl KEYP
2449 pushl KLEN
Josh Poimboeuf8691ccd2016-01-21 16:49:19 -06002450 movl (FRAME_OFFSET+20)(%esp), KEYP # ctx
2451 movl (FRAME_OFFSET+24)(%esp), OUTP # dst
2452 movl (FRAME_OFFSET+28)(%esp), INP # src
2453 movl (FRAME_OFFSET+32)(%esp), LEN # len
2454 movl (FRAME_OFFSET+36)(%esp), IVP # iv
Mathias Krause0d258ef2010-11-27 16:34:46 +08002455#endif
Huang Ying54b6a1b2009-01-18 16:28:34 +11002456 cmp $16, LEN
2457 jb .Lcbc_enc_ret
2458 mov 480(KEYP), KLEN
2459 movups (IVP), STATE # load iv as initial state
2460.align 4
2461.Lcbc_enc_loop:
2462 movups (INP), IN # load input
2463 pxor IN, STATE
2464 call _aesni_enc1
2465 movups STATE, (OUTP) # store output
2466 sub $16, LEN
2467 add $16, INP
2468 add $16, OUTP
2469 cmp $16, LEN
2470 jge .Lcbc_enc_loop
2471 movups STATE, (IVP)
2472.Lcbc_enc_ret:
Mathias Krause0d258ef2010-11-27 16:34:46 +08002473#ifndef __x86_64__
2474 popl KLEN
2475 popl KEYP
2476 popl LEN
2477 popl IVP
2478#endif
Josh Poimboeuf8691ccd2016-01-21 16:49:19 -06002479 FRAME_END
Huang Ying54b6a1b2009-01-18 16:28:34 +11002480 ret
Jiri Slaby6dcc5622019-10-11 13:51:04 +02002481SYM_FUNC_END(aesni_cbc_enc)
Huang Ying54b6a1b2009-01-18 16:28:34 +11002482
2483/*
2484 * void aesni_cbc_dec(struct crypto_aes_ctx *ctx, const u8 *dst, u8 *src,
2485 * size_t len, u8 *iv)
2486 */
Jiri Slaby6dcc5622019-10-11 13:51:04 +02002487SYM_FUNC_START(aesni_cbc_dec)
Josh Poimboeuf8691ccd2016-01-21 16:49:19 -06002488 FRAME_BEGIN
Mathias Krause0d258ef2010-11-27 16:34:46 +08002489#ifndef __x86_64__
2490 pushl IVP
2491 pushl LEN
2492 pushl KEYP
2493 pushl KLEN
Josh Poimboeuf8691ccd2016-01-21 16:49:19 -06002494 movl (FRAME_OFFSET+20)(%esp), KEYP # ctx
2495 movl (FRAME_OFFSET+24)(%esp), OUTP # dst
2496 movl (FRAME_OFFSET+28)(%esp), INP # src
2497 movl (FRAME_OFFSET+32)(%esp), LEN # len
2498 movl (FRAME_OFFSET+36)(%esp), IVP # iv
Mathias Krause0d258ef2010-11-27 16:34:46 +08002499#endif
Huang Ying54b6a1b2009-01-18 16:28:34 +11002500 cmp $16, LEN
Huang Yinge6efaa02009-06-18 19:33:57 +08002501 jb .Lcbc_dec_just_ret
Huang Ying54b6a1b2009-01-18 16:28:34 +11002502 mov 480(KEYP), KLEN
2503 add $240, KEYP
2504 movups (IVP), IV
2505 cmp $64, LEN
2506 jb .Lcbc_dec_loop1
2507.align 4
2508.Lcbc_dec_loop4:
2509 movups (INP), IN1
2510 movaps IN1, STATE1
2511 movups 0x10(INP), IN2
2512 movaps IN2, STATE2
Mathias Krause0d258ef2010-11-27 16:34:46 +08002513#ifdef __x86_64__
Huang Ying54b6a1b2009-01-18 16:28:34 +11002514 movups 0x20(INP), IN3
2515 movaps IN3, STATE3
2516 movups 0x30(INP), IN4
2517 movaps IN4, STATE4
Mathias Krause0d258ef2010-11-27 16:34:46 +08002518#else
2519 movups 0x20(INP), IN1
2520 movaps IN1, STATE3
2521 movups 0x30(INP), IN2
2522 movaps IN2, STATE4
2523#endif
Huang Ying54b6a1b2009-01-18 16:28:34 +11002524 call _aesni_dec4
2525 pxor IV, STATE1
Mathias Krause0d258ef2010-11-27 16:34:46 +08002526#ifdef __x86_64__
Huang Ying54b6a1b2009-01-18 16:28:34 +11002527 pxor IN1, STATE2
2528 pxor IN2, STATE3
2529 pxor IN3, STATE4
2530 movaps IN4, IV
Mathias Krause0d258ef2010-11-27 16:34:46 +08002531#else
Mathias Krause0d258ef2010-11-27 16:34:46 +08002532 pxor IN1, STATE4
2533 movaps IN2, IV
Mathias Krause7c8d5182012-05-30 01:43:08 +02002534 movups (INP), IN1
2535 pxor IN1, STATE2
2536 movups 0x10(INP), IN2
2537 pxor IN2, STATE3
Mathias Krause0d258ef2010-11-27 16:34:46 +08002538#endif
Huang Ying54b6a1b2009-01-18 16:28:34 +11002539 movups STATE1, (OUTP)
2540 movups STATE2, 0x10(OUTP)
2541 movups STATE3, 0x20(OUTP)
2542 movups STATE4, 0x30(OUTP)
2543 sub $64, LEN
2544 add $64, INP
2545 add $64, OUTP
2546 cmp $64, LEN
2547 jge .Lcbc_dec_loop4
2548 cmp $16, LEN
2549 jb .Lcbc_dec_ret
2550.align 4
2551.Lcbc_dec_loop1:
2552 movups (INP), IN
2553 movaps IN, STATE
2554 call _aesni_dec1
2555 pxor IV, STATE
2556 movups STATE, (OUTP)
2557 movaps IN, IV
2558 sub $16, LEN
2559 add $16, INP
2560 add $16, OUTP
2561 cmp $16, LEN
2562 jge .Lcbc_dec_loop1
Huang Ying54b6a1b2009-01-18 16:28:34 +11002563.Lcbc_dec_ret:
Huang Yinge6efaa02009-06-18 19:33:57 +08002564 movups IV, (IVP)
2565.Lcbc_dec_just_ret:
Mathias Krause0d258ef2010-11-27 16:34:46 +08002566#ifndef __x86_64__
2567 popl KLEN
2568 popl KEYP
2569 popl LEN
2570 popl IVP
2571#endif
Josh Poimboeuf8691ccd2016-01-21 16:49:19 -06002572 FRAME_END
Huang Ying54b6a1b2009-01-18 16:28:34 +11002573 ret
Jiri Slaby6dcc5622019-10-11 13:51:04 +02002574SYM_FUNC_END(aesni_cbc_dec)
Huang Ying12387a42010-03-10 18:28:55 +08002575
Ard Biesheuvelddf169a2020-12-08 00:34:02 +01002576/*
2577 * void aesni_cts_cbc_enc(struct crypto_aes_ctx *ctx, const u8 *dst, u8 *src,
2578 * size_t len, u8 *iv)
2579 */
2580SYM_FUNC_START(aesni_cts_cbc_enc)
2581 FRAME_BEGIN
2582#ifndef __x86_64__
2583 pushl IVP
2584 pushl LEN
2585 pushl KEYP
2586 pushl KLEN
2587 movl (FRAME_OFFSET+20)(%esp), KEYP # ctx
2588 movl (FRAME_OFFSET+24)(%esp), OUTP # dst
2589 movl (FRAME_OFFSET+28)(%esp), INP # src
2590 movl (FRAME_OFFSET+32)(%esp), LEN # len
2591 movl (FRAME_OFFSET+36)(%esp), IVP # iv
2592 lea .Lcts_permute_table, T1
2593#else
2594 lea .Lcts_permute_table(%rip), T1
2595#endif
2596 mov 480(KEYP), KLEN
2597 movups (IVP), STATE
2598 sub $16, LEN
2599 mov T1, IVP
2600 add $32, IVP
2601 add LEN, T1
2602 sub LEN, IVP
2603 movups (T1), %xmm4
2604 movups (IVP), %xmm5
2605
2606 movups (INP), IN1
2607 add LEN, INP
2608 movups (INP), IN2
2609
2610 pxor IN1, STATE
2611 call _aesni_enc1
2612
2613 pshufb %xmm5, IN2
2614 pxor STATE, IN2
2615 pshufb %xmm4, STATE
2616 add OUTP, LEN
2617 movups STATE, (LEN)
2618
2619 movaps IN2, STATE
2620 call _aesni_enc1
2621 movups STATE, (OUTP)
2622
2623#ifndef __x86_64__
2624 popl KLEN
2625 popl KEYP
2626 popl LEN
2627 popl IVP
2628#endif
2629 FRAME_END
2630 ret
2631SYM_FUNC_END(aesni_cts_cbc_enc)
2632
2633/*
2634 * void aesni_cts_cbc_dec(struct crypto_aes_ctx *ctx, const u8 *dst, u8 *src,
2635 * size_t len, u8 *iv)
2636 */
2637SYM_FUNC_START(aesni_cts_cbc_dec)
2638 FRAME_BEGIN
2639#ifndef __x86_64__
2640 pushl IVP
2641 pushl LEN
2642 pushl KEYP
2643 pushl KLEN
2644 movl (FRAME_OFFSET+20)(%esp), KEYP # ctx
2645 movl (FRAME_OFFSET+24)(%esp), OUTP # dst
2646 movl (FRAME_OFFSET+28)(%esp), INP # src
2647 movl (FRAME_OFFSET+32)(%esp), LEN # len
2648 movl (FRAME_OFFSET+36)(%esp), IVP # iv
2649 lea .Lcts_permute_table, T1
2650#else
2651 lea .Lcts_permute_table(%rip), T1
2652#endif
2653 mov 480(KEYP), KLEN
2654 add $240, KEYP
2655 movups (IVP), IV
2656 sub $16, LEN
2657 mov T1, IVP
2658 add $32, IVP
2659 add LEN, T1
2660 sub LEN, IVP
2661 movups (T1), %xmm4
2662
2663 movups (INP), STATE
2664 add LEN, INP
2665 movups (INP), IN1
2666
2667 call _aesni_dec1
2668 movaps STATE, IN2
2669 pshufb %xmm4, STATE
2670 pxor IN1, STATE
2671
2672 add OUTP, LEN
2673 movups STATE, (LEN)
2674
2675 movups (IVP), %xmm0
2676 pshufb %xmm0, IN1
2677 pblendvb IN2, IN1
2678 movaps IN1, STATE
2679 call _aesni_dec1
2680
2681 pxor IV, STATE
2682 movups STATE, (OUTP)
2683
2684#ifndef __x86_64__
2685 popl KLEN
2686 popl KEYP
2687 popl LEN
2688 popl IVP
2689#endif
2690 FRAME_END
2691 ret
2692SYM_FUNC_END(aesni_cts_cbc_dec)
2693
Josh Poimboeuf1253cab2016-01-21 16:49:15 -06002694.pushsection .rodata
Huang Ying12387a42010-03-10 18:28:55 +08002695.align 16
Ard Biesheuvelddf169a2020-12-08 00:34:02 +01002696.Lcts_permute_table:
2697 .byte 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80
2698 .byte 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80
2699 .byte 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07
2700 .byte 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f
2701 .byte 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80
2702 .byte 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80
2703#ifdef __x86_64__
Huang Ying12387a42010-03-10 18:28:55 +08002704.Lbswap_mask:
2705 .byte 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0
Ard Biesheuvelddf169a2020-12-08 00:34:02 +01002706#endif
Josh Poimboeuf1253cab2016-01-21 16:49:15 -06002707.popsection
Huang Ying12387a42010-03-10 18:28:55 +08002708
Ard Biesheuvelddf169a2020-12-08 00:34:02 +01002709#ifdef __x86_64__
Huang Ying12387a42010-03-10 18:28:55 +08002710/*
2711 * _aesni_inc_init: internal ABI
2712 * setup registers used by _aesni_inc
2713 * input:
2714 * IV
2715 * output:
2716 * CTR: == IV, in little endian
2717 * TCTR_LOW: == lower qword of CTR
2718 * INC: == 1, in little endian
2719 * BSWAP_MASK == endian swapping mask
2720 */
Jiri Slaby74d8b902019-10-11 13:50:46 +02002721SYM_FUNC_START_LOCAL(_aesni_inc_init)
Huang Ying12387a42010-03-10 18:28:55 +08002722 movaps .Lbswap_mask, BSWAP_MASK
2723 movaps IV, CTR
Uros Bizjakd7866e52020-07-09 17:08:57 +02002724 pshufb BSWAP_MASK, CTR
Huang Ying12387a42010-03-10 18:28:55 +08002725 mov $1, TCTR_LOW
Uros Bizjakd7866e52020-07-09 17:08:57 +02002726 movq TCTR_LOW, INC
2727 movq CTR, TCTR_LOW
Huang Ying12387a42010-03-10 18:28:55 +08002728 ret
Jiri Slaby74d8b902019-10-11 13:50:46 +02002729SYM_FUNC_END(_aesni_inc_init)
Huang Ying12387a42010-03-10 18:28:55 +08002730
2731/*
2732 * _aesni_inc: internal ABI
2733 * Increase IV by 1, IV is in big endian
2734 * input:
2735 * IV
2736 * CTR: == IV, in little endian
2737 * TCTR_LOW: == lower qword of CTR
2738 * INC: == 1, in little endian
2739 * BSWAP_MASK == endian swapping mask
2740 * output:
2741 * IV: Increase by 1
2742 * changed:
2743 * CTR: == output IV, in little endian
2744 * TCTR_LOW: == lower qword of CTR
2745 */
Jiri Slaby74d8b902019-10-11 13:50:46 +02002746SYM_FUNC_START_LOCAL(_aesni_inc)
Huang Ying12387a42010-03-10 18:28:55 +08002747 paddq INC, CTR
2748 add $1, TCTR_LOW
2749 jnc .Linc_low
2750 pslldq $8, INC
2751 paddq INC, CTR
2752 psrldq $8, INC
2753.Linc_low:
2754 movaps CTR, IV
Uros Bizjakd7866e52020-07-09 17:08:57 +02002755 pshufb BSWAP_MASK, IV
Huang Ying12387a42010-03-10 18:28:55 +08002756 ret
Jiri Slaby74d8b902019-10-11 13:50:46 +02002757SYM_FUNC_END(_aesni_inc)
Huang Ying12387a42010-03-10 18:28:55 +08002758
2759/*
2760 * void aesni_ctr_enc(struct crypto_aes_ctx *ctx, const u8 *dst, u8 *src,
2761 * size_t len, u8 *iv)
2762 */
Jiri Slaby6dcc5622019-10-11 13:51:04 +02002763SYM_FUNC_START(aesni_ctr_enc)
Josh Poimboeuf8691ccd2016-01-21 16:49:19 -06002764 FRAME_BEGIN
Huang Ying12387a42010-03-10 18:28:55 +08002765 cmp $16, LEN
2766 jb .Lctr_enc_just_ret
2767 mov 480(KEYP), KLEN
2768 movups (IVP), IV
2769 call _aesni_inc_init
2770 cmp $64, LEN
2771 jb .Lctr_enc_loop1
2772.align 4
2773.Lctr_enc_loop4:
2774 movaps IV, STATE1
2775 call _aesni_inc
2776 movups (INP), IN1
2777 movaps IV, STATE2
2778 call _aesni_inc
2779 movups 0x10(INP), IN2
2780 movaps IV, STATE3
2781 call _aesni_inc
2782 movups 0x20(INP), IN3
2783 movaps IV, STATE4
2784 call _aesni_inc
2785 movups 0x30(INP), IN4
2786 call _aesni_enc4
2787 pxor IN1, STATE1
2788 movups STATE1, (OUTP)
2789 pxor IN2, STATE2
2790 movups STATE2, 0x10(OUTP)
2791 pxor IN3, STATE3
2792 movups STATE3, 0x20(OUTP)
2793 pxor IN4, STATE4
2794 movups STATE4, 0x30(OUTP)
2795 sub $64, LEN
2796 add $64, INP
2797 add $64, OUTP
2798 cmp $64, LEN
2799 jge .Lctr_enc_loop4
2800 cmp $16, LEN
2801 jb .Lctr_enc_ret
2802.align 4
2803.Lctr_enc_loop1:
2804 movaps IV, STATE
2805 call _aesni_inc
2806 movups (INP), IN
2807 call _aesni_enc1
2808 pxor IN, STATE
2809 movups STATE, (OUTP)
2810 sub $16, LEN
2811 add $16, INP
2812 add $16, OUTP
2813 cmp $16, LEN
2814 jge .Lctr_enc_loop1
2815.Lctr_enc_ret:
2816 movups IV, (IVP)
2817.Lctr_enc_just_ret:
Josh Poimboeuf8691ccd2016-01-21 16:49:19 -06002818 FRAME_END
Huang Ying12387a42010-03-10 18:28:55 +08002819 ret
Jiri Slaby6dcc5622019-10-11 13:51:04 +02002820SYM_FUNC_END(aesni_ctr_enc)
Jussi Kivilinnac456a9c2013-04-08 21:51:16 +03002821
Ard Biesheuvel24811042020-12-31 17:41:55 +01002822#endif
2823
2824.section .rodata.cst16.gf128mul_x_ble_mask, "aM", @progbits, 16
2825.align 16
2826.Lgf128mul_x_ble_mask:
2827 .octa 0x00000000000000010000000000000087
2828.previous
2829
Jussi Kivilinnac456a9c2013-04-08 21:51:16 +03002830/*
2831 * _aesni_gf128mul_x_ble: internal ABI
2832 * Multiply in GF(2^128) for XTS IVs
2833 * input:
2834 * IV: current IV
2835 * GF128MUL_MASK == mask with 0x87 and 0x01
2836 * output:
2837 * IV: next IV
2838 * changed:
2839 * CTR: == temporary value
2840 */
2841#define _aesni_gf128mul_x_ble() \
Ard Biesheuvel24811042020-12-31 17:41:55 +01002842 pshufd $0x13, IV, KEY; \
Jussi Kivilinnac456a9c2013-04-08 21:51:16 +03002843 paddq IV, IV; \
Ard Biesheuvel24811042020-12-31 17:41:55 +01002844 psrad $31, KEY; \
2845 pand GF128MUL_MASK, KEY; \
2846 pxor KEY, IV;
Jussi Kivilinnac456a9c2013-04-08 21:51:16 +03002847
2848/*
Ard Biesheuvel86ad60a2020-12-31 17:41:54 +01002849 * void aesni_xts_encrypt(const struct crypto_aes_ctx *ctx, u8 *dst,
2850 * const u8 *src, unsigned int len, le128 *iv)
Jussi Kivilinnac456a9c2013-04-08 21:51:16 +03002851 */
Ard Biesheuvel86ad60a2020-12-31 17:41:54 +01002852SYM_FUNC_START(aesni_xts_encrypt)
Josh Poimboeuf8691ccd2016-01-21 16:49:19 -06002853 FRAME_BEGIN
Ard Biesheuvel24811042020-12-31 17:41:55 +01002854#ifndef __x86_64__
2855 pushl IVP
2856 pushl LEN
2857 pushl KEYP
2858 pushl KLEN
2859 movl (FRAME_OFFSET+20)(%esp), KEYP # ctx
2860 movl (FRAME_OFFSET+24)(%esp), OUTP # dst
2861 movl (FRAME_OFFSET+28)(%esp), INP # src
2862 movl (FRAME_OFFSET+32)(%esp), LEN # len
2863 movl (FRAME_OFFSET+36)(%esp), IVP # iv
Jussi Kivilinnac456a9c2013-04-08 21:51:16 +03002864 movdqa .Lgf128mul_x_ble_mask, GF128MUL_MASK
Ard Biesheuvel24811042020-12-31 17:41:55 +01002865#else
2866 movdqa .Lgf128mul_x_ble_mask(%rip), GF128MUL_MASK
2867#endif
Jussi Kivilinnac456a9c2013-04-08 21:51:16 +03002868 movups (IVP), IV
2869
2870 mov 480(KEYP), KLEN
Jussi Kivilinnac456a9c2013-04-08 21:51:16 +03002871
Ard Biesheuvel86ad60a2020-12-31 17:41:54 +01002872.Lxts_enc_loop4:
Ard Biesheuvel24811042020-12-31 17:41:55 +01002873 sub $64, LEN
2874 jl .Lxts_enc_1x
2875
Jussi Kivilinnac456a9c2013-04-08 21:51:16 +03002876 movdqa IV, STATE1
Ard Biesheuvel24811042020-12-31 17:41:55 +01002877 movdqu 0x00(INP), IN
2878 pxor IN, STATE1
Jussi Kivilinnac456a9c2013-04-08 21:51:16 +03002879 movdqu IV, 0x00(OUTP)
2880
2881 _aesni_gf128mul_x_ble()
2882 movdqa IV, STATE2
Ard Biesheuvel24811042020-12-31 17:41:55 +01002883 movdqu 0x10(INP), IN
2884 pxor IN, STATE2
Jussi Kivilinnac456a9c2013-04-08 21:51:16 +03002885 movdqu IV, 0x10(OUTP)
2886
2887 _aesni_gf128mul_x_ble()
2888 movdqa IV, STATE3
Ard Biesheuvel24811042020-12-31 17:41:55 +01002889 movdqu 0x20(INP), IN
2890 pxor IN, STATE3
Jussi Kivilinnac456a9c2013-04-08 21:51:16 +03002891 movdqu IV, 0x20(OUTP)
2892
2893 _aesni_gf128mul_x_ble()
2894 movdqa IV, STATE4
Ard Biesheuvel24811042020-12-31 17:41:55 +01002895 movdqu 0x30(INP), IN
2896 pxor IN, STATE4
Jussi Kivilinnac456a9c2013-04-08 21:51:16 +03002897 movdqu IV, 0x30(OUTP)
2898
Ard Biesheuvel86ad60a2020-12-31 17:41:54 +01002899 call _aesni_enc4
Jussi Kivilinnac456a9c2013-04-08 21:51:16 +03002900
Ard Biesheuvel24811042020-12-31 17:41:55 +01002901 movdqu 0x00(OUTP), IN
2902 pxor IN, STATE1
Jussi Kivilinnac456a9c2013-04-08 21:51:16 +03002903 movdqu STATE1, 0x00(OUTP)
2904
Ard Biesheuvel24811042020-12-31 17:41:55 +01002905 movdqu 0x10(OUTP), IN
2906 pxor IN, STATE2
Jussi Kivilinnac456a9c2013-04-08 21:51:16 +03002907 movdqu STATE2, 0x10(OUTP)
2908
Ard Biesheuvel24811042020-12-31 17:41:55 +01002909 movdqu 0x20(OUTP), IN
2910 pxor IN, STATE3
Jussi Kivilinnac456a9c2013-04-08 21:51:16 +03002911 movdqu STATE3, 0x20(OUTP)
2912
Ard Biesheuvel24811042020-12-31 17:41:55 +01002913 movdqu 0x30(OUTP), IN
2914 pxor IN, STATE4
Jussi Kivilinnac456a9c2013-04-08 21:51:16 +03002915 movdqu STATE4, 0x30(OUTP)
2916
2917 _aesni_gf128mul_x_ble()
Jussi Kivilinnac456a9c2013-04-08 21:51:16 +03002918
Ard Biesheuvel86ad60a2020-12-31 17:41:54 +01002919 add $64, INP
2920 add $64, OUTP
Ard Biesheuvel24811042020-12-31 17:41:55 +01002921 test LEN, LEN
2922 jnz .Lxts_enc_loop4
Ard Biesheuvel86ad60a2020-12-31 17:41:54 +01002923
Ard Biesheuvel24811042020-12-31 17:41:55 +01002924.Lxts_enc_ret_iv:
Jussi Kivilinnac456a9c2013-04-08 21:51:16 +03002925 movups IV, (IVP)
2926
Ard Biesheuvel24811042020-12-31 17:41:55 +01002927.Lxts_enc_ret:
2928#ifndef __x86_64__
2929 popl KLEN
2930 popl KEYP
2931 popl LEN
2932 popl IVP
2933#endif
Josh Poimboeuf8691ccd2016-01-21 16:49:19 -06002934 FRAME_END
Jussi Kivilinnac456a9c2013-04-08 21:51:16 +03002935 ret
Ard Biesheuvel24811042020-12-31 17:41:55 +01002936
2937.Lxts_enc_1x:
2938 add $64, LEN
2939 jz .Lxts_enc_ret_iv
2940 sub $16, LEN
2941 jl .Lxts_enc_cts4
2942
2943.Lxts_enc_loop1:
2944 movdqu (INP), STATE
2945 pxor IV, STATE
2946 call _aesni_enc1
2947 pxor IV, STATE
2948 _aesni_gf128mul_x_ble()
2949
2950 test LEN, LEN
2951 jz .Lxts_enc_out
2952
2953 add $16, INP
2954 sub $16, LEN
2955 jl .Lxts_enc_cts1
2956
2957 movdqu STATE, (OUTP)
2958 add $16, OUTP
2959 jmp .Lxts_enc_loop1
2960
2961.Lxts_enc_out:
2962 movdqu STATE, (OUTP)
2963 jmp .Lxts_enc_ret_iv
2964
2965.Lxts_enc_cts4:
2966 movdqa STATE4, STATE
2967 sub $16, OUTP
2968
2969.Lxts_enc_cts1:
2970#ifndef __x86_64__
2971 lea .Lcts_permute_table, T1
2972#else
2973 lea .Lcts_permute_table(%rip), T1
2974#endif
2975 add LEN, INP /* rewind input pointer */
2976 add $16, LEN /* # bytes in final block */
2977 movups (INP), IN1
2978
2979 mov T1, IVP
2980 add $32, IVP
2981 add LEN, T1
2982 sub LEN, IVP
2983 add OUTP, LEN
2984
2985 movups (T1), %xmm4
2986 movaps STATE, IN2
2987 pshufb %xmm4, STATE
2988 movups STATE, (LEN)
2989
2990 movups (IVP), %xmm0
2991 pshufb %xmm0, IN1
2992 pblendvb IN2, IN1
2993 movaps IN1, STATE
2994
2995 pxor IV, STATE
2996 call _aesni_enc1
2997 pxor IV, STATE
2998
2999 movups STATE, (OUTP)
3000 jmp .Lxts_enc_ret
Ard Biesheuvel86ad60a2020-12-31 17:41:54 +01003001SYM_FUNC_END(aesni_xts_encrypt)
3002
3003/*
3004 * void aesni_xts_decrypt(const struct crypto_aes_ctx *ctx, u8 *dst,
3005 * const u8 *src, unsigned int len, le128 *iv)
3006 */
3007SYM_FUNC_START(aesni_xts_decrypt)
3008 FRAME_BEGIN
Ard Biesheuvel24811042020-12-31 17:41:55 +01003009#ifndef __x86_64__
3010 pushl IVP
3011 pushl LEN
3012 pushl KEYP
3013 pushl KLEN
3014 movl (FRAME_OFFSET+20)(%esp), KEYP # ctx
3015 movl (FRAME_OFFSET+24)(%esp), OUTP # dst
3016 movl (FRAME_OFFSET+28)(%esp), INP # src
3017 movl (FRAME_OFFSET+32)(%esp), LEN # len
3018 movl (FRAME_OFFSET+36)(%esp), IVP # iv
Ard Biesheuvel86ad60a2020-12-31 17:41:54 +01003019 movdqa .Lgf128mul_x_ble_mask, GF128MUL_MASK
Ard Biesheuvel24811042020-12-31 17:41:55 +01003020#else
3021 movdqa .Lgf128mul_x_ble_mask(%rip), GF128MUL_MASK
3022#endif
Ard Biesheuvel86ad60a2020-12-31 17:41:54 +01003023 movups (IVP), IV
3024
3025 mov 480(KEYP), KLEN
3026 add $240, KEYP
3027
Ard Biesheuvel24811042020-12-31 17:41:55 +01003028 test $15, LEN
3029 jz .Lxts_dec_loop4
3030 sub $16, LEN
3031
Ard Biesheuvel86ad60a2020-12-31 17:41:54 +01003032.Lxts_dec_loop4:
Ard Biesheuvel24811042020-12-31 17:41:55 +01003033 sub $64, LEN
3034 jl .Lxts_dec_1x
3035
Ard Biesheuvel86ad60a2020-12-31 17:41:54 +01003036 movdqa IV, STATE1
Ard Biesheuvel24811042020-12-31 17:41:55 +01003037 movdqu 0x00(INP), IN
3038 pxor IN, STATE1
Ard Biesheuvel86ad60a2020-12-31 17:41:54 +01003039 movdqu IV, 0x00(OUTP)
3040
3041 _aesni_gf128mul_x_ble()
3042 movdqa IV, STATE2
Ard Biesheuvel24811042020-12-31 17:41:55 +01003043 movdqu 0x10(INP), IN
3044 pxor IN, STATE2
Ard Biesheuvel86ad60a2020-12-31 17:41:54 +01003045 movdqu IV, 0x10(OUTP)
3046
3047 _aesni_gf128mul_x_ble()
3048 movdqa IV, STATE3
Ard Biesheuvel24811042020-12-31 17:41:55 +01003049 movdqu 0x20(INP), IN
3050 pxor IN, STATE3
Ard Biesheuvel86ad60a2020-12-31 17:41:54 +01003051 movdqu IV, 0x20(OUTP)
3052
3053 _aesni_gf128mul_x_ble()
3054 movdqa IV, STATE4
Ard Biesheuvel24811042020-12-31 17:41:55 +01003055 movdqu 0x30(INP), IN
3056 pxor IN, STATE4
Ard Biesheuvel86ad60a2020-12-31 17:41:54 +01003057 movdqu IV, 0x30(OUTP)
3058
3059 call _aesni_dec4
3060
Ard Biesheuvel24811042020-12-31 17:41:55 +01003061 movdqu 0x00(OUTP), IN
3062 pxor IN, STATE1
Ard Biesheuvel86ad60a2020-12-31 17:41:54 +01003063 movdqu STATE1, 0x00(OUTP)
3064
Ard Biesheuvel24811042020-12-31 17:41:55 +01003065 movdqu 0x10(OUTP), IN
3066 pxor IN, STATE2
Ard Biesheuvel86ad60a2020-12-31 17:41:54 +01003067 movdqu STATE2, 0x10(OUTP)
3068
Ard Biesheuvel24811042020-12-31 17:41:55 +01003069 movdqu 0x20(OUTP), IN
3070 pxor IN, STATE3
Ard Biesheuvel86ad60a2020-12-31 17:41:54 +01003071 movdqu STATE3, 0x20(OUTP)
3072
Ard Biesheuvel24811042020-12-31 17:41:55 +01003073 movdqu 0x30(OUTP), IN
3074 pxor IN, STATE4
Ard Biesheuvel86ad60a2020-12-31 17:41:54 +01003075 movdqu STATE4, 0x30(OUTP)
3076
3077 _aesni_gf128mul_x_ble()
3078
3079 add $64, INP
3080 add $64, OUTP
Ard Biesheuvel24811042020-12-31 17:41:55 +01003081 test LEN, LEN
3082 jnz .Lxts_dec_loop4
Ard Biesheuvel86ad60a2020-12-31 17:41:54 +01003083
Ard Biesheuvel24811042020-12-31 17:41:55 +01003084.Lxts_dec_ret_iv:
Ard Biesheuvel86ad60a2020-12-31 17:41:54 +01003085 movups IV, (IVP)
3086
Ard Biesheuvel24811042020-12-31 17:41:55 +01003087.Lxts_dec_ret:
3088#ifndef __x86_64__
3089 popl KLEN
3090 popl KEYP
3091 popl LEN
3092 popl IVP
3093#endif
Ard Biesheuvel86ad60a2020-12-31 17:41:54 +01003094 FRAME_END
3095 ret
Jussi Kivilinnac456a9c2013-04-08 21:51:16 +03003096
Ard Biesheuvel24811042020-12-31 17:41:55 +01003097.Lxts_dec_1x:
3098 add $64, LEN
3099 jz .Lxts_dec_ret_iv
3100
3101.Lxts_dec_loop1:
3102 movdqu (INP), STATE
3103
3104 add $16, INP
3105 sub $16, LEN
3106 jl .Lxts_dec_cts1
3107
3108 pxor IV, STATE
3109 call _aesni_dec1
3110 pxor IV, STATE
3111 _aesni_gf128mul_x_ble()
3112
3113 test LEN, LEN
3114 jz .Lxts_dec_out
3115
3116 movdqu STATE, (OUTP)
3117 add $16, OUTP
3118 jmp .Lxts_dec_loop1
3119
3120.Lxts_dec_out:
3121 movdqu STATE, (OUTP)
3122 jmp .Lxts_dec_ret_iv
3123
3124.Lxts_dec_cts1:
3125 movdqa IV, STATE4
3126 _aesni_gf128mul_x_ble()
3127
3128 pxor IV, STATE
3129 call _aesni_dec1
3130 pxor IV, STATE
3131
3132#ifndef __x86_64__
3133 lea .Lcts_permute_table, T1
3134#else
3135 lea .Lcts_permute_table(%rip), T1
Mathias Krause0d258ef2010-11-27 16:34:46 +08003136#endif
Ard Biesheuvel24811042020-12-31 17:41:55 +01003137 add LEN, INP /* rewind input pointer */
3138 add $16, LEN /* # bytes in final block */
3139 movups (INP), IN1
3140
3141 mov T1, IVP
3142 add $32, IVP
3143 add LEN, T1
3144 sub LEN, IVP
3145 add OUTP, LEN
3146
3147 movups (T1), %xmm4
3148 movaps STATE, IN2
3149 pshufb %xmm4, STATE
3150 movups STATE, (LEN)
3151
3152 movups (IVP), %xmm0
3153 pshufb %xmm0, IN1
3154 pblendvb IN2, IN1
3155 movaps IN1, STATE
3156
3157 pxor STATE4, STATE
3158 call _aesni_dec1
3159 pxor STATE4, STATE
3160
3161 movups STATE, (OUTP)
3162 jmp .Lxts_dec_ret
3163SYM_FUNC_END(aesni_xts_decrypt)