blob: cad6e1bfa7d5f237559655b2b12d3070300972c6 [file] [log] [blame]
Thomas Gleixner2874c5f2019-05-27 08:55:01 +02001/* SPDX-License-Identifier: GPL-2.0-or-later */
Huang Ying54b6a1b2009-01-18 16:28:34 +11002/*
3 * Implement AES algorithm in Intel AES-NI instructions.
4 *
5 * The white paper of AES-NI instructions can be downloaded from:
6 * http://softwarecommunity.intel.com/isn/downloads/intelavx/AES-Instructions-Set_WP.pdf
7 *
8 * Copyright (C) 2008, Intel Corp.
9 * Author: Huang Ying <ying.huang@intel.com>
10 * Vinodh Gopal <vinodh.gopal@intel.com>
11 * Kahraman Akdemir
12 *
Tadeusz Struk0bd82f52010-11-04 15:00:45 -040013 * Added RFC4106 AES-GCM support for 128-bit keys under the AEAD
14 * interface for 64-bit kernels.
15 * Authors: Erdinc Ozturk (erdinc.ozturk@intel.com)
16 * Aidan O'Mahony (aidan.o.mahony@intel.com)
17 * Adrian Hoban <adrian.hoban@intel.com>
18 * James Guilford (james.guilford@intel.com)
19 * Gabriele Paoloni <gabriele.paoloni@intel.com>
20 * Tadeusz Struk (tadeusz.struk@intel.com)
21 * Wajdi Feghali (wajdi.k.feghali@intel.com)
22 * Copyright (c) 2010, Intel Corporation.
23 *
Mathias Krause0d258ef2010-11-27 16:34:46 +080024 * Ported x86_64 version to x86:
25 * Author: Mathias Krause <minipli@googlemail.com>
Huang Ying54b6a1b2009-01-18 16:28:34 +110026 */
27
28#include <linux/linkage.h>
Huang Yingb369e522009-11-23 19:54:06 +080029#include <asm/inst.h>
Josh Poimboeuf8691ccd2016-01-21 16:49:19 -060030#include <asm/frame.h>
David Woodhouse9697fa32018-01-11 21:46:27 +000031#include <asm/nospec-branch.h>
Huang Ying54b6a1b2009-01-18 16:28:34 +110032
Timothy McCaffreye31ac322015-01-13 13:16:43 -050033/*
34 * The following macros are used to move an (un)aligned 16 byte value to/from
35 * an XMM register. This can done for either FP or integer values, for FP use
36 * movaps (move aligned packed single) or integer use movdqa (move double quad
37 * aligned). It doesn't make a performance difference which instruction is used
38 * since Nehalem (original Core i7) was released. However, the movaps is a byte
39 * shorter, so that is the one we'll use for now. (same for unaligned).
40 */
41#define MOVADQ movaps
42#define MOVUDQ movups
43
Mathias Krause559ad0f2010-11-29 08:35:39 +080044#ifdef __x86_64__
Timothy McCaffreye31ac322015-01-13 13:16:43 -050045
Denys Vlasenkoe1839142017-01-19 22:33:04 +010046# constants in mergeable sections, linker can reorder and merge
47.section .rodata.cst16.gf128mul_x_ble_mask, "aM", @progbits, 16
Jussi Kivilinnac456a9c2013-04-08 21:51:16 +030048.align 16
49.Lgf128mul_x_ble_mask:
50 .octa 0x00000000000000010000000000000087
Denys Vlasenkoe1839142017-01-19 22:33:04 +010051.section .rodata.cst16.POLY, "aM", @progbits, 16
52.align 16
Tadeusz Struk0bd82f52010-11-04 15:00:45 -040053POLY: .octa 0xC2000000000000000000000000000001
Denys Vlasenkoe1839142017-01-19 22:33:04 +010054.section .rodata.cst16.TWOONE, "aM", @progbits, 16
55.align 16
Tadeusz Struk0bd82f52010-11-04 15:00:45 -040056TWOONE: .octa 0x00000001000000000000000000000001
57
Denys Vlasenkoe1839142017-01-19 22:33:04 +010058.section .rodata.cst16.SHUF_MASK, "aM", @progbits, 16
59.align 16
60SHUF_MASK: .octa 0x000102030405060708090A0B0C0D0E0F
61.section .rodata.cst16.MASK1, "aM", @progbits, 16
62.align 16
63MASK1: .octa 0x0000000000000000ffffffffffffffff
64.section .rodata.cst16.MASK2, "aM", @progbits, 16
65.align 16
66MASK2: .octa 0xffffffffffffffff0000000000000000
67.section .rodata.cst16.ONE, "aM", @progbits, 16
68.align 16
69ONE: .octa 0x00000000000000000000000000000001
70.section .rodata.cst16.F_MIN_MASK, "aM", @progbits, 16
71.align 16
72F_MIN_MASK: .octa 0xf1f2f3f4f5f6f7f8f9fafbfcfdfeff0
73.section .rodata.cst16.dec, "aM", @progbits, 16
74.align 16
75dec: .octa 0x1
76.section .rodata.cst16.enc, "aM", @progbits, 16
77.align 16
78enc: .octa 0x2
79
Tadeusz Struk0bd82f52010-11-04 15:00:45 -040080# order of these constants should not change.
81# more specifically, ALL_F should follow SHIFT_MASK,
Denys Vlasenkoe1839142017-01-19 22:33:04 +010082# and zero should follow ALL_F
83.section .rodata, "a", @progbits
84.align 16
Tadeusz Struk0bd82f52010-11-04 15:00:45 -040085SHIFT_MASK: .octa 0x0f0e0d0c0b0a09080706050403020100
86ALL_F: .octa 0xffffffffffffffffffffffffffffffff
Denys Vlasenkoe1839142017-01-19 22:33:04 +010087 .octa 0x00000000000000000000000000000000
Tadeusz Struk0bd82f52010-11-04 15:00:45 -040088
Huang Ying54b6a1b2009-01-18 16:28:34 +110089.text
90
Tadeusz Struk0bd82f52010-11-04 15:00:45 -040091
92#define STACK_OFFSET 8*3
Tadeusz Struk0bd82f52010-11-04 15:00:45 -040093
Dave Watson9ee4a5d2018-02-14 09:39:23 -080094#define AadHash 16*0
95#define AadLen 16*1
96#define InLen (16*1)+8
97#define PBlockEncKey 16*2
98#define OrigIV 16*3
99#define CurCount 16*4
100#define PBlockLen 16*5
Dave Watson1476db22018-02-14 09:40:10 -0800101#define HashKey 16*6 // store HashKey <<1 mod poly here
102#define HashKey_2 16*7 // store HashKey^2 <<1 mod poly here
103#define HashKey_3 16*8 // store HashKey^3 <<1 mod poly here
104#define HashKey_4 16*9 // store HashKey^4 <<1 mod poly here
105#define HashKey_k 16*10 // store XOR of High 64 bits and Low 64
106 // bits of HashKey <<1 mod poly here
107 //(for Karatsuba purposes)
108#define HashKey_2_k 16*11 // store XOR of High 64 bits and Low 64
109 // bits of HashKey^2 <<1 mod poly here
110 // (for Karatsuba purposes)
111#define HashKey_3_k 16*12 // store XOR of High 64 bits and Low 64
112 // bits of HashKey^3 <<1 mod poly here
113 // (for Karatsuba purposes)
114#define HashKey_4_k 16*13 // store XOR of High 64 bits and Low 64
115 // bits of HashKey^4 <<1 mod poly here
116 // (for Karatsuba purposes)
Dave Watson9ee4a5d2018-02-14 09:39:23 -0800117
Tadeusz Struk0bd82f52010-11-04 15:00:45 -0400118#define arg1 rdi
119#define arg2 rsi
120#define arg3 rdx
121#define arg4 rcx
122#define arg5 r8
123#define arg6 r9
Dave Watson1476db22018-02-14 09:40:10 -0800124#define arg7 STACK_OFFSET+8(%rsp)
125#define arg8 STACK_OFFSET+16(%rsp)
126#define arg9 STACK_OFFSET+24(%rsp)
127#define arg10 STACK_OFFSET+32(%rsp)
128#define arg11 STACK_OFFSET+40(%rsp)
Timothy McCaffreye31ac322015-01-13 13:16:43 -0500129#define keysize 2*15*16(%arg1)
Mathias Krause559ad0f2010-11-29 08:35:39 +0800130#endif
Tadeusz Struk0bd82f52010-11-04 15:00:45 -0400131
132
Huang Ying54b6a1b2009-01-18 16:28:34 +1100133#define STATE1 %xmm0
134#define STATE2 %xmm4
135#define STATE3 %xmm5
136#define STATE4 %xmm6
137#define STATE STATE1
138#define IN1 %xmm1
139#define IN2 %xmm7
140#define IN3 %xmm8
141#define IN4 %xmm9
142#define IN IN1
143#define KEY %xmm2
144#define IV %xmm3
Mathias Krause0d258ef2010-11-27 16:34:46 +0800145
Huang Ying12387a42010-03-10 18:28:55 +0800146#define BSWAP_MASK %xmm10
147#define CTR %xmm11
148#define INC %xmm12
Huang Ying54b6a1b2009-01-18 16:28:34 +1100149
Jussi Kivilinnac456a9c2013-04-08 21:51:16 +0300150#define GF128MUL_MASK %xmm10
151
Mathias Krause0d258ef2010-11-27 16:34:46 +0800152#ifdef __x86_64__
153#define AREG %rax
Huang Ying54b6a1b2009-01-18 16:28:34 +1100154#define KEYP %rdi
155#define OUTP %rsi
Mathias Krause0d258ef2010-11-27 16:34:46 +0800156#define UKEYP OUTP
Huang Ying54b6a1b2009-01-18 16:28:34 +1100157#define INP %rdx
158#define LEN %rcx
159#define IVP %r8
160#define KLEN %r9d
161#define T1 %r10
162#define TKEYP T1
163#define T2 %r11
Huang Ying12387a42010-03-10 18:28:55 +0800164#define TCTR_LOW T2
Mathias Krause0d258ef2010-11-27 16:34:46 +0800165#else
166#define AREG %eax
167#define KEYP %edi
168#define OUTP AREG
169#define UKEYP OUTP
170#define INP %edx
171#define LEN %esi
172#define IVP %ebp
173#define KLEN %ebx
174#define T1 %ecx
175#define TKEYP T1
176#endif
Huang Ying54b6a1b2009-01-18 16:28:34 +1100177
Dave Watson6c2c86b2018-02-14 09:38:35 -0800178.macro FUNC_SAVE
179 push %r12
180 push %r13
181 push %r14
Dave Watson6c2c86b2018-02-14 09:38:35 -0800182#
183# states of %xmm registers %xmm6:%xmm15 not saved
184# all %xmm registers are clobbered
185#
Dave Watson6c2c86b2018-02-14 09:38:35 -0800186.endm
187
188
189.macro FUNC_RESTORE
Dave Watson6c2c86b2018-02-14 09:38:35 -0800190 pop %r14
191 pop %r13
192 pop %r12
193.endm
Tadeusz Struk0bd82f52010-11-04 15:00:45 -0400194
Dave Watson1476db22018-02-14 09:40:10 -0800195# Precompute hashkeys.
196# Input: Hash subkey.
197# Output: HashKeys stored in gcm_context_data. Only needs to be called
198# once per key.
199# clobbers r12, and tmp xmm registers.
Dave Watsonfb8986e2018-02-14 09:40:47 -0800200.macro PRECOMPUTE SUBKEY TMP1 TMP2 TMP3 TMP4 TMP5 TMP6 TMP7
201 mov \SUBKEY, %r12
Dave Watson1476db22018-02-14 09:40:10 -0800202 movdqu (%r12), \TMP3
203 movdqa SHUF_MASK(%rip), \TMP2
204 PSHUFB_XMM \TMP2, \TMP3
205
206 # precompute HashKey<<1 mod poly from the HashKey (required for GHASH)
207
208 movdqa \TMP3, \TMP2
209 psllq $1, \TMP3
210 psrlq $63, \TMP2
211 movdqa \TMP2, \TMP1
212 pslldq $8, \TMP2
213 psrldq $8, \TMP1
214 por \TMP2, \TMP3
215
216 # reduce HashKey<<1
217
218 pshufd $0x24, \TMP1, \TMP2
219 pcmpeqd TWOONE(%rip), \TMP2
220 pand POLY(%rip), \TMP2
221 pxor \TMP2, \TMP3
Dave Watsone5b954e2018-08-15 10:29:42 -0700222 movdqu \TMP3, HashKey(%arg2)
Dave Watson1476db22018-02-14 09:40:10 -0800223
224 movdqa \TMP3, \TMP5
225 pshufd $78, \TMP3, \TMP1
226 pxor \TMP3, \TMP1
Dave Watsone5b954e2018-08-15 10:29:42 -0700227 movdqu \TMP1, HashKey_k(%arg2)
Dave Watson1476db22018-02-14 09:40:10 -0800228
229 GHASH_MUL \TMP5, \TMP3, \TMP1, \TMP2, \TMP4, \TMP6, \TMP7
230# TMP5 = HashKey^2<<1 (mod poly)
Dave Watsone5b954e2018-08-15 10:29:42 -0700231 movdqu \TMP5, HashKey_2(%arg2)
Dave Watson1476db22018-02-14 09:40:10 -0800232# HashKey_2 = HashKey^2<<1 (mod poly)
233 pshufd $78, \TMP5, \TMP1
234 pxor \TMP5, \TMP1
Dave Watsone5b954e2018-08-15 10:29:42 -0700235 movdqu \TMP1, HashKey_2_k(%arg2)
Dave Watson1476db22018-02-14 09:40:10 -0800236
237 GHASH_MUL \TMP5, \TMP3, \TMP1, \TMP2, \TMP4, \TMP6, \TMP7
238# TMP5 = HashKey^3<<1 (mod poly)
Dave Watsone5b954e2018-08-15 10:29:42 -0700239 movdqu \TMP5, HashKey_3(%arg2)
Dave Watson1476db22018-02-14 09:40:10 -0800240 pshufd $78, \TMP5, \TMP1
241 pxor \TMP5, \TMP1
Dave Watsone5b954e2018-08-15 10:29:42 -0700242 movdqu \TMP1, HashKey_3_k(%arg2)
Dave Watson1476db22018-02-14 09:40:10 -0800243
244 GHASH_MUL \TMP5, \TMP3, \TMP1, \TMP2, \TMP4, \TMP6, \TMP7
245# TMP5 = HashKey^3<<1 (mod poly)
Dave Watsone5b954e2018-08-15 10:29:42 -0700246 movdqu \TMP5, HashKey_4(%arg2)
Dave Watson1476db22018-02-14 09:40:10 -0800247 pshufd $78, \TMP5, \TMP1
248 pxor \TMP5, \TMP1
Dave Watsone5b954e2018-08-15 10:29:42 -0700249 movdqu \TMP1, HashKey_4_k(%arg2)
Dave Watson1476db22018-02-14 09:40:10 -0800250.endm
Dave Watson7af964c2018-02-14 09:38:45 -0800251
252# GCM_INIT initializes a gcm_context struct to prepare for encoding/decoding.
253# Clobbers rax, r10-r13 and xmm0-xmm6, %xmm13
Dave Watsonfb8986e2018-02-14 09:40:47 -0800254.macro GCM_INIT Iv SUBKEY AAD AADLEN
255 mov \AADLEN, %r11
Dave Watson96604742018-02-14 09:39:45 -0800256 mov %r11, AadLen(%arg2) # ctx_data.aad_length = aad_length
Jan Beulicha7bea832018-07-02 04:31:54 -0600257 xor %r11d, %r11d
Dave Watson96604742018-02-14 09:39:45 -0800258 mov %r11, InLen(%arg2) # ctx_data.in_length = 0
259 mov %r11, PBlockLen(%arg2) # ctx_data.partial_block_length = 0
260 mov %r11, PBlockEncKey(%arg2) # ctx_data.partial_block_enc_key = 0
Dave Watsonfb8986e2018-02-14 09:40:47 -0800261 mov \Iv, %rax
Dave Watson96604742018-02-14 09:39:45 -0800262 movdqu (%rax), %xmm0
263 movdqu %xmm0, OrigIV(%arg2) # ctx_data.orig_IV = iv
264
265 movdqa SHUF_MASK(%rip), %xmm2
266 PSHUFB_XMM %xmm2, %xmm0
267 movdqu %xmm0, CurCount(%arg2) # ctx_data.current_counter = iv
268
Dave Watsonfb8986e2018-02-14 09:40:47 -0800269 PRECOMPUTE \SUBKEY, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7,
Dave Watsone5b954e2018-08-15 10:29:42 -0700270 movdqu HashKey(%arg2), %xmm13
Dave Watsonc594c542018-02-14 09:39:36 -0800271
Dave Watsonfb8986e2018-02-14 09:40:47 -0800272 CALC_AAD_HASH %xmm13, \AAD, \AADLEN, %xmm0, %xmm1, %xmm2, %xmm3, \
273 %xmm4, %xmm5, %xmm6
Dave Watson7af964c2018-02-14 09:38:45 -0800274.endm
275
Dave Watsonba458332018-02-14 09:39:10 -0800276# GCM_ENC_DEC Encodes/Decodes given data. Assumes that the passed gcm_context
277# struct has been initialized by GCM_INIT.
278# Requires the input data be at least 1 byte long because of READ_PARTIAL_BLOCK
279# Clobbers rax, r10-r13, and xmm0-xmm15
280.macro GCM_ENC_DEC operation
Dave Watson96604742018-02-14 09:39:45 -0800281 movdqu AadHash(%arg2), %xmm8
Dave Watson1476db22018-02-14 09:40:10 -0800282 movdqu HashKey(%arg2), %xmm13
Dave Watson96604742018-02-14 09:39:45 -0800283 add %arg5, InLen(%arg2)
Dave Watsonae952c52018-02-14 09:40:19 -0800284
Jan Beulicha7bea832018-07-02 04:31:54 -0600285 xor %r11d, %r11d # initialise the data pointer offset as zero
Dave Watsonae952c52018-02-14 09:40:19 -0800286 PARTIAL_BLOCK %arg3 %arg4 %arg5 %r11 %xmm8 \operation
287
288 sub %r11, %arg5 # sub partial block data used
Dave Watson96604742018-02-14 09:39:45 -0800289 mov %arg5, %r13 # save the number of bytes
Dave Watsonae952c52018-02-14 09:40:19 -0800290
Dave Watson96604742018-02-14 09:39:45 -0800291 and $-16, %r13 # %r13 = %r13 - (%r13 mod 16)
292 mov %r13, %r12
Dave Watsonba458332018-02-14 09:39:10 -0800293 # Encrypt/Decrypt first few blocks
294
295 and $(3<<4), %r12
296 jz _initial_num_blocks_is_0_\@
297 cmp $(2<<4), %r12
298 jb _initial_num_blocks_is_1_\@
299 je _initial_num_blocks_is_2_\@
300_initial_num_blocks_is_3_\@:
301 INITIAL_BLOCKS_ENC_DEC %xmm9, %xmm10, %xmm13, %xmm11, %xmm12, %xmm0, \
302%xmm1, %xmm2, %xmm3, %xmm4, %xmm8, %xmm5, %xmm6, 5, 678, \operation
303 sub $48, %r13
304 jmp _initial_blocks_\@
305_initial_num_blocks_is_2_\@:
306 INITIAL_BLOCKS_ENC_DEC %xmm9, %xmm10, %xmm13, %xmm11, %xmm12, %xmm0, \
307%xmm1, %xmm2, %xmm3, %xmm4, %xmm8, %xmm5, %xmm6, 6, 78, \operation
308 sub $32, %r13
309 jmp _initial_blocks_\@
310_initial_num_blocks_is_1_\@:
311 INITIAL_BLOCKS_ENC_DEC %xmm9, %xmm10, %xmm13, %xmm11, %xmm12, %xmm0, \
312%xmm1, %xmm2, %xmm3, %xmm4, %xmm8, %xmm5, %xmm6, 7, 8, \operation
313 sub $16, %r13
314 jmp _initial_blocks_\@
315_initial_num_blocks_is_0_\@:
316 INITIAL_BLOCKS_ENC_DEC %xmm9, %xmm10, %xmm13, %xmm11, %xmm12, %xmm0, \
317%xmm1, %xmm2, %xmm3, %xmm4, %xmm8, %xmm5, %xmm6, 8, 0, \operation
318_initial_blocks_\@:
319
320 # Main loop - Encrypt/Decrypt remaining blocks
321
322 cmp $0, %r13
323 je _zero_cipher_left_\@
324 sub $64, %r13
325 je _four_cipher_left_\@
326_crypt_by_4_\@:
327 GHASH_4_ENCRYPT_4_PARALLEL_\operation %xmm9, %xmm10, %xmm11, %xmm12, \
328 %xmm13, %xmm14, %xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, \
329 %xmm7, %xmm8, enc
330 add $64, %r11
331 sub $64, %r13
332 jne _crypt_by_4_\@
333_four_cipher_left_\@:
334 GHASH_LAST_4 %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14, \
335%xmm15, %xmm1, %xmm2, %xmm3, %xmm4, %xmm8
336_zero_cipher_left_\@:
Dave Watson96604742018-02-14 09:39:45 -0800337 movdqu %xmm8, AadHash(%arg2)
338 movdqu %xmm0, CurCount(%arg2)
339
Dave Watson9ee4a5d2018-02-14 09:39:23 -0800340 mov %arg5, %r13
341 and $15, %r13 # %r13 = arg5 (mod 16)
Dave Watsonba458332018-02-14 09:39:10 -0800342 je _multiple_of_16_bytes_\@
343
Dave Watson96604742018-02-14 09:39:45 -0800344 mov %r13, PBlockLen(%arg2)
345
Dave Watsonba458332018-02-14 09:39:10 -0800346 # Handle the last <16 Byte block separately
347 paddd ONE(%rip), %xmm0 # INCR CNT to get Yn
Dave Watson96604742018-02-14 09:39:45 -0800348 movdqu %xmm0, CurCount(%arg2)
Dave Watson9ee4a5d2018-02-14 09:39:23 -0800349 movdqa SHUF_MASK(%rip), %xmm10
Dave Watsonba458332018-02-14 09:39:10 -0800350 PSHUFB_XMM %xmm10, %xmm0
351
352 ENCRYPT_SINGLE_BLOCK %xmm0, %xmm1 # Encrypt(K, Yn)
Dave Watson96604742018-02-14 09:39:45 -0800353 movdqu %xmm0, PBlockEncKey(%arg2)
Dave Watsonba458332018-02-14 09:39:10 -0800354
Dave Watson933d6ae2018-02-14 09:40:31 -0800355 cmp $16, %arg5
356 jge _large_enough_update_\@
357
Dave Watson9ee4a5d2018-02-14 09:39:23 -0800358 lea (%arg4,%r11,1), %r10
Dave Watsonba458332018-02-14 09:39:10 -0800359 mov %r13, %r12
360 READ_PARTIAL_BLOCK %r10 %r12 %xmm2 %xmm1
Dave Watson933d6ae2018-02-14 09:40:31 -0800361 jmp _data_read_\@
Dave Watsonba458332018-02-14 09:39:10 -0800362
Dave Watson933d6ae2018-02-14 09:40:31 -0800363_large_enough_update_\@:
364 sub $16, %r11
365 add %r13, %r11
366
367 # receive the last <16 Byte block
368 movdqu (%arg4, %r11, 1), %xmm1
369
370 sub %r13, %r11
371 add $16, %r11
372
373 lea SHIFT_MASK+16(%rip), %r12
374 # adjust the shuffle mask pointer to be able to shift 16-r13 bytes
375 # (r13 is the number of bytes in plaintext mod 16)
376 sub %r13, %r12
377 # get the appropriate shuffle mask
378 movdqu (%r12), %xmm2
379 # shift right 16-r13 bytes
380 PSHUFB_XMM %xmm2, %xmm1
381
382_data_read_\@:
Dave Watsonba458332018-02-14 09:39:10 -0800383 lea ALL_F+16(%rip), %r12
384 sub %r13, %r12
Dave Watson933d6ae2018-02-14 09:40:31 -0800385
Dave Watsonba458332018-02-14 09:39:10 -0800386.ifc \operation, dec
387 movdqa %xmm1, %xmm2
388.endif
389 pxor %xmm1, %xmm0 # XOR Encrypt(K, Yn)
390 movdqu (%r12), %xmm1
391 # get the appropriate mask to mask out top 16-r13 bytes of xmm0
392 pand %xmm1, %xmm0 # mask out top 16-r13 bytes of xmm0
393.ifc \operation, dec
394 pand %xmm1, %xmm2
395 movdqa SHUF_MASK(%rip), %xmm10
396 PSHUFB_XMM %xmm10 ,%xmm2
397
398 pxor %xmm2, %xmm8
399.else
400 movdqa SHUF_MASK(%rip), %xmm10
401 PSHUFB_XMM %xmm10,%xmm0
402
403 pxor %xmm0, %xmm8
404.endif
405
Dave Watson96604742018-02-14 09:39:45 -0800406 movdqu %xmm8, AadHash(%arg2)
Dave Watsonba458332018-02-14 09:39:10 -0800407.ifc \operation, enc
408 # GHASH computation for the last <16 byte block
409 movdqa SHUF_MASK(%rip), %xmm10
410 # shuffle xmm0 back to output as ciphertext
411 PSHUFB_XMM %xmm10, %xmm0
412.endif
413
414 # Output %r13 bytes
415 MOVQ_R64_XMM %xmm0, %rax
416 cmp $8, %r13
417 jle _less_than_8_bytes_left_\@
Dave Watson9ee4a5d2018-02-14 09:39:23 -0800418 mov %rax, (%arg3 , %r11, 1)
Dave Watsonba458332018-02-14 09:39:10 -0800419 add $8, %r11
420 psrldq $8, %xmm0
421 MOVQ_R64_XMM %xmm0, %rax
422 sub $8, %r13
423_less_than_8_bytes_left_\@:
Dave Watson9ee4a5d2018-02-14 09:39:23 -0800424 mov %al, (%arg3, %r11, 1)
Dave Watsonba458332018-02-14 09:39:10 -0800425 add $1, %r11
426 shr $8, %rax
427 sub $1, %r13
428 jne _less_than_8_bytes_left_\@
429_multiple_of_16_bytes_\@:
430.endm
431
Dave Watsonadcadab2018-02-14 09:38:57 -0800432# GCM_COMPLETE Finishes update of tag of last partial block
433# Output: Authorization Tag (AUTH_TAG)
434# Clobbers rax, r10-r12, and xmm0, xmm1, xmm5-xmm15
Dave Watsonfb8986e2018-02-14 09:40:47 -0800435.macro GCM_COMPLETE AUTHTAG AUTHTAGLEN
Dave Watson96604742018-02-14 09:39:45 -0800436 movdqu AadHash(%arg2), %xmm8
Dave Watson1476db22018-02-14 09:40:10 -0800437 movdqu HashKey(%arg2), %xmm13
Dave Watsone2e34b02018-02-14 09:39:55 -0800438
439 mov PBlockLen(%arg2), %r12
440
441 cmp $0, %r12
442 je _partial_done\@
443
444 GHASH_MUL %xmm8, %xmm13, %xmm9, %xmm10, %xmm11, %xmm5, %xmm6
445
446_partial_done\@:
Dave Watson96604742018-02-14 09:39:45 -0800447 mov AadLen(%arg2), %r12 # %r13 = aadLen (number of bytes)
Dave Watsonadcadab2018-02-14 09:38:57 -0800448 shl $3, %r12 # convert into number of bits
449 movd %r12d, %xmm15 # len(A) in %xmm15
Dave Watson96604742018-02-14 09:39:45 -0800450 mov InLen(%arg2), %r12
451 shl $3, %r12 # len(C) in bits (*128)
452 MOVQ_R64_XMM %r12, %xmm1
453
Dave Watsonadcadab2018-02-14 09:38:57 -0800454 pslldq $8, %xmm15 # %xmm15 = len(A)||0x0000000000000000
455 pxor %xmm1, %xmm15 # %xmm15 = len(A)||len(C)
456 pxor %xmm15, %xmm8
457 GHASH_MUL %xmm8, %xmm13, %xmm9, %xmm10, %xmm11, %xmm5, %xmm6
458 # final GHASH computation
459 movdqa SHUF_MASK(%rip), %xmm10
460 PSHUFB_XMM %xmm10, %xmm8
461
Dave Watson96604742018-02-14 09:39:45 -0800462 movdqu OrigIV(%arg2), %xmm0 # %xmm0 = Y0
Dave Watsonadcadab2018-02-14 09:38:57 -0800463 ENCRYPT_SINGLE_BLOCK %xmm0, %xmm1 # E(K, Y0)
464 pxor %xmm8, %xmm0
465_return_T_\@:
Dave Watsonfb8986e2018-02-14 09:40:47 -0800466 mov \AUTHTAG, %r10 # %r10 = authTag
467 mov \AUTHTAGLEN, %r11 # %r11 = auth_tag_len
Dave Watsonadcadab2018-02-14 09:38:57 -0800468 cmp $16, %r11
469 je _T_16_\@
470 cmp $8, %r11
471 jl _T_4_\@
472_T_8_\@:
473 MOVQ_R64_XMM %xmm0, %rax
474 mov %rax, (%r10)
475 add $8, %r10
476 sub $8, %r11
477 psrldq $8, %xmm0
478 cmp $0, %r11
479 je _return_T_done_\@
480_T_4_\@:
481 movd %xmm0, %eax
482 mov %eax, (%r10)
483 add $4, %r10
484 sub $4, %r11
485 psrldq $4, %xmm0
486 cmp $0, %r11
487 je _return_T_done_\@
488_T_123_\@:
489 movd %xmm0, %eax
490 cmp $2, %r11
491 jl _T_1_\@
492 mov %ax, (%r10)
493 cmp $2, %r11
494 je _return_T_done_\@
495 add $2, %r10
496 sar $16, %eax
497_T_1_\@:
498 mov %al, (%r10)
499 jmp _return_T_done_\@
500_T_16_\@:
501 movdqu %xmm0, (%r10)
502_return_T_done_\@:
503.endm
504
Mathias Krause559ad0f2010-11-29 08:35:39 +0800505#ifdef __x86_64__
Tadeusz Struk0bd82f52010-11-04 15:00:45 -0400506/* GHASH_MUL MACRO to implement: Data*HashKey mod (128,127,126,121,0)
507*
508*
509* Input: A and B (128-bits each, bit-reflected)
510* Output: C = A*B*x mod poly, (i.e. >>1 )
511* To compute GH = GH*HashKey mod poly, give HK = HashKey<<1 mod poly as input
512* GH = GH * HK * x mod poly which is equivalent to GH*HashKey mod poly.
513*
514*/
515.macro GHASH_MUL GH HK TMP1 TMP2 TMP3 TMP4 TMP5
516 movdqa \GH, \TMP1
517 pshufd $78, \GH, \TMP2
518 pshufd $78, \HK, \TMP3
519 pxor \GH, \TMP2 # TMP2 = a1+a0
520 pxor \HK, \TMP3 # TMP3 = b1+b0
521 PCLMULQDQ 0x11, \HK, \TMP1 # TMP1 = a1*b1
522 PCLMULQDQ 0x00, \HK, \GH # GH = a0*b0
523 PCLMULQDQ 0x00, \TMP3, \TMP2 # TMP2 = (a0+a1)*(b1+b0)
524 pxor \GH, \TMP2
525 pxor \TMP1, \TMP2 # TMP2 = (a0*b0)+(a1*b0)
526 movdqa \TMP2, \TMP3
527 pslldq $8, \TMP3 # left shift TMP3 2 DWs
528 psrldq $8, \TMP2 # right shift TMP2 2 DWs
529 pxor \TMP3, \GH
530 pxor \TMP2, \TMP1 # TMP2:GH holds the result of GH*HK
531
532 # first phase of the reduction
533
534 movdqa \GH, \TMP2
535 movdqa \GH, \TMP3
536 movdqa \GH, \TMP4 # copy GH into TMP2,TMP3 and TMP4
537 # in in order to perform
538 # independent shifts
539 pslld $31, \TMP2 # packed right shift <<31
540 pslld $30, \TMP3 # packed right shift <<30
541 pslld $25, \TMP4 # packed right shift <<25
542 pxor \TMP3, \TMP2 # xor the shifted versions
543 pxor \TMP4, \TMP2
544 movdqa \TMP2, \TMP5
545 psrldq $4, \TMP5 # right shift TMP5 1 DW
546 pslldq $12, \TMP2 # left shift TMP2 3 DWs
547 pxor \TMP2, \GH
548
549 # second phase of the reduction
550
551 movdqa \GH,\TMP2 # copy GH into TMP2,TMP3 and TMP4
552 # in in order to perform
553 # independent shifts
554 movdqa \GH,\TMP3
555 movdqa \GH,\TMP4
556 psrld $1,\TMP2 # packed left shift >>1
557 psrld $2,\TMP3 # packed left shift >>2
558 psrld $7,\TMP4 # packed left shift >>7
559 pxor \TMP3,\TMP2 # xor the shifted versions
560 pxor \TMP4,\TMP2
561 pxor \TMP5, \TMP2
562 pxor \TMP2, \GH
563 pxor \TMP1, \GH # result is in TMP1
564.endm
565
Junaid Shahidb20209c2017-12-20 17:08:37 -0800566# Reads DLEN bytes starting at DPTR and stores in XMMDst
567# where 0 < DLEN < 16
568# Clobbers %rax, DLEN and XMM1
569.macro READ_PARTIAL_BLOCK DPTR DLEN XMM1 XMMDst
570 cmp $8, \DLEN
571 jl _read_lt8_\@
572 mov (\DPTR), %rax
573 MOVQ_R64_XMM %rax, \XMMDst
574 sub $8, \DLEN
575 jz _done_read_partial_block_\@
576 xor %eax, %eax
577_read_next_byte_\@:
578 shl $8, %rax
579 mov 7(\DPTR, \DLEN, 1), %al
580 dec \DLEN
581 jnz _read_next_byte_\@
582 MOVQ_R64_XMM %rax, \XMM1
583 pslldq $8, \XMM1
584 por \XMM1, \XMMDst
585 jmp _done_read_partial_block_\@
586_read_lt8_\@:
587 xor %eax, %eax
588_read_next_byte_lt8_\@:
589 shl $8, %rax
590 mov -1(\DPTR, \DLEN, 1), %al
591 dec \DLEN
592 jnz _read_next_byte_lt8_\@
593 MOVQ_R64_XMM %rax, \XMMDst
594_done_read_partial_block_\@:
595.endm
596
Dave Watsonc594c542018-02-14 09:39:36 -0800597# CALC_AAD_HASH: Calculates the hash of the data which will not be encrypted.
598# clobbers r10-11, xmm14
Dave Watsonfb8986e2018-02-14 09:40:47 -0800599.macro CALC_AAD_HASH HASHKEY AAD AADLEN TMP1 TMP2 TMP3 TMP4 TMP5 \
Dave Watsonc594c542018-02-14 09:39:36 -0800600 TMP6 TMP7
601 MOVADQ SHUF_MASK(%rip), %xmm14
Dave Watsonfb8986e2018-02-14 09:40:47 -0800602 mov \AAD, %r10 # %r10 = AAD
603 mov \AADLEN, %r11 # %r11 = aadLen
Dave Watsonc594c542018-02-14 09:39:36 -0800604 pxor \TMP7, \TMP7
605 pxor \TMP6, \TMP6
606
607 cmp $16, %r11
608 jl _get_AAD_rest\@
609_get_AAD_blocks\@:
610 movdqu (%r10), \TMP7
611 PSHUFB_XMM %xmm14, \TMP7 # byte-reflect the AAD data
612 pxor \TMP7, \TMP6
613 GHASH_MUL \TMP6, \HASHKEY, \TMP1, \TMP2, \TMP3, \TMP4, \TMP5
614 add $16, %r10
615 sub $16, %r11
616 cmp $16, %r11
617 jge _get_AAD_blocks\@
618
619 movdqu \TMP6, \TMP7
620
621 /* read the last <16B of AAD */
622_get_AAD_rest\@:
623 cmp $0, %r11
624 je _get_AAD_done\@
625
626 READ_PARTIAL_BLOCK %r10, %r11, \TMP1, \TMP7
627 PSHUFB_XMM %xmm14, \TMP7 # byte-reflect the AAD data
628 pxor \TMP6, \TMP7
629 GHASH_MUL \TMP7, \HASHKEY, \TMP1, \TMP2, \TMP3, \TMP4, \TMP5
630 movdqu \TMP7, \TMP6
631
632_get_AAD_done\@:
633 movdqu \TMP6, AadHash(%arg2)
634.endm
635
Dave Watsonae952c52018-02-14 09:40:19 -0800636# PARTIAL_BLOCK: Handles encryption/decryption and the tag partial blocks
637# between update calls.
638# Requires the input data be at least 1 byte long due to READ_PARTIAL_BLOCK
639# Outputs encrypted bytes, and updates hash and partial info in gcm_data_context
640# Clobbers rax, r10, r12, r13, xmm0-6, xmm9-13
641.macro PARTIAL_BLOCK CYPH_PLAIN_OUT PLAIN_CYPH_IN PLAIN_CYPH_LEN DATA_OFFSET \
642 AAD_HASH operation
643 mov PBlockLen(%arg2), %r13
644 cmp $0, %r13
645 je _partial_block_done_\@ # Leave Macro if no partial blocks
646 # Read in input data without over reading
647 cmp $16, \PLAIN_CYPH_LEN
648 jl _fewer_than_16_bytes_\@
649 movups (\PLAIN_CYPH_IN), %xmm1 # If more than 16 bytes, just fill xmm
650 jmp _data_read_\@
651
652_fewer_than_16_bytes_\@:
653 lea (\PLAIN_CYPH_IN, \DATA_OFFSET, 1), %r10
654 mov \PLAIN_CYPH_LEN, %r12
655 READ_PARTIAL_BLOCK %r10 %r12 %xmm0 %xmm1
656
657 mov PBlockLen(%arg2), %r13
658
659_data_read_\@: # Finished reading in data
660
661 movdqu PBlockEncKey(%arg2), %xmm9
662 movdqu HashKey(%arg2), %xmm13
663
664 lea SHIFT_MASK(%rip), %r12
665
666 # adjust the shuffle mask pointer to be able to shift r13 bytes
667 # r16-r13 is the number of bytes in plaintext mod 16)
668 add %r13, %r12
669 movdqu (%r12), %xmm2 # get the appropriate shuffle mask
670 PSHUFB_XMM %xmm2, %xmm9 # shift right r13 bytes
671
672.ifc \operation, dec
673 movdqa %xmm1, %xmm3
674 pxor %xmm1, %xmm9 # Cyphertext XOR E(K, Yn)
675
676 mov \PLAIN_CYPH_LEN, %r10
677 add %r13, %r10
678 # Set r10 to be the amount of data left in CYPH_PLAIN_IN after filling
679 sub $16, %r10
680 # Determine if if partial block is not being filled and
681 # shift mask accordingly
682 jge _no_extra_mask_1_\@
683 sub %r10, %r12
684_no_extra_mask_1_\@:
685
686 movdqu ALL_F-SHIFT_MASK(%r12), %xmm1
687 # get the appropriate mask to mask out bottom r13 bytes of xmm9
688 pand %xmm1, %xmm9 # mask out bottom r13 bytes of xmm9
689
690 pand %xmm1, %xmm3
691 movdqa SHUF_MASK(%rip), %xmm10
692 PSHUFB_XMM %xmm10, %xmm3
693 PSHUFB_XMM %xmm2, %xmm3
694 pxor %xmm3, \AAD_HASH
695
696 cmp $0, %r10
697 jl _partial_incomplete_1_\@
698
699 # GHASH computation for the last <16 Byte block
700 GHASH_MUL \AAD_HASH, %xmm13, %xmm0, %xmm10, %xmm11, %xmm5, %xmm6
Jan Beulicha7bea832018-07-02 04:31:54 -0600701 xor %eax, %eax
Dave Watsonae952c52018-02-14 09:40:19 -0800702
703 mov %rax, PBlockLen(%arg2)
704 jmp _dec_done_\@
705_partial_incomplete_1_\@:
706 add \PLAIN_CYPH_LEN, PBlockLen(%arg2)
707_dec_done_\@:
708 movdqu \AAD_HASH, AadHash(%arg2)
709.else
710 pxor %xmm1, %xmm9 # Plaintext XOR E(K, Yn)
711
712 mov \PLAIN_CYPH_LEN, %r10
713 add %r13, %r10
714 # Set r10 to be the amount of data left in CYPH_PLAIN_IN after filling
715 sub $16, %r10
716 # Determine if if partial block is not being filled and
717 # shift mask accordingly
718 jge _no_extra_mask_2_\@
719 sub %r10, %r12
720_no_extra_mask_2_\@:
721
722 movdqu ALL_F-SHIFT_MASK(%r12), %xmm1
723 # get the appropriate mask to mask out bottom r13 bytes of xmm9
724 pand %xmm1, %xmm9
725
726 movdqa SHUF_MASK(%rip), %xmm1
727 PSHUFB_XMM %xmm1, %xmm9
728 PSHUFB_XMM %xmm2, %xmm9
729 pxor %xmm9, \AAD_HASH
730
731 cmp $0, %r10
732 jl _partial_incomplete_2_\@
733
734 # GHASH computation for the last <16 Byte block
735 GHASH_MUL \AAD_HASH, %xmm13, %xmm0, %xmm10, %xmm11, %xmm5, %xmm6
Jan Beulicha7bea832018-07-02 04:31:54 -0600736 xor %eax, %eax
Dave Watsonae952c52018-02-14 09:40:19 -0800737
738 mov %rax, PBlockLen(%arg2)
739 jmp _encode_done_\@
740_partial_incomplete_2_\@:
741 add \PLAIN_CYPH_LEN, PBlockLen(%arg2)
742_encode_done_\@:
743 movdqu \AAD_HASH, AadHash(%arg2)
744
745 movdqa SHUF_MASK(%rip), %xmm10
746 # shuffle xmm9 back to output as ciphertext
747 PSHUFB_XMM %xmm10, %xmm9
748 PSHUFB_XMM %xmm2, %xmm9
749.endif
750 # output encrypted Bytes
751 cmp $0, %r10
752 jl _partial_fill_\@
753 mov %r13, %r12
754 mov $16, %r13
755 # Set r13 to be the number of bytes to write out
756 sub %r12, %r13
757 jmp _count_set_\@
758_partial_fill_\@:
759 mov \PLAIN_CYPH_LEN, %r13
760_count_set_\@:
761 movdqa %xmm9, %xmm0
762 MOVQ_R64_XMM %xmm0, %rax
763 cmp $8, %r13
764 jle _less_than_8_bytes_left_\@
765
766 mov %rax, (\CYPH_PLAIN_OUT, \DATA_OFFSET, 1)
767 add $8, \DATA_OFFSET
768 psrldq $8, %xmm0
769 MOVQ_R64_XMM %xmm0, %rax
770 sub $8, %r13
771_less_than_8_bytes_left_\@:
772 movb %al, (\CYPH_PLAIN_OUT, \DATA_OFFSET, 1)
773 add $1, \DATA_OFFSET
774 shr $8, %rax
775 sub $1, %r13
776 jne _less_than_8_bytes_left_\@
777_partial_block_done_\@:
778.endm # PARTIAL_BLOCK
779
Tadeusz Struk0bd82f52010-11-04 15:00:45 -0400780/*
781* if a = number of total plaintext bytes
782* b = floor(a/16)
783* num_initial_blocks = b mod 4
784* encrypt the initial num_initial_blocks blocks and apply ghash on
785* the ciphertext
786* %r10, %r11, %r12, %rax, %xmm5, %xmm6, %xmm7, %xmm8, %xmm9 registers
787* are clobbered
Dave Watson1476db22018-02-14 09:40:10 -0800788* arg1, %arg2, %arg3 are used as a pointer only, not modified
Tadeusz Struk0bd82f52010-11-04 15:00:45 -0400789*/
790
Tadeusz Struk0bd82f52010-11-04 15:00:45 -0400791
Dave Watsone1fd3162018-02-14 09:38:12 -0800792.macro INITIAL_BLOCKS_ENC_DEC TMP1 TMP2 TMP3 TMP4 TMP5 XMM0 XMM1 \
Dave Watsonc594c542018-02-14 09:39:36 -0800793 XMM2 XMM3 XMM4 XMMDst TMP6 TMP7 i i_seq operation
Dave Watson96604742018-02-14 09:39:45 -0800794 MOVADQ SHUF_MASK(%rip), %xmm14
Sabrina Dubroca0487cca2017-04-28 18:11:56 +0200795
Dave Watsonc594c542018-02-14 09:39:36 -0800796 movdqu AadHash(%arg2), %xmm\i # XMM0 = Y0
Sabrina Dubroca0487cca2017-04-28 18:11:56 +0200797
Sabrina Dubroca0487cca2017-04-28 18:11:56 +0200798 # start AES for num_initial_blocks blocks
Tadeusz Struk3c097b82010-12-13 19:51:15 +0800799
Dave Watson96604742018-02-14 09:39:45 -0800800 movdqu CurCount(%arg2), \XMM0 # XMM0 = Y0
Tadeusz Struk3c097b82010-12-13 19:51:15 +0800801
802.if (\i == 5) || (\i == 6) || (\i == 7)
Tadeusz Struk3c097b82010-12-13 19:51:15 +0800803
Timothy McCaffreye31ac322015-01-13 13:16:43 -0500804 MOVADQ ONE(%RIP),\TMP1
805 MOVADQ 0(%arg1),\TMP2
Tadeusz Struk3c097b82010-12-13 19:51:15 +0800806.irpc index, \i_seq
Timothy McCaffreye31ac322015-01-13 13:16:43 -0500807 paddd \TMP1, \XMM0 # INCR Y0
Dave Watsone1fd3162018-02-14 09:38:12 -0800808.ifc \operation, dec
809 movdqa \XMM0, %xmm\index
810.else
Timothy McCaffreye31ac322015-01-13 13:16:43 -0500811 MOVADQ \XMM0, %xmm\index
Dave Watsone1fd3162018-02-14 09:38:12 -0800812.endif
Timothy McCaffreye31ac322015-01-13 13:16:43 -0500813 PSHUFB_XMM %xmm14, %xmm\index # perform a 16 byte swap
814 pxor \TMP2, %xmm\index
Tadeusz Struk3c097b82010-12-13 19:51:15 +0800815.endr
Timothy McCaffreye31ac322015-01-13 13:16:43 -0500816 lea 0x10(%arg1),%r10
817 mov keysize,%eax
818 shr $2,%eax # 128->4, 192->6, 256->8
819 add $5,%eax # 128->9, 192->11, 256->13
820
Dave Watsone1fd3162018-02-14 09:38:12 -0800821aes_loop_initial_\@:
Timothy McCaffreye31ac322015-01-13 13:16:43 -0500822 MOVADQ (%r10),\TMP1
823.irpc index, \i_seq
824 AESENC \TMP1, %xmm\index
Tadeusz Struk3c097b82010-12-13 19:51:15 +0800825.endr
Timothy McCaffreye31ac322015-01-13 13:16:43 -0500826 add $16,%r10
827 sub $1,%eax
Dave Watsone1fd3162018-02-14 09:38:12 -0800828 jnz aes_loop_initial_\@
Timothy McCaffreye31ac322015-01-13 13:16:43 -0500829
830 MOVADQ (%r10), \TMP1
Tadeusz Struk3c097b82010-12-13 19:51:15 +0800831.irpc index, \i_seq
Timothy McCaffreye31ac322015-01-13 13:16:43 -0500832 AESENCLAST \TMP1, %xmm\index # Last Round
Tadeusz Struk3c097b82010-12-13 19:51:15 +0800833.endr
834.irpc index, \i_seq
Dave Watson9ee4a5d2018-02-14 09:39:23 -0800835 movdqu (%arg4 , %r11, 1), \TMP1
Tadeusz Struk3c097b82010-12-13 19:51:15 +0800836 pxor \TMP1, %xmm\index
Dave Watson9ee4a5d2018-02-14 09:39:23 -0800837 movdqu %xmm\index, (%arg3 , %r11, 1)
Tadeusz Struk3c097b82010-12-13 19:51:15 +0800838 # write back plaintext/ciphertext for num_initial_blocks
839 add $16, %r11
Dave Watsone1fd3162018-02-14 09:38:12 -0800840
841.ifc \operation, dec
842 movdqa \TMP1, %xmm\index
843.endif
Tadeusz Struk3c097b82010-12-13 19:51:15 +0800844 PSHUFB_XMM %xmm14, %xmm\index
845
846 # prepare plaintext/ciphertext for GHASH computation
847.endr
848.endif
Sabrina Dubroca0487cca2017-04-28 18:11:56 +0200849
Tadeusz Struk3c097b82010-12-13 19:51:15 +0800850 # apply GHASH on num_initial_blocks blocks
851
852.if \i == 5
853 pxor %xmm5, %xmm6
854 GHASH_MUL %xmm6, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1
855 pxor %xmm6, %xmm7
856 GHASH_MUL %xmm7, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1
857 pxor %xmm7, %xmm8
858 GHASH_MUL %xmm8, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1
859.elseif \i == 6
860 pxor %xmm6, %xmm7
861 GHASH_MUL %xmm7, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1
862 pxor %xmm7, %xmm8
863 GHASH_MUL %xmm8, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1
864.elseif \i == 7
865 pxor %xmm7, %xmm8
866 GHASH_MUL %xmm8, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1
867.endif
868 cmp $64, %r13
Dave Watsone1fd3162018-02-14 09:38:12 -0800869 jl _initial_blocks_done\@
Tadeusz Struk3c097b82010-12-13 19:51:15 +0800870 # no need for precomputed values
871/*
872*
873* Precomputations for HashKey parallel with encryption of first 4 blocks.
874* Haskey_i_k holds XORed values of the low and high parts of the Haskey_i
875*/
Timothy McCaffreye31ac322015-01-13 13:16:43 -0500876 MOVADQ ONE(%RIP),\TMP1
877 paddd \TMP1, \XMM0 # INCR Y0
878 MOVADQ \XMM0, \XMM1
Tadeusz Struk3c097b82010-12-13 19:51:15 +0800879 PSHUFB_XMM %xmm14, \XMM1 # perform a 16 byte swap
880
Timothy McCaffreye31ac322015-01-13 13:16:43 -0500881 paddd \TMP1, \XMM0 # INCR Y0
882 MOVADQ \XMM0, \XMM2
Tadeusz Struk3c097b82010-12-13 19:51:15 +0800883 PSHUFB_XMM %xmm14, \XMM2 # perform a 16 byte swap
884
Timothy McCaffreye31ac322015-01-13 13:16:43 -0500885 paddd \TMP1, \XMM0 # INCR Y0
886 MOVADQ \XMM0, \XMM3
Tadeusz Struk3c097b82010-12-13 19:51:15 +0800887 PSHUFB_XMM %xmm14, \XMM3 # perform a 16 byte swap
888
Timothy McCaffreye31ac322015-01-13 13:16:43 -0500889 paddd \TMP1, \XMM0 # INCR Y0
890 MOVADQ \XMM0, \XMM4
Tadeusz Struk3c097b82010-12-13 19:51:15 +0800891 PSHUFB_XMM %xmm14, \XMM4 # perform a 16 byte swap
892
Timothy McCaffreye31ac322015-01-13 13:16:43 -0500893 MOVADQ 0(%arg1),\TMP1
894 pxor \TMP1, \XMM1
895 pxor \TMP1, \XMM2
896 pxor \TMP1, \XMM3
897 pxor \TMP1, \XMM4
Tadeusz Struk3c097b82010-12-13 19:51:15 +0800898.irpc index, 1234 # do 4 rounds
899 movaps 0x10*\index(%arg1), \TMP1
900 AESENC \TMP1, \XMM1
901 AESENC \TMP1, \XMM2
902 AESENC \TMP1, \XMM3
903 AESENC \TMP1, \XMM4
904.endr
Tadeusz Struk3c097b82010-12-13 19:51:15 +0800905.irpc index, 56789 # do next 5 rounds
906 movaps 0x10*\index(%arg1), \TMP1
907 AESENC \TMP1, \XMM1
908 AESENC \TMP1, \XMM2
909 AESENC \TMP1, \XMM3
910 AESENC \TMP1, \XMM4
911.endr
Timothy McCaffreye31ac322015-01-13 13:16:43 -0500912 lea 0xa0(%arg1),%r10
913 mov keysize,%eax
914 shr $2,%eax # 128->4, 192->6, 256->8
915 sub $4,%eax # 128->0, 192->2, 256->4
Dave Watsone1fd3162018-02-14 09:38:12 -0800916 jz aes_loop_pre_done\@
Timothy McCaffreye31ac322015-01-13 13:16:43 -0500917
Dave Watsone1fd3162018-02-14 09:38:12 -0800918aes_loop_pre_\@:
Timothy McCaffreye31ac322015-01-13 13:16:43 -0500919 MOVADQ (%r10),\TMP2
920.irpc index, 1234
921 AESENC \TMP2, %xmm\index
922.endr
923 add $16,%r10
924 sub $1,%eax
Dave Watsone1fd3162018-02-14 09:38:12 -0800925 jnz aes_loop_pre_\@
Timothy McCaffreye31ac322015-01-13 13:16:43 -0500926
Dave Watsone1fd3162018-02-14 09:38:12 -0800927aes_loop_pre_done\@:
Timothy McCaffreye31ac322015-01-13 13:16:43 -0500928 MOVADQ (%r10), \TMP2
Tadeusz Struk3c097b82010-12-13 19:51:15 +0800929 AESENCLAST \TMP2, \XMM1
930 AESENCLAST \TMP2, \XMM2
931 AESENCLAST \TMP2, \XMM3
932 AESENCLAST \TMP2, \XMM4
Dave Watson9ee4a5d2018-02-14 09:39:23 -0800933 movdqu 16*0(%arg4 , %r11 , 1), \TMP1
Tadeusz Struk3c097b82010-12-13 19:51:15 +0800934 pxor \TMP1, \XMM1
Dave Watsone1fd3162018-02-14 09:38:12 -0800935.ifc \operation, dec
Dave Watson9ee4a5d2018-02-14 09:39:23 -0800936 movdqu \XMM1, 16*0(%arg3 , %r11 , 1)
Dave Watsone1fd3162018-02-14 09:38:12 -0800937 movdqa \TMP1, \XMM1
938.endif
Dave Watson9ee4a5d2018-02-14 09:39:23 -0800939 movdqu 16*1(%arg4 , %r11 , 1), \TMP1
Tadeusz Struk3c097b82010-12-13 19:51:15 +0800940 pxor \TMP1, \XMM2
Dave Watsone1fd3162018-02-14 09:38:12 -0800941.ifc \operation, dec
Dave Watson9ee4a5d2018-02-14 09:39:23 -0800942 movdqu \XMM2, 16*1(%arg3 , %r11 , 1)
Dave Watsone1fd3162018-02-14 09:38:12 -0800943 movdqa \TMP1, \XMM2
944.endif
Dave Watson9ee4a5d2018-02-14 09:39:23 -0800945 movdqu 16*2(%arg4 , %r11 , 1), \TMP1
Tadeusz Struk3c097b82010-12-13 19:51:15 +0800946 pxor \TMP1, \XMM3
Dave Watsone1fd3162018-02-14 09:38:12 -0800947.ifc \operation, dec
Dave Watson9ee4a5d2018-02-14 09:39:23 -0800948 movdqu \XMM3, 16*2(%arg3 , %r11 , 1)
Dave Watsone1fd3162018-02-14 09:38:12 -0800949 movdqa \TMP1, \XMM3
950.endif
Dave Watson9ee4a5d2018-02-14 09:39:23 -0800951 movdqu 16*3(%arg4 , %r11 , 1), \TMP1
Tadeusz Struk3c097b82010-12-13 19:51:15 +0800952 pxor \TMP1, \XMM4
Dave Watsone1fd3162018-02-14 09:38:12 -0800953.ifc \operation, dec
Dave Watson9ee4a5d2018-02-14 09:39:23 -0800954 movdqu \XMM4, 16*3(%arg3 , %r11 , 1)
Dave Watsone1fd3162018-02-14 09:38:12 -0800955 movdqa \TMP1, \XMM4
956.else
Dave Watson9ee4a5d2018-02-14 09:39:23 -0800957 movdqu \XMM1, 16*0(%arg3 , %r11 , 1)
958 movdqu \XMM2, 16*1(%arg3 , %r11 , 1)
959 movdqu \XMM3, 16*2(%arg3 , %r11 , 1)
960 movdqu \XMM4, 16*3(%arg3 , %r11 , 1)
Dave Watsone1fd3162018-02-14 09:38:12 -0800961.endif
Tadeusz Struk3c097b82010-12-13 19:51:15 +0800962
Tadeusz Struk0bd82f52010-11-04 15:00:45 -0400963 add $64, %r11
Tadeusz Struk3c097b82010-12-13 19:51:15 +0800964 PSHUFB_XMM %xmm14, \XMM1 # perform a 16 byte swap
Tadeusz Struk0bd82f52010-11-04 15:00:45 -0400965 pxor \XMMDst, \XMM1
966# combine GHASHed value with the corresponding ciphertext
Tadeusz Struk3c097b82010-12-13 19:51:15 +0800967 PSHUFB_XMM %xmm14, \XMM2 # perform a 16 byte swap
Tadeusz Struk3c097b82010-12-13 19:51:15 +0800968 PSHUFB_XMM %xmm14, \XMM3 # perform a 16 byte swap
Tadeusz Struk3c097b82010-12-13 19:51:15 +0800969 PSHUFB_XMM %xmm14, \XMM4 # perform a 16 byte swap
970
Dave Watsone1fd3162018-02-14 09:38:12 -0800971_initial_blocks_done\@:
Tadeusz Struk3c097b82010-12-13 19:51:15 +0800972
Tadeusz Struk0bd82f52010-11-04 15:00:45 -0400973.endm
974
975/*
976* encrypt 4 blocks at a time
977* ghash the 4 previously encrypted ciphertext blocks
Dave Watson9ee4a5d2018-02-14 09:39:23 -0800978* arg1, %arg3, %arg4 are used as pointers only, not modified
Tadeusz Struk0bd82f52010-11-04 15:00:45 -0400979* %r11 is the data offset value
980*/
Tadeusz Struk3c097b82010-12-13 19:51:15 +0800981.macro GHASH_4_ENCRYPT_4_PARALLEL_ENC TMP1 TMP2 TMP3 TMP4 TMP5 \
Tadeusz Struk0bd82f52010-11-04 15:00:45 -0400982TMP6 XMM0 XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7 XMM8 operation
983
984 movdqa \XMM1, \XMM5
985 movdqa \XMM2, \XMM6
986 movdqa \XMM3, \XMM7
987 movdqa \XMM4, \XMM8
988
Tadeusz Struk3c097b82010-12-13 19:51:15 +0800989 movdqa SHUF_MASK(%rip), %xmm15
Tadeusz Struk0bd82f52010-11-04 15:00:45 -0400990 # multiply TMP5 * HashKey using karatsuba
991
992 movdqa \XMM5, \TMP4
993 pshufd $78, \XMM5, \TMP6
994 pxor \XMM5, \TMP6
995 paddd ONE(%rip), \XMM0 # INCR CNT
Dave Watsone5b954e2018-08-15 10:29:42 -0700996 movdqu HashKey_4(%arg2), \TMP5
Tadeusz Struk0bd82f52010-11-04 15:00:45 -0400997 PCLMULQDQ 0x11, \TMP5, \TMP4 # TMP4 = a1*b1
998 movdqa \XMM0, \XMM1
999 paddd ONE(%rip), \XMM0 # INCR CNT
1000 movdqa \XMM0, \XMM2
1001 paddd ONE(%rip), \XMM0 # INCR CNT
1002 movdqa \XMM0, \XMM3
1003 paddd ONE(%rip), \XMM0 # INCR CNT
1004 movdqa \XMM0, \XMM4
Tadeusz Struk3c097b82010-12-13 19:51:15 +08001005 PSHUFB_XMM %xmm15, \XMM1 # perform a 16 byte swap
Tadeusz Struk0bd82f52010-11-04 15:00:45 -04001006 PCLMULQDQ 0x00, \TMP5, \XMM5 # XMM5 = a0*b0
Tadeusz Struk3c097b82010-12-13 19:51:15 +08001007 PSHUFB_XMM %xmm15, \XMM2 # perform a 16 byte swap
1008 PSHUFB_XMM %xmm15, \XMM3 # perform a 16 byte swap
1009 PSHUFB_XMM %xmm15, \XMM4 # perform a 16 byte swap
1010
Tadeusz Struk0bd82f52010-11-04 15:00:45 -04001011 pxor (%arg1), \XMM1
1012 pxor (%arg1), \XMM2
1013 pxor (%arg1), \XMM3
1014 pxor (%arg1), \XMM4
Dave Watsone5b954e2018-08-15 10:29:42 -07001015 movdqu HashKey_4_k(%arg2), \TMP5
Tadeusz Struk0bd82f52010-11-04 15:00:45 -04001016 PCLMULQDQ 0x00, \TMP5, \TMP6 # TMP6 = (a1+a0)*(b1+b0)
1017 movaps 0x10(%arg1), \TMP1
1018 AESENC \TMP1, \XMM1 # Round 1
1019 AESENC \TMP1, \XMM2
1020 AESENC \TMP1, \XMM3
1021 AESENC \TMP1, \XMM4
1022 movaps 0x20(%arg1), \TMP1
1023 AESENC \TMP1, \XMM1 # Round 2
1024 AESENC \TMP1, \XMM2
1025 AESENC \TMP1, \XMM3
1026 AESENC \TMP1, \XMM4
1027 movdqa \XMM6, \TMP1
1028 pshufd $78, \XMM6, \TMP2
1029 pxor \XMM6, \TMP2
Dave Watsone5b954e2018-08-15 10:29:42 -07001030 movdqu HashKey_3(%arg2), \TMP5
Tadeusz Struk0bd82f52010-11-04 15:00:45 -04001031 PCLMULQDQ 0x11, \TMP5, \TMP1 # TMP1 = a1 * b1
1032 movaps 0x30(%arg1), \TMP3
1033 AESENC \TMP3, \XMM1 # Round 3
1034 AESENC \TMP3, \XMM2
1035 AESENC \TMP3, \XMM3
1036 AESENC \TMP3, \XMM4
1037 PCLMULQDQ 0x00, \TMP5, \XMM6 # XMM6 = a0*b0
1038 movaps 0x40(%arg1), \TMP3
1039 AESENC \TMP3, \XMM1 # Round 4
1040 AESENC \TMP3, \XMM2
1041 AESENC \TMP3, \XMM3
1042 AESENC \TMP3, \XMM4
Dave Watsone5b954e2018-08-15 10:29:42 -07001043 movdqu HashKey_3_k(%arg2), \TMP5
Tadeusz Struk0bd82f52010-11-04 15:00:45 -04001044 PCLMULQDQ 0x00, \TMP5, \TMP2 # TMP2 = (a1+a0)*(b1+b0)
1045 movaps 0x50(%arg1), \TMP3
1046 AESENC \TMP3, \XMM1 # Round 5
1047 AESENC \TMP3, \XMM2
1048 AESENC \TMP3, \XMM3
1049 AESENC \TMP3, \XMM4
1050 pxor \TMP1, \TMP4
1051# accumulate the results in TMP4:XMM5, TMP6 holds the middle part
1052 pxor \XMM6, \XMM5
1053 pxor \TMP2, \TMP6
1054 movdqa \XMM7, \TMP1
1055 pshufd $78, \XMM7, \TMP2
1056 pxor \XMM7, \TMP2
Dave Watsone5b954e2018-08-15 10:29:42 -07001057 movdqu HashKey_2(%arg2), \TMP5
Tadeusz Struk0bd82f52010-11-04 15:00:45 -04001058
1059 # Multiply TMP5 * HashKey using karatsuba
1060
1061 PCLMULQDQ 0x11, \TMP5, \TMP1 # TMP1 = a1*b1
1062 movaps 0x60(%arg1), \TMP3
1063 AESENC \TMP3, \XMM1 # Round 6
1064 AESENC \TMP3, \XMM2
1065 AESENC \TMP3, \XMM3
1066 AESENC \TMP3, \XMM4
1067 PCLMULQDQ 0x00, \TMP5, \XMM7 # XMM7 = a0*b0
1068 movaps 0x70(%arg1), \TMP3
1069 AESENC \TMP3, \XMM1 # Round 7
1070 AESENC \TMP3, \XMM2
1071 AESENC \TMP3, \XMM3
1072 AESENC \TMP3, \XMM4
Dave Watsone5b954e2018-08-15 10:29:42 -07001073 movdqu HashKey_2_k(%arg2), \TMP5
Tadeusz Struk0bd82f52010-11-04 15:00:45 -04001074 PCLMULQDQ 0x00, \TMP5, \TMP2 # TMP2 = (a1+a0)*(b1+b0)
1075 movaps 0x80(%arg1), \TMP3
1076 AESENC \TMP3, \XMM1 # Round 8
1077 AESENC \TMP3, \XMM2
1078 AESENC \TMP3, \XMM3
1079 AESENC \TMP3, \XMM4
1080 pxor \TMP1, \TMP4
1081# accumulate the results in TMP4:XMM5, TMP6 holds the middle part
1082 pxor \XMM7, \XMM5
1083 pxor \TMP2, \TMP6
1084
1085 # Multiply XMM8 * HashKey
1086 # XMM8 and TMP5 hold the values for the two operands
1087
1088 movdqa \XMM8, \TMP1
1089 pshufd $78, \XMM8, \TMP2
1090 pxor \XMM8, \TMP2
Dave Watsone5b954e2018-08-15 10:29:42 -07001091 movdqu HashKey(%arg2), \TMP5
Tadeusz Struk0bd82f52010-11-04 15:00:45 -04001092 PCLMULQDQ 0x11, \TMP5, \TMP1 # TMP1 = a1*b1
1093 movaps 0x90(%arg1), \TMP3
1094 AESENC \TMP3, \XMM1 # Round 9
1095 AESENC \TMP3, \XMM2
1096 AESENC \TMP3, \XMM3
1097 AESENC \TMP3, \XMM4
1098 PCLMULQDQ 0x00, \TMP5, \XMM8 # XMM8 = a0*b0
Timothy McCaffreye31ac322015-01-13 13:16:43 -05001099 lea 0xa0(%arg1),%r10
1100 mov keysize,%eax
1101 shr $2,%eax # 128->4, 192->6, 256->8
1102 sub $4,%eax # 128->0, 192->2, 256->4
Dave Watsonfb8986e2018-02-14 09:40:47 -08001103 jz aes_loop_par_enc_done\@
Timothy McCaffreye31ac322015-01-13 13:16:43 -05001104
Dave Watsonfb8986e2018-02-14 09:40:47 -08001105aes_loop_par_enc\@:
Timothy McCaffreye31ac322015-01-13 13:16:43 -05001106 MOVADQ (%r10),\TMP3
1107.irpc index, 1234
1108 AESENC \TMP3, %xmm\index
1109.endr
1110 add $16,%r10
1111 sub $1,%eax
Dave Watsonfb8986e2018-02-14 09:40:47 -08001112 jnz aes_loop_par_enc\@
Timothy McCaffreye31ac322015-01-13 13:16:43 -05001113
Dave Watsonfb8986e2018-02-14 09:40:47 -08001114aes_loop_par_enc_done\@:
Timothy McCaffreye31ac322015-01-13 13:16:43 -05001115 MOVADQ (%r10), \TMP3
Tadeusz Struk0bd82f52010-11-04 15:00:45 -04001116 AESENCLAST \TMP3, \XMM1 # Round 10
1117 AESENCLAST \TMP3, \XMM2
1118 AESENCLAST \TMP3, \XMM3
1119 AESENCLAST \TMP3, \XMM4
Dave Watsone5b954e2018-08-15 10:29:42 -07001120 movdqu HashKey_k(%arg2), \TMP5
Tadeusz Struk0bd82f52010-11-04 15:00:45 -04001121 PCLMULQDQ 0x00, \TMP5, \TMP2 # TMP2 = (a1+a0)*(b1+b0)
Dave Watson9ee4a5d2018-02-14 09:39:23 -08001122 movdqu (%arg4,%r11,1), \TMP3
Tadeusz Struk0bd82f52010-11-04 15:00:45 -04001123 pxor \TMP3, \XMM1 # Ciphertext/Plaintext XOR EK
Dave Watson9ee4a5d2018-02-14 09:39:23 -08001124 movdqu 16(%arg4,%r11,1), \TMP3
Tadeusz Struk0bd82f52010-11-04 15:00:45 -04001125 pxor \TMP3, \XMM2 # Ciphertext/Plaintext XOR EK
Dave Watson9ee4a5d2018-02-14 09:39:23 -08001126 movdqu 32(%arg4,%r11,1), \TMP3
Tadeusz Struk0bd82f52010-11-04 15:00:45 -04001127 pxor \TMP3, \XMM3 # Ciphertext/Plaintext XOR EK
Dave Watson9ee4a5d2018-02-14 09:39:23 -08001128 movdqu 48(%arg4,%r11,1), \TMP3
Tadeusz Struk0bd82f52010-11-04 15:00:45 -04001129 pxor \TMP3, \XMM4 # Ciphertext/Plaintext XOR EK
Dave Watson9ee4a5d2018-02-14 09:39:23 -08001130 movdqu \XMM1, (%arg3,%r11,1) # Write to the ciphertext buffer
1131 movdqu \XMM2, 16(%arg3,%r11,1) # Write to the ciphertext buffer
1132 movdqu \XMM3, 32(%arg3,%r11,1) # Write to the ciphertext buffer
1133 movdqu \XMM4, 48(%arg3,%r11,1) # Write to the ciphertext buffer
Tadeusz Struk3c097b82010-12-13 19:51:15 +08001134 PSHUFB_XMM %xmm15, \XMM1 # perform a 16 byte swap
1135 PSHUFB_XMM %xmm15, \XMM2 # perform a 16 byte swap
1136 PSHUFB_XMM %xmm15, \XMM3 # perform a 16 byte swap
1137 PSHUFB_XMM %xmm15, \XMM4 # perform a 16 byte swap
1138
1139 pxor \TMP4, \TMP1
1140 pxor \XMM8, \XMM5
1141 pxor \TMP6, \TMP2
1142 pxor \TMP1, \TMP2
1143 pxor \XMM5, \TMP2
1144 movdqa \TMP2, \TMP3
1145 pslldq $8, \TMP3 # left shift TMP3 2 DWs
1146 psrldq $8, \TMP2 # right shift TMP2 2 DWs
1147 pxor \TMP3, \XMM5
1148 pxor \TMP2, \TMP1 # accumulate the results in TMP1:XMM5
1149
1150 # first phase of reduction
1151
1152 movdqa \XMM5, \TMP2
1153 movdqa \XMM5, \TMP3
1154 movdqa \XMM5, \TMP4
1155# move XMM5 into TMP2, TMP3, TMP4 in order to perform shifts independently
1156 pslld $31, \TMP2 # packed right shift << 31
1157 pslld $30, \TMP3 # packed right shift << 30
1158 pslld $25, \TMP4 # packed right shift << 25
1159 pxor \TMP3, \TMP2 # xor the shifted versions
1160 pxor \TMP4, \TMP2
1161 movdqa \TMP2, \TMP5
1162 psrldq $4, \TMP5 # right shift T5 1 DW
1163 pslldq $12, \TMP2 # left shift T2 3 DWs
1164 pxor \TMP2, \XMM5
1165
1166 # second phase of reduction
1167
1168 movdqa \XMM5,\TMP2 # make 3 copies of XMM5 into TMP2, TMP3, TMP4
1169 movdqa \XMM5,\TMP3
1170 movdqa \XMM5,\TMP4
1171 psrld $1, \TMP2 # packed left shift >>1
1172 psrld $2, \TMP3 # packed left shift >>2
1173 psrld $7, \TMP4 # packed left shift >>7
1174 pxor \TMP3,\TMP2 # xor the shifted versions
1175 pxor \TMP4,\TMP2
1176 pxor \TMP5, \TMP2
1177 pxor \TMP2, \XMM5
1178 pxor \TMP1, \XMM5 # result is in TMP1
1179
1180 pxor \XMM5, \XMM1
1181.endm
1182
1183/*
1184* decrypt 4 blocks at a time
1185* ghash the 4 previously decrypted ciphertext blocks
Dave Watson9ee4a5d2018-02-14 09:39:23 -08001186* arg1, %arg3, %arg4 are used as pointers only, not modified
Tadeusz Struk3c097b82010-12-13 19:51:15 +08001187* %r11 is the data offset value
1188*/
1189.macro GHASH_4_ENCRYPT_4_PARALLEL_DEC TMP1 TMP2 TMP3 TMP4 TMP5 \
1190TMP6 XMM0 XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7 XMM8 operation
1191
1192 movdqa \XMM1, \XMM5
1193 movdqa \XMM2, \XMM6
1194 movdqa \XMM3, \XMM7
1195 movdqa \XMM4, \XMM8
1196
1197 movdqa SHUF_MASK(%rip), %xmm15
1198 # multiply TMP5 * HashKey using karatsuba
1199
1200 movdqa \XMM5, \TMP4
1201 pshufd $78, \XMM5, \TMP6
1202 pxor \XMM5, \TMP6
1203 paddd ONE(%rip), \XMM0 # INCR CNT
Dave Watsone5b954e2018-08-15 10:29:42 -07001204 movdqu HashKey_4(%arg2), \TMP5
Tadeusz Struk3c097b82010-12-13 19:51:15 +08001205 PCLMULQDQ 0x11, \TMP5, \TMP4 # TMP4 = a1*b1
1206 movdqa \XMM0, \XMM1
1207 paddd ONE(%rip), \XMM0 # INCR CNT
1208 movdqa \XMM0, \XMM2
1209 paddd ONE(%rip), \XMM0 # INCR CNT
1210 movdqa \XMM0, \XMM3
1211 paddd ONE(%rip), \XMM0 # INCR CNT
1212 movdqa \XMM0, \XMM4
1213 PSHUFB_XMM %xmm15, \XMM1 # perform a 16 byte swap
1214 PCLMULQDQ 0x00, \TMP5, \XMM5 # XMM5 = a0*b0
1215 PSHUFB_XMM %xmm15, \XMM2 # perform a 16 byte swap
1216 PSHUFB_XMM %xmm15, \XMM3 # perform a 16 byte swap
1217 PSHUFB_XMM %xmm15, \XMM4 # perform a 16 byte swap
1218
1219 pxor (%arg1), \XMM1
1220 pxor (%arg1), \XMM2
1221 pxor (%arg1), \XMM3
1222 pxor (%arg1), \XMM4
Dave Watsone5b954e2018-08-15 10:29:42 -07001223 movdqu HashKey_4_k(%arg2), \TMP5
Tadeusz Struk3c097b82010-12-13 19:51:15 +08001224 PCLMULQDQ 0x00, \TMP5, \TMP6 # TMP6 = (a1+a0)*(b1+b0)
1225 movaps 0x10(%arg1), \TMP1
1226 AESENC \TMP1, \XMM1 # Round 1
1227 AESENC \TMP1, \XMM2
1228 AESENC \TMP1, \XMM3
1229 AESENC \TMP1, \XMM4
1230 movaps 0x20(%arg1), \TMP1
1231 AESENC \TMP1, \XMM1 # Round 2
1232 AESENC \TMP1, \XMM2
1233 AESENC \TMP1, \XMM3
1234 AESENC \TMP1, \XMM4
1235 movdqa \XMM6, \TMP1
1236 pshufd $78, \XMM6, \TMP2
1237 pxor \XMM6, \TMP2
Dave Watsone5b954e2018-08-15 10:29:42 -07001238 movdqu HashKey_3(%arg2), \TMP5
Tadeusz Struk3c097b82010-12-13 19:51:15 +08001239 PCLMULQDQ 0x11, \TMP5, \TMP1 # TMP1 = a1 * b1
1240 movaps 0x30(%arg1), \TMP3
1241 AESENC \TMP3, \XMM1 # Round 3
1242 AESENC \TMP3, \XMM2
1243 AESENC \TMP3, \XMM3
1244 AESENC \TMP3, \XMM4
1245 PCLMULQDQ 0x00, \TMP5, \XMM6 # XMM6 = a0*b0
1246 movaps 0x40(%arg1), \TMP3
1247 AESENC \TMP3, \XMM1 # Round 4
1248 AESENC \TMP3, \XMM2
1249 AESENC \TMP3, \XMM3
1250 AESENC \TMP3, \XMM4
Dave Watsone5b954e2018-08-15 10:29:42 -07001251 movdqu HashKey_3_k(%arg2), \TMP5
Tadeusz Struk3c097b82010-12-13 19:51:15 +08001252 PCLMULQDQ 0x00, \TMP5, \TMP2 # TMP2 = (a1+a0)*(b1+b0)
1253 movaps 0x50(%arg1), \TMP3
1254 AESENC \TMP3, \XMM1 # Round 5
1255 AESENC \TMP3, \XMM2
1256 AESENC \TMP3, \XMM3
1257 AESENC \TMP3, \XMM4
1258 pxor \TMP1, \TMP4
1259# accumulate the results in TMP4:XMM5, TMP6 holds the middle part
1260 pxor \XMM6, \XMM5
1261 pxor \TMP2, \TMP6
1262 movdqa \XMM7, \TMP1
1263 pshufd $78, \XMM7, \TMP2
1264 pxor \XMM7, \TMP2
Dave Watsone5b954e2018-08-15 10:29:42 -07001265 movdqu HashKey_2(%arg2), \TMP5
Tadeusz Struk3c097b82010-12-13 19:51:15 +08001266
1267 # Multiply TMP5 * HashKey using karatsuba
1268
1269 PCLMULQDQ 0x11, \TMP5, \TMP1 # TMP1 = a1*b1
1270 movaps 0x60(%arg1), \TMP3
1271 AESENC \TMP3, \XMM1 # Round 6
1272 AESENC \TMP3, \XMM2
1273 AESENC \TMP3, \XMM3
1274 AESENC \TMP3, \XMM4
1275 PCLMULQDQ 0x00, \TMP5, \XMM7 # XMM7 = a0*b0
1276 movaps 0x70(%arg1), \TMP3
1277 AESENC \TMP3, \XMM1 # Round 7
1278 AESENC \TMP3, \XMM2
1279 AESENC \TMP3, \XMM3
1280 AESENC \TMP3, \XMM4
Dave Watsone5b954e2018-08-15 10:29:42 -07001281 movdqu HashKey_2_k(%arg2), \TMP5
Tadeusz Struk3c097b82010-12-13 19:51:15 +08001282 PCLMULQDQ 0x00, \TMP5, \TMP2 # TMP2 = (a1+a0)*(b1+b0)
1283 movaps 0x80(%arg1), \TMP3
1284 AESENC \TMP3, \XMM1 # Round 8
1285 AESENC \TMP3, \XMM2
1286 AESENC \TMP3, \XMM3
1287 AESENC \TMP3, \XMM4
1288 pxor \TMP1, \TMP4
1289# accumulate the results in TMP4:XMM5, TMP6 holds the middle part
1290 pxor \XMM7, \XMM5
1291 pxor \TMP2, \TMP6
1292
1293 # Multiply XMM8 * HashKey
1294 # XMM8 and TMP5 hold the values for the two operands
1295
1296 movdqa \XMM8, \TMP1
1297 pshufd $78, \XMM8, \TMP2
1298 pxor \XMM8, \TMP2
Dave Watsone5b954e2018-08-15 10:29:42 -07001299 movdqu HashKey(%arg2), \TMP5
Tadeusz Struk3c097b82010-12-13 19:51:15 +08001300 PCLMULQDQ 0x11, \TMP5, \TMP1 # TMP1 = a1*b1
1301 movaps 0x90(%arg1), \TMP3
1302 AESENC \TMP3, \XMM1 # Round 9
1303 AESENC \TMP3, \XMM2
1304 AESENC \TMP3, \XMM3
1305 AESENC \TMP3, \XMM4
1306 PCLMULQDQ 0x00, \TMP5, \XMM8 # XMM8 = a0*b0
Timothy McCaffreye31ac322015-01-13 13:16:43 -05001307 lea 0xa0(%arg1),%r10
1308 mov keysize,%eax
1309 shr $2,%eax # 128->4, 192->6, 256->8
1310 sub $4,%eax # 128->0, 192->2, 256->4
Dave Watsonfb8986e2018-02-14 09:40:47 -08001311 jz aes_loop_par_dec_done\@
Timothy McCaffreye31ac322015-01-13 13:16:43 -05001312
Dave Watsonfb8986e2018-02-14 09:40:47 -08001313aes_loop_par_dec\@:
Timothy McCaffreye31ac322015-01-13 13:16:43 -05001314 MOVADQ (%r10),\TMP3
1315.irpc index, 1234
1316 AESENC \TMP3, %xmm\index
1317.endr
1318 add $16,%r10
1319 sub $1,%eax
Dave Watsonfb8986e2018-02-14 09:40:47 -08001320 jnz aes_loop_par_dec\@
Timothy McCaffreye31ac322015-01-13 13:16:43 -05001321
Dave Watsonfb8986e2018-02-14 09:40:47 -08001322aes_loop_par_dec_done\@:
Timothy McCaffreye31ac322015-01-13 13:16:43 -05001323 MOVADQ (%r10), \TMP3
1324 AESENCLAST \TMP3, \XMM1 # last round
Tadeusz Struk3c097b82010-12-13 19:51:15 +08001325 AESENCLAST \TMP3, \XMM2
1326 AESENCLAST \TMP3, \XMM3
1327 AESENCLAST \TMP3, \XMM4
Dave Watsone5b954e2018-08-15 10:29:42 -07001328 movdqu HashKey_k(%arg2), \TMP5
Tadeusz Struk3c097b82010-12-13 19:51:15 +08001329 PCLMULQDQ 0x00, \TMP5, \TMP2 # TMP2 = (a1+a0)*(b1+b0)
Dave Watson9ee4a5d2018-02-14 09:39:23 -08001330 movdqu (%arg4,%r11,1), \TMP3
Tadeusz Struk3c097b82010-12-13 19:51:15 +08001331 pxor \TMP3, \XMM1 # Ciphertext/Plaintext XOR EK
Dave Watson9ee4a5d2018-02-14 09:39:23 -08001332 movdqu \XMM1, (%arg3,%r11,1) # Write to plaintext buffer
Tadeusz Struk3c097b82010-12-13 19:51:15 +08001333 movdqa \TMP3, \XMM1
Dave Watson9ee4a5d2018-02-14 09:39:23 -08001334 movdqu 16(%arg4,%r11,1), \TMP3
Tadeusz Struk3c097b82010-12-13 19:51:15 +08001335 pxor \TMP3, \XMM2 # Ciphertext/Plaintext XOR EK
Dave Watson9ee4a5d2018-02-14 09:39:23 -08001336 movdqu \XMM2, 16(%arg3,%r11,1) # Write to plaintext buffer
Tadeusz Struk3c097b82010-12-13 19:51:15 +08001337 movdqa \TMP3, \XMM2
Dave Watson9ee4a5d2018-02-14 09:39:23 -08001338 movdqu 32(%arg4,%r11,1), \TMP3
Tadeusz Struk3c097b82010-12-13 19:51:15 +08001339 pxor \TMP3, \XMM3 # Ciphertext/Plaintext XOR EK
Dave Watson9ee4a5d2018-02-14 09:39:23 -08001340 movdqu \XMM3, 32(%arg3,%r11,1) # Write to plaintext buffer
Tadeusz Struk3c097b82010-12-13 19:51:15 +08001341 movdqa \TMP3, \XMM3
Dave Watson9ee4a5d2018-02-14 09:39:23 -08001342 movdqu 48(%arg4,%r11,1), \TMP3
Tadeusz Struk3c097b82010-12-13 19:51:15 +08001343 pxor \TMP3, \XMM4 # Ciphertext/Plaintext XOR EK
Dave Watson9ee4a5d2018-02-14 09:39:23 -08001344 movdqu \XMM4, 48(%arg3,%r11,1) # Write to plaintext buffer
Tadeusz Struk0bd82f52010-11-04 15:00:45 -04001345 movdqa \TMP3, \XMM4
Tadeusz Struk3c097b82010-12-13 19:51:15 +08001346 PSHUFB_XMM %xmm15, \XMM1 # perform a 16 byte swap
1347 PSHUFB_XMM %xmm15, \XMM2 # perform a 16 byte swap
1348 PSHUFB_XMM %xmm15, \XMM3 # perform a 16 byte swap
1349 PSHUFB_XMM %xmm15, \XMM4 # perform a 16 byte swap
Tadeusz Struk0bd82f52010-11-04 15:00:45 -04001350
1351 pxor \TMP4, \TMP1
1352 pxor \XMM8, \XMM5
1353 pxor \TMP6, \TMP2
1354 pxor \TMP1, \TMP2
1355 pxor \XMM5, \TMP2
1356 movdqa \TMP2, \TMP3
1357 pslldq $8, \TMP3 # left shift TMP3 2 DWs
1358 psrldq $8, \TMP2 # right shift TMP2 2 DWs
1359 pxor \TMP3, \XMM5
1360 pxor \TMP2, \TMP1 # accumulate the results in TMP1:XMM5
1361
1362 # first phase of reduction
1363
1364 movdqa \XMM5, \TMP2
1365 movdqa \XMM5, \TMP3
1366 movdqa \XMM5, \TMP4
1367# move XMM5 into TMP2, TMP3, TMP4 in order to perform shifts independently
1368 pslld $31, \TMP2 # packed right shift << 31
1369 pslld $30, \TMP3 # packed right shift << 30
1370 pslld $25, \TMP4 # packed right shift << 25
1371 pxor \TMP3, \TMP2 # xor the shifted versions
1372 pxor \TMP4, \TMP2
1373 movdqa \TMP2, \TMP5
1374 psrldq $4, \TMP5 # right shift T5 1 DW
1375 pslldq $12, \TMP2 # left shift T2 3 DWs
1376 pxor \TMP2, \XMM5
1377
1378 # second phase of reduction
1379
1380 movdqa \XMM5,\TMP2 # make 3 copies of XMM5 into TMP2, TMP3, TMP4
1381 movdqa \XMM5,\TMP3
1382 movdqa \XMM5,\TMP4
1383 psrld $1, \TMP2 # packed left shift >>1
1384 psrld $2, \TMP3 # packed left shift >>2
1385 psrld $7, \TMP4 # packed left shift >>7
1386 pxor \TMP3,\TMP2 # xor the shifted versions
1387 pxor \TMP4,\TMP2
1388 pxor \TMP5, \TMP2
1389 pxor \TMP2, \XMM5
1390 pxor \TMP1, \XMM5 # result is in TMP1
1391
1392 pxor \XMM5, \XMM1
1393.endm
1394
1395/* GHASH the last 4 ciphertext blocks. */
1396.macro GHASH_LAST_4 TMP1 TMP2 TMP3 TMP4 TMP5 TMP6 \
1397TMP7 XMM1 XMM2 XMM3 XMM4 XMMDst
1398
1399 # Multiply TMP6 * HashKey (using Karatsuba)
1400
1401 movdqa \XMM1, \TMP6
1402 pshufd $78, \XMM1, \TMP2
1403 pxor \XMM1, \TMP2
Dave Watsone5b954e2018-08-15 10:29:42 -07001404 movdqu HashKey_4(%arg2), \TMP5
Tadeusz Struk0bd82f52010-11-04 15:00:45 -04001405 PCLMULQDQ 0x11, \TMP5, \TMP6 # TMP6 = a1*b1
1406 PCLMULQDQ 0x00, \TMP5, \XMM1 # XMM1 = a0*b0
Dave Watsone5b954e2018-08-15 10:29:42 -07001407 movdqu HashKey_4_k(%arg2), \TMP4
Tadeusz Struk0bd82f52010-11-04 15:00:45 -04001408 PCLMULQDQ 0x00, \TMP4, \TMP2 # TMP2 = (a1+a0)*(b1+b0)
1409 movdqa \XMM1, \XMMDst
1410 movdqa \TMP2, \XMM1 # result in TMP6, XMMDst, XMM1
1411
1412 # Multiply TMP1 * HashKey (using Karatsuba)
1413
1414 movdqa \XMM2, \TMP1
1415 pshufd $78, \XMM2, \TMP2
1416 pxor \XMM2, \TMP2
Dave Watsone5b954e2018-08-15 10:29:42 -07001417 movdqu HashKey_3(%arg2), \TMP5
Tadeusz Struk0bd82f52010-11-04 15:00:45 -04001418 PCLMULQDQ 0x11, \TMP5, \TMP1 # TMP1 = a1*b1
1419 PCLMULQDQ 0x00, \TMP5, \XMM2 # XMM2 = a0*b0
Dave Watsone5b954e2018-08-15 10:29:42 -07001420 movdqu HashKey_3_k(%arg2), \TMP4
Tadeusz Struk0bd82f52010-11-04 15:00:45 -04001421 PCLMULQDQ 0x00, \TMP4, \TMP2 # TMP2 = (a1+a0)*(b1+b0)
1422 pxor \TMP1, \TMP6
1423 pxor \XMM2, \XMMDst
1424 pxor \TMP2, \XMM1
1425# results accumulated in TMP6, XMMDst, XMM1
1426
1427 # Multiply TMP1 * HashKey (using Karatsuba)
1428
1429 movdqa \XMM3, \TMP1
1430 pshufd $78, \XMM3, \TMP2
1431 pxor \XMM3, \TMP2
Dave Watsone5b954e2018-08-15 10:29:42 -07001432 movdqu HashKey_2(%arg2), \TMP5
Tadeusz Struk0bd82f52010-11-04 15:00:45 -04001433 PCLMULQDQ 0x11, \TMP5, \TMP1 # TMP1 = a1*b1
1434 PCLMULQDQ 0x00, \TMP5, \XMM3 # XMM3 = a0*b0
Dave Watsone5b954e2018-08-15 10:29:42 -07001435 movdqu HashKey_2_k(%arg2), \TMP4
Tadeusz Struk0bd82f52010-11-04 15:00:45 -04001436 PCLMULQDQ 0x00, \TMP4, \TMP2 # TMP2 = (a1+a0)*(b1+b0)
1437 pxor \TMP1, \TMP6
1438 pxor \XMM3, \XMMDst
1439 pxor \TMP2, \XMM1 # results accumulated in TMP6, XMMDst, XMM1
1440
1441 # Multiply TMP1 * HashKey (using Karatsuba)
1442 movdqa \XMM4, \TMP1
1443 pshufd $78, \XMM4, \TMP2
1444 pxor \XMM4, \TMP2
Dave Watsone5b954e2018-08-15 10:29:42 -07001445 movdqu HashKey(%arg2), \TMP5
Tadeusz Struk0bd82f52010-11-04 15:00:45 -04001446 PCLMULQDQ 0x11, \TMP5, \TMP1 # TMP1 = a1*b1
1447 PCLMULQDQ 0x00, \TMP5, \XMM4 # XMM4 = a0*b0
Dave Watsone5b954e2018-08-15 10:29:42 -07001448 movdqu HashKey_k(%arg2), \TMP4
Tadeusz Struk0bd82f52010-11-04 15:00:45 -04001449 PCLMULQDQ 0x00, \TMP4, \TMP2 # TMP2 = (a1+a0)*(b1+b0)
1450 pxor \TMP1, \TMP6
1451 pxor \XMM4, \XMMDst
1452 pxor \XMM1, \TMP2
1453 pxor \TMP6, \TMP2
1454 pxor \XMMDst, \TMP2
1455 # middle section of the temp results combined as in karatsuba algorithm
1456 movdqa \TMP2, \TMP4
1457 pslldq $8, \TMP4 # left shift TMP4 2 DWs
1458 psrldq $8, \TMP2 # right shift TMP2 2 DWs
1459 pxor \TMP4, \XMMDst
1460 pxor \TMP2, \TMP6
1461# TMP6:XMMDst holds the result of the accumulated carry-less multiplications
1462 # first phase of the reduction
1463 movdqa \XMMDst, \TMP2
1464 movdqa \XMMDst, \TMP3
1465 movdqa \XMMDst, \TMP4
1466# move XMMDst into TMP2, TMP3, TMP4 in order to perform 3 shifts independently
1467 pslld $31, \TMP2 # packed right shifting << 31
1468 pslld $30, \TMP3 # packed right shifting << 30
1469 pslld $25, \TMP4 # packed right shifting << 25
1470 pxor \TMP3, \TMP2 # xor the shifted versions
1471 pxor \TMP4, \TMP2
1472 movdqa \TMP2, \TMP7
1473 psrldq $4, \TMP7 # right shift TMP7 1 DW
1474 pslldq $12, \TMP2 # left shift TMP2 3 DWs
1475 pxor \TMP2, \XMMDst
1476
1477 # second phase of the reduction
1478 movdqa \XMMDst, \TMP2
1479 # make 3 copies of XMMDst for doing 3 shift operations
1480 movdqa \XMMDst, \TMP3
1481 movdqa \XMMDst, \TMP4
1482 psrld $1, \TMP2 # packed left shift >> 1
1483 psrld $2, \TMP3 # packed left shift >> 2
1484 psrld $7, \TMP4 # packed left shift >> 7
1485 pxor \TMP3, \TMP2 # xor the shifted versions
1486 pxor \TMP4, \TMP2
1487 pxor \TMP7, \TMP2
1488 pxor \TMP2, \XMMDst
1489 pxor \TMP6, \XMMDst # reduced result is in XMMDst
1490.endm
1491
Timothy McCaffreye31ac322015-01-13 13:16:43 -05001492
1493/* Encryption of a single block
1494* uses eax & r10
1495*/
1496
Tadeusz Struk0bd82f52010-11-04 15:00:45 -04001497.macro ENCRYPT_SINGLE_BLOCK XMM0 TMP1
1498
Timothy McCaffreye31ac322015-01-13 13:16:43 -05001499 pxor (%arg1), \XMM0
1500 mov keysize,%eax
1501 shr $2,%eax # 128->4, 192->6, 256->8
1502 add $5,%eax # 128->9, 192->11, 256->13
1503 lea 16(%arg1), %r10 # get first expanded key address
1504
1505_esb_loop_\@:
1506 MOVADQ (%r10),\TMP1
1507 AESENC \TMP1,\XMM0
1508 add $16,%r10
1509 sub $1,%eax
1510 jnz _esb_loop_\@
1511
1512 MOVADQ (%r10),\TMP1
1513 AESENCLAST \TMP1,\XMM0
Tadeusz Struk0bd82f52010-11-04 15:00:45 -04001514.endm
Tadeusz Struk0bd82f52010-11-04 15:00:45 -04001515/*****************************************************************************
1516* void aesni_gcm_dec(void *aes_ctx, // AES Key schedule. Starts on a 16 byte boundary.
Dave Watson9ee4a5d2018-02-14 09:39:23 -08001517* struct gcm_context_data *data
1518* // Context data
Tadeusz Struk0bd82f52010-11-04 15:00:45 -04001519* u8 *out, // Plaintext output. Encrypt in-place is allowed.
1520* const u8 *in, // Ciphertext input
1521* u64 plaintext_len, // Length of data in bytes for decryption.
1522* u8 *iv, // Pre-counter block j0: 4 byte salt (from Security Association)
1523* // concatenated with 8 byte Initialisation Vector (from IPSec ESP Payload)
1524* // concatenated with 0x00000001. 16-byte aligned pointer.
1525* u8 *hash_subkey, // H, the Hash sub key input. Data starts on a 16-byte boundary.
1526* const u8 *aad, // Additional Authentication Data (AAD)
1527* u64 aad_len, // Length of AAD in bytes. With RFC4106 this is going to be 8 or 12 bytes
1528* u8 *auth_tag, // Authenticated Tag output. The driver will compare this to the
1529* // given authentication tag and only return the plaintext if they match.
1530* u64 auth_tag_len); // Authenticated Tag Length in bytes. Valid values are 16
1531* // (most likely), 12 or 8.
1532*
1533* Assumptions:
1534*
1535* keys:
1536* keys are pre-expanded and aligned to 16 bytes. we are using the first
1537* set of 11 keys in the data structure void *aes_ctx
1538*
1539* iv:
1540* 0 1 2 3
1541* 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
1542* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1543* | Salt (From the SA) |
1544* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1545* | Initialization Vector |
1546* | (This is the sequence number from IPSec header) |
1547* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1548* | 0x1 |
1549* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1550*
1551*
1552*
1553* AAD:
1554* AAD padded to 128 bits with 0
1555* for example, assume AAD is a u32 vector
1556*
1557* if AAD is 8 bytes:
1558* AAD[3] = {A0, A1};
1559* padded AAD in xmm register = {A1 A0 0 0}
1560*
1561* 0 1 2 3
1562* 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
1563* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1564* | SPI (A1) |
1565* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1566* | 32-bit Sequence Number (A0) |
1567* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1568* | 0x0 |
1569* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1570*
1571* AAD Format with 32-bit Sequence Number
1572*
1573* if AAD is 12 bytes:
1574* AAD[3] = {A0, A1, A2};
1575* padded AAD in xmm register = {A2 A1 A0 0}
1576*
1577* 0 1 2 3
1578* 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
1579* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1580* 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
1581* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1582* | SPI (A2) |
1583* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1584* | 64-bit Extended Sequence Number {A1,A0} |
1585* | |
1586* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1587* | 0x0 |
1588* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1589*
1590* AAD Format with 64-bit Extended Sequence Number
1591*
Tadeusz Struk0bd82f52010-11-04 15:00:45 -04001592* poly = x^128 + x^127 + x^126 + x^121 + 1
1593*
1594*****************************************************************************/
Jiri Slaby6dcc5622019-10-11 13:51:04 +02001595SYM_FUNC_START(aesni_gcm_dec)
Dave Watson6c2c86b2018-02-14 09:38:35 -08001596 FUNC_SAVE
Tadeusz Struk3c097b82010-12-13 19:51:15 +08001597
Dave Watsonfb8986e2018-02-14 09:40:47 -08001598 GCM_INIT %arg6, arg7, arg8, arg9
Dave Watsonba458332018-02-14 09:39:10 -08001599 GCM_ENC_DEC dec
Dave Watsonfb8986e2018-02-14 09:40:47 -08001600 GCM_COMPLETE arg10, arg11
Dave Watson6c2c86b2018-02-14 09:38:35 -08001601 FUNC_RESTORE
Tadeusz Struk0bd82f52010-11-04 15:00:45 -04001602 ret
Jiri Slaby6dcc5622019-10-11 13:51:04 +02001603SYM_FUNC_END(aesni_gcm_dec)
Tadeusz Struk0bd82f52010-11-04 15:00:45 -04001604
1605
1606/*****************************************************************************
1607* void aesni_gcm_enc(void *aes_ctx, // AES Key schedule. Starts on a 16 byte boundary.
Dave Watson9ee4a5d2018-02-14 09:39:23 -08001608* struct gcm_context_data *data
1609* // Context data
Tadeusz Struk0bd82f52010-11-04 15:00:45 -04001610* u8 *out, // Ciphertext output. Encrypt in-place is allowed.
1611* const u8 *in, // Plaintext input
1612* u64 plaintext_len, // Length of data in bytes for encryption.
1613* u8 *iv, // Pre-counter block j0: 4 byte salt (from Security Association)
1614* // concatenated with 8 byte Initialisation Vector (from IPSec ESP Payload)
1615* // concatenated with 0x00000001. 16-byte aligned pointer.
1616* u8 *hash_subkey, // H, the Hash sub key input. Data starts on a 16-byte boundary.
1617* const u8 *aad, // Additional Authentication Data (AAD)
1618* u64 aad_len, // Length of AAD in bytes. With RFC4106 this is going to be 8 or 12 bytes
1619* u8 *auth_tag, // Authenticated Tag output.
1620* u64 auth_tag_len); // Authenticated Tag Length in bytes. Valid values are 16 (most likely),
1621* // 12 or 8.
1622*
1623* Assumptions:
1624*
1625* keys:
1626* keys are pre-expanded and aligned to 16 bytes. we are using the
1627* first set of 11 keys in the data structure void *aes_ctx
1628*
1629*
1630* iv:
1631* 0 1 2 3
1632* 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
1633* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1634* | Salt (From the SA) |
1635* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1636* | Initialization Vector |
1637* | (This is the sequence number from IPSec header) |
1638* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1639* | 0x1 |
1640* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1641*
1642*
1643*
1644* AAD:
1645* AAD padded to 128 bits with 0
1646* for example, assume AAD is a u32 vector
1647*
1648* if AAD is 8 bytes:
1649* AAD[3] = {A0, A1};
1650* padded AAD in xmm register = {A1 A0 0 0}
1651*
1652* 0 1 2 3
1653* 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
1654* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1655* | SPI (A1) |
1656* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1657* | 32-bit Sequence Number (A0) |
1658* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1659* | 0x0 |
1660* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1661*
1662* AAD Format with 32-bit Sequence Number
1663*
1664* if AAD is 12 bytes:
1665* AAD[3] = {A0, A1, A2};
1666* padded AAD in xmm register = {A2 A1 A0 0}
1667*
1668* 0 1 2 3
1669* 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
1670* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1671* | SPI (A2) |
1672* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1673* | 64-bit Extended Sequence Number {A1,A0} |
1674* | |
1675* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1676* | 0x0 |
1677* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1678*
1679* AAD Format with 64-bit Extended Sequence Number
1680*
Tadeusz Struk0bd82f52010-11-04 15:00:45 -04001681* poly = x^128 + x^127 + x^126 + x^121 + 1
1682***************************************************************************/
Jiri Slaby6dcc5622019-10-11 13:51:04 +02001683SYM_FUNC_START(aesni_gcm_enc)
Dave Watson6c2c86b2018-02-14 09:38:35 -08001684 FUNC_SAVE
Tadeusz Struk3c097b82010-12-13 19:51:15 +08001685
Dave Watsonfb8986e2018-02-14 09:40:47 -08001686 GCM_INIT %arg6, arg7, arg8, arg9
Dave Watsonba458332018-02-14 09:39:10 -08001687 GCM_ENC_DEC enc
Dave Watsonfb8986e2018-02-14 09:40:47 -08001688
1689 GCM_COMPLETE arg10, arg11
Dave Watson6c2c86b2018-02-14 09:38:35 -08001690 FUNC_RESTORE
Tadeusz Struk0bd82f52010-11-04 15:00:45 -04001691 ret
Jiri Slaby6dcc5622019-10-11 13:51:04 +02001692SYM_FUNC_END(aesni_gcm_enc)
Tadeusz Struk3c097b82010-12-13 19:51:15 +08001693
Dave Watsonfb8986e2018-02-14 09:40:47 -08001694/*****************************************************************************
1695* void aesni_gcm_init(void *aes_ctx, // AES Key schedule. Starts on a 16 byte boundary.
1696* struct gcm_context_data *data,
1697* // context data
1698* u8 *iv, // Pre-counter block j0: 4 byte salt (from Security Association)
1699* // concatenated with 8 byte Initialisation Vector (from IPSec ESP Payload)
1700* // concatenated with 0x00000001. 16-byte aligned pointer.
1701* u8 *hash_subkey, // H, the Hash sub key input. Data starts on a 16-byte boundary.
1702* const u8 *aad, // Additional Authentication Data (AAD)
1703* u64 aad_len) // Length of AAD in bytes.
1704*/
Jiri Slaby6dcc5622019-10-11 13:51:04 +02001705SYM_FUNC_START(aesni_gcm_init)
Dave Watsonfb8986e2018-02-14 09:40:47 -08001706 FUNC_SAVE
1707 GCM_INIT %arg3, %arg4,%arg5, %arg6
1708 FUNC_RESTORE
1709 ret
Jiri Slaby6dcc5622019-10-11 13:51:04 +02001710SYM_FUNC_END(aesni_gcm_init)
Dave Watsonfb8986e2018-02-14 09:40:47 -08001711
1712/*****************************************************************************
1713* void aesni_gcm_enc_update(void *aes_ctx, // AES Key schedule. Starts on a 16 byte boundary.
1714* struct gcm_context_data *data,
1715* // context data
1716* u8 *out, // Ciphertext output. Encrypt in-place is allowed.
1717* const u8 *in, // Plaintext input
1718* u64 plaintext_len, // Length of data in bytes for encryption.
1719*/
Jiri Slaby6dcc5622019-10-11 13:51:04 +02001720SYM_FUNC_START(aesni_gcm_enc_update)
Dave Watsonfb8986e2018-02-14 09:40:47 -08001721 FUNC_SAVE
1722 GCM_ENC_DEC enc
1723 FUNC_RESTORE
1724 ret
Jiri Slaby6dcc5622019-10-11 13:51:04 +02001725SYM_FUNC_END(aesni_gcm_enc_update)
Dave Watsonfb8986e2018-02-14 09:40:47 -08001726
1727/*****************************************************************************
1728* void aesni_gcm_dec_update(void *aes_ctx, // AES Key schedule. Starts on a 16 byte boundary.
1729* struct gcm_context_data *data,
1730* // context data
1731* u8 *out, // Ciphertext output. Encrypt in-place is allowed.
1732* const u8 *in, // Plaintext input
1733* u64 plaintext_len, // Length of data in bytes for encryption.
1734*/
Jiri Slaby6dcc5622019-10-11 13:51:04 +02001735SYM_FUNC_START(aesni_gcm_dec_update)
Dave Watsonfb8986e2018-02-14 09:40:47 -08001736 FUNC_SAVE
1737 GCM_ENC_DEC dec
1738 FUNC_RESTORE
1739 ret
Jiri Slaby6dcc5622019-10-11 13:51:04 +02001740SYM_FUNC_END(aesni_gcm_dec_update)
Dave Watsonfb8986e2018-02-14 09:40:47 -08001741
1742/*****************************************************************************
1743* void aesni_gcm_finalize(void *aes_ctx, // AES Key schedule. Starts on a 16 byte boundary.
1744* struct gcm_context_data *data,
1745* // context data
1746* u8 *auth_tag, // Authenticated Tag output.
1747* u64 auth_tag_len); // Authenticated Tag Length in bytes. Valid values are 16 (most likely),
1748* // 12 or 8.
1749*/
Jiri Slaby6dcc5622019-10-11 13:51:04 +02001750SYM_FUNC_START(aesni_gcm_finalize)
Dave Watsonfb8986e2018-02-14 09:40:47 -08001751 FUNC_SAVE
1752 GCM_COMPLETE %arg3 %arg4
1753 FUNC_RESTORE
1754 ret
Jiri Slaby6dcc5622019-10-11 13:51:04 +02001755SYM_FUNC_END(aesni_gcm_finalize)
Dave Watsonfb8986e2018-02-14 09:40:47 -08001756
Mathias Krause559ad0f2010-11-29 08:35:39 +08001757#endif
Tadeusz Struk0bd82f52010-11-04 15:00:45 -04001758
1759
Jiri Slabye9b9d022019-10-11 13:50:49 +02001760SYM_FUNC_START_LOCAL_ALIAS(_key_expansion_128)
Jiri Slaby74d8b902019-10-11 13:50:46 +02001761SYM_FUNC_START_LOCAL(_key_expansion_256a)
Huang Ying54b6a1b2009-01-18 16:28:34 +11001762 pshufd $0b11111111, %xmm1, %xmm1
1763 shufps $0b00010000, %xmm0, %xmm4
1764 pxor %xmm4, %xmm0
1765 shufps $0b10001100, %xmm0, %xmm4
1766 pxor %xmm4, %xmm0
1767 pxor %xmm1, %xmm0
Mathias Krause0d258ef2010-11-27 16:34:46 +08001768 movaps %xmm0, (TKEYP)
1769 add $0x10, TKEYP
Huang Ying54b6a1b2009-01-18 16:28:34 +11001770 ret
Jiri Slaby74d8b902019-10-11 13:50:46 +02001771SYM_FUNC_END(_key_expansion_256a)
Jiri Slabye9b9d022019-10-11 13:50:49 +02001772SYM_FUNC_END_ALIAS(_key_expansion_128)
Huang Ying54b6a1b2009-01-18 16:28:34 +11001773
Jiri Slaby74d8b902019-10-11 13:50:46 +02001774SYM_FUNC_START_LOCAL(_key_expansion_192a)
Huang Ying54b6a1b2009-01-18 16:28:34 +11001775 pshufd $0b01010101, %xmm1, %xmm1
1776 shufps $0b00010000, %xmm0, %xmm4
1777 pxor %xmm4, %xmm0
1778 shufps $0b10001100, %xmm0, %xmm4
1779 pxor %xmm4, %xmm0
1780 pxor %xmm1, %xmm0
1781
1782 movaps %xmm2, %xmm5
1783 movaps %xmm2, %xmm6
1784 pslldq $4, %xmm5
1785 pshufd $0b11111111, %xmm0, %xmm3
1786 pxor %xmm3, %xmm2
1787 pxor %xmm5, %xmm2
1788
1789 movaps %xmm0, %xmm1
1790 shufps $0b01000100, %xmm0, %xmm6
Mathias Krause0d258ef2010-11-27 16:34:46 +08001791 movaps %xmm6, (TKEYP)
Huang Ying54b6a1b2009-01-18 16:28:34 +11001792 shufps $0b01001110, %xmm2, %xmm1
Mathias Krause0d258ef2010-11-27 16:34:46 +08001793 movaps %xmm1, 0x10(TKEYP)
1794 add $0x20, TKEYP
Huang Ying54b6a1b2009-01-18 16:28:34 +11001795 ret
Jiri Slaby74d8b902019-10-11 13:50:46 +02001796SYM_FUNC_END(_key_expansion_192a)
Huang Ying54b6a1b2009-01-18 16:28:34 +11001797
Jiri Slaby74d8b902019-10-11 13:50:46 +02001798SYM_FUNC_START_LOCAL(_key_expansion_192b)
Huang Ying54b6a1b2009-01-18 16:28:34 +11001799 pshufd $0b01010101, %xmm1, %xmm1
1800 shufps $0b00010000, %xmm0, %xmm4
1801 pxor %xmm4, %xmm0
1802 shufps $0b10001100, %xmm0, %xmm4
1803 pxor %xmm4, %xmm0
1804 pxor %xmm1, %xmm0
1805
1806 movaps %xmm2, %xmm5
1807 pslldq $4, %xmm5
1808 pshufd $0b11111111, %xmm0, %xmm3
1809 pxor %xmm3, %xmm2
1810 pxor %xmm5, %xmm2
1811
Mathias Krause0d258ef2010-11-27 16:34:46 +08001812 movaps %xmm0, (TKEYP)
1813 add $0x10, TKEYP
Huang Ying54b6a1b2009-01-18 16:28:34 +11001814 ret
Jiri Slaby74d8b902019-10-11 13:50:46 +02001815SYM_FUNC_END(_key_expansion_192b)
Huang Ying54b6a1b2009-01-18 16:28:34 +11001816
Jiri Slaby74d8b902019-10-11 13:50:46 +02001817SYM_FUNC_START_LOCAL(_key_expansion_256b)
Huang Ying54b6a1b2009-01-18 16:28:34 +11001818 pshufd $0b10101010, %xmm1, %xmm1
1819 shufps $0b00010000, %xmm2, %xmm4
1820 pxor %xmm4, %xmm2
1821 shufps $0b10001100, %xmm2, %xmm4
1822 pxor %xmm4, %xmm2
1823 pxor %xmm1, %xmm2
Mathias Krause0d258ef2010-11-27 16:34:46 +08001824 movaps %xmm2, (TKEYP)
1825 add $0x10, TKEYP
Huang Ying54b6a1b2009-01-18 16:28:34 +11001826 ret
Jiri Slaby74d8b902019-10-11 13:50:46 +02001827SYM_FUNC_END(_key_expansion_256b)
Huang Ying54b6a1b2009-01-18 16:28:34 +11001828
1829/*
1830 * int aesni_set_key(struct crypto_aes_ctx *ctx, const u8 *in_key,
1831 * unsigned int key_len)
1832 */
Jiri Slaby6dcc5622019-10-11 13:51:04 +02001833SYM_FUNC_START(aesni_set_key)
Josh Poimboeuf8691ccd2016-01-21 16:49:19 -06001834 FRAME_BEGIN
Mathias Krause0d258ef2010-11-27 16:34:46 +08001835#ifndef __x86_64__
1836 pushl KEYP
Josh Poimboeuf8691ccd2016-01-21 16:49:19 -06001837 movl (FRAME_OFFSET+8)(%esp), KEYP # ctx
1838 movl (FRAME_OFFSET+12)(%esp), UKEYP # in_key
1839 movl (FRAME_OFFSET+16)(%esp), %edx # key_len
Mathias Krause0d258ef2010-11-27 16:34:46 +08001840#endif
1841 movups (UKEYP), %xmm0 # user key (first 16 bytes)
1842 movaps %xmm0, (KEYP)
1843 lea 0x10(KEYP), TKEYP # key addr
1844 movl %edx, 480(KEYP)
Huang Ying54b6a1b2009-01-18 16:28:34 +11001845 pxor %xmm4, %xmm4 # xmm4 is assumed 0 in _key_expansion_x
1846 cmp $24, %dl
1847 jb .Lenc_key128
1848 je .Lenc_key192
Mathias Krause0d258ef2010-11-27 16:34:46 +08001849 movups 0x10(UKEYP), %xmm2 # other user key
1850 movaps %xmm2, (TKEYP)
1851 add $0x10, TKEYP
Huang Yingb369e522009-11-23 19:54:06 +08001852 AESKEYGENASSIST 0x1 %xmm2 %xmm1 # round 1
Huang Ying54b6a1b2009-01-18 16:28:34 +11001853 call _key_expansion_256a
Huang Yingb369e522009-11-23 19:54:06 +08001854 AESKEYGENASSIST 0x1 %xmm0 %xmm1
Huang Ying54b6a1b2009-01-18 16:28:34 +11001855 call _key_expansion_256b
Huang Yingb369e522009-11-23 19:54:06 +08001856 AESKEYGENASSIST 0x2 %xmm2 %xmm1 # round 2
Huang Ying54b6a1b2009-01-18 16:28:34 +11001857 call _key_expansion_256a
Huang Yingb369e522009-11-23 19:54:06 +08001858 AESKEYGENASSIST 0x2 %xmm0 %xmm1
Huang Ying54b6a1b2009-01-18 16:28:34 +11001859 call _key_expansion_256b
Huang Yingb369e522009-11-23 19:54:06 +08001860 AESKEYGENASSIST 0x4 %xmm2 %xmm1 # round 3
Huang Ying54b6a1b2009-01-18 16:28:34 +11001861 call _key_expansion_256a
Huang Yingb369e522009-11-23 19:54:06 +08001862 AESKEYGENASSIST 0x4 %xmm0 %xmm1
Huang Ying54b6a1b2009-01-18 16:28:34 +11001863 call _key_expansion_256b
Huang Yingb369e522009-11-23 19:54:06 +08001864 AESKEYGENASSIST 0x8 %xmm2 %xmm1 # round 4
Huang Ying54b6a1b2009-01-18 16:28:34 +11001865 call _key_expansion_256a
Huang Yingb369e522009-11-23 19:54:06 +08001866 AESKEYGENASSIST 0x8 %xmm0 %xmm1
Huang Ying54b6a1b2009-01-18 16:28:34 +11001867 call _key_expansion_256b
Huang Yingb369e522009-11-23 19:54:06 +08001868 AESKEYGENASSIST 0x10 %xmm2 %xmm1 # round 5
Huang Ying54b6a1b2009-01-18 16:28:34 +11001869 call _key_expansion_256a
Huang Yingb369e522009-11-23 19:54:06 +08001870 AESKEYGENASSIST 0x10 %xmm0 %xmm1
Huang Ying54b6a1b2009-01-18 16:28:34 +11001871 call _key_expansion_256b
Huang Yingb369e522009-11-23 19:54:06 +08001872 AESKEYGENASSIST 0x20 %xmm2 %xmm1 # round 6
Huang Ying54b6a1b2009-01-18 16:28:34 +11001873 call _key_expansion_256a
Huang Yingb369e522009-11-23 19:54:06 +08001874 AESKEYGENASSIST 0x20 %xmm0 %xmm1
Huang Ying54b6a1b2009-01-18 16:28:34 +11001875 call _key_expansion_256b
Huang Yingb369e522009-11-23 19:54:06 +08001876 AESKEYGENASSIST 0x40 %xmm2 %xmm1 # round 7
Huang Ying54b6a1b2009-01-18 16:28:34 +11001877 call _key_expansion_256a
1878 jmp .Ldec_key
1879.Lenc_key192:
Mathias Krause0d258ef2010-11-27 16:34:46 +08001880 movq 0x10(UKEYP), %xmm2 # other user key
Huang Yingb369e522009-11-23 19:54:06 +08001881 AESKEYGENASSIST 0x1 %xmm2 %xmm1 # round 1
Huang Ying54b6a1b2009-01-18 16:28:34 +11001882 call _key_expansion_192a
Huang Yingb369e522009-11-23 19:54:06 +08001883 AESKEYGENASSIST 0x2 %xmm2 %xmm1 # round 2
Huang Ying54b6a1b2009-01-18 16:28:34 +11001884 call _key_expansion_192b
Huang Yingb369e522009-11-23 19:54:06 +08001885 AESKEYGENASSIST 0x4 %xmm2 %xmm1 # round 3
Huang Ying54b6a1b2009-01-18 16:28:34 +11001886 call _key_expansion_192a
Huang Yingb369e522009-11-23 19:54:06 +08001887 AESKEYGENASSIST 0x8 %xmm2 %xmm1 # round 4
Huang Ying54b6a1b2009-01-18 16:28:34 +11001888 call _key_expansion_192b
Huang Yingb369e522009-11-23 19:54:06 +08001889 AESKEYGENASSIST 0x10 %xmm2 %xmm1 # round 5
Huang Ying54b6a1b2009-01-18 16:28:34 +11001890 call _key_expansion_192a
Huang Yingb369e522009-11-23 19:54:06 +08001891 AESKEYGENASSIST 0x20 %xmm2 %xmm1 # round 6
Huang Ying54b6a1b2009-01-18 16:28:34 +11001892 call _key_expansion_192b
Huang Yingb369e522009-11-23 19:54:06 +08001893 AESKEYGENASSIST 0x40 %xmm2 %xmm1 # round 7
Huang Ying54b6a1b2009-01-18 16:28:34 +11001894 call _key_expansion_192a
Huang Yingb369e522009-11-23 19:54:06 +08001895 AESKEYGENASSIST 0x80 %xmm2 %xmm1 # round 8
Huang Ying54b6a1b2009-01-18 16:28:34 +11001896 call _key_expansion_192b
1897 jmp .Ldec_key
1898.Lenc_key128:
Huang Yingb369e522009-11-23 19:54:06 +08001899 AESKEYGENASSIST 0x1 %xmm0 %xmm1 # round 1
Huang Ying54b6a1b2009-01-18 16:28:34 +11001900 call _key_expansion_128
Huang Yingb369e522009-11-23 19:54:06 +08001901 AESKEYGENASSIST 0x2 %xmm0 %xmm1 # round 2
Huang Ying54b6a1b2009-01-18 16:28:34 +11001902 call _key_expansion_128
Huang Yingb369e522009-11-23 19:54:06 +08001903 AESKEYGENASSIST 0x4 %xmm0 %xmm1 # round 3
Huang Ying54b6a1b2009-01-18 16:28:34 +11001904 call _key_expansion_128
Huang Yingb369e522009-11-23 19:54:06 +08001905 AESKEYGENASSIST 0x8 %xmm0 %xmm1 # round 4
Huang Ying54b6a1b2009-01-18 16:28:34 +11001906 call _key_expansion_128
Huang Yingb369e522009-11-23 19:54:06 +08001907 AESKEYGENASSIST 0x10 %xmm0 %xmm1 # round 5
Huang Ying54b6a1b2009-01-18 16:28:34 +11001908 call _key_expansion_128
Huang Yingb369e522009-11-23 19:54:06 +08001909 AESKEYGENASSIST 0x20 %xmm0 %xmm1 # round 6
Huang Ying54b6a1b2009-01-18 16:28:34 +11001910 call _key_expansion_128
Huang Yingb369e522009-11-23 19:54:06 +08001911 AESKEYGENASSIST 0x40 %xmm0 %xmm1 # round 7
Huang Ying54b6a1b2009-01-18 16:28:34 +11001912 call _key_expansion_128
Huang Yingb369e522009-11-23 19:54:06 +08001913 AESKEYGENASSIST 0x80 %xmm0 %xmm1 # round 8
Huang Ying54b6a1b2009-01-18 16:28:34 +11001914 call _key_expansion_128
Huang Yingb369e522009-11-23 19:54:06 +08001915 AESKEYGENASSIST 0x1b %xmm0 %xmm1 # round 9
Huang Ying54b6a1b2009-01-18 16:28:34 +11001916 call _key_expansion_128
Huang Yingb369e522009-11-23 19:54:06 +08001917 AESKEYGENASSIST 0x36 %xmm0 %xmm1 # round 10
Huang Ying54b6a1b2009-01-18 16:28:34 +11001918 call _key_expansion_128
1919.Ldec_key:
Mathias Krause0d258ef2010-11-27 16:34:46 +08001920 sub $0x10, TKEYP
1921 movaps (KEYP), %xmm0
1922 movaps (TKEYP), %xmm1
1923 movaps %xmm0, 240(TKEYP)
1924 movaps %xmm1, 240(KEYP)
1925 add $0x10, KEYP
1926 lea 240-16(TKEYP), UKEYP
Huang Ying54b6a1b2009-01-18 16:28:34 +11001927.align 4
1928.Ldec_key_loop:
Mathias Krause0d258ef2010-11-27 16:34:46 +08001929 movaps (KEYP), %xmm0
Huang Yingb369e522009-11-23 19:54:06 +08001930 AESIMC %xmm0 %xmm1
Mathias Krause0d258ef2010-11-27 16:34:46 +08001931 movaps %xmm1, (UKEYP)
1932 add $0x10, KEYP
1933 sub $0x10, UKEYP
1934 cmp TKEYP, KEYP
Huang Ying54b6a1b2009-01-18 16:28:34 +11001935 jb .Ldec_key_loop
Mathias Krause0d258ef2010-11-27 16:34:46 +08001936 xor AREG, AREG
1937#ifndef __x86_64__
1938 popl KEYP
1939#endif
Josh Poimboeuf8691ccd2016-01-21 16:49:19 -06001940 FRAME_END
Huang Ying54b6a1b2009-01-18 16:28:34 +11001941 ret
Jiri Slaby6dcc5622019-10-11 13:51:04 +02001942SYM_FUNC_END(aesni_set_key)
Huang Ying54b6a1b2009-01-18 16:28:34 +11001943
1944/*
Kees Cook9c1e8832019-11-26 22:08:02 -08001945 * void aesni_enc(const void *ctx, u8 *dst, const u8 *src)
Huang Ying54b6a1b2009-01-18 16:28:34 +11001946 */
Jiri Slaby6dcc5622019-10-11 13:51:04 +02001947SYM_FUNC_START(aesni_enc)
Josh Poimboeuf8691ccd2016-01-21 16:49:19 -06001948 FRAME_BEGIN
Mathias Krause0d258ef2010-11-27 16:34:46 +08001949#ifndef __x86_64__
1950 pushl KEYP
1951 pushl KLEN
Josh Poimboeuf8691ccd2016-01-21 16:49:19 -06001952 movl (FRAME_OFFSET+12)(%esp), KEYP # ctx
1953 movl (FRAME_OFFSET+16)(%esp), OUTP # dst
1954 movl (FRAME_OFFSET+20)(%esp), INP # src
Mathias Krause0d258ef2010-11-27 16:34:46 +08001955#endif
Huang Ying54b6a1b2009-01-18 16:28:34 +11001956 movl 480(KEYP), KLEN # key length
1957 movups (INP), STATE # input
1958 call _aesni_enc1
1959 movups STATE, (OUTP) # output
Mathias Krause0d258ef2010-11-27 16:34:46 +08001960#ifndef __x86_64__
1961 popl KLEN
1962 popl KEYP
1963#endif
Josh Poimboeuf8691ccd2016-01-21 16:49:19 -06001964 FRAME_END
Huang Ying54b6a1b2009-01-18 16:28:34 +11001965 ret
Jiri Slaby6dcc5622019-10-11 13:51:04 +02001966SYM_FUNC_END(aesni_enc)
Huang Ying54b6a1b2009-01-18 16:28:34 +11001967
1968/*
1969 * _aesni_enc1: internal ABI
1970 * input:
1971 * KEYP: key struct pointer
1972 * KLEN: round count
1973 * STATE: initial state (input)
1974 * output:
1975 * STATE: finial state (output)
1976 * changed:
1977 * KEY
1978 * TKEYP (T1)
1979 */
Jiri Slaby74d8b902019-10-11 13:50:46 +02001980SYM_FUNC_START_LOCAL(_aesni_enc1)
Huang Ying54b6a1b2009-01-18 16:28:34 +11001981 movaps (KEYP), KEY # key
1982 mov KEYP, TKEYP
1983 pxor KEY, STATE # round 0
1984 add $0x30, TKEYP
1985 cmp $24, KLEN
1986 jb .Lenc128
1987 lea 0x20(TKEYP), TKEYP
1988 je .Lenc192
1989 add $0x20, TKEYP
1990 movaps -0x60(TKEYP), KEY
Huang Yingb369e522009-11-23 19:54:06 +08001991 AESENC KEY STATE
Huang Ying54b6a1b2009-01-18 16:28:34 +11001992 movaps -0x50(TKEYP), KEY
Huang Yingb369e522009-11-23 19:54:06 +08001993 AESENC KEY STATE
Huang Ying54b6a1b2009-01-18 16:28:34 +11001994.align 4
1995.Lenc192:
1996 movaps -0x40(TKEYP), KEY
Huang Yingb369e522009-11-23 19:54:06 +08001997 AESENC KEY STATE
Huang Ying54b6a1b2009-01-18 16:28:34 +11001998 movaps -0x30(TKEYP), KEY
Huang Yingb369e522009-11-23 19:54:06 +08001999 AESENC KEY STATE
Huang Ying54b6a1b2009-01-18 16:28:34 +11002000.align 4
2001.Lenc128:
2002 movaps -0x20(TKEYP), KEY
Huang Yingb369e522009-11-23 19:54:06 +08002003 AESENC KEY STATE
Huang Ying54b6a1b2009-01-18 16:28:34 +11002004 movaps -0x10(TKEYP), KEY
Huang Yingb369e522009-11-23 19:54:06 +08002005 AESENC KEY STATE
Huang Ying54b6a1b2009-01-18 16:28:34 +11002006 movaps (TKEYP), KEY
Huang Yingb369e522009-11-23 19:54:06 +08002007 AESENC KEY STATE
Huang Ying54b6a1b2009-01-18 16:28:34 +11002008 movaps 0x10(TKEYP), KEY
Huang Yingb369e522009-11-23 19:54:06 +08002009 AESENC KEY STATE
Huang Ying54b6a1b2009-01-18 16:28:34 +11002010 movaps 0x20(TKEYP), KEY
Huang Yingb369e522009-11-23 19:54:06 +08002011 AESENC KEY STATE
Huang Ying54b6a1b2009-01-18 16:28:34 +11002012 movaps 0x30(TKEYP), KEY
Huang Yingb369e522009-11-23 19:54:06 +08002013 AESENC KEY STATE
Huang Ying54b6a1b2009-01-18 16:28:34 +11002014 movaps 0x40(TKEYP), KEY
Huang Yingb369e522009-11-23 19:54:06 +08002015 AESENC KEY STATE
Huang Ying54b6a1b2009-01-18 16:28:34 +11002016 movaps 0x50(TKEYP), KEY
Huang Yingb369e522009-11-23 19:54:06 +08002017 AESENC KEY STATE
Huang Ying54b6a1b2009-01-18 16:28:34 +11002018 movaps 0x60(TKEYP), KEY
Huang Yingb369e522009-11-23 19:54:06 +08002019 AESENC KEY STATE
Huang Ying54b6a1b2009-01-18 16:28:34 +11002020 movaps 0x70(TKEYP), KEY
Huang Yingb369e522009-11-23 19:54:06 +08002021 AESENCLAST KEY STATE
Huang Ying54b6a1b2009-01-18 16:28:34 +11002022 ret
Jiri Slaby74d8b902019-10-11 13:50:46 +02002023SYM_FUNC_END(_aesni_enc1)
Huang Ying54b6a1b2009-01-18 16:28:34 +11002024
2025/*
2026 * _aesni_enc4: internal ABI
2027 * input:
2028 * KEYP: key struct pointer
2029 * KLEN: round count
2030 * STATE1: initial state (input)
2031 * STATE2
2032 * STATE3
2033 * STATE4
2034 * output:
2035 * STATE1: finial state (output)
2036 * STATE2
2037 * STATE3
2038 * STATE4
2039 * changed:
2040 * KEY
2041 * TKEYP (T1)
2042 */
Jiri Slaby74d8b902019-10-11 13:50:46 +02002043SYM_FUNC_START_LOCAL(_aesni_enc4)
Huang Ying54b6a1b2009-01-18 16:28:34 +11002044 movaps (KEYP), KEY # key
2045 mov KEYP, TKEYP
2046 pxor KEY, STATE1 # round 0
2047 pxor KEY, STATE2
2048 pxor KEY, STATE3
2049 pxor KEY, STATE4
2050 add $0x30, TKEYP
2051 cmp $24, KLEN
2052 jb .L4enc128
2053 lea 0x20(TKEYP), TKEYP
2054 je .L4enc192
2055 add $0x20, TKEYP
2056 movaps -0x60(TKEYP), KEY
Huang Yingb369e522009-11-23 19:54:06 +08002057 AESENC KEY STATE1
2058 AESENC KEY STATE2
2059 AESENC KEY STATE3
2060 AESENC KEY STATE4
Huang Ying54b6a1b2009-01-18 16:28:34 +11002061 movaps -0x50(TKEYP), KEY
Huang Yingb369e522009-11-23 19:54:06 +08002062 AESENC KEY STATE1
2063 AESENC KEY STATE2
2064 AESENC KEY STATE3
2065 AESENC KEY STATE4
Huang Ying54b6a1b2009-01-18 16:28:34 +11002066#.align 4
2067.L4enc192:
2068 movaps -0x40(TKEYP), KEY
Huang Yingb369e522009-11-23 19:54:06 +08002069 AESENC KEY STATE1
2070 AESENC KEY STATE2
2071 AESENC KEY STATE3
2072 AESENC KEY STATE4
Huang Ying54b6a1b2009-01-18 16:28:34 +11002073 movaps -0x30(TKEYP), KEY
Huang Yingb369e522009-11-23 19:54:06 +08002074 AESENC KEY STATE1
2075 AESENC KEY STATE2
2076 AESENC KEY STATE3
2077 AESENC KEY STATE4
Huang Ying54b6a1b2009-01-18 16:28:34 +11002078#.align 4
2079.L4enc128:
2080 movaps -0x20(TKEYP), KEY
Huang Yingb369e522009-11-23 19:54:06 +08002081 AESENC KEY STATE1
2082 AESENC KEY STATE2
2083 AESENC KEY STATE3
2084 AESENC KEY STATE4
Huang Ying54b6a1b2009-01-18 16:28:34 +11002085 movaps -0x10(TKEYP), KEY
Huang Yingb369e522009-11-23 19:54:06 +08002086 AESENC KEY STATE1
2087 AESENC KEY STATE2
2088 AESENC KEY STATE3
2089 AESENC KEY STATE4
Huang Ying54b6a1b2009-01-18 16:28:34 +11002090 movaps (TKEYP), KEY
Huang Yingb369e522009-11-23 19:54:06 +08002091 AESENC KEY STATE1
2092 AESENC KEY STATE2
2093 AESENC KEY STATE3
2094 AESENC KEY STATE4
Huang Ying54b6a1b2009-01-18 16:28:34 +11002095 movaps 0x10(TKEYP), KEY
Huang Yingb369e522009-11-23 19:54:06 +08002096 AESENC KEY STATE1
2097 AESENC KEY STATE2
2098 AESENC KEY STATE3
2099 AESENC KEY STATE4
Huang Ying54b6a1b2009-01-18 16:28:34 +11002100 movaps 0x20(TKEYP), KEY
Huang Yingb369e522009-11-23 19:54:06 +08002101 AESENC KEY STATE1
2102 AESENC KEY STATE2
2103 AESENC KEY STATE3
2104 AESENC KEY STATE4
Huang Ying54b6a1b2009-01-18 16:28:34 +11002105 movaps 0x30(TKEYP), KEY
Huang Yingb369e522009-11-23 19:54:06 +08002106 AESENC KEY STATE1
2107 AESENC KEY STATE2
2108 AESENC KEY STATE3
2109 AESENC KEY STATE4
Huang Ying54b6a1b2009-01-18 16:28:34 +11002110 movaps 0x40(TKEYP), KEY
Huang Yingb369e522009-11-23 19:54:06 +08002111 AESENC KEY STATE1
2112 AESENC KEY STATE2
2113 AESENC KEY STATE3
2114 AESENC KEY STATE4
Huang Ying54b6a1b2009-01-18 16:28:34 +11002115 movaps 0x50(TKEYP), KEY
Huang Yingb369e522009-11-23 19:54:06 +08002116 AESENC KEY STATE1
2117 AESENC KEY STATE2
2118 AESENC KEY STATE3
2119 AESENC KEY STATE4
Huang Ying54b6a1b2009-01-18 16:28:34 +11002120 movaps 0x60(TKEYP), KEY
Huang Yingb369e522009-11-23 19:54:06 +08002121 AESENC KEY STATE1
2122 AESENC KEY STATE2
2123 AESENC KEY STATE3
2124 AESENC KEY STATE4
Huang Ying54b6a1b2009-01-18 16:28:34 +11002125 movaps 0x70(TKEYP), KEY
Huang Yingb369e522009-11-23 19:54:06 +08002126 AESENCLAST KEY STATE1 # last round
2127 AESENCLAST KEY STATE2
2128 AESENCLAST KEY STATE3
2129 AESENCLAST KEY STATE4
Huang Ying54b6a1b2009-01-18 16:28:34 +11002130 ret
Jiri Slaby74d8b902019-10-11 13:50:46 +02002131SYM_FUNC_END(_aesni_enc4)
Huang Ying54b6a1b2009-01-18 16:28:34 +11002132
2133/*
Kees Cook9c1e8832019-11-26 22:08:02 -08002134 * void aesni_dec (const void *ctx, u8 *dst, const u8 *src)
Huang Ying54b6a1b2009-01-18 16:28:34 +11002135 */
Jiri Slaby6dcc5622019-10-11 13:51:04 +02002136SYM_FUNC_START(aesni_dec)
Josh Poimboeuf8691ccd2016-01-21 16:49:19 -06002137 FRAME_BEGIN
Mathias Krause0d258ef2010-11-27 16:34:46 +08002138#ifndef __x86_64__
2139 pushl KEYP
2140 pushl KLEN
Josh Poimboeuf8691ccd2016-01-21 16:49:19 -06002141 movl (FRAME_OFFSET+12)(%esp), KEYP # ctx
2142 movl (FRAME_OFFSET+16)(%esp), OUTP # dst
2143 movl (FRAME_OFFSET+20)(%esp), INP # src
Mathias Krause0d258ef2010-11-27 16:34:46 +08002144#endif
Huang Ying54b6a1b2009-01-18 16:28:34 +11002145 mov 480(KEYP), KLEN # key length
2146 add $240, KEYP
2147 movups (INP), STATE # input
2148 call _aesni_dec1
2149 movups STATE, (OUTP) #output
Mathias Krause0d258ef2010-11-27 16:34:46 +08002150#ifndef __x86_64__
2151 popl KLEN
2152 popl KEYP
2153#endif
Josh Poimboeuf8691ccd2016-01-21 16:49:19 -06002154 FRAME_END
Huang Ying54b6a1b2009-01-18 16:28:34 +11002155 ret
Jiri Slaby6dcc5622019-10-11 13:51:04 +02002156SYM_FUNC_END(aesni_dec)
Huang Ying54b6a1b2009-01-18 16:28:34 +11002157
2158/*
2159 * _aesni_dec1: internal ABI
2160 * input:
2161 * KEYP: key struct pointer
2162 * KLEN: key length
2163 * STATE: initial state (input)
2164 * output:
2165 * STATE: finial state (output)
2166 * changed:
2167 * KEY
2168 * TKEYP (T1)
2169 */
Jiri Slaby74d8b902019-10-11 13:50:46 +02002170SYM_FUNC_START_LOCAL(_aesni_dec1)
Huang Ying54b6a1b2009-01-18 16:28:34 +11002171 movaps (KEYP), KEY # key
2172 mov KEYP, TKEYP
2173 pxor KEY, STATE # round 0
2174 add $0x30, TKEYP
2175 cmp $24, KLEN
2176 jb .Ldec128
2177 lea 0x20(TKEYP), TKEYP
2178 je .Ldec192
2179 add $0x20, TKEYP
2180 movaps -0x60(TKEYP), KEY
Huang Yingb369e522009-11-23 19:54:06 +08002181 AESDEC KEY STATE
Huang Ying54b6a1b2009-01-18 16:28:34 +11002182 movaps -0x50(TKEYP), KEY
Huang Yingb369e522009-11-23 19:54:06 +08002183 AESDEC KEY STATE
Huang Ying54b6a1b2009-01-18 16:28:34 +11002184.align 4
2185.Ldec192:
2186 movaps -0x40(TKEYP), KEY
Huang Yingb369e522009-11-23 19:54:06 +08002187 AESDEC KEY STATE
Huang Ying54b6a1b2009-01-18 16:28:34 +11002188 movaps -0x30(TKEYP), KEY
Huang Yingb369e522009-11-23 19:54:06 +08002189 AESDEC KEY STATE
Huang Ying54b6a1b2009-01-18 16:28:34 +11002190.align 4
2191.Ldec128:
2192 movaps -0x20(TKEYP), KEY
Huang Yingb369e522009-11-23 19:54:06 +08002193 AESDEC KEY STATE
Huang Ying54b6a1b2009-01-18 16:28:34 +11002194 movaps -0x10(TKEYP), KEY
Huang Yingb369e522009-11-23 19:54:06 +08002195 AESDEC KEY STATE
Huang Ying54b6a1b2009-01-18 16:28:34 +11002196 movaps (TKEYP), KEY
Huang Yingb369e522009-11-23 19:54:06 +08002197 AESDEC KEY STATE
Huang Ying54b6a1b2009-01-18 16:28:34 +11002198 movaps 0x10(TKEYP), KEY
Huang Yingb369e522009-11-23 19:54:06 +08002199 AESDEC KEY STATE
Huang Ying54b6a1b2009-01-18 16:28:34 +11002200 movaps 0x20(TKEYP), KEY
Huang Yingb369e522009-11-23 19:54:06 +08002201 AESDEC KEY STATE
Huang Ying54b6a1b2009-01-18 16:28:34 +11002202 movaps 0x30(TKEYP), KEY
Huang Yingb369e522009-11-23 19:54:06 +08002203 AESDEC KEY STATE
Huang Ying54b6a1b2009-01-18 16:28:34 +11002204 movaps 0x40(TKEYP), KEY
Huang Yingb369e522009-11-23 19:54:06 +08002205 AESDEC KEY STATE
Huang Ying54b6a1b2009-01-18 16:28:34 +11002206 movaps 0x50(TKEYP), KEY
Huang Yingb369e522009-11-23 19:54:06 +08002207 AESDEC KEY STATE
Huang Ying54b6a1b2009-01-18 16:28:34 +11002208 movaps 0x60(TKEYP), KEY
Huang Yingb369e522009-11-23 19:54:06 +08002209 AESDEC KEY STATE
Huang Ying54b6a1b2009-01-18 16:28:34 +11002210 movaps 0x70(TKEYP), KEY
Huang Yingb369e522009-11-23 19:54:06 +08002211 AESDECLAST KEY STATE
Huang Ying54b6a1b2009-01-18 16:28:34 +11002212 ret
Jiri Slaby74d8b902019-10-11 13:50:46 +02002213SYM_FUNC_END(_aesni_dec1)
Huang Ying54b6a1b2009-01-18 16:28:34 +11002214
2215/*
2216 * _aesni_dec4: internal ABI
2217 * input:
2218 * KEYP: key struct pointer
2219 * KLEN: key length
2220 * STATE1: initial state (input)
2221 * STATE2
2222 * STATE3
2223 * STATE4
2224 * output:
2225 * STATE1: finial state (output)
2226 * STATE2
2227 * STATE3
2228 * STATE4
2229 * changed:
2230 * KEY
2231 * TKEYP (T1)
2232 */
Jiri Slaby74d8b902019-10-11 13:50:46 +02002233SYM_FUNC_START_LOCAL(_aesni_dec4)
Huang Ying54b6a1b2009-01-18 16:28:34 +11002234 movaps (KEYP), KEY # key
2235 mov KEYP, TKEYP
2236 pxor KEY, STATE1 # round 0
2237 pxor KEY, STATE2
2238 pxor KEY, STATE3
2239 pxor KEY, STATE4
2240 add $0x30, TKEYP
2241 cmp $24, KLEN
2242 jb .L4dec128
2243 lea 0x20(TKEYP), TKEYP
2244 je .L4dec192
2245 add $0x20, TKEYP
2246 movaps -0x60(TKEYP), KEY
Huang Yingb369e522009-11-23 19:54:06 +08002247 AESDEC KEY STATE1
2248 AESDEC KEY STATE2
2249 AESDEC KEY STATE3
2250 AESDEC KEY STATE4
Huang Ying54b6a1b2009-01-18 16:28:34 +11002251 movaps -0x50(TKEYP), KEY
Huang Yingb369e522009-11-23 19:54:06 +08002252 AESDEC KEY STATE1
2253 AESDEC KEY STATE2
2254 AESDEC KEY STATE3
2255 AESDEC KEY STATE4
Huang Ying54b6a1b2009-01-18 16:28:34 +11002256.align 4
2257.L4dec192:
2258 movaps -0x40(TKEYP), KEY
Huang Yingb369e522009-11-23 19:54:06 +08002259 AESDEC KEY STATE1
2260 AESDEC KEY STATE2
2261 AESDEC KEY STATE3
2262 AESDEC KEY STATE4
Huang Ying54b6a1b2009-01-18 16:28:34 +11002263 movaps -0x30(TKEYP), KEY
Huang Yingb369e522009-11-23 19:54:06 +08002264 AESDEC KEY STATE1
2265 AESDEC KEY STATE2
2266 AESDEC KEY STATE3
2267 AESDEC KEY STATE4
Huang Ying54b6a1b2009-01-18 16:28:34 +11002268.align 4
2269.L4dec128:
2270 movaps -0x20(TKEYP), KEY
Huang Yingb369e522009-11-23 19:54:06 +08002271 AESDEC KEY STATE1
2272 AESDEC KEY STATE2
2273 AESDEC KEY STATE3
2274 AESDEC KEY STATE4
Huang Ying54b6a1b2009-01-18 16:28:34 +11002275 movaps -0x10(TKEYP), KEY
Huang Yingb369e522009-11-23 19:54:06 +08002276 AESDEC KEY STATE1
2277 AESDEC KEY STATE2
2278 AESDEC KEY STATE3
2279 AESDEC KEY STATE4
Huang Ying54b6a1b2009-01-18 16:28:34 +11002280 movaps (TKEYP), KEY
Huang Yingb369e522009-11-23 19:54:06 +08002281 AESDEC KEY STATE1
2282 AESDEC KEY STATE2
2283 AESDEC KEY STATE3
2284 AESDEC KEY STATE4
Huang Ying54b6a1b2009-01-18 16:28:34 +11002285 movaps 0x10(TKEYP), KEY
Huang Yingb369e522009-11-23 19:54:06 +08002286 AESDEC KEY STATE1
2287 AESDEC KEY STATE2
2288 AESDEC KEY STATE3
2289 AESDEC KEY STATE4
Huang Ying54b6a1b2009-01-18 16:28:34 +11002290 movaps 0x20(TKEYP), KEY
Huang Yingb369e522009-11-23 19:54:06 +08002291 AESDEC KEY STATE1
2292 AESDEC KEY STATE2
2293 AESDEC KEY STATE3
2294 AESDEC KEY STATE4
Huang Ying54b6a1b2009-01-18 16:28:34 +11002295 movaps 0x30(TKEYP), KEY
Huang Yingb369e522009-11-23 19:54:06 +08002296 AESDEC KEY STATE1
2297 AESDEC KEY STATE2
2298 AESDEC KEY STATE3
2299 AESDEC KEY STATE4
Huang Ying54b6a1b2009-01-18 16:28:34 +11002300 movaps 0x40(TKEYP), KEY
Huang Yingb369e522009-11-23 19:54:06 +08002301 AESDEC KEY STATE1
2302 AESDEC KEY STATE2
2303 AESDEC KEY STATE3
2304 AESDEC KEY STATE4
Huang Ying54b6a1b2009-01-18 16:28:34 +11002305 movaps 0x50(TKEYP), KEY
Huang Yingb369e522009-11-23 19:54:06 +08002306 AESDEC KEY STATE1
2307 AESDEC KEY STATE2
2308 AESDEC KEY STATE3
2309 AESDEC KEY STATE4
Huang Ying54b6a1b2009-01-18 16:28:34 +11002310 movaps 0x60(TKEYP), KEY
Huang Yingb369e522009-11-23 19:54:06 +08002311 AESDEC KEY STATE1
2312 AESDEC KEY STATE2
2313 AESDEC KEY STATE3
2314 AESDEC KEY STATE4
Huang Ying54b6a1b2009-01-18 16:28:34 +11002315 movaps 0x70(TKEYP), KEY
Huang Yingb369e522009-11-23 19:54:06 +08002316 AESDECLAST KEY STATE1 # last round
2317 AESDECLAST KEY STATE2
2318 AESDECLAST KEY STATE3
2319 AESDECLAST KEY STATE4
Huang Ying54b6a1b2009-01-18 16:28:34 +11002320 ret
Jiri Slaby74d8b902019-10-11 13:50:46 +02002321SYM_FUNC_END(_aesni_dec4)
Huang Ying54b6a1b2009-01-18 16:28:34 +11002322
2323/*
2324 * void aesni_ecb_enc(struct crypto_aes_ctx *ctx, const u8 *dst, u8 *src,
2325 * size_t len)
2326 */
Jiri Slaby6dcc5622019-10-11 13:51:04 +02002327SYM_FUNC_START(aesni_ecb_enc)
Josh Poimboeuf8691ccd2016-01-21 16:49:19 -06002328 FRAME_BEGIN
Mathias Krause0d258ef2010-11-27 16:34:46 +08002329#ifndef __x86_64__
2330 pushl LEN
2331 pushl KEYP
2332 pushl KLEN
Josh Poimboeuf8691ccd2016-01-21 16:49:19 -06002333 movl (FRAME_OFFSET+16)(%esp), KEYP # ctx
2334 movl (FRAME_OFFSET+20)(%esp), OUTP # dst
2335 movl (FRAME_OFFSET+24)(%esp), INP # src
2336 movl (FRAME_OFFSET+28)(%esp), LEN # len
Mathias Krause0d258ef2010-11-27 16:34:46 +08002337#endif
Huang Ying54b6a1b2009-01-18 16:28:34 +11002338 test LEN, LEN # check length
2339 jz .Lecb_enc_ret
2340 mov 480(KEYP), KLEN
2341 cmp $16, LEN
2342 jb .Lecb_enc_ret
2343 cmp $64, LEN
2344 jb .Lecb_enc_loop1
2345.align 4
2346.Lecb_enc_loop4:
2347 movups (INP), STATE1
2348 movups 0x10(INP), STATE2
2349 movups 0x20(INP), STATE3
2350 movups 0x30(INP), STATE4
2351 call _aesni_enc4
2352 movups STATE1, (OUTP)
2353 movups STATE2, 0x10(OUTP)
2354 movups STATE3, 0x20(OUTP)
2355 movups STATE4, 0x30(OUTP)
2356 sub $64, LEN
2357 add $64, INP
2358 add $64, OUTP
2359 cmp $64, LEN
2360 jge .Lecb_enc_loop4
2361 cmp $16, LEN
2362 jb .Lecb_enc_ret
2363.align 4
2364.Lecb_enc_loop1:
2365 movups (INP), STATE1
2366 call _aesni_enc1
2367 movups STATE1, (OUTP)
2368 sub $16, LEN
2369 add $16, INP
2370 add $16, OUTP
2371 cmp $16, LEN
2372 jge .Lecb_enc_loop1
2373.Lecb_enc_ret:
Mathias Krause0d258ef2010-11-27 16:34:46 +08002374#ifndef __x86_64__
2375 popl KLEN
2376 popl KEYP
2377 popl LEN
2378#endif
Josh Poimboeuf8691ccd2016-01-21 16:49:19 -06002379 FRAME_END
Huang Ying54b6a1b2009-01-18 16:28:34 +11002380 ret
Jiri Slaby6dcc5622019-10-11 13:51:04 +02002381SYM_FUNC_END(aesni_ecb_enc)
Huang Ying54b6a1b2009-01-18 16:28:34 +11002382
2383/*
2384 * void aesni_ecb_dec(struct crypto_aes_ctx *ctx, const u8 *dst, u8 *src,
2385 * size_t len);
2386 */
Jiri Slaby6dcc5622019-10-11 13:51:04 +02002387SYM_FUNC_START(aesni_ecb_dec)
Josh Poimboeuf8691ccd2016-01-21 16:49:19 -06002388 FRAME_BEGIN
Mathias Krause0d258ef2010-11-27 16:34:46 +08002389#ifndef __x86_64__
2390 pushl LEN
2391 pushl KEYP
2392 pushl KLEN
Josh Poimboeuf8691ccd2016-01-21 16:49:19 -06002393 movl (FRAME_OFFSET+16)(%esp), KEYP # ctx
2394 movl (FRAME_OFFSET+20)(%esp), OUTP # dst
2395 movl (FRAME_OFFSET+24)(%esp), INP # src
2396 movl (FRAME_OFFSET+28)(%esp), LEN # len
Mathias Krause0d258ef2010-11-27 16:34:46 +08002397#endif
Huang Ying54b6a1b2009-01-18 16:28:34 +11002398 test LEN, LEN
2399 jz .Lecb_dec_ret
2400 mov 480(KEYP), KLEN
2401 add $240, KEYP
2402 cmp $16, LEN
2403 jb .Lecb_dec_ret
2404 cmp $64, LEN
2405 jb .Lecb_dec_loop1
2406.align 4
2407.Lecb_dec_loop4:
2408 movups (INP), STATE1
2409 movups 0x10(INP), STATE2
2410 movups 0x20(INP), STATE3
2411 movups 0x30(INP), STATE4
2412 call _aesni_dec4
2413 movups STATE1, (OUTP)
2414 movups STATE2, 0x10(OUTP)
2415 movups STATE3, 0x20(OUTP)
2416 movups STATE4, 0x30(OUTP)
2417 sub $64, LEN
2418 add $64, INP
2419 add $64, OUTP
2420 cmp $64, LEN
2421 jge .Lecb_dec_loop4
2422 cmp $16, LEN
2423 jb .Lecb_dec_ret
2424.align 4
2425.Lecb_dec_loop1:
2426 movups (INP), STATE1
2427 call _aesni_dec1
2428 movups STATE1, (OUTP)
2429 sub $16, LEN
2430 add $16, INP
2431 add $16, OUTP
2432 cmp $16, LEN
2433 jge .Lecb_dec_loop1
2434.Lecb_dec_ret:
Mathias Krause0d258ef2010-11-27 16:34:46 +08002435#ifndef __x86_64__
2436 popl KLEN
2437 popl KEYP
2438 popl LEN
2439#endif
Josh Poimboeuf8691ccd2016-01-21 16:49:19 -06002440 FRAME_END
Huang Ying54b6a1b2009-01-18 16:28:34 +11002441 ret
Jiri Slaby6dcc5622019-10-11 13:51:04 +02002442SYM_FUNC_END(aesni_ecb_dec)
Huang Ying54b6a1b2009-01-18 16:28:34 +11002443
2444/*
2445 * void aesni_cbc_enc(struct crypto_aes_ctx *ctx, const u8 *dst, u8 *src,
2446 * size_t len, u8 *iv)
2447 */
Jiri Slaby6dcc5622019-10-11 13:51:04 +02002448SYM_FUNC_START(aesni_cbc_enc)
Josh Poimboeuf8691ccd2016-01-21 16:49:19 -06002449 FRAME_BEGIN
Mathias Krause0d258ef2010-11-27 16:34:46 +08002450#ifndef __x86_64__
2451 pushl IVP
2452 pushl LEN
2453 pushl KEYP
2454 pushl KLEN
Josh Poimboeuf8691ccd2016-01-21 16:49:19 -06002455 movl (FRAME_OFFSET+20)(%esp), KEYP # ctx
2456 movl (FRAME_OFFSET+24)(%esp), OUTP # dst
2457 movl (FRAME_OFFSET+28)(%esp), INP # src
2458 movl (FRAME_OFFSET+32)(%esp), LEN # len
2459 movl (FRAME_OFFSET+36)(%esp), IVP # iv
Mathias Krause0d258ef2010-11-27 16:34:46 +08002460#endif
Huang Ying54b6a1b2009-01-18 16:28:34 +11002461 cmp $16, LEN
2462 jb .Lcbc_enc_ret
2463 mov 480(KEYP), KLEN
2464 movups (IVP), STATE # load iv as initial state
2465.align 4
2466.Lcbc_enc_loop:
2467 movups (INP), IN # load input
2468 pxor IN, STATE
2469 call _aesni_enc1
2470 movups STATE, (OUTP) # store output
2471 sub $16, LEN
2472 add $16, INP
2473 add $16, OUTP
2474 cmp $16, LEN
2475 jge .Lcbc_enc_loop
2476 movups STATE, (IVP)
2477.Lcbc_enc_ret:
Mathias Krause0d258ef2010-11-27 16:34:46 +08002478#ifndef __x86_64__
2479 popl KLEN
2480 popl KEYP
2481 popl LEN
2482 popl IVP
2483#endif
Josh Poimboeuf8691ccd2016-01-21 16:49:19 -06002484 FRAME_END
Huang Ying54b6a1b2009-01-18 16:28:34 +11002485 ret
Jiri Slaby6dcc5622019-10-11 13:51:04 +02002486SYM_FUNC_END(aesni_cbc_enc)
Huang Ying54b6a1b2009-01-18 16:28:34 +11002487
2488/*
2489 * void aesni_cbc_dec(struct crypto_aes_ctx *ctx, const u8 *dst, u8 *src,
2490 * size_t len, u8 *iv)
2491 */
Jiri Slaby6dcc5622019-10-11 13:51:04 +02002492SYM_FUNC_START(aesni_cbc_dec)
Josh Poimboeuf8691ccd2016-01-21 16:49:19 -06002493 FRAME_BEGIN
Mathias Krause0d258ef2010-11-27 16:34:46 +08002494#ifndef __x86_64__
2495 pushl IVP
2496 pushl LEN
2497 pushl KEYP
2498 pushl KLEN
Josh Poimboeuf8691ccd2016-01-21 16:49:19 -06002499 movl (FRAME_OFFSET+20)(%esp), KEYP # ctx
2500 movl (FRAME_OFFSET+24)(%esp), OUTP # dst
2501 movl (FRAME_OFFSET+28)(%esp), INP # src
2502 movl (FRAME_OFFSET+32)(%esp), LEN # len
2503 movl (FRAME_OFFSET+36)(%esp), IVP # iv
Mathias Krause0d258ef2010-11-27 16:34:46 +08002504#endif
Huang Ying54b6a1b2009-01-18 16:28:34 +11002505 cmp $16, LEN
Huang Yinge6efaa02009-06-18 19:33:57 +08002506 jb .Lcbc_dec_just_ret
Huang Ying54b6a1b2009-01-18 16:28:34 +11002507 mov 480(KEYP), KLEN
2508 add $240, KEYP
2509 movups (IVP), IV
2510 cmp $64, LEN
2511 jb .Lcbc_dec_loop1
2512.align 4
2513.Lcbc_dec_loop4:
2514 movups (INP), IN1
2515 movaps IN1, STATE1
2516 movups 0x10(INP), IN2
2517 movaps IN2, STATE2
Mathias Krause0d258ef2010-11-27 16:34:46 +08002518#ifdef __x86_64__
Huang Ying54b6a1b2009-01-18 16:28:34 +11002519 movups 0x20(INP), IN3
2520 movaps IN3, STATE3
2521 movups 0x30(INP), IN4
2522 movaps IN4, STATE4
Mathias Krause0d258ef2010-11-27 16:34:46 +08002523#else
2524 movups 0x20(INP), IN1
2525 movaps IN1, STATE3
2526 movups 0x30(INP), IN2
2527 movaps IN2, STATE4
2528#endif
Huang Ying54b6a1b2009-01-18 16:28:34 +11002529 call _aesni_dec4
2530 pxor IV, STATE1
Mathias Krause0d258ef2010-11-27 16:34:46 +08002531#ifdef __x86_64__
Huang Ying54b6a1b2009-01-18 16:28:34 +11002532 pxor IN1, STATE2
2533 pxor IN2, STATE3
2534 pxor IN3, STATE4
2535 movaps IN4, IV
Mathias Krause0d258ef2010-11-27 16:34:46 +08002536#else
Mathias Krause0d258ef2010-11-27 16:34:46 +08002537 pxor IN1, STATE4
2538 movaps IN2, IV
Mathias Krause7c8d5182012-05-30 01:43:08 +02002539 movups (INP), IN1
2540 pxor IN1, STATE2
2541 movups 0x10(INP), IN2
2542 pxor IN2, STATE3
Mathias Krause0d258ef2010-11-27 16:34:46 +08002543#endif
Huang Ying54b6a1b2009-01-18 16:28:34 +11002544 movups STATE1, (OUTP)
2545 movups STATE2, 0x10(OUTP)
2546 movups STATE3, 0x20(OUTP)
2547 movups STATE4, 0x30(OUTP)
2548 sub $64, LEN
2549 add $64, INP
2550 add $64, OUTP
2551 cmp $64, LEN
2552 jge .Lcbc_dec_loop4
2553 cmp $16, LEN
2554 jb .Lcbc_dec_ret
2555.align 4
2556.Lcbc_dec_loop1:
2557 movups (INP), IN
2558 movaps IN, STATE
2559 call _aesni_dec1
2560 pxor IV, STATE
2561 movups STATE, (OUTP)
2562 movaps IN, IV
2563 sub $16, LEN
2564 add $16, INP
2565 add $16, OUTP
2566 cmp $16, LEN
2567 jge .Lcbc_dec_loop1
Huang Ying54b6a1b2009-01-18 16:28:34 +11002568.Lcbc_dec_ret:
Huang Yinge6efaa02009-06-18 19:33:57 +08002569 movups IV, (IVP)
2570.Lcbc_dec_just_ret:
Mathias Krause0d258ef2010-11-27 16:34:46 +08002571#ifndef __x86_64__
2572 popl KLEN
2573 popl KEYP
2574 popl LEN
2575 popl IVP
2576#endif
Josh Poimboeuf8691ccd2016-01-21 16:49:19 -06002577 FRAME_END
Huang Ying54b6a1b2009-01-18 16:28:34 +11002578 ret
Jiri Slaby6dcc5622019-10-11 13:51:04 +02002579SYM_FUNC_END(aesni_cbc_dec)
Huang Ying12387a42010-03-10 18:28:55 +08002580
Mathias Krause0d258ef2010-11-27 16:34:46 +08002581#ifdef __x86_64__
Josh Poimboeuf1253cab2016-01-21 16:49:15 -06002582.pushsection .rodata
Huang Ying12387a42010-03-10 18:28:55 +08002583.align 16
2584.Lbswap_mask:
2585 .byte 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0
Josh Poimboeuf1253cab2016-01-21 16:49:15 -06002586.popsection
Huang Ying12387a42010-03-10 18:28:55 +08002587
2588/*
2589 * _aesni_inc_init: internal ABI
2590 * setup registers used by _aesni_inc
2591 * input:
2592 * IV
2593 * output:
2594 * CTR: == IV, in little endian
2595 * TCTR_LOW: == lower qword of CTR
2596 * INC: == 1, in little endian
2597 * BSWAP_MASK == endian swapping mask
2598 */
Jiri Slaby74d8b902019-10-11 13:50:46 +02002599SYM_FUNC_START_LOCAL(_aesni_inc_init)
Huang Ying12387a42010-03-10 18:28:55 +08002600 movaps .Lbswap_mask, BSWAP_MASK
2601 movaps IV, CTR
2602 PSHUFB_XMM BSWAP_MASK CTR
2603 mov $1, TCTR_LOW
Huang Ying32cbd7d2010-03-13 16:28:42 +08002604 MOVQ_R64_XMM TCTR_LOW INC
2605 MOVQ_R64_XMM CTR TCTR_LOW
Huang Ying12387a42010-03-10 18:28:55 +08002606 ret
Jiri Slaby74d8b902019-10-11 13:50:46 +02002607SYM_FUNC_END(_aesni_inc_init)
Huang Ying12387a42010-03-10 18:28:55 +08002608
2609/*
2610 * _aesni_inc: internal ABI
2611 * Increase IV by 1, IV is in big endian
2612 * input:
2613 * IV
2614 * CTR: == IV, in little endian
2615 * TCTR_LOW: == lower qword of CTR
2616 * INC: == 1, in little endian
2617 * BSWAP_MASK == endian swapping mask
2618 * output:
2619 * IV: Increase by 1
2620 * changed:
2621 * CTR: == output IV, in little endian
2622 * TCTR_LOW: == lower qword of CTR
2623 */
Jiri Slaby74d8b902019-10-11 13:50:46 +02002624SYM_FUNC_START_LOCAL(_aesni_inc)
Huang Ying12387a42010-03-10 18:28:55 +08002625 paddq INC, CTR
2626 add $1, TCTR_LOW
2627 jnc .Linc_low
2628 pslldq $8, INC
2629 paddq INC, CTR
2630 psrldq $8, INC
2631.Linc_low:
2632 movaps CTR, IV
2633 PSHUFB_XMM BSWAP_MASK IV
2634 ret
Jiri Slaby74d8b902019-10-11 13:50:46 +02002635SYM_FUNC_END(_aesni_inc)
Huang Ying12387a42010-03-10 18:28:55 +08002636
2637/*
2638 * void aesni_ctr_enc(struct crypto_aes_ctx *ctx, const u8 *dst, u8 *src,
2639 * size_t len, u8 *iv)
2640 */
Jiri Slaby6dcc5622019-10-11 13:51:04 +02002641SYM_FUNC_START(aesni_ctr_enc)
Josh Poimboeuf8691ccd2016-01-21 16:49:19 -06002642 FRAME_BEGIN
Huang Ying12387a42010-03-10 18:28:55 +08002643 cmp $16, LEN
2644 jb .Lctr_enc_just_ret
2645 mov 480(KEYP), KLEN
2646 movups (IVP), IV
2647 call _aesni_inc_init
2648 cmp $64, LEN
2649 jb .Lctr_enc_loop1
2650.align 4
2651.Lctr_enc_loop4:
2652 movaps IV, STATE1
2653 call _aesni_inc
2654 movups (INP), IN1
2655 movaps IV, STATE2
2656 call _aesni_inc
2657 movups 0x10(INP), IN2
2658 movaps IV, STATE3
2659 call _aesni_inc
2660 movups 0x20(INP), IN3
2661 movaps IV, STATE4
2662 call _aesni_inc
2663 movups 0x30(INP), IN4
2664 call _aesni_enc4
2665 pxor IN1, STATE1
2666 movups STATE1, (OUTP)
2667 pxor IN2, STATE2
2668 movups STATE2, 0x10(OUTP)
2669 pxor IN3, STATE3
2670 movups STATE3, 0x20(OUTP)
2671 pxor IN4, STATE4
2672 movups STATE4, 0x30(OUTP)
2673 sub $64, LEN
2674 add $64, INP
2675 add $64, OUTP
2676 cmp $64, LEN
2677 jge .Lctr_enc_loop4
2678 cmp $16, LEN
2679 jb .Lctr_enc_ret
2680.align 4
2681.Lctr_enc_loop1:
2682 movaps IV, STATE
2683 call _aesni_inc
2684 movups (INP), IN
2685 call _aesni_enc1
2686 pxor IN, STATE
2687 movups STATE, (OUTP)
2688 sub $16, LEN
2689 add $16, INP
2690 add $16, OUTP
2691 cmp $16, LEN
2692 jge .Lctr_enc_loop1
2693.Lctr_enc_ret:
2694 movups IV, (IVP)
2695.Lctr_enc_just_ret:
Josh Poimboeuf8691ccd2016-01-21 16:49:19 -06002696 FRAME_END
Huang Ying12387a42010-03-10 18:28:55 +08002697 ret
Jiri Slaby6dcc5622019-10-11 13:51:04 +02002698SYM_FUNC_END(aesni_ctr_enc)
Jussi Kivilinnac456a9c2013-04-08 21:51:16 +03002699
2700/*
2701 * _aesni_gf128mul_x_ble: internal ABI
2702 * Multiply in GF(2^128) for XTS IVs
2703 * input:
2704 * IV: current IV
2705 * GF128MUL_MASK == mask with 0x87 and 0x01
2706 * output:
2707 * IV: next IV
2708 * changed:
2709 * CTR: == temporary value
2710 */
2711#define _aesni_gf128mul_x_ble() \
2712 pshufd $0x13, IV, CTR; \
2713 paddq IV, IV; \
2714 psrad $31, CTR; \
2715 pand GF128MUL_MASK, CTR; \
2716 pxor CTR, IV;
2717
2718/*
Kees Cook9c1e8832019-11-26 22:08:02 -08002719 * void aesni_xts_crypt8(const struct crypto_aes_ctx *ctx, u8 *dst,
2720 * const u8 *src, bool enc, le128 *iv)
Jussi Kivilinnac456a9c2013-04-08 21:51:16 +03002721 */
Jiri Slaby6dcc5622019-10-11 13:51:04 +02002722SYM_FUNC_START(aesni_xts_crypt8)
Josh Poimboeuf8691ccd2016-01-21 16:49:19 -06002723 FRAME_BEGIN
Jussi Kivilinnac456a9c2013-04-08 21:51:16 +03002724 cmpb $0, %cl
2725 movl $0, %ecx
2726 movl $240, %r10d
2727 leaq _aesni_enc4, %r11
2728 leaq _aesni_dec4, %rax
2729 cmovel %r10d, %ecx
2730 cmoveq %rax, %r11
2731
2732 movdqa .Lgf128mul_x_ble_mask, GF128MUL_MASK
2733 movups (IVP), IV
2734
2735 mov 480(KEYP), KLEN
2736 addq %rcx, KEYP
2737
2738 movdqa IV, STATE1
Jussi Kivilinnafe6510b2013-06-11 22:25:22 +03002739 movdqu 0x00(INP), INC
2740 pxor INC, STATE1
Jussi Kivilinnac456a9c2013-04-08 21:51:16 +03002741 movdqu IV, 0x00(OUTP)
2742
2743 _aesni_gf128mul_x_ble()
2744 movdqa IV, STATE2
Jussi Kivilinnafe6510b2013-06-11 22:25:22 +03002745 movdqu 0x10(INP), INC
2746 pxor INC, STATE2
Jussi Kivilinnac456a9c2013-04-08 21:51:16 +03002747 movdqu IV, 0x10(OUTP)
2748
2749 _aesni_gf128mul_x_ble()
2750 movdqa IV, STATE3
Jussi Kivilinnafe6510b2013-06-11 22:25:22 +03002751 movdqu 0x20(INP), INC
2752 pxor INC, STATE3
Jussi Kivilinnac456a9c2013-04-08 21:51:16 +03002753 movdqu IV, 0x20(OUTP)
2754
2755 _aesni_gf128mul_x_ble()
2756 movdqa IV, STATE4
Jussi Kivilinnafe6510b2013-06-11 22:25:22 +03002757 movdqu 0x30(INP), INC
2758 pxor INC, STATE4
Jussi Kivilinnac456a9c2013-04-08 21:51:16 +03002759 movdqu IV, 0x30(OUTP)
2760
David Woodhouse9697fa32018-01-11 21:46:27 +00002761 CALL_NOSPEC %r11
Jussi Kivilinnac456a9c2013-04-08 21:51:16 +03002762
Jussi Kivilinnafe6510b2013-06-11 22:25:22 +03002763 movdqu 0x00(OUTP), INC
2764 pxor INC, STATE1
Jussi Kivilinnac456a9c2013-04-08 21:51:16 +03002765 movdqu STATE1, 0x00(OUTP)
2766
2767 _aesni_gf128mul_x_ble()
2768 movdqa IV, STATE1
Jussi Kivilinnafe6510b2013-06-11 22:25:22 +03002769 movdqu 0x40(INP), INC
2770 pxor INC, STATE1
Jussi Kivilinnac456a9c2013-04-08 21:51:16 +03002771 movdqu IV, 0x40(OUTP)
2772
Jussi Kivilinnafe6510b2013-06-11 22:25:22 +03002773 movdqu 0x10(OUTP), INC
2774 pxor INC, STATE2
Jussi Kivilinnac456a9c2013-04-08 21:51:16 +03002775 movdqu STATE2, 0x10(OUTP)
2776
2777 _aesni_gf128mul_x_ble()
2778 movdqa IV, STATE2
Jussi Kivilinnafe6510b2013-06-11 22:25:22 +03002779 movdqu 0x50(INP), INC
2780 pxor INC, STATE2
Jussi Kivilinnac456a9c2013-04-08 21:51:16 +03002781 movdqu IV, 0x50(OUTP)
2782
Jussi Kivilinnafe6510b2013-06-11 22:25:22 +03002783 movdqu 0x20(OUTP), INC
2784 pxor INC, STATE3
Jussi Kivilinnac456a9c2013-04-08 21:51:16 +03002785 movdqu STATE3, 0x20(OUTP)
2786
2787 _aesni_gf128mul_x_ble()
2788 movdqa IV, STATE3
Jussi Kivilinnafe6510b2013-06-11 22:25:22 +03002789 movdqu 0x60(INP), INC
2790 pxor INC, STATE3
Jussi Kivilinnac456a9c2013-04-08 21:51:16 +03002791 movdqu IV, 0x60(OUTP)
2792
Jussi Kivilinnafe6510b2013-06-11 22:25:22 +03002793 movdqu 0x30(OUTP), INC
2794 pxor INC, STATE4
Jussi Kivilinnac456a9c2013-04-08 21:51:16 +03002795 movdqu STATE4, 0x30(OUTP)
2796
2797 _aesni_gf128mul_x_ble()
2798 movdqa IV, STATE4
Jussi Kivilinnafe6510b2013-06-11 22:25:22 +03002799 movdqu 0x70(INP), INC
2800 pxor INC, STATE4
Jussi Kivilinnac456a9c2013-04-08 21:51:16 +03002801 movdqu IV, 0x70(OUTP)
2802
2803 _aesni_gf128mul_x_ble()
2804 movups IV, (IVP)
2805
David Woodhouse9697fa32018-01-11 21:46:27 +00002806 CALL_NOSPEC %r11
Jussi Kivilinnac456a9c2013-04-08 21:51:16 +03002807
Jussi Kivilinnafe6510b2013-06-11 22:25:22 +03002808 movdqu 0x40(OUTP), INC
2809 pxor INC, STATE1
Jussi Kivilinnac456a9c2013-04-08 21:51:16 +03002810 movdqu STATE1, 0x40(OUTP)
2811
Jussi Kivilinnafe6510b2013-06-11 22:25:22 +03002812 movdqu 0x50(OUTP), INC
2813 pxor INC, STATE2
Jussi Kivilinnac456a9c2013-04-08 21:51:16 +03002814 movdqu STATE2, 0x50(OUTP)
2815
Jussi Kivilinnafe6510b2013-06-11 22:25:22 +03002816 movdqu 0x60(OUTP), INC
2817 pxor INC, STATE3
Jussi Kivilinnac456a9c2013-04-08 21:51:16 +03002818 movdqu STATE3, 0x60(OUTP)
2819
Jussi Kivilinnafe6510b2013-06-11 22:25:22 +03002820 movdqu 0x70(OUTP), INC
2821 pxor INC, STATE4
Jussi Kivilinnac456a9c2013-04-08 21:51:16 +03002822 movdqu STATE4, 0x70(OUTP)
2823
Josh Poimboeuf8691ccd2016-01-21 16:49:19 -06002824 FRAME_END
Jussi Kivilinnac456a9c2013-04-08 21:51:16 +03002825 ret
Jiri Slaby6dcc5622019-10-11 13:51:04 +02002826SYM_FUNC_END(aesni_xts_crypt8)
Jussi Kivilinnac456a9c2013-04-08 21:51:16 +03002827
Mathias Krause0d258ef2010-11-27 16:34:46 +08002828#endif