blob: d27a50656aa1f720c124e06036da624cb199d0f1 [file] [log] [blame]
Huang Ying54b6a1b2009-01-18 16:28:34 +11001/*
2 * Implement AES algorithm in Intel AES-NI instructions.
3 *
4 * The white paper of AES-NI instructions can be downloaded from:
5 * http://softwarecommunity.intel.com/isn/downloads/intelavx/AES-Instructions-Set_WP.pdf
6 *
7 * Copyright (C) 2008, Intel Corp.
8 * Author: Huang Ying <ying.huang@intel.com>
9 * Vinodh Gopal <vinodh.gopal@intel.com>
10 * Kahraman Akdemir
11 *
Tadeusz Struk0bd82f52010-11-04 15:00:45 -040012 * Added RFC4106 AES-GCM support for 128-bit keys under the AEAD
13 * interface for 64-bit kernels.
14 * Authors: Erdinc Ozturk (erdinc.ozturk@intel.com)
15 * Aidan O'Mahony (aidan.o.mahony@intel.com)
16 * Adrian Hoban <adrian.hoban@intel.com>
17 * James Guilford (james.guilford@intel.com)
18 * Gabriele Paoloni <gabriele.paoloni@intel.com>
19 * Tadeusz Struk (tadeusz.struk@intel.com)
20 * Wajdi Feghali (wajdi.k.feghali@intel.com)
21 * Copyright (c) 2010, Intel Corporation.
22 *
Mathias Krause0d258ef2010-11-27 16:34:46 +080023 * Ported x86_64 version to x86:
24 * Author: Mathias Krause <minipli@googlemail.com>
25 *
Huang Ying54b6a1b2009-01-18 16:28:34 +110026 * This program is free software; you can redistribute it and/or modify
27 * it under the terms of the GNU General Public License as published by
28 * the Free Software Foundation; either version 2 of the License, or
29 * (at your option) any later version.
30 */
31
32#include <linux/linkage.h>
Huang Yingb369e522009-11-23 19:54:06 +080033#include <asm/inst.h>
Josh Poimboeuf8691ccd2016-01-21 16:49:19 -060034#include <asm/frame.h>
David Woodhouse9697fa32018-01-11 21:46:27 +000035#include <asm/nospec-branch.h>
Huang Ying54b6a1b2009-01-18 16:28:34 +110036
Timothy McCaffreye31ac322015-01-13 13:16:43 -050037/*
38 * The following macros are used to move an (un)aligned 16 byte value to/from
39 * an XMM register. This can done for either FP or integer values, for FP use
40 * movaps (move aligned packed single) or integer use movdqa (move double quad
41 * aligned). It doesn't make a performance difference which instruction is used
42 * since Nehalem (original Core i7) was released. However, the movaps is a byte
43 * shorter, so that is the one we'll use for now. (same for unaligned).
44 */
45#define MOVADQ movaps
46#define MOVUDQ movups
47
Mathias Krause559ad0f2010-11-29 08:35:39 +080048#ifdef __x86_64__
Timothy McCaffreye31ac322015-01-13 13:16:43 -050049
Denys Vlasenkoe1839142017-01-19 22:33:04 +010050# constants in mergeable sections, linker can reorder and merge
51.section .rodata.cst16.gf128mul_x_ble_mask, "aM", @progbits, 16
Jussi Kivilinnac456a9c2013-04-08 21:51:16 +030052.align 16
53.Lgf128mul_x_ble_mask:
54 .octa 0x00000000000000010000000000000087
Denys Vlasenkoe1839142017-01-19 22:33:04 +010055.section .rodata.cst16.POLY, "aM", @progbits, 16
56.align 16
Tadeusz Struk0bd82f52010-11-04 15:00:45 -040057POLY: .octa 0xC2000000000000000000000000000001
Denys Vlasenkoe1839142017-01-19 22:33:04 +010058.section .rodata.cst16.TWOONE, "aM", @progbits, 16
59.align 16
Tadeusz Struk0bd82f52010-11-04 15:00:45 -040060TWOONE: .octa 0x00000001000000000000000000000001
61
Denys Vlasenkoe1839142017-01-19 22:33:04 +010062.section .rodata.cst16.SHUF_MASK, "aM", @progbits, 16
63.align 16
64SHUF_MASK: .octa 0x000102030405060708090A0B0C0D0E0F
65.section .rodata.cst16.MASK1, "aM", @progbits, 16
66.align 16
67MASK1: .octa 0x0000000000000000ffffffffffffffff
68.section .rodata.cst16.MASK2, "aM", @progbits, 16
69.align 16
70MASK2: .octa 0xffffffffffffffff0000000000000000
71.section .rodata.cst16.ONE, "aM", @progbits, 16
72.align 16
73ONE: .octa 0x00000000000000000000000000000001
74.section .rodata.cst16.F_MIN_MASK, "aM", @progbits, 16
75.align 16
76F_MIN_MASK: .octa 0xf1f2f3f4f5f6f7f8f9fafbfcfdfeff0
77.section .rodata.cst16.dec, "aM", @progbits, 16
78.align 16
79dec: .octa 0x1
80.section .rodata.cst16.enc, "aM", @progbits, 16
81.align 16
82enc: .octa 0x2
83
Tadeusz Struk0bd82f52010-11-04 15:00:45 -040084# order of these constants should not change.
85# more specifically, ALL_F should follow SHIFT_MASK,
Denys Vlasenkoe1839142017-01-19 22:33:04 +010086# and zero should follow ALL_F
87.section .rodata, "a", @progbits
88.align 16
Tadeusz Struk0bd82f52010-11-04 15:00:45 -040089SHIFT_MASK: .octa 0x0f0e0d0c0b0a09080706050403020100
90ALL_F: .octa 0xffffffffffffffffffffffffffffffff
Denys Vlasenkoe1839142017-01-19 22:33:04 +010091 .octa 0x00000000000000000000000000000000
Tadeusz Struk0bd82f52010-11-04 15:00:45 -040092
Huang Ying54b6a1b2009-01-18 16:28:34 +110093.text
94
Tadeusz Struk0bd82f52010-11-04 15:00:45 -040095
96#define STACK_OFFSET 8*3
Tadeusz Struk0bd82f52010-11-04 15:00:45 -040097
Dave Watson9ee4a5d2018-02-14 09:39:23 -080098#define AadHash 16*0
99#define AadLen 16*1
100#define InLen (16*1)+8
101#define PBlockEncKey 16*2
102#define OrigIV 16*3
103#define CurCount 16*4
104#define PBlockLen 16*5
Dave Watson1476db22018-02-14 09:40:10 -0800105#define HashKey 16*6 // store HashKey <<1 mod poly here
106#define HashKey_2 16*7 // store HashKey^2 <<1 mod poly here
107#define HashKey_3 16*8 // store HashKey^3 <<1 mod poly here
108#define HashKey_4 16*9 // store HashKey^4 <<1 mod poly here
109#define HashKey_k 16*10 // store XOR of High 64 bits and Low 64
110 // bits of HashKey <<1 mod poly here
111 //(for Karatsuba purposes)
112#define HashKey_2_k 16*11 // store XOR of High 64 bits and Low 64
113 // bits of HashKey^2 <<1 mod poly here
114 // (for Karatsuba purposes)
115#define HashKey_3_k 16*12 // store XOR of High 64 bits and Low 64
116 // bits of HashKey^3 <<1 mod poly here
117 // (for Karatsuba purposes)
118#define HashKey_4_k 16*13 // store XOR of High 64 bits and Low 64
119 // bits of HashKey^4 <<1 mod poly here
120 // (for Karatsuba purposes)
Dave Watson9ee4a5d2018-02-14 09:39:23 -0800121
Tadeusz Struk0bd82f52010-11-04 15:00:45 -0400122#define arg1 rdi
123#define arg2 rsi
124#define arg3 rdx
125#define arg4 rcx
126#define arg5 r8
127#define arg6 r9
Dave Watson1476db22018-02-14 09:40:10 -0800128#define arg7 STACK_OFFSET+8(%rsp)
129#define arg8 STACK_OFFSET+16(%rsp)
130#define arg9 STACK_OFFSET+24(%rsp)
131#define arg10 STACK_OFFSET+32(%rsp)
132#define arg11 STACK_OFFSET+40(%rsp)
Timothy McCaffreye31ac322015-01-13 13:16:43 -0500133#define keysize 2*15*16(%arg1)
Mathias Krause559ad0f2010-11-29 08:35:39 +0800134#endif
Tadeusz Struk0bd82f52010-11-04 15:00:45 -0400135
136
Huang Ying54b6a1b2009-01-18 16:28:34 +1100137#define STATE1 %xmm0
138#define STATE2 %xmm4
139#define STATE3 %xmm5
140#define STATE4 %xmm6
141#define STATE STATE1
142#define IN1 %xmm1
143#define IN2 %xmm7
144#define IN3 %xmm8
145#define IN4 %xmm9
146#define IN IN1
147#define KEY %xmm2
148#define IV %xmm3
Mathias Krause0d258ef2010-11-27 16:34:46 +0800149
Huang Ying12387a42010-03-10 18:28:55 +0800150#define BSWAP_MASK %xmm10
151#define CTR %xmm11
152#define INC %xmm12
Huang Ying54b6a1b2009-01-18 16:28:34 +1100153
Jussi Kivilinnac456a9c2013-04-08 21:51:16 +0300154#define GF128MUL_MASK %xmm10
155
Mathias Krause0d258ef2010-11-27 16:34:46 +0800156#ifdef __x86_64__
157#define AREG %rax
Huang Ying54b6a1b2009-01-18 16:28:34 +1100158#define KEYP %rdi
159#define OUTP %rsi
Mathias Krause0d258ef2010-11-27 16:34:46 +0800160#define UKEYP OUTP
Huang Ying54b6a1b2009-01-18 16:28:34 +1100161#define INP %rdx
162#define LEN %rcx
163#define IVP %r8
164#define KLEN %r9d
165#define T1 %r10
166#define TKEYP T1
167#define T2 %r11
Huang Ying12387a42010-03-10 18:28:55 +0800168#define TCTR_LOW T2
Mathias Krause0d258ef2010-11-27 16:34:46 +0800169#else
170#define AREG %eax
171#define KEYP %edi
172#define OUTP AREG
173#define UKEYP OUTP
174#define INP %edx
175#define LEN %esi
176#define IVP %ebp
177#define KLEN %ebx
178#define T1 %ecx
179#define TKEYP T1
180#endif
Huang Ying54b6a1b2009-01-18 16:28:34 +1100181
Dave Watson6c2c86b2018-02-14 09:38:35 -0800182.macro FUNC_SAVE
183 push %r12
184 push %r13
185 push %r14
Dave Watson6c2c86b2018-02-14 09:38:35 -0800186#
187# states of %xmm registers %xmm6:%xmm15 not saved
188# all %xmm registers are clobbered
189#
Dave Watson6c2c86b2018-02-14 09:38:35 -0800190.endm
191
192
193.macro FUNC_RESTORE
Dave Watson6c2c86b2018-02-14 09:38:35 -0800194 pop %r14
195 pop %r13
196 pop %r12
197.endm
Tadeusz Struk0bd82f52010-11-04 15:00:45 -0400198
Dave Watson1476db22018-02-14 09:40:10 -0800199# Precompute hashkeys.
200# Input: Hash subkey.
201# Output: HashKeys stored in gcm_context_data. Only needs to be called
202# once per key.
203# clobbers r12, and tmp xmm registers.
Dave Watsonfb8986e2018-02-14 09:40:47 -0800204.macro PRECOMPUTE SUBKEY TMP1 TMP2 TMP3 TMP4 TMP5 TMP6 TMP7
205 mov \SUBKEY, %r12
Dave Watson1476db22018-02-14 09:40:10 -0800206 movdqu (%r12), \TMP3
207 movdqa SHUF_MASK(%rip), \TMP2
208 PSHUFB_XMM \TMP2, \TMP3
209
210 # precompute HashKey<<1 mod poly from the HashKey (required for GHASH)
211
212 movdqa \TMP3, \TMP2
213 psllq $1, \TMP3
214 psrlq $63, \TMP2
215 movdqa \TMP2, \TMP1
216 pslldq $8, \TMP2
217 psrldq $8, \TMP1
218 por \TMP2, \TMP3
219
220 # reduce HashKey<<1
221
222 pshufd $0x24, \TMP1, \TMP2
223 pcmpeqd TWOONE(%rip), \TMP2
224 pand POLY(%rip), \TMP2
225 pxor \TMP2, \TMP3
Dave Watsone5b954e2018-08-15 10:29:42 -0700226 movdqu \TMP3, HashKey(%arg2)
Dave Watson1476db22018-02-14 09:40:10 -0800227
228 movdqa \TMP3, \TMP5
229 pshufd $78, \TMP3, \TMP1
230 pxor \TMP3, \TMP1
Dave Watsone5b954e2018-08-15 10:29:42 -0700231 movdqu \TMP1, HashKey_k(%arg2)
Dave Watson1476db22018-02-14 09:40:10 -0800232
233 GHASH_MUL \TMP5, \TMP3, \TMP1, \TMP2, \TMP4, \TMP6, \TMP7
234# TMP5 = HashKey^2<<1 (mod poly)
Dave Watsone5b954e2018-08-15 10:29:42 -0700235 movdqu \TMP5, HashKey_2(%arg2)
Dave Watson1476db22018-02-14 09:40:10 -0800236# HashKey_2 = HashKey^2<<1 (mod poly)
237 pshufd $78, \TMP5, \TMP1
238 pxor \TMP5, \TMP1
Dave Watsone5b954e2018-08-15 10:29:42 -0700239 movdqu \TMP1, HashKey_2_k(%arg2)
Dave Watson1476db22018-02-14 09:40:10 -0800240
241 GHASH_MUL \TMP5, \TMP3, \TMP1, \TMP2, \TMP4, \TMP6, \TMP7
242# TMP5 = HashKey^3<<1 (mod poly)
Dave Watsone5b954e2018-08-15 10:29:42 -0700243 movdqu \TMP5, HashKey_3(%arg2)
Dave Watson1476db22018-02-14 09:40:10 -0800244 pshufd $78, \TMP5, \TMP1
245 pxor \TMP5, \TMP1
Dave Watsone5b954e2018-08-15 10:29:42 -0700246 movdqu \TMP1, HashKey_3_k(%arg2)
Dave Watson1476db22018-02-14 09:40:10 -0800247
248 GHASH_MUL \TMP5, \TMP3, \TMP1, \TMP2, \TMP4, \TMP6, \TMP7
249# TMP5 = HashKey^3<<1 (mod poly)
Dave Watsone5b954e2018-08-15 10:29:42 -0700250 movdqu \TMP5, HashKey_4(%arg2)
Dave Watson1476db22018-02-14 09:40:10 -0800251 pshufd $78, \TMP5, \TMP1
252 pxor \TMP5, \TMP1
Dave Watsone5b954e2018-08-15 10:29:42 -0700253 movdqu \TMP1, HashKey_4_k(%arg2)
Dave Watson1476db22018-02-14 09:40:10 -0800254.endm
Dave Watson7af964c2018-02-14 09:38:45 -0800255
256# GCM_INIT initializes a gcm_context struct to prepare for encoding/decoding.
257# Clobbers rax, r10-r13 and xmm0-xmm6, %xmm13
Dave Watsonfb8986e2018-02-14 09:40:47 -0800258.macro GCM_INIT Iv SUBKEY AAD AADLEN
259 mov \AADLEN, %r11
Dave Watson96604742018-02-14 09:39:45 -0800260 mov %r11, AadLen(%arg2) # ctx_data.aad_length = aad_length
261 xor %r11, %r11
262 mov %r11, InLen(%arg2) # ctx_data.in_length = 0
263 mov %r11, PBlockLen(%arg2) # ctx_data.partial_block_length = 0
264 mov %r11, PBlockEncKey(%arg2) # ctx_data.partial_block_enc_key = 0
Dave Watsonfb8986e2018-02-14 09:40:47 -0800265 mov \Iv, %rax
Dave Watson96604742018-02-14 09:39:45 -0800266 movdqu (%rax), %xmm0
267 movdqu %xmm0, OrigIV(%arg2) # ctx_data.orig_IV = iv
268
269 movdqa SHUF_MASK(%rip), %xmm2
270 PSHUFB_XMM %xmm2, %xmm0
271 movdqu %xmm0, CurCount(%arg2) # ctx_data.current_counter = iv
272
Dave Watsonfb8986e2018-02-14 09:40:47 -0800273 PRECOMPUTE \SUBKEY, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7,
Dave Watsone5b954e2018-08-15 10:29:42 -0700274 movdqu HashKey(%arg2), %xmm13
Dave Watsonc594c542018-02-14 09:39:36 -0800275
Dave Watsonfb8986e2018-02-14 09:40:47 -0800276 CALC_AAD_HASH %xmm13, \AAD, \AADLEN, %xmm0, %xmm1, %xmm2, %xmm3, \
277 %xmm4, %xmm5, %xmm6
Dave Watson7af964c2018-02-14 09:38:45 -0800278.endm
279
Dave Watsonba458332018-02-14 09:39:10 -0800280# GCM_ENC_DEC Encodes/Decodes given data. Assumes that the passed gcm_context
281# struct has been initialized by GCM_INIT.
282# Requires the input data be at least 1 byte long because of READ_PARTIAL_BLOCK
283# Clobbers rax, r10-r13, and xmm0-xmm15
284.macro GCM_ENC_DEC operation
Dave Watson96604742018-02-14 09:39:45 -0800285 movdqu AadHash(%arg2), %xmm8
Dave Watson1476db22018-02-14 09:40:10 -0800286 movdqu HashKey(%arg2), %xmm13
Dave Watson96604742018-02-14 09:39:45 -0800287 add %arg5, InLen(%arg2)
Dave Watsonae952c52018-02-14 09:40:19 -0800288
289 xor %r11, %r11 # initialise the data pointer offset as zero
290 PARTIAL_BLOCK %arg3 %arg4 %arg5 %r11 %xmm8 \operation
291
292 sub %r11, %arg5 # sub partial block data used
Dave Watson96604742018-02-14 09:39:45 -0800293 mov %arg5, %r13 # save the number of bytes
Dave Watsonae952c52018-02-14 09:40:19 -0800294
Dave Watson96604742018-02-14 09:39:45 -0800295 and $-16, %r13 # %r13 = %r13 - (%r13 mod 16)
296 mov %r13, %r12
Dave Watsonba458332018-02-14 09:39:10 -0800297 # Encrypt/Decrypt first few blocks
298
299 and $(3<<4), %r12
300 jz _initial_num_blocks_is_0_\@
301 cmp $(2<<4), %r12
302 jb _initial_num_blocks_is_1_\@
303 je _initial_num_blocks_is_2_\@
304_initial_num_blocks_is_3_\@:
305 INITIAL_BLOCKS_ENC_DEC %xmm9, %xmm10, %xmm13, %xmm11, %xmm12, %xmm0, \
306%xmm1, %xmm2, %xmm3, %xmm4, %xmm8, %xmm5, %xmm6, 5, 678, \operation
307 sub $48, %r13
308 jmp _initial_blocks_\@
309_initial_num_blocks_is_2_\@:
310 INITIAL_BLOCKS_ENC_DEC %xmm9, %xmm10, %xmm13, %xmm11, %xmm12, %xmm0, \
311%xmm1, %xmm2, %xmm3, %xmm4, %xmm8, %xmm5, %xmm6, 6, 78, \operation
312 sub $32, %r13
313 jmp _initial_blocks_\@
314_initial_num_blocks_is_1_\@:
315 INITIAL_BLOCKS_ENC_DEC %xmm9, %xmm10, %xmm13, %xmm11, %xmm12, %xmm0, \
316%xmm1, %xmm2, %xmm3, %xmm4, %xmm8, %xmm5, %xmm6, 7, 8, \operation
317 sub $16, %r13
318 jmp _initial_blocks_\@
319_initial_num_blocks_is_0_\@:
320 INITIAL_BLOCKS_ENC_DEC %xmm9, %xmm10, %xmm13, %xmm11, %xmm12, %xmm0, \
321%xmm1, %xmm2, %xmm3, %xmm4, %xmm8, %xmm5, %xmm6, 8, 0, \operation
322_initial_blocks_\@:
323
324 # Main loop - Encrypt/Decrypt remaining blocks
325
326 cmp $0, %r13
327 je _zero_cipher_left_\@
328 sub $64, %r13
329 je _four_cipher_left_\@
330_crypt_by_4_\@:
331 GHASH_4_ENCRYPT_4_PARALLEL_\operation %xmm9, %xmm10, %xmm11, %xmm12, \
332 %xmm13, %xmm14, %xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, \
333 %xmm7, %xmm8, enc
334 add $64, %r11
335 sub $64, %r13
336 jne _crypt_by_4_\@
337_four_cipher_left_\@:
338 GHASH_LAST_4 %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14, \
339%xmm15, %xmm1, %xmm2, %xmm3, %xmm4, %xmm8
340_zero_cipher_left_\@:
Dave Watson96604742018-02-14 09:39:45 -0800341 movdqu %xmm8, AadHash(%arg2)
342 movdqu %xmm0, CurCount(%arg2)
343
Dave Watson9ee4a5d2018-02-14 09:39:23 -0800344 mov %arg5, %r13
345 and $15, %r13 # %r13 = arg5 (mod 16)
Dave Watsonba458332018-02-14 09:39:10 -0800346 je _multiple_of_16_bytes_\@
347
Dave Watson96604742018-02-14 09:39:45 -0800348 mov %r13, PBlockLen(%arg2)
349
Dave Watsonba458332018-02-14 09:39:10 -0800350 # Handle the last <16 Byte block separately
351 paddd ONE(%rip), %xmm0 # INCR CNT to get Yn
Dave Watson96604742018-02-14 09:39:45 -0800352 movdqu %xmm0, CurCount(%arg2)
Dave Watson9ee4a5d2018-02-14 09:39:23 -0800353 movdqa SHUF_MASK(%rip), %xmm10
Dave Watsonba458332018-02-14 09:39:10 -0800354 PSHUFB_XMM %xmm10, %xmm0
355
356 ENCRYPT_SINGLE_BLOCK %xmm0, %xmm1 # Encrypt(K, Yn)
Dave Watson96604742018-02-14 09:39:45 -0800357 movdqu %xmm0, PBlockEncKey(%arg2)
Dave Watsonba458332018-02-14 09:39:10 -0800358
Dave Watson933d6ae2018-02-14 09:40:31 -0800359 cmp $16, %arg5
360 jge _large_enough_update_\@
361
Dave Watson9ee4a5d2018-02-14 09:39:23 -0800362 lea (%arg4,%r11,1), %r10
Dave Watsonba458332018-02-14 09:39:10 -0800363 mov %r13, %r12
364 READ_PARTIAL_BLOCK %r10 %r12 %xmm2 %xmm1
Dave Watson933d6ae2018-02-14 09:40:31 -0800365 jmp _data_read_\@
Dave Watsonba458332018-02-14 09:39:10 -0800366
Dave Watson933d6ae2018-02-14 09:40:31 -0800367_large_enough_update_\@:
368 sub $16, %r11
369 add %r13, %r11
370
371 # receive the last <16 Byte block
372 movdqu (%arg4, %r11, 1), %xmm1
373
374 sub %r13, %r11
375 add $16, %r11
376
377 lea SHIFT_MASK+16(%rip), %r12
378 # adjust the shuffle mask pointer to be able to shift 16-r13 bytes
379 # (r13 is the number of bytes in plaintext mod 16)
380 sub %r13, %r12
381 # get the appropriate shuffle mask
382 movdqu (%r12), %xmm2
383 # shift right 16-r13 bytes
384 PSHUFB_XMM %xmm2, %xmm1
385
386_data_read_\@:
Dave Watsonba458332018-02-14 09:39:10 -0800387 lea ALL_F+16(%rip), %r12
388 sub %r13, %r12
Dave Watson933d6ae2018-02-14 09:40:31 -0800389
Dave Watsonba458332018-02-14 09:39:10 -0800390.ifc \operation, dec
391 movdqa %xmm1, %xmm2
392.endif
393 pxor %xmm1, %xmm0 # XOR Encrypt(K, Yn)
394 movdqu (%r12), %xmm1
395 # get the appropriate mask to mask out top 16-r13 bytes of xmm0
396 pand %xmm1, %xmm0 # mask out top 16-r13 bytes of xmm0
397.ifc \operation, dec
398 pand %xmm1, %xmm2
399 movdqa SHUF_MASK(%rip), %xmm10
400 PSHUFB_XMM %xmm10 ,%xmm2
401
402 pxor %xmm2, %xmm8
403.else
404 movdqa SHUF_MASK(%rip), %xmm10
405 PSHUFB_XMM %xmm10,%xmm0
406
407 pxor %xmm0, %xmm8
408.endif
409
Dave Watson96604742018-02-14 09:39:45 -0800410 movdqu %xmm8, AadHash(%arg2)
Dave Watsonba458332018-02-14 09:39:10 -0800411.ifc \operation, enc
412 # GHASH computation for the last <16 byte block
413 movdqa SHUF_MASK(%rip), %xmm10
414 # shuffle xmm0 back to output as ciphertext
415 PSHUFB_XMM %xmm10, %xmm0
416.endif
417
418 # Output %r13 bytes
419 MOVQ_R64_XMM %xmm0, %rax
420 cmp $8, %r13
421 jle _less_than_8_bytes_left_\@
Dave Watson9ee4a5d2018-02-14 09:39:23 -0800422 mov %rax, (%arg3 , %r11, 1)
Dave Watsonba458332018-02-14 09:39:10 -0800423 add $8, %r11
424 psrldq $8, %xmm0
425 MOVQ_R64_XMM %xmm0, %rax
426 sub $8, %r13
427_less_than_8_bytes_left_\@:
Dave Watson9ee4a5d2018-02-14 09:39:23 -0800428 mov %al, (%arg3, %r11, 1)
Dave Watsonba458332018-02-14 09:39:10 -0800429 add $1, %r11
430 shr $8, %rax
431 sub $1, %r13
432 jne _less_than_8_bytes_left_\@
433_multiple_of_16_bytes_\@:
434.endm
435
Dave Watsonadcadab2018-02-14 09:38:57 -0800436# GCM_COMPLETE Finishes update of tag of last partial block
437# Output: Authorization Tag (AUTH_TAG)
438# Clobbers rax, r10-r12, and xmm0, xmm1, xmm5-xmm15
Dave Watsonfb8986e2018-02-14 09:40:47 -0800439.macro GCM_COMPLETE AUTHTAG AUTHTAGLEN
Dave Watson96604742018-02-14 09:39:45 -0800440 movdqu AadHash(%arg2), %xmm8
Dave Watson1476db22018-02-14 09:40:10 -0800441 movdqu HashKey(%arg2), %xmm13
Dave Watsone2e34b02018-02-14 09:39:55 -0800442
443 mov PBlockLen(%arg2), %r12
444
445 cmp $0, %r12
446 je _partial_done\@
447
448 GHASH_MUL %xmm8, %xmm13, %xmm9, %xmm10, %xmm11, %xmm5, %xmm6
449
450_partial_done\@:
Dave Watson96604742018-02-14 09:39:45 -0800451 mov AadLen(%arg2), %r12 # %r13 = aadLen (number of bytes)
Dave Watsonadcadab2018-02-14 09:38:57 -0800452 shl $3, %r12 # convert into number of bits
453 movd %r12d, %xmm15 # len(A) in %xmm15
Dave Watson96604742018-02-14 09:39:45 -0800454 mov InLen(%arg2), %r12
455 shl $3, %r12 # len(C) in bits (*128)
456 MOVQ_R64_XMM %r12, %xmm1
457
Dave Watsonadcadab2018-02-14 09:38:57 -0800458 pslldq $8, %xmm15 # %xmm15 = len(A)||0x0000000000000000
459 pxor %xmm1, %xmm15 # %xmm15 = len(A)||len(C)
460 pxor %xmm15, %xmm8
461 GHASH_MUL %xmm8, %xmm13, %xmm9, %xmm10, %xmm11, %xmm5, %xmm6
462 # final GHASH computation
463 movdqa SHUF_MASK(%rip), %xmm10
464 PSHUFB_XMM %xmm10, %xmm8
465
Dave Watson96604742018-02-14 09:39:45 -0800466 movdqu OrigIV(%arg2), %xmm0 # %xmm0 = Y0
Dave Watsonadcadab2018-02-14 09:38:57 -0800467 ENCRYPT_SINGLE_BLOCK %xmm0, %xmm1 # E(K, Y0)
468 pxor %xmm8, %xmm0
469_return_T_\@:
Dave Watsonfb8986e2018-02-14 09:40:47 -0800470 mov \AUTHTAG, %r10 # %r10 = authTag
471 mov \AUTHTAGLEN, %r11 # %r11 = auth_tag_len
Dave Watsonadcadab2018-02-14 09:38:57 -0800472 cmp $16, %r11
473 je _T_16_\@
474 cmp $8, %r11
475 jl _T_4_\@
476_T_8_\@:
477 MOVQ_R64_XMM %xmm0, %rax
478 mov %rax, (%r10)
479 add $8, %r10
480 sub $8, %r11
481 psrldq $8, %xmm0
482 cmp $0, %r11
483 je _return_T_done_\@
484_T_4_\@:
485 movd %xmm0, %eax
486 mov %eax, (%r10)
487 add $4, %r10
488 sub $4, %r11
489 psrldq $4, %xmm0
490 cmp $0, %r11
491 je _return_T_done_\@
492_T_123_\@:
493 movd %xmm0, %eax
494 cmp $2, %r11
495 jl _T_1_\@
496 mov %ax, (%r10)
497 cmp $2, %r11
498 je _return_T_done_\@
499 add $2, %r10
500 sar $16, %eax
501_T_1_\@:
502 mov %al, (%r10)
503 jmp _return_T_done_\@
504_T_16_\@:
505 movdqu %xmm0, (%r10)
506_return_T_done_\@:
507.endm
508
Mathias Krause559ad0f2010-11-29 08:35:39 +0800509#ifdef __x86_64__
Tadeusz Struk0bd82f52010-11-04 15:00:45 -0400510/* GHASH_MUL MACRO to implement: Data*HashKey mod (128,127,126,121,0)
511*
512*
513* Input: A and B (128-bits each, bit-reflected)
514* Output: C = A*B*x mod poly, (i.e. >>1 )
515* To compute GH = GH*HashKey mod poly, give HK = HashKey<<1 mod poly as input
516* GH = GH * HK * x mod poly which is equivalent to GH*HashKey mod poly.
517*
518*/
519.macro GHASH_MUL GH HK TMP1 TMP2 TMP3 TMP4 TMP5
520 movdqa \GH, \TMP1
521 pshufd $78, \GH, \TMP2
522 pshufd $78, \HK, \TMP3
523 pxor \GH, \TMP2 # TMP2 = a1+a0
524 pxor \HK, \TMP3 # TMP3 = b1+b0
525 PCLMULQDQ 0x11, \HK, \TMP1 # TMP1 = a1*b1
526 PCLMULQDQ 0x00, \HK, \GH # GH = a0*b0
527 PCLMULQDQ 0x00, \TMP3, \TMP2 # TMP2 = (a0+a1)*(b1+b0)
528 pxor \GH, \TMP2
529 pxor \TMP1, \TMP2 # TMP2 = (a0*b0)+(a1*b0)
530 movdqa \TMP2, \TMP3
531 pslldq $8, \TMP3 # left shift TMP3 2 DWs
532 psrldq $8, \TMP2 # right shift TMP2 2 DWs
533 pxor \TMP3, \GH
534 pxor \TMP2, \TMP1 # TMP2:GH holds the result of GH*HK
535
536 # first phase of the reduction
537
538 movdqa \GH, \TMP2
539 movdqa \GH, \TMP3
540 movdqa \GH, \TMP4 # copy GH into TMP2,TMP3 and TMP4
541 # in in order to perform
542 # independent shifts
543 pslld $31, \TMP2 # packed right shift <<31
544 pslld $30, \TMP3 # packed right shift <<30
545 pslld $25, \TMP4 # packed right shift <<25
546 pxor \TMP3, \TMP2 # xor the shifted versions
547 pxor \TMP4, \TMP2
548 movdqa \TMP2, \TMP5
549 psrldq $4, \TMP5 # right shift TMP5 1 DW
550 pslldq $12, \TMP2 # left shift TMP2 3 DWs
551 pxor \TMP2, \GH
552
553 # second phase of the reduction
554
555 movdqa \GH,\TMP2 # copy GH into TMP2,TMP3 and TMP4
556 # in in order to perform
557 # independent shifts
558 movdqa \GH,\TMP3
559 movdqa \GH,\TMP4
560 psrld $1,\TMP2 # packed left shift >>1
561 psrld $2,\TMP3 # packed left shift >>2
562 psrld $7,\TMP4 # packed left shift >>7
563 pxor \TMP3,\TMP2 # xor the shifted versions
564 pxor \TMP4,\TMP2
565 pxor \TMP5, \TMP2
566 pxor \TMP2, \GH
567 pxor \TMP1, \GH # result is in TMP1
568.endm
569
Junaid Shahidb20209c2017-12-20 17:08:37 -0800570# Reads DLEN bytes starting at DPTR and stores in XMMDst
571# where 0 < DLEN < 16
572# Clobbers %rax, DLEN and XMM1
573.macro READ_PARTIAL_BLOCK DPTR DLEN XMM1 XMMDst
574 cmp $8, \DLEN
575 jl _read_lt8_\@
576 mov (\DPTR), %rax
577 MOVQ_R64_XMM %rax, \XMMDst
578 sub $8, \DLEN
579 jz _done_read_partial_block_\@
580 xor %eax, %eax
581_read_next_byte_\@:
582 shl $8, %rax
583 mov 7(\DPTR, \DLEN, 1), %al
584 dec \DLEN
585 jnz _read_next_byte_\@
586 MOVQ_R64_XMM %rax, \XMM1
587 pslldq $8, \XMM1
588 por \XMM1, \XMMDst
589 jmp _done_read_partial_block_\@
590_read_lt8_\@:
591 xor %eax, %eax
592_read_next_byte_lt8_\@:
593 shl $8, %rax
594 mov -1(\DPTR, \DLEN, 1), %al
595 dec \DLEN
596 jnz _read_next_byte_lt8_\@
597 MOVQ_R64_XMM %rax, \XMMDst
598_done_read_partial_block_\@:
599.endm
600
Dave Watsonc594c542018-02-14 09:39:36 -0800601# CALC_AAD_HASH: Calculates the hash of the data which will not be encrypted.
602# clobbers r10-11, xmm14
Dave Watsonfb8986e2018-02-14 09:40:47 -0800603.macro CALC_AAD_HASH HASHKEY AAD AADLEN TMP1 TMP2 TMP3 TMP4 TMP5 \
Dave Watsonc594c542018-02-14 09:39:36 -0800604 TMP6 TMP7
605 MOVADQ SHUF_MASK(%rip), %xmm14
Dave Watsonfb8986e2018-02-14 09:40:47 -0800606 mov \AAD, %r10 # %r10 = AAD
607 mov \AADLEN, %r11 # %r11 = aadLen
Dave Watsonc594c542018-02-14 09:39:36 -0800608 pxor \TMP7, \TMP7
609 pxor \TMP6, \TMP6
610
611 cmp $16, %r11
612 jl _get_AAD_rest\@
613_get_AAD_blocks\@:
614 movdqu (%r10), \TMP7
615 PSHUFB_XMM %xmm14, \TMP7 # byte-reflect the AAD data
616 pxor \TMP7, \TMP6
617 GHASH_MUL \TMP6, \HASHKEY, \TMP1, \TMP2, \TMP3, \TMP4, \TMP5
618 add $16, %r10
619 sub $16, %r11
620 cmp $16, %r11
621 jge _get_AAD_blocks\@
622
623 movdqu \TMP6, \TMP7
624
625 /* read the last <16B of AAD */
626_get_AAD_rest\@:
627 cmp $0, %r11
628 je _get_AAD_done\@
629
630 READ_PARTIAL_BLOCK %r10, %r11, \TMP1, \TMP7
631 PSHUFB_XMM %xmm14, \TMP7 # byte-reflect the AAD data
632 pxor \TMP6, \TMP7
633 GHASH_MUL \TMP7, \HASHKEY, \TMP1, \TMP2, \TMP3, \TMP4, \TMP5
634 movdqu \TMP7, \TMP6
635
636_get_AAD_done\@:
637 movdqu \TMP6, AadHash(%arg2)
638.endm
639
Dave Watsonae952c52018-02-14 09:40:19 -0800640# PARTIAL_BLOCK: Handles encryption/decryption and the tag partial blocks
641# between update calls.
642# Requires the input data be at least 1 byte long due to READ_PARTIAL_BLOCK
643# Outputs encrypted bytes, and updates hash and partial info in gcm_data_context
644# Clobbers rax, r10, r12, r13, xmm0-6, xmm9-13
645.macro PARTIAL_BLOCK CYPH_PLAIN_OUT PLAIN_CYPH_IN PLAIN_CYPH_LEN DATA_OFFSET \
646 AAD_HASH operation
647 mov PBlockLen(%arg2), %r13
648 cmp $0, %r13
649 je _partial_block_done_\@ # Leave Macro if no partial blocks
650 # Read in input data without over reading
651 cmp $16, \PLAIN_CYPH_LEN
652 jl _fewer_than_16_bytes_\@
653 movups (\PLAIN_CYPH_IN), %xmm1 # If more than 16 bytes, just fill xmm
654 jmp _data_read_\@
655
656_fewer_than_16_bytes_\@:
657 lea (\PLAIN_CYPH_IN, \DATA_OFFSET, 1), %r10
658 mov \PLAIN_CYPH_LEN, %r12
659 READ_PARTIAL_BLOCK %r10 %r12 %xmm0 %xmm1
660
661 mov PBlockLen(%arg2), %r13
662
663_data_read_\@: # Finished reading in data
664
665 movdqu PBlockEncKey(%arg2), %xmm9
666 movdqu HashKey(%arg2), %xmm13
667
668 lea SHIFT_MASK(%rip), %r12
669
670 # adjust the shuffle mask pointer to be able to shift r13 bytes
671 # r16-r13 is the number of bytes in plaintext mod 16)
672 add %r13, %r12
673 movdqu (%r12), %xmm2 # get the appropriate shuffle mask
674 PSHUFB_XMM %xmm2, %xmm9 # shift right r13 bytes
675
676.ifc \operation, dec
677 movdqa %xmm1, %xmm3
678 pxor %xmm1, %xmm9 # Cyphertext XOR E(K, Yn)
679
680 mov \PLAIN_CYPH_LEN, %r10
681 add %r13, %r10
682 # Set r10 to be the amount of data left in CYPH_PLAIN_IN after filling
683 sub $16, %r10
684 # Determine if if partial block is not being filled and
685 # shift mask accordingly
686 jge _no_extra_mask_1_\@
687 sub %r10, %r12
688_no_extra_mask_1_\@:
689
690 movdqu ALL_F-SHIFT_MASK(%r12), %xmm1
691 # get the appropriate mask to mask out bottom r13 bytes of xmm9
692 pand %xmm1, %xmm9 # mask out bottom r13 bytes of xmm9
693
694 pand %xmm1, %xmm3
695 movdqa SHUF_MASK(%rip), %xmm10
696 PSHUFB_XMM %xmm10, %xmm3
697 PSHUFB_XMM %xmm2, %xmm3
698 pxor %xmm3, \AAD_HASH
699
700 cmp $0, %r10
701 jl _partial_incomplete_1_\@
702
703 # GHASH computation for the last <16 Byte block
704 GHASH_MUL \AAD_HASH, %xmm13, %xmm0, %xmm10, %xmm11, %xmm5, %xmm6
705 xor %rax,%rax
706
707 mov %rax, PBlockLen(%arg2)
708 jmp _dec_done_\@
709_partial_incomplete_1_\@:
710 add \PLAIN_CYPH_LEN, PBlockLen(%arg2)
711_dec_done_\@:
712 movdqu \AAD_HASH, AadHash(%arg2)
713.else
714 pxor %xmm1, %xmm9 # Plaintext XOR E(K, Yn)
715
716 mov \PLAIN_CYPH_LEN, %r10
717 add %r13, %r10
718 # Set r10 to be the amount of data left in CYPH_PLAIN_IN after filling
719 sub $16, %r10
720 # Determine if if partial block is not being filled and
721 # shift mask accordingly
722 jge _no_extra_mask_2_\@
723 sub %r10, %r12
724_no_extra_mask_2_\@:
725
726 movdqu ALL_F-SHIFT_MASK(%r12), %xmm1
727 # get the appropriate mask to mask out bottom r13 bytes of xmm9
728 pand %xmm1, %xmm9
729
730 movdqa SHUF_MASK(%rip), %xmm1
731 PSHUFB_XMM %xmm1, %xmm9
732 PSHUFB_XMM %xmm2, %xmm9
733 pxor %xmm9, \AAD_HASH
734
735 cmp $0, %r10
736 jl _partial_incomplete_2_\@
737
738 # GHASH computation for the last <16 Byte block
739 GHASH_MUL \AAD_HASH, %xmm13, %xmm0, %xmm10, %xmm11, %xmm5, %xmm6
740 xor %rax,%rax
741
742 mov %rax, PBlockLen(%arg2)
743 jmp _encode_done_\@
744_partial_incomplete_2_\@:
745 add \PLAIN_CYPH_LEN, PBlockLen(%arg2)
746_encode_done_\@:
747 movdqu \AAD_HASH, AadHash(%arg2)
748
749 movdqa SHUF_MASK(%rip), %xmm10
750 # shuffle xmm9 back to output as ciphertext
751 PSHUFB_XMM %xmm10, %xmm9
752 PSHUFB_XMM %xmm2, %xmm9
753.endif
754 # output encrypted Bytes
755 cmp $0, %r10
756 jl _partial_fill_\@
757 mov %r13, %r12
758 mov $16, %r13
759 # Set r13 to be the number of bytes to write out
760 sub %r12, %r13
761 jmp _count_set_\@
762_partial_fill_\@:
763 mov \PLAIN_CYPH_LEN, %r13
764_count_set_\@:
765 movdqa %xmm9, %xmm0
766 MOVQ_R64_XMM %xmm0, %rax
767 cmp $8, %r13
768 jle _less_than_8_bytes_left_\@
769
770 mov %rax, (\CYPH_PLAIN_OUT, \DATA_OFFSET, 1)
771 add $8, \DATA_OFFSET
772 psrldq $8, %xmm0
773 MOVQ_R64_XMM %xmm0, %rax
774 sub $8, %r13
775_less_than_8_bytes_left_\@:
776 movb %al, (\CYPH_PLAIN_OUT, \DATA_OFFSET, 1)
777 add $1, \DATA_OFFSET
778 shr $8, %rax
779 sub $1, %r13
780 jne _less_than_8_bytes_left_\@
781_partial_block_done_\@:
782.endm # PARTIAL_BLOCK
783
Tadeusz Struk0bd82f52010-11-04 15:00:45 -0400784/*
785* if a = number of total plaintext bytes
786* b = floor(a/16)
787* num_initial_blocks = b mod 4
788* encrypt the initial num_initial_blocks blocks and apply ghash on
789* the ciphertext
790* %r10, %r11, %r12, %rax, %xmm5, %xmm6, %xmm7, %xmm8, %xmm9 registers
791* are clobbered
Dave Watson1476db22018-02-14 09:40:10 -0800792* arg1, %arg2, %arg3 are used as a pointer only, not modified
Tadeusz Struk0bd82f52010-11-04 15:00:45 -0400793*/
794
Tadeusz Struk0bd82f52010-11-04 15:00:45 -0400795
Dave Watsone1fd3162018-02-14 09:38:12 -0800796.macro INITIAL_BLOCKS_ENC_DEC TMP1 TMP2 TMP3 TMP4 TMP5 XMM0 XMM1 \
Dave Watsonc594c542018-02-14 09:39:36 -0800797 XMM2 XMM3 XMM4 XMMDst TMP6 TMP7 i i_seq operation
Dave Watson96604742018-02-14 09:39:45 -0800798 MOVADQ SHUF_MASK(%rip), %xmm14
Sabrina Dubroca0487cca2017-04-28 18:11:56 +0200799
Dave Watsonc594c542018-02-14 09:39:36 -0800800 movdqu AadHash(%arg2), %xmm\i # XMM0 = Y0
Sabrina Dubroca0487cca2017-04-28 18:11:56 +0200801
Sabrina Dubroca0487cca2017-04-28 18:11:56 +0200802 # start AES for num_initial_blocks blocks
Tadeusz Struk3c097b82010-12-13 19:51:15 +0800803
Dave Watson96604742018-02-14 09:39:45 -0800804 movdqu CurCount(%arg2), \XMM0 # XMM0 = Y0
Tadeusz Struk3c097b82010-12-13 19:51:15 +0800805
806.if (\i == 5) || (\i == 6) || (\i == 7)
Tadeusz Struk3c097b82010-12-13 19:51:15 +0800807
Timothy McCaffreye31ac322015-01-13 13:16:43 -0500808 MOVADQ ONE(%RIP),\TMP1
809 MOVADQ 0(%arg1),\TMP2
Tadeusz Struk3c097b82010-12-13 19:51:15 +0800810.irpc index, \i_seq
Timothy McCaffreye31ac322015-01-13 13:16:43 -0500811 paddd \TMP1, \XMM0 # INCR Y0
Dave Watsone1fd3162018-02-14 09:38:12 -0800812.ifc \operation, dec
813 movdqa \XMM0, %xmm\index
814.else
Timothy McCaffreye31ac322015-01-13 13:16:43 -0500815 MOVADQ \XMM0, %xmm\index
Dave Watsone1fd3162018-02-14 09:38:12 -0800816.endif
Timothy McCaffreye31ac322015-01-13 13:16:43 -0500817 PSHUFB_XMM %xmm14, %xmm\index # perform a 16 byte swap
818 pxor \TMP2, %xmm\index
Tadeusz Struk3c097b82010-12-13 19:51:15 +0800819.endr
Timothy McCaffreye31ac322015-01-13 13:16:43 -0500820 lea 0x10(%arg1),%r10
821 mov keysize,%eax
822 shr $2,%eax # 128->4, 192->6, 256->8
823 add $5,%eax # 128->9, 192->11, 256->13
824
Dave Watsone1fd3162018-02-14 09:38:12 -0800825aes_loop_initial_\@:
Timothy McCaffreye31ac322015-01-13 13:16:43 -0500826 MOVADQ (%r10),\TMP1
827.irpc index, \i_seq
828 AESENC \TMP1, %xmm\index
Tadeusz Struk3c097b82010-12-13 19:51:15 +0800829.endr
Timothy McCaffreye31ac322015-01-13 13:16:43 -0500830 add $16,%r10
831 sub $1,%eax
Dave Watsone1fd3162018-02-14 09:38:12 -0800832 jnz aes_loop_initial_\@
Timothy McCaffreye31ac322015-01-13 13:16:43 -0500833
834 MOVADQ (%r10), \TMP1
Tadeusz Struk3c097b82010-12-13 19:51:15 +0800835.irpc index, \i_seq
Timothy McCaffreye31ac322015-01-13 13:16:43 -0500836 AESENCLAST \TMP1, %xmm\index # Last Round
Tadeusz Struk3c097b82010-12-13 19:51:15 +0800837.endr
838.irpc index, \i_seq
Dave Watson9ee4a5d2018-02-14 09:39:23 -0800839 movdqu (%arg4 , %r11, 1), \TMP1
Tadeusz Struk3c097b82010-12-13 19:51:15 +0800840 pxor \TMP1, %xmm\index
Dave Watson9ee4a5d2018-02-14 09:39:23 -0800841 movdqu %xmm\index, (%arg3 , %r11, 1)
Tadeusz Struk3c097b82010-12-13 19:51:15 +0800842 # write back plaintext/ciphertext for num_initial_blocks
843 add $16, %r11
Dave Watsone1fd3162018-02-14 09:38:12 -0800844
845.ifc \operation, dec
846 movdqa \TMP1, %xmm\index
847.endif
Tadeusz Struk3c097b82010-12-13 19:51:15 +0800848 PSHUFB_XMM %xmm14, %xmm\index
849
850 # prepare plaintext/ciphertext for GHASH computation
851.endr
852.endif
Sabrina Dubroca0487cca2017-04-28 18:11:56 +0200853
Tadeusz Struk3c097b82010-12-13 19:51:15 +0800854 # apply GHASH on num_initial_blocks blocks
855
856.if \i == 5
857 pxor %xmm5, %xmm6
858 GHASH_MUL %xmm6, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1
859 pxor %xmm6, %xmm7
860 GHASH_MUL %xmm7, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1
861 pxor %xmm7, %xmm8
862 GHASH_MUL %xmm8, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1
863.elseif \i == 6
864 pxor %xmm6, %xmm7
865 GHASH_MUL %xmm7, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1
866 pxor %xmm7, %xmm8
867 GHASH_MUL %xmm8, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1
868.elseif \i == 7
869 pxor %xmm7, %xmm8
870 GHASH_MUL %xmm8, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1
871.endif
872 cmp $64, %r13
Dave Watsone1fd3162018-02-14 09:38:12 -0800873 jl _initial_blocks_done\@
Tadeusz Struk3c097b82010-12-13 19:51:15 +0800874 # no need for precomputed values
875/*
876*
877* Precomputations for HashKey parallel with encryption of first 4 blocks.
878* Haskey_i_k holds XORed values of the low and high parts of the Haskey_i
879*/
Timothy McCaffreye31ac322015-01-13 13:16:43 -0500880 MOVADQ ONE(%RIP),\TMP1
881 paddd \TMP1, \XMM0 # INCR Y0
882 MOVADQ \XMM0, \XMM1
Tadeusz Struk3c097b82010-12-13 19:51:15 +0800883 PSHUFB_XMM %xmm14, \XMM1 # perform a 16 byte swap
884
Timothy McCaffreye31ac322015-01-13 13:16:43 -0500885 paddd \TMP1, \XMM0 # INCR Y0
886 MOVADQ \XMM0, \XMM2
Tadeusz Struk3c097b82010-12-13 19:51:15 +0800887 PSHUFB_XMM %xmm14, \XMM2 # perform a 16 byte swap
888
Timothy McCaffreye31ac322015-01-13 13:16:43 -0500889 paddd \TMP1, \XMM0 # INCR Y0
890 MOVADQ \XMM0, \XMM3
Tadeusz Struk3c097b82010-12-13 19:51:15 +0800891 PSHUFB_XMM %xmm14, \XMM3 # perform a 16 byte swap
892
Timothy McCaffreye31ac322015-01-13 13:16:43 -0500893 paddd \TMP1, \XMM0 # INCR Y0
894 MOVADQ \XMM0, \XMM4
Tadeusz Struk3c097b82010-12-13 19:51:15 +0800895 PSHUFB_XMM %xmm14, \XMM4 # perform a 16 byte swap
896
Timothy McCaffreye31ac322015-01-13 13:16:43 -0500897 MOVADQ 0(%arg1),\TMP1
898 pxor \TMP1, \XMM1
899 pxor \TMP1, \XMM2
900 pxor \TMP1, \XMM3
901 pxor \TMP1, \XMM4
Tadeusz Struk3c097b82010-12-13 19:51:15 +0800902.irpc index, 1234 # do 4 rounds
903 movaps 0x10*\index(%arg1), \TMP1
904 AESENC \TMP1, \XMM1
905 AESENC \TMP1, \XMM2
906 AESENC \TMP1, \XMM3
907 AESENC \TMP1, \XMM4
908.endr
Tadeusz Struk3c097b82010-12-13 19:51:15 +0800909.irpc index, 56789 # do next 5 rounds
910 movaps 0x10*\index(%arg1), \TMP1
911 AESENC \TMP1, \XMM1
912 AESENC \TMP1, \XMM2
913 AESENC \TMP1, \XMM3
914 AESENC \TMP1, \XMM4
915.endr
Timothy McCaffreye31ac322015-01-13 13:16:43 -0500916 lea 0xa0(%arg1),%r10
917 mov keysize,%eax
918 shr $2,%eax # 128->4, 192->6, 256->8
919 sub $4,%eax # 128->0, 192->2, 256->4
Dave Watsone1fd3162018-02-14 09:38:12 -0800920 jz aes_loop_pre_done\@
Timothy McCaffreye31ac322015-01-13 13:16:43 -0500921
Dave Watsone1fd3162018-02-14 09:38:12 -0800922aes_loop_pre_\@:
Timothy McCaffreye31ac322015-01-13 13:16:43 -0500923 MOVADQ (%r10),\TMP2
924.irpc index, 1234
925 AESENC \TMP2, %xmm\index
926.endr
927 add $16,%r10
928 sub $1,%eax
Dave Watsone1fd3162018-02-14 09:38:12 -0800929 jnz aes_loop_pre_\@
Timothy McCaffreye31ac322015-01-13 13:16:43 -0500930
Dave Watsone1fd3162018-02-14 09:38:12 -0800931aes_loop_pre_done\@:
Timothy McCaffreye31ac322015-01-13 13:16:43 -0500932 MOVADQ (%r10), \TMP2
Tadeusz Struk3c097b82010-12-13 19:51:15 +0800933 AESENCLAST \TMP2, \XMM1
934 AESENCLAST \TMP2, \XMM2
935 AESENCLAST \TMP2, \XMM3
936 AESENCLAST \TMP2, \XMM4
Dave Watson9ee4a5d2018-02-14 09:39:23 -0800937 movdqu 16*0(%arg4 , %r11 , 1), \TMP1
Tadeusz Struk3c097b82010-12-13 19:51:15 +0800938 pxor \TMP1, \XMM1
Dave Watsone1fd3162018-02-14 09:38:12 -0800939.ifc \operation, dec
Dave Watson9ee4a5d2018-02-14 09:39:23 -0800940 movdqu \XMM1, 16*0(%arg3 , %r11 , 1)
Dave Watsone1fd3162018-02-14 09:38:12 -0800941 movdqa \TMP1, \XMM1
942.endif
Dave Watson9ee4a5d2018-02-14 09:39:23 -0800943 movdqu 16*1(%arg4 , %r11 , 1), \TMP1
Tadeusz Struk3c097b82010-12-13 19:51:15 +0800944 pxor \TMP1, \XMM2
Dave Watsone1fd3162018-02-14 09:38:12 -0800945.ifc \operation, dec
Dave Watson9ee4a5d2018-02-14 09:39:23 -0800946 movdqu \XMM2, 16*1(%arg3 , %r11 , 1)
Dave Watsone1fd3162018-02-14 09:38:12 -0800947 movdqa \TMP1, \XMM2
948.endif
Dave Watson9ee4a5d2018-02-14 09:39:23 -0800949 movdqu 16*2(%arg4 , %r11 , 1), \TMP1
Tadeusz Struk3c097b82010-12-13 19:51:15 +0800950 pxor \TMP1, \XMM3
Dave Watsone1fd3162018-02-14 09:38:12 -0800951.ifc \operation, dec
Dave Watson9ee4a5d2018-02-14 09:39:23 -0800952 movdqu \XMM3, 16*2(%arg3 , %r11 , 1)
Dave Watsone1fd3162018-02-14 09:38:12 -0800953 movdqa \TMP1, \XMM3
954.endif
Dave Watson9ee4a5d2018-02-14 09:39:23 -0800955 movdqu 16*3(%arg4 , %r11 , 1), \TMP1
Tadeusz Struk3c097b82010-12-13 19:51:15 +0800956 pxor \TMP1, \XMM4
Dave Watsone1fd3162018-02-14 09:38:12 -0800957.ifc \operation, dec
Dave Watson9ee4a5d2018-02-14 09:39:23 -0800958 movdqu \XMM4, 16*3(%arg3 , %r11 , 1)
Dave Watsone1fd3162018-02-14 09:38:12 -0800959 movdqa \TMP1, \XMM4
960.else
Dave Watson9ee4a5d2018-02-14 09:39:23 -0800961 movdqu \XMM1, 16*0(%arg3 , %r11 , 1)
962 movdqu \XMM2, 16*1(%arg3 , %r11 , 1)
963 movdqu \XMM3, 16*2(%arg3 , %r11 , 1)
964 movdqu \XMM4, 16*3(%arg3 , %r11 , 1)
Dave Watsone1fd3162018-02-14 09:38:12 -0800965.endif
Tadeusz Struk3c097b82010-12-13 19:51:15 +0800966
Tadeusz Struk0bd82f52010-11-04 15:00:45 -0400967 add $64, %r11
Tadeusz Struk3c097b82010-12-13 19:51:15 +0800968 PSHUFB_XMM %xmm14, \XMM1 # perform a 16 byte swap
Tadeusz Struk0bd82f52010-11-04 15:00:45 -0400969 pxor \XMMDst, \XMM1
970# combine GHASHed value with the corresponding ciphertext
Tadeusz Struk3c097b82010-12-13 19:51:15 +0800971 PSHUFB_XMM %xmm14, \XMM2 # perform a 16 byte swap
Tadeusz Struk3c097b82010-12-13 19:51:15 +0800972 PSHUFB_XMM %xmm14, \XMM3 # perform a 16 byte swap
Tadeusz Struk3c097b82010-12-13 19:51:15 +0800973 PSHUFB_XMM %xmm14, \XMM4 # perform a 16 byte swap
974
Dave Watsone1fd3162018-02-14 09:38:12 -0800975_initial_blocks_done\@:
Tadeusz Struk3c097b82010-12-13 19:51:15 +0800976
Tadeusz Struk0bd82f52010-11-04 15:00:45 -0400977.endm
978
979/*
980* encrypt 4 blocks at a time
981* ghash the 4 previously encrypted ciphertext blocks
Dave Watson9ee4a5d2018-02-14 09:39:23 -0800982* arg1, %arg3, %arg4 are used as pointers only, not modified
Tadeusz Struk0bd82f52010-11-04 15:00:45 -0400983* %r11 is the data offset value
984*/
Tadeusz Struk3c097b82010-12-13 19:51:15 +0800985.macro GHASH_4_ENCRYPT_4_PARALLEL_ENC TMP1 TMP2 TMP3 TMP4 TMP5 \
Tadeusz Struk0bd82f52010-11-04 15:00:45 -0400986TMP6 XMM0 XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7 XMM8 operation
987
988 movdqa \XMM1, \XMM5
989 movdqa \XMM2, \XMM6
990 movdqa \XMM3, \XMM7
991 movdqa \XMM4, \XMM8
992
Tadeusz Struk3c097b82010-12-13 19:51:15 +0800993 movdqa SHUF_MASK(%rip), %xmm15
Tadeusz Struk0bd82f52010-11-04 15:00:45 -0400994 # multiply TMP5 * HashKey using karatsuba
995
996 movdqa \XMM5, \TMP4
997 pshufd $78, \XMM5, \TMP6
998 pxor \XMM5, \TMP6
999 paddd ONE(%rip), \XMM0 # INCR CNT
Dave Watsone5b954e2018-08-15 10:29:42 -07001000 movdqu HashKey_4(%arg2), \TMP5
Tadeusz Struk0bd82f52010-11-04 15:00:45 -04001001 PCLMULQDQ 0x11, \TMP5, \TMP4 # TMP4 = a1*b1
1002 movdqa \XMM0, \XMM1
1003 paddd ONE(%rip), \XMM0 # INCR CNT
1004 movdqa \XMM0, \XMM2
1005 paddd ONE(%rip), \XMM0 # INCR CNT
1006 movdqa \XMM0, \XMM3
1007 paddd ONE(%rip), \XMM0 # INCR CNT
1008 movdqa \XMM0, \XMM4
Tadeusz Struk3c097b82010-12-13 19:51:15 +08001009 PSHUFB_XMM %xmm15, \XMM1 # perform a 16 byte swap
Tadeusz Struk0bd82f52010-11-04 15:00:45 -04001010 PCLMULQDQ 0x00, \TMP5, \XMM5 # XMM5 = a0*b0
Tadeusz Struk3c097b82010-12-13 19:51:15 +08001011 PSHUFB_XMM %xmm15, \XMM2 # perform a 16 byte swap
1012 PSHUFB_XMM %xmm15, \XMM3 # perform a 16 byte swap
1013 PSHUFB_XMM %xmm15, \XMM4 # perform a 16 byte swap
1014
Tadeusz Struk0bd82f52010-11-04 15:00:45 -04001015 pxor (%arg1), \XMM1
1016 pxor (%arg1), \XMM2
1017 pxor (%arg1), \XMM3
1018 pxor (%arg1), \XMM4
Dave Watsone5b954e2018-08-15 10:29:42 -07001019 movdqu HashKey_4_k(%arg2), \TMP5
Tadeusz Struk0bd82f52010-11-04 15:00:45 -04001020 PCLMULQDQ 0x00, \TMP5, \TMP6 # TMP6 = (a1+a0)*(b1+b0)
1021 movaps 0x10(%arg1), \TMP1
1022 AESENC \TMP1, \XMM1 # Round 1
1023 AESENC \TMP1, \XMM2
1024 AESENC \TMP1, \XMM3
1025 AESENC \TMP1, \XMM4
1026 movaps 0x20(%arg1), \TMP1
1027 AESENC \TMP1, \XMM1 # Round 2
1028 AESENC \TMP1, \XMM2
1029 AESENC \TMP1, \XMM3
1030 AESENC \TMP1, \XMM4
1031 movdqa \XMM6, \TMP1
1032 pshufd $78, \XMM6, \TMP2
1033 pxor \XMM6, \TMP2
Dave Watsone5b954e2018-08-15 10:29:42 -07001034 movdqu HashKey_3(%arg2), \TMP5
Tadeusz Struk0bd82f52010-11-04 15:00:45 -04001035 PCLMULQDQ 0x11, \TMP5, \TMP1 # TMP1 = a1 * b1
1036 movaps 0x30(%arg1), \TMP3
1037 AESENC \TMP3, \XMM1 # Round 3
1038 AESENC \TMP3, \XMM2
1039 AESENC \TMP3, \XMM3
1040 AESENC \TMP3, \XMM4
1041 PCLMULQDQ 0x00, \TMP5, \XMM6 # XMM6 = a0*b0
1042 movaps 0x40(%arg1), \TMP3
1043 AESENC \TMP3, \XMM1 # Round 4
1044 AESENC \TMP3, \XMM2
1045 AESENC \TMP3, \XMM3
1046 AESENC \TMP3, \XMM4
Dave Watsone5b954e2018-08-15 10:29:42 -07001047 movdqu HashKey_3_k(%arg2), \TMP5
Tadeusz Struk0bd82f52010-11-04 15:00:45 -04001048 PCLMULQDQ 0x00, \TMP5, \TMP2 # TMP2 = (a1+a0)*(b1+b0)
1049 movaps 0x50(%arg1), \TMP3
1050 AESENC \TMP3, \XMM1 # Round 5
1051 AESENC \TMP3, \XMM2
1052 AESENC \TMP3, \XMM3
1053 AESENC \TMP3, \XMM4
1054 pxor \TMP1, \TMP4
1055# accumulate the results in TMP4:XMM5, TMP6 holds the middle part
1056 pxor \XMM6, \XMM5
1057 pxor \TMP2, \TMP6
1058 movdqa \XMM7, \TMP1
1059 pshufd $78, \XMM7, \TMP2
1060 pxor \XMM7, \TMP2
Dave Watsone5b954e2018-08-15 10:29:42 -07001061 movdqu HashKey_2(%arg2), \TMP5
Tadeusz Struk0bd82f52010-11-04 15:00:45 -04001062
1063 # Multiply TMP5 * HashKey using karatsuba
1064
1065 PCLMULQDQ 0x11, \TMP5, \TMP1 # TMP1 = a1*b1
1066 movaps 0x60(%arg1), \TMP3
1067 AESENC \TMP3, \XMM1 # Round 6
1068 AESENC \TMP3, \XMM2
1069 AESENC \TMP3, \XMM3
1070 AESENC \TMP3, \XMM4
1071 PCLMULQDQ 0x00, \TMP5, \XMM7 # XMM7 = a0*b0
1072 movaps 0x70(%arg1), \TMP3
1073 AESENC \TMP3, \XMM1 # Round 7
1074 AESENC \TMP3, \XMM2
1075 AESENC \TMP3, \XMM3
1076 AESENC \TMP3, \XMM4
Dave Watsone5b954e2018-08-15 10:29:42 -07001077 movdqu HashKey_2_k(%arg2), \TMP5
Tadeusz Struk0bd82f52010-11-04 15:00:45 -04001078 PCLMULQDQ 0x00, \TMP5, \TMP2 # TMP2 = (a1+a0)*(b1+b0)
1079 movaps 0x80(%arg1), \TMP3
1080 AESENC \TMP3, \XMM1 # Round 8
1081 AESENC \TMP3, \XMM2
1082 AESENC \TMP3, \XMM3
1083 AESENC \TMP3, \XMM4
1084 pxor \TMP1, \TMP4
1085# accumulate the results in TMP4:XMM5, TMP6 holds the middle part
1086 pxor \XMM7, \XMM5
1087 pxor \TMP2, \TMP6
1088
1089 # Multiply XMM8 * HashKey
1090 # XMM8 and TMP5 hold the values for the two operands
1091
1092 movdqa \XMM8, \TMP1
1093 pshufd $78, \XMM8, \TMP2
1094 pxor \XMM8, \TMP2
Dave Watsone5b954e2018-08-15 10:29:42 -07001095 movdqu HashKey(%arg2), \TMP5
Tadeusz Struk0bd82f52010-11-04 15:00:45 -04001096 PCLMULQDQ 0x11, \TMP5, \TMP1 # TMP1 = a1*b1
1097 movaps 0x90(%arg1), \TMP3
1098 AESENC \TMP3, \XMM1 # Round 9
1099 AESENC \TMP3, \XMM2
1100 AESENC \TMP3, \XMM3
1101 AESENC \TMP3, \XMM4
1102 PCLMULQDQ 0x00, \TMP5, \XMM8 # XMM8 = a0*b0
Timothy McCaffreye31ac322015-01-13 13:16:43 -05001103 lea 0xa0(%arg1),%r10
1104 mov keysize,%eax
1105 shr $2,%eax # 128->4, 192->6, 256->8
1106 sub $4,%eax # 128->0, 192->2, 256->4
Dave Watsonfb8986e2018-02-14 09:40:47 -08001107 jz aes_loop_par_enc_done\@
Timothy McCaffreye31ac322015-01-13 13:16:43 -05001108
Dave Watsonfb8986e2018-02-14 09:40:47 -08001109aes_loop_par_enc\@:
Timothy McCaffreye31ac322015-01-13 13:16:43 -05001110 MOVADQ (%r10),\TMP3
1111.irpc index, 1234
1112 AESENC \TMP3, %xmm\index
1113.endr
1114 add $16,%r10
1115 sub $1,%eax
Dave Watsonfb8986e2018-02-14 09:40:47 -08001116 jnz aes_loop_par_enc\@
Timothy McCaffreye31ac322015-01-13 13:16:43 -05001117
Dave Watsonfb8986e2018-02-14 09:40:47 -08001118aes_loop_par_enc_done\@:
Timothy McCaffreye31ac322015-01-13 13:16:43 -05001119 MOVADQ (%r10), \TMP3
Tadeusz Struk0bd82f52010-11-04 15:00:45 -04001120 AESENCLAST \TMP3, \XMM1 # Round 10
1121 AESENCLAST \TMP3, \XMM2
1122 AESENCLAST \TMP3, \XMM3
1123 AESENCLAST \TMP3, \XMM4
Dave Watsone5b954e2018-08-15 10:29:42 -07001124 movdqu HashKey_k(%arg2), \TMP5
Tadeusz Struk0bd82f52010-11-04 15:00:45 -04001125 PCLMULQDQ 0x00, \TMP5, \TMP2 # TMP2 = (a1+a0)*(b1+b0)
Dave Watson9ee4a5d2018-02-14 09:39:23 -08001126 movdqu (%arg4,%r11,1), \TMP3
Tadeusz Struk0bd82f52010-11-04 15:00:45 -04001127 pxor \TMP3, \XMM1 # Ciphertext/Plaintext XOR EK
Dave Watson9ee4a5d2018-02-14 09:39:23 -08001128 movdqu 16(%arg4,%r11,1), \TMP3
Tadeusz Struk0bd82f52010-11-04 15:00:45 -04001129 pxor \TMP3, \XMM2 # Ciphertext/Plaintext XOR EK
Dave Watson9ee4a5d2018-02-14 09:39:23 -08001130 movdqu 32(%arg4,%r11,1), \TMP3
Tadeusz Struk0bd82f52010-11-04 15:00:45 -04001131 pxor \TMP3, \XMM3 # Ciphertext/Plaintext XOR EK
Dave Watson9ee4a5d2018-02-14 09:39:23 -08001132 movdqu 48(%arg4,%r11,1), \TMP3
Tadeusz Struk0bd82f52010-11-04 15:00:45 -04001133 pxor \TMP3, \XMM4 # Ciphertext/Plaintext XOR EK
Dave Watson9ee4a5d2018-02-14 09:39:23 -08001134 movdqu \XMM1, (%arg3,%r11,1) # Write to the ciphertext buffer
1135 movdqu \XMM2, 16(%arg3,%r11,1) # Write to the ciphertext buffer
1136 movdqu \XMM3, 32(%arg3,%r11,1) # Write to the ciphertext buffer
1137 movdqu \XMM4, 48(%arg3,%r11,1) # Write to the ciphertext buffer
Tadeusz Struk3c097b82010-12-13 19:51:15 +08001138 PSHUFB_XMM %xmm15, \XMM1 # perform a 16 byte swap
1139 PSHUFB_XMM %xmm15, \XMM2 # perform a 16 byte swap
1140 PSHUFB_XMM %xmm15, \XMM3 # perform a 16 byte swap
1141 PSHUFB_XMM %xmm15, \XMM4 # perform a 16 byte swap
1142
1143 pxor \TMP4, \TMP1
1144 pxor \XMM8, \XMM5
1145 pxor \TMP6, \TMP2
1146 pxor \TMP1, \TMP2
1147 pxor \XMM5, \TMP2
1148 movdqa \TMP2, \TMP3
1149 pslldq $8, \TMP3 # left shift TMP3 2 DWs
1150 psrldq $8, \TMP2 # right shift TMP2 2 DWs
1151 pxor \TMP3, \XMM5
1152 pxor \TMP2, \TMP1 # accumulate the results in TMP1:XMM5
1153
1154 # first phase of reduction
1155
1156 movdqa \XMM5, \TMP2
1157 movdqa \XMM5, \TMP3
1158 movdqa \XMM5, \TMP4
1159# move XMM5 into TMP2, TMP3, TMP4 in order to perform shifts independently
1160 pslld $31, \TMP2 # packed right shift << 31
1161 pslld $30, \TMP3 # packed right shift << 30
1162 pslld $25, \TMP4 # packed right shift << 25
1163 pxor \TMP3, \TMP2 # xor the shifted versions
1164 pxor \TMP4, \TMP2
1165 movdqa \TMP2, \TMP5
1166 psrldq $4, \TMP5 # right shift T5 1 DW
1167 pslldq $12, \TMP2 # left shift T2 3 DWs
1168 pxor \TMP2, \XMM5
1169
1170 # second phase of reduction
1171
1172 movdqa \XMM5,\TMP2 # make 3 copies of XMM5 into TMP2, TMP3, TMP4
1173 movdqa \XMM5,\TMP3
1174 movdqa \XMM5,\TMP4
1175 psrld $1, \TMP2 # packed left shift >>1
1176 psrld $2, \TMP3 # packed left shift >>2
1177 psrld $7, \TMP4 # packed left shift >>7
1178 pxor \TMP3,\TMP2 # xor the shifted versions
1179 pxor \TMP4,\TMP2
1180 pxor \TMP5, \TMP2
1181 pxor \TMP2, \XMM5
1182 pxor \TMP1, \XMM5 # result is in TMP1
1183
1184 pxor \XMM5, \XMM1
1185.endm
1186
1187/*
1188* decrypt 4 blocks at a time
1189* ghash the 4 previously decrypted ciphertext blocks
Dave Watson9ee4a5d2018-02-14 09:39:23 -08001190* arg1, %arg3, %arg4 are used as pointers only, not modified
Tadeusz Struk3c097b82010-12-13 19:51:15 +08001191* %r11 is the data offset value
1192*/
1193.macro GHASH_4_ENCRYPT_4_PARALLEL_DEC TMP1 TMP2 TMP3 TMP4 TMP5 \
1194TMP6 XMM0 XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7 XMM8 operation
1195
1196 movdqa \XMM1, \XMM5
1197 movdqa \XMM2, \XMM6
1198 movdqa \XMM3, \XMM7
1199 movdqa \XMM4, \XMM8
1200
1201 movdqa SHUF_MASK(%rip), %xmm15
1202 # multiply TMP5 * HashKey using karatsuba
1203
1204 movdqa \XMM5, \TMP4
1205 pshufd $78, \XMM5, \TMP6
1206 pxor \XMM5, \TMP6
1207 paddd ONE(%rip), \XMM0 # INCR CNT
Dave Watsone5b954e2018-08-15 10:29:42 -07001208 movdqu HashKey_4(%arg2), \TMP5
Tadeusz Struk3c097b82010-12-13 19:51:15 +08001209 PCLMULQDQ 0x11, \TMP5, \TMP4 # TMP4 = a1*b1
1210 movdqa \XMM0, \XMM1
1211 paddd ONE(%rip), \XMM0 # INCR CNT
1212 movdqa \XMM0, \XMM2
1213 paddd ONE(%rip), \XMM0 # INCR CNT
1214 movdqa \XMM0, \XMM3
1215 paddd ONE(%rip), \XMM0 # INCR CNT
1216 movdqa \XMM0, \XMM4
1217 PSHUFB_XMM %xmm15, \XMM1 # perform a 16 byte swap
1218 PCLMULQDQ 0x00, \TMP5, \XMM5 # XMM5 = a0*b0
1219 PSHUFB_XMM %xmm15, \XMM2 # perform a 16 byte swap
1220 PSHUFB_XMM %xmm15, \XMM3 # perform a 16 byte swap
1221 PSHUFB_XMM %xmm15, \XMM4 # perform a 16 byte swap
1222
1223 pxor (%arg1), \XMM1
1224 pxor (%arg1), \XMM2
1225 pxor (%arg1), \XMM3
1226 pxor (%arg1), \XMM4
Dave Watsone5b954e2018-08-15 10:29:42 -07001227 movdqu HashKey_4_k(%arg2), \TMP5
Tadeusz Struk3c097b82010-12-13 19:51:15 +08001228 PCLMULQDQ 0x00, \TMP5, \TMP6 # TMP6 = (a1+a0)*(b1+b0)
1229 movaps 0x10(%arg1), \TMP1
1230 AESENC \TMP1, \XMM1 # Round 1
1231 AESENC \TMP1, \XMM2
1232 AESENC \TMP1, \XMM3
1233 AESENC \TMP1, \XMM4
1234 movaps 0x20(%arg1), \TMP1
1235 AESENC \TMP1, \XMM1 # Round 2
1236 AESENC \TMP1, \XMM2
1237 AESENC \TMP1, \XMM3
1238 AESENC \TMP1, \XMM4
1239 movdqa \XMM6, \TMP1
1240 pshufd $78, \XMM6, \TMP2
1241 pxor \XMM6, \TMP2
Dave Watsone5b954e2018-08-15 10:29:42 -07001242 movdqu HashKey_3(%arg2), \TMP5
Tadeusz Struk3c097b82010-12-13 19:51:15 +08001243 PCLMULQDQ 0x11, \TMP5, \TMP1 # TMP1 = a1 * b1
1244 movaps 0x30(%arg1), \TMP3
1245 AESENC \TMP3, \XMM1 # Round 3
1246 AESENC \TMP3, \XMM2
1247 AESENC \TMP3, \XMM3
1248 AESENC \TMP3, \XMM4
1249 PCLMULQDQ 0x00, \TMP5, \XMM6 # XMM6 = a0*b0
1250 movaps 0x40(%arg1), \TMP3
1251 AESENC \TMP3, \XMM1 # Round 4
1252 AESENC \TMP3, \XMM2
1253 AESENC \TMP3, \XMM3
1254 AESENC \TMP3, \XMM4
Dave Watsone5b954e2018-08-15 10:29:42 -07001255 movdqu HashKey_3_k(%arg2), \TMP5
Tadeusz Struk3c097b82010-12-13 19:51:15 +08001256 PCLMULQDQ 0x00, \TMP5, \TMP2 # TMP2 = (a1+a0)*(b1+b0)
1257 movaps 0x50(%arg1), \TMP3
1258 AESENC \TMP3, \XMM1 # Round 5
1259 AESENC \TMP3, \XMM2
1260 AESENC \TMP3, \XMM3
1261 AESENC \TMP3, \XMM4
1262 pxor \TMP1, \TMP4
1263# accumulate the results in TMP4:XMM5, TMP6 holds the middle part
1264 pxor \XMM6, \XMM5
1265 pxor \TMP2, \TMP6
1266 movdqa \XMM7, \TMP1
1267 pshufd $78, \XMM7, \TMP2
1268 pxor \XMM7, \TMP2
Dave Watsone5b954e2018-08-15 10:29:42 -07001269 movdqu HashKey_2(%arg2), \TMP5
Tadeusz Struk3c097b82010-12-13 19:51:15 +08001270
1271 # Multiply TMP5 * HashKey using karatsuba
1272
1273 PCLMULQDQ 0x11, \TMP5, \TMP1 # TMP1 = a1*b1
1274 movaps 0x60(%arg1), \TMP3
1275 AESENC \TMP3, \XMM1 # Round 6
1276 AESENC \TMP3, \XMM2
1277 AESENC \TMP3, \XMM3
1278 AESENC \TMP3, \XMM4
1279 PCLMULQDQ 0x00, \TMP5, \XMM7 # XMM7 = a0*b0
1280 movaps 0x70(%arg1), \TMP3
1281 AESENC \TMP3, \XMM1 # Round 7
1282 AESENC \TMP3, \XMM2
1283 AESENC \TMP3, \XMM3
1284 AESENC \TMP3, \XMM4
Dave Watsone5b954e2018-08-15 10:29:42 -07001285 movdqu HashKey_2_k(%arg2), \TMP5
Tadeusz Struk3c097b82010-12-13 19:51:15 +08001286 PCLMULQDQ 0x00, \TMP5, \TMP2 # TMP2 = (a1+a0)*(b1+b0)
1287 movaps 0x80(%arg1), \TMP3
1288 AESENC \TMP3, \XMM1 # Round 8
1289 AESENC \TMP3, \XMM2
1290 AESENC \TMP3, \XMM3
1291 AESENC \TMP3, \XMM4
1292 pxor \TMP1, \TMP4
1293# accumulate the results in TMP4:XMM5, TMP6 holds the middle part
1294 pxor \XMM7, \XMM5
1295 pxor \TMP2, \TMP6
1296
1297 # Multiply XMM8 * HashKey
1298 # XMM8 and TMP5 hold the values for the two operands
1299
1300 movdqa \XMM8, \TMP1
1301 pshufd $78, \XMM8, \TMP2
1302 pxor \XMM8, \TMP2
Dave Watsone5b954e2018-08-15 10:29:42 -07001303 movdqu HashKey(%arg2), \TMP5
Tadeusz Struk3c097b82010-12-13 19:51:15 +08001304 PCLMULQDQ 0x11, \TMP5, \TMP1 # TMP1 = a1*b1
1305 movaps 0x90(%arg1), \TMP3
1306 AESENC \TMP3, \XMM1 # Round 9
1307 AESENC \TMP3, \XMM2
1308 AESENC \TMP3, \XMM3
1309 AESENC \TMP3, \XMM4
1310 PCLMULQDQ 0x00, \TMP5, \XMM8 # XMM8 = a0*b0
Timothy McCaffreye31ac322015-01-13 13:16:43 -05001311 lea 0xa0(%arg1),%r10
1312 mov keysize,%eax
1313 shr $2,%eax # 128->4, 192->6, 256->8
1314 sub $4,%eax # 128->0, 192->2, 256->4
Dave Watsonfb8986e2018-02-14 09:40:47 -08001315 jz aes_loop_par_dec_done\@
Timothy McCaffreye31ac322015-01-13 13:16:43 -05001316
Dave Watsonfb8986e2018-02-14 09:40:47 -08001317aes_loop_par_dec\@:
Timothy McCaffreye31ac322015-01-13 13:16:43 -05001318 MOVADQ (%r10),\TMP3
1319.irpc index, 1234
1320 AESENC \TMP3, %xmm\index
1321.endr
1322 add $16,%r10
1323 sub $1,%eax
Dave Watsonfb8986e2018-02-14 09:40:47 -08001324 jnz aes_loop_par_dec\@
Timothy McCaffreye31ac322015-01-13 13:16:43 -05001325
Dave Watsonfb8986e2018-02-14 09:40:47 -08001326aes_loop_par_dec_done\@:
Timothy McCaffreye31ac322015-01-13 13:16:43 -05001327 MOVADQ (%r10), \TMP3
1328 AESENCLAST \TMP3, \XMM1 # last round
Tadeusz Struk3c097b82010-12-13 19:51:15 +08001329 AESENCLAST \TMP3, \XMM2
1330 AESENCLAST \TMP3, \XMM3
1331 AESENCLAST \TMP3, \XMM4
Dave Watsone5b954e2018-08-15 10:29:42 -07001332 movdqu HashKey_k(%arg2), \TMP5
Tadeusz Struk3c097b82010-12-13 19:51:15 +08001333 PCLMULQDQ 0x00, \TMP5, \TMP2 # TMP2 = (a1+a0)*(b1+b0)
Dave Watson9ee4a5d2018-02-14 09:39:23 -08001334 movdqu (%arg4,%r11,1), \TMP3
Tadeusz Struk3c097b82010-12-13 19:51:15 +08001335 pxor \TMP3, \XMM1 # Ciphertext/Plaintext XOR EK
Dave Watson9ee4a5d2018-02-14 09:39:23 -08001336 movdqu \XMM1, (%arg3,%r11,1) # Write to plaintext buffer
Tadeusz Struk3c097b82010-12-13 19:51:15 +08001337 movdqa \TMP3, \XMM1
Dave Watson9ee4a5d2018-02-14 09:39:23 -08001338 movdqu 16(%arg4,%r11,1), \TMP3
Tadeusz Struk3c097b82010-12-13 19:51:15 +08001339 pxor \TMP3, \XMM2 # Ciphertext/Plaintext XOR EK
Dave Watson9ee4a5d2018-02-14 09:39:23 -08001340 movdqu \XMM2, 16(%arg3,%r11,1) # Write to plaintext buffer
Tadeusz Struk3c097b82010-12-13 19:51:15 +08001341 movdqa \TMP3, \XMM2
Dave Watson9ee4a5d2018-02-14 09:39:23 -08001342 movdqu 32(%arg4,%r11,1), \TMP3
Tadeusz Struk3c097b82010-12-13 19:51:15 +08001343 pxor \TMP3, \XMM3 # Ciphertext/Plaintext XOR EK
Dave Watson9ee4a5d2018-02-14 09:39:23 -08001344 movdqu \XMM3, 32(%arg3,%r11,1) # Write to plaintext buffer
Tadeusz Struk3c097b82010-12-13 19:51:15 +08001345 movdqa \TMP3, \XMM3
Dave Watson9ee4a5d2018-02-14 09:39:23 -08001346 movdqu 48(%arg4,%r11,1), \TMP3
Tadeusz Struk3c097b82010-12-13 19:51:15 +08001347 pxor \TMP3, \XMM4 # Ciphertext/Plaintext XOR EK
Dave Watson9ee4a5d2018-02-14 09:39:23 -08001348 movdqu \XMM4, 48(%arg3,%r11,1) # Write to plaintext buffer
Tadeusz Struk0bd82f52010-11-04 15:00:45 -04001349 movdqa \TMP3, \XMM4
Tadeusz Struk3c097b82010-12-13 19:51:15 +08001350 PSHUFB_XMM %xmm15, \XMM1 # perform a 16 byte swap
1351 PSHUFB_XMM %xmm15, \XMM2 # perform a 16 byte swap
1352 PSHUFB_XMM %xmm15, \XMM3 # perform a 16 byte swap
1353 PSHUFB_XMM %xmm15, \XMM4 # perform a 16 byte swap
Tadeusz Struk0bd82f52010-11-04 15:00:45 -04001354
1355 pxor \TMP4, \TMP1
1356 pxor \XMM8, \XMM5
1357 pxor \TMP6, \TMP2
1358 pxor \TMP1, \TMP2
1359 pxor \XMM5, \TMP2
1360 movdqa \TMP2, \TMP3
1361 pslldq $8, \TMP3 # left shift TMP3 2 DWs
1362 psrldq $8, \TMP2 # right shift TMP2 2 DWs
1363 pxor \TMP3, \XMM5
1364 pxor \TMP2, \TMP1 # accumulate the results in TMP1:XMM5
1365
1366 # first phase of reduction
1367
1368 movdqa \XMM5, \TMP2
1369 movdqa \XMM5, \TMP3
1370 movdqa \XMM5, \TMP4
1371# move XMM5 into TMP2, TMP3, TMP4 in order to perform shifts independently
1372 pslld $31, \TMP2 # packed right shift << 31
1373 pslld $30, \TMP3 # packed right shift << 30
1374 pslld $25, \TMP4 # packed right shift << 25
1375 pxor \TMP3, \TMP2 # xor the shifted versions
1376 pxor \TMP4, \TMP2
1377 movdqa \TMP2, \TMP5
1378 psrldq $4, \TMP5 # right shift T5 1 DW
1379 pslldq $12, \TMP2 # left shift T2 3 DWs
1380 pxor \TMP2, \XMM5
1381
1382 # second phase of reduction
1383
1384 movdqa \XMM5,\TMP2 # make 3 copies of XMM5 into TMP2, TMP3, TMP4
1385 movdqa \XMM5,\TMP3
1386 movdqa \XMM5,\TMP4
1387 psrld $1, \TMP2 # packed left shift >>1
1388 psrld $2, \TMP3 # packed left shift >>2
1389 psrld $7, \TMP4 # packed left shift >>7
1390 pxor \TMP3,\TMP2 # xor the shifted versions
1391 pxor \TMP4,\TMP2
1392 pxor \TMP5, \TMP2
1393 pxor \TMP2, \XMM5
1394 pxor \TMP1, \XMM5 # result is in TMP1
1395
1396 pxor \XMM5, \XMM1
1397.endm
1398
1399/* GHASH the last 4 ciphertext blocks. */
1400.macro GHASH_LAST_4 TMP1 TMP2 TMP3 TMP4 TMP5 TMP6 \
1401TMP7 XMM1 XMM2 XMM3 XMM4 XMMDst
1402
1403 # Multiply TMP6 * HashKey (using Karatsuba)
1404
1405 movdqa \XMM1, \TMP6
1406 pshufd $78, \XMM1, \TMP2
1407 pxor \XMM1, \TMP2
Dave Watsone5b954e2018-08-15 10:29:42 -07001408 movdqu HashKey_4(%arg2), \TMP5
Tadeusz Struk0bd82f52010-11-04 15:00:45 -04001409 PCLMULQDQ 0x11, \TMP5, \TMP6 # TMP6 = a1*b1
1410 PCLMULQDQ 0x00, \TMP5, \XMM1 # XMM1 = a0*b0
Dave Watsone5b954e2018-08-15 10:29:42 -07001411 movdqu HashKey_4_k(%arg2), \TMP4
Tadeusz Struk0bd82f52010-11-04 15:00:45 -04001412 PCLMULQDQ 0x00, \TMP4, \TMP2 # TMP2 = (a1+a0)*(b1+b0)
1413 movdqa \XMM1, \XMMDst
1414 movdqa \TMP2, \XMM1 # result in TMP6, XMMDst, XMM1
1415
1416 # Multiply TMP1 * HashKey (using Karatsuba)
1417
1418 movdqa \XMM2, \TMP1
1419 pshufd $78, \XMM2, \TMP2
1420 pxor \XMM2, \TMP2
Dave Watsone5b954e2018-08-15 10:29:42 -07001421 movdqu HashKey_3(%arg2), \TMP5
Tadeusz Struk0bd82f52010-11-04 15:00:45 -04001422 PCLMULQDQ 0x11, \TMP5, \TMP1 # TMP1 = a1*b1
1423 PCLMULQDQ 0x00, \TMP5, \XMM2 # XMM2 = a0*b0
Dave Watsone5b954e2018-08-15 10:29:42 -07001424 movdqu HashKey_3_k(%arg2), \TMP4
Tadeusz Struk0bd82f52010-11-04 15:00:45 -04001425 PCLMULQDQ 0x00, \TMP4, \TMP2 # TMP2 = (a1+a0)*(b1+b0)
1426 pxor \TMP1, \TMP6
1427 pxor \XMM2, \XMMDst
1428 pxor \TMP2, \XMM1
1429# results accumulated in TMP6, XMMDst, XMM1
1430
1431 # Multiply TMP1 * HashKey (using Karatsuba)
1432
1433 movdqa \XMM3, \TMP1
1434 pshufd $78, \XMM3, \TMP2
1435 pxor \XMM3, \TMP2
Dave Watsone5b954e2018-08-15 10:29:42 -07001436 movdqu HashKey_2(%arg2), \TMP5
Tadeusz Struk0bd82f52010-11-04 15:00:45 -04001437 PCLMULQDQ 0x11, \TMP5, \TMP1 # TMP1 = a1*b1
1438 PCLMULQDQ 0x00, \TMP5, \XMM3 # XMM3 = a0*b0
Dave Watsone5b954e2018-08-15 10:29:42 -07001439 movdqu HashKey_2_k(%arg2), \TMP4
Tadeusz Struk0bd82f52010-11-04 15:00:45 -04001440 PCLMULQDQ 0x00, \TMP4, \TMP2 # TMP2 = (a1+a0)*(b1+b0)
1441 pxor \TMP1, \TMP6
1442 pxor \XMM3, \XMMDst
1443 pxor \TMP2, \XMM1 # results accumulated in TMP6, XMMDst, XMM1
1444
1445 # Multiply TMP1 * HashKey (using Karatsuba)
1446 movdqa \XMM4, \TMP1
1447 pshufd $78, \XMM4, \TMP2
1448 pxor \XMM4, \TMP2
Dave Watsone5b954e2018-08-15 10:29:42 -07001449 movdqu HashKey(%arg2), \TMP5
Tadeusz Struk0bd82f52010-11-04 15:00:45 -04001450 PCLMULQDQ 0x11, \TMP5, \TMP1 # TMP1 = a1*b1
1451 PCLMULQDQ 0x00, \TMP5, \XMM4 # XMM4 = a0*b0
Dave Watsone5b954e2018-08-15 10:29:42 -07001452 movdqu HashKey_k(%arg2), \TMP4
Tadeusz Struk0bd82f52010-11-04 15:00:45 -04001453 PCLMULQDQ 0x00, \TMP4, \TMP2 # TMP2 = (a1+a0)*(b1+b0)
1454 pxor \TMP1, \TMP6
1455 pxor \XMM4, \XMMDst
1456 pxor \XMM1, \TMP2
1457 pxor \TMP6, \TMP2
1458 pxor \XMMDst, \TMP2
1459 # middle section of the temp results combined as in karatsuba algorithm
1460 movdqa \TMP2, \TMP4
1461 pslldq $8, \TMP4 # left shift TMP4 2 DWs
1462 psrldq $8, \TMP2 # right shift TMP2 2 DWs
1463 pxor \TMP4, \XMMDst
1464 pxor \TMP2, \TMP6
1465# TMP6:XMMDst holds the result of the accumulated carry-less multiplications
1466 # first phase of the reduction
1467 movdqa \XMMDst, \TMP2
1468 movdqa \XMMDst, \TMP3
1469 movdqa \XMMDst, \TMP4
1470# move XMMDst into TMP2, TMP3, TMP4 in order to perform 3 shifts independently
1471 pslld $31, \TMP2 # packed right shifting << 31
1472 pslld $30, \TMP3 # packed right shifting << 30
1473 pslld $25, \TMP4 # packed right shifting << 25
1474 pxor \TMP3, \TMP2 # xor the shifted versions
1475 pxor \TMP4, \TMP2
1476 movdqa \TMP2, \TMP7
1477 psrldq $4, \TMP7 # right shift TMP7 1 DW
1478 pslldq $12, \TMP2 # left shift TMP2 3 DWs
1479 pxor \TMP2, \XMMDst
1480
1481 # second phase of the reduction
1482 movdqa \XMMDst, \TMP2
1483 # make 3 copies of XMMDst for doing 3 shift operations
1484 movdqa \XMMDst, \TMP3
1485 movdqa \XMMDst, \TMP4
1486 psrld $1, \TMP2 # packed left shift >> 1
1487 psrld $2, \TMP3 # packed left shift >> 2
1488 psrld $7, \TMP4 # packed left shift >> 7
1489 pxor \TMP3, \TMP2 # xor the shifted versions
1490 pxor \TMP4, \TMP2
1491 pxor \TMP7, \TMP2
1492 pxor \TMP2, \XMMDst
1493 pxor \TMP6, \XMMDst # reduced result is in XMMDst
1494.endm
1495
Timothy McCaffreye31ac322015-01-13 13:16:43 -05001496
1497/* Encryption of a single block
1498* uses eax & r10
1499*/
1500
Tadeusz Struk0bd82f52010-11-04 15:00:45 -04001501.macro ENCRYPT_SINGLE_BLOCK XMM0 TMP1
1502
Timothy McCaffreye31ac322015-01-13 13:16:43 -05001503 pxor (%arg1), \XMM0
1504 mov keysize,%eax
1505 shr $2,%eax # 128->4, 192->6, 256->8
1506 add $5,%eax # 128->9, 192->11, 256->13
1507 lea 16(%arg1), %r10 # get first expanded key address
1508
1509_esb_loop_\@:
1510 MOVADQ (%r10),\TMP1
1511 AESENC \TMP1,\XMM0
1512 add $16,%r10
1513 sub $1,%eax
1514 jnz _esb_loop_\@
1515
1516 MOVADQ (%r10),\TMP1
1517 AESENCLAST \TMP1,\XMM0
Tadeusz Struk0bd82f52010-11-04 15:00:45 -04001518.endm
Tadeusz Struk0bd82f52010-11-04 15:00:45 -04001519/*****************************************************************************
1520* void aesni_gcm_dec(void *aes_ctx, // AES Key schedule. Starts on a 16 byte boundary.
Dave Watson9ee4a5d2018-02-14 09:39:23 -08001521* struct gcm_context_data *data
1522* // Context data
Tadeusz Struk0bd82f52010-11-04 15:00:45 -04001523* u8 *out, // Plaintext output. Encrypt in-place is allowed.
1524* const u8 *in, // Ciphertext input
1525* u64 plaintext_len, // Length of data in bytes for decryption.
1526* u8 *iv, // Pre-counter block j0: 4 byte salt (from Security Association)
1527* // concatenated with 8 byte Initialisation Vector (from IPSec ESP Payload)
1528* // concatenated with 0x00000001. 16-byte aligned pointer.
1529* u8 *hash_subkey, // H, the Hash sub key input. Data starts on a 16-byte boundary.
1530* const u8 *aad, // Additional Authentication Data (AAD)
1531* u64 aad_len, // Length of AAD in bytes. With RFC4106 this is going to be 8 or 12 bytes
1532* u8 *auth_tag, // Authenticated Tag output. The driver will compare this to the
1533* // given authentication tag and only return the plaintext if they match.
1534* u64 auth_tag_len); // Authenticated Tag Length in bytes. Valid values are 16
1535* // (most likely), 12 or 8.
1536*
1537* Assumptions:
1538*
1539* keys:
1540* keys are pre-expanded and aligned to 16 bytes. we are using the first
1541* set of 11 keys in the data structure void *aes_ctx
1542*
1543* iv:
1544* 0 1 2 3
1545* 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
1546* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1547* | Salt (From the SA) |
1548* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1549* | Initialization Vector |
1550* | (This is the sequence number from IPSec header) |
1551* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1552* | 0x1 |
1553* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1554*
1555*
1556*
1557* AAD:
1558* AAD padded to 128 bits with 0
1559* for example, assume AAD is a u32 vector
1560*
1561* if AAD is 8 bytes:
1562* AAD[3] = {A0, A1};
1563* padded AAD in xmm register = {A1 A0 0 0}
1564*
1565* 0 1 2 3
1566* 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
1567* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1568* | SPI (A1) |
1569* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1570* | 32-bit Sequence Number (A0) |
1571* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1572* | 0x0 |
1573* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1574*
1575* AAD Format with 32-bit Sequence Number
1576*
1577* if AAD is 12 bytes:
1578* AAD[3] = {A0, A1, A2};
1579* padded AAD in xmm register = {A2 A1 A0 0}
1580*
1581* 0 1 2 3
1582* 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
1583* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1584* 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
1585* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1586* | SPI (A2) |
1587* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1588* | 64-bit Extended Sequence Number {A1,A0} |
1589* | |
1590* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1591* | 0x0 |
1592* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1593*
1594* AAD Format with 64-bit Extended Sequence Number
1595*
Tadeusz Struk0bd82f52010-11-04 15:00:45 -04001596* poly = x^128 + x^127 + x^126 + x^121 + 1
1597*
1598*****************************************************************************/
Tadeusz Struk0bd82f52010-11-04 15:00:45 -04001599ENTRY(aesni_gcm_dec)
Dave Watson6c2c86b2018-02-14 09:38:35 -08001600 FUNC_SAVE
Tadeusz Struk3c097b82010-12-13 19:51:15 +08001601
Dave Watsonfb8986e2018-02-14 09:40:47 -08001602 GCM_INIT %arg6, arg7, arg8, arg9
Dave Watsonba458332018-02-14 09:39:10 -08001603 GCM_ENC_DEC dec
Dave Watsonfb8986e2018-02-14 09:40:47 -08001604 GCM_COMPLETE arg10, arg11
Dave Watson6c2c86b2018-02-14 09:38:35 -08001605 FUNC_RESTORE
Tadeusz Struk0bd82f52010-11-04 15:00:45 -04001606 ret
Jussi Kivilinna8309b742013-01-19 13:38:55 +02001607ENDPROC(aesni_gcm_dec)
Tadeusz Struk0bd82f52010-11-04 15:00:45 -04001608
1609
1610/*****************************************************************************
1611* void aesni_gcm_enc(void *aes_ctx, // AES Key schedule. Starts on a 16 byte boundary.
Dave Watson9ee4a5d2018-02-14 09:39:23 -08001612* struct gcm_context_data *data
1613* // Context data
Tadeusz Struk0bd82f52010-11-04 15:00:45 -04001614* u8 *out, // Ciphertext output. Encrypt in-place is allowed.
1615* const u8 *in, // Plaintext input
1616* u64 plaintext_len, // Length of data in bytes for encryption.
1617* u8 *iv, // Pre-counter block j0: 4 byte salt (from Security Association)
1618* // concatenated with 8 byte Initialisation Vector (from IPSec ESP Payload)
1619* // concatenated with 0x00000001. 16-byte aligned pointer.
1620* u8 *hash_subkey, // H, the Hash sub key input. Data starts on a 16-byte boundary.
1621* const u8 *aad, // Additional Authentication Data (AAD)
1622* u64 aad_len, // Length of AAD in bytes. With RFC4106 this is going to be 8 or 12 bytes
1623* u8 *auth_tag, // Authenticated Tag output.
1624* u64 auth_tag_len); // Authenticated Tag Length in bytes. Valid values are 16 (most likely),
1625* // 12 or 8.
1626*
1627* Assumptions:
1628*
1629* keys:
1630* keys are pre-expanded and aligned to 16 bytes. we are using the
1631* first set of 11 keys in the data structure void *aes_ctx
1632*
1633*
1634* iv:
1635* 0 1 2 3
1636* 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
1637* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1638* | Salt (From the SA) |
1639* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1640* | Initialization Vector |
1641* | (This is the sequence number from IPSec header) |
1642* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1643* | 0x1 |
1644* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1645*
1646*
1647*
1648* AAD:
1649* AAD padded to 128 bits with 0
1650* for example, assume AAD is a u32 vector
1651*
1652* if AAD is 8 bytes:
1653* AAD[3] = {A0, A1};
1654* padded AAD in xmm register = {A1 A0 0 0}
1655*
1656* 0 1 2 3
1657* 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
1658* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1659* | SPI (A1) |
1660* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1661* | 32-bit Sequence Number (A0) |
1662* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1663* | 0x0 |
1664* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1665*
1666* AAD Format with 32-bit Sequence Number
1667*
1668* if AAD is 12 bytes:
1669* AAD[3] = {A0, A1, A2};
1670* padded AAD in xmm register = {A2 A1 A0 0}
1671*
1672* 0 1 2 3
1673* 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
1674* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1675* | SPI (A2) |
1676* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1677* | 64-bit Extended Sequence Number {A1,A0} |
1678* | |
1679* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1680* | 0x0 |
1681* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1682*
1683* AAD Format with 64-bit Extended Sequence Number
1684*
Tadeusz Struk0bd82f52010-11-04 15:00:45 -04001685* poly = x^128 + x^127 + x^126 + x^121 + 1
1686***************************************************************************/
1687ENTRY(aesni_gcm_enc)
Dave Watson6c2c86b2018-02-14 09:38:35 -08001688 FUNC_SAVE
Tadeusz Struk3c097b82010-12-13 19:51:15 +08001689
Dave Watsonfb8986e2018-02-14 09:40:47 -08001690 GCM_INIT %arg6, arg7, arg8, arg9
Dave Watsonba458332018-02-14 09:39:10 -08001691 GCM_ENC_DEC enc
Dave Watsonfb8986e2018-02-14 09:40:47 -08001692
1693 GCM_COMPLETE arg10, arg11
Dave Watson6c2c86b2018-02-14 09:38:35 -08001694 FUNC_RESTORE
Tadeusz Struk0bd82f52010-11-04 15:00:45 -04001695 ret
Jussi Kivilinna8309b742013-01-19 13:38:55 +02001696ENDPROC(aesni_gcm_enc)
Tadeusz Struk3c097b82010-12-13 19:51:15 +08001697
Dave Watsonfb8986e2018-02-14 09:40:47 -08001698/*****************************************************************************
1699* void aesni_gcm_init(void *aes_ctx, // AES Key schedule. Starts on a 16 byte boundary.
1700* struct gcm_context_data *data,
1701* // context data
1702* u8 *iv, // Pre-counter block j0: 4 byte salt (from Security Association)
1703* // concatenated with 8 byte Initialisation Vector (from IPSec ESP Payload)
1704* // concatenated with 0x00000001. 16-byte aligned pointer.
1705* u8 *hash_subkey, // H, the Hash sub key input. Data starts on a 16-byte boundary.
1706* const u8 *aad, // Additional Authentication Data (AAD)
1707* u64 aad_len) // Length of AAD in bytes.
1708*/
1709ENTRY(aesni_gcm_init)
1710 FUNC_SAVE
1711 GCM_INIT %arg3, %arg4,%arg5, %arg6
1712 FUNC_RESTORE
1713 ret
1714ENDPROC(aesni_gcm_init)
1715
1716/*****************************************************************************
1717* void aesni_gcm_enc_update(void *aes_ctx, // AES Key schedule. Starts on a 16 byte boundary.
1718* struct gcm_context_data *data,
1719* // context data
1720* u8 *out, // Ciphertext output. Encrypt in-place is allowed.
1721* const u8 *in, // Plaintext input
1722* u64 plaintext_len, // Length of data in bytes for encryption.
1723*/
1724ENTRY(aesni_gcm_enc_update)
1725 FUNC_SAVE
1726 GCM_ENC_DEC enc
1727 FUNC_RESTORE
1728 ret
1729ENDPROC(aesni_gcm_enc_update)
1730
1731/*****************************************************************************
1732* void aesni_gcm_dec_update(void *aes_ctx, // AES Key schedule. Starts on a 16 byte boundary.
1733* struct gcm_context_data *data,
1734* // context data
1735* u8 *out, // Ciphertext output. Encrypt in-place is allowed.
1736* const u8 *in, // Plaintext input
1737* u64 plaintext_len, // Length of data in bytes for encryption.
1738*/
1739ENTRY(aesni_gcm_dec_update)
1740 FUNC_SAVE
1741 GCM_ENC_DEC dec
1742 FUNC_RESTORE
1743 ret
1744ENDPROC(aesni_gcm_dec_update)
1745
1746/*****************************************************************************
1747* void aesni_gcm_finalize(void *aes_ctx, // AES Key schedule. Starts on a 16 byte boundary.
1748* struct gcm_context_data *data,
1749* // context data
1750* u8 *auth_tag, // Authenticated Tag output.
1751* u64 auth_tag_len); // Authenticated Tag Length in bytes. Valid values are 16 (most likely),
1752* // 12 or 8.
1753*/
1754ENTRY(aesni_gcm_finalize)
1755 FUNC_SAVE
1756 GCM_COMPLETE %arg3 %arg4
1757 FUNC_RESTORE
1758 ret
1759ENDPROC(aesni_gcm_finalize)
1760
Mathias Krause559ad0f2010-11-29 08:35:39 +08001761#endif
Tadeusz Struk0bd82f52010-11-04 15:00:45 -04001762
1763
Jussi Kivilinna8309b742013-01-19 13:38:55 +02001764.align 4
Huang Ying54b6a1b2009-01-18 16:28:34 +11001765_key_expansion_128:
1766_key_expansion_256a:
1767 pshufd $0b11111111, %xmm1, %xmm1
1768 shufps $0b00010000, %xmm0, %xmm4
1769 pxor %xmm4, %xmm0
1770 shufps $0b10001100, %xmm0, %xmm4
1771 pxor %xmm4, %xmm0
1772 pxor %xmm1, %xmm0
Mathias Krause0d258ef2010-11-27 16:34:46 +08001773 movaps %xmm0, (TKEYP)
1774 add $0x10, TKEYP
Huang Ying54b6a1b2009-01-18 16:28:34 +11001775 ret
Jussi Kivilinna8309b742013-01-19 13:38:55 +02001776ENDPROC(_key_expansion_128)
1777ENDPROC(_key_expansion_256a)
Huang Ying54b6a1b2009-01-18 16:28:34 +11001778
Mathias Krause0d258ef2010-11-27 16:34:46 +08001779.align 4
Huang Ying54b6a1b2009-01-18 16:28:34 +11001780_key_expansion_192a:
1781 pshufd $0b01010101, %xmm1, %xmm1
1782 shufps $0b00010000, %xmm0, %xmm4
1783 pxor %xmm4, %xmm0
1784 shufps $0b10001100, %xmm0, %xmm4
1785 pxor %xmm4, %xmm0
1786 pxor %xmm1, %xmm0
1787
1788 movaps %xmm2, %xmm5
1789 movaps %xmm2, %xmm6
1790 pslldq $4, %xmm5
1791 pshufd $0b11111111, %xmm0, %xmm3
1792 pxor %xmm3, %xmm2
1793 pxor %xmm5, %xmm2
1794
1795 movaps %xmm0, %xmm1
1796 shufps $0b01000100, %xmm0, %xmm6
Mathias Krause0d258ef2010-11-27 16:34:46 +08001797 movaps %xmm6, (TKEYP)
Huang Ying54b6a1b2009-01-18 16:28:34 +11001798 shufps $0b01001110, %xmm2, %xmm1
Mathias Krause0d258ef2010-11-27 16:34:46 +08001799 movaps %xmm1, 0x10(TKEYP)
1800 add $0x20, TKEYP
Huang Ying54b6a1b2009-01-18 16:28:34 +11001801 ret
Jussi Kivilinna8309b742013-01-19 13:38:55 +02001802ENDPROC(_key_expansion_192a)
Huang Ying54b6a1b2009-01-18 16:28:34 +11001803
Mathias Krause0d258ef2010-11-27 16:34:46 +08001804.align 4
Huang Ying54b6a1b2009-01-18 16:28:34 +11001805_key_expansion_192b:
1806 pshufd $0b01010101, %xmm1, %xmm1
1807 shufps $0b00010000, %xmm0, %xmm4
1808 pxor %xmm4, %xmm0
1809 shufps $0b10001100, %xmm0, %xmm4
1810 pxor %xmm4, %xmm0
1811 pxor %xmm1, %xmm0
1812
1813 movaps %xmm2, %xmm5
1814 pslldq $4, %xmm5
1815 pshufd $0b11111111, %xmm0, %xmm3
1816 pxor %xmm3, %xmm2
1817 pxor %xmm5, %xmm2
1818
Mathias Krause0d258ef2010-11-27 16:34:46 +08001819 movaps %xmm0, (TKEYP)
1820 add $0x10, TKEYP
Huang Ying54b6a1b2009-01-18 16:28:34 +11001821 ret
Jussi Kivilinna8309b742013-01-19 13:38:55 +02001822ENDPROC(_key_expansion_192b)
Huang Ying54b6a1b2009-01-18 16:28:34 +11001823
Mathias Krause0d258ef2010-11-27 16:34:46 +08001824.align 4
Huang Ying54b6a1b2009-01-18 16:28:34 +11001825_key_expansion_256b:
1826 pshufd $0b10101010, %xmm1, %xmm1
1827 shufps $0b00010000, %xmm2, %xmm4
1828 pxor %xmm4, %xmm2
1829 shufps $0b10001100, %xmm2, %xmm4
1830 pxor %xmm4, %xmm2
1831 pxor %xmm1, %xmm2
Mathias Krause0d258ef2010-11-27 16:34:46 +08001832 movaps %xmm2, (TKEYP)
1833 add $0x10, TKEYP
Huang Ying54b6a1b2009-01-18 16:28:34 +11001834 ret
Jussi Kivilinna8309b742013-01-19 13:38:55 +02001835ENDPROC(_key_expansion_256b)
Huang Ying54b6a1b2009-01-18 16:28:34 +11001836
1837/*
1838 * int aesni_set_key(struct crypto_aes_ctx *ctx, const u8 *in_key,
1839 * unsigned int key_len)
1840 */
1841ENTRY(aesni_set_key)
Josh Poimboeuf8691ccd2016-01-21 16:49:19 -06001842 FRAME_BEGIN
Mathias Krause0d258ef2010-11-27 16:34:46 +08001843#ifndef __x86_64__
1844 pushl KEYP
Josh Poimboeuf8691ccd2016-01-21 16:49:19 -06001845 movl (FRAME_OFFSET+8)(%esp), KEYP # ctx
1846 movl (FRAME_OFFSET+12)(%esp), UKEYP # in_key
1847 movl (FRAME_OFFSET+16)(%esp), %edx # key_len
Mathias Krause0d258ef2010-11-27 16:34:46 +08001848#endif
1849 movups (UKEYP), %xmm0 # user key (first 16 bytes)
1850 movaps %xmm0, (KEYP)
1851 lea 0x10(KEYP), TKEYP # key addr
1852 movl %edx, 480(KEYP)
Huang Ying54b6a1b2009-01-18 16:28:34 +11001853 pxor %xmm4, %xmm4 # xmm4 is assumed 0 in _key_expansion_x
1854 cmp $24, %dl
1855 jb .Lenc_key128
1856 je .Lenc_key192
Mathias Krause0d258ef2010-11-27 16:34:46 +08001857 movups 0x10(UKEYP), %xmm2 # other user key
1858 movaps %xmm2, (TKEYP)
1859 add $0x10, TKEYP
Huang Yingb369e522009-11-23 19:54:06 +08001860 AESKEYGENASSIST 0x1 %xmm2 %xmm1 # round 1
Huang Ying54b6a1b2009-01-18 16:28:34 +11001861 call _key_expansion_256a
Huang Yingb369e522009-11-23 19:54:06 +08001862 AESKEYGENASSIST 0x1 %xmm0 %xmm1
Huang Ying54b6a1b2009-01-18 16:28:34 +11001863 call _key_expansion_256b
Huang Yingb369e522009-11-23 19:54:06 +08001864 AESKEYGENASSIST 0x2 %xmm2 %xmm1 # round 2
Huang Ying54b6a1b2009-01-18 16:28:34 +11001865 call _key_expansion_256a
Huang Yingb369e522009-11-23 19:54:06 +08001866 AESKEYGENASSIST 0x2 %xmm0 %xmm1
Huang Ying54b6a1b2009-01-18 16:28:34 +11001867 call _key_expansion_256b
Huang Yingb369e522009-11-23 19:54:06 +08001868 AESKEYGENASSIST 0x4 %xmm2 %xmm1 # round 3
Huang Ying54b6a1b2009-01-18 16:28:34 +11001869 call _key_expansion_256a
Huang Yingb369e522009-11-23 19:54:06 +08001870 AESKEYGENASSIST 0x4 %xmm0 %xmm1
Huang Ying54b6a1b2009-01-18 16:28:34 +11001871 call _key_expansion_256b
Huang Yingb369e522009-11-23 19:54:06 +08001872 AESKEYGENASSIST 0x8 %xmm2 %xmm1 # round 4
Huang Ying54b6a1b2009-01-18 16:28:34 +11001873 call _key_expansion_256a
Huang Yingb369e522009-11-23 19:54:06 +08001874 AESKEYGENASSIST 0x8 %xmm0 %xmm1
Huang Ying54b6a1b2009-01-18 16:28:34 +11001875 call _key_expansion_256b
Huang Yingb369e522009-11-23 19:54:06 +08001876 AESKEYGENASSIST 0x10 %xmm2 %xmm1 # round 5
Huang Ying54b6a1b2009-01-18 16:28:34 +11001877 call _key_expansion_256a
Huang Yingb369e522009-11-23 19:54:06 +08001878 AESKEYGENASSIST 0x10 %xmm0 %xmm1
Huang Ying54b6a1b2009-01-18 16:28:34 +11001879 call _key_expansion_256b
Huang Yingb369e522009-11-23 19:54:06 +08001880 AESKEYGENASSIST 0x20 %xmm2 %xmm1 # round 6
Huang Ying54b6a1b2009-01-18 16:28:34 +11001881 call _key_expansion_256a
Huang Yingb369e522009-11-23 19:54:06 +08001882 AESKEYGENASSIST 0x20 %xmm0 %xmm1
Huang Ying54b6a1b2009-01-18 16:28:34 +11001883 call _key_expansion_256b
Huang Yingb369e522009-11-23 19:54:06 +08001884 AESKEYGENASSIST 0x40 %xmm2 %xmm1 # round 7
Huang Ying54b6a1b2009-01-18 16:28:34 +11001885 call _key_expansion_256a
1886 jmp .Ldec_key
1887.Lenc_key192:
Mathias Krause0d258ef2010-11-27 16:34:46 +08001888 movq 0x10(UKEYP), %xmm2 # other user key
Huang Yingb369e522009-11-23 19:54:06 +08001889 AESKEYGENASSIST 0x1 %xmm2 %xmm1 # round 1
Huang Ying54b6a1b2009-01-18 16:28:34 +11001890 call _key_expansion_192a
Huang Yingb369e522009-11-23 19:54:06 +08001891 AESKEYGENASSIST 0x2 %xmm2 %xmm1 # round 2
Huang Ying54b6a1b2009-01-18 16:28:34 +11001892 call _key_expansion_192b
Huang Yingb369e522009-11-23 19:54:06 +08001893 AESKEYGENASSIST 0x4 %xmm2 %xmm1 # round 3
Huang Ying54b6a1b2009-01-18 16:28:34 +11001894 call _key_expansion_192a
Huang Yingb369e522009-11-23 19:54:06 +08001895 AESKEYGENASSIST 0x8 %xmm2 %xmm1 # round 4
Huang Ying54b6a1b2009-01-18 16:28:34 +11001896 call _key_expansion_192b
Huang Yingb369e522009-11-23 19:54:06 +08001897 AESKEYGENASSIST 0x10 %xmm2 %xmm1 # round 5
Huang Ying54b6a1b2009-01-18 16:28:34 +11001898 call _key_expansion_192a
Huang Yingb369e522009-11-23 19:54:06 +08001899 AESKEYGENASSIST 0x20 %xmm2 %xmm1 # round 6
Huang Ying54b6a1b2009-01-18 16:28:34 +11001900 call _key_expansion_192b
Huang Yingb369e522009-11-23 19:54:06 +08001901 AESKEYGENASSIST 0x40 %xmm2 %xmm1 # round 7
Huang Ying54b6a1b2009-01-18 16:28:34 +11001902 call _key_expansion_192a
Huang Yingb369e522009-11-23 19:54:06 +08001903 AESKEYGENASSIST 0x80 %xmm2 %xmm1 # round 8
Huang Ying54b6a1b2009-01-18 16:28:34 +11001904 call _key_expansion_192b
1905 jmp .Ldec_key
1906.Lenc_key128:
Huang Yingb369e522009-11-23 19:54:06 +08001907 AESKEYGENASSIST 0x1 %xmm0 %xmm1 # round 1
Huang Ying54b6a1b2009-01-18 16:28:34 +11001908 call _key_expansion_128
Huang Yingb369e522009-11-23 19:54:06 +08001909 AESKEYGENASSIST 0x2 %xmm0 %xmm1 # round 2
Huang Ying54b6a1b2009-01-18 16:28:34 +11001910 call _key_expansion_128
Huang Yingb369e522009-11-23 19:54:06 +08001911 AESKEYGENASSIST 0x4 %xmm0 %xmm1 # round 3
Huang Ying54b6a1b2009-01-18 16:28:34 +11001912 call _key_expansion_128
Huang Yingb369e522009-11-23 19:54:06 +08001913 AESKEYGENASSIST 0x8 %xmm0 %xmm1 # round 4
Huang Ying54b6a1b2009-01-18 16:28:34 +11001914 call _key_expansion_128
Huang Yingb369e522009-11-23 19:54:06 +08001915 AESKEYGENASSIST 0x10 %xmm0 %xmm1 # round 5
Huang Ying54b6a1b2009-01-18 16:28:34 +11001916 call _key_expansion_128
Huang Yingb369e522009-11-23 19:54:06 +08001917 AESKEYGENASSIST 0x20 %xmm0 %xmm1 # round 6
Huang Ying54b6a1b2009-01-18 16:28:34 +11001918 call _key_expansion_128
Huang Yingb369e522009-11-23 19:54:06 +08001919 AESKEYGENASSIST 0x40 %xmm0 %xmm1 # round 7
Huang Ying54b6a1b2009-01-18 16:28:34 +11001920 call _key_expansion_128
Huang Yingb369e522009-11-23 19:54:06 +08001921 AESKEYGENASSIST 0x80 %xmm0 %xmm1 # round 8
Huang Ying54b6a1b2009-01-18 16:28:34 +11001922 call _key_expansion_128
Huang Yingb369e522009-11-23 19:54:06 +08001923 AESKEYGENASSIST 0x1b %xmm0 %xmm1 # round 9
Huang Ying54b6a1b2009-01-18 16:28:34 +11001924 call _key_expansion_128
Huang Yingb369e522009-11-23 19:54:06 +08001925 AESKEYGENASSIST 0x36 %xmm0 %xmm1 # round 10
Huang Ying54b6a1b2009-01-18 16:28:34 +11001926 call _key_expansion_128
1927.Ldec_key:
Mathias Krause0d258ef2010-11-27 16:34:46 +08001928 sub $0x10, TKEYP
1929 movaps (KEYP), %xmm0
1930 movaps (TKEYP), %xmm1
1931 movaps %xmm0, 240(TKEYP)
1932 movaps %xmm1, 240(KEYP)
1933 add $0x10, KEYP
1934 lea 240-16(TKEYP), UKEYP
Huang Ying54b6a1b2009-01-18 16:28:34 +11001935.align 4
1936.Ldec_key_loop:
Mathias Krause0d258ef2010-11-27 16:34:46 +08001937 movaps (KEYP), %xmm0
Huang Yingb369e522009-11-23 19:54:06 +08001938 AESIMC %xmm0 %xmm1
Mathias Krause0d258ef2010-11-27 16:34:46 +08001939 movaps %xmm1, (UKEYP)
1940 add $0x10, KEYP
1941 sub $0x10, UKEYP
1942 cmp TKEYP, KEYP
Huang Ying54b6a1b2009-01-18 16:28:34 +11001943 jb .Ldec_key_loop
Mathias Krause0d258ef2010-11-27 16:34:46 +08001944 xor AREG, AREG
1945#ifndef __x86_64__
1946 popl KEYP
1947#endif
Josh Poimboeuf8691ccd2016-01-21 16:49:19 -06001948 FRAME_END
Huang Ying54b6a1b2009-01-18 16:28:34 +11001949 ret
Jussi Kivilinna8309b742013-01-19 13:38:55 +02001950ENDPROC(aesni_set_key)
Huang Ying54b6a1b2009-01-18 16:28:34 +11001951
1952/*
1953 * void aesni_enc(struct crypto_aes_ctx *ctx, u8 *dst, const u8 *src)
1954 */
1955ENTRY(aesni_enc)
Josh Poimboeuf8691ccd2016-01-21 16:49:19 -06001956 FRAME_BEGIN
Mathias Krause0d258ef2010-11-27 16:34:46 +08001957#ifndef __x86_64__
1958 pushl KEYP
1959 pushl KLEN
Josh Poimboeuf8691ccd2016-01-21 16:49:19 -06001960 movl (FRAME_OFFSET+12)(%esp), KEYP # ctx
1961 movl (FRAME_OFFSET+16)(%esp), OUTP # dst
1962 movl (FRAME_OFFSET+20)(%esp), INP # src
Mathias Krause0d258ef2010-11-27 16:34:46 +08001963#endif
Huang Ying54b6a1b2009-01-18 16:28:34 +11001964 movl 480(KEYP), KLEN # key length
1965 movups (INP), STATE # input
1966 call _aesni_enc1
1967 movups STATE, (OUTP) # output
Mathias Krause0d258ef2010-11-27 16:34:46 +08001968#ifndef __x86_64__
1969 popl KLEN
1970 popl KEYP
1971#endif
Josh Poimboeuf8691ccd2016-01-21 16:49:19 -06001972 FRAME_END
Huang Ying54b6a1b2009-01-18 16:28:34 +11001973 ret
Jussi Kivilinna8309b742013-01-19 13:38:55 +02001974ENDPROC(aesni_enc)
Huang Ying54b6a1b2009-01-18 16:28:34 +11001975
1976/*
1977 * _aesni_enc1: internal ABI
1978 * input:
1979 * KEYP: key struct pointer
1980 * KLEN: round count
1981 * STATE: initial state (input)
1982 * output:
1983 * STATE: finial state (output)
1984 * changed:
1985 * KEY
1986 * TKEYP (T1)
1987 */
Mathias Krause0d258ef2010-11-27 16:34:46 +08001988.align 4
Huang Ying54b6a1b2009-01-18 16:28:34 +11001989_aesni_enc1:
1990 movaps (KEYP), KEY # key
1991 mov KEYP, TKEYP
1992 pxor KEY, STATE # round 0
1993 add $0x30, TKEYP
1994 cmp $24, KLEN
1995 jb .Lenc128
1996 lea 0x20(TKEYP), TKEYP
1997 je .Lenc192
1998 add $0x20, TKEYP
1999 movaps -0x60(TKEYP), KEY
Huang Yingb369e522009-11-23 19:54:06 +08002000 AESENC KEY STATE
Huang Ying54b6a1b2009-01-18 16:28:34 +11002001 movaps -0x50(TKEYP), KEY
Huang Yingb369e522009-11-23 19:54:06 +08002002 AESENC KEY STATE
Huang Ying54b6a1b2009-01-18 16:28:34 +11002003.align 4
2004.Lenc192:
2005 movaps -0x40(TKEYP), KEY
Huang Yingb369e522009-11-23 19:54:06 +08002006 AESENC KEY STATE
Huang Ying54b6a1b2009-01-18 16:28:34 +11002007 movaps -0x30(TKEYP), KEY
Huang Yingb369e522009-11-23 19:54:06 +08002008 AESENC KEY STATE
Huang Ying54b6a1b2009-01-18 16:28:34 +11002009.align 4
2010.Lenc128:
2011 movaps -0x20(TKEYP), KEY
Huang Yingb369e522009-11-23 19:54:06 +08002012 AESENC KEY STATE
Huang Ying54b6a1b2009-01-18 16:28:34 +11002013 movaps -0x10(TKEYP), KEY
Huang Yingb369e522009-11-23 19:54:06 +08002014 AESENC KEY STATE
Huang Ying54b6a1b2009-01-18 16:28:34 +11002015 movaps (TKEYP), KEY
Huang Yingb369e522009-11-23 19:54:06 +08002016 AESENC KEY STATE
Huang Ying54b6a1b2009-01-18 16:28:34 +11002017 movaps 0x10(TKEYP), KEY
Huang Yingb369e522009-11-23 19:54:06 +08002018 AESENC KEY STATE
Huang Ying54b6a1b2009-01-18 16:28:34 +11002019 movaps 0x20(TKEYP), KEY
Huang Yingb369e522009-11-23 19:54:06 +08002020 AESENC KEY STATE
Huang Ying54b6a1b2009-01-18 16:28:34 +11002021 movaps 0x30(TKEYP), KEY
Huang Yingb369e522009-11-23 19:54:06 +08002022 AESENC KEY STATE
Huang Ying54b6a1b2009-01-18 16:28:34 +11002023 movaps 0x40(TKEYP), KEY
Huang Yingb369e522009-11-23 19:54:06 +08002024 AESENC KEY STATE
Huang Ying54b6a1b2009-01-18 16:28:34 +11002025 movaps 0x50(TKEYP), KEY
Huang Yingb369e522009-11-23 19:54:06 +08002026 AESENC KEY STATE
Huang Ying54b6a1b2009-01-18 16:28:34 +11002027 movaps 0x60(TKEYP), KEY
Huang Yingb369e522009-11-23 19:54:06 +08002028 AESENC KEY STATE
Huang Ying54b6a1b2009-01-18 16:28:34 +11002029 movaps 0x70(TKEYP), KEY
Huang Yingb369e522009-11-23 19:54:06 +08002030 AESENCLAST KEY STATE
Huang Ying54b6a1b2009-01-18 16:28:34 +11002031 ret
Jussi Kivilinna8309b742013-01-19 13:38:55 +02002032ENDPROC(_aesni_enc1)
Huang Ying54b6a1b2009-01-18 16:28:34 +11002033
2034/*
2035 * _aesni_enc4: internal ABI
2036 * input:
2037 * KEYP: key struct pointer
2038 * KLEN: round count
2039 * STATE1: initial state (input)
2040 * STATE2
2041 * STATE3
2042 * STATE4
2043 * output:
2044 * STATE1: finial state (output)
2045 * STATE2
2046 * STATE3
2047 * STATE4
2048 * changed:
2049 * KEY
2050 * TKEYP (T1)
2051 */
Mathias Krause0d258ef2010-11-27 16:34:46 +08002052.align 4
Huang Ying54b6a1b2009-01-18 16:28:34 +11002053_aesni_enc4:
2054 movaps (KEYP), KEY # key
2055 mov KEYP, TKEYP
2056 pxor KEY, STATE1 # round 0
2057 pxor KEY, STATE2
2058 pxor KEY, STATE3
2059 pxor KEY, STATE4
2060 add $0x30, TKEYP
2061 cmp $24, KLEN
2062 jb .L4enc128
2063 lea 0x20(TKEYP), TKEYP
2064 je .L4enc192
2065 add $0x20, TKEYP
2066 movaps -0x60(TKEYP), KEY
Huang Yingb369e522009-11-23 19:54:06 +08002067 AESENC KEY STATE1
2068 AESENC KEY STATE2
2069 AESENC KEY STATE3
2070 AESENC KEY STATE4
Huang Ying54b6a1b2009-01-18 16:28:34 +11002071 movaps -0x50(TKEYP), KEY
Huang Yingb369e522009-11-23 19:54:06 +08002072 AESENC KEY STATE1
2073 AESENC KEY STATE2
2074 AESENC KEY STATE3
2075 AESENC KEY STATE4
Huang Ying54b6a1b2009-01-18 16:28:34 +11002076#.align 4
2077.L4enc192:
2078 movaps -0x40(TKEYP), KEY
Huang Yingb369e522009-11-23 19:54:06 +08002079 AESENC KEY STATE1
2080 AESENC KEY STATE2
2081 AESENC KEY STATE3
2082 AESENC KEY STATE4
Huang Ying54b6a1b2009-01-18 16:28:34 +11002083 movaps -0x30(TKEYP), KEY
Huang Yingb369e522009-11-23 19:54:06 +08002084 AESENC KEY STATE1
2085 AESENC KEY STATE2
2086 AESENC KEY STATE3
2087 AESENC KEY STATE4
Huang Ying54b6a1b2009-01-18 16:28:34 +11002088#.align 4
2089.L4enc128:
2090 movaps -0x20(TKEYP), KEY
Huang Yingb369e522009-11-23 19:54:06 +08002091 AESENC KEY STATE1
2092 AESENC KEY STATE2
2093 AESENC KEY STATE3
2094 AESENC KEY STATE4
Huang Ying54b6a1b2009-01-18 16:28:34 +11002095 movaps -0x10(TKEYP), KEY
Huang Yingb369e522009-11-23 19:54:06 +08002096 AESENC KEY STATE1
2097 AESENC KEY STATE2
2098 AESENC KEY STATE3
2099 AESENC KEY STATE4
Huang Ying54b6a1b2009-01-18 16:28:34 +11002100 movaps (TKEYP), KEY
Huang Yingb369e522009-11-23 19:54:06 +08002101 AESENC KEY STATE1
2102 AESENC KEY STATE2
2103 AESENC KEY STATE3
2104 AESENC KEY STATE4
Huang Ying54b6a1b2009-01-18 16:28:34 +11002105 movaps 0x10(TKEYP), KEY
Huang Yingb369e522009-11-23 19:54:06 +08002106 AESENC KEY STATE1
2107 AESENC KEY STATE2
2108 AESENC KEY STATE3
2109 AESENC KEY STATE4
Huang Ying54b6a1b2009-01-18 16:28:34 +11002110 movaps 0x20(TKEYP), KEY
Huang Yingb369e522009-11-23 19:54:06 +08002111 AESENC KEY STATE1
2112 AESENC KEY STATE2
2113 AESENC KEY STATE3
2114 AESENC KEY STATE4
Huang Ying54b6a1b2009-01-18 16:28:34 +11002115 movaps 0x30(TKEYP), KEY
Huang Yingb369e522009-11-23 19:54:06 +08002116 AESENC KEY STATE1
2117 AESENC KEY STATE2
2118 AESENC KEY STATE3
2119 AESENC KEY STATE4
Huang Ying54b6a1b2009-01-18 16:28:34 +11002120 movaps 0x40(TKEYP), KEY
Huang Yingb369e522009-11-23 19:54:06 +08002121 AESENC KEY STATE1
2122 AESENC KEY STATE2
2123 AESENC KEY STATE3
2124 AESENC KEY STATE4
Huang Ying54b6a1b2009-01-18 16:28:34 +11002125 movaps 0x50(TKEYP), KEY
Huang Yingb369e522009-11-23 19:54:06 +08002126 AESENC KEY STATE1
2127 AESENC KEY STATE2
2128 AESENC KEY STATE3
2129 AESENC KEY STATE4
Huang Ying54b6a1b2009-01-18 16:28:34 +11002130 movaps 0x60(TKEYP), KEY
Huang Yingb369e522009-11-23 19:54:06 +08002131 AESENC KEY STATE1
2132 AESENC KEY STATE2
2133 AESENC KEY STATE3
2134 AESENC KEY STATE4
Huang Ying54b6a1b2009-01-18 16:28:34 +11002135 movaps 0x70(TKEYP), KEY
Huang Yingb369e522009-11-23 19:54:06 +08002136 AESENCLAST KEY STATE1 # last round
2137 AESENCLAST KEY STATE2
2138 AESENCLAST KEY STATE3
2139 AESENCLAST KEY STATE4
Huang Ying54b6a1b2009-01-18 16:28:34 +11002140 ret
Jussi Kivilinna8309b742013-01-19 13:38:55 +02002141ENDPROC(_aesni_enc4)
Huang Ying54b6a1b2009-01-18 16:28:34 +11002142
2143/*
2144 * void aesni_dec (struct crypto_aes_ctx *ctx, u8 *dst, const u8 *src)
2145 */
2146ENTRY(aesni_dec)
Josh Poimboeuf8691ccd2016-01-21 16:49:19 -06002147 FRAME_BEGIN
Mathias Krause0d258ef2010-11-27 16:34:46 +08002148#ifndef __x86_64__
2149 pushl KEYP
2150 pushl KLEN
Josh Poimboeuf8691ccd2016-01-21 16:49:19 -06002151 movl (FRAME_OFFSET+12)(%esp), KEYP # ctx
2152 movl (FRAME_OFFSET+16)(%esp), OUTP # dst
2153 movl (FRAME_OFFSET+20)(%esp), INP # src
Mathias Krause0d258ef2010-11-27 16:34:46 +08002154#endif
Huang Ying54b6a1b2009-01-18 16:28:34 +11002155 mov 480(KEYP), KLEN # key length
2156 add $240, KEYP
2157 movups (INP), STATE # input
2158 call _aesni_dec1
2159 movups STATE, (OUTP) #output
Mathias Krause0d258ef2010-11-27 16:34:46 +08002160#ifndef __x86_64__
2161 popl KLEN
2162 popl KEYP
2163#endif
Josh Poimboeuf8691ccd2016-01-21 16:49:19 -06002164 FRAME_END
Huang Ying54b6a1b2009-01-18 16:28:34 +11002165 ret
Jussi Kivilinna8309b742013-01-19 13:38:55 +02002166ENDPROC(aesni_dec)
Huang Ying54b6a1b2009-01-18 16:28:34 +11002167
2168/*
2169 * _aesni_dec1: internal ABI
2170 * input:
2171 * KEYP: key struct pointer
2172 * KLEN: key length
2173 * STATE: initial state (input)
2174 * output:
2175 * STATE: finial state (output)
2176 * changed:
2177 * KEY
2178 * TKEYP (T1)
2179 */
Mathias Krause0d258ef2010-11-27 16:34:46 +08002180.align 4
Huang Ying54b6a1b2009-01-18 16:28:34 +11002181_aesni_dec1:
2182 movaps (KEYP), KEY # key
2183 mov KEYP, TKEYP
2184 pxor KEY, STATE # round 0
2185 add $0x30, TKEYP
2186 cmp $24, KLEN
2187 jb .Ldec128
2188 lea 0x20(TKEYP), TKEYP
2189 je .Ldec192
2190 add $0x20, TKEYP
2191 movaps -0x60(TKEYP), KEY
Huang Yingb369e522009-11-23 19:54:06 +08002192 AESDEC KEY STATE
Huang Ying54b6a1b2009-01-18 16:28:34 +11002193 movaps -0x50(TKEYP), KEY
Huang Yingb369e522009-11-23 19:54:06 +08002194 AESDEC KEY STATE
Huang Ying54b6a1b2009-01-18 16:28:34 +11002195.align 4
2196.Ldec192:
2197 movaps -0x40(TKEYP), KEY
Huang Yingb369e522009-11-23 19:54:06 +08002198 AESDEC KEY STATE
Huang Ying54b6a1b2009-01-18 16:28:34 +11002199 movaps -0x30(TKEYP), KEY
Huang Yingb369e522009-11-23 19:54:06 +08002200 AESDEC KEY STATE
Huang Ying54b6a1b2009-01-18 16:28:34 +11002201.align 4
2202.Ldec128:
2203 movaps -0x20(TKEYP), KEY
Huang Yingb369e522009-11-23 19:54:06 +08002204 AESDEC KEY STATE
Huang Ying54b6a1b2009-01-18 16:28:34 +11002205 movaps -0x10(TKEYP), KEY
Huang Yingb369e522009-11-23 19:54:06 +08002206 AESDEC KEY STATE
Huang Ying54b6a1b2009-01-18 16:28:34 +11002207 movaps (TKEYP), KEY
Huang Yingb369e522009-11-23 19:54:06 +08002208 AESDEC KEY STATE
Huang Ying54b6a1b2009-01-18 16:28:34 +11002209 movaps 0x10(TKEYP), KEY
Huang Yingb369e522009-11-23 19:54:06 +08002210 AESDEC KEY STATE
Huang Ying54b6a1b2009-01-18 16:28:34 +11002211 movaps 0x20(TKEYP), KEY
Huang Yingb369e522009-11-23 19:54:06 +08002212 AESDEC KEY STATE
Huang Ying54b6a1b2009-01-18 16:28:34 +11002213 movaps 0x30(TKEYP), KEY
Huang Yingb369e522009-11-23 19:54:06 +08002214 AESDEC KEY STATE
Huang Ying54b6a1b2009-01-18 16:28:34 +11002215 movaps 0x40(TKEYP), KEY
Huang Yingb369e522009-11-23 19:54:06 +08002216 AESDEC KEY STATE
Huang Ying54b6a1b2009-01-18 16:28:34 +11002217 movaps 0x50(TKEYP), KEY
Huang Yingb369e522009-11-23 19:54:06 +08002218 AESDEC KEY STATE
Huang Ying54b6a1b2009-01-18 16:28:34 +11002219 movaps 0x60(TKEYP), KEY
Huang Yingb369e522009-11-23 19:54:06 +08002220 AESDEC KEY STATE
Huang Ying54b6a1b2009-01-18 16:28:34 +11002221 movaps 0x70(TKEYP), KEY
Huang Yingb369e522009-11-23 19:54:06 +08002222 AESDECLAST KEY STATE
Huang Ying54b6a1b2009-01-18 16:28:34 +11002223 ret
Jussi Kivilinna8309b742013-01-19 13:38:55 +02002224ENDPROC(_aesni_dec1)
Huang Ying54b6a1b2009-01-18 16:28:34 +11002225
2226/*
2227 * _aesni_dec4: internal ABI
2228 * input:
2229 * KEYP: key struct pointer
2230 * KLEN: key length
2231 * STATE1: initial state (input)
2232 * STATE2
2233 * STATE3
2234 * STATE4
2235 * output:
2236 * STATE1: finial state (output)
2237 * STATE2
2238 * STATE3
2239 * STATE4
2240 * changed:
2241 * KEY
2242 * TKEYP (T1)
2243 */
Mathias Krause0d258ef2010-11-27 16:34:46 +08002244.align 4
Huang Ying54b6a1b2009-01-18 16:28:34 +11002245_aesni_dec4:
2246 movaps (KEYP), KEY # key
2247 mov KEYP, TKEYP
2248 pxor KEY, STATE1 # round 0
2249 pxor KEY, STATE2
2250 pxor KEY, STATE3
2251 pxor KEY, STATE4
2252 add $0x30, TKEYP
2253 cmp $24, KLEN
2254 jb .L4dec128
2255 lea 0x20(TKEYP), TKEYP
2256 je .L4dec192
2257 add $0x20, TKEYP
2258 movaps -0x60(TKEYP), KEY
Huang Yingb369e522009-11-23 19:54:06 +08002259 AESDEC KEY STATE1
2260 AESDEC KEY STATE2
2261 AESDEC KEY STATE3
2262 AESDEC KEY STATE4
Huang Ying54b6a1b2009-01-18 16:28:34 +11002263 movaps -0x50(TKEYP), KEY
Huang Yingb369e522009-11-23 19:54:06 +08002264 AESDEC KEY STATE1
2265 AESDEC KEY STATE2
2266 AESDEC KEY STATE3
2267 AESDEC KEY STATE4
Huang Ying54b6a1b2009-01-18 16:28:34 +11002268.align 4
2269.L4dec192:
2270 movaps -0x40(TKEYP), KEY
Huang Yingb369e522009-11-23 19:54:06 +08002271 AESDEC KEY STATE1
2272 AESDEC KEY STATE2
2273 AESDEC KEY STATE3
2274 AESDEC KEY STATE4
Huang Ying54b6a1b2009-01-18 16:28:34 +11002275 movaps -0x30(TKEYP), KEY
Huang Yingb369e522009-11-23 19:54:06 +08002276 AESDEC KEY STATE1
2277 AESDEC KEY STATE2
2278 AESDEC KEY STATE3
2279 AESDEC KEY STATE4
Huang Ying54b6a1b2009-01-18 16:28:34 +11002280.align 4
2281.L4dec128:
2282 movaps -0x20(TKEYP), KEY
Huang Yingb369e522009-11-23 19:54:06 +08002283 AESDEC KEY STATE1
2284 AESDEC KEY STATE2
2285 AESDEC KEY STATE3
2286 AESDEC KEY STATE4
Huang Ying54b6a1b2009-01-18 16:28:34 +11002287 movaps -0x10(TKEYP), KEY
Huang Yingb369e522009-11-23 19:54:06 +08002288 AESDEC KEY STATE1
2289 AESDEC KEY STATE2
2290 AESDEC KEY STATE3
2291 AESDEC KEY STATE4
Huang Ying54b6a1b2009-01-18 16:28:34 +11002292 movaps (TKEYP), KEY
Huang Yingb369e522009-11-23 19:54:06 +08002293 AESDEC KEY STATE1
2294 AESDEC KEY STATE2
2295 AESDEC KEY STATE3
2296 AESDEC KEY STATE4
Huang Ying54b6a1b2009-01-18 16:28:34 +11002297 movaps 0x10(TKEYP), KEY
Huang Yingb369e522009-11-23 19:54:06 +08002298 AESDEC KEY STATE1
2299 AESDEC KEY STATE2
2300 AESDEC KEY STATE3
2301 AESDEC KEY STATE4
Huang Ying54b6a1b2009-01-18 16:28:34 +11002302 movaps 0x20(TKEYP), KEY
Huang Yingb369e522009-11-23 19:54:06 +08002303 AESDEC KEY STATE1
2304 AESDEC KEY STATE2
2305 AESDEC KEY STATE3
2306 AESDEC KEY STATE4
Huang Ying54b6a1b2009-01-18 16:28:34 +11002307 movaps 0x30(TKEYP), KEY
Huang Yingb369e522009-11-23 19:54:06 +08002308 AESDEC KEY STATE1
2309 AESDEC KEY STATE2
2310 AESDEC KEY STATE3
2311 AESDEC KEY STATE4
Huang Ying54b6a1b2009-01-18 16:28:34 +11002312 movaps 0x40(TKEYP), KEY
Huang Yingb369e522009-11-23 19:54:06 +08002313 AESDEC KEY STATE1
2314 AESDEC KEY STATE2
2315 AESDEC KEY STATE3
2316 AESDEC KEY STATE4
Huang Ying54b6a1b2009-01-18 16:28:34 +11002317 movaps 0x50(TKEYP), KEY
Huang Yingb369e522009-11-23 19:54:06 +08002318 AESDEC KEY STATE1
2319 AESDEC KEY STATE2
2320 AESDEC KEY STATE3
2321 AESDEC KEY STATE4
Huang Ying54b6a1b2009-01-18 16:28:34 +11002322 movaps 0x60(TKEYP), KEY
Huang Yingb369e522009-11-23 19:54:06 +08002323 AESDEC KEY STATE1
2324 AESDEC KEY STATE2
2325 AESDEC KEY STATE3
2326 AESDEC KEY STATE4
Huang Ying54b6a1b2009-01-18 16:28:34 +11002327 movaps 0x70(TKEYP), KEY
Huang Yingb369e522009-11-23 19:54:06 +08002328 AESDECLAST KEY STATE1 # last round
2329 AESDECLAST KEY STATE2
2330 AESDECLAST KEY STATE3
2331 AESDECLAST KEY STATE4
Huang Ying54b6a1b2009-01-18 16:28:34 +11002332 ret
Jussi Kivilinna8309b742013-01-19 13:38:55 +02002333ENDPROC(_aesni_dec4)
Huang Ying54b6a1b2009-01-18 16:28:34 +11002334
2335/*
2336 * void aesni_ecb_enc(struct crypto_aes_ctx *ctx, const u8 *dst, u8 *src,
2337 * size_t len)
2338 */
2339ENTRY(aesni_ecb_enc)
Josh Poimboeuf8691ccd2016-01-21 16:49:19 -06002340 FRAME_BEGIN
Mathias Krause0d258ef2010-11-27 16:34:46 +08002341#ifndef __x86_64__
2342 pushl LEN
2343 pushl KEYP
2344 pushl KLEN
Josh Poimboeuf8691ccd2016-01-21 16:49:19 -06002345 movl (FRAME_OFFSET+16)(%esp), KEYP # ctx
2346 movl (FRAME_OFFSET+20)(%esp), OUTP # dst
2347 movl (FRAME_OFFSET+24)(%esp), INP # src
2348 movl (FRAME_OFFSET+28)(%esp), LEN # len
Mathias Krause0d258ef2010-11-27 16:34:46 +08002349#endif
Huang Ying54b6a1b2009-01-18 16:28:34 +11002350 test LEN, LEN # check length
2351 jz .Lecb_enc_ret
2352 mov 480(KEYP), KLEN
2353 cmp $16, LEN
2354 jb .Lecb_enc_ret
2355 cmp $64, LEN
2356 jb .Lecb_enc_loop1
2357.align 4
2358.Lecb_enc_loop4:
2359 movups (INP), STATE1
2360 movups 0x10(INP), STATE2
2361 movups 0x20(INP), STATE3
2362 movups 0x30(INP), STATE4
2363 call _aesni_enc4
2364 movups STATE1, (OUTP)
2365 movups STATE2, 0x10(OUTP)
2366 movups STATE3, 0x20(OUTP)
2367 movups STATE4, 0x30(OUTP)
2368 sub $64, LEN
2369 add $64, INP
2370 add $64, OUTP
2371 cmp $64, LEN
2372 jge .Lecb_enc_loop4
2373 cmp $16, LEN
2374 jb .Lecb_enc_ret
2375.align 4
2376.Lecb_enc_loop1:
2377 movups (INP), STATE1
2378 call _aesni_enc1
2379 movups STATE1, (OUTP)
2380 sub $16, LEN
2381 add $16, INP
2382 add $16, OUTP
2383 cmp $16, LEN
2384 jge .Lecb_enc_loop1
2385.Lecb_enc_ret:
Mathias Krause0d258ef2010-11-27 16:34:46 +08002386#ifndef __x86_64__
2387 popl KLEN
2388 popl KEYP
2389 popl LEN
2390#endif
Josh Poimboeuf8691ccd2016-01-21 16:49:19 -06002391 FRAME_END
Huang Ying54b6a1b2009-01-18 16:28:34 +11002392 ret
Jussi Kivilinna8309b742013-01-19 13:38:55 +02002393ENDPROC(aesni_ecb_enc)
Huang Ying54b6a1b2009-01-18 16:28:34 +11002394
2395/*
2396 * void aesni_ecb_dec(struct crypto_aes_ctx *ctx, const u8 *dst, u8 *src,
2397 * size_t len);
2398 */
2399ENTRY(aesni_ecb_dec)
Josh Poimboeuf8691ccd2016-01-21 16:49:19 -06002400 FRAME_BEGIN
Mathias Krause0d258ef2010-11-27 16:34:46 +08002401#ifndef __x86_64__
2402 pushl LEN
2403 pushl KEYP
2404 pushl KLEN
Josh Poimboeuf8691ccd2016-01-21 16:49:19 -06002405 movl (FRAME_OFFSET+16)(%esp), KEYP # ctx
2406 movl (FRAME_OFFSET+20)(%esp), OUTP # dst
2407 movl (FRAME_OFFSET+24)(%esp), INP # src
2408 movl (FRAME_OFFSET+28)(%esp), LEN # len
Mathias Krause0d258ef2010-11-27 16:34:46 +08002409#endif
Huang Ying54b6a1b2009-01-18 16:28:34 +11002410 test LEN, LEN
2411 jz .Lecb_dec_ret
2412 mov 480(KEYP), KLEN
2413 add $240, KEYP
2414 cmp $16, LEN
2415 jb .Lecb_dec_ret
2416 cmp $64, LEN
2417 jb .Lecb_dec_loop1
2418.align 4
2419.Lecb_dec_loop4:
2420 movups (INP), STATE1
2421 movups 0x10(INP), STATE2
2422 movups 0x20(INP), STATE3
2423 movups 0x30(INP), STATE4
2424 call _aesni_dec4
2425 movups STATE1, (OUTP)
2426 movups STATE2, 0x10(OUTP)
2427 movups STATE3, 0x20(OUTP)
2428 movups STATE4, 0x30(OUTP)
2429 sub $64, LEN
2430 add $64, INP
2431 add $64, OUTP
2432 cmp $64, LEN
2433 jge .Lecb_dec_loop4
2434 cmp $16, LEN
2435 jb .Lecb_dec_ret
2436.align 4
2437.Lecb_dec_loop1:
2438 movups (INP), STATE1
2439 call _aesni_dec1
2440 movups STATE1, (OUTP)
2441 sub $16, LEN
2442 add $16, INP
2443 add $16, OUTP
2444 cmp $16, LEN
2445 jge .Lecb_dec_loop1
2446.Lecb_dec_ret:
Mathias Krause0d258ef2010-11-27 16:34:46 +08002447#ifndef __x86_64__
2448 popl KLEN
2449 popl KEYP
2450 popl LEN
2451#endif
Josh Poimboeuf8691ccd2016-01-21 16:49:19 -06002452 FRAME_END
Huang Ying54b6a1b2009-01-18 16:28:34 +11002453 ret
Jussi Kivilinna8309b742013-01-19 13:38:55 +02002454ENDPROC(aesni_ecb_dec)
Huang Ying54b6a1b2009-01-18 16:28:34 +11002455
2456/*
2457 * void aesni_cbc_enc(struct crypto_aes_ctx *ctx, const u8 *dst, u8 *src,
2458 * size_t len, u8 *iv)
2459 */
2460ENTRY(aesni_cbc_enc)
Josh Poimboeuf8691ccd2016-01-21 16:49:19 -06002461 FRAME_BEGIN
Mathias Krause0d258ef2010-11-27 16:34:46 +08002462#ifndef __x86_64__
2463 pushl IVP
2464 pushl LEN
2465 pushl KEYP
2466 pushl KLEN
Josh Poimboeuf8691ccd2016-01-21 16:49:19 -06002467 movl (FRAME_OFFSET+20)(%esp), KEYP # ctx
2468 movl (FRAME_OFFSET+24)(%esp), OUTP # dst
2469 movl (FRAME_OFFSET+28)(%esp), INP # src
2470 movl (FRAME_OFFSET+32)(%esp), LEN # len
2471 movl (FRAME_OFFSET+36)(%esp), IVP # iv
Mathias Krause0d258ef2010-11-27 16:34:46 +08002472#endif
Huang Ying54b6a1b2009-01-18 16:28:34 +11002473 cmp $16, LEN
2474 jb .Lcbc_enc_ret
2475 mov 480(KEYP), KLEN
2476 movups (IVP), STATE # load iv as initial state
2477.align 4
2478.Lcbc_enc_loop:
2479 movups (INP), IN # load input
2480 pxor IN, STATE
2481 call _aesni_enc1
2482 movups STATE, (OUTP) # store output
2483 sub $16, LEN
2484 add $16, INP
2485 add $16, OUTP
2486 cmp $16, LEN
2487 jge .Lcbc_enc_loop
2488 movups STATE, (IVP)
2489.Lcbc_enc_ret:
Mathias Krause0d258ef2010-11-27 16:34:46 +08002490#ifndef __x86_64__
2491 popl KLEN
2492 popl KEYP
2493 popl LEN
2494 popl IVP
2495#endif
Josh Poimboeuf8691ccd2016-01-21 16:49:19 -06002496 FRAME_END
Huang Ying54b6a1b2009-01-18 16:28:34 +11002497 ret
Jussi Kivilinna8309b742013-01-19 13:38:55 +02002498ENDPROC(aesni_cbc_enc)
Huang Ying54b6a1b2009-01-18 16:28:34 +11002499
2500/*
2501 * void aesni_cbc_dec(struct crypto_aes_ctx *ctx, const u8 *dst, u8 *src,
2502 * size_t len, u8 *iv)
2503 */
2504ENTRY(aesni_cbc_dec)
Josh Poimboeuf8691ccd2016-01-21 16:49:19 -06002505 FRAME_BEGIN
Mathias Krause0d258ef2010-11-27 16:34:46 +08002506#ifndef __x86_64__
2507 pushl IVP
2508 pushl LEN
2509 pushl KEYP
2510 pushl KLEN
Josh Poimboeuf8691ccd2016-01-21 16:49:19 -06002511 movl (FRAME_OFFSET+20)(%esp), KEYP # ctx
2512 movl (FRAME_OFFSET+24)(%esp), OUTP # dst
2513 movl (FRAME_OFFSET+28)(%esp), INP # src
2514 movl (FRAME_OFFSET+32)(%esp), LEN # len
2515 movl (FRAME_OFFSET+36)(%esp), IVP # iv
Mathias Krause0d258ef2010-11-27 16:34:46 +08002516#endif
Huang Ying54b6a1b2009-01-18 16:28:34 +11002517 cmp $16, LEN
Huang Yinge6efaa02009-06-18 19:33:57 +08002518 jb .Lcbc_dec_just_ret
Huang Ying54b6a1b2009-01-18 16:28:34 +11002519 mov 480(KEYP), KLEN
2520 add $240, KEYP
2521 movups (IVP), IV
2522 cmp $64, LEN
2523 jb .Lcbc_dec_loop1
2524.align 4
2525.Lcbc_dec_loop4:
2526 movups (INP), IN1
2527 movaps IN1, STATE1
2528 movups 0x10(INP), IN2
2529 movaps IN2, STATE2
Mathias Krause0d258ef2010-11-27 16:34:46 +08002530#ifdef __x86_64__
Huang Ying54b6a1b2009-01-18 16:28:34 +11002531 movups 0x20(INP), IN3
2532 movaps IN3, STATE3
2533 movups 0x30(INP), IN4
2534 movaps IN4, STATE4
Mathias Krause0d258ef2010-11-27 16:34:46 +08002535#else
2536 movups 0x20(INP), IN1
2537 movaps IN1, STATE3
2538 movups 0x30(INP), IN2
2539 movaps IN2, STATE4
2540#endif
Huang Ying54b6a1b2009-01-18 16:28:34 +11002541 call _aesni_dec4
2542 pxor IV, STATE1
Mathias Krause0d258ef2010-11-27 16:34:46 +08002543#ifdef __x86_64__
Huang Ying54b6a1b2009-01-18 16:28:34 +11002544 pxor IN1, STATE2
2545 pxor IN2, STATE3
2546 pxor IN3, STATE4
2547 movaps IN4, IV
Mathias Krause0d258ef2010-11-27 16:34:46 +08002548#else
Mathias Krause0d258ef2010-11-27 16:34:46 +08002549 pxor IN1, STATE4
2550 movaps IN2, IV
Mathias Krause7c8d5182012-05-30 01:43:08 +02002551 movups (INP), IN1
2552 pxor IN1, STATE2
2553 movups 0x10(INP), IN2
2554 pxor IN2, STATE3
Mathias Krause0d258ef2010-11-27 16:34:46 +08002555#endif
Huang Ying54b6a1b2009-01-18 16:28:34 +11002556 movups STATE1, (OUTP)
2557 movups STATE2, 0x10(OUTP)
2558 movups STATE3, 0x20(OUTP)
2559 movups STATE4, 0x30(OUTP)
2560 sub $64, LEN
2561 add $64, INP
2562 add $64, OUTP
2563 cmp $64, LEN
2564 jge .Lcbc_dec_loop4
2565 cmp $16, LEN
2566 jb .Lcbc_dec_ret
2567.align 4
2568.Lcbc_dec_loop1:
2569 movups (INP), IN
2570 movaps IN, STATE
2571 call _aesni_dec1
2572 pxor IV, STATE
2573 movups STATE, (OUTP)
2574 movaps IN, IV
2575 sub $16, LEN
2576 add $16, INP
2577 add $16, OUTP
2578 cmp $16, LEN
2579 jge .Lcbc_dec_loop1
Huang Ying54b6a1b2009-01-18 16:28:34 +11002580.Lcbc_dec_ret:
Huang Yinge6efaa02009-06-18 19:33:57 +08002581 movups IV, (IVP)
2582.Lcbc_dec_just_ret:
Mathias Krause0d258ef2010-11-27 16:34:46 +08002583#ifndef __x86_64__
2584 popl KLEN
2585 popl KEYP
2586 popl LEN
2587 popl IVP
2588#endif
Josh Poimboeuf8691ccd2016-01-21 16:49:19 -06002589 FRAME_END
Huang Ying54b6a1b2009-01-18 16:28:34 +11002590 ret
Jussi Kivilinna8309b742013-01-19 13:38:55 +02002591ENDPROC(aesni_cbc_dec)
Huang Ying12387a42010-03-10 18:28:55 +08002592
Mathias Krause0d258ef2010-11-27 16:34:46 +08002593#ifdef __x86_64__
Josh Poimboeuf1253cab2016-01-21 16:49:15 -06002594.pushsection .rodata
Huang Ying12387a42010-03-10 18:28:55 +08002595.align 16
2596.Lbswap_mask:
2597 .byte 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0
Josh Poimboeuf1253cab2016-01-21 16:49:15 -06002598.popsection
Huang Ying12387a42010-03-10 18:28:55 +08002599
2600/*
2601 * _aesni_inc_init: internal ABI
2602 * setup registers used by _aesni_inc
2603 * input:
2604 * IV
2605 * output:
2606 * CTR: == IV, in little endian
2607 * TCTR_LOW: == lower qword of CTR
2608 * INC: == 1, in little endian
2609 * BSWAP_MASK == endian swapping mask
2610 */
Mathias Krause0d258ef2010-11-27 16:34:46 +08002611.align 4
Huang Ying12387a42010-03-10 18:28:55 +08002612_aesni_inc_init:
2613 movaps .Lbswap_mask, BSWAP_MASK
2614 movaps IV, CTR
2615 PSHUFB_XMM BSWAP_MASK CTR
2616 mov $1, TCTR_LOW
Huang Ying32cbd7d2010-03-13 16:28:42 +08002617 MOVQ_R64_XMM TCTR_LOW INC
2618 MOVQ_R64_XMM CTR TCTR_LOW
Huang Ying12387a42010-03-10 18:28:55 +08002619 ret
Jussi Kivilinna8309b742013-01-19 13:38:55 +02002620ENDPROC(_aesni_inc_init)
Huang Ying12387a42010-03-10 18:28:55 +08002621
2622/*
2623 * _aesni_inc: internal ABI
2624 * Increase IV by 1, IV is in big endian
2625 * input:
2626 * IV
2627 * CTR: == IV, in little endian
2628 * TCTR_LOW: == lower qword of CTR
2629 * INC: == 1, in little endian
2630 * BSWAP_MASK == endian swapping mask
2631 * output:
2632 * IV: Increase by 1
2633 * changed:
2634 * CTR: == output IV, in little endian
2635 * TCTR_LOW: == lower qword of CTR
2636 */
Mathias Krause0d258ef2010-11-27 16:34:46 +08002637.align 4
Huang Ying12387a42010-03-10 18:28:55 +08002638_aesni_inc:
2639 paddq INC, CTR
2640 add $1, TCTR_LOW
2641 jnc .Linc_low
2642 pslldq $8, INC
2643 paddq INC, CTR
2644 psrldq $8, INC
2645.Linc_low:
2646 movaps CTR, IV
2647 PSHUFB_XMM BSWAP_MASK IV
2648 ret
Jussi Kivilinna8309b742013-01-19 13:38:55 +02002649ENDPROC(_aesni_inc)
Huang Ying12387a42010-03-10 18:28:55 +08002650
2651/*
2652 * void aesni_ctr_enc(struct crypto_aes_ctx *ctx, const u8 *dst, u8 *src,
2653 * size_t len, u8 *iv)
2654 */
2655ENTRY(aesni_ctr_enc)
Josh Poimboeuf8691ccd2016-01-21 16:49:19 -06002656 FRAME_BEGIN
Huang Ying12387a42010-03-10 18:28:55 +08002657 cmp $16, LEN
2658 jb .Lctr_enc_just_ret
2659 mov 480(KEYP), KLEN
2660 movups (IVP), IV
2661 call _aesni_inc_init
2662 cmp $64, LEN
2663 jb .Lctr_enc_loop1
2664.align 4
2665.Lctr_enc_loop4:
2666 movaps IV, STATE1
2667 call _aesni_inc
2668 movups (INP), IN1
2669 movaps IV, STATE2
2670 call _aesni_inc
2671 movups 0x10(INP), IN2
2672 movaps IV, STATE3
2673 call _aesni_inc
2674 movups 0x20(INP), IN3
2675 movaps IV, STATE4
2676 call _aesni_inc
2677 movups 0x30(INP), IN4
2678 call _aesni_enc4
2679 pxor IN1, STATE1
2680 movups STATE1, (OUTP)
2681 pxor IN2, STATE2
2682 movups STATE2, 0x10(OUTP)
2683 pxor IN3, STATE3
2684 movups STATE3, 0x20(OUTP)
2685 pxor IN4, STATE4
2686 movups STATE4, 0x30(OUTP)
2687 sub $64, LEN
2688 add $64, INP
2689 add $64, OUTP
2690 cmp $64, LEN
2691 jge .Lctr_enc_loop4
2692 cmp $16, LEN
2693 jb .Lctr_enc_ret
2694.align 4
2695.Lctr_enc_loop1:
2696 movaps IV, STATE
2697 call _aesni_inc
2698 movups (INP), IN
2699 call _aesni_enc1
2700 pxor IN, STATE
2701 movups STATE, (OUTP)
2702 sub $16, LEN
2703 add $16, INP
2704 add $16, OUTP
2705 cmp $16, LEN
2706 jge .Lctr_enc_loop1
2707.Lctr_enc_ret:
2708 movups IV, (IVP)
2709.Lctr_enc_just_ret:
Josh Poimboeuf8691ccd2016-01-21 16:49:19 -06002710 FRAME_END
Huang Ying12387a42010-03-10 18:28:55 +08002711 ret
Jussi Kivilinna8309b742013-01-19 13:38:55 +02002712ENDPROC(aesni_ctr_enc)
Jussi Kivilinnac456a9c2013-04-08 21:51:16 +03002713
2714/*
2715 * _aesni_gf128mul_x_ble: internal ABI
2716 * Multiply in GF(2^128) for XTS IVs
2717 * input:
2718 * IV: current IV
2719 * GF128MUL_MASK == mask with 0x87 and 0x01
2720 * output:
2721 * IV: next IV
2722 * changed:
2723 * CTR: == temporary value
2724 */
2725#define _aesni_gf128mul_x_ble() \
2726 pshufd $0x13, IV, CTR; \
2727 paddq IV, IV; \
2728 psrad $31, CTR; \
2729 pand GF128MUL_MASK, CTR; \
2730 pxor CTR, IV;
2731
2732/*
2733 * void aesni_xts_crypt8(struct crypto_aes_ctx *ctx, const u8 *dst, u8 *src,
2734 * bool enc, u8 *iv)
2735 */
2736ENTRY(aesni_xts_crypt8)
Josh Poimboeuf8691ccd2016-01-21 16:49:19 -06002737 FRAME_BEGIN
Jussi Kivilinnac456a9c2013-04-08 21:51:16 +03002738 cmpb $0, %cl
2739 movl $0, %ecx
2740 movl $240, %r10d
2741 leaq _aesni_enc4, %r11
2742 leaq _aesni_dec4, %rax
2743 cmovel %r10d, %ecx
2744 cmoveq %rax, %r11
2745
2746 movdqa .Lgf128mul_x_ble_mask, GF128MUL_MASK
2747 movups (IVP), IV
2748
2749 mov 480(KEYP), KLEN
2750 addq %rcx, KEYP
2751
2752 movdqa IV, STATE1
Jussi Kivilinnafe6510b2013-06-11 22:25:22 +03002753 movdqu 0x00(INP), INC
2754 pxor INC, STATE1
Jussi Kivilinnac456a9c2013-04-08 21:51:16 +03002755 movdqu IV, 0x00(OUTP)
2756
2757 _aesni_gf128mul_x_ble()
2758 movdqa IV, STATE2
Jussi Kivilinnafe6510b2013-06-11 22:25:22 +03002759 movdqu 0x10(INP), INC
2760 pxor INC, STATE2
Jussi Kivilinnac456a9c2013-04-08 21:51:16 +03002761 movdqu IV, 0x10(OUTP)
2762
2763 _aesni_gf128mul_x_ble()
2764 movdqa IV, STATE3
Jussi Kivilinnafe6510b2013-06-11 22:25:22 +03002765 movdqu 0x20(INP), INC
2766 pxor INC, STATE3
Jussi Kivilinnac456a9c2013-04-08 21:51:16 +03002767 movdqu IV, 0x20(OUTP)
2768
2769 _aesni_gf128mul_x_ble()
2770 movdqa IV, STATE4
Jussi Kivilinnafe6510b2013-06-11 22:25:22 +03002771 movdqu 0x30(INP), INC
2772 pxor INC, STATE4
Jussi Kivilinnac456a9c2013-04-08 21:51:16 +03002773 movdqu IV, 0x30(OUTP)
2774
David Woodhouse9697fa32018-01-11 21:46:27 +00002775 CALL_NOSPEC %r11
Jussi Kivilinnac456a9c2013-04-08 21:51:16 +03002776
Jussi Kivilinnafe6510b2013-06-11 22:25:22 +03002777 movdqu 0x00(OUTP), INC
2778 pxor INC, STATE1
Jussi Kivilinnac456a9c2013-04-08 21:51:16 +03002779 movdqu STATE1, 0x00(OUTP)
2780
2781 _aesni_gf128mul_x_ble()
2782 movdqa IV, STATE1
Jussi Kivilinnafe6510b2013-06-11 22:25:22 +03002783 movdqu 0x40(INP), INC
2784 pxor INC, STATE1
Jussi Kivilinnac456a9c2013-04-08 21:51:16 +03002785 movdqu IV, 0x40(OUTP)
2786
Jussi Kivilinnafe6510b2013-06-11 22:25:22 +03002787 movdqu 0x10(OUTP), INC
2788 pxor INC, STATE2
Jussi Kivilinnac456a9c2013-04-08 21:51:16 +03002789 movdqu STATE2, 0x10(OUTP)
2790
2791 _aesni_gf128mul_x_ble()
2792 movdqa IV, STATE2
Jussi Kivilinnafe6510b2013-06-11 22:25:22 +03002793 movdqu 0x50(INP), INC
2794 pxor INC, STATE2
Jussi Kivilinnac456a9c2013-04-08 21:51:16 +03002795 movdqu IV, 0x50(OUTP)
2796
Jussi Kivilinnafe6510b2013-06-11 22:25:22 +03002797 movdqu 0x20(OUTP), INC
2798 pxor INC, STATE3
Jussi Kivilinnac456a9c2013-04-08 21:51:16 +03002799 movdqu STATE3, 0x20(OUTP)
2800
2801 _aesni_gf128mul_x_ble()
2802 movdqa IV, STATE3
Jussi Kivilinnafe6510b2013-06-11 22:25:22 +03002803 movdqu 0x60(INP), INC
2804 pxor INC, STATE3
Jussi Kivilinnac456a9c2013-04-08 21:51:16 +03002805 movdqu IV, 0x60(OUTP)
2806
Jussi Kivilinnafe6510b2013-06-11 22:25:22 +03002807 movdqu 0x30(OUTP), INC
2808 pxor INC, STATE4
Jussi Kivilinnac456a9c2013-04-08 21:51:16 +03002809 movdqu STATE4, 0x30(OUTP)
2810
2811 _aesni_gf128mul_x_ble()
2812 movdqa IV, STATE4
Jussi Kivilinnafe6510b2013-06-11 22:25:22 +03002813 movdqu 0x70(INP), INC
2814 pxor INC, STATE4
Jussi Kivilinnac456a9c2013-04-08 21:51:16 +03002815 movdqu IV, 0x70(OUTP)
2816
2817 _aesni_gf128mul_x_ble()
2818 movups IV, (IVP)
2819
David Woodhouse9697fa32018-01-11 21:46:27 +00002820 CALL_NOSPEC %r11
Jussi Kivilinnac456a9c2013-04-08 21:51:16 +03002821
Jussi Kivilinnafe6510b2013-06-11 22:25:22 +03002822 movdqu 0x40(OUTP), INC
2823 pxor INC, STATE1
Jussi Kivilinnac456a9c2013-04-08 21:51:16 +03002824 movdqu STATE1, 0x40(OUTP)
2825
Jussi Kivilinnafe6510b2013-06-11 22:25:22 +03002826 movdqu 0x50(OUTP), INC
2827 pxor INC, STATE2
Jussi Kivilinnac456a9c2013-04-08 21:51:16 +03002828 movdqu STATE2, 0x50(OUTP)
2829
Jussi Kivilinnafe6510b2013-06-11 22:25:22 +03002830 movdqu 0x60(OUTP), INC
2831 pxor INC, STATE3
Jussi Kivilinnac456a9c2013-04-08 21:51:16 +03002832 movdqu STATE3, 0x60(OUTP)
2833
Jussi Kivilinnafe6510b2013-06-11 22:25:22 +03002834 movdqu 0x70(OUTP), INC
2835 pxor INC, STATE4
Jussi Kivilinnac456a9c2013-04-08 21:51:16 +03002836 movdqu STATE4, 0x70(OUTP)
2837
Josh Poimboeuf8691ccd2016-01-21 16:49:19 -06002838 FRAME_END
Jussi Kivilinnac456a9c2013-04-08 21:51:16 +03002839 ret
2840ENDPROC(aesni_xts_crypt8)
2841
Mathias Krause0d258ef2010-11-27 16:34:46 +08002842#endif