blob: c00f32c39360f38c2e8765875636d1a6a234d8de [file] [log] [blame]
Huang Ying54b6a1b2009-01-18 16:28:34 +11001/*
2 * Implement AES algorithm in Intel AES-NI instructions.
3 *
4 * The white paper of AES-NI instructions can be downloaded from:
5 * http://softwarecommunity.intel.com/isn/downloads/intelavx/AES-Instructions-Set_WP.pdf
6 *
7 * Copyright (C) 2008, Intel Corp.
8 * Author: Huang Ying <ying.huang@intel.com>
9 * Vinodh Gopal <vinodh.gopal@intel.com>
10 * Kahraman Akdemir
11 *
Tadeusz Struk0bd82f52010-11-04 15:00:45 -040012 * Added RFC4106 AES-GCM support for 128-bit keys under the AEAD
13 * interface for 64-bit kernels.
14 * Authors: Erdinc Ozturk (erdinc.ozturk@intel.com)
15 * Aidan O'Mahony (aidan.o.mahony@intel.com)
16 * Adrian Hoban <adrian.hoban@intel.com>
17 * James Guilford (james.guilford@intel.com)
18 * Gabriele Paoloni <gabriele.paoloni@intel.com>
19 * Tadeusz Struk (tadeusz.struk@intel.com)
20 * Wajdi Feghali (wajdi.k.feghali@intel.com)
21 * Copyright (c) 2010, Intel Corporation.
22 *
Mathias Krause0d258ef2010-11-27 16:34:46 +080023 * Ported x86_64 version to x86:
24 * Author: Mathias Krause <minipli@googlemail.com>
25 *
Huang Ying54b6a1b2009-01-18 16:28:34 +110026 * This program is free software; you can redistribute it and/or modify
27 * it under the terms of the GNU General Public License as published by
28 * the Free Software Foundation; either version 2 of the License, or
29 * (at your option) any later version.
30 */
31
32#include <linux/linkage.h>
Huang Yingb369e522009-11-23 19:54:06 +080033#include <asm/inst.h>
Josh Poimboeuf8691ccd2016-01-21 16:49:19 -060034#include <asm/frame.h>
David Woodhouse9697fa32018-01-11 21:46:27 +000035#include <asm/nospec-branch.h>
Huang Ying54b6a1b2009-01-18 16:28:34 +110036
Timothy McCaffreye31ac322015-01-13 13:16:43 -050037/*
38 * The following macros are used to move an (un)aligned 16 byte value to/from
39 * an XMM register. This can done for either FP or integer values, for FP use
40 * movaps (move aligned packed single) or integer use movdqa (move double quad
41 * aligned). It doesn't make a performance difference which instruction is used
42 * since Nehalem (original Core i7) was released. However, the movaps is a byte
43 * shorter, so that is the one we'll use for now. (same for unaligned).
44 */
45#define MOVADQ movaps
46#define MOVUDQ movups
47
Mathias Krause559ad0f2010-11-29 08:35:39 +080048#ifdef __x86_64__
Timothy McCaffreye31ac322015-01-13 13:16:43 -050049
Denys Vlasenkoe1839142017-01-19 22:33:04 +010050# constants in mergeable sections, linker can reorder and merge
51.section .rodata.cst16.gf128mul_x_ble_mask, "aM", @progbits, 16
Jussi Kivilinnac456a9c2013-04-08 21:51:16 +030052.align 16
53.Lgf128mul_x_ble_mask:
54 .octa 0x00000000000000010000000000000087
Denys Vlasenkoe1839142017-01-19 22:33:04 +010055.section .rodata.cst16.POLY, "aM", @progbits, 16
56.align 16
Tadeusz Struk0bd82f52010-11-04 15:00:45 -040057POLY: .octa 0xC2000000000000000000000000000001
Denys Vlasenkoe1839142017-01-19 22:33:04 +010058.section .rodata.cst16.TWOONE, "aM", @progbits, 16
59.align 16
Tadeusz Struk0bd82f52010-11-04 15:00:45 -040060TWOONE: .octa 0x00000001000000000000000000000001
61
Denys Vlasenkoe1839142017-01-19 22:33:04 +010062.section .rodata.cst16.SHUF_MASK, "aM", @progbits, 16
63.align 16
64SHUF_MASK: .octa 0x000102030405060708090A0B0C0D0E0F
65.section .rodata.cst16.MASK1, "aM", @progbits, 16
66.align 16
67MASK1: .octa 0x0000000000000000ffffffffffffffff
68.section .rodata.cst16.MASK2, "aM", @progbits, 16
69.align 16
70MASK2: .octa 0xffffffffffffffff0000000000000000
71.section .rodata.cst16.ONE, "aM", @progbits, 16
72.align 16
73ONE: .octa 0x00000000000000000000000000000001
74.section .rodata.cst16.F_MIN_MASK, "aM", @progbits, 16
75.align 16
76F_MIN_MASK: .octa 0xf1f2f3f4f5f6f7f8f9fafbfcfdfeff0
77.section .rodata.cst16.dec, "aM", @progbits, 16
78.align 16
79dec: .octa 0x1
80.section .rodata.cst16.enc, "aM", @progbits, 16
81.align 16
82enc: .octa 0x2
83
Tadeusz Struk0bd82f52010-11-04 15:00:45 -040084# order of these constants should not change.
85# more specifically, ALL_F should follow SHIFT_MASK,
Denys Vlasenkoe1839142017-01-19 22:33:04 +010086# and zero should follow ALL_F
87.section .rodata, "a", @progbits
88.align 16
Tadeusz Struk0bd82f52010-11-04 15:00:45 -040089SHIFT_MASK: .octa 0x0f0e0d0c0b0a09080706050403020100
90ALL_F: .octa 0xffffffffffffffffffffffffffffffff
Denys Vlasenkoe1839142017-01-19 22:33:04 +010091 .octa 0x00000000000000000000000000000000
Tadeusz Struk0bd82f52010-11-04 15:00:45 -040092
Huang Ying54b6a1b2009-01-18 16:28:34 +110093.text
94
Tadeusz Struk0bd82f52010-11-04 15:00:45 -040095
96#define STACK_OFFSET 8*3
Tadeusz Struk0bd82f52010-11-04 15:00:45 -040097
Dave Watson9ee4a5d2018-02-14 09:39:23 -080098#define AadHash 16*0
99#define AadLen 16*1
100#define InLen (16*1)+8
101#define PBlockEncKey 16*2
102#define OrigIV 16*3
103#define CurCount 16*4
104#define PBlockLen 16*5
Dave Watson1476db22018-02-14 09:40:10 -0800105#define HashKey 16*6 // store HashKey <<1 mod poly here
106#define HashKey_2 16*7 // store HashKey^2 <<1 mod poly here
107#define HashKey_3 16*8 // store HashKey^3 <<1 mod poly here
108#define HashKey_4 16*9 // store HashKey^4 <<1 mod poly here
109#define HashKey_k 16*10 // store XOR of High 64 bits and Low 64
110 // bits of HashKey <<1 mod poly here
111 //(for Karatsuba purposes)
112#define HashKey_2_k 16*11 // store XOR of High 64 bits and Low 64
113 // bits of HashKey^2 <<1 mod poly here
114 // (for Karatsuba purposes)
115#define HashKey_3_k 16*12 // store XOR of High 64 bits and Low 64
116 // bits of HashKey^3 <<1 mod poly here
117 // (for Karatsuba purposes)
118#define HashKey_4_k 16*13 // store XOR of High 64 bits and Low 64
119 // bits of HashKey^4 <<1 mod poly here
120 // (for Karatsuba purposes)
Dave Watson9ee4a5d2018-02-14 09:39:23 -0800121
Tadeusz Struk0bd82f52010-11-04 15:00:45 -0400122#define arg1 rdi
123#define arg2 rsi
124#define arg3 rdx
125#define arg4 rcx
126#define arg5 r8
127#define arg6 r9
Dave Watson1476db22018-02-14 09:40:10 -0800128#define arg7 STACK_OFFSET+8(%rsp)
129#define arg8 STACK_OFFSET+16(%rsp)
130#define arg9 STACK_OFFSET+24(%rsp)
131#define arg10 STACK_OFFSET+32(%rsp)
132#define arg11 STACK_OFFSET+40(%rsp)
Timothy McCaffreye31ac322015-01-13 13:16:43 -0500133#define keysize 2*15*16(%arg1)
Mathias Krause559ad0f2010-11-29 08:35:39 +0800134#endif
Tadeusz Struk0bd82f52010-11-04 15:00:45 -0400135
136
Huang Ying54b6a1b2009-01-18 16:28:34 +1100137#define STATE1 %xmm0
138#define STATE2 %xmm4
139#define STATE3 %xmm5
140#define STATE4 %xmm6
141#define STATE STATE1
142#define IN1 %xmm1
143#define IN2 %xmm7
144#define IN3 %xmm8
145#define IN4 %xmm9
146#define IN IN1
147#define KEY %xmm2
148#define IV %xmm3
Mathias Krause0d258ef2010-11-27 16:34:46 +0800149
Huang Ying12387a42010-03-10 18:28:55 +0800150#define BSWAP_MASK %xmm10
151#define CTR %xmm11
152#define INC %xmm12
Huang Ying54b6a1b2009-01-18 16:28:34 +1100153
Jussi Kivilinnac456a9c2013-04-08 21:51:16 +0300154#define GF128MUL_MASK %xmm10
155
Mathias Krause0d258ef2010-11-27 16:34:46 +0800156#ifdef __x86_64__
157#define AREG %rax
Huang Ying54b6a1b2009-01-18 16:28:34 +1100158#define KEYP %rdi
159#define OUTP %rsi
Mathias Krause0d258ef2010-11-27 16:34:46 +0800160#define UKEYP OUTP
Huang Ying54b6a1b2009-01-18 16:28:34 +1100161#define INP %rdx
162#define LEN %rcx
163#define IVP %r8
164#define KLEN %r9d
165#define T1 %r10
166#define TKEYP T1
167#define T2 %r11
Huang Ying12387a42010-03-10 18:28:55 +0800168#define TCTR_LOW T2
Mathias Krause0d258ef2010-11-27 16:34:46 +0800169#else
170#define AREG %eax
171#define KEYP %edi
172#define OUTP AREG
173#define UKEYP OUTP
174#define INP %edx
175#define LEN %esi
176#define IVP %ebp
177#define KLEN %ebx
178#define T1 %ecx
179#define TKEYP T1
180#endif
Huang Ying54b6a1b2009-01-18 16:28:34 +1100181
Dave Watson6c2c86b2018-02-14 09:38:35 -0800182.macro FUNC_SAVE
183 push %r12
184 push %r13
185 push %r14
Dave Watson6c2c86b2018-02-14 09:38:35 -0800186#
187# states of %xmm registers %xmm6:%xmm15 not saved
188# all %xmm registers are clobbered
189#
Dave Watson6c2c86b2018-02-14 09:38:35 -0800190.endm
191
192
193.macro FUNC_RESTORE
Dave Watson6c2c86b2018-02-14 09:38:35 -0800194 pop %r14
195 pop %r13
196 pop %r12
197.endm
Tadeusz Struk0bd82f52010-11-04 15:00:45 -0400198
Dave Watson1476db22018-02-14 09:40:10 -0800199# Precompute hashkeys.
200# Input: Hash subkey.
201# Output: HashKeys stored in gcm_context_data. Only needs to be called
202# once per key.
203# clobbers r12, and tmp xmm registers.
204.macro PRECOMPUTE TMP1 TMP2 TMP3 TMP4 TMP5 TMP6 TMP7
205 mov arg7, %r12
206 movdqu (%r12), \TMP3
207 movdqa SHUF_MASK(%rip), \TMP2
208 PSHUFB_XMM \TMP2, \TMP3
209
210 # precompute HashKey<<1 mod poly from the HashKey (required for GHASH)
211
212 movdqa \TMP3, \TMP2
213 psllq $1, \TMP3
214 psrlq $63, \TMP2
215 movdqa \TMP2, \TMP1
216 pslldq $8, \TMP2
217 psrldq $8, \TMP1
218 por \TMP2, \TMP3
219
220 # reduce HashKey<<1
221
222 pshufd $0x24, \TMP1, \TMP2
223 pcmpeqd TWOONE(%rip), \TMP2
224 pand POLY(%rip), \TMP2
225 pxor \TMP2, \TMP3
226 movdqa \TMP3, HashKey(%arg2)
227
228 movdqa \TMP3, \TMP5
229 pshufd $78, \TMP3, \TMP1
230 pxor \TMP3, \TMP1
231 movdqa \TMP1, HashKey_k(%arg2)
232
233 GHASH_MUL \TMP5, \TMP3, \TMP1, \TMP2, \TMP4, \TMP6, \TMP7
234# TMP5 = HashKey^2<<1 (mod poly)
235 movdqa \TMP5, HashKey_2(%arg2)
236# HashKey_2 = HashKey^2<<1 (mod poly)
237 pshufd $78, \TMP5, \TMP1
238 pxor \TMP5, \TMP1
239 movdqa \TMP1, HashKey_2_k(%arg2)
240
241 GHASH_MUL \TMP5, \TMP3, \TMP1, \TMP2, \TMP4, \TMP6, \TMP7
242# TMP5 = HashKey^3<<1 (mod poly)
243 movdqa \TMP5, HashKey_3(%arg2)
244 pshufd $78, \TMP5, \TMP1
245 pxor \TMP5, \TMP1
246 movdqa \TMP1, HashKey_3_k(%arg2)
247
248 GHASH_MUL \TMP5, \TMP3, \TMP1, \TMP2, \TMP4, \TMP6, \TMP7
249# TMP5 = HashKey^3<<1 (mod poly)
250 movdqa \TMP5, HashKey_4(%arg2)
251 pshufd $78, \TMP5, \TMP1
252 pxor \TMP5, \TMP1
253 movdqa \TMP1, HashKey_4_k(%arg2)
254.endm
Dave Watson7af964c2018-02-14 09:38:45 -0800255
256# GCM_INIT initializes a gcm_context struct to prepare for encoding/decoding.
257# Clobbers rax, r10-r13 and xmm0-xmm6, %xmm13
258.macro GCM_INIT
Dave Watson96604742018-02-14 09:39:45 -0800259 mov arg9, %r11
260 mov %r11, AadLen(%arg2) # ctx_data.aad_length = aad_length
261 xor %r11, %r11
262 mov %r11, InLen(%arg2) # ctx_data.in_length = 0
263 mov %r11, PBlockLen(%arg2) # ctx_data.partial_block_length = 0
264 mov %r11, PBlockEncKey(%arg2) # ctx_data.partial_block_enc_key = 0
265 mov %arg6, %rax
266 movdqu (%rax), %xmm0
267 movdqu %xmm0, OrigIV(%arg2) # ctx_data.orig_IV = iv
268
269 movdqa SHUF_MASK(%rip), %xmm2
270 PSHUFB_XMM %xmm2, %xmm0
271 movdqu %xmm0, CurCount(%arg2) # ctx_data.current_counter = iv
272
Dave Watson1476db22018-02-14 09:40:10 -0800273 PRECOMPUTE %xmm1 %xmm2 %xmm3 %xmm4 %xmm5 %xmm6 %xmm7
274 movdqa HashKey(%arg2), %xmm13
Dave Watsonc594c542018-02-14 09:39:36 -0800275
276 CALC_AAD_HASH %xmm13 %xmm0 %xmm1 %xmm2 %xmm3 %xmm4 \
277 %xmm5 %xmm6
Dave Watson7af964c2018-02-14 09:38:45 -0800278.endm
279
Dave Watsonba458332018-02-14 09:39:10 -0800280# GCM_ENC_DEC Encodes/Decodes given data. Assumes that the passed gcm_context
281# struct has been initialized by GCM_INIT.
282# Requires the input data be at least 1 byte long because of READ_PARTIAL_BLOCK
283# Clobbers rax, r10-r13, and xmm0-xmm15
284.macro GCM_ENC_DEC operation
Dave Watson96604742018-02-14 09:39:45 -0800285 movdqu AadHash(%arg2), %xmm8
Dave Watson1476db22018-02-14 09:40:10 -0800286 movdqu HashKey(%arg2), %xmm13
Dave Watson96604742018-02-14 09:39:45 -0800287 add %arg5, InLen(%arg2)
Dave Watsonae952c52018-02-14 09:40:19 -0800288
289 xor %r11, %r11 # initialise the data pointer offset as zero
290 PARTIAL_BLOCK %arg3 %arg4 %arg5 %r11 %xmm8 \operation
291
292 sub %r11, %arg5 # sub partial block data used
Dave Watson96604742018-02-14 09:39:45 -0800293 mov %arg5, %r13 # save the number of bytes
Dave Watsonae952c52018-02-14 09:40:19 -0800294
Dave Watson96604742018-02-14 09:39:45 -0800295 and $-16, %r13 # %r13 = %r13 - (%r13 mod 16)
296 mov %r13, %r12
Dave Watsonba458332018-02-14 09:39:10 -0800297 # Encrypt/Decrypt first few blocks
298
299 and $(3<<4), %r12
300 jz _initial_num_blocks_is_0_\@
301 cmp $(2<<4), %r12
302 jb _initial_num_blocks_is_1_\@
303 je _initial_num_blocks_is_2_\@
304_initial_num_blocks_is_3_\@:
305 INITIAL_BLOCKS_ENC_DEC %xmm9, %xmm10, %xmm13, %xmm11, %xmm12, %xmm0, \
306%xmm1, %xmm2, %xmm3, %xmm4, %xmm8, %xmm5, %xmm6, 5, 678, \operation
307 sub $48, %r13
308 jmp _initial_blocks_\@
309_initial_num_blocks_is_2_\@:
310 INITIAL_BLOCKS_ENC_DEC %xmm9, %xmm10, %xmm13, %xmm11, %xmm12, %xmm0, \
311%xmm1, %xmm2, %xmm3, %xmm4, %xmm8, %xmm5, %xmm6, 6, 78, \operation
312 sub $32, %r13
313 jmp _initial_blocks_\@
314_initial_num_blocks_is_1_\@:
315 INITIAL_BLOCKS_ENC_DEC %xmm9, %xmm10, %xmm13, %xmm11, %xmm12, %xmm0, \
316%xmm1, %xmm2, %xmm3, %xmm4, %xmm8, %xmm5, %xmm6, 7, 8, \operation
317 sub $16, %r13
318 jmp _initial_blocks_\@
319_initial_num_blocks_is_0_\@:
320 INITIAL_BLOCKS_ENC_DEC %xmm9, %xmm10, %xmm13, %xmm11, %xmm12, %xmm0, \
321%xmm1, %xmm2, %xmm3, %xmm4, %xmm8, %xmm5, %xmm6, 8, 0, \operation
322_initial_blocks_\@:
323
324 # Main loop - Encrypt/Decrypt remaining blocks
325
326 cmp $0, %r13
327 je _zero_cipher_left_\@
328 sub $64, %r13
329 je _four_cipher_left_\@
330_crypt_by_4_\@:
331 GHASH_4_ENCRYPT_4_PARALLEL_\operation %xmm9, %xmm10, %xmm11, %xmm12, \
332 %xmm13, %xmm14, %xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, \
333 %xmm7, %xmm8, enc
334 add $64, %r11
335 sub $64, %r13
336 jne _crypt_by_4_\@
337_four_cipher_left_\@:
338 GHASH_LAST_4 %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14, \
339%xmm15, %xmm1, %xmm2, %xmm3, %xmm4, %xmm8
340_zero_cipher_left_\@:
Dave Watson96604742018-02-14 09:39:45 -0800341 movdqu %xmm8, AadHash(%arg2)
342 movdqu %xmm0, CurCount(%arg2)
343
Dave Watson9ee4a5d2018-02-14 09:39:23 -0800344 mov %arg5, %r13
345 and $15, %r13 # %r13 = arg5 (mod 16)
Dave Watsonba458332018-02-14 09:39:10 -0800346 je _multiple_of_16_bytes_\@
347
Dave Watson96604742018-02-14 09:39:45 -0800348 mov %r13, PBlockLen(%arg2)
349
Dave Watsonba458332018-02-14 09:39:10 -0800350 # Handle the last <16 Byte block separately
351 paddd ONE(%rip), %xmm0 # INCR CNT to get Yn
Dave Watson96604742018-02-14 09:39:45 -0800352 movdqu %xmm0, CurCount(%arg2)
Dave Watson9ee4a5d2018-02-14 09:39:23 -0800353 movdqa SHUF_MASK(%rip), %xmm10
Dave Watsonba458332018-02-14 09:39:10 -0800354 PSHUFB_XMM %xmm10, %xmm0
355
356 ENCRYPT_SINGLE_BLOCK %xmm0, %xmm1 # Encrypt(K, Yn)
Dave Watson96604742018-02-14 09:39:45 -0800357 movdqu %xmm0, PBlockEncKey(%arg2)
Dave Watsonba458332018-02-14 09:39:10 -0800358
Dave Watson933d6ae2018-02-14 09:40:31 -0800359 cmp $16, %arg5
360 jge _large_enough_update_\@
361
Dave Watson9ee4a5d2018-02-14 09:39:23 -0800362 lea (%arg4,%r11,1), %r10
Dave Watsonba458332018-02-14 09:39:10 -0800363 mov %r13, %r12
364 READ_PARTIAL_BLOCK %r10 %r12 %xmm2 %xmm1
Dave Watson933d6ae2018-02-14 09:40:31 -0800365 jmp _data_read_\@
Dave Watsonba458332018-02-14 09:39:10 -0800366
Dave Watson933d6ae2018-02-14 09:40:31 -0800367_large_enough_update_\@:
368 sub $16, %r11
369 add %r13, %r11
370
371 # receive the last <16 Byte block
372 movdqu (%arg4, %r11, 1), %xmm1
373
374 sub %r13, %r11
375 add $16, %r11
376
377 lea SHIFT_MASK+16(%rip), %r12
378 # adjust the shuffle mask pointer to be able to shift 16-r13 bytes
379 # (r13 is the number of bytes in plaintext mod 16)
380 sub %r13, %r12
381 # get the appropriate shuffle mask
382 movdqu (%r12), %xmm2
383 # shift right 16-r13 bytes
384 PSHUFB_XMM %xmm2, %xmm1
385
386_data_read_\@:
Dave Watsonba458332018-02-14 09:39:10 -0800387 lea ALL_F+16(%rip), %r12
388 sub %r13, %r12
Dave Watson933d6ae2018-02-14 09:40:31 -0800389
Dave Watsonba458332018-02-14 09:39:10 -0800390.ifc \operation, dec
391 movdqa %xmm1, %xmm2
392.endif
393 pxor %xmm1, %xmm0 # XOR Encrypt(K, Yn)
394 movdqu (%r12), %xmm1
395 # get the appropriate mask to mask out top 16-r13 bytes of xmm0
396 pand %xmm1, %xmm0 # mask out top 16-r13 bytes of xmm0
397.ifc \operation, dec
398 pand %xmm1, %xmm2
399 movdqa SHUF_MASK(%rip), %xmm10
400 PSHUFB_XMM %xmm10 ,%xmm2
401
402 pxor %xmm2, %xmm8
403.else
404 movdqa SHUF_MASK(%rip), %xmm10
405 PSHUFB_XMM %xmm10,%xmm0
406
407 pxor %xmm0, %xmm8
408.endif
409
Dave Watson96604742018-02-14 09:39:45 -0800410 movdqu %xmm8, AadHash(%arg2)
Dave Watsonba458332018-02-14 09:39:10 -0800411.ifc \operation, enc
412 # GHASH computation for the last <16 byte block
413 movdqa SHUF_MASK(%rip), %xmm10
414 # shuffle xmm0 back to output as ciphertext
415 PSHUFB_XMM %xmm10, %xmm0
416.endif
417
418 # Output %r13 bytes
419 MOVQ_R64_XMM %xmm0, %rax
420 cmp $8, %r13
421 jle _less_than_8_bytes_left_\@
Dave Watson9ee4a5d2018-02-14 09:39:23 -0800422 mov %rax, (%arg3 , %r11, 1)
Dave Watsonba458332018-02-14 09:39:10 -0800423 add $8, %r11
424 psrldq $8, %xmm0
425 MOVQ_R64_XMM %xmm0, %rax
426 sub $8, %r13
427_less_than_8_bytes_left_\@:
Dave Watson9ee4a5d2018-02-14 09:39:23 -0800428 mov %al, (%arg3, %r11, 1)
Dave Watsonba458332018-02-14 09:39:10 -0800429 add $1, %r11
430 shr $8, %rax
431 sub $1, %r13
432 jne _less_than_8_bytes_left_\@
433_multiple_of_16_bytes_\@:
434.endm
435
Dave Watsonadcadab2018-02-14 09:38:57 -0800436# GCM_COMPLETE Finishes update of tag of last partial block
437# Output: Authorization Tag (AUTH_TAG)
438# Clobbers rax, r10-r12, and xmm0, xmm1, xmm5-xmm15
439.macro GCM_COMPLETE
Dave Watson96604742018-02-14 09:39:45 -0800440 movdqu AadHash(%arg2), %xmm8
Dave Watson1476db22018-02-14 09:40:10 -0800441 movdqu HashKey(%arg2), %xmm13
Dave Watsone2e34b02018-02-14 09:39:55 -0800442
443 mov PBlockLen(%arg2), %r12
444
445 cmp $0, %r12
446 je _partial_done\@
447
448 GHASH_MUL %xmm8, %xmm13, %xmm9, %xmm10, %xmm11, %xmm5, %xmm6
449
450_partial_done\@:
Dave Watson96604742018-02-14 09:39:45 -0800451 mov AadLen(%arg2), %r12 # %r13 = aadLen (number of bytes)
Dave Watsonadcadab2018-02-14 09:38:57 -0800452 shl $3, %r12 # convert into number of bits
453 movd %r12d, %xmm15 # len(A) in %xmm15
Dave Watson96604742018-02-14 09:39:45 -0800454 mov InLen(%arg2), %r12
455 shl $3, %r12 # len(C) in bits (*128)
456 MOVQ_R64_XMM %r12, %xmm1
457
Dave Watsonadcadab2018-02-14 09:38:57 -0800458 pslldq $8, %xmm15 # %xmm15 = len(A)||0x0000000000000000
459 pxor %xmm1, %xmm15 # %xmm15 = len(A)||len(C)
460 pxor %xmm15, %xmm8
461 GHASH_MUL %xmm8, %xmm13, %xmm9, %xmm10, %xmm11, %xmm5, %xmm6
462 # final GHASH computation
463 movdqa SHUF_MASK(%rip), %xmm10
464 PSHUFB_XMM %xmm10, %xmm8
465
Dave Watson96604742018-02-14 09:39:45 -0800466 movdqu OrigIV(%arg2), %xmm0 # %xmm0 = Y0
Dave Watsonadcadab2018-02-14 09:38:57 -0800467 ENCRYPT_SINGLE_BLOCK %xmm0, %xmm1 # E(K, Y0)
468 pxor %xmm8, %xmm0
469_return_T_\@:
Dave Watson9ee4a5d2018-02-14 09:39:23 -0800470 mov arg10, %r10 # %r10 = authTag
471 mov arg11, %r11 # %r11 = auth_tag_len
Dave Watsonadcadab2018-02-14 09:38:57 -0800472 cmp $16, %r11
473 je _T_16_\@
474 cmp $8, %r11
475 jl _T_4_\@
476_T_8_\@:
477 MOVQ_R64_XMM %xmm0, %rax
478 mov %rax, (%r10)
479 add $8, %r10
480 sub $8, %r11
481 psrldq $8, %xmm0
482 cmp $0, %r11
483 je _return_T_done_\@
484_T_4_\@:
485 movd %xmm0, %eax
486 mov %eax, (%r10)
487 add $4, %r10
488 sub $4, %r11
489 psrldq $4, %xmm0
490 cmp $0, %r11
491 je _return_T_done_\@
492_T_123_\@:
493 movd %xmm0, %eax
494 cmp $2, %r11
495 jl _T_1_\@
496 mov %ax, (%r10)
497 cmp $2, %r11
498 je _return_T_done_\@
499 add $2, %r10
500 sar $16, %eax
501_T_1_\@:
502 mov %al, (%r10)
503 jmp _return_T_done_\@
504_T_16_\@:
505 movdqu %xmm0, (%r10)
506_return_T_done_\@:
507.endm
508
Mathias Krause559ad0f2010-11-29 08:35:39 +0800509#ifdef __x86_64__
Tadeusz Struk0bd82f52010-11-04 15:00:45 -0400510/* GHASH_MUL MACRO to implement: Data*HashKey mod (128,127,126,121,0)
511*
512*
513* Input: A and B (128-bits each, bit-reflected)
514* Output: C = A*B*x mod poly, (i.e. >>1 )
515* To compute GH = GH*HashKey mod poly, give HK = HashKey<<1 mod poly as input
516* GH = GH * HK * x mod poly which is equivalent to GH*HashKey mod poly.
517*
518*/
519.macro GHASH_MUL GH HK TMP1 TMP2 TMP3 TMP4 TMP5
520 movdqa \GH, \TMP1
521 pshufd $78, \GH, \TMP2
522 pshufd $78, \HK, \TMP3
523 pxor \GH, \TMP2 # TMP2 = a1+a0
524 pxor \HK, \TMP3 # TMP3 = b1+b0
525 PCLMULQDQ 0x11, \HK, \TMP1 # TMP1 = a1*b1
526 PCLMULQDQ 0x00, \HK, \GH # GH = a0*b0
527 PCLMULQDQ 0x00, \TMP3, \TMP2 # TMP2 = (a0+a1)*(b1+b0)
528 pxor \GH, \TMP2
529 pxor \TMP1, \TMP2 # TMP2 = (a0*b0)+(a1*b0)
530 movdqa \TMP2, \TMP3
531 pslldq $8, \TMP3 # left shift TMP3 2 DWs
532 psrldq $8, \TMP2 # right shift TMP2 2 DWs
533 pxor \TMP3, \GH
534 pxor \TMP2, \TMP1 # TMP2:GH holds the result of GH*HK
535
536 # first phase of the reduction
537
538 movdqa \GH, \TMP2
539 movdqa \GH, \TMP3
540 movdqa \GH, \TMP4 # copy GH into TMP2,TMP3 and TMP4
541 # in in order to perform
542 # independent shifts
543 pslld $31, \TMP2 # packed right shift <<31
544 pslld $30, \TMP3 # packed right shift <<30
545 pslld $25, \TMP4 # packed right shift <<25
546 pxor \TMP3, \TMP2 # xor the shifted versions
547 pxor \TMP4, \TMP2
548 movdqa \TMP2, \TMP5
549 psrldq $4, \TMP5 # right shift TMP5 1 DW
550 pslldq $12, \TMP2 # left shift TMP2 3 DWs
551 pxor \TMP2, \GH
552
553 # second phase of the reduction
554
555 movdqa \GH,\TMP2 # copy GH into TMP2,TMP3 and TMP4
556 # in in order to perform
557 # independent shifts
558 movdqa \GH,\TMP3
559 movdqa \GH,\TMP4
560 psrld $1,\TMP2 # packed left shift >>1
561 psrld $2,\TMP3 # packed left shift >>2
562 psrld $7,\TMP4 # packed left shift >>7
563 pxor \TMP3,\TMP2 # xor the shifted versions
564 pxor \TMP4,\TMP2
565 pxor \TMP5, \TMP2
566 pxor \TMP2, \GH
567 pxor \TMP1, \GH # result is in TMP1
568.endm
569
Junaid Shahidb20209c2017-12-20 17:08:37 -0800570# Reads DLEN bytes starting at DPTR and stores in XMMDst
571# where 0 < DLEN < 16
572# Clobbers %rax, DLEN and XMM1
573.macro READ_PARTIAL_BLOCK DPTR DLEN XMM1 XMMDst
574 cmp $8, \DLEN
575 jl _read_lt8_\@
576 mov (\DPTR), %rax
577 MOVQ_R64_XMM %rax, \XMMDst
578 sub $8, \DLEN
579 jz _done_read_partial_block_\@
580 xor %eax, %eax
581_read_next_byte_\@:
582 shl $8, %rax
583 mov 7(\DPTR, \DLEN, 1), %al
584 dec \DLEN
585 jnz _read_next_byte_\@
586 MOVQ_R64_XMM %rax, \XMM1
587 pslldq $8, \XMM1
588 por \XMM1, \XMMDst
589 jmp _done_read_partial_block_\@
590_read_lt8_\@:
591 xor %eax, %eax
592_read_next_byte_lt8_\@:
593 shl $8, %rax
594 mov -1(\DPTR, \DLEN, 1), %al
595 dec \DLEN
596 jnz _read_next_byte_lt8_\@
597 MOVQ_R64_XMM %rax, \XMMDst
598_done_read_partial_block_\@:
599.endm
600
Dave Watsonc594c542018-02-14 09:39:36 -0800601# CALC_AAD_HASH: Calculates the hash of the data which will not be encrypted.
602# clobbers r10-11, xmm14
603.macro CALC_AAD_HASH HASHKEY TMP1 TMP2 TMP3 TMP4 TMP5 \
604 TMP6 TMP7
605 MOVADQ SHUF_MASK(%rip), %xmm14
606 mov arg8, %r10 # %r10 = AAD
607 mov arg9, %r11 # %r11 = aadLen
608 pxor \TMP7, \TMP7
609 pxor \TMP6, \TMP6
610
611 cmp $16, %r11
612 jl _get_AAD_rest\@
613_get_AAD_blocks\@:
614 movdqu (%r10), \TMP7
615 PSHUFB_XMM %xmm14, \TMP7 # byte-reflect the AAD data
616 pxor \TMP7, \TMP6
617 GHASH_MUL \TMP6, \HASHKEY, \TMP1, \TMP2, \TMP3, \TMP4, \TMP5
618 add $16, %r10
619 sub $16, %r11
620 cmp $16, %r11
621 jge _get_AAD_blocks\@
622
623 movdqu \TMP6, \TMP7
624
625 /* read the last <16B of AAD */
626_get_AAD_rest\@:
627 cmp $0, %r11
628 je _get_AAD_done\@
629
630 READ_PARTIAL_BLOCK %r10, %r11, \TMP1, \TMP7
631 PSHUFB_XMM %xmm14, \TMP7 # byte-reflect the AAD data
632 pxor \TMP6, \TMP7
633 GHASH_MUL \TMP7, \HASHKEY, \TMP1, \TMP2, \TMP3, \TMP4, \TMP5
634 movdqu \TMP7, \TMP6
635
636_get_AAD_done\@:
637 movdqu \TMP6, AadHash(%arg2)
638.endm
639
Dave Watsonae952c52018-02-14 09:40:19 -0800640# PARTIAL_BLOCK: Handles encryption/decryption and the tag partial blocks
641# between update calls.
642# Requires the input data be at least 1 byte long due to READ_PARTIAL_BLOCK
643# Outputs encrypted bytes, and updates hash and partial info in gcm_data_context
644# Clobbers rax, r10, r12, r13, xmm0-6, xmm9-13
645.macro PARTIAL_BLOCK CYPH_PLAIN_OUT PLAIN_CYPH_IN PLAIN_CYPH_LEN DATA_OFFSET \
646 AAD_HASH operation
647 mov PBlockLen(%arg2), %r13
648 cmp $0, %r13
649 je _partial_block_done_\@ # Leave Macro if no partial blocks
650 # Read in input data without over reading
651 cmp $16, \PLAIN_CYPH_LEN
652 jl _fewer_than_16_bytes_\@
653 movups (\PLAIN_CYPH_IN), %xmm1 # If more than 16 bytes, just fill xmm
654 jmp _data_read_\@
655
656_fewer_than_16_bytes_\@:
657 lea (\PLAIN_CYPH_IN, \DATA_OFFSET, 1), %r10
658 mov \PLAIN_CYPH_LEN, %r12
659 READ_PARTIAL_BLOCK %r10 %r12 %xmm0 %xmm1
660
661 mov PBlockLen(%arg2), %r13
662
663_data_read_\@: # Finished reading in data
664
665 movdqu PBlockEncKey(%arg2), %xmm9
666 movdqu HashKey(%arg2), %xmm13
667
668 lea SHIFT_MASK(%rip), %r12
669
670 # adjust the shuffle mask pointer to be able to shift r13 bytes
671 # r16-r13 is the number of bytes in plaintext mod 16)
672 add %r13, %r12
673 movdqu (%r12), %xmm2 # get the appropriate shuffle mask
674 PSHUFB_XMM %xmm2, %xmm9 # shift right r13 bytes
675
676.ifc \operation, dec
677 movdqa %xmm1, %xmm3
678 pxor %xmm1, %xmm9 # Cyphertext XOR E(K, Yn)
679
680 mov \PLAIN_CYPH_LEN, %r10
681 add %r13, %r10
682 # Set r10 to be the amount of data left in CYPH_PLAIN_IN after filling
683 sub $16, %r10
684 # Determine if if partial block is not being filled and
685 # shift mask accordingly
686 jge _no_extra_mask_1_\@
687 sub %r10, %r12
688_no_extra_mask_1_\@:
689
690 movdqu ALL_F-SHIFT_MASK(%r12), %xmm1
691 # get the appropriate mask to mask out bottom r13 bytes of xmm9
692 pand %xmm1, %xmm9 # mask out bottom r13 bytes of xmm9
693
694 pand %xmm1, %xmm3
695 movdqa SHUF_MASK(%rip), %xmm10
696 PSHUFB_XMM %xmm10, %xmm3
697 PSHUFB_XMM %xmm2, %xmm3
698 pxor %xmm3, \AAD_HASH
699
700 cmp $0, %r10
701 jl _partial_incomplete_1_\@
702
703 # GHASH computation for the last <16 Byte block
704 GHASH_MUL \AAD_HASH, %xmm13, %xmm0, %xmm10, %xmm11, %xmm5, %xmm6
705 xor %rax,%rax
706
707 mov %rax, PBlockLen(%arg2)
708 jmp _dec_done_\@
709_partial_incomplete_1_\@:
710 add \PLAIN_CYPH_LEN, PBlockLen(%arg2)
711_dec_done_\@:
712 movdqu \AAD_HASH, AadHash(%arg2)
713.else
714 pxor %xmm1, %xmm9 # Plaintext XOR E(K, Yn)
715
716 mov \PLAIN_CYPH_LEN, %r10
717 add %r13, %r10
718 # Set r10 to be the amount of data left in CYPH_PLAIN_IN after filling
719 sub $16, %r10
720 # Determine if if partial block is not being filled and
721 # shift mask accordingly
722 jge _no_extra_mask_2_\@
723 sub %r10, %r12
724_no_extra_mask_2_\@:
725
726 movdqu ALL_F-SHIFT_MASK(%r12), %xmm1
727 # get the appropriate mask to mask out bottom r13 bytes of xmm9
728 pand %xmm1, %xmm9
729
730 movdqa SHUF_MASK(%rip), %xmm1
731 PSHUFB_XMM %xmm1, %xmm9
732 PSHUFB_XMM %xmm2, %xmm9
733 pxor %xmm9, \AAD_HASH
734
735 cmp $0, %r10
736 jl _partial_incomplete_2_\@
737
738 # GHASH computation for the last <16 Byte block
739 GHASH_MUL \AAD_HASH, %xmm13, %xmm0, %xmm10, %xmm11, %xmm5, %xmm6
740 xor %rax,%rax
741
742 mov %rax, PBlockLen(%arg2)
743 jmp _encode_done_\@
744_partial_incomplete_2_\@:
745 add \PLAIN_CYPH_LEN, PBlockLen(%arg2)
746_encode_done_\@:
747 movdqu \AAD_HASH, AadHash(%arg2)
748
749 movdqa SHUF_MASK(%rip), %xmm10
750 # shuffle xmm9 back to output as ciphertext
751 PSHUFB_XMM %xmm10, %xmm9
752 PSHUFB_XMM %xmm2, %xmm9
753.endif
754 # output encrypted Bytes
755 cmp $0, %r10
756 jl _partial_fill_\@
757 mov %r13, %r12
758 mov $16, %r13
759 # Set r13 to be the number of bytes to write out
760 sub %r12, %r13
761 jmp _count_set_\@
762_partial_fill_\@:
763 mov \PLAIN_CYPH_LEN, %r13
764_count_set_\@:
765 movdqa %xmm9, %xmm0
766 MOVQ_R64_XMM %xmm0, %rax
767 cmp $8, %r13
768 jle _less_than_8_bytes_left_\@
769
770 mov %rax, (\CYPH_PLAIN_OUT, \DATA_OFFSET, 1)
771 add $8, \DATA_OFFSET
772 psrldq $8, %xmm0
773 MOVQ_R64_XMM %xmm0, %rax
774 sub $8, %r13
775_less_than_8_bytes_left_\@:
776 movb %al, (\CYPH_PLAIN_OUT, \DATA_OFFSET, 1)
777 add $1, \DATA_OFFSET
778 shr $8, %rax
779 sub $1, %r13
780 jne _less_than_8_bytes_left_\@
781_partial_block_done_\@:
782.endm # PARTIAL_BLOCK
783
Tadeusz Struk0bd82f52010-11-04 15:00:45 -0400784/*
785* if a = number of total plaintext bytes
786* b = floor(a/16)
787* num_initial_blocks = b mod 4
788* encrypt the initial num_initial_blocks blocks and apply ghash on
789* the ciphertext
790* %r10, %r11, %r12, %rax, %xmm5, %xmm6, %xmm7, %xmm8, %xmm9 registers
791* are clobbered
Dave Watson1476db22018-02-14 09:40:10 -0800792* arg1, %arg2, %arg3 are used as a pointer only, not modified
Tadeusz Struk0bd82f52010-11-04 15:00:45 -0400793*/
794
Tadeusz Struk0bd82f52010-11-04 15:00:45 -0400795
Dave Watsone1fd3162018-02-14 09:38:12 -0800796.macro INITIAL_BLOCKS_ENC_DEC TMP1 TMP2 TMP3 TMP4 TMP5 XMM0 XMM1 \
Dave Watsonc594c542018-02-14 09:39:36 -0800797 XMM2 XMM3 XMM4 XMMDst TMP6 TMP7 i i_seq operation
Dave Watson96604742018-02-14 09:39:45 -0800798 MOVADQ SHUF_MASK(%rip), %xmm14
Sabrina Dubroca0487cca2017-04-28 18:11:56 +0200799
Dave Watsonc594c542018-02-14 09:39:36 -0800800 movdqu AadHash(%arg2), %xmm\i # XMM0 = Y0
Sabrina Dubroca0487cca2017-04-28 18:11:56 +0200801
Sabrina Dubroca0487cca2017-04-28 18:11:56 +0200802 # start AES for num_initial_blocks blocks
Tadeusz Struk3c097b82010-12-13 19:51:15 +0800803
Dave Watson96604742018-02-14 09:39:45 -0800804 movdqu CurCount(%arg2), \XMM0 # XMM0 = Y0
Tadeusz Struk3c097b82010-12-13 19:51:15 +0800805
806.if (\i == 5) || (\i == 6) || (\i == 7)
Tadeusz Struk3c097b82010-12-13 19:51:15 +0800807
Timothy McCaffreye31ac322015-01-13 13:16:43 -0500808 MOVADQ ONE(%RIP),\TMP1
809 MOVADQ 0(%arg1),\TMP2
Tadeusz Struk3c097b82010-12-13 19:51:15 +0800810.irpc index, \i_seq
Timothy McCaffreye31ac322015-01-13 13:16:43 -0500811 paddd \TMP1, \XMM0 # INCR Y0
Dave Watsone1fd3162018-02-14 09:38:12 -0800812.ifc \operation, dec
813 movdqa \XMM0, %xmm\index
814.else
Timothy McCaffreye31ac322015-01-13 13:16:43 -0500815 MOVADQ \XMM0, %xmm\index
Dave Watsone1fd3162018-02-14 09:38:12 -0800816.endif
Timothy McCaffreye31ac322015-01-13 13:16:43 -0500817 PSHUFB_XMM %xmm14, %xmm\index # perform a 16 byte swap
818 pxor \TMP2, %xmm\index
Tadeusz Struk3c097b82010-12-13 19:51:15 +0800819.endr
Timothy McCaffreye31ac322015-01-13 13:16:43 -0500820 lea 0x10(%arg1),%r10
821 mov keysize,%eax
822 shr $2,%eax # 128->4, 192->6, 256->8
823 add $5,%eax # 128->9, 192->11, 256->13
824
Dave Watsone1fd3162018-02-14 09:38:12 -0800825aes_loop_initial_\@:
Timothy McCaffreye31ac322015-01-13 13:16:43 -0500826 MOVADQ (%r10),\TMP1
827.irpc index, \i_seq
828 AESENC \TMP1, %xmm\index
Tadeusz Struk3c097b82010-12-13 19:51:15 +0800829.endr
Timothy McCaffreye31ac322015-01-13 13:16:43 -0500830 add $16,%r10
831 sub $1,%eax
Dave Watsone1fd3162018-02-14 09:38:12 -0800832 jnz aes_loop_initial_\@
Timothy McCaffreye31ac322015-01-13 13:16:43 -0500833
834 MOVADQ (%r10), \TMP1
Tadeusz Struk3c097b82010-12-13 19:51:15 +0800835.irpc index, \i_seq
Timothy McCaffreye31ac322015-01-13 13:16:43 -0500836 AESENCLAST \TMP1, %xmm\index # Last Round
Tadeusz Struk3c097b82010-12-13 19:51:15 +0800837.endr
838.irpc index, \i_seq
Dave Watson9ee4a5d2018-02-14 09:39:23 -0800839 movdqu (%arg4 , %r11, 1), \TMP1
Tadeusz Struk3c097b82010-12-13 19:51:15 +0800840 pxor \TMP1, %xmm\index
Dave Watson9ee4a5d2018-02-14 09:39:23 -0800841 movdqu %xmm\index, (%arg3 , %r11, 1)
Tadeusz Struk3c097b82010-12-13 19:51:15 +0800842 # write back plaintext/ciphertext for num_initial_blocks
843 add $16, %r11
Dave Watsone1fd3162018-02-14 09:38:12 -0800844
845.ifc \operation, dec
846 movdqa \TMP1, %xmm\index
847.endif
Tadeusz Struk3c097b82010-12-13 19:51:15 +0800848 PSHUFB_XMM %xmm14, %xmm\index
849
850 # prepare plaintext/ciphertext for GHASH computation
851.endr
852.endif
Sabrina Dubroca0487cca2017-04-28 18:11:56 +0200853
Tadeusz Struk3c097b82010-12-13 19:51:15 +0800854 # apply GHASH on num_initial_blocks blocks
855
856.if \i == 5
857 pxor %xmm5, %xmm6
858 GHASH_MUL %xmm6, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1
859 pxor %xmm6, %xmm7
860 GHASH_MUL %xmm7, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1
861 pxor %xmm7, %xmm8
862 GHASH_MUL %xmm8, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1
863.elseif \i == 6
864 pxor %xmm6, %xmm7
865 GHASH_MUL %xmm7, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1
866 pxor %xmm7, %xmm8
867 GHASH_MUL %xmm8, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1
868.elseif \i == 7
869 pxor %xmm7, %xmm8
870 GHASH_MUL %xmm8, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1
871.endif
872 cmp $64, %r13
Dave Watsone1fd3162018-02-14 09:38:12 -0800873 jl _initial_blocks_done\@
Tadeusz Struk3c097b82010-12-13 19:51:15 +0800874 # no need for precomputed values
875/*
876*
877* Precomputations for HashKey parallel with encryption of first 4 blocks.
878* Haskey_i_k holds XORed values of the low and high parts of the Haskey_i
879*/
Timothy McCaffreye31ac322015-01-13 13:16:43 -0500880 MOVADQ ONE(%RIP),\TMP1
881 paddd \TMP1, \XMM0 # INCR Y0
882 MOVADQ \XMM0, \XMM1
Tadeusz Struk3c097b82010-12-13 19:51:15 +0800883 PSHUFB_XMM %xmm14, \XMM1 # perform a 16 byte swap
884
Timothy McCaffreye31ac322015-01-13 13:16:43 -0500885 paddd \TMP1, \XMM0 # INCR Y0
886 MOVADQ \XMM0, \XMM2
Tadeusz Struk3c097b82010-12-13 19:51:15 +0800887 PSHUFB_XMM %xmm14, \XMM2 # perform a 16 byte swap
888
Timothy McCaffreye31ac322015-01-13 13:16:43 -0500889 paddd \TMP1, \XMM0 # INCR Y0
890 MOVADQ \XMM0, \XMM3
Tadeusz Struk3c097b82010-12-13 19:51:15 +0800891 PSHUFB_XMM %xmm14, \XMM3 # perform a 16 byte swap
892
Timothy McCaffreye31ac322015-01-13 13:16:43 -0500893 paddd \TMP1, \XMM0 # INCR Y0
894 MOVADQ \XMM0, \XMM4
Tadeusz Struk3c097b82010-12-13 19:51:15 +0800895 PSHUFB_XMM %xmm14, \XMM4 # perform a 16 byte swap
896
Timothy McCaffreye31ac322015-01-13 13:16:43 -0500897 MOVADQ 0(%arg1),\TMP1
898 pxor \TMP1, \XMM1
899 pxor \TMP1, \XMM2
900 pxor \TMP1, \XMM3
901 pxor \TMP1, \XMM4
Tadeusz Struk3c097b82010-12-13 19:51:15 +0800902.irpc index, 1234 # do 4 rounds
903 movaps 0x10*\index(%arg1), \TMP1
904 AESENC \TMP1, \XMM1
905 AESENC \TMP1, \XMM2
906 AESENC \TMP1, \XMM3
907 AESENC \TMP1, \XMM4
908.endr
Tadeusz Struk3c097b82010-12-13 19:51:15 +0800909.irpc index, 56789 # do next 5 rounds
910 movaps 0x10*\index(%arg1), \TMP1
911 AESENC \TMP1, \XMM1
912 AESENC \TMP1, \XMM2
913 AESENC \TMP1, \XMM3
914 AESENC \TMP1, \XMM4
915.endr
Timothy McCaffreye31ac322015-01-13 13:16:43 -0500916 lea 0xa0(%arg1),%r10
917 mov keysize,%eax
918 shr $2,%eax # 128->4, 192->6, 256->8
919 sub $4,%eax # 128->0, 192->2, 256->4
Dave Watsone1fd3162018-02-14 09:38:12 -0800920 jz aes_loop_pre_done\@
Timothy McCaffreye31ac322015-01-13 13:16:43 -0500921
Dave Watsone1fd3162018-02-14 09:38:12 -0800922aes_loop_pre_\@:
Timothy McCaffreye31ac322015-01-13 13:16:43 -0500923 MOVADQ (%r10),\TMP2
924.irpc index, 1234
925 AESENC \TMP2, %xmm\index
926.endr
927 add $16,%r10
928 sub $1,%eax
Dave Watsone1fd3162018-02-14 09:38:12 -0800929 jnz aes_loop_pre_\@
Timothy McCaffreye31ac322015-01-13 13:16:43 -0500930
Dave Watsone1fd3162018-02-14 09:38:12 -0800931aes_loop_pre_done\@:
Timothy McCaffreye31ac322015-01-13 13:16:43 -0500932 MOVADQ (%r10), \TMP2
Tadeusz Struk3c097b82010-12-13 19:51:15 +0800933 AESENCLAST \TMP2, \XMM1
934 AESENCLAST \TMP2, \XMM2
935 AESENCLAST \TMP2, \XMM3
936 AESENCLAST \TMP2, \XMM4
Dave Watson9ee4a5d2018-02-14 09:39:23 -0800937 movdqu 16*0(%arg4 , %r11 , 1), \TMP1
Tadeusz Struk3c097b82010-12-13 19:51:15 +0800938 pxor \TMP1, \XMM1
Dave Watsone1fd3162018-02-14 09:38:12 -0800939.ifc \operation, dec
Dave Watson9ee4a5d2018-02-14 09:39:23 -0800940 movdqu \XMM1, 16*0(%arg3 , %r11 , 1)
Dave Watsone1fd3162018-02-14 09:38:12 -0800941 movdqa \TMP1, \XMM1
942.endif
Dave Watson9ee4a5d2018-02-14 09:39:23 -0800943 movdqu 16*1(%arg4 , %r11 , 1), \TMP1
Tadeusz Struk3c097b82010-12-13 19:51:15 +0800944 pxor \TMP1, \XMM2
Dave Watsone1fd3162018-02-14 09:38:12 -0800945.ifc \operation, dec
Dave Watson9ee4a5d2018-02-14 09:39:23 -0800946 movdqu \XMM2, 16*1(%arg3 , %r11 , 1)
Dave Watsone1fd3162018-02-14 09:38:12 -0800947 movdqa \TMP1, \XMM2
948.endif
Dave Watson9ee4a5d2018-02-14 09:39:23 -0800949 movdqu 16*2(%arg4 , %r11 , 1), \TMP1
Tadeusz Struk3c097b82010-12-13 19:51:15 +0800950 pxor \TMP1, \XMM3
Dave Watsone1fd3162018-02-14 09:38:12 -0800951.ifc \operation, dec
Dave Watson9ee4a5d2018-02-14 09:39:23 -0800952 movdqu \XMM3, 16*2(%arg3 , %r11 , 1)
Dave Watsone1fd3162018-02-14 09:38:12 -0800953 movdqa \TMP1, \XMM3
954.endif
Dave Watson9ee4a5d2018-02-14 09:39:23 -0800955 movdqu 16*3(%arg4 , %r11 , 1), \TMP1
Tadeusz Struk3c097b82010-12-13 19:51:15 +0800956 pxor \TMP1, \XMM4
Dave Watsone1fd3162018-02-14 09:38:12 -0800957.ifc \operation, dec
Dave Watson9ee4a5d2018-02-14 09:39:23 -0800958 movdqu \XMM4, 16*3(%arg3 , %r11 , 1)
Dave Watsone1fd3162018-02-14 09:38:12 -0800959 movdqa \TMP1, \XMM4
960.else
Dave Watson9ee4a5d2018-02-14 09:39:23 -0800961 movdqu \XMM1, 16*0(%arg3 , %r11 , 1)
962 movdqu \XMM2, 16*1(%arg3 , %r11 , 1)
963 movdqu \XMM3, 16*2(%arg3 , %r11 , 1)
964 movdqu \XMM4, 16*3(%arg3 , %r11 , 1)
Dave Watsone1fd3162018-02-14 09:38:12 -0800965.endif
Tadeusz Struk3c097b82010-12-13 19:51:15 +0800966
Tadeusz Struk0bd82f52010-11-04 15:00:45 -0400967 add $64, %r11
Tadeusz Struk3c097b82010-12-13 19:51:15 +0800968 PSHUFB_XMM %xmm14, \XMM1 # perform a 16 byte swap
Tadeusz Struk0bd82f52010-11-04 15:00:45 -0400969 pxor \XMMDst, \XMM1
970# combine GHASHed value with the corresponding ciphertext
Tadeusz Struk3c097b82010-12-13 19:51:15 +0800971 PSHUFB_XMM %xmm14, \XMM2 # perform a 16 byte swap
Tadeusz Struk3c097b82010-12-13 19:51:15 +0800972 PSHUFB_XMM %xmm14, \XMM3 # perform a 16 byte swap
Tadeusz Struk3c097b82010-12-13 19:51:15 +0800973 PSHUFB_XMM %xmm14, \XMM4 # perform a 16 byte swap
974
Dave Watsone1fd3162018-02-14 09:38:12 -0800975_initial_blocks_done\@:
Tadeusz Struk3c097b82010-12-13 19:51:15 +0800976
Tadeusz Struk0bd82f52010-11-04 15:00:45 -0400977.endm
978
979/*
980* encrypt 4 blocks at a time
981* ghash the 4 previously encrypted ciphertext blocks
Dave Watson9ee4a5d2018-02-14 09:39:23 -0800982* arg1, %arg3, %arg4 are used as pointers only, not modified
Tadeusz Struk0bd82f52010-11-04 15:00:45 -0400983* %r11 is the data offset value
984*/
Tadeusz Struk3c097b82010-12-13 19:51:15 +0800985.macro GHASH_4_ENCRYPT_4_PARALLEL_ENC TMP1 TMP2 TMP3 TMP4 TMP5 \
Tadeusz Struk0bd82f52010-11-04 15:00:45 -0400986TMP6 XMM0 XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7 XMM8 operation
987
988 movdqa \XMM1, \XMM5
989 movdqa \XMM2, \XMM6
990 movdqa \XMM3, \XMM7
991 movdqa \XMM4, \XMM8
992
Tadeusz Struk3c097b82010-12-13 19:51:15 +0800993 movdqa SHUF_MASK(%rip), %xmm15
Tadeusz Struk0bd82f52010-11-04 15:00:45 -0400994 # multiply TMP5 * HashKey using karatsuba
995
996 movdqa \XMM5, \TMP4
997 pshufd $78, \XMM5, \TMP6
998 pxor \XMM5, \TMP6
999 paddd ONE(%rip), \XMM0 # INCR CNT
Dave Watson1476db22018-02-14 09:40:10 -08001000 movdqa HashKey_4(%arg2), \TMP5
Tadeusz Struk0bd82f52010-11-04 15:00:45 -04001001 PCLMULQDQ 0x11, \TMP5, \TMP4 # TMP4 = a1*b1
1002 movdqa \XMM0, \XMM1
1003 paddd ONE(%rip), \XMM0 # INCR CNT
1004 movdqa \XMM0, \XMM2
1005 paddd ONE(%rip), \XMM0 # INCR CNT
1006 movdqa \XMM0, \XMM3
1007 paddd ONE(%rip), \XMM0 # INCR CNT
1008 movdqa \XMM0, \XMM4
Tadeusz Struk3c097b82010-12-13 19:51:15 +08001009 PSHUFB_XMM %xmm15, \XMM1 # perform a 16 byte swap
Tadeusz Struk0bd82f52010-11-04 15:00:45 -04001010 PCLMULQDQ 0x00, \TMP5, \XMM5 # XMM5 = a0*b0
Tadeusz Struk3c097b82010-12-13 19:51:15 +08001011 PSHUFB_XMM %xmm15, \XMM2 # perform a 16 byte swap
1012 PSHUFB_XMM %xmm15, \XMM3 # perform a 16 byte swap
1013 PSHUFB_XMM %xmm15, \XMM4 # perform a 16 byte swap
1014
Tadeusz Struk0bd82f52010-11-04 15:00:45 -04001015 pxor (%arg1), \XMM1
1016 pxor (%arg1), \XMM2
1017 pxor (%arg1), \XMM3
1018 pxor (%arg1), \XMM4
Dave Watson1476db22018-02-14 09:40:10 -08001019 movdqa HashKey_4_k(%arg2), \TMP5
Tadeusz Struk0bd82f52010-11-04 15:00:45 -04001020 PCLMULQDQ 0x00, \TMP5, \TMP6 # TMP6 = (a1+a0)*(b1+b0)
1021 movaps 0x10(%arg1), \TMP1
1022 AESENC \TMP1, \XMM1 # Round 1
1023 AESENC \TMP1, \XMM2
1024 AESENC \TMP1, \XMM3
1025 AESENC \TMP1, \XMM4
1026 movaps 0x20(%arg1), \TMP1
1027 AESENC \TMP1, \XMM1 # Round 2
1028 AESENC \TMP1, \XMM2
1029 AESENC \TMP1, \XMM3
1030 AESENC \TMP1, \XMM4
1031 movdqa \XMM6, \TMP1
1032 pshufd $78, \XMM6, \TMP2
1033 pxor \XMM6, \TMP2
Dave Watson1476db22018-02-14 09:40:10 -08001034 movdqa HashKey_3(%arg2), \TMP5
Tadeusz Struk0bd82f52010-11-04 15:00:45 -04001035 PCLMULQDQ 0x11, \TMP5, \TMP1 # TMP1 = a1 * b1
1036 movaps 0x30(%arg1), \TMP3
1037 AESENC \TMP3, \XMM1 # Round 3
1038 AESENC \TMP3, \XMM2
1039 AESENC \TMP3, \XMM3
1040 AESENC \TMP3, \XMM4
1041 PCLMULQDQ 0x00, \TMP5, \XMM6 # XMM6 = a0*b0
1042 movaps 0x40(%arg1), \TMP3
1043 AESENC \TMP3, \XMM1 # Round 4
1044 AESENC \TMP3, \XMM2
1045 AESENC \TMP3, \XMM3
1046 AESENC \TMP3, \XMM4
Dave Watson1476db22018-02-14 09:40:10 -08001047 movdqa HashKey_3_k(%arg2), \TMP5
Tadeusz Struk0bd82f52010-11-04 15:00:45 -04001048 PCLMULQDQ 0x00, \TMP5, \TMP2 # TMP2 = (a1+a0)*(b1+b0)
1049 movaps 0x50(%arg1), \TMP3
1050 AESENC \TMP3, \XMM1 # Round 5
1051 AESENC \TMP3, \XMM2
1052 AESENC \TMP3, \XMM3
1053 AESENC \TMP3, \XMM4
1054 pxor \TMP1, \TMP4
1055# accumulate the results in TMP4:XMM5, TMP6 holds the middle part
1056 pxor \XMM6, \XMM5
1057 pxor \TMP2, \TMP6
1058 movdqa \XMM7, \TMP1
1059 pshufd $78, \XMM7, \TMP2
1060 pxor \XMM7, \TMP2
Dave Watson1476db22018-02-14 09:40:10 -08001061 movdqa HashKey_2(%arg2), \TMP5
Tadeusz Struk0bd82f52010-11-04 15:00:45 -04001062
1063 # Multiply TMP5 * HashKey using karatsuba
1064
1065 PCLMULQDQ 0x11, \TMP5, \TMP1 # TMP1 = a1*b1
1066 movaps 0x60(%arg1), \TMP3
1067 AESENC \TMP3, \XMM1 # Round 6
1068 AESENC \TMP3, \XMM2
1069 AESENC \TMP3, \XMM3
1070 AESENC \TMP3, \XMM4
1071 PCLMULQDQ 0x00, \TMP5, \XMM7 # XMM7 = a0*b0
1072 movaps 0x70(%arg1), \TMP3
1073 AESENC \TMP3, \XMM1 # Round 7
1074 AESENC \TMP3, \XMM2
1075 AESENC \TMP3, \XMM3
1076 AESENC \TMP3, \XMM4
Dave Watson1476db22018-02-14 09:40:10 -08001077 movdqa HashKey_2_k(%arg2), \TMP5
Tadeusz Struk0bd82f52010-11-04 15:00:45 -04001078 PCLMULQDQ 0x00, \TMP5, \TMP2 # TMP2 = (a1+a0)*(b1+b0)
1079 movaps 0x80(%arg1), \TMP3
1080 AESENC \TMP3, \XMM1 # Round 8
1081 AESENC \TMP3, \XMM2
1082 AESENC \TMP3, \XMM3
1083 AESENC \TMP3, \XMM4
1084 pxor \TMP1, \TMP4
1085# accumulate the results in TMP4:XMM5, TMP6 holds the middle part
1086 pxor \XMM7, \XMM5
1087 pxor \TMP2, \TMP6
1088
1089 # Multiply XMM8 * HashKey
1090 # XMM8 and TMP5 hold the values for the two operands
1091
1092 movdqa \XMM8, \TMP1
1093 pshufd $78, \XMM8, \TMP2
1094 pxor \XMM8, \TMP2
Dave Watson1476db22018-02-14 09:40:10 -08001095 movdqa HashKey(%arg2), \TMP5
Tadeusz Struk0bd82f52010-11-04 15:00:45 -04001096 PCLMULQDQ 0x11, \TMP5, \TMP1 # TMP1 = a1*b1
1097 movaps 0x90(%arg1), \TMP3
1098 AESENC \TMP3, \XMM1 # Round 9
1099 AESENC \TMP3, \XMM2
1100 AESENC \TMP3, \XMM3
1101 AESENC \TMP3, \XMM4
1102 PCLMULQDQ 0x00, \TMP5, \XMM8 # XMM8 = a0*b0
Timothy McCaffreye31ac322015-01-13 13:16:43 -05001103 lea 0xa0(%arg1),%r10
1104 mov keysize,%eax
1105 shr $2,%eax # 128->4, 192->6, 256->8
1106 sub $4,%eax # 128->0, 192->2, 256->4
1107 jz aes_loop_par_enc_done
1108
1109aes_loop_par_enc:
1110 MOVADQ (%r10),\TMP3
1111.irpc index, 1234
1112 AESENC \TMP3, %xmm\index
1113.endr
1114 add $16,%r10
1115 sub $1,%eax
1116 jnz aes_loop_par_enc
1117
1118aes_loop_par_enc_done:
1119 MOVADQ (%r10), \TMP3
Tadeusz Struk0bd82f52010-11-04 15:00:45 -04001120 AESENCLAST \TMP3, \XMM1 # Round 10
1121 AESENCLAST \TMP3, \XMM2
1122 AESENCLAST \TMP3, \XMM3
1123 AESENCLAST \TMP3, \XMM4
Dave Watson1476db22018-02-14 09:40:10 -08001124 movdqa HashKey_k(%arg2), \TMP5
Tadeusz Struk0bd82f52010-11-04 15:00:45 -04001125 PCLMULQDQ 0x00, \TMP5, \TMP2 # TMP2 = (a1+a0)*(b1+b0)
Dave Watson9ee4a5d2018-02-14 09:39:23 -08001126 movdqu (%arg4,%r11,1), \TMP3
Tadeusz Struk0bd82f52010-11-04 15:00:45 -04001127 pxor \TMP3, \XMM1 # Ciphertext/Plaintext XOR EK
Dave Watson9ee4a5d2018-02-14 09:39:23 -08001128 movdqu 16(%arg4,%r11,1), \TMP3
Tadeusz Struk0bd82f52010-11-04 15:00:45 -04001129 pxor \TMP3, \XMM2 # Ciphertext/Plaintext XOR EK
Dave Watson9ee4a5d2018-02-14 09:39:23 -08001130 movdqu 32(%arg4,%r11,1), \TMP3
Tadeusz Struk0bd82f52010-11-04 15:00:45 -04001131 pxor \TMP3, \XMM3 # Ciphertext/Plaintext XOR EK
Dave Watson9ee4a5d2018-02-14 09:39:23 -08001132 movdqu 48(%arg4,%r11,1), \TMP3
Tadeusz Struk0bd82f52010-11-04 15:00:45 -04001133 pxor \TMP3, \XMM4 # Ciphertext/Plaintext XOR EK
Dave Watson9ee4a5d2018-02-14 09:39:23 -08001134 movdqu \XMM1, (%arg3,%r11,1) # Write to the ciphertext buffer
1135 movdqu \XMM2, 16(%arg3,%r11,1) # Write to the ciphertext buffer
1136 movdqu \XMM3, 32(%arg3,%r11,1) # Write to the ciphertext buffer
1137 movdqu \XMM4, 48(%arg3,%r11,1) # Write to the ciphertext buffer
Tadeusz Struk3c097b82010-12-13 19:51:15 +08001138 PSHUFB_XMM %xmm15, \XMM1 # perform a 16 byte swap
1139 PSHUFB_XMM %xmm15, \XMM2 # perform a 16 byte swap
1140 PSHUFB_XMM %xmm15, \XMM3 # perform a 16 byte swap
1141 PSHUFB_XMM %xmm15, \XMM4 # perform a 16 byte swap
1142
1143 pxor \TMP4, \TMP1
1144 pxor \XMM8, \XMM5
1145 pxor \TMP6, \TMP2
1146 pxor \TMP1, \TMP2
1147 pxor \XMM5, \TMP2
1148 movdqa \TMP2, \TMP3
1149 pslldq $8, \TMP3 # left shift TMP3 2 DWs
1150 psrldq $8, \TMP2 # right shift TMP2 2 DWs
1151 pxor \TMP3, \XMM5
1152 pxor \TMP2, \TMP1 # accumulate the results in TMP1:XMM5
1153
1154 # first phase of reduction
1155
1156 movdqa \XMM5, \TMP2
1157 movdqa \XMM5, \TMP3
1158 movdqa \XMM5, \TMP4
1159# move XMM5 into TMP2, TMP3, TMP4 in order to perform shifts independently
1160 pslld $31, \TMP2 # packed right shift << 31
1161 pslld $30, \TMP3 # packed right shift << 30
1162 pslld $25, \TMP4 # packed right shift << 25
1163 pxor \TMP3, \TMP2 # xor the shifted versions
1164 pxor \TMP4, \TMP2
1165 movdqa \TMP2, \TMP5
1166 psrldq $4, \TMP5 # right shift T5 1 DW
1167 pslldq $12, \TMP2 # left shift T2 3 DWs
1168 pxor \TMP2, \XMM5
1169
1170 # second phase of reduction
1171
1172 movdqa \XMM5,\TMP2 # make 3 copies of XMM5 into TMP2, TMP3, TMP4
1173 movdqa \XMM5,\TMP3
1174 movdqa \XMM5,\TMP4
1175 psrld $1, \TMP2 # packed left shift >>1
1176 psrld $2, \TMP3 # packed left shift >>2
1177 psrld $7, \TMP4 # packed left shift >>7
1178 pxor \TMP3,\TMP2 # xor the shifted versions
1179 pxor \TMP4,\TMP2
1180 pxor \TMP5, \TMP2
1181 pxor \TMP2, \XMM5
1182 pxor \TMP1, \XMM5 # result is in TMP1
1183
1184 pxor \XMM5, \XMM1
1185.endm
1186
1187/*
1188* decrypt 4 blocks at a time
1189* ghash the 4 previously decrypted ciphertext blocks
Dave Watson9ee4a5d2018-02-14 09:39:23 -08001190* arg1, %arg3, %arg4 are used as pointers only, not modified
Tadeusz Struk3c097b82010-12-13 19:51:15 +08001191* %r11 is the data offset value
1192*/
1193.macro GHASH_4_ENCRYPT_4_PARALLEL_DEC TMP1 TMP2 TMP3 TMP4 TMP5 \
1194TMP6 XMM0 XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7 XMM8 operation
1195
1196 movdqa \XMM1, \XMM5
1197 movdqa \XMM2, \XMM6
1198 movdqa \XMM3, \XMM7
1199 movdqa \XMM4, \XMM8
1200
1201 movdqa SHUF_MASK(%rip), %xmm15
1202 # multiply TMP5 * HashKey using karatsuba
1203
1204 movdqa \XMM5, \TMP4
1205 pshufd $78, \XMM5, \TMP6
1206 pxor \XMM5, \TMP6
1207 paddd ONE(%rip), \XMM0 # INCR CNT
Dave Watson1476db22018-02-14 09:40:10 -08001208 movdqa HashKey_4(%arg2), \TMP5
Tadeusz Struk3c097b82010-12-13 19:51:15 +08001209 PCLMULQDQ 0x11, \TMP5, \TMP4 # TMP4 = a1*b1
1210 movdqa \XMM0, \XMM1
1211 paddd ONE(%rip), \XMM0 # INCR CNT
1212 movdqa \XMM0, \XMM2
1213 paddd ONE(%rip), \XMM0 # INCR CNT
1214 movdqa \XMM0, \XMM3
1215 paddd ONE(%rip), \XMM0 # INCR CNT
1216 movdqa \XMM0, \XMM4
1217 PSHUFB_XMM %xmm15, \XMM1 # perform a 16 byte swap
1218 PCLMULQDQ 0x00, \TMP5, \XMM5 # XMM5 = a0*b0
1219 PSHUFB_XMM %xmm15, \XMM2 # perform a 16 byte swap
1220 PSHUFB_XMM %xmm15, \XMM3 # perform a 16 byte swap
1221 PSHUFB_XMM %xmm15, \XMM4 # perform a 16 byte swap
1222
1223 pxor (%arg1), \XMM1
1224 pxor (%arg1), \XMM2
1225 pxor (%arg1), \XMM3
1226 pxor (%arg1), \XMM4
Dave Watson1476db22018-02-14 09:40:10 -08001227 movdqa HashKey_4_k(%arg2), \TMP5
Tadeusz Struk3c097b82010-12-13 19:51:15 +08001228 PCLMULQDQ 0x00, \TMP5, \TMP6 # TMP6 = (a1+a0)*(b1+b0)
1229 movaps 0x10(%arg1), \TMP1
1230 AESENC \TMP1, \XMM1 # Round 1
1231 AESENC \TMP1, \XMM2
1232 AESENC \TMP1, \XMM3
1233 AESENC \TMP1, \XMM4
1234 movaps 0x20(%arg1), \TMP1
1235 AESENC \TMP1, \XMM1 # Round 2
1236 AESENC \TMP1, \XMM2
1237 AESENC \TMP1, \XMM3
1238 AESENC \TMP1, \XMM4
1239 movdqa \XMM6, \TMP1
1240 pshufd $78, \XMM6, \TMP2
1241 pxor \XMM6, \TMP2
Dave Watson1476db22018-02-14 09:40:10 -08001242 movdqa HashKey_3(%arg2), \TMP5
Tadeusz Struk3c097b82010-12-13 19:51:15 +08001243 PCLMULQDQ 0x11, \TMP5, \TMP1 # TMP1 = a1 * b1
1244 movaps 0x30(%arg1), \TMP3
1245 AESENC \TMP3, \XMM1 # Round 3
1246 AESENC \TMP3, \XMM2
1247 AESENC \TMP3, \XMM3
1248 AESENC \TMP3, \XMM4
1249 PCLMULQDQ 0x00, \TMP5, \XMM6 # XMM6 = a0*b0
1250 movaps 0x40(%arg1), \TMP3
1251 AESENC \TMP3, \XMM1 # Round 4
1252 AESENC \TMP3, \XMM2
1253 AESENC \TMP3, \XMM3
1254 AESENC \TMP3, \XMM4
Dave Watson1476db22018-02-14 09:40:10 -08001255 movdqa HashKey_3_k(%arg2), \TMP5
Tadeusz Struk3c097b82010-12-13 19:51:15 +08001256 PCLMULQDQ 0x00, \TMP5, \TMP2 # TMP2 = (a1+a0)*(b1+b0)
1257 movaps 0x50(%arg1), \TMP3
1258 AESENC \TMP3, \XMM1 # Round 5
1259 AESENC \TMP3, \XMM2
1260 AESENC \TMP3, \XMM3
1261 AESENC \TMP3, \XMM4
1262 pxor \TMP1, \TMP4
1263# accumulate the results in TMP4:XMM5, TMP6 holds the middle part
1264 pxor \XMM6, \XMM5
1265 pxor \TMP2, \TMP6
1266 movdqa \XMM7, \TMP1
1267 pshufd $78, \XMM7, \TMP2
1268 pxor \XMM7, \TMP2
Dave Watson1476db22018-02-14 09:40:10 -08001269 movdqa HashKey_2(%arg2), \TMP5
Tadeusz Struk3c097b82010-12-13 19:51:15 +08001270
1271 # Multiply TMP5 * HashKey using karatsuba
1272
1273 PCLMULQDQ 0x11, \TMP5, \TMP1 # TMP1 = a1*b1
1274 movaps 0x60(%arg1), \TMP3
1275 AESENC \TMP3, \XMM1 # Round 6
1276 AESENC \TMP3, \XMM2
1277 AESENC \TMP3, \XMM3
1278 AESENC \TMP3, \XMM4
1279 PCLMULQDQ 0x00, \TMP5, \XMM7 # XMM7 = a0*b0
1280 movaps 0x70(%arg1), \TMP3
1281 AESENC \TMP3, \XMM1 # Round 7
1282 AESENC \TMP3, \XMM2
1283 AESENC \TMP3, \XMM3
1284 AESENC \TMP3, \XMM4
Dave Watson1476db22018-02-14 09:40:10 -08001285 movdqa HashKey_2_k(%arg2), \TMP5
Tadeusz Struk3c097b82010-12-13 19:51:15 +08001286 PCLMULQDQ 0x00, \TMP5, \TMP2 # TMP2 = (a1+a0)*(b1+b0)
1287 movaps 0x80(%arg1), \TMP3
1288 AESENC \TMP3, \XMM1 # Round 8
1289 AESENC \TMP3, \XMM2
1290 AESENC \TMP3, \XMM3
1291 AESENC \TMP3, \XMM4
1292 pxor \TMP1, \TMP4
1293# accumulate the results in TMP4:XMM5, TMP6 holds the middle part
1294 pxor \XMM7, \XMM5
1295 pxor \TMP2, \TMP6
1296
1297 # Multiply XMM8 * HashKey
1298 # XMM8 and TMP5 hold the values for the two operands
1299
1300 movdqa \XMM8, \TMP1
1301 pshufd $78, \XMM8, \TMP2
1302 pxor \XMM8, \TMP2
Dave Watson1476db22018-02-14 09:40:10 -08001303 movdqa HashKey(%arg2), \TMP5
Tadeusz Struk3c097b82010-12-13 19:51:15 +08001304 PCLMULQDQ 0x11, \TMP5, \TMP1 # TMP1 = a1*b1
1305 movaps 0x90(%arg1), \TMP3
1306 AESENC \TMP3, \XMM1 # Round 9
1307 AESENC \TMP3, \XMM2
1308 AESENC \TMP3, \XMM3
1309 AESENC \TMP3, \XMM4
1310 PCLMULQDQ 0x00, \TMP5, \XMM8 # XMM8 = a0*b0
Timothy McCaffreye31ac322015-01-13 13:16:43 -05001311 lea 0xa0(%arg1),%r10
1312 mov keysize,%eax
1313 shr $2,%eax # 128->4, 192->6, 256->8
1314 sub $4,%eax # 128->0, 192->2, 256->4
1315 jz aes_loop_par_dec_done
1316
1317aes_loop_par_dec:
1318 MOVADQ (%r10),\TMP3
1319.irpc index, 1234
1320 AESENC \TMP3, %xmm\index
1321.endr
1322 add $16,%r10
1323 sub $1,%eax
1324 jnz aes_loop_par_dec
1325
1326aes_loop_par_dec_done:
1327 MOVADQ (%r10), \TMP3
1328 AESENCLAST \TMP3, \XMM1 # last round
Tadeusz Struk3c097b82010-12-13 19:51:15 +08001329 AESENCLAST \TMP3, \XMM2
1330 AESENCLAST \TMP3, \XMM3
1331 AESENCLAST \TMP3, \XMM4
Dave Watson1476db22018-02-14 09:40:10 -08001332 movdqa HashKey_k(%arg2), \TMP5
Tadeusz Struk3c097b82010-12-13 19:51:15 +08001333 PCLMULQDQ 0x00, \TMP5, \TMP2 # TMP2 = (a1+a0)*(b1+b0)
Dave Watson9ee4a5d2018-02-14 09:39:23 -08001334 movdqu (%arg4,%r11,1), \TMP3
Tadeusz Struk3c097b82010-12-13 19:51:15 +08001335 pxor \TMP3, \XMM1 # Ciphertext/Plaintext XOR EK
Dave Watson9ee4a5d2018-02-14 09:39:23 -08001336 movdqu \XMM1, (%arg3,%r11,1) # Write to plaintext buffer
Tadeusz Struk3c097b82010-12-13 19:51:15 +08001337 movdqa \TMP3, \XMM1
Dave Watson9ee4a5d2018-02-14 09:39:23 -08001338 movdqu 16(%arg4,%r11,1), \TMP3
Tadeusz Struk3c097b82010-12-13 19:51:15 +08001339 pxor \TMP3, \XMM2 # Ciphertext/Plaintext XOR EK
Dave Watson9ee4a5d2018-02-14 09:39:23 -08001340 movdqu \XMM2, 16(%arg3,%r11,1) # Write to plaintext buffer
Tadeusz Struk3c097b82010-12-13 19:51:15 +08001341 movdqa \TMP3, \XMM2
Dave Watson9ee4a5d2018-02-14 09:39:23 -08001342 movdqu 32(%arg4,%r11,1), \TMP3
Tadeusz Struk3c097b82010-12-13 19:51:15 +08001343 pxor \TMP3, \XMM3 # Ciphertext/Plaintext XOR EK
Dave Watson9ee4a5d2018-02-14 09:39:23 -08001344 movdqu \XMM3, 32(%arg3,%r11,1) # Write to plaintext buffer
Tadeusz Struk3c097b82010-12-13 19:51:15 +08001345 movdqa \TMP3, \XMM3
Dave Watson9ee4a5d2018-02-14 09:39:23 -08001346 movdqu 48(%arg4,%r11,1), \TMP3
Tadeusz Struk3c097b82010-12-13 19:51:15 +08001347 pxor \TMP3, \XMM4 # Ciphertext/Plaintext XOR EK
Dave Watson9ee4a5d2018-02-14 09:39:23 -08001348 movdqu \XMM4, 48(%arg3,%r11,1) # Write to plaintext buffer
Tadeusz Struk0bd82f52010-11-04 15:00:45 -04001349 movdqa \TMP3, \XMM4
Tadeusz Struk3c097b82010-12-13 19:51:15 +08001350 PSHUFB_XMM %xmm15, \XMM1 # perform a 16 byte swap
1351 PSHUFB_XMM %xmm15, \XMM2 # perform a 16 byte swap
1352 PSHUFB_XMM %xmm15, \XMM3 # perform a 16 byte swap
1353 PSHUFB_XMM %xmm15, \XMM4 # perform a 16 byte swap
Tadeusz Struk0bd82f52010-11-04 15:00:45 -04001354
1355 pxor \TMP4, \TMP1
1356 pxor \XMM8, \XMM5
1357 pxor \TMP6, \TMP2
1358 pxor \TMP1, \TMP2
1359 pxor \XMM5, \TMP2
1360 movdqa \TMP2, \TMP3
1361 pslldq $8, \TMP3 # left shift TMP3 2 DWs
1362 psrldq $8, \TMP2 # right shift TMP2 2 DWs
1363 pxor \TMP3, \XMM5
1364 pxor \TMP2, \TMP1 # accumulate the results in TMP1:XMM5
1365
1366 # first phase of reduction
1367
1368 movdqa \XMM5, \TMP2
1369 movdqa \XMM5, \TMP3
1370 movdqa \XMM5, \TMP4
1371# move XMM5 into TMP2, TMP3, TMP4 in order to perform shifts independently
1372 pslld $31, \TMP2 # packed right shift << 31
1373 pslld $30, \TMP3 # packed right shift << 30
1374 pslld $25, \TMP4 # packed right shift << 25
1375 pxor \TMP3, \TMP2 # xor the shifted versions
1376 pxor \TMP4, \TMP2
1377 movdqa \TMP2, \TMP5
1378 psrldq $4, \TMP5 # right shift T5 1 DW
1379 pslldq $12, \TMP2 # left shift T2 3 DWs
1380 pxor \TMP2, \XMM5
1381
1382 # second phase of reduction
1383
1384 movdqa \XMM5,\TMP2 # make 3 copies of XMM5 into TMP2, TMP3, TMP4
1385 movdqa \XMM5,\TMP3
1386 movdqa \XMM5,\TMP4
1387 psrld $1, \TMP2 # packed left shift >>1
1388 psrld $2, \TMP3 # packed left shift >>2
1389 psrld $7, \TMP4 # packed left shift >>7
1390 pxor \TMP3,\TMP2 # xor the shifted versions
1391 pxor \TMP4,\TMP2
1392 pxor \TMP5, \TMP2
1393 pxor \TMP2, \XMM5
1394 pxor \TMP1, \XMM5 # result is in TMP1
1395
1396 pxor \XMM5, \XMM1
1397.endm
1398
1399/* GHASH the last 4 ciphertext blocks. */
1400.macro GHASH_LAST_4 TMP1 TMP2 TMP3 TMP4 TMP5 TMP6 \
1401TMP7 XMM1 XMM2 XMM3 XMM4 XMMDst
1402
1403 # Multiply TMP6 * HashKey (using Karatsuba)
1404
1405 movdqa \XMM1, \TMP6
1406 pshufd $78, \XMM1, \TMP2
1407 pxor \XMM1, \TMP2
Dave Watson1476db22018-02-14 09:40:10 -08001408 movdqa HashKey_4(%arg2), \TMP5
Tadeusz Struk0bd82f52010-11-04 15:00:45 -04001409 PCLMULQDQ 0x11, \TMP5, \TMP6 # TMP6 = a1*b1
1410 PCLMULQDQ 0x00, \TMP5, \XMM1 # XMM1 = a0*b0
Dave Watson1476db22018-02-14 09:40:10 -08001411 movdqa HashKey_4_k(%arg2), \TMP4
Tadeusz Struk0bd82f52010-11-04 15:00:45 -04001412 PCLMULQDQ 0x00, \TMP4, \TMP2 # TMP2 = (a1+a0)*(b1+b0)
1413 movdqa \XMM1, \XMMDst
1414 movdqa \TMP2, \XMM1 # result in TMP6, XMMDst, XMM1
1415
1416 # Multiply TMP1 * HashKey (using Karatsuba)
1417
1418 movdqa \XMM2, \TMP1
1419 pshufd $78, \XMM2, \TMP2
1420 pxor \XMM2, \TMP2
Dave Watson1476db22018-02-14 09:40:10 -08001421 movdqa HashKey_3(%arg2), \TMP5
Tadeusz Struk0bd82f52010-11-04 15:00:45 -04001422 PCLMULQDQ 0x11, \TMP5, \TMP1 # TMP1 = a1*b1
1423 PCLMULQDQ 0x00, \TMP5, \XMM2 # XMM2 = a0*b0
Dave Watson1476db22018-02-14 09:40:10 -08001424 movdqa HashKey_3_k(%arg2), \TMP4
Tadeusz Struk0bd82f52010-11-04 15:00:45 -04001425 PCLMULQDQ 0x00, \TMP4, \TMP2 # TMP2 = (a1+a0)*(b1+b0)
1426 pxor \TMP1, \TMP6
1427 pxor \XMM2, \XMMDst
1428 pxor \TMP2, \XMM1
1429# results accumulated in TMP6, XMMDst, XMM1
1430
1431 # Multiply TMP1 * HashKey (using Karatsuba)
1432
1433 movdqa \XMM3, \TMP1
1434 pshufd $78, \XMM3, \TMP2
1435 pxor \XMM3, \TMP2
Dave Watson1476db22018-02-14 09:40:10 -08001436 movdqa HashKey_2(%arg2), \TMP5
Tadeusz Struk0bd82f52010-11-04 15:00:45 -04001437 PCLMULQDQ 0x11, \TMP5, \TMP1 # TMP1 = a1*b1
1438 PCLMULQDQ 0x00, \TMP5, \XMM3 # XMM3 = a0*b0
Dave Watson1476db22018-02-14 09:40:10 -08001439 movdqa HashKey_2_k(%arg2), \TMP4
Tadeusz Struk0bd82f52010-11-04 15:00:45 -04001440 PCLMULQDQ 0x00, \TMP4, \TMP2 # TMP2 = (a1+a0)*(b1+b0)
1441 pxor \TMP1, \TMP6
1442 pxor \XMM3, \XMMDst
1443 pxor \TMP2, \XMM1 # results accumulated in TMP6, XMMDst, XMM1
1444
1445 # Multiply TMP1 * HashKey (using Karatsuba)
1446 movdqa \XMM4, \TMP1
1447 pshufd $78, \XMM4, \TMP2
1448 pxor \XMM4, \TMP2
Dave Watson1476db22018-02-14 09:40:10 -08001449 movdqa HashKey(%arg2), \TMP5
Tadeusz Struk0bd82f52010-11-04 15:00:45 -04001450 PCLMULQDQ 0x11, \TMP5, \TMP1 # TMP1 = a1*b1
1451 PCLMULQDQ 0x00, \TMP5, \XMM4 # XMM4 = a0*b0
Dave Watson1476db22018-02-14 09:40:10 -08001452 movdqa HashKey_k(%arg2), \TMP4
Tadeusz Struk0bd82f52010-11-04 15:00:45 -04001453 PCLMULQDQ 0x00, \TMP4, \TMP2 # TMP2 = (a1+a0)*(b1+b0)
1454 pxor \TMP1, \TMP6
1455 pxor \XMM4, \XMMDst
1456 pxor \XMM1, \TMP2
1457 pxor \TMP6, \TMP2
1458 pxor \XMMDst, \TMP2
1459 # middle section of the temp results combined as in karatsuba algorithm
1460 movdqa \TMP2, \TMP4
1461 pslldq $8, \TMP4 # left shift TMP4 2 DWs
1462 psrldq $8, \TMP2 # right shift TMP2 2 DWs
1463 pxor \TMP4, \XMMDst
1464 pxor \TMP2, \TMP6
1465# TMP6:XMMDst holds the result of the accumulated carry-less multiplications
1466 # first phase of the reduction
1467 movdqa \XMMDst, \TMP2
1468 movdqa \XMMDst, \TMP3
1469 movdqa \XMMDst, \TMP4
1470# move XMMDst into TMP2, TMP3, TMP4 in order to perform 3 shifts independently
1471 pslld $31, \TMP2 # packed right shifting << 31
1472 pslld $30, \TMP3 # packed right shifting << 30
1473 pslld $25, \TMP4 # packed right shifting << 25
1474 pxor \TMP3, \TMP2 # xor the shifted versions
1475 pxor \TMP4, \TMP2
1476 movdqa \TMP2, \TMP7
1477 psrldq $4, \TMP7 # right shift TMP7 1 DW
1478 pslldq $12, \TMP2 # left shift TMP2 3 DWs
1479 pxor \TMP2, \XMMDst
1480
1481 # second phase of the reduction
1482 movdqa \XMMDst, \TMP2
1483 # make 3 copies of XMMDst for doing 3 shift operations
1484 movdqa \XMMDst, \TMP3
1485 movdqa \XMMDst, \TMP4
1486 psrld $1, \TMP2 # packed left shift >> 1
1487 psrld $2, \TMP3 # packed left shift >> 2
1488 psrld $7, \TMP4 # packed left shift >> 7
1489 pxor \TMP3, \TMP2 # xor the shifted versions
1490 pxor \TMP4, \TMP2
1491 pxor \TMP7, \TMP2
1492 pxor \TMP2, \XMMDst
1493 pxor \TMP6, \XMMDst # reduced result is in XMMDst
1494.endm
1495
Timothy McCaffreye31ac322015-01-13 13:16:43 -05001496
1497/* Encryption of a single block
1498* uses eax & r10
1499*/
1500
Tadeusz Struk0bd82f52010-11-04 15:00:45 -04001501.macro ENCRYPT_SINGLE_BLOCK XMM0 TMP1
1502
Timothy McCaffreye31ac322015-01-13 13:16:43 -05001503 pxor (%arg1), \XMM0
1504 mov keysize,%eax
1505 shr $2,%eax # 128->4, 192->6, 256->8
1506 add $5,%eax # 128->9, 192->11, 256->13
1507 lea 16(%arg1), %r10 # get first expanded key address
1508
1509_esb_loop_\@:
1510 MOVADQ (%r10),\TMP1
1511 AESENC \TMP1,\XMM0
1512 add $16,%r10
1513 sub $1,%eax
1514 jnz _esb_loop_\@
1515
1516 MOVADQ (%r10),\TMP1
1517 AESENCLAST \TMP1,\XMM0
Tadeusz Struk0bd82f52010-11-04 15:00:45 -04001518.endm
Tadeusz Struk0bd82f52010-11-04 15:00:45 -04001519/*****************************************************************************
1520* void aesni_gcm_dec(void *aes_ctx, // AES Key schedule. Starts on a 16 byte boundary.
Dave Watson9ee4a5d2018-02-14 09:39:23 -08001521* struct gcm_context_data *data
1522* // Context data
Tadeusz Struk0bd82f52010-11-04 15:00:45 -04001523* u8 *out, // Plaintext output. Encrypt in-place is allowed.
1524* const u8 *in, // Ciphertext input
1525* u64 plaintext_len, // Length of data in bytes for decryption.
1526* u8 *iv, // Pre-counter block j0: 4 byte salt (from Security Association)
1527* // concatenated with 8 byte Initialisation Vector (from IPSec ESP Payload)
1528* // concatenated with 0x00000001. 16-byte aligned pointer.
1529* u8 *hash_subkey, // H, the Hash sub key input. Data starts on a 16-byte boundary.
1530* const u8 *aad, // Additional Authentication Data (AAD)
1531* u64 aad_len, // Length of AAD in bytes. With RFC4106 this is going to be 8 or 12 bytes
1532* u8 *auth_tag, // Authenticated Tag output. The driver will compare this to the
1533* // given authentication tag and only return the plaintext if they match.
1534* u64 auth_tag_len); // Authenticated Tag Length in bytes. Valid values are 16
1535* // (most likely), 12 or 8.
1536*
1537* Assumptions:
1538*
1539* keys:
1540* keys are pre-expanded and aligned to 16 bytes. we are using the first
1541* set of 11 keys in the data structure void *aes_ctx
1542*
1543* iv:
1544* 0 1 2 3
1545* 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
1546* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1547* | Salt (From the SA) |
1548* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1549* | Initialization Vector |
1550* | (This is the sequence number from IPSec header) |
1551* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1552* | 0x1 |
1553* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1554*
1555*
1556*
1557* AAD:
1558* AAD padded to 128 bits with 0
1559* for example, assume AAD is a u32 vector
1560*
1561* if AAD is 8 bytes:
1562* AAD[3] = {A0, A1};
1563* padded AAD in xmm register = {A1 A0 0 0}
1564*
1565* 0 1 2 3
1566* 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
1567* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1568* | SPI (A1) |
1569* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1570* | 32-bit Sequence Number (A0) |
1571* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1572* | 0x0 |
1573* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1574*
1575* AAD Format with 32-bit Sequence Number
1576*
1577* if AAD is 12 bytes:
1578* AAD[3] = {A0, A1, A2};
1579* padded AAD in xmm register = {A2 A1 A0 0}
1580*
1581* 0 1 2 3
1582* 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
1583* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1584* 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
1585* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1586* | SPI (A2) |
1587* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1588* | 64-bit Extended Sequence Number {A1,A0} |
1589* | |
1590* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1591* | 0x0 |
1592* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1593*
1594* AAD Format with 64-bit Extended Sequence Number
1595*
Tadeusz Struk0bd82f52010-11-04 15:00:45 -04001596* poly = x^128 + x^127 + x^126 + x^121 + 1
1597*
1598*****************************************************************************/
Tadeusz Struk0bd82f52010-11-04 15:00:45 -04001599ENTRY(aesni_gcm_dec)
Dave Watson6c2c86b2018-02-14 09:38:35 -08001600 FUNC_SAVE
Tadeusz Struk3c097b82010-12-13 19:51:15 +08001601
Dave Watson7af964c2018-02-14 09:38:45 -08001602 GCM_INIT
Dave Watsonba458332018-02-14 09:39:10 -08001603 GCM_ENC_DEC dec
Dave Watsonadcadab2018-02-14 09:38:57 -08001604 GCM_COMPLETE
Dave Watson6c2c86b2018-02-14 09:38:35 -08001605 FUNC_RESTORE
Tadeusz Struk0bd82f52010-11-04 15:00:45 -04001606 ret
Jussi Kivilinna8309b742013-01-19 13:38:55 +02001607ENDPROC(aesni_gcm_dec)
Tadeusz Struk0bd82f52010-11-04 15:00:45 -04001608
1609
1610/*****************************************************************************
1611* void aesni_gcm_enc(void *aes_ctx, // AES Key schedule. Starts on a 16 byte boundary.
Dave Watson9ee4a5d2018-02-14 09:39:23 -08001612* struct gcm_context_data *data
1613* // Context data
Tadeusz Struk0bd82f52010-11-04 15:00:45 -04001614* u8 *out, // Ciphertext output. Encrypt in-place is allowed.
1615* const u8 *in, // Plaintext input
1616* u64 plaintext_len, // Length of data in bytes for encryption.
1617* u8 *iv, // Pre-counter block j0: 4 byte salt (from Security Association)
1618* // concatenated with 8 byte Initialisation Vector (from IPSec ESP Payload)
1619* // concatenated with 0x00000001. 16-byte aligned pointer.
1620* u8 *hash_subkey, // H, the Hash sub key input. Data starts on a 16-byte boundary.
1621* const u8 *aad, // Additional Authentication Data (AAD)
1622* u64 aad_len, // Length of AAD in bytes. With RFC4106 this is going to be 8 or 12 bytes
1623* u8 *auth_tag, // Authenticated Tag output.
1624* u64 auth_tag_len); // Authenticated Tag Length in bytes. Valid values are 16 (most likely),
1625* // 12 or 8.
1626*
1627* Assumptions:
1628*
1629* keys:
1630* keys are pre-expanded and aligned to 16 bytes. we are using the
1631* first set of 11 keys in the data structure void *aes_ctx
1632*
1633*
1634* iv:
1635* 0 1 2 3
1636* 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
1637* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1638* | Salt (From the SA) |
1639* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1640* | Initialization Vector |
1641* | (This is the sequence number from IPSec header) |
1642* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1643* | 0x1 |
1644* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1645*
1646*
1647*
1648* AAD:
1649* AAD padded to 128 bits with 0
1650* for example, assume AAD is a u32 vector
1651*
1652* if AAD is 8 bytes:
1653* AAD[3] = {A0, A1};
1654* padded AAD in xmm register = {A1 A0 0 0}
1655*
1656* 0 1 2 3
1657* 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
1658* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1659* | SPI (A1) |
1660* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1661* | 32-bit Sequence Number (A0) |
1662* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1663* | 0x0 |
1664* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1665*
1666* AAD Format with 32-bit Sequence Number
1667*
1668* if AAD is 12 bytes:
1669* AAD[3] = {A0, A1, A2};
1670* padded AAD in xmm register = {A2 A1 A0 0}
1671*
1672* 0 1 2 3
1673* 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
1674* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1675* | SPI (A2) |
1676* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1677* | 64-bit Extended Sequence Number {A1,A0} |
1678* | |
1679* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1680* | 0x0 |
1681* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1682*
1683* AAD Format with 64-bit Extended Sequence Number
1684*
Tadeusz Struk0bd82f52010-11-04 15:00:45 -04001685* poly = x^128 + x^127 + x^126 + x^121 + 1
1686***************************************************************************/
1687ENTRY(aesni_gcm_enc)
Dave Watson6c2c86b2018-02-14 09:38:35 -08001688 FUNC_SAVE
Tadeusz Struk3c097b82010-12-13 19:51:15 +08001689
Dave Watson7af964c2018-02-14 09:38:45 -08001690 GCM_INIT
Dave Watsonba458332018-02-14 09:39:10 -08001691 GCM_ENC_DEC enc
Dave Watsonadcadab2018-02-14 09:38:57 -08001692 GCM_COMPLETE
Dave Watson6c2c86b2018-02-14 09:38:35 -08001693 FUNC_RESTORE
Tadeusz Struk0bd82f52010-11-04 15:00:45 -04001694 ret
Jussi Kivilinna8309b742013-01-19 13:38:55 +02001695ENDPROC(aesni_gcm_enc)
Tadeusz Struk3c097b82010-12-13 19:51:15 +08001696
Mathias Krause559ad0f2010-11-29 08:35:39 +08001697#endif
Tadeusz Struk0bd82f52010-11-04 15:00:45 -04001698
1699
Jussi Kivilinna8309b742013-01-19 13:38:55 +02001700.align 4
Huang Ying54b6a1b2009-01-18 16:28:34 +11001701_key_expansion_128:
1702_key_expansion_256a:
1703 pshufd $0b11111111, %xmm1, %xmm1
1704 shufps $0b00010000, %xmm0, %xmm4
1705 pxor %xmm4, %xmm0
1706 shufps $0b10001100, %xmm0, %xmm4
1707 pxor %xmm4, %xmm0
1708 pxor %xmm1, %xmm0
Mathias Krause0d258ef2010-11-27 16:34:46 +08001709 movaps %xmm0, (TKEYP)
1710 add $0x10, TKEYP
Huang Ying54b6a1b2009-01-18 16:28:34 +11001711 ret
Jussi Kivilinna8309b742013-01-19 13:38:55 +02001712ENDPROC(_key_expansion_128)
1713ENDPROC(_key_expansion_256a)
Huang Ying54b6a1b2009-01-18 16:28:34 +11001714
Mathias Krause0d258ef2010-11-27 16:34:46 +08001715.align 4
Huang Ying54b6a1b2009-01-18 16:28:34 +11001716_key_expansion_192a:
1717 pshufd $0b01010101, %xmm1, %xmm1
1718 shufps $0b00010000, %xmm0, %xmm4
1719 pxor %xmm4, %xmm0
1720 shufps $0b10001100, %xmm0, %xmm4
1721 pxor %xmm4, %xmm0
1722 pxor %xmm1, %xmm0
1723
1724 movaps %xmm2, %xmm5
1725 movaps %xmm2, %xmm6
1726 pslldq $4, %xmm5
1727 pshufd $0b11111111, %xmm0, %xmm3
1728 pxor %xmm3, %xmm2
1729 pxor %xmm5, %xmm2
1730
1731 movaps %xmm0, %xmm1
1732 shufps $0b01000100, %xmm0, %xmm6
Mathias Krause0d258ef2010-11-27 16:34:46 +08001733 movaps %xmm6, (TKEYP)
Huang Ying54b6a1b2009-01-18 16:28:34 +11001734 shufps $0b01001110, %xmm2, %xmm1
Mathias Krause0d258ef2010-11-27 16:34:46 +08001735 movaps %xmm1, 0x10(TKEYP)
1736 add $0x20, TKEYP
Huang Ying54b6a1b2009-01-18 16:28:34 +11001737 ret
Jussi Kivilinna8309b742013-01-19 13:38:55 +02001738ENDPROC(_key_expansion_192a)
Huang Ying54b6a1b2009-01-18 16:28:34 +11001739
Mathias Krause0d258ef2010-11-27 16:34:46 +08001740.align 4
Huang Ying54b6a1b2009-01-18 16:28:34 +11001741_key_expansion_192b:
1742 pshufd $0b01010101, %xmm1, %xmm1
1743 shufps $0b00010000, %xmm0, %xmm4
1744 pxor %xmm4, %xmm0
1745 shufps $0b10001100, %xmm0, %xmm4
1746 pxor %xmm4, %xmm0
1747 pxor %xmm1, %xmm0
1748
1749 movaps %xmm2, %xmm5
1750 pslldq $4, %xmm5
1751 pshufd $0b11111111, %xmm0, %xmm3
1752 pxor %xmm3, %xmm2
1753 pxor %xmm5, %xmm2
1754
Mathias Krause0d258ef2010-11-27 16:34:46 +08001755 movaps %xmm0, (TKEYP)
1756 add $0x10, TKEYP
Huang Ying54b6a1b2009-01-18 16:28:34 +11001757 ret
Jussi Kivilinna8309b742013-01-19 13:38:55 +02001758ENDPROC(_key_expansion_192b)
Huang Ying54b6a1b2009-01-18 16:28:34 +11001759
Mathias Krause0d258ef2010-11-27 16:34:46 +08001760.align 4
Huang Ying54b6a1b2009-01-18 16:28:34 +11001761_key_expansion_256b:
1762 pshufd $0b10101010, %xmm1, %xmm1
1763 shufps $0b00010000, %xmm2, %xmm4
1764 pxor %xmm4, %xmm2
1765 shufps $0b10001100, %xmm2, %xmm4
1766 pxor %xmm4, %xmm2
1767 pxor %xmm1, %xmm2
Mathias Krause0d258ef2010-11-27 16:34:46 +08001768 movaps %xmm2, (TKEYP)
1769 add $0x10, TKEYP
Huang Ying54b6a1b2009-01-18 16:28:34 +11001770 ret
Jussi Kivilinna8309b742013-01-19 13:38:55 +02001771ENDPROC(_key_expansion_256b)
Huang Ying54b6a1b2009-01-18 16:28:34 +11001772
1773/*
1774 * int aesni_set_key(struct crypto_aes_ctx *ctx, const u8 *in_key,
1775 * unsigned int key_len)
1776 */
1777ENTRY(aesni_set_key)
Josh Poimboeuf8691ccd2016-01-21 16:49:19 -06001778 FRAME_BEGIN
Mathias Krause0d258ef2010-11-27 16:34:46 +08001779#ifndef __x86_64__
1780 pushl KEYP
Josh Poimboeuf8691ccd2016-01-21 16:49:19 -06001781 movl (FRAME_OFFSET+8)(%esp), KEYP # ctx
1782 movl (FRAME_OFFSET+12)(%esp), UKEYP # in_key
1783 movl (FRAME_OFFSET+16)(%esp), %edx # key_len
Mathias Krause0d258ef2010-11-27 16:34:46 +08001784#endif
1785 movups (UKEYP), %xmm0 # user key (first 16 bytes)
1786 movaps %xmm0, (KEYP)
1787 lea 0x10(KEYP), TKEYP # key addr
1788 movl %edx, 480(KEYP)
Huang Ying54b6a1b2009-01-18 16:28:34 +11001789 pxor %xmm4, %xmm4 # xmm4 is assumed 0 in _key_expansion_x
1790 cmp $24, %dl
1791 jb .Lenc_key128
1792 je .Lenc_key192
Mathias Krause0d258ef2010-11-27 16:34:46 +08001793 movups 0x10(UKEYP), %xmm2 # other user key
1794 movaps %xmm2, (TKEYP)
1795 add $0x10, TKEYP
Huang Yingb369e522009-11-23 19:54:06 +08001796 AESKEYGENASSIST 0x1 %xmm2 %xmm1 # round 1
Huang Ying54b6a1b2009-01-18 16:28:34 +11001797 call _key_expansion_256a
Huang Yingb369e522009-11-23 19:54:06 +08001798 AESKEYGENASSIST 0x1 %xmm0 %xmm1
Huang Ying54b6a1b2009-01-18 16:28:34 +11001799 call _key_expansion_256b
Huang Yingb369e522009-11-23 19:54:06 +08001800 AESKEYGENASSIST 0x2 %xmm2 %xmm1 # round 2
Huang Ying54b6a1b2009-01-18 16:28:34 +11001801 call _key_expansion_256a
Huang Yingb369e522009-11-23 19:54:06 +08001802 AESKEYGENASSIST 0x2 %xmm0 %xmm1
Huang Ying54b6a1b2009-01-18 16:28:34 +11001803 call _key_expansion_256b
Huang Yingb369e522009-11-23 19:54:06 +08001804 AESKEYGENASSIST 0x4 %xmm2 %xmm1 # round 3
Huang Ying54b6a1b2009-01-18 16:28:34 +11001805 call _key_expansion_256a
Huang Yingb369e522009-11-23 19:54:06 +08001806 AESKEYGENASSIST 0x4 %xmm0 %xmm1
Huang Ying54b6a1b2009-01-18 16:28:34 +11001807 call _key_expansion_256b
Huang Yingb369e522009-11-23 19:54:06 +08001808 AESKEYGENASSIST 0x8 %xmm2 %xmm1 # round 4
Huang Ying54b6a1b2009-01-18 16:28:34 +11001809 call _key_expansion_256a
Huang Yingb369e522009-11-23 19:54:06 +08001810 AESKEYGENASSIST 0x8 %xmm0 %xmm1
Huang Ying54b6a1b2009-01-18 16:28:34 +11001811 call _key_expansion_256b
Huang Yingb369e522009-11-23 19:54:06 +08001812 AESKEYGENASSIST 0x10 %xmm2 %xmm1 # round 5
Huang Ying54b6a1b2009-01-18 16:28:34 +11001813 call _key_expansion_256a
Huang Yingb369e522009-11-23 19:54:06 +08001814 AESKEYGENASSIST 0x10 %xmm0 %xmm1
Huang Ying54b6a1b2009-01-18 16:28:34 +11001815 call _key_expansion_256b
Huang Yingb369e522009-11-23 19:54:06 +08001816 AESKEYGENASSIST 0x20 %xmm2 %xmm1 # round 6
Huang Ying54b6a1b2009-01-18 16:28:34 +11001817 call _key_expansion_256a
Huang Yingb369e522009-11-23 19:54:06 +08001818 AESKEYGENASSIST 0x20 %xmm0 %xmm1
Huang Ying54b6a1b2009-01-18 16:28:34 +11001819 call _key_expansion_256b
Huang Yingb369e522009-11-23 19:54:06 +08001820 AESKEYGENASSIST 0x40 %xmm2 %xmm1 # round 7
Huang Ying54b6a1b2009-01-18 16:28:34 +11001821 call _key_expansion_256a
1822 jmp .Ldec_key
1823.Lenc_key192:
Mathias Krause0d258ef2010-11-27 16:34:46 +08001824 movq 0x10(UKEYP), %xmm2 # other user key
Huang Yingb369e522009-11-23 19:54:06 +08001825 AESKEYGENASSIST 0x1 %xmm2 %xmm1 # round 1
Huang Ying54b6a1b2009-01-18 16:28:34 +11001826 call _key_expansion_192a
Huang Yingb369e522009-11-23 19:54:06 +08001827 AESKEYGENASSIST 0x2 %xmm2 %xmm1 # round 2
Huang Ying54b6a1b2009-01-18 16:28:34 +11001828 call _key_expansion_192b
Huang Yingb369e522009-11-23 19:54:06 +08001829 AESKEYGENASSIST 0x4 %xmm2 %xmm1 # round 3
Huang Ying54b6a1b2009-01-18 16:28:34 +11001830 call _key_expansion_192a
Huang Yingb369e522009-11-23 19:54:06 +08001831 AESKEYGENASSIST 0x8 %xmm2 %xmm1 # round 4
Huang Ying54b6a1b2009-01-18 16:28:34 +11001832 call _key_expansion_192b
Huang Yingb369e522009-11-23 19:54:06 +08001833 AESKEYGENASSIST 0x10 %xmm2 %xmm1 # round 5
Huang Ying54b6a1b2009-01-18 16:28:34 +11001834 call _key_expansion_192a
Huang Yingb369e522009-11-23 19:54:06 +08001835 AESKEYGENASSIST 0x20 %xmm2 %xmm1 # round 6
Huang Ying54b6a1b2009-01-18 16:28:34 +11001836 call _key_expansion_192b
Huang Yingb369e522009-11-23 19:54:06 +08001837 AESKEYGENASSIST 0x40 %xmm2 %xmm1 # round 7
Huang Ying54b6a1b2009-01-18 16:28:34 +11001838 call _key_expansion_192a
Huang Yingb369e522009-11-23 19:54:06 +08001839 AESKEYGENASSIST 0x80 %xmm2 %xmm1 # round 8
Huang Ying54b6a1b2009-01-18 16:28:34 +11001840 call _key_expansion_192b
1841 jmp .Ldec_key
1842.Lenc_key128:
Huang Yingb369e522009-11-23 19:54:06 +08001843 AESKEYGENASSIST 0x1 %xmm0 %xmm1 # round 1
Huang Ying54b6a1b2009-01-18 16:28:34 +11001844 call _key_expansion_128
Huang Yingb369e522009-11-23 19:54:06 +08001845 AESKEYGENASSIST 0x2 %xmm0 %xmm1 # round 2
Huang Ying54b6a1b2009-01-18 16:28:34 +11001846 call _key_expansion_128
Huang Yingb369e522009-11-23 19:54:06 +08001847 AESKEYGENASSIST 0x4 %xmm0 %xmm1 # round 3
Huang Ying54b6a1b2009-01-18 16:28:34 +11001848 call _key_expansion_128
Huang Yingb369e522009-11-23 19:54:06 +08001849 AESKEYGENASSIST 0x8 %xmm0 %xmm1 # round 4
Huang Ying54b6a1b2009-01-18 16:28:34 +11001850 call _key_expansion_128
Huang Yingb369e522009-11-23 19:54:06 +08001851 AESKEYGENASSIST 0x10 %xmm0 %xmm1 # round 5
Huang Ying54b6a1b2009-01-18 16:28:34 +11001852 call _key_expansion_128
Huang Yingb369e522009-11-23 19:54:06 +08001853 AESKEYGENASSIST 0x20 %xmm0 %xmm1 # round 6
Huang Ying54b6a1b2009-01-18 16:28:34 +11001854 call _key_expansion_128
Huang Yingb369e522009-11-23 19:54:06 +08001855 AESKEYGENASSIST 0x40 %xmm0 %xmm1 # round 7
Huang Ying54b6a1b2009-01-18 16:28:34 +11001856 call _key_expansion_128
Huang Yingb369e522009-11-23 19:54:06 +08001857 AESKEYGENASSIST 0x80 %xmm0 %xmm1 # round 8
Huang Ying54b6a1b2009-01-18 16:28:34 +11001858 call _key_expansion_128
Huang Yingb369e522009-11-23 19:54:06 +08001859 AESKEYGENASSIST 0x1b %xmm0 %xmm1 # round 9
Huang Ying54b6a1b2009-01-18 16:28:34 +11001860 call _key_expansion_128
Huang Yingb369e522009-11-23 19:54:06 +08001861 AESKEYGENASSIST 0x36 %xmm0 %xmm1 # round 10
Huang Ying54b6a1b2009-01-18 16:28:34 +11001862 call _key_expansion_128
1863.Ldec_key:
Mathias Krause0d258ef2010-11-27 16:34:46 +08001864 sub $0x10, TKEYP
1865 movaps (KEYP), %xmm0
1866 movaps (TKEYP), %xmm1
1867 movaps %xmm0, 240(TKEYP)
1868 movaps %xmm1, 240(KEYP)
1869 add $0x10, KEYP
1870 lea 240-16(TKEYP), UKEYP
Huang Ying54b6a1b2009-01-18 16:28:34 +11001871.align 4
1872.Ldec_key_loop:
Mathias Krause0d258ef2010-11-27 16:34:46 +08001873 movaps (KEYP), %xmm0
Huang Yingb369e522009-11-23 19:54:06 +08001874 AESIMC %xmm0 %xmm1
Mathias Krause0d258ef2010-11-27 16:34:46 +08001875 movaps %xmm1, (UKEYP)
1876 add $0x10, KEYP
1877 sub $0x10, UKEYP
1878 cmp TKEYP, KEYP
Huang Ying54b6a1b2009-01-18 16:28:34 +11001879 jb .Ldec_key_loop
Mathias Krause0d258ef2010-11-27 16:34:46 +08001880 xor AREG, AREG
1881#ifndef __x86_64__
1882 popl KEYP
1883#endif
Josh Poimboeuf8691ccd2016-01-21 16:49:19 -06001884 FRAME_END
Huang Ying54b6a1b2009-01-18 16:28:34 +11001885 ret
Jussi Kivilinna8309b742013-01-19 13:38:55 +02001886ENDPROC(aesni_set_key)
Huang Ying54b6a1b2009-01-18 16:28:34 +11001887
1888/*
1889 * void aesni_enc(struct crypto_aes_ctx *ctx, u8 *dst, const u8 *src)
1890 */
1891ENTRY(aesni_enc)
Josh Poimboeuf8691ccd2016-01-21 16:49:19 -06001892 FRAME_BEGIN
Mathias Krause0d258ef2010-11-27 16:34:46 +08001893#ifndef __x86_64__
1894 pushl KEYP
1895 pushl KLEN
Josh Poimboeuf8691ccd2016-01-21 16:49:19 -06001896 movl (FRAME_OFFSET+12)(%esp), KEYP # ctx
1897 movl (FRAME_OFFSET+16)(%esp), OUTP # dst
1898 movl (FRAME_OFFSET+20)(%esp), INP # src
Mathias Krause0d258ef2010-11-27 16:34:46 +08001899#endif
Huang Ying54b6a1b2009-01-18 16:28:34 +11001900 movl 480(KEYP), KLEN # key length
1901 movups (INP), STATE # input
1902 call _aesni_enc1
1903 movups STATE, (OUTP) # output
Mathias Krause0d258ef2010-11-27 16:34:46 +08001904#ifndef __x86_64__
1905 popl KLEN
1906 popl KEYP
1907#endif
Josh Poimboeuf8691ccd2016-01-21 16:49:19 -06001908 FRAME_END
Huang Ying54b6a1b2009-01-18 16:28:34 +11001909 ret
Jussi Kivilinna8309b742013-01-19 13:38:55 +02001910ENDPROC(aesni_enc)
Huang Ying54b6a1b2009-01-18 16:28:34 +11001911
1912/*
1913 * _aesni_enc1: internal ABI
1914 * input:
1915 * KEYP: key struct pointer
1916 * KLEN: round count
1917 * STATE: initial state (input)
1918 * output:
1919 * STATE: finial state (output)
1920 * changed:
1921 * KEY
1922 * TKEYP (T1)
1923 */
Mathias Krause0d258ef2010-11-27 16:34:46 +08001924.align 4
Huang Ying54b6a1b2009-01-18 16:28:34 +11001925_aesni_enc1:
1926 movaps (KEYP), KEY # key
1927 mov KEYP, TKEYP
1928 pxor KEY, STATE # round 0
1929 add $0x30, TKEYP
1930 cmp $24, KLEN
1931 jb .Lenc128
1932 lea 0x20(TKEYP), TKEYP
1933 je .Lenc192
1934 add $0x20, TKEYP
1935 movaps -0x60(TKEYP), KEY
Huang Yingb369e522009-11-23 19:54:06 +08001936 AESENC KEY STATE
Huang Ying54b6a1b2009-01-18 16:28:34 +11001937 movaps -0x50(TKEYP), KEY
Huang Yingb369e522009-11-23 19:54:06 +08001938 AESENC KEY STATE
Huang Ying54b6a1b2009-01-18 16:28:34 +11001939.align 4
1940.Lenc192:
1941 movaps -0x40(TKEYP), KEY
Huang Yingb369e522009-11-23 19:54:06 +08001942 AESENC KEY STATE
Huang Ying54b6a1b2009-01-18 16:28:34 +11001943 movaps -0x30(TKEYP), KEY
Huang Yingb369e522009-11-23 19:54:06 +08001944 AESENC KEY STATE
Huang Ying54b6a1b2009-01-18 16:28:34 +11001945.align 4
1946.Lenc128:
1947 movaps -0x20(TKEYP), KEY
Huang Yingb369e522009-11-23 19:54:06 +08001948 AESENC KEY STATE
Huang Ying54b6a1b2009-01-18 16:28:34 +11001949 movaps -0x10(TKEYP), KEY
Huang Yingb369e522009-11-23 19:54:06 +08001950 AESENC KEY STATE
Huang Ying54b6a1b2009-01-18 16:28:34 +11001951 movaps (TKEYP), KEY
Huang Yingb369e522009-11-23 19:54:06 +08001952 AESENC KEY STATE
Huang Ying54b6a1b2009-01-18 16:28:34 +11001953 movaps 0x10(TKEYP), KEY
Huang Yingb369e522009-11-23 19:54:06 +08001954 AESENC KEY STATE
Huang Ying54b6a1b2009-01-18 16:28:34 +11001955 movaps 0x20(TKEYP), KEY
Huang Yingb369e522009-11-23 19:54:06 +08001956 AESENC KEY STATE
Huang Ying54b6a1b2009-01-18 16:28:34 +11001957 movaps 0x30(TKEYP), KEY
Huang Yingb369e522009-11-23 19:54:06 +08001958 AESENC KEY STATE
Huang Ying54b6a1b2009-01-18 16:28:34 +11001959 movaps 0x40(TKEYP), KEY
Huang Yingb369e522009-11-23 19:54:06 +08001960 AESENC KEY STATE
Huang Ying54b6a1b2009-01-18 16:28:34 +11001961 movaps 0x50(TKEYP), KEY
Huang Yingb369e522009-11-23 19:54:06 +08001962 AESENC KEY STATE
Huang Ying54b6a1b2009-01-18 16:28:34 +11001963 movaps 0x60(TKEYP), KEY
Huang Yingb369e522009-11-23 19:54:06 +08001964 AESENC KEY STATE
Huang Ying54b6a1b2009-01-18 16:28:34 +11001965 movaps 0x70(TKEYP), KEY
Huang Yingb369e522009-11-23 19:54:06 +08001966 AESENCLAST KEY STATE
Huang Ying54b6a1b2009-01-18 16:28:34 +11001967 ret
Jussi Kivilinna8309b742013-01-19 13:38:55 +02001968ENDPROC(_aesni_enc1)
Huang Ying54b6a1b2009-01-18 16:28:34 +11001969
1970/*
1971 * _aesni_enc4: internal ABI
1972 * input:
1973 * KEYP: key struct pointer
1974 * KLEN: round count
1975 * STATE1: initial state (input)
1976 * STATE2
1977 * STATE3
1978 * STATE4
1979 * output:
1980 * STATE1: finial state (output)
1981 * STATE2
1982 * STATE3
1983 * STATE4
1984 * changed:
1985 * KEY
1986 * TKEYP (T1)
1987 */
Mathias Krause0d258ef2010-11-27 16:34:46 +08001988.align 4
Huang Ying54b6a1b2009-01-18 16:28:34 +11001989_aesni_enc4:
1990 movaps (KEYP), KEY # key
1991 mov KEYP, TKEYP
1992 pxor KEY, STATE1 # round 0
1993 pxor KEY, STATE2
1994 pxor KEY, STATE3
1995 pxor KEY, STATE4
1996 add $0x30, TKEYP
1997 cmp $24, KLEN
1998 jb .L4enc128
1999 lea 0x20(TKEYP), TKEYP
2000 je .L4enc192
2001 add $0x20, TKEYP
2002 movaps -0x60(TKEYP), KEY
Huang Yingb369e522009-11-23 19:54:06 +08002003 AESENC KEY STATE1
2004 AESENC KEY STATE2
2005 AESENC KEY STATE3
2006 AESENC KEY STATE4
Huang Ying54b6a1b2009-01-18 16:28:34 +11002007 movaps -0x50(TKEYP), KEY
Huang Yingb369e522009-11-23 19:54:06 +08002008 AESENC KEY STATE1
2009 AESENC KEY STATE2
2010 AESENC KEY STATE3
2011 AESENC KEY STATE4
Huang Ying54b6a1b2009-01-18 16:28:34 +11002012#.align 4
2013.L4enc192:
2014 movaps -0x40(TKEYP), KEY
Huang Yingb369e522009-11-23 19:54:06 +08002015 AESENC KEY STATE1
2016 AESENC KEY STATE2
2017 AESENC KEY STATE3
2018 AESENC KEY STATE4
Huang Ying54b6a1b2009-01-18 16:28:34 +11002019 movaps -0x30(TKEYP), KEY
Huang Yingb369e522009-11-23 19:54:06 +08002020 AESENC KEY STATE1
2021 AESENC KEY STATE2
2022 AESENC KEY STATE3
2023 AESENC KEY STATE4
Huang Ying54b6a1b2009-01-18 16:28:34 +11002024#.align 4
2025.L4enc128:
2026 movaps -0x20(TKEYP), KEY
Huang Yingb369e522009-11-23 19:54:06 +08002027 AESENC KEY STATE1
2028 AESENC KEY STATE2
2029 AESENC KEY STATE3
2030 AESENC KEY STATE4
Huang Ying54b6a1b2009-01-18 16:28:34 +11002031 movaps -0x10(TKEYP), KEY
Huang Yingb369e522009-11-23 19:54:06 +08002032 AESENC KEY STATE1
2033 AESENC KEY STATE2
2034 AESENC KEY STATE3
2035 AESENC KEY STATE4
Huang Ying54b6a1b2009-01-18 16:28:34 +11002036 movaps (TKEYP), KEY
Huang Yingb369e522009-11-23 19:54:06 +08002037 AESENC KEY STATE1
2038 AESENC KEY STATE2
2039 AESENC KEY STATE3
2040 AESENC KEY STATE4
Huang Ying54b6a1b2009-01-18 16:28:34 +11002041 movaps 0x10(TKEYP), KEY
Huang Yingb369e522009-11-23 19:54:06 +08002042 AESENC KEY STATE1
2043 AESENC KEY STATE2
2044 AESENC KEY STATE3
2045 AESENC KEY STATE4
Huang Ying54b6a1b2009-01-18 16:28:34 +11002046 movaps 0x20(TKEYP), KEY
Huang Yingb369e522009-11-23 19:54:06 +08002047 AESENC KEY STATE1
2048 AESENC KEY STATE2
2049 AESENC KEY STATE3
2050 AESENC KEY STATE4
Huang Ying54b6a1b2009-01-18 16:28:34 +11002051 movaps 0x30(TKEYP), KEY
Huang Yingb369e522009-11-23 19:54:06 +08002052 AESENC KEY STATE1
2053 AESENC KEY STATE2
2054 AESENC KEY STATE3
2055 AESENC KEY STATE4
Huang Ying54b6a1b2009-01-18 16:28:34 +11002056 movaps 0x40(TKEYP), KEY
Huang Yingb369e522009-11-23 19:54:06 +08002057 AESENC KEY STATE1
2058 AESENC KEY STATE2
2059 AESENC KEY STATE3
2060 AESENC KEY STATE4
Huang Ying54b6a1b2009-01-18 16:28:34 +11002061 movaps 0x50(TKEYP), KEY
Huang Yingb369e522009-11-23 19:54:06 +08002062 AESENC KEY STATE1
2063 AESENC KEY STATE2
2064 AESENC KEY STATE3
2065 AESENC KEY STATE4
Huang Ying54b6a1b2009-01-18 16:28:34 +11002066 movaps 0x60(TKEYP), KEY
Huang Yingb369e522009-11-23 19:54:06 +08002067 AESENC KEY STATE1
2068 AESENC KEY STATE2
2069 AESENC KEY STATE3
2070 AESENC KEY STATE4
Huang Ying54b6a1b2009-01-18 16:28:34 +11002071 movaps 0x70(TKEYP), KEY
Huang Yingb369e522009-11-23 19:54:06 +08002072 AESENCLAST KEY STATE1 # last round
2073 AESENCLAST KEY STATE2
2074 AESENCLAST KEY STATE3
2075 AESENCLAST KEY STATE4
Huang Ying54b6a1b2009-01-18 16:28:34 +11002076 ret
Jussi Kivilinna8309b742013-01-19 13:38:55 +02002077ENDPROC(_aesni_enc4)
Huang Ying54b6a1b2009-01-18 16:28:34 +11002078
2079/*
2080 * void aesni_dec (struct crypto_aes_ctx *ctx, u8 *dst, const u8 *src)
2081 */
2082ENTRY(aesni_dec)
Josh Poimboeuf8691ccd2016-01-21 16:49:19 -06002083 FRAME_BEGIN
Mathias Krause0d258ef2010-11-27 16:34:46 +08002084#ifndef __x86_64__
2085 pushl KEYP
2086 pushl KLEN
Josh Poimboeuf8691ccd2016-01-21 16:49:19 -06002087 movl (FRAME_OFFSET+12)(%esp), KEYP # ctx
2088 movl (FRAME_OFFSET+16)(%esp), OUTP # dst
2089 movl (FRAME_OFFSET+20)(%esp), INP # src
Mathias Krause0d258ef2010-11-27 16:34:46 +08002090#endif
Huang Ying54b6a1b2009-01-18 16:28:34 +11002091 mov 480(KEYP), KLEN # key length
2092 add $240, KEYP
2093 movups (INP), STATE # input
2094 call _aesni_dec1
2095 movups STATE, (OUTP) #output
Mathias Krause0d258ef2010-11-27 16:34:46 +08002096#ifndef __x86_64__
2097 popl KLEN
2098 popl KEYP
2099#endif
Josh Poimboeuf8691ccd2016-01-21 16:49:19 -06002100 FRAME_END
Huang Ying54b6a1b2009-01-18 16:28:34 +11002101 ret
Jussi Kivilinna8309b742013-01-19 13:38:55 +02002102ENDPROC(aesni_dec)
Huang Ying54b6a1b2009-01-18 16:28:34 +11002103
2104/*
2105 * _aesni_dec1: internal ABI
2106 * input:
2107 * KEYP: key struct pointer
2108 * KLEN: key length
2109 * STATE: initial state (input)
2110 * output:
2111 * STATE: finial state (output)
2112 * changed:
2113 * KEY
2114 * TKEYP (T1)
2115 */
Mathias Krause0d258ef2010-11-27 16:34:46 +08002116.align 4
Huang Ying54b6a1b2009-01-18 16:28:34 +11002117_aesni_dec1:
2118 movaps (KEYP), KEY # key
2119 mov KEYP, TKEYP
2120 pxor KEY, STATE # round 0
2121 add $0x30, TKEYP
2122 cmp $24, KLEN
2123 jb .Ldec128
2124 lea 0x20(TKEYP), TKEYP
2125 je .Ldec192
2126 add $0x20, TKEYP
2127 movaps -0x60(TKEYP), KEY
Huang Yingb369e522009-11-23 19:54:06 +08002128 AESDEC KEY STATE
Huang Ying54b6a1b2009-01-18 16:28:34 +11002129 movaps -0x50(TKEYP), KEY
Huang Yingb369e522009-11-23 19:54:06 +08002130 AESDEC KEY STATE
Huang Ying54b6a1b2009-01-18 16:28:34 +11002131.align 4
2132.Ldec192:
2133 movaps -0x40(TKEYP), KEY
Huang Yingb369e522009-11-23 19:54:06 +08002134 AESDEC KEY STATE
Huang Ying54b6a1b2009-01-18 16:28:34 +11002135 movaps -0x30(TKEYP), KEY
Huang Yingb369e522009-11-23 19:54:06 +08002136 AESDEC KEY STATE
Huang Ying54b6a1b2009-01-18 16:28:34 +11002137.align 4
2138.Ldec128:
2139 movaps -0x20(TKEYP), KEY
Huang Yingb369e522009-11-23 19:54:06 +08002140 AESDEC KEY STATE
Huang Ying54b6a1b2009-01-18 16:28:34 +11002141 movaps -0x10(TKEYP), KEY
Huang Yingb369e522009-11-23 19:54:06 +08002142 AESDEC KEY STATE
Huang Ying54b6a1b2009-01-18 16:28:34 +11002143 movaps (TKEYP), KEY
Huang Yingb369e522009-11-23 19:54:06 +08002144 AESDEC KEY STATE
Huang Ying54b6a1b2009-01-18 16:28:34 +11002145 movaps 0x10(TKEYP), KEY
Huang Yingb369e522009-11-23 19:54:06 +08002146 AESDEC KEY STATE
Huang Ying54b6a1b2009-01-18 16:28:34 +11002147 movaps 0x20(TKEYP), KEY
Huang Yingb369e522009-11-23 19:54:06 +08002148 AESDEC KEY STATE
Huang Ying54b6a1b2009-01-18 16:28:34 +11002149 movaps 0x30(TKEYP), KEY
Huang Yingb369e522009-11-23 19:54:06 +08002150 AESDEC KEY STATE
Huang Ying54b6a1b2009-01-18 16:28:34 +11002151 movaps 0x40(TKEYP), KEY
Huang Yingb369e522009-11-23 19:54:06 +08002152 AESDEC KEY STATE
Huang Ying54b6a1b2009-01-18 16:28:34 +11002153 movaps 0x50(TKEYP), KEY
Huang Yingb369e522009-11-23 19:54:06 +08002154 AESDEC KEY STATE
Huang Ying54b6a1b2009-01-18 16:28:34 +11002155 movaps 0x60(TKEYP), KEY
Huang Yingb369e522009-11-23 19:54:06 +08002156 AESDEC KEY STATE
Huang Ying54b6a1b2009-01-18 16:28:34 +11002157 movaps 0x70(TKEYP), KEY
Huang Yingb369e522009-11-23 19:54:06 +08002158 AESDECLAST KEY STATE
Huang Ying54b6a1b2009-01-18 16:28:34 +11002159 ret
Jussi Kivilinna8309b742013-01-19 13:38:55 +02002160ENDPROC(_aesni_dec1)
Huang Ying54b6a1b2009-01-18 16:28:34 +11002161
2162/*
2163 * _aesni_dec4: internal ABI
2164 * input:
2165 * KEYP: key struct pointer
2166 * KLEN: key length
2167 * STATE1: initial state (input)
2168 * STATE2
2169 * STATE3
2170 * STATE4
2171 * output:
2172 * STATE1: finial state (output)
2173 * STATE2
2174 * STATE3
2175 * STATE4
2176 * changed:
2177 * KEY
2178 * TKEYP (T1)
2179 */
Mathias Krause0d258ef2010-11-27 16:34:46 +08002180.align 4
Huang Ying54b6a1b2009-01-18 16:28:34 +11002181_aesni_dec4:
2182 movaps (KEYP), KEY # key
2183 mov KEYP, TKEYP
2184 pxor KEY, STATE1 # round 0
2185 pxor KEY, STATE2
2186 pxor KEY, STATE3
2187 pxor KEY, STATE4
2188 add $0x30, TKEYP
2189 cmp $24, KLEN
2190 jb .L4dec128
2191 lea 0x20(TKEYP), TKEYP
2192 je .L4dec192
2193 add $0x20, TKEYP
2194 movaps -0x60(TKEYP), KEY
Huang Yingb369e522009-11-23 19:54:06 +08002195 AESDEC KEY STATE1
2196 AESDEC KEY STATE2
2197 AESDEC KEY STATE3
2198 AESDEC KEY STATE4
Huang Ying54b6a1b2009-01-18 16:28:34 +11002199 movaps -0x50(TKEYP), KEY
Huang Yingb369e522009-11-23 19:54:06 +08002200 AESDEC KEY STATE1
2201 AESDEC KEY STATE2
2202 AESDEC KEY STATE3
2203 AESDEC KEY STATE4
Huang Ying54b6a1b2009-01-18 16:28:34 +11002204.align 4
2205.L4dec192:
2206 movaps -0x40(TKEYP), KEY
Huang Yingb369e522009-11-23 19:54:06 +08002207 AESDEC KEY STATE1
2208 AESDEC KEY STATE2
2209 AESDEC KEY STATE3
2210 AESDEC KEY STATE4
Huang Ying54b6a1b2009-01-18 16:28:34 +11002211 movaps -0x30(TKEYP), KEY
Huang Yingb369e522009-11-23 19:54:06 +08002212 AESDEC KEY STATE1
2213 AESDEC KEY STATE2
2214 AESDEC KEY STATE3
2215 AESDEC KEY STATE4
Huang Ying54b6a1b2009-01-18 16:28:34 +11002216.align 4
2217.L4dec128:
2218 movaps -0x20(TKEYP), KEY
Huang Yingb369e522009-11-23 19:54:06 +08002219 AESDEC KEY STATE1
2220 AESDEC KEY STATE2
2221 AESDEC KEY STATE3
2222 AESDEC KEY STATE4
Huang Ying54b6a1b2009-01-18 16:28:34 +11002223 movaps -0x10(TKEYP), KEY
Huang Yingb369e522009-11-23 19:54:06 +08002224 AESDEC KEY STATE1
2225 AESDEC KEY STATE2
2226 AESDEC KEY STATE3
2227 AESDEC KEY STATE4
Huang Ying54b6a1b2009-01-18 16:28:34 +11002228 movaps (TKEYP), KEY
Huang Yingb369e522009-11-23 19:54:06 +08002229 AESDEC KEY STATE1
2230 AESDEC KEY STATE2
2231 AESDEC KEY STATE3
2232 AESDEC KEY STATE4
Huang Ying54b6a1b2009-01-18 16:28:34 +11002233 movaps 0x10(TKEYP), KEY
Huang Yingb369e522009-11-23 19:54:06 +08002234 AESDEC KEY STATE1
2235 AESDEC KEY STATE2
2236 AESDEC KEY STATE3
2237 AESDEC KEY STATE4
Huang Ying54b6a1b2009-01-18 16:28:34 +11002238 movaps 0x20(TKEYP), KEY
Huang Yingb369e522009-11-23 19:54:06 +08002239 AESDEC KEY STATE1
2240 AESDEC KEY STATE2
2241 AESDEC KEY STATE3
2242 AESDEC KEY STATE4
Huang Ying54b6a1b2009-01-18 16:28:34 +11002243 movaps 0x30(TKEYP), KEY
Huang Yingb369e522009-11-23 19:54:06 +08002244 AESDEC KEY STATE1
2245 AESDEC KEY STATE2
2246 AESDEC KEY STATE3
2247 AESDEC KEY STATE4
Huang Ying54b6a1b2009-01-18 16:28:34 +11002248 movaps 0x40(TKEYP), KEY
Huang Yingb369e522009-11-23 19:54:06 +08002249 AESDEC KEY STATE1
2250 AESDEC KEY STATE2
2251 AESDEC KEY STATE3
2252 AESDEC KEY STATE4
Huang Ying54b6a1b2009-01-18 16:28:34 +11002253 movaps 0x50(TKEYP), KEY
Huang Yingb369e522009-11-23 19:54:06 +08002254 AESDEC KEY STATE1
2255 AESDEC KEY STATE2
2256 AESDEC KEY STATE3
2257 AESDEC KEY STATE4
Huang Ying54b6a1b2009-01-18 16:28:34 +11002258 movaps 0x60(TKEYP), KEY
Huang Yingb369e522009-11-23 19:54:06 +08002259 AESDEC KEY STATE1
2260 AESDEC KEY STATE2
2261 AESDEC KEY STATE3
2262 AESDEC KEY STATE4
Huang Ying54b6a1b2009-01-18 16:28:34 +11002263 movaps 0x70(TKEYP), KEY
Huang Yingb369e522009-11-23 19:54:06 +08002264 AESDECLAST KEY STATE1 # last round
2265 AESDECLAST KEY STATE2
2266 AESDECLAST KEY STATE3
2267 AESDECLAST KEY STATE4
Huang Ying54b6a1b2009-01-18 16:28:34 +11002268 ret
Jussi Kivilinna8309b742013-01-19 13:38:55 +02002269ENDPROC(_aesni_dec4)
Huang Ying54b6a1b2009-01-18 16:28:34 +11002270
2271/*
2272 * void aesni_ecb_enc(struct crypto_aes_ctx *ctx, const u8 *dst, u8 *src,
2273 * size_t len)
2274 */
2275ENTRY(aesni_ecb_enc)
Josh Poimboeuf8691ccd2016-01-21 16:49:19 -06002276 FRAME_BEGIN
Mathias Krause0d258ef2010-11-27 16:34:46 +08002277#ifndef __x86_64__
2278 pushl LEN
2279 pushl KEYP
2280 pushl KLEN
Josh Poimboeuf8691ccd2016-01-21 16:49:19 -06002281 movl (FRAME_OFFSET+16)(%esp), KEYP # ctx
2282 movl (FRAME_OFFSET+20)(%esp), OUTP # dst
2283 movl (FRAME_OFFSET+24)(%esp), INP # src
2284 movl (FRAME_OFFSET+28)(%esp), LEN # len
Mathias Krause0d258ef2010-11-27 16:34:46 +08002285#endif
Huang Ying54b6a1b2009-01-18 16:28:34 +11002286 test LEN, LEN # check length
2287 jz .Lecb_enc_ret
2288 mov 480(KEYP), KLEN
2289 cmp $16, LEN
2290 jb .Lecb_enc_ret
2291 cmp $64, LEN
2292 jb .Lecb_enc_loop1
2293.align 4
2294.Lecb_enc_loop4:
2295 movups (INP), STATE1
2296 movups 0x10(INP), STATE2
2297 movups 0x20(INP), STATE3
2298 movups 0x30(INP), STATE4
2299 call _aesni_enc4
2300 movups STATE1, (OUTP)
2301 movups STATE2, 0x10(OUTP)
2302 movups STATE3, 0x20(OUTP)
2303 movups STATE4, 0x30(OUTP)
2304 sub $64, LEN
2305 add $64, INP
2306 add $64, OUTP
2307 cmp $64, LEN
2308 jge .Lecb_enc_loop4
2309 cmp $16, LEN
2310 jb .Lecb_enc_ret
2311.align 4
2312.Lecb_enc_loop1:
2313 movups (INP), STATE1
2314 call _aesni_enc1
2315 movups STATE1, (OUTP)
2316 sub $16, LEN
2317 add $16, INP
2318 add $16, OUTP
2319 cmp $16, LEN
2320 jge .Lecb_enc_loop1
2321.Lecb_enc_ret:
Mathias Krause0d258ef2010-11-27 16:34:46 +08002322#ifndef __x86_64__
2323 popl KLEN
2324 popl KEYP
2325 popl LEN
2326#endif
Josh Poimboeuf8691ccd2016-01-21 16:49:19 -06002327 FRAME_END
Huang Ying54b6a1b2009-01-18 16:28:34 +11002328 ret
Jussi Kivilinna8309b742013-01-19 13:38:55 +02002329ENDPROC(aesni_ecb_enc)
Huang Ying54b6a1b2009-01-18 16:28:34 +11002330
2331/*
2332 * void aesni_ecb_dec(struct crypto_aes_ctx *ctx, const u8 *dst, u8 *src,
2333 * size_t len);
2334 */
2335ENTRY(aesni_ecb_dec)
Josh Poimboeuf8691ccd2016-01-21 16:49:19 -06002336 FRAME_BEGIN
Mathias Krause0d258ef2010-11-27 16:34:46 +08002337#ifndef __x86_64__
2338 pushl LEN
2339 pushl KEYP
2340 pushl KLEN
Josh Poimboeuf8691ccd2016-01-21 16:49:19 -06002341 movl (FRAME_OFFSET+16)(%esp), KEYP # ctx
2342 movl (FRAME_OFFSET+20)(%esp), OUTP # dst
2343 movl (FRAME_OFFSET+24)(%esp), INP # src
2344 movl (FRAME_OFFSET+28)(%esp), LEN # len
Mathias Krause0d258ef2010-11-27 16:34:46 +08002345#endif
Huang Ying54b6a1b2009-01-18 16:28:34 +11002346 test LEN, LEN
2347 jz .Lecb_dec_ret
2348 mov 480(KEYP), KLEN
2349 add $240, KEYP
2350 cmp $16, LEN
2351 jb .Lecb_dec_ret
2352 cmp $64, LEN
2353 jb .Lecb_dec_loop1
2354.align 4
2355.Lecb_dec_loop4:
2356 movups (INP), STATE1
2357 movups 0x10(INP), STATE2
2358 movups 0x20(INP), STATE3
2359 movups 0x30(INP), STATE4
2360 call _aesni_dec4
2361 movups STATE1, (OUTP)
2362 movups STATE2, 0x10(OUTP)
2363 movups STATE3, 0x20(OUTP)
2364 movups STATE4, 0x30(OUTP)
2365 sub $64, LEN
2366 add $64, INP
2367 add $64, OUTP
2368 cmp $64, LEN
2369 jge .Lecb_dec_loop4
2370 cmp $16, LEN
2371 jb .Lecb_dec_ret
2372.align 4
2373.Lecb_dec_loop1:
2374 movups (INP), STATE1
2375 call _aesni_dec1
2376 movups STATE1, (OUTP)
2377 sub $16, LEN
2378 add $16, INP
2379 add $16, OUTP
2380 cmp $16, LEN
2381 jge .Lecb_dec_loop1
2382.Lecb_dec_ret:
Mathias Krause0d258ef2010-11-27 16:34:46 +08002383#ifndef __x86_64__
2384 popl KLEN
2385 popl KEYP
2386 popl LEN
2387#endif
Josh Poimboeuf8691ccd2016-01-21 16:49:19 -06002388 FRAME_END
Huang Ying54b6a1b2009-01-18 16:28:34 +11002389 ret
Jussi Kivilinna8309b742013-01-19 13:38:55 +02002390ENDPROC(aesni_ecb_dec)
Huang Ying54b6a1b2009-01-18 16:28:34 +11002391
2392/*
2393 * void aesni_cbc_enc(struct crypto_aes_ctx *ctx, const u8 *dst, u8 *src,
2394 * size_t len, u8 *iv)
2395 */
2396ENTRY(aesni_cbc_enc)
Josh Poimboeuf8691ccd2016-01-21 16:49:19 -06002397 FRAME_BEGIN
Mathias Krause0d258ef2010-11-27 16:34:46 +08002398#ifndef __x86_64__
2399 pushl IVP
2400 pushl LEN
2401 pushl KEYP
2402 pushl KLEN
Josh Poimboeuf8691ccd2016-01-21 16:49:19 -06002403 movl (FRAME_OFFSET+20)(%esp), KEYP # ctx
2404 movl (FRAME_OFFSET+24)(%esp), OUTP # dst
2405 movl (FRAME_OFFSET+28)(%esp), INP # src
2406 movl (FRAME_OFFSET+32)(%esp), LEN # len
2407 movl (FRAME_OFFSET+36)(%esp), IVP # iv
Mathias Krause0d258ef2010-11-27 16:34:46 +08002408#endif
Huang Ying54b6a1b2009-01-18 16:28:34 +11002409 cmp $16, LEN
2410 jb .Lcbc_enc_ret
2411 mov 480(KEYP), KLEN
2412 movups (IVP), STATE # load iv as initial state
2413.align 4
2414.Lcbc_enc_loop:
2415 movups (INP), IN # load input
2416 pxor IN, STATE
2417 call _aesni_enc1
2418 movups STATE, (OUTP) # store output
2419 sub $16, LEN
2420 add $16, INP
2421 add $16, OUTP
2422 cmp $16, LEN
2423 jge .Lcbc_enc_loop
2424 movups STATE, (IVP)
2425.Lcbc_enc_ret:
Mathias Krause0d258ef2010-11-27 16:34:46 +08002426#ifndef __x86_64__
2427 popl KLEN
2428 popl KEYP
2429 popl LEN
2430 popl IVP
2431#endif
Josh Poimboeuf8691ccd2016-01-21 16:49:19 -06002432 FRAME_END
Huang Ying54b6a1b2009-01-18 16:28:34 +11002433 ret
Jussi Kivilinna8309b742013-01-19 13:38:55 +02002434ENDPROC(aesni_cbc_enc)
Huang Ying54b6a1b2009-01-18 16:28:34 +11002435
2436/*
2437 * void aesni_cbc_dec(struct crypto_aes_ctx *ctx, const u8 *dst, u8 *src,
2438 * size_t len, u8 *iv)
2439 */
2440ENTRY(aesni_cbc_dec)
Josh Poimboeuf8691ccd2016-01-21 16:49:19 -06002441 FRAME_BEGIN
Mathias Krause0d258ef2010-11-27 16:34:46 +08002442#ifndef __x86_64__
2443 pushl IVP
2444 pushl LEN
2445 pushl KEYP
2446 pushl KLEN
Josh Poimboeuf8691ccd2016-01-21 16:49:19 -06002447 movl (FRAME_OFFSET+20)(%esp), KEYP # ctx
2448 movl (FRAME_OFFSET+24)(%esp), OUTP # dst
2449 movl (FRAME_OFFSET+28)(%esp), INP # src
2450 movl (FRAME_OFFSET+32)(%esp), LEN # len
2451 movl (FRAME_OFFSET+36)(%esp), IVP # iv
Mathias Krause0d258ef2010-11-27 16:34:46 +08002452#endif
Huang Ying54b6a1b2009-01-18 16:28:34 +11002453 cmp $16, LEN
Huang Yinge6efaa02009-06-18 19:33:57 +08002454 jb .Lcbc_dec_just_ret
Huang Ying54b6a1b2009-01-18 16:28:34 +11002455 mov 480(KEYP), KLEN
2456 add $240, KEYP
2457 movups (IVP), IV
2458 cmp $64, LEN
2459 jb .Lcbc_dec_loop1
2460.align 4
2461.Lcbc_dec_loop4:
2462 movups (INP), IN1
2463 movaps IN1, STATE1
2464 movups 0x10(INP), IN2
2465 movaps IN2, STATE2
Mathias Krause0d258ef2010-11-27 16:34:46 +08002466#ifdef __x86_64__
Huang Ying54b6a1b2009-01-18 16:28:34 +11002467 movups 0x20(INP), IN3
2468 movaps IN3, STATE3
2469 movups 0x30(INP), IN4
2470 movaps IN4, STATE4
Mathias Krause0d258ef2010-11-27 16:34:46 +08002471#else
2472 movups 0x20(INP), IN1
2473 movaps IN1, STATE3
2474 movups 0x30(INP), IN2
2475 movaps IN2, STATE4
2476#endif
Huang Ying54b6a1b2009-01-18 16:28:34 +11002477 call _aesni_dec4
2478 pxor IV, STATE1
Mathias Krause0d258ef2010-11-27 16:34:46 +08002479#ifdef __x86_64__
Huang Ying54b6a1b2009-01-18 16:28:34 +11002480 pxor IN1, STATE2
2481 pxor IN2, STATE3
2482 pxor IN3, STATE4
2483 movaps IN4, IV
Mathias Krause0d258ef2010-11-27 16:34:46 +08002484#else
Mathias Krause0d258ef2010-11-27 16:34:46 +08002485 pxor IN1, STATE4
2486 movaps IN2, IV
Mathias Krause7c8d5182012-05-30 01:43:08 +02002487 movups (INP), IN1
2488 pxor IN1, STATE2
2489 movups 0x10(INP), IN2
2490 pxor IN2, STATE3
Mathias Krause0d258ef2010-11-27 16:34:46 +08002491#endif
Huang Ying54b6a1b2009-01-18 16:28:34 +11002492 movups STATE1, (OUTP)
2493 movups STATE2, 0x10(OUTP)
2494 movups STATE3, 0x20(OUTP)
2495 movups STATE4, 0x30(OUTP)
2496 sub $64, LEN
2497 add $64, INP
2498 add $64, OUTP
2499 cmp $64, LEN
2500 jge .Lcbc_dec_loop4
2501 cmp $16, LEN
2502 jb .Lcbc_dec_ret
2503.align 4
2504.Lcbc_dec_loop1:
2505 movups (INP), IN
2506 movaps IN, STATE
2507 call _aesni_dec1
2508 pxor IV, STATE
2509 movups STATE, (OUTP)
2510 movaps IN, IV
2511 sub $16, LEN
2512 add $16, INP
2513 add $16, OUTP
2514 cmp $16, LEN
2515 jge .Lcbc_dec_loop1
Huang Ying54b6a1b2009-01-18 16:28:34 +11002516.Lcbc_dec_ret:
Huang Yinge6efaa02009-06-18 19:33:57 +08002517 movups IV, (IVP)
2518.Lcbc_dec_just_ret:
Mathias Krause0d258ef2010-11-27 16:34:46 +08002519#ifndef __x86_64__
2520 popl KLEN
2521 popl KEYP
2522 popl LEN
2523 popl IVP
2524#endif
Josh Poimboeuf8691ccd2016-01-21 16:49:19 -06002525 FRAME_END
Huang Ying54b6a1b2009-01-18 16:28:34 +11002526 ret
Jussi Kivilinna8309b742013-01-19 13:38:55 +02002527ENDPROC(aesni_cbc_dec)
Huang Ying12387a42010-03-10 18:28:55 +08002528
Mathias Krause0d258ef2010-11-27 16:34:46 +08002529#ifdef __x86_64__
Josh Poimboeuf1253cab2016-01-21 16:49:15 -06002530.pushsection .rodata
Huang Ying12387a42010-03-10 18:28:55 +08002531.align 16
2532.Lbswap_mask:
2533 .byte 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0
Josh Poimboeuf1253cab2016-01-21 16:49:15 -06002534.popsection
Huang Ying12387a42010-03-10 18:28:55 +08002535
2536/*
2537 * _aesni_inc_init: internal ABI
2538 * setup registers used by _aesni_inc
2539 * input:
2540 * IV
2541 * output:
2542 * CTR: == IV, in little endian
2543 * TCTR_LOW: == lower qword of CTR
2544 * INC: == 1, in little endian
2545 * BSWAP_MASK == endian swapping mask
2546 */
Mathias Krause0d258ef2010-11-27 16:34:46 +08002547.align 4
Huang Ying12387a42010-03-10 18:28:55 +08002548_aesni_inc_init:
2549 movaps .Lbswap_mask, BSWAP_MASK
2550 movaps IV, CTR
2551 PSHUFB_XMM BSWAP_MASK CTR
2552 mov $1, TCTR_LOW
Huang Ying32cbd7d2010-03-13 16:28:42 +08002553 MOVQ_R64_XMM TCTR_LOW INC
2554 MOVQ_R64_XMM CTR TCTR_LOW
Huang Ying12387a42010-03-10 18:28:55 +08002555 ret
Jussi Kivilinna8309b742013-01-19 13:38:55 +02002556ENDPROC(_aesni_inc_init)
Huang Ying12387a42010-03-10 18:28:55 +08002557
2558/*
2559 * _aesni_inc: internal ABI
2560 * Increase IV by 1, IV is in big endian
2561 * input:
2562 * IV
2563 * CTR: == IV, in little endian
2564 * TCTR_LOW: == lower qword of CTR
2565 * INC: == 1, in little endian
2566 * BSWAP_MASK == endian swapping mask
2567 * output:
2568 * IV: Increase by 1
2569 * changed:
2570 * CTR: == output IV, in little endian
2571 * TCTR_LOW: == lower qword of CTR
2572 */
Mathias Krause0d258ef2010-11-27 16:34:46 +08002573.align 4
Huang Ying12387a42010-03-10 18:28:55 +08002574_aesni_inc:
2575 paddq INC, CTR
2576 add $1, TCTR_LOW
2577 jnc .Linc_low
2578 pslldq $8, INC
2579 paddq INC, CTR
2580 psrldq $8, INC
2581.Linc_low:
2582 movaps CTR, IV
2583 PSHUFB_XMM BSWAP_MASK IV
2584 ret
Jussi Kivilinna8309b742013-01-19 13:38:55 +02002585ENDPROC(_aesni_inc)
Huang Ying12387a42010-03-10 18:28:55 +08002586
2587/*
2588 * void aesni_ctr_enc(struct crypto_aes_ctx *ctx, const u8 *dst, u8 *src,
2589 * size_t len, u8 *iv)
2590 */
2591ENTRY(aesni_ctr_enc)
Josh Poimboeuf8691ccd2016-01-21 16:49:19 -06002592 FRAME_BEGIN
Huang Ying12387a42010-03-10 18:28:55 +08002593 cmp $16, LEN
2594 jb .Lctr_enc_just_ret
2595 mov 480(KEYP), KLEN
2596 movups (IVP), IV
2597 call _aesni_inc_init
2598 cmp $64, LEN
2599 jb .Lctr_enc_loop1
2600.align 4
2601.Lctr_enc_loop4:
2602 movaps IV, STATE1
2603 call _aesni_inc
2604 movups (INP), IN1
2605 movaps IV, STATE2
2606 call _aesni_inc
2607 movups 0x10(INP), IN2
2608 movaps IV, STATE3
2609 call _aesni_inc
2610 movups 0x20(INP), IN3
2611 movaps IV, STATE4
2612 call _aesni_inc
2613 movups 0x30(INP), IN4
2614 call _aesni_enc4
2615 pxor IN1, STATE1
2616 movups STATE1, (OUTP)
2617 pxor IN2, STATE2
2618 movups STATE2, 0x10(OUTP)
2619 pxor IN3, STATE3
2620 movups STATE3, 0x20(OUTP)
2621 pxor IN4, STATE4
2622 movups STATE4, 0x30(OUTP)
2623 sub $64, LEN
2624 add $64, INP
2625 add $64, OUTP
2626 cmp $64, LEN
2627 jge .Lctr_enc_loop4
2628 cmp $16, LEN
2629 jb .Lctr_enc_ret
2630.align 4
2631.Lctr_enc_loop1:
2632 movaps IV, STATE
2633 call _aesni_inc
2634 movups (INP), IN
2635 call _aesni_enc1
2636 pxor IN, STATE
2637 movups STATE, (OUTP)
2638 sub $16, LEN
2639 add $16, INP
2640 add $16, OUTP
2641 cmp $16, LEN
2642 jge .Lctr_enc_loop1
2643.Lctr_enc_ret:
2644 movups IV, (IVP)
2645.Lctr_enc_just_ret:
Josh Poimboeuf8691ccd2016-01-21 16:49:19 -06002646 FRAME_END
Huang Ying12387a42010-03-10 18:28:55 +08002647 ret
Jussi Kivilinna8309b742013-01-19 13:38:55 +02002648ENDPROC(aesni_ctr_enc)
Jussi Kivilinnac456a9c2013-04-08 21:51:16 +03002649
2650/*
2651 * _aesni_gf128mul_x_ble: internal ABI
2652 * Multiply in GF(2^128) for XTS IVs
2653 * input:
2654 * IV: current IV
2655 * GF128MUL_MASK == mask with 0x87 and 0x01
2656 * output:
2657 * IV: next IV
2658 * changed:
2659 * CTR: == temporary value
2660 */
2661#define _aesni_gf128mul_x_ble() \
2662 pshufd $0x13, IV, CTR; \
2663 paddq IV, IV; \
2664 psrad $31, CTR; \
2665 pand GF128MUL_MASK, CTR; \
2666 pxor CTR, IV;
2667
2668/*
2669 * void aesni_xts_crypt8(struct crypto_aes_ctx *ctx, const u8 *dst, u8 *src,
2670 * bool enc, u8 *iv)
2671 */
2672ENTRY(aesni_xts_crypt8)
Josh Poimboeuf8691ccd2016-01-21 16:49:19 -06002673 FRAME_BEGIN
Jussi Kivilinnac456a9c2013-04-08 21:51:16 +03002674 cmpb $0, %cl
2675 movl $0, %ecx
2676 movl $240, %r10d
2677 leaq _aesni_enc4, %r11
2678 leaq _aesni_dec4, %rax
2679 cmovel %r10d, %ecx
2680 cmoveq %rax, %r11
2681
2682 movdqa .Lgf128mul_x_ble_mask, GF128MUL_MASK
2683 movups (IVP), IV
2684
2685 mov 480(KEYP), KLEN
2686 addq %rcx, KEYP
2687
2688 movdqa IV, STATE1
Jussi Kivilinnafe6510b2013-06-11 22:25:22 +03002689 movdqu 0x00(INP), INC
2690 pxor INC, STATE1
Jussi Kivilinnac456a9c2013-04-08 21:51:16 +03002691 movdqu IV, 0x00(OUTP)
2692
2693 _aesni_gf128mul_x_ble()
2694 movdqa IV, STATE2
Jussi Kivilinnafe6510b2013-06-11 22:25:22 +03002695 movdqu 0x10(INP), INC
2696 pxor INC, STATE2
Jussi Kivilinnac456a9c2013-04-08 21:51:16 +03002697 movdqu IV, 0x10(OUTP)
2698
2699 _aesni_gf128mul_x_ble()
2700 movdqa IV, STATE3
Jussi Kivilinnafe6510b2013-06-11 22:25:22 +03002701 movdqu 0x20(INP), INC
2702 pxor INC, STATE3
Jussi Kivilinnac456a9c2013-04-08 21:51:16 +03002703 movdqu IV, 0x20(OUTP)
2704
2705 _aesni_gf128mul_x_ble()
2706 movdqa IV, STATE4
Jussi Kivilinnafe6510b2013-06-11 22:25:22 +03002707 movdqu 0x30(INP), INC
2708 pxor INC, STATE4
Jussi Kivilinnac456a9c2013-04-08 21:51:16 +03002709 movdqu IV, 0x30(OUTP)
2710
David Woodhouse9697fa32018-01-11 21:46:27 +00002711 CALL_NOSPEC %r11
Jussi Kivilinnac456a9c2013-04-08 21:51:16 +03002712
Jussi Kivilinnafe6510b2013-06-11 22:25:22 +03002713 movdqu 0x00(OUTP), INC
2714 pxor INC, STATE1
Jussi Kivilinnac456a9c2013-04-08 21:51:16 +03002715 movdqu STATE1, 0x00(OUTP)
2716
2717 _aesni_gf128mul_x_ble()
2718 movdqa IV, STATE1
Jussi Kivilinnafe6510b2013-06-11 22:25:22 +03002719 movdqu 0x40(INP), INC
2720 pxor INC, STATE1
Jussi Kivilinnac456a9c2013-04-08 21:51:16 +03002721 movdqu IV, 0x40(OUTP)
2722
Jussi Kivilinnafe6510b2013-06-11 22:25:22 +03002723 movdqu 0x10(OUTP), INC
2724 pxor INC, STATE2
Jussi Kivilinnac456a9c2013-04-08 21:51:16 +03002725 movdqu STATE2, 0x10(OUTP)
2726
2727 _aesni_gf128mul_x_ble()
2728 movdqa IV, STATE2
Jussi Kivilinnafe6510b2013-06-11 22:25:22 +03002729 movdqu 0x50(INP), INC
2730 pxor INC, STATE2
Jussi Kivilinnac456a9c2013-04-08 21:51:16 +03002731 movdqu IV, 0x50(OUTP)
2732
Jussi Kivilinnafe6510b2013-06-11 22:25:22 +03002733 movdqu 0x20(OUTP), INC
2734 pxor INC, STATE3
Jussi Kivilinnac456a9c2013-04-08 21:51:16 +03002735 movdqu STATE3, 0x20(OUTP)
2736
2737 _aesni_gf128mul_x_ble()
2738 movdqa IV, STATE3
Jussi Kivilinnafe6510b2013-06-11 22:25:22 +03002739 movdqu 0x60(INP), INC
2740 pxor INC, STATE3
Jussi Kivilinnac456a9c2013-04-08 21:51:16 +03002741 movdqu IV, 0x60(OUTP)
2742
Jussi Kivilinnafe6510b2013-06-11 22:25:22 +03002743 movdqu 0x30(OUTP), INC
2744 pxor INC, STATE4
Jussi Kivilinnac456a9c2013-04-08 21:51:16 +03002745 movdqu STATE4, 0x30(OUTP)
2746
2747 _aesni_gf128mul_x_ble()
2748 movdqa IV, STATE4
Jussi Kivilinnafe6510b2013-06-11 22:25:22 +03002749 movdqu 0x70(INP), INC
2750 pxor INC, STATE4
Jussi Kivilinnac456a9c2013-04-08 21:51:16 +03002751 movdqu IV, 0x70(OUTP)
2752
2753 _aesni_gf128mul_x_ble()
2754 movups IV, (IVP)
2755
David Woodhouse9697fa32018-01-11 21:46:27 +00002756 CALL_NOSPEC %r11
Jussi Kivilinnac456a9c2013-04-08 21:51:16 +03002757
Jussi Kivilinnafe6510b2013-06-11 22:25:22 +03002758 movdqu 0x40(OUTP), INC
2759 pxor INC, STATE1
Jussi Kivilinnac456a9c2013-04-08 21:51:16 +03002760 movdqu STATE1, 0x40(OUTP)
2761
Jussi Kivilinnafe6510b2013-06-11 22:25:22 +03002762 movdqu 0x50(OUTP), INC
2763 pxor INC, STATE2
Jussi Kivilinnac456a9c2013-04-08 21:51:16 +03002764 movdqu STATE2, 0x50(OUTP)
2765
Jussi Kivilinnafe6510b2013-06-11 22:25:22 +03002766 movdqu 0x60(OUTP), INC
2767 pxor INC, STATE3
Jussi Kivilinnac456a9c2013-04-08 21:51:16 +03002768 movdqu STATE3, 0x60(OUTP)
2769
Jussi Kivilinnafe6510b2013-06-11 22:25:22 +03002770 movdqu 0x70(OUTP), INC
2771 pxor INC, STATE4
Jussi Kivilinnac456a9c2013-04-08 21:51:16 +03002772 movdqu STATE4, 0x70(OUTP)
2773
Josh Poimboeuf8691ccd2016-01-21 16:49:19 -06002774 FRAME_END
Jussi Kivilinnac456a9c2013-04-08 21:51:16 +03002775 ret
2776ENDPROC(aesni_xts_crypt8)
2777
Mathias Krause0d258ef2010-11-27 16:34:46 +08002778#endif