crypto: aesni - AVX and AVX2 version of AESNI-GCM encode and decode

We have added AVX and AVX2 routines that optimize AESNI-GCM encode/decode.
These routines are optimized for encrypt and decrypt of large buffers.
In tests we have seen up to 6% speedup for 1K, 11% speedup for 2K and
18% speedup for 8K buffer over the existing SSE version.  These routines
should provide even better speedup for future Intel x86_64 cpus.

Signed-off-by: Tim Chen <tim.c.chen@linux.intel.com>
Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
diff --git a/arch/x86/crypto/aesni-intel_glue.c b/arch/x86/crypto/aesni-intel_glue.c
index aba34b8..3ae311d 100644
--- a/arch/x86/crypto/aesni-intel_glue.c
+++ b/arch/x86/crypto/aesni-intel_glue.c
@@ -101,6 +101,9 @@
 int crypto_fpu_init(void);
 void crypto_fpu_exit(void);
 
+#define AVX_GEN2_OPTSIZE 640
+#define AVX_GEN4_OPTSIZE 4096
+
 #ifdef CONFIG_X86_64
 asmlinkage void aesni_ctr_enc(struct crypto_aes_ctx *ctx, u8 *out,
 			      const u8 *in, unsigned int len, u8 *iv);
@@ -150,6 +153,123 @@
 			u8 *hash_subkey, const u8 *aad, unsigned long aad_len,
 			u8 *auth_tag, unsigned long auth_tag_len);
 
+
+#ifdef CONFIG_AS_AVX
+/*
+ * asmlinkage void aesni_gcm_precomp_avx_gen2()
+ * gcm_data *my_ctx_data, context data
+ * u8 *hash_subkey,  the Hash sub key input. Data starts on a 16-byte boundary.
+ */
+asmlinkage void aesni_gcm_precomp_avx_gen2(void *my_ctx_data, u8 *hash_subkey);
+
+asmlinkage void aesni_gcm_enc_avx_gen2(void *ctx, u8 *out,
+			const u8 *in, unsigned long plaintext_len, u8 *iv,
+			const u8 *aad, unsigned long aad_len,
+			u8 *auth_tag, unsigned long auth_tag_len);
+
+asmlinkage void aesni_gcm_dec_avx_gen2(void *ctx, u8 *out,
+			const u8 *in, unsigned long ciphertext_len, u8 *iv,
+			const u8 *aad, unsigned long aad_len,
+			u8 *auth_tag, unsigned long auth_tag_len);
+
+static void aesni_gcm_enc_avx(void *ctx, u8 *out,
+			const u8 *in, unsigned long plaintext_len, u8 *iv,
+			u8 *hash_subkey, const u8 *aad, unsigned long aad_len,
+			u8 *auth_tag, unsigned long auth_tag_len)
+{
+	if (plaintext_len < AVX_GEN2_OPTSIZE) {
+		aesni_gcm_enc(ctx, out, in, plaintext_len, iv, hash_subkey, aad,
+				aad_len, auth_tag, auth_tag_len);
+	} else {
+		aesni_gcm_precomp_avx_gen2(ctx, hash_subkey);
+		aesni_gcm_enc_avx_gen2(ctx, out, in, plaintext_len, iv, aad,
+					aad_len, auth_tag, auth_tag_len);
+	}
+}
+
+static void aesni_gcm_dec_avx(void *ctx, u8 *out,
+			const u8 *in, unsigned long ciphertext_len, u8 *iv,
+			u8 *hash_subkey, const u8 *aad, unsigned long aad_len,
+			u8 *auth_tag, unsigned long auth_tag_len)
+{
+	if (ciphertext_len < AVX_GEN2_OPTSIZE) {
+		aesni_gcm_dec(ctx, out, in, ciphertext_len, iv, hash_subkey, aad,
+				aad_len, auth_tag, auth_tag_len);
+	} else {
+		aesni_gcm_precomp_avx_gen2(ctx, hash_subkey);
+		aesni_gcm_dec_avx_gen2(ctx, out, in, ciphertext_len, iv, aad,
+					aad_len, auth_tag, auth_tag_len);
+	}
+}
+#endif
+
+#ifdef CONFIG_AS_AVX2
+/*
+ * asmlinkage void aesni_gcm_precomp_avx_gen4()
+ * gcm_data *my_ctx_data, context data
+ * u8 *hash_subkey,  the Hash sub key input. Data starts on a 16-byte boundary.
+ */
+asmlinkage void aesni_gcm_precomp_avx_gen4(void *my_ctx_data, u8 *hash_subkey);
+
+asmlinkage void aesni_gcm_enc_avx_gen4(void *ctx, u8 *out,
+			const u8 *in, unsigned long plaintext_len, u8 *iv,
+			const u8 *aad, unsigned long aad_len,
+			u8 *auth_tag, unsigned long auth_tag_len);
+
+asmlinkage void aesni_gcm_dec_avx_gen4(void *ctx, u8 *out,
+			const u8 *in, unsigned long ciphertext_len, u8 *iv,
+			const u8 *aad, unsigned long aad_len,
+			u8 *auth_tag, unsigned long auth_tag_len);
+
+static void aesni_gcm_enc_avx2(void *ctx, u8 *out,
+			const u8 *in, unsigned long plaintext_len, u8 *iv,
+			u8 *hash_subkey, const u8 *aad, unsigned long aad_len,
+			u8 *auth_tag, unsigned long auth_tag_len)
+{
+	if (plaintext_len < AVX_GEN2_OPTSIZE) {
+		aesni_gcm_enc(ctx, out, in, plaintext_len, iv, hash_subkey, aad,
+				aad_len, auth_tag, auth_tag_len);
+	} else if (plaintext_len < AVX_GEN4_OPTSIZE) {
+		aesni_gcm_precomp_avx_gen2(ctx, hash_subkey);
+		aesni_gcm_enc_avx_gen2(ctx, out, in, plaintext_len, iv, aad,
+					aad_len, auth_tag, auth_tag_len);
+	} else {
+		aesni_gcm_precomp_avx_gen4(ctx, hash_subkey);
+		aesni_gcm_enc_avx_gen4(ctx, out, in, plaintext_len, iv, aad,
+					aad_len, auth_tag, auth_tag_len);
+	}
+}
+
+static void aesni_gcm_dec_avx2(void *ctx, u8 *out,
+			const u8 *in, unsigned long ciphertext_len, u8 *iv,
+			u8 *hash_subkey, const u8 *aad, unsigned long aad_len,
+			u8 *auth_tag, unsigned long auth_tag_len)
+{
+	if (ciphertext_len < AVX_GEN2_OPTSIZE) {
+		aesni_gcm_dec(ctx, out, in, ciphertext_len, iv, hash_subkey,
+				aad, aad_len, auth_tag, auth_tag_len);
+	} else if (ciphertext_len < AVX_GEN4_OPTSIZE) {
+		aesni_gcm_precomp_avx_gen2(ctx, hash_subkey);
+		aesni_gcm_dec_avx_gen2(ctx, out, in, ciphertext_len, iv, aad,
+					aad_len, auth_tag, auth_tag_len);
+	} else {
+		aesni_gcm_precomp_avx_gen4(ctx, hash_subkey);
+		aesni_gcm_dec_avx_gen4(ctx, out, in, ciphertext_len, iv, aad,
+					aad_len, auth_tag, auth_tag_len);
+	}
+}
+#endif
+
+static void (*aesni_gcm_enc_tfm)(void *ctx, u8 *out,
+			const u8 *in, unsigned long plaintext_len, u8 *iv,
+			u8 *hash_subkey, const u8 *aad, unsigned long aad_len,
+			u8 *auth_tag, unsigned long auth_tag_len);
+
+static void (*aesni_gcm_dec_tfm)(void *ctx, u8 *out,
+			const u8 *in, unsigned long ciphertext_len, u8 *iv,
+			u8 *hash_subkey, const u8 *aad, unsigned long aad_len,
+			u8 *auth_tag, unsigned long auth_tag_len);
+
 static inline struct
 aesni_rfc4106_gcm_ctx *aesni_rfc4106_gcm_ctx_get(struct crypto_aead *tfm)
 {
@@ -915,7 +1035,7 @@
 		dst = src;
 	}
 
-	aesni_gcm_enc(aes_ctx, dst, src, (unsigned long)req->cryptlen, iv,
+	aesni_gcm_enc_tfm(aes_ctx, dst, src, (unsigned long)req->cryptlen, iv,
 		ctx->hash_subkey, assoc, (unsigned long)req->assoclen, dst
 		+ ((unsigned long)req->cryptlen), auth_tag_len);
 
@@ -996,7 +1116,7 @@
 		dst = src;
 	}
 
-	aesni_gcm_dec(aes_ctx, dst, src, tempCipherLen, iv,
+	aesni_gcm_dec_tfm(aes_ctx, dst, src, tempCipherLen, iv,
 		ctx->hash_subkey, assoc, (unsigned long)req->assoclen,
 		authTag, auth_tag_len);
 
@@ -1353,6 +1473,25 @@
 
 	if (!x86_match_cpu(aesni_cpu_id))
 		return -ENODEV;
+#ifdef CONFIG_AS_AVX2
+	if (boot_cpu_has(X86_FEATURE_AVX2)) {
+		pr_info("AVX2 version of gcm_enc/dec engaged.\n");
+		aesni_gcm_enc_tfm = aesni_gcm_enc_avx2;
+		aesni_gcm_dec_tfm = aesni_gcm_dec_avx2;
+	} else
+#endif
+#ifdef CONFIG_AS_AVX
+	if (boot_cpu_has(X86_FEATURE_AVX)) {
+		pr_info("AVX version of gcm_enc/dec engaged.\n");
+		aesni_gcm_enc_tfm = aesni_gcm_enc_avx;
+		aesni_gcm_dec_tfm = aesni_gcm_dec_avx;
+	} else
+#endif
+	{
+		pr_info("SSE version of gcm_enc/dec engaged.\n");
+		aesni_gcm_enc_tfm = aesni_gcm_enc;
+		aesni_gcm_dec_tfm = aesni_gcm_dec;
+	}
 
 	err = crypto_fpu_init();
 	if (err)