[CRYPTO] salsa20: Add x86-64 assembly version
This is the x86-64 version of the Salsa20 stream cipher algorithm. The
original assembly code came from
<http://cr.yp.to/snuffle/salsa20/amd64-3/salsa20.s>. It has been
reformatted for clarity.
Signed-off-by: Tan Swee Heng <thesweeheng@gmail.com>
Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
diff --git a/arch/x86/crypto/Makefile b/arch/x86/crypto/Makefile
index 25cc844..09200e1 100644
--- a/arch/x86/crypto/Makefile
+++ b/arch/x86/crypto/Makefile
@@ -8,6 +8,7 @@
obj-$(CONFIG_CRYPTO_AES_X86_64) += aes-x86_64.o
obj-$(CONFIG_CRYPTO_TWOFISH_X86_64) += twofish-x86_64.o
+obj-$(CONFIG_CRYPTO_SALSA20_X86_64) += salsa20-x86_64.o
aes-i586-y := aes-i586-asm_32.o aes_glue.o
twofish-i586-y := twofish-i586-asm_32.o twofish_32.o
@@ -15,3 +16,4 @@
aes-x86_64-y := aes-x86_64-asm_64.o aes_glue.o
twofish-x86_64-y := twofish-x86_64-asm_64.o twofish_64.o
+salsa20-x86_64-y := salsa20-x86_64-asm_64.o salsa20_glue.o
diff --git a/arch/x86/crypto/salsa20-x86_64-asm_64.S b/arch/x86/crypto/salsa20-x86_64-asm_64.S
new file mode 100644
index 0000000..6214a9b
--- /dev/null
+++ b/arch/x86/crypto/salsa20-x86_64-asm_64.S
@@ -0,0 +1,920 @@
+# enter ECRYPT_encrypt_bytes
+.text
+.p2align 5
+.globl ECRYPT_encrypt_bytes
+ECRYPT_encrypt_bytes:
+ mov %rsp,%r11
+ and $31,%r11
+ add $256,%r11
+ sub %r11,%rsp
+ # x = arg1
+ mov %rdi,%r8
+ # m = arg2
+ mov %rsi,%rsi
+ # out = arg3
+ mov %rdx,%rdi
+ # bytes = arg4
+ mov %rcx,%rdx
+ # unsigned>? bytes - 0
+ cmp $0,%rdx
+ # comment:fp stack unchanged by jump
+ # goto done if !unsigned>
+ jbe ._done
+ # comment:fp stack unchanged by fallthrough
+# start:
+._start:
+ # r11_stack = r11
+ movq %r11,0(%rsp)
+ # r12_stack = r12
+ movq %r12,8(%rsp)
+ # r13_stack = r13
+ movq %r13,16(%rsp)
+ # r14_stack = r14
+ movq %r14,24(%rsp)
+ # r15_stack = r15
+ movq %r15,32(%rsp)
+ # rbx_stack = rbx
+ movq %rbx,40(%rsp)
+ # rbp_stack = rbp
+ movq %rbp,48(%rsp)
+ # in0 = *(uint64 *) (x + 0)
+ movq 0(%r8),%rcx
+ # in2 = *(uint64 *) (x + 8)
+ movq 8(%r8),%r9
+ # in4 = *(uint64 *) (x + 16)
+ movq 16(%r8),%rax
+ # in6 = *(uint64 *) (x + 24)
+ movq 24(%r8),%r10
+ # in8 = *(uint64 *) (x + 32)
+ movq 32(%r8),%r11
+ # in10 = *(uint64 *) (x + 40)
+ movq 40(%r8),%r12
+ # in12 = *(uint64 *) (x + 48)
+ movq 48(%r8),%r13
+ # in14 = *(uint64 *) (x + 56)
+ movq 56(%r8),%r14
+ # j0 = in0
+ movq %rcx,56(%rsp)
+ # j2 = in2
+ movq %r9,64(%rsp)
+ # j4 = in4
+ movq %rax,72(%rsp)
+ # j6 = in6
+ movq %r10,80(%rsp)
+ # j8 = in8
+ movq %r11,88(%rsp)
+ # j10 = in10
+ movq %r12,96(%rsp)
+ # j12 = in12
+ movq %r13,104(%rsp)
+ # j14 = in14
+ movq %r14,112(%rsp)
+ # x_backup = x
+ movq %r8,120(%rsp)
+# bytesatleast1:
+._bytesatleast1:
+ # unsigned<? bytes - 64
+ cmp $64,%rdx
+ # comment:fp stack unchanged by jump
+ # goto nocopy if !unsigned<
+ jae ._nocopy
+ # ctarget = out
+ movq %rdi,128(%rsp)
+ # out = &tmp
+ leaq 192(%rsp),%rdi
+ # i = bytes
+ mov %rdx,%rcx
+ # while (i) { *out++ = *m++; --i }
+ rep movsb
+ # out = &tmp
+ leaq 192(%rsp),%rdi
+ # m = &tmp
+ leaq 192(%rsp),%rsi
+ # comment:fp stack unchanged by fallthrough
+# nocopy:
+._nocopy:
+ # out_backup = out
+ movq %rdi,136(%rsp)
+ # m_backup = m
+ movq %rsi,144(%rsp)
+ # bytes_backup = bytes
+ movq %rdx,152(%rsp)
+ # x1 = j0
+ movq 56(%rsp),%rdi
+ # x0 = x1
+ mov %rdi,%rdx
+ # (uint64) x1 >>= 32
+ shr $32,%rdi
+ # x3 = j2
+ movq 64(%rsp),%rsi
+ # x2 = x3
+ mov %rsi,%rcx
+ # (uint64) x3 >>= 32
+ shr $32,%rsi
+ # x5 = j4
+ movq 72(%rsp),%r8
+ # x4 = x5
+ mov %r8,%r9
+ # (uint64) x5 >>= 32
+ shr $32,%r8
+ # x5_stack = x5
+ movq %r8,160(%rsp)
+ # x7 = j6
+ movq 80(%rsp),%r8
+ # x6 = x7
+ mov %r8,%rax
+ # (uint64) x7 >>= 32
+ shr $32,%r8
+ # x9 = j8
+ movq 88(%rsp),%r10
+ # x8 = x9
+ mov %r10,%r11
+ # (uint64) x9 >>= 32
+ shr $32,%r10
+ # x11 = j10
+ movq 96(%rsp),%r12
+ # x10 = x11
+ mov %r12,%r13
+ # x10_stack = x10
+ movq %r13,168(%rsp)
+ # (uint64) x11 >>= 32
+ shr $32,%r12
+ # x13 = j12
+ movq 104(%rsp),%r13
+ # x12 = x13
+ mov %r13,%r14
+ # (uint64) x13 >>= 32
+ shr $32,%r13
+ # x15 = j14
+ movq 112(%rsp),%r15
+ # x14 = x15
+ mov %r15,%rbx
+ # (uint64) x15 >>= 32
+ shr $32,%r15
+ # x15_stack = x15
+ movq %r15,176(%rsp)
+ # i = 20
+ mov $20,%r15
+# mainloop:
+._mainloop:
+ # i_backup = i
+ movq %r15,184(%rsp)
+ # x5 = x5_stack
+ movq 160(%rsp),%r15
+ # a = x12 + x0
+ lea (%r14,%rdx),%rbp
+ # (uint32) a <<<= 7
+ rol $7,%ebp
+ # x4 ^= a
+ xor %rbp,%r9
+ # b = x1 + x5
+ lea (%rdi,%r15),%rbp
+ # (uint32) b <<<= 7
+ rol $7,%ebp
+ # x9 ^= b
+ xor %rbp,%r10
+ # a = x0 + x4
+ lea (%rdx,%r9),%rbp
+ # (uint32) a <<<= 9
+ rol $9,%ebp
+ # x8 ^= a
+ xor %rbp,%r11
+ # b = x5 + x9
+ lea (%r15,%r10),%rbp
+ # (uint32) b <<<= 9
+ rol $9,%ebp
+ # x13 ^= b
+ xor %rbp,%r13
+ # a = x4 + x8
+ lea (%r9,%r11),%rbp
+ # (uint32) a <<<= 13
+ rol $13,%ebp
+ # x12 ^= a
+ xor %rbp,%r14
+ # b = x9 + x13
+ lea (%r10,%r13),%rbp
+ # (uint32) b <<<= 13
+ rol $13,%ebp
+ # x1 ^= b
+ xor %rbp,%rdi
+ # a = x8 + x12
+ lea (%r11,%r14),%rbp
+ # (uint32) a <<<= 18
+ rol $18,%ebp
+ # x0 ^= a
+ xor %rbp,%rdx
+ # b = x13 + x1
+ lea (%r13,%rdi),%rbp
+ # (uint32) b <<<= 18
+ rol $18,%ebp
+ # x5 ^= b
+ xor %rbp,%r15
+ # x10 = x10_stack
+ movq 168(%rsp),%rbp
+ # x5_stack = x5
+ movq %r15,160(%rsp)
+ # c = x6 + x10
+ lea (%rax,%rbp),%r15
+ # (uint32) c <<<= 7
+ rol $7,%r15d
+ # x14 ^= c
+ xor %r15,%rbx
+ # c = x10 + x14
+ lea (%rbp,%rbx),%r15
+ # (uint32) c <<<= 9
+ rol $9,%r15d
+ # x2 ^= c
+ xor %r15,%rcx
+ # c = x14 + x2
+ lea (%rbx,%rcx),%r15
+ # (uint32) c <<<= 13
+ rol $13,%r15d
+ # x6 ^= c
+ xor %r15,%rax
+ # c = x2 + x6
+ lea (%rcx,%rax),%r15
+ # (uint32) c <<<= 18
+ rol $18,%r15d
+ # x10 ^= c
+ xor %r15,%rbp
+ # x15 = x15_stack
+ movq 176(%rsp),%r15
+ # x10_stack = x10
+ movq %rbp,168(%rsp)
+ # d = x11 + x15
+ lea (%r12,%r15),%rbp
+ # (uint32) d <<<= 7
+ rol $7,%ebp
+ # x3 ^= d
+ xor %rbp,%rsi
+ # d = x15 + x3
+ lea (%r15,%rsi),%rbp
+ # (uint32) d <<<= 9
+ rol $9,%ebp
+ # x7 ^= d
+ xor %rbp,%r8
+ # d = x3 + x7
+ lea (%rsi,%r8),%rbp
+ # (uint32) d <<<= 13
+ rol $13,%ebp
+ # x11 ^= d
+ xor %rbp,%r12
+ # d = x7 + x11
+ lea (%r8,%r12),%rbp
+ # (uint32) d <<<= 18
+ rol $18,%ebp
+ # x15 ^= d
+ xor %rbp,%r15
+ # x15_stack = x15
+ movq %r15,176(%rsp)
+ # x5 = x5_stack
+ movq 160(%rsp),%r15
+ # a = x3 + x0
+ lea (%rsi,%rdx),%rbp
+ # (uint32) a <<<= 7
+ rol $7,%ebp
+ # x1 ^= a
+ xor %rbp,%rdi
+ # b = x4 + x5
+ lea (%r9,%r15),%rbp
+ # (uint32) b <<<= 7
+ rol $7,%ebp
+ # x6 ^= b
+ xor %rbp,%rax
+ # a = x0 + x1
+ lea (%rdx,%rdi),%rbp
+ # (uint32) a <<<= 9
+ rol $9,%ebp
+ # x2 ^= a
+ xor %rbp,%rcx
+ # b = x5 + x6
+ lea (%r15,%rax),%rbp
+ # (uint32) b <<<= 9
+ rol $9,%ebp
+ # x7 ^= b
+ xor %rbp,%r8
+ # a = x1 + x2
+ lea (%rdi,%rcx),%rbp
+ # (uint32) a <<<= 13
+ rol $13,%ebp
+ # x3 ^= a
+ xor %rbp,%rsi
+ # b = x6 + x7
+ lea (%rax,%r8),%rbp
+ # (uint32) b <<<= 13
+ rol $13,%ebp
+ # x4 ^= b
+ xor %rbp,%r9
+ # a = x2 + x3
+ lea (%rcx,%rsi),%rbp
+ # (uint32) a <<<= 18
+ rol $18,%ebp
+ # x0 ^= a
+ xor %rbp,%rdx
+ # b = x7 + x4
+ lea (%r8,%r9),%rbp
+ # (uint32) b <<<= 18
+ rol $18,%ebp
+ # x5 ^= b
+ xor %rbp,%r15
+ # x10 = x10_stack
+ movq 168(%rsp),%rbp
+ # x5_stack = x5
+ movq %r15,160(%rsp)
+ # c = x9 + x10
+ lea (%r10,%rbp),%r15
+ # (uint32) c <<<= 7
+ rol $7,%r15d
+ # x11 ^= c
+ xor %r15,%r12
+ # c = x10 + x11
+ lea (%rbp,%r12),%r15
+ # (uint32) c <<<= 9
+ rol $9,%r15d
+ # x8 ^= c
+ xor %r15,%r11
+ # c = x11 + x8
+ lea (%r12,%r11),%r15
+ # (uint32) c <<<= 13
+ rol $13,%r15d
+ # x9 ^= c
+ xor %r15,%r10
+ # c = x8 + x9
+ lea (%r11,%r10),%r15
+ # (uint32) c <<<= 18
+ rol $18,%r15d
+ # x10 ^= c
+ xor %r15,%rbp
+ # x15 = x15_stack
+ movq 176(%rsp),%r15
+ # x10_stack = x10
+ movq %rbp,168(%rsp)
+ # d = x14 + x15
+ lea (%rbx,%r15),%rbp
+ # (uint32) d <<<= 7
+ rol $7,%ebp
+ # x12 ^= d
+ xor %rbp,%r14
+ # d = x15 + x12
+ lea (%r15,%r14),%rbp
+ # (uint32) d <<<= 9
+ rol $9,%ebp
+ # x13 ^= d
+ xor %rbp,%r13
+ # d = x12 + x13
+ lea (%r14,%r13),%rbp
+ # (uint32) d <<<= 13
+ rol $13,%ebp
+ # x14 ^= d
+ xor %rbp,%rbx
+ # d = x13 + x14
+ lea (%r13,%rbx),%rbp
+ # (uint32) d <<<= 18
+ rol $18,%ebp
+ # x15 ^= d
+ xor %rbp,%r15
+ # x15_stack = x15
+ movq %r15,176(%rsp)
+ # x5 = x5_stack
+ movq 160(%rsp),%r15
+ # a = x12 + x0
+ lea (%r14,%rdx),%rbp
+ # (uint32) a <<<= 7
+ rol $7,%ebp
+ # x4 ^= a
+ xor %rbp,%r9
+ # b = x1 + x5
+ lea (%rdi,%r15),%rbp
+ # (uint32) b <<<= 7
+ rol $7,%ebp
+ # x9 ^= b
+ xor %rbp,%r10
+ # a = x0 + x4
+ lea (%rdx,%r9),%rbp
+ # (uint32) a <<<= 9
+ rol $9,%ebp
+ # x8 ^= a
+ xor %rbp,%r11
+ # b = x5 + x9
+ lea (%r15,%r10),%rbp
+ # (uint32) b <<<= 9
+ rol $9,%ebp
+ # x13 ^= b
+ xor %rbp,%r13
+ # a = x4 + x8
+ lea (%r9,%r11),%rbp
+ # (uint32) a <<<= 13
+ rol $13,%ebp
+ # x12 ^= a
+ xor %rbp,%r14
+ # b = x9 + x13
+ lea (%r10,%r13),%rbp
+ # (uint32) b <<<= 13
+ rol $13,%ebp
+ # x1 ^= b
+ xor %rbp,%rdi
+ # a = x8 + x12
+ lea (%r11,%r14),%rbp
+ # (uint32) a <<<= 18
+ rol $18,%ebp
+ # x0 ^= a
+ xor %rbp,%rdx
+ # b = x13 + x1
+ lea (%r13,%rdi),%rbp
+ # (uint32) b <<<= 18
+ rol $18,%ebp
+ # x5 ^= b
+ xor %rbp,%r15
+ # x10 = x10_stack
+ movq 168(%rsp),%rbp
+ # x5_stack = x5
+ movq %r15,160(%rsp)
+ # c = x6 + x10
+ lea (%rax,%rbp),%r15
+ # (uint32) c <<<= 7
+ rol $7,%r15d
+ # x14 ^= c
+ xor %r15,%rbx
+ # c = x10 + x14
+ lea (%rbp,%rbx),%r15
+ # (uint32) c <<<= 9
+ rol $9,%r15d
+ # x2 ^= c
+ xor %r15,%rcx
+ # c = x14 + x2
+ lea (%rbx,%rcx),%r15
+ # (uint32) c <<<= 13
+ rol $13,%r15d
+ # x6 ^= c
+ xor %r15,%rax
+ # c = x2 + x6
+ lea (%rcx,%rax),%r15
+ # (uint32) c <<<= 18
+ rol $18,%r15d
+ # x10 ^= c
+ xor %r15,%rbp
+ # x15 = x15_stack
+ movq 176(%rsp),%r15
+ # x10_stack = x10
+ movq %rbp,168(%rsp)
+ # d = x11 + x15
+ lea (%r12,%r15),%rbp
+ # (uint32) d <<<= 7
+ rol $7,%ebp
+ # x3 ^= d
+ xor %rbp,%rsi
+ # d = x15 + x3
+ lea (%r15,%rsi),%rbp
+ # (uint32) d <<<= 9
+ rol $9,%ebp
+ # x7 ^= d
+ xor %rbp,%r8
+ # d = x3 + x7
+ lea (%rsi,%r8),%rbp
+ # (uint32) d <<<= 13
+ rol $13,%ebp
+ # x11 ^= d
+ xor %rbp,%r12
+ # d = x7 + x11
+ lea (%r8,%r12),%rbp
+ # (uint32) d <<<= 18
+ rol $18,%ebp
+ # x15 ^= d
+ xor %rbp,%r15
+ # x15_stack = x15
+ movq %r15,176(%rsp)
+ # x5 = x5_stack
+ movq 160(%rsp),%r15
+ # a = x3 + x0
+ lea (%rsi,%rdx),%rbp
+ # (uint32) a <<<= 7
+ rol $7,%ebp
+ # x1 ^= a
+ xor %rbp,%rdi
+ # b = x4 + x5
+ lea (%r9,%r15),%rbp
+ # (uint32) b <<<= 7
+ rol $7,%ebp
+ # x6 ^= b
+ xor %rbp,%rax
+ # a = x0 + x1
+ lea (%rdx,%rdi),%rbp
+ # (uint32) a <<<= 9
+ rol $9,%ebp
+ # x2 ^= a
+ xor %rbp,%rcx
+ # b = x5 + x6
+ lea (%r15,%rax),%rbp
+ # (uint32) b <<<= 9
+ rol $9,%ebp
+ # x7 ^= b
+ xor %rbp,%r8
+ # a = x1 + x2
+ lea (%rdi,%rcx),%rbp
+ # (uint32) a <<<= 13
+ rol $13,%ebp
+ # x3 ^= a
+ xor %rbp,%rsi
+ # b = x6 + x7
+ lea (%rax,%r8),%rbp
+ # (uint32) b <<<= 13
+ rol $13,%ebp
+ # x4 ^= b
+ xor %rbp,%r9
+ # a = x2 + x3
+ lea (%rcx,%rsi),%rbp
+ # (uint32) a <<<= 18
+ rol $18,%ebp
+ # x0 ^= a
+ xor %rbp,%rdx
+ # b = x7 + x4
+ lea (%r8,%r9),%rbp
+ # (uint32) b <<<= 18
+ rol $18,%ebp
+ # x5 ^= b
+ xor %rbp,%r15
+ # x10 = x10_stack
+ movq 168(%rsp),%rbp
+ # x5_stack = x5
+ movq %r15,160(%rsp)
+ # c = x9 + x10
+ lea (%r10,%rbp),%r15
+ # (uint32) c <<<= 7
+ rol $7,%r15d
+ # x11 ^= c
+ xor %r15,%r12
+ # c = x10 + x11
+ lea (%rbp,%r12),%r15
+ # (uint32) c <<<= 9
+ rol $9,%r15d
+ # x8 ^= c
+ xor %r15,%r11
+ # c = x11 + x8
+ lea (%r12,%r11),%r15
+ # (uint32) c <<<= 13
+ rol $13,%r15d
+ # x9 ^= c
+ xor %r15,%r10
+ # c = x8 + x9
+ lea (%r11,%r10),%r15
+ # (uint32) c <<<= 18
+ rol $18,%r15d
+ # x10 ^= c
+ xor %r15,%rbp
+ # x15 = x15_stack
+ movq 176(%rsp),%r15
+ # x10_stack = x10
+ movq %rbp,168(%rsp)
+ # d = x14 + x15
+ lea (%rbx,%r15),%rbp
+ # (uint32) d <<<= 7
+ rol $7,%ebp
+ # x12 ^= d
+ xor %rbp,%r14
+ # d = x15 + x12
+ lea (%r15,%r14),%rbp
+ # (uint32) d <<<= 9
+ rol $9,%ebp
+ # x13 ^= d
+ xor %rbp,%r13
+ # d = x12 + x13
+ lea (%r14,%r13),%rbp
+ # (uint32) d <<<= 13
+ rol $13,%ebp
+ # x14 ^= d
+ xor %rbp,%rbx
+ # d = x13 + x14
+ lea (%r13,%rbx),%rbp
+ # (uint32) d <<<= 18
+ rol $18,%ebp
+ # x15 ^= d
+ xor %rbp,%r15
+ # x15_stack = x15
+ movq %r15,176(%rsp)
+ # i = i_backup
+ movq 184(%rsp),%r15
+ # unsigned>? i -= 4
+ sub $4,%r15
+ # comment:fp stack unchanged by jump
+ # goto mainloop if unsigned>
+ ja ._mainloop
+ # (uint32) x2 += j2
+ addl 64(%rsp),%ecx
+ # x3 <<= 32
+ shl $32,%rsi
+ # x3 += j2
+ addq 64(%rsp),%rsi
+ # (uint64) x3 >>= 32
+ shr $32,%rsi
+ # x3 <<= 32
+ shl $32,%rsi
+ # x2 += x3
+ add %rsi,%rcx
+ # (uint32) x6 += j6
+ addl 80(%rsp),%eax
+ # x7 <<= 32
+ shl $32,%r8
+ # x7 += j6
+ addq 80(%rsp),%r8
+ # (uint64) x7 >>= 32
+ shr $32,%r8
+ # x7 <<= 32
+ shl $32,%r8
+ # x6 += x7
+ add %r8,%rax
+ # (uint32) x8 += j8
+ addl 88(%rsp),%r11d
+ # x9 <<= 32
+ shl $32,%r10
+ # x9 += j8
+ addq 88(%rsp),%r10
+ # (uint64) x9 >>= 32
+ shr $32,%r10
+ # x9 <<= 32
+ shl $32,%r10
+ # x8 += x9
+ add %r10,%r11
+ # (uint32) x12 += j12
+ addl 104(%rsp),%r14d
+ # x13 <<= 32
+ shl $32,%r13
+ # x13 += j12
+ addq 104(%rsp),%r13
+ # (uint64) x13 >>= 32
+ shr $32,%r13
+ # x13 <<= 32
+ shl $32,%r13
+ # x12 += x13
+ add %r13,%r14
+ # (uint32) x0 += j0
+ addl 56(%rsp),%edx
+ # x1 <<= 32
+ shl $32,%rdi
+ # x1 += j0
+ addq 56(%rsp),%rdi
+ # (uint64) x1 >>= 32
+ shr $32,%rdi
+ # x1 <<= 32
+ shl $32,%rdi
+ # x0 += x1
+ add %rdi,%rdx
+ # x5 = x5_stack
+ movq 160(%rsp),%rdi
+ # (uint32) x4 += j4
+ addl 72(%rsp),%r9d
+ # x5 <<= 32
+ shl $32,%rdi
+ # x5 += j4
+ addq 72(%rsp),%rdi
+ # (uint64) x5 >>= 32
+ shr $32,%rdi
+ # x5 <<= 32
+ shl $32,%rdi
+ # x4 += x5
+ add %rdi,%r9
+ # x10 = x10_stack
+ movq 168(%rsp),%r8
+ # (uint32) x10 += j10
+ addl 96(%rsp),%r8d
+ # x11 <<= 32
+ shl $32,%r12
+ # x11 += j10
+ addq 96(%rsp),%r12
+ # (uint64) x11 >>= 32
+ shr $32,%r12
+ # x11 <<= 32
+ shl $32,%r12
+ # x10 += x11
+ add %r12,%r8
+ # x15 = x15_stack
+ movq 176(%rsp),%rdi
+ # (uint32) x14 += j14
+ addl 112(%rsp),%ebx
+ # x15 <<= 32
+ shl $32,%rdi
+ # x15 += j14
+ addq 112(%rsp),%rdi
+ # (uint64) x15 >>= 32
+ shr $32,%rdi
+ # x15 <<= 32
+ shl $32,%rdi
+ # x14 += x15
+ add %rdi,%rbx
+ # out = out_backup
+ movq 136(%rsp),%rdi
+ # m = m_backup
+ movq 144(%rsp),%rsi
+ # x0 ^= *(uint64 *) (m + 0)
+ xorq 0(%rsi),%rdx
+ # *(uint64 *) (out + 0) = x0
+ movq %rdx,0(%rdi)
+ # x2 ^= *(uint64 *) (m + 8)
+ xorq 8(%rsi),%rcx
+ # *(uint64 *) (out + 8) = x2
+ movq %rcx,8(%rdi)
+ # x4 ^= *(uint64 *) (m + 16)
+ xorq 16(%rsi),%r9
+ # *(uint64 *) (out + 16) = x4
+ movq %r9,16(%rdi)
+ # x6 ^= *(uint64 *) (m + 24)
+ xorq 24(%rsi),%rax
+ # *(uint64 *) (out + 24) = x6
+ movq %rax,24(%rdi)
+ # x8 ^= *(uint64 *) (m + 32)
+ xorq 32(%rsi),%r11
+ # *(uint64 *) (out + 32) = x8
+ movq %r11,32(%rdi)
+ # x10 ^= *(uint64 *) (m + 40)
+ xorq 40(%rsi),%r8
+ # *(uint64 *) (out + 40) = x10
+ movq %r8,40(%rdi)
+ # x12 ^= *(uint64 *) (m + 48)
+ xorq 48(%rsi),%r14
+ # *(uint64 *) (out + 48) = x12
+ movq %r14,48(%rdi)
+ # x14 ^= *(uint64 *) (m + 56)
+ xorq 56(%rsi),%rbx
+ # *(uint64 *) (out + 56) = x14
+ movq %rbx,56(%rdi)
+ # bytes = bytes_backup
+ movq 152(%rsp),%rdx
+ # in8 = j8
+ movq 88(%rsp),%rcx
+ # in8 += 1
+ add $1,%rcx
+ # j8 = in8
+ movq %rcx,88(%rsp)
+ # unsigned>? unsigned<? bytes - 64
+ cmp $64,%rdx
+ # comment:fp stack unchanged by jump
+ # goto bytesatleast65 if unsigned>
+ ja ._bytesatleast65
+ # comment:fp stack unchanged by jump
+ # goto bytesatleast64 if !unsigned<
+ jae ._bytesatleast64
+ # m = out
+ mov %rdi,%rsi
+ # out = ctarget
+ movq 128(%rsp),%rdi
+ # i = bytes
+ mov %rdx,%rcx
+ # while (i) { *out++ = *m++; --i }
+ rep movsb
+ # comment:fp stack unchanged by fallthrough
+# bytesatleast64:
+._bytesatleast64:
+ # x = x_backup
+ movq 120(%rsp),%rdi
+ # in8 = j8
+ movq 88(%rsp),%rsi
+ # *(uint64 *) (x + 32) = in8
+ movq %rsi,32(%rdi)
+ # r11 = r11_stack
+ movq 0(%rsp),%r11
+ # r12 = r12_stack
+ movq 8(%rsp),%r12
+ # r13 = r13_stack
+ movq 16(%rsp),%r13
+ # r14 = r14_stack
+ movq 24(%rsp),%r14
+ # r15 = r15_stack
+ movq 32(%rsp),%r15
+ # rbx = rbx_stack
+ movq 40(%rsp),%rbx
+ # rbp = rbp_stack
+ movq 48(%rsp),%rbp
+ # comment:fp stack unchanged by fallthrough
+# done:
+._done:
+ # leave
+ add %r11,%rsp
+ mov %rdi,%rax
+ mov %rsi,%rdx
+ ret
+# bytesatleast65:
+._bytesatleast65:
+ # bytes -= 64
+ sub $64,%rdx
+ # out += 64
+ add $64,%rdi
+ # m += 64
+ add $64,%rsi
+ # comment:fp stack unchanged by jump
+ # goto bytesatleast1
+ jmp ._bytesatleast1
+# enter ECRYPT_keysetup
+.text
+.p2align 5
+.globl ECRYPT_keysetup
+ECRYPT_keysetup:
+ mov %rsp,%r11
+ and $31,%r11
+ add $256,%r11
+ sub %r11,%rsp
+ # k = arg2
+ mov %rsi,%rsi
+ # kbits = arg3
+ mov %rdx,%rdx
+ # x = arg1
+ mov %rdi,%rdi
+ # in0 = *(uint64 *) (k + 0)
+ movq 0(%rsi),%r8
+ # in2 = *(uint64 *) (k + 8)
+ movq 8(%rsi),%r9
+ # *(uint64 *) (x + 4) = in0
+ movq %r8,4(%rdi)
+ # *(uint64 *) (x + 12) = in2
+ movq %r9,12(%rdi)
+ # unsigned<? kbits - 256
+ cmp $256,%rdx
+ # comment:fp stack unchanged by jump
+ # goto kbits128 if unsigned<
+ jb ._kbits128
+# kbits256:
+._kbits256:
+ # in10 = *(uint64 *) (k + 16)
+ movq 16(%rsi),%rdx
+ # in12 = *(uint64 *) (k + 24)
+ movq 24(%rsi),%rsi
+ # *(uint64 *) (x + 44) = in10
+ movq %rdx,44(%rdi)
+ # *(uint64 *) (x + 52) = in12
+ movq %rsi,52(%rdi)
+ # in0 = 1634760805
+ mov $1634760805,%rsi
+ # in4 = 857760878
+ mov $857760878,%rdx
+ # in10 = 2036477234
+ mov $2036477234,%rcx
+ # in14 = 1797285236
+ mov $1797285236,%r8
+ # *(uint32 *) (x + 0) = in0
+ movl %esi,0(%rdi)
+ # *(uint32 *) (x + 20) = in4
+ movl %edx,20(%rdi)
+ # *(uint32 *) (x + 40) = in10
+ movl %ecx,40(%rdi)
+ # *(uint32 *) (x + 60) = in14
+ movl %r8d,60(%rdi)
+ # comment:fp stack unchanged by jump
+ # goto keysetupdone
+ jmp ._keysetupdone
+# kbits128:
+._kbits128:
+ # in10 = *(uint64 *) (k + 0)
+ movq 0(%rsi),%rdx
+ # in12 = *(uint64 *) (k + 8)
+ movq 8(%rsi),%rsi
+ # *(uint64 *) (x + 44) = in10
+ movq %rdx,44(%rdi)
+ # *(uint64 *) (x + 52) = in12
+ movq %rsi,52(%rdi)
+ # in0 = 1634760805
+ mov $1634760805,%rsi
+ # in4 = 824206446
+ mov $824206446,%rdx
+ # in10 = 2036477238
+ mov $2036477238,%rcx
+ # in14 = 1797285236
+ mov $1797285236,%r8
+ # *(uint32 *) (x + 0) = in0
+ movl %esi,0(%rdi)
+ # *(uint32 *) (x + 20) = in4
+ movl %edx,20(%rdi)
+ # *(uint32 *) (x + 40) = in10
+ movl %ecx,40(%rdi)
+ # *(uint32 *) (x + 60) = in14
+ movl %r8d,60(%rdi)
+# keysetupdone:
+._keysetupdone:
+ # leave
+ add %r11,%rsp
+ mov %rdi,%rax
+ mov %rsi,%rdx
+ ret
+# enter ECRYPT_ivsetup
+.text
+.p2align 5
+.globl ECRYPT_ivsetup
+ECRYPT_ivsetup:
+ mov %rsp,%r11
+ and $31,%r11
+ add $256,%r11
+ sub %r11,%rsp
+ # iv = arg2
+ mov %rsi,%rsi
+ # x = arg1
+ mov %rdi,%rdi
+ # in6 = *(uint64 *) (iv + 0)
+ movq 0(%rsi),%rsi
+ # in8 = 0
+ mov $0,%r8
+ # *(uint64 *) (x + 24) = in6
+ movq %rsi,24(%rdi)
+ # *(uint64 *) (x + 32) = in8
+ movq %r8,32(%rdi)
+ # leave
+ add %r11,%rsp
+ mov %rdi,%rax
+ mov %rsi,%rdx
+ ret
diff --git a/arch/x86/crypto/salsa20_glue.c b/arch/x86/crypto/salsa20_glue.c
index 3be4439..bccb76d 100644
--- a/arch/x86/crypto/salsa20_glue.c
+++ b/arch/x86/crypto/salsa20_glue.c
@@ -8,6 +8,8 @@
* and to remove extraneous comments and functions that are not needed.
* - i586 version, renamed as salsa20-i586-asm_32.S
* available from <http://cr.yp.to/snuffle/salsa20/x86-pm/salsa20.s>
+ * - x86-64 version, renamed as salsa20-x86_64-asm_64.S
+ * available from <http://cr.yp.to/snuffle/salsa20/amd64-3/salsa20.s>
*
* This program is free software; you can redistribute it and/or modify it
* under the terms of the GNU General Public License as published by the Free
diff --git a/crypto/Kconfig b/crypto/Kconfig
index 221356b..b0481f7 100644
--- a/crypto/Kconfig
+++ b/crypto/Kconfig
@@ -504,6 +504,21 @@
The Salsa20 stream cipher algorithm is designed by Daniel J.
Bernstein <djb@cr.yp.to>. See <http://cr.yp.to/snuffle.html>
+config CRYPTO_SALSA20_X86_64
+ tristate "Salsa20 stream cipher algorithm (x86_64) (EXPERIMENTAL)"
+ depends on (X86 || UML_X86) && 64BIT
+ depends on EXPERIMENTAL
+ select CRYPTO_BLKCIPHER
+ select CRYPTO_SALSA20
+ help
+ Salsa20 stream cipher algorithm.
+
+ Salsa20 is a stream cipher submitted to eSTREAM, the ECRYPT
+ Stream Cipher Project. See <http://www.ecrypt.eu.org/stream/>
+
+ The Salsa20 stream cipher algorithm is designed by Daniel J.
+ Bernstein <djb@cr.yp.to>. See <http://cr.yp.to/snuffle.html>
+
config CRYPTO_DEFLATE
tristate "Deflate compression algorithm"
select CRYPTO_ALGAPI