Eric Biggers | 012c823 | 2018-12-04 22:20:00 -0800 | [diff] [blame] | 1 | /* SPDX-License-Identifier: GPL-2.0 */ |
| 2 | /* |
| 3 | * NH - ε-almost-universal hash function, x86_64 SSE2 accelerated |
| 4 | * |
| 5 | * Copyright 2018 Google LLC |
| 6 | * |
| 7 | * Author: Eric Biggers <ebiggers@google.com> |
| 8 | */ |
| 9 | |
| 10 | #include <linux/linkage.h> |
| 11 | |
| 12 | #define PASS0_SUMS %xmm0 |
| 13 | #define PASS1_SUMS %xmm1 |
| 14 | #define PASS2_SUMS %xmm2 |
| 15 | #define PASS3_SUMS %xmm3 |
| 16 | #define K0 %xmm4 |
| 17 | #define K1 %xmm5 |
| 18 | #define K2 %xmm6 |
| 19 | #define K3 %xmm7 |
| 20 | #define T0 %xmm8 |
| 21 | #define T1 %xmm9 |
| 22 | #define T2 %xmm10 |
| 23 | #define T3 %xmm11 |
| 24 | #define T4 %xmm12 |
| 25 | #define T5 %xmm13 |
| 26 | #define T6 %xmm14 |
| 27 | #define T7 %xmm15 |
| 28 | #define KEY %rdi |
| 29 | #define MESSAGE %rsi |
| 30 | #define MESSAGE_LEN %rdx |
| 31 | #define HASH %rcx |
| 32 | |
| 33 | .macro _nh_stride k0, k1, k2, k3, offset |
| 34 | |
| 35 | // Load next message stride |
| 36 | movdqu \offset(MESSAGE), T1 |
| 37 | |
| 38 | // Load next key stride |
| 39 | movdqu \offset(KEY), \k3 |
| 40 | |
| 41 | // Add message words to key words |
| 42 | movdqa T1, T2 |
| 43 | movdqa T1, T3 |
| 44 | paddd T1, \k0 // reuse k0 to avoid a move |
| 45 | paddd \k1, T1 |
| 46 | paddd \k2, T2 |
| 47 | paddd \k3, T3 |
| 48 | |
| 49 | // Multiply 32x32 => 64 and accumulate |
| 50 | pshufd $0x10, \k0, T4 |
| 51 | pshufd $0x32, \k0, \k0 |
| 52 | pshufd $0x10, T1, T5 |
| 53 | pshufd $0x32, T1, T1 |
| 54 | pshufd $0x10, T2, T6 |
| 55 | pshufd $0x32, T2, T2 |
| 56 | pshufd $0x10, T3, T7 |
| 57 | pshufd $0x32, T3, T3 |
| 58 | pmuludq T4, \k0 |
| 59 | pmuludq T5, T1 |
| 60 | pmuludq T6, T2 |
| 61 | pmuludq T7, T3 |
| 62 | paddq \k0, PASS0_SUMS |
| 63 | paddq T1, PASS1_SUMS |
| 64 | paddq T2, PASS2_SUMS |
| 65 | paddq T3, PASS3_SUMS |
| 66 | .endm |
| 67 | |
| 68 | /* |
| 69 | * void nh_sse2(const u32 *key, const u8 *message, size_t message_len, |
| 70 | * u8 hash[NH_HASH_BYTES]) |
| 71 | * |
| 72 | * It's guaranteed that message_len % 16 == 0. |
| 73 | */ |
Jiri Slaby | 6dcc562 | 2019-10-11 13:51:04 +0200 | [diff] [blame] | 74 | SYM_FUNC_START(nh_sse2) |
Eric Biggers | 012c823 | 2018-12-04 22:20:00 -0800 | [diff] [blame] | 75 | |
| 76 | movdqu 0x00(KEY), K0 |
| 77 | movdqu 0x10(KEY), K1 |
| 78 | movdqu 0x20(KEY), K2 |
| 79 | add $0x30, KEY |
| 80 | pxor PASS0_SUMS, PASS0_SUMS |
| 81 | pxor PASS1_SUMS, PASS1_SUMS |
| 82 | pxor PASS2_SUMS, PASS2_SUMS |
| 83 | pxor PASS3_SUMS, PASS3_SUMS |
| 84 | |
| 85 | sub $0x40, MESSAGE_LEN |
| 86 | jl .Lloop4_done |
| 87 | .Lloop4: |
| 88 | _nh_stride K0, K1, K2, K3, 0x00 |
| 89 | _nh_stride K1, K2, K3, K0, 0x10 |
| 90 | _nh_stride K2, K3, K0, K1, 0x20 |
| 91 | _nh_stride K3, K0, K1, K2, 0x30 |
| 92 | add $0x40, KEY |
| 93 | add $0x40, MESSAGE |
| 94 | sub $0x40, MESSAGE_LEN |
| 95 | jge .Lloop4 |
| 96 | |
| 97 | .Lloop4_done: |
| 98 | and $0x3f, MESSAGE_LEN |
| 99 | jz .Ldone |
| 100 | _nh_stride K0, K1, K2, K3, 0x00 |
| 101 | |
| 102 | sub $0x10, MESSAGE_LEN |
| 103 | jz .Ldone |
| 104 | _nh_stride K1, K2, K3, K0, 0x10 |
| 105 | |
| 106 | sub $0x10, MESSAGE_LEN |
| 107 | jz .Ldone |
| 108 | _nh_stride K2, K3, K0, K1, 0x20 |
| 109 | |
| 110 | .Ldone: |
| 111 | // Sum the accumulators for each pass, then store the sums to 'hash' |
| 112 | movdqa PASS0_SUMS, T0 |
| 113 | movdqa PASS2_SUMS, T1 |
| 114 | punpcklqdq PASS1_SUMS, T0 // => (PASS0_SUM_A PASS1_SUM_A) |
| 115 | punpcklqdq PASS3_SUMS, T1 // => (PASS2_SUM_A PASS3_SUM_A) |
| 116 | punpckhqdq PASS1_SUMS, PASS0_SUMS // => (PASS0_SUM_B PASS1_SUM_B) |
| 117 | punpckhqdq PASS3_SUMS, PASS2_SUMS // => (PASS2_SUM_B PASS3_SUM_B) |
| 118 | paddq PASS0_SUMS, T0 |
| 119 | paddq PASS2_SUMS, T1 |
| 120 | movdqu T0, 0x00(HASH) |
| 121 | movdqu T1, 0x10(HASH) |
Peter Zijlstra | f94909c | 2021-12-04 14:43:40 +0100 | [diff] [blame] | 122 | RET |
Jiri Slaby | 6dcc562 | 2019-10-11 13:51:04 +0200 | [diff] [blame] | 123 | SYM_FUNC_END(nh_sse2) |