blob: 34c567bbcb4faa9f570b648330cb3593f67efcd1 [file] [log] [blame]
Eric Biggers012c8232018-12-04 22:20:00 -08001/* SPDX-License-Identifier: GPL-2.0 */
2/*
3 * NH - ε-almost-universal hash function, x86_64 SSE2 accelerated
4 *
5 * Copyright 2018 Google LLC
6 *
7 * Author: Eric Biggers <ebiggers@google.com>
8 */
9
10#include <linux/linkage.h>
11
12#define PASS0_SUMS %xmm0
13#define PASS1_SUMS %xmm1
14#define PASS2_SUMS %xmm2
15#define PASS3_SUMS %xmm3
16#define K0 %xmm4
17#define K1 %xmm5
18#define K2 %xmm6
19#define K3 %xmm7
20#define T0 %xmm8
21#define T1 %xmm9
22#define T2 %xmm10
23#define T3 %xmm11
24#define T4 %xmm12
25#define T5 %xmm13
26#define T6 %xmm14
27#define T7 %xmm15
28#define KEY %rdi
29#define MESSAGE %rsi
30#define MESSAGE_LEN %rdx
31#define HASH %rcx
32
33.macro _nh_stride k0, k1, k2, k3, offset
34
35 // Load next message stride
36 movdqu \offset(MESSAGE), T1
37
38 // Load next key stride
39 movdqu \offset(KEY), \k3
40
41 // Add message words to key words
42 movdqa T1, T2
43 movdqa T1, T3
44 paddd T1, \k0 // reuse k0 to avoid a move
45 paddd \k1, T1
46 paddd \k2, T2
47 paddd \k3, T3
48
49 // Multiply 32x32 => 64 and accumulate
50 pshufd $0x10, \k0, T4
51 pshufd $0x32, \k0, \k0
52 pshufd $0x10, T1, T5
53 pshufd $0x32, T1, T1
54 pshufd $0x10, T2, T6
55 pshufd $0x32, T2, T2
56 pshufd $0x10, T3, T7
57 pshufd $0x32, T3, T3
58 pmuludq T4, \k0
59 pmuludq T5, T1
60 pmuludq T6, T2
61 pmuludq T7, T3
62 paddq \k0, PASS0_SUMS
63 paddq T1, PASS1_SUMS
64 paddq T2, PASS2_SUMS
65 paddq T3, PASS3_SUMS
66.endm
67
68/*
69 * void nh_sse2(const u32 *key, const u8 *message, size_t message_len,
70 * u8 hash[NH_HASH_BYTES])
71 *
72 * It's guaranteed that message_len % 16 == 0.
73 */
Jiri Slaby6dcc5622019-10-11 13:51:04 +020074SYM_FUNC_START(nh_sse2)
Eric Biggers012c8232018-12-04 22:20:00 -080075
76 movdqu 0x00(KEY), K0
77 movdqu 0x10(KEY), K1
78 movdqu 0x20(KEY), K2
79 add $0x30, KEY
80 pxor PASS0_SUMS, PASS0_SUMS
81 pxor PASS1_SUMS, PASS1_SUMS
82 pxor PASS2_SUMS, PASS2_SUMS
83 pxor PASS3_SUMS, PASS3_SUMS
84
85 sub $0x40, MESSAGE_LEN
86 jl .Lloop4_done
87.Lloop4:
88 _nh_stride K0, K1, K2, K3, 0x00
89 _nh_stride K1, K2, K3, K0, 0x10
90 _nh_stride K2, K3, K0, K1, 0x20
91 _nh_stride K3, K0, K1, K2, 0x30
92 add $0x40, KEY
93 add $0x40, MESSAGE
94 sub $0x40, MESSAGE_LEN
95 jge .Lloop4
96
97.Lloop4_done:
98 and $0x3f, MESSAGE_LEN
99 jz .Ldone
100 _nh_stride K0, K1, K2, K3, 0x00
101
102 sub $0x10, MESSAGE_LEN
103 jz .Ldone
104 _nh_stride K1, K2, K3, K0, 0x10
105
106 sub $0x10, MESSAGE_LEN
107 jz .Ldone
108 _nh_stride K2, K3, K0, K1, 0x20
109
110.Ldone:
111 // Sum the accumulators for each pass, then store the sums to 'hash'
112 movdqa PASS0_SUMS, T0
113 movdqa PASS2_SUMS, T1
114 punpcklqdq PASS1_SUMS, T0 // => (PASS0_SUM_A PASS1_SUM_A)
115 punpcklqdq PASS3_SUMS, T1 // => (PASS2_SUM_A PASS3_SUM_A)
116 punpckhqdq PASS1_SUMS, PASS0_SUMS // => (PASS0_SUM_B PASS1_SUM_B)
117 punpckhqdq PASS3_SUMS, PASS2_SUMS // => (PASS2_SUM_B PASS3_SUM_B)
118 paddq PASS0_SUMS, T0
119 paddq PASS2_SUMS, T1
120 movdqu T0, 0x00(HASH)
121 movdqu T1, 0x10(HASH)
Peter Zijlstraf94909c2021-12-04 14:43:40 +0100122 RET
Jiri Slaby6dcc5622019-10-11 13:51:04 +0200123SYM_FUNC_END(nh_sse2)