blob: c4a5db612c3276380d914bbe65170811ddc1e7d1 [file] [log] [blame]
Tim Chen46d208a2013-03-26 13:58:58 -07001########################################################################
2# Implement fast SHA-256 with SSSE3 instructions. (x86_64)
3#
4# Copyright (C) 2013 Intel Corporation.
5#
6# Authors:
7# James Guilford <james.guilford@intel.com>
8# Kirk Yap <kirk.s.yap@intel.com>
9# Tim Chen <tim.c.chen@linux.intel.com>
10#
11# This software is available to you under a choice of one of two
12# licenses. You may choose to be licensed under the terms of the GNU
13# General Public License (GPL) Version 2, available from the file
14# COPYING in the main directory of this source tree, or the
15# OpenIB.org BSD license below:
16#
17# Redistribution and use in source and binary forms, with or
18# without modification, are permitted provided that the following
19# conditions are met:
20#
21# - Redistributions of source code must retain the above
22# copyright notice, this list of conditions and the following
23# disclaimer.
24#
25# - Redistributions in binary form must reproduce the above
26# copyright notice, this list of conditions and the following
27# disclaimer in the documentation and/or other materials
28# provided with the distribution.
29#
30# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
31# EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
32# MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
33# NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
34# BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
35# ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
36# CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
37# SOFTWARE.
38#
39########################################################################
40#
41# This code is described in an Intel White-Paper:
42# "Fast SHA-256 Implementations on Intel Architecture Processors"
43#
44# To find it, surf to http://www.intel.com/p/en_US/embedded
45# and search for that title.
46#
47########################################################################
48
49#include <linux/linkage.h>
50
51## assume buffers not aligned
52#define MOVDQ movdqu
53
54################################ Define Macros
55
56# addm [mem], reg
57# Add reg to mem using reg-mem add and store
58.macro addm p1 p2
59 add \p1, \p2
60 mov \p2, \p1
61.endm
62
63################################
64
65# COPY_XMM_AND_BSWAP xmm, [mem], byte_flip_mask
66# Load xmm with mem and byte swap each dword
67.macro COPY_XMM_AND_BSWAP p1 p2 p3
68 MOVDQ \p2, \p1
69 pshufb \p3, \p1
70.endm
71
72################################
73
74X0 = %xmm4
75X1 = %xmm5
76X2 = %xmm6
77X3 = %xmm7
78
79XTMP0 = %xmm0
80XTMP1 = %xmm1
81XTMP2 = %xmm2
82XTMP3 = %xmm3
83XTMP4 = %xmm8
84XFER = %xmm9
85
86SHUF_00BA = %xmm10 # shuffle xBxA -> 00BA
87SHUF_DC00 = %xmm11 # shuffle xDxC -> DC00
88BYTE_FLIP_MASK = %xmm12
89
90NUM_BLKS = %rdx # 3rd arg
Ard Biesheuvel16310302015-04-09 12:55:47 +020091INP = %rsi # 2nd arg
92CTX = %rdi # 1st arg
Tim Chen46d208a2013-03-26 13:58:58 -070093
Ard Biesheuvel16310302015-04-09 12:55:47 +020094SRND = %rsi # clobbers INP
Tim Chen46d208a2013-03-26 13:58:58 -070095c = %ecx
96d = %r8d
97e = %edx
Josh Poimboeuf539012d2017-09-18 14:42:09 -050098TBL = %r12
Tim Chen46d208a2013-03-26 13:58:58 -070099a = %eax
100b = %ebx
101
102f = %r9d
103g = %r10d
104h = %r11d
105
106y0 = %r13d
107y1 = %r14d
108y2 = %r15d
109
110
111
112_INP_END_SIZE = 8
113_INP_SIZE = 8
Jussi Kivilinnade614e52013-05-21 17:09:41 +0300114_XFER_SIZE = 16
Tim Chen46d208a2013-03-26 13:58:58 -0700115_XMM_SAVE_SIZE = 0
116
117_INP_END = 0
118_INP = _INP_END + _INP_END_SIZE
119_XFER = _INP + _INP_SIZE
120_XMM_SAVE = _XFER + _XFER_SIZE
121STACK_SIZE = _XMM_SAVE + _XMM_SAVE_SIZE
122
123# rotate_Xs
124# Rotate values of symbols X0...X3
125.macro rotate_Xs
126X_ = X0
127X0 = X1
128X1 = X2
129X2 = X3
130X3 = X_
131.endm
132
133# ROTATE_ARGS
134# Rotate values of symbols a...h
135.macro ROTATE_ARGS
136TMP_ = h
137h = g
138g = f
139f = e
140e = d
141d = c
142c = b
143b = a
144a = TMP_
145.endm
146
147.macro FOUR_ROUNDS_AND_SCHED
148 ## compute s0 four at a time and s1 two at a time
149 ## compute W[-16] + W[-7] 4 at a time
150 movdqa X3, XTMP0
151 mov e, y0 # y0 = e
152 ror $(25-11), y0 # y0 = e >> (25-11)
153 mov a, y1 # y1 = a
154 palignr $4, X2, XTMP0 # XTMP0 = W[-7]
155 ror $(22-13), y1 # y1 = a >> (22-13)
156 xor e, y0 # y0 = e ^ (e >> (25-11))
157 mov f, y2 # y2 = f
158 ror $(11-6), y0 # y0 = (e >> (11-6)) ^ (e >> (25-6))
159 movdqa X1, XTMP1
160 xor a, y1 # y1 = a ^ (a >> (22-13)
161 xor g, y2 # y2 = f^g
162 paddd X0, XTMP0 # XTMP0 = W[-7] + W[-16]
163 xor e, y0 # y0 = e ^ (e >> (11-6)) ^ (e >> (25-6))
164 and e, y2 # y2 = (f^g)&e
165 ror $(13-2), y1 # y1 = (a >> (13-2)) ^ (a >> (22-2))
166 ## compute s0
167 palignr $4, X0, XTMP1 # XTMP1 = W[-15]
168 xor a, y1 # y1 = a ^ (a >> (13-2)) ^ (a >> (22-2))
169 ror $6, y0 # y0 = S1 = (e>>6) & (e>>11) ^ (e>>25)
170 xor g, y2 # y2 = CH = ((f^g)&e)^g
171 movdqa XTMP1, XTMP2 # XTMP2 = W[-15]
172 ror $2, y1 # y1 = S0 = (a>>2) ^ (a>>13) ^ (a>>22)
173 add y0, y2 # y2 = S1 + CH
174 add _XFER(%rsp) , y2 # y2 = k + w + S1 + CH
175 movdqa XTMP1, XTMP3 # XTMP3 = W[-15]
176 mov a, y0 # y0 = a
177 add y2, h # h = h + S1 + CH + k + w
178 mov a, y2 # y2 = a
179 pslld $(32-7), XTMP1 #
180 or c, y0 # y0 = a|c
181 add h, d # d = d + h + S1 + CH + k + w
182 and c, y2 # y2 = a&c
183 psrld $7, XTMP2 #
184 and b, y0 # y0 = (a|c)&b
185 add y1, h # h = h + S1 + CH + k + w + S0
186 por XTMP2, XTMP1 # XTMP1 = W[-15] ror 7
187 or y2, y0 # y0 = MAJ = (a|c)&b)|(a&c)
188 add y0, h # h = h + S1 + CH + k + w + S0 + MAJ
189 #
190 ROTATE_ARGS #
191 movdqa XTMP3, XTMP2 # XTMP2 = W[-15]
192 mov e, y0 # y0 = e
193 mov a, y1 # y1 = a
194 movdqa XTMP3, XTMP4 # XTMP4 = W[-15]
195 ror $(25-11), y0 # y0 = e >> (25-11)
196 xor e, y0 # y0 = e ^ (e >> (25-11))
197 mov f, y2 # y2 = f
198 ror $(22-13), y1 # y1 = a >> (22-13)
199 pslld $(32-18), XTMP3 #
200 xor a, y1 # y1 = a ^ (a >> (22-13)
201 ror $(11-6), y0 # y0 = (e >> (11-6)) ^ (e >> (25-6))
202 xor g, y2 # y2 = f^g
203 psrld $18, XTMP2 #
204 ror $(13-2), y1 # y1 = (a >> (13-2)) ^ (a >> (22-2))
205 xor e, y0 # y0 = e ^ (e >> (11-6)) ^ (e >> (25-6))
206 and e, y2 # y2 = (f^g)&e
207 ror $6, y0 # y0 = S1 = (e>>6) & (e>>11) ^ (e>>25)
208 pxor XTMP3, XTMP1
209 xor a, y1 # y1 = a ^ (a >> (13-2)) ^ (a >> (22-2))
210 xor g, y2 # y2 = CH = ((f^g)&e)^g
211 psrld $3, XTMP4 # XTMP4 = W[-15] >> 3
212 add y0, y2 # y2 = S1 + CH
213 add (1*4 + _XFER)(%rsp), y2 # y2 = k + w + S1 + CH
214 ror $2, y1 # y1 = S0 = (a>>2) ^ (a>>13) ^ (a>>22)
215 pxor XTMP2, XTMP1 # XTMP1 = W[-15] ror 7 ^ W[-15] ror 18
216 mov a, y0 # y0 = a
217 add y2, h # h = h + S1 + CH + k + w
218 mov a, y2 # y2 = a
219 pxor XTMP4, XTMP1 # XTMP1 = s0
220 or c, y0 # y0 = a|c
221 add h, d # d = d + h + S1 + CH + k + w
222 and c, y2 # y2 = a&c
223 ## compute low s1
224 pshufd $0b11111010, X3, XTMP2 # XTMP2 = W[-2] {BBAA}
225 and b, y0 # y0 = (a|c)&b
226 add y1, h # h = h + S1 + CH + k + w + S0
227 paddd XTMP1, XTMP0 # XTMP0 = W[-16] + W[-7] + s0
228 or y2, y0 # y0 = MAJ = (a|c)&b)|(a&c)
229 add y0, h # h = h + S1 + CH + k + w + S0 + MAJ
230
231 ROTATE_ARGS
232 movdqa XTMP2, XTMP3 # XTMP3 = W[-2] {BBAA}
233 mov e, y0 # y0 = e
234 mov a, y1 # y1 = a
235 ror $(25-11), y0 # y0 = e >> (25-11)
236 movdqa XTMP2, XTMP4 # XTMP4 = W[-2] {BBAA}
237 xor e, y0 # y0 = e ^ (e >> (25-11))
238 ror $(22-13), y1 # y1 = a >> (22-13)
239 mov f, y2 # y2 = f
240 xor a, y1 # y1 = a ^ (a >> (22-13)
241 ror $(11-6), y0 # y0 = (e >> (11-6)) ^ (e >> (25-6))
242 psrlq $17, XTMP2 # XTMP2 = W[-2] ror 17 {xBxA}
243 xor g, y2 # y2 = f^g
244 psrlq $19, XTMP3 # XTMP3 = W[-2] ror 19 {xBxA}
245 xor e, y0 # y0 = e ^ (e >> (11-6)) ^ (e >> (25-6))
246 and e, y2 # y2 = (f^g)&e
247 psrld $10, XTMP4 # XTMP4 = W[-2] >> 10 {BBAA}
248 ror $(13-2), y1 # y1 = (a >> (13-2)) ^ (a >> (22-2))
249 xor a, y1 # y1 = a ^ (a >> (13-2)) ^ (a >> (22-2))
250 xor g, y2 # y2 = CH = ((f^g)&e)^g
251 ror $6, y0 # y0 = S1 = (e>>6) & (e>>11) ^ (e>>25)
252 pxor XTMP3, XTMP2
253 add y0, y2 # y2 = S1 + CH
254 ror $2, y1 # y1 = S0 = (a>>2) ^ (a>>13) ^ (a>>22)
255 add (2*4 + _XFER)(%rsp), y2 # y2 = k + w + S1 + CH
256 pxor XTMP2, XTMP4 # XTMP4 = s1 {xBxA}
257 mov a, y0 # y0 = a
258 add y2, h # h = h + S1 + CH + k + w
259 mov a, y2 # y2 = a
260 pshufb SHUF_00BA, XTMP4 # XTMP4 = s1 {00BA}
261 or c, y0 # y0 = a|c
262 add h, d # d = d + h + S1 + CH + k + w
263 and c, y2 # y2 = a&c
264 paddd XTMP4, XTMP0 # XTMP0 = {..., ..., W[1], W[0]}
265 and b, y0 # y0 = (a|c)&b
266 add y1, h # h = h + S1 + CH + k + w + S0
267 ## compute high s1
268 pshufd $0b01010000, XTMP0, XTMP2 # XTMP2 = W[-2] {BBAA}
269 or y2, y0 # y0 = MAJ = (a|c)&b)|(a&c)
270 add y0, h # h = h + S1 + CH + k + w + S0 + MAJ
271 #
272 ROTATE_ARGS #
273 movdqa XTMP2, XTMP3 # XTMP3 = W[-2] {DDCC}
274 mov e, y0 # y0 = e
275 ror $(25-11), y0 # y0 = e >> (25-11)
276 mov a, y1 # y1 = a
277 movdqa XTMP2, X0 # X0 = W[-2] {DDCC}
278 ror $(22-13), y1 # y1 = a >> (22-13)
279 xor e, y0 # y0 = e ^ (e >> (25-11))
280 mov f, y2 # y2 = f
281 ror $(11-6), y0 # y0 = (e >> (11-6)) ^ (e >> (25-6))
282 psrlq $17, XTMP2 # XTMP2 = W[-2] ror 17 {xDxC}
283 xor a, y1 # y1 = a ^ (a >> (22-13)
284 xor g, y2 # y2 = f^g
285 psrlq $19, XTMP3 # XTMP3 = W[-2] ror 19 {xDxC}
286 xor e, y0 # y0 = e ^ (e >> (11-6)) ^ (e >> (25
287 and e, y2 # y2 = (f^g)&e
288 ror $(13-2), y1 # y1 = (a >> (13-2)) ^ (a >> (22-2))
289 psrld $10, X0 # X0 = W[-2] >> 10 {DDCC}
290 xor a, y1 # y1 = a ^ (a >> (13-2)) ^ (a >> (22
291 ror $6, y0 # y0 = S1 = (e>>6) & (e>>11) ^ (e>>2
292 xor g, y2 # y2 = CH = ((f^g)&e)^g
293 pxor XTMP3, XTMP2 #
294 ror $2, y1 # y1 = S0 = (a>>2) ^ (a>>13) ^ (a>>2
295 add y0, y2 # y2 = S1 + CH
296 add (3*4 + _XFER)(%rsp), y2 # y2 = k + w + S1 + CH
297 pxor XTMP2, X0 # X0 = s1 {xDxC}
298 mov a, y0 # y0 = a
299 add y2, h # h = h + S1 + CH + k + w
300 mov a, y2 # y2 = a
301 pshufb SHUF_DC00, X0 # X0 = s1 {DC00}
302 or c, y0 # y0 = a|c
303 add h, d # d = d + h + S1 + CH + k + w
304 and c, y2 # y2 = a&c
305 paddd XTMP0, X0 # X0 = {W[3], W[2], W[1], W[0]}
306 and b, y0 # y0 = (a|c)&b
307 add y1, h # h = h + S1 + CH + k + w + S0
308 or y2, y0 # y0 = MAJ = (a|c)&b)|(a&c)
309 add y0, h # h = h + S1 + CH + k + w + S0 + MAJ
310
311 ROTATE_ARGS
312 rotate_Xs
313.endm
314
315## input is [rsp + _XFER + %1 * 4]
316.macro DO_ROUND round
317 mov e, y0 # y0 = e
318 ror $(25-11), y0 # y0 = e >> (25-11)
319 mov a, y1 # y1 = a
320 xor e, y0 # y0 = e ^ (e >> (25-11))
321 ror $(22-13), y1 # y1 = a >> (22-13)
322 mov f, y2 # y2 = f
323 xor a, y1 # y1 = a ^ (a >> (22-13)
324 ror $(11-6), y0 # y0 = (e >> (11-6)) ^ (e >> (25-6))
325 xor g, y2 # y2 = f^g
326 xor e, y0 # y0 = e ^ (e >> (11-6)) ^ (e >> (25-6))
327 ror $(13-2), y1 # y1 = (a >> (13-2)) ^ (a >> (22-2))
328 and e, y2 # y2 = (f^g)&e
329 xor a, y1 # y1 = a ^ (a >> (13-2)) ^ (a >> (22-2))
330 ror $6, y0 # y0 = S1 = (e>>6) & (e>>11) ^ (e>>25)
331 xor g, y2 # y2 = CH = ((f^g)&e)^g
332 add y0, y2 # y2 = S1 + CH
333 ror $2, y1 # y1 = S0 = (a>>2) ^ (a>>13) ^ (a>>22)
334 offset = \round * 4 + _XFER
335 add offset(%rsp), y2 # y2 = k + w + S1 + CH
336 mov a, y0 # y0 = a
337 add y2, h # h = h + S1 + CH + k + w
338 mov a, y2 # y2 = a
339 or c, y0 # y0 = a|c
340 add h, d # d = d + h + S1 + CH + k + w
341 and c, y2 # y2 = a&c
342 and b, y0 # y0 = (a|c)&b
343 add y1, h # h = h + S1 + CH + k + w + S0
344 or y2, y0 # y0 = MAJ = (a|c)&b)|(a&c)
345 add y0, h # h = h + S1 + CH + k + w + S0 + MAJ
346 ROTATE_ARGS
347.endm
348
349########################################################################
Kees Cook41419a22020-01-14 19:57:29 -0800350## void sha256_transform_ssse3(struct sha256_state *state, const u8 *data,
351## int blocks);
352## arg 1 : pointer to state
353## (struct sha256_state is assumed to begin with u32 state[8])
Ard Biesheuvel16310302015-04-09 12:55:47 +0200354## arg 2 : pointer to input data
Tim Chen46d208a2013-03-26 13:58:58 -0700355## arg 3 : Num blocks
356########################################################################
357.text
Jiri Slaby6dcc5622019-10-11 13:51:04 +0200358SYM_FUNC_START(sha256_transform_ssse3)
Tim Chen46d208a2013-03-26 13:58:58 -0700359.align 32
360 pushq %rbx
Josh Poimboeuf539012d2017-09-18 14:42:09 -0500361 pushq %r12
Tim Chen46d208a2013-03-26 13:58:58 -0700362 pushq %r13
363 pushq %r14
364 pushq %r15
Josh Poimboeuf539012d2017-09-18 14:42:09 -0500365 pushq %rbp
366 mov %rsp, %rbp
Tim Chen46d208a2013-03-26 13:58:58 -0700367
Tim Chen46d208a2013-03-26 13:58:58 -0700368 subq $STACK_SIZE, %rsp
369 and $~15, %rsp
370
371 shl $6, NUM_BLKS # convert to bytes
372 jz done_hash
373 add INP, NUM_BLKS
374 mov NUM_BLKS, _INP_END(%rsp) # pointer to end of data
375
376 ## load initial digest
377 mov 4*0(CTX), a
378 mov 4*1(CTX), b
379 mov 4*2(CTX), c
380 mov 4*3(CTX), d
381 mov 4*4(CTX), e
382 mov 4*5(CTX), f
383 mov 4*6(CTX), g
384 mov 4*7(CTX), h
385
386 movdqa PSHUFFLE_BYTE_FLIP_MASK(%rip), BYTE_FLIP_MASK
387 movdqa _SHUF_00BA(%rip), SHUF_00BA
388 movdqa _SHUF_DC00(%rip), SHUF_DC00
389
390loop0:
391 lea K256(%rip), TBL
392
393 ## byte swap first 16 dwords
394 COPY_XMM_AND_BSWAP X0, 0*16(INP), BYTE_FLIP_MASK
395 COPY_XMM_AND_BSWAP X1, 1*16(INP), BYTE_FLIP_MASK
396 COPY_XMM_AND_BSWAP X2, 2*16(INP), BYTE_FLIP_MASK
397 COPY_XMM_AND_BSWAP X3, 3*16(INP), BYTE_FLIP_MASK
398
399 mov INP, _INP(%rsp)
400
401 ## schedule 48 input dwords, by doing 3 rounds of 16 each
402 mov $3, SRND
403.align 16
404loop1:
405 movdqa (TBL), XFER
406 paddd X0, XFER
407 movdqa XFER, _XFER(%rsp)
408 FOUR_ROUNDS_AND_SCHED
409
410 movdqa 1*16(TBL), XFER
411 paddd X0, XFER
412 movdqa XFER, _XFER(%rsp)
413 FOUR_ROUNDS_AND_SCHED
414
415 movdqa 2*16(TBL), XFER
416 paddd X0, XFER
417 movdqa XFER, _XFER(%rsp)
418 FOUR_ROUNDS_AND_SCHED
419
420 movdqa 3*16(TBL), XFER
421 paddd X0, XFER
422 movdqa XFER, _XFER(%rsp)
423 add $4*16, TBL
424 FOUR_ROUNDS_AND_SCHED
425
426 sub $1, SRND
427 jne loop1
428
429 mov $2, SRND
430loop2:
431 paddd (TBL), X0
432 movdqa X0, _XFER(%rsp)
433 DO_ROUND 0
434 DO_ROUND 1
435 DO_ROUND 2
436 DO_ROUND 3
437 paddd 1*16(TBL), X1
438 movdqa X1, _XFER(%rsp)
439 add $2*16, TBL
440 DO_ROUND 0
441 DO_ROUND 1
442 DO_ROUND 2
443 DO_ROUND 3
444
445 movdqa X2, X0
446 movdqa X3, X1
447
448 sub $1, SRND
449 jne loop2
450
451 addm (4*0)(CTX),a
452 addm (4*1)(CTX),b
453 addm (4*2)(CTX),c
454 addm (4*3)(CTX),d
455 addm (4*4)(CTX),e
456 addm (4*5)(CTX),f
457 addm (4*6)(CTX),g
458 addm (4*7)(CTX),h
459
460 mov _INP(%rsp), INP
461 add $64, INP
462 cmp _INP_END(%rsp), INP
463 jne loop0
464
465done_hash:
466
Josh Poimboeuf539012d2017-09-18 14:42:09 -0500467 mov %rbp, %rsp
468 popq %rbp
Tim Chen46d208a2013-03-26 13:58:58 -0700469 popq %r15
470 popq %r14
471 popq %r13
Josh Poimboeuf539012d2017-09-18 14:42:09 -0500472 popq %r12
Tim Chen46d208a2013-03-26 13:58:58 -0700473 popq %rbx
474
Peter Zijlstraf94909c2021-12-04 14:43:40 +0100475 RET
Jiri Slaby6dcc5622019-10-11 13:51:04 +0200476SYM_FUNC_END(sha256_transform_ssse3)
Tim Chen46d208a2013-03-26 13:58:58 -0700477
Denys Vlasenkoe1839142017-01-19 22:33:04 +0100478.section .rodata.cst256.K256, "aM", @progbits, 256
Tim Chen46d208a2013-03-26 13:58:58 -0700479.align 64
480K256:
481 .long 0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5
482 .long 0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5
483 .long 0xd807aa98,0x12835b01,0x243185be,0x550c7dc3
484 .long 0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174
485 .long 0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc
486 .long 0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da
487 .long 0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7
488 .long 0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967
489 .long 0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13
490 .long 0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85
491 .long 0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3
492 .long 0xd192e819,0xd6990624,0xf40e3585,0x106aa070
493 .long 0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5
494 .long 0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3
495 .long 0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208
496 .long 0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2
497
Denys Vlasenkoe1839142017-01-19 22:33:04 +0100498.section .rodata.cst16.PSHUFFLE_BYTE_FLIP_MASK, "aM", @progbits, 16
499.align 16
Tim Chen46d208a2013-03-26 13:58:58 -0700500PSHUFFLE_BYTE_FLIP_MASK:
501 .octa 0x0c0d0e0f08090a0b0405060700010203
502
Denys Vlasenkoe1839142017-01-19 22:33:04 +0100503.section .rodata.cst16._SHUF_00BA, "aM", @progbits, 16
504.align 16
Tim Chen46d208a2013-03-26 13:58:58 -0700505# shuffle xBxA -> 00BA
506_SHUF_00BA:
507 .octa 0xFFFFFFFFFFFFFFFF0b0a090803020100
508
Denys Vlasenkoe1839142017-01-19 22:33:04 +0100509.section .rodata.cst16._SHUF_DC00, "aM", @progbits, 16
510.align 16
Tim Chen46d208a2013-03-26 13:58:58 -0700511# shuffle xDxC -> DC00
512_SHUF_DC00:
513 .octa 0x0b0a090803020100FFFFFFFFFFFFFFFF