blob: d9b734d0c8cc78ecdc3293ca117546eb538a12fc [file] [log] [blame]
Tim Chen6a8ce1e2012-09-27 15:44:22 -07001/*
2 * Implement fast CRC32C with PCLMULQDQ instructions. (x86_64)
3 *
Tim Chen918731f2013-02-21 11:04:22 -08004 * The white papers on CRC32C calculations with PCLMULQDQ instruction can be
Tim Chen6a8ce1e2012-09-27 15:44:22 -07005 * downloaded from:
Tim Chen918731f2013-02-21 11:04:22 -08006 * http://www.intel.com/content/dam/www/public/us/en/documents/white-papers/crc-iscsi-polynomial-crc32-instruction-paper.pdf
7 * http://www.intel.com/content/dam/www/public/us/en/documents/white-papers/fast-crc-computation-paper.pdf
Tim Chen6a8ce1e2012-09-27 15:44:22 -07008 *
9 * Copyright (C) 2012 Intel Corporation.
10 *
11 * Authors:
12 * Wajdi Feghali <wajdi.k.feghali@intel.com>
13 * James Guilford <james.guilford@intel.com>
14 * David Cote <david.m.cote@intel.com>
15 * Tim Chen <tim.c.chen@linux.intel.com>
16 *
17 * This software is available to you under a choice of one of two
18 * licenses. You may choose to be licensed under the terms of the GNU
19 * General Public License (GPL) Version 2, available from the file
20 * COPYING in the main directory of this source tree, or the
21 * OpenIB.org BSD license below:
22 *
23 * Redistribution and use in source and binary forms, with or
24 * without modification, are permitted provided that the following
25 * conditions are met:
26 *
27 * - Redistributions of source code must retain the above
28 * copyright notice, this list of conditions and the following
29 * disclaimer.
30 *
31 * - Redistributions in binary form must reproduce the above
32 * copyright notice, this list of conditions and the following
33 * disclaimer in the documentation and/or other materials
34 * provided with the distribution.
35 *
36 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
37 * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
38 * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
39 * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
40 * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
41 * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
42 * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
43 * SOFTWARE.
44 */
45
Sandy Wu57ae1b02013-03-28 17:05:44 -070046#include <asm/inst.h>
Jussi Kivilinna698a5ab2013-01-19 13:39:21 +020047#include <linux/linkage.h>
David Woodhouse9697fa32018-01-11 21:46:27 +000048#include <asm/nospec-branch.h>
Jussi Kivilinna698a5ab2013-01-19 13:39:21 +020049
Tim Chen6a8ce1e2012-09-27 15:44:22 -070050## ISCSI CRC 32 Implementation with crc32 and pclmulqdq Instruction
51
52.macro LABEL prefix n
53\prefix\n\():
54.endm
55
56.macro JMPTBL_ENTRY i
57.word crc_\i - crc_array
58.endm
59
60.macro JNC_LESS_THAN j
61 jnc less_than_\j
62.endm
63
64# Define threshold where buffers are considered "small" and routed to more
65# efficient "by-1" code. This "by-1" code only handles up to 255 bytes, so
66# SMALL_SIZE can be no larger than 255.
67
68#define SMALL_SIZE 200
69
70.if (SMALL_SIZE > 255)
71.error "SMALL_ SIZE must be < 256"
72.endif
73
74# unsigned int crc_pcl(u8 *buffer, int len, unsigned int crc_init);
75
George Spelvin473946e2014-06-06 23:08:58 -040076.text
Jussi Kivilinna698a5ab2013-01-19 13:39:21 +020077ENTRY(crc_pcl)
Tim Chen6a8ce1e2012-09-27 15:44:22 -070078#define bufp %rdi
79#define bufp_dw %edi
80#define bufp_w %di
81#define bufp_b %dil
82#define bufptmp %rcx
83#define block_0 %rcx
84#define block_1 %rdx
85#define block_2 %r11
86#define len %rsi
87#define len_dw %esi
88#define len_w %si
89#define len_b %sil
90#define crc_init_arg %rdx
91#define tmp %rbx
92#define crc_init %r8
93#define crc_init_dw %r8d
94#define crc1 %r9
95#define crc2 %r10
96
97 pushq %rbx
98 pushq %rdi
99 pushq %rsi
100
101 ## Move crc_init for Linux to a different
102 mov crc_init_arg, crc_init
103
104 ################################################################
105 ## 1) ALIGN:
106 ################################################################
107
108 mov bufp, bufptmp # rdi = *buf
109 neg bufp
110 and $7, bufp # calculate the unalignment amount of
111 # the address
112 je proc_block # Skip if aligned
113
114 ## If len is less than 8 and we're unaligned, we need to jump
115 ## to special code to avoid reading beyond the end of the buffer
116 cmp $8, len
117 jae do_align
118 # less_than_8 expects length in upper 3 bits of len_dw
119 # less_than_8_post_shl1 expects length = carryflag * 8 + len_dw[31:30]
120 shl $32-3+1, len_dw
121 jmp less_than_8_post_shl1
122
123do_align:
124 #### Calculate CRC of unaligned bytes of the buffer (if any)
125 movq (bufptmp), tmp # load a quadward from the buffer
126 add bufp, bufptmp # align buffer pointer for quadword
127 # processing
128 sub bufp, len # update buffer length
129align_loop:
130 crc32b %bl, crc_init_dw # compute crc32 of 1-byte
131 shr $8, tmp # get next byte
132 dec bufp
133 jne align_loop
134
135proc_block:
136
137 ################################################################
138 ## 2) PROCESS BLOCKS:
139 ################################################################
140
141 ## compute num of bytes to be processed
142 movq len, tmp # save num bytes in tmp
143
144 cmpq $128*24, len
145 jae full_block
146
147continue_block:
148 cmpq $SMALL_SIZE, len
149 jb small
150
151 ## len < 128*24
152 movq $2731, %rax # 2731 = ceil(2^16 / 24)
153 mul len_dw
154 shrq $16, %rax
155
156 ## eax contains floor(bytes / 24) = num 24-byte chunks to do
157
158 ## process rax 24-byte chunks (128 >= rax >= 0)
159
160 ## compute end address of each block
161 ## block 0 (base addr + RAX * 8)
162 ## block 1 (base addr + RAX * 16)
163 ## block 2 (base addr + RAX * 24)
164 lea (bufptmp, %rax, 8), block_0
165 lea (block_0, %rax, 8), block_1
166 lea (block_1, %rax, 8), block_2
167
168 xor crc1, crc1
169 xor crc2, crc2
170
171 ## branch into array
172 lea jump_table(%rip), bufp
173 movzxw (bufp, %rax, 2), len
Josh Poimboeuff66f6192016-01-21 16:49:16 -0600174 lea crc_array(%rip), bufp
175 lea (bufp, len, 1), bufp
David Woodhouse9697fa32018-01-11 21:46:27 +0000176 JMP_NOSPEC bufp
Tim Chen6a8ce1e2012-09-27 15:44:22 -0700177
178 ################################################################
179 ## 2a) PROCESS FULL BLOCKS:
180 ################################################################
181full_block:
Denys Vlasenkoa734b4a2015-03-31 19:00:10 +0200182 movl $128,%eax
Tim Chen6a8ce1e2012-09-27 15:44:22 -0700183 lea 128*8*2(block_0), block_1
184 lea 128*8*3(block_0), block_2
185 add $128*8*1, block_0
186
187 xor crc1,crc1
188 xor crc2,crc2
189
190 # Fall thruogh into top of crc array (crc_128)
191
192 ################################################################
193 ## 3) CRC Array:
194 ################################################################
195
196crc_array:
197 i=128
198.rept 128-1
199.altmacro
200LABEL crc_ %i
201.noaltmacro
202 crc32q -i*8(block_0), crc_init
203 crc32q -i*8(block_1), crc1
204 crc32q -i*8(block_2), crc2
205 i=(i-1)
206.endr
207
208.altmacro
209LABEL crc_ %i
210.noaltmacro
211 crc32q -i*8(block_0), crc_init
212 crc32q -i*8(block_1), crc1
213# SKIP crc32 -i*8(block_2), crc2 ; Don't do this one yet
214
215 mov block_2, block_0
216
217 ################################################################
218 ## 4) Combine three results:
219 ################################################################
220
George Spelvin473946e2014-06-06 23:08:58 -0400221 lea (K_table-8)(%rip), bufp # first entry is for idx 1
Tim Chen6a8ce1e2012-09-27 15:44:22 -0700222 shlq $3, %rax # rax *= 8
George Spelvin473946e2014-06-06 23:08:58 -0400223 pmovzxdq (bufp,%rax), %xmm0 # 2 consts: K1:K2
224 leal (%eax,%eax,2), %eax # rax *= 3 (total *24)
225 subq %rax, tmp # tmp -= rax*24
Tim Chen6a8ce1e2012-09-27 15:44:22 -0700226
227 movq crc_init, %xmm1 # CRC for block 1
Sandy Wu57ae1b02013-03-28 17:05:44 -0700228 PCLMULQDQ 0x00,%xmm0,%xmm1 # Multiply by K2
Tim Chen6a8ce1e2012-09-27 15:44:22 -0700229
230 movq crc1, %xmm2 # CRC for block 2
Sandy Wu57ae1b02013-03-28 17:05:44 -0700231 PCLMULQDQ 0x10, %xmm0, %xmm2 # Multiply by K1
Tim Chen6a8ce1e2012-09-27 15:44:22 -0700232
233 pxor %xmm2,%xmm1
234 movq %xmm1, %rax
235 xor -i*8(block_2), %rax
236 mov crc2, crc_init
237 crc32 %rax, crc_init
238
George Spelvin473946e2014-06-06 23:08:58 -0400239 ################################################################
240 ## 5) Check for end:
241 ################################################################
Tim Chen6a8ce1e2012-09-27 15:44:22 -0700242
243LABEL crc_ 0
244 mov tmp, len
245 cmp $128*24, tmp
246 jae full_block
247 cmp $24, tmp
248 jae continue_block
249
250less_than_24:
251 shl $32-4, len_dw # less_than_16 expects length
252 # in upper 4 bits of len_dw
253 jnc less_than_16
254 crc32q (bufptmp), crc_init
255 crc32q 8(bufptmp), crc_init
256 jz do_return
257 add $16, bufptmp
258 # len is less than 8 if we got here
259 # less_than_8 expects length in upper 3 bits of len_dw
260 # less_than_8_post_shl1 expects length = carryflag * 8 + len_dw[31:30]
261 shl $2, len_dw
262 jmp less_than_8_post_shl1
263
264 #######################################################################
265 ## 6) LESS THAN 256-bytes REMAIN AT THIS POINT (8-bits of len are full)
266 #######################################################################
267small:
268 shl $32-8, len_dw # Prepare len_dw for less_than_256
269 j=256
270.rept 5 # j = {256, 128, 64, 32, 16}
271.altmacro
272LABEL less_than_ %j # less_than_j: Length should be in
273 # upper lg(j) bits of len_dw
274 j=(j/2)
275 shl $1, len_dw # Get next MSB
276 JNC_LESS_THAN %j
277.noaltmacro
278 i=0
279.rept (j/8)
280 crc32q i(bufptmp), crc_init # Compute crc32 of 8-byte data
281 i=i+8
282.endr
283 jz do_return # Return if remaining length is zero
284 add $j, bufptmp # Advance buf
285.endr
286
287less_than_8: # Length should be stored in
288 # upper 3 bits of len_dw
289 shl $1, len_dw
290less_than_8_post_shl1:
291 jnc less_than_4
292 crc32l (bufptmp), crc_init_dw # CRC of 4 bytes
293 jz do_return # return if remaining data is zero
294 add $4, bufptmp
295less_than_4: # Length should be stored in
296 # upper 2 bits of len_dw
297 shl $1, len_dw
298 jnc less_than_2
299 crc32w (bufptmp), crc_init_dw # CRC of 2 bytes
300 jz do_return # return if remaining data is zero
301 add $2, bufptmp
302less_than_2: # Length should be stored in the MSB
303 # of len_dw
304 shl $1, len_dw
305 jnc less_than_1
306 crc32b (bufptmp), crc_init_dw # CRC of 1 byte
307less_than_1: # Length should be zero
308do_return:
309 movq crc_init, %rax
310 popq %rsi
311 popq %rdi
312 popq %rbx
313 ret
Josh Poimboeuff66f6192016-01-21 16:49:16 -0600314ENDPROC(crc_pcl)
Tim Chen6a8ce1e2012-09-27 15:44:22 -0700315
Denys Vlasenko587d5312017-01-19 22:28:05 +0100316.section .rodata, "a", @progbits
Tim Chen6a8ce1e2012-09-27 15:44:22 -0700317 ################################################################
318 ## jump table Table is 129 entries x 2 bytes each
319 ################################################################
320.align 4
321jump_table:
322 i=0
323.rept 129
324.altmacro
325JMPTBL_ENTRY %i
326.noaltmacro
327 i=i+1
328.endr
Jussi Kivilinna698a5ab2013-01-19 13:39:21 +0200329
Jussi Kivilinna698a5ab2013-01-19 13:39:21 +0200330
Tim Chen6a8ce1e2012-09-27 15:44:22 -0700331 ################################################################
332 ## PCLMULQDQ tables
George Spelvin473946e2014-06-06 23:08:58 -0400333 ## Table is 128 entries x 2 words (8 bytes) each
Tim Chen6a8ce1e2012-09-27 15:44:22 -0700334 ################################################################
George Spelvin473946e2014-06-06 23:08:58 -0400335.align 8
Tim Chen6a8ce1e2012-09-27 15:44:22 -0700336K_table:
George Spelvin473946e2014-06-06 23:08:58 -0400337 .long 0x493c7d27, 0x00000001
338 .long 0xba4fc28e, 0x493c7d27
339 .long 0xddc0152b, 0xf20c0dfe
340 .long 0x9e4addf8, 0xba4fc28e
341 .long 0x39d3b296, 0x3da6d0cb
342 .long 0x0715ce53, 0xddc0152b
343 .long 0x47db8317, 0x1c291d04
344 .long 0x0d3b6092, 0x9e4addf8
345 .long 0xc96cfdc0, 0x740eef02
346 .long 0x878a92a7, 0x39d3b296
347 .long 0xdaece73e, 0x083a6eec
348 .long 0xab7aff2a, 0x0715ce53
349 .long 0x2162d385, 0xc49f4f67
350 .long 0x83348832, 0x47db8317
351 .long 0x299847d5, 0x2ad91c30
352 .long 0xb9e02b86, 0x0d3b6092
353 .long 0x18b33a4e, 0x6992cea2
354 .long 0xb6dd949b, 0xc96cfdc0
355 .long 0x78d9ccb7, 0x7e908048
356 .long 0xbac2fd7b, 0x878a92a7
357 .long 0xa60ce07b, 0x1b3d8f29
358 .long 0xce7f39f4, 0xdaece73e
359 .long 0x61d82e56, 0xf1d0f55e
360 .long 0xd270f1a2, 0xab7aff2a
361 .long 0xc619809d, 0xa87ab8a8
362 .long 0x2b3cac5d, 0x2162d385
363 .long 0x65863b64, 0x8462d800
364 .long 0x1b03397f, 0x83348832
365 .long 0xebb883bd, 0x71d111a8
366 .long 0xb3e32c28, 0x299847d5
367 .long 0x064f7f26, 0xffd852c6
368 .long 0xdd7e3b0c, 0xb9e02b86
369 .long 0xf285651c, 0xdcb17aa4
370 .long 0x10746f3c, 0x18b33a4e
371 .long 0xc7a68855, 0xf37c5aee
372 .long 0x271d9844, 0xb6dd949b
373 .long 0x8e766a0c, 0x6051d5a2
374 .long 0x93a5f730, 0x78d9ccb7
375 .long 0x6cb08e5c, 0x18b0d4ff
376 .long 0x6b749fb2, 0xbac2fd7b
377 .long 0x1393e203, 0x21f3d99c
378 .long 0xcec3662e, 0xa60ce07b
379 .long 0x96c515bb, 0x8f158014
380 .long 0xe6fc4e6a, 0xce7f39f4
381 .long 0x8227bb8a, 0xa00457f7
382 .long 0xb0cd4768, 0x61d82e56
383 .long 0x39c7ff35, 0x8d6d2c43
384 .long 0xd7a4825c, 0xd270f1a2
385 .long 0x0ab3844b, 0x00ac29cf
386 .long 0x0167d312, 0xc619809d
387 .long 0xf6076544, 0xe9adf796
388 .long 0x26f6a60a, 0x2b3cac5d
389 .long 0xa741c1bf, 0x96638b34
390 .long 0x98d8d9cb, 0x65863b64
391 .long 0x49c3cc9c, 0xe0e9f351
392 .long 0x68bce87a, 0x1b03397f
393 .long 0x57a3d037, 0x9af01f2d
394 .long 0x6956fc3b, 0xebb883bd
395 .long 0x42d98888, 0x2cff42cf
396 .long 0x3771e98f, 0xb3e32c28
397 .long 0xb42ae3d9, 0x88f25a3a
398 .long 0x2178513a, 0x064f7f26
399 .long 0xe0ac139e, 0x4e36f0b0
400 .long 0x170076fa, 0xdd7e3b0c
401 .long 0x444dd413, 0xbd6f81f8
402 .long 0x6f345e45, 0xf285651c
403 .long 0x41d17b64, 0x91c9bd4b
404 .long 0xff0dba97, 0x10746f3c
405 .long 0xa2b73df1, 0x885f087b
406 .long 0xf872e54c, 0xc7a68855
407 .long 0x1e41e9fc, 0x4c144932
408 .long 0x86d8e4d2, 0x271d9844
409 .long 0x651bd98b, 0x52148f02
410 .long 0x5bb8f1bc, 0x8e766a0c
411 .long 0xa90fd27a, 0xa3c6f37a
412 .long 0xb3af077a, 0x93a5f730
413 .long 0x4984d782, 0xd7c0557f
414 .long 0xca6ef3ac, 0x6cb08e5c
415 .long 0x234e0b26, 0x63ded06a
416 .long 0xdd66cbbb, 0x6b749fb2
417 .long 0x4597456a, 0x4d56973c
418 .long 0xe9e28eb4, 0x1393e203
419 .long 0x7b3ff57a, 0x9669c9df
420 .long 0xc9c8b782, 0xcec3662e
421 .long 0x3f70cc6f, 0xe417f38a
422 .long 0x93e106a4, 0x96c515bb
423 .long 0x62ec6c6d, 0x4b9e0f71
424 .long 0xd813b325, 0xe6fc4e6a
425 .long 0x0df04680, 0xd104b8fc
426 .long 0x2342001e, 0x8227bb8a
427 .long 0x0a2a8d7e, 0x5b397730
428 .long 0x6d9a4957, 0xb0cd4768
429 .long 0xe8b6368b, 0xe78eb416
430 .long 0xd2c3ed1a, 0x39c7ff35
431 .long 0x995a5724, 0x61ff0e01
432 .long 0x9ef68d35, 0xd7a4825c
433 .long 0x0c139b31, 0x8d96551c
434 .long 0xf2271e60, 0x0ab3844b
435 .long 0x0b0bf8ca, 0x0bf80dd2
436 .long 0x2664fd8b, 0x0167d312
437 .long 0xed64812d, 0x8821abed
438 .long 0x02ee03b2, 0xf6076544
439 .long 0x8604ae0f, 0x6a45d2b2
440 .long 0x363bd6b3, 0x26f6a60a
441 .long 0x135c83fd, 0xd8d26619
442 .long 0x5fabe670, 0xa741c1bf
443 .long 0x35ec3279, 0xde87806c
444 .long 0x00bcf5f6, 0x98d8d9cb
445 .long 0x8ae00689, 0x14338754
446 .long 0x17f27698, 0x49c3cc9c
447 .long 0x58ca5f00, 0x5bd2011f
448 .long 0xaa7c7ad5, 0x68bce87a
449 .long 0xb5cfca28, 0xdd07448e
450 .long 0xded288f8, 0x57a3d037
451 .long 0x59f229bc, 0xdde8f5b9
452 .long 0x6d390dec, 0x6956fc3b
453 .long 0x37170390, 0xa3e3e02c
454 .long 0x6353c1cc, 0x42d98888
455 .long 0xc4584f5c, 0xd73c7bea
456 .long 0xf48642e9, 0x3771e98f
457 .long 0x531377e2, 0x80ff0093
458 .long 0xdd35bc8d, 0xb42ae3d9
459 .long 0xb25b29f2, 0x8fe4c34d
460 .long 0x9a5ede41, 0x2178513a
461 .long 0xa563905d, 0xdf99fc11
462 .long 0x45cddf4e, 0xe0ac139e
463 .long 0xacfa3103, 0x6c23e841
464 .long 0xa51b6135, 0x170076fa