blob: 98ff51bd2f7dcc1e97f102ec2487ddc25d037f3c [file] [log] [blame]
Thomas Gleixner2874c5f2019-05-27 08:55:01 +02001/* SPDX-License-Identifier: GPL-2.0-or-later */
Paul Mackerras14cf11a2005-09-26 16:04:21 +10002/*
3 * This file contains assembly-language implementations
4 * of IP-style 1's complement checksum routines.
5 *
6 * Copyright (C) 1995-1996 Gary Thomas (gdt@linuxppc.org)
7 *
Paul Mackerras14cf11a2005-09-26 16:04:21 +10008 * Severely hacked about by Paul Mackerras (paulus@cs.anu.edu.au).
9 */
10
11#include <linux/sys.h>
12#include <asm/processor.h>
13#include <asm/errno.h>
14#include <asm/ppc_asm.h>
Al Viro9445aa12016-01-13 23:33:46 -050015#include <asm/export.h>
Paul Mackerras14cf11a2005-09-26 16:04:21 +100016
17/*
Paul Mackerras14cf11a2005-09-26 16:04:21 +100018 * Computes the checksum of a memory block at buff, length len,
19 * and adds in "sum" (32-bit).
20 *
Christophe Leroy7e393222016-03-07 18:44:37 +010021 * __csum_partial(r3=buff, r4=len, r5=sum)
Paul Mackerras14cf11a2005-09-26 16:04:21 +100022 */
Christophe Leroy7e393222016-03-07 18:44:37 +010023_GLOBAL(__csum_partial)
Anton Blanchard9b83ecb2010-08-02 20:08:34 +000024 addic r0,r5,0 /* clear carry */
25
26 srdi. r6,r4,3 /* less than 8 bytes? */
27 beq .Lcsum_tail_word
28
29 /*
30 * If only halfword aligned, align to a double word. Since odd
31 * aligned addresses should be rare and they would require more
32 * work to calculate the correct checksum, we ignore that case
33 * and take the potential slowdown of unaligned loads.
34 */
Paul Mackerrasd4fde562016-11-03 16:15:42 +110035 rldicl. r6,r3,64-1,64-2 /* r6 = (r3 >> 1) & 0x3 */
Anton Blanchard9b83ecb2010-08-02 20:08:34 +000036 beq .Lcsum_aligned
37
38 li r7,4
39 sub r6,r7,r6
40 mtctr r6
41
421:
43 lhz r6,0(r3) /* align to doubleword */
44 subi r4,r4,2
45 addi r3,r3,2
46 adde r0,r0,r6
47 bdnz 1b
48
49.Lcsum_aligned:
50 /*
51 * We unroll the loop such that each iteration is 64 bytes with an
52 * entry and exit limb of 64 bytes, meaning a minimum size of
53 * 128 bytes.
54 */
55 srdi. r6,r4,7
56 beq .Lcsum_tail_doublewords /* len < 128 */
57
58 srdi r6,r4,6
59 subi r6,r6,1
60 mtctr r6
61
62 stdu r1,-STACKFRAMESIZE(r1)
Michael Neulingc75df6f2012-06-25 13:33:10 +000063 std r14,STK_REG(R14)(r1)
64 std r15,STK_REG(R15)(r1)
65 std r16,STK_REG(R16)(r1)
Anton Blanchard9b83ecb2010-08-02 20:08:34 +000066
67 ld r6,0(r3)
68 ld r9,8(r3)
69
70 ld r10,16(r3)
71 ld r11,24(r3)
72
73 /*
Stewart Smithec5619f2016-05-23 11:27:01 +100074 * On POWER6 and POWER7 back to back adde instructions take 2 cycles
75 * because of the XER dependency. This means the fastest this loop can
76 * go is 16 cycles per iteration. The scheduling of the loop below has
Anton Blanchard9b83ecb2010-08-02 20:08:34 +000077 * been shown to hit this on both POWER6 and POWER7.
78 */
79 .align 5
802:
81 adde r0,r0,r6
82 ld r12,32(r3)
83 ld r14,40(r3)
84
85 adde r0,r0,r9
86 ld r15,48(r3)
87 ld r16,56(r3)
88 addi r3,r3,64
89
90 adde r0,r0,r10
91
92 adde r0,r0,r11
93
94 adde r0,r0,r12
95
96 adde r0,r0,r14
97
98 adde r0,r0,r15
99 ld r6,0(r3)
100 ld r9,8(r3)
101
102 adde r0,r0,r16
103 ld r10,16(r3)
104 ld r11,24(r3)
105 bdnz 2b
106
107
108 adde r0,r0,r6
109 ld r12,32(r3)
110 ld r14,40(r3)
111
112 adde r0,r0,r9
113 ld r15,48(r3)
114 ld r16,56(r3)
115 addi r3,r3,64
116
117 adde r0,r0,r10
118 adde r0,r0,r11
119 adde r0,r0,r12
120 adde r0,r0,r14
121 adde r0,r0,r15
122 adde r0,r0,r16
123
Michael Neulingc75df6f2012-06-25 13:33:10 +0000124 ld r14,STK_REG(R14)(r1)
125 ld r15,STK_REG(R15)(r1)
126 ld r16,STK_REG(R16)(r1)
Anton Blanchard9b83ecb2010-08-02 20:08:34 +0000127 addi r1,r1,STACKFRAMESIZE
128
129 andi. r4,r4,63
130
131.Lcsum_tail_doublewords: /* Up to 127 bytes to go */
132 srdi. r6,r4,3
133 beq .Lcsum_tail_word
134
135 mtctr r6
1363:
137 ld r6,0(r3)
138 addi r3,r3,8
139 adde r0,r0,r6
140 bdnz 3b
141
142 andi. r4,r4,7
143
144.Lcsum_tail_word: /* Up to 7 bytes to go */
145 srdi. r6,r4,2
146 beq .Lcsum_tail_halfword
147
148 lwz r6,0(r3)
Paul Mackerras14cf11a2005-09-26 16:04:21 +1000149 addi r3,r3,4
Anton Blanchard9b83ecb2010-08-02 20:08:34 +0000150 adde r0,r0,r6
Paul Mackerras14cf11a2005-09-26 16:04:21 +1000151 subi r4,r4,4
Anton Blanchard9b83ecb2010-08-02 20:08:34 +0000152
153.Lcsum_tail_halfword: /* Up to 3 bytes to go */
154 srdi. r6,r4,1
155 beq .Lcsum_tail_byte
156
157 lhz r6,0(r3)
158 addi r3,r3,2
159 adde r0,r0,r6
160 subi r4,r4,2
161
162.Lcsum_tail_byte: /* Up to 1 byte to go */
163 andi. r6,r4,1
164 beq .Lcsum_finish
165
166 lbz r6,0(r3)
Paul Mackerrasd4fde562016-11-03 16:15:42 +1100167#ifdef __BIG_ENDIAN__
Anton Blanchard9b83ecb2010-08-02 20:08:34 +0000168 sldi r9,r6,8 /* Pad the byte out to 16 bits */
169 adde r0,r0,r9
Paul Mackerrasd4fde562016-11-03 16:15:42 +1100170#else
171 adde r0,r0,r6
172#endif
Anton Blanchard9b83ecb2010-08-02 20:08:34 +0000173
174.Lcsum_finish:
175 addze r0,r0 /* add in final carry */
176 rldicl r4,r0,32,0 /* fold two 32 bit halves together */
177 add r3,r4,r0
178 srdi r3,r3,32
179 blr
Al Viro9445aa12016-01-13 23:33:46 -0500180EXPORT_SYMBOL(__csum_partial)
Paul Mackerras14cf11a2005-09-26 16:04:21 +1000181
Anton Blanchardfdd374b2010-08-02 20:09:52 +0000182
Paul E. McKenney8f21bd02013-10-01 17:11:35 +1000183 .macro srcnr
Anton Blanchardfdd374b2010-08-02 20:09:52 +0000184100:
Al Viro70d65cd2020-07-20 10:09:24 -0400185 EX_TABLE(100b,.Lerror_nr)
Paul E. McKenney8f21bd02013-10-01 17:11:35 +1000186 .endm
187
188 .macro source
189150:
Al Viro70d65cd2020-07-20 10:09:24 -0400190 EX_TABLE(150b,.Lerror)
Paul E. McKenney8f21bd02013-10-01 17:11:35 +1000191 .endm
192
193 .macro dstnr
194200:
Al Viro70d65cd2020-07-20 10:09:24 -0400195 EX_TABLE(200b,.Lerror_nr)
Anton Blanchardfdd374b2010-08-02 20:09:52 +0000196 .endm
197
198 .macro dest
Paul E. McKenney8f21bd02013-10-01 17:11:35 +1000199250:
Al Viro70d65cd2020-07-20 10:09:24 -0400200 EX_TABLE(250b,.Lerror)
Anton Blanchardfdd374b2010-08-02 20:09:52 +0000201 .endm
202
Paul Mackerras14cf11a2005-09-26 16:04:21 +1000203/*
204 * Computes the checksum of a memory block at src, length len,
Al Viro70d65cd2020-07-20 10:09:24 -0400205 * and adds in 0xffffffff (32-bit), while copying the block to dst.
206 * If an access exception occurs, it returns 0.
Paul Mackerras14cf11a2005-09-26 16:04:21 +1000207 *
Al Viro70d65cd2020-07-20 10:09:24 -0400208 * csum_partial_copy_generic(r3=src, r4=dst, r5=len)
Paul Mackerras14cf11a2005-09-26 16:04:21 +1000209 */
210_GLOBAL(csum_partial_copy_generic)
Al Viro70d65cd2020-07-20 10:09:24 -0400211 li r6,-1
Anton Blanchardfdd374b2010-08-02 20:09:52 +0000212 addic r0,r6,0 /* clear carry */
213
214 srdi. r6,r5,3 /* less than 8 bytes? */
215 beq .Lcopy_tail_word
216
217 /*
218 * If only halfword aligned, align to a double word. Since odd
219 * aligned addresses should be rare and they would require more
220 * work to calculate the correct checksum, we ignore that case
221 * and take the potential slowdown of unaligned loads.
222 *
223 * If the source and destination are relatively unaligned we only
224 * align the source. This keeps things simple.
225 */
Paul Mackerrasd4fde562016-11-03 16:15:42 +1100226 rldicl. r6,r3,64-1,64-2 /* r6 = (r3 >> 1) & 0x3 */
Anton Blanchardfdd374b2010-08-02 20:09:52 +0000227 beq .Lcopy_aligned
228
Paul E. McKenneyd9813c32013-10-01 16:54:05 +1000229 li r9,4
230 sub r6,r9,r6
Anton Blanchardfdd374b2010-08-02 20:09:52 +0000231 mtctr r6
232
2331:
Paul E. McKenney8f21bd02013-10-01 17:11:35 +1000234srcnr; lhz r6,0(r3) /* align to doubleword */
Paul Mackerras14cf11a2005-09-26 16:04:21 +1000235 subi r5,r5,2
Paul Mackerras14cf11a2005-09-26 16:04:21 +1000236 addi r3,r3,2
Anton Blanchardfdd374b2010-08-02 20:09:52 +0000237 adde r0,r0,r6
Paul E. McKenney8f21bd02013-10-01 17:11:35 +1000238dstnr; sth r6,0(r4)
Paul Mackerras14cf11a2005-09-26 16:04:21 +1000239 addi r4,r4,2
Anton Blanchardfdd374b2010-08-02 20:09:52 +0000240 bdnz 1b
241
242.Lcopy_aligned:
243 /*
244 * We unroll the loop such that each iteration is 64 bytes with an
245 * entry and exit limb of 64 bytes, meaning a minimum size of
246 * 128 bytes.
247 */
248 srdi. r6,r5,7
249 beq .Lcopy_tail_doublewords /* len < 128 */
250
251 srdi r6,r5,6
252 subi r6,r6,1
253 mtctr r6
254
255 stdu r1,-STACKFRAMESIZE(r1)
Michael Neulingc75df6f2012-06-25 13:33:10 +0000256 std r14,STK_REG(R14)(r1)
257 std r15,STK_REG(R15)(r1)
258 std r16,STK_REG(R16)(r1)
Anton Blanchardfdd374b2010-08-02 20:09:52 +0000259
260source; ld r6,0(r3)
261source; ld r9,8(r3)
262
263source; ld r10,16(r3)
264source; ld r11,24(r3)
265
266 /*
Stewart Smithec5619f2016-05-23 11:27:01 +1000267 * On POWER6 and POWER7 back to back adde instructions take 2 cycles
268 * because of the XER dependency. This means the fastest this loop can
269 * go is 16 cycles per iteration. The scheduling of the loop below has
Anton Blanchardfdd374b2010-08-02 20:09:52 +0000270 * been shown to hit this on both POWER6 and POWER7.
271 */
272 .align 5
2732:
Paul Mackerras14cf11a2005-09-26 16:04:21 +1000274 adde r0,r0,r6
Anton Blanchardfdd374b2010-08-02 20:09:52 +0000275source; ld r12,32(r3)
276source; ld r14,40(r3)
277
278 adde r0,r0,r9
279source; ld r15,48(r3)
280source; ld r16,56(r3)
281 addi r3,r3,64
282
283 adde r0,r0,r10
284dest; std r6,0(r4)
285dest; std r9,8(r4)
286
287 adde r0,r0,r11
288dest; std r10,16(r4)
289dest; std r11,24(r4)
290
291 adde r0,r0,r12
292dest; std r12,32(r4)
293dest; std r14,40(r4)
294
295 adde r0,r0,r14
296dest; std r15,48(r4)
297dest; std r16,56(r4)
298 addi r4,r4,64
299
300 adde r0,r0,r15
301source; ld r6,0(r3)
302source; ld r9,8(r3)
303
304 adde r0,r0,r16
305source; ld r10,16(r3)
306source; ld r11,24(r3)
307 bdnz 2b
308
309
Paul Mackerras14cf11a2005-09-26 16:04:21 +1000310 adde r0,r0,r6
Anton Blanchardfdd374b2010-08-02 20:09:52 +0000311source; ld r12,32(r3)
312source; ld r14,40(r3)
313
314 adde r0,r0,r9
315source; ld r15,48(r3)
316source; ld r16,56(r3)
317 addi r3,r3,64
318
319 adde r0,r0,r10
320dest; std r6,0(r4)
321dest; std r9,8(r4)
322
323 adde r0,r0,r11
324dest; std r10,16(r4)
325dest; std r11,24(r4)
326
327 adde r0,r0,r12
328dest; std r12,32(r4)
329dest; std r14,40(r4)
330
331 adde r0,r0,r14
332dest; std r15,48(r4)
333dest; std r16,56(r4)
334 addi r4,r4,64
335
336 adde r0,r0,r15
337 adde r0,r0,r16
338
Michael Neulingc75df6f2012-06-25 13:33:10 +0000339 ld r14,STK_REG(R14)(r1)
340 ld r15,STK_REG(R15)(r1)
341 ld r16,STK_REG(R16)(r1)
Anton Blanchardfdd374b2010-08-02 20:09:52 +0000342 addi r1,r1,STACKFRAMESIZE
343
344 andi. r5,r5,63
345
346.Lcopy_tail_doublewords: /* Up to 127 bytes to go */
347 srdi. r6,r5,3
348 beq .Lcopy_tail_word
349
350 mtctr r6
3513:
Paul E. McKenney8f21bd02013-10-01 17:11:35 +1000352srcnr; ld r6,0(r3)
Anton Blanchardfdd374b2010-08-02 20:09:52 +0000353 addi r3,r3,8
354 adde r0,r0,r6
Paul E. McKenney8f21bd02013-10-01 17:11:35 +1000355dstnr; std r6,0(r4)
Anton Blanchardfdd374b2010-08-02 20:09:52 +0000356 addi r4,r4,8
357 bdnz 3b
358
359 andi. r5,r5,7
360
361.Lcopy_tail_word: /* Up to 7 bytes to go */
362 srdi. r6,r5,2
363 beq .Lcopy_tail_halfword
364
Paul E. McKenney8f21bd02013-10-01 17:11:35 +1000365srcnr; lwz r6,0(r3)
Anton Blanchardfdd374b2010-08-02 20:09:52 +0000366 addi r3,r3,4
367 adde r0,r0,r6
Paul E. McKenney8f21bd02013-10-01 17:11:35 +1000368dstnr; stw r6,0(r4)
Anton Blanchardfdd374b2010-08-02 20:09:52 +0000369 addi r4,r4,4
370 subi r5,r5,4
371
372.Lcopy_tail_halfword: /* Up to 3 bytes to go */
373 srdi. r6,r5,1
374 beq .Lcopy_tail_byte
375
Paul E. McKenney8f21bd02013-10-01 17:11:35 +1000376srcnr; lhz r6,0(r3)
Anton Blanchardfdd374b2010-08-02 20:09:52 +0000377 addi r3,r3,2
378 adde r0,r0,r6
Paul E. McKenney8f21bd02013-10-01 17:11:35 +1000379dstnr; sth r6,0(r4)
Anton Blanchardfdd374b2010-08-02 20:09:52 +0000380 addi r4,r4,2
381 subi r5,r5,2
382
383.Lcopy_tail_byte: /* Up to 1 byte to go */
384 andi. r6,r5,1
385 beq .Lcopy_finish
386
Paul E. McKenney8f21bd02013-10-01 17:11:35 +1000387srcnr; lbz r6,0(r3)
Paul Mackerrasd4fde562016-11-03 16:15:42 +1100388#ifdef __BIG_ENDIAN__
Anton Blanchardfdd374b2010-08-02 20:09:52 +0000389 sldi r9,r6,8 /* Pad the byte out to 16 bits */
390 adde r0,r0,r9
Paul Mackerrasd4fde562016-11-03 16:15:42 +1100391#else
392 adde r0,r0,r6
393#endif
Paul E. McKenney8f21bd02013-10-01 17:11:35 +1000394dstnr; stb r6,0(r4)
Anton Blanchardfdd374b2010-08-02 20:09:52 +0000395
396.Lcopy_finish:
397 addze r0,r0 /* add in final carry */
398 rldicl r4,r0,32,0 /* fold two 32 bit halves together */
399 add r3,r4,r0
400 srdi r3,r3,32
Paul Mackerras14cf11a2005-09-26 16:04:21 +1000401 blr
402
Al Viro70d65cd2020-07-20 10:09:24 -0400403.Lerror:
Paul E. McKenney8f21bd02013-10-01 17:11:35 +1000404 ld r14,STK_REG(R14)(r1)
405 ld r15,STK_REG(R15)(r1)
406 ld r16,STK_REG(R16)(r1)
407 addi r1,r1,STACKFRAMESIZE
Al Viro70d65cd2020-07-20 10:09:24 -0400408.Lerror_nr:
409 li r3,0
Paul Mackerras14cf11a2005-09-26 16:04:21 +1000410 blr
411
Al Viro9445aa12016-01-13 23:33:46 -0500412EXPORT_SYMBOL(csum_partial_copy_generic)
Christophe Leroye9c49432018-05-24 11:33:18 +0000413
414/*
415 * __sum16 csum_ipv6_magic(const struct in6_addr *saddr,
416 * const struct in6_addr *daddr,
417 * __u32 len, __u8 proto, __wsum sum)
418 */
419
420_GLOBAL(csum_ipv6_magic)
421 ld r8, 0(r3)
422 ld r9, 8(r3)
423 add r5, r5, r6
424 addc r0, r8, r9
425 ld r10, 0(r4)
426 ld r11, 8(r4)
Christophe Leroy85682a72018-09-10 06:09:04 +0000427#ifdef CONFIG_CPU_LITTLE_ENDIAN
428 rotldi r5, r5, 8
429#endif
Christophe Leroye9c49432018-05-24 11:33:18 +0000430 adde r0, r0, r10
431 add r5, r5, r7
432 adde r0, r0, r11
433 adde r0, r0, r5
434 addze r0, r0
435 rotldi r3, r0, 32 /* fold two 32 bit halves together */
436 add r3, r0, r3
437 srdi r0, r3, 32
438 rotlwi r3, r0, 16 /* fold two 16 bit halves together */
439 add r3, r0, r3
440 not r3, r3
441 rlwinm r3, r3, 16, 16, 31
442 blr
443EXPORT_SYMBOL(csum_ipv6_magic)