blob: 886ed94b9c13307f5fc739e899274704faa89a15 [file] [log] [blame]
Paul Mackerras14cf11a2005-09-26 16:04:21 +10001/*
2 * This file contains assembly-language implementations
3 * of IP-style 1's complement checksum routines.
4 *
5 * Copyright (C) 1995-1996 Gary Thomas (gdt@linuxppc.org)
6 *
7 * This program is free software; you can redistribute it and/or
8 * modify it under the terms of the GNU General Public License
9 * as published by the Free Software Foundation; either version
10 * 2 of the License, or (at your option) any later version.
11 *
12 * Severely hacked about by Paul Mackerras (paulus@cs.anu.edu.au).
13 */
14
15#include <linux/sys.h>
16#include <asm/processor.h>
17#include <asm/errno.h>
18#include <asm/ppc_asm.h>
Al Viro9445aa12016-01-13 23:33:46 -050019#include <asm/export.h>
Paul Mackerras14cf11a2005-09-26 16:04:21 +100020
21/*
Paul Mackerras14cf11a2005-09-26 16:04:21 +100022 * Computes the checksum of a memory block at buff, length len,
23 * and adds in "sum" (32-bit).
24 *
Christophe Leroy7e393222016-03-07 18:44:37 +010025 * __csum_partial(r3=buff, r4=len, r5=sum)
Paul Mackerras14cf11a2005-09-26 16:04:21 +100026 */
Christophe Leroy7e393222016-03-07 18:44:37 +010027_GLOBAL(__csum_partial)
Anton Blanchard9b83ecb2010-08-02 20:08:34 +000028 addic r0,r5,0 /* clear carry */
29
30 srdi. r6,r4,3 /* less than 8 bytes? */
31 beq .Lcsum_tail_word
32
33 /*
34 * If only halfword aligned, align to a double word. Since odd
35 * aligned addresses should be rare and they would require more
36 * work to calculate the correct checksum, we ignore that case
37 * and take the potential slowdown of unaligned loads.
38 */
Paul Mackerrasd4fde562016-11-03 16:15:42 +110039 rldicl. r6,r3,64-1,64-2 /* r6 = (r3 >> 1) & 0x3 */
Anton Blanchard9b83ecb2010-08-02 20:08:34 +000040 beq .Lcsum_aligned
41
42 li r7,4
43 sub r6,r7,r6
44 mtctr r6
45
461:
47 lhz r6,0(r3) /* align to doubleword */
48 subi r4,r4,2
49 addi r3,r3,2
50 adde r0,r0,r6
51 bdnz 1b
52
53.Lcsum_aligned:
54 /*
55 * We unroll the loop such that each iteration is 64 bytes with an
56 * entry and exit limb of 64 bytes, meaning a minimum size of
57 * 128 bytes.
58 */
59 srdi. r6,r4,7
60 beq .Lcsum_tail_doublewords /* len < 128 */
61
62 srdi r6,r4,6
63 subi r6,r6,1
64 mtctr r6
65
66 stdu r1,-STACKFRAMESIZE(r1)
Michael Neulingc75df6f2012-06-25 13:33:10 +000067 std r14,STK_REG(R14)(r1)
68 std r15,STK_REG(R15)(r1)
69 std r16,STK_REG(R16)(r1)
Anton Blanchard9b83ecb2010-08-02 20:08:34 +000070
71 ld r6,0(r3)
72 ld r9,8(r3)
73
74 ld r10,16(r3)
75 ld r11,24(r3)
76
77 /*
Stewart Smithec5619f2016-05-23 11:27:01 +100078 * On POWER6 and POWER7 back to back adde instructions take 2 cycles
79 * because of the XER dependency. This means the fastest this loop can
80 * go is 16 cycles per iteration. The scheduling of the loop below has
Anton Blanchard9b83ecb2010-08-02 20:08:34 +000081 * been shown to hit this on both POWER6 and POWER7.
82 */
83 .align 5
842:
85 adde r0,r0,r6
86 ld r12,32(r3)
87 ld r14,40(r3)
88
89 adde r0,r0,r9
90 ld r15,48(r3)
91 ld r16,56(r3)
92 addi r3,r3,64
93
94 adde r0,r0,r10
95
96 adde r0,r0,r11
97
98 adde r0,r0,r12
99
100 adde r0,r0,r14
101
102 adde r0,r0,r15
103 ld r6,0(r3)
104 ld r9,8(r3)
105
106 adde r0,r0,r16
107 ld r10,16(r3)
108 ld r11,24(r3)
109 bdnz 2b
110
111
112 adde r0,r0,r6
113 ld r12,32(r3)
114 ld r14,40(r3)
115
116 adde r0,r0,r9
117 ld r15,48(r3)
118 ld r16,56(r3)
119 addi r3,r3,64
120
121 adde r0,r0,r10
122 adde r0,r0,r11
123 adde r0,r0,r12
124 adde r0,r0,r14
125 adde r0,r0,r15
126 adde r0,r0,r16
127
Michael Neulingc75df6f2012-06-25 13:33:10 +0000128 ld r14,STK_REG(R14)(r1)
129 ld r15,STK_REG(R15)(r1)
130 ld r16,STK_REG(R16)(r1)
Anton Blanchard9b83ecb2010-08-02 20:08:34 +0000131 addi r1,r1,STACKFRAMESIZE
132
133 andi. r4,r4,63
134
135.Lcsum_tail_doublewords: /* Up to 127 bytes to go */
136 srdi. r6,r4,3
137 beq .Lcsum_tail_word
138
139 mtctr r6
1403:
141 ld r6,0(r3)
142 addi r3,r3,8
143 adde r0,r0,r6
144 bdnz 3b
145
146 andi. r4,r4,7
147
148.Lcsum_tail_word: /* Up to 7 bytes to go */
149 srdi. r6,r4,2
150 beq .Lcsum_tail_halfword
151
152 lwz r6,0(r3)
Paul Mackerras14cf11a2005-09-26 16:04:21 +1000153 addi r3,r3,4
Anton Blanchard9b83ecb2010-08-02 20:08:34 +0000154 adde r0,r0,r6
Paul Mackerras14cf11a2005-09-26 16:04:21 +1000155 subi r4,r4,4
Anton Blanchard9b83ecb2010-08-02 20:08:34 +0000156
157.Lcsum_tail_halfword: /* Up to 3 bytes to go */
158 srdi. r6,r4,1
159 beq .Lcsum_tail_byte
160
161 lhz r6,0(r3)
162 addi r3,r3,2
163 adde r0,r0,r6
164 subi r4,r4,2
165
166.Lcsum_tail_byte: /* Up to 1 byte to go */
167 andi. r6,r4,1
168 beq .Lcsum_finish
169
170 lbz r6,0(r3)
Paul Mackerrasd4fde562016-11-03 16:15:42 +1100171#ifdef __BIG_ENDIAN__
Anton Blanchard9b83ecb2010-08-02 20:08:34 +0000172 sldi r9,r6,8 /* Pad the byte out to 16 bits */
173 adde r0,r0,r9
Paul Mackerrasd4fde562016-11-03 16:15:42 +1100174#else
175 adde r0,r0,r6
176#endif
Anton Blanchard9b83ecb2010-08-02 20:08:34 +0000177
178.Lcsum_finish:
179 addze r0,r0 /* add in final carry */
180 rldicl r4,r0,32,0 /* fold two 32 bit halves together */
181 add r3,r4,r0
182 srdi r3,r3,32
183 blr
Al Viro9445aa12016-01-13 23:33:46 -0500184EXPORT_SYMBOL(__csum_partial)
Paul Mackerras14cf11a2005-09-26 16:04:21 +1000185
Anton Blanchardfdd374b2010-08-02 20:09:52 +0000186
Paul E. McKenney8f21bd02013-10-01 17:11:35 +1000187 .macro srcnr
Anton Blanchardfdd374b2010-08-02 20:09:52 +0000188100:
Nicholas Piggin24bfa6a2016-10-13 16:42:53 +1100189 EX_TABLE(100b,.Lsrc_error_nr)
Paul E. McKenney8f21bd02013-10-01 17:11:35 +1000190 .endm
191
192 .macro source
193150:
Nicholas Piggin24bfa6a2016-10-13 16:42:53 +1100194 EX_TABLE(150b,.Lsrc_error)
Paul E. McKenney8f21bd02013-10-01 17:11:35 +1000195 .endm
196
197 .macro dstnr
198200:
Nicholas Piggin24bfa6a2016-10-13 16:42:53 +1100199 EX_TABLE(200b,.Ldest_error_nr)
Anton Blanchardfdd374b2010-08-02 20:09:52 +0000200 .endm
201
202 .macro dest
Paul E. McKenney8f21bd02013-10-01 17:11:35 +1000203250:
Nicholas Piggin24bfa6a2016-10-13 16:42:53 +1100204 EX_TABLE(250b,.Ldest_error)
Anton Blanchardfdd374b2010-08-02 20:09:52 +0000205 .endm
206
Paul Mackerras14cf11a2005-09-26 16:04:21 +1000207/*
208 * Computes the checksum of a memory block at src, length len,
209 * and adds in "sum" (32-bit), while copying the block to dst.
210 * If an access exception occurs on src or dst, it stores -EFAULT
Anton Blanchardfdd374b2010-08-02 20:09:52 +0000211 * to *src_err or *dst_err respectively. The caller must take any action
212 * required in this case (zeroing memory, recalculating partial checksum etc).
Paul Mackerras14cf11a2005-09-26 16:04:21 +1000213 *
214 * csum_partial_copy_generic(r3=src, r4=dst, r5=len, r6=sum, r7=src_err, r8=dst_err)
215 */
216_GLOBAL(csum_partial_copy_generic)
Anton Blanchardfdd374b2010-08-02 20:09:52 +0000217 addic r0,r6,0 /* clear carry */
218
219 srdi. r6,r5,3 /* less than 8 bytes? */
220 beq .Lcopy_tail_word
221
222 /*
223 * If only halfword aligned, align to a double word. Since odd
224 * aligned addresses should be rare and they would require more
225 * work to calculate the correct checksum, we ignore that case
226 * and take the potential slowdown of unaligned loads.
227 *
228 * If the source and destination are relatively unaligned we only
229 * align the source. This keeps things simple.
230 */
Paul Mackerrasd4fde562016-11-03 16:15:42 +1100231 rldicl. r6,r3,64-1,64-2 /* r6 = (r3 >> 1) & 0x3 */
Anton Blanchardfdd374b2010-08-02 20:09:52 +0000232 beq .Lcopy_aligned
233
Paul E. McKenneyd9813c32013-10-01 16:54:05 +1000234 li r9,4
235 sub r6,r9,r6
Anton Blanchardfdd374b2010-08-02 20:09:52 +0000236 mtctr r6
237
2381:
Paul E. McKenney8f21bd02013-10-01 17:11:35 +1000239srcnr; lhz r6,0(r3) /* align to doubleword */
Paul Mackerras14cf11a2005-09-26 16:04:21 +1000240 subi r5,r5,2
Paul Mackerras14cf11a2005-09-26 16:04:21 +1000241 addi r3,r3,2
Anton Blanchardfdd374b2010-08-02 20:09:52 +0000242 adde r0,r0,r6
Paul E. McKenney8f21bd02013-10-01 17:11:35 +1000243dstnr; sth r6,0(r4)
Paul Mackerras14cf11a2005-09-26 16:04:21 +1000244 addi r4,r4,2
Anton Blanchardfdd374b2010-08-02 20:09:52 +0000245 bdnz 1b
246
247.Lcopy_aligned:
248 /*
249 * We unroll the loop such that each iteration is 64 bytes with an
250 * entry and exit limb of 64 bytes, meaning a minimum size of
251 * 128 bytes.
252 */
253 srdi. r6,r5,7
254 beq .Lcopy_tail_doublewords /* len < 128 */
255
256 srdi r6,r5,6
257 subi r6,r6,1
258 mtctr r6
259
260 stdu r1,-STACKFRAMESIZE(r1)
Michael Neulingc75df6f2012-06-25 13:33:10 +0000261 std r14,STK_REG(R14)(r1)
262 std r15,STK_REG(R15)(r1)
263 std r16,STK_REG(R16)(r1)
Anton Blanchardfdd374b2010-08-02 20:09:52 +0000264
265source; ld r6,0(r3)
266source; ld r9,8(r3)
267
268source; ld r10,16(r3)
269source; ld r11,24(r3)
270
271 /*
Stewart Smithec5619f2016-05-23 11:27:01 +1000272 * On POWER6 and POWER7 back to back adde instructions take 2 cycles
273 * because of the XER dependency. This means the fastest this loop can
274 * go is 16 cycles per iteration. The scheduling of the loop below has
Anton Blanchardfdd374b2010-08-02 20:09:52 +0000275 * been shown to hit this on both POWER6 and POWER7.
276 */
277 .align 5
2782:
Paul Mackerras14cf11a2005-09-26 16:04:21 +1000279 adde r0,r0,r6
Anton Blanchardfdd374b2010-08-02 20:09:52 +0000280source; ld r12,32(r3)
281source; ld r14,40(r3)
282
283 adde r0,r0,r9
284source; ld r15,48(r3)
285source; ld r16,56(r3)
286 addi r3,r3,64
287
288 adde r0,r0,r10
289dest; std r6,0(r4)
290dest; std r9,8(r4)
291
292 adde r0,r0,r11
293dest; std r10,16(r4)
294dest; std r11,24(r4)
295
296 adde r0,r0,r12
297dest; std r12,32(r4)
298dest; std r14,40(r4)
299
300 adde r0,r0,r14
301dest; std r15,48(r4)
302dest; std r16,56(r4)
303 addi r4,r4,64
304
305 adde r0,r0,r15
306source; ld r6,0(r3)
307source; ld r9,8(r3)
308
309 adde r0,r0,r16
310source; ld r10,16(r3)
311source; ld r11,24(r3)
312 bdnz 2b
313
314
Paul Mackerras14cf11a2005-09-26 16:04:21 +1000315 adde r0,r0,r6
Anton Blanchardfdd374b2010-08-02 20:09:52 +0000316source; ld r12,32(r3)
317source; ld r14,40(r3)
318
319 adde r0,r0,r9
320source; ld r15,48(r3)
321source; ld r16,56(r3)
322 addi r3,r3,64
323
324 adde r0,r0,r10
325dest; std r6,0(r4)
326dest; std r9,8(r4)
327
328 adde r0,r0,r11
329dest; std r10,16(r4)
330dest; std r11,24(r4)
331
332 adde r0,r0,r12
333dest; std r12,32(r4)
334dest; std r14,40(r4)
335
336 adde r0,r0,r14
337dest; std r15,48(r4)
338dest; std r16,56(r4)
339 addi r4,r4,64
340
341 adde r0,r0,r15
342 adde r0,r0,r16
343
Michael Neulingc75df6f2012-06-25 13:33:10 +0000344 ld r14,STK_REG(R14)(r1)
345 ld r15,STK_REG(R15)(r1)
346 ld r16,STK_REG(R16)(r1)
Anton Blanchardfdd374b2010-08-02 20:09:52 +0000347 addi r1,r1,STACKFRAMESIZE
348
349 andi. r5,r5,63
350
351.Lcopy_tail_doublewords: /* Up to 127 bytes to go */
352 srdi. r6,r5,3
353 beq .Lcopy_tail_word
354
355 mtctr r6
3563:
Paul E. McKenney8f21bd02013-10-01 17:11:35 +1000357srcnr; ld r6,0(r3)
Anton Blanchardfdd374b2010-08-02 20:09:52 +0000358 addi r3,r3,8
359 adde r0,r0,r6
Paul E. McKenney8f21bd02013-10-01 17:11:35 +1000360dstnr; std r6,0(r4)
Anton Blanchardfdd374b2010-08-02 20:09:52 +0000361 addi r4,r4,8
362 bdnz 3b
363
364 andi. r5,r5,7
365
366.Lcopy_tail_word: /* Up to 7 bytes to go */
367 srdi. r6,r5,2
368 beq .Lcopy_tail_halfword
369
Paul E. McKenney8f21bd02013-10-01 17:11:35 +1000370srcnr; lwz r6,0(r3)
Anton Blanchardfdd374b2010-08-02 20:09:52 +0000371 addi r3,r3,4
372 adde r0,r0,r6
Paul E. McKenney8f21bd02013-10-01 17:11:35 +1000373dstnr; stw r6,0(r4)
Anton Blanchardfdd374b2010-08-02 20:09:52 +0000374 addi r4,r4,4
375 subi r5,r5,4
376
377.Lcopy_tail_halfword: /* Up to 3 bytes to go */
378 srdi. r6,r5,1
379 beq .Lcopy_tail_byte
380
Paul E. McKenney8f21bd02013-10-01 17:11:35 +1000381srcnr; lhz r6,0(r3)
Anton Blanchardfdd374b2010-08-02 20:09:52 +0000382 addi r3,r3,2
383 adde r0,r0,r6
Paul E. McKenney8f21bd02013-10-01 17:11:35 +1000384dstnr; sth r6,0(r4)
Anton Blanchardfdd374b2010-08-02 20:09:52 +0000385 addi r4,r4,2
386 subi r5,r5,2
387
388.Lcopy_tail_byte: /* Up to 1 byte to go */
389 andi. r6,r5,1
390 beq .Lcopy_finish
391
Paul E. McKenney8f21bd02013-10-01 17:11:35 +1000392srcnr; lbz r6,0(r3)
Paul Mackerrasd4fde562016-11-03 16:15:42 +1100393#ifdef __BIG_ENDIAN__
Anton Blanchardfdd374b2010-08-02 20:09:52 +0000394 sldi r9,r6,8 /* Pad the byte out to 16 bits */
395 adde r0,r0,r9
Paul Mackerrasd4fde562016-11-03 16:15:42 +1100396#else
397 adde r0,r0,r6
398#endif
Paul E. McKenney8f21bd02013-10-01 17:11:35 +1000399dstnr; stb r6,0(r4)
Anton Blanchardfdd374b2010-08-02 20:09:52 +0000400
401.Lcopy_finish:
402 addze r0,r0 /* add in final carry */
403 rldicl r4,r0,32,0 /* fold two 32 bit halves together */
404 add r3,r4,r0
405 srdi r3,r3,32
Paul Mackerras14cf11a2005-09-26 16:04:21 +1000406 blr
407
Anton Blanchardfdd374b2010-08-02 20:09:52 +0000408.Lsrc_error:
Paul E. McKenney8f21bd02013-10-01 17:11:35 +1000409 ld r14,STK_REG(R14)(r1)
410 ld r15,STK_REG(R15)(r1)
411 ld r16,STK_REG(R16)(r1)
412 addi r1,r1,STACKFRAMESIZE
413.Lsrc_error_nr:
Paul Mackerras14cf11a2005-09-26 16:04:21 +1000414 cmpdi 0,r7,0
Anton Blanchardfdd374b2010-08-02 20:09:52 +0000415 beqlr
Paul Mackerras14cf11a2005-09-26 16:04:21 +1000416 li r6,-EFAULT
417 stw r6,0(r7)
Paul Mackerras14cf11a2005-09-26 16:04:21 +1000418 blr
419
Anton Blanchardfdd374b2010-08-02 20:09:52 +0000420.Ldest_error:
Paul E. McKenney8f21bd02013-10-01 17:11:35 +1000421 ld r14,STK_REG(R14)(r1)
422 ld r15,STK_REG(R15)(r1)
423 ld r16,STK_REG(R16)(r1)
424 addi r1,r1,STACKFRAMESIZE
425.Ldest_error_nr:
Paul Mackerras14cf11a2005-09-26 16:04:21 +1000426 cmpdi 0,r8,0
Anton Blanchardfdd374b2010-08-02 20:09:52 +0000427 beqlr
Paul Mackerras14cf11a2005-09-26 16:04:21 +1000428 li r6,-EFAULT
429 stw r6,0(r8)
Paul Mackerras14cf11a2005-09-26 16:04:21 +1000430 blr
Al Viro9445aa12016-01-13 23:33:46 -0500431EXPORT_SYMBOL(csum_partial_copy_generic)
Christophe Leroye9c49432018-05-24 11:33:18 +0000432
433/*
434 * __sum16 csum_ipv6_magic(const struct in6_addr *saddr,
435 * const struct in6_addr *daddr,
436 * __u32 len, __u8 proto, __wsum sum)
437 */
438
439_GLOBAL(csum_ipv6_magic)
440 ld r8, 0(r3)
441 ld r9, 8(r3)
442 add r5, r5, r6
443 addc r0, r8, r9
444 ld r10, 0(r4)
445 ld r11, 8(r4)
446 adde r0, r0, r10
447 add r5, r5, r7
448 adde r0, r0, r11
449 adde r0, r0, r5
450 addze r0, r0
451 rotldi r3, r0, 32 /* fold two 32 bit halves together */
452 add r3, r0, r3
453 srdi r0, r3, 32
454 rotlwi r3, r0, 16 /* fold two 16 bit halves together */
455 add r3, r0, r3
456 not r3, r3
457 rlwinm r3, r3, 16, 16, 31
458 blr
459EXPORT_SYMBOL(csum_ipv6_magic)