blob: 420a912198a234df56738e3a7fb8b781ea7390b4 [file] [log] [blame]
Paul Mackerras14cf11a2005-09-26 16:04:21 +10001/*
2 * Memory copy functions for 32-bit PowerPC.
3 *
4 * Copyright (C) 1996-2005 Paul Mackerras.
5 *
6 * This program is free software; you can redistribute it and/or
7 * modify it under the terms of the GNU General Public License
8 * as published by the Free Software Foundation; either version
9 * 2 of the License, or (at your option) any later version.
10 */
11#include <linux/config.h>
12#include <asm/processor.h>
13#include <asm/cache.h>
14#include <asm/errno.h>
15#include <asm/ppc_asm.h>
16
17#define COPY_16_BYTES \
18 lwz r7,4(r4); \
19 lwz r8,8(r4); \
20 lwz r9,12(r4); \
21 lwzu r10,16(r4); \
22 stw r7,4(r6); \
23 stw r8,8(r6); \
24 stw r9,12(r6); \
25 stwu r10,16(r6)
26
27#define COPY_16_BYTES_WITHEX(n) \
288 ## n ## 0: \
29 lwz r7,4(r4); \
308 ## n ## 1: \
31 lwz r8,8(r4); \
328 ## n ## 2: \
33 lwz r9,12(r4); \
348 ## n ## 3: \
35 lwzu r10,16(r4); \
368 ## n ## 4: \
37 stw r7,4(r6); \
388 ## n ## 5: \
39 stw r8,8(r6); \
408 ## n ## 6: \
41 stw r9,12(r6); \
428 ## n ## 7: \
43 stwu r10,16(r6)
44
45#define COPY_16_BYTES_EXCODE(n) \
469 ## n ## 0: \
47 addi r5,r5,-(16 * n); \
48 b 104f; \
499 ## n ## 1: \
50 addi r5,r5,-(16 * n); \
51 b 105f; \
52.section __ex_table,"a"; \
53 .align 2; \
54 .long 8 ## n ## 0b,9 ## n ## 0b; \
55 .long 8 ## n ## 1b,9 ## n ## 0b; \
56 .long 8 ## n ## 2b,9 ## n ## 0b; \
57 .long 8 ## n ## 3b,9 ## n ## 0b; \
58 .long 8 ## n ## 4b,9 ## n ## 1b; \
59 .long 8 ## n ## 5b,9 ## n ## 1b; \
60 .long 8 ## n ## 6b,9 ## n ## 1b; \
61 .long 8 ## n ## 7b,9 ## n ## 1b; \
62 .text
63
64 .text
65 .stabs "arch/powerpc/lib/",N_SO,0,0,0f
66 .stabs "copy32.S",N_SO,0,0,0f
670:
68
69CACHELINE_BYTES = L1_CACHE_LINE_SIZE
70LG_CACHELINE_BYTES = LG_L1_CACHE_LINE_SIZE
71CACHELINE_MASK = (L1_CACHE_LINE_SIZE-1)
72
73/*
74 * Use dcbz on the complete cache lines in the destination
75 * to set them to zero. This requires that the destination
76 * area is cacheable. -- paulus
77 */
78_GLOBAL(cacheable_memzero)
79 mr r5,r4
80 li r4,0
81 addi r6,r3,-4
82 cmplwi 0,r5,4
83 blt 7f
84 stwu r4,4(r6)
85 beqlr
86 andi. r0,r6,3
87 add r5,r0,r5
88 subf r6,r0,r6
89 clrlwi r7,r6,32-LG_CACHELINE_BYTES
90 add r8,r7,r5
91 srwi r9,r8,LG_CACHELINE_BYTES
92 addic. r9,r9,-1 /* total number of complete cachelines */
93 ble 2f
94 xori r0,r7,CACHELINE_MASK & ~3
95 srwi. r0,r0,2
96 beq 3f
97 mtctr r0
984: stwu r4,4(r6)
99 bdnz 4b
1003: mtctr r9
101 li r7,4
102#if !defined(CONFIG_8xx)
10310: dcbz r7,r6
104#else
10510: stw r4, 4(r6)
106 stw r4, 8(r6)
107 stw r4, 12(r6)
108 stw r4, 16(r6)
109#if CACHE_LINE_SIZE >= 32
110 stw r4, 20(r6)
111 stw r4, 24(r6)
112 stw r4, 28(r6)
113 stw r4, 32(r6)
114#endif /* CACHE_LINE_SIZE */
115#endif
116 addi r6,r6,CACHELINE_BYTES
117 bdnz 10b
118 clrlwi r5,r8,32-LG_CACHELINE_BYTES
119 addi r5,r5,4
1202: srwi r0,r5,2
121 mtctr r0
122 bdz 6f
1231: stwu r4,4(r6)
124 bdnz 1b
1256: andi. r5,r5,3
1267: cmpwi 0,r5,0
127 beqlr
128 mtctr r5
129 addi r6,r6,3
1308: stbu r4,1(r6)
131 bdnz 8b
132 blr
133
134_GLOBAL(memset)
135 rlwimi r4,r4,8,16,23
136 rlwimi r4,r4,16,0,15
137 addi r6,r3,-4
138 cmplwi 0,r5,4
139 blt 7f
140 stwu r4,4(r6)
141 beqlr
142 andi. r0,r6,3
143 add r5,r0,r5
144 subf r6,r0,r6
145 srwi r0,r5,2
146 mtctr r0
147 bdz 6f
1481: stwu r4,4(r6)
149 bdnz 1b
1506: andi. r5,r5,3
1517: cmpwi 0,r5,0
152 beqlr
153 mtctr r5
154 addi r6,r6,3
1558: stbu r4,1(r6)
156 bdnz 8b
157 blr
158
159/*
160 * This version uses dcbz on the complete cache lines in the
161 * destination area to reduce memory traffic. This requires that
162 * the destination area is cacheable.
163 * We only use this version if the source and dest don't overlap.
164 * -- paulus.
165 */
166_GLOBAL(cacheable_memcpy)
167 add r7,r3,r5 /* test if the src & dst overlap */
168 add r8,r4,r5
169 cmplw 0,r4,r7
170 cmplw 1,r3,r8
171 crand 0,0,4 /* cr0.lt &= cr1.lt */
172 blt memcpy /* if regions overlap */
173
174 addi r4,r4,-4
175 addi r6,r3,-4
176 neg r0,r3
177 andi. r0,r0,CACHELINE_MASK /* # bytes to start of cache line */
178 beq 58f
179
180 cmplw 0,r5,r0 /* is this more than total to do? */
181 blt 63f /* if not much to do */
182 andi. r8,r0,3 /* get it word-aligned first */
183 subf r5,r0,r5
184 mtctr r8
185 beq+ 61f
18670: lbz r9,4(r4) /* do some bytes */
187 stb r9,4(r6)
188 addi r4,r4,1
189 addi r6,r6,1
190 bdnz 70b
19161: srwi. r0,r0,2
192 mtctr r0
193 beq 58f
19472: lwzu r9,4(r4) /* do some words */
195 stwu r9,4(r6)
196 bdnz 72b
197
19858: srwi. r0,r5,LG_CACHELINE_BYTES /* # complete cachelines */
199 clrlwi r5,r5,32-LG_CACHELINE_BYTES
200 li r11,4
201 mtctr r0
202 beq 63f
20353:
204#if !defined(CONFIG_8xx)
205 dcbz r11,r6
206#endif
207 COPY_16_BYTES
208#if L1_CACHE_LINE_SIZE >= 32
209 COPY_16_BYTES
210#if L1_CACHE_LINE_SIZE >= 64
211 COPY_16_BYTES
212 COPY_16_BYTES
213#if L1_CACHE_LINE_SIZE >= 128
214 COPY_16_BYTES
215 COPY_16_BYTES
216 COPY_16_BYTES
217 COPY_16_BYTES
218#endif
219#endif
220#endif
221 bdnz 53b
222
22363: srwi. r0,r5,2
224 mtctr r0
225 beq 64f
22630: lwzu r0,4(r4)
227 stwu r0,4(r6)
228 bdnz 30b
229
23064: andi. r0,r5,3
231 mtctr r0
232 beq+ 65f
23340: lbz r0,4(r4)
234 stb r0,4(r6)
235 addi r4,r4,1
236 addi r6,r6,1
237 bdnz 40b
23865: blr
239
240_GLOBAL(memmove)
241 cmplw 0,r3,r4
242 bgt backwards_memcpy
243 /* fall through */
244
245_GLOBAL(memcpy)
246 srwi. r7,r5,3
247 addi r6,r3,-4
248 addi r4,r4,-4
249 beq 2f /* if less than 8 bytes to do */
250 andi. r0,r6,3 /* get dest word aligned */
251 mtctr r7
252 bne 5f
2531: lwz r7,4(r4)
254 lwzu r8,8(r4)
255 stw r7,4(r6)
256 stwu r8,8(r6)
257 bdnz 1b
258 andi. r5,r5,7
2592: cmplwi 0,r5,4
260 blt 3f
261 lwzu r0,4(r4)
262 addi r5,r5,-4
263 stwu r0,4(r6)
2643: cmpwi 0,r5,0
265 beqlr
266 mtctr r5
267 addi r4,r4,3
268 addi r6,r6,3
2694: lbzu r0,1(r4)
270 stbu r0,1(r6)
271 bdnz 4b
272 blr
2735: subfic r0,r0,4
274 mtctr r0
2756: lbz r7,4(r4)
276 addi r4,r4,1
277 stb r7,4(r6)
278 addi r6,r6,1
279 bdnz 6b
280 subf r5,r0,r5
281 rlwinm. r7,r5,32-3,3,31
282 beq 2b
283 mtctr r7
284 b 1b
285
286_GLOBAL(backwards_memcpy)
287 rlwinm. r7,r5,32-3,3,31 /* r0 = r5 >> 3 */
288 add r6,r3,r5
289 add r4,r4,r5
290 beq 2f
291 andi. r0,r6,3
292 mtctr r7
293 bne 5f
2941: lwz r7,-4(r4)
295 lwzu r8,-8(r4)
296 stw r7,-4(r6)
297 stwu r8,-8(r6)
298 bdnz 1b
299 andi. r5,r5,7
3002: cmplwi 0,r5,4
301 blt 3f
302 lwzu r0,-4(r4)
303 subi r5,r5,4
304 stwu r0,-4(r6)
3053: cmpwi 0,r5,0
306 beqlr
307 mtctr r5
3084: lbzu r0,-1(r4)
309 stbu r0,-1(r6)
310 bdnz 4b
311 blr
3125: mtctr r0
3136: lbzu r7,-1(r4)
314 stbu r7,-1(r6)
315 bdnz 6b
316 subf r5,r0,r5
317 rlwinm. r7,r5,32-3,3,31
318 beq 2b
319 mtctr r7
320 b 1b
321
322_GLOBAL(__copy_tofrom_user)
323 addi r4,r4,-4
324 addi r6,r3,-4
325 neg r0,r3
326 andi. r0,r0,CACHELINE_MASK /* # bytes to start of cache line */
327 beq 58f
328
329 cmplw 0,r5,r0 /* is this more than total to do? */
330 blt 63f /* if not much to do */
331 andi. r8,r0,3 /* get it word-aligned first */
332 mtctr r8
333 beq+ 61f
33470: lbz r9,4(r4) /* do some bytes */
33571: stb r9,4(r6)
336 addi r4,r4,1
337 addi r6,r6,1
338 bdnz 70b
33961: subf r5,r0,r5
340 srwi. r0,r0,2
341 mtctr r0
342 beq 58f
34372: lwzu r9,4(r4) /* do some words */
34473: stwu r9,4(r6)
345 bdnz 72b
346
347 .section __ex_table,"a"
348 .align 2
349 .long 70b,100f
350 .long 71b,101f
351 .long 72b,102f
352 .long 73b,103f
353 .text
354
35558: srwi. r0,r5,LG_CACHELINE_BYTES /* # complete cachelines */
356 clrlwi r5,r5,32-LG_CACHELINE_BYTES
357 li r11,4
358 beq 63f
359
360#ifdef CONFIG_8xx
361 /* Don't use prefetch on 8xx */
362 mtctr r0
363 li r0,0
36453: COPY_16_BYTES_WITHEX(0)
365 bdnz 53b
366
367#else /* not CONFIG_8xx */
368 /* Here we decide how far ahead to prefetch the source */
369 li r3,4
370 cmpwi r0,1
371 li r7,0
372 ble 114f
373 li r7,1
374#if MAX_COPY_PREFETCH > 1
375 /* Heuristically, for large transfers we prefetch
376 MAX_COPY_PREFETCH cachelines ahead. For small transfers
377 we prefetch 1 cacheline ahead. */
378 cmpwi r0,MAX_COPY_PREFETCH
379 ble 112f
380 li r7,MAX_COPY_PREFETCH
381112: mtctr r7
382111: dcbt r3,r4
383 addi r3,r3,CACHELINE_BYTES
384 bdnz 111b
385#else
386 dcbt r3,r4
387 addi r3,r3,CACHELINE_BYTES
388#endif /* MAX_COPY_PREFETCH > 1 */
389
390114: subf r8,r7,r0
391 mr r0,r7
392 mtctr r8
393
39453: dcbt r3,r4
39554: dcbz r11,r6
396 .section __ex_table,"a"
397 .align 2
398 .long 54b,105f
399 .text
400/* the main body of the cacheline loop */
401 COPY_16_BYTES_WITHEX(0)
402#if L1_CACHE_LINE_SIZE >= 32
403 COPY_16_BYTES_WITHEX(1)
404#if L1_CACHE_LINE_SIZE >= 64
405 COPY_16_BYTES_WITHEX(2)
406 COPY_16_BYTES_WITHEX(3)
407#if L1_CACHE_LINE_SIZE >= 128
408 COPY_16_BYTES_WITHEX(4)
409 COPY_16_BYTES_WITHEX(5)
410 COPY_16_BYTES_WITHEX(6)
411 COPY_16_BYTES_WITHEX(7)
412#endif
413#endif
414#endif
415 bdnz 53b
416 cmpwi r0,0
417 li r3,4
418 li r7,0
419 bne 114b
420#endif /* CONFIG_8xx */
421
42263: srwi. r0,r5,2
423 mtctr r0
424 beq 64f
42530: lwzu r0,4(r4)
42631: stwu r0,4(r6)
427 bdnz 30b
428
42964: andi. r0,r5,3
430 mtctr r0
431 beq+ 65f
43240: lbz r0,4(r4)
43341: stb r0,4(r6)
434 addi r4,r4,1
435 addi r6,r6,1
436 bdnz 40b
43765: li r3,0
438 blr
439
440/* read fault, initial single-byte copy */
441100: li r9,0
442 b 90f
443/* write fault, initial single-byte copy */
444101: li r9,1
44590: subf r5,r8,r5
446 li r3,0
447 b 99f
448/* read fault, initial word copy */
449102: li r9,0
450 b 91f
451/* write fault, initial word copy */
452103: li r9,1
45391: li r3,2
454 b 99f
455
456/*
457 * this stuff handles faults in the cacheline loop and branches to either
458 * 104f (if in read part) or 105f (if in write part), after updating r5
459 */
460 COPY_16_BYTES_EXCODE(0)
461#if L1_CACHE_LINE_SIZE >= 32
462 COPY_16_BYTES_EXCODE(1)
463#if L1_CACHE_LINE_SIZE >= 64
464 COPY_16_BYTES_EXCODE(2)
465 COPY_16_BYTES_EXCODE(3)
466#if L1_CACHE_LINE_SIZE >= 128
467 COPY_16_BYTES_EXCODE(4)
468 COPY_16_BYTES_EXCODE(5)
469 COPY_16_BYTES_EXCODE(6)
470 COPY_16_BYTES_EXCODE(7)
471#endif
472#endif
473#endif
474
475/* read fault in cacheline loop */
476104: li r9,0
477 b 92f
478/* fault on dcbz (effectively a write fault) */
479/* or write fault in cacheline loop */
480105: li r9,1
48192: li r3,LG_CACHELINE_BYTES
482 mfctr r8
483 add r0,r0,r8
484 b 106f
485/* read fault in final word loop */
486108: li r9,0
487 b 93f
488/* write fault in final word loop */
489109: li r9,1
49093: andi. r5,r5,3
491 li r3,2
492 b 99f
493/* read fault in final byte loop */
494110: li r9,0
495 b 94f
496/* write fault in final byte loop */
497111: li r9,1
49894: li r5,0
499 li r3,0
500/*
501 * At this stage the number of bytes not copied is
502 * r5 + (ctr << r3), and r9 is 0 for read or 1 for write.
503 */
50499: mfctr r0
505106: slw r3,r0,r3
506 add. r3,r3,r5
507 beq 120f /* shouldn't happen */
508 cmpwi 0,r9,0
509 bne 120f
510/* for a read fault, first try to continue the copy one byte at a time */
511 mtctr r3
512130: lbz r0,4(r4)
513131: stb r0,4(r6)
514 addi r4,r4,1
515 addi r6,r6,1
516 bdnz 130b
517/* then clear out the destination: r3 bytes starting at 4(r6) */
518132: mfctr r3
519 srwi. r0,r3,2
520 li r9,0
521 mtctr r0
522 beq 113f
523112: stwu r9,4(r6)
524 bdnz 112b
525113: andi. r0,r3,3
526 mtctr r0
527 beq 120f
528114: stb r9,4(r6)
529 addi r6,r6,1
530 bdnz 114b
531120: blr
532
533 .section __ex_table,"a"
534 .align 2
535 .long 30b,108b
536 .long 31b,109b
537 .long 40b,110b
538 .long 41b,111b
539 .long 130b,132b
540 .long 131b,120b
541 .long 112b,120b
542 .long 114b,120b
543 .text