Greg Kroah-Hartman | b244131 | 2017-11-01 15:07:57 +0100 | [diff] [blame] | 1 | /* SPDX-License-Identifier: GPL-2.0 */ |
David S. Miller | 398d108 | 2006-03-05 16:41:56 -0800 | [diff] [blame] | 2 | /* NGmemcpy.S: Niagara optimized memcpy. |
| 3 | * |
David S. Miller | 25e5566ed | 2007-10-02 01:03:09 -0700 | [diff] [blame] | 4 | * Copyright (C) 2006, 2007 David S. Miller (davem@davemloft.net) |
David S. Miller | 398d108 | 2006-03-05 16:41:56 -0800 | [diff] [blame] | 5 | */ |
| 6 | |
| 7 | #ifdef __KERNEL__ |
David S. Miller | 7ae3aaf | 2016-10-24 19:32:12 -0700 | [diff] [blame] | 8 | #include <linux/linkage.h> |
David S. Miller | 398d108 | 2006-03-05 16:41:56 -0800 | [diff] [blame] | 9 | #include <asm/asi.h> |
David S. Miller | 0d4bc95 | 2006-02-11 10:30:41 -0800 | [diff] [blame] | 10 | #include <asm/thread_info.h> |
David S. Miller | 398d108 | 2006-03-05 16:41:56 -0800 | [diff] [blame] | 11 | #define GLOBAL_SPARE %g7 |
David S. Miller | 0d4bc95 | 2006-02-11 10:30:41 -0800 | [diff] [blame] | 12 | #define RESTORE_ASI(TMP) \ |
| 13 | ldub [%g6 + TI_CURRENT_DS], TMP; \ |
| 14 | wr TMP, 0x0, %asi; |
David S. Miller | 398d108 | 2006-03-05 16:41:56 -0800 | [diff] [blame] | 15 | #else |
| 16 | #define GLOBAL_SPARE %g5 |
David S. Miller | 0d4bc95 | 2006-02-11 10:30:41 -0800 | [diff] [blame] | 17 | #define RESTORE_ASI(TMP) \ |
| 18 | wr %g0, ASI_PNF, %asi |
David S. Miller | 398d108 | 2006-03-05 16:41:56 -0800 | [diff] [blame] | 19 | #endif |
| 20 | |
David S. Miller | 25e5566ed | 2007-10-02 01:03:09 -0700 | [diff] [blame] | 21 | #ifdef __sparc_v9__ |
| 22 | #define SAVE_AMOUNT 128 |
| 23 | #else |
| 24 | #define SAVE_AMOUNT 64 |
| 25 | #endif |
| 26 | |
David S. Miller | 398d108 | 2006-03-05 16:41:56 -0800 | [diff] [blame] | 27 | #ifndef STORE_ASI |
| 28 | #define STORE_ASI ASI_BLK_INIT_QUAD_LDD_P |
| 29 | #endif |
| 30 | |
| 31 | #ifndef EX_LD |
David S. Miller | 7ae3aaf | 2016-10-24 19:32:12 -0700 | [diff] [blame] | 32 | #define EX_LD(x,y) x |
David S. Miller | 398d108 | 2006-03-05 16:41:56 -0800 | [diff] [blame] | 33 | #endif |
| 34 | |
| 35 | #ifndef EX_ST |
David S. Miller | 7ae3aaf | 2016-10-24 19:32:12 -0700 | [diff] [blame] | 36 | #define EX_ST(x,y) x |
David S. Miller | 398d108 | 2006-03-05 16:41:56 -0800 | [diff] [blame] | 37 | #endif |
| 38 | |
| 39 | #ifndef LOAD |
| 40 | #ifndef MEMCPY_DEBUG |
| 41 | #define LOAD(type,addr,dest) type [addr], dest |
| 42 | #else |
| 43 | #define LOAD(type,addr,dest) type##a [addr] 0x80, dest |
| 44 | #endif |
| 45 | #endif |
| 46 | |
| 47 | #ifndef LOAD_TWIN |
| 48 | #define LOAD_TWIN(addr_reg,dest0,dest1) \ |
| 49 | ldda [addr_reg] ASI_BLK_INIT_QUAD_LDD_P, dest0 |
| 50 | #endif |
| 51 | |
| 52 | #ifndef STORE |
| 53 | #define STORE(type,src,addr) type src, [addr] |
| 54 | #endif |
| 55 | |
| 56 | #ifndef STORE_INIT |
David S. Miller | 25e5566ed | 2007-10-02 01:03:09 -0700 | [diff] [blame] | 57 | #ifndef SIMULATE_NIAGARA_ON_NON_NIAGARA |
David S. Miller | 398d108 | 2006-03-05 16:41:56 -0800 | [diff] [blame] | 58 | #define STORE_INIT(src,addr) stxa src, [addr] %asi |
David S. Miller | 25e5566ed | 2007-10-02 01:03:09 -0700 | [diff] [blame] | 59 | #else |
| 60 | #define STORE_INIT(src,addr) stx src, [addr + 0x00] |
| 61 | #endif |
David S. Miller | 398d108 | 2006-03-05 16:41:56 -0800 | [diff] [blame] | 62 | #endif |
| 63 | |
| 64 | #ifndef FUNC_NAME |
| 65 | #define FUNC_NAME NGmemcpy |
| 66 | #endif |
| 67 | |
| 68 | #ifndef PREAMBLE |
| 69 | #define PREAMBLE |
| 70 | #endif |
| 71 | |
| 72 | #ifndef XCC |
| 73 | #define XCC xcc |
| 74 | #endif |
| 75 | |
| 76 | .register %g2,#scratch |
| 77 | .register %g3,#scratch |
| 78 | |
| 79 | .text |
David S. Miller | 7ae3aaf | 2016-10-24 19:32:12 -0700 | [diff] [blame] | 80 | #ifndef EX_RETVAL |
| 81 | #define EX_RETVAL(x) x |
| 82 | __restore_asi: |
| 83 | ret |
| 84 | wr %g0, ASI_AIUS, %asi |
| 85 | restore |
| 86 | ENTRY(NG_ret_i2_plus_i4_plus_1) |
| 87 | ba,pt %xcc, __restore_asi |
| 88 | add %i2, %i5, %i0 |
| 89 | ENDPROC(NG_ret_i2_plus_i4_plus_1) |
| 90 | ENTRY(NG_ret_i2_plus_g1) |
| 91 | ba,pt %xcc, __restore_asi |
| 92 | add %i2, %g1, %i0 |
| 93 | ENDPROC(NG_ret_i2_plus_g1) |
| 94 | ENTRY(NG_ret_i2_plus_g1_minus_8) |
| 95 | sub %g1, 8, %g1 |
| 96 | ba,pt %xcc, __restore_asi |
| 97 | add %i2, %g1, %i0 |
| 98 | ENDPROC(NG_ret_i2_plus_g1_minus_8) |
| 99 | ENTRY(NG_ret_i2_plus_g1_minus_16) |
| 100 | sub %g1, 16, %g1 |
| 101 | ba,pt %xcc, __restore_asi |
| 102 | add %i2, %g1, %i0 |
| 103 | ENDPROC(NG_ret_i2_plus_g1_minus_16) |
| 104 | ENTRY(NG_ret_i2_plus_g1_minus_24) |
| 105 | sub %g1, 24, %g1 |
| 106 | ba,pt %xcc, __restore_asi |
| 107 | add %i2, %g1, %i0 |
| 108 | ENDPROC(NG_ret_i2_plus_g1_minus_24) |
| 109 | ENTRY(NG_ret_i2_plus_g1_minus_32) |
| 110 | sub %g1, 32, %g1 |
| 111 | ba,pt %xcc, __restore_asi |
| 112 | add %i2, %g1, %i0 |
| 113 | ENDPROC(NG_ret_i2_plus_g1_minus_32) |
| 114 | ENTRY(NG_ret_i2_plus_g1_minus_40) |
| 115 | sub %g1, 40, %g1 |
| 116 | ba,pt %xcc, __restore_asi |
| 117 | add %i2, %g1, %i0 |
| 118 | ENDPROC(NG_ret_i2_plus_g1_minus_40) |
| 119 | ENTRY(NG_ret_i2_plus_g1_minus_48) |
| 120 | sub %g1, 48, %g1 |
| 121 | ba,pt %xcc, __restore_asi |
| 122 | add %i2, %g1, %i0 |
| 123 | ENDPROC(NG_ret_i2_plus_g1_minus_48) |
| 124 | ENTRY(NG_ret_i2_plus_g1_minus_56) |
| 125 | sub %g1, 56, %g1 |
| 126 | ba,pt %xcc, __restore_asi |
| 127 | add %i2, %g1, %i0 |
| 128 | ENDPROC(NG_ret_i2_plus_g1_minus_56) |
| 129 | ENTRY(NG_ret_i2_plus_i4) |
| 130 | ba,pt %xcc, __restore_asi |
| 131 | add %i2, %i4, %i0 |
| 132 | ENDPROC(NG_ret_i2_plus_i4) |
| 133 | ENTRY(NG_ret_i2_plus_i4_minus_8) |
| 134 | sub %i4, 8, %i4 |
| 135 | ba,pt %xcc, __restore_asi |
| 136 | add %i2, %i4, %i0 |
| 137 | ENDPROC(NG_ret_i2_plus_i4_minus_8) |
| 138 | ENTRY(NG_ret_i2_plus_8) |
| 139 | ba,pt %xcc, __restore_asi |
| 140 | add %i2, 8, %i0 |
| 141 | ENDPROC(NG_ret_i2_plus_8) |
| 142 | ENTRY(NG_ret_i2_plus_4) |
| 143 | ba,pt %xcc, __restore_asi |
| 144 | add %i2, 4, %i0 |
| 145 | ENDPROC(NG_ret_i2_plus_4) |
| 146 | ENTRY(NG_ret_i2_plus_1) |
| 147 | ba,pt %xcc, __restore_asi |
| 148 | add %i2, 1, %i0 |
| 149 | ENDPROC(NG_ret_i2_plus_1) |
| 150 | ENTRY(NG_ret_i2_plus_g1_plus_1) |
| 151 | add %g1, 1, %g1 |
| 152 | ba,pt %xcc, __restore_asi |
| 153 | add %i2, %g1, %i0 |
| 154 | ENDPROC(NG_ret_i2_plus_g1_plus_1) |
| 155 | ENTRY(NG_ret_i2) |
| 156 | ba,pt %xcc, __restore_asi |
| 157 | mov %i2, %i0 |
| 158 | ENDPROC(NG_ret_i2) |
| 159 | ENTRY(NG_ret_i2_and_7_plus_i4) |
| 160 | and %i2, 7, %i2 |
| 161 | ba,pt %xcc, __restore_asi |
| 162 | add %i2, %i4, %i0 |
| 163 | ENDPROC(NG_ret_i2_and_7_plus_i4) |
| 164 | #endif |
| 165 | |
David S. Miller | 398d108 | 2006-03-05 16:41:56 -0800 | [diff] [blame] | 166 | .align 64 |
| 167 | |
| 168 | .globl FUNC_NAME |
| 169 | .type FUNC_NAME,#function |
David S. Miller | 25e5566ed | 2007-10-02 01:03:09 -0700 | [diff] [blame] | 170 | FUNC_NAME: /* %i0=dst, %i1=src, %i2=len */ |
| 171 | PREAMBLE |
| 172 | save %sp, -SAVE_AMOUNT, %sp |
| 173 | srlx %i2, 31, %g2 |
David S. Miller | 398d108 | 2006-03-05 16:41:56 -0800 | [diff] [blame] | 174 | cmp %g2, 0 |
| 175 | tne %xcc, 5 |
David S. Miller | 25e5566ed | 2007-10-02 01:03:09 -0700 | [diff] [blame] | 176 | mov %i0, %o0 |
| 177 | cmp %i2, 0 |
David S. Miller | 398d108 | 2006-03-05 16:41:56 -0800 | [diff] [blame] | 178 | be,pn %XCC, 85f |
David S. Miller | 25e5566ed | 2007-10-02 01:03:09 -0700 | [diff] [blame] | 179 | or %o0, %i1, %i3 |
| 180 | cmp %i2, 16 |
David S. Miller | 398d108 | 2006-03-05 16:41:56 -0800 | [diff] [blame] | 181 | blu,a,pn %XCC, 80f |
David S. Miller | 25e5566ed | 2007-10-02 01:03:09 -0700 | [diff] [blame] | 182 | or %i3, %i2, %i3 |
David S. Miller | 398d108 | 2006-03-05 16:41:56 -0800 | [diff] [blame] | 183 | |
| 184 | /* 2 blocks (128 bytes) is the minimum we can do the block |
| 185 | * copy with. We need to ensure that we'll iterate at least |
| 186 | * once in the block copy loop. At worst we'll need to align |
| 187 | * the destination to a 64-byte boundary which can chew up |
| 188 | * to (64 - 1) bytes from the length before we perform the |
| 189 | * block copy loop. |
| 190 | */ |
David S. Miller | 25e5566ed | 2007-10-02 01:03:09 -0700 | [diff] [blame] | 191 | cmp %i2, (2 * 64) |
David S. Miller | 398d108 | 2006-03-05 16:41:56 -0800 | [diff] [blame] | 192 | blu,pt %XCC, 70f |
David S. Miller | 25e5566ed | 2007-10-02 01:03:09 -0700 | [diff] [blame] | 193 | andcc %i3, 0x7, %g0 |
David S. Miller | 398d108 | 2006-03-05 16:41:56 -0800 | [diff] [blame] | 194 | |
| 195 | /* %o0: dst |
David S. Miller | 25e5566ed | 2007-10-02 01:03:09 -0700 | [diff] [blame] | 196 | * %i1: src |
| 197 | * %i2: len (known to be >= 128) |
David S. Miller | 398d108 | 2006-03-05 16:41:56 -0800 | [diff] [blame] | 198 | * |
David S. Miller | 25e5566ed | 2007-10-02 01:03:09 -0700 | [diff] [blame] | 199 | * The block copy loops will use %i4/%i5,%g2/%g3 as |
David S. Miller | 398d108 | 2006-03-05 16:41:56 -0800 | [diff] [blame] | 200 | * temporaries while copying the data. |
| 201 | */ |
| 202 | |
David S. Miller | 25e5566ed | 2007-10-02 01:03:09 -0700 | [diff] [blame] | 203 | LOAD(prefetch, %i1, #one_read) |
David S. Miller | 398d108 | 2006-03-05 16:41:56 -0800 | [diff] [blame] | 204 | wr %g0, STORE_ASI, %asi |
| 205 | |
| 206 | /* Align destination on 64-byte boundary. */ |
David S. Miller | 25e5566ed | 2007-10-02 01:03:09 -0700 | [diff] [blame] | 207 | andcc %o0, (64 - 1), %i4 |
David S. Miller | 398d108 | 2006-03-05 16:41:56 -0800 | [diff] [blame] | 208 | be,pt %XCC, 2f |
David S. Miller | 25e5566ed | 2007-10-02 01:03:09 -0700 | [diff] [blame] | 209 | sub %i4, 64, %i4 |
| 210 | sub %g0, %i4, %i4 ! bytes to align dst |
| 211 | sub %i2, %i4, %i2 |
| 212 | 1: subcc %i4, 1, %i4 |
David S. Miller | 7ae3aaf | 2016-10-24 19:32:12 -0700 | [diff] [blame] | 213 | EX_LD(LOAD(ldub, %i1, %g1), NG_ret_i2_plus_i4_plus_1) |
| 214 | EX_ST(STORE(stb, %g1, %o0), NG_ret_i2_plus_i4_plus_1) |
David S. Miller | 25e5566ed | 2007-10-02 01:03:09 -0700 | [diff] [blame] | 215 | add %i1, 1, %i1 |
David S. Miller | 398d108 | 2006-03-05 16:41:56 -0800 | [diff] [blame] | 216 | bne,pt %XCC, 1b |
| 217 | add %o0, 1, %o0 |
| 218 | |
| 219 | /* If the source is on a 16-byte boundary we can do |
| 220 | * the direct block copy loop. If it is 8-byte aligned |
| 221 | * we can do the 16-byte loads offset by -8 bytes and the |
| 222 | * init stores offset by one register. |
| 223 | * |
| 224 | * If the source is not even 8-byte aligned, we need to do |
| 225 | * shifting and masking (basically integer faligndata). |
| 226 | * |
| 227 | * The careful bit with init stores is that if we store |
| 228 | * to any part of the cache line we have to store the whole |
| 229 | * cacheline else we can end up with corrupt L2 cache line |
| 230 | * contents. Since the loop works on 64-bytes of 64-byte |
| 231 | * aligned store data at a time, this is easy to ensure. |
| 232 | */ |
| 233 | 2: |
David S. Miller | 25e5566ed | 2007-10-02 01:03:09 -0700 | [diff] [blame] | 234 | andcc %i1, (16 - 1), %i4 |
| 235 | andn %i2, (64 - 1), %g1 ! block copy loop iterator |
David S. Miller | 398d108 | 2006-03-05 16:41:56 -0800 | [diff] [blame] | 236 | be,pt %XCC, 50f |
David S. Miller | 25e5566ed | 2007-10-02 01:03:09 -0700 | [diff] [blame] | 237 | sub %i2, %g1, %i2 ! final sub-block copy bytes |
| 238 | |
| 239 | cmp %i4, 8 |
| 240 | be,pt %XCC, 10f |
| 241 | sub %i1, %i4, %i1 |
David S. Miller | 398d108 | 2006-03-05 16:41:56 -0800 | [diff] [blame] | 242 | |
| 243 | /* Neither 8-byte nor 16-byte aligned, shift and mask. */ |
David S. Miller | 25e5566ed | 2007-10-02 01:03:09 -0700 | [diff] [blame] | 244 | and %i4, 0x7, GLOBAL_SPARE |
| 245 | sll GLOBAL_SPARE, 3, GLOBAL_SPARE |
| 246 | mov 64, %i5 |
David S. Miller | 7ae3aaf | 2016-10-24 19:32:12 -0700 | [diff] [blame] | 247 | EX_LD(LOAD_TWIN(%i1, %g2, %g3), NG_ret_i2_plus_g1) |
David S. Miller | 25e5566ed | 2007-10-02 01:03:09 -0700 | [diff] [blame] | 248 | sub %i5, GLOBAL_SPARE, %i5 |
| 249 | mov 16, %o4 |
| 250 | mov 32, %o5 |
| 251 | mov 48, %o7 |
| 252 | mov 64, %i3 |
David S. Miller | 398d108 | 2006-03-05 16:41:56 -0800 | [diff] [blame] | 253 | |
David S. Miller | 25e5566ed | 2007-10-02 01:03:09 -0700 | [diff] [blame] | 254 | bg,pn %XCC, 9f |
| 255 | nop |
David S. Miller | 398d108 | 2006-03-05 16:41:56 -0800 | [diff] [blame] | 256 | |
David S. Miller | 25e5566ed | 2007-10-02 01:03:09 -0700 | [diff] [blame] | 257 | #define MIX_THREE_WORDS(WORD1, WORD2, WORD3, PRE_SHIFT, POST_SHIFT, TMP) \ |
| 258 | sllx WORD1, POST_SHIFT, WORD1; \ |
| 259 | srlx WORD2, PRE_SHIFT, TMP; \ |
| 260 | sllx WORD2, POST_SHIFT, WORD2; \ |
| 261 | or WORD1, TMP, WORD1; \ |
| 262 | srlx WORD3, PRE_SHIFT, TMP; \ |
| 263 | or WORD2, TMP, WORD2; |
| 264 | |
David S. Miller | 7ae3aaf | 2016-10-24 19:32:12 -0700 | [diff] [blame] | 265 | 8: EX_LD(LOAD_TWIN(%i1 + %o4, %o2, %o3), NG_ret_i2_plus_g1) |
David S. Miller | 25e5566ed | 2007-10-02 01:03:09 -0700 | [diff] [blame] | 266 | MIX_THREE_WORDS(%g2, %g3, %o2, %i5, GLOBAL_SPARE, %o1) |
| 267 | LOAD(prefetch, %i1 + %i3, #one_read) |
| 268 | |
David S. Miller | 7ae3aaf | 2016-10-24 19:32:12 -0700 | [diff] [blame] | 269 | EX_ST(STORE_INIT(%g2, %o0 + 0x00), NG_ret_i2_plus_g1) |
| 270 | EX_ST(STORE_INIT(%g3, %o0 + 0x08), NG_ret_i2_plus_g1_minus_8) |
David S. Miller | 25e5566ed | 2007-10-02 01:03:09 -0700 | [diff] [blame] | 271 | |
David S. Miller | 7ae3aaf | 2016-10-24 19:32:12 -0700 | [diff] [blame] | 272 | EX_LD(LOAD_TWIN(%i1 + %o5, %g2, %g3), NG_ret_i2_plus_g1_minus_16) |
David S. Miller | 25e5566ed | 2007-10-02 01:03:09 -0700 | [diff] [blame] | 273 | MIX_THREE_WORDS(%o2, %o3, %g2, %i5, GLOBAL_SPARE, %o1) |
| 274 | |
David S. Miller | 7ae3aaf | 2016-10-24 19:32:12 -0700 | [diff] [blame] | 275 | EX_ST(STORE_INIT(%o2, %o0 + 0x10), NG_ret_i2_plus_g1_minus_16) |
| 276 | EX_ST(STORE_INIT(%o3, %o0 + 0x18), NG_ret_i2_plus_g1_minus_24) |
David S. Miller | 25e5566ed | 2007-10-02 01:03:09 -0700 | [diff] [blame] | 277 | |
David S. Miller | 7ae3aaf | 2016-10-24 19:32:12 -0700 | [diff] [blame] | 278 | EX_LD(LOAD_TWIN(%i1 + %o7, %o2, %o3), NG_ret_i2_plus_g1_minus_32) |
David S. Miller | 25e5566ed | 2007-10-02 01:03:09 -0700 | [diff] [blame] | 279 | MIX_THREE_WORDS(%g2, %g3, %o2, %i5, GLOBAL_SPARE, %o1) |
| 280 | |
David S. Miller | 7ae3aaf | 2016-10-24 19:32:12 -0700 | [diff] [blame] | 281 | EX_ST(STORE_INIT(%g2, %o0 + 0x20), NG_ret_i2_plus_g1_minus_32) |
| 282 | EX_ST(STORE_INIT(%g3, %o0 + 0x28), NG_ret_i2_plus_g1_minus_40) |
David S. Miller | 25e5566ed | 2007-10-02 01:03:09 -0700 | [diff] [blame] | 283 | |
David S. Miller | 7ae3aaf | 2016-10-24 19:32:12 -0700 | [diff] [blame] | 284 | EX_LD(LOAD_TWIN(%i1 + %i3, %g2, %g3), NG_ret_i2_plus_g1_minus_48) |
David S. Miller | 25e5566ed | 2007-10-02 01:03:09 -0700 | [diff] [blame] | 285 | add %i1, 64, %i1 |
| 286 | MIX_THREE_WORDS(%o2, %o3, %g2, %i5, GLOBAL_SPARE, %o1) |
| 287 | |
David S. Miller | 7ae3aaf | 2016-10-24 19:32:12 -0700 | [diff] [blame] | 288 | EX_ST(STORE_INIT(%o2, %o0 + 0x30), NG_ret_i2_plus_g1_minus_48) |
| 289 | EX_ST(STORE_INIT(%o3, %o0 + 0x38), NG_ret_i2_plus_g1_minus_56) |
David S. Miller | 25e5566ed | 2007-10-02 01:03:09 -0700 | [diff] [blame] | 290 | |
| 291 | subcc %g1, 64, %g1 |
| 292 | bne,pt %XCC, 8b |
David S. Miller | 398d108 | 2006-03-05 16:41:56 -0800 | [diff] [blame] | 293 | add %o0, 64, %o0 |
| 294 | |
David S. Miller | 398d108 | 2006-03-05 16:41:56 -0800 | [diff] [blame] | 295 | ba,pt %XCC, 60f |
David S. Miller | 25e5566ed | 2007-10-02 01:03:09 -0700 | [diff] [blame] | 296 | add %i1, %i4, %i1 |
| 297 | |
David S. Miller | 7ae3aaf | 2016-10-24 19:32:12 -0700 | [diff] [blame] | 298 | 9: EX_LD(LOAD_TWIN(%i1 + %o4, %o2, %o3), NG_ret_i2_plus_g1) |
David S. Miller | 25e5566ed | 2007-10-02 01:03:09 -0700 | [diff] [blame] | 299 | MIX_THREE_WORDS(%g3, %o2, %o3, %i5, GLOBAL_SPARE, %o1) |
| 300 | LOAD(prefetch, %i1 + %i3, #one_read) |
| 301 | |
David S. Miller | 7ae3aaf | 2016-10-24 19:32:12 -0700 | [diff] [blame] | 302 | EX_ST(STORE_INIT(%g3, %o0 + 0x00), NG_ret_i2_plus_g1) |
| 303 | EX_ST(STORE_INIT(%o2, %o0 + 0x08), NG_ret_i2_plus_g1_minus_8) |
David S. Miller | 25e5566ed | 2007-10-02 01:03:09 -0700 | [diff] [blame] | 304 | |
David S. Miller | 7ae3aaf | 2016-10-24 19:32:12 -0700 | [diff] [blame] | 305 | EX_LD(LOAD_TWIN(%i1 + %o5, %g2, %g3), NG_ret_i2_plus_g1_minus_16) |
David S. Miller | 25e5566ed | 2007-10-02 01:03:09 -0700 | [diff] [blame] | 306 | MIX_THREE_WORDS(%o3, %g2, %g3, %i5, GLOBAL_SPARE, %o1) |
| 307 | |
David S. Miller | 7ae3aaf | 2016-10-24 19:32:12 -0700 | [diff] [blame] | 308 | EX_ST(STORE_INIT(%o3, %o0 + 0x10), NG_ret_i2_plus_g1_minus_16) |
| 309 | EX_ST(STORE_INIT(%g2, %o0 + 0x18), NG_ret_i2_plus_g1_minus_24) |
David S. Miller | 25e5566ed | 2007-10-02 01:03:09 -0700 | [diff] [blame] | 310 | |
David S. Miller | 7ae3aaf | 2016-10-24 19:32:12 -0700 | [diff] [blame] | 311 | EX_LD(LOAD_TWIN(%i1 + %o7, %o2, %o3), NG_ret_i2_plus_g1_minus_32) |
David S. Miller | 25e5566ed | 2007-10-02 01:03:09 -0700 | [diff] [blame] | 312 | MIX_THREE_WORDS(%g3, %o2, %o3, %i5, GLOBAL_SPARE, %o1) |
| 313 | |
David S. Miller | 7ae3aaf | 2016-10-24 19:32:12 -0700 | [diff] [blame] | 314 | EX_ST(STORE_INIT(%g3, %o0 + 0x20), NG_ret_i2_plus_g1_minus_32) |
| 315 | EX_ST(STORE_INIT(%o2, %o0 + 0x28), NG_ret_i2_plus_g1_minus_40) |
David S. Miller | 25e5566ed | 2007-10-02 01:03:09 -0700 | [diff] [blame] | 316 | |
David S. Miller | 7ae3aaf | 2016-10-24 19:32:12 -0700 | [diff] [blame] | 317 | EX_LD(LOAD_TWIN(%i1 + %i3, %g2, %g3), NG_ret_i2_plus_g1_minus_48) |
David S. Miller | 25e5566ed | 2007-10-02 01:03:09 -0700 | [diff] [blame] | 318 | add %i1, 64, %i1 |
| 319 | MIX_THREE_WORDS(%o3, %g2, %g3, %i5, GLOBAL_SPARE, %o1) |
| 320 | |
David S. Miller | 7ae3aaf | 2016-10-24 19:32:12 -0700 | [diff] [blame] | 321 | EX_ST(STORE_INIT(%o3, %o0 + 0x30), NG_ret_i2_plus_g1_minus_48) |
| 322 | EX_ST(STORE_INIT(%g2, %o0 + 0x38), NG_ret_i2_plus_g1_minus_56) |
David S. Miller | 25e5566ed | 2007-10-02 01:03:09 -0700 | [diff] [blame] | 323 | |
| 324 | subcc %g1, 64, %g1 |
| 325 | bne,pt %XCC, 9b |
| 326 | add %o0, 64, %o0 |
| 327 | |
| 328 | ba,pt %XCC, 60f |
| 329 | add %i1, %i4, %i1 |
David S. Miller | 398d108 | 2006-03-05 16:41:56 -0800 | [diff] [blame] | 330 | |
| 331 | 10: /* Destination is 64-byte aligned, source was only 8-byte |
| 332 | * aligned but it has been subtracted by 8 and we perform |
| 333 | * one twin load ahead, then add 8 back into source when |
| 334 | * we finish the loop. |
| 335 | */ |
David S. Miller | 7ae3aaf | 2016-10-24 19:32:12 -0700 | [diff] [blame] | 336 | EX_LD(LOAD_TWIN(%i1, %o4, %o5), NG_ret_i2_plus_g1) |
David S. Miller | 25e5566ed | 2007-10-02 01:03:09 -0700 | [diff] [blame] | 337 | mov 16, %o7 |
| 338 | mov 32, %g2 |
| 339 | mov 48, %g3 |
| 340 | mov 64, %o1 |
David S. Miller | 7ae3aaf | 2016-10-24 19:32:12 -0700 | [diff] [blame] | 341 | 1: EX_LD(LOAD_TWIN(%i1 + %o7, %o2, %o3), NG_ret_i2_plus_g1) |
David S. Miller | 25e5566ed | 2007-10-02 01:03:09 -0700 | [diff] [blame] | 342 | LOAD(prefetch, %i1 + %o1, #one_read) |
David S. Miller | 7ae3aaf | 2016-10-24 19:32:12 -0700 | [diff] [blame] | 343 | EX_ST(STORE_INIT(%o5, %o0 + 0x00), NG_ret_i2_plus_g1) ! initializes cache line |
| 344 | EX_ST(STORE_INIT(%o2, %o0 + 0x08), NG_ret_i2_plus_g1_minus_8) |
| 345 | EX_LD(LOAD_TWIN(%i1 + %g2, %o4, %o5), NG_ret_i2_plus_g1_minus_16) |
| 346 | EX_ST(STORE_INIT(%o3, %o0 + 0x10), NG_ret_i2_plus_g1_minus_16) |
| 347 | EX_ST(STORE_INIT(%o4, %o0 + 0x18), NG_ret_i2_plus_g1_minus_24) |
| 348 | EX_LD(LOAD_TWIN(%i1 + %g3, %o2, %o3), NG_ret_i2_plus_g1_minus_32) |
| 349 | EX_ST(STORE_INIT(%o5, %o0 + 0x20), NG_ret_i2_plus_g1_minus_32) |
| 350 | EX_ST(STORE_INIT(%o2, %o0 + 0x28), NG_ret_i2_plus_g1_minus_40) |
| 351 | EX_LD(LOAD_TWIN(%i1 + %o1, %o4, %o5), NG_ret_i2_plus_g1_minus_48) |
David S. Miller | 25e5566ed | 2007-10-02 01:03:09 -0700 | [diff] [blame] | 352 | add %i1, 64, %i1 |
David S. Miller | 7ae3aaf | 2016-10-24 19:32:12 -0700 | [diff] [blame] | 353 | EX_ST(STORE_INIT(%o3, %o0 + 0x30), NG_ret_i2_plus_g1_minus_48) |
| 354 | EX_ST(STORE_INIT(%o4, %o0 + 0x38), NG_ret_i2_plus_g1_minus_56) |
David S. Miller | 398d108 | 2006-03-05 16:41:56 -0800 | [diff] [blame] | 355 | subcc %g1, 64, %g1 |
| 356 | bne,pt %XCC, 1b |
| 357 | add %o0, 64, %o0 |
| 358 | |
| 359 | ba,pt %XCC, 60f |
David S. Miller | 25e5566ed | 2007-10-02 01:03:09 -0700 | [diff] [blame] | 360 | add %i1, 0x8, %i1 |
David S. Miller | 398d108 | 2006-03-05 16:41:56 -0800 | [diff] [blame] | 361 | |
| 362 | 50: /* Destination is 64-byte aligned, and source is 16-byte |
| 363 | * aligned. |
| 364 | */ |
David S. Miller | 25e5566ed | 2007-10-02 01:03:09 -0700 | [diff] [blame] | 365 | mov 16, %o7 |
| 366 | mov 32, %g2 |
| 367 | mov 48, %g3 |
| 368 | mov 64, %o1 |
David S. Miller | 7ae3aaf | 2016-10-24 19:32:12 -0700 | [diff] [blame] | 369 | 1: EX_LD(LOAD_TWIN(%i1 + %g0, %o4, %o5), NG_ret_i2_plus_g1) |
| 370 | EX_LD(LOAD_TWIN(%i1 + %o7, %o2, %o3), NG_ret_i2_plus_g1) |
David S. Miller | 25e5566ed | 2007-10-02 01:03:09 -0700 | [diff] [blame] | 371 | LOAD(prefetch, %i1 + %o1, #one_read) |
David S. Miller | 7ae3aaf | 2016-10-24 19:32:12 -0700 | [diff] [blame] | 372 | EX_ST(STORE_INIT(%o4, %o0 + 0x00), NG_ret_i2_plus_g1) ! initializes cache line |
| 373 | EX_ST(STORE_INIT(%o5, %o0 + 0x08), NG_ret_i2_plus_g1_minus_8) |
| 374 | EX_LD(LOAD_TWIN(%i1 + %g2, %o4, %o5), NG_ret_i2_plus_g1_minus_16) |
| 375 | EX_ST(STORE_INIT(%o2, %o0 + 0x10), NG_ret_i2_plus_g1_minus_16) |
| 376 | EX_ST(STORE_INIT(%o3, %o0 + 0x18), NG_ret_i2_plus_g1_minus_24) |
| 377 | EX_LD(LOAD_TWIN(%i1 + %g3, %o2, %o3), NG_ret_i2_plus_g1_minus_32) |
David S. Miller | 25e5566ed | 2007-10-02 01:03:09 -0700 | [diff] [blame] | 378 | add %i1, 64, %i1 |
David S. Miller | 7ae3aaf | 2016-10-24 19:32:12 -0700 | [diff] [blame] | 379 | EX_ST(STORE_INIT(%o4, %o0 + 0x20), NG_ret_i2_plus_g1_minus_32) |
| 380 | EX_ST(STORE_INIT(%o5, %o0 + 0x28), NG_ret_i2_plus_g1_minus_40) |
| 381 | EX_ST(STORE_INIT(%o2, %o0 + 0x30), NG_ret_i2_plus_g1_minus_48) |
| 382 | EX_ST(STORE_INIT(%o3, %o0 + 0x38), NG_ret_i2_plus_g1_minus_56) |
David S. Miller | 398d108 | 2006-03-05 16:41:56 -0800 | [diff] [blame] | 383 | subcc %g1, 64, %g1 |
| 384 | bne,pt %XCC, 1b |
| 385 | add %o0, 64, %o0 |
| 386 | /* fall through */ |
| 387 | |
| 388 | 60: |
David S. Miller | 24d559c | 2007-03-19 13:27:33 -0700 | [diff] [blame] | 389 | membar #Sync |
| 390 | |
David S. Miller | 25e5566ed | 2007-10-02 01:03:09 -0700 | [diff] [blame] | 391 | /* %i2 contains any final bytes still needed to be copied |
David S. Miller | 398d108 | 2006-03-05 16:41:56 -0800 | [diff] [blame] | 392 | * over. If anything is left, we copy it one byte at a time. |
| 393 | */ |
David S. Miller | 25e5566ed | 2007-10-02 01:03:09 -0700 | [diff] [blame] | 394 | RESTORE_ASI(%i3) |
| 395 | brz,pt %i2, 85f |
| 396 | sub %o0, %i1, %i3 |
David S. Miller | 398d108 | 2006-03-05 16:41:56 -0800 | [diff] [blame] | 397 | ba,a,pt %XCC, 90f |
Babu Moger | 0ae2d26 | 2017-03-17 14:52:21 -0600 | [diff] [blame] | 398 | nop |
David S. Miller | 398d108 | 2006-03-05 16:41:56 -0800 | [diff] [blame] | 399 | |
| 400 | .align 64 |
| 401 | 70: /* 16 < len <= 64 */ |
| 402 | bne,pn %XCC, 75f |
David S. Miller | 25e5566ed | 2007-10-02 01:03:09 -0700 | [diff] [blame] | 403 | sub %o0, %i1, %i3 |
David S. Miller | 398d108 | 2006-03-05 16:41:56 -0800 | [diff] [blame] | 404 | |
| 405 | 72: |
David S. Miller | 25e5566ed | 2007-10-02 01:03:09 -0700 | [diff] [blame] | 406 | andn %i2, 0xf, %i4 |
| 407 | and %i2, 0xf, %i2 |
| 408 | 1: subcc %i4, 0x10, %i4 |
David S. Miller | 7ae3aaf | 2016-10-24 19:32:12 -0700 | [diff] [blame] | 409 | EX_LD(LOAD(ldx, %i1, %o4), NG_ret_i2_plus_i4) |
David S. Miller | 25e5566ed | 2007-10-02 01:03:09 -0700 | [diff] [blame] | 410 | add %i1, 0x08, %i1 |
David S. Miller | 7ae3aaf | 2016-10-24 19:32:12 -0700 | [diff] [blame] | 411 | EX_LD(LOAD(ldx, %i1, %g1), NG_ret_i2_plus_i4) |
David S. Miller | 25e5566ed | 2007-10-02 01:03:09 -0700 | [diff] [blame] | 412 | sub %i1, 0x08, %i1 |
David S. Miller | 7ae3aaf | 2016-10-24 19:32:12 -0700 | [diff] [blame] | 413 | EX_ST(STORE(stx, %o4, %i1 + %i3), NG_ret_i2_plus_i4) |
David S. Miller | 25e5566ed | 2007-10-02 01:03:09 -0700 | [diff] [blame] | 414 | add %i1, 0x8, %i1 |
David S. Miller | 7ae3aaf | 2016-10-24 19:32:12 -0700 | [diff] [blame] | 415 | EX_ST(STORE(stx, %g1, %i1 + %i3), NG_ret_i2_plus_i4_minus_8) |
David S. Miller | 398d108 | 2006-03-05 16:41:56 -0800 | [diff] [blame] | 416 | bgu,pt %XCC, 1b |
David S. Miller | 25e5566ed | 2007-10-02 01:03:09 -0700 | [diff] [blame] | 417 | add %i1, 0x8, %i1 |
| 418 | 73: andcc %i2, 0x8, %g0 |
David S. Miller | 398d108 | 2006-03-05 16:41:56 -0800 | [diff] [blame] | 419 | be,pt %XCC, 1f |
| 420 | nop |
David S. Miller | 25e5566ed | 2007-10-02 01:03:09 -0700 | [diff] [blame] | 421 | sub %i2, 0x8, %i2 |
David S. Miller | 7ae3aaf | 2016-10-24 19:32:12 -0700 | [diff] [blame] | 422 | EX_LD(LOAD(ldx, %i1, %o4), NG_ret_i2_plus_8) |
| 423 | EX_ST(STORE(stx, %o4, %i1 + %i3), NG_ret_i2_plus_8) |
David S. Miller | 25e5566ed | 2007-10-02 01:03:09 -0700 | [diff] [blame] | 424 | add %i1, 0x8, %i1 |
| 425 | 1: andcc %i2, 0x4, %g0 |
David S. Miller | 398d108 | 2006-03-05 16:41:56 -0800 | [diff] [blame] | 426 | be,pt %XCC, 1f |
| 427 | nop |
David S. Miller | 25e5566ed | 2007-10-02 01:03:09 -0700 | [diff] [blame] | 428 | sub %i2, 0x4, %i2 |
David S. Miller | 7ae3aaf | 2016-10-24 19:32:12 -0700 | [diff] [blame] | 429 | EX_LD(LOAD(lduw, %i1, %i5), NG_ret_i2_plus_4) |
| 430 | EX_ST(STORE(stw, %i5, %i1 + %i3), NG_ret_i2_plus_4) |
David S. Miller | 25e5566ed | 2007-10-02 01:03:09 -0700 | [diff] [blame] | 431 | add %i1, 0x4, %i1 |
| 432 | 1: cmp %i2, 0 |
David S. Miller | 398d108 | 2006-03-05 16:41:56 -0800 | [diff] [blame] | 433 | be,pt %XCC, 85f |
| 434 | nop |
| 435 | ba,pt %xcc, 90f |
| 436 | nop |
| 437 | |
| 438 | 75: |
| 439 | andcc %o0, 0x7, %g1 |
| 440 | sub %g1, 0x8, %g1 |
| 441 | be,pn %icc, 2f |
| 442 | sub %g0, %g1, %g1 |
David S. Miller | 25e5566ed | 2007-10-02 01:03:09 -0700 | [diff] [blame] | 443 | sub %i2, %g1, %i2 |
David S. Miller | 398d108 | 2006-03-05 16:41:56 -0800 | [diff] [blame] | 444 | |
| 445 | 1: subcc %g1, 1, %g1 |
David S. Miller | 7ae3aaf | 2016-10-24 19:32:12 -0700 | [diff] [blame] | 446 | EX_LD(LOAD(ldub, %i1, %i5), NG_ret_i2_plus_g1_plus_1) |
| 447 | EX_ST(STORE(stb, %i5, %i1 + %i3), NG_ret_i2_plus_g1_plus_1) |
David S. Miller | 398d108 | 2006-03-05 16:41:56 -0800 | [diff] [blame] | 448 | bgu,pt %icc, 1b |
David S. Miller | 25e5566ed | 2007-10-02 01:03:09 -0700 | [diff] [blame] | 449 | add %i1, 1, %i1 |
David S. Miller | 398d108 | 2006-03-05 16:41:56 -0800 | [diff] [blame] | 450 | |
David S. Miller | 25e5566ed | 2007-10-02 01:03:09 -0700 | [diff] [blame] | 451 | 2: add %i1, %i3, %o0 |
| 452 | andcc %i1, 0x7, %g1 |
David S. Miller | 398d108 | 2006-03-05 16:41:56 -0800 | [diff] [blame] | 453 | bne,pt %icc, 8f |
| 454 | sll %g1, 3, %g1 |
| 455 | |
David S. Miller | 25e5566ed | 2007-10-02 01:03:09 -0700 | [diff] [blame] | 456 | cmp %i2, 16 |
David S. Miller | 398d108 | 2006-03-05 16:41:56 -0800 | [diff] [blame] | 457 | bgeu,pt %icc, 72b |
| 458 | nop |
| 459 | ba,a,pt %xcc, 73b |
| 460 | |
David S. Miller | 25e5566ed | 2007-10-02 01:03:09 -0700 | [diff] [blame] | 461 | 8: mov 64, %i3 |
| 462 | andn %i1, 0x7, %i1 |
David S. Miller | 7ae3aaf | 2016-10-24 19:32:12 -0700 | [diff] [blame] | 463 | EX_LD(LOAD(ldx, %i1, %g2), NG_ret_i2) |
David S. Miller | 25e5566ed | 2007-10-02 01:03:09 -0700 | [diff] [blame] | 464 | sub %i3, %g1, %i3 |
| 465 | andn %i2, 0x7, %i4 |
David S. Miller | 398d108 | 2006-03-05 16:41:56 -0800 | [diff] [blame] | 466 | sllx %g2, %g1, %g2 |
David S. Miller | 25e5566ed | 2007-10-02 01:03:09 -0700 | [diff] [blame] | 467 | 1: add %i1, 0x8, %i1 |
David S. Miller | 7ae3aaf | 2016-10-24 19:32:12 -0700 | [diff] [blame] | 468 | EX_LD(LOAD(ldx, %i1, %g3), NG_ret_i2_and_7_plus_i4) |
David S. Miller | 25e5566ed | 2007-10-02 01:03:09 -0700 | [diff] [blame] | 469 | subcc %i4, 0x8, %i4 |
| 470 | srlx %g3, %i3, %i5 |
| 471 | or %i5, %g2, %i5 |
David S. Miller | 7ae3aaf | 2016-10-24 19:32:12 -0700 | [diff] [blame] | 472 | EX_ST(STORE(stx, %i5, %o0), NG_ret_i2_and_7_plus_i4) |
David S. Miller | 398d108 | 2006-03-05 16:41:56 -0800 | [diff] [blame] | 473 | add %o0, 0x8, %o0 |
| 474 | bgu,pt %icc, 1b |
| 475 | sllx %g3, %g1, %g2 |
| 476 | |
| 477 | srl %g1, 3, %g1 |
David S. Miller | 25e5566ed | 2007-10-02 01:03:09 -0700 | [diff] [blame] | 478 | andcc %i2, 0x7, %i2 |
David S. Miller | 398d108 | 2006-03-05 16:41:56 -0800 | [diff] [blame] | 479 | be,pn %icc, 85f |
David S. Miller | 25e5566ed | 2007-10-02 01:03:09 -0700 | [diff] [blame] | 480 | add %i1, %g1, %i1 |
David S. Miller | 398d108 | 2006-03-05 16:41:56 -0800 | [diff] [blame] | 481 | ba,pt %xcc, 90f |
David S. Miller | 25e5566ed | 2007-10-02 01:03:09 -0700 | [diff] [blame] | 482 | sub %o0, %i1, %i3 |
David S. Miller | 398d108 | 2006-03-05 16:41:56 -0800 | [diff] [blame] | 483 | |
| 484 | .align 64 |
| 485 | 80: /* 0 < len <= 16 */ |
David S. Miller | 25e5566ed | 2007-10-02 01:03:09 -0700 | [diff] [blame] | 486 | andcc %i3, 0x3, %g0 |
David S. Miller | 398d108 | 2006-03-05 16:41:56 -0800 | [diff] [blame] | 487 | bne,pn %XCC, 90f |
David S. Miller | 25e5566ed | 2007-10-02 01:03:09 -0700 | [diff] [blame] | 488 | sub %o0, %i1, %i3 |
David S. Miller | 398d108 | 2006-03-05 16:41:56 -0800 | [diff] [blame] | 489 | |
| 490 | 1: |
David S. Miller | 25e5566ed | 2007-10-02 01:03:09 -0700 | [diff] [blame] | 491 | subcc %i2, 4, %i2 |
David S. Miller | 7ae3aaf | 2016-10-24 19:32:12 -0700 | [diff] [blame] | 492 | EX_LD(LOAD(lduw, %i1, %g1), NG_ret_i2_plus_4) |
| 493 | EX_ST(STORE(stw, %g1, %i1 + %i3), NG_ret_i2_plus_4) |
David S. Miller | 398d108 | 2006-03-05 16:41:56 -0800 | [diff] [blame] | 494 | bgu,pt %XCC, 1b |
David S. Miller | 25e5566ed | 2007-10-02 01:03:09 -0700 | [diff] [blame] | 495 | add %i1, 4, %i1 |
David S. Miller | 398d108 | 2006-03-05 16:41:56 -0800 | [diff] [blame] | 496 | |
David S. Miller | 25e5566ed | 2007-10-02 01:03:09 -0700 | [diff] [blame] | 497 | 85: ret |
| 498 | restore EX_RETVAL(%i0), %g0, %o0 |
David S. Miller | 398d108 | 2006-03-05 16:41:56 -0800 | [diff] [blame] | 499 | |
| 500 | .align 32 |
| 501 | 90: |
David S. Miller | 25e5566ed | 2007-10-02 01:03:09 -0700 | [diff] [blame] | 502 | subcc %i2, 1, %i2 |
David S. Miller | 7ae3aaf | 2016-10-24 19:32:12 -0700 | [diff] [blame] | 503 | EX_LD(LOAD(ldub, %i1, %g1), NG_ret_i2_plus_1) |
| 504 | EX_ST(STORE(stb, %g1, %i1 + %i3), NG_ret_i2_plus_1) |
David S. Miller | 398d108 | 2006-03-05 16:41:56 -0800 | [diff] [blame] | 505 | bgu,pt %XCC, 90b |
David S. Miller | 25e5566ed | 2007-10-02 01:03:09 -0700 | [diff] [blame] | 506 | add %i1, 1, %i1 |
| 507 | ret |
| 508 | restore EX_RETVAL(%i0), %g0, %o0 |
David S. Miller | 398d108 | 2006-03-05 16:41:56 -0800 | [diff] [blame] | 509 | |
| 510 | .size FUNC_NAME, .-FUNC_NAME |