blob: b82fd64ee1e1c01391187997345c82f08dc58f15 [file] [log] [blame]
Thomas Gleixnercaab2772019-06-03 07:44:50 +02001/* SPDX-License-Identifier: GPL-2.0-only */
Catalin Marinas4a899222013-03-21 16:16:43 +00002/*
Mark Rutland6b8f6482021-06-02 16:13:58 +01003 * Copyright (c) 2012-2021, Arm Limited.
zhichang.yuan808dbac2014-04-28 06:11:29 +01004 *
Robin Murphy28513302021-05-27 16:34:46 +01005 * Adapted from the original at:
Mark Rutland6b8f6482021-06-02 16:13:58 +01006 * https://github.com/ARM-software/optimized-routines/blob/afd6244a1f8d9229/string/aarch64/memcpy.S
Catalin Marinas4a899222013-03-21 16:16:43 +00007 */
8
9#include <linux/linkage.h>
10#include <asm/assembler.h>
11
Robin Murphy28513302021-05-27 16:34:46 +010012/* Assumptions:
Catalin Marinas4a899222013-03-21 16:16:43 +000013 *
Robin Murphy28513302021-05-27 16:34:46 +010014 * ARMv8-a, AArch64, unaligned accesses.
15 *
Catalin Marinas4a899222013-03-21 16:16:43 +000016 */
zhichang.yuan808dbac2014-04-28 06:11:29 +010017
Robin Murphy28513302021-05-27 16:34:46 +010018#define L(label) .L ## label
Feng Kane5c88e32015-09-23 11:55:38 -070019
Robin Murphy28513302021-05-27 16:34:46 +010020#define dstin x0
21#define src x1
22#define count x2
23#define dst x3
24#define srcend x4
25#define dstend x5
26#define A_l x6
27#define A_lw w6
28#define A_h x7
29#define B_l x8
30#define B_lw w8
31#define B_h x9
32#define C_l x10
33#define C_lw w10
34#define C_h x11
35#define D_l x12
36#define D_h x13
37#define E_l x14
38#define E_h x15
39#define F_l x16
40#define F_h x17
41#define G_l count
42#define G_h dst
43#define H_l src
44#define H_h srcend
45#define tmp1 x14
Feng Kane5c88e32015-09-23 11:55:38 -070046
Robin Murphy28513302021-05-27 16:34:46 +010047/* This implementation handles overlaps and supports both memcpy and memmove
48 from a single entry point. It uses unaligned accesses and branchless
49 sequences to keep the code small, simple and improve performance.
Feng Kane5c88e32015-09-23 11:55:38 -070050
Robin Murphy28513302021-05-27 16:34:46 +010051 Copies are split into 3 main cases: small copies of up to 32 bytes, medium
52 copies of up to 128 bytes, and large copies. The overhead of the overlap
53 check is negligible since it is only required for large copies.
Feng Kane5c88e32015-09-23 11:55:38 -070054
Robin Murphy28513302021-05-27 16:34:46 +010055 Large copies use a software pipelined loop processing 64 bytes per iteration.
56 The destination pointer is 16-byte aligned to minimize unaligned accesses.
57 The loop tail is handled by always copying 64 bytes from the end.
58*/
Feng Kane5c88e32015-09-23 11:55:38 -070059
Robin Murphy28513302021-05-27 16:34:46 +010060SYM_FUNC_START_ALIAS(__memmove)
61SYM_FUNC_START_WEAK_ALIAS_PI(memmove)
Mark Brown3ac0f452020-01-06 19:58:17 +000062SYM_FUNC_START_ALIAS(__memcpy)
Fangrui Songec9d7802020-10-29 11:19:51 -070063SYM_FUNC_START_WEAK_PI(memcpy)
Robin Murphy28513302021-05-27 16:34:46 +010064 add srcend, src, count
65 add dstend, dstin, count
66 cmp count, 128
67 b.hi L(copy_long)
68 cmp count, 32
69 b.hi L(copy32_128)
70
71 /* Small copies: 0..32 bytes. */
72 cmp count, 16
73 b.lo L(copy16)
74 ldp A_l, A_h, [src]
75 ldp D_l, D_h, [srcend, -16]
76 stp A_l, A_h, [dstin]
77 stp D_l, D_h, [dstend, -16]
zhichang.yuan808dbac2014-04-28 06:11:29 +010078 ret
Robin Murphy28513302021-05-27 16:34:46 +010079
80 /* Copy 8-15 bytes. */
81L(copy16):
82 tbz count, 3, L(copy8)
83 ldr A_l, [src]
84 ldr A_h, [srcend, -8]
85 str A_l, [dstin]
86 str A_h, [dstend, -8]
87 ret
88
89 .p2align 3
90 /* Copy 4-7 bytes. */
91L(copy8):
92 tbz count, 2, L(copy4)
93 ldr A_lw, [src]
94 ldr B_lw, [srcend, -4]
95 str A_lw, [dstin]
96 str B_lw, [dstend, -4]
97 ret
98
99 /* Copy 0..3 bytes using a branchless sequence. */
100L(copy4):
101 cbz count, L(copy0)
102 lsr tmp1, count, 1
103 ldrb A_lw, [src]
104 ldrb C_lw, [srcend, -1]
105 ldrb B_lw, [src, tmp1]
106 strb A_lw, [dstin]
107 strb B_lw, [dstin, tmp1]
108 strb C_lw, [dstend, -1]
109L(copy0):
110 ret
111
112 .p2align 4
113 /* Medium copies: 33..128 bytes. */
114L(copy32_128):
115 ldp A_l, A_h, [src]
116 ldp B_l, B_h, [src, 16]
117 ldp C_l, C_h, [srcend, -32]
118 ldp D_l, D_h, [srcend, -16]
119 cmp count, 64
120 b.hi L(copy128)
121 stp A_l, A_h, [dstin]
122 stp B_l, B_h, [dstin, 16]
123 stp C_l, C_h, [dstend, -32]
124 stp D_l, D_h, [dstend, -16]
125 ret
126
127 .p2align 4
128 /* Copy 65..128 bytes. */
129L(copy128):
130 ldp E_l, E_h, [src, 32]
131 ldp F_l, F_h, [src, 48]
132 cmp count, 96
133 b.ls L(copy96)
134 ldp G_l, G_h, [srcend, -64]
135 ldp H_l, H_h, [srcend, -48]
136 stp G_l, G_h, [dstend, -64]
137 stp H_l, H_h, [dstend, -48]
138L(copy96):
139 stp A_l, A_h, [dstin]
140 stp B_l, B_h, [dstin, 16]
141 stp E_l, E_h, [dstin, 32]
142 stp F_l, F_h, [dstin, 48]
143 stp C_l, C_h, [dstend, -32]
144 stp D_l, D_h, [dstend, -16]
145 ret
146
147 .p2align 4
148 /* Copy more than 128 bytes. */
149L(copy_long):
150 /* Use backwards copy if there is an overlap. */
151 sub tmp1, dstin, src
152 cbz tmp1, L(copy0)
153 cmp tmp1, count
154 b.lo L(copy_long_backwards)
155
156 /* Copy 16 bytes and then align dst to 16-byte alignment. */
157
158 ldp D_l, D_h, [src]
159 and tmp1, dstin, 15
160 bic dst, dstin, 15
161 sub src, src, tmp1
162 add count, count, tmp1 /* Count is now 16 too large. */
163 ldp A_l, A_h, [src, 16]
164 stp D_l, D_h, [dstin]
165 ldp B_l, B_h, [src, 32]
166 ldp C_l, C_h, [src, 48]
167 ldp D_l, D_h, [src, 64]!
168 subs count, count, 128 + 16 /* Test and readjust count. */
169 b.ls L(copy64_from_end)
170
171L(loop64):
172 stp A_l, A_h, [dst, 16]
173 ldp A_l, A_h, [src, 16]
174 stp B_l, B_h, [dst, 32]
175 ldp B_l, B_h, [src, 32]
176 stp C_l, C_h, [dst, 48]
177 ldp C_l, C_h, [src, 48]
178 stp D_l, D_h, [dst, 64]!
179 ldp D_l, D_h, [src, 64]!
180 subs count, count, 64
181 b.hi L(loop64)
182
183 /* Write the last iteration and copy 64 bytes from the end. */
184L(copy64_from_end):
185 ldp E_l, E_h, [srcend, -64]
186 stp A_l, A_h, [dst, 16]
187 ldp A_l, A_h, [srcend, -48]
188 stp B_l, B_h, [dst, 32]
189 ldp B_l, B_h, [srcend, -32]
190 stp C_l, C_h, [dst, 48]
191 ldp C_l, C_h, [srcend, -16]
192 stp D_l, D_h, [dst, 64]
193 stp E_l, E_h, [dstend, -64]
194 stp A_l, A_h, [dstend, -48]
195 stp B_l, B_h, [dstend, -32]
196 stp C_l, C_h, [dstend, -16]
197 ret
198
199 .p2align 4
200
201 /* Large backwards copy for overlapping copies.
202 Copy 16 bytes and then align dst to 16-byte alignment. */
203L(copy_long_backwards):
204 ldp D_l, D_h, [srcend, -16]
205 and tmp1, dstend, 15
206 sub srcend, srcend, tmp1
207 sub count, count, tmp1
208 ldp A_l, A_h, [srcend, -16]
209 stp D_l, D_h, [dstend, -16]
210 ldp B_l, B_h, [srcend, -32]
211 ldp C_l, C_h, [srcend, -48]
212 ldp D_l, D_h, [srcend, -64]!
213 sub dstend, dstend, tmp1
214 subs count, count, 128
215 b.ls L(copy64_from_start)
216
217L(loop64_backwards):
218 stp A_l, A_h, [dstend, -16]
219 ldp A_l, A_h, [srcend, -16]
220 stp B_l, B_h, [dstend, -32]
221 ldp B_l, B_h, [srcend, -32]
222 stp C_l, C_h, [dstend, -48]
223 ldp C_l, C_h, [srcend, -48]
224 stp D_l, D_h, [dstend, -64]!
225 ldp D_l, D_h, [srcend, -64]!
226 subs count, count, 64
227 b.hi L(loop64_backwards)
228
229 /* Write the last iteration and copy 64 bytes from the start. */
230L(copy64_from_start):
231 ldp G_l, G_h, [src, 48]
232 stp A_l, A_h, [dstend, -16]
233 ldp A_l, A_h, [src, 32]
234 stp B_l, B_h, [dstend, -32]
235 ldp B_l, B_h, [src, 16]
236 stp C_l, C_h, [dstend, -48]
237 ldp C_l, C_h, [src]
238 stp D_l, D_h, [dstend, -64]
239 stp G_l, G_h, [dstin, 48]
240 stp A_l, A_h, [dstin, 32]
241 stp B_l, B_h, [dstin, 16]
242 stp C_l, C_h, [dstin]
243 ret
244
Mark Brown3ac0f452020-01-06 19:58:17 +0000245SYM_FUNC_END_PI(memcpy)
Mark Rutlandac0e8c72018-12-07 18:08:21 +0000246EXPORT_SYMBOL(memcpy)
Mark Brown3ac0f452020-01-06 19:58:17 +0000247SYM_FUNC_END_ALIAS(__memcpy)
Mark Rutlandac0e8c72018-12-07 18:08:21 +0000248EXPORT_SYMBOL(__memcpy)
Robin Murphy28513302021-05-27 16:34:46 +0100249SYM_FUNC_END_ALIAS_PI(memmove)
250EXPORT_SYMBOL(memmove)
251SYM_FUNC_END_ALIAS(__memmove)
Mark Rutland6b8f6482021-06-02 16:13:58 +0100252EXPORT_SYMBOL(__memmove)