blob: 515f81b002021cb1f7a69fc6b544dbc7015e0e50 [file] [log] [blame]
Greg Kroah-Hartmanb2441312017-11-01 15:07:57 +01001! SPDX-License-Identifier: GPL-2.0
Linus Torvalds1da177e2005-04-16 15:20:36 -07002!
3! Fast SH memcpy
4!
5! by Toshiyasu Morita (tm@netcom.com)
6! hacked by J"orn Rernnecke (joern.rennecke@superh.com) ("o for o-umlaut)
7! SH5 code Copyright 2002 SuperH Ltd.
8!
9! Entry: ARG0: destination pointer
10! ARG1: source pointer
11! ARG2: byte count
12!
13! Exit: RESULT: destination pointer
14! any other registers in the range r0-r7: trashed
15!
16! Notes: Usually one wants to do small reads and write a longword, but
17! unfortunately it is difficult in some cases to concatanate bytes
18! into a longword on the SH, so this does a longword read and small
19! writes.
20!
21! This implementation makes two assumptions about how it is called:
22!
23! 1.: If the byte count is nonzero, the address of the last byte to be
24! copied is unsigned greater than the address of the first byte to
25! be copied. This could be easily swapped for a signed comparison,
26! but the algorithm used needs some comparison.
27!
28! 2.: When there are two or three bytes in the last word of an 11-or-more
29! bytes memory chunk to b copied, the rest of the word can be read
30! without side effects.
Lucas De Marchi25985ed2011-03-30 22:57:33 -030031! This could be easily changed by increasing the minimum size of
Linus Torvalds1da177e2005-04-16 15:20:36 -070032! a fast memcpy and the amount subtracted from r7 before L_2l_loop be 2,
33! however, this would cost a few extra cyles on average.
34! For SHmedia, the assumption is that any quadword can be read in its
35! enirety if at least one byte is included in the copy.
36
37/* Imported into Linux kernel by Richard Curnow. This is used to implement the
38 __copy_user function in the general case, so it has to be a distinct
39 function from intra-kernel memcpy to allow for exception fix-ups in the
40 event that the user pointer is bad somewhere in the copy (e.g. due to
41 running off the end of the vma).
42
43 Note, this algorithm will be slightly wasteful in the case where the source
44 and destination pointers are equally aligned, because the stlo/sthi pairs
45 could then be merged back into single stores. If there are a lot of cache
46 misses, this is probably offset by the stall lengths on the preloads.
47
48*/
49
50/* NOTE : Prefetches removed and allocos guarded by synco to avoid TAKum03020
51 * erratum. The first two prefetches are nop-ed out to avoid upsetting the
52 * instruction counts used in the jump address calculation.
53 * */
54
55 .section .text..SHmedia32,"ax"
56 .little
57 .balign 32
58 .global copy_user_memcpy
59 .global copy_user_memcpy_end
60copy_user_memcpy:
61
62#define LDUAQ(P,O,D0,D1) ldlo.q P,O,D0; ldhi.q P,O+7,D1
63#define STUAQ(P,O,D0,D1) stlo.q P,O,D0; sthi.q P,O+7,D1
64#define LDUAL(P,O,D0,D1) ldlo.l P,O,D0; ldhi.l P,O+3,D1
65#define STUAL(P,O,D0,D1) stlo.l P,O,D0; sthi.l P,O+3,D1
66
67 nop ! ld.b r3,0,r63 ! TAKum03020
68 pta/l Large,tr0
69 movi 25,r0
70 bgeu/u r4,r0,tr0
71 nsb r4,r0
72 shlli r0,5,r0
73 movi (L1-L0+63*32 + 1) & 0xffff,r1
74 sub r1, r0, r0
75L0: ptrel r0,tr0
76 add r2,r4,r5
77 ptabs r18,tr1
78 add r3,r4,r6
79 blink tr0,r63
80
81/* Rearranged to make cut2 safe */
82 .balign 8
83L4_7: /* 4..7 byte memcpy cntd. */
84 stlo.l r2, 0, r0
85 or r6, r7, r6
86 sthi.l r5, -1, r6
87 stlo.l r5, -4, r6
88 blink tr1,r63
89
90 .balign 8
91L1: /* 0 byte memcpy */
92 nop
93 blink tr1,r63
94 nop
95 nop
96 nop
97 nop
98
99L2_3: /* 2 or 3 byte memcpy cntd. */
100 st.b r5,-1,r6
101 blink tr1,r63
102
103 /* 1 byte memcpy */
104 ld.b r3,0,r0
105 st.b r2,0,r0
106 blink tr1,r63
107
108L8_15: /* 8..15 byte memcpy cntd. */
109 stlo.q r2, 0, r0
110 or r6, r7, r6
111 sthi.q r5, -1, r6
112 stlo.q r5, -8, r6
113 blink tr1,r63
114
115 /* 2 or 3 byte memcpy */
116 ld.b r3,0,r0
117 nop ! ld.b r2,0,r63 ! TAKum03020
118 ld.b r3,1,r1
119 st.b r2,0,r0
120 pta/l L2_3,tr0
121 ld.b r6,-1,r6
122 st.b r2,1,r1
123 blink tr0, r63
124
125 /* 4 .. 7 byte memcpy */
126 LDUAL (r3, 0, r0, r1)
127 pta L4_7, tr0
128 ldlo.l r6, -4, r7
129 or r0, r1, r0
130 sthi.l r2, 3, r0
131 ldhi.l r6, -1, r6
132 blink tr0, r63
133
134 /* 8 .. 15 byte memcpy */
135 LDUAQ (r3, 0, r0, r1)
136 pta L8_15, tr0
137 ldlo.q r6, -8, r7
138 or r0, r1, r0
139 sthi.q r2, 7, r0
140 ldhi.q r6, -1, r6
141 blink tr0, r63
142
143 /* 16 .. 24 byte memcpy */
144 LDUAQ (r3, 0, r0, r1)
145 LDUAQ (r3, 8, r8, r9)
146 or r0, r1, r0
147 sthi.q r2, 7, r0
148 or r8, r9, r8
149 sthi.q r2, 15, r8
150 ldlo.q r6, -8, r7
151 ldhi.q r6, -1, r6
152 stlo.q r2, 8, r8
153 stlo.q r2, 0, r0
154 or r6, r7, r6
155 sthi.q r5, -1, r6
156 stlo.q r5, -8, r6
157 blink tr1,r63
158
159Large:
160 ! ld.b r2, 0, r63 ! TAKum03020
161 pta/l Loop_ua, tr1
162 ori r3, -8, r7
163 sub r2, r7, r22
164 sub r3, r2, r6
165 add r2, r4, r5
166 ldlo.q r3, 0, r0
167 addi r5, -16, r5
168 movi 64+8, r27 ! could subtract r7 from that.
169 stlo.q r2, 0, r0
170 sthi.q r2, 7, r0
171 ldx.q r22, r6, r0
172 bgtu/l r27, r4, tr1
173
174 addi r5, -48, r27
175 pta/l Loop_line, tr0
176 addi r6, 64, r36
177 addi r6, -24, r19
178 addi r6, -16, r20
179 addi r6, -8, r21
180
181Loop_line:
182 ! ldx.q r22, r36, r63 ! TAKum03020
183 alloco r22, 32
184 synco
185 addi r22, 32, r22
186 ldx.q r22, r19, r23
187 sthi.q r22, -25, r0
188 ldx.q r22, r20, r24
189 ldx.q r22, r21, r25
190 stlo.q r22, -32, r0
191 ldx.q r22, r6, r0
192 sthi.q r22, -17, r23
193 sthi.q r22, -9, r24
194 sthi.q r22, -1, r25
195 stlo.q r22, -24, r23
196 stlo.q r22, -16, r24
197 stlo.q r22, -8, r25
198 bgeu r27, r22, tr0
199
200Loop_ua:
201 addi r22, 8, r22
202 sthi.q r22, -1, r0
203 stlo.q r22, -8, r0
204 ldx.q r22, r6, r0
205 bgtu/l r5, r22, tr1
206
207 add r3, r4, r7
208 ldlo.q r7, -8, r1
209 sthi.q r22, 7, r0
210 ldhi.q r7, -1, r7
211 ptabs r18,tr1
212 stlo.q r22, 0, r0
213 or r1, r7, r1
214 sthi.q r5, 15, r1
215 stlo.q r5, 8, r1
216 blink tr1, r63
217copy_user_memcpy_end:
218 nop