Blame - arch/sh/lib64/copy_user_memcpy.S - SHIFTPHONES/mainline/linux

blob: 515f81b002021cb1f7a69fc6b544dbc7015e0e50 [file] [log] [blame]

Greg Kroah-Hartman	b244131	2017-11-01 15:07:57 +0100	[diff] [blame]	1	! SPDX-License-Identifier: GPL-2.0
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	2	!
				3	! Fast SH memcpy
				4	!
				5	! by Toshiyasu Morita (tm@netcom.com)
				6	! hacked by J"orn Rernnecke (joern.rennecke@superh.com) ("o for o-umlaut)
				7	! SH5 code Copyright 2002 SuperH Ltd.
				8	!
				9	! Entry: ARG0: destination pointer
				10	! ARG1: source pointer
				11	! ARG2: byte count
				12	!
				13	! Exit: RESULT: destination pointer
				14	! any other registers in the range r0-r7: trashed
				15	!
				16	! Notes: Usually one wants to do small reads and write a longword, but
				17	! unfortunately it is difficult in some cases to concatanate bytes
				18	! into a longword on the SH, so this does a longword read and small
				19	! writes.
				20	!
				21	! This implementation makes two assumptions about how it is called:
				22	!
				23	! 1.: If the byte count is nonzero, the address of the last byte to be
				24	! copied is unsigned greater than the address of the first byte to
				25	! be copied. This could be easily swapped for a signed comparison,
				26	! but the algorithm used needs some comparison.
				27	!
				28	! 2.: When there are two or three bytes in the last word of an 11-or-more
				29	! bytes memory chunk to b copied, the rest of the word can be read
				30	! without side effects.
Lucas De Marchi	25985ed	2011-03-30 22:57:33 -0300	[diff] [blame]	31	! This could be easily changed by increasing the minimum size of
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	32	! a fast memcpy and the amount subtracted from r7 before L_2l_loop be 2,
				33	! however, this would cost a few extra cyles on average.
				34	! For SHmedia, the assumption is that any quadword can be read in its
				35	! enirety if at least one byte is included in the copy.
				36
				37	/* Imported into Linux kernel by Richard Curnow. This is used to implement the
				38	__copy_user function in the general case, so it has to be a distinct
				39	function from intra-kernel memcpy to allow for exception fix-ups in the
				40	event that the user pointer is bad somewhere in the copy (e.g. due to
				41	running off the end of the vma).
				42
				43	Note, this algorithm will be slightly wasteful in the case where the source
				44	and destination pointers are equally aligned, because the stlo/sthi pairs
				45	could then be merged back into single stores. If there are a lot of cache
				46	misses, this is probably offset by the stall lengths on the preloads.
				47
				48	*/
				49
				50	/* NOTE : Prefetches removed and allocos guarded by synco to avoid TAKum03020
				51	* erratum. The first two prefetches are nop-ed out to avoid upsetting the
				52	* instruction counts used in the jump address calculation.
				53	* */
				54
				55	.section .text..SHmedia32,"ax"
				56	.little
				57	.balign 32
				58	.global copy_user_memcpy
				59	.global copy_user_memcpy_end
				60	copy_user_memcpy:
				61
				62	#define LDUAQ(P,O,D0,D1) ldlo.q P,O,D0; ldhi.q P,O+7,D1
				63	#define STUAQ(P,O,D0,D1) stlo.q P,O,D0; sthi.q P,O+7,D1
				64	#define LDUAL(P,O,D0,D1) ldlo.l P,O,D0; ldhi.l P,O+3,D1
				65	#define STUAL(P,O,D0,D1) stlo.l P,O,D0; sthi.l P,O+3,D1
				66
				67	nop ! ld.b r3,0,r63 ! TAKum03020
				68	pta/l Large,tr0
				69	movi 25,r0
				70	bgeu/u r4,r0,tr0
				71	nsb r4,r0
				72	shlli r0,5,r0
				73	movi (L1-L0+63*32 + 1) & 0xffff,r1
				74	sub r1, r0, r0
				75	L0: ptrel r0,tr0
				76	add r2,r4,r5
				77	ptabs r18,tr1
				78	add r3,r4,r6
				79	blink tr0,r63
				80
				81	/* Rearranged to make cut2 safe */
				82	.balign 8
				83	L4_7: /* 4..7 byte memcpy cntd. */
				84	stlo.l r2, 0, r0
				85	or r6, r7, r6
				86	sthi.l r5, -1, r6
				87	stlo.l r5, -4, r6
				88	blink tr1,r63
				89
				90	.balign 8
				91	L1: /* 0 byte memcpy */
				92	nop
				93	blink tr1,r63
				94	nop
				95	nop
				96	nop
				97	nop
				98
				99	L2_3: /* 2 or 3 byte memcpy cntd. */
				100	st.b r5,-1,r6
				101	blink tr1,r63
				102
				103	/* 1 byte memcpy */
				104	ld.b r3,0,r0
				105	st.b r2,0,r0
				106	blink tr1,r63
				107
				108	L8_15: /* 8..15 byte memcpy cntd. */
				109	stlo.q r2, 0, r0
				110	or r6, r7, r6
				111	sthi.q r5, -1, r6
				112	stlo.q r5, -8, r6
				113	blink tr1,r63
				114
				115	/* 2 or 3 byte memcpy */
				116	ld.b r3,0,r0
				117	nop ! ld.b r2,0,r63 ! TAKum03020
				118	ld.b r3,1,r1
				119	st.b r2,0,r0
				120	pta/l L2_3,tr0
				121	ld.b r6,-1,r6
				122	st.b r2,1,r1
				123	blink tr0, r63
				124
				125	/* 4 .. 7 byte memcpy */
				126	LDUAL (r3, 0, r0, r1)
				127	pta L4_7, tr0
				128	ldlo.l r6, -4, r7
				129	or r0, r1, r0
				130	sthi.l r2, 3, r0
				131	ldhi.l r6, -1, r6
				132	blink tr0, r63
				133
				134	/* 8 .. 15 byte memcpy */
				135	LDUAQ (r3, 0, r0, r1)
				136	pta L8_15, tr0
				137	ldlo.q r6, -8, r7
				138	or r0, r1, r0
				139	sthi.q r2, 7, r0
				140	ldhi.q r6, -1, r6
				141	blink tr0, r63
				142
				143	/* 16 .. 24 byte memcpy */
				144	LDUAQ (r3, 0, r0, r1)
				145	LDUAQ (r3, 8, r8, r9)
				146	or r0, r1, r0
				147	sthi.q r2, 7, r0
				148	or r8, r9, r8
				149	sthi.q r2, 15, r8
				150	ldlo.q r6, -8, r7
				151	ldhi.q r6, -1, r6
				152	stlo.q r2, 8, r8
				153	stlo.q r2, 0, r0
				154	or r6, r7, r6
				155	sthi.q r5, -1, r6
				156	stlo.q r5, -8, r6
				157	blink tr1,r63
				158
				159	Large:
				160	! ld.b r2, 0, r63 ! TAKum03020
				161	pta/l Loop_ua, tr1
				162	ori r3, -8, r7
				163	sub r2, r7, r22
				164	sub r3, r2, r6
				165	add r2, r4, r5
				166	ldlo.q r3, 0, r0
				167	addi r5, -16, r5
				168	movi 64+8, r27 ! could subtract r7 from that.
				169	stlo.q r2, 0, r0
				170	sthi.q r2, 7, r0
				171	ldx.q r22, r6, r0
				172	bgtu/l r27, r4, tr1
				173
				174	addi r5, -48, r27
				175	pta/l Loop_line, tr0
				176	addi r6, 64, r36
				177	addi r6, -24, r19
				178	addi r6, -16, r20
				179	addi r6, -8, r21
				180
				181	Loop_line:
				182	! ldx.q r22, r36, r63 ! TAKum03020
				183	alloco r22, 32
				184	synco
				185	addi r22, 32, r22
				186	ldx.q r22, r19, r23
				187	sthi.q r22, -25, r0
				188	ldx.q r22, r20, r24
				189	ldx.q r22, r21, r25
				190	stlo.q r22, -32, r0
				191	ldx.q r22, r6, r0
				192	sthi.q r22, -17, r23
				193	sthi.q r22, -9, r24
				194	sthi.q r22, -1, r25
				195	stlo.q r22, -24, r23
				196	stlo.q r22, -16, r24
				197	stlo.q r22, -8, r25
				198	bgeu r27, r22, tr0
				199
				200	Loop_ua:
				201	addi r22, 8, r22
				202	sthi.q r22, -1, r0
				203	stlo.q r22, -8, r0
				204	ldx.q r22, r6, r0
				205	bgtu/l r5, r22, tr1
				206
				207	add r3, r4, r7
				208	ldlo.q r7, -8, r1
				209	sthi.q r22, 7, r0
				210	ldhi.q r7, -1, r7
				211	ptabs r18,tr1
				212	stlo.q r22, 0, r0
				213	or r1, r7, r1
				214	sthi.q r5, 15, r1
				215	stlo.q r5, 8, r1
				216	blink tr1, r63
				217	copy_user_memcpy_end:
				218	nop