Blame - arch/arm64/lib/memcpy.S - SHIFTPHONES/mainline/linux

blob: 31073a8304fb6eae4a9217268cffbb7e42b727b4 [file] [log] [blame]

Thomas Gleixner	caab277	2019-06-03 07:44:50 +0200	[diff] [blame]	1	/* SPDX-License-Identifier: GPL-2.0-only */
Catalin Marinas	4a89922	2013-03-21 16:16:43 +0000	[diff] [blame]	2	/*
Robin Murphy	2851330	2021-05-27 16:34:46 +0100	[diff] [blame^]	3	* Copyright (c) 2012-2020, Arm Limited.
zhichang.yuan	808dbac	2014-04-28 06:11:29 +0100	[diff] [blame]	4	*
Robin Murphy	2851330	2021-05-27 16:34:46 +0100	[diff] [blame^]	5	* Adapted from the original at:
				6	* https://github.com/ARM-software/optimized-routines/blob/master/string/aarch64/memcpy.S
Catalin Marinas	4a89922	2013-03-21 16:16:43 +0000	[diff] [blame]	7	*/
				8
				9	#include <linux/linkage.h>
				10	#include <asm/assembler.h>
				11
Robin Murphy	2851330	2021-05-27 16:34:46 +0100	[diff] [blame^]	12	/* Assumptions:
Catalin Marinas	4a89922	2013-03-21 16:16:43 +0000	[diff] [blame]	13	*
Robin Murphy	2851330	2021-05-27 16:34:46 +0100	[diff] [blame^]	14	* ARMv8-a, AArch64, unaligned accesses.
				15	*
Catalin Marinas	4a89922	2013-03-21 16:16:43 +0000	[diff] [blame]	16	*/
zhichang.yuan	808dbac	2014-04-28 06:11:29 +0100	[diff] [blame]	17
Robin Murphy	2851330	2021-05-27 16:34:46 +0100	[diff] [blame^]	18	#define L(label) .L ## label
Feng Kan	e5c88e3	2015-09-23 11:55:38 -0700	[diff] [blame]	19
Robin Murphy	2851330	2021-05-27 16:34:46 +0100	[diff] [blame^]	20	#define dstin x0
				21	#define src x1
				22	#define count x2
				23	#define dst x3
				24	#define srcend x4
				25	#define dstend x5
				26	#define A_l x6
				27	#define A_lw w6
				28	#define A_h x7
				29	#define B_l x8
				30	#define B_lw w8
				31	#define B_h x9
				32	#define C_l x10
				33	#define C_lw w10
				34	#define C_h x11
				35	#define D_l x12
				36	#define D_h x13
				37	#define E_l x14
				38	#define E_h x15
				39	#define F_l x16
				40	#define F_h x17
				41	#define G_l count
				42	#define G_h dst
				43	#define H_l src
				44	#define H_h srcend
				45	#define tmp1 x14
Feng Kan	e5c88e3	2015-09-23 11:55:38 -0700	[diff] [blame]	46
Robin Murphy	2851330	2021-05-27 16:34:46 +0100	[diff] [blame^]	47	/* This implementation handles overlaps and supports both memcpy and memmove
				48	from a single entry point. It uses unaligned accesses and branchless
				49	sequences to keep the code small, simple and improve performance.
Feng Kan	e5c88e3	2015-09-23 11:55:38 -0700	[diff] [blame]	50
Robin Murphy	2851330	2021-05-27 16:34:46 +0100	[diff] [blame^]	51	Copies are split into 3 main cases: small copies of up to 32 bytes, medium
				52	copies of up to 128 bytes, and large copies. The overhead of the overlap
				53	check is negligible since it is only required for large copies.
Feng Kan	e5c88e3	2015-09-23 11:55:38 -0700	[diff] [blame]	54
Robin Murphy	2851330	2021-05-27 16:34:46 +0100	[diff] [blame^]	55	Large copies use a software pipelined loop processing 64 bytes per iteration.
				56	The destination pointer is 16-byte aligned to minimize unaligned accesses.
				57	The loop tail is handled by always copying 64 bytes from the end.
				58	*/
Feng Kan	e5c88e3	2015-09-23 11:55:38 -0700	[diff] [blame]	59
Robin Murphy	2851330	2021-05-27 16:34:46 +0100	[diff] [blame^]	60	SYM_FUNC_START_ALIAS(__memmove)
				61	SYM_FUNC_START_WEAK_ALIAS_PI(memmove)
Mark Brown	3ac0f45	2020-01-06 19:58:17 +0000	[diff] [blame]	62	SYM_FUNC_START_ALIAS(__memcpy)
Fangrui Song	ec9d780	2020-10-29 11:19:51 -0700	[diff] [blame]	63	SYM_FUNC_START_WEAK_PI(memcpy)
Robin Murphy	2851330	2021-05-27 16:34:46 +0100	[diff] [blame^]	64	add srcend, src, count
				65	add dstend, dstin, count
				66	cmp count, 128
				67	b.hi L(copy_long)
				68	cmp count, 32
				69	b.hi L(copy32_128)
				70
				71	/* Small copies: 0..32 bytes. */
				72	cmp count, 16
				73	b.lo L(copy16)
				74	ldp A_l, A_h, [src]
				75	ldp D_l, D_h, [srcend, -16]
				76	stp A_l, A_h, [dstin]
				77	stp D_l, D_h, [dstend, -16]
zhichang.yuan	808dbac	2014-04-28 06:11:29 +0100	[diff] [blame]	78	ret
Robin Murphy	2851330	2021-05-27 16:34:46 +0100	[diff] [blame^]	79
				80	/* Copy 8-15 bytes. */
				81	L(copy16):
				82	tbz count, 3, L(copy8)
				83	ldr A_l, [src]
				84	ldr A_h, [srcend, -8]
				85	str A_l, [dstin]
				86	str A_h, [dstend, -8]
				87	ret
				88
				89	.p2align 3
				90	/* Copy 4-7 bytes. */
				91	L(copy8):
				92	tbz count, 2, L(copy4)
				93	ldr A_lw, [src]
				94	ldr B_lw, [srcend, -4]
				95	str A_lw, [dstin]
				96	str B_lw, [dstend, -4]
				97	ret
				98
				99	/* Copy 0..3 bytes using a branchless sequence. */
				100	L(copy4):
				101	cbz count, L(copy0)
				102	lsr tmp1, count, 1
				103	ldrb A_lw, [src]
				104	ldrb C_lw, [srcend, -1]
				105	ldrb B_lw, [src, tmp1]
				106	strb A_lw, [dstin]
				107	strb B_lw, [dstin, tmp1]
				108	strb C_lw, [dstend, -1]
				109	L(copy0):
				110	ret
				111
				112	.p2align 4
				113	/* Medium copies: 33..128 bytes. */
				114	L(copy32_128):
				115	ldp A_l, A_h, [src]
				116	ldp B_l, B_h, [src, 16]
				117	ldp C_l, C_h, [srcend, -32]
				118	ldp D_l, D_h, [srcend, -16]
				119	cmp count, 64
				120	b.hi L(copy128)
				121	stp A_l, A_h, [dstin]
				122	stp B_l, B_h, [dstin, 16]
				123	stp C_l, C_h, [dstend, -32]
				124	stp D_l, D_h, [dstend, -16]
				125	ret
				126
				127	.p2align 4
				128	/* Copy 65..128 bytes. */
				129	L(copy128):
				130	ldp E_l, E_h, [src, 32]
				131	ldp F_l, F_h, [src, 48]
				132	cmp count, 96
				133	b.ls L(copy96)
				134	ldp G_l, G_h, [srcend, -64]
				135	ldp H_l, H_h, [srcend, -48]
				136	stp G_l, G_h, [dstend, -64]
				137	stp H_l, H_h, [dstend, -48]
				138	L(copy96):
				139	stp A_l, A_h, [dstin]
				140	stp B_l, B_h, [dstin, 16]
				141	stp E_l, E_h, [dstin, 32]
				142	stp F_l, F_h, [dstin, 48]
				143	stp C_l, C_h, [dstend, -32]
				144	stp D_l, D_h, [dstend, -16]
				145	ret
				146
				147	.p2align 4
				148	/* Copy more than 128 bytes. */
				149	L(copy_long):
				150	/* Use backwards copy if there is an overlap. */
				151	sub tmp1, dstin, src
				152	cbz tmp1, L(copy0)
				153	cmp tmp1, count
				154	b.lo L(copy_long_backwards)
				155
				156	/* Copy 16 bytes and then align dst to 16-byte alignment. */
				157
				158	ldp D_l, D_h, [src]
				159	and tmp1, dstin, 15
				160	bic dst, dstin, 15
				161	sub src, src, tmp1
				162	add count, count, tmp1 /* Count is now 16 too large. */
				163	ldp A_l, A_h, [src, 16]
				164	stp D_l, D_h, [dstin]
				165	ldp B_l, B_h, [src, 32]
				166	ldp C_l, C_h, [src, 48]
				167	ldp D_l, D_h, [src, 64]!
				168	subs count, count, 128 + 16 /* Test and readjust count. */
				169	b.ls L(copy64_from_end)
				170
				171	L(loop64):
				172	stp A_l, A_h, [dst, 16]
				173	ldp A_l, A_h, [src, 16]
				174	stp B_l, B_h, [dst, 32]
				175	ldp B_l, B_h, [src, 32]
				176	stp C_l, C_h, [dst, 48]
				177	ldp C_l, C_h, [src, 48]
				178	stp D_l, D_h, [dst, 64]!
				179	ldp D_l, D_h, [src, 64]!
				180	subs count, count, 64
				181	b.hi L(loop64)
				182
				183	/* Write the last iteration and copy 64 bytes from the end. */
				184	L(copy64_from_end):
				185	ldp E_l, E_h, [srcend, -64]
				186	stp A_l, A_h, [dst, 16]
				187	ldp A_l, A_h, [srcend, -48]
				188	stp B_l, B_h, [dst, 32]
				189	ldp B_l, B_h, [srcend, -32]
				190	stp C_l, C_h, [dst, 48]
				191	ldp C_l, C_h, [srcend, -16]
				192	stp D_l, D_h, [dst, 64]
				193	stp E_l, E_h, [dstend, -64]
				194	stp A_l, A_h, [dstend, -48]
				195	stp B_l, B_h, [dstend, -32]
				196	stp C_l, C_h, [dstend, -16]
				197	ret
				198
				199	.p2align 4
				200
				201	/* Large backwards copy for overlapping copies.
				202	Copy 16 bytes and then align dst to 16-byte alignment. */
				203	L(copy_long_backwards):
				204	ldp D_l, D_h, [srcend, -16]
				205	and tmp1, dstend, 15
				206	sub srcend, srcend, tmp1
				207	sub count, count, tmp1
				208	ldp A_l, A_h, [srcend, -16]
				209	stp D_l, D_h, [dstend, -16]
				210	ldp B_l, B_h, [srcend, -32]
				211	ldp C_l, C_h, [srcend, -48]
				212	ldp D_l, D_h, [srcend, -64]!
				213	sub dstend, dstend, tmp1
				214	subs count, count, 128
				215	b.ls L(copy64_from_start)
				216
				217	L(loop64_backwards):
				218	stp A_l, A_h, [dstend, -16]
				219	ldp A_l, A_h, [srcend, -16]
				220	stp B_l, B_h, [dstend, -32]
				221	ldp B_l, B_h, [srcend, -32]
				222	stp C_l, C_h, [dstend, -48]
				223	ldp C_l, C_h, [srcend, -48]
				224	stp D_l, D_h, [dstend, -64]!
				225	ldp D_l, D_h, [srcend, -64]!
				226	subs count, count, 64
				227	b.hi L(loop64_backwards)
				228
				229	/* Write the last iteration and copy 64 bytes from the start. */
				230	L(copy64_from_start):
				231	ldp G_l, G_h, [src, 48]
				232	stp A_l, A_h, [dstend, -16]
				233	ldp A_l, A_h, [src, 32]
				234	stp B_l, B_h, [dstend, -32]
				235	ldp B_l, B_h, [src, 16]
				236	stp C_l, C_h, [dstend, -48]
				237	ldp C_l, C_h, [src]
				238	stp D_l, D_h, [dstend, -64]
				239	stp G_l, G_h, [dstin, 48]
				240	stp A_l, A_h, [dstin, 32]
				241	stp B_l, B_h, [dstin, 16]
				242	stp C_l, C_h, [dstin]
				243	ret
				244
Mark Brown	3ac0f45	2020-01-06 19:58:17 +0000	[diff] [blame]	245	SYM_FUNC_END_PI(memcpy)
Mark Rutland	ac0e8c7	2018-12-07 18:08:21 +0000	[diff] [blame]	246	EXPORT_SYMBOL(memcpy)
Mark Brown	3ac0f45	2020-01-06 19:58:17 +0000	[diff] [blame]	247	SYM_FUNC_END_ALIAS(__memcpy)
Mark Rutland	ac0e8c7	2018-12-07 18:08:21 +0000	[diff] [blame]	248	EXPORT_SYMBOL(__memcpy)
Robin Murphy	2851330	2021-05-27 16:34:46 +0100	[diff] [blame^]	249	SYM_FUNC_END_ALIAS_PI(memmove)
				250	EXPORT_SYMBOL(memmove)
				251	SYM_FUNC_END_ALIAS(__memmove)
				252	EXPORT_SYMBOL(__memmove)