Blame - lib/raid6/recov_neon_inner.c - SHIFTPHONES/kernel/shift/mainline

blob: 8cd20c9f834a1e8cc90e9a4f783f74a4244a5f7a [file] [log] [blame]

Ard Biesheuvel	6ec4e251	2017-07-13 18:16:01 +0100	[diff] [blame]	1	/*
				2	* Copyright (C) 2012 Intel Corporation
				3	* Copyright (C) 2017 Linaro Ltd. <ard.biesheuvel@linaro.org>
				4	*
				5	* This program is free software; you can redistribute it and/or
				6	* modify it under the terms of the GNU General Public License
				7	* as published by the Free Software Foundation; version 2
				8	* of the License.
				9	*/
				10
				11	#include <arm_neon.h>
				12
				13	static const uint8x16_t x0f = {
				14	0x0f, 0x0f, 0x0f, 0x0f, 0x0f, 0x0f, 0x0f, 0x0f,
				15	0x0f, 0x0f, 0x0f, 0x0f, 0x0f, 0x0f, 0x0f, 0x0f,
				16	};
				17
				18	#ifdef CONFIG_ARM
				19	/*
				20	* AArch32 does not provide this intrinsic natively because it does not
				21	* implement the underlying instruction. AArch32 only provides a 64-bit
				22	* wide vtbl.8 instruction, so use that instead.
				23	*/
				24	static uint8x16_t vqtbl1q_u8(uint8x16_t a, uint8x16_t b)
				25	{
				26	union {
				27	uint8x16_t val;
				28	uint8x8x2_t pair;
				29	} __a = { a };
				30
				31	return vcombine_u8(vtbl2_u8(__a.pair, vget_low_u8(b)),
				32	vtbl2_u8(__a.pair, vget_high_u8(b)));
				33	}
				34	#endif
				35
				36	void __raid6_2data_recov_neon(int bytes, uint8_t p, uint8_t q, uint8_t *dp,
				37	uint8_t dq, const uint8_t pbmul,
				38	const uint8_t *qmul)
				39	{
				40	uint8x16_t pm0 = vld1q_u8(pbmul);
				41	uint8x16_t pm1 = vld1q_u8(pbmul + 16);
				42	uint8x16_t qm0 = vld1q_u8(qmul);
				43	uint8x16_t qm1 = vld1q_u8(qmul + 16);
				44
				45	/*
				46	* while ( bytes-- ) {
				47	* uint8_t px, qx, db;
				48	*
				49	* px = p ^ dp;
				50	* qx = qmul[q ^ dq];
				51	* *dq++ = db = pbmul[px] ^ qx;
				52	* *dp++ = db ^ px;
				53	* p++; q++;
				54	* }
				55	*/
				56
				57	while (bytes) {
				58	uint8x16_t vx, vy, px, qx, db;
				59
				60	px = veorq_u8(vld1q_u8(p), vld1q_u8(dp));
				61	vx = veorq_u8(vld1q_u8(q), vld1q_u8(dq));
				62
				63	vy = (uint8x16_t)vshrq_n_s16((int16x8_t)vx, 4);
				64	vx = vqtbl1q_u8(qm0, vandq_u8(vx, x0f));
				65	vy = vqtbl1q_u8(qm1, vandq_u8(vy, x0f));
				66	qx = veorq_u8(vx, vy);
				67
				68	vy = (uint8x16_t)vshrq_n_s16((int16x8_t)px, 4);
				69	vx = vqtbl1q_u8(pm0, vandq_u8(px, x0f));
				70	vy = vqtbl1q_u8(pm1, vandq_u8(vy, x0f));
				71	vx = veorq_u8(vx, vy);
				72	db = veorq_u8(vx, qx);
				73
				74	vst1q_u8(dq, db);
				75	vst1q_u8(dp, veorq_u8(db, px));
				76
				77	bytes -= 16;
				78	p += 16;
				79	q += 16;
				80	dp += 16;
				81	dq += 16;
				82	}
				83	}
				84
				85	void __raid6_datap_recov_neon(int bytes, uint8_t p, uint8_t q, uint8_t *dq,
				86	const uint8_t *qmul)
				87	{
				88	uint8x16_t qm0 = vld1q_u8(qmul);
				89	uint8x16_t qm1 = vld1q_u8(qmul + 16);
				90
				91	/*
				92	* while (bytes--) {
				93	* p++ ^= dq = qmul[q ^ dq];
				94	* q++; dq++;
				95	* }
				96	*/
				97
				98	while (bytes) {
				99	uint8x16_t vx, vy;
				100
				101	vx = veorq_u8(vld1q_u8(q), vld1q_u8(dq));
				102
				103	vy = (uint8x16_t)vshrq_n_s16((int16x8_t)vx, 4);
				104	vx = vqtbl1q_u8(qm0, vandq_u8(vx, x0f));
				105	vy = vqtbl1q_u8(qm1, vandq_u8(vy, x0f));
				106	vx = veorq_u8(vx, vy);
				107	vy = veorq_u8(vx, vld1q_u8(p));
				108
				109	vst1q_u8(dq, vx);
				110	vst1q_u8(p, vy);
				111
				112	bytes -= 16;
				113	p += 16;
				114	q += 16;
				115	dq += 16;
				116	}
				117	}