Blame - arch/s390/crypto/crc32be-vx.S - SHIFTPHONES/android_kernel_shift_sdm845

blob: 8013989cd2e51d63a895a4d124d99a84952c09fa [file] [log] [blame]

Hendrik Brueckner	19c9378	2015-04-28 12:29:06 +0200	[diff] [blame]	1	/*
				2	* Hardware-accelerated CRC-32 variants for Linux on z Systems
				3	*
				4	* Use the z/Architecture Vector Extension Facility to accelerate the
				5	* computing of CRC-32 checksums.
				6	*
				7	* This CRC-32 implementation algorithm processes the most-significant
				8	* bit first (BE).
				9	*
				10	* Copyright IBM Corp. 2015
				11	* Author(s): Hendrik Brueckner <brueckner@linux.vnet.ibm.com>
				12	*/
				13
				14	#include <linux/linkage.h>
				15	#include <asm/vx-insn.h>
				16
				17	/* Vector register range containing CRC-32 constants */
				18	#define CONST_R1R2 %v9
				19	#define CONST_R3R4 %v10
				20	#define CONST_R5 %v11
				21	#define CONST_R6 %v12
				22	#define CONST_RU_POLY %v13
				23	#define CONST_CRC_POLY %v14
				24
				25	.data
				26	.align 8
				27
				28	/*
				29	* The CRC-32 constant block contains reduction constants to fold and
				30	* process particular chunks of the input data stream in parallel.
				31	*
				32	* For the CRC-32 variants, the constants are precomputed according to
				33	* these defintions:
				34	*
				35	* R1 = x4*128+64 mod P(x)
				36	* R2 = x4*128 mod P(x)
				37	* R3 = x128+64 mod P(x)
				38	* R4 = x128 mod P(x)
				39	* R5 = x96 mod P(x)
				40	* R6 = x64 mod P(x)
				41	*
				42	* Barret reduction constant, u, is defined as floor(x**64 / P(x)).
				43	*
				44	* where P(x) is the polynomial in the normal domain and the P'(x) is the
				45	* polynomial in the reversed (bitreflected) domain.
				46	*
				47	* Note that the constant definitions below are extended in order to compute
				48	* intermediate results with a single VECTOR GALOIS FIELD MULTIPLY instruction.
				49	* The righmost doubleword can be 0 to prevent contribution to the result or
				50	* can be multiplied by 1 to perform an XOR without the need for a separate
				51	* VECTOR EXCLUSIVE OR instruction.
				52	*
				53	* CRC-32 (IEEE 802.3 Ethernet, ...) polynomials:
				54	*
				55	* P(x) = 0x04C11DB7
				56	* P'(x) = 0xEDB88320
				57	*/
				58
				59	.Lconstants_CRC_32_BE:
				60	.quad 0x08833794c, 0x0e6228b11 # R1, R2
				61	.quad 0x0c5b9cd4c, 0x0e8a45605 # R3, R4
				62	.quad 0x0f200aa66, 1 << 32 # R5, x32
				63	.quad 0x0490d678d, 1 # R6, 1
				64	.quad 0x104d101df, 0 # u
				65	.quad 0x104C11DB7, 0 # P(x)
				66
				67	.previous
				68
				69	.text
				70	/*
				71	* The CRC-32 function(s) use these calling conventions:
				72	*
				73	* Parameters:
				74	*
				75	* %r2: Initial CRC value, typically ~0; and final CRC (return) value.
				76	* %r3: Input buffer pointer, performance might be improved if the
				77	* buffer is on a doubleword boundary.
				78	* %r4: Length of the buffer, must be 64 bytes or greater.
				79	*
				80	* Register usage:
				81	*
				82	* %r5: CRC-32 constant pool base pointer.
				83	* V0: Initial CRC value and intermediate constants and results.
				84	* V1..V4: Data for CRC computation.
				85	* V5..V8: Next data chunks that are fetched from the input buffer.
				86	*
				87	* V9..V14: CRC-32 constants.
				88	*/
				89	ENTRY(crc32_be_vgfm_16)
				90	/* Load CRC-32 constants */
				91	larl %r5,.Lconstants_CRC_32_BE
				92	VLM CONST_R1R2,CONST_CRC_POLY,0,%r5
				93
				94	/* Load the initial CRC value into the leftmost word of V0. */
				95	VZERO %v0
				96	VLVGF %v0,%r2,0
				97
				98	/* Load a 64-byte data chunk and XOR with CRC */
				99	VLM %v1,%v4,0,%r3 /* 64-bytes into V1..V4 */
				100	VX %v1,%v0,%v1 /* V1 ^= CRC */
				101	aghi %r3,64 /* BUF = BUF + 64 */
				102	aghi %r4,-64 /* LEN = LEN - 64 */
				103
				104	/* Check remaining buffer size and jump to proper folding method */
				105	cghi %r4,64
				106	jl .Lless_than_64bytes
				107
				108	.Lfold_64bytes_loop:
				109	/* Load the next 64-byte data chunk into V5 to V8 */
				110	VLM %v5,%v8,0,%r3
				111
				112	/*
				113	* Perform a GF(2) multiplication of the doublewords in V1 with
				114	* the reduction constants in V0. The intermediate result is
				115	* then folded (accumulated) with the next data chunk in V5 and
				116	* stored in V1. Repeat this step for the register contents
				117	* in V2, V3, and V4 respectively.
				118	*/
				119	VGFMAG %v1,CONST_R1R2,%v1,%v5
				120	VGFMAG %v2,CONST_R1R2,%v2,%v6
				121	VGFMAG %v3,CONST_R1R2,%v3,%v7
				122	VGFMAG %v4,CONST_R1R2,%v4,%v8
				123
				124	/* Adjust buffer pointer and length for next loop */
				125	aghi %r3,64 /* BUF = BUF + 64 */
				126	aghi %r4,-64 /* LEN = LEN - 64 */
				127
				128	cghi %r4,64
				129	jnl .Lfold_64bytes_loop
				130
				131	.Lless_than_64bytes:
				132	/* Fold V1 to V4 into a single 128-bit value in V1 */
				133	VGFMAG %v1,CONST_R3R4,%v1,%v2
				134	VGFMAG %v1,CONST_R3R4,%v1,%v3
				135	VGFMAG %v1,CONST_R3R4,%v1,%v4
				136
				137	/* Check whether to continue with 64-bit folding */
				138	cghi %r4,16
				139	jl .Lfinal_fold
				140
				141	.Lfold_16bytes_loop:
				142
				143	VL %v2,0,,%r3 /* Load next data chunk */
				144	VGFMAG %v1,CONST_R3R4,%v1,%v2 /* Fold next data chunk */
				145
				146	/* Adjust buffer pointer and size for folding next data chunk */
				147	aghi %r3,16
				148	aghi %r4,-16
				149
				150	/* Process remaining data chunks */
				151	cghi %r4,16
				152	jnl .Lfold_16bytes_loop
				153
				154	.Lfinal_fold:
				155	/*
				156	* The R5 constant is used to fold a 128-bit value into an 96-bit value
				157	* that is XORed with the next 96-bit input data chunk. To use a single
				158	* VGFMG instruction, multiply the rightmost 64-bit with x^32 (1<<32) to
				159	* form an intermediate 96-bit value (with appended zeros) which is then
				160	* XORed with the intermediate reduction result.
				161	*/
				162	VGFMG %v1,CONST_R5,%v1
				163
				164	/*
				165	* Further reduce the remaining 96-bit value to a 64-bit value using a
				166	* single VGFMG, the rightmost doubleword is multiplied with 0x1. The
				167	* intermediate result is then XORed with the product of the leftmost
				168	* doubleword with R6. The result is a 64-bit value and is subject to
				169	* the Barret reduction.
				170	*/
				171	VGFMG %v1,CONST_R6,%v1
				172
				173	/*
				174	* The input values to the Barret reduction are the degree-63 polynomial
				175	* in V1 (R(x)), degree-32 generator polynomial, and the reduction
				176	* constant u. The Barret reduction result is the CRC value of R(x) mod
				177	* P(x).
				178	*
				179	* The Barret reduction algorithm is defined as:
				180	*
				181	* 1. T1(x) = floor( R(x) / x^32 ) GF2MUL u
				182	* 2. T2(x) = floor( T1(x) / x^32 ) GF2MUL P(x)
				183	* 3. C(x) = R(x) XOR T2(x) mod x^32
				184	*
				185	* Note: To compensate the division by x^32, use the vector unpack
				186	* instruction to move the leftmost word into the leftmost doubleword
				187	* of the vector register. The rightmost doubleword is multiplied
				188	* with zero to not contribute to the intermedate results.
				189	*/
				190
				191	/* T1(x) = floor( R(x) / x^32 ) GF2MUL u */
				192	VUPLLF %v2,%v1
				193	VGFMG %v2,CONST_RU_POLY,%v2
				194
				195	/*
				196	* Compute the GF(2) product of the CRC polynomial in VO with T1(x) in
				197	* V2 and XOR the intermediate result, T2(x), with the value in V1.
				198	* The final result is in the rightmost word of V2.
				199	*/
				200	VUPLLF %v2,%v2
				201	VGFMAG %v2,CONST_CRC_POLY,%v2,%v1
				202
				203	.Ldone:
				204	VLGVF %r2,%v2,3
				205	br %r14
				206
				207	.previous