| /* SPDX-License-Identifier: GPL-2.0 */ |
| /* |
| * Original implementation written by Andy Polyakov, @dot-asm. |
| * This is an adaptation of the original code for kernel use. |
| * |
| * Copyright (C) 2006-2019 CRYPTOGAMS by <appro@openssl.org>. All Rights Reserved. |
| */ |
| |
| #include <linux/linkage.h> |
| #include <asm/nospec-insn.h> |
| #include <asm/vx-insn.h> |
| |
| #define SP %r15 |
| #define FRAME (16 * 8 + 4 * 8) |
| |
| .data |
| .align 32 |
| |
| .Lsigma: |
| .long 0x61707865,0x3320646e,0x79622d32,0x6b206574 # endian-neutral |
| .long 1,0,0,0 |
| .long 2,0,0,0 |
| .long 3,0,0,0 |
| .long 0x03020100,0x07060504,0x0b0a0908,0x0f0e0d0c # byte swap |
| |
| .long 0,1,2,3 |
| .long 0x61707865,0x61707865,0x61707865,0x61707865 # smashed sigma |
| .long 0x3320646e,0x3320646e,0x3320646e,0x3320646e |
| .long 0x79622d32,0x79622d32,0x79622d32,0x79622d32 |
| .long 0x6b206574,0x6b206574,0x6b206574,0x6b206574 |
| |
| .previous |
| |
| GEN_BR_THUNK %r14 |
| |
| .text |
| |
| ############################################################################# |
| # void chacha20_vx_4x(u8 *out, counst u8 *inp, size_t len, |
| # counst u32 *key, const u32 *counter) |
| |
| #define OUT %r2 |
| #define INP %r3 |
| #define LEN %r4 |
| #define KEY %r5 |
| #define COUNTER %r6 |
| |
| #define BEPERM %v31 |
| #define CTR %v26 |
| |
| #define K0 %v16 |
| #define K1 %v17 |
| #define K2 %v18 |
| #define K3 %v19 |
| |
| #define XA0 %v0 |
| #define XA1 %v1 |
| #define XA2 %v2 |
| #define XA3 %v3 |
| |
| #define XB0 %v4 |
| #define XB1 %v5 |
| #define XB2 %v6 |
| #define XB3 %v7 |
| |
| #define XC0 %v8 |
| #define XC1 %v9 |
| #define XC2 %v10 |
| #define XC3 %v11 |
| |
| #define XD0 %v12 |
| #define XD1 %v13 |
| #define XD2 %v14 |
| #define XD3 %v15 |
| |
| #define XT0 %v27 |
| #define XT1 %v28 |
| #define XT2 %v29 |
| #define XT3 %v30 |
| |
| ENTRY(chacha20_vx_4x) |
| stmg %r6,%r7,6*8(SP) |
| |
| larl %r7,.Lsigma |
| lhi %r0,10 |
| lhi %r1,0 |
| |
| VL K0,0,,%r7 # load sigma |
| VL K1,0,,KEY # load key |
| VL K2,16,,KEY |
| VL K3,0,,COUNTER # load counter |
| |
| VL BEPERM,0x40,,%r7 |
| VL CTR,0x50,,%r7 |
| |
| VLM XA0,XA3,0x60,%r7,4 # load [smashed] sigma |
| |
| VREPF XB0,K1,0 # smash the key |
| VREPF XB1,K1,1 |
| VREPF XB2,K1,2 |
| VREPF XB3,K1,3 |
| |
| VREPF XD0,K3,0 |
| VREPF XD1,K3,1 |
| VREPF XD2,K3,2 |
| VREPF XD3,K3,3 |
| VAF XD0,XD0,CTR |
| |
| VREPF XC0,K2,0 |
| VREPF XC1,K2,1 |
| VREPF XC2,K2,2 |
| VREPF XC3,K2,3 |
| |
| .Loop_4x: |
| VAF XA0,XA0,XB0 |
| VX XD0,XD0,XA0 |
| VERLLF XD0,XD0,16 |
| |
| VAF XA1,XA1,XB1 |
| VX XD1,XD1,XA1 |
| VERLLF XD1,XD1,16 |
| |
| VAF XA2,XA2,XB2 |
| VX XD2,XD2,XA2 |
| VERLLF XD2,XD2,16 |
| |
| VAF XA3,XA3,XB3 |
| VX XD3,XD3,XA3 |
| VERLLF XD3,XD3,16 |
| |
| VAF XC0,XC0,XD0 |
| VX XB0,XB0,XC0 |
| VERLLF XB0,XB0,12 |
| |
| VAF XC1,XC1,XD1 |
| VX XB1,XB1,XC1 |
| VERLLF XB1,XB1,12 |
| |
| VAF XC2,XC2,XD2 |
| VX XB2,XB2,XC2 |
| VERLLF XB2,XB2,12 |
| |
| VAF XC3,XC3,XD3 |
| VX XB3,XB3,XC3 |
| VERLLF XB3,XB3,12 |
| |
| VAF XA0,XA0,XB0 |
| VX XD0,XD0,XA0 |
| VERLLF XD0,XD0,8 |
| |
| VAF XA1,XA1,XB1 |
| VX XD1,XD1,XA1 |
| VERLLF XD1,XD1,8 |
| |
| VAF XA2,XA2,XB2 |
| VX XD2,XD2,XA2 |
| VERLLF XD2,XD2,8 |
| |
| VAF XA3,XA3,XB3 |
| VX XD3,XD3,XA3 |
| VERLLF XD3,XD3,8 |
| |
| VAF XC0,XC0,XD0 |
| VX XB0,XB0,XC0 |
| VERLLF XB0,XB0,7 |
| |
| VAF XC1,XC1,XD1 |
| VX XB1,XB1,XC1 |
| VERLLF XB1,XB1,7 |
| |
| VAF XC2,XC2,XD2 |
| VX XB2,XB2,XC2 |
| VERLLF XB2,XB2,7 |
| |
| VAF XC3,XC3,XD3 |
| VX XB3,XB3,XC3 |
| VERLLF XB3,XB3,7 |
| |
| VAF XA0,XA0,XB1 |
| VX XD3,XD3,XA0 |
| VERLLF XD3,XD3,16 |
| |
| VAF XA1,XA1,XB2 |
| VX XD0,XD0,XA1 |
| VERLLF XD0,XD0,16 |
| |
| VAF XA2,XA2,XB3 |
| VX XD1,XD1,XA2 |
| VERLLF XD1,XD1,16 |
| |
| VAF XA3,XA3,XB0 |
| VX XD2,XD2,XA3 |
| VERLLF XD2,XD2,16 |
| |
| VAF XC2,XC2,XD3 |
| VX XB1,XB1,XC2 |
| VERLLF XB1,XB1,12 |
| |
| VAF XC3,XC3,XD0 |
| VX XB2,XB2,XC3 |
| VERLLF XB2,XB2,12 |
| |
| VAF XC0,XC0,XD1 |
| VX XB3,XB3,XC0 |
| VERLLF XB3,XB3,12 |
| |
| VAF XC1,XC1,XD2 |
| VX XB0,XB0,XC1 |
| VERLLF XB0,XB0,12 |
| |
| VAF XA0,XA0,XB1 |
| VX XD3,XD3,XA0 |
| VERLLF XD3,XD3,8 |
| |
| VAF XA1,XA1,XB2 |
| VX XD0,XD0,XA1 |
| VERLLF XD0,XD0,8 |
| |
| VAF XA2,XA2,XB3 |
| VX XD1,XD1,XA2 |
| VERLLF XD1,XD1,8 |
| |
| VAF XA3,XA3,XB0 |
| VX XD2,XD2,XA3 |
| VERLLF XD2,XD2,8 |
| |
| VAF XC2,XC2,XD3 |
| VX XB1,XB1,XC2 |
| VERLLF XB1,XB1,7 |
| |
| VAF XC3,XC3,XD0 |
| VX XB2,XB2,XC3 |
| VERLLF XB2,XB2,7 |
| |
| VAF XC0,XC0,XD1 |
| VX XB3,XB3,XC0 |
| VERLLF XB3,XB3,7 |
| |
| VAF XC1,XC1,XD2 |
| VX XB0,XB0,XC1 |
| VERLLF XB0,XB0,7 |
| brct %r0,.Loop_4x |
| |
| VAF XD0,XD0,CTR |
| |
| VMRHF XT0,XA0,XA1 # transpose data |
| VMRHF XT1,XA2,XA3 |
| VMRLF XT2,XA0,XA1 |
| VMRLF XT3,XA2,XA3 |
| VPDI XA0,XT0,XT1,0b0000 |
| VPDI XA1,XT0,XT1,0b0101 |
| VPDI XA2,XT2,XT3,0b0000 |
| VPDI XA3,XT2,XT3,0b0101 |
| |
| VMRHF XT0,XB0,XB1 |
| VMRHF XT1,XB2,XB3 |
| VMRLF XT2,XB0,XB1 |
| VMRLF XT3,XB2,XB3 |
| VPDI XB0,XT0,XT1,0b0000 |
| VPDI XB1,XT0,XT1,0b0101 |
| VPDI XB2,XT2,XT3,0b0000 |
| VPDI XB3,XT2,XT3,0b0101 |
| |
| VMRHF XT0,XC0,XC1 |
| VMRHF XT1,XC2,XC3 |
| VMRLF XT2,XC0,XC1 |
| VMRLF XT3,XC2,XC3 |
| VPDI XC0,XT0,XT1,0b0000 |
| VPDI XC1,XT0,XT1,0b0101 |
| VPDI XC2,XT2,XT3,0b0000 |
| VPDI XC3,XT2,XT3,0b0101 |
| |
| VMRHF XT0,XD0,XD1 |
| VMRHF XT1,XD2,XD3 |
| VMRLF XT2,XD0,XD1 |
| VMRLF XT3,XD2,XD3 |
| VPDI XD0,XT0,XT1,0b0000 |
| VPDI XD1,XT0,XT1,0b0101 |
| VPDI XD2,XT2,XT3,0b0000 |
| VPDI XD3,XT2,XT3,0b0101 |
| |
| VAF XA0,XA0,K0 |
| VAF XB0,XB0,K1 |
| VAF XC0,XC0,K2 |
| VAF XD0,XD0,K3 |
| |
| VPERM XA0,XA0,XA0,BEPERM |
| VPERM XB0,XB0,XB0,BEPERM |
| VPERM XC0,XC0,XC0,BEPERM |
| VPERM XD0,XD0,XD0,BEPERM |
| |
| VLM XT0,XT3,0,INP,0 |
| |
| VX XT0,XT0,XA0 |
| VX XT1,XT1,XB0 |
| VX XT2,XT2,XC0 |
| VX XT3,XT3,XD0 |
| |
| VSTM XT0,XT3,0,OUT,0 |
| |
| la INP,0x40(INP) |
| la OUT,0x40(OUT) |
| aghi LEN,-0x40 |
| |
| VAF XA0,XA1,K0 |
| VAF XB0,XB1,K1 |
| VAF XC0,XC1,K2 |
| VAF XD0,XD1,K3 |
| |
| VPERM XA0,XA0,XA0,BEPERM |
| VPERM XB0,XB0,XB0,BEPERM |
| VPERM XC0,XC0,XC0,BEPERM |
| VPERM XD0,XD0,XD0,BEPERM |
| |
| .insn rilu,0xc20e00000000,LEN,0x40 # clgfi LEN,0x40 |
| jl .Ltail_4x |
| |
| VLM XT0,XT3,0,INP,0 |
| |
| VX XT0,XT0,XA0 |
| VX XT1,XT1,XB0 |
| VX XT2,XT2,XC0 |
| VX XT3,XT3,XD0 |
| |
| VSTM XT0,XT3,0,OUT,0 |
| |
| la INP,0x40(INP) |
| la OUT,0x40(OUT) |
| aghi LEN,-0x40 |
| je .Ldone_4x |
| |
| VAF XA0,XA2,K0 |
| VAF XB0,XB2,K1 |
| VAF XC0,XC2,K2 |
| VAF XD0,XD2,K3 |
| |
| VPERM XA0,XA0,XA0,BEPERM |
| VPERM XB0,XB0,XB0,BEPERM |
| VPERM XC0,XC0,XC0,BEPERM |
| VPERM XD0,XD0,XD0,BEPERM |
| |
| .insn rilu,0xc20e00000000,LEN,0x40 # clgfi LEN,0x40 |
| jl .Ltail_4x |
| |
| VLM XT0,XT3,0,INP,0 |
| |
| VX XT0,XT0,XA0 |
| VX XT1,XT1,XB0 |
| VX XT2,XT2,XC0 |
| VX XT3,XT3,XD0 |
| |
| VSTM XT0,XT3,0,OUT,0 |
| |
| la INP,0x40(INP) |
| la OUT,0x40(OUT) |
| aghi LEN,-0x40 |
| je .Ldone_4x |
| |
| VAF XA0,XA3,K0 |
| VAF XB0,XB3,K1 |
| VAF XC0,XC3,K2 |
| VAF XD0,XD3,K3 |
| |
| VPERM XA0,XA0,XA0,BEPERM |
| VPERM XB0,XB0,XB0,BEPERM |
| VPERM XC0,XC0,XC0,BEPERM |
| VPERM XD0,XD0,XD0,BEPERM |
| |
| .insn rilu,0xc20e00000000,LEN,0x40 # clgfi LEN,0x40 |
| jl .Ltail_4x |
| |
| VLM XT0,XT3,0,INP,0 |
| |
| VX XT0,XT0,XA0 |
| VX XT1,XT1,XB0 |
| VX XT2,XT2,XC0 |
| VX XT3,XT3,XD0 |
| |
| VSTM XT0,XT3,0,OUT,0 |
| |
| .Ldone_4x: |
| lmg %r6,%r7,6*8(SP) |
| BR_EX %r14 |
| |
| .Ltail_4x: |
| VLR XT0,XC0 |
| VLR XT1,XD0 |
| |
| VST XA0,8*8+0x00,,SP |
| VST XB0,8*8+0x10,,SP |
| VST XT0,8*8+0x20,,SP |
| VST XT1,8*8+0x30,,SP |
| |
| lghi %r1,0 |
| |
| .Loop_tail_4x: |
| llgc %r5,0(%r1,INP) |
| llgc %r6,8*8(%r1,SP) |
| xr %r6,%r5 |
| stc %r6,0(%r1,OUT) |
| la %r1,1(%r1) |
| brct LEN,.Loop_tail_4x |
| |
| lmg %r6,%r7,6*8(SP) |
| BR_EX %r14 |
| ENDPROC(chacha20_vx_4x) |
| |
| #undef OUT |
| #undef INP |
| #undef LEN |
| #undef KEY |
| #undef COUNTER |
| |
| #undef BEPERM |
| |
| #undef K0 |
| #undef K1 |
| #undef K2 |
| #undef K3 |
| |
| |
| ############################################################################# |
| # void chacha20_vx(u8 *out, counst u8 *inp, size_t len, |
| # counst u32 *key, const u32 *counter) |
| |
| #define OUT %r2 |
| #define INP %r3 |
| #define LEN %r4 |
| #define KEY %r5 |
| #define COUNTER %r6 |
| |
| #define BEPERM %v31 |
| |
| #define K0 %v27 |
| #define K1 %v24 |
| #define K2 %v25 |
| #define K3 %v26 |
| |
| #define A0 %v0 |
| #define B0 %v1 |
| #define C0 %v2 |
| #define D0 %v3 |
| |
| #define A1 %v4 |
| #define B1 %v5 |
| #define C1 %v6 |
| #define D1 %v7 |
| |
| #define A2 %v8 |
| #define B2 %v9 |
| #define C2 %v10 |
| #define D2 %v11 |
| |
| #define A3 %v12 |
| #define B3 %v13 |
| #define C3 %v14 |
| #define D3 %v15 |
| |
| #define A4 %v16 |
| #define B4 %v17 |
| #define C4 %v18 |
| #define D4 %v19 |
| |
| #define A5 %v20 |
| #define B5 %v21 |
| #define C5 %v22 |
| #define D5 %v23 |
| |
| #define T0 %v27 |
| #define T1 %v28 |
| #define T2 %v29 |
| #define T3 %v30 |
| |
| ENTRY(chacha20_vx) |
| .insn rilu,0xc20e00000000,LEN,256 # clgfi LEN,256 |
| jle chacha20_vx_4x |
| stmg %r6,%r7,6*8(SP) |
| |
| lghi %r1,-FRAME |
| lgr %r0,SP |
| la SP,0(%r1,SP) |
| stg %r0,0(SP) # back-chain |
| |
| larl %r7,.Lsigma |
| lhi %r0,10 |
| |
| VLM K1,K2,0,KEY,0 # load key |
| VL K3,0,,COUNTER # load counter |
| |
| VLM K0,BEPERM,0,%r7,4 # load sigma, increments, ... |
| |
| .Loop_outer_vx: |
| VLR A0,K0 |
| VLR B0,K1 |
| VLR A1,K0 |
| VLR B1,K1 |
| VLR A2,K0 |
| VLR B2,K1 |
| VLR A3,K0 |
| VLR B3,K1 |
| VLR A4,K0 |
| VLR B4,K1 |
| VLR A5,K0 |
| VLR B5,K1 |
| |
| VLR D0,K3 |
| VAF D1,K3,T1 # K[3]+1 |
| VAF D2,K3,T2 # K[3]+2 |
| VAF D3,K3,T3 # K[3]+3 |
| VAF D4,D2,T2 # K[3]+4 |
| VAF D5,D2,T3 # K[3]+5 |
| |
| VLR C0,K2 |
| VLR C1,K2 |
| VLR C2,K2 |
| VLR C3,K2 |
| VLR C4,K2 |
| VLR C5,K2 |
| |
| VLR T1,D1 |
| VLR T2,D2 |
| VLR T3,D3 |
| |
| .Loop_vx: |
| VAF A0,A0,B0 |
| VAF A1,A1,B1 |
| VAF A2,A2,B2 |
| VAF A3,A3,B3 |
| VAF A4,A4,B4 |
| VAF A5,A5,B5 |
| VX D0,D0,A0 |
| VX D1,D1,A1 |
| VX D2,D2,A2 |
| VX D3,D3,A3 |
| VX D4,D4,A4 |
| VX D5,D5,A5 |
| VERLLF D0,D0,16 |
| VERLLF D1,D1,16 |
| VERLLF D2,D2,16 |
| VERLLF D3,D3,16 |
| VERLLF D4,D4,16 |
| VERLLF D5,D5,16 |
| |
| VAF C0,C0,D0 |
| VAF C1,C1,D1 |
| VAF C2,C2,D2 |
| VAF C3,C3,D3 |
| VAF C4,C4,D4 |
| VAF C5,C5,D5 |
| VX B0,B0,C0 |
| VX B1,B1,C1 |
| VX B2,B2,C2 |
| VX B3,B3,C3 |
| VX B4,B4,C4 |
| VX B5,B5,C5 |
| VERLLF B0,B0,12 |
| VERLLF B1,B1,12 |
| VERLLF B2,B2,12 |
| VERLLF B3,B3,12 |
| VERLLF B4,B4,12 |
| VERLLF B5,B5,12 |
| |
| VAF A0,A0,B0 |
| VAF A1,A1,B1 |
| VAF A2,A2,B2 |
| VAF A3,A3,B3 |
| VAF A4,A4,B4 |
| VAF A5,A5,B5 |
| VX D0,D0,A0 |
| VX D1,D1,A1 |
| VX D2,D2,A2 |
| VX D3,D3,A3 |
| VX D4,D4,A4 |
| VX D5,D5,A5 |
| VERLLF D0,D0,8 |
| VERLLF D1,D1,8 |
| VERLLF D2,D2,8 |
| VERLLF D3,D3,8 |
| VERLLF D4,D4,8 |
| VERLLF D5,D5,8 |
| |
| VAF C0,C0,D0 |
| VAF C1,C1,D1 |
| VAF C2,C2,D2 |
| VAF C3,C3,D3 |
| VAF C4,C4,D4 |
| VAF C5,C5,D5 |
| VX B0,B0,C0 |
| VX B1,B1,C1 |
| VX B2,B2,C2 |
| VX B3,B3,C3 |
| VX B4,B4,C4 |
| VX B5,B5,C5 |
| VERLLF B0,B0,7 |
| VERLLF B1,B1,7 |
| VERLLF B2,B2,7 |
| VERLLF B3,B3,7 |
| VERLLF B4,B4,7 |
| VERLLF B5,B5,7 |
| |
| VSLDB C0,C0,C0,8 |
| VSLDB C1,C1,C1,8 |
| VSLDB C2,C2,C2,8 |
| VSLDB C3,C3,C3,8 |
| VSLDB C4,C4,C4,8 |
| VSLDB C5,C5,C5,8 |
| VSLDB B0,B0,B0,4 |
| VSLDB B1,B1,B1,4 |
| VSLDB B2,B2,B2,4 |
| VSLDB B3,B3,B3,4 |
| VSLDB B4,B4,B4,4 |
| VSLDB B5,B5,B5,4 |
| VSLDB D0,D0,D0,12 |
| VSLDB D1,D1,D1,12 |
| VSLDB D2,D2,D2,12 |
| VSLDB D3,D3,D3,12 |
| VSLDB D4,D4,D4,12 |
| VSLDB D5,D5,D5,12 |
| |
| VAF A0,A0,B0 |
| VAF A1,A1,B1 |
| VAF A2,A2,B2 |
| VAF A3,A3,B3 |
| VAF A4,A4,B4 |
| VAF A5,A5,B5 |
| VX D0,D0,A0 |
| VX D1,D1,A1 |
| VX D2,D2,A2 |
| VX D3,D3,A3 |
| VX D4,D4,A4 |
| VX D5,D5,A5 |
| VERLLF D0,D0,16 |
| VERLLF D1,D1,16 |
| VERLLF D2,D2,16 |
| VERLLF D3,D3,16 |
| VERLLF D4,D4,16 |
| VERLLF D5,D5,16 |
| |
| VAF C0,C0,D0 |
| VAF C1,C1,D1 |
| VAF C2,C2,D2 |
| VAF C3,C3,D3 |
| VAF C4,C4,D4 |
| VAF C5,C5,D5 |
| VX B0,B0,C0 |
| VX B1,B1,C1 |
| VX B2,B2,C2 |
| VX B3,B3,C3 |
| VX B4,B4,C4 |
| VX B5,B5,C5 |
| VERLLF B0,B0,12 |
| VERLLF B1,B1,12 |
| VERLLF B2,B2,12 |
| VERLLF B3,B3,12 |
| VERLLF B4,B4,12 |
| VERLLF B5,B5,12 |
| |
| VAF A0,A0,B0 |
| VAF A1,A1,B1 |
| VAF A2,A2,B2 |
| VAF A3,A3,B3 |
| VAF A4,A4,B4 |
| VAF A5,A5,B5 |
| VX D0,D0,A0 |
| VX D1,D1,A1 |
| VX D2,D2,A2 |
| VX D3,D3,A3 |
| VX D4,D4,A4 |
| VX D5,D5,A5 |
| VERLLF D0,D0,8 |
| VERLLF D1,D1,8 |
| VERLLF D2,D2,8 |
| VERLLF D3,D3,8 |
| VERLLF D4,D4,8 |
| VERLLF D5,D5,8 |
| |
| VAF C0,C0,D0 |
| VAF C1,C1,D1 |
| VAF C2,C2,D2 |
| VAF C3,C3,D3 |
| VAF C4,C4,D4 |
| VAF C5,C5,D5 |
| VX B0,B0,C0 |
| VX B1,B1,C1 |
| VX B2,B2,C2 |
| VX B3,B3,C3 |
| VX B4,B4,C4 |
| VX B5,B5,C5 |
| VERLLF B0,B0,7 |
| VERLLF B1,B1,7 |
| VERLLF B2,B2,7 |
| VERLLF B3,B3,7 |
| VERLLF B4,B4,7 |
| VERLLF B5,B5,7 |
| |
| VSLDB C0,C0,C0,8 |
| VSLDB C1,C1,C1,8 |
| VSLDB C2,C2,C2,8 |
| VSLDB C3,C3,C3,8 |
| VSLDB C4,C4,C4,8 |
| VSLDB C5,C5,C5,8 |
| VSLDB B0,B0,B0,12 |
| VSLDB B1,B1,B1,12 |
| VSLDB B2,B2,B2,12 |
| VSLDB B3,B3,B3,12 |
| VSLDB B4,B4,B4,12 |
| VSLDB B5,B5,B5,12 |
| VSLDB D0,D0,D0,4 |
| VSLDB D1,D1,D1,4 |
| VSLDB D2,D2,D2,4 |
| VSLDB D3,D3,D3,4 |
| VSLDB D4,D4,D4,4 |
| VSLDB D5,D5,D5,4 |
| brct %r0,.Loop_vx |
| |
| VAF A0,A0,K0 |
| VAF B0,B0,K1 |
| VAF C0,C0,K2 |
| VAF D0,D0,K3 |
| VAF A1,A1,K0 |
| VAF D1,D1,T1 # +K[3]+1 |
| |
| VPERM A0,A0,A0,BEPERM |
| VPERM B0,B0,B0,BEPERM |
| VPERM C0,C0,C0,BEPERM |
| VPERM D0,D0,D0,BEPERM |
| |
| .insn rilu,0xc20e00000000,LEN,0x40 # clgfi LEN,0x40 |
| jl .Ltail_vx |
| |
| VAF D2,D2,T2 # +K[3]+2 |
| VAF D3,D3,T3 # +K[3]+3 |
| VLM T0,T3,0,INP,0 |
| |
| VX A0,A0,T0 |
| VX B0,B0,T1 |
| VX C0,C0,T2 |
| VX D0,D0,T3 |
| |
| VLM K0,T3,0,%r7,4 # re-load sigma and increments |
| |
| VSTM A0,D0,0,OUT,0 |
| |
| la INP,0x40(INP) |
| la OUT,0x40(OUT) |
| aghi LEN,-0x40 |
| je .Ldone_vx |
| |
| VAF B1,B1,K1 |
| VAF C1,C1,K2 |
| |
| VPERM A0,A1,A1,BEPERM |
| VPERM B0,B1,B1,BEPERM |
| VPERM C0,C1,C1,BEPERM |
| VPERM D0,D1,D1,BEPERM |
| |
| .insn rilu,0xc20e00000000,LEN,0x40 # clgfi LEN,0x40 |
| jl .Ltail_vx |
| |
| VLM A1,D1,0,INP,0 |
| |
| VX A0,A0,A1 |
| VX B0,B0,B1 |
| VX C0,C0,C1 |
| VX D0,D0,D1 |
| |
| VSTM A0,D0,0,OUT,0 |
| |
| la INP,0x40(INP) |
| la OUT,0x40(OUT) |
| aghi LEN,-0x40 |
| je .Ldone_vx |
| |
| VAF A2,A2,K0 |
| VAF B2,B2,K1 |
| VAF C2,C2,K2 |
| |
| VPERM A0,A2,A2,BEPERM |
| VPERM B0,B2,B2,BEPERM |
| VPERM C0,C2,C2,BEPERM |
| VPERM D0,D2,D2,BEPERM |
| |
| .insn rilu,0xc20e00000000,LEN,0x40 # clgfi LEN,0x40 |
| jl .Ltail_vx |
| |
| VLM A1,D1,0,INP,0 |
| |
| VX A0,A0,A1 |
| VX B0,B0,B1 |
| VX C0,C0,C1 |
| VX D0,D0,D1 |
| |
| VSTM A0,D0,0,OUT,0 |
| |
| la INP,0x40(INP) |
| la OUT,0x40(OUT) |
| aghi LEN,-0x40 |
| je .Ldone_vx |
| |
| VAF A3,A3,K0 |
| VAF B3,B3,K1 |
| VAF C3,C3,K2 |
| VAF D2,K3,T3 # K[3]+3 |
| |
| VPERM A0,A3,A3,BEPERM |
| VPERM B0,B3,B3,BEPERM |
| VPERM C0,C3,C3,BEPERM |
| VPERM D0,D3,D3,BEPERM |
| |
| .insn rilu,0xc20e00000000,LEN,0x40 # clgfi LEN,0x40 |
| jl .Ltail_vx |
| |
| VAF D3,D2,T1 # K[3]+4 |
| VLM A1,D1,0,INP,0 |
| |
| VX A0,A0,A1 |
| VX B0,B0,B1 |
| VX C0,C0,C1 |
| VX D0,D0,D1 |
| |
| VSTM A0,D0,0,OUT,0 |
| |
| la INP,0x40(INP) |
| la OUT,0x40(OUT) |
| aghi LEN,-0x40 |
| je .Ldone_vx |
| |
| VAF A4,A4,K0 |
| VAF B4,B4,K1 |
| VAF C4,C4,K2 |
| VAF D4,D4,D3 # +K[3]+4 |
| VAF D3,D3,T1 # K[3]+5 |
| VAF K3,D2,T3 # K[3]+=6 |
| |
| VPERM A0,A4,A4,BEPERM |
| VPERM B0,B4,B4,BEPERM |
| VPERM C0,C4,C4,BEPERM |
| VPERM D0,D4,D4,BEPERM |
| |
| .insn rilu,0xc20e00000000,LEN,0x40 # clgfi LEN,0x40 |
| jl .Ltail_vx |
| |
| VLM A1,D1,0,INP,0 |
| |
| VX A0,A0,A1 |
| VX B0,B0,B1 |
| VX C0,C0,C1 |
| VX D0,D0,D1 |
| |
| VSTM A0,D0,0,OUT,0 |
| |
| la INP,0x40(INP) |
| la OUT,0x40(OUT) |
| aghi LEN,-0x40 |
| je .Ldone_vx |
| |
| VAF A5,A5,K0 |
| VAF B5,B5,K1 |
| VAF C5,C5,K2 |
| VAF D5,D5,D3 # +K[3]+5 |
| |
| VPERM A0,A5,A5,BEPERM |
| VPERM B0,B5,B5,BEPERM |
| VPERM C0,C5,C5,BEPERM |
| VPERM D0,D5,D5,BEPERM |
| |
| .insn rilu,0xc20e00000000,LEN,0x40 # clgfi LEN,0x40 |
| jl .Ltail_vx |
| |
| VLM A1,D1,0,INP,0 |
| |
| VX A0,A0,A1 |
| VX B0,B0,B1 |
| VX C0,C0,C1 |
| VX D0,D0,D1 |
| |
| VSTM A0,D0,0,OUT,0 |
| |
| la INP,0x40(INP) |
| la OUT,0x40(OUT) |
| lhi %r0,10 |
| aghi LEN,-0x40 |
| jne .Loop_outer_vx |
| |
| .Ldone_vx: |
| lmg %r6,%r7,FRAME+6*8(SP) |
| la SP,FRAME(SP) |
| BR_EX %r14 |
| |
| .Ltail_vx: |
| VSTM A0,D0,8*8,SP,3 |
| lghi %r1,0 |
| |
| .Loop_tail_vx: |
| llgc %r5,0(%r1,INP) |
| llgc %r6,8*8(%r1,SP) |
| xr %r6,%r5 |
| stc %r6,0(%r1,OUT) |
| la %r1,1(%r1) |
| brct LEN,.Loop_tail_vx |
| |
| lmg %r6,%r7,FRAME+6*8(SP) |
| la SP,FRAME(SP) |
| BR_EX %r14 |
| ENDPROC(chacha20_vx) |
| |
| .previous |