Blame - lib/crypto/curve25519-hacl64.c - SHIFTPHONES/mainline/linux

blob: 771d82dc5f14e4f0d54a288c9dc6c522ee8e2c60 [file] [log] [blame]

Jason A. Donenfeld	0ed42a6f	2019-11-08 13:22:32 +0100	[diff] [blame]	1	// SPDX-License-Identifier: GPL-2.0 OR MIT
				2	/*
				3	* Copyright (C) 2016-2017 INRIA and Microsoft Corporation.
				4	* Copyright (C) 2018-2019 Jason A. Donenfeld <Jason@zx2c4.com>. All Rights Reserved.
				5	*
				6	* This is a machine-generated formally verified implementation of Curve25519
				7	* ECDH from: <https://github.com/mitls/hacl-star>. Though originally machine
				8	* generated, it has been tweaked to be suitable for use in the kernel. It is
				9	* optimized for 64-bit machines that can efficiently work with 128-bit
				10	* integer types.
				11	*/
				12
				13	#include <asm/unaligned.h>
				14	#include <crypto/curve25519.h>
				15	#include <linux/string.h>
				16
				17	typedef __uint128_t u128;
				18
				19	static __always_inline u64 u64_eq_mask(u64 a, u64 b)
				20	{
				21	u64 x = a ^ b;
				22	u64 minus_x = ~x + (u64)1U;
				23	u64 x_or_minus_x = x \| minus_x;
				24	u64 xnx = x_or_minus_x >> (u32)63U;
				25	u64 c = xnx - (u64)1U;
				26	return c;
				27	}
				28
				29	static __always_inline u64 u64_gte_mask(u64 a, u64 b)
				30	{
				31	u64 x = a;
				32	u64 y = b;
				33	u64 x_xor_y = x ^ y;
				34	u64 x_sub_y = x - y;
				35	u64 x_sub_y_xor_y = x_sub_y ^ y;
				36	u64 q = x_xor_y \| x_sub_y_xor_y;
				37	u64 x_xor_q = x ^ q;
				38	u64 x_xor_q_ = x_xor_q >> (u32)63U;
				39	u64 c = x_xor_q_ - (u64)1U;
				40	return c;
				41	}
				42
				43	static __always_inline void modulo_carry_top(u64 *b)
				44	{
				45	u64 b4 = b[4];
				46	u64 b0 = b[0];
				47	u64 b4_ = b4 & 0x7ffffffffffffLLU;
				48	u64 b0_ = b0 + 19 * (b4 >> 51);
				49	b[4] = b4_;
				50	b[0] = b0_;
				51	}
				52
				53	static __always_inline void fproduct_copy_from_wide_(u64 output, u128 input)
				54	{
				55	{
				56	u128 xi = input[0];
				57	output[0] = ((u64)(xi));
				58	}
				59	{
				60	u128 xi = input[1];
				61	output[1] = ((u64)(xi));
				62	}
				63	{
				64	u128 xi = input[2];
				65	output[2] = ((u64)(xi));
				66	}
				67	{
				68	u128 xi = input[3];
				69	output[3] = ((u64)(xi));
				70	}
				71	{
				72	u128 xi = input[4];
				73	output[4] = ((u64)(xi));
				74	}
				75	}
				76
				77	static __always_inline void
				78	fproduct_sum_scalar_multiplication_(u128 output, u64 input, u64 s)
				79	{
				80	output[0] += (u128)input[0] * s;
				81	output[1] += (u128)input[1] * s;
				82	output[2] += (u128)input[2] * s;
				83	output[3] += (u128)input[3] * s;
				84	output[4] += (u128)input[4] * s;
				85	}
				86
				87	static __always_inline void fproduct_carry_wide_(u128 *tmp)
				88	{
				89	{
				90	u32 ctr = 0;
				91	u128 tctr = tmp[ctr];
				92	u128 tctrp1 = tmp[ctr + 1];
				93	u64 r0 = ((u64)(tctr)) & 0x7ffffffffffffLLU;
				94	u128 c = ((tctr) >> (51));
				95	tmp[ctr] = ((u128)(r0));
				96	tmp[ctr + 1] = ((tctrp1) + (c));
				97	}
				98	{
				99	u32 ctr = 1;
				100	u128 tctr = tmp[ctr];
				101	u128 tctrp1 = tmp[ctr + 1];
				102	u64 r0 = ((u64)(tctr)) & 0x7ffffffffffffLLU;
				103	u128 c = ((tctr) >> (51));
				104	tmp[ctr] = ((u128)(r0));
				105	tmp[ctr + 1] = ((tctrp1) + (c));
				106	}
				107
				108	{
				109	u32 ctr = 2;
				110	u128 tctr = tmp[ctr];
				111	u128 tctrp1 = tmp[ctr + 1];
				112	u64 r0 = ((u64)(tctr)) & 0x7ffffffffffffLLU;
				113	u128 c = ((tctr) >> (51));
				114	tmp[ctr] = ((u128)(r0));
				115	tmp[ctr + 1] = ((tctrp1) + (c));
				116	}
				117	{
				118	u32 ctr = 3;
				119	u128 tctr = tmp[ctr];
				120	u128 tctrp1 = tmp[ctr + 1];
				121	u64 r0 = ((u64)(tctr)) & 0x7ffffffffffffLLU;
				122	u128 c = ((tctr) >> (51));
				123	tmp[ctr] = ((u128)(r0));
				124	tmp[ctr + 1] = ((tctrp1) + (c));
				125	}
				126	}
				127
				128	static __always_inline void fmul_shift_reduce(u64 *output)
				129	{
				130	u64 tmp = output[4];
				131	u64 b0;
				132	{
				133	u32 ctr = 5 - 0 - 1;
				134	u64 z = output[ctr - 1];
				135	output[ctr] = z;
				136	}
				137	{
				138	u32 ctr = 5 - 1 - 1;
				139	u64 z = output[ctr - 1];
				140	output[ctr] = z;
				141	}
				142	{
				143	u32 ctr = 5 - 2 - 1;
				144	u64 z = output[ctr - 1];
				145	output[ctr] = z;
				146	}
				147	{
				148	u32 ctr = 5 - 3 - 1;
				149	u64 z = output[ctr - 1];
				150	output[ctr] = z;
				151	}
				152	output[0] = tmp;
				153	b0 = output[0];
				154	output[0] = 19 * b0;
				155	}
				156
				157	static __always_inline void fmul_mul_shift_reduce_(u128 output, u64 input,
				158	u64 *input21)
				159	{
				160	u32 i;
				161	u64 input2i;
				162	{
				163	u64 input2i = input21[0];
				164	fproduct_sum_scalar_multiplication_(output, input, input2i);
				165	fmul_shift_reduce(input);
				166	}
				167	{
				168	u64 input2i = input21[1];
				169	fproduct_sum_scalar_multiplication_(output, input, input2i);
				170	fmul_shift_reduce(input);
				171	}
				172	{
				173	u64 input2i = input21[2];
				174	fproduct_sum_scalar_multiplication_(output, input, input2i);
				175	fmul_shift_reduce(input);
				176	}
				177	{
				178	u64 input2i = input21[3];
				179	fproduct_sum_scalar_multiplication_(output, input, input2i);
				180	fmul_shift_reduce(input);
				181	}
				182	i = 4;
				183	input2i = input21[i];
				184	fproduct_sum_scalar_multiplication_(output, input, input2i);
				185	}
				186
				187	static __always_inline void fmul_fmul(u64 output, u64 input, u64 *input21)
				188	{
				189	u64 tmp[5] = { input[0], input[1], input[2], input[3], input[4] };
				190	{
				191	u128 b4;
				192	u128 b0;
				193	u128 b4_;
				194	u128 b0_;
				195	u64 i0;
				196	u64 i1;
				197	u64 i0_;
				198	u64 i1_;
				199	u128 t[5] = { 0 };
				200	fmul_mul_shift_reduce_(t, tmp, input21);
				201	fproduct_carry_wide_(t);
				202	b4 = t[4];
				203	b0 = t[0];
				204	b4_ = ((b4) & (((u128)(0x7ffffffffffffLLU))));
				205	b0_ = ((b0) + (((u128)(19) * (((u64)(((b4) >> (51))))))));
				206	t[4] = b4_;
				207	t[0] = b0_;
				208	fproduct_copy_from_wide_(output, t);
				209	i0 = output[0];
				210	i1 = output[1];
				211	i0_ = i0 & 0x7ffffffffffffLLU;
				212	i1_ = i1 + (i0 >> 51);
				213	output[0] = i0_;
				214	output[1] = i1_;
				215	}
				216	}
				217
				218	static __always_inline void fsquare_fsquare__(u128 tmp, u64 output)
				219	{
				220	u64 r0 = output[0];
				221	u64 r1 = output[1];
				222	u64 r2 = output[2];
				223	u64 r3 = output[3];
				224	u64 r4 = output[4];
				225	u64 d0 = r0 * 2;
				226	u64 d1 = r1 * 2;
				227	u64 d2 = r2 * 2 * 19;
				228	u64 d419 = r4 * 19;
				229	u64 d4 = d419 * 2;
				230	u128 s0 = ((((((u128)(r0) * (r0))) + (((u128)(d4) * (r1))))) +
				231	(((u128)(d2) * (r3))));
				232	u128 s1 = ((((((u128)(d0) * (r1))) + (((u128)(d4) * (r2))))) +
				233	(((u128)(r3 * 19) * (r3))));
				234	u128 s2 = ((((((u128)(d0) * (r2))) + (((u128)(r1) * (r1))))) +
				235	(((u128)(d4) * (r3))));
				236	u128 s3 = ((((((u128)(d0) * (r3))) + (((u128)(d1) * (r2))))) +
				237	(((u128)(r4) * (d419))));
				238	u128 s4 = ((((((u128)(d0) * (r4))) + (((u128)(d1) * (r3))))) +
				239	(((u128)(r2) * (r2))));
				240	tmp[0] = s0;
				241	tmp[1] = s1;
				242	tmp[2] = s2;
				243	tmp[3] = s3;
				244	tmp[4] = s4;
				245	}
				246
				247	static __always_inline void fsquare_fsquare_(u128 tmp, u64 output)
				248	{
				249	u128 b4;
				250	u128 b0;
				251	u128 b4_;
				252	u128 b0_;
				253	u64 i0;
				254	u64 i1;
				255	u64 i0_;
				256	u64 i1_;
				257	fsquare_fsquare__(tmp, output);
				258	fproduct_carry_wide_(tmp);
				259	b4 = tmp[4];
				260	b0 = tmp[0];
				261	b4_ = ((b4) & (((u128)(0x7ffffffffffffLLU))));
				262	b0_ = ((b0) + (((u128)(19) * (((u64)(((b4) >> (51))))))));
				263	tmp[4] = b4_;
				264	tmp[0] = b0_;
				265	fproduct_copy_from_wide_(output, tmp);
				266	i0 = output[0];
				267	i1 = output[1];
				268	i0_ = i0 & 0x7ffffffffffffLLU;
				269	i1_ = i1 + (i0 >> 51);
				270	output[0] = i0_;
				271	output[1] = i1_;
				272	}
				273
				274	static __always_inline void fsquare_fsquare_times_(u64 output, u128 tmp,
				275	u32 count1)
				276	{
				277	u32 i;
				278	fsquare_fsquare_(tmp, output);
				279	for (i = 1; i < count1; ++i)
				280	fsquare_fsquare_(tmp, output);
				281	}
				282
				283	static __always_inline void fsquare_fsquare_times(u64 output, u64 input,
				284	u32 count1)
				285	{
				286	u128 t[5];
				287	memcpy(output, input, 5 * sizeof(*input));
				288	fsquare_fsquare_times_(output, t, count1);
				289	}
				290
				291	static __always_inline void fsquare_fsquare_times_inplace(u64 *output,
				292	u32 count1)
				293	{
				294	u128 t[5];
				295	fsquare_fsquare_times_(output, t, count1);
				296	}
				297
				298	static __always_inline void crecip_crecip(u64 out, u64 z)
				299	{
				300	u64 buf[20] = { 0 };
				301	u64 *a0 = buf;
				302	u64 *t00 = buf + 5;
				303	u64 *b0 = buf + 10;
				304	u64 *t01;
				305	u64 *b1;
				306	u64 *c0;
				307	u64 *a;
				308	u64 *t0;
				309	u64 *b;
				310	u64 *c;
				311	fsquare_fsquare_times(a0, z, 1);
				312	fsquare_fsquare_times(t00, a0, 2);
				313	fmul_fmul(b0, t00, z);
				314	fmul_fmul(a0, b0, a0);
				315	fsquare_fsquare_times(t00, a0, 1);
				316	fmul_fmul(b0, t00, b0);
				317	fsquare_fsquare_times(t00, b0, 5);
				318	t01 = buf + 5;
				319	b1 = buf + 10;
				320	c0 = buf + 15;
				321	fmul_fmul(b1, t01, b1);
				322	fsquare_fsquare_times(t01, b1, 10);
				323	fmul_fmul(c0, t01, b1);
				324	fsquare_fsquare_times(t01, c0, 20);
				325	fmul_fmul(t01, t01, c0);
				326	fsquare_fsquare_times_inplace(t01, 10);
				327	fmul_fmul(b1, t01, b1);
				328	fsquare_fsquare_times(t01, b1, 50);
				329	a = buf;
				330	t0 = buf + 5;
				331	b = buf + 10;
				332	c = buf + 15;
				333	fmul_fmul(c, t0, b);
				334	fsquare_fsquare_times(t0, c, 100);
				335	fmul_fmul(t0, t0, c);
				336	fsquare_fsquare_times_inplace(t0, 50);
				337	fmul_fmul(t0, t0, b);
				338	fsquare_fsquare_times_inplace(t0, 5);
				339	fmul_fmul(out, t0, a);
				340	}
				341
				342	static __always_inline void fsum(u64 a, u64 b)
				343	{
				344	a[0] += b[0];
				345	a[1] += b[1];
				346	a[2] += b[2];
				347	a[3] += b[3];
				348	a[4] += b[4];
				349	}
				350
				351	static __always_inline void fdifference(u64 a, u64 b)
				352	{
				353	u64 tmp[5] = { 0 };
				354	u64 b0;
				355	u64 b1;
				356	u64 b2;
				357	u64 b3;
				358	u64 b4;
				359	memcpy(tmp, b, 5 * sizeof(*b));
				360	b0 = tmp[0];
				361	b1 = tmp[1];
				362	b2 = tmp[2];
				363	b3 = tmp[3];
				364	b4 = tmp[4];
				365	tmp[0] = b0 + 0x3fffffffffff68LLU;
				366	tmp[1] = b1 + 0x3ffffffffffff8LLU;
				367	tmp[2] = b2 + 0x3ffffffffffff8LLU;
				368	tmp[3] = b3 + 0x3ffffffffffff8LLU;
				369	tmp[4] = b4 + 0x3ffffffffffff8LLU;
				370	{
				371	u64 xi = a[0];
				372	u64 yi = tmp[0];
				373	a[0] = yi - xi;
				374	}
				375	{
				376	u64 xi = a[1];
				377	u64 yi = tmp[1];
				378	a[1] = yi - xi;
				379	}
				380	{
				381	u64 xi = a[2];
				382	u64 yi = tmp[2];
				383	a[2] = yi - xi;
				384	}
				385	{
				386	u64 xi = a[3];
				387	u64 yi = tmp[3];
				388	a[3] = yi - xi;
				389	}
				390	{
				391	u64 xi = a[4];
				392	u64 yi = tmp[4];
				393	a[4] = yi - xi;
				394	}
				395	}
				396
				397	static __always_inline void fscalar(u64 output, u64 b, u64 s)
				398	{
				399	u128 tmp[5];
				400	u128 b4;
				401	u128 b0;
				402	u128 b4_;
				403	u128 b0_;
				404	{
				405	u64 xi = b[0];
				406	tmp[0] = ((u128)(xi) * (s));
				407	}
				408	{
				409	u64 xi = b[1];
				410	tmp[1] = ((u128)(xi) * (s));
				411	}
				412	{
				413	u64 xi = b[2];
				414	tmp[2] = ((u128)(xi) * (s));
				415	}
				416	{
				417	u64 xi = b[3];
				418	tmp[3] = ((u128)(xi) * (s));
				419	}
				420	{
				421	u64 xi = b[4];
				422	tmp[4] = ((u128)(xi) * (s));
				423	}
				424	fproduct_carry_wide_(tmp);
				425	b4 = tmp[4];
				426	b0 = tmp[0];
				427	b4_ = ((b4) & (((u128)(0x7ffffffffffffLLU))));
				428	b0_ = ((b0) + (((u128)(19) * (((u64)(((b4) >> (51))))))));
				429	tmp[4] = b4_;
				430	tmp[0] = b0_;
				431	fproduct_copy_from_wide_(output, tmp);
				432	}
				433
				434	static __always_inline void fmul(u64 output, u64 a, u64 *b)
				435	{
				436	fmul_fmul(output, a, b);
				437	}
				438
				439	static __always_inline void crecip(u64 output, u64 input)
				440	{
				441	crecip_crecip(output, input);
				442	}
				443
				444	static __always_inline void point_swap_conditional_step(u64 a, u64 b,
				445	u64 swap1, u32 ctr)
				446	{
				447	u32 i = ctr - 1;
				448	u64 ai = a[i];
				449	u64 bi = b[i];
				450	u64 x = swap1 & (ai ^ bi);
				451	u64 ai1 = ai ^ x;
				452	u64 bi1 = bi ^ x;
				453	a[i] = ai1;
				454	b[i] = bi1;
				455	}
				456
				457	static __always_inline void point_swap_conditional5(u64 a, u64 b, u64 swap1)
				458	{
				459	point_swap_conditional_step(a, b, swap1, 5);
				460	point_swap_conditional_step(a, b, swap1, 4);
				461	point_swap_conditional_step(a, b, swap1, 3);
				462	point_swap_conditional_step(a, b, swap1, 2);
				463	point_swap_conditional_step(a, b, swap1, 1);
				464	}
				465
				466	static __always_inline void point_swap_conditional(u64 a, u64 b, u64 iswap)
				467	{
				468	u64 swap1 = 0 - iswap;
				469	point_swap_conditional5(a, b, swap1);
				470	point_swap_conditional5(a + 5, b + 5, swap1);
				471	}
				472
				473	static __always_inline void point_copy(u64 output, u64 input)
				474	{
				475	memcpy(output, input, 5 * sizeof(*input));
				476	memcpy(output + 5, input + 5, 5 * sizeof(*input));
				477	}
				478
				479	static __always_inline void addanddouble_fmonty(u64 pp, u64 ppq, u64 *p,
				480	u64 pq, u64 qmqp)
				481	{
				482	u64 *qx = qmqp;
				483	u64 *x2 = pp;
				484	u64 *z2 = pp + 5;
				485	u64 *x3 = ppq;
				486	u64 *z3 = ppq + 5;
				487	u64 *x = p;
				488	u64 *z = p + 5;
				489	u64 *xprime = pq;
				490	u64 *zprime = pq + 5;
				491	u64 buf[40] = { 0 };
				492	u64 *origx = buf;
				493	u64 *origxprime0 = buf + 5;
				494	u64 *xxprime0;
				495	u64 *zzprime0;
				496	u64 *origxprime;
				497	xxprime0 = buf + 25;
				498	zzprime0 = buf + 30;
				499	memcpy(origx, x, 5 * sizeof(*x));
				500	fsum(x, z);
				501	fdifference(z, origx);
				502	memcpy(origxprime0, xprime, 5 * sizeof(*xprime));
				503	fsum(xprime, zprime);
				504	fdifference(zprime, origxprime0);
				505	fmul(xxprime0, xprime, z);
				506	fmul(zzprime0, x, zprime);
				507	origxprime = buf + 5;
				508	{
				509	u64 *xx0;
				510	u64 *zz0;
				511	u64 *xxprime;
				512	u64 *zzprime;
				513	u64 *zzzprime;
				514	xx0 = buf + 15;
				515	zz0 = buf + 20;
				516	xxprime = buf + 25;
				517	zzprime = buf + 30;
				518	zzzprime = buf + 35;
				519	memcpy(origxprime, xxprime, 5 * sizeof(*xxprime));
				520	fsum(xxprime, zzprime);
				521	fdifference(zzprime, origxprime);
				522	fsquare_fsquare_times(x3, xxprime, 1);
				523	fsquare_fsquare_times(zzzprime, zzprime, 1);
				524	fmul(z3, zzzprime, qx);
				525	fsquare_fsquare_times(xx0, x, 1);
				526	fsquare_fsquare_times(zz0, z, 1);
				527	{
				528	u64 *zzz;
				529	u64 *xx;
				530	u64 *zz;
				531	u64 scalar;
				532	zzz = buf + 10;
				533	xx = buf + 15;
				534	zz = buf + 20;
				535	fmul(x2, xx, zz);
				536	fdifference(zz, xx);
				537	scalar = 121665;
				538	fscalar(zzz, zz, scalar);
				539	fsum(zzz, xx);
				540	fmul(z2, zzz, zz);
				541	}
				542	}
				543	}
				544
				545	static __always_inline void
				546	ladder_smallloop_cmult_small_loop_step(u64 nq, u64 nqpq, u64 nq2, u64 nqpq2,
				547	u64 *q, u8 byt)
				548	{
				549	u64 bit0 = (u64)(byt >> 7);
				550	u64 bit;
				551	point_swap_conditional(nq, nqpq, bit0);
				552	addanddouble_fmonty(nq2, nqpq2, nq, nqpq, q);
				553	bit = (u64)(byt >> 7);
				554	point_swap_conditional(nq2, nqpq2, bit);
				555	}
				556
				557	static __always_inline void
				558	ladder_smallloop_cmult_small_loop_double_step(u64 nq, u64 nqpq, u64 *nq2,
				559	u64 nqpq2, u64 q, u8 byt)
				560	{
				561	u8 byt1;
				562	ladder_smallloop_cmult_small_loop_step(nq, nqpq, nq2, nqpq2, q, byt);
				563	byt1 = byt << 1;
				564	ladder_smallloop_cmult_small_loop_step(nq2, nqpq2, nq, nqpq, q, byt1);
				565	}
				566
				567	static __always_inline void
				568	ladder_smallloop_cmult_small_loop(u64 nq, u64 nqpq, u64 nq2, u64 nqpq2,
				569	u64 *q, u8 byt, u32 i)
				570	{
				571	while (i--) {
				572	ladder_smallloop_cmult_small_loop_double_step(nq, nqpq, nq2,
				573	nqpq2, q, byt);
				574	byt <<= 2;
				575	}
				576	}
				577
				578	static __always_inline void ladder_bigloop_cmult_big_loop(u8 n1, u64 nq,
				579	u64 nqpq, u64 nq2,
				580	u64 nqpq2, u64 q,
				581	u32 i)
				582	{
				583	while (i--) {
				584	u8 byte = n1[i];
				585	ladder_smallloop_cmult_small_loop(nq, nqpq, nq2, nqpq2, q,
				586	byte, 4);
				587	}
				588	}
				589
				590	static void ladder_cmult(u64 result, u8 n1, u64 *q)
				591	{
				592	u64 point_buf[40] = { 0 };
				593	u64 *nq = point_buf;
				594	u64 *nqpq = point_buf + 10;
				595	u64 *nq2 = point_buf + 20;
				596	u64 *nqpq2 = point_buf + 30;
				597	point_copy(nqpq, q);
				598	nq[0] = 1;
				599	ladder_bigloop_cmult_big_loop(n1, nq, nqpq, nq2, nqpq2, q, 32);
				600	point_copy(result, nq);
				601	}
				602
				603	static __always_inline void format_fexpand(u64 output, const u8 input)
				604	{
				605	const u8 *x00 = input + 6;
				606	const u8 *x01 = input + 12;
				607	const u8 *x02 = input + 19;
				608	const u8 *x0 = input + 24;
				609	u64 i0, i1, i2, i3, i4, output0, output1, output2, output3, output4;
				610	i0 = get_unaligned_le64(input);
				611	i1 = get_unaligned_le64(x00);
				612	i2 = get_unaligned_le64(x01);
				613	i3 = get_unaligned_le64(x02);
				614	i4 = get_unaligned_le64(x0);
				615	output0 = i0 & 0x7ffffffffffffLLU;
				616	output1 = i1 >> 3 & 0x7ffffffffffffLLU;
				617	output2 = i2 >> 6 & 0x7ffffffffffffLLU;
				618	output3 = i3 >> 1 & 0x7ffffffffffffLLU;
				619	output4 = i4 >> 12 & 0x7ffffffffffffLLU;
				620	output[0] = output0;
				621	output[1] = output1;
				622	output[2] = output2;
				623	output[3] = output3;
				624	output[4] = output4;
				625	}
				626
				627	static __always_inline void format_fcontract_first_carry_pass(u64 *input)
				628	{
				629	u64 t0 = input[0];
				630	u64 t1 = input[1];
				631	u64 t2 = input[2];
				632	u64 t3 = input[3];
				633	u64 t4 = input[4];
				634	u64 t1_ = t1 + (t0 >> 51);
				635	u64 t0_ = t0 & 0x7ffffffffffffLLU;
				636	u64 t2_ = t2 + (t1_ >> 51);
				637	u64 t1__ = t1_ & 0x7ffffffffffffLLU;
				638	u64 t3_ = t3 + (t2_ >> 51);
				639	u64 t2__ = t2_ & 0x7ffffffffffffLLU;
				640	u64 t4_ = t4 + (t3_ >> 51);
				641	u64 t3__ = t3_ & 0x7ffffffffffffLLU;
				642	input[0] = t0_;
				643	input[1] = t1__;
				644	input[2] = t2__;
				645	input[3] = t3__;
				646	input[4] = t4_;
				647	}
				648
				649	static __always_inline void format_fcontract_first_carry_full(u64 *input)
				650	{
				651	format_fcontract_first_carry_pass(input);
				652	modulo_carry_top(input);
				653	}
				654
				655	static __always_inline void format_fcontract_second_carry_pass(u64 *input)
				656	{
				657	u64 t0 = input[0];
				658	u64 t1 = input[1];
				659	u64 t2 = input[2];
				660	u64 t3 = input[3];
				661	u64 t4 = input[4];
				662	u64 t1_ = t1 + (t0 >> 51);
				663	u64 t0_ = t0 & 0x7ffffffffffffLLU;
				664	u64 t2_ = t2 + (t1_ >> 51);
				665	u64 t1__ = t1_ & 0x7ffffffffffffLLU;
				666	u64 t3_ = t3 + (t2_ >> 51);
				667	u64 t2__ = t2_ & 0x7ffffffffffffLLU;
				668	u64 t4_ = t4 + (t3_ >> 51);
				669	u64 t3__ = t3_ & 0x7ffffffffffffLLU;
				670	input[0] = t0_;
				671	input[1] = t1__;
				672	input[2] = t2__;
				673	input[3] = t3__;
				674	input[4] = t4_;
				675	}
				676
				677	static __always_inline void format_fcontract_second_carry_full(u64 *input)
				678	{
				679	u64 i0;
				680	u64 i1;
				681	u64 i0_;
				682	u64 i1_;
				683	format_fcontract_second_carry_pass(input);
				684	modulo_carry_top(input);
				685	i0 = input[0];
				686	i1 = input[1];
				687	i0_ = i0 & 0x7ffffffffffffLLU;
				688	i1_ = i1 + (i0 >> 51);
				689	input[0] = i0_;
				690	input[1] = i1_;
				691	}
				692
				693	static __always_inline void format_fcontract_trim(u64 *input)
				694	{
				695	u64 a0 = input[0];
				696	u64 a1 = input[1];
				697	u64 a2 = input[2];
				698	u64 a3 = input[3];
				699	u64 a4 = input[4];
				700	u64 mask0 = u64_gte_mask(a0, 0x7ffffffffffedLLU);
				701	u64 mask1 = u64_eq_mask(a1, 0x7ffffffffffffLLU);
				702	u64 mask2 = u64_eq_mask(a2, 0x7ffffffffffffLLU);
				703	u64 mask3 = u64_eq_mask(a3, 0x7ffffffffffffLLU);
				704	u64 mask4 = u64_eq_mask(a4, 0x7ffffffffffffLLU);
				705	u64 mask = (((mask0 & mask1) & mask2) & mask3) & mask4;
				706	u64 a0_ = a0 - (0x7ffffffffffedLLU & mask);
				707	u64 a1_ = a1 - (0x7ffffffffffffLLU & mask);
				708	u64 a2_ = a2 - (0x7ffffffffffffLLU & mask);
				709	u64 a3_ = a3 - (0x7ffffffffffffLLU & mask);
				710	u64 a4_ = a4 - (0x7ffffffffffffLLU & mask);
				711	input[0] = a0_;
				712	input[1] = a1_;
				713	input[2] = a2_;
				714	input[3] = a3_;
				715	input[4] = a4_;
				716	}
				717
				718	static __always_inline void format_fcontract_store(u8 output, u64 input)
				719	{
				720	u64 t0 = input[0];
				721	u64 t1 = input[1];
				722	u64 t2 = input[2];
				723	u64 t3 = input[3];
				724	u64 t4 = input[4];
				725	u64 o0 = t1 << 51 \| t0;
				726	u64 o1 = t2 << 38 \| t1 >> 13;
				727	u64 o2 = t3 << 25 \| t2 >> 26;
				728	u64 o3 = t4 << 12 \| t3 >> 39;
				729	u8 *b0 = output;
				730	u8 *b1 = output + 8;
				731	u8 *b2 = output + 16;
				732	u8 *b3 = output + 24;
				733	put_unaligned_le64(o0, b0);
				734	put_unaligned_le64(o1, b1);
				735	put_unaligned_le64(o2, b2);
				736	put_unaligned_le64(o3, b3);
				737	}
				738
				739	static __always_inline void format_fcontract(u8 output, u64 input)
				740	{
				741	format_fcontract_first_carry_full(input);
				742	format_fcontract_second_carry_full(input);
				743	format_fcontract_trim(input);
				744	format_fcontract_store(output, input);
				745	}
				746
				747	static __always_inline void format_scalar_of_point(u8 scalar, u64 point)
				748	{
				749	u64 *x = point;
				750	u64 *z = point + 5;
				751	u64 buf[10] __aligned(32) = { 0 };
				752	u64 *zmone = buf;
				753	u64 *sc = buf + 5;
				754	crecip(zmone, z);
				755	fmul(sc, x, zmone);
				756	format_fcontract(scalar, sc);
				757	}
				758
				759	void curve25519_generic(u8 mypublic[CURVE25519_KEY_SIZE],
				760	const u8 secret[CURVE25519_KEY_SIZE],
				761	const u8 basepoint[CURVE25519_KEY_SIZE])
				762	{
				763	u64 buf0[10] __aligned(32) = { 0 };
				764	u64 *x0 = buf0;
				765	u64 *z = buf0 + 5;
				766	u64 *q;
				767	format_fexpand(x0, basepoint);
				768	z[0] = 1;
				769	q = buf0;
				770	{
				771	u8 e[32] __aligned(32) = { 0 };
				772	u8 *scalar;
				773	memcpy(e, secret, 32);
				774	curve25519_clamp_secret(e);
				775	scalar = e;
				776	{
				777	u64 buf[15] = { 0 };
				778	u64 *nq = buf;
				779	u64 *x = nq;
				780	x[0] = 1;
				781	ladder_cmult(nq, scalar, q);
				782	format_scalar_of_point(mypublic, nq);
				783	memzero_explicit(buf, sizeof(buf));
				784	}
				785	memzero_explicit(e, sizeof(e));
				786	}
				787	memzero_explicit(buf0, sizeof(buf0));
				788	}