Blame - lib/crypto/curve25519-fiat32.c - SHIFTPHONES/mainline/linux

blob: 2fde0ec33dbd0873147c064ac9afec8eccd7e690 [file] [log] [blame]

Jason A. Donenfeld	0ed42a6f	2019-11-08 13:22:32 +0100	[diff] [blame]	1	// SPDX-License-Identifier: GPL-2.0 OR MIT
				2	/*
				3	* Copyright (C) 2015-2016 The fiat-crypto Authors.
				4	* Copyright (C) 2018-2019 Jason A. Donenfeld <Jason@zx2c4.com>. All Rights Reserved.
				5	*
				6	* This is a machine-generated formally verified implementation of Curve25519
				7	* ECDH from: <https://github.com/mit-plv/fiat-crypto>. Though originally
				8	* machine generated, it has been tweaked to be suitable for use in the kernel.
				9	* It is optimized for 32-bit machines and machines that cannot work efficiently
				10	* with 128-bit integer types.
				11	*/
				12
				13	#include <asm/unaligned.h>
				14	#include <crypto/curve25519.h>
				15	#include <linux/string.h>
				16
				17	/* fe means field element. Here the field is \Z/(2^255-19). An element t,
				18	* entries t[0]...t[9], represents the integer t[0]+2^26 t[1]+2^51 t[2]+2^77
				19	* t[3]+2^102 t[4]+...+2^230 t[9].
				20	* fe limbs are bounded by 1.1252^26,1.1252^25,1.1252^26,1.1252^25,etc.
				21	* Multiplication and carrying produce fe from fe_loose.
				22	*/
				23	typedef struct fe { u32 v[10]; } fe;
				24
				25	/* fe_loose limbs are bounded by 3.3752^26,3.3752^25,3.3752^26,3.3752^25,etc
				26	* Addition and subtraction produce fe_loose from (fe, fe).
				27	*/
				28	typedef struct fe_loose { u32 v[10]; } fe_loose;
				29
				30	static __always_inline void fe_frombytes_impl(u32 h[10], const u8 *s)
				31	{
				32	/* Ignores top bit of s. */
				33	u32 a0 = get_unaligned_le32(s);
				34	u32 a1 = get_unaligned_le32(s+4);
				35	u32 a2 = get_unaligned_le32(s+8);
				36	u32 a3 = get_unaligned_le32(s+12);
				37	u32 a4 = get_unaligned_le32(s+16);
				38	u32 a5 = get_unaligned_le32(s+20);
				39	u32 a6 = get_unaligned_le32(s+24);
				40	u32 a7 = get_unaligned_le32(s+28);
				41	h[0] = a0&((1<<26)-1); /* 26 used, 32-26 left. 26 */
				42	h[1] = (a0>>26) \| ((a1&((1<<19)-1))<< 6); /* (32-26) + 19 = 6+19 = 25 */
				43	h[2] = (a1>>19) \| ((a2&((1<<13)-1))<<13); /* (32-19) + 13 = 13+13 = 26 */
				44	h[3] = (a2>>13) \| ((a3&((1<< 6)-1))<<19); /* (32-13) + 6 = 19+ 6 = 25 */
				45	h[4] = (a3>> 6); /* (32- 6) = 26 */
				46	h[5] = a4&((1<<25)-1); /* 25 */
				47	h[6] = (a4>>25) \| ((a5&((1<<19)-1))<< 7); /* (32-25) + 19 = 7+19 = 26 */
				48	h[7] = (a5>>19) \| ((a6&((1<<12)-1))<<13); /* (32-19) + 12 = 13+12 = 25 */
				49	h[8] = (a6>>12) \| ((a7&((1<< 6)-1))<<20); /* (32-12) + 6 = 20+ 6 = 26 */
				50	h[9] = (a7>> 6)&((1<<25)-1); /* 25 */
				51	}
				52
				53	static __always_inline void fe_frombytes(fe h, const u8 s)
				54	{
				55	fe_frombytes_impl(h->v, s);
				56	}
				57
				58	static __always_inline u8 /bool/
				59	addcarryx_u25(u8 /bool/ c, u32 a, u32 b, u32 *low)
				60	{
				61	/* This function extracts 25 bits of result and 1 bit of carry
				62	* (26 total), so a 32-bit intermediate is sufficient.
				63	*/
				64	u32 x = a + b + c;
				65	*low = x & ((1 << 25) - 1);
				66	return (x >> 25) & 1;
				67	}
				68
				69	static __always_inline u8 /bool/
				70	addcarryx_u26(u8 /bool/ c, u32 a, u32 b, u32 *low)
				71	{
				72	/* This function extracts 26 bits of result and 1 bit of carry
				73	* (27 total), so a 32-bit intermediate is sufficient.
				74	*/
				75	u32 x = a + b + c;
				76	*low = x & ((1 << 26) - 1);
				77	return (x >> 26) & 1;
				78	}
				79
				80	static __always_inline u8 /bool/
				81	subborrow_u25(u8 /bool/ c, u32 a, u32 b, u32 *low)
				82	{
				83	/* This function extracts 25 bits of result and 1 bit of borrow
				84	* (26 total), so a 32-bit intermediate is sufficient.
				85	*/
				86	u32 x = a - b - c;
				87	*low = x & ((1 << 25) - 1);
				88	return x >> 31;
				89	}
				90
				91	static __always_inline u8 /bool/
				92	subborrow_u26(u8 /bool/ c, u32 a, u32 b, u32 *low)
				93	{
				94	/* This function extracts 26 bits of result and 1 bit of borrow
				95	*(27 total), so a 32-bit intermediate is sufficient.
				96	*/
				97	u32 x = a - b - c;
				98	*low = x & ((1 << 26) - 1);
				99	return x >> 31;
				100	}
				101
				102	static __always_inline u32 cmovznz32(u32 t, u32 z, u32 nz)
				103	{
				104	t = -!!t; /* all set if nonzero, 0 if 0 */
				105	return (t&nz) \| ((~t)&z);
				106	}
				107
				108	static __always_inline void fe_freeze(u32 out[10], const u32 in1[10])
				109	{
				110	{ const u32 x17 = in1[9];
				111	{ const u32 x18 = in1[8];
				112	{ const u32 x16 = in1[7];
				113	{ const u32 x14 = in1[6];
				114	{ const u32 x12 = in1[5];
				115	{ const u32 x10 = in1[4];
				116	{ const u32 x8 = in1[3];
				117	{ const u32 x6 = in1[2];
				118	{ const u32 x4 = in1[1];
				119	{ const u32 x2 = in1[0];
				120	{ u32 x20; u8/bool/ x21 = subborrow_u26(0x0, x2, 0x3ffffed, &x20);
				121	{ u32 x23; u8/bool/ x24 = subborrow_u25(x21, x4, 0x1ffffff, &x23);
				122	{ u32 x26; u8/bool/ x27 = subborrow_u26(x24, x6, 0x3ffffff, &x26);
				123	{ u32 x29; u8/bool/ x30 = subborrow_u25(x27, x8, 0x1ffffff, &x29);
				124	{ u32 x32; u8/bool/ x33 = subborrow_u26(x30, x10, 0x3ffffff, &x32);
				125	{ u32 x35; u8/bool/ x36 = subborrow_u25(x33, x12, 0x1ffffff, &x35);
				126	{ u32 x38; u8/bool/ x39 = subborrow_u26(x36, x14, 0x3ffffff, &x38);
				127	{ u32 x41; u8/bool/ x42 = subborrow_u25(x39, x16, 0x1ffffff, &x41);
				128	{ u32 x44; u8/bool/ x45 = subborrow_u26(x42, x18, 0x3ffffff, &x44);
				129	{ u32 x47; u8/bool/ x48 = subborrow_u25(x45, x17, 0x1ffffff, &x47);
				130	{ u32 x49 = cmovznz32(x48, 0x0, 0xffffffff);
				131	{ u32 x50 = (x49 & 0x3ffffed);
				132	{ u32 x52; u8/bool/ x53 = addcarryx_u26(0x0, x20, x50, &x52);
				133	{ u32 x54 = (x49 & 0x1ffffff);
				134	{ u32 x56; u8/bool/ x57 = addcarryx_u25(x53, x23, x54, &x56);
				135	{ u32 x58 = (x49 & 0x3ffffff);
				136	{ u32 x60; u8/bool/ x61 = addcarryx_u26(x57, x26, x58, &x60);
				137	{ u32 x62 = (x49 & 0x1ffffff);
				138	{ u32 x64; u8/bool/ x65 = addcarryx_u25(x61, x29, x62, &x64);
				139	{ u32 x66 = (x49 & 0x3ffffff);
				140	{ u32 x68; u8/bool/ x69 = addcarryx_u26(x65, x32, x66, &x68);
				141	{ u32 x70 = (x49 & 0x1ffffff);
				142	{ u32 x72; u8/bool/ x73 = addcarryx_u25(x69, x35, x70, &x72);
				143	{ u32 x74 = (x49 & 0x3ffffff);
				144	{ u32 x76; u8/bool/ x77 = addcarryx_u26(x73, x38, x74, &x76);
				145	{ u32 x78 = (x49 & 0x1ffffff);
				146	{ u32 x80; u8/bool/ x81 = addcarryx_u25(x77, x41, x78, &x80);
				147	{ u32 x82 = (x49 & 0x3ffffff);
				148	{ u32 x84; u8/bool/ x85 = addcarryx_u26(x81, x44, x82, &x84);
				149	{ u32 x86 = (x49 & 0x1ffffff);
				150	{ u32 x88; addcarryx_u25(x85, x47, x86, &x88);
				151	out[0] = x52;
				152	out[1] = x56;
				153	out[2] = x60;
				154	out[3] = x64;
				155	out[4] = x68;
				156	out[5] = x72;
				157	out[6] = x76;
				158	out[7] = x80;
				159	out[8] = x84;
				160	out[9] = x88;
				161	}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}
				162	}
				163
				164	static __always_inline void fe_tobytes(u8 s[32], const fe *f)
				165	{
				166	u32 h[10];
				167	fe_freeze(h, f->v);
				168	s[0] = h[0] >> 0;
				169	s[1] = h[0] >> 8;
				170	s[2] = h[0] >> 16;
				171	s[3] = (h[0] >> 24) \| (h[1] << 2);
				172	s[4] = h[1] >> 6;
				173	s[5] = h[1] >> 14;
				174	s[6] = (h[1] >> 22) \| (h[2] << 3);
				175	s[7] = h[2] >> 5;
				176	s[8] = h[2] >> 13;
				177	s[9] = (h[2] >> 21) \| (h[3] << 5);
				178	s[10] = h[3] >> 3;
				179	s[11] = h[3] >> 11;
				180	s[12] = (h[3] >> 19) \| (h[4] << 6);
				181	s[13] = h[4] >> 2;
				182	s[14] = h[4] >> 10;
				183	s[15] = h[4] >> 18;
				184	s[16] = h[5] >> 0;
				185	s[17] = h[5] >> 8;
				186	s[18] = h[5] >> 16;
				187	s[19] = (h[5] >> 24) \| (h[6] << 1);
				188	s[20] = h[6] >> 7;
				189	s[21] = h[6] >> 15;
				190	s[22] = (h[6] >> 23) \| (h[7] << 3);
				191	s[23] = h[7] >> 5;
				192	s[24] = h[7] >> 13;
				193	s[25] = (h[7] >> 21) \| (h[8] << 4);
				194	s[26] = h[8] >> 4;
				195	s[27] = h[8] >> 12;
				196	s[28] = (h[8] >> 20) \| (h[9] << 6);
				197	s[29] = h[9] >> 2;
				198	s[30] = h[9] >> 10;
				199	s[31] = h[9] >> 18;
				200	}
				201
				202	/* h = f */
				203	static __always_inline void fe_copy(fe h, const fe f)
				204	{
				205	memmove(h, f, sizeof(u32) * 10);
				206	}
				207
				208	static __always_inline void fe_copy_lt(fe_loose h, const fe f)
				209	{
				210	memmove(h, f, sizeof(u32) * 10);
				211	}
				212
				213	/* h = 0 */
				214	static __always_inline void fe_0(fe *h)
				215	{
				216	memset(h, 0, sizeof(u32) * 10);
				217	}
				218
				219	/* h = 1 */
				220	static __always_inline void fe_1(fe *h)
				221	{
				222	memset(h, 0, sizeof(u32) * 10);
				223	h->v[0] = 1;
				224	}
				225
Ard Biesheuvel	660bb8e	2019-11-08 13:22:35 +0100	[diff] [blame]	226	static noinline void fe_add_impl(u32 out[10], const u32 in1[10], const u32 in2[10])
Jason A. Donenfeld	0ed42a6f	2019-11-08 13:22:32 +0100	[diff] [blame]	227	{
				228	{ const u32 x20 = in1[9];
				229	{ const u32 x21 = in1[8];
				230	{ const u32 x19 = in1[7];
				231	{ const u32 x17 = in1[6];
				232	{ const u32 x15 = in1[5];
				233	{ const u32 x13 = in1[4];
				234	{ const u32 x11 = in1[3];
				235	{ const u32 x9 = in1[2];
				236	{ const u32 x7 = in1[1];
				237	{ const u32 x5 = in1[0];
				238	{ const u32 x38 = in2[9];
				239	{ const u32 x39 = in2[8];
				240	{ const u32 x37 = in2[7];
				241	{ const u32 x35 = in2[6];
				242	{ const u32 x33 = in2[5];
				243	{ const u32 x31 = in2[4];
				244	{ const u32 x29 = in2[3];
				245	{ const u32 x27 = in2[2];
				246	{ const u32 x25 = in2[1];
				247	{ const u32 x23 = in2[0];
				248	out[0] = (x5 + x23);
				249	out[1] = (x7 + x25);
				250	out[2] = (x9 + x27);
				251	out[3] = (x11 + x29);
				252	out[4] = (x13 + x31);
				253	out[5] = (x15 + x33);
				254	out[6] = (x17 + x35);
				255	out[7] = (x19 + x37);
				256	out[8] = (x21 + x39);
				257	out[9] = (x20 + x38);
				258	}}}}}}}}}}}}}}}}}}}}
				259	}
				260
				261	/* h = f + g
				262	* Can overlap h with f or g.
				263	*/
				264	static __always_inline void fe_add(fe_loose h, const fe f, const fe *g)
				265	{
				266	fe_add_impl(h->v, f->v, g->v);
				267	}
				268
Ard Biesheuvel	660bb8e	2019-11-08 13:22:35 +0100	[diff] [blame]	269	static noinline void fe_sub_impl(u32 out[10], const u32 in1[10], const u32 in2[10])
Jason A. Donenfeld	0ed42a6f	2019-11-08 13:22:32 +0100	[diff] [blame]	270	{
				271	{ const u32 x20 = in1[9];
				272	{ const u32 x21 = in1[8];
				273	{ const u32 x19 = in1[7];
				274	{ const u32 x17 = in1[6];
				275	{ const u32 x15 = in1[5];
				276	{ const u32 x13 = in1[4];
				277	{ const u32 x11 = in1[3];
				278	{ const u32 x9 = in1[2];
				279	{ const u32 x7 = in1[1];
				280	{ const u32 x5 = in1[0];
				281	{ const u32 x38 = in2[9];
				282	{ const u32 x39 = in2[8];
				283	{ const u32 x37 = in2[7];
				284	{ const u32 x35 = in2[6];
				285	{ const u32 x33 = in2[5];
				286	{ const u32 x31 = in2[4];
				287	{ const u32 x29 = in2[3];
				288	{ const u32 x27 = in2[2];
				289	{ const u32 x25 = in2[1];
				290	{ const u32 x23 = in2[0];
				291	out[0] = ((0x7ffffda + x5) - x23);
				292	out[1] = ((0x3fffffe + x7) - x25);
				293	out[2] = ((0x7fffffe + x9) - x27);
				294	out[3] = ((0x3fffffe + x11) - x29);
				295	out[4] = ((0x7fffffe + x13) - x31);
				296	out[5] = ((0x3fffffe + x15) - x33);
				297	out[6] = ((0x7fffffe + x17) - x35);
				298	out[7] = ((0x3fffffe + x19) - x37);
				299	out[8] = ((0x7fffffe + x21) - x39);
				300	out[9] = ((0x3fffffe + x20) - x38);
				301	}}}}}}}}}}}}}}}}}}}}
				302	}
				303
				304	/* h = f - g
				305	* Can overlap h with f or g.
				306	*/
				307	static __always_inline void fe_sub(fe_loose h, const fe f, const fe *g)
				308	{
				309	fe_sub_impl(h->v, f->v, g->v);
				310	}
				311
Ard Biesheuvel	660bb8e	2019-11-08 13:22:35 +0100	[diff] [blame]	312	static noinline void fe_mul_impl(u32 out[10], const u32 in1[10], const u32 in2[10])
Jason A. Donenfeld	0ed42a6f	2019-11-08 13:22:32 +0100	[diff] [blame]	313	{
				314	{ const u32 x20 = in1[9];
				315	{ const u32 x21 = in1[8];
				316	{ const u32 x19 = in1[7];
				317	{ const u32 x17 = in1[6];
				318	{ const u32 x15 = in1[5];
				319	{ const u32 x13 = in1[4];
				320	{ const u32 x11 = in1[3];
				321	{ const u32 x9 = in1[2];
				322	{ const u32 x7 = in1[1];
				323	{ const u32 x5 = in1[0];
				324	{ const u32 x38 = in2[9];
				325	{ const u32 x39 = in2[8];
				326	{ const u32 x37 = in2[7];
				327	{ const u32 x35 = in2[6];
				328	{ const u32 x33 = in2[5];
				329	{ const u32 x31 = in2[4];
				330	{ const u32 x29 = in2[3];
				331	{ const u32 x27 = in2[2];
				332	{ const u32 x25 = in2[1];
				333	{ const u32 x23 = in2[0];
				334	{ u64 x40 = ((u64)x23 * x5);
				335	{ u64 x41 = (((u64)x23 * x7) + ((u64)x25 * x5));
				336	{ u64 x42 = ((((u64)(0x2 * x25) * x7) + ((u64)x23 * x9)) + ((u64)x27 * x5));
				337	{ u64 x43 = (((((u64)x25 * x9) + ((u64)x27 * x7)) + ((u64)x23 * x11)) + ((u64)x29 * x5));
				338	{ u64 x44 = (((((u64)x27 * x9) + (0x2 * (((u64)x25 * x11) + ((u64)x29 * x7)))) + ((u64)x23 * x13)) + ((u64)x31 * x5));
				339	{ u64 x45 = (((((((u64)x27 * x11) + ((u64)x29 * x9)) + ((u64)x25 * x13)) + ((u64)x31 * x7)) + ((u64)x23 * x15)) + ((u64)x33 * x5));
				340	{ u64 x46 = (((((0x2 * ((((u64)x29 * x11) + ((u64)x25 * x15)) + ((u64)x33 * x7))) + ((u64)x27 * x13)) + ((u64)x31 * x9)) + ((u64)x23 * x17)) + ((u64)x35 * x5));
				341	{ u64 x47 = (((((((((u64)x29 * x13) + ((u64)x31 * x11)) + ((u64)x27 * x15)) + ((u64)x33 * x9)) + ((u64)x25 * x17)) + ((u64)x35 * x7)) + ((u64)x23 * x19)) + ((u64)x37 * x5));
				342	{ u64 x48 = (((((((u64)x31 * x13) + (0x2 * (((((u64)x29 * x15) + ((u64)x33 * x11)) + ((u64)x25 * x19)) + ((u64)x37 * x7)))) + ((u64)x27 * x17)) + ((u64)x35 * x9)) + ((u64)x23 * x21)) + ((u64)x39 * x5));
				343	{ u64 x49 = (((((((((((u64)x31 * x15) + ((u64)x33 * x13)) + ((u64)x29 * x17)) + ((u64)x35 * x11)) + ((u64)x27 * x19)) + ((u64)x37 * x9)) + ((u64)x25 * x21)) + ((u64)x39 * x7)) + ((u64)x23 * x20)) + ((u64)x38 * x5));
				344	{ u64 x50 = (((((0x2 * ((((((u64)x33 * x15) + ((u64)x29 * x19)) + ((u64)x37 * x11)) + ((u64)x25 * x20)) + ((u64)x38 * x7))) + ((u64)x31 * x17)) + ((u64)x35 * x13)) + ((u64)x27 * x21)) + ((u64)x39 * x9));
				345	{ u64 x51 = (((((((((u64)x33 * x17) + ((u64)x35 * x15)) + ((u64)x31 * x19)) + ((u64)x37 * x13)) + ((u64)x29 * x21)) + ((u64)x39 * x11)) + ((u64)x27 * x20)) + ((u64)x38 * x9));
				346	{ u64 x52 = (((((u64)x35 * x17) + (0x2 * (((((u64)x33 * x19) + ((u64)x37 * x15)) + ((u64)x29 * x20)) + ((u64)x38 * x11)))) + ((u64)x31 * x21)) + ((u64)x39 * x13));
				347	{ u64 x53 = (((((((u64)x35 * x19) + ((u64)x37 * x17)) + ((u64)x33 * x21)) + ((u64)x39 * x15)) + ((u64)x31 * x20)) + ((u64)x38 * x13));
				348	{ u64 x54 = (((0x2 * ((((u64)x37 * x19) + ((u64)x33 * x20)) + ((u64)x38 * x15))) + ((u64)x35 * x21)) + ((u64)x39 * x17));
				349	{ u64 x55 = (((((u64)x37 * x21) + ((u64)x39 * x19)) + ((u64)x35 * x20)) + ((u64)x38 * x17));
				350	{ u64 x56 = (((u64)x39 * x21) + (0x2 * (((u64)x37 * x20) + ((u64)x38 * x19))));
				351	{ u64 x57 = (((u64)x39 * x20) + ((u64)x38 * x21));
				352	{ u64 x58 = ((u64)(0x2 * x38) * x20);
				353	{ u64 x59 = (x48 + (x58 << 0x4));
				354	{ u64 x60 = (x59 + (x58 << 0x1));
				355	{ u64 x61 = (x60 + x58);
				356	{ u64 x62 = (x47 + (x57 << 0x4));
				357	{ u64 x63 = (x62 + (x57 << 0x1));
				358	{ u64 x64 = (x63 + x57);
				359	{ u64 x65 = (x46 + (x56 << 0x4));
				360	{ u64 x66 = (x65 + (x56 << 0x1));
				361	{ u64 x67 = (x66 + x56);
				362	{ u64 x68 = (x45 + (x55 << 0x4));
				363	{ u64 x69 = (x68 + (x55 << 0x1));
				364	{ u64 x70 = (x69 + x55);
				365	{ u64 x71 = (x44 + (x54 << 0x4));
				366	{ u64 x72 = (x71 + (x54 << 0x1));
				367	{ u64 x73 = (x72 + x54);
				368	{ u64 x74 = (x43 + (x53 << 0x4));
				369	{ u64 x75 = (x74 + (x53 << 0x1));
				370	{ u64 x76 = (x75 + x53);
				371	{ u64 x77 = (x42 + (x52 << 0x4));
				372	{ u64 x78 = (x77 + (x52 << 0x1));
				373	{ u64 x79 = (x78 + x52);
				374	{ u64 x80 = (x41 + (x51 << 0x4));
				375	{ u64 x81 = (x80 + (x51 << 0x1));
				376	{ u64 x82 = (x81 + x51);
				377	{ u64 x83 = (x40 + (x50 << 0x4));
				378	{ u64 x84 = (x83 + (x50 << 0x1));
				379	{ u64 x85 = (x84 + x50);
				380	{ u64 x86 = (x85 >> 0x1a);
				381	{ u32 x87 = ((u32)x85 & 0x3ffffff);
				382	{ u64 x88 = (x86 + x82);
				383	{ u64 x89 = (x88 >> 0x19);
				384	{ u32 x90 = ((u32)x88 & 0x1ffffff);
				385	{ u64 x91 = (x89 + x79);
				386	{ u64 x92 = (x91 >> 0x1a);
				387	{ u32 x93 = ((u32)x91 & 0x3ffffff);
				388	{ u64 x94 = (x92 + x76);
				389	{ u64 x95 = (x94 >> 0x19);
				390	{ u32 x96 = ((u32)x94 & 0x1ffffff);
				391	{ u64 x97 = (x95 + x73);
				392	{ u64 x98 = (x97 >> 0x1a);
				393	{ u32 x99 = ((u32)x97 & 0x3ffffff);
				394	{ u64 x100 = (x98 + x70);
				395	{ u64 x101 = (x100 >> 0x19);
				396	{ u32 x102 = ((u32)x100 & 0x1ffffff);
				397	{ u64 x103 = (x101 + x67);
				398	{ u64 x104 = (x103 >> 0x1a);
				399	{ u32 x105 = ((u32)x103 & 0x3ffffff);
				400	{ u64 x106 = (x104 + x64);
				401	{ u64 x107 = (x106 >> 0x19);
				402	{ u32 x108 = ((u32)x106 & 0x1ffffff);
				403	{ u64 x109 = (x107 + x61);
				404	{ u64 x110 = (x109 >> 0x1a);
				405	{ u32 x111 = ((u32)x109 & 0x3ffffff);
				406	{ u64 x112 = (x110 + x49);
				407	{ u64 x113 = (x112 >> 0x19);
				408	{ u32 x114 = ((u32)x112 & 0x1ffffff);
				409	{ u64 x115 = (x87 + (0x13 * x113));
				410	{ u32 x116 = (u32) (x115 >> 0x1a);
				411	{ u32 x117 = ((u32)x115 & 0x3ffffff);
				412	{ u32 x118 = (x116 + x90);
				413	{ u32 x119 = (x118 >> 0x19);
				414	{ u32 x120 = (x118 & 0x1ffffff);
				415	out[0] = x117;
				416	out[1] = x120;
				417	out[2] = (x119 + x93);
				418	out[3] = x96;
				419	out[4] = x99;
				420	out[5] = x102;
				421	out[6] = x105;
				422	out[7] = x108;
				423	out[8] = x111;
				424	out[9] = x114;
				425	}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}
				426	}
				427
				428	static __always_inline void fe_mul_ttt(fe h, const fe f, const fe *g)
				429	{
				430	fe_mul_impl(h->v, f->v, g->v);
				431	}
				432
				433	static __always_inline void fe_mul_tlt(fe h, const fe_loose f, const fe *g)
				434	{
				435	fe_mul_impl(h->v, f->v, g->v);
				436	}
				437
				438	static __always_inline void
				439	fe_mul_tll(fe h, const fe_loose f, const fe_loose *g)
				440	{
				441	fe_mul_impl(h->v, f->v, g->v);
				442	}
				443
Ard Biesheuvel	660bb8e	2019-11-08 13:22:35 +0100	[diff] [blame]	444	static noinline void fe_sqr_impl(u32 out[10], const u32 in1[10])
Jason A. Donenfeld	0ed42a6f	2019-11-08 13:22:32 +0100	[diff] [blame]	445	{
				446	{ const u32 x17 = in1[9];
				447	{ const u32 x18 = in1[8];
				448	{ const u32 x16 = in1[7];
				449	{ const u32 x14 = in1[6];
				450	{ const u32 x12 = in1[5];
				451	{ const u32 x10 = in1[4];
				452	{ const u32 x8 = in1[3];
				453	{ const u32 x6 = in1[2];
				454	{ const u32 x4 = in1[1];
				455	{ const u32 x2 = in1[0];
				456	{ u64 x19 = ((u64)x2 * x2);
				457	{ u64 x20 = ((u64)(0x2 * x2) * x4);
				458	{ u64 x21 = (0x2 * (((u64)x4 * x4) + ((u64)x2 * x6)));
				459	{ u64 x22 = (0x2 * (((u64)x4 * x6) + ((u64)x2 * x8)));
				460	{ u64 x23 = ((((u64)x6 * x6) + ((u64)(0x4 * x4) * x8)) + ((u64)(0x2 * x2) * x10));
				461	{ u64 x24 = (0x2 * ((((u64)x6 * x8) + ((u64)x4 * x10)) + ((u64)x2 * x12)));
				462	{ u64 x25 = (0x2 * (((((u64)x8 * x8) + ((u64)x6 * x10)) + ((u64)x2 * x14)) + ((u64)(0x2 * x4) * x12)));
				463	{ u64 x26 = (0x2 * (((((u64)x8 * x10) + ((u64)x6 * x12)) + ((u64)x4 * x14)) + ((u64)x2 * x16)));
				464	{ u64 x27 = (((u64)x10 * x10) + (0x2 * ((((u64)x6 * x14) + ((u64)x2 * x18)) + (0x2 * (((u64)x4 * x16) + ((u64)x8 * x12))))));
				465	{ u64 x28 = (0x2 * ((((((u64)x10 * x12) + ((u64)x8 * x14)) + ((u64)x6 * x16)) + ((u64)x4 * x18)) + ((u64)x2 * x17)));
				466	{ u64 x29 = (0x2 * (((((u64)x12 * x12) + ((u64)x10 * x14)) + ((u64)x6 * x18)) + (0x2 * (((u64)x8 * x16) + ((u64)x4 * x17)))));
				467	{ u64 x30 = (0x2 * (((((u64)x12 * x14) + ((u64)x10 * x16)) + ((u64)x8 * x18)) + ((u64)x6 * x17)));
				468	{ u64 x31 = (((u64)x14 * x14) + (0x2 * (((u64)x10 * x18) + (0x2 * (((u64)x12 * x16) + ((u64)x8 * x17))))));
				469	{ u64 x32 = (0x2 * ((((u64)x14 * x16) + ((u64)x12 * x18)) + ((u64)x10 * x17)));
				470	{ u64 x33 = (0x2 * ((((u64)x16 * x16) + ((u64)x14 * x18)) + ((u64)(0x2 * x12) * x17)));
				471	{ u64 x34 = (0x2 * (((u64)x16 * x18) + ((u64)x14 * x17)));
				472	{ u64 x35 = (((u64)x18 * x18) + ((u64)(0x4 * x16) * x17));
				473	{ u64 x36 = ((u64)(0x2 * x18) * x17);
				474	{ u64 x37 = ((u64)(0x2 * x17) * x17);
				475	{ u64 x38 = (x27 + (x37 << 0x4));
				476	{ u64 x39 = (x38 + (x37 << 0x1));
				477	{ u64 x40 = (x39 + x37);
				478	{ u64 x41 = (x26 + (x36 << 0x4));
				479	{ u64 x42 = (x41 + (x36 << 0x1));
				480	{ u64 x43 = (x42 + x36);
				481	{ u64 x44 = (x25 + (x35 << 0x4));
				482	{ u64 x45 = (x44 + (x35 << 0x1));
				483	{ u64 x46 = (x45 + x35);
				484	{ u64 x47 = (x24 + (x34 << 0x4));
				485	{ u64 x48 = (x47 + (x34 << 0x1));
				486	{ u64 x49 = (x48 + x34);
				487	{ u64 x50 = (x23 + (x33 << 0x4));
				488	{ u64 x51 = (x50 + (x33 << 0x1));
				489	{ u64 x52 = (x51 + x33);
				490	{ u64 x53 = (x22 + (x32 << 0x4));
				491	{ u64 x54 = (x53 + (x32 << 0x1));
				492	{ u64 x55 = (x54 + x32);
				493	{ u64 x56 = (x21 + (x31 << 0x4));
				494	{ u64 x57 = (x56 + (x31 << 0x1));
				495	{ u64 x58 = (x57 + x31);
				496	{ u64 x59 = (x20 + (x30 << 0x4));
				497	{ u64 x60 = (x59 + (x30 << 0x1));
				498	{ u64 x61 = (x60 + x30);
				499	{ u64 x62 = (x19 + (x29 << 0x4));
				500	{ u64 x63 = (x62 + (x29 << 0x1));
				501	{ u64 x64 = (x63 + x29);
				502	{ u64 x65 = (x64 >> 0x1a);
				503	{ u32 x66 = ((u32)x64 & 0x3ffffff);
				504	{ u64 x67 = (x65 + x61);
				505	{ u64 x68 = (x67 >> 0x19);
				506	{ u32 x69 = ((u32)x67 & 0x1ffffff);
				507	{ u64 x70 = (x68 + x58);
				508	{ u64 x71 = (x70 >> 0x1a);
				509	{ u32 x72 = ((u32)x70 & 0x3ffffff);
				510	{ u64 x73 = (x71 + x55);
				511	{ u64 x74 = (x73 >> 0x19);
				512	{ u32 x75 = ((u32)x73 & 0x1ffffff);
				513	{ u64 x76 = (x74 + x52);
				514	{ u64 x77 = (x76 >> 0x1a);
				515	{ u32 x78 = ((u32)x76 & 0x3ffffff);
				516	{ u64 x79 = (x77 + x49);
				517	{ u64 x80 = (x79 >> 0x19);
				518	{ u32 x81 = ((u32)x79 & 0x1ffffff);
				519	{ u64 x82 = (x80 + x46);
				520	{ u64 x83 = (x82 >> 0x1a);
				521	{ u32 x84 = ((u32)x82 & 0x3ffffff);
				522	{ u64 x85 = (x83 + x43);
				523	{ u64 x86 = (x85 >> 0x19);
				524	{ u32 x87 = ((u32)x85 & 0x1ffffff);
				525	{ u64 x88 = (x86 + x40);
				526	{ u64 x89 = (x88 >> 0x1a);
				527	{ u32 x90 = ((u32)x88 & 0x3ffffff);
				528	{ u64 x91 = (x89 + x28);
				529	{ u64 x92 = (x91 >> 0x19);
				530	{ u32 x93 = ((u32)x91 & 0x1ffffff);
				531	{ u64 x94 = (x66 + (0x13 * x92));
				532	{ u32 x95 = (u32) (x94 >> 0x1a);
				533	{ u32 x96 = ((u32)x94 & 0x3ffffff);
				534	{ u32 x97 = (x95 + x69);
				535	{ u32 x98 = (x97 >> 0x19);
				536	{ u32 x99 = (x97 & 0x1ffffff);
				537	out[0] = x96;
				538	out[1] = x99;
				539	out[2] = (x98 + x72);
				540	out[3] = x75;
				541	out[4] = x78;
				542	out[5] = x81;
				543	out[6] = x84;
				544	out[7] = x87;
				545	out[8] = x90;
				546	out[9] = x93;
				547	}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}
				548	}
				549
				550	static __always_inline void fe_sq_tl(fe h, const fe_loose f)
				551	{
				552	fe_sqr_impl(h->v, f->v);
				553	}
				554
				555	static __always_inline void fe_sq_tt(fe h, const fe f)
				556	{
				557	fe_sqr_impl(h->v, f->v);
				558	}
				559
				560	static __always_inline void fe_loose_invert(fe out, const fe_loose z)
				561	{
				562	fe t0;
				563	fe t1;
				564	fe t2;
				565	fe t3;
				566	int i;
				567
				568	fe_sq_tl(&t0, z);
				569	fe_sq_tt(&t1, &t0);
				570	for (i = 1; i < 2; ++i)
				571	fe_sq_tt(&t1, &t1);
				572	fe_mul_tlt(&t1, z, &t1);
				573	fe_mul_ttt(&t0, &t0, &t1);
				574	fe_sq_tt(&t2, &t0);
				575	fe_mul_ttt(&t1, &t1, &t2);
				576	fe_sq_tt(&t2, &t1);
				577	for (i = 1; i < 5; ++i)
				578	fe_sq_tt(&t2, &t2);
				579	fe_mul_ttt(&t1, &t2, &t1);
				580	fe_sq_tt(&t2, &t1);
				581	for (i = 1; i < 10; ++i)
				582	fe_sq_tt(&t2, &t2);
				583	fe_mul_ttt(&t2, &t2, &t1);
				584	fe_sq_tt(&t3, &t2);
				585	for (i = 1; i < 20; ++i)
				586	fe_sq_tt(&t3, &t3);
				587	fe_mul_ttt(&t2, &t3, &t2);
				588	fe_sq_tt(&t2, &t2);
				589	for (i = 1; i < 10; ++i)
				590	fe_sq_tt(&t2, &t2);
				591	fe_mul_ttt(&t1, &t2, &t1);
				592	fe_sq_tt(&t2, &t1);
				593	for (i = 1; i < 50; ++i)
				594	fe_sq_tt(&t2, &t2);
				595	fe_mul_ttt(&t2, &t2, &t1);
				596	fe_sq_tt(&t3, &t2);
				597	for (i = 1; i < 100; ++i)
				598	fe_sq_tt(&t3, &t3);
				599	fe_mul_ttt(&t2, &t3, &t2);
				600	fe_sq_tt(&t2, &t2);
				601	for (i = 1; i < 50; ++i)
				602	fe_sq_tt(&t2, &t2);
				603	fe_mul_ttt(&t1, &t2, &t1);
				604	fe_sq_tt(&t1, &t1);
				605	for (i = 1; i < 5; ++i)
				606	fe_sq_tt(&t1, &t1);
				607	fe_mul_ttt(out, &t1, &t0);
				608	}
				609
				610	static __always_inline void fe_invert(fe out, const fe z)
				611	{
				612	fe_loose l;
				613	fe_copy_lt(&l, z);
				614	fe_loose_invert(out, &l);
				615	}
				616
				617	/* Replace (f,g) with (g,f) if b == 1;
				618	* replace (f,g) with (f,g) if b == 0.
				619	*
				620	* Preconditions: b in {0,1}
				621	*/
Ard Biesheuvel	660bb8e	2019-11-08 13:22:35 +0100	[diff] [blame]	622	static noinline void fe_cswap(fe f, fe g, unsigned int b)
Jason A. Donenfeld	0ed42a6f	2019-11-08 13:22:32 +0100	[diff] [blame]	623	{
				624	unsigned i;
				625	b = 0 - b;
				626	for (i = 0; i < 10; i++) {
				627	u32 x = f->v[i] ^ g->v[i];
				628	x &= b;
				629	f->v[i] ^= x;
				630	g->v[i] ^= x;
				631	}
				632	}
				633
				634	/* NOTE: based on fiat-crypto fe_mul, edited for in2=121666, 0, 0.*/
				635	static __always_inline void fe_mul_121666_impl(u32 out[10], const u32 in1[10])
				636	{
				637	{ const u32 x20 = in1[9];
				638	{ const u32 x21 = in1[8];
				639	{ const u32 x19 = in1[7];
				640	{ const u32 x17 = in1[6];
				641	{ const u32 x15 = in1[5];
				642	{ const u32 x13 = in1[4];
				643	{ const u32 x11 = in1[3];
				644	{ const u32 x9 = in1[2];
				645	{ const u32 x7 = in1[1];
				646	{ const u32 x5 = in1[0];
				647	{ const u32 x38 = 0;
				648	{ const u32 x39 = 0;
				649	{ const u32 x37 = 0;
				650	{ const u32 x35 = 0;
				651	{ const u32 x33 = 0;
				652	{ const u32 x31 = 0;
				653	{ const u32 x29 = 0;
				654	{ const u32 x27 = 0;
				655	{ const u32 x25 = 0;
				656	{ const u32 x23 = 121666;
				657	{ u64 x40 = ((u64)x23 * x5);
				658	{ u64 x41 = (((u64)x23 * x7) + ((u64)x25 * x5));
				659	{ u64 x42 = ((((u64)(0x2 * x25) * x7) + ((u64)x23 * x9)) + ((u64)x27 * x5));
				660	{ u64 x43 = (((((u64)x25 * x9) + ((u64)x27 * x7)) + ((u64)x23 * x11)) + ((u64)x29 * x5));
				661	{ u64 x44 = (((((u64)x27 * x9) + (0x2 * (((u64)x25 * x11) + ((u64)x29 * x7)))) + ((u64)x23 * x13)) + ((u64)x31 * x5));
				662	{ u64 x45 = (((((((u64)x27 * x11) + ((u64)x29 * x9)) + ((u64)x25 * x13)) + ((u64)x31 * x7)) + ((u64)x23 * x15)) + ((u64)x33 * x5));
				663	{ u64 x46 = (((((0x2 * ((((u64)x29 * x11) + ((u64)x25 * x15)) + ((u64)x33 * x7))) + ((u64)x27 * x13)) + ((u64)x31 * x9)) + ((u64)x23 * x17)) + ((u64)x35 * x5));
				664	{ u64 x47 = (((((((((u64)x29 * x13) + ((u64)x31 * x11)) + ((u64)x27 * x15)) + ((u64)x33 * x9)) + ((u64)x25 * x17)) + ((u64)x35 * x7)) + ((u64)x23 * x19)) + ((u64)x37 * x5));
				665	{ u64 x48 = (((((((u64)x31 * x13) + (0x2 * (((((u64)x29 * x15) + ((u64)x33 * x11)) + ((u64)x25 * x19)) + ((u64)x37 * x7)))) + ((u64)x27 * x17)) + ((u64)x35 * x9)) + ((u64)x23 * x21)) + ((u64)x39 * x5));
				666	{ u64 x49 = (((((((((((u64)x31 * x15) + ((u64)x33 * x13)) + ((u64)x29 * x17)) + ((u64)x35 * x11)) + ((u64)x27 * x19)) + ((u64)x37 * x9)) + ((u64)x25 * x21)) + ((u64)x39 * x7)) + ((u64)x23 * x20)) + ((u64)x38 * x5));
				667	{ u64 x50 = (((((0x2 * ((((((u64)x33 * x15) + ((u64)x29 * x19)) + ((u64)x37 * x11)) + ((u64)x25 * x20)) + ((u64)x38 * x7))) + ((u64)x31 * x17)) + ((u64)x35 * x13)) + ((u64)x27 * x21)) + ((u64)x39 * x9));
				668	{ u64 x51 = (((((((((u64)x33 * x17) + ((u64)x35 * x15)) + ((u64)x31 * x19)) + ((u64)x37 * x13)) + ((u64)x29 * x21)) + ((u64)x39 * x11)) + ((u64)x27 * x20)) + ((u64)x38 * x9));
				669	{ u64 x52 = (((((u64)x35 * x17) + (0x2 * (((((u64)x33 * x19) + ((u64)x37 * x15)) + ((u64)x29 * x20)) + ((u64)x38 * x11)))) + ((u64)x31 * x21)) + ((u64)x39 * x13));
				670	{ u64 x53 = (((((((u64)x35 * x19) + ((u64)x37 * x17)) + ((u64)x33 * x21)) + ((u64)x39 * x15)) + ((u64)x31 * x20)) + ((u64)x38 * x13));
				671	{ u64 x54 = (((0x2 * ((((u64)x37 * x19) + ((u64)x33 * x20)) + ((u64)x38 * x15))) + ((u64)x35 * x21)) + ((u64)x39 * x17));
				672	{ u64 x55 = (((((u64)x37 * x21) + ((u64)x39 * x19)) + ((u64)x35 * x20)) + ((u64)x38 * x17));
				673	{ u64 x56 = (((u64)x39 * x21) + (0x2 * (((u64)x37 * x20) + ((u64)x38 * x19))));
				674	{ u64 x57 = (((u64)x39 * x20) + ((u64)x38 * x21));
				675	{ u64 x58 = ((u64)(0x2 * x38) * x20);
				676	{ u64 x59 = (x48 + (x58 << 0x4));
				677	{ u64 x60 = (x59 + (x58 << 0x1));
				678	{ u64 x61 = (x60 + x58);
				679	{ u64 x62 = (x47 + (x57 << 0x4));
				680	{ u64 x63 = (x62 + (x57 << 0x1));
				681	{ u64 x64 = (x63 + x57);
				682	{ u64 x65 = (x46 + (x56 << 0x4));
				683	{ u64 x66 = (x65 + (x56 << 0x1));
				684	{ u64 x67 = (x66 + x56);
				685	{ u64 x68 = (x45 + (x55 << 0x4));
				686	{ u64 x69 = (x68 + (x55 << 0x1));
				687	{ u64 x70 = (x69 + x55);
				688	{ u64 x71 = (x44 + (x54 << 0x4));
				689	{ u64 x72 = (x71 + (x54 << 0x1));
				690	{ u64 x73 = (x72 + x54);
				691	{ u64 x74 = (x43 + (x53 << 0x4));
				692	{ u64 x75 = (x74 + (x53 << 0x1));
				693	{ u64 x76 = (x75 + x53);
				694	{ u64 x77 = (x42 + (x52 << 0x4));
				695	{ u64 x78 = (x77 + (x52 << 0x1));
				696	{ u64 x79 = (x78 + x52);
				697	{ u64 x80 = (x41 + (x51 << 0x4));
				698	{ u64 x81 = (x80 + (x51 << 0x1));
				699	{ u64 x82 = (x81 + x51);
				700	{ u64 x83 = (x40 + (x50 << 0x4));
				701	{ u64 x84 = (x83 + (x50 << 0x1));
				702	{ u64 x85 = (x84 + x50);
				703	{ u64 x86 = (x85 >> 0x1a);
				704	{ u32 x87 = ((u32)x85 & 0x3ffffff);
				705	{ u64 x88 = (x86 + x82);
				706	{ u64 x89 = (x88 >> 0x19);
				707	{ u32 x90 = ((u32)x88 & 0x1ffffff);
				708	{ u64 x91 = (x89 + x79);
				709	{ u64 x92 = (x91 >> 0x1a);
				710	{ u32 x93 = ((u32)x91 & 0x3ffffff);
				711	{ u64 x94 = (x92 + x76);
				712	{ u64 x95 = (x94 >> 0x19);
				713	{ u32 x96 = ((u32)x94 & 0x1ffffff);
				714	{ u64 x97 = (x95 + x73);
				715	{ u64 x98 = (x97 >> 0x1a);
				716	{ u32 x99 = ((u32)x97 & 0x3ffffff);
				717	{ u64 x100 = (x98 + x70);
				718	{ u64 x101 = (x100 >> 0x19);
				719	{ u32 x102 = ((u32)x100 & 0x1ffffff);
				720	{ u64 x103 = (x101 + x67);
				721	{ u64 x104 = (x103 >> 0x1a);
				722	{ u32 x105 = ((u32)x103 & 0x3ffffff);
				723	{ u64 x106 = (x104 + x64);
				724	{ u64 x107 = (x106 >> 0x19);
				725	{ u32 x108 = ((u32)x106 & 0x1ffffff);
				726	{ u64 x109 = (x107 + x61);
				727	{ u64 x110 = (x109 >> 0x1a);
				728	{ u32 x111 = ((u32)x109 & 0x3ffffff);
				729	{ u64 x112 = (x110 + x49);
				730	{ u64 x113 = (x112 >> 0x19);
				731	{ u32 x114 = ((u32)x112 & 0x1ffffff);
				732	{ u64 x115 = (x87 + (0x13 * x113));
				733	{ u32 x116 = (u32) (x115 >> 0x1a);
				734	{ u32 x117 = ((u32)x115 & 0x3ffffff);
				735	{ u32 x118 = (x116 + x90);
				736	{ u32 x119 = (x118 >> 0x19);
				737	{ u32 x120 = (x118 & 0x1ffffff);
				738	out[0] = x117;
				739	out[1] = x120;
				740	out[2] = (x119 + x93);
				741	out[3] = x96;
				742	out[4] = x99;
				743	out[5] = x102;
				744	out[6] = x105;
				745	out[7] = x108;
				746	out[8] = x111;
				747	out[9] = x114;
				748	}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}
				749	}
				750
				751	static __always_inline void fe_mul121666(fe h, const fe_loose f)
				752	{
				753	fe_mul_121666_impl(h->v, f->v);
				754	}
				755
				756	void curve25519_generic(u8 out[CURVE25519_KEY_SIZE],
				757	const u8 scalar[CURVE25519_KEY_SIZE],
				758	const u8 point[CURVE25519_KEY_SIZE])
				759	{
				760	fe x1, x2, z2, x3, z3;
				761	fe_loose x2l, z2l, x3l;
				762	unsigned swap = 0;
				763	int pos;
				764	u8 e[32];
				765
				766	memcpy(e, scalar, 32);
				767	curve25519_clamp_secret(e);
				768
				769	/* The following implementation was transcribed to Coq and proven to
				770	* correspond to unary scalar multiplication in affine coordinates given
				771	* that x1 != 0 is the x coordinate of some point on the curve. It was
				772	* also checked in Coq that doing a ladderstep with x1 = x3 = 0 gives
				773	* z2' = z3' = 0, and z2 = z3 = 0 gives z2' = z3' = 0. The statement was
				774	* quantified over the underlying field, so it applies to Curve25519
				775	* itself and the quadratic twist of Curve25519. It was not proven in
				776	* Coq that prime-field arithmetic correctly simulates extension-field
				777	* arithmetic on prime-field values. The decoding of the byte array
				778	* representation of e was not considered.
				779	*
				780	* Specification of Montgomery curves in affine coordinates:
				781	* <https://github.com/mit-plv/fiat-crypto/blob/2456d821825521f7e03e65882cc3521795b0320f/src/Spec/MontgomeryCurve.v#L27>
				782	*
				783	* Proof that these form a group that is isomorphic to a Weierstrass
				784	* curve:
				785	* <https://github.com/mit-plv/fiat-crypto/blob/2456d821825521f7e03e65882cc3521795b0320f/src/Curves/Montgomery/AffineProofs.v#L35>
				786	*
				787	* Coq transcription and correctness proof of the loop
				788	* (where scalarbits=255):
				789	* <https://github.com/mit-plv/fiat-crypto/blob/2456d821825521f7e03e65882cc3521795b0320f/src/Curves/Montgomery/XZ.v#L118>
				790	* <https://github.com/mit-plv/fiat-crypto/blob/2456d821825521f7e03e65882cc3521795b0320f/src/Curves/Montgomery/XZProofs.v#L278>
				791	* preconditions: 0 <= e < 2^255 (not necessarily e < order),
				792	* fe_invert(0) = 0
				793	*/
				794	fe_frombytes(&x1, point);
				795	fe_1(&x2);
				796	fe_0(&z2);
				797	fe_copy(&x3, &x1);
				798	fe_1(&z3);
				799
				800	for (pos = 254; pos >= 0; --pos) {
				801	fe tmp0, tmp1;
				802	fe_loose tmp0l, tmp1l;
				803	/* loop invariant as of right before the test, for the case
				804	* where x1 != 0:
				805	* pos >= -1; if z2 = 0 then x2 is nonzero; if z3 = 0 then x3
				806	* is nonzero
				807	* let r := e >> (pos+1) in the following equalities of
				808	* projective points:
				809	* to_xz (r*P) === if swap then (x3, z3) else (x2, z2)
				810	* to_xz ((r+1)*P) === if swap then (x2, z2) else (x3, z3)
				811	* x1 is the nonzero x coordinate of the nonzero
				812	* point (rP-(r+1)P)
				813	*/
				814	unsigned b = 1 & (e[pos / 8] >> (pos & 7));
				815	swap ^= b;
				816	fe_cswap(&x2, &x3, swap);
				817	fe_cswap(&z2, &z3, swap);
				818	swap = b;
				819	/* Coq transcription of ladderstep formula (called from
				820	* transcribed loop):
				821	* <https://github.com/mit-plv/fiat-crypto/blob/2456d821825521f7e03e65882cc3521795b0320f/src/Curves/Montgomery/XZ.v#L89>
				822	* <https://github.com/mit-plv/fiat-crypto/blob/2456d821825521f7e03e65882cc3521795b0320f/src/Curves/Montgomery/XZProofs.v#L131>
				823	* x1 != 0 <https://github.com/mit-plv/fiat-crypto/blob/2456d821825521f7e03e65882cc3521795b0320f/src/Curves/Montgomery/XZProofs.v#L217>
				824	* x1 = 0 <https://github.com/mit-plv/fiat-crypto/blob/2456d821825521f7e03e65882cc3521795b0320f/src/Curves/Montgomery/XZProofs.v#L147>
				825	*/
				826	fe_sub(&tmp0l, &x3, &z3);
				827	fe_sub(&tmp1l, &x2, &z2);
				828	fe_add(&x2l, &x2, &z2);
				829	fe_add(&z2l, &x3, &z3);
				830	fe_mul_tll(&z3, &tmp0l, &x2l);
				831	fe_mul_tll(&z2, &z2l, &tmp1l);
				832	fe_sq_tl(&tmp0, &tmp1l);
				833	fe_sq_tl(&tmp1, &x2l);
				834	fe_add(&x3l, &z3, &z2);
				835	fe_sub(&z2l, &z3, &z2);
				836	fe_mul_ttt(&x2, &tmp1, &tmp0);
				837	fe_sub(&tmp1l, &tmp1, &tmp0);
				838	fe_sq_tl(&z2, &z2l);
				839	fe_mul121666(&z3, &tmp1l);
				840	fe_sq_tl(&x3, &x3l);
				841	fe_add(&tmp0l, &tmp0, &z3);
				842	fe_mul_ttt(&z3, &x1, &z2);
				843	fe_mul_tll(&z2, &tmp1l, &tmp0l);
				844	}
				845	/* here pos=-1, so r=e, so to_xz (e*P) === if swap then (x3, z3)
				846	* else (x2, z2)
				847	*/
				848	fe_cswap(&x2, &x3, swap);
				849	fe_cswap(&z2, &z3, swap);
				850
				851	fe_invert(&z2, &z2);
				852	fe_mul_ttt(&x2, &x2, &z2);
				853	fe_tobytes(out, &x2);
				854
				855	memzero_explicit(&x1, sizeof(x1));
				856	memzero_explicit(&x2, sizeof(x2));
				857	memzero_explicit(&z2, sizeof(z2));
				858	memzero_explicit(&x3, sizeof(x3));
				859	memzero_explicit(&z3, sizeof(z3));
				860	memzero_explicit(&x2l, sizeof(x2l));
				861	memzero_explicit(&z2l, sizeof(z2l));
				862	memzero_explicit(&x3l, sizeof(x3l));
				863	memzero_explicit(&e, sizeof(e));
				864	}