Blame - p256_ec.c - chromium.googlesource.com/chromiumos/third_party/cryptoc

blob: 55728931f8cce34df2e5f6710575fb5561ef8e59 [file] [log] [blame]

nagendra modadugu	4fae542	2016-05-10 16:11:54 -0700	[diff] [blame]	1	// Copyright 2016 Google Inc.
				2	//
				3	// Licensed under the Apache License, Version 2.0 (the "License");
				4	// you may not use this file except in compliance with the License.
				5	// You may obtain a copy of the License at
				6	//
				7	// http://www.apache.org/licenses/LICENSE-2.0
				8	//
				9	// Unless required by applicable law or agreed to in writing, software
				10	// distributed under the License is distributed on an "AS IS" BASIS,
				11	// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
				12	// See the License for the specific language governing permissions and
				13	// limitations under the License.
				14	#include <stdint.h>
				15
				16	#include <string.h>
				17
				18	#include "cryptoc/p256.h"
				19
				20	typedef uint8_t u8;
				21	typedef uint32_t u32;
				22	typedef int32_t s32;
				23	typedef uint64_t u64;
				24
				25	/* Our field elements are represented as nine 32-bit limbs.
				26	*
				27	* The value of an felem (field element) is:
				28	* x[0] + (x[1] * 2*29) + (x[2] 2*57) + ... + (x[8] 2**228)
				29	*
				30	* That is, each limb is alternately 29 or 28-bits wide in little-endian
				31	* order.
				32	*
				33	* This means that an felem hits 2257, rather than 2256 as we would like. A
				34	* 28, 29, ... pattern would cause us to hit 2**256, but that causes problems
				35	* when multiplying as terms end up one bit short of a limb which would require
				36	* much bit-shifting to correct.
				37	*
				38	* Finally, the values stored in an felem are in Montgomery form. So the value
				39	* \|y\| is stored as (yR) mod p, where p is the P-256 prime and R is 2*257.
				40	*/
				41	typedef u32 limb;
				42	#define NLIMBS 9
				43	typedef limb felem[NLIMBS];
				44
				45	static const limb kBottom28Bits = 0xfffffff;
				46	static const limb kBottom29Bits = 0x1fffffff;
				47
				48	/* kOne is the number 1 as an felem. It's 2**257 mod p split up into 29 and
				49	* 28-bit words. */
				50	static const felem kOne = {
				51	2, 0, 0, 0xffff800,
				52	0x1fffffff, 0xfffffff, 0x1fbfffff, 0x1ffffff,
				53	0
				54	};
				55	static const felem kZero = {0};
				56	static const felem kP = {
				57	0x1fffffff, 0xfffffff, 0x1fffffff, 0x3ff,
				58	0, 0, 0x200000, 0xf000000,
				59	0xfffffff
				60	};
				61	static const felem k2P = {
				62	0x1ffffffe, 0xfffffff, 0x1fffffff, 0x7ff,
				63	0, 0, 0x400000, 0xe000000,
				64	0x1fffffff
				65	};
				66	/* kPrecomputed contains precomputed values to aid the calculation of scalar
				67	* multiples of the base point, G. It's actually two, equal length, tables
				68	* concatenated.
				69	*
				70	* The first table contains (x,y) felem pairs for 16 multiples of the base
				71	* point, G.
				72	*
				73	* Index \| Index (binary) \| Value
				74	* 0 \| 0000 \| 0G (all zeros, omitted)
				75	* 1 \| 0001 \| G
				76	* 2 \| 0010 \| 2**64G
				77	* 3 \| 0011 \| 2**64G + G
				78	* 4 \| 0100 \| 2**128G
				79	* 5 \| 0101 \| 2**128G + G
				80	* 6 \| 0110 \| 2128G + 264G
				81	* 7 \| 0111 \| 2128G + 264G + G
				82	* 8 \| 1000 \| 2**192G
				83	* 9 \| 1001 \| 2**192G + G
				84	* 10 \| 1010 \| 2192G + 264G
				85	* 11 \| 1011 \| 2192G + 264G + G
				86	* 12 \| 1100 \| 2192G + 2128G
				87	* 13 \| 1101 \| 2192G + 2128G + G
				88	* 14 \| 1110 \| 2192G + 2128G + 2**64G
				89	* 15 \| 1111 \| 2192G + 2128G + 2**64G + G
				90	*
				91	* The second table follows the same style, but the terms are 2**32G,
				92	* 296G, 2160G, 2**224G.
				93	*
				94	* This is ~2KB of data. */
				95	static const limb kPrecomputed[NLIMBS * 2 * 15 * 2] = {
				96	0x11522878, 0xe730d41, 0xdb60179, 0x4afe2ff, 0x12883add, 0xcaddd88, 0x119e7edc, 0xd4a6eab, 0x3120bee,
				97	0x1d2aac15, 0xf25357c, 0x19e45cdd, 0x5c721d0, 0x1992c5a5, 0xa237487, 0x154ba21, 0x14b10bb, 0xae3fe3,
				98	0xd41a576, 0x922fc51, 0x234994f, 0x60b60d3, 0x164586ae, 0xce95f18, 0x1fe49073, 0x3fa36cc, 0x5ebcd2c,
				99	0xb402f2f, 0x15c70bf, 0x1561925c, 0x5a26704, 0xda91e90, 0xcdc1c7f, 0x1ea12446, 0xe1ade1e, 0xec91f22,
				100	0x26f7778, 0x566847e, 0xa0bec9e, 0x234f453, 0x1a31f21a, 0xd85e75c, 0x56c7109, 0xa267a00, 0xb57c050,
				101	0x98fb57, 0xaa837cc, 0x60c0792, 0xcfa5e19, 0x61bab9e, 0x589e39b, 0xa324c5, 0x7d6dee7, 0x2976e4b,
				102	0x1fc4124a, 0xa8c244b, 0x1ce86762, 0xcd61c7e, 0x1831c8e0, 0x75774e1, 0x1d96a5a9, 0x843a649, 0xc3ab0fa,
				103	0x6e2e7d5, 0x7673a2a, 0x178b65e8, 0x4003e9b, 0x1a1f11c2, 0x7816ea, 0xf643e11, 0x58c43df, 0xf423fc2,
				104	0x19633ffa, 0x891f2b2, 0x123c231c, 0x46add8c, 0x54700dd, 0x59e2b17, 0x172db40f, 0x83e277d, 0xb0dd609,
				105	0xfd1da12, 0x35c6e52, 0x19ede20c, 0xd19e0c0, 0x97d0f40, 0xb015b19, 0x449e3f5, 0xe10c9e, 0x33ab581,
				106	0x56a67ab, 0x577734d, 0x1dddc062, 0xc57b10d, 0x149b39d, 0x26a9e7b, 0xc35df9f, 0x48764cd, 0x76dbcca,
				107	0xca4b366, 0xe9303ab, 0x1a7480e7, 0x57e9e81, 0x1e13eb50, 0xf466cf3, 0x6f16b20, 0x4ba3173, 0xc168c33,
				108	0x15cb5439, 0x6a38e11, 0x73658bd, 0xb29564f, 0x3f6dc5b, 0x53b97e, 0x1322c4c0, 0x65dd7ff, 0x3a1e4f6,
				109	0x14e614aa, 0x9246317, 0x1bc83aca, 0xad97eed, 0xd38ce4a, 0xf82b006, 0x341f077, 0xa6add89, 0x4894acd,
				110	0x9f162d5, 0xf8410ef, 0x1b266a56, 0xd7f223, 0x3e0cb92, 0xe39b672, 0x6a2901a, 0x69a8556, 0x7e7c0,
				111	0x9b7d8d3, 0x309a80, 0x1ad05f7f, 0xc2fb5dd, 0xcbfd41d, 0x9ceb638, 0x1051825c, 0xda0cf5b, 0x812e881,
				112	0x6f35669, 0x6a56f2c, 0x1df8d184, 0x345820, 0x1477d477, 0x1645db1, 0xbe80c51, 0xc22be3e, 0xe35e65a,
				113	0x1aeb7aa0, 0xc375315, 0xf67bc99, 0x7fdd7b9, 0x191fc1be, 0x61235d, 0x2c184e9, 0x1c5a839, 0x47a1e26,
				114	0xb7cb456, 0x93e225d, 0x14f3c6ed, 0xccc1ac9, 0x17fe37f3, 0x4988989, 0x1a90c502, 0x2f32042, 0xa17769b,
				115	0xafd8c7c, 0x8191c6e, 0x1dcdb237, 0x16200c0, 0x107b32a1, 0x66c08db, 0x10d06a02, 0x3fc93, 0x5620023,
				116	0x16722b27, 0x68b5c59, 0x270fcfc, 0xfad0ecc, 0xe5de1c2, 0xeab466b, 0x2fc513c, 0x407f75c, 0xbaab133,
				117	0x9705fe9, 0xb88b8e7, 0x734c993, 0x1e1ff8f, 0x19156970, 0xabd0f00, 0x10469ea7, 0x3293ac0, 0xcdc98aa,
				118	0x1d843fd, 0xe14bfe8, 0x15be825f, 0x8b5212, 0xeb3fb67, 0x81cbd29, 0xbc62f16, 0x2b6fcc7, 0xf5a4e29,
				119	0x13560b66, 0xc0b6ac2, 0x51ae690, 0xd41e271, 0xf3e9bd4, 0x1d70aab, 0x1029f72, 0x73e1c35, 0xee70fbc,
				120	0xad81baf, 0x9ecc49a, 0x86c741e, 0xfe6be30, 0x176752e7, 0x23d416, 0x1f83de85, 0x27de188, 0x66f70b8,
				121	0x181cd51f, 0x96b6e4c, 0x188f2335, 0xa5df759, 0x17a77eb6, 0xfeb0e73, 0x154ae914, 0x2f3ec51, 0x3826b59,
				122	0xb91f17d, 0x1c72949, 0x1362bf0a, 0xe23fddf, 0xa5614b0, 0xf7d8f, 0x79061, 0x823d9d2, 0x8213f39,
				123	0x1128ae0b, 0xd095d05, 0xb85c0c2, 0x1ecb2ef, 0x24ddc84, 0xe35e901, 0x18411a4a, 0xf5ddc3d, 0x3786689,
				124	0x52260e8, 0x5ae3564, 0x542b10d, 0x8d93a45, 0x19952aa4, 0x996cc41, 0x1051a729, 0x4be3499, 0x52b23aa,
				125	0x109f307e, 0x6f5b6bb, 0x1f84e1e7, 0x77a0cfa, 0x10c4df3f, 0x25a02ea, 0xb048035, 0xe31de66, 0xc6ecaa3,
				126	0x28ea335, 0x2886024, 0x1372f020, 0xf55d35, 0x15e4684c, 0xf2a9e17, 0x1a4a7529, 0xcb7beb1, 0xb2a78a1,
				127	0x1ab21f1f, 0x6361ccf, 0x6c9179d, 0xb135627, 0x1267b974, 0x4408bad, 0x1cbff658, 0xe3d6511, 0xc7d76f,
				128	0x1cc7a69, 0xe7ee31b, 0x54fab4f, 0x2b914f, 0x1ad27a30, 0xcd3579e, 0xc50124c, 0x50daa90, 0xb13f72,
				129	0xb06aa75, 0x70f5cc6, 0x1649e5aa, 0x84a5312, 0x329043c, 0x41c4011, 0x13d32411, 0xb04a838, 0xd760d2d,
				130	0x1713b532, 0xbaa0c03, 0x84022ab, 0x6bcf5c1, 0x2f45379, 0x18ae070, 0x18c9e11e, 0x20bca9a, 0x66f496b,
				131	0x3eef294, 0x67500d2, 0xd7f613c, 0x2dbbeb, 0xb741038, 0xe04133f, 0x1582968d, 0xbe985f7, 0x1acbc1a,
				132	0x1a6a939f, 0x33e50f6, 0xd665ed4, 0xb4b7bd6, 0x1e5a3799, 0x6b33847, 0x17fa56ff, 0x65ef930, 0x21dc4a,
				133	0x2b37659, 0x450fe17, 0xb357b65, 0xdf5efac, 0x15397bef, 0x9d35a7f, 0x112ac15f, 0x624e62e, 0xa90ae2f,
				134	0x107eecd2, 0x1f69bbe, 0x77d6bce, 0x5741394, 0x13c684fc, 0x950c910, 0x725522b, 0xdc78583, 0x40eeabb,
				135	0x1fde328a, 0xbd61d96, 0xd28c387, 0x9e77d89, 0x12550c40, 0x759cb7d, 0x367ef34, 0xae2a960, 0x91b8bdc,
				136	0x93462a9, 0xf469ef, 0xb2e9aef, 0xd2ca771, 0x54e1f42, 0x7aaa49, 0x6316abb, 0x2413c8e, 0x5425bf9,
				137	0x1bed3e3a, 0xf272274, 0x1f5e7326, 0x6416517, 0xea27072, 0x9cedea7, 0x6e7633, 0x7c91952, 0xd806dce,
				138	0x8e2a7e1, 0xe421e1a, 0x418c9e1, 0x1dbc890, 0x1b395c36, 0xa1dc175, 0x1dc4ef73, 0x8956f34, 0xe4b5cf2,
				139	0x1b0d3a18, 0x3194a36, 0x6c2641f, 0xe44124c, 0xa2f4eaa, 0xa8c25ba, 0xf927ed7, 0x627b614, 0x7371cca,
				140	0xba16694, 0x417bc03, 0x7c0a7e3, 0x9c35c19, 0x1168a205, 0x8b6b00d, 0x10e3edc9, 0x9c19bf2, 0x5882229,
				141	0x1b2b4162, 0xa5cef1a, 0x1543622b, 0x9bd433e, 0x364e04d, 0x7480792, 0x5c9b5b3, 0xe85ff25, 0x408ef57,
				142	0x1814cfa4, 0x121b41b, 0xd248a0f, 0x3b05222, 0x39bb16a, 0xc75966d, 0xa038113, 0xa4a1769, 0x11fbc6c,
				143	0x917e50e, 0xeec3da8, 0x169d6eac, 0x10c1699, 0xa416153, 0xf724912, 0x15cd60b7, 0x4acbad9, 0x5efc5fa,
				144	0xf150ed7, 0x122b51, 0x1104b40a, 0xcb7f442, 0xfbb28ff, 0x6ac53ca, 0x196142cc, 0x7bf0fa9, 0x957651,
				145	0x4e0f215, 0xed439f8, 0x3f46bd5, 0x5ace82f, 0x110916b6, 0x6db078, 0xffd7d57, 0xf2ecaac, 0xca86dec,
				146	0x15d6b2da, 0x965ecc9, 0x1c92b4c2, 0x1f3811, 0x1cb080f5, 0x2d8b804, 0x19d1c12d, 0xf20bd46, 0x1951fa7,
				147	0xa3656c3, 0x523a425, 0xfcd0692, 0xd44ddc8, 0x131f0f5b, 0xaf80e4a, 0xcd9fc74, 0x99bb618, 0x2db944c,
				148	0xa673090, 0x1c210e1, 0x178c8d23, 0x1474383, 0x10b8743d, 0x985a55b, 0x2e74779, 0x576138, 0x9587927,
				149	0x133130fa, 0xbe05516, 0x9f4d619, 0xbb62570, 0x99ec591, 0xd9468fe, 0x1d07782d, 0xfc72e0b, 0x701b298,
				150	0x1863863b, 0x85954b8, 0x121a0c36, 0x9e7fedf, 0xf64b429, 0x9b9d71e, 0x14e2f5d8, 0xf858d3a, 0x942eea8,
				151	0xda5b765, 0x6edafff, 0xa9d18cc, 0xc65e4ba, 0x1c747e86, 0xe4ea915, 0x1981d7a1, 0x8395659, 0x52ed4e2,
				152	0x87d43b7, 0x37ab11b, 0x19d292ce, 0xf8d4692, 0x18c3053f, 0x8863e13, 0x4c146c0, 0x6bdf55a, 0x4e4457d,
				153	0x16152289, 0xac78ec2, 0x1a59c5a2, 0x2028b97, 0x71c2d01, 0x295851f, 0x404747b, 0x878558d, 0x7d29aa4,
				154	0x13d8341f, 0x8daefd7, 0x139c972d, 0x6b7ea75, 0xd4a9dde, 0xff163d8, 0x81d55d7, 0xa5bef68, 0xb7b30d8,
				155	0xbe73d6f, 0xaa88141, 0xd976c81, 0x7e7a9cc, 0x18beb771, 0xd773cbd, 0x13f51951, 0x9d0c177, 0x1c49a78,
				156	};
				157
				158
				159	/* Field element operations: */
				160
				161	/* NON_ZERO_TO_ALL_ONES returns:
				162	* 0xffffffff for 0 < x <= 2**31
				163	* 0 for x == 0 or x > 2**31.
				164	*
				165	* x must be a u32 or an equivalent type such as limb. */
				166	#define NON_ZERO_TO_ALL_ONES(x) ((((u32)(x) - 1) >> 31) - 1)
				167
				168	/* felem_reduce_carry adds a multiple of p in order to cancel \|carry\|,
				169	* which is a term at 2**257.
				170	*
				171	* On entry: carry < 23, inout[0,2,...] < 229, inout[1,3,...] < 2**28.
				172	* On exit: inout[0,2,..] < 230, inout[1,3,...] < 229. */
				173	static void felem_reduce_carry(felem inout, limb carry) {
				174	const u32 carry_mask = NON_ZERO_TO_ALL_ONES(carry);
				175
				176	inout[0] += carry << 1;
				177	inout[3] += 0x10000000 & carry_mask;
				178	/* carry < 23 thus (carry << 11) < 214 and we added 2**28 in the
				179	* previous line therefore this doesn't underflow. */
				180	inout[3] -= carry << 11;
				181	inout[4] += (0x20000000 - 1) & carry_mask;
				182	inout[5] += (0x10000000 - 1) & carry_mask;
				183	inout[6] += (0x20000000 - 1) & carry_mask;
				184	inout[6] -= carry << 22;
				185	/* This may underflow if carry is non-zero but, if so, we'll fix it in the
				186	* next line. */
				187	inout[7] -= 1 & carry_mask;
				188	inout[7] += carry << 25;
				189	}
				190
				191	/* felem_sum sets out = in+in2.
				192	*
				193	* On entry, in[i]+in2[i] must not overflow a 32-bit word.
				194	* On exit: out[0,2,...] < 230, out[1,3,...] < 229 */
				195	static void felem_sum(felem out, const felem in, const felem in2) {
				196	limb carry = 0;
				197	unsigned i;
				198
				199	for (i = 0;; i++) {
				200	out[i] = in[i] + in2[i];
				201	out[i] += carry;
				202	carry = out[i] >> 29;
				203	out[i] &= kBottom29Bits;
				204
				205	i++;
				206	if (i == NLIMBS)
				207	break;
				208
				209	out[i] = in[i] + in2[i];
				210	out[i] += carry;
				211	carry = out[i] >> 28;
				212	out[i] &= kBottom28Bits;
				213	}
				214
				215	felem_reduce_carry(out, carry);
				216	}
				217
				218	#define two31m3 (((limb)1) << 31) - (((limb)1) << 3)
				219	#define two30m2 (((limb)1) << 30) - (((limb)1) << 2)
				220	#define two30p13m2 (((limb)1) << 30) + (((limb)1) << 13) - (((limb)1) << 2)
				221	#define two31m2 (((limb)1) << 31) - (((limb)1) << 2)
				222	#define two31p24m2 (((limb)1) << 31) + (((limb)1) << 24) - (((limb)1) << 2)
				223	#define two30m27m2 (((limb)1) << 30) - (((limb)1) << 27) - (((limb)1) << 2)
				224
				225	/* zero31 is 0 mod p. */
				226	static const felem zero31 = { two31m3, two30m2, two31m2, two30p13m2, two31m2, two30m2, two31p24m2, two30m27m2, two31m2 };
				227
				228	/* felem_diff sets out = in-in2.
				229	*
				230	* On entry: in[0,2,...] < 230, in[1,3,...] < 229 and
				231	* in2[0,2,...] < 230, in2[1,3,...] < 229.
				232	* On exit: out[0,2,...] < 230, out[1,3,...] < 229. */
				233	static void felem_diff(felem out, const felem in, const felem in2) {
				234	limb carry = 0;
				235	unsigned i;
				236
				237	for (i = 0;; i++) {
				238	out[i] = in[i] - in2[i];
				239	out[i] += zero31[i];
				240	out[i] += carry;
				241	carry = out[i] >> 29;
				242	out[i] &= kBottom29Bits;
				243
				244	i++;
				245	if (i == NLIMBS)
				246	break;
				247
				248	out[i] = in[i] - in2[i];
				249	out[i] += zero31[i];
				250	out[i] += carry;
				251	carry = out[i] >> 28;
				252	out[i] &= kBottom28Bits;
				253	}
				254
				255	felem_reduce_carry(out, carry);
				256	}
				257
				258	/* felem_reduce_degree sets out = tmp/R mod p where tmp contains 64-bit words
				259	* with the same 29,28,... bit positions as an felem.
				260	*
				261	* The values in felems are in Montgomery form: xR mod p where R = 2*257.
				262	* Since we just multiplied two Montgomery values together, the result is
				263	* xyR*R mod p. We wish to divide by R in order for the result also to be
				264	* in Montgomery form.
				265	*
				266	* On entry: tmp[i] < 2**64
				267	* On exit: out[0,2,...] < 230, out[1,3,...] < 229 */
				268	static void felem_reduce_degree(felem out, u64 tmp[17]) {
				269	/* The following table may be helpful when reading this code:
				270	*
				271	* Limb number: 0 \| 1 \| 2 \| 3 \| 4 \| 5 \| 6 \| 7 \| 8 \| 9 \| 10...
				272	* Width (bits): 29\| 28\| 29\| 28\| 29\| 28\| 29\| 28\| 29\| 28\| 29
				273	* Start bit: 0 \| 29\| 57\| 86\|114\|143\|171\|200\|228\|257\|285
				274	* (odd phase): 0 \| 28\| 57\| 85\|114\|142\|171\|199\|228\|256\|285 */
				275	limb tmp2[18], carry, x, xMask;
				276	unsigned i;
				277
				278	/* tmp contains 64-bit words with the same 29,28,29-bit positions as an
				279	* felem. So the top of an element of tmp might overlap with another
				280	* element two positions down. The following loop eliminates this
				281	* overlap. */
				282	tmp2[0] = (limb)(tmp[0] & kBottom29Bits);
				283
				284	/* In the following we use "(limb) tmp[x]" and "(limb) (tmp[x]>>32)" to try
				285	* and hint to the compiler that it can do a single-word shift by selecting
				286	* the right register rather than doing a double-word shift and truncating
				287	* afterwards. */
				288	tmp2[1] = ((limb) tmp[0]) >> 29;
				289	tmp2[1] \|= (((limb)(tmp[0] >> 32)) << 3) & kBottom28Bits;
				290	tmp2[1] += ((limb) tmp[1]) & kBottom28Bits;
				291	carry = tmp2[1] >> 28;
				292	tmp2[1] &= kBottom28Bits;
				293
				294	for (i = 2; i < 17; i++) {
				295	tmp2[i] = ((limb)(tmp[i - 2] >> 32)) >> 25;
				296	tmp2[i] += ((limb)(tmp[i - 1])) >> 28;
				297	tmp2[i] += (((limb)(tmp[i - 1] >> 32)) << 4) & kBottom29Bits;
				298	tmp2[i] += ((limb) tmp[i]) & kBottom29Bits;
				299	tmp2[i] += carry;
				300	carry = tmp2[i] >> 29;
				301	tmp2[i] &= kBottom29Bits;
				302
				303	i++;
				304	if (i == 17)
				305	break;
				306	tmp2[i] = ((limb)(tmp[i - 2] >> 32)) >> 25;
				307	tmp2[i] += ((limb)(tmp[i - 1])) >> 29;
				308	tmp2[i] += (((limb)(tmp[i - 1] >> 32)) << 3) & kBottom28Bits;
				309	tmp2[i] += ((limb) tmp[i]) & kBottom28Bits;
				310	tmp2[i] += carry;
				311	carry = tmp2[i] >> 28;
				312	tmp2[i] &= kBottom28Bits;
				313	}
				314
				315	tmp2[17] = ((limb)(tmp[15] >> 32)) >> 25;
				316	tmp2[17] += ((limb)(tmp[16])) >> 29;
				317	tmp2[17] += (((limb)(tmp[16] >> 32)) << 3);
				318	tmp2[17] += carry;
				319
				320	/* Montgomery elimination of terms.
				321	*
				322	* Since R is 2**257, we can divide by R with a bitwise shift if we can
				323	* ensure that the right-most 257 bits are all zero. We can make that true by
				324	* adding multiplies of p without affecting the value.
				325	*
				326	* So we eliminate limbs from right to left. Since the bottom 29 bits of p
				327	* are all ones, then by adding tmp2[0]*p to tmp2 we'll make tmp2[0] == 0.
				328	* We can do that for 8 further limbs and then right shift to eliminate the
				329	* extra factor of R. */
				330	for (i = 0;; i += 2) {
				331	tmp2[i + 1] += tmp2[i] >> 29;
				332	x = tmp2[i] & kBottom29Bits;
				333	xMask = NON_ZERO_TO_ALL_ONES(x);
				334	tmp2[i] = 0;
				335
				336	/* The bounds calculations for this loop are tricky. Each iteration of
				337	* the loop eliminates two words by adding values to words to their
				338	* right.
				339	*
				340	* The following table contains the amounts added to each word (as an
				341	* offset from the value of i at the top of the loop). The amounts are
				342	* accounted for from the first and second half of the loop separately
				343	* and are written as, for example, 28 to mean a value <2**28.
				344	*
				345	* Word: 3 4 5 6 7 8 9 10
				346	* Added in top half: 28 11 29 21 29 28
				347	* 28 29
				348	* 29
				349	* Added in bottom half: 29 10 28 21 28 28
				350	* 29
				351	*
				352	* The value that is currently offset 7 will be offset 5 for the next
				353	* iteration and then offset 3 for the iteration after that. Therefore
				354	* the total value added will be the values added at 7, 5 and 3.
				355	*
				356	* The following table accumulates these values. The sums at the bottom
				357	* are written as, for example, 29+28, to mean a value < 229+228.
				358	*
				359	* Word: 3 4 5 6 7 8 9 10 11 12 13
				360	* 28 11 10 29 21 29 28 28 28 28 28
				361	* 29 28 11 28 29 28 29 28 29 28
				362	* 29 28 21 21 29 21 29 21
				363	* 10 29 28 21 28 21 28
				364	* 28 29 28 29 28 29 28
				365	* 11 10 29 10 29 10
				366	* 29 28 11 28 11
				367	* 29 29
				368	* --------------------------------------------
				369	* 30+ 31+ 30+ 31+ 30+
				370	* 28+ 29+ 28+ 29+ 21+
				371	* 21+ 28+ 21+ 28+ 10
				372	* 10 21+ 10 21+
				373	* 11 11
				374	*
				375	* So the greatest amount is added to tmp2[10] and tmp2[12]. If
				376	* tmp2[10/12] has an initial value of <2**29, then the maximum value
				377	* will be < 231 + 230 + 228 + 221 + 211, which is < 232,
				378	* as required. */
				379	tmp2[i + 3] += (x << 10) & kBottom28Bits;
				380	tmp2[i + 4] += (x >> 18);
				381
				382	tmp2[i + 6] += (x << 21) & kBottom29Bits;
				383	tmp2[i + 7] += x >> 8;
				384
				385	/* At position 200, which is the starting bit position for word 7, we
				386	* have a factor of 0xf000000 = 228 - 224. */
				387	tmp2[i + 7] += 0x10000000 & xMask;
				388	/* Word 7 is 28 bits wide, so the 2*28 term exactly hits word 8. /
				389	tmp2[i + 8] += (x - 1) & xMask;
				390	tmp2[i + 7] -= (x << 24) & kBottom28Bits;
				391	tmp2[i + 8] -= x >> 4;
				392
				393	tmp2[i + 8] += 0x20000000 & xMask;
				394	tmp2[i + 8] -= x;
				395	tmp2[i + 8] += (x << 28) & kBottom29Bits;
				396	tmp2[i + 9] += ((x >> 1) - 1) & xMask;
				397
				398	if (i+1 == NLIMBS)
				399	break;
				400	tmp2[i + 2] += tmp2[i + 1] >> 28;
				401	x = tmp2[i + 1] & kBottom28Bits;
				402	xMask = NON_ZERO_TO_ALL_ONES(x);
				403	tmp2[i + 1] = 0;
				404
				405	tmp2[i + 4] += (x << 11) & kBottom29Bits;
				406	tmp2[i + 5] += (x >> 18);
				407
				408	tmp2[i + 7] += (x << 21) & kBottom28Bits;
				409	tmp2[i + 8] += x >> 7;
				410
				411	/* At position 199, which is the starting bit of the 8th word when
				412	* dealing with a context starting on an odd word, we have a factor of
				413	* 0x1e000000 = 229 - 225. Since we have not updated i, the 8th
				414	* word from i+1 is i+8. */
				415	tmp2[i + 8] += 0x20000000 & xMask;
				416	tmp2[i + 9] += (x - 1) & xMask;
				417	tmp2[i + 8] -= (x << 25) & kBottom29Bits;
				418	tmp2[i + 9] -= x >> 4;
				419
				420	tmp2[i + 9] += 0x10000000 & xMask;
				421	tmp2[i + 9] -= x;
				422	tmp2[i + 10] += (x - 1) & xMask;
				423	}
				424
				425	/* We merge the right shift with a carry chain. The words above 2**257 have
				426	* widths of 28,29,... which we need to correct when copying them down. */
				427	carry = 0;
				428	for (i = 0; i < 8; i++) {
				429	/* The maximum value of tmp2[i + 9] occurs on the first iteration and
				430	* is < 230+229+228. Adding 229 (from tmp2[i + 10]) is
				431	* therefore safe. */
				432	out[i] = tmp2[i + 9];
				433	out[i] += carry;
				434	out[i] += (tmp2[i + 10] << 28) & kBottom29Bits;
				435	carry = out[i] >> 29;
				436	out[i] &= kBottom29Bits;
				437
				438	i++;
				439	out[i] = tmp2[i + 9] >> 1;
				440	out[i] += carry;
				441	carry = out[i] >> 28;
				442	out[i] &= kBottom28Bits;
				443	}
				444
				445	out[8] = tmp2[17];
				446	out[8] += carry;
				447	carry = out[8] >> 29;
				448	out[8] &= kBottom29Bits;
				449
				450	felem_reduce_carry(out, carry);
				451	}
				452
				453	/* felem_square sets out=in*in.
				454	*
				455	* On entry: in[0,2,...] < 230, in[1,3,...] < 229.
				456	* On exit: out[0,2,...] < 230, out[1,3,...] < 229. */
				457	static void felem_square(felem out, const felem in) {
				458	u64 tmp[17];
				459
				460	tmp[0] = ((u64) in[0]) * in[0];
				461	tmp[1] = ((u64) in[0]) * (in[1] << 1);
				462	tmp[2] = ((u64) in[0]) * (in[2] << 1) +
				463	((u64) in[1]) * (in[1] << 1);
				464	tmp[3] = ((u64) in[0]) * (in[3] << 1) +
				465	((u64) in[1]) * (in[2] << 1);
				466	tmp[4] = ((u64) in[0]) * (in[4] << 1) +
				467	((u64) in[1]) * (in[3] << 2) + ((u64) in[2]) * in[2];
				468	tmp[5] = ((u64) in[0]) * (in[5] << 1) + ((u64) in[1]) *
				469	(in[4] << 1) + ((u64) in[2]) * (in[3] << 1);
				470	tmp[6] = ((u64) in[0]) * (in[6] << 1) + ((u64) in[1]) *
				471	(in[5] << 2) + ((u64) in[2]) * (in[4] << 1) +
				472	((u64) in[3]) * (in[3] << 1);
				473	tmp[7] = ((u64) in[0]) * (in[7] << 1) + ((u64) in[1]) *
				474	(in[6] << 1) + ((u64) in[2]) * (in[5] << 1) +
				475	((u64) in[3]) * (in[4] << 1);
				476	/* tmp[8] has the greatest value of 261 + 260 + 261 + 260 + 2**60,
				477	* which is < 2*64 as required. /
				478	tmp[8] = ((u64) in[0]) * (in[8] << 1) + ((u64) in[1]) *
				479	(in[7] << 2) + ((u64) in[2]) * (in[6] << 1) +
				480	((u64) in[3]) * (in[5] << 2) + ((u64) in[4]) * in[4];
				481	tmp[9] = ((u64) in[1]) * (in[8] << 1) + ((u64) in[2]) *
				482	(in[7] << 1) + ((u64) in[3]) * (in[6] << 1) +
				483	((u64) in[4]) * (in[5] << 1);
				484	tmp[10] = ((u64) in[2]) * (in[8] << 1) + ((u64) in[3]) *
				485	(in[7] << 2) + ((u64) in[4]) * (in[6] << 1) +
				486	((u64) in[5]) * (in[5] << 1);
				487	tmp[11] = ((u64) in[3]) * (in[8] << 1) + ((u64) in[4]) *
				488	(in[7] << 1) + ((u64) in[5]) * (in[6] << 1);
				489	tmp[12] = ((u64) in[4]) * (in[8] << 1) +
				490	((u64) in[5]) * (in[7] << 2) + ((u64) in[6]) * in[6];
				491	tmp[13] = ((u64) in[5]) * (in[8] << 1) +
				492	((u64) in[6]) * (in[7] << 1);
				493	tmp[14] = ((u64) in[6]) * (in[8] << 1) +
				494	((u64) in[7]) * (in[7] << 1);
				495	tmp[15] = ((u64) in[7]) * (in[8] << 1);
				496	tmp[16] = ((u64) in[8]) * in[8];
				497
				498	felem_reduce_degree(out, tmp);
				499	}
				500
				501	/* felem_mul sets out=in*in2.
				502	*
				503	* On entry: in[0,2,...] < 230, in[1,3,...] < 229 and
				504	* in2[0,2,...] < 230, in2[1,3,...] < 229.
				505	* On exit: out[0,2,...] < 230, out[1,3,...] < 229. */
				506	static void felem_mul(felem out, const felem in, const felem in2) {
				507	u64 tmp[17];
				508
				509	tmp[0] = ((u64) in[0]) * in2[0];
				510	tmp[1] = ((u64) in[0]) * (in2[1] << 0) +
				511	((u64) in[1]) * (in2[0] << 0);
				512	tmp[2] = ((u64) in[0]) * (in2[2] << 0) + ((u64) in[1]) *
				513	(in2[1] << 1) + ((u64) in[2]) * (in2[0] << 0);
				514	tmp[3] = ((u64) in[0]) * (in2[3] << 0) + ((u64) in[1]) *
				515	(in2[2] << 0) + ((u64) in[2]) * (in2[1] << 0) +
				516	((u64) in[3]) * (in2[0] << 0);
				517	tmp[4] = ((u64) in[0]) * (in2[4] << 0) + ((u64) in[1]) *
				518	(in2[3] << 1) + ((u64) in[2]) * (in2[2] << 0) +
				519	((u64) in[3]) * (in2[1] << 1) +
				520	((u64) in[4]) * (in2[0] << 0);
				521	tmp[5] = ((u64) in[0]) * (in2[5] << 0) + ((u64) in[1]) *
				522	(in2[4] << 0) + ((u64) in[2]) * (in2[3] << 0) +
				523	((u64) in[3]) * (in2[2] << 0) + ((u64) in[4]) *
				524	(in2[1] << 0) + ((u64) in[5]) * (in2[0] << 0);
				525	tmp[6] = ((u64) in[0]) * (in2[6] << 0) + ((u64) in[1]) *
				526	(in2[5] << 1) + ((u64) in[2]) * (in2[4] << 0) +
				527	((u64) in[3]) * (in2[3] << 1) + ((u64) in[4]) *
				528	(in2[2] << 0) + ((u64) in[5]) * (in2[1] << 1) +
				529	((u64) in[6]) * (in2[0] << 0);
				530	tmp[7] = ((u64) in[0]) * (in2[7] << 0) + ((u64) in[1]) *
				531	(in2[6] << 0) + ((u64) in[2]) * (in2[5] << 0) +
				532	((u64) in[3]) * (in2[4] << 0) + ((u64) in[4]) *
				533	(in2[3] << 0) + ((u64) in[5]) * (in2[2] << 0) +
				534	((u64) in[6]) * (in2[1] << 0) +
				535	((u64) in[7]) * (in2[0] << 0);
				536	/* tmp[8] has the greatest value but doesn't overflow. See logic in
				537	* felem_square. */
				538	tmp[8] = ((u64) in[0]) * (in2[8] << 0) + ((u64) in[1]) *
				539	(in2[7] << 1) + ((u64) in[2]) * (in2[6] << 0) +
				540	((u64) in[3]) * (in2[5] << 1) + ((u64) in[4]) *
				541	(in2[4] << 0) + ((u64) in[5]) * (in2[3] << 1) +
				542	((u64) in[6]) * (in2[2] << 0) + ((u64) in[7]) *
				543	(in2[1] << 1) + ((u64) in[8]) * (in2[0] << 0);
				544	tmp[9] = ((u64) in[1]) * (in2[8] << 0) + ((u64) in[2]) *
				545	(in2[7] << 0) + ((u64) in[3]) * (in2[6] << 0) +
				546	((u64) in[4]) * (in2[5] << 0) + ((u64) in[5]) *
				547	(in2[4] << 0) + ((u64) in[6]) * (in2[3] << 0) +
				548	((u64) in[7]) * (in2[2] << 0) +
				549	((u64) in[8]) * (in2[1] << 0);
				550	tmp[10] = ((u64) in[2]) * (in2[8] << 0) + ((u64) in[3]) *
				551	(in2[7] << 1) + ((u64) in[4]) * (in2[6] << 0) +
				552	((u64) in[5]) * (in2[5] << 1) + ((u64) in[6]) *
				553	(in2[4] << 0) + ((u64) in[7]) * (in2[3] << 1) +
				554	((u64) in[8]) * (in2[2] << 0);
				555	tmp[11] = ((u64) in[3]) * (in2[8] << 0) + ((u64) in[4]) *
				556	(in2[7] << 0) + ((u64) in[5]) * (in2[6] << 0) +
				557	((u64) in[6]) * (in2[5] << 0) + ((u64) in[7]) *
				558	(in2[4] << 0) + ((u64) in[8]) * (in2[3] << 0);
				559	tmp[12] = ((u64) in[4]) * (in2[8] << 0) + ((u64) in[5]) *
				560	(in2[7] << 1) + ((u64) in[6]) * (in2[6] << 0) +
				561	((u64) in[7]) * (in2[5] << 1) +
				562	((u64) in[8]) * (in2[4] << 0);
				563	tmp[13] = ((u64) in[5]) * (in2[8] << 0) + ((u64) in[6]) *
				564	(in2[7] << 0) + ((u64) in[7]) * (in2[6] << 0) +
				565	((u64) in[8]) * (in2[5] << 0);
				566	tmp[14] = ((u64) in[6]) * (in2[8] << 0) + ((u64) in[7]) *
				567	(in2[7] << 1) + ((u64) in[8]) * (in2[6] << 0);
				568	tmp[15] = ((u64) in[7]) * (in2[8] << 0) +
				569	((u64) in[8]) * (in2[7] << 0);
				570	tmp[16] = ((u64) in[8]) * (in2[8] << 0);
				571
				572	felem_reduce_degree(out, tmp);
				573	}
				574
				575	static void felem_assign(felem out, const felem in) {
				576	memcpy(out, in, sizeof(felem));
				577	}
				578
				579	/* felem_inv calculates \|out\| = \|in\|^{-1}
				580	*
				581	* Based on Fermat's Little Theorem:
				582	* a^p = a (mod p)
				583	* a^{p-1} = 1 (mod p)
				584	* a^{p-2} = a^{-1} (mod p)
				585	*/
				586	static void felem_inv(felem out, const felem in) {
				587	felem ftmp, ftmp2;
				588	/* each e_I will hold \|in\|^{2^I - 1} */
				589	felem e2, e4, e8, e16, e32, e64;
				590	unsigned i;
				591
				592	felem_square(ftmp, in); /* 2^1 */
				593	felem_mul(ftmp, in, ftmp); /* 2^2 - 2^0 */
				594	felem_assign(e2, ftmp);
				595	felem_square(ftmp, ftmp); /* 2^3 - 2^1 */
				596	felem_square(ftmp, ftmp); /* 2^4 - 2^2 */
				597	felem_mul(ftmp, ftmp, e2); /* 2^4 - 2^0 */
				598	felem_assign(e4, ftmp);
				599	felem_square(ftmp, ftmp); /* 2^5 - 2^1 */
				600	felem_square(ftmp, ftmp); /* 2^6 - 2^2 */
				601	felem_square(ftmp, ftmp); /* 2^7 - 2^3 */
				602	felem_square(ftmp, ftmp); /* 2^8 - 2^4 */
				603	felem_mul(ftmp, ftmp, e4); /* 2^8 - 2^0 */
				604	felem_assign(e8, ftmp);
				605	for (i = 0; i < 8; i++) {
				606	felem_square(ftmp, ftmp);
				607	} /* 2^16 - 2^8 */
				608	felem_mul(ftmp, ftmp, e8); /* 2^16 - 2^0 */
				609	felem_assign(e16, ftmp);
				610	for (i = 0; i < 16; i++) {
				611	felem_square(ftmp, ftmp);
				612	} /* 2^32 - 2^16 */
				613	felem_mul(ftmp, ftmp, e16); /* 2^32 - 2^0 */
				614	felem_assign(e32, ftmp);
				615	for (i = 0; i < 32; i++) {
				616	felem_square(ftmp, ftmp);
				617	} /* 2^64 - 2^32 */
				618	felem_assign(e64, ftmp);
				619	felem_mul(ftmp, ftmp, in); /* 2^64 - 2^32 + 2^0 */
				620	for (i = 0; i < 192; i++) {
				621	felem_square(ftmp, ftmp);
				622	} /* 2^256 - 2^224 + 2^192 */
				623
				624	felem_mul(ftmp2, e64, e32); /* 2^64 - 2^0 */
				625	for (i = 0; i < 16; i++) {
				626	felem_square(ftmp2, ftmp2);
				627	} /* 2^80 - 2^16 */
				628	felem_mul(ftmp2, ftmp2, e16); /* 2^80 - 2^0 */
				629	for (i = 0; i < 8; i++) {
				630	felem_square(ftmp2, ftmp2);
				631	} /* 2^88 - 2^8 */
				632	felem_mul(ftmp2, ftmp2, e8); /* 2^88 - 2^0 */
				633	for (i = 0; i < 4; i++) {
				634	felem_square(ftmp2, ftmp2);
				635	} /* 2^92 - 2^4 */
				636	felem_mul(ftmp2, ftmp2, e4); /* 2^92 - 2^0 */
				637	felem_square(ftmp2, ftmp2); /* 2^93 - 2^1 */
				638	felem_square(ftmp2, ftmp2); /* 2^94 - 2^2 */
				639	felem_mul(ftmp2, ftmp2, e2); /* 2^94 - 2^0 */
				640	felem_square(ftmp2, ftmp2); /* 2^95 - 2^1 */
				641	felem_square(ftmp2, ftmp2); /* 2^96 - 2^2 */
				642	felem_mul(ftmp2, ftmp2, in); /* 2^96 - 3 */
				643
				644	felem_mul(out, ftmp2, ftmp); /* 2^256 - 2^224 + 2^192 + 2^96 - 3 */
				645	}
				646
				647	/* felem_scalar_3 sets out=3*out.
				648	*
				649	* On entry: out[0,2,...] < 230, out[1,3,...] < 229.
				650	* On exit: out[0,2,...] < 230, out[1,3,...] < 229. */
				651	static void felem_scalar_3(felem out) {
				652	limb carry = 0;
				653	unsigned i;
				654
				655	for (i = 0;; i++) {
				656	out[i] *= 3;
				657	out[i] += carry;
				658	carry = out[i] >> 29;
				659	out[i] &= kBottom29Bits;
				660
				661	i++;
				662	if (i == NLIMBS)
				663	break;
				664
				665	out[i] *= 3;
				666	out[i] += carry;
				667	carry = out[i] >> 28;
				668	out[i] &= kBottom28Bits;
				669	}
				670
				671	felem_reduce_carry(out, carry);
				672	}
				673
				674	/* felem_scalar_4 sets out=4*out.
				675	*
				676	* On entry: out[0,2,...] < 230, out[1,3,...] < 229.
				677	* On exit: out[0,2,...] < 230, out[1,3,...] < 229. */
				678	static void felem_scalar_4(felem out) {
				679	limb carry = 0, next_carry;
				680	unsigned i;
				681
				682	for (i = 0;; i++) {
				683	next_carry = out[i] >> 27;
				684	out[i] <<= 2;
				685	out[i] &= kBottom29Bits;
				686	out[i] += carry;
				687	carry = next_carry + (out[i] >> 29);
				688	out[i] &= kBottom29Bits;
				689
				690	i++;
				691	if (i == NLIMBS)
				692	break;
				693
				694	next_carry = out[i] >> 26;
				695	out[i] <<= 2;
				696	out[i] &= kBottom28Bits;
				697	out[i] += carry;
				698	carry = next_carry + (out[i] >> 28);
				699	out[i] &= kBottom28Bits;
				700	}
				701
				702	felem_reduce_carry(out, carry);
				703	}
				704
				705	/* felem_scalar_8 sets out=8*out.
				706	*
				707	* On entry: out[0,2,...] < 230, out[1,3,...] < 229.
				708	* On exit: out[0,2,...] < 230, out[1,3,...] < 229. */
				709	static void felem_scalar_8(felem out) {
				710	limb carry = 0, next_carry;
				711	unsigned i;
				712
				713	for (i = 0;; i++) {
				714	next_carry = out[i] >> 26;
				715	out[i] <<= 3;
				716	out[i] &= kBottom29Bits;
				717	out[i] += carry;
				718	carry = next_carry + (out[i] >> 29);
				719	out[i] &= kBottom29Bits;
				720
				721	i++;
				722	if (i == NLIMBS)
				723	break;
				724
				725	next_carry = out[i] >> 25;
				726	out[i] <<= 3;
				727	out[i] &= kBottom28Bits;
				728	out[i] += carry;
				729	carry = next_carry + (out[i] >> 28);
				730	out[i] &= kBottom28Bits;
				731	}
				732
				733	felem_reduce_carry(out, carry);
				734	}
				735
				736	/* felem_is_zero_vartime returns 1 iff \|in\| == 0. It takes a variable amount of
				737	* time depending on the value of \|in\|. */
				738	static char felem_is_zero_vartime(const felem in) {
				739	limb carry;
				740	int i;
				741	limb tmp[NLIMBS];
				742
				743	felem_assign(tmp, in);
				744
				745	/* First, reduce tmp to a minimal form. */
				746	do {
				747	carry = 0;
				748	for (i = 0;; i++) {
				749	tmp[i] += carry;
				750	carry = tmp[i] >> 29;
				751	tmp[i] &= kBottom29Bits;
				752
				753	i++;
				754	if (i == NLIMBS)
				755	break;
				756
				757	tmp[i] += carry;
				758	carry = tmp[i] >> 28;
				759	tmp[i] &= kBottom28Bits;
				760	}
				761
				762	felem_reduce_carry(tmp, carry);
				763	} while (carry);
				764
				765	/* tmp < 2*257, so the only possible zero values are 0, p and 2p. /
				766	return memcmp(tmp, kZero, sizeof(tmp)) == 0 \|\|
				767	memcmp(tmp, kP, sizeof(tmp)) == 0 \|\|
				768	memcmp(tmp, k2P, sizeof(tmp)) == 0;
				769	}
				770
				771	/* Group operations:
				772	*
				773	* Elements of the elliptic curve group are represented in Jacobian
				774	* coordinates: (x, y, z). An affine point (x', y') is x'=x/z2, y'=y/z3 in
				775	* Jacobian form. */
				776
				777	/* point_double sets {x_out,y_out,z_out} = 2*{x,y,z}.
				778	*
				779	* See http://www.hyperelliptic.org/EFD/g1p/auto-shortw-jacobian-0.html#doubling-dbl-2009-l */
				780	static void point_double(felem x_out, felem y_out, felem z_out, const felem x,
				781	const felem y, const felem z) {
				782	felem delta, gamma, alpha, beta, tmp, tmp2;
				783
				784	felem_square(delta, z);
				785	felem_square(gamma, y);
				786	felem_mul(beta, x, gamma);
				787
				788	felem_sum(tmp, x, delta);
				789	felem_diff(tmp2, x, delta);
				790	felem_mul(alpha, tmp, tmp2);
				791	felem_scalar_3(alpha);
				792
				793	felem_sum(tmp, y, z);
				794	felem_square(tmp, tmp);
				795	felem_diff(tmp, tmp, gamma);
				796	felem_diff(z_out, tmp, delta);
				797
				798	felem_scalar_4(beta);
				799	felem_square(x_out, alpha);
				800	felem_diff(x_out, x_out, beta);
				801	felem_diff(x_out, x_out, beta);
				802
				803	felem_diff(tmp, beta, x_out);
				804	felem_mul(tmp, alpha, tmp);
				805	felem_square(tmp2, gamma);
				806	felem_scalar_8(tmp2);
				807	felem_diff(y_out, tmp, tmp2);
				808	}
				809
				810	/* point_add_mixed sets {x_out,y_out,z_out} = {x1,y1,z1} + {x2,y2,1}.
				811	* (i.e. the second point is affine.)
				812	*
				813	* See http://www.hyperelliptic.org/EFD/g1p/auto-shortw-jacobian-0.html#addition-add-2007-bl
				814	*
				815	* Note that this function does not handle P+P, infinity+P nor P+infinity
				816	* correctly. */
				817	static void point_add_mixed(felem x_out, felem y_out, felem z_out,
				818	const felem x1, const felem y1, const felem z1,
				819	const felem x2, const felem y2) {
				820	felem z1z1, z1z1z1, s2, u2, h, i, j, r, rr, v, tmp;
				821
				822	felem_square(z1z1, z1);
				823	felem_sum(tmp, z1, z1);
				824
				825	felem_mul(u2, x2, z1z1);
				826	felem_mul(z1z1z1, z1, z1z1);
				827	felem_mul(s2, y2, z1z1z1);
				828	felem_diff(h, u2, x1);
				829	felem_sum(i, h, h);
				830	felem_square(i, i);
				831	felem_mul(j, h, i);
				832	felem_diff(r, s2, y1);
				833	felem_sum(r, r, r);
				834	felem_mul(v, x1, i);
				835
				836	felem_mul(z_out, tmp, h);
				837	felem_square(rr, r);
				838	felem_diff(x_out, rr, j);
				839	felem_diff(x_out, x_out, v);
				840	felem_diff(x_out, x_out, v);
				841
				842	felem_diff(tmp, v, x_out);
				843	felem_mul(y_out, tmp, r);
				844	felem_mul(tmp, y1, j);
				845	felem_diff(y_out, y_out, tmp);
				846	felem_diff(y_out, y_out, tmp);
				847	}
				848
				849	/* point_add sets {x_out,y_out,z_out} = {x1,y1,z1} + {x2,y2,z2}.
				850	*
				851	* See http://www.hyperelliptic.org/EFD/g1p/auto-shortw-jacobian-0.html#addition-add-2007-bl
				852	*
				853	* Note that this function does not handle P+P, infinity+P nor P+infinity
				854	* correctly. */
				855	static void point_add(felem x_out, felem y_out, felem z_out, const felem x1,
				856	const felem y1, const felem z1, const felem x2,
				857	const felem y2, const felem z2) {
				858	felem z1z1, z1z1z1, z2z2, z2z2z2, s1, s2, u1, u2, h, i, j, r, rr, v, tmp;
				859
				860	felem_square(z1z1, z1);
				861	felem_square(z2z2, z2);
				862	felem_mul(u1, x1, z2z2);
				863
				864	felem_sum(tmp, z1, z2);
				865	felem_square(tmp, tmp);
				866	felem_diff(tmp, tmp, z1z1);
				867	felem_diff(tmp, tmp, z2z2);
				868
				869	felem_mul(z2z2z2, z2, z2z2);
				870	felem_mul(s1, y1, z2z2z2);
				871
				872	felem_mul(u2, x2, z1z1);
				873	felem_mul(z1z1z1, z1, z1z1);
				874	felem_mul(s2, y2, z1z1z1);
				875	felem_diff(h, u2, u1);
				876	felem_sum(i, h, h);
				877	felem_square(i, i);
				878	felem_mul(j, h, i);
				879	felem_diff(r, s2, s1);
				880	felem_sum(r, r, r);
				881	felem_mul(v, u1, i);
				882
				883	felem_mul(z_out, tmp, h);
				884	felem_square(rr, r);
				885	felem_diff(x_out, rr, j);
				886	felem_diff(x_out, x_out, v);
				887	felem_diff(x_out, x_out, v);
				888
				889	felem_diff(tmp, v, x_out);
				890	felem_mul(y_out, tmp, r);
				891	felem_mul(tmp, s1, j);
				892	felem_diff(y_out, y_out, tmp);
				893	felem_diff(y_out, y_out, tmp);
				894	}
				895
				896	/* point_add_or_double_vartime sets {x_out,y_out,z_out} = {x1,y1,z1} +
				897	* {x2,y2,z2}.
				898	*
				899	* See http://www.hyperelliptic.org/EFD/g1p/auto-shortw-jacobian-0.html#addition-add-2007-bl
				900	*
				901	* This function handles the case where {x1,y1,z1}={x2,y2,z2}. */
				902	static void point_add_or_double_vartime(
				903	felem x_out, felem y_out, felem z_out, const felem x1, const felem y1,
				904	const felem z1, const felem x2, const felem y2, const felem z2) {
				905	felem z1z1, z1z1z1, z2z2, z2z2z2, s1, s2, u1, u2, h, i, j, r, rr, v, tmp;
				906	char x_equal, y_equal;
				907
				908	felem_square(z1z1, z1);
				909	felem_square(z2z2, z2);
				910	felem_mul(u1, x1, z2z2);
				911
				912	felem_sum(tmp, z1, z2);
				913	felem_square(tmp, tmp);
				914	felem_diff(tmp, tmp, z1z1);
				915	felem_diff(tmp, tmp, z2z2);
				916
				917	felem_mul(z2z2z2, z2, z2z2);
				918	felem_mul(s1, y1, z2z2z2);
				919
				920	felem_mul(u2, x2, z1z1);
				921	felem_mul(z1z1z1, z1, z1z1);
				922	felem_mul(s2, y2, z1z1z1);
				923	felem_diff(h, u2, u1);
				924	x_equal = felem_is_zero_vartime(h);
				925	felem_sum(i, h, h);
				926	felem_square(i, i);
				927	felem_mul(j, h, i);
				928	felem_diff(r, s2, s1);
				929	y_equal = felem_is_zero_vartime(r);
				930	if (x_equal && y_equal) {
				931	point_double(x_out, y_out, z_out, x1, y1, z1);
				932	return;
				933	}
				934	felem_sum(r, r, r);
				935	felem_mul(v, u1, i);
				936
				937	felem_mul(z_out, tmp, h);
				938	felem_square(rr, r);
				939	felem_diff(x_out, rr, j);
				940	felem_diff(x_out, x_out, v);
				941	felem_diff(x_out, x_out, v);
				942
				943	felem_diff(tmp, v, x_out);
				944	felem_mul(y_out, tmp, r);
				945	felem_mul(tmp, s1, j);
				946	felem_diff(y_out, y_out, tmp);
				947	felem_diff(y_out, y_out, tmp);
				948	}
				949
				950	/* copy_conditional sets out=in if mask = 0xffffffff in constant time.
				951	*
				952	* On entry: mask is either 0 or 0xffffffff. */
				953	static void copy_conditional(felem out, const felem in, limb mask) {
				954	int i;
				955
				956	for (i = 0; i < NLIMBS; i++) {
				957	const limb tmp = mask & (in[i] ^ out[i]);
				958	out[i] ^= tmp;
				959	}
				960	}
				961
				962	/* select_affine_point sets {out_x,out_y} to the index'th entry of table.
				963	* On entry: index < 16, table[0] must be zero. */
				964	static void select_affine_point(felem out_x, felem out_y, const limb* table,
				965	limb index) {
				966	limb i, j;
				967
				968	memset(out_x, 0, sizeof(felem));
				969	memset(out_y, 0, sizeof(felem));
				970
				971	for (i = 1; i < 16; i++) {
				972	limb mask = i ^ index;
				973	mask \|= mask >> 2;
				974	mask \|= mask >> 1;
				975	mask &= 1;
				976	mask--;
				977	for (j = 0; j < NLIMBS; j++, table++) {
				978	out_x[j] \|= *table & mask;
				979	}
				980	for (j = 0; j < NLIMBS; j++, table++) {
				981	out_y[j] \|= *table & mask;
				982	}
				983	}
				984	}
				985
				986	/* select_jacobian_point sets {out_x,out_y,out_z} to the index'th entry of
				987	* table. On entry: index < 16, table[0] must be zero. */
				988	static void select_jacobian_point(felem out_x, felem out_y, felem out_z,
				989	const limb* table, limb index) {
				990	limb i, j;
				991
				992	memset(out_x, 0, sizeof(felem));
				993	memset(out_y, 0, sizeof(felem));
				994	memset(out_z, 0, sizeof(felem));
				995
				996	/* The implicit value at index 0 is all zero. We don't need to perform that
				997	* iteration of the loop because we already set out_* to zero. */
				998	table += 3 * NLIMBS;
				999
				1000	// Hit all entries to obscure cache profiling.
				1001	for (i = 1; i < 16; i++) {
				1002	limb mask = i ^ index;
				1003	mask \|= mask >> 2;
				1004	mask \|= mask >> 1;
				1005	mask &= 1;
				1006	mask--;
				1007	for (j = 0; j < NLIMBS; j++, table++) {
				1008	out_x[j] \|= *table & mask;
				1009	}
				1010	for (j = 0; j < NLIMBS; j++, table++) {
				1011	out_y[j] \|= *table & mask;
				1012	}
				1013	for (j = 0; j < NLIMBS; j++, table++) {
				1014	out_z[j] \|= *table & mask;
				1015	}
				1016	}
				1017	}
				1018
				1019	/* scalar_base_mult sets {nx,ny,nz} = scalar*G where scalar is a little-endian
				1020	* number. Note that the value of scalar must be less than the order of the
				1021	* group. */
				1022	static void scalar_base_mult(felem nx, felem ny, felem nz,
				1023	const p256_int* scalar) {
				1024	int i, j;
				1025	limb n_is_infinity_mask = -1, p_is_noninfinite_mask, mask;
				1026	u32 table_offset;
				1027
				1028	felem px, py;
				1029	felem tx, ty, tz;
				1030
				1031	memset(nx, 0, sizeof(felem));
				1032	memset(ny, 0, sizeof(felem));
				1033	memset(nz, 0, sizeof(felem));
				1034
				1035	/* The loop adds bits at positions 0, 64, 128 and 192, followed by
				1036	* positions 32,96,160 and 224 and does this 32 times. */
				1037	for (i = 0; i < 32; i++) {
				1038	if (i) {
				1039	point_double(nx, ny, nz, nx, ny, nz);
				1040	}
				1041	table_offset = 0;
				1042	for (j = 0; j <= 32; j += 32) {
				1043	char bit0 = p256_get_bit(scalar, 31 - i + j);
				1044	char bit1 = p256_get_bit(scalar, 95 - i + j);
				1045	char bit2 = p256_get_bit(scalar, 159 - i + j);
				1046	char bit3 = p256_get_bit(scalar, 223 - i + j);
				1047	limb index = bit0 \| (bit1 << 1) \| (bit2 << 2) \| (bit3 << 3);
				1048
				1049	select_affine_point(px, py, kPrecomputed + table_offset, index);
				1050	table_offset += 30 * NLIMBS;
				1051
				1052	/* Since scalar is less than the order of the group, we know that
				1053	* {nx,ny,nz} != {px,py,1}, unless both are zero, which we handle
				1054	* below. */
				1055	point_add_mixed(tx, ty, tz, nx, ny, nz, px, py);
				1056	/* The result of point_add_mixed is incorrect if {nx,ny,nz} is zero
				1057	* (a.k.a. the point at infinity). We handle that situation by
				1058	* copying the point from the table. */
				1059	copy_conditional(nx, px, n_is_infinity_mask);
				1060	copy_conditional(ny, py, n_is_infinity_mask);
				1061	copy_conditional(nz, kOne, n_is_infinity_mask);
				1062
				1063	/* Equally, the result is also wrong if the point from the table is
				1064	* zero, which happens when the index is zero. We handle that by
				1065	* only copying from {tx,ty,tz} to {nx,ny,nz} if index != 0. */
				1066	p_is_noninfinite_mask = NON_ZERO_TO_ALL_ONES(index);
				1067	mask = p_is_noninfinite_mask & ~n_is_infinity_mask;
				1068	copy_conditional(nx, tx, mask);
				1069	copy_conditional(ny, ty, mask);
				1070	copy_conditional(nz, tz, mask);
				1071	/* If p was not zero, then n is now non-zero. */
				1072	n_is_infinity_mask &= ~p_is_noninfinite_mask;
				1073	}
				1074	}
				1075	}
				1076
				1077	/* point_to_affine converts a Jacobian point to an affine point. If the input
				1078	* is the point at infinity then it returns (0, 0) in constant time. */
				1079	static void point_to_affine(felem x_out, felem y_out, const felem nx,
				1080	const felem ny, const felem nz) {
				1081	felem z_inv, z_inv_sq;
				1082	felem_inv(z_inv, nz);
				1083	felem_square(z_inv_sq, z_inv);
				1084	felem_mul(x_out, nx, z_inv_sq);
				1085	felem_mul(z_inv, z_inv, z_inv_sq);
				1086	felem_mul(y_out, ny, z_inv);
				1087	}
				1088
				1089	/* scalar_base_mult sets {nx,ny,nz} = scalar{x,y}. /
				1090	static void scalar_mult(felem nx, felem ny, felem nz, const felem x,
				1091	const felem y, const p256_int* scalar) {
				1092	int i;
				1093	felem px, py, pz, tx, ty, tz;
				1094	felem precomp[16][3];
				1095	limb n_is_infinity_mask, index, p_is_noninfinite_mask, mask;
				1096
				1097	/* We precompute 0,1,2,... times {x,y}. */
				1098	memset(precomp, 0, sizeof(felem) * 3);
				1099	memcpy(&precomp[1][0], x, sizeof(felem));
				1100	memcpy(&precomp[1][1], y, sizeof(felem));
				1101	memcpy(&precomp[1][2], kOne, sizeof(felem));
				1102
				1103	for (i = 2; i < 16; i += 2) {
				1104	point_double(precomp[i][0], precomp[i][1], precomp[i][2],
				1105	precomp[i / 2][0], precomp[i / 2][1], precomp[i / 2][2]);
				1106
				1107	point_add_mixed(precomp[i + 1][0], precomp[i + 1][1], precomp[i + 1][2],
				1108	precomp[i][0], precomp[i][1], precomp[i][2], x, y);
				1109	}
				1110
				1111	memset(nx, 0, sizeof(felem));
				1112	memset(ny, 0, sizeof(felem));
				1113	memset(nz, 0, sizeof(felem));
				1114	n_is_infinity_mask = -1;
				1115
				1116	/* We add in a window of four bits each iteration and do this 64 times. */
				1117	for (i = 0; i < 256; i += 4) {
				1118	if (i) {
				1119	point_double(nx, ny, nz, nx, ny, nz);
				1120	point_double(nx, ny, nz, nx, ny, nz);
				1121	point_double(nx, ny, nz, nx, ny, nz);
				1122	point_double(nx, ny, nz, nx, ny, nz);
				1123	}
				1124
				1125	index = (p256_get_bit(scalar, 255 - i - 0) << 3) \|
				1126	(p256_get_bit(scalar, 255 - i - 1) << 2) \|
				1127	(p256_get_bit(scalar, 255 - i - 2) << 1) \|
				1128	p256_get_bit(scalar, 255 - i - 3);
				1129
				1130	/* See the comments in scalar_base_mult about handling infinities. */
				1131	select_jacobian_point(px, py, pz, precomp[0][0], index);
				1132	point_add(tx, ty, tz, nx, ny, nz, px, py, pz);
				1133	copy_conditional(nx, px, n_is_infinity_mask);
				1134	copy_conditional(ny, py, n_is_infinity_mask);
				1135	copy_conditional(nz, pz, n_is_infinity_mask);
				1136
				1137	p_is_noninfinite_mask = NON_ZERO_TO_ALL_ONES(index);
				1138	mask = p_is_noninfinite_mask & ~n_is_infinity_mask;
				1139
				1140	copy_conditional(nx, tx, mask);
				1141	copy_conditional(ny, ty, mask);
				1142	copy_conditional(nz, tz, mask);
				1143	n_is_infinity_mask &= ~p_is_noninfinite_mask;
				1144	}
				1145	}
				1146
				1147	#define kRDigits {2, 0, 0, 0xfffffffe, 0xffffffff, 0xffffffff, 0xfffffffd, 1} // 2^257 mod p256.p
				1148
				1149	#define kRInvDigits {0x80000000, 1, 0xffffffff, 0, 0x80000001, 0xfffffffe, 1, 0x7fffffff} // 1 / 2^257 mod p256.p
				1150
				1151	static const p256_int kR = { kRDigits };
				1152	static const p256_int kRInv = { kRInvDigits };
				1153
				1154	/* to_montgomery sets out = Rin. /
				1155	static void to_montgomery(felem out, const p256_int* in) {
				1156	p256_int in_shifted;
				1157	int i;
				1158
				1159	p256_init(&in_shifted);
				1160	p256_modmul(&SECP256r1_p, in, 0, &kR, &in_shifted);
				1161
				1162	for (i = 0; i < NLIMBS; i++) {
				1163	if ((i & 1) == 0) {
				1164	out[i] = P256_DIGIT(&in_shifted, 0) & kBottom29Bits;
				1165	p256_shr(&in_shifted, 29, &in_shifted);
				1166	} else {
				1167	out[i] = P256_DIGIT(&in_shifted, 0) & kBottom28Bits;
				1168	p256_shr(&in_shifted, 28, &in_shifted);
				1169	}
				1170	}
				1171
				1172	p256_clear(&in_shifted);
				1173	}
				1174
				1175	/* from_montgomery sets out=in/R. */
				1176	static void from_montgomery(p256_int* out, const felem in) {
				1177	p256_int result, tmp;
				1178	int i, top;
				1179
				1180	p256_init(&result);
				1181	p256_init(&tmp);
				1182
				1183	p256_add_d(&tmp, in[NLIMBS - 1], &result);
				1184	for (i = NLIMBS - 2; i >= 0; i--) {
				1185	if ((i & 1) == 0) {
				1186	top = p256_shl(&result, 29, &tmp);
				1187	} else {
				1188	top = p256_shl(&result, 28, &tmp);
				1189	}
				1190	top \|= p256_add_d(&tmp, in[i], &result);
				1191	}
				1192
				1193	p256_modmul(&SECP256r1_p, &kRInv, top, &result, out);
				1194
				1195	p256_clear(&result);
				1196	p256_clear(&tmp);
				1197	}
				1198
				1199	/* p256_base_point_mul sets {out_x,out_y} = nG, where n is < the
				1200	* order of the group. */
				1201	void p256_base_point_mul(const p256_int* n, p256_int* out_x, p256_int* out_y) {
				1202	felem x, y, z;
				1203
				1204	scalar_base_mult(x, y, z, n);
				1205
				1206	{
				1207	felem x_affine, y_affine;
				1208
				1209	point_to_affine(x_affine, y_affine, x, y, z);
				1210	from_montgomery(out_x, x_affine);
				1211	from_montgomery(out_y, y_affine);
				1212	}
				1213	}
				1214
				1215	/* p256_point_mul sets {out_x,out_y} = n*{in_x,in_y}, where n is <
				1216	* the order of the group. */
				1217	void p256_point_mul(const p256_int* n, const p256_int* in_x,
				1218	const p256_int* in_y, p256_int* out_x, p256_int* out_y) {
				1219	felem x, y, z, px, py;
				1220
				1221	to_montgomery(px, in_x);
				1222	to_montgomery(py, in_y);
				1223
				1224	scalar_mult(x, y, z, px, py, n);
				1225
				1226	point_to_affine(px, py, x, y, z);
				1227	from_montgomery(out_x, px);
				1228	from_montgomery(out_y, py);
				1229	}
				1230
				1231	/* p256_points_mul_vartime sets {out_x,out_y} = n1G + n2{in_x,in_y}, where
				1232	* n1 and n2 are < the order of the group.
				1233	*
				1234	* As indicated by the name, this function operates in variable time. This
				1235	* is safe because it's used for signature validation which doesn't deal
				1236	* with secrets. */
				1237	void p256_points_mul_vartime(
				1238	const p256_int* n1, const p256_int* n2, const p256_int* in_x,
				1239	const p256_int* in_y, p256_int* out_x, p256_int* out_y) {
				1240	felem x1, y1, z1, x2, y2, z2, px, py;
				1241
				1242	/* If both scalars are zero, then the result is the point at infinity. */
				1243	if (p256_is_zero(n1) != 0 && p256_is_zero(n2) != 0) {
				1244	p256_clear(out_x);
				1245	p256_clear(out_y);
				1246	return;
				1247	}
				1248
				1249	to_montgomery(px, in_x);
				1250	to_montgomery(py, in_y);
				1251	scalar_base_mult(x1, y1, z1, n1);
				1252	scalar_mult(x2, y2, z2, px, py, n2);
				1253
				1254	if (p256_is_zero(n2) != 0) {
				1255	/* If n2 == 0, then {x2,y2,z2} is zero and the result is just
				1256	* {x1,y1,z1}. */
				1257	} else if (p256_is_zero(n1) != 0) {
				1258	/* If n1 == 0, then {x1,y1,z1} is zero and the result is just
				1259	* {x2,y2,z2}. */
				1260	memcpy(x1, x2, sizeof(x2));
				1261	memcpy(y1, y2, sizeof(y2));
				1262	memcpy(z1, z2, sizeof(z2));
				1263	} else {
				1264	/* This function handles the case where {x1,y1,z1} == {x2,y2,z2}. */
				1265	point_add_or_double_vartime(x1, y1, z1, x1, y1, z1, x2, y2, z2);
				1266	}
				1267
				1268	point_to_affine(px, py, x1, y1, z1);
				1269	from_montgomery(out_x, px);
				1270	from_montgomery(out_y, py);
				1271	}