blob: 55728931f8cce34df2e5f6710575fb5561ef8e59 [file] [log] [blame]
nagendra modadugu4fae5422016-05-10 16:11:54 -07001// Copyright 2016 Google Inc.
2//
3// Licensed under the Apache License, Version 2.0 (the "License");
4// you may not use this file except in compliance with the License.
5// You may obtain a copy of the License at
6//
7// http://www.apache.org/licenses/LICENSE-2.0
8//
9// Unless required by applicable law or agreed to in writing, software
10// distributed under the License is distributed on an "AS IS" BASIS,
11// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12// See the License for the specific language governing permissions and
13// limitations under the License.
14#include <stdint.h>
15
16#include <string.h>
17
18#include "cryptoc/p256.h"
19
20typedef uint8_t u8;
21typedef uint32_t u32;
22typedef int32_t s32;
23typedef uint64_t u64;
24
25/* Our field elements are represented as nine 32-bit limbs.
26 *
27 * The value of an felem (field element) is:
28 * x[0] + (x[1] * 2**29) + (x[2] * 2**57) + ... + (x[8] * 2**228)
29 *
30 * That is, each limb is alternately 29 or 28-bits wide in little-endian
31 * order.
32 *
33 * This means that an felem hits 2**257, rather than 2**256 as we would like. A
34 * 28, 29, ... pattern would cause us to hit 2**256, but that causes problems
35 * when multiplying as terms end up one bit short of a limb which would require
36 * much bit-shifting to correct.
37 *
38 * Finally, the values stored in an felem are in Montgomery form. So the value
39 * |y| is stored as (y*R) mod p, where p is the P-256 prime and R is 2**257.
40 */
41typedef u32 limb;
42#define NLIMBS 9
43typedef limb felem[NLIMBS];
44
45static const limb kBottom28Bits = 0xfffffff;
46static const limb kBottom29Bits = 0x1fffffff;
47
48/* kOne is the number 1 as an felem. It's 2**257 mod p split up into 29 and
49 * 28-bit words. */
50static const felem kOne = {
51 2, 0, 0, 0xffff800,
52 0x1fffffff, 0xfffffff, 0x1fbfffff, 0x1ffffff,
53 0
54};
55static const felem kZero = {0};
56static const felem kP = {
57 0x1fffffff, 0xfffffff, 0x1fffffff, 0x3ff,
58 0, 0, 0x200000, 0xf000000,
59 0xfffffff
60};
61static const felem k2P = {
62 0x1ffffffe, 0xfffffff, 0x1fffffff, 0x7ff,
63 0, 0, 0x400000, 0xe000000,
64 0x1fffffff
65};
66/* kPrecomputed contains precomputed values to aid the calculation of scalar
67 * multiples of the base point, G. It's actually two, equal length, tables
68 * concatenated.
69 *
70 * The first table contains (x,y) felem pairs for 16 multiples of the base
71 * point, G.
72 *
73 * Index | Index (binary) | Value
74 * 0 | 0000 | 0G (all zeros, omitted)
75 * 1 | 0001 | G
76 * 2 | 0010 | 2**64G
77 * 3 | 0011 | 2**64G + G
78 * 4 | 0100 | 2**128G
79 * 5 | 0101 | 2**128G + G
80 * 6 | 0110 | 2**128G + 2**64G
81 * 7 | 0111 | 2**128G + 2**64G + G
82 * 8 | 1000 | 2**192G
83 * 9 | 1001 | 2**192G + G
84 * 10 | 1010 | 2**192G + 2**64G
85 * 11 | 1011 | 2**192G + 2**64G + G
86 * 12 | 1100 | 2**192G + 2**128G
87 * 13 | 1101 | 2**192G + 2**128G + G
88 * 14 | 1110 | 2**192G + 2**128G + 2**64G
89 * 15 | 1111 | 2**192G + 2**128G + 2**64G + G
90 *
91 * The second table follows the same style, but the terms are 2**32G,
92 * 2**96G, 2**160G, 2**224G.
93 *
94 * This is ~2KB of data. */
95static const limb kPrecomputed[NLIMBS * 2 * 15 * 2] = {
96 0x11522878, 0xe730d41, 0xdb60179, 0x4afe2ff, 0x12883add, 0xcaddd88, 0x119e7edc, 0xd4a6eab, 0x3120bee,
97 0x1d2aac15, 0xf25357c, 0x19e45cdd, 0x5c721d0, 0x1992c5a5, 0xa237487, 0x154ba21, 0x14b10bb, 0xae3fe3,
98 0xd41a576, 0x922fc51, 0x234994f, 0x60b60d3, 0x164586ae, 0xce95f18, 0x1fe49073, 0x3fa36cc, 0x5ebcd2c,
99 0xb402f2f, 0x15c70bf, 0x1561925c, 0x5a26704, 0xda91e90, 0xcdc1c7f, 0x1ea12446, 0xe1ade1e, 0xec91f22,
100 0x26f7778, 0x566847e, 0xa0bec9e, 0x234f453, 0x1a31f21a, 0xd85e75c, 0x56c7109, 0xa267a00, 0xb57c050,
101 0x98fb57, 0xaa837cc, 0x60c0792, 0xcfa5e19, 0x61bab9e, 0x589e39b, 0xa324c5, 0x7d6dee7, 0x2976e4b,
102 0x1fc4124a, 0xa8c244b, 0x1ce86762, 0xcd61c7e, 0x1831c8e0, 0x75774e1, 0x1d96a5a9, 0x843a649, 0xc3ab0fa,
103 0x6e2e7d5, 0x7673a2a, 0x178b65e8, 0x4003e9b, 0x1a1f11c2, 0x7816ea, 0xf643e11, 0x58c43df, 0xf423fc2,
104 0x19633ffa, 0x891f2b2, 0x123c231c, 0x46add8c, 0x54700dd, 0x59e2b17, 0x172db40f, 0x83e277d, 0xb0dd609,
105 0xfd1da12, 0x35c6e52, 0x19ede20c, 0xd19e0c0, 0x97d0f40, 0xb015b19, 0x449e3f5, 0xe10c9e, 0x33ab581,
106 0x56a67ab, 0x577734d, 0x1dddc062, 0xc57b10d, 0x149b39d, 0x26a9e7b, 0xc35df9f, 0x48764cd, 0x76dbcca,
107 0xca4b366, 0xe9303ab, 0x1a7480e7, 0x57e9e81, 0x1e13eb50, 0xf466cf3, 0x6f16b20, 0x4ba3173, 0xc168c33,
108 0x15cb5439, 0x6a38e11, 0x73658bd, 0xb29564f, 0x3f6dc5b, 0x53b97e, 0x1322c4c0, 0x65dd7ff, 0x3a1e4f6,
109 0x14e614aa, 0x9246317, 0x1bc83aca, 0xad97eed, 0xd38ce4a, 0xf82b006, 0x341f077, 0xa6add89, 0x4894acd,
110 0x9f162d5, 0xf8410ef, 0x1b266a56, 0xd7f223, 0x3e0cb92, 0xe39b672, 0x6a2901a, 0x69a8556, 0x7e7c0,
111 0x9b7d8d3, 0x309a80, 0x1ad05f7f, 0xc2fb5dd, 0xcbfd41d, 0x9ceb638, 0x1051825c, 0xda0cf5b, 0x812e881,
112 0x6f35669, 0x6a56f2c, 0x1df8d184, 0x345820, 0x1477d477, 0x1645db1, 0xbe80c51, 0xc22be3e, 0xe35e65a,
113 0x1aeb7aa0, 0xc375315, 0xf67bc99, 0x7fdd7b9, 0x191fc1be, 0x61235d, 0x2c184e9, 0x1c5a839, 0x47a1e26,
114 0xb7cb456, 0x93e225d, 0x14f3c6ed, 0xccc1ac9, 0x17fe37f3, 0x4988989, 0x1a90c502, 0x2f32042, 0xa17769b,
115 0xafd8c7c, 0x8191c6e, 0x1dcdb237, 0x16200c0, 0x107b32a1, 0x66c08db, 0x10d06a02, 0x3fc93, 0x5620023,
116 0x16722b27, 0x68b5c59, 0x270fcfc, 0xfad0ecc, 0xe5de1c2, 0xeab466b, 0x2fc513c, 0x407f75c, 0xbaab133,
117 0x9705fe9, 0xb88b8e7, 0x734c993, 0x1e1ff8f, 0x19156970, 0xabd0f00, 0x10469ea7, 0x3293ac0, 0xcdc98aa,
118 0x1d843fd, 0xe14bfe8, 0x15be825f, 0x8b5212, 0xeb3fb67, 0x81cbd29, 0xbc62f16, 0x2b6fcc7, 0xf5a4e29,
119 0x13560b66, 0xc0b6ac2, 0x51ae690, 0xd41e271, 0xf3e9bd4, 0x1d70aab, 0x1029f72, 0x73e1c35, 0xee70fbc,
120 0xad81baf, 0x9ecc49a, 0x86c741e, 0xfe6be30, 0x176752e7, 0x23d416, 0x1f83de85, 0x27de188, 0x66f70b8,
121 0x181cd51f, 0x96b6e4c, 0x188f2335, 0xa5df759, 0x17a77eb6, 0xfeb0e73, 0x154ae914, 0x2f3ec51, 0x3826b59,
122 0xb91f17d, 0x1c72949, 0x1362bf0a, 0xe23fddf, 0xa5614b0, 0xf7d8f, 0x79061, 0x823d9d2, 0x8213f39,
123 0x1128ae0b, 0xd095d05, 0xb85c0c2, 0x1ecb2ef, 0x24ddc84, 0xe35e901, 0x18411a4a, 0xf5ddc3d, 0x3786689,
124 0x52260e8, 0x5ae3564, 0x542b10d, 0x8d93a45, 0x19952aa4, 0x996cc41, 0x1051a729, 0x4be3499, 0x52b23aa,
125 0x109f307e, 0x6f5b6bb, 0x1f84e1e7, 0x77a0cfa, 0x10c4df3f, 0x25a02ea, 0xb048035, 0xe31de66, 0xc6ecaa3,
126 0x28ea335, 0x2886024, 0x1372f020, 0xf55d35, 0x15e4684c, 0xf2a9e17, 0x1a4a7529, 0xcb7beb1, 0xb2a78a1,
127 0x1ab21f1f, 0x6361ccf, 0x6c9179d, 0xb135627, 0x1267b974, 0x4408bad, 0x1cbff658, 0xe3d6511, 0xc7d76f,
128 0x1cc7a69, 0xe7ee31b, 0x54fab4f, 0x2b914f, 0x1ad27a30, 0xcd3579e, 0xc50124c, 0x50daa90, 0xb13f72,
129 0xb06aa75, 0x70f5cc6, 0x1649e5aa, 0x84a5312, 0x329043c, 0x41c4011, 0x13d32411, 0xb04a838, 0xd760d2d,
130 0x1713b532, 0xbaa0c03, 0x84022ab, 0x6bcf5c1, 0x2f45379, 0x18ae070, 0x18c9e11e, 0x20bca9a, 0x66f496b,
131 0x3eef294, 0x67500d2, 0xd7f613c, 0x2dbbeb, 0xb741038, 0xe04133f, 0x1582968d, 0xbe985f7, 0x1acbc1a,
132 0x1a6a939f, 0x33e50f6, 0xd665ed4, 0xb4b7bd6, 0x1e5a3799, 0x6b33847, 0x17fa56ff, 0x65ef930, 0x21dc4a,
133 0x2b37659, 0x450fe17, 0xb357b65, 0xdf5efac, 0x15397bef, 0x9d35a7f, 0x112ac15f, 0x624e62e, 0xa90ae2f,
134 0x107eecd2, 0x1f69bbe, 0x77d6bce, 0x5741394, 0x13c684fc, 0x950c910, 0x725522b, 0xdc78583, 0x40eeabb,
135 0x1fde328a, 0xbd61d96, 0xd28c387, 0x9e77d89, 0x12550c40, 0x759cb7d, 0x367ef34, 0xae2a960, 0x91b8bdc,
136 0x93462a9, 0xf469ef, 0xb2e9aef, 0xd2ca771, 0x54e1f42, 0x7aaa49, 0x6316abb, 0x2413c8e, 0x5425bf9,
137 0x1bed3e3a, 0xf272274, 0x1f5e7326, 0x6416517, 0xea27072, 0x9cedea7, 0x6e7633, 0x7c91952, 0xd806dce,
138 0x8e2a7e1, 0xe421e1a, 0x418c9e1, 0x1dbc890, 0x1b395c36, 0xa1dc175, 0x1dc4ef73, 0x8956f34, 0xe4b5cf2,
139 0x1b0d3a18, 0x3194a36, 0x6c2641f, 0xe44124c, 0xa2f4eaa, 0xa8c25ba, 0xf927ed7, 0x627b614, 0x7371cca,
140 0xba16694, 0x417bc03, 0x7c0a7e3, 0x9c35c19, 0x1168a205, 0x8b6b00d, 0x10e3edc9, 0x9c19bf2, 0x5882229,
141 0x1b2b4162, 0xa5cef1a, 0x1543622b, 0x9bd433e, 0x364e04d, 0x7480792, 0x5c9b5b3, 0xe85ff25, 0x408ef57,
142 0x1814cfa4, 0x121b41b, 0xd248a0f, 0x3b05222, 0x39bb16a, 0xc75966d, 0xa038113, 0xa4a1769, 0x11fbc6c,
143 0x917e50e, 0xeec3da8, 0x169d6eac, 0x10c1699, 0xa416153, 0xf724912, 0x15cd60b7, 0x4acbad9, 0x5efc5fa,
144 0xf150ed7, 0x122b51, 0x1104b40a, 0xcb7f442, 0xfbb28ff, 0x6ac53ca, 0x196142cc, 0x7bf0fa9, 0x957651,
145 0x4e0f215, 0xed439f8, 0x3f46bd5, 0x5ace82f, 0x110916b6, 0x6db078, 0xffd7d57, 0xf2ecaac, 0xca86dec,
146 0x15d6b2da, 0x965ecc9, 0x1c92b4c2, 0x1f3811, 0x1cb080f5, 0x2d8b804, 0x19d1c12d, 0xf20bd46, 0x1951fa7,
147 0xa3656c3, 0x523a425, 0xfcd0692, 0xd44ddc8, 0x131f0f5b, 0xaf80e4a, 0xcd9fc74, 0x99bb618, 0x2db944c,
148 0xa673090, 0x1c210e1, 0x178c8d23, 0x1474383, 0x10b8743d, 0x985a55b, 0x2e74779, 0x576138, 0x9587927,
149 0x133130fa, 0xbe05516, 0x9f4d619, 0xbb62570, 0x99ec591, 0xd9468fe, 0x1d07782d, 0xfc72e0b, 0x701b298,
150 0x1863863b, 0x85954b8, 0x121a0c36, 0x9e7fedf, 0xf64b429, 0x9b9d71e, 0x14e2f5d8, 0xf858d3a, 0x942eea8,
151 0xda5b765, 0x6edafff, 0xa9d18cc, 0xc65e4ba, 0x1c747e86, 0xe4ea915, 0x1981d7a1, 0x8395659, 0x52ed4e2,
152 0x87d43b7, 0x37ab11b, 0x19d292ce, 0xf8d4692, 0x18c3053f, 0x8863e13, 0x4c146c0, 0x6bdf55a, 0x4e4457d,
153 0x16152289, 0xac78ec2, 0x1a59c5a2, 0x2028b97, 0x71c2d01, 0x295851f, 0x404747b, 0x878558d, 0x7d29aa4,
154 0x13d8341f, 0x8daefd7, 0x139c972d, 0x6b7ea75, 0xd4a9dde, 0xff163d8, 0x81d55d7, 0xa5bef68, 0xb7b30d8,
155 0xbe73d6f, 0xaa88141, 0xd976c81, 0x7e7a9cc, 0x18beb771, 0xd773cbd, 0x13f51951, 0x9d0c177, 0x1c49a78,
156};
157
158
159/* Field element operations: */
160
161/* NON_ZERO_TO_ALL_ONES returns:
162 * 0xffffffff for 0 < x <= 2**31
163 * 0 for x == 0 or x > 2**31.
164 *
165 * x must be a u32 or an equivalent type such as limb. */
166#define NON_ZERO_TO_ALL_ONES(x) ((((u32)(x) - 1) >> 31) - 1)
167
168/* felem_reduce_carry adds a multiple of p in order to cancel |carry|,
169 * which is a term at 2**257.
170 *
171 * On entry: carry < 2**3, inout[0,2,...] < 2**29, inout[1,3,...] < 2**28.
172 * On exit: inout[0,2,..] < 2**30, inout[1,3,...] < 2**29. */
173static void felem_reduce_carry(felem inout, limb carry) {
174 const u32 carry_mask = NON_ZERO_TO_ALL_ONES(carry);
175
176 inout[0] += carry << 1;
177 inout[3] += 0x10000000 & carry_mask;
178 /* carry < 2**3 thus (carry << 11) < 2**14 and we added 2**28 in the
179 * previous line therefore this doesn't underflow. */
180 inout[3] -= carry << 11;
181 inout[4] += (0x20000000 - 1) & carry_mask;
182 inout[5] += (0x10000000 - 1) & carry_mask;
183 inout[6] += (0x20000000 - 1) & carry_mask;
184 inout[6] -= carry << 22;
185 /* This may underflow if carry is non-zero but, if so, we'll fix it in the
186 * next line. */
187 inout[7] -= 1 & carry_mask;
188 inout[7] += carry << 25;
189}
190
191/* felem_sum sets out = in+in2.
192 *
193 * On entry, in[i]+in2[i] must not overflow a 32-bit word.
194 * On exit: out[0,2,...] < 2**30, out[1,3,...] < 2**29 */
195static void felem_sum(felem out, const felem in, const felem in2) {
196 limb carry = 0;
197 unsigned i;
198
199 for (i = 0;; i++) {
200 out[i] = in[i] + in2[i];
201 out[i] += carry;
202 carry = out[i] >> 29;
203 out[i] &= kBottom29Bits;
204
205 i++;
206 if (i == NLIMBS)
207 break;
208
209 out[i] = in[i] + in2[i];
210 out[i] += carry;
211 carry = out[i] >> 28;
212 out[i] &= kBottom28Bits;
213 }
214
215 felem_reduce_carry(out, carry);
216}
217
218#define two31m3 (((limb)1) << 31) - (((limb)1) << 3)
219#define two30m2 (((limb)1) << 30) - (((limb)1) << 2)
220#define two30p13m2 (((limb)1) << 30) + (((limb)1) << 13) - (((limb)1) << 2)
221#define two31m2 (((limb)1) << 31) - (((limb)1) << 2)
222#define two31p24m2 (((limb)1) << 31) + (((limb)1) << 24) - (((limb)1) << 2)
223#define two30m27m2 (((limb)1) << 30) - (((limb)1) << 27) - (((limb)1) << 2)
224
225/* zero31 is 0 mod p. */
226static const felem zero31 = { two31m3, two30m2, two31m2, two30p13m2, two31m2, two30m2, two31p24m2, two30m27m2, two31m2 };
227
228/* felem_diff sets out = in-in2.
229 *
230 * On entry: in[0,2,...] < 2**30, in[1,3,...] < 2**29 and
231 * in2[0,2,...] < 2**30, in2[1,3,...] < 2**29.
232 * On exit: out[0,2,...] < 2**30, out[1,3,...] < 2**29. */
233static void felem_diff(felem out, const felem in, const felem in2) {
234 limb carry = 0;
235 unsigned i;
236
237 for (i = 0;; i++) {
238 out[i] = in[i] - in2[i];
239 out[i] += zero31[i];
240 out[i] += carry;
241 carry = out[i] >> 29;
242 out[i] &= kBottom29Bits;
243
244 i++;
245 if (i == NLIMBS)
246 break;
247
248 out[i] = in[i] - in2[i];
249 out[i] += zero31[i];
250 out[i] += carry;
251 carry = out[i] >> 28;
252 out[i] &= kBottom28Bits;
253 }
254
255 felem_reduce_carry(out, carry);
256}
257
258/* felem_reduce_degree sets out = tmp/R mod p where tmp contains 64-bit words
259 * with the same 29,28,... bit positions as an felem.
260 *
261 * The values in felems are in Montgomery form: x*R mod p where R = 2**257.
262 * Since we just multiplied two Montgomery values together, the result is
263 * x*y*R*R mod p. We wish to divide by R in order for the result also to be
264 * in Montgomery form.
265 *
266 * On entry: tmp[i] < 2**64
267 * On exit: out[0,2,...] < 2**30, out[1,3,...] < 2**29 */
268static void felem_reduce_degree(felem out, u64 tmp[17]) {
269 /* The following table may be helpful when reading this code:
270 *
271 * Limb number: 0 | 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10...
272 * Width (bits): 29| 28| 29| 28| 29| 28| 29| 28| 29| 28| 29
273 * Start bit: 0 | 29| 57| 86|114|143|171|200|228|257|285
274 * (odd phase): 0 | 28| 57| 85|114|142|171|199|228|256|285 */
275 limb tmp2[18], carry, x, xMask;
276 unsigned i;
277
278 /* tmp contains 64-bit words with the same 29,28,29-bit positions as an
279 * felem. So the top of an element of tmp might overlap with another
280 * element two positions down. The following loop eliminates this
281 * overlap. */
282 tmp2[0] = (limb)(tmp[0] & kBottom29Bits);
283
284 /* In the following we use "(limb) tmp[x]" and "(limb) (tmp[x]>>32)" to try
285 * and hint to the compiler that it can do a single-word shift by selecting
286 * the right register rather than doing a double-word shift and truncating
287 * afterwards. */
288 tmp2[1] = ((limb) tmp[0]) >> 29;
289 tmp2[1] |= (((limb)(tmp[0] >> 32)) << 3) & kBottom28Bits;
290 tmp2[1] += ((limb) tmp[1]) & kBottom28Bits;
291 carry = tmp2[1] >> 28;
292 tmp2[1] &= kBottom28Bits;
293
294 for (i = 2; i < 17; i++) {
295 tmp2[i] = ((limb)(tmp[i - 2] >> 32)) >> 25;
296 tmp2[i] += ((limb)(tmp[i - 1])) >> 28;
297 tmp2[i] += (((limb)(tmp[i - 1] >> 32)) << 4) & kBottom29Bits;
298 tmp2[i] += ((limb) tmp[i]) & kBottom29Bits;
299 tmp2[i] += carry;
300 carry = tmp2[i] >> 29;
301 tmp2[i] &= kBottom29Bits;
302
303 i++;
304 if (i == 17)
305 break;
306 tmp2[i] = ((limb)(tmp[i - 2] >> 32)) >> 25;
307 tmp2[i] += ((limb)(tmp[i - 1])) >> 29;
308 tmp2[i] += (((limb)(tmp[i - 1] >> 32)) << 3) & kBottom28Bits;
309 tmp2[i] += ((limb) tmp[i]) & kBottom28Bits;
310 tmp2[i] += carry;
311 carry = tmp2[i] >> 28;
312 tmp2[i] &= kBottom28Bits;
313 }
314
315 tmp2[17] = ((limb)(tmp[15] >> 32)) >> 25;
316 tmp2[17] += ((limb)(tmp[16])) >> 29;
317 tmp2[17] += (((limb)(tmp[16] >> 32)) << 3);
318 tmp2[17] += carry;
319
320 /* Montgomery elimination of terms.
321 *
322 * Since R is 2**257, we can divide by R with a bitwise shift if we can
323 * ensure that the right-most 257 bits are all zero. We can make that true by
324 * adding multiplies of p without affecting the value.
325 *
326 * So we eliminate limbs from right to left. Since the bottom 29 bits of p
327 * are all ones, then by adding tmp2[0]*p to tmp2 we'll make tmp2[0] == 0.
328 * We can do that for 8 further limbs and then right shift to eliminate the
329 * extra factor of R. */
330 for (i = 0;; i += 2) {
331 tmp2[i + 1] += tmp2[i] >> 29;
332 x = tmp2[i] & kBottom29Bits;
333 xMask = NON_ZERO_TO_ALL_ONES(x);
334 tmp2[i] = 0;
335
336 /* The bounds calculations for this loop are tricky. Each iteration of
337 * the loop eliminates two words by adding values to words to their
338 * right.
339 *
340 * The following table contains the amounts added to each word (as an
341 * offset from the value of i at the top of the loop). The amounts are
342 * accounted for from the first and second half of the loop separately
343 * and are written as, for example, 28 to mean a value <2**28.
344 *
345 * Word: 3 4 5 6 7 8 9 10
346 * Added in top half: 28 11 29 21 29 28
347 * 28 29
348 * 29
349 * Added in bottom half: 29 10 28 21 28 28
350 * 29
351 *
352 * The value that is currently offset 7 will be offset 5 for the next
353 * iteration and then offset 3 for the iteration after that. Therefore
354 * the total value added will be the values added at 7, 5 and 3.
355 *
356 * The following table accumulates these values. The sums at the bottom
357 * are written as, for example, 29+28, to mean a value < 2**29+2**28.
358 *
359 * Word: 3 4 5 6 7 8 9 10 11 12 13
360 * 28 11 10 29 21 29 28 28 28 28 28
361 * 29 28 11 28 29 28 29 28 29 28
362 * 29 28 21 21 29 21 29 21
363 * 10 29 28 21 28 21 28
364 * 28 29 28 29 28 29 28
365 * 11 10 29 10 29 10
366 * 29 28 11 28 11
367 * 29 29
368 * --------------------------------------------
369 * 30+ 31+ 30+ 31+ 30+
370 * 28+ 29+ 28+ 29+ 21+
371 * 21+ 28+ 21+ 28+ 10
372 * 10 21+ 10 21+
373 * 11 11
374 *
375 * So the greatest amount is added to tmp2[10] and tmp2[12]. If
376 * tmp2[10/12] has an initial value of <2**29, then the maximum value
377 * will be < 2**31 + 2**30 + 2**28 + 2**21 + 2**11, which is < 2**32,
378 * as required. */
379 tmp2[i + 3] += (x << 10) & kBottom28Bits;
380 tmp2[i + 4] += (x >> 18);
381
382 tmp2[i + 6] += (x << 21) & kBottom29Bits;
383 tmp2[i + 7] += x >> 8;
384
385 /* At position 200, which is the starting bit position for word 7, we
386 * have a factor of 0xf000000 = 2**28 - 2**24. */
387 tmp2[i + 7] += 0x10000000 & xMask;
388 /* Word 7 is 28 bits wide, so the 2**28 term exactly hits word 8. */
389 tmp2[i + 8] += (x - 1) & xMask;
390 tmp2[i + 7] -= (x << 24) & kBottom28Bits;
391 tmp2[i + 8] -= x >> 4;
392
393 tmp2[i + 8] += 0x20000000 & xMask;
394 tmp2[i + 8] -= x;
395 tmp2[i + 8] += (x << 28) & kBottom29Bits;
396 tmp2[i + 9] += ((x >> 1) - 1) & xMask;
397
398 if (i+1 == NLIMBS)
399 break;
400 tmp2[i + 2] += tmp2[i + 1] >> 28;
401 x = tmp2[i + 1] & kBottom28Bits;
402 xMask = NON_ZERO_TO_ALL_ONES(x);
403 tmp2[i + 1] = 0;
404
405 tmp2[i + 4] += (x << 11) & kBottom29Bits;
406 tmp2[i + 5] += (x >> 18);
407
408 tmp2[i + 7] += (x << 21) & kBottom28Bits;
409 tmp2[i + 8] += x >> 7;
410
411 /* At position 199, which is the starting bit of the 8th word when
412 * dealing with a context starting on an odd word, we have a factor of
413 * 0x1e000000 = 2**29 - 2**25. Since we have not updated i, the 8th
414 * word from i+1 is i+8. */
415 tmp2[i + 8] += 0x20000000 & xMask;
416 tmp2[i + 9] += (x - 1) & xMask;
417 tmp2[i + 8] -= (x << 25) & kBottom29Bits;
418 tmp2[i + 9] -= x >> 4;
419
420 tmp2[i + 9] += 0x10000000 & xMask;
421 tmp2[i + 9] -= x;
422 tmp2[i + 10] += (x - 1) & xMask;
423 }
424
425 /* We merge the right shift with a carry chain. The words above 2**257 have
426 * widths of 28,29,... which we need to correct when copying them down. */
427 carry = 0;
428 for (i = 0; i < 8; i++) {
429 /* The maximum value of tmp2[i + 9] occurs on the first iteration and
430 * is < 2**30+2**29+2**28. Adding 2**29 (from tmp2[i + 10]) is
431 * therefore safe. */
432 out[i] = tmp2[i + 9];
433 out[i] += carry;
434 out[i] += (tmp2[i + 10] << 28) & kBottom29Bits;
435 carry = out[i] >> 29;
436 out[i] &= kBottom29Bits;
437
438 i++;
439 out[i] = tmp2[i + 9] >> 1;
440 out[i] += carry;
441 carry = out[i] >> 28;
442 out[i] &= kBottom28Bits;
443 }
444
445 out[8] = tmp2[17];
446 out[8] += carry;
447 carry = out[8] >> 29;
448 out[8] &= kBottom29Bits;
449
450 felem_reduce_carry(out, carry);
451}
452
453/* felem_square sets out=in*in.
454 *
455 * On entry: in[0,2,...] < 2**30, in[1,3,...] < 2**29.
456 * On exit: out[0,2,...] < 2**30, out[1,3,...] < 2**29. */
457static void felem_square(felem out, const felem in) {
458 u64 tmp[17];
459
460 tmp[0] = ((u64) in[0]) * in[0];
461 tmp[1] = ((u64) in[0]) * (in[1] << 1);
462 tmp[2] = ((u64) in[0]) * (in[2] << 1) +
463 ((u64) in[1]) * (in[1] << 1);
464 tmp[3] = ((u64) in[0]) * (in[3] << 1) +
465 ((u64) in[1]) * (in[2] << 1);
466 tmp[4] = ((u64) in[0]) * (in[4] << 1) +
467 ((u64) in[1]) * (in[3] << 2) + ((u64) in[2]) * in[2];
468 tmp[5] = ((u64) in[0]) * (in[5] << 1) + ((u64) in[1]) *
469 (in[4] << 1) + ((u64) in[2]) * (in[3] << 1);
470 tmp[6] = ((u64) in[0]) * (in[6] << 1) + ((u64) in[1]) *
471 (in[5] << 2) + ((u64) in[2]) * (in[4] << 1) +
472 ((u64) in[3]) * (in[3] << 1);
473 tmp[7] = ((u64) in[0]) * (in[7] << 1) + ((u64) in[1]) *
474 (in[6] << 1) + ((u64) in[2]) * (in[5] << 1) +
475 ((u64) in[3]) * (in[4] << 1);
476 /* tmp[8] has the greatest value of 2**61 + 2**60 + 2**61 + 2**60 + 2**60,
477 * which is < 2**64 as required. */
478 tmp[8] = ((u64) in[0]) * (in[8] << 1) + ((u64) in[1]) *
479 (in[7] << 2) + ((u64) in[2]) * (in[6] << 1) +
480 ((u64) in[3]) * (in[5] << 2) + ((u64) in[4]) * in[4];
481 tmp[9] = ((u64) in[1]) * (in[8] << 1) + ((u64) in[2]) *
482 (in[7] << 1) + ((u64) in[3]) * (in[6] << 1) +
483 ((u64) in[4]) * (in[5] << 1);
484 tmp[10] = ((u64) in[2]) * (in[8] << 1) + ((u64) in[3]) *
485 (in[7] << 2) + ((u64) in[4]) * (in[6] << 1) +
486 ((u64) in[5]) * (in[5] << 1);
487 tmp[11] = ((u64) in[3]) * (in[8] << 1) + ((u64) in[4]) *
488 (in[7] << 1) + ((u64) in[5]) * (in[6] << 1);
489 tmp[12] = ((u64) in[4]) * (in[8] << 1) +
490 ((u64) in[5]) * (in[7] << 2) + ((u64) in[6]) * in[6];
491 tmp[13] = ((u64) in[5]) * (in[8] << 1) +
492 ((u64) in[6]) * (in[7] << 1);
493 tmp[14] = ((u64) in[6]) * (in[8] << 1) +
494 ((u64) in[7]) * (in[7] << 1);
495 tmp[15] = ((u64) in[7]) * (in[8] << 1);
496 tmp[16] = ((u64) in[8]) * in[8];
497
498 felem_reduce_degree(out, tmp);
499}
500
501/* felem_mul sets out=in*in2.
502 *
503 * On entry: in[0,2,...] < 2**30, in[1,3,...] < 2**29 and
504 * in2[0,2,...] < 2**30, in2[1,3,...] < 2**29.
505 * On exit: out[0,2,...] < 2**30, out[1,3,...] < 2**29. */
506static void felem_mul(felem out, const felem in, const felem in2) {
507 u64 tmp[17];
508
509 tmp[0] = ((u64) in[0]) * in2[0];
510 tmp[1] = ((u64) in[0]) * (in2[1] << 0) +
511 ((u64) in[1]) * (in2[0] << 0);
512 tmp[2] = ((u64) in[0]) * (in2[2] << 0) + ((u64) in[1]) *
513 (in2[1] << 1) + ((u64) in[2]) * (in2[0] << 0);
514 tmp[3] = ((u64) in[0]) * (in2[3] << 0) + ((u64) in[1]) *
515 (in2[2] << 0) + ((u64) in[2]) * (in2[1] << 0) +
516 ((u64) in[3]) * (in2[0] << 0);
517 tmp[4] = ((u64) in[0]) * (in2[4] << 0) + ((u64) in[1]) *
518 (in2[3] << 1) + ((u64) in[2]) * (in2[2] << 0) +
519 ((u64) in[3]) * (in2[1] << 1) +
520 ((u64) in[4]) * (in2[0] << 0);
521 tmp[5] = ((u64) in[0]) * (in2[5] << 0) + ((u64) in[1]) *
522 (in2[4] << 0) + ((u64) in[2]) * (in2[3] << 0) +
523 ((u64) in[3]) * (in2[2] << 0) + ((u64) in[4]) *
524 (in2[1] << 0) + ((u64) in[5]) * (in2[0] << 0);
525 tmp[6] = ((u64) in[0]) * (in2[6] << 0) + ((u64) in[1]) *
526 (in2[5] << 1) + ((u64) in[2]) * (in2[4] << 0) +
527 ((u64) in[3]) * (in2[3] << 1) + ((u64) in[4]) *
528 (in2[2] << 0) + ((u64) in[5]) * (in2[1] << 1) +
529 ((u64) in[6]) * (in2[0] << 0);
530 tmp[7] = ((u64) in[0]) * (in2[7] << 0) + ((u64) in[1]) *
531 (in2[6] << 0) + ((u64) in[2]) * (in2[5] << 0) +
532 ((u64) in[3]) * (in2[4] << 0) + ((u64) in[4]) *
533 (in2[3] << 0) + ((u64) in[5]) * (in2[2] << 0) +
534 ((u64) in[6]) * (in2[1] << 0) +
535 ((u64) in[7]) * (in2[0] << 0);
536 /* tmp[8] has the greatest value but doesn't overflow. See logic in
537 * felem_square. */
538 tmp[8] = ((u64) in[0]) * (in2[8] << 0) + ((u64) in[1]) *
539 (in2[7] << 1) + ((u64) in[2]) * (in2[6] << 0) +
540 ((u64) in[3]) * (in2[5] << 1) + ((u64) in[4]) *
541 (in2[4] << 0) + ((u64) in[5]) * (in2[3] << 1) +
542 ((u64) in[6]) * (in2[2] << 0) + ((u64) in[7]) *
543 (in2[1] << 1) + ((u64) in[8]) * (in2[0] << 0);
544 tmp[9] = ((u64) in[1]) * (in2[8] << 0) + ((u64) in[2]) *
545 (in2[7] << 0) + ((u64) in[3]) * (in2[6] << 0) +
546 ((u64) in[4]) * (in2[5] << 0) + ((u64) in[5]) *
547 (in2[4] << 0) + ((u64) in[6]) * (in2[3] << 0) +
548 ((u64) in[7]) * (in2[2] << 0) +
549 ((u64) in[8]) * (in2[1] << 0);
550 tmp[10] = ((u64) in[2]) * (in2[8] << 0) + ((u64) in[3]) *
551 (in2[7] << 1) + ((u64) in[4]) * (in2[6] << 0) +
552 ((u64) in[5]) * (in2[5] << 1) + ((u64) in[6]) *
553 (in2[4] << 0) + ((u64) in[7]) * (in2[3] << 1) +
554 ((u64) in[8]) * (in2[2] << 0);
555 tmp[11] = ((u64) in[3]) * (in2[8] << 0) + ((u64) in[4]) *
556 (in2[7] << 0) + ((u64) in[5]) * (in2[6] << 0) +
557 ((u64) in[6]) * (in2[5] << 0) + ((u64) in[7]) *
558 (in2[4] << 0) + ((u64) in[8]) * (in2[3] << 0);
559 tmp[12] = ((u64) in[4]) * (in2[8] << 0) + ((u64) in[5]) *
560 (in2[7] << 1) + ((u64) in[6]) * (in2[6] << 0) +
561 ((u64) in[7]) * (in2[5] << 1) +
562 ((u64) in[8]) * (in2[4] << 0);
563 tmp[13] = ((u64) in[5]) * (in2[8] << 0) + ((u64) in[6]) *
564 (in2[7] << 0) + ((u64) in[7]) * (in2[6] << 0) +
565 ((u64) in[8]) * (in2[5] << 0);
566 tmp[14] = ((u64) in[6]) * (in2[8] << 0) + ((u64) in[7]) *
567 (in2[7] << 1) + ((u64) in[8]) * (in2[6] << 0);
568 tmp[15] = ((u64) in[7]) * (in2[8] << 0) +
569 ((u64) in[8]) * (in2[7] << 0);
570 tmp[16] = ((u64) in[8]) * (in2[8] << 0);
571
572 felem_reduce_degree(out, tmp);
573}
574
575static void felem_assign(felem out, const felem in) {
576 memcpy(out, in, sizeof(felem));
577}
578
579/* felem_inv calculates |out| = |in|^{-1}
580 *
581 * Based on Fermat's Little Theorem:
582 * a^p = a (mod p)
583 * a^{p-1} = 1 (mod p)
584 * a^{p-2} = a^{-1} (mod p)
585 */
586static void felem_inv(felem out, const felem in) {
587 felem ftmp, ftmp2;
588 /* each e_I will hold |in|^{2^I - 1} */
589 felem e2, e4, e8, e16, e32, e64;
590 unsigned i;
591
592 felem_square(ftmp, in); /* 2^1 */
593 felem_mul(ftmp, in, ftmp); /* 2^2 - 2^0 */
594 felem_assign(e2, ftmp);
595 felem_square(ftmp, ftmp); /* 2^3 - 2^1 */
596 felem_square(ftmp, ftmp); /* 2^4 - 2^2 */
597 felem_mul(ftmp, ftmp, e2); /* 2^4 - 2^0 */
598 felem_assign(e4, ftmp);
599 felem_square(ftmp, ftmp); /* 2^5 - 2^1 */
600 felem_square(ftmp, ftmp); /* 2^6 - 2^2 */
601 felem_square(ftmp, ftmp); /* 2^7 - 2^3 */
602 felem_square(ftmp, ftmp); /* 2^8 - 2^4 */
603 felem_mul(ftmp, ftmp, e4); /* 2^8 - 2^0 */
604 felem_assign(e8, ftmp);
605 for (i = 0; i < 8; i++) {
606 felem_square(ftmp, ftmp);
607 } /* 2^16 - 2^8 */
608 felem_mul(ftmp, ftmp, e8); /* 2^16 - 2^0 */
609 felem_assign(e16, ftmp);
610 for (i = 0; i < 16; i++) {
611 felem_square(ftmp, ftmp);
612 } /* 2^32 - 2^16 */
613 felem_mul(ftmp, ftmp, e16); /* 2^32 - 2^0 */
614 felem_assign(e32, ftmp);
615 for (i = 0; i < 32; i++) {
616 felem_square(ftmp, ftmp);
617 } /* 2^64 - 2^32 */
618 felem_assign(e64, ftmp);
619 felem_mul(ftmp, ftmp, in); /* 2^64 - 2^32 + 2^0 */
620 for (i = 0; i < 192; i++) {
621 felem_square(ftmp, ftmp);
622 } /* 2^256 - 2^224 + 2^192 */
623
624 felem_mul(ftmp2, e64, e32); /* 2^64 - 2^0 */
625 for (i = 0; i < 16; i++) {
626 felem_square(ftmp2, ftmp2);
627 } /* 2^80 - 2^16 */
628 felem_mul(ftmp2, ftmp2, e16); /* 2^80 - 2^0 */
629 for (i = 0; i < 8; i++) {
630 felem_square(ftmp2, ftmp2);
631 } /* 2^88 - 2^8 */
632 felem_mul(ftmp2, ftmp2, e8); /* 2^88 - 2^0 */
633 for (i = 0; i < 4; i++) {
634 felem_square(ftmp2, ftmp2);
635 } /* 2^92 - 2^4 */
636 felem_mul(ftmp2, ftmp2, e4); /* 2^92 - 2^0 */
637 felem_square(ftmp2, ftmp2); /* 2^93 - 2^1 */
638 felem_square(ftmp2, ftmp2); /* 2^94 - 2^2 */
639 felem_mul(ftmp2, ftmp2, e2); /* 2^94 - 2^0 */
640 felem_square(ftmp2, ftmp2); /* 2^95 - 2^1 */
641 felem_square(ftmp2, ftmp2); /* 2^96 - 2^2 */
642 felem_mul(ftmp2, ftmp2, in); /* 2^96 - 3 */
643
644 felem_mul(out, ftmp2, ftmp); /* 2^256 - 2^224 + 2^192 + 2^96 - 3 */
645}
646
647/* felem_scalar_3 sets out=3*out.
648 *
649 * On entry: out[0,2,...] < 2**30, out[1,3,...] < 2**29.
650 * On exit: out[0,2,...] < 2**30, out[1,3,...] < 2**29. */
651static void felem_scalar_3(felem out) {
652 limb carry = 0;
653 unsigned i;
654
655 for (i = 0;; i++) {
656 out[i] *= 3;
657 out[i] += carry;
658 carry = out[i] >> 29;
659 out[i] &= kBottom29Bits;
660
661 i++;
662 if (i == NLIMBS)
663 break;
664
665 out[i] *= 3;
666 out[i] += carry;
667 carry = out[i] >> 28;
668 out[i] &= kBottom28Bits;
669 }
670
671 felem_reduce_carry(out, carry);
672}
673
674/* felem_scalar_4 sets out=4*out.
675 *
676 * On entry: out[0,2,...] < 2**30, out[1,3,...] < 2**29.
677 * On exit: out[0,2,...] < 2**30, out[1,3,...] < 2**29. */
678static void felem_scalar_4(felem out) {
679 limb carry = 0, next_carry;
680 unsigned i;
681
682 for (i = 0;; i++) {
683 next_carry = out[i] >> 27;
684 out[i] <<= 2;
685 out[i] &= kBottom29Bits;
686 out[i] += carry;
687 carry = next_carry + (out[i] >> 29);
688 out[i] &= kBottom29Bits;
689
690 i++;
691 if (i == NLIMBS)
692 break;
693
694 next_carry = out[i] >> 26;
695 out[i] <<= 2;
696 out[i] &= kBottom28Bits;
697 out[i] += carry;
698 carry = next_carry + (out[i] >> 28);
699 out[i] &= kBottom28Bits;
700 }
701
702 felem_reduce_carry(out, carry);
703}
704
705/* felem_scalar_8 sets out=8*out.
706 *
707 * On entry: out[0,2,...] < 2**30, out[1,3,...] < 2**29.
708 * On exit: out[0,2,...] < 2**30, out[1,3,...] < 2**29. */
709static void felem_scalar_8(felem out) {
710 limb carry = 0, next_carry;
711 unsigned i;
712
713 for (i = 0;; i++) {
714 next_carry = out[i] >> 26;
715 out[i] <<= 3;
716 out[i] &= kBottom29Bits;
717 out[i] += carry;
718 carry = next_carry + (out[i] >> 29);
719 out[i] &= kBottom29Bits;
720
721 i++;
722 if (i == NLIMBS)
723 break;
724
725 next_carry = out[i] >> 25;
726 out[i] <<= 3;
727 out[i] &= kBottom28Bits;
728 out[i] += carry;
729 carry = next_carry + (out[i] >> 28);
730 out[i] &= kBottom28Bits;
731 }
732
733 felem_reduce_carry(out, carry);
734}
735
736/* felem_is_zero_vartime returns 1 iff |in| == 0. It takes a variable amount of
737 * time depending on the value of |in|. */
738static char felem_is_zero_vartime(const felem in) {
739 limb carry;
740 int i;
741 limb tmp[NLIMBS];
742
743 felem_assign(tmp, in);
744
745 /* First, reduce tmp to a minimal form. */
746 do {
747 carry = 0;
748 for (i = 0;; i++) {
749 tmp[i] += carry;
750 carry = tmp[i] >> 29;
751 tmp[i] &= kBottom29Bits;
752
753 i++;
754 if (i == NLIMBS)
755 break;
756
757 tmp[i] += carry;
758 carry = tmp[i] >> 28;
759 tmp[i] &= kBottom28Bits;
760 }
761
762 felem_reduce_carry(tmp, carry);
763 } while (carry);
764
765 /* tmp < 2**257, so the only possible zero values are 0, p and 2p. */
766 return memcmp(tmp, kZero, sizeof(tmp)) == 0 ||
767 memcmp(tmp, kP, sizeof(tmp)) == 0 ||
768 memcmp(tmp, k2P, sizeof(tmp)) == 0;
769}
770
771/* Group operations:
772 *
773 * Elements of the elliptic curve group are represented in Jacobian
774 * coordinates: (x, y, z). An affine point (x', y') is x'=x/z**2, y'=y/z**3 in
775 * Jacobian form. */
776
777/* point_double sets {x_out,y_out,z_out} = 2*{x,y,z}.
778 *
779 * See http://www.hyperelliptic.org/EFD/g1p/auto-shortw-jacobian-0.html#doubling-dbl-2009-l */
780static void point_double(felem x_out, felem y_out, felem z_out, const felem x,
781 const felem y, const felem z) {
782 felem delta, gamma, alpha, beta, tmp, tmp2;
783
784 felem_square(delta, z);
785 felem_square(gamma, y);
786 felem_mul(beta, x, gamma);
787
788 felem_sum(tmp, x, delta);
789 felem_diff(tmp2, x, delta);
790 felem_mul(alpha, tmp, tmp2);
791 felem_scalar_3(alpha);
792
793 felem_sum(tmp, y, z);
794 felem_square(tmp, tmp);
795 felem_diff(tmp, tmp, gamma);
796 felem_diff(z_out, tmp, delta);
797
798 felem_scalar_4(beta);
799 felem_square(x_out, alpha);
800 felem_diff(x_out, x_out, beta);
801 felem_diff(x_out, x_out, beta);
802
803 felem_diff(tmp, beta, x_out);
804 felem_mul(tmp, alpha, tmp);
805 felem_square(tmp2, gamma);
806 felem_scalar_8(tmp2);
807 felem_diff(y_out, tmp, tmp2);
808}
809
810/* point_add_mixed sets {x_out,y_out,z_out} = {x1,y1,z1} + {x2,y2,1}.
811 * (i.e. the second point is affine.)
812 *
813 * See http://www.hyperelliptic.org/EFD/g1p/auto-shortw-jacobian-0.html#addition-add-2007-bl
814 *
815 * Note that this function does not handle P+P, infinity+P nor P+infinity
816 * correctly. */
817static void point_add_mixed(felem x_out, felem y_out, felem z_out,
818 const felem x1, const felem y1, const felem z1,
819 const felem x2, const felem y2) {
820 felem z1z1, z1z1z1, s2, u2, h, i, j, r, rr, v, tmp;
821
822 felem_square(z1z1, z1);
823 felem_sum(tmp, z1, z1);
824
825 felem_mul(u2, x2, z1z1);
826 felem_mul(z1z1z1, z1, z1z1);
827 felem_mul(s2, y2, z1z1z1);
828 felem_diff(h, u2, x1);
829 felem_sum(i, h, h);
830 felem_square(i, i);
831 felem_mul(j, h, i);
832 felem_diff(r, s2, y1);
833 felem_sum(r, r, r);
834 felem_mul(v, x1, i);
835
836 felem_mul(z_out, tmp, h);
837 felem_square(rr, r);
838 felem_diff(x_out, rr, j);
839 felem_diff(x_out, x_out, v);
840 felem_diff(x_out, x_out, v);
841
842 felem_diff(tmp, v, x_out);
843 felem_mul(y_out, tmp, r);
844 felem_mul(tmp, y1, j);
845 felem_diff(y_out, y_out, tmp);
846 felem_diff(y_out, y_out, tmp);
847}
848
849/* point_add sets {x_out,y_out,z_out} = {x1,y1,z1} + {x2,y2,z2}.
850 *
851 * See http://www.hyperelliptic.org/EFD/g1p/auto-shortw-jacobian-0.html#addition-add-2007-bl
852 *
853 * Note that this function does not handle P+P, infinity+P nor P+infinity
854 * correctly. */
855static void point_add(felem x_out, felem y_out, felem z_out, const felem x1,
856 const felem y1, const felem z1, const felem x2,
857 const felem y2, const felem z2) {
858 felem z1z1, z1z1z1, z2z2, z2z2z2, s1, s2, u1, u2, h, i, j, r, rr, v, tmp;
859
860 felem_square(z1z1, z1);
861 felem_square(z2z2, z2);
862 felem_mul(u1, x1, z2z2);
863
864 felem_sum(tmp, z1, z2);
865 felem_square(tmp, tmp);
866 felem_diff(tmp, tmp, z1z1);
867 felem_diff(tmp, tmp, z2z2);
868
869 felem_mul(z2z2z2, z2, z2z2);
870 felem_mul(s1, y1, z2z2z2);
871
872 felem_mul(u2, x2, z1z1);
873 felem_mul(z1z1z1, z1, z1z1);
874 felem_mul(s2, y2, z1z1z1);
875 felem_diff(h, u2, u1);
876 felem_sum(i, h, h);
877 felem_square(i, i);
878 felem_mul(j, h, i);
879 felem_diff(r, s2, s1);
880 felem_sum(r, r, r);
881 felem_mul(v, u1, i);
882
883 felem_mul(z_out, tmp, h);
884 felem_square(rr, r);
885 felem_diff(x_out, rr, j);
886 felem_diff(x_out, x_out, v);
887 felem_diff(x_out, x_out, v);
888
889 felem_diff(tmp, v, x_out);
890 felem_mul(y_out, tmp, r);
891 felem_mul(tmp, s1, j);
892 felem_diff(y_out, y_out, tmp);
893 felem_diff(y_out, y_out, tmp);
894}
895
896/* point_add_or_double_vartime sets {x_out,y_out,z_out} = {x1,y1,z1} +
897 * {x2,y2,z2}.
898 *
899 * See http://www.hyperelliptic.org/EFD/g1p/auto-shortw-jacobian-0.html#addition-add-2007-bl
900 *
901 * This function handles the case where {x1,y1,z1}={x2,y2,z2}. */
902static void point_add_or_double_vartime(
903 felem x_out, felem y_out, felem z_out, const felem x1, const felem y1,
904 const felem z1, const felem x2, const felem y2, const felem z2) {
905 felem z1z1, z1z1z1, z2z2, z2z2z2, s1, s2, u1, u2, h, i, j, r, rr, v, tmp;
906 char x_equal, y_equal;
907
908 felem_square(z1z1, z1);
909 felem_square(z2z2, z2);
910 felem_mul(u1, x1, z2z2);
911
912 felem_sum(tmp, z1, z2);
913 felem_square(tmp, tmp);
914 felem_diff(tmp, tmp, z1z1);
915 felem_diff(tmp, tmp, z2z2);
916
917 felem_mul(z2z2z2, z2, z2z2);
918 felem_mul(s1, y1, z2z2z2);
919
920 felem_mul(u2, x2, z1z1);
921 felem_mul(z1z1z1, z1, z1z1);
922 felem_mul(s2, y2, z1z1z1);
923 felem_diff(h, u2, u1);
924 x_equal = felem_is_zero_vartime(h);
925 felem_sum(i, h, h);
926 felem_square(i, i);
927 felem_mul(j, h, i);
928 felem_diff(r, s2, s1);
929 y_equal = felem_is_zero_vartime(r);
930 if (x_equal && y_equal) {
931 point_double(x_out, y_out, z_out, x1, y1, z1);
932 return;
933 }
934 felem_sum(r, r, r);
935 felem_mul(v, u1, i);
936
937 felem_mul(z_out, tmp, h);
938 felem_square(rr, r);
939 felem_diff(x_out, rr, j);
940 felem_diff(x_out, x_out, v);
941 felem_diff(x_out, x_out, v);
942
943 felem_diff(tmp, v, x_out);
944 felem_mul(y_out, tmp, r);
945 felem_mul(tmp, s1, j);
946 felem_diff(y_out, y_out, tmp);
947 felem_diff(y_out, y_out, tmp);
948}
949
950/* copy_conditional sets out=in if mask = 0xffffffff in constant time.
951 *
952 * On entry: mask is either 0 or 0xffffffff. */
953static void copy_conditional(felem out, const felem in, limb mask) {
954 int i;
955
956 for (i = 0; i < NLIMBS; i++) {
957 const limb tmp = mask & (in[i] ^ out[i]);
958 out[i] ^= tmp;
959 }
960}
961
962/* select_affine_point sets {out_x,out_y} to the index'th entry of table.
963 * On entry: index < 16, table[0] must be zero. */
964static void select_affine_point(felem out_x, felem out_y, const limb* table,
965 limb index) {
966 limb i, j;
967
968 memset(out_x, 0, sizeof(felem));
969 memset(out_y, 0, sizeof(felem));
970
971 for (i = 1; i < 16; i++) {
972 limb mask = i ^ index;
973 mask |= mask >> 2;
974 mask |= mask >> 1;
975 mask &= 1;
976 mask--;
977 for (j = 0; j < NLIMBS; j++, table++) {
978 out_x[j] |= *table & mask;
979 }
980 for (j = 0; j < NLIMBS; j++, table++) {
981 out_y[j] |= *table & mask;
982 }
983 }
984}
985
986/* select_jacobian_point sets {out_x,out_y,out_z} to the index'th entry of
987 * table. On entry: index < 16, table[0] must be zero. */
988static void select_jacobian_point(felem out_x, felem out_y, felem out_z,
989 const limb* table, limb index) {
990 limb i, j;
991
992 memset(out_x, 0, sizeof(felem));
993 memset(out_y, 0, sizeof(felem));
994 memset(out_z, 0, sizeof(felem));
995
996 /* The implicit value at index 0 is all zero. We don't need to perform that
997 * iteration of the loop because we already set out_* to zero. */
998 table += 3 * NLIMBS;
999
1000 // Hit all entries to obscure cache profiling.
1001 for (i = 1; i < 16; i++) {
1002 limb mask = i ^ index;
1003 mask |= mask >> 2;
1004 mask |= mask >> 1;
1005 mask &= 1;
1006 mask--;
1007 for (j = 0; j < NLIMBS; j++, table++) {
1008 out_x[j] |= *table & mask;
1009 }
1010 for (j = 0; j < NLIMBS; j++, table++) {
1011 out_y[j] |= *table & mask;
1012 }
1013 for (j = 0; j < NLIMBS; j++, table++) {
1014 out_z[j] |= *table & mask;
1015 }
1016 }
1017}
1018
1019/* scalar_base_mult sets {nx,ny,nz} = scalar*G where scalar is a little-endian
1020 * number. Note that the value of scalar must be less than the order of the
1021 * group. */
1022static void scalar_base_mult(felem nx, felem ny, felem nz,
1023 const p256_int* scalar) {
1024 int i, j;
1025 limb n_is_infinity_mask = -1, p_is_noninfinite_mask, mask;
1026 u32 table_offset;
1027
1028 felem px, py;
1029 felem tx, ty, tz;
1030
1031 memset(nx, 0, sizeof(felem));
1032 memset(ny, 0, sizeof(felem));
1033 memset(nz, 0, sizeof(felem));
1034
1035 /* The loop adds bits at positions 0, 64, 128 and 192, followed by
1036 * positions 32,96,160 and 224 and does this 32 times. */
1037 for (i = 0; i < 32; i++) {
1038 if (i) {
1039 point_double(nx, ny, nz, nx, ny, nz);
1040 }
1041 table_offset = 0;
1042 for (j = 0; j <= 32; j += 32) {
1043 char bit0 = p256_get_bit(scalar, 31 - i + j);
1044 char bit1 = p256_get_bit(scalar, 95 - i + j);
1045 char bit2 = p256_get_bit(scalar, 159 - i + j);
1046 char bit3 = p256_get_bit(scalar, 223 - i + j);
1047 limb index = bit0 | (bit1 << 1) | (bit2 << 2) | (bit3 << 3);
1048
1049 select_affine_point(px, py, kPrecomputed + table_offset, index);
1050 table_offset += 30 * NLIMBS;
1051
1052 /* Since scalar is less than the order of the group, we know that
1053 * {nx,ny,nz} != {px,py,1}, unless both are zero, which we handle
1054 * below. */
1055 point_add_mixed(tx, ty, tz, nx, ny, nz, px, py);
1056 /* The result of point_add_mixed is incorrect if {nx,ny,nz} is zero
1057 * (a.k.a. the point at infinity). We handle that situation by
1058 * copying the point from the table. */
1059 copy_conditional(nx, px, n_is_infinity_mask);
1060 copy_conditional(ny, py, n_is_infinity_mask);
1061 copy_conditional(nz, kOne, n_is_infinity_mask);
1062
1063 /* Equally, the result is also wrong if the point from the table is
1064 * zero, which happens when the index is zero. We handle that by
1065 * only copying from {tx,ty,tz} to {nx,ny,nz} if index != 0. */
1066 p_is_noninfinite_mask = NON_ZERO_TO_ALL_ONES(index);
1067 mask = p_is_noninfinite_mask & ~n_is_infinity_mask;
1068 copy_conditional(nx, tx, mask);
1069 copy_conditional(ny, ty, mask);
1070 copy_conditional(nz, tz, mask);
1071 /* If p was not zero, then n is now non-zero. */
1072 n_is_infinity_mask &= ~p_is_noninfinite_mask;
1073 }
1074 }
1075}
1076
1077/* point_to_affine converts a Jacobian point to an affine point. If the input
1078 * is the point at infinity then it returns (0, 0) in constant time. */
1079static void point_to_affine(felem x_out, felem y_out, const felem nx,
1080 const felem ny, const felem nz) {
1081 felem z_inv, z_inv_sq;
1082 felem_inv(z_inv, nz);
1083 felem_square(z_inv_sq, z_inv);
1084 felem_mul(x_out, nx, z_inv_sq);
1085 felem_mul(z_inv, z_inv, z_inv_sq);
1086 felem_mul(y_out, ny, z_inv);
1087}
1088
1089/* scalar_base_mult sets {nx,ny,nz} = scalar*{x,y}. */
1090static void scalar_mult(felem nx, felem ny, felem nz, const felem x,
1091 const felem y, const p256_int* scalar) {
1092 int i;
1093 felem px, py, pz, tx, ty, tz;
1094 felem precomp[16][3];
1095 limb n_is_infinity_mask, index, p_is_noninfinite_mask, mask;
1096
1097 /* We precompute 0,1,2,... times {x,y}. */
1098 memset(precomp, 0, sizeof(felem) * 3);
1099 memcpy(&precomp[1][0], x, sizeof(felem));
1100 memcpy(&precomp[1][1], y, sizeof(felem));
1101 memcpy(&precomp[1][2], kOne, sizeof(felem));
1102
1103 for (i = 2; i < 16; i += 2) {
1104 point_double(precomp[i][0], precomp[i][1], precomp[i][2],
1105 precomp[i / 2][0], precomp[i / 2][1], precomp[i / 2][2]);
1106
1107 point_add_mixed(precomp[i + 1][0], precomp[i + 1][1], precomp[i + 1][2],
1108 precomp[i][0], precomp[i][1], precomp[i][2], x, y);
1109 }
1110
1111 memset(nx, 0, sizeof(felem));
1112 memset(ny, 0, sizeof(felem));
1113 memset(nz, 0, sizeof(felem));
1114 n_is_infinity_mask = -1;
1115
1116 /* We add in a window of four bits each iteration and do this 64 times. */
1117 for (i = 0; i < 256; i += 4) {
1118 if (i) {
1119 point_double(nx, ny, nz, nx, ny, nz);
1120 point_double(nx, ny, nz, nx, ny, nz);
1121 point_double(nx, ny, nz, nx, ny, nz);
1122 point_double(nx, ny, nz, nx, ny, nz);
1123 }
1124
1125 index = (p256_get_bit(scalar, 255 - i - 0) << 3) |
1126 (p256_get_bit(scalar, 255 - i - 1) << 2) |
1127 (p256_get_bit(scalar, 255 - i - 2) << 1) |
1128 p256_get_bit(scalar, 255 - i - 3);
1129
1130 /* See the comments in scalar_base_mult about handling infinities. */
1131 select_jacobian_point(px, py, pz, precomp[0][0], index);
1132 point_add(tx, ty, tz, nx, ny, nz, px, py, pz);
1133 copy_conditional(nx, px, n_is_infinity_mask);
1134 copy_conditional(ny, py, n_is_infinity_mask);
1135 copy_conditional(nz, pz, n_is_infinity_mask);
1136
1137 p_is_noninfinite_mask = NON_ZERO_TO_ALL_ONES(index);
1138 mask = p_is_noninfinite_mask & ~n_is_infinity_mask;
1139
1140 copy_conditional(nx, tx, mask);
1141 copy_conditional(ny, ty, mask);
1142 copy_conditional(nz, tz, mask);
1143 n_is_infinity_mask &= ~p_is_noninfinite_mask;
1144 }
1145}
1146
1147#define kRDigits {2, 0, 0, 0xfffffffe, 0xffffffff, 0xffffffff, 0xfffffffd, 1} // 2^257 mod p256.p
1148
1149#define kRInvDigits {0x80000000, 1, 0xffffffff, 0, 0x80000001, 0xfffffffe, 1, 0x7fffffff} // 1 / 2^257 mod p256.p
1150
1151static const p256_int kR = { kRDigits };
1152static const p256_int kRInv = { kRInvDigits };
1153
1154/* to_montgomery sets out = R*in. */
1155static void to_montgomery(felem out, const p256_int* in) {
1156 p256_int in_shifted;
1157 int i;
1158
1159 p256_init(&in_shifted);
1160 p256_modmul(&SECP256r1_p, in, 0, &kR, &in_shifted);
1161
1162 for (i = 0; i < NLIMBS; i++) {
1163 if ((i & 1) == 0) {
1164 out[i] = P256_DIGIT(&in_shifted, 0) & kBottom29Bits;
1165 p256_shr(&in_shifted, 29, &in_shifted);
1166 } else {
1167 out[i] = P256_DIGIT(&in_shifted, 0) & kBottom28Bits;
1168 p256_shr(&in_shifted, 28, &in_shifted);
1169 }
1170 }
1171
1172 p256_clear(&in_shifted);
1173}
1174
1175/* from_montgomery sets out=in/R. */
1176static void from_montgomery(p256_int* out, const felem in) {
1177 p256_int result, tmp;
1178 int i, top;
1179
1180 p256_init(&result);
1181 p256_init(&tmp);
1182
1183 p256_add_d(&tmp, in[NLIMBS - 1], &result);
1184 for (i = NLIMBS - 2; i >= 0; i--) {
1185 if ((i & 1) == 0) {
1186 top = p256_shl(&result, 29, &tmp);
1187 } else {
1188 top = p256_shl(&result, 28, &tmp);
1189 }
1190 top |= p256_add_d(&tmp, in[i], &result);
1191 }
1192
1193 p256_modmul(&SECP256r1_p, &kRInv, top, &result, out);
1194
1195 p256_clear(&result);
1196 p256_clear(&tmp);
1197}
1198
1199/* p256_base_point_mul sets {out_x,out_y} = nG, where n is < the
1200 * order of the group. */
1201void p256_base_point_mul(const p256_int* n, p256_int* out_x, p256_int* out_y) {
1202 felem x, y, z;
1203
1204 scalar_base_mult(x, y, z, n);
1205
1206 {
1207 felem x_affine, y_affine;
1208
1209 point_to_affine(x_affine, y_affine, x, y, z);
1210 from_montgomery(out_x, x_affine);
1211 from_montgomery(out_y, y_affine);
1212 }
1213}
1214
1215/* p256_point_mul sets {out_x,out_y} = n*{in_x,in_y}, where n is <
1216 * the order of the group. */
1217void p256_point_mul(const p256_int* n, const p256_int* in_x,
1218 const p256_int* in_y, p256_int* out_x, p256_int* out_y) {
1219 felem x, y, z, px, py;
1220
1221 to_montgomery(px, in_x);
1222 to_montgomery(py, in_y);
1223
1224 scalar_mult(x, y, z, px, py, n);
1225
1226 point_to_affine(px, py, x, y, z);
1227 from_montgomery(out_x, px);
1228 from_montgomery(out_y, py);
1229}
1230
1231/* p256_points_mul_vartime sets {out_x,out_y} = n1*G + n2*{in_x,in_y}, where
1232 * n1 and n2 are < the order of the group.
1233 *
1234 * As indicated by the name, this function operates in variable time. This
1235 * is safe because it's used for signature validation which doesn't deal
1236 * with secrets. */
1237void p256_points_mul_vartime(
1238 const p256_int* n1, const p256_int* n2, const p256_int* in_x,
1239 const p256_int* in_y, p256_int* out_x, p256_int* out_y) {
1240 felem x1, y1, z1, x2, y2, z2, px, py;
1241
1242 /* If both scalars are zero, then the result is the point at infinity. */
1243 if (p256_is_zero(n1) != 0 && p256_is_zero(n2) != 0) {
1244 p256_clear(out_x);
1245 p256_clear(out_y);
1246 return;
1247 }
1248
1249 to_montgomery(px, in_x);
1250 to_montgomery(py, in_y);
1251 scalar_base_mult(x1, y1, z1, n1);
1252 scalar_mult(x2, y2, z2, px, py, n2);
1253
1254 if (p256_is_zero(n2) != 0) {
1255 /* If n2 == 0, then {x2,y2,z2} is zero and the result is just
1256 * {x1,y1,z1}. */
1257 } else if (p256_is_zero(n1) != 0) {
1258 /* If n1 == 0, then {x1,y1,z1} is zero and the result is just
1259 * {x2,y2,z2}. */
1260 memcpy(x1, x2, sizeof(x2));
1261 memcpy(y1, y2, sizeof(y2));
1262 memcpy(z1, z2, sizeof(z2));
1263 } else {
1264 /* This function handles the case where {x1,y1,z1} == {x2,y2,z2}. */
1265 point_add_or_double_vartime(x1, y1, z1, x1, y1, z1, x2, y2, z2);
1266 }
1267
1268 point_to_affine(px, py, x1, y1, z1);
1269 from_montgomery(out_x, px);
1270 from_montgomery(out_y, py);
1271}