xref: /freebsd-11-stable/crypto/openssl/crypto/ec/ecp_nistp521.c (revision a7ce0a90be11992abc2c716cb6bd16c0ea43f6d2)
1 /* crypto/ec/ecp_nistp521.c */
2 /*
3  * Written by Adam Langley (Google) for the OpenSSL project
4  */
5 /* Copyright 2011 Google Inc.
6  *
7  * Licensed under the Apache License, Version 2.0 (the "License");
8  *
9  * you may not use this file except in compliance with the License.
10  * You may obtain a copy of the License at
11  *
12  *     http://www.apache.org/licenses/LICENSE-2.0
13  *
14  *  Unless required by applicable law or agreed to in writing, software
15  *  distributed under the License is distributed on an "AS IS" BASIS,
16  *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
17  *  See the License for the specific language governing permissions and
18  *  limitations under the License.
19  */
20 
21 /*
22  * A 64-bit implementation of the NIST P-521 elliptic curve point multiplication
23  *
24  * OpenSSL integration was taken from Emilia Kasper's work in ecp_nistp224.c.
25  * Otherwise based on Emilia's P224 work, which was inspired by my curve25519
26  * work which got its smarts from Daniel J. Bernstein's work on the same.
27  */
28 
29 #include <openssl/opensslconf.h>
30 #ifndef OPENSSL_NO_EC_NISTP_64_GCC_128
31 
32 # ifndef OPENSSL_SYS_VMS
33 #  include <stdint.h>
34 # else
35 #  include <inttypes.h>
36 # endif
37 
38 # include <string.h>
39 # include <openssl/err.h>
40 # include "ec_lcl.h"
41 # include "bn_int.h" /* bn_bn2lebinpad, bn_lebin2bn */
42 
43 # if defined(__GNUC__) && (__GNUC__ > 3 || (__GNUC__ == 3 && __GNUC_MINOR__ >= 1))
44   /* even with gcc, the typedef won't work for 32-bit platforms */
45 typedef __uint128_t uint128_t;  /* nonstandard; implemented by gcc on 64-bit
46                                  * platforms */
47 # else
48 #  error "Need GCC 3.1 or later to define type uint128_t"
49 # endif
50 
51 typedef uint8_t u8;
52 typedef uint64_t u64;
53 
54 /*
55  * The underlying field. P521 operates over GF(2^521-1). We can serialise an
56  * element of this field into 66 bytes where the most significant byte
57  * contains only a single bit. We call this an felem_bytearray.
58  */
59 
60 typedef u8 felem_bytearray[66];
61 
62 /*
63  * These are the parameters of P521, taken from FIPS 186-3, section D.1.2.5.
64  * These values are big-endian.
65  */
66 static const felem_bytearray nistp521_curve_params[5] = {
67     {0x01, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* p */
68      0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
69      0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
70      0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
71      0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
72      0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
73      0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
74      0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
75      0xff, 0xff},
76     {0x01, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* a = -3 */
77      0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
78      0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
79      0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
80      0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
81      0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
82      0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
83      0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
84      0xff, 0xfc},
85     {0x00, 0x51, 0x95, 0x3e, 0xb9, 0x61, 0x8e, 0x1c, /* b */
86      0x9a, 0x1f, 0x92, 0x9a, 0x21, 0xa0, 0xb6, 0x85,
87      0x40, 0xee, 0xa2, 0xda, 0x72, 0x5b, 0x99, 0xb3,
88      0x15, 0xf3, 0xb8, 0xb4, 0x89, 0x91, 0x8e, 0xf1,
89      0x09, 0xe1, 0x56, 0x19, 0x39, 0x51, 0xec, 0x7e,
90      0x93, 0x7b, 0x16, 0x52, 0xc0, 0xbd, 0x3b, 0xb1,
91      0xbf, 0x07, 0x35, 0x73, 0xdf, 0x88, 0x3d, 0x2c,
92      0x34, 0xf1, 0xef, 0x45, 0x1f, 0xd4, 0x6b, 0x50,
93      0x3f, 0x00},
94     {0x00, 0xc6, 0x85, 0x8e, 0x06, 0xb7, 0x04, 0x04, /* x */
95      0xe9, 0xcd, 0x9e, 0x3e, 0xcb, 0x66, 0x23, 0x95,
96      0xb4, 0x42, 0x9c, 0x64, 0x81, 0x39, 0x05, 0x3f,
97      0xb5, 0x21, 0xf8, 0x28, 0xaf, 0x60, 0x6b, 0x4d,
98      0x3d, 0xba, 0xa1, 0x4b, 0x5e, 0x77, 0xef, 0xe7,
99      0x59, 0x28, 0xfe, 0x1d, 0xc1, 0x27, 0xa2, 0xff,
100      0xa8, 0xde, 0x33, 0x48, 0xb3, 0xc1, 0x85, 0x6a,
101      0x42, 0x9b, 0xf9, 0x7e, 0x7e, 0x31, 0xc2, 0xe5,
102      0xbd, 0x66},
103     {0x01, 0x18, 0x39, 0x29, 0x6a, 0x78, 0x9a, 0x3b, /* y */
104      0xc0, 0x04, 0x5c, 0x8a, 0x5f, 0xb4, 0x2c, 0x7d,
105      0x1b, 0xd9, 0x98, 0xf5, 0x44, 0x49, 0x57, 0x9b,
106      0x44, 0x68, 0x17, 0xaf, 0xbd, 0x17, 0x27, 0x3e,
107      0x66, 0x2c, 0x97, 0xee, 0x72, 0x99, 0x5e, 0xf4,
108      0x26, 0x40, 0xc5, 0x50, 0xb9, 0x01, 0x3f, 0xad,
109      0x07, 0x61, 0x35, 0x3c, 0x70, 0x86, 0xa2, 0x72,
110      0xc2, 0x40, 0x88, 0xbe, 0x94, 0x76, 0x9f, 0xd1,
111      0x66, 0x50}
112 };
113 
114 /*-
115  * The representation of field elements.
116  * ------------------------------------
117  *
118  * We represent field elements with nine values. These values are either 64 or
119  * 128 bits and the field element represented is:
120  *   v[0]*2^0 + v[1]*2^58 + v[2]*2^116 + ... + v[8]*2^464  (mod p)
121  * Each of the nine values is called a 'limb'. Since the limbs are spaced only
122  * 58 bits apart, but are greater than 58 bits in length, the most significant
123  * bits of each limb overlap with the least significant bits of the next.
124  *
125  * A field element with 64-bit limbs is an 'felem'. One with 128-bit limbs is a
126  * 'largefelem' */
127 
128 # define NLIMBS 9
129 
130 typedef uint64_t limb;
131 typedef limb felem[NLIMBS];
132 typedef uint128_t largefelem[NLIMBS];
133 
134 static const limb bottom57bits = 0x1ffffffffffffff;
135 static const limb bottom58bits = 0x3ffffffffffffff;
136 
137 /*
138  * bin66_to_felem takes a little-endian byte array and converts it into felem
139  * form. This assumes that the CPU is little-endian.
140  */
bin66_to_felem(felem out,const u8 in[66])141 static void bin66_to_felem(felem out, const u8 in[66])
142 {
143     out[0] = (*((limb *) & in[0])) & bottom58bits;
144     out[1] = (*((limb *) & in[7]) >> 2) & bottom58bits;
145     out[2] = (*((limb *) & in[14]) >> 4) & bottom58bits;
146     out[3] = (*((limb *) & in[21]) >> 6) & bottom58bits;
147     out[4] = (*((limb *) & in[29])) & bottom58bits;
148     out[5] = (*((limb *) & in[36]) >> 2) & bottom58bits;
149     out[6] = (*((limb *) & in[43]) >> 4) & bottom58bits;
150     out[7] = (*((limb *) & in[50]) >> 6) & bottom58bits;
151     out[8] = (*((limb *) & in[58])) & bottom57bits;
152 }
153 
154 /*
155  * felem_to_bin66 takes an felem and serialises into a little endian, 66 byte
156  * array. This assumes that the CPU is little-endian.
157  */
felem_to_bin66(u8 out[66],const felem in)158 static void felem_to_bin66(u8 out[66], const felem in)
159 {
160     memset(out, 0, 66);
161     (*((limb *) & out[0])) = in[0];
162     (*((limb *) & out[7])) |= in[1] << 2;
163     (*((limb *) & out[14])) |= in[2] << 4;
164     (*((limb *) & out[21])) |= in[3] << 6;
165     (*((limb *) & out[29])) = in[4];
166     (*((limb *) & out[36])) |= in[5] << 2;
167     (*((limb *) & out[43])) |= in[6] << 4;
168     (*((limb *) & out[50])) |= in[7] << 6;
169     (*((limb *) & out[58])) = in[8];
170 }
171 
172 /* BN_to_felem converts an OpenSSL BIGNUM into an felem */
BN_to_felem(felem out,const BIGNUM * bn)173 static int BN_to_felem(felem out, const BIGNUM *bn)
174 {
175     felem_bytearray b_out;
176     int num_bytes;
177 
178     if (BN_is_negative(bn)) {
179         ECerr(EC_F_BN_TO_FELEM, EC_R_BIGNUM_OUT_OF_RANGE);
180         return 0;
181     }
182     num_bytes = bn_bn2lebinpad(bn, b_out, sizeof(b_out));
183     if (num_bytes < 0) {
184         ECerr(EC_F_BN_TO_FELEM, EC_R_BIGNUM_OUT_OF_RANGE);
185         return 0;
186     }
187     bin66_to_felem(out, b_out);
188     return 1;
189 }
190 
191 /* felem_to_BN converts an felem into an OpenSSL BIGNUM */
felem_to_BN(BIGNUM * out,const felem in)192 static BIGNUM *felem_to_BN(BIGNUM *out, const felem in)
193 {
194     felem_bytearray b_out;
195     felem_to_bin66(b_out, in);
196     return bn_lebin2bn(b_out, sizeof(b_out), out);
197 }
198 
199 /*-
200  * Field operations
201  * ----------------
202  */
203 
felem_one(felem out)204 static void felem_one(felem out)
205 {
206     out[0] = 1;
207     out[1] = 0;
208     out[2] = 0;
209     out[3] = 0;
210     out[4] = 0;
211     out[5] = 0;
212     out[6] = 0;
213     out[7] = 0;
214     out[8] = 0;
215 }
216 
felem_assign(felem out,const felem in)217 static void felem_assign(felem out, const felem in)
218 {
219     out[0] = in[0];
220     out[1] = in[1];
221     out[2] = in[2];
222     out[3] = in[3];
223     out[4] = in[4];
224     out[5] = in[5];
225     out[6] = in[6];
226     out[7] = in[7];
227     out[8] = in[8];
228 }
229 
230 /* felem_sum64 sets out = out + in. */
felem_sum64(felem out,const felem in)231 static void felem_sum64(felem out, const felem in)
232 {
233     out[0] += in[0];
234     out[1] += in[1];
235     out[2] += in[2];
236     out[3] += in[3];
237     out[4] += in[4];
238     out[5] += in[5];
239     out[6] += in[6];
240     out[7] += in[7];
241     out[8] += in[8];
242 }
243 
244 /* felem_scalar sets out = in * scalar */
felem_scalar(felem out,const felem in,limb scalar)245 static void felem_scalar(felem out, const felem in, limb scalar)
246 {
247     out[0] = in[0] * scalar;
248     out[1] = in[1] * scalar;
249     out[2] = in[2] * scalar;
250     out[3] = in[3] * scalar;
251     out[4] = in[4] * scalar;
252     out[5] = in[5] * scalar;
253     out[6] = in[6] * scalar;
254     out[7] = in[7] * scalar;
255     out[8] = in[8] * scalar;
256 }
257 
258 /* felem_scalar64 sets out = out * scalar */
felem_scalar64(felem out,limb scalar)259 static void felem_scalar64(felem out, limb scalar)
260 {
261     out[0] *= scalar;
262     out[1] *= scalar;
263     out[2] *= scalar;
264     out[3] *= scalar;
265     out[4] *= scalar;
266     out[5] *= scalar;
267     out[6] *= scalar;
268     out[7] *= scalar;
269     out[8] *= scalar;
270 }
271 
272 /* felem_scalar128 sets out = out * scalar */
felem_scalar128(largefelem out,limb scalar)273 static void felem_scalar128(largefelem out, limb scalar)
274 {
275     out[0] *= scalar;
276     out[1] *= scalar;
277     out[2] *= scalar;
278     out[3] *= scalar;
279     out[4] *= scalar;
280     out[5] *= scalar;
281     out[6] *= scalar;
282     out[7] *= scalar;
283     out[8] *= scalar;
284 }
285 
286 /*-
287  * felem_neg sets |out| to |-in|
288  * On entry:
289  *   in[i] < 2^59 + 2^14
290  * On exit:
291  *   out[i] < 2^62
292  */
felem_neg(felem out,const felem in)293 static void felem_neg(felem out, const felem in)
294 {
295     /* In order to prevent underflow, we subtract from 0 mod p. */
296     static const limb two62m3 = (((limb) 1) << 62) - (((limb) 1) << 5);
297     static const limb two62m2 = (((limb) 1) << 62) - (((limb) 1) << 4);
298 
299     out[0] = two62m3 - in[0];
300     out[1] = two62m2 - in[1];
301     out[2] = two62m2 - in[2];
302     out[3] = two62m2 - in[3];
303     out[4] = two62m2 - in[4];
304     out[5] = two62m2 - in[5];
305     out[6] = two62m2 - in[6];
306     out[7] = two62m2 - in[7];
307     out[8] = two62m2 - in[8];
308 }
309 
310 /*-
311  * felem_diff64 subtracts |in| from |out|
312  * On entry:
313  *   in[i] < 2^59 + 2^14
314  * On exit:
315  *   out[i] < out[i] + 2^62
316  */
felem_diff64(felem out,const felem in)317 static void felem_diff64(felem out, const felem in)
318 {
319     /*
320      * In order to prevent underflow, we add 0 mod p before subtracting.
321      */
322     static const limb two62m3 = (((limb) 1) << 62) - (((limb) 1) << 5);
323     static const limb two62m2 = (((limb) 1) << 62) - (((limb) 1) << 4);
324 
325     out[0] += two62m3 - in[0];
326     out[1] += two62m2 - in[1];
327     out[2] += two62m2 - in[2];
328     out[3] += two62m2 - in[3];
329     out[4] += two62m2 - in[4];
330     out[5] += two62m2 - in[5];
331     out[6] += two62m2 - in[6];
332     out[7] += two62m2 - in[7];
333     out[8] += two62m2 - in[8];
334 }
335 
336 /*-
337  * felem_diff_128_64 subtracts |in| from |out|
338  * On entry:
339  *   in[i] < 2^62 + 2^17
340  * On exit:
341  *   out[i] < out[i] + 2^63
342  */
felem_diff_128_64(largefelem out,const felem in)343 static void felem_diff_128_64(largefelem out, const felem in)
344 {
345     /*
346      * In order to prevent underflow, we add 64p mod p (which is equivalent
347      * to 0 mod p) before subtracting. p is 2^521 - 1, i.e. in binary a 521
348      * digit number with all bits set to 1. See "The representation of field
349      * elements" comment above for a description of how limbs are used to
350      * represent a number. 64p is represented with 8 limbs containing a number
351      * with 58 bits set and one limb with a number with 57 bits set.
352      */
353     static const limb two63m6 = (((limb) 1) << 63) - (((limb) 1) << 6);
354     static const limb two63m5 = (((limb) 1) << 63) - (((limb) 1) << 5);
355 
356     out[0] += two63m6 - in[0];
357     out[1] += two63m5 - in[1];
358     out[2] += two63m5 - in[2];
359     out[3] += two63m5 - in[3];
360     out[4] += two63m5 - in[4];
361     out[5] += two63m5 - in[5];
362     out[6] += two63m5 - in[6];
363     out[7] += two63m5 - in[7];
364     out[8] += two63m5 - in[8];
365 }
366 
367 /*-
368  * felem_diff_128_64 subtracts |in| from |out|
369  * On entry:
370  *   in[i] < 2^126
371  * On exit:
372  *   out[i] < out[i] + 2^127 - 2^69
373  */
felem_diff128(largefelem out,const largefelem in)374 static void felem_diff128(largefelem out, const largefelem in)
375 {
376     /*
377      * In order to prevent underflow, we add 0 mod p before subtracting.
378      */
379     static const uint128_t two127m70 =
380         (((uint128_t) 1) << 127) - (((uint128_t) 1) << 70);
381     static const uint128_t two127m69 =
382         (((uint128_t) 1) << 127) - (((uint128_t) 1) << 69);
383 
384     out[0] += (two127m70 - in[0]);
385     out[1] += (two127m69 - in[1]);
386     out[2] += (two127m69 - in[2]);
387     out[3] += (two127m69 - in[3]);
388     out[4] += (two127m69 - in[4]);
389     out[5] += (two127m69 - in[5]);
390     out[6] += (two127m69 - in[6]);
391     out[7] += (two127m69 - in[7]);
392     out[8] += (two127m69 - in[8]);
393 }
394 
395 /*-
396  * felem_square sets |out| = |in|^2
397  * On entry:
398  *   in[i] < 2^62
399  * On exit:
400  *   out[i] < 17 * max(in[i]) * max(in[i])
401  */
felem_square(largefelem out,const felem in)402 static void felem_square(largefelem out, const felem in)
403 {
404     felem inx2, inx4;
405     felem_scalar(inx2, in, 2);
406     felem_scalar(inx4, in, 4);
407 
408     /*-
409      * We have many cases were we want to do
410      *   in[x] * in[y] +
411      *   in[y] * in[x]
412      * This is obviously just
413      *   2 * in[x] * in[y]
414      * However, rather than do the doubling on the 128 bit result, we
415      * double one of the inputs to the multiplication by reading from
416      * |inx2|
417      */
418 
419     out[0] = ((uint128_t) in[0]) * in[0];
420     out[1] = ((uint128_t) in[0]) * inx2[1];
421     out[2] = ((uint128_t) in[0]) * inx2[2] + ((uint128_t) in[1]) * in[1];
422     out[3] = ((uint128_t) in[0]) * inx2[3] + ((uint128_t) in[1]) * inx2[2];
423     out[4] = ((uint128_t) in[0]) * inx2[4] +
424         ((uint128_t) in[1]) * inx2[3] + ((uint128_t) in[2]) * in[2];
425     out[5] = ((uint128_t) in[0]) * inx2[5] +
426         ((uint128_t) in[1]) * inx2[4] + ((uint128_t) in[2]) * inx2[3];
427     out[6] = ((uint128_t) in[0]) * inx2[6] +
428         ((uint128_t) in[1]) * inx2[5] +
429         ((uint128_t) in[2]) * inx2[4] + ((uint128_t) in[3]) * in[3];
430     out[7] = ((uint128_t) in[0]) * inx2[7] +
431         ((uint128_t) in[1]) * inx2[6] +
432         ((uint128_t) in[2]) * inx2[5] + ((uint128_t) in[3]) * inx2[4];
433     out[8] = ((uint128_t) in[0]) * inx2[8] +
434         ((uint128_t) in[1]) * inx2[7] +
435         ((uint128_t) in[2]) * inx2[6] +
436         ((uint128_t) in[3]) * inx2[5] + ((uint128_t) in[4]) * in[4];
437 
438     /*
439      * The remaining limbs fall above 2^521, with the first falling at 2^522.
440      * They correspond to locations one bit up from the limbs produced above
441      * so we would have to multiply by two to align them. Again, rather than
442      * operate on the 128-bit result, we double one of the inputs to the
443      * multiplication. If we want to double for both this reason, and the
444      * reason above, then we end up multiplying by four.
445      */
446 
447     /* 9 */
448     out[0] += ((uint128_t) in[1]) * inx4[8] +
449         ((uint128_t) in[2]) * inx4[7] +
450         ((uint128_t) in[3]) * inx4[6] + ((uint128_t) in[4]) * inx4[5];
451 
452     /* 10 */
453     out[1] += ((uint128_t) in[2]) * inx4[8] +
454         ((uint128_t) in[3]) * inx4[7] +
455         ((uint128_t) in[4]) * inx4[6] + ((uint128_t) in[5]) * inx2[5];
456 
457     /* 11 */
458     out[2] += ((uint128_t) in[3]) * inx4[8] +
459         ((uint128_t) in[4]) * inx4[7] + ((uint128_t) in[5]) * inx4[6];
460 
461     /* 12 */
462     out[3] += ((uint128_t) in[4]) * inx4[8] +
463         ((uint128_t) in[5]) * inx4[7] + ((uint128_t) in[6]) * inx2[6];
464 
465     /* 13 */
466     out[4] += ((uint128_t) in[5]) * inx4[8] + ((uint128_t) in[6]) * inx4[7];
467 
468     /* 14 */
469     out[5] += ((uint128_t) in[6]) * inx4[8] + ((uint128_t) in[7]) * inx2[7];
470 
471     /* 15 */
472     out[6] += ((uint128_t) in[7]) * inx4[8];
473 
474     /* 16 */
475     out[7] += ((uint128_t) in[8]) * inx2[8];
476 }
477 
478 /*-
479  * felem_mul sets |out| = |in1| * |in2|
480  * On entry:
481  *   in1[i] < 2^64
482  *   in2[i] < 2^63
483  * On exit:
484  *   out[i] < 17 * max(in1[i]) * max(in2[i])
485  */
felem_mul(largefelem out,const felem in1,const felem in2)486 static void felem_mul(largefelem out, const felem in1, const felem in2)
487 {
488     felem in2x2;
489     felem_scalar(in2x2, in2, 2);
490 
491     out[0] = ((uint128_t) in1[0]) * in2[0];
492 
493     out[1] = ((uint128_t) in1[0]) * in2[1] + ((uint128_t) in1[1]) * in2[0];
494 
495     out[2] = ((uint128_t) in1[0]) * in2[2] +
496         ((uint128_t) in1[1]) * in2[1] + ((uint128_t) in1[2]) * in2[0];
497 
498     out[3] = ((uint128_t) in1[0]) * in2[3] +
499         ((uint128_t) in1[1]) * in2[2] +
500         ((uint128_t) in1[2]) * in2[1] + ((uint128_t) in1[3]) * in2[0];
501 
502     out[4] = ((uint128_t) in1[0]) * in2[4] +
503         ((uint128_t) in1[1]) * in2[3] +
504         ((uint128_t) in1[2]) * in2[2] +
505         ((uint128_t) in1[3]) * in2[1] + ((uint128_t) in1[4]) * in2[0];
506 
507     out[5] = ((uint128_t) in1[0]) * in2[5] +
508         ((uint128_t) in1[1]) * in2[4] +
509         ((uint128_t) in1[2]) * in2[3] +
510         ((uint128_t) in1[3]) * in2[2] +
511         ((uint128_t) in1[4]) * in2[1] + ((uint128_t) in1[5]) * in2[0];
512 
513     out[6] = ((uint128_t) in1[0]) * in2[6] +
514         ((uint128_t) in1[1]) * in2[5] +
515         ((uint128_t) in1[2]) * in2[4] +
516         ((uint128_t) in1[3]) * in2[3] +
517         ((uint128_t) in1[4]) * in2[2] +
518         ((uint128_t) in1[5]) * in2[1] + ((uint128_t) in1[6]) * in2[0];
519 
520     out[7] = ((uint128_t) in1[0]) * in2[7] +
521         ((uint128_t) in1[1]) * in2[6] +
522         ((uint128_t) in1[2]) * in2[5] +
523         ((uint128_t) in1[3]) * in2[4] +
524         ((uint128_t) in1[4]) * in2[3] +
525         ((uint128_t) in1[5]) * in2[2] +
526         ((uint128_t) in1[6]) * in2[1] + ((uint128_t) in1[7]) * in2[0];
527 
528     out[8] = ((uint128_t) in1[0]) * in2[8] +
529         ((uint128_t) in1[1]) * in2[7] +
530         ((uint128_t) in1[2]) * in2[6] +
531         ((uint128_t) in1[3]) * in2[5] +
532         ((uint128_t) in1[4]) * in2[4] +
533         ((uint128_t) in1[5]) * in2[3] +
534         ((uint128_t) in1[6]) * in2[2] +
535         ((uint128_t) in1[7]) * in2[1] + ((uint128_t) in1[8]) * in2[0];
536 
537     /* See comment in felem_square about the use of in2x2 here */
538 
539     out[0] += ((uint128_t) in1[1]) * in2x2[8] +
540         ((uint128_t) in1[2]) * in2x2[7] +
541         ((uint128_t) in1[3]) * in2x2[6] +
542         ((uint128_t) in1[4]) * in2x2[5] +
543         ((uint128_t) in1[5]) * in2x2[4] +
544         ((uint128_t) in1[6]) * in2x2[3] +
545         ((uint128_t) in1[7]) * in2x2[2] + ((uint128_t) in1[8]) * in2x2[1];
546 
547     out[1] += ((uint128_t) in1[2]) * in2x2[8] +
548         ((uint128_t) in1[3]) * in2x2[7] +
549         ((uint128_t) in1[4]) * in2x2[6] +
550         ((uint128_t) in1[5]) * in2x2[5] +
551         ((uint128_t) in1[6]) * in2x2[4] +
552         ((uint128_t) in1[7]) * in2x2[3] + ((uint128_t) in1[8]) * in2x2[2];
553 
554     out[2] += ((uint128_t) in1[3]) * in2x2[8] +
555         ((uint128_t) in1[4]) * in2x2[7] +
556         ((uint128_t) in1[5]) * in2x2[6] +
557         ((uint128_t) in1[6]) * in2x2[5] +
558         ((uint128_t) in1[7]) * in2x2[4] + ((uint128_t) in1[8]) * in2x2[3];
559 
560     out[3] += ((uint128_t) in1[4]) * in2x2[8] +
561         ((uint128_t) in1[5]) * in2x2[7] +
562         ((uint128_t) in1[6]) * in2x2[6] +
563         ((uint128_t) in1[7]) * in2x2[5] + ((uint128_t) in1[8]) * in2x2[4];
564 
565     out[4] += ((uint128_t) in1[5]) * in2x2[8] +
566         ((uint128_t) in1[6]) * in2x2[7] +
567         ((uint128_t) in1[7]) * in2x2[6] + ((uint128_t) in1[8]) * in2x2[5];
568 
569     out[5] += ((uint128_t) in1[6]) * in2x2[8] +
570         ((uint128_t) in1[7]) * in2x2[7] + ((uint128_t) in1[8]) * in2x2[6];
571 
572     out[6] += ((uint128_t) in1[7]) * in2x2[8] +
573         ((uint128_t) in1[8]) * in2x2[7];
574 
575     out[7] += ((uint128_t) in1[8]) * in2x2[8];
576 }
577 
578 static const limb bottom52bits = 0xfffffffffffff;
579 
580 /*-
581  * felem_reduce converts a largefelem to an felem.
582  * On entry:
583  *   in[i] < 2^128
584  * On exit:
585  *   out[i] < 2^59 + 2^14
586  */
felem_reduce(felem out,const largefelem in)587 static void felem_reduce(felem out, const largefelem in)
588 {
589     u64 overflow1, overflow2;
590 
591     out[0] = ((limb) in[0]) & bottom58bits;
592     out[1] = ((limb) in[1]) & bottom58bits;
593     out[2] = ((limb) in[2]) & bottom58bits;
594     out[3] = ((limb) in[3]) & bottom58bits;
595     out[4] = ((limb) in[4]) & bottom58bits;
596     out[5] = ((limb) in[5]) & bottom58bits;
597     out[6] = ((limb) in[6]) & bottom58bits;
598     out[7] = ((limb) in[7]) & bottom58bits;
599     out[8] = ((limb) in[8]) & bottom58bits;
600 
601     /* out[i] < 2^58 */
602 
603     out[1] += ((limb) in[0]) >> 58;
604     out[1] += (((limb) (in[0] >> 64)) & bottom52bits) << 6;
605     /*-
606      * out[1] < 2^58 + 2^6 + 2^58
607      *        = 2^59 + 2^6
608      */
609     out[2] += ((limb) (in[0] >> 64)) >> 52;
610 
611     out[2] += ((limb) in[1]) >> 58;
612     out[2] += (((limb) (in[1] >> 64)) & bottom52bits) << 6;
613     out[3] += ((limb) (in[1] >> 64)) >> 52;
614 
615     out[3] += ((limb) in[2]) >> 58;
616     out[3] += (((limb) (in[2] >> 64)) & bottom52bits) << 6;
617     out[4] += ((limb) (in[2] >> 64)) >> 52;
618 
619     out[4] += ((limb) in[3]) >> 58;
620     out[4] += (((limb) (in[3] >> 64)) & bottom52bits) << 6;
621     out[5] += ((limb) (in[3] >> 64)) >> 52;
622 
623     out[5] += ((limb) in[4]) >> 58;
624     out[5] += (((limb) (in[4] >> 64)) & bottom52bits) << 6;
625     out[6] += ((limb) (in[4] >> 64)) >> 52;
626 
627     out[6] += ((limb) in[5]) >> 58;
628     out[6] += (((limb) (in[5] >> 64)) & bottom52bits) << 6;
629     out[7] += ((limb) (in[5] >> 64)) >> 52;
630 
631     out[7] += ((limb) in[6]) >> 58;
632     out[7] += (((limb) (in[6] >> 64)) & bottom52bits) << 6;
633     out[8] += ((limb) (in[6] >> 64)) >> 52;
634 
635     out[8] += ((limb) in[7]) >> 58;
636     out[8] += (((limb) (in[7] >> 64)) & bottom52bits) << 6;
637     /*-
638      * out[x > 1] < 2^58 + 2^6 + 2^58 + 2^12
639      *            < 2^59 + 2^13
640      */
641     overflow1 = ((limb) (in[7] >> 64)) >> 52;
642 
643     overflow1 += ((limb) in[8]) >> 58;
644     overflow1 += (((limb) (in[8] >> 64)) & bottom52bits) << 6;
645     overflow2 = ((limb) (in[8] >> 64)) >> 52;
646 
647     overflow1 <<= 1;            /* overflow1 < 2^13 + 2^7 + 2^59 */
648     overflow2 <<= 1;            /* overflow2 < 2^13 */
649 
650     out[0] += overflow1;        /* out[0] < 2^60 */
651     out[1] += overflow2;        /* out[1] < 2^59 + 2^6 + 2^13 */
652 
653     out[1] += out[0] >> 58;
654     out[0] &= bottom58bits;
655     /*-
656      * out[0] < 2^58
657      * out[1] < 2^59 + 2^6 + 2^13 + 2^2
658      *        < 2^59 + 2^14
659      */
660 }
661 
felem_square_reduce(felem out,const felem in)662 static void felem_square_reduce(felem out, const felem in)
663 {
664     largefelem tmp;
665     felem_square(tmp, in);
666     felem_reduce(out, tmp);
667 }
668 
felem_mul_reduce(felem out,const felem in1,const felem in2)669 static void felem_mul_reduce(felem out, const felem in1, const felem in2)
670 {
671     largefelem tmp;
672     felem_mul(tmp, in1, in2);
673     felem_reduce(out, tmp);
674 }
675 
676 /*-
677  * felem_inv calculates |out| = |in|^{-1}
678  *
679  * Based on Fermat's Little Theorem:
680  *   a^p = a (mod p)
681  *   a^{p-1} = 1 (mod p)
682  *   a^{p-2} = a^{-1} (mod p)
683  */
felem_inv(felem out,const felem in)684 static void felem_inv(felem out, const felem in)
685 {
686     felem ftmp, ftmp2, ftmp3, ftmp4;
687     largefelem tmp;
688     unsigned i;
689 
690     felem_square(tmp, in);
691     felem_reduce(ftmp, tmp);    /* 2^1 */
692     felem_mul(tmp, in, ftmp);
693     felem_reduce(ftmp, tmp);    /* 2^2 - 2^0 */
694     felem_assign(ftmp2, ftmp);
695     felem_square(tmp, ftmp);
696     felem_reduce(ftmp, tmp);    /* 2^3 - 2^1 */
697     felem_mul(tmp, in, ftmp);
698     felem_reduce(ftmp, tmp);    /* 2^3 - 2^0 */
699     felem_square(tmp, ftmp);
700     felem_reduce(ftmp, tmp);    /* 2^4 - 2^1 */
701 
702     felem_square(tmp, ftmp2);
703     felem_reduce(ftmp3, tmp);   /* 2^3 - 2^1 */
704     felem_square(tmp, ftmp3);
705     felem_reduce(ftmp3, tmp);   /* 2^4 - 2^2 */
706     felem_mul(tmp, ftmp3, ftmp2);
707     felem_reduce(ftmp3, tmp);   /* 2^4 - 2^0 */
708 
709     felem_assign(ftmp2, ftmp3);
710     felem_square(tmp, ftmp3);
711     felem_reduce(ftmp3, tmp);   /* 2^5 - 2^1 */
712     felem_square(tmp, ftmp3);
713     felem_reduce(ftmp3, tmp);   /* 2^6 - 2^2 */
714     felem_square(tmp, ftmp3);
715     felem_reduce(ftmp3, tmp);   /* 2^7 - 2^3 */
716     felem_square(tmp, ftmp3);
717     felem_reduce(ftmp3, tmp);   /* 2^8 - 2^4 */
718     felem_assign(ftmp4, ftmp3);
719     felem_mul(tmp, ftmp3, ftmp);
720     felem_reduce(ftmp4, tmp);   /* 2^8 - 2^1 */
721     felem_square(tmp, ftmp4);
722     felem_reduce(ftmp4, tmp);   /* 2^9 - 2^2 */
723     felem_mul(tmp, ftmp3, ftmp2);
724     felem_reduce(ftmp3, tmp);   /* 2^8 - 2^0 */
725     felem_assign(ftmp2, ftmp3);
726 
727     for (i = 0; i < 8; i++) {
728         felem_square(tmp, ftmp3);
729         felem_reduce(ftmp3, tmp); /* 2^16 - 2^8 */
730     }
731     felem_mul(tmp, ftmp3, ftmp2);
732     felem_reduce(ftmp3, tmp);   /* 2^16 - 2^0 */
733     felem_assign(ftmp2, ftmp3);
734 
735     for (i = 0; i < 16; i++) {
736         felem_square(tmp, ftmp3);
737         felem_reduce(ftmp3, tmp); /* 2^32 - 2^16 */
738     }
739     felem_mul(tmp, ftmp3, ftmp2);
740     felem_reduce(ftmp3, tmp);   /* 2^32 - 2^0 */
741     felem_assign(ftmp2, ftmp3);
742 
743     for (i = 0; i < 32; i++) {
744         felem_square(tmp, ftmp3);
745         felem_reduce(ftmp3, tmp); /* 2^64 - 2^32 */
746     }
747     felem_mul(tmp, ftmp3, ftmp2);
748     felem_reduce(ftmp3, tmp);   /* 2^64 - 2^0 */
749     felem_assign(ftmp2, ftmp3);
750 
751     for (i = 0; i < 64; i++) {
752         felem_square(tmp, ftmp3);
753         felem_reduce(ftmp3, tmp); /* 2^128 - 2^64 */
754     }
755     felem_mul(tmp, ftmp3, ftmp2);
756     felem_reduce(ftmp3, tmp);   /* 2^128 - 2^0 */
757     felem_assign(ftmp2, ftmp3);
758 
759     for (i = 0; i < 128; i++) {
760         felem_square(tmp, ftmp3);
761         felem_reduce(ftmp3, tmp); /* 2^256 - 2^128 */
762     }
763     felem_mul(tmp, ftmp3, ftmp2);
764     felem_reduce(ftmp3, tmp);   /* 2^256 - 2^0 */
765     felem_assign(ftmp2, ftmp3);
766 
767     for (i = 0; i < 256; i++) {
768         felem_square(tmp, ftmp3);
769         felem_reduce(ftmp3, tmp); /* 2^512 - 2^256 */
770     }
771     felem_mul(tmp, ftmp3, ftmp2);
772     felem_reduce(ftmp3, tmp);   /* 2^512 - 2^0 */
773 
774     for (i = 0; i < 9; i++) {
775         felem_square(tmp, ftmp3);
776         felem_reduce(ftmp3, tmp); /* 2^521 - 2^9 */
777     }
778     felem_mul(tmp, ftmp3, ftmp4);
779     felem_reduce(ftmp3, tmp);   /* 2^512 - 2^2 */
780     felem_mul(tmp, ftmp3, in);
781     felem_reduce(out, tmp);     /* 2^512 - 3 */
782 }
783 
784 /* This is 2^521-1, expressed as an felem */
785 static const felem kPrime = {
786     0x03ffffffffffffff, 0x03ffffffffffffff, 0x03ffffffffffffff,
787     0x03ffffffffffffff, 0x03ffffffffffffff, 0x03ffffffffffffff,
788     0x03ffffffffffffff, 0x03ffffffffffffff, 0x01ffffffffffffff
789 };
790 
791 /*-
792  * felem_is_zero returns a limb with all bits set if |in| == 0 (mod p) and 0
793  * otherwise.
794  * On entry:
795  *   in[i] < 2^59 + 2^14
796  */
felem_is_zero(const felem in)797 static limb felem_is_zero(const felem in)
798 {
799     felem ftmp;
800     limb is_zero, is_p;
801     felem_assign(ftmp, in);
802 
803     ftmp[0] += ftmp[8] >> 57;
804     ftmp[8] &= bottom57bits;
805     /* ftmp[8] < 2^57 */
806     ftmp[1] += ftmp[0] >> 58;
807     ftmp[0] &= bottom58bits;
808     ftmp[2] += ftmp[1] >> 58;
809     ftmp[1] &= bottom58bits;
810     ftmp[3] += ftmp[2] >> 58;
811     ftmp[2] &= bottom58bits;
812     ftmp[4] += ftmp[3] >> 58;
813     ftmp[3] &= bottom58bits;
814     ftmp[5] += ftmp[4] >> 58;
815     ftmp[4] &= bottom58bits;
816     ftmp[6] += ftmp[5] >> 58;
817     ftmp[5] &= bottom58bits;
818     ftmp[7] += ftmp[6] >> 58;
819     ftmp[6] &= bottom58bits;
820     ftmp[8] += ftmp[7] >> 58;
821     ftmp[7] &= bottom58bits;
822     /* ftmp[8] < 2^57 + 4 */
823 
824     /*
825      * The ninth limb of 2*(2^521-1) is 0x03ffffffffffffff, which is greater
826      * than our bound for ftmp[8]. Therefore we only have to check if the
827      * zero is zero or 2^521-1.
828      */
829 
830     is_zero = 0;
831     is_zero |= ftmp[0];
832     is_zero |= ftmp[1];
833     is_zero |= ftmp[2];
834     is_zero |= ftmp[3];
835     is_zero |= ftmp[4];
836     is_zero |= ftmp[5];
837     is_zero |= ftmp[6];
838     is_zero |= ftmp[7];
839     is_zero |= ftmp[8];
840 
841     is_zero--;
842     /*
843      * We know that ftmp[i] < 2^63, therefore the only way that the top bit
844      * can be set is if is_zero was 0 before the decrement.
845      */
846     is_zero = 0 - (is_zero >> 63);
847 
848     is_p = ftmp[0] ^ kPrime[0];
849     is_p |= ftmp[1] ^ kPrime[1];
850     is_p |= ftmp[2] ^ kPrime[2];
851     is_p |= ftmp[3] ^ kPrime[3];
852     is_p |= ftmp[4] ^ kPrime[4];
853     is_p |= ftmp[5] ^ kPrime[5];
854     is_p |= ftmp[6] ^ kPrime[6];
855     is_p |= ftmp[7] ^ kPrime[7];
856     is_p |= ftmp[8] ^ kPrime[8];
857 
858     is_p--;
859     is_p = 0 - (is_p >> 63);
860 
861     is_zero |= is_p;
862     return is_zero;
863 }
864 
felem_is_zero_int(const void * in)865 static int felem_is_zero_int(const void *in)
866 {
867     return (int)(felem_is_zero(in) & ((limb) 1));
868 }
869 
870 /*-
871  * felem_contract converts |in| to its unique, minimal representation.
872  * On entry:
873  *   in[i] < 2^59 + 2^14
874  */
felem_contract(felem out,const felem in)875 static void felem_contract(felem out, const felem in)
876 {
877     limb is_p, is_greater, sign;
878     static const limb two58 = ((limb) 1) << 58;
879 
880     felem_assign(out, in);
881 
882     out[0] += out[8] >> 57;
883     out[8] &= bottom57bits;
884     /* out[8] < 2^57 */
885     out[1] += out[0] >> 58;
886     out[0] &= bottom58bits;
887     out[2] += out[1] >> 58;
888     out[1] &= bottom58bits;
889     out[3] += out[2] >> 58;
890     out[2] &= bottom58bits;
891     out[4] += out[3] >> 58;
892     out[3] &= bottom58bits;
893     out[5] += out[4] >> 58;
894     out[4] &= bottom58bits;
895     out[6] += out[5] >> 58;
896     out[5] &= bottom58bits;
897     out[7] += out[6] >> 58;
898     out[6] &= bottom58bits;
899     out[8] += out[7] >> 58;
900     out[7] &= bottom58bits;
901     /* out[8] < 2^57 + 4 */
902 
903     /*
904      * If the value is greater than 2^521-1 then we have to subtract 2^521-1
905      * out. See the comments in felem_is_zero regarding why we don't test for
906      * other multiples of the prime.
907      */
908 
909     /*
910      * First, if |out| is equal to 2^521-1, we subtract it out to get zero.
911      */
912 
913     is_p = out[0] ^ kPrime[0];
914     is_p |= out[1] ^ kPrime[1];
915     is_p |= out[2] ^ kPrime[2];
916     is_p |= out[3] ^ kPrime[3];
917     is_p |= out[4] ^ kPrime[4];
918     is_p |= out[5] ^ kPrime[5];
919     is_p |= out[6] ^ kPrime[6];
920     is_p |= out[7] ^ kPrime[7];
921     is_p |= out[8] ^ kPrime[8];
922 
923     is_p--;
924     is_p &= is_p << 32;
925     is_p &= is_p << 16;
926     is_p &= is_p << 8;
927     is_p &= is_p << 4;
928     is_p &= is_p << 2;
929     is_p &= is_p << 1;
930     is_p = 0 - (is_p >> 63);
931     is_p = ~is_p;
932 
933     /* is_p is 0 iff |out| == 2^521-1 and all ones otherwise */
934 
935     out[0] &= is_p;
936     out[1] &= is_p;
937     out[2] &= is_p;
938     out[3] &= is_p;
939     out[4] &= is_p;
940     out[5] &= is_p;
941     out[6] &= is_p;
942     out[7] &= is_p;
943     out[8] &= is_p;
944 
945     /*
946      * In order to test that |out| >= 2^521-1 we need only test if out[8] >>
947      * 57 is greater than zero as (2^521-1) + x >= 2^522
948      */
949     is_greater = out[8] >> 57;
950     is_greater |= is_greater << 32;
951     is_greater |= is_greater << 16;
952     is_greater |= is_greater << 8;
953     is_greater |= is_greater << 4;
954     is_greater |= is_greater << 2;
955     is_greater |= is_greater << 1;
956     is_greater = 0 - (is_greater >> 63);
957 
958     out[0] -= kPrime[0] & is_greater;
959     out[1] -= kPrime[1] & is_greater;
960     out[2] -= kPrime[2] & is_greater;
961     out[3] -= kPrime[3] & is_greater;
962     out[4] -= kPrime[4] & is_greater;
963     out[5] -= kPrime[5] & is_greater;
964     out[6] -= kPrime[6] & is_greater;
965     out[7] -= kPrime[7] & is_greater;
966     out[8] -= kPrime[8] & is_greater;
967 
968     /* Eliminate negative coefficients */
969     sign = -(out[0] >> 63);
970     out[0] += (two58 & sign);
971     out[1] -= (1 & sign);
972     sign = -(out[1] >> 63);
973     out[1] += (two58 & sign);
974     out[2] -= (1 & sign);
975     sign = -(out[2] >> 63);
976     out[2] += (two58 & sign);
977     out[3] -= (1 & sign);
978     sign = -(out[3] >> 63);
979     out[3] += (two58 & sign);
980     out[4] -= (1 & sign);
981     sign = -(out[4] >> 63);
982     out[4] += (two58 & sign);
983     out[5] -= (1 & sign);
984     sign = -(out[0] >> 63);
985     out[5] += (two58 & sign);
986     out[6] -= (1 & sign);
987     sign = -(out[6] >> 63);
988     out[6] += (two58 & sign);
989     out[7] -= (1 & sign);
990     sign = -(out[7] >> 63);
991     out[7] += (two58 & sign);
992     out[8] -= (1 & sign);
993     sign = -(out[5] >> 63);
994     out[5] += (two58 & sign);
995     out[6] -= (1 & sign);
996     sign = -(out[6] >> 63);
997     out[6] += (two58 & sign);
998     out[7] -= (1 & sign);
999     sign = -(out[7] >> 63);
1000     out[7] += (two58 & sign);
1001     out[8] -= (1 & sign);
1002 }
1003 
1004 /*-
1005  * Group operations
1006  * ----------------
1007  *
1008  * Building on top of the field operations we have the operations on the
1009  * elliptic curve group itself. Points on the curve are represented in Jacobian
1010  * coordinates */
1011 
1012 /*-
1013  * point_double calcuates 2*(x_in, y_in, z_in)
1014  *
1015  * The method is taken from:
1016  *   http://hyperelliptic.org/EFD/g1p/auto-shortw-jacobian-3.html#doubling-dbl-2001-b
1017  *
1018  * Outputs can equal corresponding inputs, i.e., x_out == x_in is allowed.
1019  * while x_out == y_in is not (maybe this works, but it's not tested). */
1020 static void
point_double(felem x_out,felem y_out,felem z_out,const felem x_in,const felem y_in,const felem z_in)1021 point_double(felem x_out, felem y_out, felem z_out,
1022              const felem x_in, const felem y_in, const felem z_in)
1023 {
1024     largefelem tmp, tmp2;
1025     felem delta, gamma, beta, alpha, ftmp, ftmp2;
1026 
1027     felem_assign(ftmp, x_in);
1028     felem_assign(ftmp2, x_in);
1029 
1030     /* delta = z^2 */
1031     felem_square(tmp, z_in);
1032     felem_reduce(delta, tmp);   /* delta[i] < 2^59 + 2^14 */
1033 
1034     /* gamma = y^2 */
1035     felem_square(tmp, y_in);
1036     felem_reduce(gamma, tmp);   /* gamma[i] < 2^59 + 2^14 */
1037 
1038     /* beta = x*gamma */
1039     felem_mul(tmp, x_in, gamma);
1040     felem_reduce(beta, tmp);    /* beta[i] < 2^59 + 2^14 */
1041 
1042     /* alpha = 3*(x-delta)*(x+delta) */
1043     felem_diff64(ftmp, delta);
1044     /* ftmp[i] < 2^61 */
1045     felem_sum64(ftmp2, delta);
1046     /* ftmp2[i] < 2^60 + 2^15 */
1047     felem_scalar64(ftmp2, 3);
1048     /* ftmp2[i] < 3*2^60 + 3*2^15 */
1049     felem_mul(tmp, ftmp, ftmp2);
1050     /*-
1051      * tmp[i] < 17(3*2^121 + 3*2^76)
1052      *        = 61*2^121 + 61*2^76
1053      *        < 64*2^121 + 64*2^76
1054      *        = 2^127 + 2^82
1055      *        < 2^128
1056      */
1057     felem_reduce(alpha, tmp);
1058 
1059     /* x' = alpha^2 - 8*beta */
1060     felem_square(tmp, alpha);
1061     /*
1062      * tmp[i] < 17*2^120 < 2^125
1063      */
1064     felem_assign(ftmp, beta);
1065     felem_scalar64(ftmp, 8);
1066     /* ftmp[i] < 2^62 + 2^17 */
1067     felem_diff_128_64(tmp, ftmp);
1068     /* tmp[i] < 2^125 + 2^63 + 2^62 + 2^17 */
1069     felem_reduce(x_out, tmp);
1070 
1071     /* z' = (y + z)^2 - gamma - delta */
1072     felem_sum64(delta, gamma);
1073     /* delta[i] < 2^60 + 2^15 */
1074     felem_assign(ftmp, y_in);
1075     felem_sum64(ftmp, z_in);
1076     /* ftmp[i] < 2^60 + 2^15 */
1077     felem_square(tmp, ftmp);
1078     /*
1079      * tmp[i] < 17(2^122) < 2^127
1080      */
1081     felem_diff_128_64(tmp, delta);
1082     /* tmp[i] < 2^127 + 2^63 */
1083     felem_reduce(z_out, tmp);
1084 
1085     /* y' = alpha*(4*beta - x') - 8*gamma^2 */
1086     felem_scalar64(beta, 4);
1087     /* beta[i] < 2^61 + 2^16 */
1088     felem_diff64(beta, x_out);
1089     /* beta[i] < 2^61 + 2^60 + 2^16 */
1090     felem_mul(tmp, alpha, beta);
1091     /*-
1092      * tmp[i] < 17*((2^59 + 2^14)(2^61 + 2^60 + 2^16))
1093      *        = 17*(2^120 + 2^75 + 2^119 + 2^74 + 2^75 + 2^30)
1094      *        = 17*(2^120 + 2^119 + 2^76 + 2^74 + 2^30)
1095      *        < 2^128
1096      */
1097     felem_square(tmp2, gamma);
1098     /*-
1099      * tmp2[i] < 17*(2^59 + 2^14)^2
1100      *         = 17*(2^118 + 2^74 + 2^28)
1101      */
1102     felem_scalar128(tmp2, 8);
1103     /*-
1104      * tmp2[i] < 8*17*(2^118 + 2^74 + 2^28)
1105      *         = 2^125 + 2^121 + 2^81 + 2^77 + 2^35 + 2^31
1106      *         < 2^126
1107      */
1108     felem_diff128(tmp, tmp2);
1109     /*-
1110      * tmp[i] < 2^127 - 2^69 + 17(2^120 + 2^119 + 2^76 + 2^74 + 2^30)
1111      *        = 2^127 + 2^124 + 2^122 + 2^120 + 2^118 + 2^80 + 2^78 + 2^76 +
1112      *          2^74 + 2^69 + 2^34 + 2^30
1113      *        < 2^128
1114      */
1115     felem_reduce(y_out, tmp);
1116 }
1117 
1118 /* copy_conditional copies in to out iff mask is all ones. */
copy_conditional(felem out,const felem in,limb mask)1119 static void copy_conditional(felem out, const felem in, limb mask)
1120 {
1121     unsigned i;
1122     for (i = 0; i < NLIMBS; ++i) {
1123         const limb tmp = mask & (in[i] ^ out[i]);
1124         out[i] ^= tmp;
1125     }
1126 }
1127 
1128 /*-
1129  * point_add calcuates (x1, y1, z1) + (x2, y2, z2)
1130  *
1131  * The method is taken from
1132  *   http://hyperelliptic.org/EFD/g1p/auto-shortw-jacobian-3.html#addition-add-2007-bl,
1133  * adapted for mixed addition (z2 = 1, or z2 = 0 for the point at infinity).
1134  *
1135  * This function includes a branch for checking whether the two input points
1136  * are equal (while not equal to the point at infinity). This case never
1137  * happens during single point multiplication, so there is no timing leak for
1138  * ECDH or ECDSA signing. */
point_add(felem x3,felem y3,felem z3,const felem x1,const felem y1,const felem z1,const int mixed,const felem x2,const felem y2,const felem z2)1139 static void point_add(felem x3, felem y3, felem z3,
1140                       const felem x1, const felem y1, const felem z1,
1141                       const int mixed, const felem x2, const felem y2,
1142                       const felem z2)
1143 {
1144     felem ftmp, ftmp2, ftmp3, ftmp4, ftmp5, ftmp6, x_out, y_out, z_out;
1145     largefelem tmp, tmp2;
1146     limb x_equal, y_equal, z1_is_zero, z2_is_zero;
1147 
1148     z1_is_zero = felem_is_zero(z1);
1149     z2_is_zero = felem_is_zero(z2);
1150 
1151     /* ftmp = z1z1 = z1**2 */
1152     felem_square(tmp, z1);
1153     felem_reduce(ftmp, tmp);
1154 
1155     if (!mixed) {
1156         /* ftmp2 = z2z2 = z2**2 */
1157         felem_square(tmp, z2);
1158         felem_reduce(ftmp2, tmp);
1159 
1160         /* u1 = ftmp3 = x1*z2z2 */
1161         felem_mul(tmp, x1, ftmp2);
1162         felem_reduce(ftmp3, tmp);
1163 
1164         /* ftmp5 = z1 + z2 */
1165         felem_assign(ftmp5, z1);
1166         felem_sum64(ftmp5, z2);
1167         /* ftmp5[i] < 2^61 */
1168 
1169         /* ftmp5 = (z1 + z2)**2 - z1z1 - z2z2 = 2*z1z2 */
1170         felem_square(tmp, ftmp5);
1171         /* tmp[i] < 17*2^122 */
1172         felem_diff_128_64(tmp, ftmp);
1173         /* tmp[i] < 17*2^122 + 2^63 */
1174         felem_diff_128_64(tmp, ftmp2);
1175         /* tmp[i] < 17*2^122 + 2^64 */
1176         felem_reduce(ftmp5, tmp);
1177 
1178         /* ftmp2 = z2 * z2z2 */
1179         felem_mul(tmp, ftmp2, z2);
1180         felem_reduce(ftmp2, tmp);
1181 
1182         /* s1 = ftmp6 = y1 * z2**3 */
1183         felem_mul(tmp, y1, ftmp2);
1184         felem_reduce(ftmp6, tmp);
1185     } else {
1186         /*
1187          * We'll assume z2 = 1 (special case z2 = 0 is handled later)
1188          */
1189 
1190         /* u1 = ftmp3 = x1*z2z2 */
1191         felem_assign(ftmp3, x1);
1192 
1193         /* ftmp5 = 2*z1z2 */
1194         felem_scalar(ftmp5, z1, 2);
1195 
1196         /* s1 = ftmp6 = y1 * z2**3 */
1197         felem_assign(ftmp6, y1);
1198     }
1199 
1200     /* u2 = x2*z1z1 */
1201     felem_mul(tmp, x2, ftmp);
1202     /* tmp[i] < 17*2^120 */
1203 
1204     /* h = ftmp4 = u2 - u1 */
1205     felem_diff_128_64(tmp, ftmp3);
1206     /* tmp[i] < 17*2^120 + 2^63 */
1207     felem_reduce(ftmp4, tmp);
1208 
1209     x_equal = felem_is_zero(ftmp4);
1210 
1211     /* z_out = ftmp5 * h */
1212     felem_mul(tmp, ftmp5, ftmp4);
1213     felem_reduce(z_out, tmp);
1214 
1215     /* ftmp = z1 * z1z1 */
1216     felem_mul(tmp, ftmp, z1);
1217     felem_reduce(ftmp, tmp);
1218 
1219     /* s2 = tmp = y2 * z1**3 */
1220     felem_mul(tmp, y2, ftmp);
1221     /* tmp[i] < 17*2^120 */
1222 
1223     /* r = ftmp5 = (s2 - s1)*2 */
1224     felem_diff_128_64(tmp, ftmp6);
1225     /* tmp[i] < 17*2^120 + 2^63 */
1226     felem_reduce(ftmp5, tmp);
1227     y_equal = felem_is_zero(ftmp5);
1228     felem_scalar64(ftmp5, 2);
1229     /* ftmp5[i] < 2^61 */
1230 
1231     if (x_equal && y_equal && !z1_is_zero && !z2_is_zero) {
1232         point_double(x3, y3, z3, x1, y1, z1);
1233         return;
1234     }
1235 
1236     /* I = ftmp = (2h)**2 */
1237     felem_assign(ftmp, ftmp4);
1238     felem_scalar64(ftmp, 2);
1239     /* ftmp[i] < 2^61 */
1240     felem_square(tmp, ftmp);
1241     /* tmp[i] < 17*2^122 */
1242     felem_reduce(ftmp, tmp);
1243 
1244     /* J = ftmp2 = h * I */
1245     felem_mul(tmp, ftmp4, ftmp);
1246     felem_reduce(ftmp2, tmp);
1247 
1248     /* V = ftmp4 = U1 * I */
1249     felem_mul(tmp, ftmp3, ftmp);
1250     felem_reduce(ftmp4, tmp);
1251 
1252     /* x_out = r**2 - J - 2V */
1253     felem_square(tmp, ftmp5);
1254     /* tmp[i] < 17*2^122 */
1255     felem_diff_128_64(tmp, ftmp2);
1256     /* tmp[i] < 17*2^122 + 2^63 */
1257     felem_assign(ftmp3, ftmp4);
1258     felem_scalar64(ftmp4, 2);
1259     /* ftmp4[i] < 2^61 */
1260     felem_diff_128_64(tmp, ftmp4);
1261     /* tmp[i] < 17*2^122 + 2^64 */
1262     felem_reduce(x_out, tmp);
1263 
1264     /* y_out = r(V-x_out) - 2 * s1 * J */
1265     felem_diff64(ftmp3, x_out);
1266     /*
1267      * ftmp3[i] < 2^60 + 2^60 = 2^61
1268      */
1269     felem_mul(tmp, ftmp5, ftmp3);
1270     /* tmp[i] < 17*2^122 */
1271     felem_mul(tmp2, ftmp6, ftmp2);
1272     /* tmp2[i] < 17*2^120 */
1273     felem_scalar128(tmp2, 2);
1274     /* tmp2[i] < 17*2^121 */
1275     felem_diff128(tmp, tmp2);
1276         /*-
1277          * tmp[i] < 2^127 - 2^69 + 17*2^122
1278          *        = 2^126 - 2^122 - 2^6 - 2^2 - 1
1279          *        < 2^127
1280          */
1281     felem_reduce(y_out, tmp);
1282 
1283     copy_conditional(x_out, x2, z1_is_zero);
1284     copy_conditional(x_out, x1, z2_is_zero);
1285     copy_conditional(y_out, y2, z1_is_zero);
1286     copy_conditional(y_out, y1, z2_is_zero);
1287     copy_conditional(z_out, z2, z1_is_zero);
1288     copy_conditional(z_out, z1, z2_is_zero);
1289     felem_assign(x3, x_out);
1290     felem_assign(y3, y_out);
1291     felem_assign(z3, z_out);
1292 }
1293 
1294 /*-
1295  * Base point pre computation
1296  * --------------------------
1297  *
1298  * Two different sorts of precomputed tables are used in the following code.
1299  * Each contain various points on the curve, where each point is three field
1300  * elements (x, y, z).
1301  *
1302  * For the base point table, z is usually 1 (0 for the point at infinity).
1303  * This table has 16 elements:
1304  * index | bits    | point
1305  * ------+---------+------------------------------
1306  *     0 | 0 0 0 0 | 0G
1307  *     1 | 0 0 0 1 | 1G
1308  *     2 | 0 0 1 0 | 2^130G
1309  *     3 | 0 0 1 1 | (2^130 + 1)G
1310  *     4 | 0 1 0 0 | 2^260G
1311  *     5 | 0 1 0 1 | (2^260 + 1)G
1312  *     6 | 0 1 1 0 | (2^260 + 2^130)G
1313  *     7 | 0 1 1 1 | (2^260 + 2^130 + 1)G
1314  *     8 | 1 0 0 0 | 2^390G
1315  *     9 | 1 0 0 1 | (2^390 + 1)G
1316  *    10 | 1 0 1 0 | (2^390 + 2^130)G
1317  *    11 | 1 0 1 1 | (2^390 + 2^130 + 1)G
1318  *    12 | 1 1 0 0 | (2^390 + 2^260)G
1319  *    13 | 1 1 0 1 | (2^390 + 2^260 + 1)G
1320  *    14 | 1 1 1 0 | (2^390 + 2^260 + 2^130)G
1321  *    15 | 1 1 1 1 | (2^390 + 2^260 + 2^130 + 1)G
1322  *
1323  * The reason for this is so that we can clock bits into four different
1324  * locations when doing simple scalar multiplies against the base point.
1325  *
1326  * Tables for other points have table[i] = iG for i in 0 .. 16. */
1327 
1328 /* gmul is the table of precomputed base points */
1329 static const felem gmul[16][3] = { {{0, 0, 0, 0, 0, 0, 0, 0, 0},
1330                                     {0, 0, 0, 0, 0, 0, 0, 0, 0},
1331                                     {0, 0, 0, 0, 0, 0, 0, 0, 0}},
1332 {{0x017e7e31c2e5bd66, 0x022cf0615a90a6fe, 0x00127a2ffa8de334,
1333   0x01dfbf9d64a3f877, 0x006b4d3dbaa14b5e, 0x014fed487e0a2bd8,
1334   0x015b4429c6481390, 0x03a73678fb2d988e, 0x00c6858e06b70404},
1335  {0x00be94769fd16650, 0x031c21a89cb09022, 0x039013fad0761353,
1336   0x02657bd099031542, 0x03273e662c97ee72, 0x01e6d11a05ebef45,
1337   0x03d1bd998f544495, 0x03001172297ed0b1, 0x011839296a789a3b},
1338  {1, 0, 0, 0, 0, 0, 0, 0, 0}},
1339 {{0x0373faacbc875bae, 0x00f325023721c671, 0x00f666fd3dbde5ad,
1340   0x01a6932363f88ea7, 0x01fc6d9e13f9c47b, 0x03bcbffc2bbf734e,
1341   0x013ee3c3647f3a92, 0x029409fefe75d07d, 0x00ef9199963d85e5},
1342  {0x011173743ad5b178, 0x02499c7c21bf7d46, 0x035beaeabb8b1a58,
1343   0x00f989c4752ea0a3, 0x0101e1de48a9c1a3, 0x01a20076be28ba6c,
1344   0x02f8052e5eb2de95, 0x01bfe8f82dea117c, 0x0160074d3c36ddb7},
1345  {1, 0, 0, 0, 0, 0, 0, 0, 0}},
1346 {{0x012f3fc373393b3b, 0x03d3d6172f1419fa, 0x02adc943c0b86873,
1347   0x00d475584177952b, 0x012a4d1673750ee2, 0x00512517a0f13b0c,
1348   0x02b184671a7b1734, 0x0315b84236f1a50a, 0x00a4afc472edbdb9},
1349  {0x00152a7077f385c4, 0x03044007d8d1c2ee, 0x0065829d61d52b52,
1350   0x00494ff6b6631d0d, 0x00a11d94d5f06bcf, 0x02d2f89474d9282e,
1351   0x0241c5727c06eeb9, 0x0386928710fbdb9d, 0x01f883f727b0dfbe},
1352  {1, 0, 0, 0, 0, 0, 0, 0, 0}},
1353 {{0x019b0c3c9185544d, 0x006243a37c9d97db, 0x02ee3cbe030a2ad2,
1354   0x00cfdd946bb51e0d, 0x0271c00932606b91, 0x03f817d1ec68c561,
1355   0x03f37009806a369c, 0x03c1f30baf184fd5, 0x01091022d6d2f065},
1356  {0x0292c583514c45ed, 0x0316fca51f9a286c, 0x00300af507c1489a,
1357   0x0295f69008298cf1, 0x02c0ed8274943d7b, 0x016509b9b47a431e,
1358   0x02bc9de9634868ce, 0x005b34929bffcb09, 0x000c1a0121681524},
1359  {1, 0, 0, 0, 0, 0, 0, 0, 0}},
1360 {{0x0286abc0292fb9f2, 0x02665eee9805b3f7, 0x01ed7455f17f26d6,
1361   0x0346355b83175d13, 0x006284944cd0a097, 0x0191895bcdec5e51,
1362   0x02e288370afda7d9, 0x03b22312bfefa67a, 0x01d104d3fc0613fe},
1363  {0x0092421a12f7e47f, 0x0077a83fa373c501, 0x03bd25c5f696bd0d,
1364   0x035c41e4d5459761, 0x01ca0d1742b24f53, 0x00aaab27863a509c,
1365   0x018b6de47df73917, 0x025c0b771705cd01, 0x01fd51d566d760a7},
1366  {1, 0, 0, 0, 0, 0, 0, 0, 0}},
1367 {{0x01dd92ff6b0d1dbd, 0x039c5e2e8f8afa69, 0x0261ed13242c3b27,
1368   0x0382c6e67026e6a0, 0x01d60b10be2089f9, 0x03c15f3dce86723f,
1369   0x03c764a32d2a062d, 0x017307eac0fad056, 0x018207c0b96c5256},
1370  {0x0196a16d60e13154, 0x03e6ce74c0267030, 0x00ddbf2b4e52a5aa,
1371   0x012738241bbf31c8, 0x00ebe8dc04685a28, 0x024c2ad6d380d4a2,
1372   0x035ee062a6e62d0e, 0x0029ed74af7d3a0f, 0x00eef32aec142ebd},
1373  {1, 0, 0, 0, 0, 0, 0, 0, 0}},
1374 {{0x00c31ec398993b39, 0x03a9f45bcda68253, 0x00ac733c24c70890,
1375   0x00872b111401ff01, 0x01d178c23195eafb, 0x03bca2c816b87f74,
1376   0x0261a9af46fbad7a, 0x0324b2a8dd3d28f9, 0x00918121d8f24e23},
1377  {0x032bc8c1ca983cd7, 0x00d869dfb08fc8c6, 0x01693cb61fce1516,
1378   0x012a5ea68f4e88a8, 0x010869cab88d7ae3, 0x009081ad277ceee1,
1379   0x033a77166d064cdc, 0x03955235a1fb3a95, 0x01251a4a9b25b65e},
1380  {1, 0, 0, 0, 0, 0, 0, 0, 0}},
1381 {{0x00148a3a1b27f40b, 0x0123186df1b31fdc, 0x00026e7beaad34ce,
1382   0x01db446ac1d3dbba, 0x0299c1a33437eaec, 0x024540610183cbb7,
1383   0x0173bb0e9ce92e46, 0x02b937e43921214b, 0x01ab0436a9bf01b5},
1384  {0x0383381640d46948, 0x008dacbf0e7f330f, 0x03602122bcc3f318,
1385   0x01ee596b200620d6, 0x03bd0585fda430b3, 0x014aed77fd123a83,
1386   0x005ace749e52f742, 0x0390fe041da2b842, 0x0189a8ceb3299242},
1387  {1, 0, 0, 0, 0, 0, 0, 0, 0}},
1388 {{0x012a19d6b3282473, 0x00c0915918b423ce, 0x023a954eb94405ae,
1389   0x00529f692be26158, 0x0289fa1b6fa4b2aa, 0x0198ae4ceea346ef,
1390   0x0047d8cdfbdedd49, 0x00cc8c8953f0f6b8, 0x001424abbff49203},
1391  {0x0256732a1115a03a, 0x0351bc38665c6733, 0x03f7b950fb4a6447,
1392   0x000afffa94c22155, 0x025763d0a4dab540, 0x000511e92d4fc283,
1393   0x030a7e9eda0ee96c, 0x004c3cd93a28bf0a, 0x017edb3a8719217f},
1394  {1, 0, 0, 0, 0, 0, 0, 0, 0}},
1395 {{0x011de5675a88e673, 0x031d7d0f5e567fbe, 0x0016b2062c970ae5,
1396   0x03f4a2be49d90aa7, 0x03cef0bd13822866, 0x03f0923dcf774a6c,
1397   0x0284bebc4f322f72, 0x016ab2645302bb2c, 0x01793f95dace0e2a},
1398  {0x010646e13527a28f, 0x01ca1babd59dc5e7, 0x01afedfd9a5595df,
1399   0x01f15785212ea6b1, 0x0324e5d64f6ae3f4, 0x02d680f526d00645,
1400   0x0127920fadf627a7, 0x03b383f75df4f684, 0x0089e0057e783b0a},
1401  {1, 0, 0, 0, 0, 0, 0, 0, 0}},
1402 {{0x00f334b9eb3c26c6, 0x0298fdaa98568dce, 0x01c2d24843a82292,
1403   0x020bcb24fa1b0711, 0x02cbdb3d2b1875e6, 0x0014907598f89422,
1404   0x03abe3aa43b26664, 0x02cbf47f720bc168, 0x0133b5e73014b79b},
1405  {0x034aab5dab05779d, 0x00cdc5d71fee9abb, 0x0399f16bd4bd9d30,
1406   0x03582fa592d82647, 0x02be1cdfb775b0e9, 0x0034f7cea32e94cb,
1407   0x0335a7f08f56f286, 0x03b707e9565d1c8b, 0x0015c946ea5b614f},
1408  {1, 0, 0, 0, 0, 0, 0, 0, 0}},
1409 {{0x024676f6cff72255, 0x00d14625cac96378, 0x00532b6008bc3767,
1410   0x01fc16721b985322, 0x023355ea1b091668, 0x029de7afdc0317c3,
1411   0x02fc8a7ca2da037c, 0x02de1217d74a6f30, 0x013f7173175b73bf},
1412  {0x0344913f441490b5, 0x0200f9e272b61eca, 0x0258a246b1dd55d2,
1413   0x03753db9ea496f36, 0x025e02937a09c5ef, 0x030cbd3d14012692,
1414   0x01793a67e70dc72a, 0x03ec1d37048a662e, 0x006550f700c32a8d},
1415  {1, 0, 0, 0, 0, 0, 0, 0, 0}},
1416 {{0x00d3f48a347eba27, 0x008e636649b61bd8, 0x00d3b93716778fb3,
1417   0x004d1915757bd209, 0x019d5311a3da44e0, 0x016d1afcbbe6aade,
1418   0x0241bf5f73265616, 0x0384672e5d50d39b, 0x005009fee522b684},
1419  {0x029b4fab064435fe, 0x018868ee095bbb07, 0x01ea3d6936cc92b8,
1420   0x000608b00f78a2f3, 0x02db911073d1c20f, 0x018205938470100a,
1421   0x01f1e4964cbe6ff2, 0x021a19a29eed4663, 0x01414485f42afa81},
1422  {1, 0, 0, 0, 0, 0, 0, 0, 0}},
1423 {{0x01612b3a17f63e34, 0x03813992885428e6, 0x022b3c215b5a9608,
1424   0x029b4057e19f2fcb, 0x0384059a587af7e6, 0x02d6400ace6fe610,
1425   0x029354d896e8e331, 0x00c047ee6dfba65e, 0x0037720542e9d49d},
1426  {0x02ce9eed7c5e9278, 0x0374ed703e79643b, 0x01316c54c4072006,
1427   0x005aaa09054b2ee8, 0x002824000c840d57, 0x03d4eba24771ed86,
1428   0x0189c50aabc3bdae, 0x0338c01541e15510, 0x00466d56e38eed42},
1429  {1, 0, 0, 0, 0, 0, 0, 0, 0}},
1430 {{0x007efd8330ad8bd6, 0x02465ed48047710b, 0x0034c6606b215e0c,
1431   0x016ae30c53cbf839, 0x01fa17bd37161216, 0x018ead4e61ce8ab9,
1432   0x005482ed5f5dee46, 0x037543755bba1d7f, 0x005e5ac7e70a9d0f},
1433  {0x0117e1bb2fdcb2a2, 0x03deea36249f40c4, 0x028d09b4a6246cb7,
1434   0x03524b8855bcf756, 0x023d7d109d5ceb58, 0x0178e43e3223ef9c,
1435   0x0154536a0c6e966a, 0x037964d1286ee9fe, 0x0199bcd90e125055},
1436  {1, 0, 0, 0, 0, 0, 0, 0, 0}}
1437 };
1438 
1439 /*
1440  * select_point selects the |idx|th point from a precomputation table and
1441  * copies it to out.
1442  */
1443  /* pre_comp below is of the size provided in |size| */
select_point(const limb idx,unsigned int size,const felem pre_comp[][3],felem out[3])1444 static void select_point(const limb idx, unsigned int size,
1445                          const felem pre_comp[][3], felem out[3])
1446 {
1447     unsigned i, j;
1448     limb *outlimbs = &out[0][0];
1449     memset(outlimbs, 0, 3 * sizeof(felem));
1450 
1451     for (i = 0; i < size; i++) {
1452         const limb *inlimbs = &pre_comp[i][0][0];
1453         limb mask = i ^ idx;
1454         mask |= mask >> 4;
1455         mask |= mask >> 2;
1456         mask |= mask >> 1;
1457         mask &= 1;
1458         mask--;
1459         for (j = 0; j < NLIMBS * 3; j++)
1460             outlimbs[j] |= inlimbs[j] & mask;
1461     }
1462 }
1463 
1464 /* get_bit returns the |i|th bit in |in| */
get_bit(const felem_bytearray in,int i)1465 static char get_bit(const felem_bytearray in, int i)
1466 {
1467     if (i < 0)
1468         return 0;
1469     return (in[i >> 3] >> (i & 7)) & 1;
1470 }
1471 
1472 /*
1473  * Interleaved point multiplication using precomputed point multiples: The
1474  * small point multiples 0*P, 1*P, ..., 16*P are in pre_comp[], the scalars
1475  * in scalars[]. If g_scalar is non-NULL, we also add this multiple of the
1476  * generator, using certain (large) precomputed multiples in g_pre_comp.
1477  * Output point (X, Y, Z) is stored in x_out, y_out, z_out
1478  */
batch_mul(felem x_out,felem y_out,felem z_out,const felem_bytearray scalars[],const unsigned num_points,const u8 * g_scalar,const int mixed,const felem pre_comp[][17][3],const felem g_pre_comp[16][3])1479 static void batch_mul(felem x_out, felem y_out, felem z_out,
1480                       const felem_bytearray scalars[],
1481                       const unsigned num_points, const u8 *g_scalar,
1482                       const int mixed, const felem pre_comp[][17][3],
1483                       const felem g_pre_comp[16][3])
1484 {
1485     int i, skip;
1486     unsigned num, gen_mul = (g_scalar != NULL);
1487     felem nq[3], tmp[4];
1488     limb bits;
1489     u8 sign, digit;
1490 
1491     /* set nq to the point at infinity */
1492     memset(nq, 0, 3 * sizeof(felem));
1493 
1494     /*
1495      * Loop over all scalars msb-to-lsb, interleaving additions of multiples
1496      * of the generator (last quarter of rounds) and additions of other
1497      * points multiples (every 5th round).
1498      */
1499     skip = 1;                   /* save two point operations in the first
1500                                  * round */
1501     for (i = (num_points ? 520 : 130); i >= 0; --i) {
1502         /* double */
1503         if (!skip)
1504             point_double(nq[0], nq[1], nq[2], nq[0], nq[1], nq[2]);
1505 
1506         /* add multiples of the generator */
1507         if (gen_mul && (i <= 130)) {
1508             bits = get_bit(g_scalar, i + 390) << 3;
1509             if (i < 130) {
1510                 bits |= get_bit(g_scalar, i + 260) << 2;
1511                 bits |= get_bit(g_scalar, i + 130) << 1;
1512                 bits |= get_bit(g_scalar, i);
1513             }
1514             /* select the point to add, in constant time */
1515             select_point(bits, 16, g_pre_comp, tmp);
1516             if (!skip) {
1517                 /* The 1 argument below is for "mixed" */
1518                 point_add(nq[0], nq[1], nq[2],
1519                           nq[0], nq[1], nq[2], 1, tmp[0], tmp[1], tmp[2]);
1520             } else {
1521                 memcpy(nq, tmp, 3 * sizeof(felem));
1522                 skip = 0;
1523             }
1524         }
1525 
1526         /* do other additions every 5 doublings */
1527         if (num_points && (i % 5 == 0)) {
1528             /* loop over all scalars */
1529             for (num = 0; num < num_points; ++num) {
1530                 bits = get_bit(scalars[num], i + 4) << 5;
1531                 bits |= get_bit(scalars[num], i + 3) << 4;
1532                 bits |= get_bit(scalars[num], i + 2) << 3;
1533                 bits |= get_bit(scalars[num], i + 1) << 2;
1534                 bits |= get_bit(scalars[num], i) << 1;
1535                 bits |= get_bit(scalars[num], i - 1);
1536                 ec_GFp_nistp_recode_scalar_bits(&sign, &digit, bits);
1537 
1538                 /*
1539                  * select the point to add or subtract, in constant time
1540                  */
1541                 select_point(digit, 17, pre_comp[num], tmp);
1542                 felem_neg(tmp[3], tmp[1]); /* (X, -Y, Z) is the negative
1543                                             * point */
1544                 copy_conditional(tmp[1], tmp[3], (-(limb) sign));
1545 
1546                 if (!skip) {
1547                     point_add(nq[0], nq[1], nq[2],
1548                               nq[0], nq[1], nq[2],
1549                               mixed, tmp[0], tmp[1], tmp[2]);
1550                 } else {
1551                     memcpy(nq, tmp, 3 * sizeof(felem));
1552                     skip = 0;
1553                 }
1554             }
1555         }
1556     }
1557     felem_assign(x_out, nq[0]);
1558     felem_assign(y_out, nq[1]);
1559     felem_assign(z_out, nq[2]);
1560 }
1561 
1562 /* Precomputation for the group generator. */
1563 typedef struct {
1564     felem g_pre_comp[16][3];
1565     int references;
1566 } NISTP521_PRE_COMP;
1567 
EC_GFp_nistp521_method(void)1568 const EC_METHOD *EC_GFp_nistp521_method(void)
1569 {
1570     static const EC_METHOD ret = {
1571         EC_FLAGS_DEFAULT_OCT,
1572         NID_X9_62_prime_field,
1573         ec_GFp_nistp521_group_init,
1574         ec_GFp_simple_group_finish,
1575         ec_GFp_simple_group_clear_finish,
1576         ec_GFp_nist_group_copy,
1577         ec_GFp_nistp521_group_set_curve,
1578         ec_GFp_simple_group_get_curve,
1579         ec_GFp_simple_group_get_degree,
1580         ec_GFp_simple_group_check_discriminant,
1581         ec_GFp_simple_point_init,
1582         ec_GFp_simple_point_finish,
1583         ec_GFp_simple_point_clear_finish,
1584         ec_GFp_simple_point_copy,
1585         ec_GFp_simple_point_set_to_infinity,
1586         ec_GFp_simple_set_Jprojective_coordinates_GFp,
1587         ec_GFp_simple_get_Jprojective_coordinates_GFp,
1588         ec_GFp_simple_point_set_affine_coordinates,
1589         ec_GFp_nistp521_point_get_affine_coordinates,
1590         0 /* point_set_compressed_coordinates */ ,
1591         0 /* point2oct */ ,
1592         0 /* oct2point */ ,
1593         ec_GFp_simple_add,
1594         ec_GFp_simple_dbl,
1595         ec_GFp_simple_invert,
1596         ec_GFp_simple_is_at_infinity,
1597         ec_GFp_simple_is_on_curve,
1598         ec_GFp_simple_cmp,
1599         ec_GFp_simple_make_affine,
1600         ec_GFp_simple_points_make_affine,
1601         ec_GFp_nistp521_points_mul,
1602         ec_GFp_nistp521_precompute_mult,
1603         ec_GFp_nistp521_have_precompute_mult,
1604         ec_GFp_nist_field_mul,
1605         ec_GFp_nist_field_sqr,
1606         0 /* field_div */ ,
1607         0 /* field_encode */ ,
1608         0 /* field_decode */ ,
1609         0                       /* field_set_to_one */
1610     };
1611 
1612     return &ret;
1613 }
1614 
1615 /******************************************************************************/
1616 /*
1617  * FUNCTIONS TO MANAGE PRECOMPUTATION
1618  */
1619 
nistp521_pre_comp_new()1620 static NISTP521_PRE_COMP *nistp521_pre_comp_new()
1621 {
1622     NISTP521_PRE_COMP *ret = NULL;
1623     ret = (NISTP521_PRE_COMP *) OPENSSL_malloc(sizeof(NISTP521_PRE_COMP));
1624     if (!ret) {
1625         ECerr(EC_F_NISTP521_PRE_COMP_NEW, ERR_R_MALLOC_FAILURE);
1626         return ret;
1627     }
1628     memset(ret->g_pre_comp, 0, sizeof(ret->g_pre_comp));
1629     ret->references = 1;
1630     return ret;
1631 }
1632 
nistp521_pre_comp_dup(void * src_)1633 static void *nistp521_pre_comp_dup(void *src_)
1634 {
1635     NISTP521_PRE_COMP *src = src_;
1636 
1637     /* no need to actually copy, these objects never change! */
1638     CRYPTO_add(&src->references, 1, CRYPTO_LOCK_EC_PRE_COMP);
1639 
1640     return src_;
1641 }
1642 
nistp521_pre_comp_free(void * pre_)1643 static void nistp521_pre_comp_free(void *pre_)
1644 {
1645     int i;
1646     NISTP521_PRE_COMP *pre = pre_;
1647 
1648     if (!pre)
1649         return;
1650 
1651     i = CRYPTO_add(&pre->references, -1, CRYPTO_LOCK_EC_PRE_COMP);
1652     if (i > 0)
1653         return;
1654 
1655     OPENSSL_free(pre);
1656 }
1657 
nistp521_pre_comp_clear_free(void * pre_)1658 static void nistp521_pre_comp_clear_free(void *pre_)
1659 {
1660     int i;
1661     NISTP521_PRE_COMP *pre = pre_;
1662 
1663     if (!pre)
1664         return;
1665 
1666     i = CRYPTO_add(&pre->references, -1, CRYPTO_LOCK_EC_PRE_COMP);
1667     if (i > 0)
1668         return;
1669 
1670     OPENSSL_cleanse(pre, sizeof(*pre));
1671     OPENSSL_free(pre);
1672 }
1673 
1674 /******************************************************************************/
1675 /*
1676  * OPENSSL EC_METHOD FUNCTIONS
1677  */
1678 
ec_GFp_nistp521_group_init(EC_GROUP * group)1679 int ec_GFp_nistp521_group_init(EC_GROUP *group)
1680 {
1681     int ret;
1682     ret = ec_GFp_simple_group_init(group);
1683     group->a_is_minus3 = 1;
1684     return ret;
1685 }
1686 
ec_GFp_nistp521_group_set_curve(EC_GROUP * group,const BIGNUM * p,const BIGNUM * a,const BIGNUM * b,BN_CTX * ctx)1687 int ec_GFp_nistp521_group_set_curve(EC_GROUP *group, const BIGNUM *p,
1688                                     const BIGNUM *a, const BIGNUM *b,
1689                                     BN_CTX *ctx)
1690 {
1691     int ret = 0;
1692     BN_CTX *new_ctx = NULL;
1693     BIGNUM *curve_p, *curve_a, *curve_b;
1694 
1695     if (ctx == NULL)
1696         if ((ctx = new_ctx = BN_CTX_new()) == NULL)
1697             return 0;
1698     BN_CTX_start(ctx);
1699     if (((curve_p = BN_CTX_get(ctx)) == NULL) ||
1700         ((curve_a = BN_CTX_get(ctx)) == NULL) ||
1701         ((curve_b = BN_CTX_get(ctx)) == NULL))
1702         goto err;
1703     BN_bin2bn(nistp521_curve_params[0], sizeof(felem_bytearray), curve_p);
1704     BN_bin2bn(nistp521_curve_params[1], sizeof(felem_bytearray), curve_a);
1705     BN_bin2bn(nistp521_curve_params[2], sizeof(felem_bytearray), curve_b);
1706     if ((BN_cmp(curve_p, p)) || (BN_cmp(curve_a, a)) || (BN_cmp(curve_b, b))) {
1707         ECerr(EC_F_EC_GFP_NISTP521_GROUP_SET_CURVE,
1708               EC_R_WRONG_CURVE_PARAMETERS);
1709         goto err;
1710     }
1711     group->field_mod_func = BN_nist_mod_521;
1712     ret = ec_GFp_simple_group_set_curve(group, p, a, b, ctx);
1713  err:
1714     BN_CTX_end(ctx);
1715     if (new_ctx != NULL)
1716         BN_CTX_free(new_ctx);
1717     return ret;
1718 }
1719 
1720 /*
1721  * Takes the Jacobian coordinates (X, Y, Z) of a point and returns (X', Y') =
1722  * (X/Z^2, Y/Z^3)
1723  */
ec_GFp_nistp521_point_get_affine_coordinates(const EC_GROUP * group,const EC_POINT * point,BIGNUM * x,BIGNUM * y,BN_CTX * ctx)1724 int ec_GFp_nistp521_point_get_affine_coordinates(const EC_GROUP *group,
1725                                                  const EC_POINT *point,
1726                                                  BIGNUM *x, BIGNUM *y,
1727                                                  BN_CTX *ctx)
1728 {
1729     felem z1, z2, x_in, y_in, x_out, y_out;
1730     largefelem tmp;
1731 
1732     if (EC_POINT_is_at_infinity(group, point)) {
1733         ECerr(EC_F_EC_GFP_NISTP521_POINT_GET_AFFINE_COORDINATES,
1734               EC_R_POINT_AT_INFINITY);
1735         return 0;
1736     }
1737     if ((!BN_to_felem(x_in, &point->X)) || (!BN_to_felem(y_in, &point->Y)) ||
1738         (!BN_to_felem(z1, &point->Z)))
1739         return 0;
1740     felem_inv(z2, z1);
1741     felem_square(tmp, z2);
1742     felem_reduce(z1, tmp);
1743     felem_mul(tmp, x_in, z1);
1744     felem_reduce(x_in, tmp);
1745     felem_contract(x_out, x_in);
1746     if (x != NULL) {
1747         if (!felem_to_BN(x, x_out)) {
1748             ECerr(EC_F_EC_GFP_NISTP521_POINT_GET_AFFINE_COORDINATES,
1749                   ERR_R_BN_LIB);
1750             return 0;
1751         }
1752     }
1753     felem_mul(tmp, z1, z2);
1754     felem_reduce(z1, tmp);
1755     felem_mul(tmp, y_in, z1);
1756     felem_reduce(y_in, tmp);
1757     felem_contract(y_out, y_in);
1758     if (y != NULL) {
1759         if (!felem_to_BN(y, y_out)) {
1760             ECerr(EC_F_EC_GFP_NISTP521_POINT_GET_AFFINE_COORDINATES,
1761                   ERR_R_BN_LIB);
1762             return 0;
1763         }
1764     }
1765     return 1;
1766 }
1767 
1768 /* points below is of size |num|, and tmp_felems is of size |num+1/ */
make_points_affine(size_t num,felem points[][3],felem tmp_felems[])1769 static void make_points_affine(size_t num, felem points[][3],
1770                                felem tmp_felems[])
1771 {
1772     /*
1773      * Runs in constant time, unless an input is the point at infinity (which
1774      * normally shouldn't happen).
1775      */
1776     ec_GFp_nistp_points_make_affine_internal(num,
1777                                              points,
1778                                              sizeof(felem),
1779                                              tmp_felems,
1780                                              (void (*)(void *))felem_one,
1781                                              felem_is_zero_int,
1782                                              (void (*)(void *, const void *))
1783                                              felem_assign,
1784                                              (void (*)(void *, const void *))
1785                                              felem_square_reduce, (void (*)
1786                                                                    (void *,
1787                                                                     const void
1788                                                                     *,
1789                                                                     const void
1790                                                                     *))
1791                                              felem_mul_reduce,
1792                                              (void (*)(void *, const void *))
1793                                              felem_inv,
1794                                              (void (*)(void *, const void *))
1795                                              felem_contract);
1796 }
1797 
1798 /*
1799  * Computes scalar*generator + \sum scalars[i]*points[i], ignoring NULL
1800  * values Result is stored in r (r can equal one of the inputs).
1801  */
ec_GFp_nistp521_points_mul(const EC_GROUP * group,EC_POINT * r,const BIGNUM * scalar,size_t num,const EC_POINT * points[],const BIGNUM * scalars[],BN_CTX * ctx)1802 int ec_GFp_nistp521_points_mul(const EC_GROUP *group, EC_POINT *r,
1803                                const BIGNUM *scalar, size_t num,
1804                                const EC_POINT *points[],
1805                                const BIGNUM *scalars[], BN_CTX *ctx)
1806 {
1807     int ret = 0;
1808     int j;
1809     int mixed = 0;
1810     BN_CTX *new_ctx = NULL;
1811     BIGNUM *x, *y, *z, *tmp_scalar;
1812     felem_bytearray g_secret;
1813     felem_bytearray *secrets = NULL;
1814     felem(*pre_comp)[17][3] = NULL;
1815     felem *tmp_felems = NULL;
1816     unsigned i;
1817     int num_bytes;
1818     int have_pre_comp = 0;
1819     size_t num_points = num;
1820     felem x_in, y_in, z_in, x_out, y_out, z_out;
1821     NISTP521_PRE_COMP *pre = NULL;
1822     felem(*g_pre_comp)[3] = NULL;
1823     EC_POINT *generator = NULL;
1824     const EC_POINT *p = NULL;
1825     const BIGNUM *p_scalar = NULL;
1826 
1827     if (ctx == NULL)
1828         if ((ctx = new_ctx = BN_CTX_new()) == NULL)
1829             return 0;
1830     BN_CTX_start(ctx);
1831     if (((x = BN_CTX_get(ctx)) == NULL) ||
1832         ((y = BN_CTX_get(ctx)) == NULL) ||
1833         ((z = BN_CTX_get(ctx)) == NULL) ||
1834         ((tmp_scalar = BN_CTX_get(ctx)) == NULL))
1835         goto err;
1836 
1837     if (scalar != NULL) {
1838         pre = EC_EX_DATA_get_data(group->extra_data,
1839                                   nistp521_pre_comp_dup,
1840                                   nistp521_pre_comp_free,
1841                                   nistp521_pre_comp_clear_free);
1842         if (pre)
1843             /* we have precomputation, try to use it */
1844             g_pre_comp = &pre->g_pre_comp[0];
1845         else
1846             /* try to use the standard precomputation */
1847             g_pre_comp = (felem(*)[3]) gmul;
1848         generator = EC_POINT_new(group);
1849         if (generator == NULL)
1850             goto err;
1851         /* get the generator from precomputation */
1852         if (!felem_to_BN(x, g_pre_comp[1][0]) ||
1853             !felem_to_BN(y, g_pre_comp[1][1]) ||
1854             !felem_to_BN(z, g_pre_comp[1][2])) {
1855             ECerr(EC_F_EC_GFP_NISTP521_POINTS_MUL, ERR_R_BN_LIB);
1856             goto err;
1857         }
1858         if (!EC_POINT_set_Jprojective_coordinates_GFp(group,
1859                                                       generator, x, y, z,
1860                                                       ctx))
1861             goto err;
1862         if (0 == EC_POINT_cmp(group, generator, group->generator, ctx))
1863             /* precomputation matches generator */
1864             have_pre_comp = 1;
1865         else
1866             /*
1867              * we don't have valid precomputation: treat the generator as a
1868              * random point
1869              */
1870             num_points++;
1871     }
1872 
1873     if (num_points > 0) {
1874         if (num_points >= 2) {
1875             /*
1876              * unless we precompute multiples for just one point, converting
1877              * those into affine form is time well spent
1878              */
1879             mixed = 1;
1880         }
1881         secrets = OPENSSL_malloc(num_points * sizeof(felem_bytearray));
1882         pre_comp = OPENSSL_malloc(num_points * 17 * 3 * sizeof(felem));
1883         if (mixed)
1884             tmp_felems =
1885                 OPENSSL_malloc((num_points * 17 + 1) * sizeof(felem));
1886         if ((secrets == NULL) || (pre_comp == NULL)
1887             || (mixed && (tmp_felems == NULL))) {
1888             ECerr(EC_F_EC_GFP_NISTP521_POINTS_MUL, ERR_R_MALLOC_FAILURE);
1889             goto err;
1890         }
1891 
1892         /*
1893          * we treat NULL scalars as 0, and NULL points as points at infinity,
1894          * i.e., they contribute nothing to the linear combination
1895          */
1896         memset(secrets, 0, num_points * sizeof(felem_bytearray));
1897         memset(pre_comp, 0, num_points * 17 * 3 * sizeof(felem));
1898         for (i = 0; i < num_points; ++i) {
1899             if (i == num) {
1900                 /*
1901                  * we didn't have a valid precomputation, so we pick the
1902                  * generator
1903                  */
1904                 p = EC_GROUP_get0_generator(group);
1905                 p_scalar = scalar;
1906             } else {
1907                 /* the i^th point */
1908                 p = points[i];
1909                 p_scalar = scalars[i];
1910             }
1911             if ((p_scalar != NULL) && (p != NULL)) {
1912                 /* reduce scalar to 0 <= scalar < 2^521 */
1913                 if ((BN_num_bits(p_scalar) > 521)
1914                     || (BN_is_negative(p_scalar))) {
1915                     /*
1916                      * this is an unusual input, and we don't guarantee
1917                      * constant-timeness
1918                      */
1919                     if (!BN_nnmod(tmp_scalar, p_scalar, &group->order, ctx)) {
1920                         ECerr(EC_F_EC_GFP_NISTP521_POINTS_MUL, ERR_R_BN_LIB);
1921                         goto err;
1922                     }
1923                     num_bytes = bn_bn2lebinpad(tmp_scalar,
1924                                                secrets[i], sizeof(secrets[i]));
1925                 } else {
1926                     num_bytes = bn_bn2lebinpad(p_scalar,
1927                                                secrets[i], sizeof(secrets[i]));
1928                 }
1929                 if (num_bytes < 0) {
1930                     ECerr(EC_F_EC_GFP_NISTP521_POINTS_MUL, ERR_R_BN_LIB);
1931                     goto err;
1932                 }
1933                 /* precompute multiples */
1934                 if ((!BN_to_felem(x_out, &p->X)) ||
1935                     (!BN_to_felem(y_out, &p->Y)) ||
1936                     (!BN_to_felem(z_out, &p->Z)))
1937                     goto err;
1938                 memcpy(pre_comp[i][1][0], x_out, sizeof(felem));
1939                 memcpy(pre_comp[i][1][1], y_out, sizeof(felem));
1940                 memcpy(pre_comp[i][1][2], z_out, sizeof(felem));
1941                 for (j = 2; j <= 16; ++j) {
1942                     if (j & 1) {
1943                         point_add(pre_comp[i][j][0], pre_comp[i][j][1],
1944                                   pre_comp[i][j][2], pre_comp[i][1][0],
1945                                   pre_comp[i][1][1], pre_comp[i][1][2], 0,
1946                                   pre_comp[i][j - 1][0],
1947                                   pre_comp[i][j - 1][1],
1948                                   pre_comp[i][j - 1][2]);
1949                     } else {
1950                         point_double(pre_comp[i][j][0], pre_comp[i][j][1],
1951                                      pre_comp[i][j][2], pre_comp[i][j / 2][0],
1952                                      pre_comp[i][j / 2][1],
1953                                      pre_comp[i][j / 2][2]);
1954                     }
1955                 }
1956             }
1957         }
1958         if (mixed)
1959             make_points_affine(num_points * 17, pre_comp[0], tmp_felems);
1960     }
1961 
1962     /* the scalar for the generator */
1963     if ((scalar != NULL) && (have_pre_comp)) {
1964         memset(g_secret, 0, sizeof(g_secret));
1965         /* reduce scalar to 0 <= scalar < 2^521 */
1966         if ((BN_num_bits(scalar) > 521) || (BN_is_negative(scalar))) {
1967             /*
1968              * this is an unusual input, and we don't guarantee
1969              * constant-timeness
1970              */
1971             if (!BN_nnmod(tmp_scalar, scalar, &group->order, ctx)) {
1972                 ECerr(EC_F_EC_GFP_NISTP521_POINTS_MUL, ERR_R_BN_LIB);
1973                 goto err;
1974             }
1975             num_bytes = bn_bn2lebinpad(tmp_scalar, g_secret, sizeof(g_secret));
1976         } else {
1977             num_bytes = bn_bn2lebinpad(scalar, g_secret, sizeof(g_secret));
1978         }
1979         /* do the multiplication with generator precomputation */
1980         batch_mul(x_out, y_out, z_out,
1981                   (const felem_bytearray(*))secrets, num_points,
1982                   g_secret,
1983                   mixed, (const felem(*)[17][3])pre_comp,
1984                   (const felem(*)[3])g_pre_comp);
1985     } else {
1986         /* do the multiplication without generator precomputation */
1987         batch_mul(x_out, y_out, z_out,
1988                   (const felem_bytearray(*))secrets, num_points,
1989                   NULL, mixed, (const felem(*)[17][3])pre_comp, NULL);
1990     }
1991     /* reduce the output to its unique minimal representation */
1992     felem_contract(x_in, x_out);
1993     felem_contract(y_in, y_out);
1994     felem_contract(z_in, z_out);
1995     if ((!felem_to_BN(x, x_in)) || (!felem_to_BN(y, y_in)) ||
1996         (!felem_to_BN(z, z_in))) {
1997         ECerr(EC_F_EC_GFP_NISTP521_POINTS_MUL, ERR_R_BN_LIB);
1998         goto err;
1999     }
2000     ret = EC_POINT_set_Jprojective_coordinates_GFp(group, r, x, y, z, ctx);
2001 
2002  err:
2003     BN_CTX_end(ctx);
2004     if (generator != NULL)
2005         EC_POINT_free(generator);
2006     if (new_ctx != NULL)
2007         BN_CTX_free(new_ctx);
2008     if (secrets != NULL)
2009         OPENSSL_free(secrets);
2010     if (pre_comp != NULL)
2011         OPENSSL_free(pre_comp);
2012     if (tmp_felems != NULL)
2013         OPENSSL_free(tmp_felems);
2014     return ret;
2015 }
2016 
ec_GFp_nistp521_precompute_mult(EC_GROUP * group,BN_CTX * ctx)2017 int ec_GFp_nistp521_precompute_mult(EC_GROUP *group, BN_CTX *ctx)
2018 {
2019     int ret = 0;
2020     NISTP521_PRE_COMP *pre = NULL;
2021     int i, j;
2022     BN_CTX *new_ctx = NULL;
2023     BIGNUM *x, *y;
2024     EC_POINT *generator = NULL;
2025     felem tmp_felems[16];
2026 
2027     /* throw away old precomputation */
2028     EC_EX_DATA_free_data(&group->extra_data, nistp521_pre_comp_dup,
2029                          nistp521_pre_comp_free,
2030                          nistp521_pre_comp_clear_free);
2031     if (ctx == NULL)
2032         if ((ctx = new_ctx = BN_CTX_new()) == NULL)
2033             return 0;
2034     BN_CTX_start(ctx);
2035     if (((x = BN_CTX_get(ctx)) == NULL) || ((y = BN_CTX_get(ctx)) == NULL))
2036         goto err;
2037     /* get the generator */
2038     if (group->generator == NULL)
2039         goto err;
2040     generator = EC_POINT_new(group);
2041     if (generator == NULL)
2042         goto err;
2043     BN_bin2bn(nistp521_curve_params[3], sizeof(felem_bytearray), x);
2044     BN_bin2bn(nistp521_curve_params[4], sizeof(felem_bytearray), y);
2045     if (!EC_POINT_set_affine_coordinates_GFp(group, generator, x, y, ctx))
2046         goto err;
2047     if ((pre = nistp521_pre_comp_new()) == NULL)
2048         goto err;
2049     /*
2050      * if the generator is the standard one, use built-in precomputation
2051      */
2052     if (0 == EC_POINT_cmp(group, generator, group->generator, ctx)) {
2053         memcpy(pre->g_pre_comp, gmul, sizeof(pre->g_pre_comp));
2054         goto done;
2055     }
2056     if ((!BN_to_felem(pre->g_pre_comp[1][0], &group->generator->X)) ||
2057         (!BN_to_felem(pre->g_pre_comp[1][1], &group->generator->Y)) ||
2058         (!BN_to_felem(pre->g_pre_comp[1][2], &group->generator->Z)))
2059         goto err;
2060     /* compute 2^130*G, 2^260*G, 2^390*G */
2061     for (i = 1; i <= 4; i <<= 1) {
2062         point_double(pre->g_pre_comp[2 * i][0], pre->g_pre_comp[2 * i][1],
2063                      pre->g_pre_comp[2 * i][2], pre->g_pre_comp[i][0],
2064                      pre->g_pre_comp[i][1], pre->g_pre_comp[i][2]);
2065         for (j = 0; j < 129; ++j) {
2066             point_double(pre->g_pre_comp[2 * i][0],
2067                          pre->g_pre_comp[2 * i][1],
2068                          pre->g_pre_comp[2 * i][2],
2069                          pre->g_pre_comp[2 * i][0],
2070                          pre->g_pre_comp[2 * i][1],
2071                          pre->g_pre_comp[2 * i][2]);
2072         }
2073     }
2074     /* g_pre_comp[0] is the point at infinity */
2075     memset(pre->g_pre_comp[0], 0, sizeof(pre->g_pre_comp[0]));
2076     /* the remaining multiples */
2077     /* 2^130*G + 2^260*G */
2078     point_add(pre->g_pre_comp[6][0], pre->g_pre_comp[6][1],
2079               pre->g_pre_comp[6][2], pre->g_pre_comp[4][0],
2080               pre->g_pre_comp[4][1], pre->g_pre_comp[4][2],
2081               0, pre->g_pre_comp[2][0], pre->g_pre_comp[2][1],
2082               pre->g_pre_comp[2][2]);
2083     /* 2^130*G + 2^390*G */
2084     point_add(pre->g_pre_comp[10][0], pre->g_pre_comp[10][1],
2085               pre->g_pre_comp[10][2], pre->g_pre_comp[8][0],
2086               pre->g_pre_comp[8][1], pre->g_pre_comp[8][2],
2087               0, pre->g_pre_comp[2][0], pre->g_pre_comp[2][1],
2088               pre->g_pre_comp[2][2]);
2089     /* 2^260*G + 2^390*G */
2090     point_add(pre->g_pre_comp[12][0], pre->g_pre_comp[12][1],
2091               pre->g_pre_comp[12][2], pre->g_pre_comp[8][0],
2092               pre->g_pre_comp[8][1], pre->g_pre_comp[8][2],
2093               0, pre->g_pre_comp[4][0], pre->g_pre_comp[4][1],
2094               pre->g_pre_comp[4][2]);
2095     /* 2^130*G + 2^260*G + 2^390*G */
2096     point_add(pre->g_pre_comp[14][0], pre->g_pre_comp[14][1],
2097               pre->g_pre_comp[14][2], pre->g_pre_comp[12][0],
2098               pre->g_pre_comp[12][1], pre->g_pre_comp[12][2],
2099               0, pre->g_pre_comp[2][0], pre->g_pre_comp[2][1],
2100               pre->g_pre_comp[2][2]);
2101     for (i = 1; i < 8; ++i) {
2102         /* odd multiples: add G */
2103         point_add(pre->g_pre_comp[2 * i + 1][0],
2104                   pre->g_pre_comp[2 * i + 1][1],
2105                   pre->g_pre_comp[2 * i + 1][2], pre->g_pre_comp[2 * i][0],
2106                   pre->g_pre_comp[2 * i][1], pre->g_pre_comp[2 * i][2], 0,
2107                   pre->g_pre_comp[1][0], pre->g_pre_comp[1][1],
2108                   pre->g_pre_comp[1][2]);
2109     }
2110     make_points_affine(15, &(pre->g_pre_comp[1]), tmp_felems);
2111 
2112  done:
2113     if (!EC_EX_DATA_set_data(&group->extra_data, pre, nistp521_pre_comp_dup,
2114                              nistp521_pre_comp_free,
2115                              nistp521_pre_comp_clear_free))
2116         goto err;
2117     ret = 1;
2118     pre = NULL;
2119  err:
2120     BN_CTX_end(ctx);
2121     if (generator != NULL)
2122         EC_POINT_free(generator);
2123     if (new_ctx != NULL)
2124         BN_CTX_free(new_ctx);
2125     if (pre)
2126         nistp521_pre_comp_free(pre);
2127     return ret;
2128 }
2129 
ec_GFp_nistp521_have_precompute_mult(const EC_GROUP * group)2130 int ec_GFp_nistp521_have_precompute_mult(const EC_GROUP *group)
2131 {
2132     if (EC_EX_DATA_get_data(group->extra_data, nistp521_pre_comp_dup,
2133                             nistp521_pre_comp_free,
2134                             nistp521_pre_comp_clear_free)
2135         != NULL)
2136         return 1;
2137     else
2138         return 0;
2139 }
2140 
2141 #else
2142 static void *dummy = &dummy;
2143 #endif
2144