QSC Post Quantum Cryptographic Library 1.0.0.6c (A6)
A post quantum secure library written in Ansi C
 
Loading...
Searching...
No Matches
falconbase_avx2.h
1/* 2025 Quantum Resistant Cryptographic Solutions Corporation
2 * All Rights Reserved.
3 *
4 * NOTICE: This software and all accompanying materials are the exclusive
5 * property of Quantum Resistant Cryptographic Solutions Corporation (QRCS).
6 * The intellectual and technical concepts contained within this implementation
7 * are proprietary to QRCS and its authorized licensors and are protected under
8 * applicable U.S. and international copyright, patent, and trade secret laws.
9 *
10 * CRYPTOGRAPHIC STANDARDS:
11 * - This software includes implementations of cryptographic algorithms such as
12 * SHA3, AES, and others. These algorithms are public domain or standardized
13 * by organizations such as NIST and are NOT the property of QRCS.
14 * - However, all source code, optimizations, and implementations in this library
15 * are original works of QRCS and are protected under this license.
16 *
17 * RESTRICTIONS:
18 * - Redistribution, modification, or unauthorized distribution of this software,
19 * in whole or in part, is strictly prohibited.
20 * - This software is provided for non-commercial, educational, and research
21 * purposes only. Commercial use in any form is expressly forbidden.
22 * - Licensing and authorized distribution are solely at the discretion of QRCS.
23 * - Any use of this software implies acceptance of these restrictions.
24 *
25 * DISCLAIMER:
26 * This software is provided "as is," without warranty of any kind, express or
27 * implied, including but not limited to warranties of merchantability or fitness
28 * for a particular purpose. QRCS disclaims all liability for any direct, indirect,
29 * incidental, or consequential damages resulting from the use or misuse of this software.
30 *
31 * FULL LICENSE:
32 * This software is subject to the **Quantum Resistant Cryptographic Solutions
33 * Proprietary License (QRCS-PL)**. The complete license terms are included
34 * in the LICENSE.txt file distributed with this software.
35 *
36 * Written by: John G. Underhill
37 * Contact: john.underhill@protonmail.com
38 */
39
40#ifndef QSC_FALCONBASE_AVX2_H
41#define QSC_FALCONBASE_AVX2_H
42
43#include "common.h"
44
45/* \cond */
46
47QSC_CPLUSPLUS_ENABLED_START
48
49#if defined(QSC_SYSTEM_HAS_AVX2)
50
51#include "intrinsics.h"
52#include "sha3.h"
53#include <math.h>
54
55/* api.h */
56
57#if defined(QSC_FALCON_S3SHAKE256F512)
58# define CRYPTO_SECRETKEYBYTES 1281
59# define CRYPTO_PUBLICKEYBYTES 897
60# define CRYPTO_BYTES 690
61# define CRYPTO_ALGNAME "Falcon-512"
62#elif defined(QSC_FALCON_S5SHAKE256F1024)
63# define CRYPTO_SECRETKEYBYTES 2305
64# define CRYPTO_PUBLICKEYBYTES 1793
65# define CRYPTO_BYTES 1330
66# define CRYPTO_ALGNAME "Falcon-1024"
67#endif
68
69/* falcon_fpr.h */
70
71#define FALCON_FPR_GM_TAB_SIZE 2048
72#define FALCON_FPR_INV_SIGMA_SIZE 11
73#define FALCON_FPR_GM_P2_SIZE 11
74#define FALCON_Q 12289
75#define FALCON_Q0I 12287
76#define FALCON_R 4091
77#define FALCON_R2 10952
78#define FALCON_GMB_SIZE 1024
79#define FALCON_KEYGEN_TEMP_1 136
80#define FALCON_KEYGEN_TEMP_2 272
81#define FALCON_KEYGEN_TEMP_3 224
82#define FALCON_KEYGEN_TEMP_4 448
83#define FALCON_KEYGEN_TEMP_5 896
84#define FALCON_KEYGEN_TEMP_6 1792
85#define FALCON_KEYGEN_TEMP_7 3584
86#define FALCON_KEYGEN_TEMP_8 7168
87#define FALCON_KEYGEN_TEMP_9 14336
88#define FALCON_KEYGEN_TEMP_10 28672
89#define FALCON_SMALL_PRIME_SIZE 522
90#define FALCON_GAUS_1024_12289_SIZE 27
91#define FALCON_MAX_BL_SMALL_SIZE 11
92#define FALCON_MAX_BL_LARGE_SIZE 10
93#define FALCON_DEPTH_INT_FG 4
94#define FALCON_NONCE_SIZE 40
95#define FALCON_L2BOUND_SIZE 11
96#define FALCON_MAXBITS_SIZE 11
97#define FALCON_REV10_SIZE 1024
98
99#if defined(__GNUC__)
100# if defined(FALCON_FMA)
101# define FALCON_TARGET_AVX2 __attribute__((target("avx2,fma")))
102# else
103# define FALCON_TARGET_AVX2 __attribute__((target("avx2")))
104# endif
105#elif defined(_MSC_VER)
106# define FALCON_TARGET_AVX2
107# pragma warning( disable : 4752 )
108#endif
109
110inline static __m256d falcon_fmadd(__m256d a, __m256d b, __m256d c)
111{
112#if defined(FALCON_FMA)
113 return _mm256_fmadd_pd(a, b, c);
114#else
115 __m256d tmp;
116 tmp = _mm256_mul_pd(a, b);
117 tmp = _mm256_add_pd(tmp, c);
118 return tmp;
119#endif
120}
121
122inline static __m256d falcon_fmsub(__m256d a, __m256d b, __m256d c)
123{
124 /* Note artifact, unused function */
125#if defined(FALCON_FMA)
126 return _mm256_fmsub_pd(a, b, c);
127#else
128 __m256d tmp;
129 tmp = _mm256_mul_pd(a, b);
130 return _mm256_sub_pd(tmp, c);
131#endif
132}
133
134//inline static uint32_t falcon_set_fpu_cw(uint32_t x)
135//{
136//#if defined __GNUC__ && defined __i386__
137// uint32_t short t;
138// uint32_t old;
139//
140// __asm__ __volatile__("fstcw %0" : "=m" (t) : : );
141// old = (t & 0x0300u) >> 8;
142// t = (uint32_t short)((t & ~0x0300u) | (x << 8));
143// __asm__ __volatile__("fldcw %0" : : "m" (t) : );
144// return old;
145//#elif defined _M_IX86
146// uint32_t short t;
147// uint32_t old;
148//
149// __asm { fstcw t }
150// old = (t & 0x0300u) >> 8;
151// t = (uint32_t short)((t & ~0x0300u) | (x << 8));
152// __asm { fldcw t }
153// return old;
154//#else
155// return x;
156//#endif
157//}
158
159/*
160 * For optimal reproducibility of values, we need to disable contraction
161 * of floating-point expressions; otherwise, on some architectures (e.g.
162 * PowerPC), the compiler may generate fused-multiply-add opcodes that
163 * may round differently than two successive separate opcodes. C99 defines
164 * a standard pragma for that, but GCC-6.2.2 appears to ignore it,
165 * hence the GCC-specific pragma (that Clang does not support).
166 */
167#if defined __clang__
168# pragma STDC FP_CONTRACT OFF
169#elif defined __GNUC__
170# pragma GCC optimize ("fp-contract=off")
171#endif
172
173 /* prng.c */
174
175typedef struct
176{
177 QSC_ALIGN(8) uint8_t buf[512];
178 QSC_ALIGN(8) uint8_t state[256];
179 size_t ptr;
180 int32_t type;
181} falcon_prng_state;
182
183inline static void falcon_chacha_round(uint32_t state[16], size_t a, size_t b, size_t c, size_t d)
184{
185 state[a] += state[b];
186 state[d] ^= state[a];
187 state[d] = (state[d] << 16) | (state[d] >> 16);
188 state[c] += state[d];
189 state[b] ^= state[c];
190 state[b] = (state[b] << 12) | (state[b] >> 20);
191 state[a] += state[b];
192 state[d] ^= state[a];
193 state[d] = (state[d] << 8) | (state[d] >> 24);
194 state[c] += state[d];
195 state[b] ^= state[c];
196 state[b] = (state[b] << 7) | (state[b] >> 25);
197}
198
199/*
200 * We wrap the native 'double' type into a structure so that the C compiler
201 * complains if we inadvertently use raw arithmetic operators on the 'falcon_fpr'
202 * type instead of using the inline functions below. This should have no
203 * extra runtime cost, since all the functions below are 'inline'.
204 */
205typedef struct { double v; } falcon_fpr;
206
207static const falcon_fpr falcon_fpr_q = { 12289.0 };
208static const falcon_fpr falcon_fpr_inverse_of_q = { 1.0 / 12289.0 };
209static const falcon_fpr falcon_fpr_inv_2sqrsigma0 = { 0.150865048875372721532312163019 };
210static const falcon_fpr falcon_fpr_log2 = { 0.69314718055994530941723212146 };
211static const falcon_fpr falcon_fpr_inv_log2 = { 1.4426950408889634073599246810 };
212static const falcon_fpr falcon_fpr_bnorm_max = { 16822.4121 };
213static const falcon_fpr falcon_fpr_zero = { 0.0 };
214static const falcon_fpr falcon_fpr_one = { 1.0 };
215static const falcon_fpr falcon_fpr_two = { 2.0 };
216static const falcon_fpr falcon_fpr_onehalf = { 0.5 };
217static const falcon_fpr falcon_fpr_invsqrt2 = { 0.707106781186547524400844362105 };
218static const falcon_fpr falcon_fpr_invsqrt8 = { 0.353553390593273762200422181052 };
219static const falcon_fpr falcon_fpr_ptwo31 = { 2147483648.0 };
220static const falcon_fpr falcon_fpr_ptwo31m1 = { 2147483647.0 };
221static const falcon_fpr falcon_fpr_mtwo31m1 = { -2147483647.0 };
222static const falcon_fpr falcon_fpr_ptwo63m1 = { 9223372036854775807.0 };
223static const falcon_fpr falcon_fpr_mtwo63m1 = { -9223372036854775807.0 };
224static const falcon_fpr falcon_fpr_ptwo63 = { 9223372036854775808.0 };
225
226extern const falcon_fpr falcon_avx2_fpr_inv_sigma[FALCON_FPR_INV_SIGMA_SIZE];
227
228extern const falcon_fpr falcon_avx2_fpr_sigma_min[FALCON_FPR_INV_SIGMA_SIZE];
229
230extern const falcon_fpr falcon_avx2_fpr_gm_tab[FALCON_FPR_GM_TAB_SIZE];
231
232extern const falcon_fpr falcon_avx2_fpr_p2_tab[FALCON_FPR_GM_P2_SIZE];
233
234inline static falcon_fpr falcon_FPR(double v)
235{
236 falcon_fpr x = { 0 };
237
238 x.v = v;
239
240 return x;
241}
242
243inline static falcon_fpr falcon_fpr_of(int64_t i)
244{
245 return falcon_FPR((double)i);
246}
247
248inline static int64_t falcon_fpr_rint(falcon_fpr x)
249{
250 /*
251 * We do not want to use llrint() since it might be not
252 * constant-time.
253 *
254 * Suppose that x >= 0. If x >= 2^52, then it is already an
255 * integer. Otherwise, if x < 2^52, then computing x+2^52 will
256 * yield a value that will be rounded to the nearest integer
257 * with exactly the right rules (round-to-nearest-even).
258 *
259 * In order to have constant-time processing, we must do the
260 * computation for both x >= 0 and x < 0 cases, and use a
261 * cast to an integer to access the sign and select the proper
262 * value. Such casts also allow us to find out if |x| < 2^52.
263 */
264 int64_t sx, tx, rp, rn, m;
265 uint32_t ub;
266
267 sx = (int64_t)(x.v - 1.0);
268 tx = (int64_t)x.v;
269 rp = (int64_t)(x.v + 4503599627370496.0) - 4503599627370496;
270 rn = (int64_t)(x.v - 4503599627370496.0) + 4503599627370496;
271
272 /*
273 * If tx >= 2^52 or tx < -2^52, then result is tx.
274 * Otherwise, if sx >= 0, then result is rp.
275 * Otherwise, result is rn. We use the fact that when x is
276 * close to 0 (|x| <= 0.25) then both rp and rn are correct;
277 * and if x is not close to 0, then trunc(x-1.0) yields the
278 * appropriate sign.
279 */
280
281 /*
282 * Clamp rp to zero if tx < 0.
283 * Clamp rn to zero if tx >= 0.
284 */
285 m = sx >> 63;
286 rn &= m;
287 rp &= ~m;
288
289 /*
290 * Get the 12 upper bits of tx; if they are not all zeros or
291 * all ones, then tx >= 2^52 or tx < -2^52, and we clamp both
292 * rp and rn to zero. Otherwise, we clamp tx to zero.
293 */
294 ub = (uint32_t)((uint64_t)tx >> 52);
295 m = -(int64_t)((((ub + 1) & 0xFFF) - 2) >> 31);
296 rp &= m;
297 rn &= m;
298 tx &= ~m;
299
300 /*
301 * Only one of tx, rn or rp (at most) can be non-zero at this
302 * point.
303 */
304 return tx | rn | rp;
305}
306
307inline static int64_t falcon_fpr_floor(falcon_fpr x)
308{
309 int64_t r;
310
311 /*
312 * The cast performs a trunc() (rounding toward 0) and thus is
313 * wrong by 1 for most negative values. The correction below is
314 * constant-time as long as the compiler turns the
315 * floating-point conversion result into a 0/1 integer without a
316 * conditional branch or another non-constant-time construction.
317 * This should hold on all modern architectures with an FPU (and
318 * if it is false on a given arch, then chances are that the FPU
319 * itself is not constant-time, making the point moot).
320 */
321 r = (int64_t)x.v;
322 return r - (x.v < (double)r);
323}
324
325inline static int64_t falcon_fpr_trunc(falcon_fpr x)
326{
327 return (int64_t)x.v;
328}
329
330inline static falcon_fpr falcon_fpr_add(falcon_fpr x, falcon_fpr y)
331{
332 return falcon_FPR(x.v + y.v);
333}
334
335inline static falcon_fpr falcon_fpr_sub(falcon_fpr x, falcon_fpr y)
336{
337 return falcon_FPR(x.v - y.v);
338}
339
340inline static falcon_fpr falcon_fpr_neg(falcon_fpr x)
341{
342 return falcon_FPR(-x.v);
343}
344
345inline static falcon_fpr falcon_fpr_half(falcon_fpr x)
346{
347 return falcon_FPR(x.v * 0.5);
348}
349
350inline static falcon_fpr falcon_fpr_double(falcon_fpr x)
351{
352 return falcon_FPR(x.v + x.v);
353}
354
355inline static falcon_fpr falcon_fpr_mul(falcon_fpr x, falcon_fpr y)
356{
357 return falcon_FPR(x.v * y.v);
358}
359
360inline static falcon_fpr falcon_fpr_sqr(falcon_fpr x)
361{
362 return falcon_FPR(x.v * x.v);
363}
364
365inline static falcon_fpr falcon_fpr_inv(falcon_fpr x)
366{
367 return falcon_FPR(1.0 / x.v);
368}
369
370inline static falcon_fpr falcon_fpr_div(falcon_fpr x, falcon_fpr y)
371{
372 return falcon_FPR(x.v / y.v);
373}
374
375inline static void falcon_fpr_sqrt_avx2(double *t)
376{
377 __m128d x;
378
379 x = _mm_load1_pd(t);
380 x = _mm_sqrt_pd(x);
381 _mm_storel_pd(t, x);
382}
383
384inline static falcon_fpr falcon_fpr_sqrt(falcon_fpr x)
385{
386 /*
387 * We prefer not to have a dependency on libm when it can be
388 * avoided. On x86, calling the sqrt() libm function inlines
389 * the relevant opcode (fsqrt or sqrtsd, depending on whether
390 * the 387 FPU or SSE2 is used for floating-point operations)
391 * but then makes an optional call to the library function
392 * for proper error handling, in case the operand is negative.
393 *
394 * To avoid this dependency, we use intrinsics or inline assembly
395 * on recognized platforms:
396 *
397 * - If AVX2 is explicitly enabled, then we use SSE2 intrinsics.
398 *
399 * - On GCC/Clang with SSE maths, we use SSE2 intrinsics.
400 *
401 * - On GCC/Clang on i386, or MSVC on i386, we use inline assembly
402 * to call the 387 FPU fsqrt opcode.
403 *
404 * - On GCC/Clang/XLC on PowerPC, we use inline assembly to call
405 * the fsqrt opcode (Clang needs a special hack).
406 *
407 * - On GCC/Clang on ARM with hardware floating-point, we use
408 * inline assembly to call the vqsrt.f64 opcode. Due to a
409 * complex ecosystem of compilers and assembly syntaxes, we
410 * have to call it "fsqrt" or "fsqrtd", depending on case.
411 *
412 * If the platform is not recognized, a call to the system
413 * library function sqrt() is performed. On some compilers, this
414 * may actually inline the relevant opcode, and call the library
415 * function only when the input is invalid (e.g. negative);
416 * Falcon never actually calls sqrt() on a negative value, but
417 * the dependency to libm will still be there.
418 */
419
420 falcon_fpr_sqrt_avx2(&x.v);
421
422 return x;
423}
424
425inline static int32_t falcon_fpr_lt(falcon_fpr x, falcon_fpr y)
426{
427 return x.v < y.v;
428}
429
430inline static uint64_t falcon_fpr_expm_p63(falcon_fpr x, falcon_fpr ccs)
431{
432 /*
433 * Polynomial approximation of exp(-x) is taken from FACCT:
434 * https://eprint.iacr.org/2018/1234
435 * Specifically, values are extracted from the implementation
436 * referenced from the FACCT article, and available at:
437 * https://github.com/raykzhao/gaussian
438 * Tests over more than 24 billions of random inputs in the
439 * 0..log(2) range have never shown a deviation larger than
440 * 2^(-50) from the true mathematical value.
441 */
442
443 /*
444 * AVX2 implementation uses more operations than Horner's method,
445 * but with a lower expression tree depth. This helps because
446 * additions and multiplications have a latency of 4 cycles on
447 * a Skylake, but the CPU can issue two of them per cycle.
448 */
449
450 static const union
451 {
452 double d[12];
453 __m256d v[3];
454 } c = {
455 {
456 0.999999999999994892974086724280,
457 0.500000000000019206858326015208,
458 0.166666666666984014666397229121,
459 0.041666666666110491190622155955,
460 0.008333333327800835146903501993,
461 0.001388888894063186997887560103,
462 0.000198412739277311890541063977,
463 0.000024801566833585381209939524,
464 0.000002755586350219122514855659,
465 0.000000275607356160477811864927,
466 0.000000025299506379442070029551,
467 0.000000002073772366009083061987
468 }
469 };
470
471 __m256d d14;
472 __m256d d58;
473 __m256d d9c;
474 double d1;
475 double d2;
476 double d4;
477 double d8;
478 double y;
479
480 d1 = -x.v;
481 d2 = d1 * d1;
482 d4 = d2 * d2;
483 d8 = d4 * d4;
484 d14 = _mm256_set_pd(d4, d2 * d1, d2, d1);
485 d58 = _mm256_mul_pd(d14, _mm256_set1_pd(d4));
486 d9c = _mm256_mul_pd(d14, _mm256_set1_pd(d8));
487 d14 = _mm256_mul_pd(d14, _mm256_loadu_pd(&c.d[0]));
488 d58 = falcon_fmadd(d58, _mm256_loadu_pd(&c.d[4]), d14);
489 d9c = falcon_fmadd(d9c, _mm256_loadu_pd(&c.d[8]), d58);
490 d9c = _mm256_hadd_pd(d9c, d9c);
491 y = 1.0 + _mm_cvtsd_f64(_mm256_castpd256_pd128(d9c)) + _mm_cvtsd_f64(_mm256_extractf128_pd(d9c, 1));
492 y *= ccs.v;
493
494 /*
495 * Final conversion goes through int64_t first, because that's what
496 * the underlying opcode (vcvttsd2si) will do, and we know that the
497 * result will fit, since x >= 0 and ccs < 1. If we did the
498 * conversion directly to uint64_t, then the compiler would add some
499 * extra code to cover the case of a source value of 2^63 or more,
500 * and though the alternate path would never be exercised, the
501 * extra comparison would cost us some cycles.
502 */
503 return (uint64_t)(int64_t)(y * falcon_fpr_ptwo63.v);
504
505}
506
507inline static size_t falcon_mkn(uint32_t logn)
508{
509 return ((size_t)1 << logn);
510}
511
512/* fft.c */
513
514inline static void falcon_fpc_add(falcon_fpr* d_re, falcon_fpr* d_im, falcon_fpr a_re, falcon_fpr a_im, falcon_fpr b_re, falcon_fpr b_im)
515{
516 falcon_fpr fpct_re;
517 falcon_fpr fpct_im;
518
519 fpct_re = falcon_fpr_add(a_re, b_re);
520 fpct_im = falcon_fpr_add(a_im, b_im);
521 *d_re = fpct_re;
522 *d_im = fpct_im;
523}
524
525inline static void falcon_fpc_sub(falcon_fpr* d_re, falcon_fpr* d_im, falcon_fpr a_re, falcon_fpr a_im, falcon_fpr b_re, falcon_fpr b_im)
526{
527 falcon_fpr fpct_re;
528 falcon_fpr fpct_im;
529
530 fpct_re = falcon_fpr_sub(a_re, b_re);
531 fpct_im = falcon_fpr_sub(a_im, b_im);
532 *d_re = fpct_re;
533 *d_im = fpct_im;
534}
535
536inline static void falcon_fpc_mul(falcon_fpr* d_re, falcon_fpr* d_im, falcon_fpr a_re, falcon_fpr a_im, falcon_fpr b_re, falcon_fpr b_im)
537{
538 falcon_fpr fpct_a_re;
539 falcon_fpr fpct_a_im;
540 falcon_fpr fpct_b_re;
541 falcon_fpr fpct_b_im;
542 falcon_fpr fpct_d_re;
543 falcon_fpr fpct_d_im;
544
545 fpct_a_re = a_re;
546 fpct_a_im = a_im;
547 fpct_b_re = b_re;
548 fpct_b_im = b_im;
549 fpct_d_re = falcon_fpr_sub(falcon_fpr_mul(fpct_a_re, fpct_b_re), falcon_fpr_mul(fpct_a_im, fpct_b_im));
550 fpct_d_im = falcon_fpr_add(falcon_fpr_mul(fpct_a_re, fpct_b_im), falcon_fpr_mul(fpct_a_im, fpct_b_re));
551 *d_re = fpct_d_re;
552 *d_im = fpct_d_im;
553}
554
555inline static void falcon_fpc_div(falcon_fpr* d_re, falcon_fpr* d_im, falcon_fpr a_re, falcon_fpr a_im, falcon_fpr b_re, falcon_fpr b_im)
556{
557 falcon_fpr fpct_a_re;
558 falcon_fpr fpct_a_im;
559 falcon_fpr fpct_b_re;
560 falcon_fpr fpct_b_im;
561 falcon_fpr fpct_d_re;
562 falcon_fpr fpct_d_im;
563 falcon_fpr fpct_m;
564
565 fpct_a_re = a_re;
566 fpct_a_im = a_im;
567 fpct_b_re = b_re;
568 fpct_b_im = b_im;
569 fpct_m = falcon_fpr_add(falcon_fpr_sqr(fpct_b_re), falcon_fpr_sqr(fpct_b_im));
570 fpct_m = falcon_fpr_inv(fpct_m);
571 fpct_b_re = falcon_fpr_mul(fpct_b_re, fpct_m);
572 fpct_b_im = falcon_fpr_mul(falcon_fpr_neg(fpct_b_im), fpct_m);
573 fpct_d_re = falcon_fpr_sub(falcon_fpr_mul(fpct_a_re, fpct_b_re), falcon_fpr_mul(fpct_a_im, fpct_b_im));
574 fpct_d_im = falcon_fpr_add(falcon_fpr_mul(fpct_a_re, fpct_b_im), falcon_fpr_mul(fpct_a_im, fpct_b_re));
575 *d_re = fpct_d_re;
576 *d_im = fpct_d_im;
577}
578
579/* codec.c */
580
581extern const uint8_t falcon_avx2_max_fg_bits[FALCON_MAXBITS_SIZE];
582extern const uint8_t falcon_falcon_max_FG_bits[FALCON_MAXBITS_SIZE];
583
584/* sign.c */
585
586typedef struct
587{
588 falcon_prng_state p;
589 falcon_fpr sigma_min;
590} falcon_sampler_context;
591
592typedef int32_t(*falcon_samplerZ)(void* ctx, falcon_fpr mu, falcon_fpr sigma);
593
594inline static uint32_t falcon_ffLDL_treesize(uint32_t logn)
595{
596 /*
597 * Get the size of the LDL tree for an input with polynomials of size
598 * 2^logn. The size is expressed in the number of elements.
599 * For logn = 0 (polynomials are constant), the "tree" is a
600 * single element. Otherwise, the tree node has size 2^logn, and
601 * has two child trees for size logn-1 each. Thus, treesize s()
602 * must fulfill these two relations:
603 *
604 * s(0) = 1
605 * s(logn) = (2^logn) + 2*s(logn-1)
606 */
607
608 return (logn + 1) << logn;
609}
610
611inline static size_t falcon_skoff_b00(uint32_t logn)
612{
613 (void)logn;
614 return 0;
615}
616
617inline static size_t falcon_skoff_b01(uint32_t logn)
618{
619 return falcon_mkn(logn);
620}
621
622inline static size_t falcon_skoff_b10(uint32_t logn)
623{
624 return 2 * falcon_mkn(logn);
625}
626
627inline static size_t falcon_skoff_b11(uint32_t logn)
628{
629 return 3 * falcon_mkn(logn);
630}
631
632inline static size_t falcon_skoff_tree(uint32_t logn)
633{
634 return 4 * falcon_mkn(logn);
635}
636
637/* keygen.c */
638
639extern const uint32_t falcon_avx2_l2bound[FALCON_L2BOUND_SIZE];
640
641extern const uint64_t falcon_avx2_gauss_1024_12289[FALCON_GAUS_1024_12289_SIZE];
642
643extern const uint16_t falcon_avx2_falcon_rev10[FALCON_REV10_SIZE];
644
645extern const size_t falcon_avx2_max_bl_small[FALCON_MAX_BL_SMALL_SIZE];
646
647extern const size_t falcon_avx2_max_bl_large[FALCON_MAX_BL_LARGE_SIZE];
648
649/*
650 * Average and standard deviation for the maximum size (in bits) of
651 * coefficients of (f,g), depending on depth. These values are used
652 * to compute bounds for Babai's reduction.
653 */
654static const struct {
655 int32_t avg;
656 int32_t std;
657} falcon_bit_length[] = {
658 { 4, 0 },
659 { 11, 1 },
660 { 24, 1 },
661 { 50, 1 },
662 { 102, 1 },
663 { 202, 2 },
664 { 401, 4 },
665 { 794, 5 },
666 { 1577, 8 },
667 { 3138, 13 },
668 { 6308, 25 }
669};
670
671inline static uint32_t falcon_modp_set(int32_t x, uint32_t p)
672{
673 /*
674 * Reduce a small signed integer modulo a small prime. The source
675 * value x MUST be such that -p < x < p.
676 */
677
678 uint32_t w;
679
680 w = (uint32_t)x;
681 w += p & (uint32_t)-(int32_t)(w >> 31);
682 return w;
683}
684
685inline static int32_t falcon_modp_norm(uint32_t x, uint32_t p)
686{
687 /*
688 * Normalize a modular integer around 0.
689 */
690
691 return (int32_t)(x - (p & (((x - ((p + 1) >> 1)) >> 31) - 1)));
692}
693
694inline static uint32_t falcon_modp_ninv31(uint32_t p)
695{
696 /*
697 * Compute -1/p mod 2^31. This works for all odd integers p that fit on 31 bits.
698 */
699 uint32_t y;
700
701 y = 2 - p;
702 y *= 2 - p * y;
703 y *= 2 - p * y;
704 y *= 2 - p * y;
705 y *= 2 - p * y;
706
707 return (uint32_t)0x7FFFFFFFUL & (uint32_t)-(int32_t)y;
708}
709
710inline static uint32_t falcon_modp_R(uint32_t p)
711{
712 /*
713 * Since 2^30 < p < 2^31, we know that 2^31 mod p is simply 2^31 - p.
714 */
715
716 return ((uint32_t)1 << 31) - p;
717}
718
719inline static uint32_t falcon_modp_add(uint32_t a, uint32_t b, uint32_t p)
720{
721 /*
722 * Addition modulo p.
723 */
724
725 uint32_t d;
726
727 d = a + b - p;
728 d += p & (uint32_t)-(int32_t)(d >> 31);
729
730 return d;
731}
732
733inline static uint32_t falcon_modp_sub(uint32_t a, uint32_t b, uint32_t p)
734{
735 /*
736 * Subtraction modulo p.
737 */
738
739 uint32_t d;
740
741 d = a - b;
742 d += p & (uint32_t)-(int32_t)(d >> 31);
743
744 return d;
745}
746
747inline static uint32_t falcon_modp_montymul(uint32_t a, uint32_t b, uint32_t p, uint32_t p0i)
748{
749 /*
750 * Montgomery multiplication modulo p. The 'p0i' value is -1/p mod 2^31.
751 * It is required that p is an odd integer.
752 */
753
754 uint64_t w;
755 uint64_t z;
756 uint32_t d;
757
758 z = (uint64_t)a * (uint64_t)b;
759 w = ((z * p0i) & (uint64_t)0x7FFFFFFF) * p;
760 d = (uint32_t)((z + w) >> 31) - p;
761 d += p & (uint32_t)-(int32_t)(d >> 31);
762
763 return d;
764}
765
766/* verify.c */
767
768typedef struct
769{
770 uint32_t p;
771 uint32_t g;
772 uint32_t s;
773} falcon_small_prime;
774
775extern const uint16_t falcon_avx2_GMb[FALCON_GMB_SIZE];
776
777extern const uint16_t falcon_avx2_iGMb[FALCON_GMB_SIZE];
778
779extern const falcon_small_prime falcon_avx2_small_primes[FALCON_SMALL_PRIME_SIZE];
780
781inline static uint32_t falcon_mq_conv_small(int32_t x)
782{
783 /*
784 * Reduce a small signed integer modulo q. The source integer MUST
785 * be between -q/2 and +q/2.
786 * If x < 0, the cast to uint32_t will set the high bit to 1.
787 */
788 uint32_t y;
789
790 y = (uint32_t)x;
791 y += FALCON_Q & (uint32_t)-(int32_t)(y >> 31);
792
793 return y;
794}
795
796inline static uint32_t falcon_mq_add(uint32_t x, uint32_t y)
797{
798 /*
799 * Addition modulo q. Operands must be in the 0..q-1 range.
800 * We compute x + y - q. If the result is negative, then the
801 * high bit will be set, and 'd >> 31' will be equal to 1;
802 * thus '-(d >> 31)' will be an all-one pattern. Otherwise,
803 * it will be an all-zero pattern. In other words, this
804 * implements a conditional addition of q.
805 */
806 uint32_t d;
807
808 d = x + y - FALCON_Q;
809 d += FALCON_Q & (uint32_t)-(int32_t)(d >> 31);
810
811 return d;
812}
813
814inline static uint32_t falcon_mq_sub(uint32_t x, uint32_t y)
815{
816 /*
817 * Subtraction modulo q. Operands must be in the 0..q-1 range.
818 * As in falcon_mq_add(), we use a conditional addition to ensure the
819 * result is in the 0..q-1 range.
820 */
821
822 uint32_t d;
823
824 d = x - y;
825 d += FALCON_Q & (uint32_t)-(int32_t)(d >> 31);
826
827 return d;
828}
829
830inline static uint32_t falcon_mq_rshift1(uint32_t x)
831{
832 /*
833 * Division by 2 modulo q. Operand must be in the 0..q-1 range.
834 */
835
836 x += FALCON_Q & (uint32_t)-(int32_t)(x & 1);
837 return (x >> 1);
838}
839
840inline static uint32_t falcon_mq_montymul(uint32_t x, uint32_t y)
841{
842 /*
843 * Montgomery multiplication modulo q. If we set R = 2^16 mod q, then
844 * this function computes: x * y / R mod q
845 * Operands must be in the 0..q-1 range.
846 */
847
848 uint32_t w;
849 uint32_t z;
850
851 /*
852 * We compute x*y + k*q with a value of k chosen so that the 16
853 * low bits of the result are 0. We can then shift the value.
854 * After the shift, result may still be larger than q, but it
855 * will be lower than 2*q, so a conditional subtraction works.
856 */
857
858 z = x * y;
859 w = ((z * FALCON_Q0I) & 0x0000FFFFUL) * FALCON_Q;
860
861 /*
862 * When adding z and w, the result will have its low 16 bits
863 * equal to 0. Since x, y and z are lower than q, the sum will
864 * be no more than (2^15 - 1) * q + (q - 1)^2, which will
865 * fit on 29 bits.
866 */
867 z = (z + w) >> 16;
868
869 /*
870 * After the shift, analysis shows that the value will be less
871 * than 2q. We do a subtraction then conditional subtraction to
872 * ensure the result is in the expected range.
873 */
874 z -= FALCON_Q;
875 z += FALCON_Q & (uint32_t)-(int32_t)(z >> 31);
876 return z;
877}
878
879inline static uint32_t falcon_mq_montysqr(uint32_t x)
880{
881 /*
882 * Montgomery squaring (computes (x^2)/R).
883 */
884
885 return falcon_mq_montymul(x, x);
886}
887
888
897int32_t qsc_falcon_avx2_generate_keypair(uint8_t *pk, uint8_t *sk, bool (*rng_generate)(uint8_t*, size_t));
898
909int32_t qsc_falcon_avx2_sign(uint8_t *sm, size_t *smlen, const uint8_t *m, size_t mlen, const uint8_t *sk, bool (*rng_generate)(uint8_t*, size_t));
910
921bool qsc_falcon_avx2_open(uint8_t *m, size_t *mlen, const uint8_t *sm, size_t smlen, const uint8_t *pk);
922
923#endif
924
925QSC_CPLUSPLUS_ENABLED_END
926
927/* \endcond */
928
929#endif
Contains common definitions for the Quantum Secure Cryptographic (QSC) library.
#define QSC_ALIGN(x)
Macro for aligning data to 'x' bytes using GCC/Clang.
Definition common.h:593
SHA3 family of hash functions.