40#ifndef QSC_FALCONBASE_AVX2_H
41#define QSC_FALCONBASE_AVX2_H
47QSC_CPLUSPLUS_ENABLED_START
49#if defined(QSC_SYSTEM_HAS_AVX2)
51#include "intrinsics.h"
57#if defined(QSC_FALCON_S3SHAKE256F512)
58# define CRYPTO_SECRETKEYBYTES 1281
59# define CRYPTO_PUBLICKEYBYTES 897
60# define CRYPTO_BYTES 690
61# define CRYPTO_ALGNAME "Falcon-512"
62#elif defined(QSC_FALCON_S5SHAKE256F1024)
63# define CRYPTO_SECRETKEYBYTES 2305
64# define CRYPTO_PUBLICKEYBYTES 1793
65# define CRYPTO_BYTES 1330
66# define CRYPTO_ALGNAME "Falcon-1024"
71#define FALCON_FPR_GM_TAB_SIZE 2048
72#define FALCON_FPR_INV_SIGMA_SIZE 11
73#define FALCON_FPR_GM_P2_SIZE 11
75#define FALCON_Q0I 12287
77#define FALCON_R2 10952
78#define FALCON_GMB_SIZE 1024
79#define FALCON_KEYGEN_TEMP_1 136
80#define FALCON_KEYGEN_TEMP_2 272
81#define FALCON_KEYGEN_TEMP_3 224
82#define FALCON_KEYGEN_TEMP_4 448
83#define FALCON_KEYGEN_TEMP_5 896
84#define FALCON_KEYGEN_TEMP_6 1792
85#define FALCON_KEYGEN_TEMP_7 3584
86#define FALCON_KEYGEN_TEMP_8 7168
87#define FALCON_KEYGEN_TEMP_9 14336
88#define FALCON_KEYGEN_TEMP_10 28672
89#define FALCON_SMALL_PRIME_SIZE 522
90#define FALCON_GAUS_1024_12289_SIZE 27
91#define FALCON_MAX_BL_SMALL_SIZE 11
92#define FALCON_MAX_BL_LARGE_SIZE 10
93#define FALCON_DEPTH_INT_FG 4
94#define FALCON_NONCE_SIZE 40
95#define FALCON_L2BOUND_SIZE 11
96#define FALCON_MAXBITS_SIZE 11
97#define FALCON_REV10_SIZE 1024
100# if defined(FALCON_FMA)
101# define FALCON_TARGET_AVX2 __attribute__((target("avx2,fma")))
103# define FALCON_TARGET_AVX2 __attribute__((target("avx2")))
105#elif defined(_MSC_VER)
106# define FALCON_TARGET_AVX2
107# pragma warning( disable : 4752 )
110inline static __m256d falcon_fmadd(__m256d a, __m256d b, __m256d c)
112#if defined(FALCON_FMA)
113 return _mm256_fmadd_pd(a, b, c);
116 tmp = _mm256_mul_pd(a, b);
117 tmp = _mm256_add_pd(tmp, c);
122inline static __m256d falcon_fmsub(__m256d a, __m256d b, __m256d c)
125#if defined(FALCON_FMA)
126 return _mm256_fmsub_pd(a, b, c);
129 tmp = _mm256_mul_pd(a, b);
130 return _mm256_sub_pd(tmp, c);
168# pragma STDC FP_CONTRACT OFF
169#elif defined __GNUC__
170# pragma GCC optimize ("fp-contract=off")
183inline static
void falcon_chacha_round(uint32_t state[16],
size_t a,
size_t b,
size_t c,
size_t d)
185 state[a] += state[b];
186 state[d] ^= state[a];
187 state[d] = (state[d] << 16) | (state[d] >> 16);
188 state[c] += state[d];
189 state[b] ^= state[c];
190 state[b] = (state[b] << 12) | (state[b] >> 20);
191 state[a] += state[b];
192 state[d] ^= state[a];
193 state[d] = (state[d] << 8) | (state[d] >> 24);
194 state[c] += state[d];
195 state[b] ^= state[c];
196 state[b] = (state[b] << 7) | (state[b] >> 25);
205typedef struct {
double v; } falcon_fpr;
207static const falcon_fpr falcon_fpr_q = { 12289.0 };
208static const falcon_fpr falcon_fpr_inverse_of_q = { 1.0 / 12289.0 };
209static const falcon_fpr falcon_fpr_inv_2sqrsigma0 = { 0.150865048875372721532312163019 };
210static const falcon_fpr falcon_fpr_log2 = { 0.69314718055994530941723212146 };
211static const falcon_fpr falcon_fpr_inv_log2 = { 1.4426950408889634073599246810 };
212static const falcon_fpr falcon_fpr_bnorm_max = { 16822.4121 };
213static const falcon_fpr falcon_fpr_zero = { 0.0 };
214static const falcon_fpr falcon_fpr_one = { 1.0 };
215static const falcon_fpr falcon_fpr_two = { 2.0 };
216static const falcon_fpr falcon_fpr_onehalf = { 0.5 };
217static const falcon_fpr falcon_fpr_invsqrt2 = { 0.707106781186547524400844362105 };
218static const falcon_fpr falcon_fpr_invsqrt8 = { 0.353553390593273762200422181052 };
219static const falcon_fpr falcon_fpr_ptwo31 = { 2147483648.0 };
220static const falcon_fpr falcon_fpr_ptwo31m1 = { 2147483647.0 };
221static const falcon_fpr falcon_fpr_mtwo31m1 = { -2147483647.0 };
222static const falcon_fpr falcon_fpr_ptwo63m1 = { 9223372036854775807.0 };
223static const falcon_fpr falcon_fpr_mtwo63m1 = { -9223372036854775807.0 };
224static const falcon_fpr falcon_fpr_ptwo63 = { 9223372036854775808.0 };
226extern const falcon_fpr falcon_avx2_fpr_inv_sigma[FALCON_FPR_INV_SIGMA_SIZE];
228extern const falcon_fpr falcon_avx2_fpr_sigma_min[FALCON_FPR_INV_SIGMA_SIZE];
230extern const falcon_fpr falcon_avx2_fpr_gm_tab[FALCON_FPR_GM_TAB_SIZE];
232extern const falcon_fpr falcon_avx2_fpr_p2_tab[FALCON_FPR_GM_P2_SIZE];
234inline static falcon_fpr falcon_FPR(
double v)
236 falcon_fpr x = { 0 };
243inline static falcon_fpr falcon_fpr_of(int64_t i)
245 return falcon_FPR((
double)i);
248inline static int64_t falcon_fpr_rint(falcon_fpr x)
264 int64_t sx, tx, rp, rn, m;
267 sx = (int64_t)(x.v - 1.0);
269 rp = (int64_t)(x.v + 4503599627370496.0) - 4503599627370496;
270 rn = (int64_t)(x.v - 4503599627370496.0) + 4503599627370496;
294 ub = (uint32_t)((uint64_t)tx >> 52);
295 m = -(int64_t)((((ub + 1) & 0xFFF) - 2) >> 31);
307inline static int64_t falcon_fpr_floor(falcon_fpr x)
322 return r - (x.v < (double)r);
325inline static int64_t falcon_fpr_trunc(falcon_fpr x)
330inline static falcon_fpr falcon_fpr_add(falcon_fpr x, falcon_fpr y)
332 return falcon_FPR(x.v + y.v);
335inline static falcon_fpr falcon_fpr_sub(falcon_fpr x, falcon_fpr y)
337 return falcon_FPR(x.v - y.v);
340inline static falcon_fpr falcon_fpr_neg(falcon_fpr x)
342 return falcon_FPR(-x.v);
345inline static falcon_fpr falcon_fpr_half(falcon_fpr x)
347 return falcon_FPR(x.v * 0.5);
350inline static falcon_fpr falcon_fpr_double(falcon_fpr x)
352 return falcon_FPR(x.v + x.v);
355inline static falcon_fpr falcon_fpr_mul(falcon_fpr x, falcon_fpr y)
357 return falcon_FPR(x.v * y.v);
360inline static falcon_fpr falcon_fpr_sqr(falcon_fpr x)
362 return falcon_FPR(x.v * x.v);
365inline static falcon_fpr falcon_fpr_inv(falcon_fpr x)
367 return falcon_FPR(1.0 / x.v);
370inline static falcon_fpr falcon_fpr_div(falcon_fpr x, falcon_fpr y)
372 return falcon_FPR(x.v / y.v);
375inline static void falcon_fpr_sqrt_avx2(
double *t)
384inline static falcon_fpr falcon_fpr_sqrt(falcon_fpr x)
420 falcon_fpr_sqrt_avx2(&x.v);
425inline static int32_t falcon_fpr_lt(falcon_fpr x, falcon_fpr y)
430inline static uint64_t falcon_fpr_expm_p63(falcon_fpr x, falcon_fpr ccs)
456 0.999999999999994892974086724280,
457 0.500000000000019206858326015208,
458 0.166666666666984014666397229121,
459 0.041666666666110491190622155955,
460 0.008333333327800835146903501993,
461 0.001388888894063186997887560103,
462 0.000198412739277311890541063977,
463 0.000024801566833585381209939524,
464 0.000002755586350219122514855659,
465 0.000000275607356160477811864927,
466 0.000000025299506379442070029551,
467 0.000000002073772366009083061987
484 d14 = _mm256_set_pd(d4, d2 * d1, d2, d1);
485 d58 = _mm256_mul_pd(d14, _mm256_set1_pd(d4));
486 d9c = _mm256_mul_pd(d14, _mm256_set1_pd(d8));
487 d14 = _mm256_mul_pd(d14, _mm256_loadu_pd(&c.d[0]));
488 d58 = falcon_fmadd(d58, _mm256_loadu_pd(&c.d[4]), d14);
489 d9c = falcon_fmadd(d9c, _mm256_loadu_pd(&c.d[8]), d58);
490 d9c = _mm256_hadd_pd(d9c, d9c);
491 y = 1.0 + _mm_cvtsd_f64(_mm256_castpd256_pd128(d9c)) + _mm_cvtsd_f64(_mm256_extractf128_pd(d9c, 1));
503 return (uint64_t)(int64_t)(y * falcon_fpr_ptwo63.v);
507inline static size_t falcon_mkn(uint32_t logn)
509 return ((
size_t)1 << logn);
514inline static void falcon_fpc_add(falcon_fpr* d_re, falcon_fpr* d_im, falcon_fpr a_re, falcon_fpr a_im, falcon_fpr b_re, falcon_fpr b_im)
519 fpct_re = falcon_fpr_add(a_re, b_re);
520 fpct_im = falcon_fpr_add(a_im, b_im);
525inline static void falcon_fpc_sub(falcon_fpr* d_re, falcon_fpr* d_im, falcon_fpr a_re, falcon_fpr a_im, falcon_fpr b_re, falcon_fpr b_im)
530 fpct_re = falcon_fpr_sub(a_re, b_re);
531 fpct_im = falcon_fpr_sub(a_im, b_im);
536inline static void falcon_fpc_mul(falcon_fpr* d_re, falcon_fpr* d_im, falcon_fpr a_re, falcon_fpr a_im, falcon_fpr b_re, falcon_fpr b_im)
538 falcon_fpr fpct_a_re;
539 falcon_fpr fpct_a_im;
540 falcon_fpr fpct_b_re;
541 falcon_fpr fpct_b_im;
542 falcon_fpr fpct_d_re;
543 falcon_fpr fpct_d_im;
549 fpct_d_re = falcon_fpr_sub(falcon_fpr_mul(fpct_a_re, fpct_b_re), falcon_fpr_mul(fpct_a_im, fpct_b_im));
550 fpct_d_im = falcon_fpr_add(falcon_fpr_mul(fpct_a_re, fpct_b_im), falcon_fpr_mul(fpct_a_im, fpct_b_re));
555inline static void falcon_fpc_div(falcon_fpr* d_re, falcon_fpr* d_im, falcon_fpr a_re, falcon_fpr a_im, falcon_fpr b_re, falcon_fpr b_im)
557 falcon_fpr fpct_a_re;
558 falcon_fpr fpct_a_im;
559 falcon_fpr fpct_b_re;
560 falcon_fpr fpct_b_im;
561 falcon_fpr fpct_d_re;
562 falcon_fpr fpct_d_im;
569 fpct_m = falcon_fpr_add(falcon_fpr_sqr(fpct_b_re), falcon_fpr_sqr(fpct_b_im));
570 fpct_m = falcon_fpr_inv(fpct_m);
571 fpct_b_re = falcon_fpr_mul(fpct_b_re, fpct_m);
572 fpct_b_im = falcon_fpr_mul(falcon_fpr_neg(fpct_b_im), fpct_m);
573 fpct_d_re = falcon_fpr_sub(falcon_fpr_mul(fpct_a_re, fpct_b_re), falcon_fpr_mul(fpct_a_im, fpct_b_im));
574 fpct_d_im = falcon_fpr_add(falcon_fpr_mul(fpct_a_re, fpct_b_im), falcon_fpr_mul(fpct_a_im, fpct_b_re));
581extern const uint8_t falcon_avx2_max_fg_bits[FALCON_MAXBITS_SIZE];
582extern const uint8_t falcon_falcon_max_FG_bits[FALCON_MAXBITS_SIZE];
589 falcon_fpr sigma_min;
590} falcon_sampler_context;
592typedef int32_t(*falcon_samplerZ)(
void* ctx, falcon_fpr mu, falcon_fpr sigma);
594inline static uint32_t falcon_ffLDL_treesize(uint32_t logn)
608 return (logn + 1) << logn;
611inline static size_t falcon_skoff_b00(uint32_t logn)
617inline static size_t falcon_skoff_b01(uint32_t logn)
619 return falcon_mkn(logn);
622inline static size_t falcon_skoff_b10(uint32_t logn)
624 return 2 * falcon_mkn(logn);
627inline static size_t falcon_skoff_b11(uint32_t logn)
629 return 3 * falcon_mkn(logn);
632inline static size_t falcon_skoff_tree(uint32_t logn)
634 return 4 * falcon_mkn(logn);
639extern const uint32_t falcon_avx2_l2bound[FALCON_L2BOUND_SIZE];
641extern const uint64_t falcon_avx2_gauss_1024_12289[FALCON_GAUS_1024_12289_SIZE];
643extern const uint16_t falcon_avx2_falcon_rev10[FALCON_REV10_SIZE];
645extern const size_t falcon_avx2_max_bl_small[FALCON_MAX_BL_SMALL_SIZE];
647extern const size_t falcon_avx2_max_bl_large[FALCON_MAX_BL_LARGE_SIZE];
657} falcon_bit_length[] = {
671inline static uint32_t falcon_modp_set(int32_t x, uint32_t p)
681 w += p & (uint32_t)-(int32_t)(w >> 31);
685inline static int32_t falcon_modp_norm(uint32_t x, uint32_t p)
691 return (int32_t)(x - (p & (((x - ((p + 1) >> 1)) >> 31) - 1)));
694inline static uint32_t falcon_modp_ninv31(uint32_t p)
707 return (uint32_t)0x7FFFFFFFUL & (uint32_t)-(int32_t)y;
710inline static uint32_t falcon_modp_R(uint32_t p)
716 return ((uint32_t)1 << 31) - p;
719inline static uint32_t falcon_modp_add(uint32_t a, uint32_t b, uint32_t p)
728 d += p & (uint32_t)-(int32_t)(d >> 31);
733inline static uint32_t falcon_modp_sub(uint32_t a, uint32_t b, uint32_t p)
742 d += p & (uint32_t)-(int32_t)(d >> 31);
747inline static uint32_t falcon_modp_montymul(uint32_t a, uint32_t b, uint32_t p, uint32_t p0i)
758 z = (uint64_t)a * (uint64_t)b;
759 w = ((z * p0i) & (uint64_t)0x7FFFFFFF) * p;
760 d = (uint32_t)((z + w) >> 31) - p;
761 d += p & (uint32_t)-(int32_t)(d >> 31);
775extern const uint16_t falcon_avx2_GMb[FALCON_GMB_SIZE];
777extern const uint16_t falcon_avx2_iGMb[FALCON_GMB_SIZE];
779extern const falcon_small_prime falcon_avx2_small_primes[FALCON_SMALL_PRIME_SIZE];
781inline static uint32_t falcon_mq_conv_small(int32_t x)
791 y += FALCON_Q & (uint32_t)-(int32_t)(y >> 31);
796inline static uint32_t falcon_mq_add(uint32_t x, uint32_t y)
808 d = x + y - FALCON_Q;
809 d += FALCON_Q & (uint32_t)-(int32_t)(d >> 31);
814inline static uint32_t falcon_mq_sub(uint32_t x, uint32_t y)
825 d += FALCON_Q & (uint32_t)-(int32_t)(d >> 31);
830inline static uint32_t falcon_mq_rshift1(uint32_t x)
836 x += FALCON_Q & (uint32_t)-(int32_t)(x & 1);
840inline static uint32_t falcon_mq_montymul(uint32_t x, uint32_t y)
859 w = ((z * FALCON_Q0I) & 0x0000FFFFUL) * FALCON_Q;
875 z += FALCON_Q & (uint32_t)-(int32_t)(z >> 31);
879inline static uint32_t falcon_mq_montysqr(uint32_t x)
885 return falcon_mq_montymul(x, x);
897int32_t qsc_falcon_avx2_generate_keypair(uint8_t *pk, uint8_t *sk,
bool (*rng_generate)(uint8_t*,
size_t));
909int32_t qsc_falcon_avx2_sign(uint8_t *sm,
size_t *smlen,
const uint8_t *m,
size_t mlen,
const uint8_t *sk,
bool (*rng_generate)(uint8_t*,
size_t));
921bool qsc_falcon_avx2_open(uint8_t *m,
size_t *mlen,
const uint8_t *sm,
size_t smlen,
const uint8_t *pk);
925QSC_CPLUSPLUS_ENABLED_END
Contains common definitions for the Quantum Secure Cryptographic (QSC) library.
#define QSC_ALIGN(x)
Macro for aligning data to 'x' bytes using GCC/Clang.
Definition common.h:593
SHA3 family of hash functions.