quantica/ml_dsa/shuffle.rs
1//! Fisher-Yates shuffle for ML-DSA NTT butterfly index randomization
2//! (**countermeasure: SPA / SEMA on secret-polynomial NTT**).
3//!
4//! Same principle as the ML-KEM `shuffle` module, ported to the
5//! ML-DSA arithmetic (`q = 8 380 417`, `i32` coefficients,
6//! 256-coefficient polynomials, NTT all the way down to length-1).
7//!
8//! ## Principle
9//!
10//! Within a given NTT level, all butterfly groups (and all butterflies
11//! within a group) are independent — permuting their execution order
12//! does not affect correctness, only the pattern of memory accesses
13//! and instantaneous power consumption. By drawing a fresh permutation
14//! per level and per group from an SCA-dedicated CSPRNG, two successive
15//! NTTs on the same input produce different power / EM traces,
16//! defeating trace-alignment-based SPA and template attacks. The
17//! CSPRNG (`ScaRng`) is seeded with `K ‖ rnd ‖ tr ‖ M'` inside
18//! [`crate::ml_dsa::dsa::sign_internal`].
19//!
20//! Applied to `s1`, `s2`, `t0` (the three secret vectors). The public
21//! matrix `A` keeps the classical NTT for performance — public values
22//! need no shuffling.
23//!
24//! The primary entry point is `ntt_shuffled`, a drop-in replacement
25//! for `super::ntt::ntt` that draws from a [`CryptoRng`].
26//!
27//! ## References
28//!
29//! * *Hardware NTT shuffling as a lightweight countermeasure for
30//! ML-KEM* (arXiv, 2024) — original shuffling analysis; the
31//! construction transfers directly to ML-DSA.
32//! * *Slothy-assisted Cortex-M4/M7 implementations of ML-DSA*
33//! (IACR ePrint 2025) — performance measurements for the shuffled
34//! variant.
35//! * *Physical security considerations for ML-DSA* (NIST, 2025) —
36//! recommended posture including shuffling as an SPA mitigation.
37//!
38//! ## Where to look next
39//!
40//! * Countermeasure description and threat analysis:
41//! `doc/sca/countermeasures/ml_dsa.rst`, section *SPA / SEMA —
42//! Fisher-Yates shuffled NTT*.
43//! * Call site: [`crate::ml_dsa::dsa::sign_internal`], Step 1 of
44//! the protected branch (three `for` loops applying
45//! `ntt_shuffled` to each polynomial of `s1_hat`, `s2_hat`,
46//! `t0_hat`).
47
48use super::MlDsaError;
49use super::ntt::{ZETAS, mod_q};
50use super::params::N;
51use super::rng::CryptoRng;
52use alloc::vec;
53
54/// Generate a uniform random permutation of `0..n` in-place using
55/// Fisher-Yates with rejection sampling on 16-bit RNG output.
56///
57/// `perm` must be a slice of length `n`; on entry its contents are
58/// overwritten with the identity permutation, then shuffled in place.
59pub fn generate_permutation(perm: &mut [u16], rng: &mut dyn CryptoRng) -> Result<(), MlDsaError> {
60 let n = perm.len();
61 for i in 0..n {
62 perm[i] = i as u16;
63 }
64 let mut rand_buf = [0u8; 2];
65 for i in (1..n).rev() {
66 let bound = (i + 1) as u16;
67 // Rejection sampling to avoid modular bias.
68 let limit = u16::MAX - (u16::MAX % bound);
69 loop {
70 rng.fill_bytes(&mut rand_buf)?;
71 let r = u16::from_le_bytes(rand_buf);
72 if r < limit {
73 let j = (r % bound) as usize;
74 perm.swap(i, j);
75 break;
76 }
77 }
78 }
79 Ok(())
80}
81
82/// Forward NTT with randomized butterfly ordering (SPA countermeasure).
83///
84/// Functionally equivalent to `super::ntt::ntt` but draws fresh
85/// random permutations from `rng` for both the inter-group and
86/// intra-group butterfly orderings at each NTT level.
87///
88/// Uses the non-Montgomery [`ZETAS`] table together with the public
89/// `mul_mod_q` helper, so the implementation is fully self-contained
90/// — at the cost of being slightly slower than the in-place
91/// Montgomery butterflies in `super::ntt::ntt`. Acceptable for the
92/// SCA-protected build because the shuffled NTT only runs three times
93/// per signature (on `s1`, `s2`, `t0`), once at the start of
94/// `sign_internal` and never inside the rejection loop.
95///
96/// Output coefficients are in `[0, q-1]`, matching the regular NTT.
97pub fn ntt_shuffled(f: &mut [i32; N], rng: &mut dyn CryptoRng) -> Result<(), MlDsaError> {
98 let mut m = 0usize;
99 let mut len = 128;
100 while len >= 1 {
101 let num_groups = N / (2 * len);
102
103 // Shuffle the order of butterfly groups at this level.
104 let mut group_perm = vec![0u16; num_groups];
105 generate_permutation(&mut group_perm, rng)?;
106
107 // Within each group, also shuffle the order of butterflies.
108 // We must compute the per-group `m`/`zeta` index up front so
109 // that re-ordering the groups doesn't desync the zeta lookup.
110 for &gi in group_perm.iter() {
111 let group_index = gi as usize;
112 let start = group_index * 2 * len;
113 // ZETAS table is indexed by the linear butterfly group
114 // counter (m + 1 .. m + num_groups). The classical NTT
115 // increments `m` once per group; here we project the
116 // shuffled group index directly.
117 let zeta = ZETAS[m + 1 + group_index];
118
119 let mut inner_perm = vec![0u16; len];
120 generate_permutation(&mut inner_perm, rng)?;
121
122 for &ji in inner_perm.iter() {
123 let j = start + ji as usize;
124 // t = zeta * f[j+len] (mod q)
125 let t = mod_q(((zeta as i64 * f[j + len] as i64) % super::params::Q as i64) as i32);
126 f[j + len] = mod_q(f[j] - t);
127 f[j] = mod_q(f[j] + t);
128 }
129 }
130
131 m += num_groups;
132 len /= 2;
133 }
134 // Final reduction to [0, q-1] (most coefficients are already in
135 // range thanks to the per-butterfly mod_q above, but we normalize
136 // for safety / parity with `super::ntt::ntt`).
137 for c in f.iter_mut() {
138 *c = mod_q(*c);
139 }
140 Ok(())
141}
142
143#[cfg(test)]
144mod tests {
145 use super::super::ntt;
146 use super::*;
147
148 /// Tiny deterministic xorshift PRNG used only for tests so we
149 /// don't depend on `OsRng` (and thus the `std` feature).
150 struct TestRng(u64);
151 impl CryptoRng for TestRng {
152 fn fill_bytes(&mut self, dest: &mut [u8]) -> Result<(), MlDsaError> {
153 for chunk in dest.chunks_mut(8) {
154 let mut x = self.0;
155 x ^= x << 13;
156 x ^= x >> 7;
157 x ^= x << 17;
158 self.0 = x;
159 let bytes = x.to_le_bytes();
160 for (i, b) in chunk.iter_mut().enumerate() {
161 *b = bytes[i];
162 }
163 }
164 Ok(())
165 }
166 }
167
168 #[test]
169 fn shuffled_ntt_matches_regular_ntt() {
170 let mut a = [0i32; N];
171 for i in 0..N {
172 a[i] = ((i as i32 * 12345 + 7).rem_euclid(super::super::params::Q)) as i32;
173 }
174 let mut b = a;
175
176 ntt::ntt(&mut a);
177
178 let mut rng = TestRng(0xDEADBEEFCAFEBABE);
179 ntt_shuffled(&mut b, &mut rng).unwrap();
180
181 assert_eq!(a, b, "shuffled NTT must match the regular NTT bit-for-bit");
182 }
183}