Skip to main content

quantica/ml_dsa/
masked.rs

1//! First-order arithmetic masking for ML-DSA polynomials.
2//!
3//! Same idea as the ML-KEM `masked` module but adapted to the
4//! ML-DSA arithmetic (q = 8 380 417, polynomial coefficients held as
5//! `i32`, NTT goes all the way down to length-1 components).
6//!
7//! Each secret polynomial is represented as two additive shares modulo
8//! `q`: `s = (s₀ + s₁) mod q`. All operations on secret data manipulate
9//! the shares independently, so a first-order side-channel attacker
10//! observing one share at a time learns nothing about the unmasked
11//! value.
12//!
13//! ## Sensitive operations protected
14//!
15//! In `dsa::sign_internal`, the secret polynomials `s1`, `s2`, `t0`
16//! are NTT-transformed once before the rejection-sampling loop, then
17//! multiplied by the per-iteration challenge polynomial `c` (which is
18//! public — verifier recomputes it):
19//!
20//! ```text
21//!   ŝ1, ŝ2, t̂0 ← NTT(s1), NTT(s2), NTT(t0)
22//!   for each rejection iteration:
23//!       ĉ ← NTT(c)
24//!       cs1[i]  ← ĉ · ŝ1[i]      // secret × public
25//!       cs2[i]  ← ĉ · ŝ2[i]      // secret × public
26//!       ct0[i]  ← ĉ · t̂0[i]      // secret × public
27//! ```
28//!
29//! The masked variants in this module replace `ŝ1`, `ŝ2`, `t̂0` with
30//! `MaskedPoly` containers and provide a `pointwise_mul_public`
31//! that multiplies each share independently. Because `ĉ` is public,
32//! no secret×secret multiplication occurs and first-order masking is
33//! sufficient.
34//!
35//! ## Available operations
36//!
37//! | Function                             | Description                                                |
38//! |--------------------------------------|------------------------------------------------------------|
39//! | `MaskedPoly::mask`                 | Split a plaintext polynomial into two shares               |
40//! | `MaskedPoly::unmask`               | Reconstruct the polynomial from shares                     |
41//! | `MaskedPoly::refresh`              | Re-randomize shares (prevents correlation buildup)         |
42//! | `MaskedPoly::zeroize`              | DSE-resistant wipe of both shares                          |
43//! | `masked_ntt`                       | Forward NTT applied to each share                          |
44//! | `masked_ntt_inv`                   | Inverse NTT applied to each share                          |
45//! | `masked_pointwise_mul_public`      | Masked × public pointwise mul (returns a `MaskedPoly`)   |
46//!
47//! ## Masked `y` pipeline (`sca-masked-y`)
48//!
49//! `MaskedPoly::expand_mask` samples `y` directly as two shares
50//! drawn from SHAKE256. The shares propagate through
51//! `masked_ntt` and `masked_mat_vec_mul` / `masked_mat_vec_mul_lazy`
52//! so that the intermediate `w = A·y` stays in masked form until the
53//! rejection loop commits to emitting it. This closes the DPA
54//! recovery of `s1` from `z = y + c·s1` that exists on any unmasked
55//! implementation. See *Side-channel analysis of masked y-sampling
56//! in ML-DSA* (IACR ePrint 2025/276) and the countermeasure chapter
57//! at `doc/sca/countermeasures/ml_dsa.rst`, section *DPA on `y` —
58//! the `sca-masked-y` pipeline*.
59//!
60//! ## References
61//!
62//! * *Hardware masking of ML-DSA* (IACR ePrint 2024,
63//!   `doc/papers/eprint2024_mldsa_hw_masking.pdf`) — reference
64//!   construction, we follow the same share topology.
65//! * *Side-channel analysis of masked y-sampling in ML-DSA*
66//!   (IACR ePrint 2025/276) — basis for
67//!   `MaskedPoly::expand_mask` + propagation through the linear
68//!   stage.
69//! * *Physical security considerations for ML-DSA* (NIST, 2025) —
70//!   masking recommendation for high-assurance profiles.
71//!
72//! ## Where to look next
73//!
74//! * Countermeasure description and threat analysis:
75//!   `doc/sca/countermeasures/ml_dsa.rst`, sections *DPA — first-
76//!   order masking of secret polynomials* and *DPA on `y` — the
77//!   `sca-masked-y` pipeline*.
78//! * Call sites: [`crate::ml_dsa::dsa::sign_internal`] (look for
79//!   `#[cfg(feature = "sca-protected")]` and
80//!   `#[cfg(feature = "sca-masked-y")]` blocks).
81//!
82//! ## Scope and residual risk
83//!
84//! Masking here is **first-order**. The shipped Tier-1 item
85//! `T1-A` (A3, refresh shares at the start of every rejection
86//! iteration, head-of-loop refresh block in `dsa.rs`) raises the
87//! effort required by a higher-order DPA that combines leakage
88//! across iterations. Going beyond first-order (full higher-order
89//! masking) is tracked as Tier-4 `T4-C`.
90
91use super::MlDsaError;
92use super::ntt::{self, mod_q};
93use super::params::{N, Q};
94use super::rng::CryptoRng;
95
96/// A polynomial split into two additive shares modulo `q`.
97///
98/// Maintains the invariant `unmask()[i] = (share0[i] + share1[i]) mod q`
99/// for all `i in 0..N`. Both shares are stored with coefficients in
100/// `[0, q-1]`. Neither share alone reveals any information about the
101/// underlying polynomial.
102pub struct MaskedPoly {
103    /// First additive share.
104    pub share0: [i32; N],
105    /// Second additive share.
106    pub share1: [i32; N],
107}
108
109impl MaskedPoly {
110    /// Build an all-zero `MaskedPoly`. Both shares are zero, so
111    /// `unmask()` returns the zero polynomial. Useful as a stack
112    /// initializer for fixed-size arrays of masked polynomials.
113    pub const fn zero() -> Self {
114        Self {
115            share0: [0i32; N],
116            share1: [0i32; N],
117        }
118    }
119
120    /// Masked sampling of a masking vector polynomial from a SHAKE256
121    /// stream — the DPA-safe replacement for `sample::expand_mask`.
122    ///
123    /// Implements ExpandMask (FIPS 204 Algorithm 34) but produces a
124    /// two-share arithmetic representation `(share0, share1)` directly,
125    /// without ever materializing the unmasked y coefficient in a stack
126    /// or heap slot.
127    ///
128    /// ## Threat model
129    ///
130    /// Boolean-masked y is attackable with ~300 traces per the
131    /// Hermelink-Ning-Petri result (ePrint 2025/276). Arithmetic
132    /// masking is more robust but still requires careful
133    /// implementation: the key invariant is that the unmasked
134    /// coefficient value must only exist transiently in a CPU
135    /// register, never be written to RAM.
136    ///
137    /// ## Implementation
138    ///
139    /// For each coefficient:
140    ///   1. Decode the unmasked `y_i` from SHAKE256 output bytes
141    ///      into a stack-local `let y_i: i32 = ...` (register-scoped).
142    ///   2. Draw a fresh random mask `r_i` from the same SHAKE256
143    ///      stream (a separate squeeze block).
144    ///   3. Compute `share1_i = r_i mod q`, `share0_i = (y_i - r_i) mod q`.
145    ///   4. Write both shares to the output `MaskedPoly`.
146    ///   5. `y_i` and `r_i` go out of scope immediately.
147    ///
148    /// The two SHAKE256 streams (y bits and mask bits) are drawn from
149    /// the same state: we first squeeze the packed-y bytes, then
150    /// squeeze additional bytes for the mask. This keeps the function
151    /// deterministic for a given `rho'' || nonce`, so the signature
152    /// remains reproducible (ACVP-compatible).
153    ///
154    /// # Arguments
155    ///
156    /// * `rho_double_prime` — 64-byte seed (FIPS 204).
157    /// * `nonce` — the per-polynomial nonce (`kappa + r`).
158    /// * `gamma1` — the Γ₁ parameter for the current ML-DSA level.
159    /// * `bitlen_gamma1_minus1` — bit length used by ExpandMask (17 or 19).
160    pub fn sample_expand_mask(
161        rho_double_prime: &[u8; 64],
162        nonce: u16,
163        gamma1: i32,
164        bitlen_gamma1_minus1: usize,
165    ) -> Self {
166        use super::sha3;
167        let c = bitlen_gamma1_minus1 + 1; // bits per coefficient
168        let poly_bytes = 32 * c; // packed y bytes for 256 coeffs
169        let mask_bytes = N * 4; // 1024 bytes of mask randomness
170
171        let mut state = sha3::shake256();
172        state.absorb(rho_double_prime);
173        state.absorb(&nonce.to_le_bytes());
174
175        // Two-phase squeeze: y packed bytes, then mask bytes.
176        let mut y_buf = [0u8; 640]; // 32 * 20 = max poly_bytes
177        state.squeeze(&mut y_buf[..poly_bytes]);
178        let mut mask_buf = [0u8; N * 4];
179        state.squeeze(&mut mask_buf);
180
181        let mut mp = Self::zero();
182        // Tight per-coefficient loop — the unmasked y_i only exists
183        // in registers between the two writes to mp.share0 and
184        // mp.share1 (the compiler optimizer is free to keep it in
185        // registers; no intermediate array is used).
186        let mut bit_pos = 0usize;
187        let b = gamma1 as u32;
188        let range = (gamma1 as u32 - 1) + gamma1 as u32; // a + b (bit_unpack semantics)
189        let bits = 32 - range.leading_zeros() as usize;
190        debug_assert_eq!(bits, c);
191
192        for i in 0..N {
193            // 1. Decode y_i (register-scoped, never on the stack as a poly).
194            let mut val = 0u32;
195            for bit in 0..bits {
196                if (y_buf[bit_pos / 8] >> (bit_pos % 8)) & 1 == 1 {
197                    val |= 1 << bit;
198                }
199                bit_pos += 1;
200            }
201            let y_i = b as i32 - val as i32;
202
203            // 2. Fresh mask r_i from the 4-byte mask chunk.
204            let r = u32::from_le_bytes([
205                mask_buf[4 * i],
206                mask_buf[4 * i + 1],
207                mask_buf[4 * i + 2],
208                mask_buf[4 * i + 3],
209            ]);
210            let r_i = (r % (Q as u32)) as i32;
211
212            // 3. share1 = r, share0 = y - r   (mod q).
213            mp.share1[i] = r_i;
214            mp.share0[i] = mod_q(y_i - r_i);
215            // y_i and r_i fall out of scope here.
216        }
217        zeroize_bytes(&mut y_buf);
218        zeroize_bytes(&mut mask_buf);
219        mp
220    }
221
222    /// Split a plaintext polynomial into two random additive shares.
223    ///
224    /// Generates a uniformly distributed `share1 ∈ [0, q-1]^N` from the
225    /// RNG, then sets `share0 = poly - share1 mod q`. The intermediate
226    /// random bytes are zeroized after use.
227    ///
228    /// # Errors
229    ///
230    /// Returns [`MlDsaError::RngFailure`] if the RNG fails.
231    pub fn mask(poly: &[i32; N], rng: &mut dyn CryptoRng) -> Result<Self, MlDsaError> {
232        let mut share0 = [0i32; N];
233        let mut share1 = [0i32; N];
234        // q < 2^23 so 4 bytes is enough; we keep rejection-bias-low by
235        // taking each 32-bit word and reducing mod q.
236        let mut rand_bytes = [0u8; N * 4];
237        rng.fill_bytes(&mut rand_bytes)?;
238
239        for i in 0..N {
240            let r = u32::from_le_bytes([
241                rand_bytes[4 * i],
242                rand_bytes[4 * i + 1],
243                rand_bytes[4 * i + 2],
244                rand_bytes[4 * i + 3],
245            ]);
246            // Reduce a 32-bit word mod q. Bias is 2^32 mod q ≈ 2^9 over
247            // 2^32, ~10^-7 — negligible for masking purposes (security
248            // properties of masking only require the share to be
249            // statistically close to uniform).
250            share1[i] = (r % (Q as u32)) as i32;
251        }
252        zeroize_bytes(&mut rand_bytes);
253
254        for i in 0..N {
255            share0[i] = mod_q(poly[i] - share1[i]);
256        }
257
258        Ok(MaskedPoly { share0, share1 })
259    }
260
261    /// Reconstruct the plaintext polynomial from the two shares.
262    ///
263    /// Result coefficients are in `[0, q-1]`. The returned polynomial
264    /// is unmasked secret data and should be zeroized after use.
265    pub fn unmask(&self) -> [i32; N] {
266        let mut out = [0i32; N];
267        for i in 0..N {
268            out[i] = mod_q(self.share0[i] + self.share1[i]);
269        }
270        out
271    }
272
273    /// Securely erase both shares via volatile writes.
274    pub fn zeroize(&mut self) {
275        zeroize_poly(&mut self.share0);
276        zeroize_poly(&mut self.share1);
277    }
278
279    /// Re-randomize the shares without changing the unmasked value.
280    ///
281    /// Draws a fresh random polynomial `r` and updates the shares as
282    /// `share0' = share0 - r mod q`, `share1' = share1 + r mod q`.
283    /// The sum is preserved: `share0' + share1' ≡ share0 + share1`.
284    /// Refreshing prevents higher-order correlation buildup when the
285    /// same masked polynomial is reused across multiple operations.
286    pub fn refresh(&mut self, rng: &mut dyn CryptoRng) -> Result<(), MlDsaError> {
287        let mut rand_bytes = [0u8; N * 4];
288        rng.fill_bytes(&mut rand_bytes)?;
289        for i in 0..N {
290            let r = (u32::from_le_bytes([
291                rand_bytes[4 * i],
292                rand_bytes[4 * i + 1],
293                rand_bytes[4 * i + 2],
294                rand_bytes[4 * i + 3],
295            ]) % (Q as u32)) as i32;
296            self.share0[i] = mod_q(self.share0[i] - r);
297            self.share1[i] = mod_q(self.share1[i] + r);
298        }
299        zeroize_bytes(&mut rand_bytes);
300        Ok(())
301    }
302}
303
304// =====================================================================
305// Linear masked operations
306// =====================================================================
307//
308// The NTT and pointwise-multiplication-by-public-data are linear, so
309// we can apply them to each share independently and the masking
310// invariant is preserved:
311//
312//   NTT(s₀ + s₁) = NTT(s₀) + NTT(s₁)
313//   p · (s₀ + s₁) = p·s₀ + p·s₁
314
315/// Apply the forward NTT to each share independently.
316pub fn masked_ntt(m: &mut MaskedPoly) {
317    ntt::ntt(&mut m.share0);
318    ntt::ntt(&mut m.share1);
319}
320
321/// Apply the inverse NTT to each share independently.
322pub fn masked_ntt_inv(m: &mut MaskedPoly) {
323    ntt::ntt_inv(&mut m.share0);
324    ntt::ntt_inv(&mut m.share1);
325}
326
327/// Pointwise multiply a masked polynomial by a **public** polynomial
328/// in NTT domain. Returns a fresh `MaskedPoly` holding the product.
329///
330/// `c_hat` (the per-iteration challenge polynomial in NTT form) is
331/// public — the verifier recomputes it, and any side-channel
332/// observation of it does not help an attacker recover the secret
333/// shares.
334pub fn masked_pointwise_mul_public(masked: &MaskedPoly, c_hat: &[i32; N]) -> MaskedPoly {
335    MaskedPoly {
336        share0: ntt::pointwise_mul(&masked.share0, c_hat),
337        share1: ntt::pointwise_mul(&masked.share1, c_hat),
338    }
339}
340
341/// Masked matrix-vector multiplication in the NTT domain: for each
342/// output row `i`, compute `sum_j (A_hat[i][j] · y_hat_m[j])` as a
343/// masked accumulator.
344///
345/// Since `A_hat` is public (derived from the public seed `rho`) and
346/// the NTT is linear, each share is multiplied independently by the
347/// same public matrix and accumulated into the corresponding output
348/// share — no secret×secret operation occurs, so first-order shares
349/// remain sufficient.
350///
351/// `y_hat_m` must have length ≥ `l` (the first `l` masked polynomials
352/// are consumed). `out` must have length ≥ `k`.
353pub fn masked_mat_vec_mul(
354    a_hat: &[[[i32; N]; super::params::MAX_L]; super::params::MAX_K],
355    y_hat_m: &[MaskedPoly],
356    k: usize,
357    l: usize,
358    out: &mut [MaskedPoly],
359) {
360    for i in 0..k {
361        out[i].share0 = [0i32; N];
362        out[i].share1 = [0i32; N];
363        for j in 0..l {
364            let prod0 = ntt::pointwise_mul(&a_hat[i][j], &y_hat_m[j].share0);
365            let prod1 = ntt::pointwise_mul(&a_hat[i][j], &y_hat_m[j].share1);
366            for n in 0..N {
367                out[i].share0[n] = mod_q(out[i].share0[n] + prod0[n]);
368                out[i].share1[n] = mod_q(out[i].share1[n] + prod1[n]);
369            }
370        }
371    }
372}
373
374/// Low-memory variant of `masked_mat_vec_mul`: recomputes each
375/// `a_hat[i][j]` polynomial on-the-fly from the public seed `rho`
376/// via SHAKE128 instead of holding the full k×l matrix in memory.
377///
378/// Trade-off identical to `mat_vec_mul_lazy` (dsa.rs): saves up to
379/// 57 KB of stack for the `a_hat` matrix at the cost of repeated
380/// SHAKE128 invocations. Called from `sign_internal` when both
381/// `sca-masked-y` and `low-mem` are enabled.
382pub fn masked_mat_vec_mul_lazy(rho: &[u8; 32], y_hat_m: &[MaskedPoly], k: usize, l: usize, out: &mut [MaskedPoly]) {
383    use super::sample;
384    for i in 0..k {
385        out[i].share0 = [0i32; N];
386        out[i].share1 = [0i32; N];
387        for j in 0..l {
388            let a_ij = sample::rej_ntt_poly(rho, j as u8, i as u8);
389            let prod0 = ntt::pointwise_mul(&a_ij, &y_hat_m[j].share0);
390            let prod1 = ntt::pointwise_mul(&a_ij, &y_hat_m[j].share1);
391            for n in 0..N {
392                out[i].share0[n] = mod_q(out[i].share0[n] + prod0[n]);
393                out[i].share1[n] = mod_q(out[i].share1[n] + prod1[n]);
394            }
395        }
396    }
397}
398
399// =====================================================================
400// Local DSE-resistant zeroization helpers
401// =====================================================================
402
403/// Securely erase a polynomial buffer (write_volatile + compiler fence).
404fn zeroize_poly(p: &mut [i32; N]) {
405    for c in p.iter_mut() {
406        unsafe { core::ptr::write_volatile(c, 0) };
407    }
408    core::sync::atomic::compiler_fence(core::sync::atomic::Ordering::SeqCst);
409}
410
411/// Securely erase a byte buffer (write_volatile + compiler fence).
412fn zeroize_bytes(b: &mut [u8]) {
413    for byte in b.iter_mut() {
414        unsafe { core::ptr::write_volatile(byte, 0) };
415    }
416    core::sync::atomic::compiler_fence(core::sync::atomic::Ordering::SeqCst);
417}
418
419#[cfg(test)]
420mod tests {
421    use super::super::ntt;
422    use super::*;
423
424    /// Same deterministic test PRNG as in `shuffle.rs::tests`.
425    struct TestRng(u64);
426    impl CryptoRng for TestRng {
427        fn fill_bytes(&mut self, dest: &mut [u8]) -> Result<(), MlDsaError> {
428            for chunk in dest.chunks_mut(8) {
429                let mut x = self.0;
430                x ^= x << 13;
431                x ^= x >> 7;
432                x ^= x << 17;
433                self.0 = x;
434                let bytes = x.to_le_bytes();
435                for (i, b) in chunk.iter_mut().enumerate() {
436                    *b = bytes[i];
437                }
438            }
439            Ok(())
440        }
441    }
442
443    fn fixture_poly() -> [i32; N] {
444        let mut p = [0i32; N];
445        for i in 0..N {
446            p[i] = ((i as i32 * 12345 + 7).rem_euclid(Q)) as i32;
447        }
448        p
449    }
450
451    #[test]
452    fn masked_mat_vec_mul_matches_unmasked() {
453        use super::super::ntt as dsa_ntt;
454        use super::super::params::{MAX_K, MAX_L, MlDsa65, Params as ParamsT};
455        use super::super::sample;
456
457        let k = MlDsa65::K;
458        let l = MlDsa65::L;
459        let rho = [0x17u8; 32];
460
461        // Reference: unmasked A·y
462        let a_hat = sample::expand_a::<MlDsa65>(&rho);
463        let mut y = [[0i32; N]; MAX_L];
464        for j in 0..l {
465            for n in 0..N {
466                y[j][n] = ((j as i32 * 100 + n as i32) % Q).abs();
467            }
468            dsa_ntt::ntt(&mut y[j]);
469        }
470        let mut w_ref = [[0i32; N]; MAX_K];
471        for i in 0..k {
472            for j in 0..l {
473                let prod = dsa_ntt::pointwise_mul(&a_hat[i][j], &y[j]);
474                for n in 0..N {
475                    w_ref[i][n] = mod_q(w_ref[i][n] + prod[n]);
476                }
477            }
478        }
479
480        // Masked path: mask y, call masked_mat_vec_mul, unmask result
481        let mut rng = TestRng(0xFEEDC0DEBADCAFEu64);
482        let mut y_m: [MaskedPoly; MAX_L] = core::array::from_fn(|_| MaskedPoly::zero());
483        for j in 0..l {
484            y_m[j] = MaskedPoly::mask(&y[j], &mut rng).unwrap();
485        }
486        let mut w_m: [MaskedPoly; MAX_K] = core::array::from_fn(|_| MaskedPoly::zero());
487        masked_mat_vec_mul(&a_hat, &y_m, k, l, &mut w_m);
488
489        for i in 0..k {
490            let w_got = w_m[i].unmask();
491            assert_eq!(w_got, w_ref[i], "mismatch at row {}", i);
492        }
493    }
494
495    #[test]
496    fn masked_expand_mask_matches_unmasked_expand_mask() {
497        use super::super::params::MlDsa65;
498        use super::super::params::Params as _;
499        use super::super::sample;
500
501        let rho = [0x42u8; 64];
502        let kappa = 0u16;
503        let gamma1 = MlDsa65::GAMMA1;
504        let bitlen = MlDsa65::BITLEN_GAMMA1_MINUS1;
505
506        // Reference: unmasked ExpandMask for row 0
507        let y_ref = sample::expand_mask::<MlDsa65>(&rho, kappa);
508
509        // Masked path: two shares, unmask to compare
510        let mp = MaskedPoly::sample_expand_mask(&rho, kappa, gamma1, bitlen);
511        let unmasked = mp.unmask();
512
513        // y_ref[0] coefficients are in (-gamma1, gamma1] (centered);
514        // unmasked() returns in [0, q-1]. Normalize both to the centered
515        // range for comparison.
516        for i in 0..N {
517            let r = y_ref[0][i];
518            let u = {
519                let mut v = unmasked[i];
520                if v > Q / 2 {
521                    v -= Q;
522                }
523                v
524            };
525            assert_eq!(r, u, "mismatch at i={}: ref={}, masked={}", i, r, u);
526        }
527    }
528
529    #[test]
530    fn mask_unmask_roundtrip() {
531        let p = fixture_poly();
532        let mut rng = TestRng(0xCAFEF00DDEADBEEF);
533        let mp = MaskedPoly::mask(&p, &mut rng).unwrap();
534        let recovered = mp.unmask();
535        assert_eq!(recovered, p);
536    }
537
538    #[test]
539    fn refresh_preserves_unmasked_value() {
540        let p = fixture_poly();
541        let mut rng = TestRng(0x1234567890ABCDEF);
542        let mut mp = MaskedPoly::mask(&p, &mut rng).unwrap();
543        mp.refresh(&mut rng).unwrap();
544        mp.refresh(&mut rng).unwrap();
545        assert_eq!(mp.unmask(), p);
546    }
547
548    #[test]
549    fn masked_ntt_matches_regular_ntt() {
550        let p = fixture_poly();
551        let mut expected = p;
552        ntt::ntt(&mut expected);
553
554        let mut rng = TestRng(0x0123456789ABCDEF);
555        let mut mp = MaskedPoly::mask(&p, &mut rng).unwrap();
556        masked_ntt(&mut mp);
557        // share0 + share1 mod q must equal NTT(p) coefficient-wise.
558        for i in 0..N {
559            let got = mod_q(mp.share0[i] + mp.share1[i]);
560            assert_eq!(got, expected[i], "mismatch at i={}", i);
561        }
562    }
563
564    #[test]
565    fn masked_pointwise_mul_public_matches_unmasked() {
566        let secret = fixture_poly();
567        // Build a "public" polynomial — anything will do.
568        let mut public = [0i32; N];
569        for i in 0..N {
570            public[i] = ((i as i32 * 991 + 13).rem_euclid(Q)) as i32;
571        }
572        // Bring both into NTT domain.
573        let mut secret_ntt = secret;
574        ntt::ntt(&mut secret_ntt);
575        let mut public_ntt = public;
576        ntt::ntt(&mut public_ntt);
577
578        // Reference: regular pointwise_mul. Output is in /R Montgomery
579        // domain and may be in [-q, q]; normalize for comparison.
580        let mut expected = ntt::pointwise_mul(&secret_ntt, &public_ntt);
581        for c in expected.iter_mut() {
582            *c = mod_q(*c);
583        }
584
585        // Masked path: mask the (NTT-domain) secret, multiply, unmask.
586        // `unmask()` already normalizes to [0, q-1].
587        let mut rng = TestRng(0xFEEDFACE12345678);
588        let mp_secret = MaskedPoly::mask(&secret_ntt, &mut rng).unwrap();
589        let mp_product = masked_pointwise_mul_public(&mp_secret, &public_ntt);
590        let got = mp_product.unmask();
591
592        assert_eq!(got, expected);
593    }
594}