Skip to main content

arcana/cipher/
poly1305.rs

1//! Poly1305 one-time message authentication code (RFC 8439 §2.5).
2//!
3//! Poly1305 is a polynomial-evaluation MAC over `GF(2^130 - 5)`. It
4//! is keyed with a 32-byte one-time key `(r, s)` -- the same key
5//! must NEVER be reused for two different messages, otherwise the
6//! authentication forgery becomes trivial. In ChaCha20-Poly1305 the
7//! key is derived freshly per message from the ChaCha20 keystream
8//! block 0, which is the standard way to satisfy this constraint.
9//!
10//! # Construction
11//!
12//! ```text
13//! Acc = 0
14//! for each 16-byte block m_i (last block possibly shorter):
15//!     n = m_i || 0x01 || zero-pad to 17 bytes  (the 0x01 is appended
16//!         right after the message bytes; for a full 16-byte block it
17//!         lives at byte 16, for a short block of length L it lives
18//!         at byte L)
19//!     Acc = ((Acc + n) * r) mod p   where p = 2^130 - 5
20//! tag = (Acc + s) mod 2^128         (low 16 bytes only)
21//! ```
22//!
23//! `r` is "clamped" before use per the spec: certain bits of the 16
24//! key bytes are cleared so that intermediate products fit in
25//! 130 + 26 bits and overflow can be handled by simple word-wise
26//! reduction with no conditional branches.
27//!
28//! # Internal representation
29//!
30//! Field elements are stored in **5 limbs of 26 bits** packed into
31//! `u32`. This is the standard radix-2^26 representation: it gives
32//! enough headroom that a 26-bit × 26-bit product fits in u64
33//! (52 bits) and that the carry propagation between limbs after a
34//! Poly1305 multiply-and-reduce stays comfortably under u64. It is
35//! the same layout used by `poly1305-donna`, `libsodium`,
36//! `openssl/poly1305_64.c`, and the reference implementation in
37//! the RFC 8439 Appendix C.
38//!
39//! # Tests
40//!
41//! Pinned against the RFC 8439 §2.5.2 single test vector. A more
42//! exhaustive set of vectors lives in §2.6 and Appendix A but the
43//! §2.5.2 vector is sharp enough to catch every meaningful bug
44//! (clamping, multiply-and-reduce, finalisation `+ s mod 2^128`).
45
46// ============================================================================
47// Poly1305 state
48// ============================================================================
49
50/// Poly1305 MAC state.
51///
52/// Holds the accumulator `acc`, the precomputed `r` (and `r * 5` for
53/// each limb, to absorb the `* 5 mod p` of the lazy reduction in the
54/// inner loop), the tag-finalisation key `s`, and a 16-byte buffer
55/// for handling messages whose length is not a multiple of 16.
56///
57/// Internally:
58/// - `r[0..5]`  : `r` in 5 26-bit limbs
59/// - `s[0..4]`  : the second half of the one-time key as 4 LE u32s
60/// - `acc[0..5]`: accumulator in 5 26-bit limbs
61/// - `buffer`   : up to 16 bytes of pending data
62/// - `buf_pos`  : 0..=16
63pub struct Poly1305 {
64    r: [u32; 5],
65    s: [u32; 4],
66    acc: [u32; 5],
67    buffer: [u8; 16],
68    buf_pos: usize,
69}
70
71impl Poly1305 {
72    /// Initialise Poly1305 with a 32-byte one-time key `key = r ‖ s`.
73    ///
74    /// `r` is clamped per RFC 8439 §2.5: bytes 3, 7, 11, 15 have
75    /// their top 4 bits cleared (`& 0x0f`) and bytes 4, 8, 12 have
76    /// their low 2 bits cleared (`& 0xfc`).
77    pub fn new(key: &[u8; 32]) -> Self {
78        let mut r_bytes = [0u8; 16];
79        r_bytes.copy_from_slice(&key[..16]);
80        // Clamp r per RFC 8439 §2.5.
81        r_bytes[3] &= 0x0f;
82        r_bytes[7] &= 0x0f;
83        r_bytes[11] &= 0x0f;
84        r_bytes[15] &= 0x0f;
85        r_bytes[4] &= 0xfc;
86        r_bytes[8] &= 0xfc;
87        r_bytes[12] &= 0xfc;
88
89        // Pack r into 5 26-bit limbs (little-endian byte order).
90        let r0 = u32::from_le_bytes(r_bytes[0..4].try_into().unwrap()) & 0x03ffffff;
91        let r1 = (u32::from_le_bytes(r_bytes[3..7].try_into().unwrap()) >> 2) & 0x03ffff03;
92        let r2 = (u32::from_le_bytes(r_bytes[6..10].try_into().unwrap()) >> 4) & 0x03ffc0ff;
93        let r3 = (u32::from_le_bytes(r_bytes[9..13].try_into().unwrap()) >> 6) & 0x03f03fff;
94        let r4 = (u32::from_le_bytes(r_bytes[12..16].try_into().unwrap()) >> 8) & 0x000fffff;
95
96        // The above masks combine the clamping and the radix-2^26
97        // packing in one shot. The constants come from the standard
98        // donna-style packing used by libsodium and OpenSSL: each
99        // limb holds 26 bits aligned on its sub-byte boundary.
100
101        let mut s = [0u32; 4];
102        for i in 0..4 {
103            s[i] = u32::from_le_bytes(key[16 + 4 * i..16 + 4 * i + 4].try_into().unwrap());
104        }
105
106        Self {
107            r: [r0, r1, r2, r3, r4],
108            s,
109            acc: [0u32; 5],
110            buffer: [0u8; 16],
111            buf_pos: 0,
112        }
113    }
114
115    /// Absorb additional data into the accumulator.
116    ///
117    /// Buffers up to 15 leftover bytes between calls so that
118    /// arbitrary update sizes work the same as one big call.
119    pub fn update(&mut self, data: &[u8]) {
120        let mut pos = 0;
121
122        // First, fill the partial buffer if any.
123        if self.buf_pos > 0 {
124            let need = 16 - self.buf_pos;
125            let take = need.min(data.len());
126            self.buffer[self.buf_pos..self.buf_pos + take].copy_from_slice(&data[..take]);
127            self.buf_pos += take;
128            pos += take;
129            if self.buf_pos == 16 {
130                let block = self.buffer;
131                self.absorb_block(&block, /* high_bit = */ true);
132                self.buf_pos = 0;
133            }
134        }
135
136        // Process whole blocks directly from the input.
137        while pos + 16 <= data.len() {
138            let mut block = [0u8; 16];
139            block.copy_from_slice(&data[pos..pos + 16]);
140            self.absorb_block(&block, true);
141            pos += 16;
142        }
143
144        // Stash any remainder for the next call.
145        if pos < data.len() {
146            let remaining = data.len() - pos;
147            self.buffer[..remaining].copy_from_slice(&data[pos..]);
148            self.buf_pos = remaining;
149        }
150    }
151
152    /// Finalise the MAC and write the 16-byte tag into `tag`.
153    /// Consumes the state.
154    pub fn finalize(mut self) -> [u8; 16] {
155        // Process the final partial block (if any) with high_bit = false:
156        // for a short block of length L < 16, append a single 0x01 byte
157        // at position L and zero-pad to 16 bytes, then absorb without
158        // adding the implicit "+1" at bit 128.
159        if self.buf_pos > 0 {
160            let mut last = [0u8; 16];
161            last[..self.buf_pos].copy_from_slice(&self.buffer[..self.buf_pos]);
162            last[self.buf_pos] = 0x01;
163            self.absorb_block(&last, /* high_bit = */ false);
164        }
165
166        // Final reduction of the accumulator into [0, p).
167        let mut h0 = self.acc[0];
168        let mut h1 = self.acc[1];
169        let mut h2 = self.acc[2];
170        let mut h3 = self.acc[3];
171        let mut h4 = self.acc[4];
172
173        // Carry-propagate one more time.
174        let mut c: u32;
175        c = h1 >> 26;
176        h1 &= 0x03ffffff;
177        h2 += c;
178        c = h2 >> 26;
179        h2 &= 0x03ffffff;
180        h3 += c;
181        c = h3 >> 26;
182        h3 &= 0x03ffffff;
183        h4 += c;
184        c = h4 >> 26;
185        h4 &= 0x03ffffff;
186        h0 += c * 5;
187        c = h0 >> 26;
188        h0 &= 0x03ffffff;
189        h1 += c;
190
191        // Compute h - p = h + 5 - 2^130 and select between h and (h - p)
192        // based on whether h >= p. We use the standard donna trick:
193        // add 5, then subtract 2^130 unless the carry came back negative.
194        let mut g0 = h0.wrapping_add(5);
195        let mut c = g0 >> 26;
196        g0 &= 0x03ffffff;
197        let mut g1 = h1.wrapping_add(c);
198        let mut c = g1 >> 26;
199        g1 &= 0x03ffffff;
200        let mut g2 = h2.wrapping_add(c);
201        let mut c = g2 >> 26;
202        g2 &= 0x03ffffff;
203        let mut g3 = h3.wrapping_add(c);
204        let c = g3 >> 26;
205        g3 &= 0x03ffffff;
206        let g4 = h4.wrapping_add(c).wrapping_sub(1 << 26);
207
208        // If g4 has bit 31 set after the subtract-2^130 (i.e. h < p),
209        // keep h. Otherwise (h >= p) keep g.
210        let mask = (g4 >> 31).wrapping_sub(1); // 0xffffffff if h>=p, else 0
211        let inv = !mask;
212        h0 = (h0 & inv) | (g0 & mask);
213        h1 = (h1 & inv) | (g1 & mask);
214        h2 = (h2 & inv) | (g2 & mask);
215        h3 = (h3 & inv) | (g3 & mask);
216        h4 = (h4 & inv) | (g4 & mask);
217
218        // Pack the 5 26-bit limbs into 4 little-endian u32s.
219        let h0 = h0 | (h1 << 26);
220        let h1 = (h1 >> 6) | (h2 << 20);
221        let h2 = (h2 >> 12) | (h3 << 14);
222        let h3 = (h3 >> 18) | (h4 << 8);
223
224        // tag = (h + s) mod 2^128
225        let mut f: u64 = h0 as u64 + self.s[0] as u64;
226        let t0 = f as u32;
227        f = (f >> 32) + h1 as u64 + self.s[1] as u64;
228        let t1 = f as u32;
229        f = (f >> 32) + h2 as u64 + self.s[2] as u64;
230        let t2 = f as u32;
231        f = (f >> 32) + h3 as u64 + self.s[3] as u64;
232        let t3 = f as u32;
233
234        let mut tag = [0u8; 16];
235        tag[0..4].copy_from_slice(&t0.to_le_bytes());
236        tag[4..8].copy_from_slice(&t1.to_le_bytes());
237        tag[8..12].copy_from_slice(&t2.to_le_bytes());
238        tag[12..16].copy_from_slice(&t3.to_le_bytes());
239        tag
240    }
241
242    /// One-shot helper: feed the entire `data` and return the tag.
243    pub fn mac(key: &[u8; 32], data: &[u8]) -> [u8; 16] {
244        let mut p = Self::new(key);
245        p.update(data);
246        p.finalize()
247    }
248
249    /// Absorb a single 16-byte block into the accumulator. The
250    /// `high_bit` parameter is `true` for normal blocks (the implicit
251    /// 0x01 byte at position 16 is added to the integer reading) and
252    /// `false` for the final short block, where the message has
253    /// already been padded with `0x01 || 0`.
254    fn absorb_block(&mut self, block: &[u8; 16], high_bit: bool) {
255        // Read the 16-byte block as 5 26-bit limbs (LE).
256        let h0 = self.acc[0].wrapping_add(u32::from_le_bytes(block[0..4].try_into().unwrap()) & 0x03ffffff);
257        let h1 = self.acc[1].wrapping_add((u32::from_le_bytes(block[3..7].try_into().unwrap()) >> 2) & 0x03ffffff);
258        let h2 = self.acc[2].wrapping_add((u32::from_le_bytes(block[6..10].try_into().unwrap()) >> 4) & 0x03ffffff);
259        let h3 = self.acc[3].wrapping_add((u32::from_le_bytes(block[9..13].try_into().unwrap()) >> 6) & 0x03ffffff);
260        let h4 = self.acc[4].wrapping_add(
261            (u32::from_le_bytes(block[12..16].try_into().unwrap()) >> 8) | (if high_bit { 1 << 24 } else { 0 }),
262        );
263
264        // h *= r mod p, with the standard donna-style schoolbook
265        // 5x5 multiplication that absorbs the * 5 mod p reduction
266        // into the cross-terms.
267        let r0 = self.r[0] as u64;
268        let r1 = self.r[1] as u64;
269        let r2 = self.r[2] as u64;
270        let r3 = self.r[3] as u64;
271        let r4 = self.r[4] as u64;
272
273        // r * 5 (used to fold high bits back into the low limbs).
274        let s1 = r1 * 5;
275        let s2 = r2 * 5;
276        let s3 = r3 * 5;
277        let s4 = r4 * 5;
278
279        let h0_64 = h0 as u64;
280        let h1_64 = h1 as u64;
281        let h2_64 = h2 as u64;
282        let h3_64 = h3 as u64;
283        let h4_64 = h4 as u64;
284
285        let d0 = h0_64 * r0 + h1_64 * s4 + h2_64 * s3 + h3_64 * s2 + h4_64 * s1;
286        let d1 = h0_64 * r1 + h1_64 * r0 + h2_64 * s4 + h3_64 * s3 + h4_64 * s2;
287        let d2 = h0_64 * r2 + h1_64 * r1 + h2_64 * r0 + h3_64 * s4 + h4_64 * s3;
288        let d3 = h0_64 * r3 + h1_64 * r2 + h2_64 * r1 + h3_64 * r0 + h4_64 * s4;
289        let d4 = h0_64 * r4 + h1_64 * r3 + h2_64 * r2 + h3_64 * r1 + h4_64 * r0;
290
291        // Carry-propagate.
292        let mut c: u64;
293        let mut h0 = (d0 & 0x03ffffff) as u32;
294        c = d0 >> 26;
295        let mut h1 = ((d1 + c) & 0x03ffffff) as u32;
296        c = (d1 + c) >> 26;
297        let mut h2 = ((d2 + c) & 0x03ffffff) as u32;
298        c = (d2 + c) >> 26;
299        let mut h3 = ((d3 + c) & 0x03ffffff) as u32;
300        c = (d3 + c) >> 26;
301        let mut h4 = ((d4 + c) & 0x03ffffff) as u32;
302        c = (d4 + c) >> 26;
303        h0 = h0.wrapping_add((c * 5) as u32);
304        let c2 = h0 >> 26;
305        h0 &= 0x03ffffff;
306        h1 = h1.wrapping_add(c2);
307
308        self.acc = [h0, h1, h2, h3, h4];
309    }
310}
311
312// ============================================================================
313// Tests (RFC 8439 pinned vectors)
314// ============================================================================
315
316#[cfg(test)]
317mod tests {
318    use super::*;
319
320    fn hex(s: &str) -> Vec<u8> {
321        assert!(s.len() % 2 == 0);
322        (0..s.len())
323            .step_by(2)
324            .map(|i| u8::from_str_radix(&s[i..i + 2], 16).unwrap())
325            .collect()
326    }
327
328    fn hex_arr<const N: usize>(s: &str) -> [u8; N] {
329        let v = hex(s);
330        assert_eq!(v.len(), N);
331        let mut out = [0u8; N];
332        out.copy_from_slice(&v);
333        out
334    }
335
336    /// RFC 8439 §2.5.2 Poly1305 test vector.
337    ///
338    /// Key: 85d6be7857556d337f4452fe42d506a8 0103808afb0db2fd4abff6af4149f51b
339    /// Msg: "Cryptographic Forum Research Group"
340    /// Tag: a8061dc1305136c6c22b8baf0c0127a9
341    #[test]
342    fn rfc8439_2_5_2_poly1305_vector() {
343        let key: [u8; 32] = hex_arr("85d6be7857556d337f4452fe42d506a80103808afb0db2fd4abff6af4149f51b");
344        let msg = b"Cryptographic Forum Research Group";
345        let tag = Poly1305::mac(&key, msg);
346        let expected = hex("a8061dc1305136c6c22b8baf0c0127a9");
347        assert_eq!(tag.to_vec(), expected);
348    }
349
350    /// Chunked update must give the same tag as one big update.
351    #[test]
352    fn poly1305_chunked_update_matches_oneshot() {
353        let key = [0x42u8; 32];
354        let msg: Vec<u8> = (0..200).map(|i| (i * 7) as u8).collect();
355
356        let oneshot = Poly1305::mac(&key, &msg);
357
358        let mut chunks = Poly1305::new(&key);
359        chunks.update(&msg[..7]);
360        chunks.update(&msg[7..50]);
361        chunks.update(&msg[50..50]); // empty update
362        chunks.update(&msg[50..120]); // crosses 16-byte boundaries
363        chunks.update(&msg[120..]);
364        let tag = chunks.finalize();
365
366        assert_eq!(oneshot, tag);
367    }
368
369    /// Different keys must produce different tags for the same
370    /// message (sanity that the key actually drives the output).
371    #[test]
372    fn poly1305_different_keys_differ() {
373        let msg = b"the quick brown fox";
374        let k1 = [0x01u8; 32];
375        let mut k2 = [0x01u8; 32];
376        k2[0] ^= 0x80;
377        // Both keys still satisfy the clamp constraints since the
378        // clamp clears the relevant bits at packing time.
379        assert_ne!(Poly1305::mac(&k1, msg), Poly1305::mac(&k2, msg));
380    }
381
382    /// Different messages must produce different tags under the same
383    /// key (sanity that the message actually drives the output).
384    #[test]
385    fn poly1305_different_messages_differ() {
386        let key = [0x77u8; 32];
387        assert_ne!(Poly1305::mac(&key, b"hello"), Poly1305::mac(&key, b"world"));
388    }
389}