arcana/cipher/poly1305.rs
1//! Poly1305 one-time message authentication code (RFC 8439 §2.5).
2//!
3//! Poly1305 is a polynomial-evaluation MAC over `GF(2^130 - 5)`. It
4//! is keyed with a 32-byte one-time key `(r, s)` -- the same key
5//! must NEVER be reused for two different messages, otherwise the
6//! authentication forgery becomes trivial. In ChaCha20-Poly1305 the
7//! key is derived freshly per message from the ChaCha20 keystream
8//! block 0, which is the standard way to satisfy this constraint.
9//!
10//! # Construction
11//!
12//! ```text
13//! Acc = 0
14//! for each 16-byte block m_i (last block possibly shorter):
15//! n = m_i || 0x01 || zero-pad to 17 bytes (the 0x01 is appended
16//! right after the message bytes; for a full 16-byte block it
17//! lives at byte 16, for a short block of length L it lives
18//! at byte L)
19//! Acc = ((Acc + n) * r) mod p where p = 2^130 - 5
20//! tag = (Acc + s) mod 2^128 (low 16 bytes only)
21//! ```
22//!
23//! `r` is "clamped" before use per the spec: certain bits of the 16
24//! key bytes are cleared so that intermediate products fit in
25//! 130 + 26 bits and overflow can be handled by simple word-wise
26//! reduction with no conditional branches.
27//!
28//! # Internal representation
29//!
30//! Field elements are stored in **5 limbs of 26 bits** packed into
31//! `u32`. This is the standard radix-2^26 representation: it gives
32//! enough headroom that a 26-bit × 26-bit product fits in u64
33//! (52 bits) and that the carry propagation between limbs after a
34//! Poly1305 multiply-and-reduce stays comfortably under u64. It is
35//! the same layout used by `poly1305-donna`, `libsodium`,
36//! `openssl/poly1305_64.c`, and the reference implementation in
37//! the RFC 8439 Appendix C.
38//!
39//! # Tests
40//!
41//! Pinned against the RFC 8439 §2.5.2 single test vector. A more
42//! exhaustive set of vectors lives in §2.6 and Appendix A but the
43//! §2.5.2 vector is sharp enough to catch every meaningful bug
44//! (clamping, multiply-and-reduce, finalisation `+ s mod 2^128`).
45
46// ============================================================================
47// Poly1305 state
48// ============================================================================
49
50/// Poly1305 MAC state.
51///
52/// Holds the accumulator `acc`, the precomputed `r` (and `r * 5` for
53/// each limb, to absorb the `* 5 mod p` of the lazy reduction in the
54/// inner loop), the tag-finalisation key `s`, and a 16-byte buffer
55/// for handling messages whose length is not a multiple of 16.
56///
57/// Internally:
58/// - `r[0..5]` : `r` in 5 26-bit limbs
59/// - `s[0..4]` : the second half of the one-time key as 4 LE u32s
60/// - `acc[0..5]`: accumulator in 5 26-bit limbs
61/// - `buffer` : up to 16 bytes of pending data
62/// - `buf_pos` : 0..=16
63pub struct Poly1305 {
64 r: [u32; 5],
65 s: [u32; 4],
66 acc: [u32; 5],
67 buffer: [u8; 16],
68 buf_pos: usize,
69}
70
71impl Poly1305 {
72 /// Initialise Poly1305 with a 32-byte one-time key `key = r ‖ s`.
73 ///
74 /// `r` is clamped per RFC 8439 §2.5: bytes 3, 7, 11, 15 have
75 /// their top 4 bits cleared (`& 0x0f`) and bytes 4, 8, 12 have
76 /// their low 2 bits cleared (`& 0xfc`).
77 pub fn new(key: &[u8; 32]) -> Self {
78 let mut r_bytes = [0u8; 16];
79 r_bytes.copy_from_slice(&key[..16]);
80 // Clamp r per RFC 8439 §2.5.
81 r_bytes[3] &= 0x0f;
82 r_bytes[7] &= 0x0f;
83 r_bytes[11] &= 0x0f;
84 r_bytes[15] &= 0x0f;
85 r_bytes[4] &= 0xfc;
86 r_bytes[8] &= 0xfc;
87 r_bytes[12] &= 0xfc;
88
89 // Pack r into 5 26-bit limbs (little-endian byte order).
90 let r0 = u32::from_le_bytes(r_bytes[0..4].try_into().unwrap()) & 0x03ffffff;
91 let r1 = (u32::from_le_bytes(r_bytes[3..7].try_into().unwrap()) >> 2) & 0x03ffff03;
92 let r2 = (u32::from_le_bytes(r_bytes[6..10].try_into().unwrap()) >> 4) & 0x03ffc0ff;
93 let r3 = (u32::from_le_bytes(r_bytes[9..13].try_into().unwrap()) >> 6) & 0x03f03fff;
94 let r4 = (u32::from_le_bytes(r_bytes[12..16].try_into().unwrap()) >> 8) & 0x000fffff;
95
96 // The above masks combine the clamping and the radix-2^26
97 // packing in one shot. The constants come from the standard
98 // donna-style packing used by libsodium and OpenSSL: each
99 // limb holds 26 bits aligned on its sub-byte boundary.
100
101 let mut s = [0u32; 4];
102 for i in 0..4 {
103 s[i] = u32::from_le_bytes(key[16 + 4 * i..16 + 4 * i + 4].try_into().unwrap());
104 }
105
106 Self {
107 r: [r0, r1, r2, r3, r4],
108 s,
109 acc: [0u32; 5],
110 buffer: [0u8; 16],
111 buf_pos: 0,
112 }
113 }
114
115 /// Absorb additional data into the accumulator.
116 ///
117 /// Buffers up to 15 leftover bytes between calls so that
118 /// arbitrary update sizes work the same as one big call.
119 pub fn update(&mut self, data: &[u8]) {
120 let mut pos = 0;
121
122 // First, fill the partial buffer if any.
123 if self.buf_pos > 0 {
124 let need = 16 - self.buf_pos;
125 let take = need.min(data.len());
126 self.buffer[self.buf_pos..self.buf_pos + take].copy_from_slice(&data[..take]);
127 self.buf_pos += take;
128 pos += take;
129 if self.buf_pos == 16 {
130 let block = self.buffer;
131 self.absorb_block(&block, /* high_bit = */ true);
132 self.buf_pos = 0;
133 }
134 }
135
136 // Process whole blocks directly from the input.
137 while pos + 16 <= data.len() {
138 let mut block = [0u8; 16];
139 block.copy_from_slice(&data[pos..pos + 16]);
140 self.absorb_block(&block, true);
141 pos += 16;
142 }
143
144 // Stash any remainder for the next call.
145 if pos < data.len() {
146 let remaining = data.len() - pos;
147 self.buffer[..remaining].copy_from_slice(&data[pos..]);
148 self.buf_pos = remaining;
149 }
150 }
151
152 /// Finalise the MAC and write the 16-byte tag into `tag`.
153 /// Consumes the state.
154 pub fn finalize(mut self) -> [u8; 16] {
155 // Process the final partial block (if any) with high_bit = false:
156 // for a short block of length L < 16, append a single 0x01 byte
157 // at position L and zero-pad to 16 bytes, then absorb without
158 // adding the implicit "+1" at bit 128.
159 if self.buf_pos > 0 {
160 let mut last = [0u8; 16];
161 last[..self.buf_pos].copy_from_slice(&self.buffer[..self.buf_pos]);
162 last[self.buf_pos] = 0x01;
163 self.absorb_block(&last, /* high_bit = */ false);
164 }
165
166 // Final reduction of the accumulator into [0, p).
167 let mut h0 = self.acc[0];
168 let mut h1 = self.acc[1];
169 let mut h2 = self.acc[2];
170 let mut h3 = self.acc[3];
171 let mut h4 = self.acc[4];
172
173 // Carry-propagate one more time.
174 let mut c: u32;
175 c = h1 >> 26;
176 h1 &= 0x03ffffff;
177 h2 += c;
178 c = h2 >> 26;
179 h2 &= 0x03ffffff;
180 h3 += c;
181 c = h3 >> 26;
182 h3 &= 0x03ffffff;
183 h4 += c;
184 c = h4 >> 26;
185 h4 &= 0x03ffffff;
186 h0 += c * 5;
187 c = h0 >> 26;
188 h0 &= 0x03ffffff;
189 h1 += c;
190
191 // Compute h - p = h + 5 - 2^130 and select between h and (h - p)
192 // based on whether h >= p. We use the standard donna trick:
193 // add 5, then subtract 2^130 unless the carry came back negative.
194 let mut g0 = h0.wrapping_add(5);
195 let mut c = g0 >> 26;
196 g0 &= 0x03ffffff;
197 let mut g1 = h1.wrapping_add(c);
198 let mut c = g1 >> 26;
199 g1 &= 0x03ffffff;
200 let mut g2 = h2.wrapping_add(c);
201 let mut c = g2 >> 26;
202 g2 &= 0x03ffffff;
203 let mut g3 = h3.wrapping_add(c);
204 let c = g3 >> 26;
205 g3 &= 0x03ffffff;
206 let g4 = h4.wrapping_add(c).wrapping_sub(1 << 26);
207
208 // If g4 has bit 31 set after the subtract-2^130 (i.e. h < p),
209 // keep h. Otherwise (h >= p) keep g.
210 let mask = (g4 >> 31).wrapping_sub(1); // 0xffffffff if h>=p, else 0
211 let inv = !mask;
212 h0 = (h0 & inv) | (g0 & mask);
213 h1 = (h1 & inv) | (g1 & mask);
214 h2 = (h2 & inv) | (g2 & mask);
215 h3 = (h3 & inv) | (g3 & mask);
216 h4 = (h4 & inv) | (g4 & mask);
217
218 // Pack the 5 26-bit limbs into 4 little-endian u32s.
219 let h0 = h0 | (h1 << 26);
220 let h1 = (h1 >> 6) | (h2 << 20);
221 let h2 = (h2 >> 12) | (h3 << 14);
222 let h3 = (h3 >> 18) | (h4 << 8);
223
224 // tag = (h + s) mod 2^128
225 let mut f: u64 = h0 as u64 + self.s[0] as u64;
226 let t0 = f as u32;
227 f = (f >> 32) + h1 as u64 + self.s[1] as u64;
228 let t1 = f as u32;
229 f = (f >> 32) + h2 as u64 + self.s[2] as u64;
230 let t2 = f as u32;
231 f = (f >> 32) + h3 as u64 + self.s[3] as u64;
232 let t3 = f as u32;
233
234 let mut tag = [0u8; 16];
235 tag[0..4].copy_from_slice(&t0.to_le_bytes());
236 tag[4..8].copy_from_slice(&t1.to_le_bytes());
237 tag[8..12].copy_from_slice(&t2.to_le_bytes());
238 tag[12..16].copy_from_slice(&t3.to_le_bytes());
239 tag
240 }
241
242 /// One-shot helper: feed the entire `data` and return the tag.
243 pub fn mac(key: &[u8; 32], data: &[u8]) -> [u8; 16] {
244 let mut p = Self::new(key);
245 p.update(data);
246 p.finalize()
247 }
248
249 /// Absorb a single 16-byte block into the accumulator. The
250 /// `high_bit` parameter is `true` for normal blocks (the implicit
251 /// 0x01 byte at position 16 is added to the integer reading) and
252 /// `false` for the final short block, where the message has
253 /// already been padded with `0x01 || 0`.
254 fn absorb_block(&mut self, block: &[u8; 16], high_bit: bool) {
255 // Read the 16-byte block as 5 26-bit limbs (LE).
256 let h0 = self.acc[0].wrapping_add(u32::from_le_bytes(block[0..4].try_into().unwrap()) & 0x03ffffff);
257 let h1 = self.acc[1].wrapping_add((u32::from_le_bytes(block[3..7].try_into().unwrap()) >> 2) & 0x03ffffff);
258 let h2 = self.acc[2].wrapping_add((u32::from_le_bytes(block[6..10].try_into().unwrap()) >> 4) & 0x03ffffff);
259 let h3 = self.acc[3].wrapping_add((u32::from_le_bytes(block[9..13].try_into().unwrap()) >> 6) & 0x03ffffff);
260 let h4 = self.acc[4].wrapping_add(
261 (u32::from_le_bytes(block[12..16].try_into().unwrap()) >> 8) | (if high_bit { 1 << 24 } else { 0 }),
262 );
263
264 // h *= r mod p, with the standard donna-style schoolbook
265 // 5x5 multiplication that absorbs the * 5 mod p reduction
266 // into the cross-terms.
267 let r0 = self.r[0] as u64;
268 let r1 = self.r[1] as u64;
269 let r2 = self.r[2] as u64;
270 let r3 = self.r[3] as u64;
271 let r4 = self.r[4] as u64;
272
273 // r * 5 (used to fold high bits back into the low limbs).
274 let s1 = r1 * 5;
275 let s2 = r2 * 5;
276 let s3 = r3 * 5;
277 let s4 = r4 * 5;
278
279 let h0_64 = h0 as u64;
280 let h1_64 = h1 as u64;
281 let h2_64 = h2 as u64;
282 let h3_64 = h3 as u64;
283 let h4_64 = h4 as u64;
284
285 let d0 = h0_64 * r0 + h1_64 * s4 + h2_64 * s3 + h3_64 * s2 + h4_64 * s1;
286 let d1 = h0_64 * r1 + h1_64 * r0 + h2_64 * s4 + h3_64 * s3 + h4_64 * s2;
287 let d2 = h0_64 * r2 + h1_64 * r1 + h2_64 * r0 + h3_64 * s4 + h4_64 * s3;
288 let d3 = h0_64 * r3 + h1_64 * r2 + h2_64 * r1 + h3_64 * r0 + h4_64 * s4;
289 let d4 = h0_64 * r4 + h1_64 * r3 + h2_64 * r2 + h3_64 * r1 + h4_64 * r0;
290
291 // Carry-propagate.
292 let mut c: u64;
293 let mut h0 = (d0 & 0x03ffffff) as u32;
294 c = d0 >> 26;
295 let mut h1 = ((d1 + c) & 0x03ffffff) as u32;
296 c = (d1 + c) >> 26;
297 let mut h2 = ((d2 + c) & 0x03ffffff) as u32;
298 c = (d2 + c) >> 26;
299 let mut h3 = ((d3 + c) & 0x03ffffff) as u32;
300 c = (d3 + c) >> 26;
301 let mut h4 = ((d4 + c) & 0x03ffffff) as u32;
302 c = (d4 + c) >> 26;
303 h0 = h0.wrapping_add((c * 5) as u32);
304 let c2 = h0 >> 26;
305 h0 &= 0x03ffffff;
306 h1 = h1.wrapping_add(c2);
307
308 self.acc = [h0, h1, h2, h3, h4];
309 }
310}
311
312// ============================================================================
313// Tests (RFC 8439 pinned vectors)
314// ============================================================================
315
316#[cfg(test)]
317mod tests {
318 use super::*;
319
320 fn hex(s: &str) -> Vec<u8> {
321 assert!(s.len() % 2 == 0);
322 (0..s.len())
323 .step_by(2)
324 .map(|i| u8::from_str_radix(&s[i..i + 2], 16).unwrap())
325 .collect()
326 }
327
328 fn hex_arr<const N: usize>(s: &str) -> [u8; N] {
329 let v = hex(s);
330 assert_eq!(v.len(), N);
331 let mut out = [0u8; N];
332 out.copy_from_slice(&v);
333 out
334 }
335
336 /// RFC 8439 §2.5.2 Poly1305 test vector.
337 ///
338 /// Key: 85d6be7857556d337f4452fe42d506a8 0103808afb0db2fd4abff6af4149f51b
339 /// Msg: "Cryptographic Forum Research Group"
340 /// Tag: a8061dc1305136c6c22b8baf0c0127a9
341 #[test]
342 fn rfc8439_2_5_2_poly1305_vector() {
343 let key: [u8; 32] = hex_arr("85d6be7857556d337f4452fe42d506a80103808afb0db2fd4abff6af4149f51b");
344 let msg = b"Cryptographic Forum Research Group";
345 let tag = Poly1305::mac(&key, msg);
346 let expected = hex("a8061dc1305136c6c22b8baf0c0127a9");
347 assert_eq!(tag.to_vec(), expected);
348 }
349
350 /// Chunked update must give the same tag as one big update.
351 #[test]
352 fn poly1305_chunked_update_matches_oneshot() {
353 let key = [0x42u8; 32];
354 let msg: Vec<u8> = (0..200).map(|i| (i * 7) as u8).collect();
355
356 let oneshot = Poly1305::mac(&key, &msg);
357
358 let mut chunks = Poly1305::new(&key);
359 chunks.update(&msg[..7]);
360 chunks.update(&msg[7..50]);
361 chunks.update(&msg[50..50]); // empty update
362 chunks.update(&msg[50..120]); // crosses 16-byte boundaries
363 chunks.update(&msg[120..]);
364 let tag = chunks.finalize();
365
366 assert_eq!(oneshot, tag);
367 }
368
369 /// Different keys must produce different tags for the same
370 /// message (sanity that the key actually drives the output).
371 #[test]
372 fn poly1305_different_keys_differ() {
373 let msg = b"the quick brown fox";
374 let k1 = [0x01u8; 32];
375 let mut k2 = [0x01u8; 32];
376 k2[0] ^= 0x80;
377 // Both keys still satisfy the clamp constraints since the
378 // clamp clears the relevant bits at packing time.
379 assert_ne!(Poly1305::mac(&k1, msg), Poly1305::mac(&k2, msg));
380 }
381
382 /// Different messages must produce different tags under the same
383 /// key (sanity that the message actually drives the output).
384 #[test]
385 fn poly1305_different_messages_differ() {
386 let key = [0x77u8; 32];
387 assert_ne!(Poly1305::mac(&key, b"hello"), Poly1305::mac(&key, b"world"));
388 }
389}