Initial AVX-512 port of ChaCha20.

caelunshun · caelunshun · commit 7346b83d8293 · 2025-11-01T14:52:03.000-06:00
Benchmark results on Zen 4:

AVX2 Zen4
test chacha20_bench1_16b   ... bench:          23.71 ns/iter (+/- 0.89) = 695 MB/s
test chacha20_bench2_256b  ... bench:          82.98 ns/iter (+/- 7.64) = 3121 MB/s
test chacha20_bench3_1kib  ... bench:         302.03 ns/iter (+/- 3.59) = 3390 MB/s
test chacha20_bench4_16kib ... bench:       4,677.58 ns/iter (+/- 161.42) = 3503 MB/s

AVX512 Zen4
test chacha20_bench1_16b   ... bench:          25.07 ns/iter (+/- 0.90) = 640 MB/s
test chacha20_bench2_256b  ... bench:          79.66 ns/iter (+/- 1.18) = 3240 MB/s
test chacha20_bench3_1kib  ... bench:         275.32 ns/iter (+/- 4.13) = 3723 MB/s
test chacha20_bench4_16kib ... bench:       4,201.84 ns/iter (+/- 24.18) = 3900 MB/s

Much greater speedups are achievable for long input sizes if we increase PAR_BLOCKS to 8,
but this also causes a 2x slowdown for short inputs (&lt; 512 bytes). The StreamCipherBackend
API doesn't seem to have any way to support multiple degrees of parallelism depending on the input
size.
diff --git a/chacha20/Cargo.toml b/chacha20/Cargo.toml
@@ -3,7 +3,7 @@ name = "chacha20"
 version = "0.10.0-rc.2"
 authors = ["RustCrypto Developers"]
 edition = "2024"
-rust-version = "1.85"
+rust-version = "1.89"
 documentation = "https://docs.rs/chacha20"
 readme = "README.md"
 repository = "https://github.com/RustCrypto/stream-ciphers"
@@ -20,7 +20,9 @@ rand_core-compatible RNGs based on those ciphers.
 
 [dependencies]
 cfg-if = "1"
-cipher = { version = "0.5.0-rc.1", optional = true, features = ["stream-wrapper"] }
+cipher = { version = "0.5.0-rc.1", optional = true, features = [
+    "stream-wrapper",
+] }
 rand_core = { version = "0.10.0-rc.1", optional = true, default-features = false }
 
 # `zeroize` is an explicit dependency because this crate may be used without the `cipher` crate
@@ -51,6 +53,7 @@ check-cfg = [
     'cfg(chacha20_force_soft)',
     'cfg(chacha20_force_sse2)',
     'cfg(chacha20_force_avx2)',
+    'cfg(chacha20_force_avx512)',
 ]
 
 [lints.clippy]
diff --git a/chacha20/src/backends.rs b/chacha20/src/backends.rs
@@ -13,6 +13,7 @@ cfg_if! {
                 pub(crate) mod sse2;
             } else {
                 pub(crate) mod soft;
+                pub(crate) mod avx512;
                 pub(crate) mod avx2;
                 pub(crate) mod sse2;
             }
diff --git a/chacha20/src/backends/avx512.rs b/chacha20/src/backends/avx512.rs
@@ -0,0 +1,350 @@
+#![allow(unsafe_op_in_unsafe_fn)]
+use crate::{Rounds, Variant};
+use core::marker::PhantomData;
+
+#[cfg(feature = "rng")]
+use crate::ChaChaCore;
+
+#[cfg(feature = "cipher")]
+use crate::{STATE_WORDS, chacha::Block};
+
+#[cfg(feature = "cipher")]
+use cipher::{
+    BlockSizeUser, ParBlocks, ParBlocksSizeUser, StreamCipherBackend, StreamCipherClosure,
+    consts::{U4, U64},
+};
+
+#[cfg(target_arch = "x86")]
+use core::arch::x86::*;
+#[cfg(target_arch = "x86_64")]
+use core::arch::x86_64::*;
+
+/// Number of blocks processed in parallel.
+const PAR_BLOCKS: usize = 4;
+/// Number of `__m512i` to store parallel blocks.
+const N: usize = PAR_BLOCKS / 4;
+
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg(feature = "cipher")]
+pub(crate) unsafe fn inner<R, F, V>(state: &mut [u32; STATE_WORDS], f: F)
+where
+    R: Rounds,
+    F: StreamCipherClosure<BlockSize = U64>,
+    V: Variant,
+{
+    let state_ptr = state.as_ptr() as *const __m128i;
+    let v = [
+        _mm512_broadcast_i32x4(_mm_loadu_si128(state_ptr.add(0))),
+        _mm512_broadcast_i32x4(_mm_loadu_si128(state_ptr.add(1))),
+        _mm512_broadcast_i32x4(_mm_loadu_si128(state_ptr.add(2))),
+    ];
+    let mut c = _mm512_broadcast_i32x4(_mm_loadu_si128(state_ptr.add(3)));
+    c = match size_of::<V::Counter>() {
+        4 => _mm512_add_epi32(
+            c,
+            _mm512_set_epi32(0, 0, 0, 3, 0, 0, 0, 2, 0, 0, 0, 1, 0, 0, 0, 0),
+        ),
+        8 => _mm512_add_epi64(c, _mm512_set_epi64(0, 3, 0, 2, 0, 1, 0, 0)),
+        _ => unreachable!(),
+    };
+    let mut ctr = [c; N];
+    for i in 0..N {
+        ctr[i] = c;
+        c = match size_of::<V::Counter>() {
+            4 => _mm512_add_epi32(
+                c,
+                _mm512_set_epi32(0, 0, 0, 4, 0, 0, 0, 4, 0, 0, 0, 4, 0, 0, 0, 4),
+            ),
+            8 => _mm512_add_epi64(c, _mm512_set_epi64(0, 4, 0, 4, 0, 4, 0, 4)),
+            _ => unreachable!(),
+        };
+    }
+    let mut backend = Backend::<R, V> {
+        v,
+        ctr,
+        _pd: PhantomData,
+    };
+
+    f.call(&mut backend);
+
+    state[12] = _mm256_extract_epi32::<0>(_mm512_extracti32x8_epi32::<0>(backend.ctr[0])) as u32;
+    match size_of::<V::Counter>() {
+        4 => {}
+        8 => {
+            state[13] =
+                _mm256_extract_epi32::<1>(_mm512_extracti32x8_epi32::<0>(backend.ctr[0])) as u32
+        }
+        _ => unreachable!(),
+    }
+}
+
+#[inline]
+#[target_feature(enable = "avx2")]
+#[cfg(feature = "rng")]
+pub(crate) unsafe fn rng_inner<R, V>(core: &mut ChaChaCore<R, V>, buffer: &mut [u32; 64])
+where
+    R: Rounds,
+    V: Variant,
+{
+    let state_ptr = core.state.as_ptr() as *const __m128i;
+    let v = [
+        _mm256_broadcastsi128_si256(_mm_loadu_si128(state_ptr.add(0))),
+        _mm256_broadcastsi128_si256(_mm_loadu_si128(state_ptr.add(1))),
+        _mm256_broadcastsi128_si256(_mm_loadu_si128(state_ptr.add(2))),
+    ];
+    let mut c = _mm256_broadcastsi128_si256(_mm_loadu_si128(state_ptr.add(3)));
+    c = _mm256_add_epi64(c, _mm256_set_epi64x(0, 1, 0, 0));
+    let mut ctr = [c; N];
+    for i in 0..N {
+        ctr[i] = c;
+        c = _mm256_add_epi64(c, _mm256_set_epi64x(0, 2, 0, 2));
+    }
+    let mut backend = Backend::<R, V> {
+        v,
+        ctr,
+        _pd: PhantomData,
+    };
+
+    backend.rng_gen_par_ks_blocks(buffer);
+
+    core.state[12] = _mm256_extract_epi32(backend.ctr[0], 0) as u32;
+    core.state[13] = _mm256_extract_epi32(backend.ctr[0], 1) as u32;
+}
+
+struct Backend<R: Rounds, V: Variant> {
+    v: [__m512i; 3],
+    ctr: [__m512i; N],
+    _pd: PhantomData<(R, V)>,
+}
+
+#[cfg(feature = "cipher")]
+impl<R: Rounds, V: Variant> BlockSizeUser for Backend<R, V> {
+    type BlockSize = U64;
+}
+
+#[cfg(feature = "cipher")]
+impl<R: Rounds, V: Variant> ParBlocksSizeUser for Backend<R, V> {
+    type ParBlocksSize = U4;
+}
+
+#[cfg(feature = "cipher")]
+impl<R: Rounds, V: Variant> StreamCipherBackend for Backend<R, V> {
+    #[inline(always)]
+    fn gen_ks_block(&mut self, block: &mut Block) {
+        unsafe {
+            let res = rounds::<R>(&self.v, &self.ctr);
+            for c in self.ctr.iter_mut() {
+                *c = match size_of::<V::Counter>() {
+                    4 => _mm512_add_epi32(
+                        *c,
+                        _mm512_set_epi32(0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1),
+                    ),
+                    8 => _mm512_add_epi64(*c, _mm512_set_epi64(0, 1, 0, 1, 0, 1, 0, 1)),
+                    _ => unreachable!(),
+                };
+            }
+
+            let block_ptr = block.as_mut_ptr() as *mut __m128i;
+
+            for i in 0..4 {
+                _mm_storeu_si128(block_ptr.add(i), _mm512_extracti32x4_epi32::<0>(res[0][i]));
+            }
+        }
+    }
+
+    #[inline(always)]
+    fn gen_par_ks_blocks(&mut self, blocks: &mut ParBlocks<Self>) {
+        unsafe {
+            let vs = rounds::<R>(&self.v, &self.ctr);
+
+            let pb = PAR_BLOCKS as i32;
+            for c in self.ctr.iter_mut() {
+                *c = match size_of::<V::Counter>() {
+                    4 => _mm512_add_epi32(
+                        *c,
+                        _mm512_set_epi32(0, 0, 0, pb, 0, 0, 0, pb, 0, 0, 0, pb, 0, 0, 0, pb),
+                    ),
+                    8 => _mm512_add_epi64(
+                        *c,
+                        _mm512_set_epi64(0, pb as i64, 0, pb as i64, 0, pb as i64, 0, pb as i64),
+                    ),
+                    _ => unreachable!(),
+                }
+            }
+
+            let mut block_ptr = blocks.as_mut_ptr() as *mut __m128i;
+            for v in vs {
+                let t: [__m128i; 16] = core::mem::transmute(v);
+                for i in 0..4 {
+                    _mm_storeu_si128(block_ptr.add(i), t[4 * i]);
+                    _mm_storeu_si128(block_ptr.add(4 + i), t[4 * i + 1]);
+                    _mm_storeu_si128(block_ptr.add(8 + i), t[4 * i + 2]);
+                    _mm_storeu_si128(block_ptr.add(12 + i), t[4 * i + 3]);
+                }
+                block_ptr = block_ptr.add(16);
+            }
+        }
+    }
+}
+
+#[cfg(feature = "rng")]
+impl<R: Rounds, V: Variant> Backend<R, V> {
+    #[inline(always)]
+    fn rng_gen_par_ks_blocks(&mut self, blocks: &mut [u32; 64]) {
+        unsafe {
+            let vs = rounds::<R>(&self.v, &self.ctr);
+
+            let pb = PAR_BLOCKS as i32;
+            for c in self.ctr.iter_mut() {
+                *c = _mm256_add_epi64(*c, _mm256_set_epi64x(0, pb as i64, 0, pb as i64));
+            }
+
+            let mut block_ptr = blocks.as_mut_ptr() as *mut __m128i;
+            for v in vs {
+                let t: [__m128i; 8] = core::mem::transmute(v);
+                for i in 0..4 {
+                    _mm_storeu_si128(block_ptr.add(i), t[2 * i]);
+                    _mm_storeu_si128(block_ptr.add(4 + i), t[2 * i + 1]);
+                }
+                block_ptr = block_ptr.add(8);
+            }
+        }
+    }
+}
+
+#[inline]
+#[target_feature(enable = "avx512f")]
+unsafe fn rounds<R: Rounds>(v: &[__m512i; 3], c: &[__m512i; N]) -> [[__m512i; 4]; N] {
+    let mut vs: [[__m512i; 4]; N] = [[_mm512_setzero_si512(); 4]; N];
+    for i in 0..N {
+        vs[i] = [v[0], v[1], v[2], c[i]];
+    }
+    for _ in 0..R::COUNT {
+        double_quarter_round(&mut vs);
+    }
+
+    for i in 0..N {
+        for j in 0..3 {
+            vs[i][j] = _mm512_add_epi32(vs[i][j], v[j]);
+        }
+        vs[i][3] = _mm512_add_epi32(vs[i][3], c[i]);
+    }
+
+    vs
+}
+
+#[inline]
+#[target_feature(enable = "avx2")]
+unsafe fn double_quarter_round(v: &mut [[__m512i; 4]; N]) {
+    add_xor_rot(v);
+    rows_to_cols(v);
+    add_xor_rot(v);
+    cols_to_rows(v);
+}
+
+/// The goal of this function is to transform the state words from:
+/// ```text
+/// [a0, a1, a2, a3]    [ 0,  1,  2,  3]
+/// [b0, b1, b2, b3] == [ 4,  5,  6,  7]
+/// [c0, c1, c2, c3]    [ 8,  9, 10, 11]
+/// [d0, d1, d2, d3]    [12, 13, 14, 15]
+/// ```
+///
+/// to:
+/// ```text
+/// [a0, a1, a2, a3]    [ 0,  1,  2,  3]
+/// [b1, b2, b3, b0] == [ 5,  6,  7,  4]
+/// [c2, c3, c0, c1]    [10, 11,  8,  9]
+/// [d3, d0, d1, d2]    [15, 12, 13, 14]
+/// ```
+///
+/// so that we can apply [`add_xor_rot`] to the resulting columns, and have it compute the
+/// "diagonal rounds" (as defined in RFC 7539) in parallel. In practice, this shuffle is
+/// non-optimal: the last state word to be altered in `add_xor_rot` is `b`, so the shuffle
+/// blocks on the result of `b` being calculated.
+///
+/// We can optimize this by observing that the four quarter rounds in `add_xor_rot` are
+/// data-independent: they only access a single column of the state, and thus the order of
+/// the columns does not matter. We therefore instead shuffle the other three state words,
+/// to obtain the following equivalent layout:
+/// ```text
+/// [a3, a0, a1, a2]    [ 3,  0,  1,  2]
+/// [b0, b1, b2, b3] == [ 4,  5,  6,  7]
+/// [c1, c2, c3, c0]    [ 9, 10, 11,  8]
+/// [d2, d3, d0, d1]    [14, 15, 12, 13]
+/// ```
+///
+/// See https://github.com/sneves/blake2-avx2/pull/4 for additional details. The earliest
+/// known occurrence of this optimization is in floodyberry's SSE4 ChaCha code from 2014:
+/// - https://github.com/floodyberry/chacha-opt/blob/0ab65cb99f5016633b652edebaf3691ceb4ff753/chacha_blocks_ssse3-64.S#L639-L643
+#[inline]
+#[target_feature(enable = "avx512f")]
+unsafe fn rows_to_cols(vs: &mut [[__m512i; 4]; N]) {
+    // c >>>= 32; d >>>= 64; a >>>= 96;
+    for [a, _, c, d] in vs {
+        *c = _mm512_shuffle_epi32::<0b_00_11_10_01>(*c); // _MM_SHUFFLE(0, 3, 2, 1)
+        *d = _mm512_shuffle_epi32::<0b_01_00_11_10>(*d); // _MM_SHUFFLE(1, 0, 3, 2)
+        *a = _mm512_shuffle_epi32::<0b_10_01_00_11>(*a); // _MM_SHUFFLE(2, 1, 0, 3)
+    }
+}
+
+/// The goal of this function is to transform the state words from:
+/// ```text
+/// [a3, a0, a1, a2]    [ 3,  0,  1,  2]
+/// [b0, b1, b2, b3] == [ 4,  5,  6,  7]
+/// [c1, c2, c3, c0]    [ 9, 10, 11,  8]
+/// [d2, d3, d0, d1]    [14, 15, 12, 13]
+/// ```
+///
+/// to:
+/// ```text
+/// [a0, a1, a2, a3]    [ 0,  1,  2,  3]
+/// [b0, b1, b2, b3] == [ 4,  5,  6,  7]
+/// [c0, c1, c2, c3]    [ 8,  9, 10, 11]
+/// [d0, d1, d2, d3]    [12, 13, 14, 15]
+/// ```
+///
+/// reversing the transformation of [`rows_to_cols`].
+#[inline]
+#[target_feature(enable = "avx512f")]
+unsafe fn cols_to_rows(vs: &mut [[__m512i; 4]; N]) {
+    // c <<<= 32; d <<<= 64; a <<<= 96;
+    for [a, _, c, d] in vs {
+        *c = _mm512_shuffle_epi32::<0b_10_01_00_11>(*c); // _MM_SHUFFLE(2, 1, 0, 3)
+        *d = _mm512_shuffle_epi32::<0b_01_00_11_10>(*d); // _MM_SHUFFLE(1, 0, 3, 2)
+        *a = _mm512_shuffle_epi32::<0b_00_11_10_01>(*a); // _MM_SHUFFLE(0, 3, 2, 1)
+    }
+}
+
+#[inline]
+#[target_feature(enable = "avx512f")]
+unsafe fn add_xor_rot(vs: &mut [[__m512i; 4]; N]) {
+    // a += b; d ^= a; d <<<= (16, 16, 16, 16);
+    for [a, b, _, d] in vs.iter_mut() {
+        *a = _mm512_add_epi32(*a, *b);
+        *d = _mm512_xor_si512(*d, *a);
+        *d = _mm512_rol_epi32::<16>(*d);
+    }
+
+    // c += d; b ^= c; b <<<= (12, 12, 12, 12);
+    for [_, b, c, d] in vs.iter_mut() {
+        *c = _mm512_add_epi32(*c, *d);
+        *b = _mm512_xor_si512(*b, *c);
+        *b = _mm512_rol_epi32::<12>(*b);
+    }
+
+    // a += b; d ^= a; d <<<= (8, 8, 8, 8);
+    for [a, b, _, d] in vs.iter_mut() {
+        *a = _mm512_add_epi32(*a, *b);
+        *d = _mm512_xor_si512(*d, *a);
+        *d = _mm512_rol_epi32::<8>(*d);
+    }
+
+    // c += d; b ^= c; b <<<= (7, 7, 7, 7);
+    for [_, b, c, d] in vs.iter_mut() {
+        *c = _mm512_add_epi32(*c, *d);
+        *b = _mm512_xor_si512(*b, *c);
+        *b = _mm512_rol_epi32::<7>(*b);
+    }
+}
diff --git a/chacha20/src/lib.rs b/chacha20/src/lib.rs

Original file line number	Diff line number	Diff line change
`@@ -13,6 +13,7 @@ cfg_if! {`
`13`	`13`	`pub(crate) mod sse2;`
`14`	`14`	`} else {`
`15`	`15`	`pub(crate) mod soft;`
	`16`	`+ pub(crate) mod avx512;`
`16`	`17`	`pub(crate) mod avx2;`
`17`	`18`	`pub(crate) mod sse2;`
`18`	`19`	`}`