diff --git a/.github/workflows/chacha20.yml b/.github/workflows/chacha20.yml index d79583e7..ac5cf746 100644 --- a/.github/workflows/chacha20.yml +++ b/.github/workflows/chacha20.yml @@ -20,6 +20,9 @@ defaults: env: CARGO_INCREMENTAL: 0 RUSTFLAGS: "-Dwarnings" + # NOTE: The mirror number changes with each version so keep these in sync + SDE_FULL_VERSION_MIRROR: "859732" + SDE_FULL_VERSION: "9.58.0-2025-06-16" jobs: build: @@ -79,6 +82,43 @@ jobs: - run: cargo check --target ${{ matrix.target }} --all-features - run: cargo hack test --feature-powerset --target ${{ matrix.target }} + # Tests for the AVX-512 backend + avx512: + runs-on: ubuntu-latest + strategy: + matrix: + include: + - target: x86_64-unknown-linux-gnu + rust: stable + RUSTFLAGS: "-Dwarnings --cfg chacha20_avx512" + env: + CARGO_INCREMENTAL: 0 + RUSTFLAGS: ${{ matrix.RUSTFLAGS }} + steps: + - uses: actions/checkout@v4 + - name: Install Intel SDE + run: | + curl -JLO "https://downloadmirror.intel.com/${{ env.SDE_FULL_VERSION_MIRROR }}/sde-external-${{ env.SDE_FULL_VERSION }}-lin.tar.xz" + tar xvf sde-external-${{ env.SDE_FULL_VERSION }}-lin.tar.xz -C /opt + echo "/opt/sde-external-${{ env.SDE_FULL_VERSION }}-lin" >> $GITHUB_PATH + - uses: RustCrypto/actions/cargo-cache@master + - uses: dtolnay/rust-toolchain@master + with: + toolchain: ${{ matrix.rust }} + targets: ${{ matrix.target }} + # NOTE: Write a `.cargo/config.toml` to configure the target for AVX-512 + # NOTE: We use intel-sde as the runner since not all GitHub CI hosts support AVX512 + - name: write .cargo/config.toml + shell: bash + run: | + cd ../chacha20/.. + mkdir -p .cargo + echo '[target.${{ matrix.target }}]' > .cargo/config.toml + echo 'runner = "sde64 -future --"' >> .cargo/config.toml + - run: ${{ matrix.deps }} + - run: cargo test --target ${{ matrix.target }} + - run: cargo test --target ${{ matrix.target }} --all-features + # Tests for the AVX2 backend avx2: runs-on: ubuntu-latest diff --git a/Cargo.lock b/Cargo.lock index 584a71db..9ea64bb5 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -58,6 +58,7 @@ dependencies = [ "cfg-if", "cipher", "cpufeatures", + "hex", "hex-literal", "proptest", "rand_chacha", @@ -138,6 +139,12 @@ dependencies = [ "hex-literal", ] +[[package]] +name = "hex" +version = "0.4.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7f24254aa9a54b5c858eaee2f5bccdb46aaf0e486a595ed5fd8f86ba55232a70" + [[package]] name = "hex-literal" version = "1.1.0" diff --git a/chacha20/Cargo.toml b/chacha20/Cargo.toml index 6e732524..73255596 100644 --- a/chacha20/Cargo.toml +++ b/chacha20/Cargo.toml @@ -34,6 +34,7 @@ cipher = { version = "0.5.0-rc.1", features = ["dev"] } hex-literal = "1" proptest = "1" rand_chacha = "0.9" +hex = "0.4" [features] default = ["cipher"] @@ -48,9 +49,11 @@ rustdoc-args = ["--cfg", "docsrs"] [lints.rust.unexpected_cfgs] level = "warn" check-cfg = [ + 'cfg(chacha20_avx512)', 'cfg(chacha20_force_soft)', 'cfg(chacha20_force_sse2)', 'cfg(chacha20_force_avx2)', + 'cfg(chacha20_force_avx512)', ] [lints.clippy] diff --git a/chacha20/README.md b/chacha20/README.md index 21a82172..72ce44c6 100644 --- a/chacha20/README.md +++ b/chacha20/README.md @@ -32,6 +32,7 @@ work on stable Rust with the following `RUSTFLAGS`: - `x86` / `x86_64` - `avx2`: (~1.4cpb) `-Ctarget-cpu=haswell -Ctarget-feature=+avx2` - `sse2`: (~1.6cpb) `-Ctarget-feature=+sse2` (on by default on x86 CPUs) + - `avx512`: `-Ctarget-feature=+avx512f,+avx512vl --cfg chacha20_avx512` requires Rust 1.89+ - `aarch64` - `neon` (~2-3x faster than `soft`) requires Rust 1.61+ and the `neon` feature enabled - Portable diff --git a/chacha20/src/backends.rs b/chacha20/src/backends.rs index 936e0b67..44f04c63 100644 --- a/chacha20/src/backends.rs +++ b/chacha20/src/backends.rs @@ -7,12 +7,19 @@ cfg_if! { pub(crate) mod soft; } else if #[cfg(any(target_arch = "x86", target_arch = "x86_64"))] { cfg_if! { - if #[cfg(chacha20_force_avx2)] { + if #[cfg(all(chacha20_avx512, chacha20_force_avx512))] { + pub(crate) mod avx512; + // AVX-2 backend needed for RNG if enabled + #[cfg(feature = "rng")] + pub(crate) mod avx2; + } else if #[cfg(chacha20_force_avx2)] { pub(crate) mod avx2; } else if #[cfg(chacha20_force_sse2)] { pub(crate) mod sse2; } else { pub(crate) mod soft; + #[cfg(chacha20_avx512)] + pub(crate) mod avx512; pub(crate) mod avx2; pub(crate) mod sse2; } diff --git a/chacha20/src/backends/avx2.rs b/chacha20/src/backends/avx2.rs index 7de35100..d5e4418b 100644 --- a/chacha20/src/backends/avx2.rs +++ b/chacha20/src/backends/avx2.rs @@ -27,6 +27,7 @@ const N: usize = PAR_BLOCKS / 2; #[inline] #[target_feature(enable = "avx2")] #[cfg(feature = "cipher")] +#[cfg_attr(chacha20_force_avx512, expect(unused))] pub(crate) unsafe fn inner(state: &mut [u32; STATE_WORDS], f: F) where R: Rounds, diff --git a/chacha20/src/backends/avx512.rs b/chacha20/src/backends/avx512.rs new file mode 100644 index 00000000..d8da59c8 --- /dev/null +++ b/chacha20/src/backends/avx512.rs @@ -0,0 +1,625 @@ +#![allow(unsafe_op_in_unsafe_fn)] +use crate::{Rounds, Variant}; +use core::marker::PhantomData; + +#[cfg(feature = "cipher")] +use crate::{STATE_WORDS, chacha::Block}; + +#[cfg(feature = "cipher")] +use cipher::{ + BlockSizeUser, ParBlocks, ParBlocksSizeUser, StreamCipherBackend, StreamCipherClosure, + consts::{U16, U64}, +}; + +#[cfg(target_arch = "x86")] +use core::arch::x86::*; +#[cfg(target_arch = "x86_64")] +use core::arch::x86_64::*; + +/// Maximum number of blocks processed in parallel. +/// We also support 8 and 4 in gen_tail_blocks. +const MAX_PAR_BLOCKS: usize = 16; + +/// Divisor to compute `N`, the number of __m512i needed +/// to represent a number of parallel blocks. +const BLOCKS_PER_VECTOR: usize = 4; + +const MAX_N: usize = MAX_PAR_BLOCKS / BLOCKS_PER_VECTOR; + +#[inline] +#[target_feature(enable = "avx512f")] +#[cfg(feature = "cipher")] +pub(crate) unsafe fn inner(state: &mut [u32; STATE_WORDS], f: F) +where + R: Rounds, + F: StreamCipherClosure, + V: Variant, +{ + let simd_state = state.as_mut_ptr().cast::(); + + let mut backend = Backend:: { + state: [ + _mm_loadu_epi32(simd_state), + _mm_loadu_epi32(simd_state.add(4)), + _mm_loadu_epi32(simd_state.add(8)), + ], + ctr: _mm_loadu_epi32(simd_state.add(12)), + _pd: PhantomData, + }; + + f.call(&mut backend); + + // Update counter in the persistent state + state[12] = _mm_extract_epi32::<0>(backend.ctr) as u32; + if size_of::() == 8 { + state[13] = _mm_extract_epi32::<1>(backend.ctr) as u32; + } +} + +struct Backend { + state: [__m128i; 3], + ctr: __m128i, + _pd: PhantomData<(R, V)>, +} + +#[cfg(feature = "cipher")] +impl Backend { + #[inline] + #[target_feature(enable = "avx512f", enable = "avx512vl")] + unsafe fn increment_ctr(&mut self, amount: usize) { + match size_of::() { + 4 => { + self.ctr = _mm_add_epi32(self.ctr, _mm_set_epi32(0, 0, 0, amount as i32)); + } + 8 => { + self.ctr = _mm_add_epi64(self.ctr, _mm_set_epi64x(0, amount as i64)); + } + _ => unreachable!(), + } + } + + /// Generates blocks using the 512-bit-wide dispatch + /// with up to `N` vectors processed in parallel, producing + /// `N * BLOCKS_PER_VECTOR` blocks. + #[inline] + #[target_feature(enable = "avx512f", enable = "avx512vl")] + unsafe fn gen_blocks_fullwidth(&mut self, blocks: &mut [Block]) { + let par_blocks = N * BLOCKS_PER_VECTOR; + assert!(blocks.len() <= par_blocks); + + let mut ctrs = [_mm512_broadcast_i32x4(self.ctr); N]; + for i in 0..ctrs.len() { + match size_of::() { + 4 => { + ctrs[i] = _mm512_add_epi32( + ctrs[i], + _mm512_set_epi32( + 0, + 0, + 0, + (i * BLOCKS_PER_VECTOR + 3) as i32, + 0, + 0, + 0, + (i * BLOCKS_PER_VECTOR + 2) as i32, + 0, + 0, + 0, + (i * BLOCKS_PER_VECTOR + 1) as i32, + 0, + 0, + 0, + (i * BLOCKS_PER_VECTOR) as i32, + ), + ); + } + 8 => { + ctrs[i] = _mm512_add_epi64( + ctrs[i], + _mm512_set_epi64( + 0, + (i * BLOCKS_PER_VECTOR + 3) as i64, + 0, + (i * BLOCKS_PER_VECTOR + 2) as i64, + 0, + (i * BLOCKS_PER_VECTOR + 1) as i64, + 0, + (i * BLOCKS_PER_VECTOR) as i64, + ), + ); + } + _ => unreachable!(), + } + } + + self.increment_ctr(blocks.len()); + + let result = rounds::(&self.state.map(|v| _mm512_broadcast_i32x4(v)), &ctrs); + + for i in 0..N { + let result_vectors = result[i]; + + // We have our data in SIMD vectors in the following layout + // (using a, b, c, and d to indicate the resp. 4 rows of each block, + // and Bn to denote the nth block): + // result_vectors[0]: + // B0a0 B0a1 B0a2 B0a3 + // B1a0 B1a1 B1a2 B1a3 + // ... + // B3a0 B3a1 B3a2 B3a2 + // + // result_vectors[1]: + // B0b0 B0b1 B0b2 B0b3 + // B1b0 B1b1 B1b2 B1b3 + // ... + // B3b0 B3b1 B3b2 B3b2 + // + // and so on for result_vectors[2] (storing c values) and result_vectors[3] (storing d values). + // + // To store to memory, we need to transpose to the following format: + // transposed[0]: + // B0a0 B0a1 B0a2 B0a3 + // B0b0 B0b1 B0b2 B0b3 + // B0c0 B0c1 B0c2 B0c3 + // B0d0 B0d1 B0d2 B0d3 + // + // and so on, such that each 512-bit SIMD vector + // contains a single contiguous block. + // + // We achieve this transposition using the following + // sequence of shuffles. + + let temp_abab_block01 = _mm512_permutex2var_epi64( + result_vectors[0], + _mm512_setr_epi64(0, 1, 8, 9, 2, 3, 10, 11), + result_vectors[1], + ); + let temp_abab_block23 = _mm512_permutex2var_epi64( + result_vectors[0], + _mm512_setr_epi64(4, 5, 12, 13, 6, 7, 14, 15), + result_vectors[1], + ); + + let temp_cdcd_block01 = _mm512_permutex2var_epi64( + result_vectors[2], + _mm512_setr_epi64(0, 1, 8, 9, 2, 3, 10, 11), + result_vectors[3], + ); + let temp_cdcd_block23 = _mm512_permutex2var_epi64( + result_vectors[2], + _mm512_setr_epi64(4, 5, 12, 13, 6, 7, 14, 15), + result_vectors[3], + ); + + let block0 = + _mm512_shuffle_i32x4::<0b01_00_01_00>(temp_abab_block01, temp_cdcd_block01); + let block1 = + _mm512_shuffle_i32x4::<0b11_10_11_10>(temp_abab_block01, temp_cdcd_block01); + let block2 = + _mm512_shuffle_i32x4::<0b01_00_01_00>(temp_abab_block23, temp_cdcd_block23); + let block3 = + _mm512_shuffle_i32x4::<0b11_10_11_10>(temp_abab_block23, temp_cdcd_block23); + + for (j, src_block) in [block0, block1, block2, block3].into_iter().enumerate() { + let dst_index = i * BLOCKS_PER_VECTOR + j; + if dst_index < blocks.len() { + _mm512_storeu_si512((&raw mut blocks[dst_index]).cast(), src_block); + } + } + } + } + + /// Generates up to 2 blocks using 256-bit vectors. + #[inline] + #[target_feature(enable = "avx512f", enable = "avx512vl")] + unsafe fn gen_blocks_halfwidth(&mut self, blocks: &mut [Block]) { + assert!(blocks.len() <= 2); + + let mut ctr = _mm256_broadcast_i32x4(self.ctr); + + match size_of::() { + 4 => { + ctr = _mm256_add_epi32(ctr, _mm256_set_epi32(0, 0, 0, 1, 0, 0, 0, 0)); + } + 8 => { + ctr = _mm256_add_epi64(ctr, _mm256_set_epi64x(0, 1, 0, 0)); + } + _ => unreachable!(), + } + + self.increment_ctr(blocks.len()); + + let block_vectors = rounds_halfwide::([ + _mm256_broadcast_i32x4(self.state[0]), + _mm256_broadcast_i32x4(self.state[1]), + _mm256_broadcast_i32x4(self.state[2]), + ctr, + ]); + + // Similar transpose operation as + // in gen_blocks_fullwidth. + + let block0_ab = _mm256_permutex2var_epi64( + block_vectors[0], + _mm256_setr_epi64x(0, 1, 4, 5), + block_vectors[1], + ); + let block0_cd = _mm256_permutex2var_epi64( + block_vectors[2], + _mm256_setr_epi64x(0, 1, 4, 5), + block_vectors[3], + ); + let block1_ab = _mm256_permutex2var_epi64( + block_vectors[0], + _mm256_setr_epi64x(2, 3, 6, 7), + block_vectors[1], + ); + let block1_cd = _mm256_permutex2var_epi64( + block_vectors[2], + _mm256_setr_epi64x(2, 3, 6, 7), + block_vectors[3], + ); + + for (i, (block_part_ab, block_part_cd)) in [(block0_ab, block0_cd), (block1_ab, block1_cd)] + .into_iter() + .enumerate() + { + if i < blocks.len() { + let dst = (&raw mut blocks[i]).cast::(); + _mm256_storeu_epi32(dst, block_part_ab); + _mm256_storeu_epi32( + dst.add(size_of::() / 2 / size_of::()), + block_part_cd, + ); + } + } + } +} + +#[cfg(feature = "cipher")] +impl BlockSizeUser for Backend { + type BlockSize = U64; +} + +#[cfg(feature = "cipher")] +impl ParBlocksSizeUser for Backend { + type ParBlocksSize = U16; +} + +#[cfg(feature = "cipher")] +impl StreamCipherBackend for Backend { + #[inline] + fn gen_par_ks_blocks(&mut self, blocks: &mut ParBlocks) { + unsafe { self.gen_blocks_fullwidth::(blocks) } + } + + #[inline(always)] + fn gen_ks_block(&mut self, block: &mut Block) { + // Fallback for generating a single block using quarter-width vectors + // (128). + + unsafe { + let state = [self.state[0], self.state[1], self.state[2], self.ctr]; + + self.increment_ctr(1); + + let result = rounds_quarterwide::(state); + + for row in 0..4 { + let dst = block.as_mut_ptr().cast::().add(row * 4); + _mm_storeu_epi32(dst, result[row]); + } + } + } + + #[inline] + fn gen_tail_blocks(&mut self, blocks: &mut [cipher::Block]) { + assert!(blocks.len() < MAX_PAR_BLOCKS); + + if blocks.is_empty() { + return; + } + + // Fallback for generating a number of blocks less than + // MAX_PAR_BLOCKS. + unsafe { + if blocks.len() == 1 { + self.gen_ks_block(&mut blocks[0]); + } else if blocks.len() == 2 { + self.gen_blocks_halfwidth(blocks); + } else if blocks.len() <= 4 { + self.gen_blocks_fullwidth::<1>(blocks); + } else if blocks.len() <= 8 { + self.gen_blocks_fullwidth::<2>(blocks); + } else { + self.gen_blocks_fullwidth::(blocks); + } + } + } +} + +#[inline] +#[target_feature(enable = "avx512f")] +unsafe fn rounds( + v: &[__m512i; 3], + c: &[__m512i; N], +) -> [[__m512i; 4]; N] { + let mut vs: [[__m512i; 4]; N] = [[_mm512_setzero_si512(); 4]; N]; + for i in 0..N { + vs[i] = [v[0], v[1], v[2], c[i]]; + } + for _ in 0..R::COUNT { + double_quarter_round(&mut vs); + } + + for i in 0..N { + for j in 0..3 { + vs[i][j] = _mm512_add_epi32(vs[i][j], v[j]); + } + vs[i][3] = _mm512_add_epi32(vs[i][3], c[i]); + } + + vs +} + +#[inline] +#[target_feature(enable = "avx512f")] +unsafe fn double_quarter_round(v: &mut [[__m512i; 4]; N]) { + add_xor_rot(v); + rows_to_cols(v); + add_xor_rot(v); + cols_to_rows(v); +} + +/// The goal of this function is to transform the state words from: +/// ```text +/// [a0, a1, a2, a3] [ 0, 1, 2, 3] +/// [b0, b1, b2, b3] == [ 4, 5, 6, 7] +/// [c0, c1, c2, c3] [ 8, 9, 10, 11] +/// [d0, d1, d2, d3] [12, 13, 14, 15] +/// ``` +/// +/// to: +/// ```text +/// [a0, a1, a2, a3] [ 0, 1, 2, 3] +/// [b1, b2, b3, b0] == [ 5, 6, 7, 4] +/// [c2, c3, c0, c1] [10, 11, 8, 9] +/// [d3, d0, d1, d2] [15, 12, 13, 14] +/// ``` +/// +/// so that we can apply [`add_xor_rot`] to the resulting columns, and have it compute the +/// "diagonal rounds" (as defined in RFC 7539) in parallel. In practice, this shuffle is +/// non-optimal: the last state word to be altered in `add_xor_rot` is `b`, so the shuffle +/// blocks on the result of `b` being calculated. +/// +/// We can optimize this by observing that the four quarter rounds in `add_xor_rot` are +/// data-independent: they only access a single column of the state, and thus the order of +/// the columns does not matter. We therefore instead shuffle the other three state words, +/// to obtain the following equivalent layout: +/// ```text +/// [a3, a0, a1, a2] [ 3, 0, 1, 2] +/// [b0, b1, b2, b3] == [ 4, 5, 6, 7] +/// [c1, c2, c3, c0] [ 9, 10, 11, 8] +/// [d2, d3, d0, d1] [14, 15, 12, 13] +/// ``` +/// +/// See https://github.com/sneves/blake2-avx2/pull/4 for additional details. The earliest +/// known occurrence of this optimization is in floodyberry's SSE4 ChaCha code from 2014: +/// - https://github.com/floodyberry/chacha-opt/blob/0ab65cb99f5016633b652edebaf3691ceb4ff753/chacha_blocks_ssse3-64.S#L639-L643 +#[inline] +#[target_feature(enable = "avx512f")] +unsafe fn rows_to_cols(vs: &mut [[__m512i; 4]; N]) { + // c >>>= 32; d >>>= 64; a >>>= 96; + for [a, _, c, d] in vs { + *c = _mm512_shuffle_epi32::<0b_00_11_10_01>(*c); // _MM_SHUFFLE(0, 3, 2, 1) + *d = _mm512_shuffle_epi32::<0b_01_00_11_10>(*d); // _MM_SHUFFLE(1, 0, 3, 2) + *a = _mm512_shuffle_epi32::<0b_10_01_00_11>(*a); // _MM_SHUFFLE(2, 1, 0, 3) + } +} + +/// The goal of this function is to transform the state words from: +/// ```text +/// [a3, a0, a1, a2] [ 3, 0, 1, 2] +/// [b0, b1, b2, b3] == [ 4, 5, 6, 7] +/// [c1, c2, c3, c0] [ 9, 10, 11, 8] +/// [d2, d3, d0, d1] [14, 15, 12, 13] +/// ``` +/// +/// to: +/// ```text +/// [a0, a1, a2, a3] [ 0, 1, 2, 3] +/// [b0, b1, b2, b3] == [ 4, 5, 6, 7] +/// [c0, c1, c2, c3] [ 8, 9, 10, 11] +/// [d0, d1, d2, d3] [12, 13, 14, 15] +/// ``` +/// +/// reversing the transformation of [`rows_to_cols`]. +#[inline] +#[target_feature(enable = "avx512f")] +unsafe fn cols_to_rows(vs: &mut [[__m512i; 4]; N]) { + // c <<<= 32; d <<<= 64; a <<<= 96; + for [a, _, c, d] in vs { + *c = _mm512_shuffle_epi32::<0b_10_01_00_11>(*c); // _MM_SHUFFLE(2, 1, 0, 3) + *d = _mm512_shuffle_epi32::<0b_01_00_11_10>(*d); // _MM_SHUFFLE(1, 0, 3, 2) + *a = _mm512_shuffle_epi32::<0b_00_11_10_01>(*a); // _MM_SHUFFLE(0, 3, 2, 1) + } +} + +#[inline] +#[target_feature(enable = "avx512f")] +unsafe fn add_xor_rot(vs: &mut [[__m512i; 4]; N]) { + // a += b; d ^= a; d <<<= (16, 16, 16, 16); + for [a, b, _, d] in vs.iter_mut() { + *a = _mm512_add_epi32(*a, *b); + *d = _mm512_xor_si512(*d, *a); + *d = _mm512_rol_epi32::<16>(*d); + } + + // c += d; b ^= c; b <<<= (12, 12, 12, 12); + for [_, b, c, d] in vs.iter_mut() { + *c = _mm512_add_epi32(*c, *d); + *b = _mm512_xor_si512(*b, *c); + *b = _mm512_rol_epi32::<12>(*b); + } + + // a += b; d ^= a; d <<<= (8, 8, 8, 8); + for [a, b, _, d] in vs.iter_mut() { + *a = _mm512_add_epi32(*a, *b); + *d = _mm512_xor_si512(*d, *a); + *d = _mm512_rol_epi32::<8>(*d); + } + + // c += d; b ^= c; b <<<= (7, 7, 7, 7); + for [_, b, c, d] in vs.iter_mut() { + *c = _mm512_add_epi32(*c, *d); + *b = _mm512_xor_si512(*b, *c); + *b = _mm512_rol_epi32::<7>(*b); + } +} + +// Below is another implementation of the round application +// that uses 256-bit vectors instead of 512-bit (but, unlike +// the avx2 module, can use new AVX-512 instructions like rotates). +// It is used for tail processing of shorter outputs, +// since 256-bit instructions can be faster and lower latency +// than 512-bit instructions on certain microarchitectures (e.g. Zen 4). + +#[inline] +#[target_feature(enable = "avx512f", enable = "avx512vl")] +unsafe fn rounds_halfwide(v_in: [__m256i; 4]) -> [__m256i; 4] { + let mut v = v_in; + + for _ in 0..R::COUNT { + double_quarter_round_halfwide(&mut v); + } + + for (a, b) in v.iter_mut().zip(v_in) { + *a = _mm256_add_epi32(*a, b); + } + + v +} + +#[inline] +#[target_feature(enable = "avx512f", enable = "avx512vl")] +unsafe fn double_quarter_round_halfwide(v: &mut [__m256i; 4]) { + add_xor_rot_halfwide(v); + rows_to_cols_halfwide(v); + add_xor_rot_halfwide(v); + cols_to_rows_halfwide(v); +} + +#[inline] +#[target_feature(enable = "avx512f", enable = "avx512vl")] +unsafe fn rows_to_cols_halfwide(v: &mut [__m256i; 4]) { + // c >>>= 32; d >>>= 64; a >>>= 96; + let [a, _, c, d] = v; + *c = _mm256_shuffle_epi32::<0b_00_11_10_01>(*c); // _MM_SHUFFLE(0, 3, 2, 1) + *d = _mm256_shuffle_epi32::<0b_01_00_11_10>(*d); // _MM_SHUFFLE(1, 0, 3, 2) + *a = _mm256_shuffle_epi32::<0b_10_01_00_11>(*a); // _MM_SHUFFLE(2, 1, 0, 3) +} + +#[inline] +#[target_feature(enable = "avx512f", enable = "avx512vl")] +unsafe fn cols_to_rows_halfwide(v: &mut [__m256i; 4]) { + // c <<<= 32; d <<<= 64; a <<<= 96; + let [a, _, c, d] = v; + *c = _mm256_shuffle_epi32::<0b_10_01_00_11>(*c); // _MM_SHUFFLE(2, 1, 0, 3) + *d = _mm256_shuffle_epi32::<0b_01_00_11_10>(*d); // _MM_SHUFFLE(1, 0, 3, 2) + *a = _mm256_shuffle_epi32::<0b_00_11_10_01>(*a); // _MM_SHUFFLE(0, 3, 2, 1) +} + +#[inline] +#[target_feature(enable = "avx512f", enable = "avx512vl")] +unsafe fn add_xor_rot_halfwide(v: &mut [__m256i; 4]) { + let [a, b, c, d] = v; + + // a += b; d ^= a; d <<<= (16, 16, 16, 16); + *a = _mm256_add_epi32(*a, *b); + *d = _mm256_xor_si256(*d, *a); + *d = _mm256_rol_epi32::<16>(*d); + + // c += d; b ^= c; b <<<= (12, 12, 12, 12); + *c = _mm256_add_epi32(*c, *d); + *b = _mm256_xor_si256(*b, *c); + *b = _mm256_rol_epi32::<12>(*b); + + // a += b; d ^= a; d <<<= (8, 8, 8, 8); + *a = _mm256_add_epi32(*a, *b); + *d = _mm256_xor_si256(*d, *a); + *d = _mm256_rol_epi32::<8>(*d); + + // c += d; b ^= c; b <<<= (7, 7, 7, 7); + *c = _mm256_add_epi32(*c, *d); + *b = _mm256_xor_si256(*b, *c); + *b = _mm256_rol_epi32::<7>(*b); +} + +// Finally, below is an implementation using 128-bit vectors +// for the case of generating a single block. + +#[inline(always)] +unsafe fn rounds_quarterwide(v_in: [__m128i; 4]) -> [__m128i; 4] { + let mut v = v_in; + + for _ in 0..R::COUNT { + double_quarter_round_quarterwide(&mut v); + } + + for (a, b) in v.iter_mut().zip(v_in) { + *a = _mm_add_epi32(*a, b); + } + + v +} + +#[inline(always)] +unsafe fn double_quarter_round_quarterwide(v: &mut [__m128i; 4]) { + add_xor_rot_quarterwide(v); + rows_to_cols_quarterwide(v); + add_xor_rot_quarterwide(v); + cols_to_rows_quarterwide(v); +} + +#[inline(always)] +unsafe fn rows_to_cols_quarterwide(v: &mut [__m128i; 4]) { + // c >>>= 32; d >>>= 64; a >>>= 96; + let [a, _, c, d] = v; + *c = _mm_shuffle_epi32::<0b_00_11_10_01>(*c); // _MM_SHUFFLE(0, 3, 2, 1) + *d = _mm_shuffle_epi32::<0b_01_00_11_10>(*d); // _MM_SHUFFLE(1, 0, 3, 2) + *a = _mm_shuffle_epi32::<0b_10_01_00_11>(*a); // _MM_SHUFFLE(2, 1, 0, 3) +} + +#[inline(always)] +unsafe fn cols_to_rows_quarterwide(v: &mut [__m128i; 4]) { + // c <<<= 32; d <<<= 64; a <<<= 96; + let [a, _, c, d] = v; + *c = _mm_shuffle_epi32::<0b_10_01_00_11>(*c); // _MM_SHUFFLE(2, 1, 0, 3) + *d = _mm_shuffle_epi32::<0b_01_00_11_10>(*d); // _MM_SHUFFLE(1, 0, 3, 2) + *a = _mm_shuffle_epi32::<0b_00_11_10_01>(*a); // _MM_SHUFFLE(0, 3, 2, 1) +} + +#[inline(always)] +unsafe fn add_xor_rot_quarterwide(v: &mut [__m128i; 4]) { + let [a, b, c, d] = v; + + // a += b; d ^= a; d <<<= (16, 16, 16, 16); + *a = _mm_add_epi32(*a, *b); + *d = _mm_xor_si128(*d, *a); + *d = _mm_rol_epi32::<16>(*d); + + // c += d; b ^= c; b <<<= (12, 12, 12, 12); + *c = _mm_add_epi32(*c, *d); + *b = _mm_xor_si128(*b, *c); + *b = _mm_rol_epi32::<12>(*b); + + // a += b; d ^= a; d <<<= (8, 8, 8, 8); + *a = _mm_add_epi32(*a, *b); + *d = _mm_xor_si128(*d, *a); + *d = _mm_rol_epi32::<8>(*d); + + // c += d; b ^= c; b <<<= (7, 7, 7, 7); + *c = _mm_add_epi32(*c, *d); + *b = _mm_xor_si128(*b, *c); + *b = _mm_rol_epi32::<7>(*b); +} diff --git a/chacha20/src/lib.rs b/chacha20/src/lib.rs index d9846f71..ca226440 100644 --- a/chacha20/src/lib.rs +++ b/chacha20/src/lib.rs @@ -185,7 +185,12 @@ cfg_if! { type Tokens = (); } else if #[cfg(any(target_arch = "x86", target_arch = "x86_64"))] { cfg_if! { - if #[cfg(chacha20_force_avx2)] { + if #[cfg(all(chacha20_avx512, chacha20_force_avx512))] { + #[cfg(not(all(target_feature = "avx512f", target_feature = "avx512vl")))] + compile_error!("You must enable `avx512f` and `avx512vl` target features with \ + `chacha20_force_avx512` configuration option"); + type Tokens = (); + } else if #[cfg(chacha20_force_avx2)] { #[cfg(not(target_feature = "avx2"))] compile_error!("You must enable `avx2` target feature with \ `chacha20_force_avx2` configuration option"); @@ -196,8 +201,13 @@ cfg_if! { `chacha20_force_sse2` configuration option"); type Tokens = (); } else { + #[cfg(chacha20_avx512)] + cpufeatures::new!(avx512_cpuid, "avx512f", "avx512vl"); cpufeatures::new!(avx2_cpuid, "avx2"); cpufeatures::new!(sse2_cpuid, "sse2"); + #[cfg(chacha20_avx512)] + type Tokens = (avx512_cpuid::InitToken, avx2_cpuid::InitToken, sse2_cpuid::InitToken); + #[cfg(not(chacha20_avx512))] type Tokens = (avx2_cpuid::InitToken, sse2_cpuid::InitToken); } } @@ -247,10 +257,14 @@ impl ChaChaCore { let tokens = (); } else if #[cfg(any(target_arch = "x86", target_arch = "x86_64"))] { cfg_if! { - if #[cfg(chacha20_force_avx2)] { + if #[cfg(chacha20_force_avx512)] { + let tokens = (); + } else if #[cfg(chacha20_force_avx2)] { let tokens = (); } else if #[cfg(chacha20_force_sse2)] { let tokens = (); + } else if #[cfg(chacha20_avx512)] { + let tokens = (avx512_cpuid::init(), avx2_cpuid::init(), sse2_cpuid::init()); } else { let tokens = (avx2_cpuid::init(), sse2_cpuid::init()); } @@ -298,7 +312,11 @@ impl StreamCipherCore for ChaChaCore { f.call(&mut backends::soft::Backend(self)); } else if #[cfg(any(target_arch = "x86", target_arch = "x86_64"))] { cfg_if! { - if #[cfg(chacha20_force_avx2)] { + if #[cfg(all(chacha20_avx512, chacha20_force_avx512))] { + unsafe { + backends::avx512::inner::(&mut self.state, f); + } + } else if #[cfg(chacha20_force_avx2)] { unsafe { backends::avx2::inner::(&mut self.state, f); } @@ -307,7 +325,18 @@ impl StreamCipherCore for ChaChaCore { backends::sse2::inner::(&mut self.state, f); } } else { + #[cfg(chacha20_avx512)] + let (avx512_token, avx2_token, sse2_token) = self.tokens; + #[cfg(not(chacha20_avx512))] let (avx2_token, sse2_token) = self.tokens; + + #[cfg(chacha20_avx512)] + if avx512_token.get() { + unsafe { + backends::avx512::inner::(&mut self.state, f); + } + return; + } if avx2_token.get() { unsafe { backends::avx2::inner::(&mut self.state, f); diff --git a/chacha20/src/rng.rs b/chacha20/src/rng.rs index 4c483442..303ac733 100644 --- a/chacha20/src/rng.rs +++ b/chacha20/src/rng.rs @@ -189,7 +189,8 @@ impl ChaChaCore { backends::soft::Backend(self).gen_ks_blocks(buffer); } else if #[cfg(any(target_arch = "x86", target_arch = "x86_64"))] { cfg_if! { - if #[cfg(chacha20_force_avx2)] { + // AVX-512 doesn't support RNG, so use AVX-2 instead + if #[cfg(any(chacha20_force_avx2, chacha20_force_avx512))] { unsafe { backends::avx2::rng_inner::(self, buffer); } @@ -198,7 +199,11 @@ impl ChaChaCore { backends::sse2::rng_inner::(self, buffer); } } else { + #[cfg(chacha20_avx512)] + let (_avx512_token, avx2_token, sse2_token) = self.tokens; + #[cfg(not(chacha20_avx512))] let (avx2_token, sse2_token) = self.tokens; + if avx2_token.get() { unsafe { backends::avx2::rng_inner::(self, buffer); diff --git a/chacha20/tests/data/chacha20_long_ciphertext.txt b/chacha20/tests/data/chacha20_long_ciphertext.txt new file mode 100644 index 00000000..e580bfd2 --- /dev/null +++ b/chacha20/tests/data/chacha20_long_ciphertext.txt @@ -0,0 +1 @@ +d15b418e477b13f6c6ca78bcefb019acd32419c9de046236edfc32a8d4a9c8e474d73f20d92f90b9805c6c71c56e29878d8c369841cfe817cf4af403456f1452c76d748c9c5d08cd2e8cf4c5c7ae2f0ef722817112a0d4b6ba0e867b5fe122013bb3c861b776b7106a19a75fcd60eead7f2315b47905fa376383f674bf7bc7c8a55b4ab6ad8f6315bf9d211e74c16bb7606af8f87ae5e3df9c5b8c52d1044b5bc559e1150d034b44e0886170e355e4a29bb0b21f7126a47b8d0f83eef6e6bb483998c4bb01ae1da66e750e4d1d3cd42c37b3649b8f35efdc650a2beec08883d6f5f84ad30d1e58601ebb228a80d6c95c82abf66a78415a2e48a105b08e2dd1ca52c6ece53a8b131950d04197ec39eb5fd0a17731dc2f223ee6d23c289966d3446ff1c4950ceb30f85c2adf8e5e1f876ffde011e15b8673409d4a349e20d2101b26b82efd950511cd92110bdfd19d0e97d6dcf3264f13fd56d1efc322193cff04f5fed90be5b734a4788450abe949e02b0ea7f5765587808ba41ad22a3d22a22626114cdad4aba17fc7cd617a7be04e2ff5f49f50b28ffdfd273d66c79b20f7f08cb8271ebfa8df83cf153f57cd74bb7dae8f8b39c107e29cc75c7d155acf4df04d4419b1a22439dc438a1cfcaa1047a05488f11d4d65eba77eb796a425bc3832798abf7867e1f8924e40875386ab4b38bccf01c3b0269bb4dd439c9d9ee70e024c7f6ccafc9aefbb1a6bfe761b86e42c967ff3a435d7acdbad4adbaef92acb8044abf1fcbd77baef27b84d96f0f63193467a961e2ca12d63c06d713386a354705bb837470b385a2fb2b57a12d6ec81fd432320c674f94f6dc97a0576ebe05c8e47c493e77bcfba7425db1cd7756be05e342c4536eee23e719a6ab4e1af9be1331f328af13988b7c99764e997840844cc89386d196a8ec60bc7c1a929a58963e6f7c7b646c444c16cb8c1a793e97c659639c2f11efa0e5543f5ec8bdbef6e43c03de0468310e36c0eceac4bd96f80cbe4f13c3d204b3b661b9d11c84616fa6c613a0d5c2b8b31827dad90a0307df6d566f4321430f76649c3d28c68c4ab60c3134e07453df1821c5fc90b265cc8d5c2d8c8956dba77a0d9b528f097736c82b172aa195ca4b20ac85e7eec64851631003c6c3ce41110b27385b7ded00178a508e5ed52fcb64938b543a38ea68d22b924e171548ae558e7b3a2b436652e7dda4f7f96e0732ba81dcf91ec3b064cc04ddb55c31847df992aee116fb91581223b6f8588151d9ade8ed5b4796ea8899028e06676695a307924b2caee25790c03a4322f0035f59e4d02bb26d40c75658166d1a39fcbfade6ae37615efe0bd3cce825caf5fe74a0b1f0ddf4ee23fdd5acefd3a88b38439f8b831f6393d8eee93d4d23efa496c93c2088405786e3a5b8b0de02654df35c6910817b065cb2bbebeab4d6b31250ccb885575aa4880250aa805ec1a80b4093499128b406e3e624b2481347c4292d9db13afbf0a75bad9df5a2111f2f2326d2d7543128706310c9f7517a25aa03aca6503f533446732885ff884ccfc66f5c771e3330a5c22c0b43b191b88262d1bc14833011bbf12a44345e7095ceab0fa824ed55b0a5d2cae20f5047e46e882b3e69a9f65775e6b455545b55c24669a0516d23e6fd96d60698ed5bac8c02df05086cd9f4b3398b087a10f1b5eecbf13ba38a626a5e2cadae279875e7efcfc732b353dc803538fa65cf55eafad56647be291ab5fbe6424ea416b419d77d864a72efdb9930e0cea4ec626f281844374595e08766466863e40be5fbd1d9937fb5d61e9d7387ef98540d564a9359eb21f43d1ac36cfbc7c06c702973b8c6d32bda7e2167185e44d8a7abab372073a63ab898ad5ef5068bdb90685fb83bddb7038c5e8273bf8a957c931d9e30528250017f68b02a1de4098d6acd1430826806eb78fbe8d87b7e07a3fb2aec5a49be204b1a74cb034196224bc3505c005a38954a69fc6328e610677ddd2d420a333e7623a5bd52ef38d8acdafe5f8b34f6fa6a088c47492d775f3930ae97ca3c1ec74807d40e8793fab12136fae771923cd285c60948f1e37ba7fd8c74afd96c4f4240eeb84595384f0fb93254b31ef9a1ad6ea46ecfac12bf2e37dd9b1d4998cdc85ffdcbc67da3d7aa7f207cfb0623e6a9b438d25ce \ No newline at end of file diff --git a/chacha20/tests/data/chacha20_long_plaintext.txt b/chacha20/tests/data/chacha20_long_plaintext.txt new file mode 100644 index 00000000..33089d38 --- /dev/null +++ b/chacha20/tests/data/chacha20_long_plaintext.txt @@ -0,0 +1 @@ +45d146d7d3f7fdb60414ff3c037ef4c086ef27096128e027f73f2038fcb9c273fb498b8ec49c7b61e3b91e7b220de438b1d5eb4fda9e31a4ac649e58cd84ea083289923909ac5cba5ff7b0375741bd6b79183c37dd8366622c21cde394e2623d91d887fd53cf865d0913390578c44829703a22e8e2797c8561e89155f7e643058d8b400e1277419cf8403f453a3c98ea9592fa0827597af9f64742a4742ed72eec3b6d5c9a8f592c485904f0af326c8cdff34d1d4b9456e66eaab2ccf6c20f7a908dec956d686dca4727ee374d68a80e23863cdf9ca2a8359c80596b314d8ad4461957a00aa13c2b05500f5a69cb549ded9939177c53796c94acab5caf369697ec8099295f45cb5cb04fed565b7843c3c47e27720ecea281762237d447d690e5c933894e9ca3a9ce1d4e05bb5e7030f0715db6a96c2ac81e336f1ea918a650ecdb1b101a080e40a52733df4d025009f796977673501fe3587409821157487dc4e2ba1e0be6ce6290bb460b35b5ed7417b281c369ef25482172d4440806a22473b137aecfd6fb80160b66523f212cc09afd327089c6d8725a53ebc9e8272492b281988da2695db40f5e53062ab59016f7f00d608a7100be1c5b8583738bd0f373465a253837a6a6c2d1353beb609a4db6f537462e53eedd6634e35ad853ed27240ba4445eed14fe51d7529c67e6892d1470a229565785d39c2a51c5bf0421a9f311a787f50b06d152640a967b1882aec7a33a1102783a8d517d669e0cd24c4dc7146cf9cc367b838f2ed302a9840a0d73d2603feea40006d03604a4998b098dfd0db44fcfcddbe061f81077c4d2a80bb69abde8bc7a643630b6f285acd21d00ff3284de985db3b9c07c67732a10eff0fc3b2c5f5ec8efa6acafb0d8bf876040236c49723ffdb3f59232f60cfd4444dd56c0152c178f8d9034132f51b4eb1728f7194a2b8aa9f9115e8b344175d59bd0c578dc77b4d3073e9686730a02ef9ccd0299911bbace3eebe3a3122064ef9473df50d1eeaba4b52240c072be25915c729f88a2dde680a41d6ce79a3db8570a4c1ee69b65ff64b84fae1141eea02f3b66bbc86f59b85fda66e813027c2e498523023817067b69782ac3c2b33c1a79ca15f091a4354ce022333b8862854905ea892d3d2c26639005c6d855d447b3456f331e17ba6c100f488d0acc0400f7c8342895aedbe070358f50ab5eb7eaf35f9f00b56391329394ea82ca2f72b04c443abf7ac8705a7b1fc9927530dfa3aeec8451fb6b9c7a488f6ef6bbf299b387f89903631f563b51c9936fb7c14c5ee082a0eda14d7a7b7f9c20d4ca646f63ffb5cf6e6bd2b42c3e4202650a08e9d88cdf84ed98c18356681f6659865c4d663e306c25d2ad090188058b0630a26e2d15548f48c352831e880430fe620171378255a23c13c4434276a8ebd1645a5bd67c39c2e69539816b475662d1767fd86d91a561e6970f16ba865d5975fe84769093563de82dad0faf08b1199fdba4d8cb7624ad75e261195dde028d999eac02bf3923aba780cff354893bf527a0e4acfaac9aeff881f52daec509595de66841600cdf0d99ea44de259d1ababa224026ef9f2e7064581b6f305e966c5b29204689855808a53dd6652297c32c0870db7b065838869d3a164db78ab43b65fc616ff5adb94ece0cf5cc5e2bed92a494e650a611d46f8341dc28c71816f26ba18eeb8a4174bb00b12b86af4e5f8224eb19d61f70b34d404fa631cadfbff5be2fc9e4ed77a2f7a914fce22777ab94f759d792091c236a6a18fbbe728380f6038aa2d18cd8a837772a156506ee6e9b836b7976fe544299054810dd08b6c9c9fe9e41115984e770f90dba9dcc2949b6f76787aad02b9119baaebfd183d732bd10ca6eb2fafcafa954911bbdc570bae72409b88fd308099f532623f22dedf51c64e83fd85952f9764b349d18b503a3ab530b84e242064e56297a932eaa0bbf82b4c1d8b825c75326436c4073091dd5e224b207309e8af373269330b6d1e275318c8ee8c7754c6fcafabb4790be220937b98bffcbfce714e781cfb8a5d6c45065128f8d8a66a0cd769b50bc6187ea48f0910a98b58a6f119cc1f8abddc2272af320f88600ac76201ecb0ad91ccbda22aacf4d1679185f8f954644bab2f78da7873beb1d769509c639e43d8e \ No newline at end of file diff --git a/chacha20/tests/kats.rs b/chacha20/tests/kats.rs index 4e4aa33c..7987fe70 100644 --- a/chacha20/tests/kats.rs +++ b/chacha20/tests/kats.rs @@ -96,6 +96,30 @@ mod chacha20test { } } +// Long input test to check the full parallel AVX-512 implementation. +// Test data generated from random byte strings. +#[cfg(feature = "cipher")] +mod chacha20test_long { + use chacha20::{ChaCha20, KeyIvInit}; + use cipher::StreamCipher; + use hex_literal::hex; + + const KEY: [u8; 32] = hex!("d387cb6ea45656c892a19d3706d1835d8e3cb11865431fa7133a09d1a1fc78da"); + + const IV: [u8; 12] = hex!("689f6a394fe2048a2400e005"); + + #[test] + fn chacha20_encryption() { + let mut cipher = ChaCha20::new(&KEY.into(), &IV.into()); + let mut buf = hex::decode(include_str!("data/chacha20_long_plaintext.txt")).unwrap(); + + cipher.apply_keystream(&mut buf); + + let ciphertext = hex::decode(include_str!("data/chacha20_long_ciphertext.txt")).unwrap(); + assert_eq!(&buf[..], ciphertext); + } +} + #[rustfmt::skip] #[cfg(feature = "xchacha")] mod xchacha20 {