From efbfa0463c793dc1319db10ca4e3b809937b227d Mon Sep 17 00:00:00 2001 From: Jack O'Connor Date: Tue, 11 Feb 2020 14:13:30 -0500 Subject: [PATCH] integrate assembly implementations into the blake3 crate --- .github/workflows/ci.yml | 26 +++-- Cargo.toml | 19 +++- README.md | 20 ++-- b3sum/Cargo.toml | 4 +- benches/bench.rs | 162 ++++++++++++++------------------ build.rs | 107 ++++++++++++++++----- src/c_avx2.rs | 63 +++++++++++++ src/c_avx512.rs | 3 - src/c_neon.rs | 2 - src/c_sse41.rs | 114 ++++++++++++++++++++++ src/lib.rs | 38 +++++--- src/platform.rs | 89 ++++++++++++------ src/{avx2.rs => rust_avx2.rs} | 0 src/{sse41.rs => rust_sse41.rs} | 0 test_vectors/Cargo.toml | 8 +- test_vectors/cross_test.sh | 2 +- 16 files changed, 465 insertions(+), 192 deletions(-) create mode 100644 src/c_avx2.rs create mode 100644 src/c_sse41.rs rename src/{avx2.rs => rust_avx2.rs} (100%) rename src/{sse41.rs => rust_sse41.rs} (100%) diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index e3da4e5..db7decd 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -24,22 +24,30 @@ jobs: toolchain: ${{ format('{0}-{1}', matrix.channel, matrix.target.toolchain) }} profile: minimal override: true - # Default tests. - - run: cargo test - # No-default-features tests. + # Default tests plus Rayon. + - run: cargo test --features=rayon + # no_std tests. - run: cargo test --no-default-features - # More features tests. Note that "c_avx512" participates in dynamic feature - # detection, so it'll be built, but it probably won't run. - - run: cargo test --features=c_avx512,rayon + # Test the x86 assembly implementations. Use -vv to log compiler commands. + - run: cargo test --features=c -vv + # Test the C intrinsics implementations. Use -vv to log compiler commands. + - run: cargo test --features=c,c_prefer_intrinsics -vv # Test release mode. This does more iteratations in test_fuzz_hasher. - run: cargo test --release - # Test benchmarks. Nightly only. - - run: cargo test --benches - if: matrix.rust_version == 'nightly' + # Test benchmarks. RUSTC_BOOTSTRAP=1 lets this run on non-nightly toolchains. + - run: cargo test --benches --features=c + env: + RUSTC_BOOTSTRAP: 1 # Test vectors. - name: test vectors run: cargo test working-directory: ./test_vectors + - name: test vectors + run: cargo test --features=c + working-directory: ./test_vectors + - name: test vectors + run: cargo test --features=c,c_prefer_intrinsics + working-directory: ./test_vectors # Test b3sum. - name: test b3sum run: cargo test diff --git a/Cargo.toml b/Cargo.toml index 4d8e7cf..1a659ef 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -11,10 +11,21 @@ edition = "2018" [features] default = ["std"] -# Like SSE4.1 and AVX2, the AVX-512 implementation participates in dynamic CPU -# feature detection. A binary with "c_avx512" on is still cross-platform. This -# feature has no effect on non-x86. -c_avx512 = [] +# The "c" feature includes C and assembly SIMD implementations of the +# compression function for x86 platforms, called via FFI. (Currently it has no +# effect on other platforms.) This requires a C toolchain on the build machine. +# This is necessary for AVX-512 support, which is not yet stable in Rust, and +# the assembly implementations also perform better than those using Rust/LLVM +# intrinsics. As with the Rust implementations, these C and assembly +# implementations participate in runtime CPU feature detection, and the +# resulting binary is portable. +c = [] +# Normally x86-64 builds prefer assembly implementations over C intrinsics. The +# assembly implementations perform better, perform most consistently across +# compilers, and are much faster to build. However, this feature makes the +# build use the C intrinsics implementations instead. This is mainly for +# testing purposes, and most callers will not want to use it. +c_prefer_intrinsics = [] # The NEON implementation does not participate in dynamic feature detection, # which is currently x86-only. If "c_neon" is on, NEON support is assumed. Note # that AArch64 always supports NEON, but support on ARMv7 varies. diff --git a/README.md b/README.md index 8f881dd..a8ad4c7 100644 --- a/README.md +++ b/README.md @@ -33,19 +33,18 @@ with BLAKE3. This repository is the official implementation of BLAKE3. It includes: * The [`blake3`](https://crates.io/crates/blake3) Rust crate, which - includes optimized SIMD implementations, with dynamic CPU feature - detection on x86. SSE4.1 and AVX2 support are implemented in Rust, - while AVX-512 and ARM NEON support are imported from the C - implementation and controlled by the `c_avx512` and `c_neon` features. - Multi-threading is implemented with - [Rayon](https://github.com/rayon-rs/rayon) and controlled by the - `rayon` feature. + includes optimized SIMD implementations, with runtime CPU feature + detection on x86. SSE4.1 and AVX2 are supported in pure Rust. The `c` + feature enables C/assembly implementations and AVX-512 support. The + `c_neon` feature enables ARM NEON support. Multi-threading is also + supported, and the `rayon` feature provides a + [Rayon](https://github.com/rayon-rs/rayon)-based implementation. * The [`b3sum`](https://crates.io/crates/b3sum) Rust crate, which provides a command line interface. You can install it from [crates.io](https://crates.io/crates/b3sum) with `cargo install - b3sum`. It enables the multi-threading and AVX-512 features of the - `blake3` crate by default. + b3sum`. It enables the `rayon` and `c` features of the `blake3` crate + by default. * The [C implementation](c), which like the Rust implementation includes SIMD code and dynamic CPU feature detection on x86. Unlike the Rust @@ -80,9 +79,6 @@ we recommend [Argon2](https://github.com/P-H-C/phc-winner-argon2).* ## Usage -This repository provides the `b3sum` command line utility and the -`blake3` Rust crate. - ### The `b3sum` utility The `b3sum` utility allows you to process files and data from standard diff --git a/b3sum/Cargo.toml b/b3sum/Cargo.toml index c4c8068..aaa23e9 100644 --- a/b3sum/Cargo.toml +++ b/b3sum/Cargo.toml @@ -9,8 +9,8 @@ readme = "README.md" edition = "2018" [features] -default = ["c_avx512", "rayon"] -c_avx512 = ["blake3/c_avx512"] +default = ["c", "rayon"] +c = ["blake3/c"] c_neon = ["blake3/c_neon"] rayon = ["blake3/rayon", "memmap"] diff --git a/benches/bench.rs b/benches/bench.rs index 0d73970..70be967 100644 --- a/benches/bench.rs +++ b/benches/bench.rs @@ -4,7 +4,7 @@ extern crate test; use arrayref::array_ref; use arrayvec::ArrayVec; -use blake3::platform::MAX_SIMD_DEGREE; +use blake3::platform::{Platform, MAX_SIMD_DEGREE}; use blake3::{BLOCK_LEN, CHUNK_LEN, OUT_LEN}; use rand::prelude::*; use test::Bencher; @@ -48,173 +48,149 @@ impl RandomInput { } } -type CompressInPlaceFn = - unsafe fn(cv: &mut [u32; 8], block: &[u8; BLOCK_LEN], block_len: u8, counter: u64, flags: u8); - -fn bench_single_compression_fn(b: &mut Bencher, f: CompressInPlaceFn) { +fn bench_single_compression_fn(b: &mut Bencher, platform: Platform) { let mut state = [1u32; 8]; let mut r = RandomInput::new(b, 64); let input = array_ref!(r.get(), 0, 64); - unsafe { - b.iter(|| f(&mut state, input, 64 as u8, 0, 0)); - } + b.iter(|| platform.compress_in_place(&mut state, input, 64 as u8, 0, 0)); } #[bench] fn bench_single_compression_portable(b: &mut Bencher) { - bench_single_compression_fn(b, blake3::portable::compress_in_place); + bench_single_compression_fn(b, Platform::portable()); } #[bench] #[cfg(any(target_arch = "x86", target_arch = "x86_64"))] fn bench_single_compression_sse41(b: &mut Bencher) { - if !blake3::platform::sse41_detected() { - return; + if let Some(platform) = Platform::sse41() { + bench_single_compression_fn(b, platform); } - bench_single_compression_fn(b, blake3::sse41::compress_in_place); } #[bench] -#[cfg(feature = "c_avx512")] +#[cfg(feature = "c")] fn bench_single_compression_avx512(b: &mut Bencher) { - if !blake3::platform::avx512_detected() { - return; + if let Some(platform) = Platform::avx512() { + bench_single_compression_fn(b, platform); } - bench_single_compression_fn(b, blake3::c_avx512::compress_in_place); } -type HashManyFn = unsafe fn( - inputs: &[&A], - key: &[u32; 8], - counter: u64, - increment_counter: blake3::IncrementCounter, - flags: u8, - flags_start: u8, - flags_end: u8, - out: &mut [u8], -); - -fn bench_many_chunks_fn(b: &mut Bencher, f: HashManyFn<[u8; CHUNK_LEN]>, degree: usize) { +fn bench_many_chunks_fn(b: &mut Bencher, platform: Platform) { + let degree = platform.simd_degree(); let mut inputs = Vec::new(); for _ in 0..degree { inputs.push(RandomInput::new(b, CHUNK_LEN)); } - unsafe { - b.iter(|| { - let input_arrays: ArrayVec<[&[u8; CHUNK_LEN]; MAX_SIMD_DEGREE]> = inputs - .iter_mut() - .take(degree) - .map(|i| array_ref!(i.get(), 0, CHUNK_LEN)) - .collect(); - let mut out = [0; MAX_SIMD_DEGREE * OUT_LEN]; - f( - &input_arrays[..], - &[0; 8], - 0, - blake3::IncrementCounter::Yes, - 0, - 0, - 0, - &mut out, - ); - }); - } + b.iter(|| { + let input_arrays: ArrayVec<[&[u8; CHUNK_LEN]; MAX_SIMD_DEGREE]> = inputs + .iter_mut() + .take(degree) + .map(|i| array_ref!(i.get(), 0, CHUNK_LEN)) + .collect(); + let mut out = [0; MAX_SIMD_DEGREE * OUT_LEN]; + platform.hash_many( + &input_arrays[..], + &[0; 8], + 0, + blake3::IncrementCounter::Yes, + 0, + 0, + 0, + &mut out, + ); + }); } #[bench] #[cfg(any(target_arch = "x86", target_arch = "x86_64"))] fn bench_many_chunks_sse41(b: &mut Bencher) { - if !blake3::platform::sse41_detected() { - return; + if let Some(platform) = Platform::sse41() { + bench_many_chunks_fn(b, platform); } - bench_many_chunks_fn(b, blake3::sse41::hash_many, blake3::sse41::DEGREE); } #[bench] #[cfg(any(target_arch = "x86", target_arch = "x86_64"))] fn bench_many_chunks_avx2(b: &mut Bencher) { - if !blake3::platform::avx2_detected() { - return; + if let Some(platform) = Platform::avx2() { + bench_many_chunks_fn(b, platform); } - bench_many_chunks_fn(b, blake3::avx2::hash_many, blake3::avx2::DEGREE); } #[bench] -#[cfg(feature = "c_avx512")] +#[cfg(feature = "c")] fn bench_many_chunks_avx512(b: &mut Bencher) { - if !blake3::platform::avx512_detected() { - return; + if let Some(platform) = Platform::avx512() { + bench_many_chunks_fn(b, platform); } - bench_many_chunks_fn(b, blake3::c_avx512::hash_many, blake3::c_avx512::DEGREE); } #[bench] #[cfg(feature = "c_neon")] fn bench_many_chunks_neon(b: &mut Bencher) { - // When "c_neon" is on, NEON support is assumed. - bench_many_chunks_fn(b, blake3::c_neon::hash_many, blake3::c_neon::DEGREE); + if let Some(platform) = Platform::neon() { + bench_many_chunks_fn(b, platform); + } } // TODO: When we get const generics we can unify this with the chunks code. -fn bench_many_parents_fn(b: &mut Bencher, f: HashManyFn<[u8; BLOCK_LEN]>, degree: usize) { +fn bench_many_parents_fn(b: &mut Bencher, platform: Platform) { + let degree = platform.simd_degree(); let mut inputs = Vec::new(); for _ in 0..degree { inputs.push(RandomInput::new(b, BLOCK_LEN)); } - unsafe { - b.iter(|| { - let input_arrays: ArrayVec<[&[u8; BLOCK_LEN]; MAX_SIMD_DEGREE]> = inputs - .iter_mut() - .take(degree) - .map(|i| array_ref!(i.get(), 0, BLOCK_LEN)) - .collect(); - let mut out = [0; MAX_SIMD_DEGREE * OUT_LEN]; - f( - &input_arrays[..], - &[0; 8], - 0, - blake3::IncrementCounter::No, - 0, - 0, - 0, - &mut out, - ); - }); - } + b.iter(|| { + let input_arrays: ArrayVec<[&[u8; BLOCK_LEN]; MAX_SIMD_DEGREE]> = inputs + .iter_mut() + .take(degree) + .map(|i| array_ref!(i.get(), 0, BLOCK_LEN)) + .collect(); + let mut out = [0; MAX_SIMD_DEGREE * OUT_LEN]; + platform.hash_many( + &input_arrays[..], + &[0; 8], + 0, + blake3::IncrementCounter::No, + 0, + 0, + 0, + &mut out, + ); + }); } #[bench] #[cfg(any(target_arch = "x86", target_arch = "x86_64"))] fn bench_many_parents_sse41(b: &mut Bencher) { - if !blake3::platform::sse41_detected() { - return; + if let Some(platform) = Platform::sse41() { + bench_many_parents_fn(b, platform); } - bench_many_parents_fn(b, blake3::sse41::hash_many, blake3::sse41::DEGREE); } #[bench] #[cfg(any(target_arch = "x86", target_arch = "x86_64"))] fn bench_many_parents_avx2(b: &mut Bencher) { - if !blake3::platform::avx2_detected() { - return; + if let Some(platform) = Platform::avx2() { + bench_many_parents_fn(b, platform); } - bench_many_parents_fn(b, blake3::avx2::hash_many, blake3::avx2::DEGREE); } #[bench] -#[cfg(feature = "c_avx512")] +#[cfg(feature = "c")] fn bench_many_parents_avx512(b: &mut Bencher) { - if !blake3::platform::avx512_detected() { - return; + if let Some(platform) = Platform::avx512() { + bench_many_parents_fn(b, platform); } - bench_many_parents_fn(b, blake3::c_avx512::hash_many, blake3::c_avx512::DEGREE); } #[bench] #[cfg(feature = "c_neon")] fn bench_many_parents_neon(b: &mut Bencher) { - // When "c_neon" is on, NEON support is assumed. - bench_many_parents_fn(b, blake3::c_neon::hash_many, blake3::c_neon::DEGREE); + if let Some(platform) = Platform::neon() { + bench_many_parents_fn(b, platform); + } } fn bench_atonce(b: &mut Bencher, len: usize) { diff --git a/build.rs b/build.rs index 67fe3fc..c5a662d 100644 --- a/build.rs +++ b/build.rs @@ -13,6 +13,11 @@ fn is_x86_64() -> bool { target_components()[0] == "x86_64" } +fn is_x86_32() -> bool { + let arch = &target_components()[0]; + arch == "i386" || arch == "i586" || arch == "i686" +} + fn is_armv7() -> bool { target_components()[0] == "armv7" } @@ -28,6 +33,13 @@ fn is_windows_msvc() -> bool { && target_components()[3] == "msvc" } +fn is_windows_gnu() -> bool { + // Some targets are only two components long, so check in steps. + target_components()[1] == "pc" + && target_components()[2] == "windows" + && target_components()[3] == "gnu" +} + fn new_build() -> cc::Build { let mut build = cc::Build::new(); if !is_windows_msvc() { @@ -37,16 +49,16 @@ fn new_build() -> cc::Build { } const WINDOWS_MSVC_ERROR: &str = r#" -The "c_avx512" feature is enabled, but your version of the MSVC C compiler does -not support the "/arch:AVX512" flag. If you are building the "b3sum" or -"bao_bin" crates, you can disable AVX-512 with Cargo's "--no-default-features" -flag. (Note that this also disables other default features like Rayon-based +The "c" feature is enabled, but your version of the MSVC C compiler does not +support the "/arch:AVX512" flag. If you are building the "b3sum" or "bao_bin" +crates, you can disable AVX-512 with Cargo's "--no-default-features" flag. +(Note that this also disables other default features like Rayon-based multithreading, which you can re-enable with "--features=rayon".) Other crates might or might not support this workaround. "#; const GNU_ERROR: &str = r#" -The "c_avx512" feature is enabled, but your C compiler does not support the +The "c" feature is enabled, but your C compiler does not support the "-mavx512f" flag. If you are building the "b3sum" or "bao_bin" crates, you can disable AVX-512 with Cargo's "--no-default-features" flag. (Note that this also disables other default features like Rayon-based multithreading, which you can @@ -69,25 +81,76 @@ fn check_for_avx512_compiler_support(build: &cc::Build) { } fn main() -> Result<(), Box> { - // "c_avx512' is a no-op for non-x86_64 targets. It also participates in - // dynamic CPU feature detection, so it's generally safe to enable. - // However, it probably won't build in some older environments without - // AVX-512 support in the C compiler, and it's disabled by default for that - // reason. - if defined("CARGO_FEATURE_C_AVX512") && is_x86_64() { - let mut build = new_build(); - check_for_avx512_compiler_support(&build); - build.file("c/blake3_avx512.c"); - if is_windows_msvc() { - // Note that a lot of versions of MSVC don't support /arch:AVX512, - // and they'll discard it with a warning, hopefully leading to a - // build error. - build.flag("/arch:AVX512"); + if defined("CARGO_FEATURE_C") { + if is_x86_64() && !defined("CARGO_FEATURE_C_PREFER_INTRINSICS") { + // On 64-bit, use the assembly implementations, unless the + // "c_prefer_intrinsics" feature is enabled. + if is_windows_msvc() { + let mut build = new_build(); + build.file("c/blake3-sse41-x86_64-windows-msvc.asm"); + build.file("c/blake3-avx2-x86_64-windows-msvc.asm"); + build.file("c/blake3-avx512-x86_64-windows-msvc.asm"); + build.compile("blake3_asm"); + } else if is_windows_gnu() { + let mut build = new_build(); + build.file("c/blake3-sse41-x86_64-windows-gnu.S"); + build.file("c/blake3-avx2-x86_64-windows-gnu.S"); + build.file("c/blake3-avx512-x86_64-windows-gnu.S"); + build.compile("blake3_asm"); + } else { + // All non-Windows implementations are assumed to support + // Linux-style assembly. These files do contain a small + // explicit workaround for macOS also. + let mut build = new_build(); + build.file("c/blake3-sse41-x86_64-unix.S"); + build.file("c/blake3-avx2-x86_64-unix.S"); + build.file("c/blake3-avx512-x86_64-unix.S"); + build.compile("blake3_asm"); + } + } else if is_x86_64() || is_x86_32() { + // Assembly implementations are only for 64-bit. On 32-bit, or if + // the "c_prefer_intrinsics" feature is enabled, use the + // intrinsics-based C implementations. These each need to be + // compiled separately, with the corresponding instruction set + // extension explicitly enabled in the compiler. + + let mut sse41_build = new_build(); + sse41_build.file("c/blake3_sse41.c"); + if is_windows_msvc() { + // /arch:SSE2 is the default on x86 and undefined on x86_64: + // https://docs.microsoft.com/en-us/cpp/build/reference/arch-x86 + // It also includes SSE4.1 intrisincs: + // https://stackoverflow.com/a/32183222/823869 + } else { + sse41_build.flag("-msse4.1"); + } + sse41_build.compile("blake3_sse41"); + + let mut avx2_build = new_build(); + avx2_build.file("c/blake3_avx2.c"); + if is_windows_msvc() { + avx2_build.flag("/arch:AVX2"); + } else { + avx2_build.flag("-mavx2"); + } + avx2_build.compile("blake3_avx2"); + + let mut avx512_build = new_build(); + check_for_avx512_compiler_support(&avx512_build); + avx512_build.file("c/blake3_avx512.c"); + if is_windows_msvc() { + // Note that a lot of versions of MSVC don't support /arch:AVX512, + // and they'll discard it with a warning, hopefully leading to a + // build error. + avx512_build.flag("/arch:AVX512"); + } else { + avx512_build.flag("-mavx512f"); + avx512_build.flag("-mavx512vl"); + } + avx512_build.compile("blake3_avx512"); } else { - build.flag("-mavx512f"); - build.flag("-mavx512vl"); + // Currently no effect for non-x86 platforms. } - build.compile("blake3_avx512"); } if defined("CARGO_FEATURE_C_NEON") { diff --git a/src/c_avx2.rs b/src/c_avx2.rs new file mode 100644 index 0000000..d805e86 --- /dev/null +++ b/src/c_avx2.rs @@ -0,0 +1,63 @@ +use crate::{CVWords, IncrementCounter, BLOCK_LEN, OUT_LEN}; + +// Note that there is no AVX2 implementation of compress_in_place or +// compress_xof. + +// Unsafe because this may only be called on platforms supporting AVX2. +pub unsafe fn hash_many>( + inputs: &[&A], + key: &CVWords, + counter: u64, + increment_counter: IncrementCounter, + flags: u8, + flags_start: u8, + flags_end: u8, + out: &mut [u8], +) { + // The Rust hash_many implementations do bounds checking on the `out` + // array, but the C implementations don't. Even though this is an unsafe + // function, assert the bounds here. + assert!(out.len() >= inputs.len() * OUT_LEN); + ffi::blake3_hash_many_avx2( + inputs.as_ptr() as *const *const u8, + inputs.len(), + A::CAPACITY / BLOCK_LEN, + key.as_ptr(), + counter, + increment_counter.yes(), + flags, + flags_start, + flags_end, + out.as_mut_ptr(), + ) +} + +pub mod ffi { + extern "C" { + pub fn blake3_hash_many_avx2( + inputs: *const *const u8, + num_inputs: usize, + blocks: usize, + key: *const u32, + counter: u64, + increment_counter: bool, + flags: u8, + flags_start: u8, + flags_end: u8, + out: *mut u8, + ); + } +} + +#[cfg(test)] +mod test { + use super::*; + + #[test] + fn test_hash_many() { + if !crate::platform::avx2_detected() { + return; + } + crate::test::test_hash_many_fn(hash_many, hash_many); + } +} diff --git a/src/c_avx512.rs b/src/c_avx512.rs index f20de2c..c1b9f64 100644 --- a/src/c_avx512.rs +++ b/src/c_avx512.rs @@ -1,7 +1,5 @@ use crate::{CVWords, IncrementCounter, BLOCK_LEN, OUT_LEN}; -pub const DEGREE: usize = 16; - // Unsafe because this may only be called on platforms supporting AVX-512. pub unsafe fn compress_in_place( cv: &mut CVWords, @@ -91,7 +89,6 @@ pub mod ffi { flags_end: u8, out: *mut u8, ); - } } diff --git a/src/c_neon.rs b/src/c_neon.rs index 34ef074..77b9654 100644 --- a/src/c_neon.rs +++ b/src/c_neon.rs @@ -1,7 +1,5 @@ use crate::{CVWords, IncrementCounter, BLOCK_LEN, OUT_LEN}; -pub const DEGREE: usize = 4; - // Unsafe because this may only be called on platforms supporting NEON. pub unsafe fn hash_many>( inputs: &[&A], diff --git a/src/c_sse41.rs b/src/c_sse41.rs new file mode 100644 index 0000000..0b64c90 --- /dev/null +++ b/src/c_sse41.rs @@ -0,0 +1,114 @@ +use crate::{CVWords, IncrementCounter, BLOCK_LEN, OUT_LEN}; + +// Unsafe because this may only be called on platforms supporting SSE4.1. +pub unsafe fn compress_in_place( + cv: &mut CVWords, + block: &[u8; BLOCK_LEN], + block_len: u8, + counter: u64, + flags: u8, +) { + ffi::blake3_compress_in_place_sse41(cv.as_mut_ptr(), block.as_ptr(), block_len, counter, flags) +} + +// Unsafe because this may only be called on platforms supporting SSE4.1. +pub unsafe fn compress_xof( + cv: &CVWords, + block: &[u8; BLOCK_LEN], + block_len: u8, + counter: u64, + flags: u8, +) -> [u8; 64] { + let mut out = [0u8; 64]; + ffi::blake3_compress_xof_sse41( + cv.as_ptr(), + block.as_ptr(), + block_len, + counter, + flags, + out.as_mut_ptr(), + ); + out +} + +// Unsafe because this may only be called on platforms supporting SSE4.1. +pub unsafe fn hash_many>( + inputs: &[&A], + key: &CVWords, + counter: u64, + increment_counter: IncrementCounter, + flags: u8, + flags_start: u8, + flags_end: u8, + out: &mut [u8], +) { + // The Rust hash_many implementations do bounds checking on the `out` + // array, but the C implementations don't. Even though this is an unsafe + // function, assert the bounds here. + assert!(out.len() >= inputs.len() * OUT_LEN); + ffi::blake3_hash_many_sse41( + inputs.as_ptr() as *const *const u8, + inputs.len(), + A::CAPACITY / BLOCK_LEN, + key.as_ptr(), + counter, + increment_counter.yes(), + flags, + flags_start, + flags_end, + out.as_mut_ptr(), + ) +} + +pub mod ffi { + extern "C" { + pub fn blake3_compress_in_place_sse41( + cv: *mut u32, + block: *const u8, + block_len: u8, + counter: u64, + flags: u8, + ); + pub fn blake3_compress_xof_sse41( + cv: *const u32, + block: *const u8, + block_len: u8, + counter: u64, + flags: u8, + out: *mut u8, + ); + pub fn blake3_hash_many_sse41( + inputs: *const *const u8, + num_inputs: usize, + blocks: usize, + key: *const u32, + counter: u64, + increment_counter: bool, + flags: u8, + flags_start: u8, + flags_end: u8, + out: *mut u8, + ); + } +} + +#[cfg(test)] +mod test { + use super::*; + + #[test] + fn test_compress() { + if !crate::platform::sse41_detected() { + return; + } + crate::test::test_compress_fn(compress_in_place, compress_xof); + } + + #[test] + fn test_hash_many() { + if !crate::platform::sse41_detected() { + return; + } + crate::test::test_hash_many_fn(hash_many, hash_many); + } +} diff --git a/src/lib.rs b/src/lib.rs index 7fa3510..58d2dbe 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -39,24 +39,32 @@ mod test; #[doc(hidden)] pub mod guts; -// These modules are pub for benchmarks only. They are not stable. -#[cfg(any(target_arch = "x86", target_arch = "x86_64"))] -#[doc(hidden)] -pub mod avx2; -#[cfg(feature = "c_avx512")] -#[cfg(any(target_arch = "x86", target_arch = "x86_64"))] -#[doc(hidden)] -pub mod c_avx512; -#[cfg(feature = "c_neon")] -#[doc(hidden)] -pub mod c_neon; +// The platform module is pub for benchmarks only. It is not stable. #[doc(hidden)] pub mod platform; -#[doc(hidden)] -pub mod portable; + +// Platform-specific implementations of the compression function. +mod portable; #[cfg(any(target_arch = "x86", target_arch = "x86_64"))] -#[doc(hidden)] -pub mod sse41; +cfg_if::cfg_if! { + if #[cfg(feature = "c")] { + #[path = "c_sse41.rs"] + mod sse41; + #[path = "c_avx2.rs"] + mod avx2; + #[path = "c_avx512.rs"] + mod avx512; + } else { + #[path = "rust_sse41.rs"] + mod sse41; + #[path = "rust_avx2.rs"] + mod avx2; + // Stable Rust does not currently support AVX-512. + } +} +#[cfg(feature = "c_neon")] +#[path = "c_neon.rs"] +mod neon; pub mod traits; diff --git a/src/platform.rs b/src/platform.rs index b453a6e..163cbbb 100644 --- a/src/platform.rs +++ b/src/platform.rs @@ -1,18 +1,10 @@ use crate::{portable, CVWords, IncrementCounter, BLOCK_LEN}; use arrayref::{array_mut_ref, array_ref}; -#[cfg(feature = "c_avx512")] -#[cfg(any(target_arch = "x86", target_arch = "x86_64"))] -use crate::c_avx512; -#[cfg(feature = "c_neon")] -use crate::c_neon; -#[cfg(any(target_arch = "x86", target_arch = "x86_64"))] -use crate::{avx2, sse41}; - cfg_if::cfg_if! { if #[cfg(any(target_arch = "x86", target_arch = "x86_64"))] { cfg_if::cfg_if! { - if #[cfg(feature = "c_avx512")] { + if #[cfg(feature = "c")] { pub const MAX_SIMD_DEGREE: usize = 16; } else { pub const MAX_SIMD_DEGREE: usize = 8; @@ -32,7 +24,7 @@ cfg_if::cfg_if! { cfg_if::cfg_if! { if #[cfg(any(target_arch = "x86", target_arch = "x86_64"))] { cfg_if::cfg_if! { - if #[cfg(feature = "c_avx512")] { + if #[cfg(feature = "c")] { pub const MAX_SIMD_DEGREE_OR_2: usize = 16; } else { pub const MAX_SIMD_DEGREE_OR_2: usize = 8; @@ -52,7 +44,7 @@ pub enum Platform { SSE41, #[cfg(any(target_arch = "x86", target_arch = "x86_64"))] AVX2, - #[cfg(feature = "c_avx512")] + #[cfg(feature = "c")] #[cfg(any(target_arch = "x86", target_arch = "x86_64"))] AVX512, #[cfg(feature = "c_neon")] @@ -64,7 +56,7 @@ impl Platform { pub fn detect() -> Self { #[cfg(any(target_arch = "x86", target_arch = "x86_64"))] { - #[cfg(feature = "c_avx512")] + #[cfg(feature = "c")] { if avx512_detected() { return Platform::AVX512; @@ -93,7 +85,7 @@ impl Platform { Platform::SSE41 => 4, #[cfg(any(target_arch = "x86", target_arch = "x86_64"))] Platform::AVX2 => 8, - #[cfg(feature = "c_avx512")] + #[cfg(feature = "c")] #[cfg(any(target_arch = "x86", target_arch = "x86_64"))] Platform::AVX512 => 16, #[cfg(feature = "c_neon")] @@ -103,7 +95,7 @@ impl Platform { degree } - pub(crate) fn compress_in_place( + pub fn compress_in_place( &self, cv: &mut CVWords, block: &[u8; BLOCK_LEN], @@ -116,13 +108,13 @@ impl Platform { // Safe because detect() checked for platform support. #[cfg(any(target_arch = "x86", target_arch = "x86_64"))] Platform::SSE41 | Platform::AVX2 => unsafe { - sse41::compress_in_place(cv, block, block_len, counter, flags) + crate::sse41::compress_in_place(cv, block, block_len, counter, flags) }, // Safe because detect() checked for platform support. - #[cfg(feature = "c_avx512")] + #[cfg(feature = "c")] #[cfg(any(target_arch = "x86", target_arch = "x86_64"))] Platform::AVX512 => unsafe { - c_avx512::compress_in_place(cv, block, block_len, counter, flags) + crate::avx512::compress_in_place(cv, block, block_len, counter, flags) }, // No NEON compress_in_place() implementation yet. #[cfg(feature = "c_neon")] @@ -130,7 +122,7 @@ impl Platform { } } - pub(crate) fn compress_xof( + pub fn compress_xof( &self, cv: &CVWords, block: &[u8; BLOCK_LEN], @@ -143,13 +135,13 @@ impl Platform { // Safe because detect() checked for platform support. #[cfg(any(target_arch = "x86", target_arch = "x86_64"))] Platform::SSE41 | Platform::AVX2 => unsafe { - sse41::compress_xof(cv, block, block_len, counter, flags) + crate::sse41::compress_xof(cv, block, block_len, counter, flags) }, // Safe because detect() checked for platform support. - #[cfg(feature = "c_avx512")] + #[cfg(feature = "c")] #[cfg(any(target_arch = "x86", target_arch = "x86_64"))] Platform::AVX512 => unsafe { - c_avx512::compress_xof(cv, block, block_len, counter, flags) + crate::avx512::compress_xof(cv, block, block_len, counter, flags) }, // No NEON compress_xof() implementation yet. #[cfg(feature = "c_neon")] @@ -167,7 +159,7 @@ impl Platform { // after every block, there's a small but measurable performance loss. // Compressing chunks with a dedicated loop avoids this. - pub(crate) fn hash_many>( + pub fn hash_many>( &self, inputs: &[&A], key: &CVWords, @@ -192,7 +184,7 @@ impl Platform { // Safe because detect() checked for platform support. #[cfg(any(target_arch = "x86", target_arch = "x86_64"))] Platform::SSE41 => unsafe { - sse41::hash_many( + crate::sse41::hash_many( inputs, key, counter, @@ -206,7 +198,7 @@ impl Platform { // Safe because detect() checked for platform support. #[cfg(any(target_arch = "x86", target_arch = "x86_64"))] Platform::AVX2 => unsafe { - avx2::hash_many( + crate::avx2::hash_many( inputs, key, counter, @@ -218,10 +210,10 @@ impl Platform { ) }, // Safe because detect() checked for platform support. - #[cfg(feature = "c_avx512")] + #[cfg(feature = "c")] #[cfg(any(target_arch = "x86", target_arch = "x86_64"))] Platform::AVX512 => unsafe { - c_avx512::hash_many( + crate::avx512::hash_many( inputs, key, counter, @@ -235,7 +227,7 @@ impl Platform { // Assumed to be safe if the "c_neon" feature is on. #[cfg(feature = "c_neon")] Platform::NEON => unsafe { - c_neon::hash_many( + crate::neon::hash_many( inputs, key, counter, @@ -248,11 +240,52 @@ impl Platform { }, } } + + // Explicit platform constructors, for benchmarks. + + pub fn portable() -> Self { + Self::Portable + } + + #[cfg(any(target_arch = "x86", target_arch = "x86_64"))] + pub fn sse41() -> Option { + if sse41_detected() { + Some(Self::SSE41) + } else { + None + } + } + + #[cfg(any(target_arch = "x86", target_arch = "x86_64"))] + pub fn avx2() -> Option { + if avx2_detected() { + Some(Self::AVX2) + } else { + None + } + } + + #[cfg(feature = "c")] + #[cfg(any(target_arch = "x86", target_arch = "x86_64"))] + pub fn avx512() -> Option { + if avx512_detected() { + Some(Self::AVX512) + } else { + None + } + } + + #[cfg(feature = "c_neon")] + #[cfg(any(target_arch = "x86", target_arch = "x86_64"))] + pub fn neon() -> Option { + // Assumed to be safe if the "c_neon" feature is on. + Some(Self::NEON) + } } // Note that AVX-512 is divided into multiple featuresets, and we use two of // them, F and VL. -#[cfg(feature = "c_avx512")] +#[cfg(feature = "c")] #[cfg(any(target_arch = "x86", target_arch = "x86_64"))] #[inline(always)] pub fn avx512_detected() -> bool { diff --git a/src/avx2.rs b/src/rust_avx2.rs similarity index 100% rename from src/avx2.rs rename to src/rust_avx2.rs diff --git a/src/sse41.rs b/src/rust_sse41.rs similarity index 100% rename from src/sse41.rs rename to src/rust_sse41.rs diff --git a/test_vectors/Cargo.toml b/test_vectors/Cargo.toml index 007d1c8..2a90e39 100644 --- a/test_vectors/Cargo.toml +++ b/test_vectors/Cargo.toml @@ -3,10 +3,16 @@ name = "test_vectors" version = "0.0.0" edition = "2018" +[features] +default = [] +c = ["blake3/c"] +c_prefer_intrinsics = ["blake3/c_prefer_intrinsics"] +c_neon = ["blake3/c_neon"] + [dependencies] # If you ever change these path dependencies, you'll probably need to update # cross_test.sh, or CI will break. I'm sorry >.< -blake3 = { path = "../", features=["c_avx512"] } +blake3 = { path = "../" } hex = "0.4.0" reference_impl = { path = "../reference_impl" } serde = { version = "1.0", features = ["derive"] } diff --git a/test_vectors/cross_test.sh b/test_vectors/cross_test.sh index 1f6a34b..c4d280c 100755 --- a/test_vectors/cross_test.sh +++ b/test_vectors/cross_test.sh @@ -19,7 +19,7 @@ mv blake3/test_vectors . mv blake3/reference_impl test_vectors mv blake3 test_vectors cd test_vectors -sed -i 's|blake3 = { path = "../", features=\["c_avx512"\] }|blake3 = { path = "./blake3" }|' Cargo.toml +sed -i 's|blake3 = { path = "../" }|blake3 = { path = "./blake3" }|' Cargo.toml sed -i 's|reference_impl = { path = "../reference_impl" }|reference_impl = { path = "reference_impl" }|' Cargo.toml cross test "$@"