1
0
Fork 0
mirror of https://github.com/BLAKE3-team/BLAKE3 synced 2024-05-08 19:06:02 +02:00

integrate assembly implementations into the blake3 crate

This commit is contained in:
Jack O'Connor 2020-02-11 14:13:30 -05:00
parent b6b3c27824
commit efbfa0463c
16 changed files with 465 additions and 192 deletions

View File

@ -24,22 +24,30 @@ jobs:
toolchain: ${{ format('{0}-{1}', matrix.channel, matrix.target.toolchain) }} toolchain: ${{ format('{0}-{1}', matrix.channel, matrix.target.toolchain) }}
profile: minimal profile: minimal
override: true override: true
# Default tests. # Default tests plus Rayon.
- run: cargo test - run: cargo test --features=rayon
# No-default-features tests. # no_std tests.
- run: cargo test --no-default-features - run: cargo test --no-default-features
# More features tests. Note that "c_avx512" participates in dynamic feature # Test the x86 assembly implementations. Use -vv to log compiler commands.
# detection, so it'll be built, but it probably won't run. - run: cargo test --features=c -vv
- run: cargo test --features=c_avx512,rayon # Test the C intrinsics implementations. Use -vv to log compiler commands.
- run: cargo test --features=c,c_prefer_intrinsics -vv
# Test release mode. This does more iteratations in test_fuzz_hasher. # Test release mode. This does more iteratations in test_fuzz_hasher.
- run: cargo test --release - run: cargo test --release
# Test benchmarks. Nightly only. # Test benchmarks. RUSTC_BOOTSTRAP=1 lets this run on non-nightly toolchains.
- run: cargo test --benches - run: cargo test --benches --features=c
if: matrix.rust_version == 'nightly' env:
RUSTC_BOOTSTRAP: 1
# Test vectors. # Test vectors.
- name: test vectors - name: test vectors
run: cargo test run: cargo test
working-directory: ./test_vectors working-directory: ./test_vectors
- name: test vectors
run: cargo test --features=c
working-directory: ./test_vectors
- name: test vectors
run: cargo test --features=c,c_prefer_intrinsics
working-directory: ./test_vectors
# Test b3sum. # Test b3sum.
- name: test b3sum - name: test b3sum
run: cargo test run: cargo test

View File

@ -11,10 +11,21 @@ edition = "2018"
[features] [features]
default = ["std"] default = ["std"]
# Like SSE4.1 and AVX2, the AVX-512 implementation participates in dynamic CPU # The "c" feature includes C and assembly SIMD implementations of the
# feature detection. A binary with "c_avx512" on is still cross-platform. This # compression function for x86 platforms, called via FFI. (Currently it has no
# feature has no effect on non-x86. # effect on other platforms.) This requires a C toolchain on the build machine.
c_avx512 = [] # This is necessary for AVX-512 support, which is not yet stable in Rust, and
# the assembly implementations also perform better than those using Rust/LLVM
# intrinsics. As with the Rust implementations, these C and assembly
# implementations participate in runtime CPU feature detection, and the
# resulting binary is portable.
c = []
# Normally x86-64 builds prefer assembly implementations over C intrinsics. The
# assembly implementations perform better, perform most consistently across
# compilers, and are much faster to build. However, this feature makes the
# build use the C intrinsics implementations instead. This is mainly for
# testing purposes, and most callers will not want to use it.
c_prefer_intrinsics = []
# The NEON implementation does not participate in dynamic feature detection, # The NEON implementation does not participate in dynamic feature detection,
# which is currently x86-only. If "c_neon" is on, NEON support is assumed. Note # which is currently x86-only. If "c_neon" is on, NEON support is assumed. Note
# that AArch64 always supports NEON, but support on ARMv7 varies. # that AArch64 always supports NEON, but support on ARMv7 varies.

View File

@ -33,19 +33,18 @@ with BLAKE3.
This repository is the official implementation of BLAKE3. It includes: This repository is the official implementation of BLAKE3. It includes:
* The [`blake3`](https://crates.io/crates/blake3) Rust crate, which * The [`blake3`](https://crates.io/crates/blake3) Rust crate, which
includes optimized SIMD implementations, with dynamic CPU feature includes optimized SIMD implementations, with runtime CPU feature
detection on x86. SSE4.1 and AVX2 support are implemented in Rust, detection on x86. SSE4.1 and AVX2 are supported in pure Rust. The `c`
while AVX-512 and ARM NEON support are imported from the C feature enables C/assembly implementations and AVX-512 support. The
implementation and controlled by the `c_avx512` and `c_neon` features. `c_neon` feature enables ARM NEON support. Multi-threading is also
Multi-threading is implemented with supported, and the `rayon` feature provides a
[Rayon](https://github.com/rayon-rs/rayon) and controlled by the [Rayon](https://github.com/rayon-rs/rayon)-based implementation.
`rayon` feature.
* The [`b3sum`](https://crates.io/crates/b3sum) Rust crate, which * The [`b3sum`](https://crates.io/crates/b3sum) Rust crate, which
provides a command line interface. You can install it from provides a command line interface. You can install it from
[crates.io](https://crates.io/crates/b3sum) with `cargo install [crates.io](https://crates.io/crates/b3sum) with `cargo install
b3sum`. It enables the multi-threading and AVX-512 features of the b3sum`. It enables the `rayon` and `c` features of the `blake3` crate
`blake3` crate by default. by default.
* The [C implementation](c), which like the Rust implementation includes * The [C implementation](c), which like the Rust implementation includes
SIMD code and dynamic CPU feature detection on x86. Unlike the Rust SIMD code and dynamic CPU feature detection on x86. Unlike the Rust
@ -80,9 +79,6 @@ we recommend [Argon2](https://github.com/P-H-C/phc-winner-argon2).*
## Usage ## Usage
This repository provides the `b3sum` command line utility and the
`blake3` Rust crate.
### The `b3sum` utility ### The `b3sum` utility
The `b3sum` utility allows you to process files and data from standard The `b3sum` utility allows you to process files and data from standard

View File

@ -9,8 +9,8 @@ readme = "README.md"
edition = "2018" edition = "2018"
[features] [features]
default = ["c_avx512", "rayon"] default = ["c", "rayon"]
c_avx512 = ["blake3/c_avx512"] c = ["blake3/c"]
c_neon = ["blake3/c_neon"] c_neon = ["blake3/c_neon"]
rayon = ["blake3/rayon", "memmap"] rayon = ["blake3/rayon", "memmap"]

View File

@ -4,7 +4,7 @@ extern crate test;
use arrayref::array_ref; use arrayref::array_ref;
use arrayvec::ArrayVec; use arrayvec::ArrayVec;
use blake3::platform::MAX_SIMD_DEGREE; use blake3::platform::{Platform, MAX_SIMD_DEGREE};
use blake3::{BLOCK_LEN, CHUNK_LEN, OUT_LEN}; use blake3::{BLOCK_LEN, CHUNK_LEN, OUT_LEN};
use rand::prelude::*; use rand::prelude::*;
use test::Bencher; use test::Bencher;
@ -48,173 +48,149 @@ impl RandomInput {
} }
} }
type CompressInPlaceFn = fn bench_single_compression_fn(b: &mut Bencher, platform: Platform) {
unsafe fn(cv: &mut [u32; 8], block: &[u8; BLOCK_LEN], block_len: u8, counter: u64, flags: u8);
fn bench_single_compression_fn(b: &mut Bencher, f: CompressInPlaceFn) {
let mut state = [1u32; 8]; let mut state = [1u32; 8];
let mut r = RandomInput::new(b, 64); let mut r = RandomInput::new(b, 64);
let input = array_ref!(r.get(), 0, 64); let input = array_ref!(r.get(), 0, 64);
unsafe { b.iter(|| platform.compress_in_place(&mut state, input, 64 as u8, 0, 0));
b.iter(|| f(&mut state, input, 64 as u8, 0, 0));
}
} }
#[bench] #[bench]
fn bench_single_compression_portable(b: &mut Bencher) { fn bench_single_compression_portable(b: &mut Bencher) {
bench_single_compression_fn(b, blake3::portable::compress_in_place); bench_single_compression_fn(b, Platform::portable());
} }
#[bench] #[bench]
#[cfg(any(target_arch = "x86", target_arch = "x86_64"))] #[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
fn bench_single_compression_sse41(b: &mut Bencher) { fn bench_single_compression_sse41(b: &mut Bencher) {
if !blake3::platform::sse41_detected() { if let Some(platform) = Platform::sse41() {
return; bench_single_compression_fn(b, platform);
} }
bench_single_compression_fn(b, blake3::sse41::compress_in_place);
} }
#[bench] #[bench]
#[cfg(feature = "c_avx512")] #[cfg(feature = "c")]
fn bench_single_compression_avx512(b: &mut Bencher) { fn bench_single_compression_avx512(b: &mut Bencher) {
if !blake3::platform::avx512_detected() { if let Some(platform) = Platform::avx512() {
return; bench_single_compression_fn(b, platform);
} }
bench_single_compression_fn(b, blake3::c_avx512::compress_in_place);
} }
type HashManyFn<A> = unsafe fn( fn bench_many_chunks_fn(b: &mut Bencher, platform: Platform) {
inputs: &[&A], let degree = platform.simd_degree();
key: &[u32; 8],
counter: u64,
increment_counter: blake3::IncrementCounter,
flags: u8,
flags_start: u8,
flags_end: u8,
out: &mut [u8],
);
fn bench_many_chunks_fn(b: &mut Bencher, f: HashManyFn<[u8; CHUNK_LEN]>, degree: usize) {
let mut inputs = Vec::new(); let mut inputs = Vec::new();
for _ in 0..degree { for _ in 0..degree {
inputs.push(RandomInput::new(b, CHUNK_LEN)); inputs.push(RandomInput::new(b, CHUNK_LEN));
} }
unsafe { b.iter(|| {
b.iter(|| { let input_arrays: ArrayVec<[&[u8; CHUNK_LEN]; MAX_SIMD_DEGREE]> = inputs
let input_arrays: ArrayVec<[&[u8; CHUNK_LEN]; MAX_SIMD_DEGREE]> = inputs .iter_mut()
.iter_mut() .take(degree)
.take(degree) .map(|i| array_ref!(i.get(), 0, CHUNK_LEN))
.map(|i| array_ref!(i.get(), 0, CHUNK_LEN)) .collect();
.collect(); let mut out = [0; MAX_SIMD_DEGREE * OUT_LEN];
let mut out = [0; MAX_SIMD_DEGREE * OUT_LEN]; platform.hash_many(
f( &input_arrays[..],
&input_arrays[..], &[0; 8],
&[0; 8], 0,
0, blake3::IncrementCounter::Yes,
blake3::IncrementCounter::Yes, 0,
0, 0,
0, 0,
0, &mut out,
&mut out, );
); });
});
}
} }
#[bench] #[bench]
#[cfg(any(target_arch = "x86", target_arch = "x86_64"))] #[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
fn bench_many_chunks_sse41(b: &mut Bencher) { fn bench_many_chunks_sse41(b: &mut Bencher) {
if !blake3::platform::sse41_detected() { if let Some(platform) = Platform::sse41() {
return; bench_many_chunks_fn(b, platform);
} }
bench_many_chunks_fn(b, blake3::sse41::hash_many, blake3::sse41::DEGREE);
} }
#[bench] #[bench]
#[cfg(any(target_arch = "x86", target_arch = "x86_64"))] #[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
fn bench_many_chunks_avx2(b: &mut Bencher) { fn bench_many_chunks_avx2(b: &mut Bencher) {
if !blake3::platform::avx2_detected() { if let Some(platform) = Platform::avx2() {
return; bench_many_chunks_fn(b, platform);
} }
bench_many_chunks_fn(b, blake3::avx2::hash_many, blake3::avx2::DEGREE);
} }
#[bench] #[bench]
#[cfg(feature = "c_avx512")] #[cfg(feature = "c")]
fn bench_many_chunks_avx512(b: &mut Bencher) { fn bench_many_chunks_avx512(b: &mut Bencher) {
if !blake3::platform::avx512_detected() { if let Some(platform) = Platform::avx512() {
return; bench_many_chunks_fn(b, platform);
} }
bench_many_chunks_fn(b, blake3::c_avx512::hash_many, blake3::c_avx512::DEGREE);
} }
#[bench] #[bench]
#[cfg(feature = "c_neon")] #[cfg(feature = "c_neon")]
fn bench_many_chunks_neon(b: &mut Bencher) { fn bench_many_chunks_neon(b: &mut Bencher) {
// When "c_neon" is on, NEON support is assumed. if let Some(platform) = Platform::neon() {
bench_many_chunks_fn(b, blake3::c_neon::hash_many, blake3::c_neon::DEGREE); bench_many_chunks_fn(b, platform);
}
} }
// TODO: When we get const generics we can unify this with the chunks code. // TODO: When we get const generics we can unify this with the chunks code.
fn bench_many_parents_fn(b: &mut Bencher, f: HashManyFn<[u8; BLOCK_LEN]>, degree: usize) { fn bench_many_parents_fn(b: &mut Bencher, platform: Platform) {
let degree = platform.simd_degree();
let mut inputs = Vec::new(); let mut inputs = Vec::new();
for _ in 0..degree { for _ in 0..degree {
inputs.push(RandomInput::new(b, BLOCK_LEN)); inputs.push(RandomInput::new(b, BLOCK_LEN));
} }
unsafe { b.iter(|| {
b.iter(|| { let input_arrays: ArrayVec<[&[u8; BLOCK_LEN]; MAX_SIMD_DEGREE]> = inputs
let input_arrays: ArrayVec<[&[u8; BLOCK_LEN]; MAX_SIMD_DEGREE]> = inputs .iter_mut()
.iter_mut() .take(degree)
.take(degree) .map(|i| array_ref!(i.get(), 0, BLOCK_LEN))
.map(|i| array_ref!(i.get(), 0, BLOCK_LEN)) .collect();
.collect(); let mut out = [0; MAX_SIMD_DEGREE * OUT_LEN];
let mut out = [0; MAX_SIMD_DEGREE * OUT_LEN]; platform.hash_many(
f( &input_arrays[..],
&input_arrays[..], &[0; 8],
&[0; 8], 0,
0, blake3::IncrementCounter::No,
blake3::IncrementCounter::No, 0,
0, 0,
0, 0,
0, &mut out,
&mut out, );
); });
});
}
} }
#[bench] #[bench]
#[cfg(any(target_arch = "x86", target_arch = "x86_64"))] #[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
fn bench_many_parents_sse41(b: &mut Bencher) { fn bench_many_parents_sse41(b: &mut Bencher) {
if !blake3::platform::sse41_detected() { if let Some(platform) = Platform::sse41() {
return; bench_many_parents_fn(b, platform);
} }
bench_many_parents_fn(b, blake3::sse41::hash_many, blake3::sse41::DEGREE);
} }
#[bench] #[bench]
#[cfg(any(target_arch = "x86", target_arch = "x86_64"))] #[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
fn bench_many_parents_avx2(b: &mut Bencher) { fn bench_many_parents_avx2(b: &mut Bencher) {
if !blake3::platform::avx2_detected() { if let Some(platform) = Platform::avx2() {
return; bench_many_parents_fn(b, platform);
} }
bench_many_parents_fn(b, blake3::avx2::hash_many, blake3::avx2::DEGREE);
} }
#[bench] #[bench]
#[cfg(feature = "c_avx512")] #[cfg(feature = "c")]
fn bench_many_parents_avx512(b: &mut Bencher) { fn bench_many_parents_avx512(b: &mut Bencher) {
if !blake3::platform::avx512_detected() { if let Some(platform) = Platform::avx512() {
return; bench_many_parents_fn(b, platform);
} }
bench_many_parents_fn(b, blake3::c_avx512::hash_many, blake3::c_avx512::DEGREE);
} }
#[bench] #[bench]
#[cfg(feature = "c_neon")] #[cfg(feature = "c_neon")]
fn bench_many_parents_neon(b: &mut Bencher) { fn bench_many_parents_neon(b: &mut Bencher) {
// When "c_neon" is on, NEON support is assumed. if let Some(platform) = Platform::neon() {
bench_many_parents_fn(b, blake3::c_neon::hash_many, blake3::c_neon::DEGREE); bench_many_parents_fn(b, platform);
}
} }
fn bench_atonce(b: &mut Bencher, len: usize) { fn bench_atonce(b: &mut Bencher, len: usize) {

107
build.rs
View File

@ -13,6 +13,11 @@ fn is_x86_64() -> bool {
target_components()[0] == "x86_64" target_components()[0] == "x86_64"
} }
fn is_x86_32() -> bool {
let arch = &target_components()[0];
arch == "i386" || arch == "i586" || arch == "i686"
}
fn is_armv7() -> bool { fn is_armv7() -> bool {
target_components()[0] == "armv7" target_components()[0] == "armv7"
} }
@ -28,6 +33,13 @@ fn is_windows_msvc() -> bool {
&& target_components()[3] == "msvc" && target_components()[3] == "msvc"
} }
fn is_windows_gnu() -> bool {
// Some targets are only two components long, so check in steps.
target_components()[1] == "pc"
&& target_components()[2] == "windows"
&& target_components()[3] == "gnu"
}
fn new_build() -> cc::Build { fn new_build() -> cc::Build {
let mut build = cc::Build::new(); let mut build = cc::Build::new();
if !is_windows_msvc() { if !is_windows_msvc() {
@ -37,16 +49,16 @@ fn new_build() -> cc::Build {
} }
const WINDOWS_MSVC_ERROR: &str = r#" const WINDOWS_MSVC_ERROR: &str = r#"
The "c_avx512" feature is enabled, but your version of the MSVC C compiler does The "c" feature is enabled, but your version of the MSVC C compiler does not
not support the "/arch:AVX512" flag. If you are building the "b3sum" or support the "/arch:AVX512" flag. If you are building the "b3sum" or "bao_bin"
"bao_bin" crates, you can disable AVX-512 with Cargo's "--no-default-features" crates, you can disable AVX-512 with Cargo's "--no-default-features" flag.
flag. (Note that this also disables other default features like Rayon-based (Note that this also disables other default features like Rayon-based
multithreading, which you can re-enable with "--features=rayon".) Other crates multithreading, which you can re-enable with "--features=rayon".) Other crates
might or might not support this workaround. might or might not support this workaround.
"#; "#;
const GNU_ERROR: &str = r#" const GNU_ERROR: &str = r#"
The "c_avx512" feature is enabled, but your C compiler does not support the The "c" feature is enabled, but your C compiler does not support the
"-mavx512f" flag. If you are building the "b3sum" or "bao_bin" crates, you can "-mavx512f" flag. If you are building the "b3sum" or "bao_bin" crates, you can
disable AVX-512 with Cargo's "--no-default-features" flag. (Note that this also disable AVX-512 with Cargo's "--no-default-features" flag. (Note that this also
disables other default features like Rayon-based multithreading, which you can disables other default features like Rayon-based multithreading, which you can
@ -69,25 +81,76 @@ fn check_for_avx512_compiler_support(build: &cc::Build) {
} }
fn main() -> Result<(), Box<dyn std::error::Error>> { fn main() -> Result<(), Box<dyn std::error::Error>> {
// "c_avx512' is a no-op for non-x86_64 targets. It also participates in if defined("CARGO_FEATURE_C") {
// dynamic CPU feature detection, so it's generally safe to enable. if is_x86_64() && !defined("CARGO_FEATURE_C_PREFER_INTRINSICS") {
// However, it probably won't build in some older environments without // On 64-bit, use the assembly implementations, unless the
// AVX-512 support in the C compiler, and it's disabled by default for that // "c_prefer_intrinsics" feature is enabled.
// reason. if is_windows_msvc() {
if defined("CARGO_FEATURE_C_AVX512") && is_x86_64() { let mut build = new_build();
let mut build = new_build(); build.file("c/blake3-sse41-x86_64-windows-msvc.asm");
check_for_avx512_compiler_support(&build); build.file("c/blake3-avx2-x86_64-windows-msvc.asm");
build.file("c/blake3_avx512.c"); build.file("c/blake3-avx512-x86_64-windows-msvc.asm");
if is_windows_msvc() { build.compile("blake3_asm");
// Note that a lot of versions of MSVC don't support /arch:AVX512, } else if is_windows_gnu() {
// and they'll discard it with a warning, hopefully leading to a let mut build = new_build();
// build error. build.file("c/blake3-sse41-x86_64-windows-gnu.S");
build.flag("/arch:AVX512"); build.file("c/blake3-avx2-x86_64-windows-gnu.S");
build.file("c/blake3-avx512-x86_64-windows-gnu.S");
build.compile("blake3_asm");
} else {
// All non-Windows implementations are assumed to support
// Linux-style assembly. These files do contain a small
// explicit workaround for macOS also.
let mut build = new_build();
build.file("c/blake3-sse41-x86_64-unix.S");
build.file("c/blake3-avx2-x86_64-unix.S");
build.file("c/blake3-avx512-x86_64-unix.S");
build.compile("blake3_asm");
}
} else if is_x86_64() || is_x86_32() {
// Assembly implementations are only for 64-bit. On 32-bit, or if
// the "c_prefer_intrinsics" feature is enabled, use the
// intrinsics-based C implementations. These each need to be
// compiled separately, with the corresponding instruction set
// extension explicitly enabled in the compiler.
let mut sse41_build = new_build();
sse41_build.file("c/blake3_sse41.c");
if is_windows_msvc() {
// /arch:SSE2 is the default on x86 and undefined on x86_64:
// https://docs.microsoft.com/en-us/cpp/build/reference/arch-x86
// It also includes SSE4.1 intrisincs:
// https://stackoverflow.com/a/32183222/823869
} else {
sse41_build.flag("-msse4.1");
}
sse41_build.compile("blake3_sse41");
let mut avx2_build = new_build();
avx2_build.file("c/blake3_avx2.c");
if is_windows_msvc() {
avx2_build.flag("/arch:AVX2");
} else {
avx2_build.flag("-mavx2");
}
avx2_build.compile("blake3_avx2");
let mut avx512_build = new_build();
check_for_avx512_compiler_support(&avx512_build);
avx512_build.file("c/blake3_avx512.c");
if is_windows_msvc() {
// Note that a lot of versions of MSVC don't support /arch:AVX512,
// and they'll discard it with a warning, hopefully leading to a
// build error.
avx512_build.flag("/arch:AVX512");
} else {
avx512_build.flag("-mavx512f");
avx512_build.flag("-mavx512vl");
}
avx512_build.compile("blake3_avx512");
} else { } else {
build.flag("-mavx512f"); // Currently no effect for non-x86 platforms.
build.flag("-mavx512vl");
} }
build.compile("blake3_avx512");
} }
if defined("CARGO_FEATURE_C_NEON") { if defined("CARGO_FEATURE_C_NEON") {

63
src/c_avx2.rs Normal file
View File

@ -0,0 +1,63 @@
use crate::{CVWords, IncrementCounter, BLOCK_LEN, OUT_LEN};
// Note that there is no AVX2 implementation of compress_in_place or
// compress_xof.
// Unsafe because this may only be called on platforms supporting AVX2.
pub unsafe fn hash_many<A: arrayvec::Array<Item = u8>>(
inputs: &[&A],
key: &CVWords,
counter: u64,
increment_counter: IncrementCounter,
flags: u8,
flags_start: u8,
flags_end: u8,
out: &mut [u8],
) {
// The Rust hash_many implementations do bounds checking on the `out`
// array, but the C implementations don't. Even though this is an unsafe
// function, assert the bounds here.
assert!(out.len() >= inputs.len() * OUT_LEN);
ffi::blake3_hash_many_avx2(
inputs.as_ptr() as *const *const u8,
inputs.len(),
A::CAPACITY / BLOCK_LEN,
key.as_ptr(),
counter,
increment_counter.yes(),
flags,
flags_start,
flags_end,
out.as_mut_ptr(),
)
}
pub mod ffi {
extern "C" {
pub fn blake3_hash_many_avx2(
inputs: *const *const u8,
num_inputs: usize,
blocks: usize,
key: *const u32,
counter: u64,
increment_counter: bool,
flags: u8,
flags_start: u8,
flags_end: u8,
out: *mut u8,
);
}
}
#[cfg(test)]
mod test {
use super::*;
#[test]
fn test_hash_many() {
if !crate::platform::avx2_detected() {
return;
}
crate::test::test_hash_many_fn(hash_many, hash_many);
}
}

View File

@ -1,7 +1,5 @@
use crate::{CVWords, IncrementCounter, BLOCK_LEN, OUT_LEN}; use crate::{CVWords, IncrementCounter, BLOCK_LEN, OUT_LEN};
pub const DEGREE: usize = 16;
// Unsafe because this may only be called on platforms supporting AVX-512. // Unsafe because this may only be called on platforms supporting AVX-512.
pub unsafe fn compress_in_place( pub unsafe fn compress_in_place(
cv: &mut CVWords, cv: &mut CVWords,
@ -91,7 +89,6 @@ pub mod ffi {
flags_end: u8, flags_end: u8,
out: *mut u8, out: *mut u8,
); );
} }
} }

View File

@ -1,7 +1,5 @@
use crate::{CVWords, IncrementCounter, BLOCK_LEN, OUT_LEN}; use crate::{CVWords, IncrementCounter, BLOCK_LEN, OUT_LEN};
pub const DEGREE: usize = 4;
// Unsafe because this may only be called on platforms supporting NEON. // Unsafe because this may only be called on platforms supporting NEON.
pub unsafe fn hash_many<A: arrayvec::Array<Item = u8>>( pub unsafe fn hash_many<A: arrayvec::Array<Item = u8>>(
inputs: &[&A], inputs: &[&A],

114
src/c_sse41.rs Normal file
View File

@ -0,0 +1,114 @@
use crate::{CVWords, IncrementCounter, BLOCK_LEN, OUT_LEN};
// Unsafe because this may only be called on platforms supporting SSE4.1.
pub unsafe fn compress_in_place(
cv: &mut CVWords,
block: &[u8; BLOCK_LEN],
block_len: u8,
counter: u64,
flags: u8,
) {
ffi::blake3_compress_in_place_sse41(cv.as_mut_ptr(), block.as_ptr(), block_len, counter, flags)
}
// Unsafe because this may only be called on platforms supporting SSE4.1.
pub unsafe fn compress_xof(
cv: &CVWords,
block: &[u8; BLOCK_LEN],
block_len: u8,
counter: u64,
flags: u8,
) -> [u8; 64] {
let mut out = [0u8; 64];
ffi::blake3_compress_xof_sse41(
cv.as_ptr(),
block.as_ptr(),
block_len,
counter,
flags,
out.as_mut_ptr(),
);
out
}
// Unsafe because this may only be called on platforms supporting SSE4.1.
pub unsafe fn hash_many<A: arrayvec::Array<Item = u8>>(
inputs: &[&A],
key: &CVWords,
counter: u64,
increment_counter: IncrementCounter,
flags: u8,
flags_start: u8,
flags_end: u8,
out: &mut [u8],
) {
// The Rust hash_many implementations do bounds checking on the `out`
// array, but the C implementations don't. Even though this is an unsafe
// function, assert the bounds here.
assert!(out.len() >= inputs.len() * OUT_LEN);
ffi::blake3_hash_many_sse41(
inputs.as_ptr() as *const *const u8,
inputs.len(),
A::CAPACITY / BLOCK_LEN,
key.as_ptr(),
counter,
increment_counter.yes(),
flags,
flags_start,
flags_end,
out.as_mut_ptr(),
)
}
pub mod ffi {
extern "C" {
pub fn blake3_compress_in_place_sse41(
cv: *mut u32,
block: *const u8,
block_len: u8,
counter: u64,
flags: u8,
);
pub fn blake3_compress_xof_sse41(
cv: *const u32,
block: *const u8,
block_len: u8,
counter: u64,
flags: u8,
out: *mut u8,
);
pub fn blake3_hash_many_sse41(
inputs: *const *const u8,
num_inputs: usize,
blocks: usize,
key: *const u32,
counter: u64,
increment_counter: bool,
flags: u8,
flags_start: u8,
flags_end: u8,
out: *mut u8,
);
}
}
#[cfg(test)]
mod test {
use super::*;
#[test]
fn test_compress() {
if !crate::platform::sse41_detected() {
return;
}
crate::test::test_compress_fn(compress_in_place, compress_xof);
}
#[test]
fn test_hash_many() {
if !crate::platform::sse41_detected() {
return;
}
crate::test::test_hash_many_fn(hash_many, hash_many);
}
}

View File

@ -39,24 +39,32 @@ mod test;
#[doc(hidden)] #[doc(hidden)]
pub mod guts; pub mod guts;
// These modules are pub for benchmarks only. They are not stable. // The platform module is pub for benchmarks only. It is not stable.
#[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
#[doc(hidden)]
pub mod avx2;
#[cfg(feature = "c_avx512")]
#[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
#[doc(hidden)]
pub mod c_avx512;
#[cfg(feature = "c_neon")]
#[doc(hidden)]
pub mod c_neon;
#[doc(hidden)] #[doc(hidden)]
pub mod platform; pub mod platform;
#[doc(hidden)]
pub mod portable; // Platform-specific implementations of the compression function.
mod portable;
#[cfg(any(target_arch = "x86", target_arch = "x86_64"))] #[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
#[doc(hidden)] cfg_if::cfg_if! {
pub mod sse41; if #[cfg(feature = "c")] {
#[path = "c_sse41.rs"]
mod sse41;
#[path = "c_avx2.rs"]
mod avx2;
#[path = "c_avx512.rs"]
mod avx512;
} else {
#[path = "rust_sse41.rs"]
mod sse41;
#[path = "rust_avx2.rs"]
mod avx2;
// Stable Rust does not currently support AVX-512.
}
}
#[cfg(feature = "c_neon")]
#[path = "c_neon.rs"]
mod neon;
pub mod traits; pub mod traits;

View File

@ -1,18 +1,10 @@
use crate::{portable, CVWords, IncrementCounter, BLOCK_LEN}; use crate::{portable, CVWords, IncrementCounter, BLOCK_LEN};
use arrayref::{array_mut_ref, array_ref}; use arrayref::{array_mut_ref, array_ref};
#[cfg(feature = "c_avx512")]
#[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
use crate::c_avx512;
#[cfg(feature = "c_neon")]
use crate::c_neon;
#[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
use crate::{avx2, sse41};
cfg_if::cfg_if! { cfg_if::cfg_if! {
if #[cfg(any(target_arch = "x86", target_arch = "x86_64"))] { if #[cfg(any(target_arch = "x86", target_arch = "x86_64"))] {
cfg_if::cfg_if! { cfg_if::cfg_if! {
if #[cfg(feature = "c_avx512")] { if #[cfg(feature = "c")] {
pub const MAX_SIMD_DEGREE: usize = 16; pub const MAX_SIMD_DEGREE: usize = 16;
} else { } else {
pub const MAX_SIMD_DEGREE: usize = 8; pub const MAX_SIMD_DEGREE: usize = 8;
@ -32,7 +24,7 @@ cfg_if::cfg_if! {
cfg_if::cfg_if! { cfg_if::cfg_if! {
if #[cfg(any(target_arch = "x86", target_arch = "x86_64"))] { if #[cfg(any(target_arch = "x86", target_arch = "x86_64"))] {
cfg_if::cfg_if! { cfg_if::cfg_if! {
if #[cfg(feature = "c_avx512")] { if #[cfg(feature = "c")] {
pub const MAX_SIMD_DEGREE_OR_2: usize = 16; pub const MAX_SIMD_DEGREE_OR_2: usize = 16;
} else { } else {
pub const MAX_SIMD_DEGREE_OR_2: usize = 8; pub const MAX_SIMD_DEGREE_OR_2: usize = 8;
@ -52,7 +44,7 @@ pub enum Platform {
SSE41, SSE41,
#[cfg(any(target_arch = "x86", target_arch = "x86_64"))] #[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
AVX2, AVX2,
#[cfg(feature = "c_avx512")] #[cfg(feature = "c")]
#[cfg(any(target_arch = "x86", target_arch = "x86_64"))] #[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
AVX512, AVX512,
#[cfg(feature = "c_neon")] #[cfg(feature = "c_neon")]
@ -64,7 +56,7 @@ impl Platform {
pub fn detect() -> Self { pub fn detect() -> Self {
#[cfg(any(target_arch = "x86", target_arch = "x86_64"))] #[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
{ {
#[cfg(feature = "c_avx512")] #[cfg(feature = "c")]
{ {
if avx512_detected() { if avx512_detected() {
return Platform::AVX512; return Platform::AVX512;
@ -93,7 +85,7 @@ impl Platform {
Platform::SSE41 => 4, Platform::SSE41 => 4,
#[cfg(any(target_arch = "x86", target_arch = "x86_64"))] #[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
Platform::AVX2 => 8, Platform::AVX2 => 8,
#[cfg(feature = "c_avx512")] #[cfg(feature = "c")]
#[cfg(any(target_arch = "x86", target_arch = "x86_64"))] #[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
Platform::AVX512 => 16, Platform::AVX512 => 16,
#[cfg(feature = "c_neon")] #[cfg(feature = "c_neon")]
@ -103,7 +95,7 @@ impl Platform {
degree degree
} }
pub(crate) fn compress_in_place( pub fn compress_in_place(
&self, &self,
cv: &mut CVWords, cv: &mut CVWords,
block: &[u8; BLOCK_LEN], block: &[u8; BLOCK_LEN],
@ -116,13 +108,13 @@ impl Platform {
// Safe because detect() checked for platform support. // Safe because detect() checked for platform support.
#[cfg(any(target_arch = "x86", target_arch = "x86_64"))] #[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
Platform::SSE41 | Platform::AVX2 => unsafe { Platform::SSE41 | Platform::AVX2 => unsafe {
sse41::compress_in_place(cv, block, block_len, counter, flags) crate::sse41::compress_in_place(cv, block, block_len, counter, flags)
}, },
// Safe because detect() checked for platform support. // Safe because detect() checked for platform support.
#[cfg(feature = "c_avx512")] #[cfg(feature = "c")]
#[cfg(any(target_arch = "x86", target_arch = "x86_64"))] #[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
Platform::AVX512 => unsafe { Platform::AVX512 => unsafe {
c_avx512::compress_in_place(cv, block, block_len, counter, flags) crate::avx512::compress_in_place(cv, block, block_len, counter, flags)
}, },
// No NEON compress_in_place() implementation yet. // No NEON compress_in_place() implementation yet.
#[cfg(feature = "c_neon")] #[cfg(feature = "c_neon")]
@ -130,7 +122,7 @@ impl Platform {
} }
} }
pub(crate) fn compress_xof( pub fn compress_xof(
&self, &self,
cv: &CVWords, cv: &CVWords,
block: &[u8; BLOCK_LEN], block: &[u8; BLOCK_LEN],
@ -143,13 +135,13 @@ impl Platform {
// Safe because detect() checked for platform support. // Safe because detect() checked for platform support.
#[cfg(any(target_arch = "x86", target_arch = "x86_64"))] #[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
Platform::SSE41 | Platform::AVX2 => unsafe { Platform::SSE41 | Platform::AVX2 => unsafe {
sse41::compress_xof(cv, block, block_len, counter, flags) crate::sse41::compress_xof(cv, block, block_len, counter, flags)
}, },
// Safe because detect() checked for platform support. // Safe because detect() checked for platform support.
#[cfg(feature = "c_avx512")] #[cfg(feature = "c")]
#[cfg(any(target_arch = "x86", target_arch = "x86_64"))] #[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
Platform::AVX512 => unsafe { Platform::AVX512 => unsafe {
c_avx512::compress_xof(cv, block, block_len, counter, flags) crate::avx512::compress_xof(cv, block, block_len, counter, flags)
}, },
// No NEON compress_xof() implementation yet. // No NEON compress_xof() implementation yet.
#[cfg(feature = "c_neon")] #[cfg(feature = "c_neon")]
@ -167,7 +159,7 @@ impl Platform {
// after every block, there's a small but measurable performance loss. // after every block, there's a small but measurable performance loss.
// Compressing chunks with a dedicated loop avoids this. // Compressing chunks with a dedicated loop avoids this.
pub(crate) fn hash_many<A: arrayvec::Array<Item = u8>>( pub fn hash_many<A: arrayvec::Array<Item = u8>>(
&self, &self,
inputs: &[&A], inputs: &[&A],
key: &CVWords, key: &CVWords,
@ -192,7 +184,7 @@ impl Platform {
// Safe because detect() checked for platform support. // Safe because detect() checked for platform support.
#[cfg(any(target_arch = "x86", target_arch = "x86_64"))] #[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
Platform::SSE41 => unsafe { Platform::SSE41 => unsafe {
sse41::hash_many( crate::sse41::hash_many(
inputs, inputs,
key, key,
counter, counter,
@ -206,7 +198,7 @@ impl Platform {
// Safe because detect() checked for platform support. // Safe because detect() checked for platform support.
#[cfg(any(target_arch = "x86", target_arch = "x86_64"))] #[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
Platform::AVX2 => unsafe { Platform::AVX2 => unsafe {
avx2::hash_many( crate::avx2::hash_many(
inputs, inputs,
key, key,
counter, counter,
@ -218,10 +210,10 @@ impl Platform {
) )
}, },
// Safe because detect() checked for platform support. // Safe because detect() checked for platform support.
#[cfg(feature = "c_avx512")] #[cfg(feature = "c")]
#[cfg(any(target_arch = "x86", target_arch = "x86_64"))] #[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
Platform::AVX512 => unsafe { Platform::AVX512 => unsafe {
c_avx512::hash_many( crate::avx512::hash_many(
inputs, inputs,
key, key,
counter, counter,
@ -235,7 +227,7 @@ impl Platform {
// Assumed to be safe if the "c_neon" feature is on. // Assumed to be safe if the "c_neon" feature is on.
#[cfg(feature = "c_neon")] #[cfg(feature = "c_neon")]
Platform::NEON => unsafe { Platform::NEON => unsafe {
c_neon::hash_many( crate::neon::hash_many(
inputs, inputs,
key, key,
counter, counter,
@ -248,11 +240,52 @@ impl Platform {
}, },
} }
} }
// Explicit platform constructors, for benchmarks.
pub fn portable() -> Self {
Self::Portable
}
#[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
pub fn sse41() -> Option<Self> {
if sse41_detected() {
Some(Self::SSE41)
} else {
None
}
}
#[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
pub fn avx2() -> Option<Self> {
if avx2_detected() {
Some(Self::AVX2)
} else {
None
}
}
#[cfg(feature = "c")]
#[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
pub fn avx512() -> Option<Self> {
if avx512_detected() {
Some(Self::AVX512)
} else {
None
}
}
#[cfg(feature = "c_neon")]
#[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
pub fn neon() -> Option<Self> {
// Assumed to be safe if the "c_neon" feature is on.
Some(Self::NEON)
}
} }
// Note that AVX-512 is divided into multiple featuresets, and we use two of // Note that AVX-512 is divided into multiple featuresets, and we use two of
// them, F and VL. // them, F and VL.
#[cfg(feature = "c_avx512")] #[cfg(feature = "c")]
#[cfg(any(target_arch = "x86", target_arch = "x86_64"))] #[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
#[inline(always)] #[inline(always)]
pub fn avx512_detected() -> bool { pub fn avx512_detected() -> bool {

View File

@ -3,10 +3,16 @@ name = "test_vectors"
version = "0.0.0" version = "0.0.0"
edition = "2018" edition = "2018"
[features]
default = []
c = ["blake3/c"]
c_prefer_intrinsics = ["blake3/c_prefer_intrinsics"]
c_neon = ["blake3/c_neon"]
[dependencies] [dependencies]
# If you ever change these path dependencies, you'll probably need to update # If you ever change these path dependencies, you'll probably need to update
# cross_test.sh, or CI will break. I'm sorry >.< # cross_test.sh, or CI will break. I'm sorry >.<
blake3 = { path = "../", features=["c_avx512"] } blake3 = { path = "../" }
hex = "0.4.0" hex = "0.4.0"
reference_impl = { path = "../reference_impl" } reference_impl = { path = "../reference_impl" }
serde = { version = "1.0", features = ["derive"] } serde = { version = "1.0", features = ["derive"] }

View File

@ -19,7 +19,7 @@ mv blake3/test_vectors .
mv blake3/reference_impl test_vectors mv blake3/reference_impl test_vectors
mv blake3 test_vectors mv blake3 test_vectors
cd test_vectors cd test_vectors
sed -i 's|blake3 = { path = "../", features=\["c_avx512"\] }|blake3 = { path = "./blake3" }|' Cargo.toml sed -i 's|blake3 = { path = "../" }|blake3 = { path = "./blake3" }|' Cargo.toml
sed -i 's|reference_impl = { path = "../reference_impl" }|reference_impl = { path = "reference_impl" }|' Cargo.toml sed -i 's|reference_impl = { path = "../reference_impl" }|reference_impl = { path = "reference_impl" }|' Cargo.toml
cross test "$@" cross test "$@"