integrate assembly implementations into the blake3 crate

2024-05-08 19:06:02 +02:00 · 2020-02-11 14:13:30 -05:00 · 2020-02-11 14:13:30 -05:00 · efbfa0463c
parent b6b3c27824
commit efbfa0463c
16 changed files with 465 additions and 192 deletions
--- a/.github/workflows/ci.yml
+++ b/.github/workflows/ci.yml
@ -24,22 +24,30 @@ jobs:
        toolchain: ${{ format('{0}-{1}', matrix.channel, matrix.target.toolchain) }}
        profile: minimal
        override: true
-    # Default tests.
+    # Default tests plus Rayon.
-    - run: cargo test
+    - run: cargo test --features=rayon
-    # No-default-features tests.
+    # no_std tests.
    - run: cargo test --no-default-features
-    # More features tests. Note that "c_avx512" participates in dynamic feature
+    # Test the x86 assembly implementations. Use -vv to log compiler commands.
-    # detection, so it'll be built, but it probably won't run.
+    - run: cargo test --features=c -vv
-    - run: cargo test --features=c_avx512,rayon
+    # Test the C intrinsics implementations. Use -vv to log compiler commands.
    - run: cargo test --features=c,c_prefer_intrinsics -vv
    # Test release mode. This does more iteratations in test_fuzz_hasher.
    - run: cargo test --release
-    # Test benchmarks. Nightly only.
+    # Test benchmarks. RUSTC_BOOTSTRAP=1 lets this run on non-nightly toolchains.
-    - run: cargo test --benches
+    - run: cargo test --benches --features=c
-      if: matrix.rust_version == 'nightly'
+      env:
        RUSTC_BOOTSTRAP: 1
    # Test vectors.
    - name: test vectors
      run: cargo test
      working-directory: ./test_vectors
    - name: test vectors
      run: cargo test --features=c
      working-directory: ./test_vectors
    - name: test vectors
      run: cargo test --features=c,c_prefer_intrinsics
      working-directory: ./test_vectors
    # Test b3sum.
    - name: test b3sum
      run: cargo test
--- a/Cargo.toml
+++ b/Cargo.toml
@ -11,10 +11,21 @@ edition = "2018"
 [features]
 default = ["std"]
-# Like SSE4.1 and AVX2, the AVX-512 implementation participates in dynamic CPU
+# The "c" feature includes C and assembly SIMD implementations of the
-# feature detection. A binary with "c_avx512" on is still cross-platform. This
+# compression function for x86 platforms, called via FFI. (Currently it has no
-# feature has no effect on non-x86.
+# effect on other platforms.) This requires a C toolchain on the build machine.
-c_avx512 = []
+# This is necessary for AVX-512 support, which is not yet stable in Rust, and
 # the assembly implementations also perform better than those using Rust/LLVM
 # intrinsics. As with the Rust implementations, these C and assembly
 # implementations participate in runtime CPU feature detection, and the
 # resulting binary is portable.
 c = []
 # Normally x86-64 builds prefer assembly implementations over C intrinsics. The
 # assembly implementations perform better, perform most consistently across
 # compilers, and are much faster to build. However, this feature makes the
 # build use the C intrinsics implementations instead. This is mainly for
 # testing purposes, and most callers will not want to use it.
 c_prefer_intrinsics = []
 # The NEON implementation does not participate in dynamic feature detection,
 # which is currently x86-only. If "c_neon" is on, NEON support is assumed. Note
 # that AArch64 always supports NEON, but support on ARMv7 varies.
--- a/README.md
+++ b/README.md
@ -33,19 +33,18 @@ with BLAKE3.
 This repository is the official implementation of BLAKE3. It includes:
 * The [`blake3`](https://crates.io/crates/blake3) Rust crate, which
-  includes optimized SIMD implementations, with dynamic CPU feature
+  includes optimized SIMD implementations, with runtime CPU feature
-  detection on x86. SSE4.1 and AVX2 support are implemented in Rust,
+  detection on x86. SSE4.1 and AVX2 are supported in pure Rust. The `c`
-  while AVX-512 and ARM NEON support are imported from the C
+  feature enables C/assembly implementations and AVX-512 support. The
-  implementation and controlled by the `c_avx512` and `c_neon` features.
+  `c_neon` feature enables ARM NEON support. Multi-threading is also
-  Multi-threading is implemented with
+  supported, and the `rayon` feature provides a
-  [Rayon](https://github.com/rayon-rs/rayon) and controlled by the
+  [Rayon](https://github.com/rayon-rs/rayon)-based implementation.
  `rayon` feature. 
 * The [`b3sum`](https://crates.io/crates/b3sum) Rust crate, which
  provides a command line interface. You can install it from
  [crates.io](https://crates.io/crates/b3sum) with `cargo install
-  b3sum`. It enables the multi-threading and AVX-512 features of the
+  b3sum`. It enables the `rayon` and `c` features of the `blake3` crate
-  `blake3` crate by default.
+  by default.
 * The [C implementation](c), which like the Rust implementation includes
  SIMD code and dynamic CPU feature detection on x86. Unlike the Rust
@ -80,9 +79,6 @@ we recommend [Argon2](https://github.com/P-H-C/phc-winner-argon2).*
 ## Usage
 This repository provides the `b3sum` command line utility and the
 `blake3` Rust crate.
 ### The `b3sum` utility
 The `b3sum` utility allows you to process files and data from standard
--- a/b3sum/Cargo.toml
+++ b/b3sum/Cargo.toml
@ -9,8 +9,8 @@ readme = "README.md"
 edition = "2018"
 [features]
-default = ["c_avx512", "rayon"]
+default = ["c", "rayon"]
-c_avx512 = ["blake3/c_avx512"]
+c = ["blake3/c"]
 c_neon = ["blake3/c_neon"]
 rayon = ["blake3/rayon", "memmap"]
--- a/benches/bench.rs
+++ b/benches/bench.rs
@ -4,7 +4,7 @@ extern crate test;
 use arrayref::array_ref;
 use arrayvec::ArrayVec;
-use blake3::platform::MAX_SIMD_DEGREE;
+use blake3::platform::{Platform, MAX_SIMD_DEGREE};
 use blake3::{BLOCK_LEN, CHUNK_LEN, OUT_LEN};
 use rand::prelude::*;
 use test::Bencher;
@ -48,173 +48,149 @@ impl RandomInput {
    }
 }
-type CompressInPlaceFn =
+fn bench_single_compression_fn(b: &mut Bencher, platform: Platform) {
    unsafe fn(cv: &mut [u32; 8], block: &[u8; BLOCK_LEN], block_len: u8, counter: u64, flags: u8);
 fn bench_single_compression_fn(b: &mut Bencher, f: CompressInPlaceFn) {
    let mut state = [1u32; 8];
    let mut r = RandomInput::new(b, 64);
    let input = array_ref!(r.get(), 0, 64);
-    unsafe {
+    b.iter(|| platform.compress_in_place(&mut state, input, 64 as u8, 0, 0));
        b.iter(|| f(&mut state, input, 64 as u8, 0, 0));
    }
 }
 #[bench]
 fn bench_single_compression_portable(b: &mut Bencher) {
-    bench_single_compression_fn(b, blake3::portable::compress_in_place);
+    bench_single_compression_fn(b, Platform::portable());
 }
 #[bench]
 #[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
 fn bench_single_compression_sse41(b: &mut Bencher) {
-    if !blake3::platform::sse41_detected() {
+    if let Some(platform) = Platform::sse41() {
-        return;
+        bench_single_compression_fn(b, platform);
    }
    bench_single_compression_fn(b, blake3::sse41::compress_in_place);
 }
 #[bench]
-#[cfg(feature = "c_avx512")]
+#[cfg(feature = "c")]
 fn bench_single_compression_avx512(b: &mut Bencher) {
-    if !blake3::platform::avx512_detected() {
+    if let Some(platform) = Platform::avx512() {
-        return;
+        bench_single_compression_fn(b, platform);
    }
    bench_single_compression_fn(b, blake3::c_avx512::compress_in_place);
 }
-type HashManyFn<A> = unsafe fn(
+fn bench_many_chunks_fn(b: &mut Bencher, platform: Platform) {
-    inputs: &[&A],
+    let degree = platform.simd_degree();
    key: &[u32; 8],
    counter: u64,
    increment_counter: blake3::IncrementCounter,
    flags: u8,
    flags_start: u8,
    flags_end: u8,
    out: &mut [u8],
 );
 fn bench_many_chunks_fn(b: &mut Bencher, f: HashManyFn<[u8; CHUNK_LEN]>, degree: usize) {
    let mut inputs = Vec::new();
    for _ in 0..degree {
        inputs.push(RandomInput::new(b, CHUNK_LEN));
    }
-    unsafe {
+    b.iter(|| {
-        b.iter(|| {
+        let input_arrays: ArrayVec<[&[u8; CHUNK_LEN]; MAX_SIMD_DEGREE]> = inputs
-            let input_arrays: ArrayVec<[&[u8; CHUNK_LEN]; MAX_SIMD_DEGREE]> = inputs
+            .iter_mut()
-                .iter_mut()
+            .take(degree)
-                .take(degree)
+            .map(|i| array_ref!(i.get(), 0, CHUNK_LEN))
-                .map(|i| array_ref!(i.get(), 0, CHUNK_LEN))
+            .collect();
-                .collect();
+        let mut out = [0; MAX_SIMD_DEGREE * OUT_LEN];
-            let mut out = [0; MAX_SIMD_DEGREE * OUT_LEN];
+        platform.hash_many(
-            f(
+            &input_arrays[..],
-                &input_arrays[..],
+            &[0; 8],
-                &[0; 8],
+            0,
-                0,
+            blake3::IncrementCounter::Yes,
-                blake3::IncrementCounter::Yes,
+            0,
-                0,
+            0,
-                0,
+            0,
-                0,
+            &mut out,
-                &mut out,
+        );
-            );
+    });
        });
    }
 }
 #[bench]
 #[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
 fn bench_many_chunks_sse41(b: &mut Bencher) {
-    if !blake3::platform::sse41_detected() {
+    if let Some(platform) = Platform::sse41() {
-        return;
+        bench_many_chunks_fn(b, platform);
    }
    bench_many_chunks_fn(b, blake3::sse41::hash_many, blake3::sse41::DEGREE);
 }
 #[bench]
 #[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
 fn bench_many_chunks_avx2(b: &mut Bencher) {
-    if !blake3::platform::avx2_detected() {
+    if let Some(platform) = Platform::avx2() {
-        return;
+        bench_many_chunks_fn(b, platform);
    }
    bench_many_chunks_fn(b, blake3::avx2::hash_many, blake3::avx2::DEGREE);
 }
 #[bench]
-#[cfg(feature = "c_avx512")]
+#[cfg(feature = "c")]
 fn bench_many_chunks_avx512(b: &mut Bencher) {
-    if !blake3::platform::avx512_detected() {
+    if let Some(platform) = Platform::avx512() {
-        return;
+        bench_many_chunks_fn(b, platform);
    }
    bench_many_chunks_fn(b, blake3::c_avx512::hash_many, blake3::c_avx512::DEGREE);
 }
 #[bench]
 #[cfg(feature = "c_neon")]
 fn bench_many_chunks_neon(b: &mut Bencher) {
-    // When "c_neon" is on, NEON support is assumed.
+    if let Some(platform) = Platform::neon() {
-    bench_many_chunks_fn(b, blake3::c_neon::hash_many, blake3::c_neon::DEGREE);
+        bench_many_chunks_fn(b, platform);
    }
 }
 // TODO: When we get const generics we can unify this with the chunks code.
-fn bench_many_parents_fn(b: &mut Bencher, f: HashManyFn<[u8; BLOCK_LEN]>, degree: usize) {
+fn bench_many_parents_fn(b: &mut Bencher, platform: Platform) {
    let degree = platform.simd_degree();
    let mut inputs = Vec::new();
    for _ in 0..degree {
        inputs.push(RandomInput::new(b, BLOCK_LEN));
    }
-    unsafe {
+    b.iter(|| {
-        b.iter(|| {
+        let input_arrays: ArrayVec<[&[u8; BLOCK_LEN]; MAX_SIMD_DEGREE]> = inputs
-            let input_arrays: ArrayVec<[&[u8; BLOCK_LEN]; MAX_SIMD_DEGREE]> = inputs
+            .iter_mut()
-                .iter_mut()
+            .take(degree)
-                .take(degree)
+            .map(|i| array_ref!(i.get(), 0, BLOCK_LEN))
-                .map(|i| array_ref!(i.get(), 0, BLOCK_LEN))
+            .collect();
-                .collect();
+        let mut out = [0; MAX_SIMD_DEGREE * OUT_LEN];
-            let mut out = [0; MAX_SIMD_DEGREE * OUT_LEN];
+        platform.hash_many(
-            f(
+            &input_arrays[..],
-                &input_arrays[..],
+            &[0; 8],
-                &[0; 8],
+            0,
-                0,
+            blake3::IncrementCounter::No,
-                blake3::IncrementCounter::No,
+            0,
-                0,
+            0,
-                0,
+            0,
-                0,
+            &mut out,
-                &mut out,
+        );
-            );
+    });
        });
    }
 }
 #[bench]
 #[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
 fn bench_many_parents_sse41(b: &mut Bencher) {
-    if !blake3::platform::sse41_detected() {
+    if let Some(platform) = Platform::sse41() {
-        return;
+        bench_many_parents_fn(b, platform);
    }
    bench_many_parents_fn(b, blake3::sse41::hash_many, blake3::sse41::DEGREE);
 }
 #[bench]
 #[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
 fn bench_many_parents_avx2(b: &mut Bencher) {
-    if !blake3::platform::avx2_detected() {
+    if let Some(platform) = Platform::avx2() {
-        return;
+        bench_many_parents_fn(b, platform);
    }
    bench_many_parents_fn(b, blake3::avx2::hash_many, blake3::avx2::DEGREE);
 }
 #[bench]
-#[cfg(feature = "c_avx512")]
+#[cfg(feature = "c")]
 fn bench_many_parents_avx512(b: &mut Bencher) {
-    if !blake3::platform::avx512_detected() {
+    if let Some(platform) = Platform::avx512() {
-        return;
+        bench_many_parents_fn(b, platform);
    }
    bench_many_parents_fn(b, blake3::c_avx512::hash_many, blake3::c_avx512::DEGREE);
 }
 #[bench]
 #[cfg(feature = "c_neon")]
 fn bench_many_parents_neon(b: &mut Bencher) {
-    // When "c_neon" is on, NEON support is assumed.
+    if let Some(platform) = Platform::neon() {
-    bench_many_parents_fn(b, blake3::c_neon::hash_many, blake3::c_neon::DEGREE);
+        bench_many_parents_fn(b, platform);
    }
 }
 fn bench_atonce(b: &mut Bencher, len: usize) {
--- a/build.rs
+++ b/build.rs
@ -13,6 +13,11 @@ fn is_x86_64() -> bool {
    target_components()[0] == "x86_64"
 }
 fn is_x86_32() -> bool {
    let arch = &target_components()[0];
    arch == "i386" || arch == "i586" || arch == "i686"
 }
 fn is_armv7() -> bool {
    target_components()[0] == "armv7"
 }
@ -28,6 +33,13 @@ fn is_windows_msvc() -> bool {
        && target_components()[3] == "msvc"
 }
 fn is_windows_gnu() -> bool {
    // Some targets are only two components long, so check in steps.
    target_components()[1] == "pc"
        && target_components()[2] == "windows"
        && target_components()[3] == "gnu"
 }
 fn new_build() -> cc::Build {
    let mut build = cc::Build::new();
    if !is_windows_msvc() {
@ -37,16 +49,16 @@ fn new_build() -> cc::Build {
 }
 const WINDOWS_MSVC_ERROR: &str = r#"
-The "c_avx512" feature is enabled, but your version of the MSVC C compiler does
+The "c" feature is enabled, but your version of the MSVC C compiler does not
-not support the "/arch:AVX512" flag. If you are building the "b3sum" or
+support the "/arch:AVX512" flag. If you are building the "b3sum" or "bao_bin"
-"bao_bin" crates, you can disable AVX-512 with Cargo's "--no-default-features"
+crates, you can disable AVX-512 with Cargo's "--no-default-features" flag.
-flag. (Note that this also disables other default features like Rayon-based
+(Note that this also disables other default features like Rayon-based
 multithreading, which you can re-enable with "--features=rayon".) Other crates
 might or might not support this workaround.
 "#;
 const GNU_ERROR: &str = r#"
-The "c_avx512" feature is enabled, but your C compiler does not support the
+The "c" feature is enabled, but your C compiler does not support the
 "-mavx512f" flag. If you are building the "b3sum" or "bao_bin" crates, you can
 disable AVX-512 with Cargo's "--no-default-features" flag. (Note that this also
 disables other default features like Rayon-based multithreading, which you can
@ -69,25 +81,76 @@ fn check_for_avx512_compiler_support(build: &cc::Build) {
 }
 fn main() -> Result<(), Box<dyn std::error::Error>> {
-    // "c_avx512' is a no-op for non-x86_64 targets. It also participates in
+    if defined("CARGO_FEATURE_C") {
-    // dynamic CPU feature detection, so it's generally safe to enable.
+        if is_x86_64() && !defined("CARGO_FEATURE_C_PREFER_INTRINSICS") {
-    // However, it probably won't build in some older environments without
+            // On 64-bit, use the assembly implementations, unless the
-    // AVX-512 support in the C compiler, and it's disabled by default for that
+            // "c_prefer_intrinsics" feature is enabled.
-    // reason.
+            if is_windows_msvc() {
-    if defined("CARGO_FEATURE_C_AVX512") && is_x86_64() {
+                let mut build = new_build();
-        let mut build = new_build();
+                build.file("c/blake3-sse41-x86_64-windows-msvc.asm");
-        check_for_avx512_compiler_support(&build);
+                build.file("c/blake3-avx2-x86_64-windows-msvc.asm");
-        build.file("c/blake3_avx512.c");
+                build.file("c/blake3-avx512-x86_64-windows-msvc.asm");
-        if is_windows_msvc() {
+                build.compile("blake3_asm");
-            // Note that a lot of versions of MSVC don't support /arch:AVX512,
+            } else if is_windows_gnu() {
-            // and they'll discard it with a warning, hopefully leading to a
+                let mut build = new_build();
-            // build error.
+                build.file("c/blake3-sse41-x86_64-windows-gnu.S");
-            build.flag("/arch:AVX512");
+                build.file("c/blake3-avx2-x86_64-windows-gnu.S");
                build.file("c/blake3-avx512-x86_64-windows-gnu.S");
                build.compile("blake3_asm");
            } else {
                // All non-Windows implementations are assumed to support
                // Linux-style assembly. These files do contain a small
                // explicit workaround for macOS also.
                let mut build = new_build();
                build.file("c/blake3-sse41-x86_64-unix.S");
                build.file("c/blake3-avx2-x86_64-unix.S");
                build.file("c/blake3-avx512-x86_64-unix.S");
                build.compile("blake3_asm");
            }
        } else if is_x86_64() || is_x86_32() {
            // Assembly implementations are only for 64-bit. On 32-bit, or if
            // the "c_prefer_intrinsics" feature is enabled, use the
            // intrinsics-based C implementations. These each need to be
            // compiled separately, with the corresponding instruction set
            // extension explicitly enabled in the compiler.
            let mut sse41_build = new_build();
            sse41_build.file("c/blake3_sse41.c");
            if is_windows_msvc() {
                // /arch:SSE2 is the default on x86 and undefined on x86_64:
                // https://docs.microsoft.com/en-us/cpp/build/reference/arch-x86
                // It also includes SSE4.1 intrisincs:
                // https://stackoverflow.com/a/32183222/823869
            } else {
                sse41_build.flag("-msse4.1");
            }
            sse41_build.compile("blake3_sse41");
            let mut avx2_build = new_build();
            avx2_build.file("c/blake3_avx2.c");
            if is_windows_msvc() {
                avx2_build.flag("/arch:AVX2");
            } else {
                avx2_build.flag("-mavx2");
            }
            avx2_build.compile("blake3_avx2");
            let mut avx512_build = new_build();
            check_for_avx512_compiler_support(&avx512_build);
            avx512_build.file("c/blake3_avx512.c");
            if is_windows_msvc() {
                // Note that a lot of versions of MSVC don't support /arch:AVX512,
                // and they'll discard it with a warning, hopefully leading to a
                // build error.
                avx512_build.flag("/arch:AVX512");
            } else {
                avx512_build.flag("-mavx512f");
                avx512_build.flag("-mavx512vl");
            }
            avx512_build.compile("blake3_avx512");
        } else {
-            build.flag("-mavx512f");
+            // Currently no effect for non-x86 platforms.
            build.flag("-mavx512vl");
        }
        build.compile("blake3_avx512");
    }
    if defined("CARGO_FEATURE_C_NEON") {
--- a/src/c_avx2.rs
+++ b/src/c_avx2.rs
@ -0,0 +1,63 @@
 use crate::{CVWords, IncrementCounter, BLOCK_LEN, OUT_LEN};
 // Note that there is no AVX2 implementation of compress_in_place or
 // compress_xof.
 // Unsafe because this may only be called on platforms supporting AVX2.
 pub unsafe fn hash_many<A: arrayvec::Array<Item = u8>>(
    inputs: &[&A],
    key: &CVWords,
    counter: u64,
    increment_counter: IncrementCounter,
    flags: u8,
    flags_start: u8,
    flags_end: u8,
    out: &mut [u8],
 ) {
    // The Rust hash_many implementations do bounds checking on the `out`
    // array, but the C implementations don't. Even though this is an unsafe
    // function, assert the bounds here.
    assert!(out.len() >= inputs.len() * OUT_LEN);
    ffi::blake3_hash_many_avx2(
        inputs.as_ptr() as *const *const u8,
        inputs.len(),
        A::CAPACITY / BLOCK_LEN,
        key.as_ptr(),
        counter,
        increment_counter.yes(),
        flags,
        flags_start,
        flags_end,
        out.as_mut_ptr(),
    )
 }
 pub mod ffi {
    extern "C" {
        pub fn blake3_hash_many_avx2(
            inputs: *const *const u8,
            num_inputs: usize,
            blocks: usize,
            key: *const u32,
            counter: u64,
            increment_counter: bool,
            flags: u8,
            flags_start: u8,
            flags_end: u8,
            out: *mut u8,
        );
    }
 }
 #[cfg(test)]
 mod test {
    use super::*;
    #[test]
    fn test_hash_many() {
        if !crate::platform::avx2_detected() {
            return;
        }
        crate::test::test_hash_many_fn(hash_many, hash_many);
    }
 }
--- a/src/c_avx512.rs
+++ b/src/c_avx512.rs
@ -1,7 +1,5 @@
 use crate::{CVWords, IncrementCounter, BLOCK_LEN, OUT_LEN};
 pub const DEGREE: usize = 16;
 // Unsafe because this may only be called on platforms supporting AVX-512.
 pub unsafe fn compress_in_place(
    cv: &mut CVWords,
@ -91,7 +89,6 @@ pub mod ffi {
            flags_end: u8,
            out: *mut u8,
        );
    }
 }
--- a/src/c_neon.rs
+++ b/src/c_neon.rs
@ -1,7 +1,5 @@
 use crate::{CVWords, IncrementCounter, BLOCK_LEN, OUT_LEN};
 pub const DEGREE: usize = 4;
 // Unsafe because this may only be called on platforms supporting NEON.
 pub unsafe fn hash_many<A: arrayvec::Array<Item = u8>>(
    inputs: &[&A],
--- a/src/c_sse41.rs
+++ b/src/c_sse41.rs
@ -0,0 +1,114 @@
 use crate::{CVWords, IncrementCounter, BLOCK_LEN, OUT_LEN};
 // Unsafe because this may only be called on platforms supporting SSE4.1.
 pub unsafe fn compress_in_place(
    cv: &mut CVWords,
    block: &[u8; BLOCK_LEN],
    block_len: u8,
    counter: u64,
    flags: u8,
 ) {
    ffi::blake3_compress_in_place_sse41(cv.as_mut_ptr(), block.as_ptr(), block_len, counter, flags)
 }
 // Unsafe because this may only be called on platforms supporting SSE4.1.
 pub unsafe fn compress_xof(
    cv: &CVWords,
    block: &[u8; BLOCK_LEN],
    block_len: u8,
    counter: u64,
    flags: u8,
 ) -> [u8; 64] {
    let mut out = [0u8; 64];
    ffi::blake3_compress_xof_sse41(
        cv.as_ptr(),
        block.as_ptr(),
        block_len,
        counter,
        flags,
        out.as_mut_ptr(),
    );
    out
 }
 // Unsafe because this may only be called on platforms supporting SSE4.1.
 pub unsafe fn hash_many<A: arrayvec::Array<Item = u8>>(
    inputs: &[&A],
    key: &CVWords,
    counter: u64,
    increment_counter: IncrementCounter,
    flags: u8,
    flags_start: u8,
    flags_end: u8,
    out: &mut [u8],
 ) {
    // The Rust hash_many implementations do bounds checking on the `out`
    // array, but the C implementations don't. Even though this is an unsafe
    // function, assert the bounds here.
    assert!(out.len() >= inputs.len() * OUT_LEN);
    ffi::blake3_hash_many_sse41(
        inputs.as_ptr() as *const *const u8,
        inputs.len(),
        A::CAPACITY / BLOCK_LEN,
        key.as_ptr(),
        counter,
        increment_counter.yes(),
        flags,
        flags_start,
        flags_end,
        out.as_mut_ptr(),
    )
 }
 pub mod ffi {
    extern "C" {
        pub fn blake3_compress_in_place_sse41(
            cv: *mut u32,
            block: *const u8,
            block_len: u8,
            counter: u64,
            flags: u8,
        );
        pub fn blake3_compress_xof_sse41(
            cv: *const u32,
            block: *const u8,
            block_len: u8,
            counter: u64,
            flags: u8,
            out: *mut u8,
        );
        pub fn blake3_hash_many_sse41(
            inputs: *const *const u8,
            num_inputs: usize,
            blocks: usize,
            key: *const u32,
            counter: u64,
            increment_counter: bool,
            flags: u8,
            flags_start: u8,
            flags_end: u8,
            out: *mut u8,
        );
    }
 }
 #[cfg(test)]
 mod test {
    use super::*;
    #[test]
    fn test_compress() {
        if !crate::platform::sse41_detected() {
            return;
        }
        crate::test::test_compress_fn(compress_in_place, compress_xof);
    }
    #[test]
    fn test_hash_many() {
        if !crate::platform::sse41_detected() {
            return;
        }
        crate::test::test_hash_many_fn(hash_many, hash_many);
    }
 }
--- a/src/lib.rs
+++ b/src/lib.rs
@ -39,24 +39,32 @@ mod test;
 #[doc(hidden)]
 pub mod guts;
-// These modules are pub for benchmarks only. They are not stable.
+// The platform module is pub for benchmarks only. It is not stable.
 #[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
 #[doc(hidden)]
 pub mod avx2;
 #[cfg(feature = "c_avx512")]
 #[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
 #[doc(hidden)]
 pub mod c_avx512;
 #[cfg(feature = "c_neon")]
 #[doc(hidden)]
 pub mod c_neon;
 #[doc(hidden)]
 pub mod platform;
-#[doc(hidden)]
+
-pub mod portable;
+// Platform-specific implementations of the compression function.
 mod portable;
 #[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
-#[doc(hidden)]
+cfg_if::cfg_if! {
-pub mod sse41;
+    if #[cfg(feature = "c")] {
        #[path = "c_sse41.rs"]
        mod sse41;
        #[path = "c_avx2.rs"]
        mod avx2;
        #[path = "c_avx512.rs"]
        mod avx512;
    } else {
        #[path = "rust_sse41.rs"]
        mod sse41;
        #[path = "rust_avx2.rs"]
        mod avx2;
        // Stable Rust does not currently support AVX-512.
    }
 }
 #[cfg(feature = "c_neon")]
 #[path = "c_neon.rs"]
 mod neon;
 pub mod traits;
--- a/src/platform.rs
+++ b/src/platform.rs
@ -1,18 +1,10 @@
 use crate::{portable, CVWords, IncrementCounter, BLOCK_LEN};
 use arrayref::{array_mut_ref, array_ref};
 #[cfg(feature = "c_avx512")]
 #[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
 use crate::c_avx512;
 #[cfg(feature = "c_neon")]
 use crate::c_neon;
 #[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
 use crate::{avx2, sse41};
 cfg_if::cfg_if! {
    if #[cfg(any(target_arch = "x86", target_arch = "x86_64"))] {
        cfg_if::cfg_if! {
-            if #[cfg(feature = "c_avx512")] {
+            if #[cfg(feature = "c")] {
                pub const MAX_SIMD_DEGREE: usize = 16;
            } else {
                pub const MAX_SIMD_DEGREE: usize = 8;
@ -32,7 +24,7 @@ cfg_if::cfg_if! {
 cfg_if::cfg_if! {
    if #[cfg(any(target_arch = "x86", target_arch = "x86_64"))] {
        cfg_if::cfg_if! {
-            if #[cfg(feature = "c_avx512")] {
+            if #[cfg(feature = "c")] {
                pub const MAX_SIMD_DEGREE_OR_2: usize = 16;
            } else {
                pub const MAX_SIMD_DEGREE_OR_2: usize = 8;
@ -52,7 +44,7 @@ pub enum Platform {
    SSE41,
    #[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
    AVX2,
-    #[cfg(feature = "c_avx512")]
+    #[cfg(feature = "c")]
    #[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
    AVX512,
    #[cfg(feature = "c_neon")]
@ -64,7 +56,7 @@ impl Platform {
    pub fn detect() -> Self {
        #[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
        {
-            #[cfg(feature = "c_avx512")]
+            #[cfg(feature = "c")]
            {
                if avx512_detected() {
                    return Platform::AVX512;
@ -93,7 +85,7 @@ impl Platform {
            Platform::SSE41 => 4,
            #[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
            Platform::AVX2 => 8,
-            #[cfg(feature = "c_avx512")]
+            #[cfg(feature = "c")]
            #[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
            Platform::AVX512 => 16,
            #[cfg(feature = "c_neon")]
@ -103,7 +95,7 @@ impl Platform {
        degree
    }
-    pub(crate) fn compress_in_place(
+    pub fn compress_in_place(
        &self,
        cv: &mut CVWords,
        block: &[u8; BLOCK_LEN],
@ -116,13 +108,13 @@ impl Platform {
            // Safe because detect() checked for platform support.
            #[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
            Platform::SSE41 | Platform::AVX2 => unsafe {
-                sse41::compress_in_place(cv, block, block_len, counter, flags)
+                crate::sse41::compress_in_place(cv, block, block_len, counter, flags)
            },
            // Safe because detect() checked for platform support.
-            #[cfg(feature = "c_avx512")]
+            #[cfg(feature = "c")]
            #[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
            Platform::AVX512 => unsafe {
-                c_avx512::compress_in_place(cv, block, block_len, counter, flags)
+                crate::avx512::compress_in_place(cv, block, block_len, counter, flags)
            },
            // No NEON compress_in_place() implementation yet.
            #[cfg(feature = "c_neon")]
@ -130,7 +122,7 @@ impl Platform {
        }
    }
-    pub(crate) fn compress_xof(
+    pub fn compress_xof(
        &self,
        cv: &CVWords,
        block: &[u8; BLOCK_LEN],
@ -143,13 +135,13 @@ impl Platform {
            // Safe because detect() checked for platform support.
            #[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
            Platform::SSE41 | Platform::AVX2 => unsafe {
-                sse41::compress_xof(cv, block, block_len, counter, flags)
+                crate::sse41::compress_xof(cv, block, block_len, counter, flags)
            },
            // Safe because detect() checked for platform support.
-            #[cfg(feature = "c_avx512")]
+            #[cfg(feature = "c")]
            #[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
            Platform::AVX512 => unsafe {
-                c_avx512::compress_xof(cv, block, block_len, counter, flags)
+                crate::avx512::compress_xof(cv, block, block_len, counter, flags)
            },
            // No NEON compress_xof() implementation yet.
            #[cfg(feature = "c_neon")]
@ -167,7 +159,7 @@ impl Platform {
    // after every block, there's a small but measurable performance loss.
    // Compressing chunks with a dedicated loop avoids this.
-    pub(crate) fn hash_many<A: arrayvec::Array<Item = u8>>(
+    pub fn hash_many<A: arrayvec::Array<Item = u8>>(
        &self,
        inputs: &[&A],
        key: &CVWords,
@ -192,7 +184,7 @@ impl Platform {
            // Safe because detect() checked for platform support.
            #[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
            Platform::SSE41 => unsafe {
-                sse41::hash_many(
+                crate::sse41::hash_many(
                    inputs,
                    key,
                    counter,
@ -206,7 +198,7 @@ impl Platform {
            // Safe because detect() checked for platform support.
            #[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
            Platform::AVX2 => unsafe {
-                avx2::hash_many(
+                crate::avx2::hash_many(
                    inputs,
                    key,
                    counter,
@ -218,10 +210,10 @@ impl Platform {
                )
            },
            // Safe because detect() checked for platform support.
-            #[cfg(feature = "c_avx512")]
+            #[cfg(feature = "c")]
            #[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
            Platform::AVX512 => unsafe {
-                c_avx512::hash_many(
+                crate::avx512::hash_many(
                    inputs,
                    key,
                    counter,
@ -235,7 +227,7 @@ impl Platform {
            // Assumed to be safe if the "c_neon" feature is on.
            #[cfg(feature = "c_neon")]
            Platform::NEON => unsafe {
-                c_neon::hash_many(
+                crate::neon::hash_many(
                    inputs,
                    key,
                    counter,
@ -248,11 +240,52 @@ impl Platform {
            },
        }
    }
    // Explicit platform constructors, for benchmarks.
    pub fn portable() -> Self {
        Self::Portable
    }
    #[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
    pub fn sse41() -> Option<Self> {
        if sse41_detected() {
            Some(Self::SSE41)
        } else {
            None
        }
    }
    #[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
    pub fn avx2() -> Option<Self> {
        if avx2_detected() {
            Some(Self::AVX2)
        } else {
            None
        }
    }
    #[cfg(feature = "c")]
    #[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
    pub fn avx512() -> Option<Self> {
        if avx512_detected() {
            Some(Self::AVX512)
        } else {
            None
        }
    }
    #[cfg(feature = "c_neon")]
    #[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
    pub fn neon() -> Option<Self> {
        // Assumed to be safe if the "c_neon" feature is on.
        Some(Self::NEON)
    }
 }
 // Note that AVX-512 is divided into multiple featuresets, and we use two of
 // them, F and VL.
-#[cfg(feature = "c_avx512")]
+#[cfg(feature = "c")]
 #[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
 #[inline(always)]
 pub fn avx512_detected() -> bool {
--- a/src/rust_avx2.rs
+++ b/src/rust_avx2.rs
--- a/src/rust_sse41.rs
+++ b/src/rust_sse41.rs
--- a/test_vectors/Cargo.toml
+++ b/test_vectors/Cargo.toml
@ -3,10 +3,16 @@ name = "test_vectors"
 version = "0.0.0"
 edition = "2018"
 [features]
 default = []
 c = ["blake3/c"]
 c_prefer_intrinsics = ["blake3/c_prefer_intrinsics"]
 c_neon = ["blake3/c_neon"]
 [dependencies]
 # If you ever change these path dependencies, you'll probably need to update
 # cross_test.sh, or CI will break. I'm sorry >.<
-blake3 = { path = "../", features=["c_avx512"] }
+blake3 = { path = "../" }
 hex = "0.4.0"
 reference_impl = { path = "../reference_impl" }
 serde = { version = "1.0", features = ["derive"] }
--- a/test_vectors/cross_test.sh
+++ b/test_vectors/cross_test.sh
@ -19,7 +19,7 @@ mv blake3/test_vectors .
 mv blake3/reference_impl test_vectors
 mv blake3 test_vectors
 cd test_vectors
-sed -i 's|blake3 = { path = "../", features=\["c_avx512"\] }|blake3 = { path = "./blake3" }|' Cargo.toml
+sed -i 's|blake3 = { path = "../" }|blake3 = { path = "./blake3" }|' Cargo.toml
 sed -i 's|reference_impl = { path = "../reference_impl" }|reference_impl = { path = "reference_impl" }|' Cargo.toml
 cross test "$@"