From e06a0f255ae22449e96b62d0d733943c6a86cb71 Mon Sep 17 00:00:00 2001
From: Jack O'Connor <oconnor663@gmail.com>
Date: Sat, 28 Mar 2020 17:27:31 -0400
Subject: [PATCH] refactor the Cargo feature set

The biggest change here is that assembly implementations are enabled by
default.

Added features:
- "pure" (Pure Rust, with no C or assembly implementations.)

Removed features:
- "c" (Now basically the default.)

Renamed features;
- "c_prefer_intrinsics" -> "prefer_intrinsics"
- "c_neon" -> "neon"

Unchanged:
- "rayon"
- "std" (Still the only feature on by default.)
---
 .github/workflows/ci.yml           | 24 +++++-----
 Cargo.toml                         | 59 ++++++++++++++----------
 README.md                          | 31 ++++++-------
 b3sum/Cargo.toml                   |  6 +--
 benches/bench.rs                   | 10 ++--
 build.rs                           | 74 ++++++++++--------------------
 c/blake3_c_rust_bindings/build.rs  |  4 +-
 src/{c_avx2.rs => ffi_avx2.rs}     |  0
 src/{c_avx512.rs => ffi_avx512.rs} |  0
 src/{c_neon.rs => ffi_neon.rs}     |  2 +-
 src/{c_sse41.rs => ffi_sse41.rs}   |  0
 src/lib.rs                         | 49 +++++++++++++-------
 src/platform.rs                    | 52 ++++++++++-----------
 test_vectors/Cargo.toml            |  7 ++-
 14 files changed, 155 insertions(+), 163 deletions(-)
 rename src/{c_avx2.rs => ffi_avx2.rs} (100%)
 rename src/{c_avx512.rs => ffi_avx512.rs} (100%)
 rename src/{c_neon.rs => ffi_neon.rs} (96%)
 rename src/{c_sse41.rs => ffi_sse41.rs} (100%)

diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
index f9162cb..4cdd644 100644
--- a/.github/workflows/ci.yml
+++ b/.github/workflows/ci.yml
@@ -30,27 +30,27 @@ jobs:
     - run: cargo test --features=rayon
     # no_std tests.
     - run: cargo test --no-default-features
-    # Test the x86 assembly implementations. Use -vv to log compiler commands.
-    - run: cargo test --features=c -vv
-    # Test the C intrinsics implementations. Use -vv to log compiler commands.
-    - run: cargo test --features=c,c_prefer_intrinsics -vv
+    # Test the intrinsics implementations.
+    - run: cargo test --features=prefer_intrinsics
+    # Test the pure Rust build.
+    - run: cargo test --features=pure
     # Test release mode. This does more iteratations in test_fuzz_hasher.
     - run: cargo test --release
-    - run: cargo test --release --features=c
-    - run: cargo test --release --features=c,c_prefer_intrinsics
+    - run: cargo test --release --features=prefer_intrinsics
+    - run: cargo test --release --features=pure
     # Test benchmarks. RUSTC_BOOTSTRAP=1 lets this run on non-nightly toolchains.
-    - run: cargo test --benches --features=c
+    - run: cargo test --benches
       env:
         RUSTC_BOOTSTRAP: 1
     # Test vectors.
     - name: test vectors
       run: cargo test
       working-directory: ./test_vectors
-    - name: test vectors C assembly
-      run: cargo test --features=c
+    - name: test vectors intrinsics
+      run: cargo test --features=prefer_intrinsics
       working-directory: ./test_vectors
-    - name: test vectors C intrinsics
-      run: cargo test --features=c,c_prefer_intrinsics
+    - name: test vectors pure
+      run: cargo test --features=pure
       working-directory: ./test_vectors
     # Test b3sum.
     - name: test b3sum
@@ -93,7 +93,7 @@ jobs:
     # Test the portable implementation on everything.
     - run: cross test --target ${{ matrix.arch }}
     # Test the NEON implementation on ARM targets.
-    - run: cross test --target ${{ matrix.arch }} --features=c_neon
+    - run: cross test --target ${{ matrix.arch }} --features=neon
       if: startsWith(matrix.arch, 'armv7-') || startsWith(matrix.arch, 'aarch64-')
     # Test vectors. Note that this uses a hacky script due to path dependency limitations.
     - run: ./test_vectors/cross_test.sh --target ${{ matrix.arch }}
diff --git a/Cargo.toml b/Cargo.toml
index d9440fa..dffaf7c 100644
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -11,27 +11,43 @@ edition = "2018"
 
 [features]
 default = ["std"]
-# The "c" feature includes C and assembly SIMD implementations of the
-# compression function for x86 platforms, called via FFI. (Currently it has no
-# effect on other platforms.) This requires a C toolchain on the build machine.
-# This is necessary for AVX-512 support, which is not yet stable in Rust, and
-# the assembly implementations also perform better than those using Rust/LLVM
-# intrinsics. As with the Rust implementations, these C and assembly
-# implementations participate in runtime CPU feature detection, and the
-# resulting binary is portable.
-c = []
-# Normally x86-64 builds prefer assembly implementations over C intrinsics. The
-# assembly implementations perform better, perform most consistently across
-# compilers, and are much faster to build. However, this feature makes the
-# build use the C intrinsics implementations instead. This is mainly for
-# testing purposes, and most callers will not want to use it.
-c_prefer_intrinsics = []
+
+# By default on x86_64, this crate uses Samuel Neves' hand-written assembly
+# implementations for SSE4.1, AVX2, and AVX512. (These provide both the best
+# runtime performance, and the fastest build times.) And by default on 32-bit
+# x86, this crate uses Rust intrinsics implementations for SSE4.1 and AVX2, and
+# a C intrinsics implementation for AVX-512. Enabling the "pure" feature
+# disables all FFI to C and assembly implementations, leaving only the Rust
+# intrinsics implementations for SSE4.1 and AVX2. This removes the dependency
+# on a C compiler/assembler, which can be helpful for certain applications.
+# Library crates should generally avoid this feature, so that each binary crate
+# is free make its own decision about build dependencies.
+pure = []
+
+# As described above, on x86_64 this crate use assembly implementations by
+# default. Enabling the "prefer_intrinsics" feature makes this crate use
+# intrinsics implementations on both 32-bit and 64-bit x86. This is mainly for
+# testing, and calling crates should not need it.
+prefer_intrinsics = []
+
 # The NEON implementation does not participate in dynamic feature detection,
-# which is currently x86-only. If "c_neon" is on, NEON support is assumed. Note
-# that AArch64 always supports NEON, but support on ARMv7 varies.
-c_neon = []
+# which is currently x86-only. If "neon" is on, NEON support is assumed. Note
+# that AArch64 always supports NEON, but support on ARMv7 varies. The NEON
+# implementation uses C intrinsics and requires a C compiler.
+neon = []
+
+# This crate uses libstd for std::io trait implementations, and also for
+# runtime CPU feature detection. This feature is enabled by default. If you use
+# --no-default-features, the only way to use the SIMD implementations in this
+# crate is to enable the corresponding instruction sets statically for the
+# entire build, with e.g. RUSTFLAGS="-C target-cpu=native".
 std = ["digest/std"]
 
+# The "rayon" feature (defined below as an optional dependency) enables the
+# join::RayonJoin type, which can be used with Hasher::update_with_join to
+# perform multi-threaded hashing. However, even if this feature is enabled, all
+# other APIs remain single-threaded.
+
 [package.metadata.docs.rs]
 # Document blake3::join::RayonJoin on docs.rs.
 features = ["rayon"]
@@ -40,13 +56,6 @@ features = ["rayon"]
 arrayref = "0.3.5"
 arrayvec = { version = "0.5.1", default-features = false, features = ["array-sizes-33-128"] }
 constant_time_eq = "0.1.5"
-# A performance note for the "rayon" feature: Multi-threading can have
-# significant overhead for small inputs, particularly on x86 where individual
-# cores are very fast. On the other hand, on slower platforms like ARM,
-# multi-threading can be beneficial for all inputs. There's no one input size
-# threshold that would work well everywhere, and this crate doesn't try to be
-# clever. If you're going to enable the "rayon" feature, you should benchmark
-# it for your specific use case.
 rayon = { version = "1.2.1", optional = true }
 cfg-if = "0.1.10"
 digest = "0.8.1"
diff --git a/README.md b/README.md
index ab2e36c..1cb6f87 100644
--- a/README.md
+++ b/README.md
@@ -1,4 +1,4 @@
-# <a href="#"><img src="media/BLAKE3.svg" alt="BLAKE3" height=50></a>
+# <a href="#"><img src="media/BLAKE3.svg" alt="BLAKE3" height=50></a>&ensp;[![Actions Status](https://github.com/BLAKE3-team/BLAKE3/workflows/tests/badge.svg)](https://github.com/BLAKE3-team/BLAKE3/actions)
 
 BLAKE3 is a cryptographic hash function that is:
 
@@ -33,23 +33,19 @@ with BLAKE3.
 This repository is the official implementation of BLAKE3. It includes:
 
 * The [`blake3`](https://crates.io/crates/blake3) Rust crate, which
-  includes optimized SIMD implementations, with runtime CPU feature
-  detection on x86. SSE4.1 and AVX2 are supported in pure Rust. The `c`
-  feature enables C/assembly implementations and AVX-512 support. The
-  `c_neon` feature enables ARM NEON support. Multi-threading is also
-  supported, and the `rayon` feature provides a
-  [Rayon](https://github.com/rayon-rs/rayon)-based implementation.
+  includes optimized SIMD implementations for SSE4.1, AVX2, AVX-512, and
+  NEON, with automatic runtime CPU feature detection on x86. The
+  optional `rayon` feature also enables multi-threading.
 
 * The [`b3sum`](https://crates.io/crates/b3sum) Rust crate, which
-  provides a command line interface. You can install it from
-  [crates.io](https://crates.io/crates/b3sum) with `cargo install
-  b3sum`. It enables the `rayon` and `c` features of the `blake3` crate
-  by default.
+  provides a command line interface. It uses multi-threading by default,
+  making it an order of magnitude faster than e.g. `sha256sum` on
+  typical desktop hardware.
 
 * The [C implementation](c), which like the Rust implementation includes
-  SIMD code and dynamic CPU feature detection on x86. Unlike the Rust
-  implementation, it's not currently multi-threaded. The
-  [README](c/README.md) provides build examples.
+  SIMD code and runtime CPU feature detection on x86. Unlike the Rust
+  implementation, it's not currently multi-threaded. See
+  [`c/README.md`](c/README.md).
 
 * The [reference implementation](reference_impl/reference_impl.rs),
   which is discussed in Section 5.1 of the [BLAKE3
@@ -59,9 +55,6 @@ This repository is the official implementation of BLAKE3. It includes:
   port that doesn't need multi-threading or SIMD optimizations, start
   here.
 
-* [![Actions
-  Status](https://github.com/BLAKE3-team/BLAKE3/workflows/tests/badge.svg)](https://github.com/BLAKE3-team/BLAKE3/actions)
-
 BLAKE3 was designed by:
 
 * [@oconnor663 ](https://github.com/oconnor663) (Jack O'Connor)
@@ -108,7 +101,9 @@ time b3sum /tmp/bigfile
 ### The `blake3` crate
 
 To use BLAKE3 from Rust code, add a dependency on the `blake3` crate to
-your `Cargo.toml`. Here's an example of hashing some input bytes:
+your `Cargo.toml`. Note that by default, unless the `pure` feature is
+enabled, building `blake3` requires a C compiler. Here's an example of
+hashing some input bytes:
 
 ```rust
 // Hash an input all at once.
diff --git a/b3sum/Cargo.toml b/b3sum/Cargo.toml
index a8b83f4..0e12bfb 100644
--- a/b3sum/Cargo.toml
+++ b/b3sum/Cargo.toml
@@ -9,9 +9,9 @@ readme = "README.md"
 edition = "2018"
 
 [features]
-default = ["c"]
-c = ["blake3/c"]
-c_neon = ["blake3/c_neon"]
+neon = ["blake3/neon"]
+prefer_intrinsics = ["blake3/prefer_intrinsics"]
+pure = ["blake3/pure"]
 
 [dependencies]
 anyhow = "1.0.25"
diff --git a/benches/bench.rs b/benches/bench.rs
index 263f81e..a6cd97a 100644
--- a/benches/bench.rs
+++ b/benches/bench.rs
@@ -69,7 +69,7 @@ fn bench_single_compression_sse41(b: &mut Bencher) {
 }
 
 #[bench]
-#[cfg(feature = "c")]
+#[cfg(not(feature = "pure"))]
 fn bench_single_compression_avx512(b: &mut Bencher) {
     if let Some(platform) = Platform::avx512() {
         bench_single_compression_fn(b, platform);
@@ -119,7 +119,7 @@ fn bench_many_chunks_avx2(b: &mut Bencher) {
 }
 
 #[bench]
-#[cfg(feature = "c")]
+#[cfg(not(feature = "pure"))]
 fn bench_many_chunks_avx512(b: &mut Bencher) {
     if let Some(platform) = Platform::avx512() {
         bench_many_chunks_fn(b, platform);
@@ -127,7 +127,7 @@ fn bench_many_chunks_avx512(b: &mut Bencher) {
 }
 
 #[bench]
-#[cfg(feature = "c_neon")]
+#[cfg(feature = "neon")]
 fn bench_many_chunks_neon(b: &mut Bencher) {
     if let Some(platform) = Platform::neon() {
         bench_many_chunks_fn(b, platform);
@@ -178,7 +178,7 @@ fn bench_many_parents_avx2(b: &mut Bencher) {
 }
 
 #[bench]
-#[cfg(feature = "c")]
+#[cfg(not(feature = "pure"))]
 fn bench_many_parents_avx512(b: &mut Bencher) {
     if let Some(platform) = Platform::avx512() {
         bench_many_parents_fn(b, platform);
@@ -186,7 +186,7 @@ fn bench_many_parents_avx512(b: &mut Bencher) {
 }
 
 #[bench]
-#[cfg(feature = "c_neon")]
+#[cfg(feature = "neon")]
 fn bench_many_parents_neon(b: &mut Bencher) {
     if let Some(platform) = Platform::neon() {
         bench_many_parents_fn(b, platform);
diff --git a/build.rs b/build.rs
index b7f7f7b..5459e84 100644
--- a/build.rs
+++ b/build.rs
@@ -49,21 +49,16 @@ fn new_build() -> cc::Build {
 }
 
 const WINDOWS_MSVC_ERROR: &str = r#"
-The "c" feature is enabled, but your version of the MSVC C compiler does not
-support the "/arch:AVX512" flag. If you are building the "b3sum" or "bao_bin"
-crates, you can disable AVX-512 with Cargo's "--no-default-features" flag.
-(Note that this also disables other default features like Rayon-based
-multithreading, which you can re-enable with "--features=rayon".) Other crates
-might or might not support this workaround.
+Your version of the MSVC C compiler does not support the "/arch:AVX512" flag.
+If you're building the "b3sum" or "bao_bin" crates, you can disable AVX-512
+with "--features=pure". Other crates might or might not support this
+workaround.
 "#;
 
 const GNU_ERROR: &str = r#"
-The "c" feature is enabled, but your C compiler does not support the
-"-mavx512f" flag. If you are building the "b3sum" or "bao_bin" crates, you can
-disable AVX-512 with Cargo's "--no-default-features" flag. (Note that this also
-disables other default features like Rayon-based multithreading, which you can
-re-enable with "--features=rayon".) Other crates might or might not support
-this workaround.
+Your C compiler does not support the "-mavx512f" flag. If you are building the
+"b3sum" or "bao_bin" crates, you can disable AVX-512 with "--features=pure".
+Other crates might or might not support this workaround.
 "#;
 
 fn check_for_avx512_compiler_support() {
@@ -82,11 +77,15 @@ fn check_for_avx512_compiler_support() {
 }
 
 fn main() -> Result<(), Box<dyn std::error::Error>> {
-    if defined("CARGO_FEATURE_C") {
+    if defined("CARGO_FEATURE_PURE") && defined("CARGO_FEATURE_NEON") {
+        panic!("It doesn't make sense to enable both \"pure\" and \"neon\".");
+    }
+
+    if (is_x86_64() || is_x86_32()) && !defined("CARGO_FEATURE_PURE") {
         check_for_avx512_compiler_support();
-        if is_x86_64() && !defined("CARGO_FEATURE_C_PREFER_INTRINSICS") {
+        if is_x86_64() && !defined("CARGO_FEATURE_PREFER_INTRINSICS") {
             // On 64-bit, use the assembly implementations, unless the
-            // "c_prefer_intrinsics" feature is enabled.
+            // "prefer_intrinsics" feature is enabled.
             if is_windows_msvc() {
                 let mut build = new_build();
                 build.file("c/blake3_sse41_x86-64_windows_msvc.asm");
@@ -109,40 +108,15 @@ fn main() -> Result<(), Box<dyn std::error::Error>> {
                 build.file("c/blake3_avx512_x86-64_unix.S");
                 build.compile("blake3_asm");
             }
-        } else if is_x86_64() || is_x86_32() {
-            // Assembly implementations are only for 64-bit. On 32-bit, or if
-            // the "c_prefer_intrinsics" feature is enabled, use the
-            // intrinsics-based C implementations. These each need to be
-            // compiled separately, with the corresponding instruction set
-            // extension explicitly enabled in the compiler.
-
-            let mut sse41_build = new_build();
-            sse41_build.file("c/blake3_sse41.c");
-            if is_windows_msvc() {
-                // /arch:SSE2 is the default on x86 and undefined on x86_64:
-                // https://docs.microsoft.com/en-us/cpp/build/reference/arch-x86
-                // It also includes SSE4.1 intrisincs:
-                // https://stackoverflow.com/a/32183222/823869
-            } else {
-                sse41_build.flag("-msse4.1");
-            }
-            sse41_build.compile("blake3_sse41");
-
-            let mut avx2_build = new_build();
-            avx2_build.file("c/blake3_avx2.c");
-            if is_windows_msvc() {
-                avx2_build.flag("/arch:AVX2");
-            } else {
-                avx2_build.flag("-mavx2");
-            }
-            avx2_build.compile("blake3_avx2");
-
+        } else {
+            // Assembly implementations are only for x86_64. On 32-bit x86, or
+            // if the "prefer_intrinsics" feature is enabled, use the Rust
+            // intrinsics implementations for SSE4.1 and AVX2, and the C
+            // intrinsics implementation for AVX-512. (Stable Rust does not yet
+            // support AVX-512.)
             let mut avx512_build = new_build();
             avx512_build.file("c/blake3_avx512.c");
             if is_windows_msvc() {
-                // Note that a lot of versions of MSVC don't support /arch:AVX512,
-                // and they'll discard it with a warning, hopefully leading to a
-                // build error.
                 avx512_build.flag("/arch:AVX512");
             } else {
                 avx512_build.flag("-mavx512f");
@@ -153,16 +127,14 @@ fn main() -> Result<(), Box<dyn std::error::Error>> {
                 avx512_build.flag("-fno-asynchronous-unwind-tables");
             }
             avx512_build.compile("blake3_avx512");
-        } else {
-            // Currently no effect for non-x86 platforms.
         }
     }
 
-    if defined("CARGO_FEATURE_C_NEON") {
+    if defined("CARGO_FEATURE_NEON") {
         let mut build = new_build();
         // Note that blake3_neon.c normally depends on the blake3_portable.c
         // for the single-instance compression function, but we expose
-        // portable.rs over FFI instead. See c_neon.rs.
+        // portable.rs over FFI instead. See ffi_neon.rs.
         build.file("c/blake3_neon.c");
         // ARMv7 platforms that support NEON generally need the following
         // flags. AArch64 supports NEON by default and does not support -mpfu.
@@ -173,7 +145,7 @@ fn main() -> Result<(), Box<dyn std::error::Error>> {
         build.compile("blake3_neon");
     }
 
-    // The `cc` crate does not automatically emit rerun-if directives for the
+    // The `cc` crate doesn't automatically emit rerun-if directives for the
     // environment variables it supports, in particular for $CC. We expect to
     // do a lot of benchmarking across different compilers, so we explicitly
     // add the variables that we're likely to need.
diff --git a/c/blake3_c_rust_bindings/build.rs b/c/blake3_c_rust_bindings/build.rs
index 125f3f7..85d8170 100644
--- a/c/blake3_c_rust_bindings/build.rs
+++ b/c/blake3_c_rust_bindings/build.rs
@@ -53,7 +53,7 @@ fn main() -> Result<(), Box<dyn std::error::Error>> {
     base_build.file("../blake3.c");
     base_build.file("../blake3_dispatch.c");
     base_build.file("../blake3_portable.c");
-    base_build.compile("blake3_c_base");
+    base_build.compile("blake3_base");
 
     if is_x86_64() && !defined("CARGO_FEATURE_PREFER_INTRINSICS") {
         // On 64-bit, use the assembly implementations, unless the
@@ -134,7 +134,7 @@ fn main() -> Result<(), Box<dyn std::error::Error>> {
             neon_build.flag("-mfpu=neon-vfpv4");
             neon_build.flag("-mfloat-abi=hard");
         }
-        neon_build.compile("blake3_c_neon");
+        neon_build.compile("blake3_neon");
     }
 
     // The `cc` crate does not automatically emit rerun-if directives for the
diff --git a/src/c_avx2.rs b/src/ffi_avx2.rs
similarity index 100%
rename from src/c_avx2.rs
rename to src/ffi_avx2.rs
diff --git a/src/c_avx512.rs b/src/ffi_avx512.rs
similarity index 100%
rename from src/c_avx512.rs
rename to src/ffi_avx512.rs
diff --git a/src/c_neon.rs b/src/ffi_neon.rs
similarity index 96%
rename from src/c_neon.rs
rename to src/ffi_neon.rs
index 77b9654..8899742 100644
--- a/src/c_neon.rs
+++ b/src/ffi_neon.rs
@@ -75,7 +75,7 @@ mod test {
 
     #[test]
     fn test_hash_many() {
-        // This entire file is gated on feature="c_neon", so NEON support is
+        // This entire file is gated on feature="neon", so NEON support is
         // assumed here.
         crate::test::test_hash_many_fn(hash_many, hash_many);
     }
diff --git a/src/c_sse41.rs b/src/ffi_sse41.rs
similarity index 100%
rename from src/c_sse41.rs
rename to src/ffi_sse41.rs
diff --git a/src/lib.rs b/src/lib.rs
index c0915ee..0a0d640 100644
--- a/src/lib.rs
+++ b/src/lib.rs
@@ -29,13 +29,15 @@
 //!
 //! # Cargo Features
 //!
-//! The `c` feature provides optimized assembly implementations and also
-//! AVX-512 support. It is off by default. If activated, a C compiler for the
-//! target platform is required.
-//!
 //! The `rayon` feature provides [Rayon]-based multi-threading, in particular
 //! the [`join::RayonJoin`] type for use with [`Hasher::update_with_join`]. It
-//! is also off by default, but on for [docs.rs].
+//! is disabled by default, but enabled for [docs.rs].
+//!
+//! The `pure` feature disables all FFI to C and assembly implementations,
+//! leaving only the Rust intrinsics implementations for SSE4.1 and AVX2. This
+//! removes the dependency on a C compiler/assembler. Library crates should
+//! generally avoid this feature, so that each binary crate is free make its
+//! own decision about build dependencies.
 //!
 //! [BLAKE3]: https://blake3.io
 //! [Rayon]: https://github.com/rayon-rs/rayon
@@ -63,23 +65,38 @@ pub mod platform;
 mod portable;
 #[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
 cfg_if::cfg_if! {
-    if #[cfg(feature = "c")] {
-        #[path = "c_sse41.rs"]
-        mod sse41;
-        #[path = "c_avx2.rs"]
-        mod avx2;
-        #[path = "c_avx512.rs"]
-        mod avx512;
-    } else {
+    if #[cfg(feature = "pure")] {
+        // When "pure" is enabled, use only Rust intrinsics. Stable Rust
+        // doesn't currently support AVX-512.
         #[path = "rust_sse41.rs"]
         mod sse41;
         #[path = "rust_avx2.rs"]
         mod avx2;
-        // Stable Rust does not currently support AVX-512.
+    } else if #[cfg(any(target_arch = "x86", feature = "prefer_intrinsics"))] {
+        // When "prefer_intrinsics" is enabled, or on 32-bit x86 (which our
+        // assembly implementations don't support), use Rust intrinsics for
+        // SSE4.1 and AVX2, and use C intrinsics for AVX-512. In this cacse,
+        // build.rs will compile and link c/blake3_avx512.c.
+        #[path = "rust_sse41.rs"]
+        mod sse41;
+        #[path = "rust_avx2.rs"]
+        mod avx2;
+        #[path = "ffi_avx512.rs"]
+        mod avx512;
+    } else {
+        // Otherwise on x86_64, use assembly implementations for everything. In
+        // this case, build.rs will compile and link all the assembly files for
+        // the target platform (Unix, Windows MSVC, or Windows GNU).
+        #[path = "ffi_sse41.rs"]
+        mod sse41;
+        #[path = "ffi_avx2.rs"]
+        mod avx2;
+        #[path = "ffi_avx512.rs"]
+        mod avx512;
     }
 }
-#[cfg(feature = "c_neon")]
-#[path = "c_neon.rs"]
+#[cfg(feature = "neon")]
+#[path = "ffi_neon.rs"]
 mod neon;
 
 pub mod traits;
diff --git a/src/platform.rs b/src/platform.rs
index 163cbbb..b1b9dad 100644
--- a/src/platform.rs
+++ b/src/platform.rs
@@ -4,13 +4,13 @@ use arrayref::{array_mut_ref, array_ref};
 cfg_if::cfg_if! {
     if #[cfg(any(target_arch = "x86", target_arch = "x86_64"))] {
         cfg_if::cfg_if! {
-            if #[cfg(feature = "c")] {
-                pub const MAX_SIMD_DEGREE: usize = 16;
-            } else {
+            if #[cfg(feature = "pure")] {
                 pub const MAX_SIMD_DEGREE: usize = 8;
+            } else {
+                pub const MAX_SIMD_DEGREE: usize = 16;
             }
         }
-    } else if #[cfg(feature = "c_neon")] {
+    } else if #[cfg(feature = "neon")] {
         pub const MAX_SIMD_DEGREE: usize = 4;
     } else {
         pub const MAX_SIMD_DEGREE: usize = 1;
@@ -24,13 +24,13 @@ cfg_if::cfg_if! {
 cfg_if::cfg_if! {
     if #[cfg(any(target_arch = "x86", target_arch = "x86_64"))] {
         cfg_if::cfg_if! {
-            if #[cfg(feature = "c")] {
-                pub const MAX_SIMD_DEGREE_OR_2: usize = 16;
-            } else {
+            if #[cfg(feature = "pure")] {
                 pub const MAX_SIMD_DEGREE_OR_2: usize = 8;
+            } else {
+                pub const MAX_SIMD_DEGREE_OR_2: usize = 16;
             }
         }
-    } else if #[cfg(feature = "c_neon")] {
+    } else if #[cfg(feature = "neon")] {
         pub const MAX_SIMD_DEGREE_OR_2: usize = 4;
     } else {
         pub const MAX_SIMD_DEGREE_OR_2: usize = 2;
@@ -44,10 +44,10 @@ pub enum Platform {
     SSE41,
     #[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
     AVX2,
-    #[cfg(feature = "c")]
+    #[cfg(not(feature = "pure"))]
     #[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
     AVX512,
-    #[cfg(feature = "c_neon")]
+    #[cfg(feature = "neon")]
     NEON,
 }
 
@@ -56,7 +56,7 @@ impl Platform {
     pub fn detect() -> Self {
         #[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
         {
-            #[cfg(feature = "c")]
+            #[cfg(not(feature = "pure"))]
             {
                 if avx512_detected() {
                     return Platform::AVX512;
@@ -69,9 +69,9 @@ impl Platform {
                 return Platform::SSE41;
             }
         }
-        // We don't use dynamic feature detection for NEON. If the "c_neon"
+        // We don't use dynamic feature detection for NEON. If the "neon"
         // feature is on, NEON is assumed to be supported.
-        #[cfg(feature = "c_neon")]
+        #[cfg(feature = "neon")]
         {
             return Platform::NEON;
         }
@@ -85,10 +85,10 @@ impl Platform {
             Platform::SSE41 => 4,
             #[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
             Platform::AVX2 => 8,
-            #[cfg(feature = "c")]
+            #[cfg(not(feature = "pure"))]
             #[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
             Platform::AVX512 => 16,
-            #[cfg(feature = "c_neon")]
+            #[cfg(feature = "neon")]
             Platform::NEON => 4,
         };
         debug_assert!(degree <= MAX_SIMD_DEGREE);
@@ -111,13 +111,13 @@ impl Platform {
                 crate::sse41::compress_in_place(cv, block, block_len, counter, flags)
             },
             // Safe because detect() checked for platform support.
-            #[cfg(feature = "c")]
+            #[cfg(not(feature = "pure"))]
             #[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
             Platform::AVX512 => unsafe {
                 crate::avx512::compress_in_place(cv, block, block_len, counter, flags)
             },
             // No NEON compress_in_place() implementation yet.
-            #[cfg(feature = "c_neon")]
+            #[cfg(feature = "neon")]
             Platform::NEON => portable::compress_in_place(cv, block, block_len, counter, flags),
         }
     }
@@ -138,13 +138,13 @@ impl Platform {
                 crate::sse41::compress_xof(cv, block, block_len, counter, flags)
             },
             // Safe because detect() checked for platform support.
-            #[cfg(feature = "c")]
+            #[cfg(not(feature = "pure"))]
             #[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
             Platform::AVX512 => unsafe {
                 crate::avx512::compress_xof(cv, block, block_len, counter, flags)
             },
             // No NEON compress_xof() implementation yet.
-            #[cfg(feature = "c_neon")]
+            #[cfg(feature = "neon")]
             Platform::NEON => portable::compress_xof(cv, block, block_len, counter, flags),
         }
     }
@@ -210,7 +210,7 @@ impl Platform {
                 )
             },
             // Safe because detect() checked for platform support.
-            #[cfg(feature = "c")]
+            #[cfg(not(feature = "pure"))]
             #[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
             Platform::AVX512 => unsafe {
                 crate::avx512::hash_many(
@@ -224,8 +224,8 @@ impl Platform {
                     out,
                 )
             },
-            // Assumed to be safe if the "c_neon" feature is on.
-            #[cfg(feature = "c_neon")]
+            // Assumed to be safe if the "neon" feature is on.
+            #[cfg(feature = "neon")]
             Platform::NEON => unsafe {
                 crate::neon::hash_many(
                     inputs,
@@ -265,7 +265,7 @@ impl Platform {
         }
     }
 
-    #[cfg(feature = "c")]
+    #[cfg(not(feature = "pure"))]
     #[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
     pub fn avx512() -> Option<Self> {
         if avx512_detected() {
@@ -275,17 +275,17 @@ impl Platform {
         }
     }
 
-    #[cfg(feature = "c_neon")]
+    #[cfg(feature = "neon")]
     #[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
     pub fn neon() -> Option<Self> {
-        // Assumed to be safe if the "c_neon" feature is on.
+        // Assumed to be safe if the "neon" feature is on.
         Some(Self::NEON)
     }
 }
 
 // Note that AVX-512 is divided into multiple featuresets, and we use two of
 // them, F and VL.
-#[cfg(feature = "c")]
+#[cfg(not(feature = "pure"))]
 #[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
 #[inline(always)]
 pub fn avx512_detected() -> bool {
diff --git a/test_vectors/Cargo.toml b/test_vectors/Cargo.toml
index 2a90e39..cd74a9d 100644
--- a/test_vectors/Cargo.toml
+++ b/test_vectors/Cargo.toml
@@ -4,10 +4,9 @@ version = "0.0.0"
 edition = "2018"
 
 [features]
-default = []
-c = ["blake3/c"]
-c_prefer_intrinsics = ["blake3/c_prefer_intrinsics"]
-c_neon = ["blake3/c_neon"]
+neon = ["blake3/neon"]
+prefer_intrinsics = ["blake3/prefer_intrinsics"]
+pure = ["blake3/pure"]
 
 [dependencies]
 # If you ever change these path dependencies, you'll probably need to update