factor out the blake3 crate changes from the guts_api branch

This commit and the branch that it starts are unlikely to land as-is, but I want to maintain them while I flesh out the new `blake3_guts` sub-crate.
2024-09-27 01:30:43 +02:00 · 2024-01-21 21:15:45 -08:00 · 2024-01-21 21:15:45 -08:00 · bd160e33a2
commit bd160e33a2
parent 5558fa4623
10 changed files with 339 additions and 825 deletions
--- a/Cargo.toml
+++ b/Cargo.toml
@ -92,8 +92,8 @@ no_neon = []
 features = ["mmap", "rayon", "serde", "zeroize"]

 [dependencies]
-arrayref = "0.3.5"
 arrayvec = { version = "0.7.4", default-features = false }
+blake3_guts = { path = "rust/guts" }
 constant_time_eq = "0.3.0"
 cfg-if = "1.0.0"
 digest = { version = "0.10.1", features = [ "mac" ], optional = true }
--- a/b3sum/Cargo.lock
+++ b/b3sum/Cargo.lock
@ -56,12 +56,6 @@ version = "1.0.79"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "080e9890a082662b09c1ad45f567faeeb47f22b5fb23895fbe1e651e718e25ca"

-[[package]]
-name = "arrayref"
-version = "0.3.7"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "6b4930d2cb77ce62f89ee5d5289b4ac049559b1c45539271f5ed4fdc7db34545"
-
 [[package]]
 name = "arrayvec"
 version = "0.7.4"
@ -98,8 +92,8 @@ checksum = "ed570934406eb16438a4e976b1b4500774099c13b8cb96eec99f620f05090ddf"
 name = "blake3"
 version = "1.5.0"
 dependencies = [
- "arrayref",
 "arrayvec",
+ "blake3_guts",
 "cc",
 "cfg-if",
 "constant_time_eq",
@ -107,6 +101,10 @@ dependencies = [
 "rayon",
 ]

+[[package]]
+name = "blake3_guts"
+version = "0.0.0"
+
 [[package]]
 name = "cc"
 version = "1.0.83"
--- a/b3sum/src/main.rs
+++ b/b3sum/src/main.rs
@ -186,7 +186,7 @@ fn write_hex_output(mut output: blake3::OutputReader, args: &Args) -> Result<()>
    // TODO: This computes each output block twice when the --seek argument isn't a multiple of 64.
    // We'll refactor all of this soon anyway, once SIMD optimizations are available for the XOF.
    let mut len = args.len();
-    let mut block = [0; blake3::guts::BLOCK_LEN];
+    let mut block = [0; 64];
    while len > 0 {
        output.fill(&mut block);
        let hex_str = hex::encode(&block[..]);
--- a/benches/bench.rs
+++ b/benches/bench.rs
@ -2,11 +2,7 @@

 extern crate test;

-use arrayref::array_ref;
-use arrayvec::ArrayVec;
-use blake3::guts::{BLOCK_LEN, CHUNK_LEN};
-use blake3::platform::{Platform, MAX_SIMD_DEGREE};
-use blake3::OUT_LEN;
+use blake3_guts::BLOCK_LEN;
 use rand::prelude::*;
 use test::Bencher;

@ -49,175 +45,6 @@ impl RandomInput {
    }
 }

-fn bench_single_compression_fn(b: &mut Bencher, platform: Platform) {
-    let mut state = [1u32; 8];
-    let mut r = RandomInput::new(b, 64);
-    let input = array_ref!(r.get(), 0, 64);
-    b.iter(|| platform.compress_in_place(&mut state, input, 64 as u8, 0, 0));
-}
-
-#[bench]
-fn bench_single_compression_portable(b: &mut Bencher) {
-    bench_single_compression_fn(b, Platform::portable());
-}
-
-#[bench]
-#[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
-fn bench_single_compression_sse2(b: &mut Bencher) {
-    if let Some(platform) = Platform::sse2() {
-        bench_single_compression_fn(b, platform);
-    }
-}
-
-#[bench]
-#[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
-fn bench_single_compression_sse41(b: &mut Bencher) {
-    if let Some(platform) = Platform::sse41() {
-        bench_single_compression_fn(b, platform);
-    }
-}
-
-#[bench]
-#[cfg(blake3_avx512_ffi)]
-fn bench_single_compression_avx512(b: &mut Bencher) {
-    if let Some(platform) = Platform::avx512() {
-        bench_single_compression_fn(b, platform);
-    }
-}
-
-fn bench_many_chunks_fn(b: &mut Bencher, platform: Platform) {
-    let degree = platform.simd_degree();
-    let mut inputs = Vec::new();
-    for _ in 0..degree {
-        inputs.push(RandomInput::new(b, CHUNK_LEN));
-    }
-    b.iter(|| {
-        let input_arrays: ArrayVec<&[u8; CHUNK_LEN], MAX_SIMD_DEGREE> = inputs
-            .iter_mut()
-            .take(degree)
-            .map(|i| array_ref!(i.get(), 0, CHUNK_LEN))
-            .collect();
-        let mut out = [0; MAX_SIMD_DEGREE * OUT_LEN];
-        platform.hash_many(
-            &input_arrays[..],
-            &[0; 8],
-            0,
-            blake3::IncrementCounter::Yes,
-            0,
-            0,
-            0,
-            &mut out,
-        );
-    });
-}
-
-#[bench]
-#[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
-fn bench_many_chunks_sse2(b: &mut Bencher) {
-    if let Some(platform) = Platform::sse2() {
-        bench_many_chunks_fn(b, platform);
-    }
-}
-
-#[bench]
-#[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
-fn bench_many_chunks_sse41(b: &mut Bencher) {
-    if let Some(platform) = Platform::sse41() {
-        bench_many_chunks_fn(b, platform);
-    }
-}
-
-#[bench]
-#[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
-fn bench_many_chunks_avx2(b: &mut Bencher) {
-    if let Some(platform) = Platform::avx2() {
-        bench_many_chunks_fn(b, platform);
-    }
-}
-
-#[bench]
-#[cfg(blake3_avx512_ffi)]
-fn bench_many_chunks_avx512(b: &mut Bencher) {
-    if let Some(platform) = Platform::avx512() {
-        bench_many_chunks_fn(b, platform);
-    }
-}
-
-#[bench]
-#[cfg(feature = "neon")]
-fn bench_many_chunks_neon(b: &mut Bencher) {
-    if let Some(platform) = Platform::neon() {
-        bench_many_chunks_fn(b, platform);
-    }
-}
-
-// TODO: When we get const generics we can unify this with the chunks code.
-fn bench_many_parents_fn(b: &mut Bencher, platform: Platform) {
-    let degree = platform.simd_degree();
-    let mut inputs = Vec::new();
-    for _ in 0..degree {
-        inputs.push(RandomInput::new(b, BLOCK_LEN));
-    }
-    b.iter(|| {
-        let input_arrays: ArrayVec<&[u8; BLOCK_LEN], MAX_SIMD_DEGREE> = inputs
-            .iter_mut()
-            .take(degree)
-            .map(|i| array_ref!(i.get(), 0, BLOCK_LEN))
-            .collect();
-        let mut out = [0; MAX_SIMD_DEGREE * OUT_LEN];
-        platform.hash_many(
-            &input_arrays[..],
-            &[0; 8],
-            0,
-            blake3::IncrementCounter::No,
-            0,
-            0,
-            0,
-            &mut out,
-        );
-    });
-}
-
-#[bench]
-#[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
-fn bench_many_parents_sse2(b: &mut Bencher) {
-    if let Some(platform) = Platform::sse2() {
-        bench_many_parents_fn(b, platform);
-    }
-}
-
-#[bench]
-#[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
-fn bench_many_parents_sse41(b: &mut Bencher) {
-    if let Some(platform) = Platform::sse41() {
-        bench_many_parents_fn(b, platform);
-    }
-}
-
-#[bench]
-#[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
-fn bench_many_parents_avx2(b: &mut Bencher) {
-    if let Some(platform) = Platform::avx2() {
-        bench_many_parents_fn(b, platform);
-    }
-}
-
-#[bench]
-#[cfg(blake3_avx512_ffi)]
-fn bench_many_parents_avx512(b: &mut Bencher) {
-    if let Some(platform) = Platform::avx512() {
-        bench_many_parents_fn(b, platform);
-    }
-}
-
-#[bench]
-#[cfg(feature = "neon")]
-fn bench_many_parents_neon(b: &mut Bencher) {
-    if let Some(platform) = Platform::neon() {
-        bench_many_parents_fn(b, platform);
-    }
-}
-
 fn bench_atonce(b: &mut Bencher, len: usize) {
    let mut input = RandomInput::new(b, len);
    b.iter(|| blake3::hash(input.get()));
--- a/src/lib.rs
+++ b/src/lib.rs
@ -88,109 +88,29 @@
 #[cfg(test)]
 mod test;

-// The guts module is for incremental use cases like the `bao` crate that need
-// to explicitly compute chunk and parent chaining values. It is semi-stable
-// and likely to keep working, but largely undocumented and not intended for
-// widespread use.
-#[doc(hidden)]
-pub mod guts;
-
-/// Undocumented and unstable, for benchmarks only.
-#[doc(hidden)]
-pub mod platform;
-
-// Platform-specific implementations of the compression function. These
-// BLAKE3-specific cfg flags are set in build.rs.
-#[cfg(blake3_avx2_rust)]
-#[path = "rust_avx2.rs"]
-mod avx2;
-#[cfg(blake3_avx2_ffi)]
-#[path = "ffi_avx2.rs"]
-mod avx2;
-#[cfg(blake3_avx512_ffi)]
-#[path = "ffi_avx512.rs"]
-mod avx512;
-#[cfg(blake3_neon)]
-#[path = "ffi_neon.rs"]
-mod neon;
-mod portable;
-#[cfg(blake3_sse2_rust)]
-#[path = "rust_sse2.rs"]
-mod sse2;
-#[cfg(blake3_sse2_ffi)]
-#[path = "ffi_sse2.rs"]
-mod sse2;
-#[cfg(blake3_sse41_rust)]
-#[path = "rust_sse41.rs"]
-mod sse41;
-#[cfg(blake3_sse41_ffi)]
-#[path = "ffi_sse41.rs"]
-mod sse41;
-
 #[cfg(feature = "traits-preview")]
 pub mod traits;

 mod io;
 mod join;

-use arrayref::{array_mut_ref, array_ref};
 use arrayvec::{ArrayString, ArrayVec};
 use core::cmp;
 use core::fmt;
-use platform::{Platform, MAX_SIMD_DEGREE, MAX_SIMD_DEGREE_OR_2};

-/// The number of bytes in a [`Hash`](struct.Hash.html), 32.
+use blake3_guts as guts;
+use guts::{
+    BlockBytes, CVBytes, BLOCK_LEN, CHUNK_END, CHUNK_LEN, CHUNK_START, DERIVE_KEY_CONTEXT,
+    DERIVE_KEY_MATERIAL, IV_BYTES, KEYED_HASH, PARENT, ROOT,
+};
+
+/// The number of bytes in a [`Hash`](struct.Hash.html), 32
 pub const OUT_LEN: usize = 32;

-/// The number of bytes in a key, 32.
+/// The number of bytes in a key, 32
 pub const KEY_LEN: usize = 32;

 const MAX_DEPTH: usize = 54; // 2^54 * CHUNK_LEN = 2^64
-use guts::{BLOCK_LEN, CHUNK_LEN};
-
-// While iterating the compression function within a chunk, the CV is
-// represented as words, to avoid doing two extra endianness conversions for
-// each compression in the portable implementation. But the hash_many interface
-// needs to hash both input bytes and parent nodes, so its better for its
-// output CVs to be represented as bytes.
-type CVWords = [u32; 8];
-type CVBytes = [u8; 32]; // little-endian
-
-const IV: &CVWords = &[
-    0x6A09E667, 0xBB67AE85, 0x3C6EF372, 0xA54FF53A, 0x510E527F, 0x9B05688C, 0x1F83D9AB, 0x5BE0CD19,
-];
-
-const MSG_SCHEDULE: [[usize; 16]; 7] = [
-    [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15],
-    [2, 6, 3, 10, 7, 0, 4, 13, 1, 11, 12, 5, 9, 14, 15, 8],
-    [3, 4, 10, 12, 13, 2, 7, 14, 6, 5, 9, 0, 11, 15, 8, 1],
-    [10, 7, 12, 9, 14, 3, 13, 15, 4, 0, 11, 2, 5, 8, 1, 6],
-    [12, 13, 9, 11, 15, 10, 14, 8, 7, 2, 5, 3, 0, 1, 6, 4],
-    [9, 14, 11, 5, 8, 12, 15, 1, 13, 3, 0, 10, 2, 6, 4, 7],
-    [11, 15, 5, 0, 1, 9, 8, 6, 14, 10, 2, 12, 3, 4, 7, 13],
-];
-
-// These are the internal flags that we use to domain separate root/non-root,
-// chunk/parent, and chunk beginning/middle/end. These get set at the high end
-// of the block flags word in the compression function, so their values start
-// high and go down.
-const CHUNK_START: u8 = 1 << 0;
-const CHUNK_END: u8 = 1 << 1;
-const PARENT: u8 = 1 << 2;
-const ROOT: u8 = 1 << 3;
-const KEYED_HASH: u8 = 1 << 4;
-const DERIVE_KEY_CONTEXT: u8 = 1 << 5;
-const DERIVE_KEY_MATERIAL: u8 = 1 << 6;
-
-#[inline]
-fn counter_low(counter: u64) -> u32 {
-    counter as u32
-}
-
-#[inline]
-fn counter_high(counter: u64) -> u32 {
-    (counter >> 32) as u32
-}

 /// An output of the default size, 32 bytes, which provides constant-time
 /// equality checking.
@ -219,19 +139,19 @@ fn counter_high(counter: u64) -> u32 {
 #[cfg_attr(feature = "zeroize", derive(zeroize::Zeroize))]
 #[cfg_attr(feature = "serde", derive(serde::Deserialize, serde::Serialize))]
 #[derive(Clone, Copy, Hash)]
-pub struct Hash([u8; OUT_LEN]);
+pub struct Hash(CVBytes);

 impl Hash {
    /// The raw bytes of the `Hash`. Note that byte arrays don't provide
    /// constant-time equality checking, so if  you need to compare hashes,
    /// prefer the `Hash` type.
    #[inline]
-    pub const fn as_bytes(&self) -> &[u8; OUT_LEN] {
+    pub const fn as_bytes(&self) -> &CVBytes {
        &self.0
    }

    /// Create a `Hash` from its raw bytes representation.
-    pub const fn from_bytes(bytes: [u8; OUT_LEN]) -> Self {
+    pub const fn from_bytes(bytes: CVBytes) -> Self {
        Self(bytes)
    }

@ -275,7 +195,7 @@ impl Hash {
        if hex_bytes.len() != OUT_LEN * 2 {
            return Err(HexError(HexErrorInner::InvalidLen(hex_bytes.len())));
        }
-        let mut hash_bytes: [u8; OUT_LEN] = [0; OUT_LEN];
+        let mut hash_bytes: CVBytes = [0; OUT_LEN];
        for i in 0..OUT_LEN {
            hash_bytes[i] = 16 * hex_val(hex_bytes[2 * i])? + hex_val(hex_bytes[2 * i + 1])?;
        }
@ -283,14 +203,14 @@ impl Hash {
    }
 }

-impl From<[u8; OUT_LEN]> for Hash {
+impl From<CVBytes> for Hash {
    #[inline]
-    fn from(bytes: [u8; OUT_LEN]) -> Self {
+    fn from(bytes: CVBytes) -> Self {
        Self::from_bytes(bytes)
    }
 }

-impl From<Hash> for [u8; OUT_LEN] {
+impl From<Hash> for CVBytes {
    #[inline]
    fn from(hash: Hash) -> Self {
        hash.0
@ -314,9 +234,9 @@ impl PartialEq for Hash {
 }

 /// This implementation is constant-time.
-impl PartialEq<[u8; OUT_LEN]> for Hash {
+impl PartialEq<CVBytes> for Hash {
    #[inline]
-    fn eq(&self, other: &[u8; OUT_LEN]) -> bool {
+    fn eq(&self, other: &CVBytes) -> bool {
        constant_time_eq::constant_time_eq_32(&self.0, other)
    }
 }
@ -395,70 +315,56 @@ impl std::error::Error for HexError {}
 #[cfg_attr(feature = "zeroize", derive(zeroize::Zeroize))]
 #[derive(Clone)]
 struct Output {
-    input_chaining_value: CVWords,
-    block: [u8; 64],
+    input_chaining_value: CVBytes,
+    block: BlockBytes,
    block_len: u8,
    counter: u64,
    flags: u8,
-    #[cfg_attr(feature = "zeroize", zeroize(skip))]
-    platform: Platform,
 }

 impl Output {
    fn chaining_value(&self) -> CVBytes {
-        let mut cv = self.input_chaining_value;
-        self.platform.compress_in_place(
-            &mut cv,
+        guts::DETECTED_IMPL.compress(
            &self.block,
-            self.block_len,
+            self.block_len as u32,
+            &self.input_chaining_value,
            self.counter,
-            self.flags,
-        );
-        platform::le_bytes_from_words_32(&cv)
+            self.flags as u32,
+        )
    }

    fn root_hash(&self) -> Hash {
        debug_assert_eq!(self.counter, 0);
-        let mut cv = self.input_chaining_value;
-        self.platform
-            .compress_in_place(&mut cv, &self.block, self.block_len, 0, self.flags | ROOT);
-        Hash(platform::le_bytes_from_words_32(&cv))
-    }
-
-    fn root_output_block(&self) -> [u8; 2 * OUT_LEN] {
-        self.platform.compress_xof(
-            &self.input_chaining_value,
+        Hash(guts::DETECTED_IMPL.compress(
            &self.block,
-            self.block_len,
-            self.counter,
-            self.flags | ROOT,
-        )
+            self.block_len as u32,
+            &self.input_chaining_value,
+            0,
+            self.flags as u32 | ROOT,
+        ))
    }
 }

 #[derive(Clone)]
 #[cfg_attr(feature = "zeroize", derive(zeroize::Zeroize))]
 struct ChunkState {
-    cv: CVWords,
+    cv: CVBytes,
    chunk_counter: u64,
-    buf: [u8; BLOCK_LEN],
+    buf: BlockBytes,
    buf_len: u8,
    blocks_compressed: u8,
    flags: u8,
-    #[cfg_attr(feature = "zeroize", zeroize(skip))]
-    platform: Platform,
 }

 impl ChunkState {
-    fn new(key: &CVWords, chunk_counter: u64, flags: u8, platform: Platform) -> Self {
+    fn new(key: &CVBytes, chunk_counter: u64, flags: u32) -> Self {
        Self {
            cv: *key,
            chunk_counter,
            buf: [0; BLOCK_LEN],
            buf_len: 0,
            blocks_compressed: 0,
-            flags,
-            platform,
+            flags: flags as u8,
        }
    }

@ -474,7 +380,7 @@ impl ChunkState {
        *input = &input[take..];
    }

-    fn start_flag(&self) -> u8 {
+    fn start_flag(&self) -> u32 {
        if self.blocks_compressed == 0 {
            CHUNK_START
        } else {
@ -489,13 +395,12 @@ impl ChunkState {
            self.fill_buf(&mut input);
            if !input.is_empty() {
                debug_assert_eq!(self.buf_len as usize, BLOCK_LEN);
-                let block_flags = self.flags | self.start_flag(); // borrowck
-                self.platform.compress_in_place(
-                    &mut self.cv,
+                self.cv = guts::DETECTED_IMPL.compress(
                    &self.buf,
-                    BLOCK_LEN as u8,
+                    BLOCK_LEN as u32,
+                    &self.cv,
                    self.chunk_counter,
-                    block_flags,
+                    self.flags as u32 | self.start_flag(),
                );
                self.buf_len = 0;
                self.buf = [0; BLOCK_LEN];
@ -505,13 +410,12 @@ impl ChunkState {

        while input.len() > BLOCK_LEN {
            debug_assert_eq!(self.buf_len, 0);
-            let block_flags = self.flags | self.start_flag(); // borrowck
-            self.platform.compress_in_place(
-                &mut self.cv,
-                array_ref!(input, 0, BLOCK_LEN),
-                BLOCK_LEN as u8,
+            self.cv = guts::DETECTED_IMPL.compress(
+                input[..BLOCK_LEN].try_into().unwrap(),
+                BLOCK_LEN as u32,
+                &self.cv,
                self.chunk_counter,
-                block_flags,
+                self.flags as u32 | self.start_flag(),
            );
            self.blocks_compressed += 1;
            input = &input[BLOCK_LEN..];
@ -524,14 +428,12 @@ impl ChunkState {
    }

    fn output(&self) -> Output {
-        let block_flags = self.flags | self.start_flag() | CHUNK_END;
        Output {
            input_chaining_value: self.cv,
            block: self.buf,
            block_len: self.buf_len,
            counter: self.chunk_counter,
-            flags: block_flags,
-            platform: self.platform,
+            flags: self.flags | self.start_flag() as u8 | CHUNK_END as u8,
        }
    }
 }
@ -543,7 +445,6 @@ impl fmt::Debug for ChunkState {
            .field("len", &self.len())
            .field("chunk_counter", &self.chunk_counter)
            .field("flags", &self.flags)
-            .field("platform", &self.platform)
            .finish()
    }
 }
@ -563,131 +464,6 @@ impl fmt::Debug for ChunkState {
 //   use full-width SIMD vectors for parent hashing. Without parallel parent
 //   hashing, we lose about 10% of overall throughput on AVX2 and AVX-512.

-/// Undocumented and unstable, for benchmarks only.
-#[doc(hidden)]
-#[derive(Clone, Copy)]
-pub enum IncrementCounter {
-    Yes,
-    No,
-}
-
-impl IncrementCounter {
-    #[inline]
-    fn yes(&self) -> bool {
-        match self {
-            IncrementCounter::Yes => true,
-            IncrementCounter::No => false,
-        }
-    }
-}
-
-// The largest power of two less than or equal to `n`, used for left_len()
-// immediately below, and also directly in Hasher::update().
-fn largest_power_of_two_leq(n: usize) -> usize {
-    ((n / 2) + 1).next_power_of_two()
-}
-
-// Given some input larger than one chunk, return the number of bytes that
-// should go in the left subtree. This is the largest power-of-2 number of
-// chunks that leaves at least 1 byte for the right subtree.
-fn left_len(content_len: usize) -> usize {
-    debug_assert!(content_len > CHUNK_LEN);
-    // Subtract 1 to reserve at least one byte for the right side.
-    let full_chunks = (content_len - 1) / CHUNK_LEN;
-    largest_power_of_two_leq(full_chunks) * CHUNK_LEN
-}
-
-// Use SIMD parallelism to hash up to MAX_SIMD_DEGREE chunks at the same time
-// on a single thread. Write out the chunk chaining values and return the
-// number of chunks hashed. These chunks are never the root and never empty;
-// those cases use a different codepath.
-fn compress_chunks_parallel(
-    input: &[u8],
-    key: &CVWords,
-    chunk_counter: u64,
-    flags: u8,
-    platform: Platform,
-    out: &mut [u8],
-) -> usize {
-    debug_assert!(!input.is_empty(), "empty chunks below the root");
-    debug_assert!(input.len() <= MAX_SIMD_DEGREE * CHUNK_LEN);
-
-    let mut chunks_exact = input.chunks_exact(CHUNK_LEN);
-    let mut chunks_array = ArrayVec::<&[u8; CHUNK_LEN], MAX_SIMD_DEGREE>::new();
-    for chunk in &mut chunks_exact {
-        chunks_array.push(array_ref!(chunk, 0, CHUNK_LEN));
-    }
-    platform.hash_many(
-        &chunks_array,
-        key,
-        chunk_counter,
-        IncrementCounter::Yes,
-        flags,
-        CHUNK_START,
-        CHUNK_END,
-        out,
-    );
-
-    // Hash the remaining partial chunk, if there is one. Note that the empty
-    // chunk (meaning the empty message) is a different codepath.
-    let chunks_so_far = chunks_array.len();
-    if !chunks_exact.remainder().is_empty() {
-        let counter = chunk_counter + chunks_so_far as u64;
-        let mut chunk_state = ChunkState::new(key, counter, flags, platform);
-        chunk_state.update(chunks_exact.remainder());
-        *array_mut_ref!(out, chunks_so_far * OUT_LEN, OUT_LEN) =
-            chunk_state.output().chaining_value();
-        chunks_so_far + 1
-    } else {
-        chunks_so_far
-    }
-}
-
-// Use SIMD parallelism to hash up to MAX_SIMD_DEGREE parents at the same time
-// on a single thread. Write out the parent chaining values and return the
-// number of parents hashed. (If there's an odd input chaining value left over,
-// return it as an additional output.) These parents are never the root and
-// never empty; those cases use a different codepath.
-fn compress_parents_parallel(
-    child_chaining_values: &[u8],
-    key: &CVWords,
-    flags: u8,
-    platform: Platform,
-    out: &mut [u8],
-) -> usize {
-    debug_assert_eq!(child_chaining_values.len() % OUT_LEN, 0, "wacky hash bytes");
-    let num_children = child_chaining_values.len() / OUT_LEN;
-    debug_assert!(num_children >= 2, "not enough children");
-    debug_assert!(num_children <= 2 * MAX_SIMD_DEGREE_OR_2, "too many");
-
-    let mut parents_exact = child_chaining_values.chunks_exact(BLOCK_LEN);
-    // Use MAX_SIMD_DEGREE_OR_2 rather than MAX_SIMD_DEGREE here, because of
-    // the requirements of compress_subtree_wide().
-    let mut parents_array = ArrayVec::<&[u8; BLOCK_LEN], MAX_SIMD_DEGREE_OR_2>::new();
-    for parent in &mut parents_exact {
-        parents_array.push(array_ref!(parent, 0, BLOCK_LEN));
-    }
-    platform.hash_many(
-        &parents_array,
-        key,
-        0, // Parents always use counter 0.
-        IncrementCounter::No,
-        flags | PARENT,
-        0, // Parents have no start flags.
-        0, // Parents have no end flags.
-        out,
-    );
-
-    // If there's an odd child left over, it becomes an output.
-    let parents_so_far = parents_array.len();
-    if !parents_exact.remainder().is_empty() {
-        out[parents_so_far * OUT_LEN..][..OUT_LEN].copy_from_slice(parents_exact.remainder());
-        parents_so_far + 1
-    } else {
-        parents_so_far
-    }
-}
-
 // The wide helper function returns (writes out) an array of chaining values
 // and returns the length of that array. The number of chaining values returned
 // is the dynamically detected SIMD degree, at most MAX_SIMD_DEGREE. Or fewer,
@ -707,66 +483,41 @@ fn compress_parents_parallel(
 // multithreading parallelism for that update().
 fn compress_subtree_wide<J: join::Join>(
    input: &[u8],
-    key: &CVWords,
+    key: &CVBytes,
    chunk_counter: u64,
-    flags: u8,
-    platform: Platform,
-    out: &mut [u8],
+    flags: u32,
+    out: guts::TransposedSplit,
 ) -> usize {
    // Note that the single chunk case does *not* bump the SIMD degree up to 2
    // when it is 1. This allows Rayon the option of multithreading even the
    // 2-chunk case, which can help performance on smaller platforms.
-    if input.len() <= platform.simd_degree() * CHUNK_LEN {
-        return compress_chunks_parallel(input, key, chunk_counter, flags, platform, out);
+    let degree = guts::DETECTED_IMPL.degree();
+    if input.len() <= degree * CHUNK_LEN {
+        return guts::DETECTED_IMPL.hash_chunks(input, key, chunk_counter, flags, out);
    }

    // With more than simd_degree chunks, we need to recurse. Start by dividing
    // the input into left and right subtrees. (Note that this is only optimal
    // as long as the SIMD degree is a power of 2. If we ever get a SIMD degree
    // of 3 or something, we'll need a more complicated strategy.)
-    debug_assert_eq!(platform.simd_degree().count_ones(), 1, "power of 2");
-    let (left, right) = input.split_at(left_len(input.len()));
+    debug_assert_eq!(degree.count_ones(), 1, "power of 2");
+    let (left, right) = input.split_at(guts::left_len(input.len()));
    let right_chunk_counter = chunk_counter + (left.len() / CHUNK_LEN) as u64;

-    // Make space for the child outputs. Here we use MAX_SIMD_DEGREE_OR_2 to
-    // account for the special case of returning 2 outputs when the SIMD degree
-    // is 1.
-    let mut cv_array = [0; 2 * MAX_SIMD_DEGREE_OR_2 * OUT_LEN];
-    let degree = if left.len() == CHUNK_LEN {
-        // The "simd_degree=1 and we're at the leaf nodes" case.
-        debug_assert_eq!(platform.simd_degree(), 1);
-        1
-    } else {
-        cmp::max(platform.simd_degree(), 2)
-    };
-    let (left_out, right_out) = cv_array.split_at_mut(degree * OUT_LEN);
+    let mut transposed_cvs = guts::TransposedVectors::new();
+    let (left_cvs, right_cvs) = guts::DETECTED_IMPL.split_transposed_vectors(&mut transposed_cvs);

    // Recurse! For update_rayon(), this is where we take advantage of RayonJoin and use multiple
    // threads.
    let (left_n, right_n) = J::join(
-        || compress_subtree_wide::<J>(left, key, chunk_counter, flags, platform, left_out),
-        || compress_subtree_wide::<J>(right, key, right_chunk_counter, flags, platform, right_out),
+        || compress_subtree_wide::<J>(left, key, chunk_counter, flags, left_cvs),
+        || compress_subtree_wide::<J>(right, key, right_chunk_counter, flags, right_cvs),
    );

-    // The special case again. If simd_degree=1, then we'll have left_n=1 and
-    // right_n=1. Rather than compressing them into a single output, return
-    // them directly, to make sure we always have at least two outputs.
-    debug_assert_eq!(left_n, degree);
-    debug_assert!(right_n >= 1 && right_n <= left_n);
-    if left_n == 1 {
-        out[..2 * OUT_LEN].copy_from_slice(&cv_array[..2 * OUT_LEN]);
-        return 2;
-    }
-
-    // Otherwise, do one layer of parent node compression.
-    let num_children = left_n + right_n;
-    compress_parents_parallel(
-        &cv_array[..num_children * OUT_LEN],
-        key,
-        flags,
-        platform,
-        out,
-    )
+    // Do one layer of parent node compression. The SIMD degree is always at least 2, so we're
+    // guaranteed that this isn't the root compression.
+    let num_cvs = left_n + right_n;
+    guts::DETECTED_IMPL.hash_parents(&mut transposed_cvs, num_cvs, key, flags, out)
 }

 // Hash a subtree with compress_subtree_wide(), and then condense the resulting
@ -781,50 +532,41 @@ fn compress_subtree_wide<J: join::Join>(
 // chunk or less. That's a different codepath.
 fn compress_subtree_to_parent_node<J: join::Join>(
    input: &[u8],
-    key: &CVWords,
+    key: &CVBytes,
    chunk_counter: u64,
-    flags: u8,
-    platform: Platform,
-) -> [u8; BLOCK_LEN] {
+    flags: u32,
+) -> BlockBytes {
    debug_assert!(input.len() > CHUNK_LEN);
-    let mut cv_array = [0; MAX_SIMD_DEGREE_OR_2 * OUT_LEN];
-    let mut num_cvs =
-        compress_subtree_wide::<J>(input, &key, chunk_counter, flags, platform, &mut cv_array);
+    let mut transposed_cvs = guts::TransposedVectors::new();
+    let (left_cvs, _) = guts::DETECTED_IMPL.split_transposed_vectors(&mut transposed_cvs);
+    let mut num_cvs = compress_subtree_wide::<J>(input, &key, chunk_counter, flags, left_cvs);
    debug_assert!(num_cvs >= 2);

    // If MAX_SIMD_DEGREE is greater than 2 and there's enough input,
    // compress_subtree_wide() returns more than 2 chaining values. Condense
    // them into 2 by forming parent nodes repeatedly.
-    let mut out_array = [0; MAX_SIMD_DEGREE_OR_2 * OUT_LEN / 2];
    while num_cvs > 2 {
-        let cv_slice = &cv_array[..num_cvs * OUT_LEN];
-        num_cvs = compress_parents_parallel(cv_slice, key, flags, platform, &mut out_array);
-        cv_array[..num_cvs * OUT_LEN].copy_from_slice(&out_array[..num_cvs * OUT_LEN]);
+        num_cvs = guts::DETECTED_IMPL.reduce_parents(&mut transposed_cvs, num_cvs, key, flags);
    }
-    *array_ref!(cv_array, 0, 2 * OUT_LEN)
+    transposed_cvs.extract_parent_node(0)
 }

 // Hash a complete input all at once. Unlike compress_subtree_wide() and
 // compress_subtree_to_parent_node(), this function handles the 1 chunk case.
-fn hash_all_at_once<J: join::Join>(input: &[u8], key: &CVWords, flags: u8) -> Output {
-    let platform = Platform::detect();
-
+fn hash_all_at_once<J: join::Join>(input: &[u8], key: &CVBytes, flags: u32) -> Output {
    // If the whole subtree is one chunk, hash it directly with a ChunkState.
    if input.len() <= CHUNK_LEN {
-        return ChunkState::new(key, 0, flags, platform)
-            .update(input)
-            .output();
+        return ChunkState::new(key, 0, flags).update(input).output();
    }

    // Otherwise construct an Output object from the parent node returned by
    // compress_subtree_to_parent_node().
    Output {
        input_chaining_value: *key,
-        block: compress_subtree_to_parent_node::<J>(input, key, 0, flags, platform),
+        block: compress_subtree_to_parent_node::<J>(input, key, 0, flags),
        block_len: BLOCK_LEN as u8,
        counter: 0,
-        flags: flags | PARENT,
-        platform,
+        flags: flags as u8 | PARENT as u8,
    }
 }

@ -839,7 +581,7 @@ fn hash_all_at_once<J: join::Join>(input: &[u8], key: &CVWords, flags: u8) -> Ou
 /// This function is always single-threaded. For multithreading support, see
 /// [`Hasher::update_rayon`](struct.Hasher.html#method.update_rayon).
 pub fn hash(input: &[u8]) -> Hash {
-    hash_all_at_once::<join::SerialJoin>(input, IV, 0).root_hash()
+    hash_all_at_once::<join::SerialJoin>(input, &IV_BYTES, 0).root_hash()
 }

 /// The keyed hash function.
@ -856,9 +598,8 @@ pub fn hash(input: &[u8]) -> Hash {
 /// This function is always single-threaded. For multithreading support, see
 /// [`Hasher::new_keyed`] and
 /// [`Hasher::update_rayon`](struct.Hasher.html#method.update_rayon).
-pub fn keyed_hash(key: &[u8; KEY_LEN], input: &[u8]) -> Hash {
-    let key_words = platform::words_from_le_bytes_32(key);
-    hash_all_at_once::<join::SerialJoin>(input, &key_words, KEYED_HASH).root_hash()
+pub fn keyed_hash(key: &CVBytes, input: &[u8]) -> Hash {
+    hash_all_at_once::<join::SerialJoin>(input, key, KEYED_HASH).root_hash()
 }

 /// The key derivation function.
@ -896,12 +637,11 @@ pub fn keyed_hash(key: &[u8; KEY_LEN], input: &[u8]) -> Hash {
 /// [`Hasher::update_rayon`](struct.Hasher.html#method.update_rayon).
 ///
 /// [Argon2]: https://en.wikipedia.org/wiki/Argon2
-pub fn derive_key(context: &str, key_material: &[u8]) -> [u8; OUT_LEN] {
+pub fn derive_key(context: &str, key_material: &[u8]) -> CVBytes {
    let context_key =
-        hash_all_at_once::<join::SerialJoin>(context.as_bytes(), IV, DERIVE_KEY_CONTEXT)
+        hash_all_at_once::<join::SerialJoin>(context.as_bytes(), &IV_BYTES, DERIVE_KEY_CONTEXT)
            .root_hash();
-    let context_key_words = platform::words_from_le_bytes_32(context_key.as_bytes());
-    hash_all_at_once::<join::SerialJoin>(key_material, &context_key_words, DERIVE_KEY_MATERIAL)
+    hash_all_at_once::<join::SerialJoin>(key_material, context_key.as_bytes(), DERIVE_KEY_MATERIAL)
        .root_hash()
        .0
 }
@ -909,9 +649,8 @@ pub fn derive_key(context: &str, key_material: &[u8]) -> [u8; OUT_LEN] {
 fn parent_node_output(
    left_child: &CVBytes,
    right_child: &CVBytes,
-    key: &CVWords,
-    flags: u8,
-    platform: Platform,
+    key: &CVBytes,
+    flags: u32,
 ) -> Output {
    let mut block = [0; BLOCK_LEN];
    block[..32].copy_from_slice(left_child);
@ -921,8 +660,7 @@ fn parent_node_output(
        block,
        block_len: BLOCK_LEN as u8,
        counter: 0,
-        flags: flags | PARENT,
-        platform,
+        flags: (flags | PARENT) as u8,
    }
 }

@ -963,7 +701,7 @@ fn parent_node_output(
 #[derive(Clone)]
 #[cfg_attr(feature = "zeroize", derive(zeroize::Zeroize))]
 pub struct Hasher {
-    key: CVWords,
+    key: CVBytes,
    chunk_state: ChunkState,
    // The stack size is MAX_DEPTH + 1 because we do lazy merging. For example,
    // with 7 chunks, we have 3 entries in the stack. Adding an 8th chunk
@ -974,26 +712,25 @@ pub struct Hasher {
 }

 impl Hasher {
-    fn new_internal(key: &CVWords, flags: u8) -> Self {
+    fn new_internal(key: &CVBytes, flags: u32) -> Self {
        Self {
            key: *key,
-            chunk_state: ChunkState::new(key, 0, flags, Platform::detect()),
+            chunk_state: ChunkState::new(key, 0, flags),
            cv_stack: ArrayVec::new(),
        }
    }

    /// Construct a new `Hasher` for the regular hash function.
    pub fn new() -> Self {
-        Self::new_internal(IV, 0)
+        Self::new_internal(&IV_BYTES, 0)
    }

    /// Construct a new `Hasher` for the keyed hash function. See
    /// [`keyed_hash`].
    ///
    /// [`keyed_hash`]: fn.keyed_hash.html
-    pub fn new_keyed(key: &[u8; KEY_LEN]) -> Self {
-        let key_words = platform::words_from_le_bytes_32(key);
-        Self::new_internal(&key_words, KEYED_HASH)
+    pub fn new_keyed(key: &CVBytes) -> Self {
+        Self::new_internal(key, KEYED_HASH)
    }

    /// Construct a new `Hasher` for the key derivation function. See
@ -1003,10 +740,9 @@ impl Hasher {
    /// [`derive_key`]: fn.derive_key.html
    pub fn new_derive_key(context: &str) -> Self {
        let context_key =
-            hash_all_at_once::<join::SerialJoin>(context.as_bytes(), IV, DERIVE_KEY_CONTEXT)
+            hash_all_at_once::<join::SerialJoin>(context.as_bytes(), &IV_BYTES, DERIVE_KEY_CONTEXT)
                .root_hash();
-        let context_key_words = platform::words_from_le_bytes_32(context_key.as_bytes());
-        Self::new_internal(&context_key_words, DERIVE_KEY_MATERIAL)
+        Self::new_internal(context_key.as_bytes(), DERIVE_KEY_MATERIAL)
    }

    /// Reset the `Hasher` to its initial state.
@ -1014,12 +750,7 @@ impl Hasher {
    /// This is functionally the same as overwriting the `Hasher` with a new
    /// one, using the same key or context string if any.
    pub fn reset(&mut self) -> &mut Self {
-        self.chunk_state = ChunkState::new(
-            &self.key,
-            0,
-            self.chunk_state.flags,
-            self.chunk_state.platform,
-        );
+        self.chunk_state = ChunkState::new(&self.key, 0, self.chunk_state.flags as u32);
        self.cv_stack.clear();
        self
    }
@ -1044,8 +775,7 @@ impl Hasher {
                &left_child,
                &right_child,
                &self.key,
-                self.chunk_state.flags,
-                self.chunk_state.platform,
+                self.chunk_state.flags as u32,
            );
            self.cv_stack.push(parent_output.chaining_value());
        }
@ -1118,8 +848,7 @@ impl Hasher {
                self.chunk_state = ChunkState::new(
                    &self.key,
                    self.chunk_state.chunk_counter + 1,
-                    self.chunk_state.flags,
-                    self.chunk_state.platform,
+                    self.chunk_state.flags as u32,
                );
            } else {
                return self;
@ -1142,7 +871,7 @@ impl Hasher {
        while input.len() > CHUNK_LEN {
            debug_assert_eq!(self.chunk_state.len(), 0, "no partial chunk data");
            debug_assert_eq!(CHUNK_LEN.count_ones(), 1, "power of 2 chunk len");
-            let mut subtree_len = largest_power_of_two_leq(input.len());
+            let mut subtree_len = guts::largest_power_of_two_leq(input.len());
            let count_so_far = self.chunk_state.chunk_counter * CHUNK_LEN as u64;
            // Shrink the subtree_len until it evenly divides the count so far.
            // We know that subtree_len itself is a power of 2, so we can use a
@ -1174,8 +903,7 @@ impl Hasher {
                    &ChunkState::new(
                        &self.key,
                        self.chunk_state.chunk_counter,
-                        self.chunk_state.flags,
-                        self.chunk_state.platform,
+                        self.chunk_state.flags as u32,
                    )
                    .update(&input[..subtree_len])
                    .output()
@ -1189,11 +917,10 @@ impl Hasher {
                    &input[..subtree_len],
                    &self.key,
                    self.chunk_state.chunk_counter,
-                    self.chunk_state.flags,
-                    self.chunk_state.platform,
+                    self.chunk_state.flags as u32,
                );
-                let left_cv = array_ref!(cv_pair, 0, 32);
-                let right_cv = array_ref!(cv_pair, 32, 32);
+                let left_cv = cv_pair[..32].try_into().unwrap();
+                let right_cv = cv_pair[32..].try_into().unwrap();
                // Push the two CVs we received into the CV stack in order. Because
                // the stack merges lazily, this guarantees we aren't merging the
                // root.
@ -1256,8 +983,7 @@ impl Hasher {
                &self.cv_stack[num_cvs_remaining - 2],
                &self.cv_stack[num_cvs_remaining - 1],
                &self.key,
-                self.chunk_state.flags,
-                self.chunk_state.platform,
+                self.chunk_state.flags as u32,
            );
            num_cvs_remaining -= 2;
        }
@ -1266,8 +992,7 @@ impl Hasher {
                &self.cv_stack[num_cvs_remaining - 1],
                &output.chaining_value(),
                &self.key,
-                self.chunk_state.flags,
-                self.chunk_state.platform,
+                self.chunk_state.flags as u32,
            );
            num_cvs_remaining -= 1;
        }
@ -1481,7 +1206,6 @@ impl fmt::Debug for Hasher {
    fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
        f.debug_struct("Hasher")
            .field("flags", &self.chunk_state.flags)
-            .field("platform", &self.chunk_state.platform)
            .finish()
    }
 }
@ -1546,6 +1270,62 @@ impl OutputReader {
        }
    }

+    // There's some nontrivial logic here to handle partial blocks, and I don't want to copy-paste
+    // it between the xof and xof_xor cases.
+    #[inline(always)]
+    fn fill_inner(&mut self, mut buf: &mut [u8], xor: bool) {
+        debug_assert!(self.position_within_block < BLOCK_LEN as u8);
+        let xof_fn = if xor {
+            guts::Implementation::xof_xor
+        } else {
+            guts::Implementation::xof
+        };
+        if self.position_within_block != 0 {
+            // The xof() and xof_xor() APIs can handle a partial block at the end but not a partial
+            // block at the beginning. We handle the beginning case here. Start by computing the
+            // complete block that we need part of.
+            let mut partial_block = [0u8; 64];
+            xof_fn(
+                &guts::DETECTED_IMPL,
+                &self.inner.block,
+                self.inner.block_len as u32,
+                &self.inner.input_chaining_value,
+                self.inner.counter,
+                self.inner.flags as u32,
+                &mut partial_block,
+            );
+            let output_bytes = &partial_block[self.position_within_block as usize..];
+            let take = cmp::min(buf.len(), output_bytes.len());
+            if xor {
+                for byte_index in 0..take {
+                    buf[byte_index] ^= output_bytes[byte_index];
+                }
+            } else {
+                buf[..take].copy_from_slice(&output_bytes[..take]);
+            }
+            buf = &mut buf[take..];
+            self.position_within_block += take as u8;
+            if self.position_within_block == BLOCK_LEN as u8 {
+                self.position_within_block = 0;
+                self.inner.counter += 1;
+            } else {
+                debug_assert!(buf.is_empty());
+                return;
+            }
+        }
+        xof_fn(
+            &guts::DETECTED_IMPL,
+            &self.inner.block,
+            self.inner.block_len as u32,
+            &self.inner.input_chaining_value,
+            self.inner.counter,
+            self.inner.flags as u32,
+            buf,
+        );
+        self.inner.counter += (buf.len() / BLOCK_LEN) as u64;
+        self.position_within_block = (buf.len() % BLOCK_LEN) as u8;
+    }
+
    /// Fill a buffer with output bytes and advance the position of the
    /// `OutputReader`. This is equivalent to [`Read::read`], except that it
    /// doesn't return a `Result`. Both methods always fill the entire buffer.
@ -1561,19 +1341,12 @@ impl OutputReader {
    /// reading further, the behavior is unspecified.
    ///
    /// [`Read::read`]: #method.read
-    pub fn fill(&mut self, mut buf: &mut [u8]) {
-        while !buf.is_empty() {
-            let block: [u8; BLOCK_LEN] = self.inner.root_output_block();
-            let output_bytes = &block[self.position_within_block as usize..];
-            let take = cmp::min(buf.len(), output_bytes.len());
-            buf[..take].copy_from_slice(&output_bytes[..take]);
-            buf = &mut buf[take..];
-            self.position_within_block += take as u8;
-            if self.position_within_block == BLOCK_LEN as u8 {
-                self.inner.counter += 1;
-                self.position_within_block = 0;
-            }
-        }
+    pub fn fill(&mut self, buf: &mut [u8]) {
+        self.fill_inner(buf, false);
+    }
+
+    pub fn fill_xor(&mut self, buf: &mut [u8]) {
+        self.fill_inner(buf, true);
    }

    /// Return the current read position in the output stream. This is
--- a/src/portable.rs
+++ b/src/portable.rs
@ -181,10 +181,10 @@ pub fn hash_many<const N: usize>(
 pub mod test {
    use super::*;

-    // This is basically testing the portable implementation against itself,
-    // but it also checks that compress_in_place and compress_xof are
-    // consistent. And there are tests against the reference implementation and
-    // against hardcoded test vectors elsewhere.
+    // These are basically testing the portable implementation against itself, but we also check
+    // that compress_in_place and compress_xof are consistent. And there are tests against the
+    // reference implementation and against hardcoded test vectors elsewhere.
+
    #[test]
    fn test_compress() {
        crate::test::test_compress_fn(compress_in_place, compress_xof);
--- a/src/test.rs
+++ b/src/test.rs
@ -1,6 +1,6 @@
-use crate::{CVBytes, CVWords, IncrementCounter, BLOCK_LEN, CHUNK_LEN, OUT_LEN};
-use arrayref::array_ref;
-use arrayvec::ArrayVec;
+use blake3_guts as guts;
+use guts::{CVBytes, CVWords, BLOCK_LEN, CHUNK_LEN};
+
 use core::usize;
 use rand::prelude::*;

@ -46,172 +46,12 @@ pub const TEST_CASES: &[usize] = &[
 pub const TEST_CASES_MAX: usize = 100 * CHUNK_LEN;

 // There's a test to make sure these two are equal below.
-pub const TEST_KEY: CVBytes = *b"whats the Elvish word for friend";
-pub const TEST_KEY_WORDS: CVWords = [
-    1952540791, 1752440947, 1816469605, 1752394102, 1919907616, 1868963940, 1919295602, 1684956521,
-];
-
-// Paint the input with a repeating byte pattern. We use a cycle length of 251,
-// because that's the largest prime number less than 256. This makes it
-// unlikely to swapping any two adjacent input blocks or chunks will give the
-// same answer.
-pub fn paint_test_input(buf: &mut [u8]) {
-    for (i, b) in buf.iter_mut().enumerate() {
-        *b = (i % 251) as u8;
-    }
-}
-
-type CompressInPlaceFn =
-    unsafe fn(cv: &mut CVWords, block: &[u8; BLOCK_LEN], block_len: u8, counter: u64, flags: u8);
-
-type CompressXofFn = unsafe fn(
-    cv: &CVWords,
-    block: &[u8; BLOCK_LEN],
-    block_len: u8,
-    counter: u64,
-    flags: u8,
-) -> [u8; 64];
-
-// A shared helper function for platform-specific tests.
-pub fn test_compress_fn(compress_in_place_fn: CompressInPlaceFn, compress_xof_fn: CompressXofFn) {
-    let initial_state = TEST_KEY_WORDS;
-    let block_len: u8 = 61;
-    let mut block = [0; BLOCK_LEN];
-    paint_test_input(&mut block[..block_len as usize]);
-    // Use a counter with set bits in both 32-bit words.
-    let counter = (5u64 << 32) + 6;
-    let flags = crate::CHUNK_END | crate::ROOT | crate::KEYED_HASH;
-
-    let portable_out =
-        crate::portable::compress_xof(&initial_state, &block, block_len, counter as u64, flags);
-
-    let mut test_state = initial_state;
-    unsafe { compress_in_place_fn(&mut test_state, &block, block_len, counter as u64, flags) };
-    let test_state_bytes = crate::platform::le_bytes_from_words_32(&test_state);
-    let test_xof =
-        unsafe { compress_xof_fn(&initial_state, &block, block_len, counter as u64, flags) };
-
-    assert_eq!(&portable_out[..32], &test_state_bytes[..]);
-    assert_eq!(&portable_out[..], &test_xof[..]);
-}
-
-type HashManyFn<A> = unsafe fn(
-    inputs: &[&A],
-    key: &CVWords,
-    counter: u64,
-    increment_counter: IncrementCounter,
-    flags: u8,
-    flags_start: u8,
-    flags_end: u8,
-    out: &mut [u8],
-);
-
-// A shared helper function for platform-specific tests.
-pub fn test_hash_many_fn(
-    hash_many_chunks_fn: HashManyFn<[u8; CHUNK_LEN]>,
-    hash_many_parents_fn: HashManyFn<[u8; 2 * OUT_LEN]>,
-) {
-    // Test a few different initial counter values.
-    // - 0: The base case.
-    // - u32::MAX: The low word of the counter overflows for all inputs except the first.
-    // - i32::MAX: *No* overflow. But carry bugs in tricky SIMD code can screw this up, if you XOR
-    //   when you're supposed to ANDNOT...
-    let initial_counters = [0, u32::MAX as u64, i32::MAX as u64];
-    for counter in initial_counters {
-        #[cfg(feature = "std")]
-        dbg!(counter);
-
-        // 31 (16 + 8 + 4 + 2 + 1) inputs
-        const NUM_INPUTS: usize = 31;
-        let mut input_buf = [0; CHUNK_LEN * NUM_INPUTS];
-        crate::test::paint_test_input(&mut input_buf);
-
-        // First hash chunks.
-        let mut chunks = ArrayVec::<&[u8; CHUNK_LEN], NUM_INPUTS>::new();
-        for i in 0..NUM_INPUTS {
-            chunks.push(array_ref!(input_buf, i * CHUNK_LEN, CHUNK_LEN));
-        }
-        let mut portable_chunks_out = [0; NUM_INPUTS * OUT_LEN];
-        crate::portable::hash_many(
-            &chunks,
-            &TEST_KEY_WORDS,
-            counter,
-            IncrementCounter::Yes,
-            crate::KEYED_HASH,
-            crate::CHUNK_START,
-            crate::CHUNK_END,
-            &mut portable_chunks_out,
-        );
-
-        let mut test_chunks_out = [0; NUM_INPUTS * OUT_LEN];
-        unsafe {
-            hash_many_chunks_fn(
-                &chunks[..],
-                &TEST_KEY_WORDS,
-                counter,
-                IncrementCounter::Yes,
-                crate::KEYED_HASH,
-                crate::CHUNK_START,
-                crate::CHUNK_END,
-                &mut test_chunks_out,
-            );
-        }
-        for n in 0..NUM_INPUTS {
-            #[cfg(feature = "std")]
-            dbg!(n);
-            assert_eq!(
-                &portable_chunks_out[n * OUT_LEN..][..OUT_LEN],
-                &test_chunks_out[n * OUT_LEN..][..OUT_LEN]
-            );
-        }
-
-        // Then hash parents.
-        let mut parents = ArrayVec::<&[u8; 2 * OUT_LEN], NUM_INPUTS>::new();
-        for i in 0..NUM_INPUTS {
-            parents.push(array_ref!(input_buf, i * 2 * OUT_LEN, 2 * OUT_LEN));
-        }
-        let mut portable_parents_out = [0; NUM_INPUTS * OUT_LEN];
-        crate::portable::hash_many(
-            &parents,
-            &TEST_KEY_WORDS,
-            counter,
-            IncrementCounter::No,
-            crate::KEYED_HASH | crate::PARENT,
-            0,
-            0,
-            &mut portable_parents_out,
-        );
-
-        let mut test_parents_out = [0; NUM_INPUTS * OUT_LEN];
-        unsafe {
-            hash_many_parents_fn(
-                &parents[..],
-                &TEST_KEY_WORDS,
-                counter,
-                IncrementCounter::No,
-                crate::KEYED_HASH | crate::PARENT,
-                0,
-                0,
-                &mut test_parents_out,
-            );
-        }
-        for n in 0..NUM_INPUTS {
-            #[cfg(feature = "std")]
-            dbg!(n);
-            assert_eq!(
-                &portable_parents_out[n * OUT_LEN..][..OUT_LEN],
-                &test_parents_out[n * OUT_LEN..][..OUT_LEN]
-            );
-        }
-    }
-}
+pub const TEST_KEY: &CVBytes = b"whats the Elvish word for friend";
+pub const TEST_KEY_WORDS: &CVWords = &guts::words_from_le_bytes_32(TEST_KEY);

 #[test]
 fn test_key_bytes_equal_key_words() {
-    assert_eq!(
-        TEST_KEY_WORDS,
-        crate::platform::words_from_le_bytes_32(&TEST_KEY),
-    );
+    assert_eq!(TEST_KEY, &guts::le_bytes_from_words_32(TEST_KEY_WORDS),);
 }

 #[test]
@ -224,52 +64,9 @@ fn test_reference_impl_size() {
    assert_eq!(1880, core::mem::size_of::<reference_impl::Hasher>());
 }

-#[test]
-fn test_counter_words() {
-    let counter: u64 = (1 << 32) + 2;
-    assert_eq!(crate::counter_low(counter), 2);
-    assert_eq!(crate::counter_high(counter), 1);
-}
-
-#[test]
-fn test_largest_power_of_two_leq() {
-    let input_output = &[
-        // The zero case is nonsensical, but it does work.
-        (0, 1),
-        (1, 1),
-        (2, 2),
-        (3, 2),
-        (4, 4),
-        (5, 4),
-        (6, 4),
-        (7, 4),
-        (8, 8),
-        // the largest possible usize
-        (usize::MAX, (usize::MAX >> 1) + 1),
-    ];
-    for &(input, output) in input_output {
-        assert_eq!(
-            output,
-            crate::largest_power_of_two_leq(input),
-            "wrong output for n={}",
-            input
-        );
-    }
-}
-
-#[test]
-fn test_left_len() {
-    let input_output = &[
-        (CHUNK_LEN + 1, CHUNK_LEN),
-        (2 * CHUNK_LEN - 1, CHUNK_LEN),
-        (2 * CHUNK_LEN, CHUNK_LEN),
-        (2 * CHUNK_LEN + 1, 2 * CHUNK_LEN),
-        (4 * CHUNK_LEN - 1, 2 * CHUNK_LEN),
-        (4 * CHUNK_LEN, 2 * CHUNK_LEN),
-        (4 * CHUNK_LEN + 1, 4 * CHUNK_LEN),
-    ];
-    for &(input, output) in input_output {
-        assert_eq!(crate::left_len(input), output);
+pub(crate) fn paint_test_input(buf: &mut [u8]) {
+    for (i, b) in buf.iter_mut().enumerate() {
+        *b = (i % 251) as u8;
    }
 }

@ -292,18 +89,18 @@ fn test_compare_reference_impl() {

            // all at once
            let test_out = crate::hash(input);
-            assert_eq!(test_out, *array_ref!(expected_out, 0, 32));
+            assert_eq!(test_out, expected_out[..32]);
            // incremental
            let mut hasher = crate::Hasher::new();
            hasher.update(input);
-            assert_eq!(hasher.finalize(), *array_ref!(expected_out, 0, 32));
+            assert_eq!(hasher.finalize(), expected_out[..32]);
            assert_eq!(hasher.finalize(), test_out);
            // incremental (rayon)
            #[cfg(feature = "rayon")]
            {
                let mut hasher = crate::Hasher::new();
                hasher.update_rayon(input);
-                assert_eq!(hasher.finalize(), *array_ref!(expected_out, 0, 32));
+                assert_eq!(hasher.finalize(), expected_out[..32]);
                assert_eq!(hasher.finalize(), test_out);
            }
            // xof
@ -314,25 +111,25 @@ fn test_compare_reference_impl() {

        // keyed
        {
-            let mut reference_hasher = reference_impl::Hasher::new_keyed(&TEST_KEY);
+            let mut reference_hasher = reference_impl::Hasher::new_keyed(TEST_KEY);
            reference_hasher.update(input);
            let mut expected_out = [0; OUT];
            reference_hasher.finalize(&mut expected_out);

            // all at once
-            let test_out = crate::keyed_hash(&TEST_KEY, input);
-            assert_eq!(test_out, *array_ref!(expected_out, 0, 32));
+            let test_out = crate::keyed_hash(TEST_KEY, input);
+            assert_eq!(test_out, expected_out[..32]);
            // incremental
-            let mut hasher = crate::Hasher::new_keyed(&TEST_KEY);
+            let mut hasher = crate::Hasher::new_keyed(TEST_KEY);
            hasher.update(input);
-            assert_eq!(hasher.finalize(), *array_ref!(expected_out, 0, 32));
+            assert_eq!(hasher.finalize(), expected_out[..32]);
            assert_eq!(hasher.finalize(), test_out);
            // incremental (rayon)
            #[cfg(feature = "rayon")]
            {
-                let mut hasher = crate::Hasher::new_keyed(&TEST_KEY);
+                let mut hasher = crate::Hasher::new_keyed(TEST_KEY);
                hasher.update_rayon(input);
-                assert_eq!(hasher.finalize(), *array_ref!(expected_out, 0, 32));
+                assert_eq!(hasher.finalize(), expected_out[..32]);
                assert_eq!(hasher.finalize(), test_out);
            }
            // xof
@ -355,15 +152,15 @@ fn test_compare_reference_impl() {
            // incremental
            let mut hasher = crate::Hasher::new_derive_key(context);
            hasher.update(input);
-            assert_eq!(hasher.finalize(), *array_ref!(expected_out, 0, 32));
-            assert_eq!(hasher.finalize(), *array_ref!(test_out, 0, 32));
+            assert_eq!(hasher.finalize(), expected_out[..32]);
+            assert_eq!(hasher.finalize(), test_out[..32]);
            // incremental (rayon)
            #[cfg(feature = "rayon")]
            {
                let mut hasher = crate::Hasher::new_derive_key(context);
                hasher.update_rayon(input);
-                assert_eq!(hasher.finalize(), *array_ref!(expected_out, 0, 32));
-                assert_eq!(hasher.finalize(), *array_ref!(test_out, 0, 32));
+                assert_eq!(hasher.finalize(), expected_out[..32]);
+                assert_eq!(hasher.finalize(), test_out[..32]);
            }
            // xof
            let mut extended = [0; OUT];
@ -423,9 +220,9 @@ fn test_fuzz_hasher() {
    let mut input_buf = [0; 3 * INPUT_MAX];
    paint_test_input(&mut input_buf);

-    // Don't do too many iterations in debug mode, to keep the tests under a
-    // second or so. CI should run tests in release mode also. Provide an
-    // environment variable for specifying a larger number of fuzz iterations.
+    // Don't do too many iterations in debug mode, to keep the tests under a second or so. CI
+    // should run tests in release mode also.
+    // TODO: Provide an environment variable for specifying a larger number of fuzz iterations?
    let num_tests = if cfg!(debug_assertions) { 100 } else { 10_000 };

    // Use a fixed RNG seed for reproducibility.
@ -493,6 +290,133 @@ fn test_xof_seek() {
    }
 }

+#[test]
+fn test_xof_xor() {
+    for step in [32, 63, 64, 128, 303] {
+        #[cfg(feature = "std")]
+        dbg!(step);
+        let mut ref_hasher = reference_impl::Hasher::new();
+        ref_hasher.update(b"foo");
+        let mut ref_output = [0u8; 1000];
+        ref_hasher.finalize(&mut ref_output);
+
+        let mut hasher = crate::Hasher::new();
+        hasher.update(b"foo");
+        let mut reader = hasher.finalize_xof();
+
+        let mut test_output = [0u8; 1000];
+        for chunk in test_output.chunks_mut(step) {
+            reader.fill(chunk);
+        }
+        assert_eq!(ref_output, test_output);
+        // Xor'ing the same output should zero the buffer.
+        reader.set_position(0);
+        for chunk in test_output.chunks_mut(step) {
+            reader.fill_xor(chunk);
+        }
+        assert_eq!([0u8; 1000], test_output);
+        // Xor'ing the same output again should reproduce the original.
+        reader.set_position(0);
+        for chunk in test_output.chunks_mut(step) {
+            reader.fill_xor(chunk);
+        }
+        assert_eq!(ref_output, test_output);
+
+        // Repeat the same test but starting at offset 500.
+        reader.set_position(500);
+        for chunk in test_output[..500].chunks_mut(step) {
+            reader.fill(chunk);
+        }
+        assert_eq!(ref_output[500..], test_output[..500]);
+        reader.set_position(500);
+        for chunk in test_output[..500].chunks_mut(step) {
+            reader.fill_xor(chunk);
+        }
+        assert_eq!([0u8; 500], test_output[..500]);
+        reader.set_position(500);
+        for chunk in test_output[..500].chunks_mut(step) {
+            reader.fill_xor(chunk);
+        }
+        assert_eq!(ref_output[500..], test_output[..500]);
+    }
+}
+
+#[test]
+#[cfg(feature = "std")]
+fn test_fuzz_xof() {
+    // Use a fixed RNG seed for reproducibility.
+    let mut rng = rand_chacha::ChaCha8Rng::from_seed([99; 32]);
+    let random_key: [u8; 32] = rng.gen();
+
+    let possible_seeks = [-64i64, -63 - 1, 0, 1, 63, 64, 127, 128, 129];
+
+    const MAX_LEN: usize = 1100;
+    let possible_lengths = [0usize, 1, 63, 64, 65, 128, 256, 512, 1024, MAX_LEN];
+    assert!(possible_lengths.into_iter().all(|x| x <= MAX_LEN));
+
+    let mut xof_output = crate::Hasher::new_keyed(&random_key).finalize_xof();
+    let mut xof_xor_output = crate::Hasher::new_keyed(&random_key).finalize_xof();
+
+    // Don't do too many iterations in debug mode, to keep the tests under a second or so. CI
+    // should run tests in release mode also.
+    // TODO: Provide an environment variable for specifying a larger number of fuzz iterations?
+    let num_tests = if cfg!(debug_assertions) {
+        1_000
+    } else {
+        100_000
+    };
+
+    let mut position = 0;
+    let mut ref_output = Vec::new();
+    for test_i in 0..num_tests {
+        eprintln!("--- test {test_i} ---");
+        // Do a random relative seek maybe. Could be zero.
+        let relative_seek: i64 = *possible_seeks.choose(&mut rng).unwrap();
+        dbg!(relative_seek);
+        if relative_seek != 0 {
+            let new_position = position as i64 + relative_seek;
+            if 0 <= new_position && new_position <= MAX_LEN as i64 {
+                position = new_position as u64;
+            } else {
+                position = 0;
+            }
+            assert_eq!(xof_output.position(), xof_xor_output.position());
+            xof_output.set_position(position as u64);
+            xof_xor_output.set_position(position as u64);
+        }
+        dbg!(position);
+
+        // Generate a random number of output bytes. If the amount of output we've gotten from the
+        // reference_impl isn't enough, double it.
+        let len: usize = *possible_lengths.choose(&mut rng).unwrap();
+        dbg!(len);
+        if position as usize + len > ref_output.len() {
+            let new_len = core::cmp::max(MAX_LEN, 2 * ref_output.len());
+            ref_output = vec![0u8; new_len];
+            eprintln!("grow reference output length to {}", ref_output.len());
+            let ref_hasher = reference_impl::Hasher::new_keyed(&random_key);
+            ref_hasher.finalize(&mut ref_output);
+        }
+        let mut buf = [0u8; MAX_LEN];
+        xof_output.fill(&mut buf[..len]);
+        assert_eq!(ref_output[position as usize..][..len], buf[..len]);
+        assert_eq!([0u8; MAX_LEN][..MAX_LEN - len], buf[len..]);
+
+        // Xor over the output with a random byte value, and then confirm that xof_xor() recovers
+        // that value.
+        let random_byte: u8 = rng.gen();
+        dbg!(random_byte);
+        for i in 0..len {
+            buf[i] ^= random_byte;
+        }
+        xof_xor_output.fill_xor(&mut buf[..len]);
+        assert_eq!([random_byte; MAX_LEN][..len], buf[..len]);
+        assert_eq!([0u8; MAX_LEN][..MAX_LEN - len], buf[len..]);
+
+        position += len as u64;
+    }
+}
+
 #[test]
 fn test_msg_schedule_permutation() {
    let permutation = [2, 6, 3, 10, 7, 0, 4, 13, 1, 11, 12, 5, 9, 14, 15, 8];
@ -506,7 +430,7 @@ fn test_msg_schedule_permutation() {
        }
    }

-    assert_eq!(generated, crate::MSG_SCHEDULE);
+    assert_eq!(generated, guts::MSG_SCHEDULE);
 }

 #[test]
@ -640,53 +564,43 @@ fn test_zeroize() {

    let mut hasher = crate::Hasher {
        chunk_state: crate::ChunkState {
-            cv: [42; 8],
+            cv: [42; 32],
            chunk_counter: 42,
            buf: [42; 64],
            buf_len: 42,
            blocks_compressed: 42,
            flags: 42,
-            platform: crate::Platform::Portable,
        },
-        key: [42; 8],
+        key: [42; 32],
        cv_stack: [[42; 32]; { crate::MAX_DEPTH + 1 }].into(),
    };
    hasher.zeroize();
-    assert_eq!(hasher.chunk_state.cv, [0; 8]);
+    assert_eq!(hasher.chunk_state.cv, [0; 32]);
    assert_eq!(hasher.chunk_state.chunk_counter, 0);
    assert_eq!(hasher.chunk_state.buf, [0; 64]);
    assert_eq!(hasher.chunk_state.buf_len, 0);
    assert_eq!(hasher.chunk_state.blocks_compressed, 0);
    assert_eq!(hasher.chunk_state.flags, 0);
-    assert!(matches!(
-        hasher.chunk_state.platform,
-        crate::Platform::Portable
-    ));
-    assert_eq!(hasher.key, [0; 8]);
+    assert_eq!(hasher.key, [0; 32]);
    assert_eq!(&*hasher.cv_stack, &[[0u8; 32]; 0]);

    let mut output_reader = crate::OutputReader {
        inner: crate::Output {
-            input_chaining_value: [42; 8],
+            input_chaining_value: [42; 32],
            block: [42; 64],
            counter: 42,
            block_len: 42,
            flags: 42,
-            platform: crate::Platform::Portable,
        },
        position_within_block: 42,
    };

    output_reader.zeroize();
-    assert_eq!(output_reader.inner.input_chaining_value, [0; 8]);
+    assert_eq!(output_reader.inner.input_chaining_value, [0; 32]);
    assert_eq!(output_reader.inner.block, [0; 64]);
    assert_eq!(output_reader.inner.counter, 0);
    assert_eq!(output_reader.inner.block_len, 0);
    assert_eq!(output_reader.inner.flags, 0);
-    assert!(matches!(
-        output_reader.inner.platform,
-        crate::Platform::Portable
-    ));
    assert_eq!(output_reader.position_within_block, 0);
 }

@ -732,7 +646,7 @@ fn test_update_reader_interrupted() -> std::io::Result<()> {
                self.already_interrupted = true;
                return Err(io::Error::from(io::ErrorKind::Interrupted));
            }
-            let take = std::cmp::min(self.slice.len(), buf.len());
+            let take = core::cmp::min(self.slice.len(), buf.len());
            buf[..take].copy_from_slice(&self.slice[..take]);
            self.slice = &self.slice[take..];
            Ok(take)
--- a/test_vectors/Cargo.toml
+++ b/test_vectors/Cargo.toml
@ -12,6 +12,7 @@ pure = ["blake3/pure"]
 # If you ever change these path dependencies, you'll probably need to update
 # cross_test.sh, or CI will break. I'm sorry >.<
 blake3 = { path = "../" }
+blake3_guts = { path = "../rust/guts" }
 hex = "0.4.0"
 reference_impl = { path = "../reference_impl" }
 serde = { version = "1.0", features = ["derive"] }
--- a/test_vectors/cross_test.sh
+++ b/test_vectors/cross_test.sh
@ -20,6 +20,7 @@ mv blake3/reference_impl test_vectors
 mv blake3 test_vectors
 cd test_vectors
 sed -i 's|blake3 = { path = "../" }|blake3 = { path = "./blake3" }|' Cargo.toml
-sed -i 's|reference_impl = { path = "../reference_impl" }|reference_impl = { path = "reference_impl" }|' Cargo.toml
+sed -i 's|blake3_guts = { path = "../rust/guts" }|blake3_guts = { path = "./blake3/rust/guts" }|' Cargo.toml
+sed -i 's|reference_impl = { path = "../reference_impl" }|reference_impl = { path = "./reference_impl" }|' Cargo.toml

 cross test "$@"
--- a/test_vectors/src/lib.rs
+++ b/test_vectors/src/lib.rs
@ -1,4 +1,4 @@
-use blake3::guts::{BLOCK_LEN, CHUNK_LEN};
+use blake3_guts::{BLOCK_LEN, CHUNK_LEN};
 use serde::{Deserialize, Serialize};

 // A non-multiple of 4 is important, since one possible bug is to fail to emit