fix Windows ARM64 build and detect ARM64EC as ARM64

format the state matrix better in reference_impl.rs
Fix missing LICENSE file in b3sum crate
2024-05-21 02:46:06 +02:00 · 2024-04-07 11:48:02 -04:00 · 2024-03-20 15:44:05 -07:00 · 2024-03-12 14:47:39 -07:00 · 2024-03-12 00:34:53 -07:00 · 2024-03-12 03:21:51 -04:00
29 changed files with 2870 additions and 603 deletions
--- a/.git-blame-ignore-revs
+++ b/.git-blame-ignore-revs
@ -0,0 +1,2 @@
+# CMakeLists.txt whitespace fixups
+3e14f865d30271c74fc68d417af488ea91b66d48
--- a/.github/workflows/ci.yml
+++ b/.github/workflows/ci.yml
@ -38,12 +38,10 @@ jobs:
        ]

    steps:
-    - uses: actions/checkout@v3
-    - uses: actions-rs/toolchain@v1
+    - uses: actions/checkout@v4
+    - uses: dtolnay/rust-toolchain@master
      with:
        toolchain: ${{ format('{0}-{1}', matrix.channel, matrix.target.toolchain) }}
-        profile: minimal
-        override: true
    # Print the compiler version, for debugging.
    - name: print compiler version
      run: cargo run --quiet
@ -52,13 +50,17 @@ jobs:
    - name: print instruction set support
      run: cargo run --quiet
      working-directory: ./tools/instruction_set_support
-    # Default tests plus Rayon and RustCrypto trait implementations.
-    - run: cargo test --features=rayon,traits-preview
+    # Default tests plus Rayon and trait implementations.
+    - run: cargo test --features=rayon,traits-preview,serde,zeroize
    # Same but with only one thread in the Rayon pool. This can find deadlocks.
    - name: "again with RAYON_NUM_THREADS=1"
-      run: cargo test --features=rayon,traits-preview
+      run: cargo test --features=rayon,traits-preview,serde,zeroize
      env:
        RAYON_NUM_THREADS: 1
+    # The mmap feature by itself (update_mmap_rayon is omitted).
+    - run: cargo test --features=mmap
+    # All public features put together.
+    - run: cargo test --features=mmap,rayon,traits-preview,serde,zeroize
    # no_std tests.
    - run: cargo test --no-default-features

@ -129,6 +131,17 @@ jobs:
      run: cargo test
      working-directory: ./reference_impl

+    # the new guts crate
+    - name: guts test
+      run: cargo test --all-features
+      working-directory: ./rust/guts
+    - name: guts no_std build
+      run: cargo build --no-default-features
+      working-directory: ./rust/guts
+    - name: guts no_std test  # note that rust/guts/src/test.rs still uses libstd
+      run: cargo test --no-default-features
+      working-directory: ./rust/guts
+
  b3sum_tests:
    name: b3sum ${{ matrix.target.name }} ${{ matrix.channel }}
    runs-on: ${{ matrix.target.os }}
@ -148,16 +161,14 @@ jobs:
          # The b3sum MSRV is sometimes higher than the blake3 crate's, because
          # b3sum depends on Clap. We check in the b3sum Cargo.lock, so Clap
          # update shouldn't randomly break us here.
-          "1.66.1",
+          "1.74.1",
        ]

    steps:
-    - uses: actions/checkout@v3
-    - uses: actions-rs/toolchain@v1
+    - uses: actions/checkout@v4
+    - uses: dtolnay/rust-toolchain@master
      with:
        toolchain: ${{ format('{0}-{1}', matrix.channel, matrix.target.toolchain) }}
-        profile: minimal
-        override: true
    # Test b3sum.
    - name: test b3sum
      run: cargo test
@ -177,14 +188,13 @@ jobs:
          - i686-unknown-linux-musl
          - armv7-unknown-linux-gnueabihf
          - aarch64-unknown-linux-gnu
-          - mips-unknown-linux-gnu
+          # Big-endian targets. See https://twitter.com/burntsushi5/status/1695483429997945092.
+          - powerpc64-unknown-linux-gnu
+          - s390x-unknown-linux-gnu

    steps:
-    - uses: actions/checkout@v3
-    - uses: actions-rs/toolchain@v1
-      with:
-        toolchain: stable
-        override: true
+    - uses: actions/checkout@v4
+    - uses: dtolnay/rust-toolchain@stable
    - run: cargo install cross
    # Test the portable implementation on everything.
    - run: cross test --target ${{ matrix.arch }}
@ -210,7 +220,7 @@ jobs:
    runs-on: ubuntu-latest

    steps:
-    - uses: actions/checkout@v3
+    - uses: actions/checkout@v4
    # Test the intrinsics-based implementations.
    - run: make -f Makefile.testing test
      working-directory: ./c
@ -262,12 +272,10 @@ jobs:
    strategy:
      fail-fast: false
    steps:
-    - uses: actions/checkout@v3
-    - uses: actions-rs/toolchain@v1
+    - uses: actions/checkout@v4
+    - uses: dtolnay/rust-toolchain@stable
      with:
-        toolchain: stable
-        target: aarch64-apple-darwin
-        override: true
+        targets: aarch64-apple-darwin
    - name: build blake3
      run: cargo build --target aarch64-apple-darwin
    - name: build b3sum
@ -278,7 +286,7 @@ jobs:
    name: build with the Tiny C Compiler
    runs-on: ubuntu-latest
    steps:
-    - uses: actions/checkout@v3
+    - uses: actions/checkout@v4
    - name: install TCC
      run: sudo apt-get install -y tcc
    - name: compile
@ -295,7 +303,7 @@ jobs:
    name: "compile and test with GCC 5.4"
    runs-on: ubuntu-latest
    steps:
-    - uses: actions/checkout@v3
+    - uses: actions/checkout@v4
    - uses: addnab/docker-run-action@v3
      with:
        image: gcc:5.4
@ -308,7 +316,7 @@ jobs:

  # CMake build test (Library only), current macOS/Linux only.
  cmake_build:
-    name: CMake ${{ matrix.os }}
+    name: CMake ${{ matrix.os }} ${{ matrix.compiler }}
    runs-on: ${{ matrix.os }}
    strategy:
      fail-fast: false
@ -323,8 +331,21 @@ jobs:
          - os: macOS-latest
            compiler: msvc
    steps:
-      - uses: actions/checkout@v3
+      - uses: actions/checkout@v4
      - name: CMake generation
        run: cmake -S c -B c/build -DCMAKE_INSTALL_PREFIX=${{github.workspace}}/target
      - name: CMake build / install
        run: cmake --build c/build --target install
+
+  miri_smoketest:
+    name: Miri smoketest
+    runs-on: ubuntu-latest
+    steps:
+    - uses: actions/checkout@v4
+    - uses: dtolnay/rust-toolchain@nightly
+      with:
+        components: miri
+    # Currently the test search "miri" only matches "test_miri_smoketest", but
+    # we might add more. If this accidentally picks up anything incompatible or
+    # slow, we can narrow it.
+    - run: cargo miri test miri
--- a/.github/workflows/tag.yml
+++ b/.github/workflows/tag.yml
@ -23,18 +23,16 @@ jobs:
        ]

    steps:
-      - uses: actions/checkout@v3
+      - uses: actions/checkout@v4
      - uses: actions/setup-python@v4
        with:
          python-version: "3.x"
      - run: pip install PyGithub
      - run: sudo apt-get install musl-tools
        if: matrix.target.os == 'ubuntu-latest'
-      - uses: actions-rs/toolchain@v1
+      - uses: dtolnay/rust-toolchain@stable
        with:
-          toolchain: stable
-          profile: minimal
-      - run: rustup target add ${{ matrix.target.rust-target }}
+          targets: ${{ matrix.target.rust-target }}
      - name: build b3sum
        id: build_b3sum
        run: python -u .github/workflows/build_b3sum.py ${{ matrix.target.rust-target }}
--- a/Cargo.toml
+++ b/Cargo.toml
@ -1,6 +1,6 @@
 [package]
 name = "blake3"
-version = "1.4.1"
+version = "1.5.1"
 authors = ["Jack O'Connor <oconnor663@gmail.com>", "Samuel Neves"]
 description = "the BLAKE3 hash function"
 repository = "https://github.com/BLAKE3-team/BLAKE3"
@ -23,11 +23,21 @@ neon = []
 # --no-default-features, the only way to use the SIMD implementations in this
 # crate is to enable the corresponding instruction sets statically for the
 # entire build, with e.g. RUSTFLAGS="-C target-cpu=native".
-std = ["digest/std"]
+std = []

-# The "rayon" feature (defined below as an optional dependency) enables the
-# `Hasher::update_rayon` method, for multithreaded hashing. However, even if
-# this feature is enabled, all other APIs remain single-threaded.
+# The `rayon` feature (disabled by default, but enabled for docs.rs) adds the
+# `update_rayon` and (in combination with `mmap` below) `update_mmap_rayon`
+# methods, for multithreaded hashing. However, even if this feature is enabled,
+# all other APIs remain single-threaded.
+rayon = ["dep:rayon", "std"]
+
+# The `mmap` feature (disabled by default, but enabled for docs.rs) adds the
+# `update_mmap` and (in combination with `rayon` above) `update_mmap_rayon`
+# helper methods for memory-mapped IO.
+mmap = ["std", "dep:memmap2"]
+
+# Implement the zeroize::Zeroize trait for types in this crate.
+zeroize = ["dep:zeroize", "arrayvec/zeroize"]

 # This crate implements traits from the RustCrypto project, exposed here as the
 # "traits-preview" feature. However, these traits aren't stable, and they're
@ -78,24 +88,29 @@ no_avx512 = []
 no_neon = []

 [package.metadata.docs.rs]
-# Document Hasher::update_rayon on docs.rs.
-features = ["rayon"]
+# Document the rayon/mmap methods and the Serialize/Deserialize/Zeroize impls on docs.rs.
+features = ["mmap", "rayon", "serde", "zeroize"]

 [dependencies]
 arrayref = "0.3.5"
-arrayvec = { version = "0.7.0", default-features = false }
+arrayvec = { version = "0.7.4", default-features = false }
 constant_time_eq = "0.3.0"
-rayon = { version = "1.2.1", optional = true }
 cfg-if = "1.0.0"
 digest = { version = "0.10.1", features = [ "mac" ], optional = true }
+memmap2 = { version = "0.9", optional = true }
+rayon = { version = "1.2.1", optional = true }
+serde = { version = "1.0", default-features = false, features = ["derive"], optional = true }
+zeroize = { version = "1", default-features = false, features = ["zeroize_derive"], optional = true }

 [dev-dependencies]
+hmac = "0.12.0"
 hex = "0.4.2"
-page_size = "0.5.0"
+page_size = "0.6.0"
 rand = "0.8.0"
 rand_chacha = "0.3.0"
 reference_impl = { path = "./reference_impl" }
-hmac = "0.12.0"
+tempfile = "3.8.0"
+serde_json = "1.0.107"

 [build-dependencies]
 cc = "1.0.4"
--- a/README.md
+++ b/README.md
@ -201,6 +201,7 @@ Alternatively, it is licensed under the Apache License 2.0.
 Here's a (non-exhaustive) list of protocols and software that use BLAKE3:

 * [Alephium](https://github.com/alephium/alephium/blob/master/crypto/src/main/scala/org/alephium/crypto/Blake3.scala)
+* [Bazel](https://github.com/bazelbuild/bazel/releases/tag/6.4.0)
 * [Chia](https://github.com/Chia-Network/chia-blockchain/blob/main/CHANGELOG.md#10beta8-aka-beta-18---2020-07-16)
 * [IPFS](https://github.com/ipfs/go-verifcid/issues/13)
 * [Farcaster](https://www.farcaster.xyz/)
@ -211,6 +212,7 @@ Here's a (non-exhaustive) list of protocols and software that use BLAKE3:
 * [Saito](https://saito.tech/)
 * [Skale](https://github.com/skalenetwork/skale-consensus/pull/284)
 * [Solana](https://docs.rs/solana-program/1.9.5/solana_program/blake3/index.html)
+* [Tekken 8](https://en.bandainamcoent.eu/tekken/tekken-8)
 * [Wasmer](https://github.com/wasmerio/wasmer/blob/4f935a8c162bf604df223003e434e4f7ca253688/lib/cache/src/hash.rs#L21)


--- a/b3sum/Cargo.lock
+++ b/b3sum/Cargo.lock
@ -4,58 +4,57 @@ version = 3

 [[package]]
 name = "anstream"
-version = "0.3.2"
+version = "0.6.13"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "0ca84f3628370c59db74ee214b3263d58f9aadd9b4fe7e711fd87dc452b7f163"
+checksum = "d96bd03f33fe50a863e394ee9718a706f988b9079b20c3784fb726e7678b62fb"
 dependencies = [
 "anstyle",
 "anstyle-parse",
 "anstyle-query",
 "anstyle-wincon",
 "colorchoice",
- "is-terminal",
 "utf8parse",
 ]

 [[package]]
 name = "anstyle"
-version = "1.0.1"
+version = "1.0.6"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "3a30da5c5f2d5e72842e00bcb57657162cdabef0931f40e2deb9b4140440cecd"
+checksum = "8901269c6307e8d93993578286ac0edf7f195079ffff5ebdeea6a59ffb7e36bc"

 [[package]]
 name = "anstyle-parse"
-version = "0.2.1"
+version = "0.2.3"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "938874ff5980b03a87c5524b3ae5b59cf99b1d6bc836848df7bc5ada9643c333"
+checksum = "c75ac65da39e5fe5ab759307499ddad880d724eed2f6ce5b5e8a26f4f387928c"
 dependencies = [
 "utf8parse",
 ]

 [[package]]
 name = "anstyle-query"
-version = "1.0.0"
+version = "1.0.2"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "5ca11d4be1bab0c8bc8734a9aa7bf4ee8316d462a08c6ac5052f888fef5b494b"
+checksum = "e28923312444cdd728e4738b3f9c9cac739500909bb3d3c94b43551b16517648"
 dependencies = [
- "windows-sys",
+ "windows-sys 0.52.0",
 ]

 [[package]]
 name = "anstyle-wincon"
-version = "1.0.1"
+version = "3.0.2"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "180abfa45703aebe0093f79badacc01b8fd4ea2e35118747e5811127f926e188"
+checksum = "1cd54b81ec8d6180e24654d0b371ad22fc3dd083b6ff8ba325b72e00c87660a7"
 dependencies = [
 "anstyle",
- "windows-sys",
+ "windows-sys 0.52.0",
 ]

 [[package]]
 name = "anyhow"
-version = "1.0.71"
+version = "1.0.81"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "9c7d0618f0e0b7e8ff11427422b64564d5fb0be1940354bfe2e0529b18a9d9b8"
+checksum = "0952808a6c2afd1aa8947271f3a60f1a6763c7b912d210184c5149b5cf147247"

 [[package]]
 name = "arrayref"
@ -69,22 +68,15 @@ version = "0.7.4"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "96d30a06541fbafbc7f82ed10c06164cfbd2c401138f6addd8404629c4b16711"

-[[package]]
-name = "autocfg"
-version = "1.1.0"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "d468802bab17cbc0cc575e9b053f41e72aa36bfa6b7f55e3529ffa43161b97fa"
-
 [[package]]
 name = "b3sum"
-version = "1.4.1"
+version = "1.5.1"
 dependencies = [
 "anyhow",
 "blake3",
 "clap",
 "duct",
 "hex",
- "memmap2",
 "rayon",
 "tempfile",
 "wild",
@ -92,43 +84,28 @@ dependencies = [

 [[package]]
 name = "bitflags"
-version = "1.3.2"
+version = "2.4.2"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "bef38d45163c2f1dde094a7dfd33ccf595c92905c8f8f4fdc18d06fb1037718a"
-
-[[package]]
-name = "bitflags"
-version = "2.3.3"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "630be753d4e58660abd17930c71b647fe46c27ea6b63cc59e1e3851406972e42"
+checksum = "ed570934406eb16438a4e976b1b4500774099c13b8cb96eec99f620f05090ddf"

 [[package]]
 name = "blake3"
-version = "1.4.1"
+version = "1.5.1"
 dependencies = [
 "arrayref",
 "arrayvec",
 "cc",
 "cfg-if",
 "constant_time_eq",
- "digest",
+ "memmap2",
 "rayon",
 ]

-[[package]]
-name = "block-buffer"
-version = "0.10.4"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "3078c7629b62d3f0439517fa394996acacc5cbc91c5a20d8c658e77abd503a71"
-dependencies = [
- "generic-array",
-]
-
 [[package]]
 name = "cc"
-version = "1.0.79"
+version = "1.0.90"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "50d30906286121d95be3d479533b458f87493b30a4b5f79a607db8f5d11aa91f"
+checksum = "8cd6604a82acf3039f1144f54b8eb34e91ffba622051189e71b781822d5ee1f5"

 [[package]]
 name = "cfg-if"
@ -138,20 +115,19 @@ checksum = "baf1de4339761588bc0619e3cbc0120ee582ebb74b53b4efbf79117bd2da40fd"

 [[package]]
 name = "clap"
-version = "4.3.11"
+version = "4.5.2"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "1640e5cc7fb47dbb8338fd471b105e7ed6c3cb2aeb00c2e067127ffd3764a05d"
+checksum = "b230ab84b0ffdf890d5a10abdbc8b83ae1c4918275daea1ab8801f71536b2651"
 dependencies = [
 "clap_builder",
 "clap_derive",
- "once_cell",
 ]

 [[package]]
 name = "clap_builder"
-version = "4.3.11"
+version = "4.5.2"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "98c59138d527eeaf9b53f35a77fcc1fad9d883116070c63d5de1c7dc7b00c72b"
+checksum = "ae129e2e766ae0ec03484e609954119f123cc1fe650337e155d03b022f24f7b4"
 dependencies = [
 "anstream",
 "anstyle",
@ -162,9 +138,9 @@ dependencies = [

 [[package]]
 name = "clap_derive"
-version = "4.3.2"
+version = "4.5.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "b8cd2b2a819ad6eec39e8f1d6b53001af1e5469f8c177579cdaeb313115b825f"
+checksum = "307bc0538d5f0f83b8248db3087aa92fe504e4691294d0c96c0eabc33f47ba47"
 dependencies = [
 "heck",
 "proc-macro2",
@ -174,9 +150,9 @@ dependencies = [

 [[package]]
 name = "clap_lex"
-version = "0.5.0"
+version = "0.7.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "2da6da31387c7e4ef160ffab6d5e7f00c42626fe39aea70a7b0f1773f7dd6c1b"
+checksum = "98cc8fbded0c607b7ba9dd60cd98df59af97e84d24e49c8557331cfc26d301ce"

 [[package]]
 name = "colorchoice"
@ -190,75 +166,36 @@ version = "0.3.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "f7144d30dcf0fafbce74250a3963025d8d52177934239851c917d29f1df280c2"

-[[package]]
-name = "crossbeam-channel"
-version = "0.5.8"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "a33c2bf77f2df06183c3aa30d1e96c0695a313d4f9c453cc3762a6db39f99200"
-dependencies = [
- "cfg-if",
- "crossbeam-utils",
-]
-
 [[package]]
 name = "crossbeam-deque"
-version = "0.8.3"
+version = "0.8.5"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "ce6fd6f855243022dcecf8702fef0c297d4338e226845fe067f6341ad9fa0cef"
+checksum = "613f8cc01fe9cf1a3eb3d7f488fd2fa8388403e97039e2f73692932e291a770d"
 dependencies = [
- "cfg-if",
 "crossbeam-epoch",
 "crossbeam-utils",
 ]

 [[package]]
 name = "crossbeam-epoch"
-version = "0.9.15"
+version = "0.9.18"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "ae211234986c545741a7dc064309f67ee1e5ad243d0e48335adc0484d960bcc7"
+checksum = "5b82ac4a3c2ca9c3460964f020e1402edd5753411d7737aa39c3714ad1b5420e"
 dependencies = [
- "autocfg",
- "cfg-if",
 "crossbeam-utils",
- "memoffset",
- "scopeguard",
 ]

 [[package]]
 name = "crossbeam-utils"
-version = "0.8.16"
+version = "0.8.19"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "5a22b2d63d4d1dc0b7f1b6b2747dd0088008a9be28b6ddf0b1e7d335e3037294"
-dependencies = [
- "cfg-if",
-]
-
-[[package]]
-name = "crypto-common"
-version = "0.1.6"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "1bfb12502f3fc46cca1bb51ac28df9d618d813cdc3d2f25b9fe775a34af26bb3"
-dependencies = [
- "generic-array",
- "typenum",
-]
-
-[[package]]
-name = "digest"
-version = "0.10.7"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "9ed9a281f7bc9b7576e61468ba615a66a5c8cfdff42420a70aa82701a3b1e292"
-dependencies = [
- "block-buffer",
- "crypto-common",
- "subtle",
-]
+checksum = "248e3bacc7dc6baa3b21e405ee045c3047101a49145e7e9eca583ab4c2ca5345"

 [[package]]
 name = "duct"
-version = "0.13.6"
+version = "0.13.7"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "37ae3fc31835f74c2a7ceda3aeede378b0ae2e74c8f1c36559fcc9ae2a4e7d3e"
+checksum = "e4ab5718d1224b63252cd0c6f74f6480f9ffeb117438a2e0f5cf6d9a4798929c"
 dependencies = [
 "libc",
 "once_cell",
@ -268,49 +205,25 @@ dependencies = [

 [[package]]
 name = "either"
-version = "1.8.1"
+version = "1.10.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "7fcaabb2fef8c910e7f4c7ce9f67a1283a1715879a7c230ca9d6d1ae31f16d91"
+checksum = "11157ac094ffbdde99aa67b23417ebdd801842852b500e395a45a9c0aac03e4a"

 [[package]]
 name = "errno"
-version = "0.3.1"
+version = "0.3.8"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "4bcfec3a70f97c962c307b2d2c56e358cf1d00b558d74262b5f929ee8cc7e73a"
+checksum = "a258e46cdc063eb8519c00b9fc845fc47bcfca4130e2f08e88665ceda8474245"
 dependencies = [
- "errno-dragonfly",
- "libc",
- "windows-sys",
-]
-
-[[package]]
-name = "errno-dragonfly"
-version = "0.1.2"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "aa68f1b12764fab894d2755d2518754e71b4fd80ecfb822714a1206c2aab39bf"
-dependencies = [
- "cc",
 "libc",
+ "windows-sys 0.52.0",
 ]

 [[package]]
 name = "fastrand"
-version = "1.9.0"
+version = "2.0.1"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "e51093e27b0797c359783294ca4f0a911c270184cb10f85783b118614a1501be"
-dependencies = [
- "instant",
-]
-
-[[package]]
-name = "generic-array"
-version = "0.14.7"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "85649ca51fd72272d7821adaf274ad91c288277713d9c18820d8499a7ff69e9a"
-dependencies = [
- "typenum",
- "version_check",
-]
+checksum = "25cbce373ec4653f1a01a31e8a5e5ec0c622dc27ff9c4e6606eefef5cbbed4a5"

 [[package]]
 name = "glob"
@ -324,134 +237,72 @@ version = "0.4.1"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "95505c38b4572b2d910cecb0281560f54b440a19336cbbcb27bf6ce6adc6f5a8"

-[[package]]
-name = "hermit-abi"
-version = "0.3.2"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "443144c8cdadd93ebf52ddb4056d257f5b52c04d3c804e657d19eb73fc33668b"
-
 [[package]]
 name = "hex"
 version = "0.4.3"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "7f24254aa9a54b5c858eaee2f5bccdb46aaf0e486a595ed5fd8f86ba55232a70"

-[[package]]
-name = "instant"
-version = "0.1.12"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "7a5bbe824c507c5da5956355e86a746d82e0e1464f65d862cc5e71da70e94b2c"
-dependencies = [
- "cfg-if",
-]
-
-[[package]]
-name = "io-lifetimes"
-version = "1.0.11"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "eae7b9aee968036d54dce06cebaefd919e4472e753296daccd6d344e3e2df0c2"
-dependencies = [
- "hermit-abi",
- "libc",
- "windows-sys",
-]
-
-[[package]]
-name = "is-terminal"
-version = "0.4.9"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "cb0889898416213fab133e1d33a0e5858a48177452750691bde3666d0fdbaf8b"
-dependencies = [
- "hermit-abi",
- "rustix 0.38.3",
- "windows-sys",
-]
-
 [[package]]
 name = "libc"
-version = "0.2.147"
+version = "0.2.153"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "b4668fb0ea861c1df094127ac5f1da3409a82116a4ba74fca2e58ef927159bb3"
+checksum = "9c198f91728a82281a64e1f4f9eeb25d82cb32a5de251c6bd1b5154d63a8e7bd"

 [[package]]
 name = "linux-raw-sys"
-version = "0.3.8"
+version = "0.4.13"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "ef53942eb7bf7ff43a617b3e2c1c4a5ecf5944a7c1bc12d7ee39bbb15e5c1519"
-
-[[package]]
-name = "linux-raw-sys"
-version = "0.4.3"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "09fc20d2ca12cb9f044c93e3bd6d32d523e6e2ec3db4f7b2939cd99026ecd3f0"
+checksum = "01cda141df6706de531b6c46c3a33ecca755538219bd484262fa09410c13539c"

 [[package]]
 name = "memmap2"
-version = "0.7.1"
+version = "0.9.4"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "f49388d20533534cd19360ad3d6a7dadc885944aa802ba3995040c5ec11288c6"
+checksum = "fe751422e4a8caa417e13c3ea66452215d7d63e19e604f4980461212f3ae1322"
 dependencies = [
 "libc",
 ]

-[[package]]
-name = "memoffset"
-version = "0.9.0"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "5a634b1c61a95585bd15607c6ab0c4e5b226e695ff2800ba0cdccddf208c406c"
-dependencies = [
- "autocfg",
-]
-
-[[package]]
-name = "num_cpus"
-version = "1.16.0"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "4161fcb6d602d4d2081af7c3a45852d875a03dd337a6bfdd6e06407b61342a43"
-dependencies = [
- "hermit-abi",
- "libc",
-]
-
 [[package]]
 name = "once_cell"
-version = "1.18.0"
+version = "1.19.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "dd8b5dd2ae5ed71462c540258bedcb51965123ad7e7ccf4b9a8cafaa4a63576d"
+checksum = "3fdb12b2476b595f9358c5161aa467c2438859caa136dec86c26fdd2efe17b92"

 [[package]]
 name = "os_pipe"
-version = "1.1.4"
+version = "1.1.5"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "0ae859aa07428ca9a929b936690f8b12dc5f11dd8c6992a18ca93919f28bc177"
+checksum = "57119c3b893986491ec9aa85056780d3a0f3cf4da7cc09dd3650dbd6c6738fb9"
 dependencies = [
 "libc",
- "windows-sys",
+ "windows-sys 0.52.0",
 ]

 [[package]]
 name = "proc-macro2"
-version = "1.0.63"
+version = "1.0.79"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "7b368fba921b0dce7e60f5e04ec15e565b3303972b42bcfde1d0713b881959eb"
+checksum = "e835ff2298f5721608eb1a980ecaee1aef2c132bf95ecc026a11b7bf3c01c02e"
 dependencies = [
 "unicode-ident",
 ]

 [[package]]
 name = "quote"
-version = "1.0.29"
+version = "1.0.35"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "573015e8ab27661678357f27dc26460738fd2b6c86e46f386fde94cb5d913105"
+checksum = "291ec9ab5efd934aaf503a6466c5d5251535d108ee747472c3977cc5acc868ef"
 dependencies = [
 "proc-macro2",
 ]

 [[package]]
 name = "rayon"
-version = "1.7.0"
+version = "1.9.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "1d2df5196e37bcc87abebc0053e20787d73847bb33134a69841207dd0a47f03b"
+checksum = "e4963ed1bc86e4f3ee217022bd855b297cef07fb9eac5dfa1f788b220b49b3bd"
 dependencies = [
 "either",
 "rayon-core",
@ -459,58 +310,27 @@ dependencies = [

 [[package]]
 name = "rayon-core"
-version = "1.11.0"
+version = "1.12.1"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "4b8f95bd6966f5c87776639160a66bd8ab9895d9d4ab01ddba9fc60661aebe8d"
+checksum = "1465873a3dfdaa8ae7cb14b4383657caab0b3e8a0aa9ae8e04b044854c8dfce2"
 dependencies = [
- "crossbeam-channel",
 "crossbeam-deque",
 "crossbeam-utils",
- "num_cpus",
-]
-
-[[package]]
-name = "redox_syscall"
-version = "0.3.5"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "567664f262709473930a4bf9e51bf2ebf3348f2e748ccc50dea20646858f8f29"
-dependencies = [
- "bitflags 1.3.2",
 ]

 [[package]]
 name = "rustix"
-version = "0.37.23"
+version = "0.38.31"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "4d69718bf81c6127a49dc64e44a742e8bb9213c0ff8869a22c308f84c1d4ab06"
+checksum = "6ea3e1a662af26cd7a3ba09c0297a31af215563ecf42817c98df621387f4e949"
 dependencies = [
- "bitflags 1.3.2",
- "errno",
- "io-lifetimes",
- "libc",
- "linux-raw-sys 0.3.8",
- "windows-sys",
-]
-
-[[package]]
-name = "rustix"
-version = "0.38.3"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "ac5ffa1efe7548069688cd7028f32591853cd7b5b756d41bcffd2353e4fc75b4"
-dependencies = [
- "bitflags 2.3.3",
+ "bitflags",
 "errno",
 "libc",
- "linux-raw-sys 0.4.3",
- "windows-sys",
+ "linux-raw-sys",
+ "windows-sys 0.52.0",
 ]

-[[package]]
-name = "scopeguard"
-version = "1.1.0"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "d29ab0c6d3fc0ee92fe66e2d99f700eab17a8d57d1c1d3b748380fb20baa78cd"
-
 [[package]]
 name = "shared_child"
 version = "1.0.0"
@ -523,21 +343,15 @@ dependencies = [

 [[package]]
 name = "strsim"
-version = "0.10.0"
+version = "0.11.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "73473c0e59e6d5812c5dfe2a064a6444949f089e20eec9a2e5506596494e4623"
-
-[[package]]
-name = "subtle"
-version = "2.5.0"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "81cdd64d312baedb58e21336b31bc043b77e01cc99033ce76ef539f78e965ebc"
+checksum = "5ee073c9e4cd00e28217186dbe12796d692868f432bf2e97ee73bed0c56dfa01"

 [[package]]
 name = "syn"
-version = "2.0.23"
+version = "2.0.52"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "59fb7d6d8281a51045d62b8eb3a7d1ce347b76f312af50cd3dc0af39c87c1737"
+checksum = "b699d15b36d1f02c3e7c69f8ffef53de37aefae075d8488d4ba1a7788d574a07"
 dependencies = [
 "proc-macro2",
 "quote",
@ -546,39 +360,31 @@ dependencies = [

 [[package]]
 name = "tempfile"
-version = "3.6.0"
+version = "3.10.1"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "31c0432476357e58790aaa47a8efb0c5138f137343f3b5f23bd36a27e3b0a6d6"
+checksum = "85b77fafb263dd9d05cbeac119526425676db3784113aa9295c88498cbf8bff1"
 dependencies = [
- "autocfg",
 "cfg-if",
 "fastrand",
- "redox_syscall",
- "rustix 0.37.23",
- "windows-sys",
+ "rustix",
+ "windows-sys 0.52.0",
 ]

 [[package]]
 name = "terminal_size"
-version = "0.2.6"
+version = "0.3.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "8e6bf6f19e9f8ed8d4048dc22981458ebcf406d67e94cd422e5ecd73d63b3237"
+checksum = "21bebf2b7c9e0a515f6e0f8c51dc0f8e4696391e6f1ff30379559f8365fb0df7"
 dependencies = [
- "rustix 0.37.23",
- "windows-sys",
+ "rustix",
+ "windows-sys 0.48.0",
 ]

-[[package]]
-name = "typenum"
-version = "1.16.0"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "497961ef93d974e23eb6f433eb5fe1b7930b659f06d12dec6fc44a8f554c0bba"
-
 [[package]]
 name = "unicode-ident"
-version = "1.0.10"
+version = "1.0.12"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "22049a19f4a68748a168c0fc439f9516686aa045927ff767eca0a85101fb6e73"
+checksum = "3354b9ac3fae1ff6755cb6db53683adb661634f67557942dea4facebec0fee4b"

 [[package]]
 name = "utf8parse"
@ -586,17 +392,11 @@ version = "0.2.1"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "711b9620af191e0cdc7468a8d14e709c3dcdb115b36f838e601583af800a370a"

-[[package]]
-name = "version_check"
-version = "0.9.4"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "49874b5167b65d7193b8aba1567f5c7d93d001cafc34600cee003eda787e483f"
-
 [[package]]
 name = "wild"
-version = "2.1.0"
+version = "2.2.1"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "05b116685a6be0c52f5a103334cbff26db643826c7b3735fc0a3ba9871310a74"
+checksum = "a3131afc8c575281e1e80f36ed6a092aa502c08b18ed7524e86fbbb12bb410e1"
 dependencies = [
 "glob",
 ]
@ -629,62 +429,128 @@ version = "0.48.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "677d2418bec65e3338edb076e806bc1ec15693c5d0104683f2efe857f61056a9"
 dependencies = [
- "windows-targets",
+ "windows-targets 0.48.5",
+]
+
+[[package]]
+name = "windows-sys"
+version = "0.52.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "282be5f36a8ce781fad8c8ae18fa3f9beff57ec1b52cb3de0789201425d9a33d"
+dependencies = [
+ "windows-targets 0.52.4",
 ]

 [[package]]
 name = "windows-targets"
-version = "0.48.1"
+version = "0.48.5"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "05d4b17490f70499f20b9e791dcf6a299785ce8af4d709018206dc5b4953e95f"
+checksum = "9a2fa6e2155d7247be68c096456083145c183cbbbc2764150dda45a87197940c"
 dependencies = [
- "windows_aarch64_gnullvm",
- "windows_aarch64_msvc",
- "windows_i686_gnu",
- "windows_i686_msvc",
- "windows_x86_64_gnu",
- "windows_x86_64_gnullvm",
- "windows_x86_64_msvc",
+ "windows_aarch64_gnullvm 0.48.5",
+ "windows_aarch64_msvc 0.48.5",
+ "windows_i686_gnu 0.48.5",
+ "windows_i686_msvc 0.48.5",
+ "windows_x86_64_gnu 0.48.5",
+ "windows_x86_64_gnullvm 0.48.5",
+ "windows_x86_64_msvc 0.48.5",
+]
+
+[[package]]
+name = "windows-targets"
+version = "0.52.4"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "7dd37b7e5ab9018759f893a1952c9420d060016fc19a472b4bb20d1bdd694d1b"
+dependencies = [
+ "windows_aarch64_gnullvm 0.52.4",
+ "windows_aarch64_msvc 0.52.4",
+ "windows_i686_gnu 0.52.4",
+ "windows_i686_msvc 0.52.4",
+ "windows_x86_64_gnu 0.52.4",
+ "windows_x86_64_gnullvm 0.52.4",
+ "windows_x86_64_msvc 0.52.4",
 ]

 [[package]]
 name = "windows_aarch64_gnullvm"
-version = "0.48.0"
+version = "0.48.5"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "91ae572e1b79dba883e0d315474df7305d12f569b400fcf90581b06062f7e1bc"
+checksum = "2b38e32f0abccf9987a4e3079dfb67dcd799fb61361e53e2882c3cbaf0d905d8"
+
+[[package]]
+name = "windows_aarch64_gnullvm"
+version = "0.52.4"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "bcf46cf4c365c6f2d1cc93ce535f2c8b244591df96ceee75d8e83deb70a9cac9"

 [[package]]
 name = "windows_aarch64_msvc"
-version = "0.48.0"
+version = "0.48.5"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "b2ef27e0d7bdfcfc7b868b317c1d32c641a6fe4629c171b8928c7b08d98d7cf3"
+checksum = "dc35310971f3b2dbbf3f0690a219f40e2d9afcf64f9ab7cc1be722937c26b4bc"
+
+[[package]]
+name = "windows_aarch64_msvc"
+version = "0.52.4"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "da9f259dd3bcf6990b55bffd094c4f7235817ba4ceebde8e6d11cd0c5633b675"

 [[package]]
 name = "windows_i686_gnu"
-version = "0.48.0"
+version = "0.48.5"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "622a1962a7db830d6fd0a69683c80a18fda201879f0f447f065a3b7467daa241"
+checksum = "a75915e7def60c94dcef72200b9a8e58e5091744960da64ec734a6c6e9b3743e"
+
+[[package]]
+name = "windows_i686_gnu"
+version = "0.52.4"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "b474d8268f99e0995f25b9f095bc7434632601028cf86590aea5c8a5cb7801d3"

 [[package]]
 name = "windows_i686_msvc"
-version = "0.48.0"
+version = "0.48.5"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "4542c6e364ce21bf45d69fdd2a8e455fa38d316158cfd43b3ac1c5b1b19f8e00"
+checksum = "8f55c233f70c4b27f66c523580f78f1004e8b5a8b659e05a4eb49d4166cca406"
+
+[[package]]
+name = "windows_i686_msvc"
+version = "0.52.4"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "1515e9a29e5bed743cb4415a9ecf5dfca648ce85ee42e15873c3cd8610ff8e02"

 [[package]]
 name = "windows_x86_64_gnu"
-version = "0.48.0"
+version = "0.48.5"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "ca2b8a661f7628cbd23440e50b05d705db3686f894fc9580820623656af974b1"
+checksum = "53d40abd2583d23e4718fddf1ebec84dbff8381c07cae67ff7768bbf19c6718e"
+
+[[package]]
+name = "windows_x86_64_gnu"
+version = "0.52.4"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "5eee091590e89cc02ad514ffe3ead9eb6b660aedca2183455434b93546371a03"

 [[package]]
 name = "windows_x86_64_gnullvm"
-version = "0.48.0"
+version = "0.48.5"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "7896dbc1f41e08872e9d5e8f8baa8fdd2677f29468c4e156210174edc7f7b953"
+checksum = "0b7b52767868a23d5bab768e390dc5f5c55825b6d30b86c844ff2dc7414044cc"
+
+[[package]]
+name = "windows_x86_64_gnullvm"
+version = "0.52.4"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "77ca79f2451b49fa9e2af39f0747fe999fcda4f5e241b2898624dca97a1f2177"

 [[package]]
 name = "windows_x86_64_msvc"
-version = "0.48.0"
+version = "0.48.5"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "1a515f5799fe4961cb532f983ce2b23082366b898e52ffbce459c86f67c8378a"
+checksum = "ed94fce61571a4006852b7389a063ab983c02eb1bb37b47f8272ce92d06d9538"
+
+[[package]]
+name = "windows_x86_64_msvc"
+version = "0.52.4"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "32b752e52a2da0ddfbdbcc6fceadfeede4c939ed16d13e648833a61dfb611ed8"
--- a/b3sum/Cargo.toml
+++ b/b3sum/Cargo.toml
@ -1,6 +1,6 @@
 [package]
 name = "b3sum"
-version = "1.4.1"
+version = "1.5.1"
 authors = ["Jack O'Connor <oconnor663@gmail.com>"]
 description = "a command line implementation of the BLAKE3 hash function"
 repository = "https://github.com/BLAKE3-team/BLAKE3"
@ -15,10 +15,9 @@ pure = ["blake3/pure"]

 [dependencies]
 anyhow = "1.0.25"
-blake3 = { version = "1", path = "..", features = ["rayon"] }
+blake3 = { version = "1", path = "..", features = ["mmap", "rayon"] }
 clap = { version = "4.0.8", features = ["derive", "wrap_help"] }
 hex = "0.4.0"
-memmap2 = "0.7.0"
 rayon = "1.2.1"
 wild = "2.0.3"

--- a/b3sum/LICENSE
+++ b/b3sum/LICENSE
@ -0,0 +1 @@
+../LICENSE
--- a/b3sum/src/main.rs
+++ b/b3sum/src/main.rs
@ -163,125 +163,22 @@ impl Args {
    }
 }

-enum Input {
-    Mmap(io::Cursor<memmap2::Mmap>),
-    File(File),
-    Stdin,
-}
-
-impl Input {
-    // Open an input file, using mmap if appropriate. "-" means stdin. Note
-    // that this convention applies both to command line arguments, and to
-    // filepaths that appear in a checkfile.
-    fn open(path: &Path, args: &Args) -> Result<Self> {
-        if path == Path::new("-") {
-            if args.keyed() {
-                bail!("Cannot open `-` in keyed mode");
-            }
-            return Ok(Self::Stdin);
+fn hash_path(args: &Args, path: &Path) -> Result<blake3::OutputReader> {
+    let mut hasher = args.base_hasher.clone();
+    if path == Path::new("-") {
+        if args.keyed() {
+            bail!("Cannot open `-` in keyed mode");
        }
-        let file = File::open(path)?;
-        if !args.no_mmap() {
-            if let Some(mmap) = maybe_memmap_file(&file)? {
-                return Ok(Self::Mmap(io::Cursor::new(mmap)));
-            }
-        }
-        Ok(Self::File(file))
-    }
-
-    fn hash(&mut self, args: &Args) -> Result<blake3::OutputReader> {
-        let mut hasher = args.base_hasher.clone();
-        match self {
-            // The fast path: If we mmapped the file successfully, hash using
-            // multiple threads. This doesn't work on stdin, or on some files,
-            // and it can also be disabled with --no-mmap.
-            Self::Mmap(cursor) => {
-                hasher.update_rayon(cursor.get_ref());
-            }
-            // The slower paths, for stdin or files we didn't/couldn't mmap.
-            // This is currently all single-threaded. Doing multi-threaded
-            // hashing without memory mapping is tricky, since all your worker
-            // threads have to stop every time you refill the buffer, and that
-            // ends up being a lot of overhead. To solve that, we need a more
-            // complicated double-buffering strategy where a background thread
-            // fills one buffer while the worker threads are hashing the other
-            // one. We might implement that in the future, but since this is
-            // the slow path anyway, it's not high priority.
-            Self::File(file) => {
-                copy_wide(file, &mut hasher)?;
-            }
-            Self::Stdin => {
-                let stdin = io::stdin();
-                let lock = stdin.lock();
-                copy_wide(lock, &mut hasher)?;
-            }
-        }
-        let mut output_reader = hasher.finalize_xof();
-        output_reader.set_position(args.seek());
-        Ok(output_reader)
-    }
-}
-
-impl Read for Input {
-    fn read(&mut self, buf: &mut [u8]) -> io::Result<usize> {
-        match self {
-            Self::Mmap(cursor) => cursor.read(buf),
-            Self::File(file) => file.read(buf),
-            Self::Stdin => io::stdin().read(buf),
-        }
-    }
-}
-
-// A 16 KiB buffer is enough to take advantage of all the SIMD instruction sets
-// that we support, but `std::io::copy` currently uses 8 KiB. Most platforms
-// can support at least 64 KiB, and there's some performance benefit to using
-// bigger reads, so that's what we use here.
-fn copy_wide(mut reader: impl Read, hasher: &mut blake3::Hasher) -> io::Result<u64> {
-    let mut buffer = [0; 65536];
-    let mut total = 0;
-    loop {
-        match reader.read(&mut buffer) {
-            Ok(0) => return Ok(total),
-            Ok(n) => {
-                hasher.update(&buffer[..n]);
-                total += n as u64;
-            }
-            Err(ref e) if e.kind() == io::ErrorKind::Interrupted => continue,
-            Err(e) => return Err(e),
-        }
-    }
-}
-
-// Mmap a file, if it looks like a good idea. Return None in cases where we
-// know mmap will fail, or if the file is short enough that mmapping isn't
-// worth it. However, if we do try to mmap and it fails, return the error.
-fn maybe_memmap_file(file: &File) -> Result<Option<memmap2::Mmap>> {
-    let metadata = file.metadata()?;
-    let file_size = metadata.len();
-    Ok(if !metadata.is_file() {
-        // Not a real file.
-        None
-    } else if file_size > isize::max_value() as u64 {
-        // Too long to safely map.
-        // https://github.com/danburkert/memmap-rs/issues/69
-        None
-    } else if file_size == 0 {
-        // Mapping an empty file currently fails.
-        // https://github.com/danburkert/memmap-rs/issues/72
-        None
-    } else if file_size < 16 * 1024 {
-        // Mapping small files is not worth it.
-        None
+        hasher.update_reader(io::stdin().lock())?;
+    } else if args.no_mmap() {
+        hasher.update_reader(File::open(path)?)?;
    } else {
-        // Explicitly set the length of the memory map, so that filesystem
-        // changes can't race to violate the invariants we just checked.
-        let map = unsafe {
-            memmap2::MmapOptions::new()
-                .len(file_size as usize)
-                .map(file)?
-        };
-        Some(map)
-    })
+        // The fast path: Try to mmap the file and hash it with multiple threads.
+        hasher.update_mmap_rayon(path)?;
+    }
+    let mut output_reader = hasher.finalize_xof();
+    output_reader.set_position(args.seek());
+    Ok(output_reader)
 }

 fn write_hex_output(mut output: blake3::OutputReader, args: &Args) -> Result<()> {
@ -477,8 +374,7 @@ fn parse_check_line(mut line: &str) -> Result<ParsedCheckLine> {
 }

 fn hash_one_input(path: &Path, args: &Args) -> Result<()> {
-    let mut input = Input::open(path, args)?;
-    let output = input.hash(args)?;
+    let output = hash_path(args, path)?;
    if args.raw() {
        write_raw_output(output, args)?;
        return Ok(());
@ -522,15 +418,13 @@ fn check_one_line(line: &str, args: &Args) -> bool {
    } else {
        file_string
    };
-    let hash_result: Result<blake3::Hash> = Input::open(&file_path, args)
-        .and_then(|mut input| input.hash(args))
-        .map(|mut hash_output| {
+    let found_hash: blake3::Hash;
+    match hash_path(args, &file_path) {
+        Ok(mut output) => {
            let mut found_hash_bytes = [0; blake3::OUT_LEN];
-            hash_output.fill(&mut found_hash_bytes);
-            found_hash_bytes.into()
-        });
-    let found_hash: blake3::Hash = match hash_result {
-        Ok(hash) => hash,
+            output.fill(&mut found_hash_bytes);
+            found_hash = found_hash_bytes.into();
+        }
        Err(e) => {
            println!("{}: FAILED ({})", file_string, e);
            return false;
@ -549,8 +443,18 @@ fn check_one_line(line: &str, args: &Args) -> bool {
 }

 fn check_one_checkfile(path: &Path, args: &Args, files_failed: &mut u64) -> Result<()> {
-    let checkfile_input = Input::open(path, args)?;
-    let mut bufreader = io::BufReader::new(checkfile_input);
+    let mut file;
+    let stdin;
+    let mut stdin_lock;
+    let mut bufreader: io::BufReader<&mut dyn Read>;
+    if path == Path::new("-") {
+        stdin = io::stdin();
+        stdin_lock = stdin.lock();
+        bufreader = io::BufReader::new(&mut stdin_lock);
+    } else {
+        file = File::open(path)?;
+        bufreader = io::BufReader::new(&mut file);
+    }
    let mut line = String::new();
    loop {
        line.clear();
--- a/build.rs
+++ b/build.rs
@ -60,6 +60,20 @@ fn is_armv7() -> bool {
    target_components()[0] == "armv7"
 }

+fn endianness() -> String {
+    let endianness = env::var("CARGO_CFG_TARGET_ENDIAN").unwrap();
+    assert!(endianness == "little" || endianness == "big");
+    endianness
+}
+
+fn is_little_endian() -> bool {
+    endianness() == "little"
+}
+
+fn is_big_endian() -> bool {
+    endianness() == "big"
+}
+
 // Windows targets may be using the MSVC toolchain or the GNU toolchain. The
 // right compiler flags to use depend on the toolchain. (And we don't want to
 // use flag_if_supported, because we don't want features to be silently
@ -253,7 +267,13 @@ fn main() -> Result<(), Box<dyn std::error::Error>> {
        }
    }

-    if (is_arm() && is_neon()) || (!is_no_neon() && !is_pure() && is_aarch64()) {
+    if is_neon() && is_big_endian() {
+        panic!("The NEON implementation doesn't support big-endian ARM.")
+    }
+
+    if (is_arm() && is_neon())
+        || (!is_no_neon() && !is_pure() && is_aarch64() && is_little_endian())
+    {
        println!("cargo:rustc-cfg=blake3_neon");
        build_neon_c_intrinsics();
    }
--- a/c/CMakeLists.txt
+++ b/c/CMakeLists.txt
@ -1,7 +1,12 @@
-cmake_minimum_required(VERSION 3.9)
+cmake_minimum_required(VERSION 3.9 FATAL_ERROR)
+
+# respect C_EXTENSIONS OFF without explicitly setting C_STANDARD
+if (POLICY CMP0128)
+  cmake_policy(SET CMP0128 NEW)
+endif()

 project(libblake3
-  VERSION 1.4.1
+  VERSION 1.5.1
  DESCRIPTION "BLAKE3 C implementation"
  LANGUAGES C ASM
 )
@ -9,8 +14,12 @@ project(libblake3
 include(FeatureSummary)
 include(GNUInstallDirs)

+# architecture lists for which to enable assembly / SIMD sources
+set(BLAKE3_AMD64_NAMES amd64 AMD64 x86_64)
+set(BLAKE3_X86_NAMES i686 x86 X86)
+set(BLAKE3_ARMv8_NAMES aarch64 AArch64 arm64 ARM64 armv8 armv8a)
 # default SIMD compiler flag configuration (can be overriden by toolchains or CLI)
-if(CMAKE_C_COMPILER_ID STREQUAL "MSVC")
+if(MSVC)
  set(BLAKE3_CFLAGS_SSE2 "/arch:SSE2" CACHE STRING "the compiler flags to enable SSE2")
  # MSVC has no dedicated sse4.1 flag (see https://learn.microsoft.com/en-us/cpp/build/reference/arch-x86?view=msvc-170)
  set(BLAKE3_CFLAGS_SSE4.1 "/arch:AVX" CACHE STRING "the compiler flags to enable SSE4.1")
@ -24,11 +33,13 @@ elseif(CMAKE_C_COMPILER_ID STREQUAL "GNU"
  set(BLAKE3_CFLAGS_SSE4.1 "-msse4.1" CACHE STRING "the compiler flags to enable SSE4.1")
  set(BLAKE3_CFLAGS_AVX2 "-mavx2" CACHE STRING "the compiler flags to enable AVX2")
  set(BLAKE3_CFLAGS_AVX512 "-mavx512f -mavx512vl" CACHE STRING "the compiler flags to enable AVX512")
+
+  if (CMAKE_SYSTEM_PROCESSOR IN_LIST BLAKE3_ARMv8_NAMES
+      AND NOT CMAKE_SIZEOF_VOID_P EQUAL 8)
+    # 32-bit ARMv8 needs NEON to be enabled explicitly
+    set(BLAKE3_CFLAGS_NEON "-mfpu=neon" CACHE STRING "the compiler flags to enable NEON")
+  endif()
 endif()
-# architecture lists for which to enable assembly / SIMD sources
-set(BLAKE3_AMD64_NAMES amd64 AMD64 x86_64)
-set(BLAKE3_X86_NAMES i686 x86 X86)
-set(BLAKE3_ARMv8_NAMES aarch64 AArch64 arm64 ARM64 armv8 armv8a)

 # library target
 add_library(blake3
@ -41,33 +52,47 @@ add_library(BLAKE3::blake3 ALIAS blake3)
 # library configuration
 set(BLAKE3_PKGCONFIG_CFLAGS)
 if (BUILD_SHARED_LIBS)
-  target_compile_definitions(blake3 
+  target_compile_definitions(blake3
    PUBLIC BLAKE3_DLL
    PRIVATE BLAKE3_DLL_EXPORTS
  )
  list(APPEND BLAKE3_PKGCONFIG_CFLAGS -DBLAKE3_DLL)
 endif()
-target_include_directories(blake3 PUBLIC $<INSTALL_INTERFACE:${CMAKE_INSTALL_INCLUDEDIR}>)
+target_include_directories(blake3 PUBLIC
+  $<BUILD_INTERFACE:${CMAKE_CURRENT_SOURCE_DIR}>
+  $<INSTALL_INTERFACE:${CMAKE_INSTALL_INCLUDEDIR}>
+)
 set_target_properties(blake3 PROPERTIES
  VERSION ${PROJECT_VERSION}
  SOVERSION 0
  C_VISIBILITY_PRESET hidden
+  C_EXTENSIONS OFF
 )
+target_compile_features(blake3 PUBLIC c_std_99)
+# ensure C_EXTENSIONS OFF is respected without overriding CMAKE_C_STANDARD
+# which may be set by the user or toolchain file
+if (NOT POLICY CMP0128 AND NOT DEFINED CMAKE_C_STANDARD)
+  set_target_properties(blake3 PROPERTIES C_STANDARD 99)
+endif()

 # optional SIMD sources
 macro(BLAKE3_DISABLE_SIMD)
  set(BLAKE3_SIMD_AMD64_ASM OFF)
  set(BLAKE3_SIMD_X86_INTRINSICS OFF)
  set(BLAKE3_SIMD_NEON_INTRINSICS OFF)
-  set_source_files_properties(blake3_dispatch.c PROPERTIES 
-    COMPILE_DEFINITIONS BLAKE3_USE_NEON=0;BLAKE3_NO_SSE2;BLAKE3_NO_SSE41;BLAKE3_NO_AVX2;BLAKE3_NO_AVX512
+  target_compile_definitions(blake3 PRIVATE
+    BLAKE3_USE_NEON=0
+    BLAKE3_NO_SSE2
+    BLAKE3_NO_SSE41
+    BLAKE3_NO_AVX2
+    BLAKE3_NO_AVX512
  )
 endmacro()

 if(CMAKE_SYSTEM_PROCESSOR IN_LIST BLAKE3_AMD64_NAMES OR BLAKE3_USE_AMD64_ASM)
  set(BLAKE3_SIMD_AMD64_ASM ON)

-  if(CMAKE_C_COMPILER_ID STREQUAL "MSVC")
+  if(MSVC)
    enable_language(ASM_MASM)
    target_sources(blake3 PRIVATE
      blake3_avx2_x86-64_windows_msvc.asm
@ -99,7 +124,7 @@ if(CMAKE_SYSTEM_PROCESSOR IN_LIST BLAKE3_AMD64_NAMES OR BLAKE3_USE_AMD64_ASM)
      BLAKE3_DISABLE_SIMD()
    endif()

-  else()  
+  else()
    BLAKE3_DISABLE_SIMD()
  endif()

@ -121,17 +146,19 @@ elseif((CMAKE_SYSTEM_PROCESSOR IN_LIST BLAKE3_X86_NAMES OR BLAKE3_USE_X86_INTRIN
  set_source_files_properties(blake3_sse2.c PROPERTIES COMPILE_FLAGS "${BLAKE3_CFLAGS_SSE2}")
  set_source_files_properties(blake3_sse41.c PROPERTIES COMPILE_FLAGS "${BLAKE3_CFLAGS_SSE4.1}")

-elseif(CMAKE_SYSTEM_PROCESSOR IN_LIST BLAKE3_ARMv8_NAMES
-        OR ((ANDROID_ABI STREQUAL "armeabi-v7a"
-            OR BLAKE3_USE_NEON_INTRINSICS)
-          AND (DEFINED BLAKE3_CFLAGS_NEON
-            OR CMAKE_SIZEOF_VOID_P EQUAL 8)))
+elseif((CMAKE_SYSTEM_PROCESSOR IN_LIST BLAKE3_ARMv8_NAMES
+          OR ANDROID_ABI STREQUAL "armeabi-v7a"
+          OR BLAKE3_USE_NEON_INTRINSICS)
+        AND (DEFINED BLAKE3_CFLAGS_NEON
+          OR CMAKE_SIZEOF_VOID_P EQUAL 8))
  set(BLAKE3_SIMD_NEON_INTRINSICS ON)

  target_sources(blake3 PRIVATE
    blake3_neon.c
  )
-  set_source_files_properties(blake3_dispatch.c PROPERTIES COMPILE_DEFINITIONS BLAKE3_USE_NEON=1)
+  target_compile_definitions(blake3 PRIVATE
+    BLAKE3_USE_NEON=1
+  )

  if (DEFINED BLAKE3_CFLAGS_NEON)
    set_source_files_properties(blake3_neon.c PROPERTIES COMPILE_FLAGS "${BLAKE3_CFLAGS_NEON}")
--- a/c/blake3.c
+++ b/c/blake3.c
@ -341,21 +341,24 @@ INLINE void compress_subtree_to_parent_node(
  size_t num_cvs = blake3_compress_subtree_wide(input, input_len, key,
                                                chunk_counter, flags, cv_array);
  assert(num_cvs <= MAX_SIMD_DEGREE_OR_2);
-
-  // If MAX_SIMD_DEGREE is greater than 2 and there's enough input,
+  // The following loop never executes when MAX_SIMD_DEGREE_OR_2 is 2, because
+  // as we just asserted, num_cvs will always be <=2 in that case. But GCC
+  // (particularly GCC 8.5) can't tell that it never executes, and if NDEBUG is
+  // set then it emits incorrect warnings here. We tried a few different
+  // hacks to silence these, but in the end our hacks just produced different
+  // warnings (see https://github.com/BLAKE3-team/BLAKE3/pull/380). Out of
+  // desperation, we ifdef out this entire loop when we know it's not needed.
+#if MAX_SIMD_DEGREE_OR_2 > 2
+  // If MAX_SIMD_DEGREE_OR_2 is greater than 2 and there's enough input,
  // compress_subtree_wide() returns more than 2 chaining values. Condense
  // them into 2 by forming parent nodes repeatedly.
  uint8_t out_array[MAX_SIMD_DEGREE_OR_2 * BLAKE3_OUT_LEN / 2];
-  // The second half of this loop condition is always true, and we just
-  // asserted it above. But GCC can't tell that it's always true, and if NDEBUG
-  // is set on platforms where MAX_SIMD_DEGREE_OR_2 == 2, GCC emits spurious
-  // warnings here. GCC 8.5 is particularly sensitive, so if you're changing
-  // this code, test it against that version.
-  while (num_cvs > 2 && num_cvs <= MAX_SIMD_DEGREE_OR_2) {
+  while (num_cvs > 2) {
    num_cvs =
        compress_parents_parallel(cv_array, num_cvs, key, flags, out_array);
    memcpy(cv_array, out_array, num_cvs * BLAKE3_OUT_LEN);
  }
+#endif
  memcpy(out, cv_array, 2 * BLAKE3_OUT_LEN);
 }

--- a/c/blake3.h
+++ b/c/blake3.h
@ -30,7 +30,7 @@
 extern "C" {
 #endif

-#define BLAKE3_VERSION_STRING "1.4.1"
+#define BLAKE3_VERSION_STRING "1.5.1"
 #define BLAKE3_KEY_LEN 32
 #define BLAKE3_OUT_LEN 32
 #define BLAKE3_BLOCK_LEN 64
--- a/c/blake3_c_rust_bindings/Cargo.toml
+++ b/c/blake3_c_rust_bindings/Cargo.toml
@ -20,9 +20,9 @@ neon = []
 [dev-dependencies]
 arrayref = "0.3.5"
 arrayvec = { version = "0.7.0", default-features = false }
-page_size = "0.4.1"
-rand = "0.7.2"
-rand_chacha = "0.2.1"
+page_size = "0.6.0"
+rand = "0.8.5"
+rand_chacha = "0.3.1"
 reference_impl = { path = "../../reference_impl" }

 [build-dependencies]
--- a/c/blake3_c_rust_bindings/src/test.rs
+++ b/c/blake3_c_rust_bindings/src/test.rs
@ -485,7 +485,7 @@ fn test_fuzz_hasher() {
        let mut total_input = 0;
        // For each test, write 3 inputs of random length.
        for _ in 0..3 {
-            let input_len = rng.gen_range(0, INPUT_MAX + 1);
+            let input_len = rng.gen_range(0..INPUT_MAX + 1);
            dbg!(input_len);
            let input = &input_buf[total_input..][..input_len];
            hasher.update(input);
--- a/c/blake3_dispatch.c
+++ b/c/blake3_dispatch.c
@ -4,6 +4,10 @@

 #include "blake3_impl.h"

+#if defined(_MSC_VER)
+#include <Windows.h>
+#endif
+
 #if defined(IS_X86)
 #if defined(_MSC_VER)
 #include <intrin.h>
@ -14,6 +18,32 @@
 #endif
 #endif

+#if !defined(BLAKE3_ATOMICS)
+#if defined(__has_include)
+#if __has_include(<stdatomic.h>) && !defined(_MSC_VER)
+#define BLAKE3_ATOMICS 1
+#else
+#define BLAKE3_ATOMICS 0
+#endif /* __has_include(<stdatomic.h>) && !defined(_MSC_VER) */
+#else
+#define BLAKE3_ATOMICS 0
+#endif /* defined(__has_include) */
+#endif /* BLAKE3_ATOMICS */
+
+#if BLAKE3_ATOMICS
+#define ATOMIC_INT _Atomic int
+#define ATOMIC_LOAD(x) x
+#define ATOMIC_STORE(x, y) x = y
+#elif defined(_MSC_VER)
+#define ATOMIC_INT LONG
+#define ATOMIC_LOAD(x) InterlockedOr(&x, 0)
+#define ATOMIC_STORE(x, y) InterlockedExchange(&x, y)
+#else
+#define ATOMIC_INT int
+#define ATOMIC_LOAD(x) x
+#define ATOMIC_STORE(x, y) x = y
+#endif
+
 #define MAYBE_UNUSED(x) (void)((x))

 #if defined(IS_X86)
@ -76,7 +106,7 @@ enum cpu_feature {
 #if !defined(BLAKE3_TESTING)
 static /* Allow the variable to be controlled manually for testing */
 #endif
-    enum cpu_feature g_cpu_features = UNDEFINED;
+    ATOMIC_INT g_cpu_features = UNDEFINED;

 #if !defined(BLAKE3_TESTING)
 static
@ -84,14 +114,16 @@ static
    enum cpu_feature
    get_cpu_features(void) {

-  if (g_cpu_features != UNDEFINED) {
-    return g_cpu_features;
+  /* If TSAN detects a data race here, try compiling with -DBLAKE3_ATOMICS=1 */
+  enum cpu_feature features = ATOMIC_LOAD(g_cpu_features);
+  if (features != UNDEFINED) {
+    return features;
  } else {
 #if defined(IS_X86)
    uint32_t regs[4] = {0};
    uint32_t *eax = &regs[0], *ebx = &regs[1], *ecx = &regs[2], *edx = &regs[3];
    (void)edx;
-    enum cpu_feature features = 0;
+    features = 0;
    cpuid(regs, 0);
    const int max_id = *eax;
    cpuid(regs, 1);
@ -124,7 +156,7 @@ static
        }
      }
    }
-    g_cpu_features = features;
+    ATOMIC_STORE(g_cpu_features, features);
    return features;
 #else
    /* How to detect NEON? */
--- a/c/blake3_impl.h
+++ b/c/blake3_impl.h
@ -28,7 +28,7 @@ enum blake3_flags {
 #define INLINE static inline __attribute__((always_inline))
 #endif

-#if defined(__x86_64__) || defined(_M_X64) 
+#if (defined(__x86_64__) || defined(_M_X64)) && !defined(_M_ARM64EC)
 #define IS_X86
 #define IS_X86_64
 #endif
@ -38,7 +38,7 @@ enum blake3_flags {
 #define IS_X86_32
 #endif

-#if defined(__aarch64__) || defined(_M_ARM64)
+#if defined(__aarch64__) || defined(_M_ARM64) || defined(_M_ARM64EC)
 #define IS_AARCH64
 #endif

@ -51,7 +51,11 @@ enum blake3_flags {
 #if !defined(BLAKE3_USE_NEON) 
  // If BLAKE3_USE_NEON not manually set, autodetect based on AArch64ness
  #if defined(IS_AARCH64)
-    #define BLAKE3_USE_NEON 1
+    #if defined(__ARM_BIG_ENDIAN)
+      #define BLAKE3_USE_NEON 0
+    #else
+      #define BLAKE3_USE_NEON 1
+    #endif
  #else
    #define BLAKE3_USE_NEON 0
  #endif
--- a/c/blake3_neon.c
+++ b/c/blake3_neon.c
@ -10,14 +10,12 @@

 INLINE uint32x4_t loadu_128(const uint8_t src[16]) {
  // vld1q_u32 has alignment requirements. Don't use it.
-  uint32x4_t x;
-  memcpy(&x, src, 16);
-  return x;
+  return vreinterpretq_u32_u8(vld1q_u8(src));
 }

 INLINE void storeu_128(uint32x4_t src, uint8_t dest[16]) {
  // vst1q_u32 has alignment requirements. Don't use it.
-  memcpy(dest, &src, 16);
+  vst1q_u8(dest, vreinterpretq_u8_u32(src));
 }

 INLINE uint32x4_t add_128(uint32x4_t a, uint32x4_t b) {
--- a/reference_impl/reference_impl.rs
+++ b/reference_impl/reference_impl.rs
@ -78,23 +78,14 @@ fn compress(
    block_len: u32,
    flags: u32,
 ) -> [u32; 16] {
+    let counter_low = counter as u32;
+    let counter_high = (counter >> 32) as u32;
+    #[rustfmt::skip]
    let mut state = [
-        chaining_value[0],
-        chaining_value[1],
-        chaining_value[2],
-        chaining_value[3],
-        chaining_value[4],
-        chaining_value[5],
-        chaining_value[6],
-        chaining_value[7],
-        IV[0],
-        IV[1],
-        IV[2],
-        IV[3],
-        counter as u32,
-        (counter >> 32) as u32,
-        block_len,
-        flags,
+        chaining_value[0], chaining_value[1], chaining_value[2], chaining_value[3],
+        chaining_value[4], chaining_value[5], chaining_value[6], chaining_value[7],
+        IV[0],             IV[1],             IV[2],             IV[3],
+        counter_low,       counter_high,      block_len,         flags,
    ];
    let mut block = *block_words;

--- a/rust/guts/Cargo.toml
+++ b/rust/guts/Cargo.toml
@ -0,0 +1,18 @@
+[package]
+name = "blake3_guts"
+version = "0.0.0"
+authors = ["Jack O'Connor <oconnor663@gmail.com>", "Samuel Neves"]
+description = "low-level building blocks for the BLAKE3 hash function"
+repository = "https://github.com/BLAKE3-team/BLAKE3"
+license = "CC0-1.0 OR Apache-2.0"
+documentation = "https://docs.rs/blake3_guts"
+readme = "readme.md"
+edition = "2021"
+
+[dev-dependencies]
+hex = "0.4.3"
+reference_impl = { path = "../../reference_impl" }
+
+[features]
+default = ["std"]
+std = []
--- a/rust/guts/readme.md
+++ b/rust/guts/readme.md
@ -0,0 +1,80 @@
+# The BLAKE3 Guts API
+
+## Introduction
+
+This [`blake3_guts`](https://crates.io/crates/blake3_guts) sub-crate contains
+low-level, high-performance, platform-specific implementations of the BLAKE3
+compression function. This API is complicated and unsafe, and this crate will
+never have a stable release. Most callers should instead use the
+[`blake3`](https://crates.io/crates/blake3) crate, which will eventually depend
+on this one internally.
+
+The code you see here (as of January 2024) is an early stage of a large planned
+refactor. The motivation for this refactor is a couple of missing features in
+both the Rust and C implementations:
+
+- The output side
+  ([`OutputReader`](https://docs.rs/blake3/latest/blake3/struct.OutputReader.html)
+  in Rust) doesn't take advantage of the most important SIMD optimizations that
+  compute multiple blocks in parallel. This blocks any project that wants to
+  use the BLAKE3 XOF as a stream cipher
+  ([[1]](https://github.com/oconnor663/bessie),
+  [[2]](https://github.com/oconnor663/blake3_aead)).
+- Low-level callers like [Bao](https://github.com/oconnor663/bao) that need
+  interior nodes of the tree also don't get those SIMD optimizations. They have
+  to use a slow, minimalistic, unstable, doc-hidden module [(also called
+  `guts`)](https://github.com/BLAKE3-team/BLAKE3/blob/master/src/guts.rs).
+
+The difficulty with adding those features is that they require changes to all
+of our optimized assembly and C intrinsics code. That's a couple dozen
+different files that are large, platform-specific, difficult to understand, and
+full of duplicated code. The higher-level Rust and C implementations of BLAKE3
+both depend on these files and will need to coordinate changes.
+
+At the same time, it won't be long before we add support for more platforms:
+
+- RISCV vector extensions
+- ARM SVE
+- WebAssembly SIMD
+
+It's important to get this refactor done before new platforms make it even
+harder to do.
+
+## The private guts API
+
+This is the API that each platform reimplements, so we want it to be as simple
+as possible apart from the high-performance work it needs to do. It's
+completely `unsafe`, and inputs and outputs are raw pointers that are allowed
+to alias (this matters for `hash_parents`, see below).
+
+- `degree`
+- `compress`
+    - The single compression function, for short inputs and odd-length tails.
+- `hash_chunks`
+- `hash_parents`
+- `xof`
+- `xof_xor`
+    - As `xof` but XOR'ing the result into the output buffer.
+- `universal_hash`
+    - This is a new construction specifically to support
+      [BLAKE3-AEAD](https://github.com/oconnor663/blake3_aead). Some
+      implementations might just stub it out with portable code.
+
+## The public guts API
+
+This is the API that this crate exposes to callers, i.e. to the main `blake3`
+crate. It's a thin, portable layer on top of the private API above. The Rust
+version of this API is memory-safe.
+
+- `degree`
+- `compress`
+- `hash_chunks`
+- `hash_parents`
+    - This handles most levels of the tree, where we keep hashing SIMD_DEGREE
+      parents at a time.
+- `reduce_parents`
+    - This uses the same `hash_parents` private API, but it handles the top
+      levels of the tree where we reduce in-place to the root parent node.
+- `xof`
+- `xof_xor`
+- `universal_hash`
--- a/rust/guts/src/lib.rs
+++ b/rust/guts/src/lib.rs
@ -0,0 +1,1000 @@
+//! # The BLAKE3 Guts API
+//!
+//! See `readme.md`.
+//!
+//! The main entrypoint into this crate is [`DETECTED_IMPL`], which is a global [`Implementation`]
+//! that atomically initializes itself the first time you use it.
+//!
+//! # Example
+//!
+//! ```rust
+//! use blake3_guts::{TransposedVectors, DETECTED_IMPL, IV_BYTES, PARENT, ROOT};
+//!
+//! // Hash an input of exactly two chunks.
+//! let input = [0u8; 2048];
+//! let mut outputs = TransposedVectors::new();
+//! let (left_outputs, _) = DETECTED_IMPL.split_transposed_vectors(&mut outputs);
+//! DETECTED_IMPL.hash_chunks(
+//!     &input,
+//!     &IV_BYTES,
+//!     0, // counter
+//!     0, // flags
+//!     left_outputs,
+//! );
+//! let root_node = outputs.extract_parent_node(0);
+//! let hash = DETECTED_IMPL.compress(
+//!     &root_node,
+//!     64, // block_len
+//!     &IV_BYTES,
+//!     0, // counter
+//!     PARENT | ROOT,
+//! );
+//!
+//! // Compute the same hash using the reference implementation.
+//! let mut reference_hasher = reference_impl::Hasher::new();
+//! reference_hasher.update(&input);
+//! let mut expected_hash = [0u8; 32];
+//! reference_hasher.finalize(&mut expected_hash);
+//!
+//! assert_eq!(hash, expected_hash);
+//! ```
+
+// Tests always require libstd.
+#![cfg_attr(all(not(feature = "std"), not(test)), no_std)]
+
+use core::cmp;
+use core::marker::PhantomData;
+use core::mem;
+use core::ptr;
+use core::sync::atomic::{AtomicPtr, Ordering::Relaxed};
+
+pub mod portable;
+
+#[cfg(test)]
+mod test;
+
+pub const OUT_LEN: usize = 32;
+pub const BLOCK_LEN: usize = 64;
+pub const CHUNK_LEN: usize = 1024;
+pub const WORD_LEN: usize = 4;
+pub const UNIVERSAL_HASH_LEN: usize = 16;
+
+pub const CHUNK_START: u32 = 1 << 0;
+pub const CHUNK_END: u32 = 1 << 1;
+pub const PARENT: u32 = 1 << 2;
+pub const ROOT: u32 = 1 << 3;
+pub const KEYED_HASH: u32 = 1 << 4;
+pub const DERIVE_KEY_CONTEXT: u32 = 1 << 5;
+pub const DERIVE_KEY_MATERIAL: u32 = 1 << 6;
+
+pub const IV: CVWords = [
+    0x6A09E667, 0xBB67AE85, 0x3C6EF372, 0xA54FF53A, 0x510E527F, 0x9B05688C, 0x1F83D9AB, 0x5BE0CD19,
+];
+pub const IV_BYTES: CVBytes = le_bytes_from_words_32(&IV);
+
+pub const MSG_SCHEDULE: [[usize; 16]; 7] = [
+    [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15],
+    [2, 6, 3, 10, 7, 0, 4, 13, 1, 11, 12, 5, 9, 14, 15, 8],
+    [3, 4, 10, 12, 13, 2, 7, 14, 6, 5, 9, 0, 11, 15, 8, 1],
+    [10, 7, 12, 9, 14, 3, 13, 15, 4, 0, 11, 2, 5, 8, 1, 6],
+    [12, 13, 9, 11, 15, 10, 14, 8, 7, 2, 5, 3, 0, 1, 6, 4],
+    [9, 14, 11, 5, 8, 12, 15, 1, 13, 3, 0, 10, 2, 6, 4, 7],
+    [11, 15, 5, 0, 1, 9, 8, 6, 14, 10, 2, 12, 3, 4, 7, 13],
+];
+
+// never less than 2
+pub const MAX_SIMD_DEGREE: usize = 2;
+
+pub type CVBytes = [u8; 32];
+pub type CVWords = [u32; 8];
+pub type BlockBytes = [u8; 64];
+pub type BlockWords = [u32; 16];
+
+pub static DETECTED_IMPL: Implementation = Implementation::new(
+    degree_init,
+    compress_init,
+    hash_chunks_init,
+    hash_parents_init,
+    xof_init,
+    xof_xor_init,
+    universal_hash_init,
+);
+
+fn detect() -> Implementation {
+    portable::implementation()
+}
+
+fn init_detected_impl() {
+    let detected = detect();
+
+    DETECTED_IMPL
+        .degree_ptr
+        .store(detected.degree_ptr.load(Relaxed), Relaxed);
+    DETECTED_IMPL
+        .compress_ptr
+        .store(detected.compress_ptr.load(Relaxed), Relaxed);
+    DETECTED_IMPL
+        .hash_chunks_ptr
+        .store(detected.hash_chunks_ptr.load(Relaxed), Relaxed);
+    DETECTED_IMPL
+        .hash_parents_ptr
+        .store(detected.hash_parents_ptr.load(Relaxed), Relaxed);
+    DETECTED_IMPL
+        .xof_ptr
+        .store(detected.xof_ptr.load(Relaxed), Relaxed);
+    DETECTED_IMPL
+        .xof_xor_ptr
+        .store(detected.xof_xor_ptr.load(Relaxed), Relaxed);
+    DETECTED_IMPL
+        .universal_hash_ptr
+        .store(detected.universal_hash_ptr.load(Relaxed), Relaxed);
+}
+
+pub struct Implementation {
+    degree_ptr: AtomicPtr<()>,
+    compress_ptr: AtomicPtr<()>,
+    hash_chunks_ptr: AtomicPtr<()>,
+    hash_parents_ptr: AtomicPtr<()>,
+    xof_ptr: AtomicPtr<()>,
+    xof_xor_ptr: AtomicPtr<()>,
+    universal_hash_ptr: AtomicPtr<()>,
+}
+
+impl Implementation {
+    const fn new(
+        degree_fn: DegreeFn,
+        compress_fn: CompressFn,
+        hash_chunks_fn: HashChunksFn,
+        hash_parents_fn: HashParentsFn,
+        xof_fn: XofFn,
+        xof_xor_fn: XofFn,
+        universal_hash_fn: UniversalHashFn,
+    ) -> Self {
+        Self {
+            degree_ptr: AtomicPtr::new(degree_fn as *mut ()),
+            compress_ptr: AtomicPtr::new(compress_fn as *mut ()),
+            hash_chunks_ptr: AtomicPtr::new(hash_chunks_fn as *mut ()),
+            hash_parents_ptr: AtomicPtr::new(hash_parents_fn as *mut ()),
+            xof_ptr: AtomicPtr::new(xof_fn as *mut ()),
+            xof_xor_ptr: AtomicPtr::new(xof_xor_fn as *mut ()),
+            universal_hash_ptr: AtomicPtr::new(universal_hash_fn as *mut ()),
+        }
+    }
+
+    #[inline]
+    fn degree_fn(&self) -> DegreeFn {
+        unsafe { mem::transmute(self.degree_ptr.load(Relaxed)) }
+    }
+
+    #[inline]
+    pub fn degree(&self) -> usize {
+        let degree = unsafe { self.degree_fn()() };
+        debug_assert!(degree >= 2);
+        debug_assert!(degree <= MAX_SIMD_DEGREE);
+        debug_assert_eq!(1, degree.count_ones(), "power of 2");
+        degree
+    }
+
+    #[inline]
+    pub fn split_transposed_vectors<'v>(
+        &self,
+        vectors: &'v mut TransposedVectors,
+    ) -> (TransposedSplit<'v>, TransposedSplit<'v>) {
+        unsafe { vectors.split(self.degree()) }
+    }
+
+    #[inline]
+    fn compress_fn(&self) -> CompressFn {
+        unsafe { mem::transmute(self.compress_ptr.load(Relaxed)) }
+    }
+
+    #[inline]
+    pub fn compress(
+        &self,
+        block: &BlockBytes,
+        block_len: u32,
+        cv: &CVBytes,
+        counter: u64,
+        flags: u32,
+    ) -> CVBytes {
+        let mut out = [0u8; 32];
+        unsafe {
+            self.compress_fn()(block, block_len, cv, counter, flags, &mut out);
+        }
+        out
+    }
+
+    // The contract for HashChunksFn doesn't require the implementation to support single-chunk
+    // inputs. Instead we handle that case here by calling compress in a loop.
+    #[inline]
+    fn hash_one_chunk(
+        &self,
+        mut input: &[u8],
+        key: &CVBytes,
+        counter: u64,
+        mut flags: u32,
+        output: TransposedSplit,
+    ) {
+        debug_assert!(input.len() <= CHUNK_LEN);
+        let mut cv = *key;
+        flags |= CHUNK_START;
+        while input.len() > BLOCK_LEN {
+            cv = self.compress(
+                input[..BLOCK_LEN].try_into().unwrap(),
+                BLOCK_LEN as u32,
+                &cv,
+                counter,
+                flags,
+            );
+            input = &input[BLOCK_LEN..];
+            flags &= !CHUNK_START;
+        }
+        let mut final_block = [0u8; BLOCK_LEN];
+        final_block[..input.len()].copy_from_slice(input);
+        cv = self.compress(
+            &final_block,
+            input.len() as u32,
+            &cv,
+            counter,
+            flags | CHUNK_END,
+        );
+        unsafe {
+            write_transposed_cv(&words_from_le_bytes_32(&cv), output.ptr);
+        }
+    }
+
+    #[inline]
+    fn hash_chunks_fn(&self) -> HashChunksFn {
+        unsafe { mem::transmute(self.hash_chunks_ptr.load(Relaxed)) }
+    }
+
+    #[inline]
+    pub fn hash_chunks(
+        &self,
+        input: &[u8],
+        key: &CVBytes,
+        counter: u64,
+        flags: u32,
+        transposed_output: TransposedSplit,
+    ) -> usize {
+        debug_assert!(input.len() <= self.degree() * CHUNK_LEN);
+        if input.len() <= CHUNK_LEN {
+            // The underlying hash_chunks_fn isn't required to support this case. Instead we handle
+            // it by calling compress_fn in a loop. But note that we still don't support root
+            // finalization or the empty input here.
+            self.hash_one_chunk(input, key, counter, flags, transposed_output);
+            return 1;
+        }
+        // SAFETY: If the caller passes in more than MAX_SIMD_DEGREE * CHUNK_LEN bytes, silently
+        // ignore the remainder. This makes it impossible to write out of bounds in a properly
+        // constructed TransposedSplit.
+        let len = cmp::min(input.len(), MAX_SIMD_DEGREE * CHUNK_LEN);
+        unsafe {
+            self.hash_chunks_fn()(
+                input.as_ptr(),
+                len,
+                key,
+                counter,
+                flags,
+                transposed_output.ptr,
+            );
+        }
+        if input.len() % CHUNK_LEN == 0 {
+            input.len() / CHUNK_LEN
+        } else {
+            (input.len() / CHUNK_LEN) + 1
+        }
+    }
+
+    #[inline]
+    fn hash_parents_fn(&self) -> HashParentsFn {
+        unsafe { mem::transmute(self.hash_parents_ptr.load(Relaxed)) }
+    }
+
+    #[inline]
+    pub fn hash_parents(
+        &self,
+        transposed_input: &TransposedVectors,
+        mut num_cvs: usize,
+        key: &CVBytes,
+        flags: u32,
+        transposed_output: TransposedSplit,
+    ) -> usize {
+        debug_assert!(num_cvs <= 2 * MAX_SIMD_DEGREE);
+        // SAFETY: Cap num_cvs at 2 * MAX_SIMD_DEGREE, to guarantee no out-of-bounds accesses.
+        num_cvs = cmp::min(num_cvs, 2 * MAX_SIMD_DEGREE);
+        let mut odd_cv = [0u32; 8];
+        if num_cvs % 2 == 1 {
+            unsafe {
+                odd_cv = read_transposed_cv(transposed_input.as_ptr().add(num_cvs - 1));
+            }
+        }
+        let num_parents = num_cvs / 2;
+        unsafe {
+            self.hash_parents_fn()(
+                transposed_input.as_ptr(),
+                num_parents,
+                key,
+                flags | PARENT,
+                transposed_output.ptr,
+            );
+        }
+        if num_cvs % 2 == 1 {
+            unsafe {
+                write_transposed_cv(&odd_cv, transposed_output.ptr.add(num_parents));
+            }
+            num_parents + 1
+        } else {
+            num_parents
+        }
+    }
+
+    #[inline]
+    pub fn reduce_parents(
+        &self,
+        transposed_in_out: &mut TransposedVectors,
+        mut num_cvs: usize,
+        key: &CVBytes,
+        flags: u32,
+    ) -> usize {
+        debug_assert!(num_cvs <= 2 * MAX_SIMD_DEGREE);
+        // SAFETY: Cap num_cvs at 2 * MAX_SIMD_DEGREE, to guarantee no out-of-bounds accesses.
+        num_cvs = cmp::min(num_cvs, 2 * MAX_SIMD_DEGREE);
+        let in_out_ptr = transposed_in_out.as_mut_ptr();
+        let mut odd_cv = [0u32; 8];
+        if num_cvs % 2 == 1 {
+            unsafe {
+                odd_cv = read_transposed_cv(in_out_ptr.add(num_cvs - 1));
+            }
+        }
+        let num_parents = num_cvs / 2;
+        unsafe {
+            self.hash_parents_fn()(in_out_ptr, num_parents, key, flags | PARENT, in_out_ptr);
+        }
+        if num_cvs % 2 == 1 {
+            unsafe {
+                write_transposed_cv(&odd_cv, in_out_ptr.add(num_parents));
+            }
+            num_parents + 1
+        } else {
+            num_parents
+        }
+    }
+
+    #[inline]
+    fn xof_fn(&self) -> XofFn {
+        unsafe { mem::transmute(self.xof_ptr.load(Relaxed)) }
+    }
+
+    #[inline]
+    pub fn xof(
+        &self,
+        block: &BlockBytes,
+        block_len: u32,
+        cv: &CVBytes,
+        mut counter: u64,
+        flags: u32,
+        mut out: &mut [u8],
+    ) {
+        let degree = self.degree();
+        let simd_len = degree * BLOCK_LEN;
+        while !out.is_empty() {
+            let take = cmp::min(simd_len, out.len());
+            unsafe {
+                self.xof_fn()(
+                    block,
+                    block_len,
+                    cv,
+                    counter,
+                    flags | ROOT,
+                    out.as_mut_ptr(),
+                    take,
+                );
+            }
+            out = &mut out[take..];
+            counter += degree as u64;
+        }
+    }
+
+    #[inline]
+    fn xof_xor_fn(&self) -> XofFn {
+        unsafe { mem::transmute(self.xof_xor_ptr.load(Relaxed)) }
+    }
+
+    #[inline]
+    pub fn xof_xor(
+        &self,
+        block: &BlockBytes,
+        block_len: u32,
+        cv: &CVBytes,
+        mut counter: u64,
+        flags: u32,
+        mut out: &mut [u8],
+    ) {
+        let degree = self.degree();
+        let simd_len = degree * BLOCK_LEN;
+        while !out.is_empty() {
+            let take = cmp::min(simd_len, out.len());
+            unsafe {
+                self.xof_xor_fn()(
+                    block,
+                    block_len,
+                    cv,
+                    counter,
+                    flags | ROOT,
+                    out.as_mut_ptr(),
+                    take,
+                );
+            }
+            out = &mut out[take..];
+            counter += degree as u64;
+        }
+    }
+
+    #[inline]
+    fn universal_hash_fn(&self) -> UniversalHashFn {
+        unsafe { mem::transmute(self.universal_hash_ptr.load(Relaxed)) }
+    }
+
+    #[inline]
+    pub fn universal_hash(&self, mut input: &[u8], key: &CVBytes, mut counter: u64) -> [u8; 16] {
+        let degree = self.degree();
+        let simd_len = degree * BLOCK_LEN;
+        let mut ret = [0u8; 16];
+        while !input.is_empty() {
+            let take = cmp::min(simd_len, input.len());
+            let mut output = [0u8; 16];
+            unsafe {
+                self.universal_hash_fn()(input.as_ptr(), take, key, counter, &mut output);
+            }
+            input = &input[take..];
+            counter += degree as u64;
+            for byte_index in 0..16 {
+                ret[byte_index] ^= output[byte_index];
+            }
+        }
+        ret
+    }
+}
+
+impl Clone for Implementation {
+    fn clone(&self) -> Self {
+        Self {
+            degree_ptr: AtomicPtr::new(self.degree_ptr.load(Relaxed)),
+            compress_ptr: AtomicPtr::new(self.compress_ptr.load(Relaxed)),
+            hash_chunks_ptr: AtomicPtr::new(self.hash_chunks_ptr.load(Relaxed)),
+            hash_parents_ptr: AtomicPtr::new(self.hash_parents_ptr.load(Relaxed)),
+            xof_ptr: AtomicPtr::new(self.xof_ptr.load(Relaxed)),
+            xof_xor_ptr: AtomicPtr::new(self.xof_xor_ptr.load(Relaxed)),
+            universal_hash_ptr: AtomicPtr::new(self.universal_hash_ptr.load(Relaxed)),
+        }
+    }
+}
+
+// never less than 2
+type DegreeFn = unsafe extern "C" fn() -> usize;
+
+unsafe extern "C" fn degree_init() -> usize {
+    init_detected_impl();
+    DETECTED_IMPL.degree_fn()()
+}
+
+type CompressFn = unsafe extern "C" fn(
+    block: *const BlockBytes, // zero padded to 64 bytes
+    block_len: u32,
+    cv: *const CVBytes,
+    counter: u64,
+    flags: u32,
+    out: *mut CVBytes, // may overlap the input
+);
+
+unsafe extern "C" fn compress_init(
+    block: *const BlockBytes,
+    block_len: u32,
+    cv: *const CVBytes,
+    counter: u64,
+    flags: u32,
+    out: *mut CVBytes,
+) {
+    init_detected_impl();
+    DETECTED_IMPL.compress_fn()(block, block_len, cv, counter, flags, out);
+}
+
+type CompressXofFn = unsafe extern "C" fn(
+    block: *const BlockBytes, // zero padded to 64 bytes
+    block_len: u32,
+    cv: *const CVBytes,
+    counter: u64,
+    flags: u32,
+    out: *mut BlockBytes, // may overlap the input
+);
+
+type HashChunksFn = unsafe extern "C" fn(
+    input: *const u8,
+    input_len: usize,
+    key: *const CVBytes,
+    counter: u64,
+    flags: u32,
+    transposed_output: *mut u32,
+);
+
+unsafe extern "C" fn hash_chunks_init(
+    input: *const u8,
+    input_len: usize,
+    key: *const CVBytes,
+    counter: u64,
+    flags: u32,
+    transposed_output: *mut u32,
+) {
+    init_detected_impl();
+    DETECTED_IMPL.hash_chunks_fn()(input, input_len, key, counter, flags, transposed_output);
+}
+
+type HashParentsFn = unsafe extern "C" fn(
+    transposed_input: *const u32,
+    num_parents: usize,
+    key: *const CVBytes,
+    flags: u32,
+    transposed_output: *mut u32, // may overlap the input
+);
+
+unsafe extern "C" fn hash_parents_init(
+    transposed_input: *const u32,
+    num_parents: usize,
+    key: *const CVBytes,
+    flags: u32,
+    transposed_output: *mut u32,
+) {
+    init_detected_impl();
+    DETECTED_IMPL.hash_parents_fn()(transposed_input, num_parents, key, flags, transposed_output);
+}
+
+// This signature covers both xof() and xof_xor().
+type XofFn = unsafe extern "C" fn(
+    block: *const BlockBytes, // zero padded to 64 bytes
+    block_len: u32,
+    cv: *const CVBytes,
+    counter: u64,
+    flags: u32,
+    out: *mut u8,
+    out_len: usize,
+);
+
+unsafe extern "C" fn xof_init(
+    block: *const BlockBytes,
+    block_len: u32,
+    cv: *const CVBytes,
+    counter: u64,
+    flags: u32,
+    out: *mut u8,
+    out_len: usize,
+) {
+    init_detected_impl();
+    DETECTED_IMPL.xof_fn()(block, block_len, cv, counter, flags, out, out_len);
+}
+
+unsafe extern "C" fn xof_xor_init(
+    block: *const BlockBytes,
+    block_len: u32,
+    cv: *const CVBytes,
+    counter: u64,
+    flags: u32,
+    out: *mut u8,
+    out_len: usize,
+) {
+    init_detected_impl();
+    DETECTED_IMPL.xof_xor_fn()(block, block_len, cv, counter, flags, out, out_len);
+}
+
+type UniversalHashFn = unsafe extern "C" fn(
+    input: *const u8,
+    input_len: usize,
+    key: *const CVBytes,
+    counter: u64,
+    out: *mut [u8; 16],
+);
+
+unsafe extern "C" fn universal_hash_init(
+    input: *const u8,
+    input_len: usize,
+    key: *const CVBytes,
+    counter: u64,
+    out: *mut [u8; 16],
+) {
+    init_detected_impl();
+    DETECTED_IMPL.universal_hash_fn()(input, input_len, key, counter, out);
+}
+
+// The implicit degree of this implementation is MAX_SIMD_DEGREE.
+#[inline(always)]
+unsafe fn hash_chunks_using_compress(
+    compress: CompressFn,
+    mut input: *const u8,
+    mut input_len: usize,
+    key: *const CVBytes,
+    mut counter: u64,
+    flags: u32,
+    mut transposed_output: *mut u32,
+) {
+    debug_assert!(input_len > 0);
+    debug_assert!(input_len <= MAX_SIMD_DEGREE * CHUNK_LEN);
+    input_len = cmp::min(input_len, MAX_SIMD_DEGREE * CHUNK_LEN);
+    while input_len > 0 {
+        let mut chunk_len = cmp::min(input_len, CHUNK_LEN);
+        input_len -= chunk_len;
+        // We only use 8 words of the CV, but compress returns 16.
+        let mut cv = *key;
+        let cv_ptr: *mut CVBytes = &mut cv;
+        let mut chunk_flags = flags | CHUNK_START;
+        while chunk_len > BLOCK_LEN {
+            compress(
+                input as *const BlockBytes,
+                BLOCK_LEN as u32,
+                cv_ptr,
+                counter,
+                chunk_flags,
+                cv_ptr,
+            );
+            input = input.add(BLOCK_LEN);
+            chunk_len -= BLOCK_LEN;
+            chunk_flags &= !CHUNK_START;
+        }
+        let mut last_block = [0u8; BLOCK_LEN];
+        ptr::copy_nonoverlapping(input, last_block.as_mut_ptr(), chunk_len);
+        input = input.add(chunk_len);
+        compress(
+            &last_block,
+            chunk_len as u32,
+            cv_ptr,
+            counter,
+            chunk_flags | CHUNK_END,
+            cv_ptr,
+        );
+        let cv_words = words_from_le_bytes_32(&cv);
+        for word_index in 0..8 {
+            transposed_output
+                .add(word_index * TRANSPOSED_STRIDE)
+                .write(cv_words[word_index]);
+        }
+        transposed_output = transposed_output.add(1);
+        counter += 1;
+    }
+}
+
+// The implicit degree of this implementation is MAX_SIMD_DEGREE.
+#[inline(always)]
+unsafe fn hash_parents_using_compress(
+    compress: CompressFn,
+    mut transposed_input: *const u32,
+    mut num_parents: usize,
+    key: *const CVBytes,
+    flags: u32,
+    mut transposed_output: *mut u32, // may overlap the input
+) {
+    debug_assert!(num_parents > 0);
+    debug_assert!(num_parents <= MAX_SIMD_DEGREE);
+    while num_parents > 0 {
+        let mut block_bytes = [0u8; 64];
+        for word_index in 0..8 {
+            let left_child_word = transposed_input.add(word_index * TRANSPOSED_STRIDE).read();
+            block_bytes[WORD_LEN * word_index..][..WORD_LEN]
+                .copy_from_slice(&left_child_word.to_le_bytes());
+            let right_child_word = transposed_input
+                .add(word_index * TRANSPOSED_STRIDE + 1)
+                .read();
+            block_bytes[WORD_LEN * (word_index + 8)..][..WORD_LEN]
+                .copy_from_slice(&right_child_word.to_le_bytes());
+        }
+        let mut cv = [0u8; 32];
+        compress(&block_bytes, BLOCK_LEN as u32, key, 0, flags, &mut cv);
+        let cv_words = words_from_le_bytes_32(&cv);
+        for word_index in 0..8 {
+            transposed_output
+                .add(word_index * TRANSPOSED_STRIDE)
+                .write(cv_words[word_index]);
+        }
+        transposed_input = transposed_input.add(2);
+        transposed_output = transposed_output.add(1);
+        num_parents -= 1;
+    }
+}
+
+#[inline(always)]
+unsafe fn xof_using_compress_xof(
+    compress_xof: CompressXofFn,
+    block: *const BlockBytes,
+    block_len: u32,
+    cv: *const CVBytes,
+    mut counter: u64,
+    flags: u32,
+    mut out: *mut u8,
+    mut out_len: usize,
+) {
+    debug_assert!(out_len <= MAX_SIMD_DEGREE * BLOCK_LEN);
+    while out_len > 0 {
+        let mut block_output = [0u8; 64];
+        compress_xof(block, block_len, cv, counter, flags, &mut block_output);
+        let take = cmp::min(out_len, BLOCK_LEN);
+        ptr::copy_nonoverlapping(block_output.as_ptr(), out, take);
+        out = out.add(take);
+        out_len -= take;
+        counter += 1;
+    }
+}
+
+#[inline(always)]
+unsafe fn xof_xor_using_compress_xof(
+    compress_xof: CompressXofFn,
+    block: *const BlockBytes,
+    block_len: u32,
+    cv: *const CVBytes,
+    mut counter: u64,
+    flags: u32,
+    mut out: *mut u8,
+    mut out_len: usize,
+) {
+    debug_assert!(out_len <= MAX_SIMD_DEGREE * BLOCK_LEN);
+    while out_len > 0 {
+        let mut block_output = [0u8; 64];
+        compress_xof(block, block_len, cv, counter, flags, &mut block_output);
+        let take = cmp::min(out_len, BLOCK_LEN);
+        for i in 0..take {
+            *out.add(i) ^= block_output[i];
+        }
+        out = out.add(take);
+        out_len -= take;
+        counter += 1;
+    }
+}
+
+#[inline(always)]
+unsafe fn universal_hash_using_compress(
+    compress: CompressFn,
+    mut input: *const u8,
+    mut input_len: usize,
+    key: *const CVBytes,
+    mut counter: u64,
+    out: *mut [u8; 16],
+) {
+    let flags = KEYED_HASH | CHUNK_START | CHUNK_END | ROOT;
+    let mut result = [0u8; 16];
+    while input_len > 0 {
+        let block_len = cmp::min(input_len, BLOCK_LEN);
+        let mut block = [0u8; BLOCK_LEN];
+        ptr::copy_nonoverlapping(input, block.as_mut_ptr(), block_len);
+        let mut block_output = [0u8; 32];
+        compress(
+            &block,
+            block_len as u32,
+            key,
+            counter,
+            flags,
+            &mut block_output,
+        );
+        for i in 0..16 {
+            result[i] ^= block_output[i];
+        }
+        input = input.add(block_len);
+        input_len -= block_len;
+        counter += 1;
+    }
+    *out = result;
+}
+
+// this is in units of *words*, for pointer operations on *const/*mut u32
+const TRANSPOSED_STRIDE: usize = 2 * MAX_SIMD_DEGREE;
+
+#[cfg_attr(any(target_arch = "x86", target_arch = "x86_64"), repr(C, align(64)))]
+#[derive(Clone, Debug, PartialEq, Eq)]
+pub struct TransposedVectors([[u32; 2 * MAX_SIMD_DEGREE]; 8]);
+
+impl TransposedVectors {
+    pub fn new() -> Self {
+        Self([[0; 2 * MAX_SIMD_DEGREE]; 8])
+    }
+
+    pub fn extract_cv(&self, cv_index: usize) -> CVBytes {
+        let mut words = [0u32; 8];
+        for word_index in 0..8 {
+            words[word_index] = self.0[word_index][cv_index];
+        }
+        le_bytes_from_words_32(&words)
+    }
+
+    pub fn extract_parent_node(&self, parent_index: usize) -> BlockBytes {
+        let mut bytes = [0u8; 64];
+        bytes[..32].copy_from_slice(&self.extract_cv(parent_index / 2));
+        bytes[32..].copy_from_slice(&self.extract_cv(parent_index / 2 + 1));
+        bytes
+    }
+
+    fn as_ptr(&self) -> *const u32 {
+        self.0[0].as_ptr()
+    }
+
+    fn as_mut_ptr(&mut self) -> *mut u32 {
+        self.0[0].as_mut_ptr()
+    }
+
+    // SAFETY: This function is just pointer arithmetic, but callers assume that it's safe (not
+    // necessarily correct) to write up to `degree` words to either side of the split, possibly
+    // from different threads.
+    unsafe fn split(&mut self, degree: usize) -> (TransposedSplit, TransposedSplit) {
+        debug_assert!(degree > 0);
+        debug_assert!(degree <= MAX_SIMD_DEGREE);
+        debug_assert_eq!(degree.count_ones(), 1, "power of 2");
+        let ptr = self.as_mut_ptr();
+        let left = TransposedSplit {
+            ptr,
+            phantom_data: PhantomData,
+        };
+        let right = TransposedSplit {
+            ptr: ptr.wrapping_add(degree),
+            phantom_data: PhantomData,
+        };
+        (left, right)
+    }
+}
+
+pub struct TransposedSplit<'vectors> {
+    ptr: *mut u32,
+    phantom_data: PhantomData<&'vectors mut u32>,
+}
+
+unsafe impl<'vectors> Send for TransposedSplit<'vectors> {}
+unsafe impl<'vectors> Sync for TransposedSplit<'vectors> {}
+
+unsafe fn read_transposed_cv(src: *const u32) -> CVWords {
+    let mut cv = [0u32; 8];
+    for word_index in 0..8 {
+        let offset_words = word_index * TRANSPOSED_STRIDE;
+        cv[word_index] = src.add(offset_words).read();
+    }
+    cv
+}
+
+unsafe fn write_transposed_cv(cv: &CVWords, dest: *mut u32) {
+    for word_index in 0..8 {
+        let offset_words = word_index * TRANSPOSED_STRIDE;
+        dest.add(offset_words).write(cv[word_index]);
+    }
+}
+
+#[inline(always)]
+pub const fn le_bytes_from_words_32(words: &CVWords) -> CVBytes {
+    let mut bytes = [0u8; 32];
+    // This loop is super verbose because currently that's what it takes to be const.
+    let mut word_index = 0;
+    while word_index < bytes.len() / WORD_LEN {
+        let word_bytes = words[word_index].to_le_bytes();
+        let mut byte_index = 0;
+        while byte_index < WORD_LEN {
+            bytes[word_index * WORD_LEN + byte_index] = word_bytes[byte_index];
+            byte_index += 1;
+        }
+        word_index += 1;
+    }
+    bytes
+}
+
+#[inline(always)]
+pub const fn le_bytes_from_words_64(words: &BlockWords) -> BlockBytes {
+    let mut bytes = [0u8; 64];
+    // This loop is super verbose because currently that's what it takes to be const.
+    let mut word_index = 0;
+    while word_index < bytes.len() / WORD_LEN {
+        let word_bytes = words[word_index].to_le_bytes();
+        let mut byte_index = 0;
+        while byte_index < WORD_LEN {
+            bytes[word_index * WORD_LEN + byte_index] = word_bytes[byte_index];
+            byte_index += 1;
+        }
+        word_index += 1;
+    }
+    bytes
+}
+
+#[inline(always)]
+pub const fn words_from_le_bytes_32(bytes: &CVBytes) -> CVWords {
+    let mut words = [0u32; 8];
+    // This loop is super verbose because currently that's what it takes to be const.
+    let mut word_index = 0;
+    while word_index < words.len() {
+        let mut word_bytes = [0u8; WORD_LEN];
+        let mut byte_index = 0;
+        while byte_index < WORD_LEN {
+            word_bytes[byte_index] = bytes[word_index * WORD_LEN + byte_index];
+            byte_index += 1;
+        }
+        words[word_index] = u32::from_le_bytes(word_bytes);
+        word_index += 1;
+    }
+    words
+}
+
+#[inline(always)]
+pub const fn words_from_le_bytes_64(bytes: &BlockBytes) -> BlockWords {
+    let mut words = [0u32; 16];
+    // This loop is super verbose because currently that's what it takes to be const.
+    let mut word_index = 0;
+    while word_index < words.len() {
+        let mut word_bytes = [0u8; WORD_LEN];
+        let mut byte_index = 0;
+        while byte_index < WORD_LEN {
+            word_bytes[byte_index] = bytes[word_index * WORD_LEN + byte_index];
+            byte_index += 1;
+        }
+        words[word_index] = u32::from_le_bytes(word_bytes);
+        word_index += 1;
+    }
+    words
+}
+
+#[test]
+fn test_byte_word_round_trips() {
+    let cv = *b"This is 32 LE bytes/eight words.";
+    assert_eq!(cv, le_bytes_from_words_32(&words_from_le_bytes_32(&cv)));
+    let block = *b"This is sixty-four little-endian bytes, or sixteen 32-bit words.";
+    assert_eq!(
+        block,
+        le_bytes_from_words_64(&words_from_le_bytes_64(&block)),
+    );
+}
+
+// The largest power of two less than or equal to `n`, used for left_len()
+// immediately below, and also directly in Hasher::update().
+pub fn largest_power_of_two_leq(n: usize) -> usize {
+    ((n / 2) + 1).next_power_of_two()
+}
+
+#[test]
+fn test_largest_power_of_two_leq() {
+    let input_output = &[
+        // The zero case is nonsensical, but it does work.
+        (0, 1),
+        (1, 1),
+        (2, 2),
+        (3, 2),
+        (4, 4),
+        (5, 4),
+        (6, 4),
+        (7, 4),
+        (8, 8),
+        // the largest possible usize
+        (usize::MAX, (usize::MAX >> 1) + 1),
+    ];
+    for &(input, output) in input_output {
+        assert_eq!(
+            output,
+            crate::largest_power_of_two_leq(input),
+            "wrong output for n={}",
+            input
+        );
+    }
+}
+
+// Given some input larger than one chunk, return the number of bytes that
+// should go in the left subtree. This is the largest power-of-2 number of
+// chunks that leaves at least 1 byte for the right subtree.
+pub fn left_len(content_len: usize) -> usize {
+    debug_assert!(content_len > CHUNK_LEN);
+    // Subtract 1 to reserve at least one byte for the right side.
+    let full_chunks = (content_len - 1) / CHUNK_LEN;
+    largest_power_of_two_leq(full_chunks) * CHUNK_LEN
+}
+
+#[test]
+fn test_left_len() {
+    let input_output = &[
+        (CHUNK_LEN + 1, CHUNK_LEN),
+        (2 * CHUNK_LEN - 1, CHUNK_LEN),
+        (2 * CHUNK_LEN, CHUNK_LEN),
+        (2 * CHUNK_LEN + 1, 2 * CHUNK_LEN),
+        (4 * CHUNK_LEN - 1, 2 * CHUNK_LEN),
+        (4 * CHUNK_LEN, 2 * CHUNK_LEN),
+        (4 * CHUNK_LEN + 1, 4 * CHUNK_LEN),
+    ];
+    for &(input, output) in input_output {
+        assert_eq!(left_len(input), output);
+    }
+}
--- a/rust/guts/src/portable.rs
+++ b/rust/guts/src/portable.rs
@ -0,0 +1,262 @@
+use crate::{
+    le_bytes_from_words_32, le_bytes_from_words_64, words_from_le_bytes_32, words_from_le_bytes_64,
+    BlockBytes, BlockWords, CVBytes, CVWords, Implementation, IV, MAX_SIMD_DEGREE, MSG_SCHEDULE,
+};
+
+const DEGREE: usize = MAX_SIMD_DEGREE;
+
+unsafe extern "C" fn degree() -> usize {
+    DEGREE
+}
+
+#[inline(always)]
+fn g(state: &mut BlockWords, a: usize, b: usize, c: usize, d: usize, x: u32, y: u32) {
+    state[a] = state[a].wrapping_add(state[b]).wrapping_add(x);
+    state[d] = (state[d] ^ state[a]).rotate_right(16);
+    state[c] = state[c].wrapping_add(state[d]);
+    state[b] = (state[b] ^ state[c]).rotate_right(12);
+    state[a] = state[a].wrapping_add(state[b]).wrapping_add(y);
+    state[d] = (state[d] ^ state[a]).rotate_right(8);
+    state[c] = state[c].wrapping_add(state[d]);
+    state[b] = (state[b] ^ state[c]).rotate_right(7);
+}
+
+#[inline(always)]
+fn round(state: &mut [u32; 16], msg: &BlockWords, round: usize) {
+    // Select the message schedule based on the round.
+    let schedule = MSG_SCHEDULE[round];
+
+    // Mix the columns.
+    g(state, 0, 4, 8, 12, msg[schedule[0]], msg[schedule[1]]);
+    g(state, 1, 5, 9, 13, msg[schedule[2]], msg[schedule[3]]);
+    g(state, 2, 6, 10, 14, msg[schedule[4]], msg[schedule[5]]);
+    g(state, 3, 7, 11, 15, msg[schedule[6]], msg[schedule[7]]);
+
+    // Mix the diagonals.
+    g(state, 0, 5, 10, 15, msg[schedule[8]], msg[schedule[9]]);
+    g(state, 1, 6, 11, 12, msg[schedule[10]], msg[schedule[11]]);
+    g(state, 2, 7, 8, 13, msg[schedule[12]], msg[schedule[13]]);
+    g(state, 3, 4, 9, 14, msg[schedule[14]], msg[schedule[15]]);
+}
+
+#[inline(always)]
+fn compress_inner(
+    block_words: &BlockWords,
+    block_len: u32,
+    cv_words: &CVWords,
+    counter: u64,
+    flags: u32,
+) -> [u32; 16] {
+    let mut state = [
+        cv_words[0],
+        cv_words[1],
+        cv_words[2],
+        cv_words[3],
+        cv_words[4],
+        cv_words[5],
+        cv_words[6],
+        cv_words[7],
+        IV[0],
+        IV[1],
+        IV[2],
+        IV[3],
+        counter as u32,
+        (counter >> 32) as u32,
+        block_len as u32,
+        flags as u32,
+    ];
+    for round_number in 0..7 {
+        round(&mut state, &block_words, round_number);
+    }
+    state
+}
+
+pub(crate) unsafe extern "C" fn compress(
+    block: *const BlockBytes,
+    block_len: u32,
+    cv: *const CVBytes,
+    counter: u64,
+    flags: u32,
+    out: *mut CVBytes,
+) {
+    let block_words = words_from_le_bytes_64(&*block);
+    let cv_words = words_from_le_bytes_32(&*cv);
+    let mut state = compress_inner(&block_words, block_len, &cv_words, counter, flags);
+    for word_index in 0..8 {
+        state[word_index] ^= state[word_index + 8];
+    }
+    *out = le_bytes_from_words_32(state[..8].try_into().unwrap());
+}
+
+pub(crate) unsafe extern "C" fn compress_xof(
+    block: *const BlockBytes,
+    block_len: u32,
+    cv: *const CVBytes,
+    counter: u64,
+    flags: u32,
+    out: *mut BlockBytes,
+) {
+    let block_words = words_from_le_bytes_64(&*block);
+    let cv_words = words_from_le_bytes_32(&*cv);
+    let mut state = compress_inner(&block_words, block_len, &cv_words, counter, flags);
+    for word_index in 0..8 {
+        state[word_index] ^= state[word_index + 8];
+        state[word_index + 8] ^= cv_words[word_index];
+    }
+    *out = le_bytes_from_words_64(&state);
+}
+
+pub(crate) unsafe extern "C" fn hash_chunks(
+    input: *const u8,
+    input_len: usize,
+    key: *const CVBytes,
+    counter: u64,
+    flags: u32,
+    transposed_output: *mut u32,
+) {
+    crate::hash_chunks_using_compress(
+        compress,
+        input,
+        input_len,
+        key,
+        counter,
+        flags,
+        transposed_output,
+    )
+}
+
+pub(crate) unsafe extern "C" fn hash_parents(
+    transposed_input: *const u32,
+    num_parents: usize,
+    key: *const CVBytes,
+    flags: u32,
+    transposed_output: *mut u32, // may overlap the input
+) {
+    crate::hash_parents_using_compress(
+        compress,
+        transposed_input,
+        num_parents,
+        key,
+        flags,
+        transposed_output,
+    )
+}
+
+pub(crate) unsafe extern "C" fn xof(
+    block: *const BlockBytes,
+    block_len: u32,
+    cv: *const CVBytes,
+    counter: u64,
+    flags: u32,
+    out: *mut u8,
+    out_len: usize,
+) {
+    crate::xof_using_compress_xof(
+        compress_xof,
+        block,
+        block_len,
+        cv,
+        counter,
+        flags,
+        out,
+        out_len,
+    )
+}
+
+pub(crate) unsafe extern "C" fn xof_xor(
+    block: *const BlockBytes,
+    block_len: u32,
+    cv: *const CVBytes,
+    counter: u64,
+    flags: u32,
+    out: *mut u8,
+    out_len: usize,
+) {
+    crate::xof_xor_using_compress_xof(
+        compress_xof,
+        block,
+        block_len,
+        cv,
+        counter,
+        flags,
+        out,
+        out_len,
+    )
+}
+
+pub(crate) unsafe extern "C" fn universal_hash(
+    input: *const u8,
+    input_len: usize,
+    key: *const CVBytes,
+    counter: u64,
+    out: *mut [u8; 16],
+) {
+    crate::universal_hash_using_compress(compress, input, input_len, key, counter, out)
+}
+
+pub fn implementation() -> Implementation {
+    Implementation::new(
+        degree,
+        compress,
+        hash_chunks,
+        hash_parents,
+        xof,
+        xof_xor,
+        universal_hash,
+    )
+}
+
+#[cfg(test)]
+mod test {
+    use super::*;
+
+    // This is circular but do it anyway.
+    #[test]
+    fn test_compress_vs_portable() {
+        crate::test::test_compress_vs_portable(&implementation());
+    }
+
+    #[test]
+    fn test_compress_vs_reference() {
+        crate::test::test_compress_vs_reference(&implementation());
+    }
+
+    // This is circular but do it anyway.
+    #[test]
+    fn test_hash_chunks_vs_portable() {
+        crate::test::test_hash_chunks_vs_portable(&implementation());
+    }
+
+    // This is circular but do it anyway.
+    #[test]
+    fn test_hash_parents_vs_portable() {
+        crate::test::test_hash_parents_vs_portable(&implementation());
+    }
+
+    #[test]
+    fn test_chunks_and_parents_vs_reference() {
+        crate::test::test_chunks_and_parents_vs_reference(&implementation());
+    }
+
+    // This is circular but do it anyway.
+    #[test]
+    fn test_xof_vs_portable() {
+        crate::test::test_xof_vs_portable(&implementation());
+    }
+
+    #[test]
+    fn test_xof_vs_reference() {
+        crate::test::test_xof_vs_reference(&implementation());
+    }
+
+    // This is circular but do it anyway.
+    #[test]
+    fn test_universal_hash_vs_portable() {
+        crate::test::test_universal_hash_vs_portable(&implementation());
+    }
+
+    #[test]
+    fn test_universal_hash_vs_reference() {
+        crate::test::test_universal_hash_vs_reference(&implementation());
+    }
+}
--- a/rust/guts/src/test.rs
+++ b/rust/guts/src/test.rs
@ -0,0 +1,523 @@
+use crate::*;
+
+pub const TEST_KEY: CVBytes = *b"whats the Elvish word for friend";
+
+// Test a few different initial counter values.
+// - 0: The base case.
+// - i32::MAX: *No* overflow. But carry bugs in tricky SIMD code can screw this up, if you XOR when
+//   you're supposed to ANDNOT.
+// - u32::MAX: The low word of the counter overflows for all inputs except the first.
+// - (42 << 32) + u32::MAX: Same but with a non-zero value in the high word.
+const INITIAL_COUNTERS: [u64; 4] = [
+    0,
+    i32::MAX as u64,
+    u32::MAX as u64,
+    (42u64 << 32) + u32::MAX as u64,
+];
+
+const BLOCK_LENGTHS: [usize; 4] = [0, 1, 63, 64];
+
+pub fn paint_test_input(buf: &mut [u8]) {
+    for (i, b) in buf.iter_mut().enumerate() {
+        *b = (i % 251) as u8;
+    }
+}
+
+pub fn test_compress_vs_portable(test_impl: &Implementation) {
+    for block_len in BLOCK_LENGTHS {
+        dbg!(block_len);
+        let mut block = [0; BLOCK_LEN];
+        paint_test_input(&mut block[..block_len]);
+        for counter in INITIAL_COUNTERS {
+            dbg!(counter);
+            let portable_cv = portable::implementation().compress(
+                &block,
+                block_len as u32,
+                &TEST_KEY,
+                counter,
+                KEYED_HASH,
+            );
+
+            let test_cv =
+                test_impl.compress(&block, block_len as u32, &TEST_KEY, counter, KEYED_HASH);
+
+            assert_eq!(portable_cv, test_cv);
+        }
+    }
+}
+
+pub fn test_compress_vs_reference(test_impl: &Implementation) {
+    for block_len in BLOCK_LENGTHS {
+        dbg!(block_len);
+        let mut block = [0; BLOCK_LEN];
+        paint_test_input(&mut block[..block_len]);
+
+        let mut ref_hasher = reference_impl::Hasher::new_keyed(&TEST_KEY);
+        ref_hasher.update(&block[..block_len]);
+        let mut ref_hash = [0u8; 32];
+        ref_hasher.finalize(&mut ref_hash);
+
+        let test_cv = test_impl.compress(
+            &block,
+            block_len as u32,
+            &TEST_KEY,
+            0,
+            CHUNK_START | CHUNK_END | ROOT | KEYED_HASH,
+        );
+
+        assert_eq!(ref_hash, test_cv);
+    }
+}
+
+fn check_transposed_eq(output_a: &TransposedVectors, output_b: &TransposedVectors) {
+    if output_a == output_b {
+        return;
+    }
+    for cv_index in 0..2 * MAX_SIMD_DEGREE {
+        let cv_a = output_a.extract_cv(cv_index);
+        let cv_b = output_b.extract_cv(cv_index);
+        if cv_a == [0; 32] && cv_b == [0; 32] {
+            println!("CV {cv_index:2} empty");
+        } else if cv_a == cv_b {
+            println!("CV {cv_index:2} matches");
+        } else {
+            println!("CV {cv_index:2} mismatch:");
+            println!("    {}", hex::encode(cv_a));
+            println!("    {}", hex::encode(cv_b));
+        }
+    }
+    panic!("transposed outputs are not equal");
+}
+
+pub fn test_hash_chunks_vs_portable(test_impl: &Implementation) {
+    assert!(test_impl.degree() <= MAX_SIMD_DEGREE);
+    dbg!(test_impl.degree() * CHUNK_LEN);
+    // Allocate 4 extra bytes of padding so we can make aligned slices.
+    let mut input_buf = [0u8; 2 * 2 * MAX_SIMD_DEGREE * CHUNK_LEN + 4];
+    let mut input_slice = &mut input_buf[..];
+    // Make sure the start of the input is word-aligned.
+    while input_slice.as_ptr() as usize % 4 != 0 {
+        input_slice = &mut input_slice[1..];
+    }
+    let (aligned_input, mut unaligned_input) =
+        input_slice.split_at_mut(2 * MAX_SIMD_DEGREE * CHUNK_LEN);
+    unaligned_input = &mut unaligned_input[1..][..2 * MAX_SIMD_DEGREE * CHUNK_LEN];
+    assert_eq!(aligned_input.as_ptr() as usize % 4, 0);
+    assert_eq!(unaligned_input.as_ptr() as usize % 4, 1);
+    paint_test_input(aligned_input);
+    paint_test_input(unaligned_input);
+    // Try just below, equal to, and just above every whole number of chunks.
+    let mut input_2_lengths = Vec::new();
+    let mut next_len = 2 * CHUNK_LEN;
+    loop {
+        // 95 is one whole block plus one interesting part of another
+        input_2_lengths.push(next_len - 95);
+        input_2_lengths.push(next_len);
+        if next_len == test_impl.degree() * CHUNK_LEN {
+            break;
+        }
+        input_2_lengths.push(next_len + 95);
+        next_len += CHUNK_LEN;
+    }
+    for input_2_len in input_2_lengths {
+        dbg!(input_2_len);
+        let aligned_input1 = &aligned_input[..test_impl.degree() * CHUNK_LEN];
+        let aligned_input2 = &aligned_input[test_impl.degree() * CHUNK_LEN..][..input_2_len];
+        let unaligned_input1 = &unaligned_input[..test_impl.degree() * CHUNK_LEN];
+        let unaligned_input2 = &unaligned_input[test_impl.degree() * CHUNK_LEN..][..input_2_len];
+        for initial_counter in INITIAL_COUNTERS {
+            dbg!(initial_counter);
+            // Make two calls, to test the output_column parameter.
+            let mut portable_output = TransposedVectors::new();
+            let (portable_left, portable_right) =
+                test_impl.split_transposed_vectors(&mut portable_output);
+            portable::implementation().hash_chunks(
+                aligned_input1,
+                &IV_BYTES,
+                initial_counter,
+                0,
+                portable_left,
+            );
+            portable::implementation().hash_chunks(
+                aligned_input2,
+                &TEST_KEY,
+                initial_counter + test_impl.degree() as u64,
+                KEYED_HASH,
+                portable_right,
+            );
+
+            let mut test_output = TransposedVectors::new();
+            let (test_left, test_right) = test_impl.split_transposed_vectors(&mut test_output);
+            test_impl.hash_chunks(aligned_input1, &IV_BYTES, initial_counter, 0, test_left);
+            test_impl.hash_chunks(
+                aligned_input2,
+                &TEST_KEY,
+                initial_counter + test_impl.degree() as u64,
+                KEYED_HASH,
+                test_right,
+            );
+            check_transposed_eq(&portable_output, &test_output);
+
+            // Do the same thing with unaligned input.
+            let mut unaligned_test_output = TransposedVectors::new();
+            let (unaligned_left, unaligned_right) =
+                test_impl.split_transposed_vectors(&mut unaligned_test_output);
+            test_impl.hash_chunks(
+                unaligned_input1,
+                &IV_BYTES,
+                initial_counter,
+                0,
+                unaligned_left,
+            );
+            test_impl.hash_chunks(
+                unaligned_input2,
+                &TEST_KEY,
+                initial_counter + test_impl.degree() as u64,
+                KEYED_HASH,
+                unaligned_right,
+            );
+            check_transposed_eq(&portable_output, &unaligned_test_output);
+        }
+    }
+}
+
+fn painted_transposed_input() -> TransposedVectors {
+    let mut vectors = TransposedVectors::new();
+    let mut val = 0;
+    for col in 0..2 * MAX_SIMD_DEGREE {
+        for row in 0..8 {
+            vectors.0[row][col] = val;
+            val += 1;
+        }
+    }
+    vectors
+}
+
+pub fn test_hash_parents_vs_portable(test_impl: &Implementation) {
+    assert!(test_impl.degree() <= MAX_SIMD_DEGREE);
+    let input = painted_transposed_input();
+    for num_parents in 2..=(test_impl.degree() / 2) {
+        dbg!(num_parents);
+        let mut portable_output = TransposedVectors::new();
+        let (portable_left, portable_right) =
+            test_impl.split_transposed_vectors(&mut portable_output);
+        portable::implementation().hash_parents(
+            &input,
+            2 * num_parents, // num_cvs
+            &IV_BYTES,
+            0,
+            portable_left,
+        );
+        portable::implementation().hash_parents(
+            &input,
+            2 * num_parents, // num_cvs
+            &TEST_KEY,
+            KEYED_HASH,
+            portable_right,
+        );
+
+        let mut test_output = TransposedVectors::new();
+        let (test_left, test_right) = test_impl.split_transposed_vectors(&mut test_output);
+        test_impl.hash_parents(
+            &input,
+            2 * num_parents, // num_cvs
+            &IV_BYTES,
+            0,
+            test_left,
+        );
+        test_impl.hash_parents(
+            &input,
+            2 * num_parents, // num_cvs
+            &TEST_KEY,
+            KEYED_HASH,
+            test_right,
+        );
+
+        check_transposed_eq(&portable_output, &test_output);
+    }
+}
+
+fn hash_with_chunks_and_parents_recurse(
+    test_impl: &Implementation,
+    input: &[u8],
+    counter: u64,
+    output: TransposedSplit,
+) -> usize {
+    assert!(input.len() > 0);
+    if input.len() <= test_impl.degree() * CHUNK_LEN {
+        return test_impl.hash_chunks(input, &IV_BYTES, counter, 0, output);
+    }
+    let (left_input, right_input) = input.split_at(left_len(input.len()));
+    let mut child_output = TransposedVectors::new();
+    let (left_output, right_output) = test_impl.split_transposed_vectors(&mut child_output);
+    let mut children =
+        hash_with_chunks_and_parents_recurse(test_impl, left_input, counter, left_output);
+    assert_eq!(children, test_impl.degree());
+    children += hash_with_chunks_and_parents_recurse(
+        test_impl,
+        right_input,
+        counter + (left_input.len() / CHUNK_LEN) as u64,
+        right_output,
+    );
+    test_impl.hash_parents(&child_output, children, &IV_BYTES, PARENT, output)
+}
+
+// Note: This test implementation doesn't support the 1-chunk-or-less case.
+fn root_hash_with_chunks_and_parents(test_impl: &Implementation, input: &[u8]) -> CVBytes {
+    // TODO: handle the 1-chunk case?
+    assert!(input.len() > CHUNK_LEN);
+    let mut cvs = TransposedVectors::new();
+    // The right half of these vectors are never used.
+    let (cvs_left, _) = test_impl.split_transposed_vectors(&mut cvs);
+    let mut num_cvs = hash_with_chunks_and_parents_recurse(test_impl, input, 0, cvs_left);
+    while num_cvs > 2 {
+        num_cvs = test_impl.reduce_parents(&mut cvs, num_cvs, &IV_BYTES, 0);
+    }
+    test_impl.compress(
+        &cvs.extract_parent_node(0),
+        BLOCK_LEN as u32,
+        &IV_BYTES,
+        0,
+        PARENT | ROOT,
+    )
+}
+
+pub fn test_chunks_and_parents_vs_reference(test_impl: &Implementation) {
+    assert_eq!(test_impl.degree().count_ones(), 1, "power of 2");
+    const MAX_INPUT_LEN: usize = 2 * MAX_SIMD_DEGREE * CHUNK_LEN;
+    let mut input_buf = [0u8; MAX_INPUT_LEN];
+    paint_test_input(&mut input_buf);
+    // Try just below, equal to, and just above every whole number of chunks, except that
+    // root_hash_with_chunks_and_parents doesn't support the 1-chunk-or-less case.
+    let mut test_lengths = vec![CHUNK_LEN + 1];
+    let mut next_len = 2 * CHUNK_LEN;
+    loop {
+        // 95 is one whole block plus one interesting part of another
+        test_lengths.push(next_len - 95);
+        test_lengths.push(next_len);
+        if next_len == MAX_INPUT_LEN {
+            break;
+        }
+        test_lengths.push(next_len + 95);
+        next_len += CHUNK_LEN;
+    }
+    for test_len in test_lengths {
+        dbg!(test_len);
+        let input = &input_buf[..test_len];
+
+        let mut ref_hasher = reference_impl::Hasher::new();
+        ref_hasher.update(&input);
+        let mut ref_hash = [0u8; 32];
+        ref_hasher.finalize(&mut ref_hash);
+
+        let test_hash = root_hash_with_chunks_and_parents(test_impl, input);
+
+        assert_eq!(ref_hash, test_hash);
+    }
+}
+
+pub fn test_xof_vs_portable(test_impl: &Implementation) {
+    let flags = CHUNK_START | CHUNK_END | KEYED_HASH;
+    for counter in INITIAL_COUNTERS {
+        dbg!(counter);
+        for input_len in [0, 1, BLOCK_LEN] {
+            dbg!(input_len);
+            let mut input_block = [0u8; BLOCK_LEN];
+            for byte_index in 0..input_len {
+                input_block[byte_index] = byte_index as u8 + 42;
+            }
+            // Try equal to and partway through every whole number of output blocks.
+            const MAX_OUTPUT_LEN: usize = 2 * MAX_SIMD_DEGREE * BLOCK_LEN;
+            let mut output_lengths = Vec::new();
+            let mut next_len = 0;
+            loop {
+                output_lengths.push(next_len);
+                if next_len == MAX_OUTPUT_LEN {
+                    break;
+                }
+                output_lengths.push(next_len + 31);
+                next_len += BLOCK_LEN;
+            }
+            for output_len in output_lengths {
+                dbg!(output_len);
+                let mut portable_output = [0xff; MAX_OUTPUT_LEN];
+                portable::implementation().xof(
+                    &input_block,
+                    input_len as u32,
+                    &TEST_KEY,
+                    counter,
+                    flags,
+                    &mut portable_output[..output_len],
+                );
+                let mut test_output = [0xff; MAX_OUTPUT_LEN];
+                test_impl.xof(
+                    &input_block,
+                    input_len as u32,
+                    &TEST_KEY,
+                    counter,
+                    flags,
+                    &mut test_output[..output_len],
+                );
+                assert_eq!(portable_output, test_output);
+
+                // Double check that the implementation didn't overwrite.
+                assert!(test_output[output_len..].iter().all(|&b| b == 0xff));
+
+                // The first XOR cancels out the output.
+                test_impl.xof_xor(
+                    &input_block,
+                    input_len as u32,
+                    &TEST_KEY,
+                    counter,
+                    flags,
+                    &mut test_output[..output_len],
+                );
+                assert!(test_output[..output_len].iter().all(|&b| b == 0));
+                assert!(test_output[output_len..].iter().all(|&b| b == 0xff));
+
+                // The second XOR restores out the output.
+                test_impl.xof_xor(
+                    &input_block,
+                    input_len as u32,
+                    &TEST_KEY,
+                    counter,
+                    flags,
+                    &mut test_output[..output_len],
+                );
+                assert_eq!(portable_output, test_output);
+                assert!(test_output[output_len..].iter().all(|&b| b == 0xff));
+            }
+        }
+    }
+}
+
+pub fn test_xof_vs_reference(test_impl: &Implementation) {
+    let input = b"hello world";
+    let mut input_block = [0; BLOCK_LEN];
+    input_block[..input.len()].copy_from_slice(input);
+
+    const MAX_OUTPUT_LEN: usize = 2 * MAX_SIMD_DEGREE * BLOCK_LEN;
+    let mut ref_output = [0; MAX_OUTPUT_LEN];
+    let mut ref_hasher = reference_impl::Hasher::new_keyed(&TEST_KEY);
+    ref_hasher.update(input);
+    ref_hasher.finalize(&mut ref_output);
+
+    // Try equal to and partway through every whole number of output blocks.
+    let mut output_lengths = vec![0, 1, 31];
+    let mut next_len = BLOCK_LEN;
+    loop {
+        output_lengths.push(next_len);
+        if next_len == MAX_OUTPUT_LEN {
+            break;
+        }
+        output_lengths.push(next_len + 31);
+        next_len += BLOCK_LEN;
+    }
+
+    for output_len in output_lengths {
+        dbg!(output_len);
+        let mut test_output = [0; MAX_OUTPUT_LEN];
+        test_impl.xof(
+            &input_block,
+            input.len() as u32,
+            &TEST_KEY,
+            0,
+            KEYED_HASH | CHUNK_START | CHUNK_END,
+            &mut test_output[..output_len],
+        );
+        assert_eq!(ref_output[..output_len], test_output[..output_len]);
+
+        // Double check that the implementation didn't overwrite.
+        assert!(test_output[output_len..].iter().all(|&b| b == 0));
+
+        // Do it again starting from block 1.
+        if output_len >= BLOCK_LEN {
+            test_impl.xof(
+                &input_block,
+                input.len() as u32,
+                &TEST_KEY,
+                1,
+                KEYED_HASH | CHUNK_START | CHUNK_END,
+                &mut test_output[..output_len - BLOCK_LEN],
+            );
+            assert_eq!(
+                ref_output[BLOCK_LEN..output_len],
+                test_output[..output_len - BLOCK_LEN],
+            );
+        }
+    }
+}
+
+pub fn test_universal_hash_vs_portable(test_impl: &Implementation) {
+    const MAX_INPUT_LEN: usize = 2 * MAX_SIMD_DEGREE * BLOCK_LEN;
+    let mut input_buf = [0; MAX_INPUT_LEN];
+    paint_test_input(&mut input_buf);
+    // Try equal to and partway through every whole number of input blocks.
+    let mut input_lengths = vec![0, 1, 31];
+    let mut next_len = BLOCK_LEN;
+    loop {
+        input_lengths.push(next_len);
+        if next_len == MAX_INPUT_LEN {
+            break;
+        }
+        input_lengths.push(next_len + 31);
+        next_len += BLOCK_LEN;
+    }
+    for input_len in input_lengths {
+        dbg!(input_len);
+        for counter in INITIAL_COUNTERS {
+            dbg!(counter);
+            let portable_output = portable::implementation().universal_hash(
+                &input_buf[..input_len],
+                &TEST_KEY,
+                counter,
+            );
+            let test_output = test_impl.universal_hash(&input_buf[..input_len], &TEST_KEY, counter);
+            assert_eq!(portable_output, test_output);
+        }
+    }
+}
+
+fn reference_impl_universal_hash(input: &[u8], key: &CVBytes) -> [u8; UNIVERSAL_HASH_LEN] {
+    // The reference_impl doesn't support XOF seeking, so we have to materialize an entire extended
+    // output to seek to a block.
+    const MAX_BLOCKS: usize = 2 * MAX_SIMD_DEGREE;
+    assert!(input.len() / BLOCK_LEN <= MAX_BLOCKS);
+    let mut output_buffer: [u8; BLOCK_LEN * MAX_BLOCKS] = [0u8; BLOCK_LEN * MAX_BLOCKS];
+    let mut result = [0u8; UNIVERSAL_HASH_LEN];
+    let mut block_start = 0;
+    while block_start < input.len() {
+        let block_len = cmp::min(input.len() - block_start, BLOCK_LEN);
+        let mut ref_hasher = reference_impl::Hasher::new_keyed(key);
+        ref_hasher.update(&input[block_start..block_start + block_len]);
+        ref_hasher.finalize(&mut output_buffer[..block_start + UNIVERSAL_HASH_LEN]);
+        for byte_index in 0..UNIVERSAL_HASH_LEN {
+            result[byte_index] ^= output_buffer[block_start + byte_index];
+        }
+        block_start += BLOCK_LEN;
+    }
+    result
+}
+
+pub fn test_universal_hash_vs_reference(test_impl: &Implementation) {
+    const MAX_INPUT_LEN: usize = 2 * MAX_SIMD_DEGREE * BLOCK_LEN;
+    let mut input_buf = [0; MAX_INPUT_LEN];
+    paint_test_input(&mut input_buf);
+    // Try equal to and partway through every whole number of input blocks.
+    let mut input_lengths = vec![0, 1, 31];
+    let mut next_len = BLOCK_LEN;
+    loop {
+        input_lengths.push(next_len);
+        if next_len == MAX_INPUT_LEN {
+            break;
+        }
+        input_lengths.push(next_len + 31);
+        next_len += BLOCK_LEN;
+    }
+    for input_len in input_lengths {
+        dbg!(input_len);
+        let ref_output = reference_impl_universal_hash(&input_buf[..input_len], &TEST_KEY);
+        let test_output = test_impl.universal_hash(&input_buf[..input_len], &TEST_KEY, 0);
+        assert_eq!(ref_output, test_output);
+    }
+}
--- a/src/io.rs
+++ b/src/io.rs
@ -0,0 +1,79 @@
+//! Helper functions for efficient IO.
+
+#[cfg(feature = "std")]
+pub(crate) fn copy_wide(
+    mut reader: impl std::io::Read,
+    hasher: &mut crate::Hasher,
+) -> std::io::Result<u64> {
+    let mut buffer = [0; 65536];
+    let mut total = 0;
+    loop {
+        match reader.read(&mut buffer) {
+            Ok(0) => return Ok(total),
+            Ok(n) => {
+                hasher.update(&buffer[..n]);
+                total += n as u64;
+            }
+            // see test_update_reader_interrupted
+            Err(e) if e.kind() == std::io::ErrorKind::Interrupted => continue,
+            Err(e) => return Err(e),
+        }
+    }
+}
+
+// Mmap a file, if it looks like a good idea. Return None in cases where we know mmap will fail, or
+// if the file is short enough that mmapping isn't worth it. However, if we do try to mmap and it
+// fails, return the error.
+//
+// SAFETY: Mmaps are fundamentally unsafe, because you can call invariant-checking functions like
+// str::from_utf8 on them and then have them change out from under you. Letting a safe caller get
+// their hands on an mmap, or even a &[u8] that's backed by an mmap, is unsound. However, because
+// this function is crate-private, we can guarantee that all can ever happen in the event of a race
+// condition is that we either hash nonsense bytes or crash with SIGBUS or similar, neither of
+// which should risk memory corruption in a safe caller.
+//
+// PARANOIA: But a data race...is a data race...is a data race...right? Even if we know that no
+// platform in the "real world" is ever going to do anything other than compute the "wrong answer"
+// if we race on this mmap while we hash it, aren't we still supposed to feel bad about doing this?
+// Well, maybe. This is IO, and IO gets special carve-outs in the memory model. Consider a
+// memory-mapped register that returns random 32-bit words. (This is actually realistic if you have
+// a hardware RNG.) It's probably sound to construct a *const i32 pointing to that register and do
+// some raw pointer reads from it. Those reads should be volatile if you don't want the compiler to
+// coalesce them, but either way the compiler isn't allowed to just _go nuts_ and insert
+// should-never-happen branches to wipe your hard drive if two adjacent reads happen to give
+// different values. As far as I'm aware, there's no such thing as a read that's allowed if it's
+// volatile but prohibited if it's not (unlike atomics). As mentioned above, it's not ok to
+// construct a safe &i32 to the register if you're going to leak that reference to unknown callers.
+// But if you "know what you're doing," I don't think *const i32 and &i32 are fundamentally
+// different here. Feedback needed.
+#[cfg(feature = "mmap")]
+pub(crate) fn maybe_mmap_file(file: &std::fs::File) -> std::io::Result<Option<memmap2::Mmap>> {
+    let metadata = file.metadata()?;
+    let file_size = metadata.len();
+    #[allow(clippy::if_same_then_else)]
+    if !metadata.is_file() {
+        // Not a real file.
+        Ok(None)
+    } else if file_size > isize::max_value() as u64 {
+        // Too long to safely map.
+        // https://github.com/danburkert/memmap-rs/issues/69
+        Ok(None)
+    } else if file_size == 0 {
+        // Mapping an empty file currently fails.
+        // https://github.com/danburkert/memmap-rs/issues/72
+        // See test_mmap_virtual_file.
+        Ok(None)
+    } else if file_size < 16 * 1024 {
+        // Mapping small files is not worth it.
+        Ok(None)
+    } else {
+        // Explicitly set the length of the memory map, so that filesystem
+        // changes can't race to violate the invariants we just checked.
+        let map = unsafe {
+            memmap2::MmapOptions::new()
+                .len(file_size as usize)
+                .map(file)?
+        };
+        Ok(Some(map))
+    }
+}
--- a/src/lib.rs
+++ b/src/lib.rs
@ -33,15 +33,33 @@
 //! # Cargo Features
 //!
 //! The `std` feature (the only feature enabled by default) is required for
-//! implementations of the [`Write`] and [`Seek`] traits, and also for runtime
-//! CPU feature detection on x86. If this feature is disabled, the only way to
-//! use the x86 SIMD implementations is to enable the corresponding instruction
-//! sets globally, with e.g. `RUSTFLAGS="-C target-cpu=native"`. The resulting
-//! binary will not be portable to other machines.
+//! implementations of the [`Write`] and [`Seek`] traits, the
+//! [`update_reader`](Hasher::update_reader) helper method, and runtime CPU
+//! feature detection on x86. If this feature is disabled, the only way to use
+//! the x86 SIMD implementations is to enable the corresponding instruction sets
+//! globally, with e.g. `RUSTFLAGS="-C target-cpu=native"`. The resulting binary
+//! will not be portable to other machines.
 //!
 //! The `rayon` feature (disabled by default, but enabled for [docs.rs]) adds
-//! the [`Hasher::update_rayon`] method, for multithreaded hashing. However,
-//! even if this feature is enabled, all other APIs remain single-threaded.
+//! the [`update_rayon`](Hasher::update_rayon) and (in combination with `mmap`
+//! below) [`update_mmap_rayon`](Hasher::update_mmap_rayon) methods, for
+//! multithreaded hashing. However, even if this feature is enabled, all other
+//! APIs remain single-threaded.
+//!
+//! The `mmap` feature (disabled by default, but enabled for [docs.rs]) adds the
+//! [`update_mmap`](Hasher::update_mmap) and (in combination with `rayon` above)
+//! [`update_mmap_rayon`](Hasher::update_mmap_rayon) helper methods for
+//! memory-mapped IO.
+//!
+//! The `zeroize` feature (disabled by default, but enabled for [docs.rs])
+//! implements
+//! [`Zeroize`](https://docs.rs/zeroize/latest/zeroize/trait.Zeroize.html) for
+//! this crate's types.
+//!
+//! The `serde` feature (disabled by default, but enabled for [docs.rs]) implements
+//! [`serde::Serialize`](https://docs.rs/serde/latest/serde/trait.Serialize.html) and
+//! [`serde::Deserialize`](https://docs.rs/serde/latest/serde/trait.Deserialize.html)
+//! for [`Hash`](struct@Hash).
 //!
 //! The NEON implementation is enabled by default for AArch64 but requires the
 //! `neon` feature for other ARM targets. Not all ARMv7 CPUs support NEON, and
@ -49,12 +67,12 @@
 //! without NEON support.
 //!
 //! The `traits-preview` feature enables implementations of traits from the
-//! RustCrypto [`digest`] crate, and re-exports that crate as
-//! `traits::digest`. However, the traits aren't stable, and they're expected to
-//! change in incompatible ways before that crate reaches 1.0. For that reason,
-//! this crate makes no SemVer guarantees for this feature, and callers who use
-//! it should expect breaking changes between patch versions. (The "-preview"
-//! feature name follows the conventions of the RustCrypto [`signature`] crate.)
+//! RustCrypto [`digest`] crate, and re-exports that crate as `traits::digest`.
+//! However, the traits aren't stable, and they're expected to change in
+//! incompatible ways before that crate reaches 1.0. For that reason, this crate
+//! makes no SemVer guarantees for this feature, and callers who use it should
+//! expect breaking changes between patch versions. (The "-preview" feature name
+//! follows the conventions of the RustCrypto [`signature`] crate.)
 //!
 //! [`Hasher::update_rayon`]: struct.Hasher.html#method.update_rayon
 //! [BLAKE3]: https://blake3.io
@ -112,6 +130,7 @@ mod sse41;
 #[cfg(feature = "traits-preview")]
 pub mod traits;

+mod io;
 mod join;

 use arrayref::{array_mut_ref, array_ref};
@ -197,6 +216,8 @@ fn counter_high(counter: u64) -> u32 {
 /// [`from_hex`]: #method.from_hex
 /// [`Display`]: https://doc.rust-lang.org/std/fmt/trait.Display.html
 /// [`FromStr`]: https://doc.rust-lang.org/std/str/trait.FromStr.html
+#[cfg_attr(feature = "zeroize", derive(zeroize::Zeroize))]
+#[cfg_attr(feature = "serde", derive(serde::Deserialize, serde::Serialize))]
 #[derive(Clone, Copy, Hash)]
 pub struct Hash([u8; OUT_LEN]);

@ -284,10 +305,28 @@ impl core::str::FromStr for Hash {
    }
 }

+// A proper implementation of constant time equality is tricky, and we get it from the
+// constant_time_eq crate instead of rolling our own. However, that crate isn't compatible with
+// Miri, so we roll our own just for that.
+#[cfg(miri)]
+fn constant_time_eq_miri(a: &[u8], b: &[u8]) -> bool {
+    if a.len() != b.len() {
+        return false;
+    }
+    let mut x = 0;
+    for i in 0..a.len() {
+        x |= a[i] ^ b[i];
+    }
+    x == 0
+}
+
 /// This implementation is constant-time.
 impl PartialEq for Hash {
    #[inline]
    fn eq(&self, other: &Hash) -> bool {
+        #[cfg(miri)]
+        return constant_time_eq_miri(&self.0, &other.0);
+        #[cfg(not(miri))]
        constant_time_eq::constant_time_eq_32(&self.0, &other.0)
    }
 }
@ -296,6 +335,9 @@ impl PartialEq for Hash {
 impl PartialEq<[u8; OUT_LEN]> for Hash {
    #[inline]
    fn eq(&self, other: &[u8; OUT_LEN]) -> bool {
+        #[cfg(miri)]
+        return constant_time_eq_miri(&self.0, other);
+        #[cfg(not(miri))]
        constant_time_eq::constant_time_eq_32(&self.0, other)
    }
 }
@ -304,6 +346,9 @@ impl PartialEq<[u8; OUT_LEN]> for Hash {
 impl PartialEq<[u8]> for Hash {
    #[inline]
    fn eq(&self, other: &[u8]) -> bool {
+        #[cfg(miri)]
+        return constant_time_eq_miri(&self.0, other);
+        #[cfg(not(miri))]
        constant_time_eq::constant_time_eq(&self.0, other)
    }
 }
@ -371,6 +416,7 @@ impl std::error::Error for HexError {}
 // Each chunk or parent node can produce either a 32-byte chaining value or, by
 // setting the ROOT flag, any number of final output bytes. The Output struct
 // captures the state just prior to choosing between those two possibilities.
+#[cfg_attr(feature = "zeroize", derive(zeroize::Zeroize))]
 #[derive(Clone)]
 struct Output {
    input_chaining_value: CVWords,
@ -378,6 +424,7 @@ struct Output {
    block_len: u8,
    counter: u64,
    flags: u8,
+    #[cfg_attr(feature = "zeroize", zeroize(skip))]
    platform: Platform,
 }

@ -414,6 +461,7 @@ impl Output {
 }

 #[derive(Clone)]
+#[cfg_attr(feature = "zeroize", derive(zeroize::Zeroize))]
 struct ChunkState {
    cv: CVWords,
    chunk_counter: u64,
@ -421,6 +469,7 @@ struct ChunkState {
    buf_len: u8,
    blocks_compressed: u8,
    flags: u8,
+    #[cfg_attr(feature = "zeroize", zeroize(skip))]
    platform: Platform,
 }

@ -903,6 +952,9 @@ fn parent_node_output(

 /// An incremental hash state that can accept any number of writes.
 ///
+/// The `rayon` and `mmap` Cargo features enable additional methods on this
+/// type related to multithreading and memory-mapped IO.
+///
 /// When the `traits-preview` Cargo feature is enabled, this type implements
 /// several commonly used traits from the
 /// [`digest`](https://crates.io/crates/digest) crate. However, those
@ -911,15 +963,6 @@ fn parent_node_output(
 /// guarantees for this feature, and callers who use it should expect breaking
 /// changes between patch versions.
 ///
-/// When the `rayon` Cargo feature is enabled, the
-/// [`update_rayon`](#method.update_rayon) method is available for multithreaded
-/// hashing.
-///
-/// **Performance note:** The [`update`](#method.update) method can't take full
-/// advantage of SIMD optimizations if its input buffer is too small or oddly
-/// sized. Using a 16 KiB buffer, or any multiple of that, enables all currently
-/// supported SIMD instruction sets.
-///
 /// # Examples
 ///
 /// ```
@ -942,6 +985,7 @@ fn parent_node_output(
 /// # }
 /// ```
 #[derive(Clone)]
+#[cfg_attr(feature = "zeroize", derive(zeroize::Zeroize))]
 pub struct Hasher {
    key: CVWords,
    chunk_state: ChunkState,
@ -1069,48 +1113,17 @@ impl Hasher {
        self.cv_stack.push(*new_cv);
    }

-    /// Add input bytes to the hash state. You can call this any number of
-    /// times.
+    /// Add input bytes to the hash state. You can call this any number of times.
    ///
    /// This method is always single-threaded. For multithreading support, see
-    /// [`update_rayon`](#method.update_rayon) below (enabled with the `rayon`
-    /// Cargo feature).
+    /// [`update_rayon`](#method.update_rayon) (enabled with the `rayon` Cargo feature).
    ///
-    /// Note that the degree of SIMD parallelism that `update` can use is
-    /// limited by the size of this input buffer. The 8 KiB buffer currently
-    /// used by [`std::io::copy`] is enough to leverage AVX2, for example, but
-    /// not enough to leverage AVX-512. A 16 KiB buffer is large enough to
-    /// leverage all currently supported SIMD instruction sets.
-    ///
-    /// [`std::io::copy`]: https://doc.rust-lang.org/std/io/fn.copy.html
+    /// Note that the degree of SIMD parallelism that `update` can use is limited by the size of
+    /// this input buffer. See [`update_reader`](#method.update_reader).
    pub fn update(&mut self, input: &[u8]) -> &mut Self {
        self.update_with_join::<join::SerialJoin>(input)
    }

-    /// Identical to [`update`](Hasher::update), but using Rayon-based
-    /// multithreading internally.
-    ///
-    /// This method is gated by the `rayon` Cargo feature, which is disabled by
-    /// default but enabled on [docs.rs](https://docs.rs).
-    ///
-    /// To get any performance benefit from multithreading, the input buffer
-    /// needs to be large. As a rule of thumb on x86_64, `update_rayon` is
-    /// _slower_ than `update` for inputs under 128 KiB. That threshold varies
-    /// quite a lot across different processors, and it's important to benchmark
-    /// your specific use case.
-    ///
-    /// Memory mapping an entire input file is a simple way to take advantage of
-    /// multithreading without needing to carefully tune your buffer size or
-    /// offload IO. However, on spinning disks where random access is expensive,
-    /// that approach can lead to disk thrashing and terrible IO performance.
-    /// Note that OS page caching can mask this problem, in which case it might
-    /// only appear for files larger than available RAM. Again, benchmarking
-    /// your specific use case is important.
-    #[cfg(feature = "rayon")]
-    pub fn update_rayon(&mut self, input: &[u8]) -> &mut Self {
-        self.update_with_join::<join::RayonJoin>(input)
-    }
-
    fn update_with_join<J: join::Join>(&mut self, mut input: &[u8]) -> &mut Self {
        // If we have some partial chunk bytes in the internal chunk_state, we
        // need to finish that chunk first.
@ -1309,6 +1322,182 @@ impl Hasher {
    pub fn count(&self) -> u64 {
        self.chunk_state.chunk_counter * CHUNK_LEN as u64 + self.chunk_state.len() as u64
    }
+
+    /// As [`update`](Hasher::update), but reading from a
+    /// [`std::io::Read`](https://doc.rust-lang.org/std/io/trait.Read.html) implementation.
+    ///
+    /// [`Hasher`] implements
+    /// [`std::io::Write`](https://doc.rust-lang.org/std/io/trait.Write.html), so it's possible to
+    /// use [`std::io::copy`](https://doc.rust-lang.org/std/io/fn.copy.html) to update a [`Hasher`]
+    /// from any reader. Unfortunately, this standard approach can limit performance, because
+    /// `copy` currently uses an internal 8 KiB buffer that isn't big enough to take advantage of
+    /// all SIMD instruction sets. (In particular, [AVX-512](https://en.wikipedia.org/wiki/AVX-512)
+    /// needs a 16 KiB buffer.) `update_reader` avoids this performance problem and is slightly
+    /// more convenient.
+    ///
+    /// The internal buffer size this method uses may change at any time, and it may be different
+    /// for different targets. The only guarantee is that it will be large enough for all of this
+    /// crate's SIMD implementations on the current platform.
+    ///
+    /// The most common implementer of
+    /// [`std::io::Read`](https://doc.rust-lang.org/std/io/trait.Read.html) might be
+    /// [`std::fs::File`](https://doc.rust-lang.org/std/fs/struct.File.html), but note that memory
+    /// mapping can be faster than this method for hashing large files. See
+    /// [`update_mmap`](Hasher::update_mmap) and [`update_mmap_rayon`](Hasher::update_mmap_rayon),
+    /// which require the `mmap` and (for the latter) `rayon` Cargo features.
+    ///
+    /// This method requires the `std` Cargo feature, which is enabled by default.
+    ///
+    /// # Example
+    ///
+    /// ```no_run
+    /// # use std::fs::File;
+    /// # use std::io;
+    /// # fn main() -> io::Result<()> {
+    /// // Hash standard input.
+    /// let mut hasher = blake3::Hasher::new();
+    /// hasher.update_reader(std::io::stdin().lock())?;
+    /// println!("{}", hasher.finalize());
+    /// # Ok(())
+    /// # }
+    /// ```
+    #[cfg(feature = "std")]
+    pub fn update_reader(&mut self, reader: impl std::io::Read) -> std::io::Result<&mut Self> {
+        io::copy_wide(reader, self)?;
+        Ok(self)
+    }
+
+    /// As [`update`](Hasher::update), but using Rayon-based multithreading
+    /// internally.
+    ///
+    /// This method is gated by the `rayon` Cargo feature, which is disabled by
+    /// default but enabled on [docs.rs](https://docs.rs).
+    ///
+    /// To get any performance benefit from multithreading, the input buffer
+    /// needs to be large. As a rule of thumb on x86_64, `update_rayon` is
+    /// _slower_ than `update` for inputs under 128 KiB. That threshold varies
+    /// quite a lot across different processors, and it's important to benchmark
+    /// your specific use case. See also the performance warning associated with
+    /// [`update_mmap_rayon`](Hasher::update_mmap_rayon).
+    ///
+    /// If you already have a large buffer in memory, and you want to hash it
+    /// with multiple threads, this method is a good option. However, reading a
+    /// file into memory just to call this method can be a performance mistake,
+    /// both because it requires lots of memory and because single-threaded
+    /// reads can be slow. For hashing whole files, see
+    /// [`update_mmap_rayon`](Hasher::update_mmap_rayon), which is gated by both
+    /// the `rayon` and `mmap` Cargo features.
+    #[cfg(feature = "rayon")]
+    pub fn update_rayon(&mut self, input: &[u8]) -> &mut Self {
+        self.update_with_join::<join::RayonJoin>(input)
+    }
+
+    /// As [`update`](Hasher::update), but reading the contents of a file using memory mapping.
+    ///
+    /// Not all files can be memory mapped, and memory mapping small files can be slower than
+    /// reading them the usual way. In those cases, this method will fall back to standard file IO.
+    /// The heuristic for whether to use memory mapping is currently very simple (file size >=
+    /// 16 KiB), and it might change at any time.
+    ///
+    /// Like [`update`](Hasher::update), this method is single-threaded. In this author's
+    /// experience, memory mapping improves single-threaded performance by ~10% for large files
+    /// that are already in cache. This probably varies between platforms, and as always it's a
+    /// good idea to benchmark your own use case. In comparison, the multithreaded
+    /// [`update_mmap_rayon`](Hasher::update_mmap_rayon) method can have a much larger impact on
+    /// performance.
+    ///
+    /// There's a correctness reason that this method takes
+    /// [`Path`](https://doc.rust-lang.org/stable/std/path/struct.Path.html) instead of
+    /// [`File`](https://doc.rust-lang.org/std/fs/struct.File.html): reading from a memory-mapped
+    /// file ignores the seek position of the original file handle (it neither respects the current
+    /// position nor updates the position). This difference in behavior would've caused
+    /// `update_mmap` and [`update_reader`](Hasher::update_reader) to give different answers and
+    /// have different side effects in some cases. Taking a
+    /// [`Path`](https://doc.rust-lang.org/stable/std/path/struct.Path.html) avoids this problem by
+    /// making it clear that a new [`File`](https://doc.rust-lang.org/std/fs/struct.File.html) is
+    /// opened internally.
+    ///
+    /// This method requires the `mmap` Cargo feature, which is disabled by default but enabled on
+    /// [docs.rs](https://docs.rs).
+    ///
+    /// # Example
+    ///
+    /// ```no_run
+    /// # use std::io;
+    /// # use std::path::Path;
+    /// # fn main() -> io::Result<()> {
+    /// let path = Path::new("file.dat");
+    /// let mut hasher = blake3::Hasher::new();
+    /// hasher.update_mmap(path)?;
+    /// println!("{}", hasher.finalize());
+    /// # Ok(())
+    /// # }
+    /// ```
+    #[cfg(feature = "mmap")]
+    pub fn update_mmap(&mut self, path: impl AsRef<std::path::Path>) -> std::io::Result<&mut Self> {
+        let file = std::fs::File::open(path.as_ref())?;
+        if let Some(mmap) = io::maybe_mmap_file(&file)? {
+            self.update(&mmap);
+        } else {
+            io::copy_wide(&file, self)?;
+        }
+        Ok(self)
+    }
+
+    /// As [`update_rayon`](Hasher::update_rayon), but reading the contents of a file using
+    /// memory mapping. This is the default behavior of `b3sum`.
+    ///
+    /// For large files that are likely to be in cache, this can be much faster than
+    /// single-threaded hashing. When benchmarks report that BLAKE3 is 10x or 20x faster than other
+    /// cryptographic hashes, this is usually what they're measuring. However...
+    ///
+    /// **Performance Warning:** There are cases where multithreading hurts performance. The worst
+    /// case is [a large file on a spinning disk](https://github.com/BLAKE3-team/BLAKE3/issues/31),
+    /// where simultaneous reads from multiple threads can cause "thrashing" (i.e. the disk spends
+    /// more time seeking around than reading data). Windows tends to be somewhat worse about this,
+    /// in part because it's less likely than Linux to keep very large files in cache. More
+    /// generally, if your CPU cores are already busy, then multithreading will add overhead
+    /// without improving performance. If your code runs in different environments that you don't
+    /// control and can't measure, then unfortunately there's no one-size-fits-all answer for
+    /// whether multithreading is a good idea.
+    ///
+    /// The memory mapping behavior of this function is the same as
+    /// [`update_mmap`](Hasher::update_mmap), and the heuristic for when to fall back to standard
+    /// file IO might change at any time.
+    ///
+    /// This method requires both the `mmap` and `rayon` Cargo features, which are disabled by
+    /// default but enabled on [docs.rs](https://docs.rs).
+    ///
+    /// # Example
+    ///
+    /// ```no_run
+    /// # use std::io;
+    /// # use std::path::Path;
+    /// # fn main() -> io::Result<()> {
+    /// # #[cfg(feature = "rayon")]
+    /// # {
+    /// let path = Path::new("big_file.dat");
+    /// let mut hasher = blake3::Hasher::new();
+    /// hasher.update_mmap_rayon(path)?;
+    /// println!("{}", hasher.finalize());
+    /// # }
+    /// # Ok(())
+    /// # }
+    /// ```
+    #[cfg(feature = "mmap")]
+    #[cfg(feature = "rayon")]
+    pub fn update_mmap_rayon(
+        &mut self,
+        path: impl AsRef<std::path::Path>,
+    ) -> std::io::Result<&mut Self> {
+        let file = std::fs::File::open(path.as_ref())?;
+        if let Some(mmap) = io::maybe_mmap_file(&file)? {
+            self.update_rayon(&mmap);
+        } else {
+            io::copy_wide(&file, self)?;
+        }
+        Ok(self)
+    }
 }

 // Don't derive(Debug), because the state may be secret.
@ -1366,6 +1555,7 @@ impl std::io::Write for Hasher {
 /// from an unknown position in the output stream to recover its block index. Callers with strong
 /// secret keys aren't affected in practice, but secret offsets are a [design
 /// smell](https://en.wikipedia.org/wiki/Design_smell) in any case.
+#[cfg_attr(feature = "zeroize", derive(zeroize::Zeroize))]
 #[derive(Clone)]
 pub struct OutputReader {
    inner: Output,
--- a/src/platform.rs
+++ b/src/platform.rs
@ -56,6 +56,11 @@ pub enum Platform {
 impl Platform {
    #[allow(unreachable_code)]
    pub fn detect() -> Self {
+        #[cfg(miri)]
+        {
+            return Platform::Portable;
+        }
+
        #[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
        {
            #[cfg(blake3_avx512_ffi)]
@ -327,7 +332,12 @@ impl Platform {
 #[cfg(blake3_avx512_ffi)]
 #[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
 #[inline(always)]
+#[allow(unreachable_code)]
 pub fn avx512_detected() -> bool {
+    if cfg!(miri) {
+        return false;
+    }
+
    // A testing-only short-circuit.
    if cfg!(feature = "no_avx512") {
        return false;
@ -349,7 +359,12 @@ pub fn avx512_detected() -> bool {

 #[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
 #[inline(always)]
+#[allow(unreachable_code)]
 pub fn avx2_detected() -> bool {
+    if cfg!(miri) {
+        return false;
+    }
+
    // A testing-only short-circuit.
    if cfg!(feature = "no_avx2") {
        return false;
@ -371,7 +386,12 @@ pub fn avx2_detected() -> bool {

 #[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
 #[inline(always)]
+#[allow(unreachable_code)]
 pub fn sse41_detected() -> bool {
+    if cfg!(miri) {
+        return false;
+    }
+
    // A testing-only short-circuit.
    if cfg!(feature = "no_sse41") {
        return false;
@ -395,6 +415,10 @@ pub fn sse41_detected() -> bool {
 #[inline(always)]
 #[allow(unreachable_code)]
 pub fn sse2_detected() -> bool {
+    if cfg!(miri) {
+        return false;
+    }
+
    // A testing-only short-circuit.
    if cfg!(feature = "no_sse2") {
        return false;
--- a/src/test.rs
+++ b/src/test.rs
@ -628,3 +628,211 @@ const fn test_hash_const_conversions() {
    let hash = crate::Hash::from_bytes(bytes);
    _ = hash.as_bytes();
 }
+
+#[cfg(feature = "zeroize")]
+#[test]
+fn test_zeroize() {
+    use zeroize::Zeroize;
+
+    let mut hash = crate::Hash([42; 32]);
+    hash.zeroize();
+    assert_eq!(hash.0, [0u8; 32]);
+
+    let mut hasher = crate::Hasher {
+        chunk_state: crate::ChunkState {
+            cv: [42; 8],
+            chunk_counter: 42,
+            buf: [42; 64],
+            buf_len: 42,
+            blocks_compressed: 42,
+            flags: 42,
+            platform: crate::Platform::Portable,
+        },
+        key: [42; 8],
+        cv_stack: [[42; 32]; { crate::MAX_DEPTH + 1 }].into(),
+    };
+    hasher.zeroize();
+    assert_eq!(hasher.chunk_state.cv, [0; 8]);
+    assert_eq!(hasher.chunk_state.chunk_counter, 0);
+    assert_eq!(hasher.chunk_state.buf, [0; 64]);
+    assert_eq!(hasher.chunk_state.buf_len, 0);
+    assert_eq!(hasher.chunk_state.blocks_compressed, 0);
+    assert_eq!(hasher.chunk_state.flags, 0);
+    assert!(matches!(
+        hasher.chunk_state.platform,
+        crate::Platform::Portable
+    ));
+    assert_eq!(hasher.key, [0; 8]);
+    assert_eq!(&*hasher.cv_stack, &[[0u8; 32]; 0]);
+
+    let mut output_reader = crate::OutputReader {
+        inner: crate::Output {
+            input_chaining_value: [42; 8],
+            block: [42; 64],
+            counter: 42,
+            block_len: 42,
+            flags: 42,
+            platform: crate::Platform::Portable,
+        },
+        position_within_block: 42,
+    };
+
+    output_reader.zeroize();
+    assert_eq!(output_reader.inner.input_chaining_value, [0; 8]);
+    assert_eq!(output_reader.inner.block, [0; 64]);
+    assert_eq!(output_reader.inner.counter, 0);
+    assert_eq!(output_reader.inner.block_len, 0);
+    assert_eq!(output_reader.inner.flags, 0);
+    assert!(matches!(
+        output_reader.inner.platform,
+        crate::Platform::Portable
+    ));
+    assert_eq!(output_reader.position_within_block, 0);
+}
+
+#[test]
+#[cfg(feature = "std")]
+fn test_update_reader() -> Result<(), std::io::Error> {
+    // This is a brief test, since update_reader() is mostly a wrapper around update(), which already
+    // has substantial testing.
+    let mut input = vec![0; 1_000_000];
+    paint_test_input(&mut input);
+    assert_eq!(
+        crate::Hasher::new().update_reader(&input[..])?.finalize(),
+        crate::hash(&input),
+    );
+    Ok(())
+}
+
+#[test]
+#[cfg(feature = "std")]
+fn test_update_reader_interrupted() -> std::io::Result<()> {
+    use std::io;
+    struct InterruptingReader<'a> {
+        already_interrupted: bool,
+        slice: &'a [u8],
+    }
+    impl<'a> InterruptingReader<'a> {
+        fn new(slice: &'a [u8]) -> Self {
+            Self {
+                already_interrupted: false,
+                slice,
+            }
+        }
+    }
+    impl<'a> io::Read for InterruptingReader<'a> {
+        fn read(&mut self, buf: &mut [u8]) -> io::Result<usize> {
+            if !self.already_interrupted {
+                self.already_interrupted = true;
+                return Err(io::Error::from(io::ErrorKind::Interrupted));
+            }
+            let take = std::cmp::min(self.slice.len(), buf.len());
+            buf[..take].copy_from_slice(&self.slice[..take]);
+            self.slice = &self.slice[take..];
+            Ok(take)
+        }
+    }
+
+    let input = b"hello world";
+    let mut reader = InterruptingReader::new(input);
+    let mut hasher = crate::Hasher::new();
+    hasher.update_reader(&mut reader)?;
+    assert_eq!(hasher.finalize(), crate::hash(input));
+    Ok(())
+}
+
+#[test]
+#[cfg(feature = "mmap")]
+// NamedTempFile isn't Miri-compatible
+#[cfg(not(miri))]
+fn test_mmap() -> Result<(), std::io::Error> {
+    // This is a brief test, since update_mmap() is mostly a wrapper around update(), which already
+    // has substantial testing.
+    use std::io::prelude::*;
+    let mut input = vec![0; 1_000_000];
+    paint_test_input(&mut input);
+    let mut tempfile = tempfile::NamedTempFile::new()?;
+    tempfile.write_all(&input)?;
+    tempfile.flush()?;
+    assert_eq!(
+        crate::Hasher::new()
+            .update_mmap(tempfile.path())?
+            .finalize(),
+        crate::hash(&input),
+    );
+    Ok(())
+}
+
+#[test]
+#[cfg(feature = "mmap")]
+#[cfg(target_os = "linux")]
+fn test_mmap_virtual_file() -> Result<(), std::io::Error> {
+    // Virtual files like /proc/version can't be mmapped, because their contents don't actually
+    // exist anywhere in memory. Make sure we fall back to regular file IO in these cases.
+    // Currently this is handled with a length check, where the assumption is that virtual files
+    // will always report length 0. If that assumption ever breaks, hopefully this test will catch
+    // it.
+    let virtual_filepath = "/proc/version";
+    let mut mmap_hasher = crate::Hasher::new();
+    // We'll fail right here if the fallback doesn't work.
+    mmap_hasher.update_mmap(virtual_filepath)?;
+    let mut read_hasher = crate::Hasher::new();
+    read_hasher.update_reader(std::fs::File::open(virtual_filepath)?)?;
+    assert_eq!(mmap_hasher.finalize(), read_hasher.finalize());
+    Ok(())
+}
+
+#[test]
+#[cfg(feature = "mmap")]
+#[cfg(feature = "rayon")]
+// NamedTempFile isn't Miri-compatible
+#[cfg(not(miri))]
+fn test_mmap_rayon() -> Result<(), std::io::Error> {
+    // This is a brief test, since update_mmap_rayon() is mostly a wrapper around update_rayon(),
+    // which already has substantial testing.
+    use std::io::prelude::*;
+    let mut input = vec![0; 1_000_000];
+    paint_test_input(&mut input);
+    let mut tempfile = tempfile::NamedTempFile::new()?;
+    tempfile.write_all(&input)?;
+    tempfile.flush()?;
+    assert_eq!(
+        crate::Hasher::new()
+            .update_mmap_rayon(tempfile.path())?
+            .finalize(),
+        crate::hash(&input),
+    );
+    Ok(())
+}
+
+#[test]
+#[cfg(feature = "std")]
+#[cfg(feature = "serde")]
+fn test_serde() {
+    let hash: crate::Hash = [7; 32].into();
+    let json = serde_json::to_string(&hash).unwrap();
+    assert_eq!(
+        json,
+        "[7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7]",
+    );
+    let hash2: crate::Hash = serde_json::from_str(&json).unwrap();
+    assert_eq!(hash, hash2);
+}
+
+// `cargo +nightly miri test` currently works, but it takes forever, because some of our test
+// inputs are quite large. Most of our unsafe code is platform specific and incompatible with Miri
+// anyway, but we'd like it to be possible for callers to run their own tests under Miri, assuming
+// they don't use incompatible features like Rayon or mmap. This test should get reasonable
+// coverage of our public API without using any large inputs, so we can run it in CI and catch
+// obvious breaks. (For example, constant_time_eq is not compatible with Miri.)
+#[test]
+fn test_miri_smoketest() {
+    let mut hasher = crate::Hasher::new_derive_key("Miri smoketest");
+    hasher.update(b"foo");
+    #[cfg(feature = "std")]
+    hasher.update_reader(&b"bar"[..]).unwrap();
+    assert_eq!(hasher.finalize(), hasher.finalize());
+    let mut reader = hasher.finalize_xof();
+    reader.set_position(999999);
+    reader.fill(&mut [0]);
+}
--- a/tools/release.md
+++ b/tools/release.md
@ -4,7 +4,7 @@
 - Bump the version in the root Cargo.toml.
 - Bump the version in b3sum/Cargo.toml.
 - Delete b3sum/Cargo.lock and recreate it with `cargo build` or similar.
- Update the `--help` output in b3sum/README.md if it's changed.
+- Update the `-h` output in b3sum/README.md if it's changed.
 - Bump `BLAKE3_VERSION_STRING` in c/blake3.h.
 - Bump `VERSION` in c/CMakeLists.txt.
 - Make a version bump commit with change notes.
Author	SHA1	Message	Date
Javier Blazquez	0816badf3a	fix Windows ARM64 build and detect ARM64EC as ARM64	2024-04-07 11:48:02 -04:00
Jack O'Connor	4ec3be8bfa	format the state matrix better in reference_impl.rs	2024-03-20 15:44:05 -07:00
Benjamin A. Beasley	d99ad871a6	Fix missing LICENSE file in b3sum crate Add a symbolic link to the top-level license file; this is dereferenced by cargo publish, and the LICENSE appears as a regular file in the published crate.	2024-03-12 14:47:39 -07:00
Jack O'Connor	54930c9522	version 1.5.1 Changes since 1.5.0: - The Rust crate is now compatible with Miri. - ~1% performance improvement on Arm NEON contributed by @divinity76 (#384). - Various fixes and improvements in the CMake build. - The MSRV of b3sum is now 1.74.1. (The MSRV of the library crate is unchanged, 1.66.1.)	2024-03-12 00:34:53 -07:00
divinity76	58bea0bcbb	optimize neon loadu_128/storeu_128 (#384 ) vld1q_u8 and vst1q_u8 has no alignment requirements. This improves performance on Oracle Cloud's VM.Standard.A1.Flex by 1.15% on a 16*1024 input, from 13920 nanoseconds down to 13800 nanoseconds (approx)	2024-03-12 03:21:51 -04:00
Jack O'Connor	5b9af1c347	test_miri_smoketest	2024-03-10 09:54:03 -07:00
Jack O'Connor	d57818afdc	avoid using NamedTempFile under Miri	2024-03-09 16:57:14 -08:00
Jack O'Connor	2435e29dbe	avoid using constant_time_eq under Miri	2024-03-09 16:57:14 -08:00
Ryo Onodera	e6e7f27336	Support running inside miri	2024-03-09 16:56:59 -08:00
Jack O'Connor	8fc36186b8	comment cleanup	2024-02-04 13:32:30 -08:00
divinity76	2918c51bc6	silenc gcc Werror=logical-op ``` /home/travis/build/php/php-src/ext/hash/blake3/upstream_blake3/c/blake3.c: In function ‘compress_subtree_to_parent_node’: /home/travis/build/php/php-src/ext/hash/blake3/upstream_blake3/c/blake3.c:354:22: error: logical ‘and’ of mutually exclusive tests is always false [-Werror=logical-op] 354 \| while (num_cvs > 2 && num_cvs <= MAX_SIMD_DEGREE_OR_2) { \| ^~ cc1: all warnings being treated as errors make: *** [Makefile:1910: ext/hash/blake3/upstream_blake3/c/blake3.lo] Error 1 ``` Fixes https://github.com/BLAKE3-team/BLAKE3/issues/379. Closes https://github.com/BLAKE3-team/BLAKE3/pull/380.	2024-02-04 13:31:55 -08:00
JP	a65fcf63ee	Tekken 8 https://twitter.com/rodarmor/status/1751567502050771189	2024-01-28 23:31:19 -08:00
Jack O'Connor	5558fa4623	add a guts docs example	2024-01-21 20:10:46 -08:00
Jack O'Connor	1a6c1e2037	guts readme updates	2024-01-21 19:43:07 -08:00
Jack O'Connor	1ca383ba9b	add guts testing to CI	2024-01-21 18:38:06 -08:00
Jack O'Connor	6e519ea6b7	configure no_std for guts, but not for testing	2024-01-21 18:38:06 -08:00
Jack O'Connor	fc75227170	factor out just the portable parts of the guts_api branch	2024-01-21 18:38:06 -08:00
Jack O'Connor	6f3e6fc86c	update memmap2 to v0.9	2024-01-21 18:37:01 -08:00
Dirk Stolle	4d32708f51	replace unmaintained actions-rs/toolchain action in CI Basically all of the `actions-rs/*` actions are unmaintained. See <https://github.com/actions-rs/toolchain/issues/216> for more information. Due to their age they generate several warnings in CI runs. To get rid of those warnings the occurrences of `actions-rs/toolchain` are replaced by `dtolnay/rust-toolchain`.	2023-12-30 02:28:33 -05:00
Dirk Stolle	5306464d03	update actions/checkout in GitHub Actions to v4	2023-12-28 10:46:51 +01:00
Jack O'Connor	c0ea395cf9	add Bazel to the list of users in the readme	2023-12-08 10:23:30 -08:00
Henrik S. Gaßmann	7ce2aa41e9	build(CMake): Require C99 mode Specify language requirement as a [compile-feature] and force compiler extensions off ensuring portability problems are detected early on. Note that we do not use the `C_STANDARD` property, because it doesn't propagate to dependent targets and would prohibit users from compiling their code base with consistent flags / language configuations if they were to target a newer C standard. Similarly we do not configure `C_STANDARD_REQUIRED` as [compile-features] do not interact with it--they are enforced regardless. [compile-feature]: https://cmake.org/cmake/help/latest/manual/cmake-compile-features.7.html#compile-feature-requirements	2023-12-02 11:11:10 -08:00
Jack O'Connor	92e4cd71be	add the compiler name to CMake CI jobs	2023-11-05 09:18:39 -08:00
Viacheslav H	1930721c50	Fix CMake target include directories if library is used with add_subdirectory or FetchContent	2023-11-05 12:16:48 -05:00
Rui Ueyama	e1f851d461	Fix Windows build with clang-cl clang-cl is LLVM's MSVC-compatible compiler frontend for Windows ABI. If clang-cl is in use, `CMAKE_C_COMPILER_ID` is `Clang` even though it doesn't take Unix-like command line options but MSVC-like options. `if(MSVC)` is the correct predicate to check if we should pass MSVC-ish command line options.	2023-11-05 09:08:13 -08:00
Henrik Gaßmann	3465fe455e	style: Exclude whitespace fixups from git blame	2023-10-31 11:53:11 +01:00
Henrik Gaßmann	3e14f865d3	style: Remove trailing whitespace in CMakeLists.txt	2023-10-31 11:51:26 +01:00
Henrik Gaßmann	bfd568897a	build(CMake): Provide NEON cflags for ARMv8 32bit ARMv8 CPUs are guaranteed to support NEON instructions. However, for 32bit ARMv8 triplets GCC needs to explicitly be configured to enable NEON intrinsics.	2023-10-31 11:45:26 +01:00
Henrik Gaßmann	dd30dcb002	build(CMake): Apply PP definitions to all sources	2023-10-02 11:12:50 -07:00
Jack O'Connor	3d921ae703	allow(unreachable_code) in all the *_detected functions Previously we only disabled these warnings for SSE2, which is assumed enabled on x86-64, but it looks like new nightly compilers are also assuming SSE4.1 on macOS. Disabling these warnings across all the detection functions accounts for that, and it also gets rid of some warnings that you'd see if you used RUSTFLAGS='-C target-cpu=native'.	2023-10-02 08:36:29 -07:00
Jack O'Connor	5aa53f07f7	version 1.5.0 Changes since 1.4.1: - The Rust crate's Hasher type has gained new helper methods for common forms of IO: update_reader, update_mmap, and update_mmap_rayon. The latter matches the default behavior of b3sum. The mmap methods are gated by the new "mmap" Cargo feature. - Most of the Rust crate's public types now implement the Zeroize trait. This is gated by the new "zeroize" Cargo feature. - The Rust crate's Hash types now implements the serde Serialize and Deserialize traits. This is gated by the new "serde" Cargo feature. - The C library now uses atomics to cache detected CPU features under most compilers other than MSVC. Previously this was a non-atomic write, which was probably "benign" but made TSan unhappy. - NEON support is now disabled by default on big-endian AArch64. Previously this was a build error if the caller didn't explicitly disable it.	2023-09-20 20:12:18 -07:00
Jack O'Connor	d7e9365be1	add a test for the new serde feature	2023-09-19 23:43:47 -07:00
Ralph Minderhoud	5e3eb949a7	Add serde support for Hash behind optional feature Added a new cargo feature `serde` that when enabled will derive `serde::Serialize` and `serde::Deserialize` for the `blake3::Hash` struct.	2023-09-19 21:54:44 -04:00
Jack O'Connor	4e25f2e094	don't default to NEON intrinsics in build.rs for big-endian targets	2023-09-19 17:18:31 -07:00
Havard Eidnes	8bfe93fbf9	c/blake3_impl.h: don't try to do NEON on big-endian aarch64. ...because this would otherwise hit #error "This implementation only supports little-endian ARM." in c/blake3_neon.c.	2023-09-19 16:57:11 -07:00
Jack O'Connor	8cdfaa41ea	minor cleanup in Hasher docs	2023-09-19 12:47:27 -07:00
Jack O'Connor	b754033a21	make update_reader/mmap/mmap_rayon return self This makes them consistent with how the existing update() and update_rayon() methods work, with the difference being that it's it's io::Result<&mut Self> instead of just &mut Self.	2023-09-16 19:22:36 -07:00
Jack O'Connor	cb32f0bd14	replace the new file module with inherent methods on Hasher New methods: - update_reader - update_mmap - update_mmap_rayon These are more discoverable, more convenient, and safer. There are two problems I want to avoid by taking a `Path` instead of a `File`. First, exposing `Mmap` objects to the caller is fundamentally unsafe, and making `maybe_mmap_file` private avoids that issue. Second, taking a `File` raises questions about whether memory mapped reads should behave like regular file reads. (Should they respect the current seek position? Should they update the seek position?) Taking a `Path` from the caller and opening the `File` internally avoids these questions.	2023-09-16 17:04:27 -07:00
Banyc	e0bb915641	move file operations from b3sum to blake3	2023-09-16 14:20:39 -07:00
Jack O'Connor	12b368541f	document the `zeroize` Cargo feature As part of this change, I don't think we need the `zeroize_crate` workaround anymore if we use the relateively new `dep:` syntax in Cargo.toml.	2023-09-16 14:11:27 -07:00
Jack O'Connor	f22d66b307	stop using MIPS for big-endian testing https://twitter.com/burntsushi5/status/1695483429997945092 https://github.com/rust-lang/compiler-team/issues/648	2023-09-10 14:18:55 -07:00
Jack O'Connor	cd4b3140cf	update the MSRV for b3sum to 1.70.0 As usual, the driver here is the MSRV of clap. I should've checked this when I updated the Cargo.lock file.	2023-09-10 14:14:54 -07:00
Jack O'Connor	02dec6e9a6	fix a build break in the blake3_c tests	2023-09-10 14:04:57 -07:00
Jack O'Connor	d6265dafc9	update dev-dependencies	2023-09-10 13:40:12 -07:00
Javier Blazquez	12823b8760	blake3_dispatch: Fix race condition initializing g_cpu_features. If multiple threads try to compute a hash simultaneously before the library has been used for the first time, the logic in get_cpu_features that detects CPU features will write to g_cpu_features without synchronization, which is a race condition and flagged by ThreadSanitizer. This change marks g_cpu_features as an atomic variable to address the race condition.	2023-07-21 19:18:40 -07:00
Elichai Turkel	e302cdf36f	Remove unneeded digest/std in std feature	2023-07-16 13:29:47 -04:00
Elichai Turkel	f18e19092b	Add tests for Zeroize	2023-07-16 13:29:47 -04:00
Elichai Turkel	8e92fc6929	Implement Zeroize on exported types	2023-07-16 13:29:47 -04:00