From f4ffbbca2f35e8341760c70c51e49ae6dd53f2be Mon Sep 17 00:00:00 2001
From: Jack O'Connor <oconnor663@gmail.com>
Date: Sun, 21 Jan 2024 22:16:02 -0800
Subject: [PATCH] factor out RISCV support from the guts_api branch

TODO: figure out what environment variable should enable this
---
 rust/guts/Cargo.toml            |    6 +
 rust/guts/build.rs              |   59 +
 rust/guts/src/lib.rs            |   17 +-
 rust/guts/src/riscv_rva23u64.S  | 1773 +++++++++++++++++++++++++++++++
 rust/guts/src/riscv_rva23u64.rs |  124 +++
 5 files changed, 1977 insertions(+), 2 deletions(-)
 create mode 100644 rust/guts/build.rs
 create mode 100644 rust/guts/src/riscv_rva23u64.S
 create mode 100644 rust/guts/src/riscv_rva23u64.rs
diff --git a/rust/guts/Cargo.toml b/rust/guts/Cargo.toml
index ebcf77f..3525d3e 100644
--- a/rust/guts/Cargo.toml
+++ b/rust/guts/Cargo.toml
@@ -9,6 +9,9 @@ documentation = "https://docs.rs/blake3_guts"
 readme = "readme.md"
 edition = "2021"
 
+[dependencies]
+cfg-if = "1.0.0"
+
 [dev-dependencies]
 hex = "0.4.3"
 reference_impl = { path = "../../reference_impl" }
@@ -16,3 +19,6 @@ reference_impl = { path = "../../reference_impl" }
 [features]
 default = ["std"]
 std = []
+
+[build-dependencies]
+cc = "1.0.79"
diff --git a/rust/guts/build.rs b/rust/guts/build.rs
new file mode 100644
index 0000000..f0ef0e2
--- /dev/null
+++ b/rust/guts/build.rs
@@ -0,0 +1,59 @@
+use std::env;
+
+fn defined(var: &str) -> bool {
+    println!("cargo:rerun-if-env-changed={}", var);
+    env::var_os(var).is_some()
+}
+
+fn is_pure() -> bool {
+    defined("CARGO_FEATURE_PURE")
+}
+
+fn target_components() -> Vec<String> {
+    let target = env::var("TARGET").unwrap();
+    target.split("-").map(|s| s.to_string()).collect()
+}
+
+fn is_riscv64gc() -> bool {
+    target_components()[0] == "riscv64gc"
+}
+
+fn new_build() -> cc::Build {
+    let build = cc::Build::new();
+    build
+}
+
+fn build_riscv_rva23u64_assembly() {
+    println!("cargo:rustc-cfg=blake3_riscv_rva23u64_ffi");
+    let mut build = new_build();
+    let asm_path = "src/riscv_rva23u64.S";
+    build.file(asm_path);
+    build.flag("--target=riscv64");
+    build.flag("-march=rv64gcv_zbb_zvbb1p0");
+    build.flag("-menable-experimental-extensions");
+    build.compile("blake3_riscv_rva23u64_assembly");
+    println!("cargo:rerun-if-changed={asm_path}");
+}
+
+fn main() {
+    // TODO: This implementation assumes some bleeding-edge extensions, and it should probably be
+    // gated by a Cargo feature.
+    if is_riscv64gc() && !is_pure() {
+        build_riscv_rva23u64_assembly();
+    }
+
+    // The `cc` crate doesn't automatically emit rerun-if directives for the
+    // environment variables it supports, in particular for $CC. We expect to
+    // do a lot of benchmarking across different compilers, so we explicitly
+    // add the variables that we're likely to need.
+    println!("cargo:rerun-if-env-changed=CC");
+    println!("cargo:rerun-if-env-changed=CFLAGS");
+
+    // Ditto for source files, though these shouldn't change as often.
+    for file in std::fs::read_dir("../../c").unwrap() {
+        println!(
+            "cargo:rerun-if-changed={}",
+            file.unwrap().path().to_str().expect("utf-8")
+        );
+    }
+}
diff --git a/rust/guts/src/lib.rs b/rust/guts/src/lib.rs
index e9b4914..a363952 100644
--- a/rust/guts/src/lib.rs
+++ b/rust/guts/src/lib.rs
@@ -49,6 +49,8 @@ use core::ptr;
 use core::sync::atomic::{AtomicPtr, Ordering::Relaxed};
 
 pub mod portable;
+#[cfg(any(target_arch = "riscv64"))]
+pub mod riscv_rva23u64;
 
 #[cfg(test)]
 mod test;
@@ -82,8 +84,14 @@ pub const MSG_SCHEDULE: [[usize; 16]; 7] = [
     [11, 15, 5, 0, 1, 9, 8, 6, 14, 10, 2, 12, 3, 4, 7, 13],
 ];
 
-// never less than 2
-pub const MAX_SIMD_DEGREE: usize = 2;
+cfg_if::cfg_if! {
+    if #[cfg(target_arch = "riscv64")] {
+        pub const MAX_SIMD_DEGREE: usize = riscv_rva23u64::MAX_SIMD_DEGREE;
+    } else {
+        // never less than 2
+        pub const MAX_SIMD_DEGREE: usize = 2;
+    }
+}
 
 pub type CVBytes = [u8; 32];
 pub type CVWords = [u32; 8];
@@ -101,6 +109,11 @@ pub static DETECTED_IMPL: Implementation = Implementation::new(
 );
 
 fn detect() -> Implementation {
+    #[cfg(target_arch = "riscv64")]
+    {
+        return riscv_rva23u64::implementation();
+    }
+    #[allow(unreachable_code)]
     portable::implementation()
 }
 
diff --git a/rust/guts/src/riscv_rva23u64.S b/rust/guts/src/riscv_rva23u64.S
new file mode 100644
index 0000000..d672a30
--- /dev/null
+++ b/rust/guts/src/riscv_rva23u64.S
@@ -0,0 +1,1773 @@
+// This implementation targets the RVA23 profile, particularly V, Zvbb, and
+// Zbb, that is the vector extension and the bit-manipulation extensions. As of
+// December 2023, most real-world hardware does *not* support these extensions.
+// This implementation also assumes that misaligned vector loads and stores are
+// supported, in particular for the vlsseg8e32.v and vssseg8e32.v instructions.
+//
+// Compiling and testing this code requires very recent versions of Clang (v17)
+// and QEMU (v8.2).
+
+#define IV0 0x6A09E667
+#define IV1 0xBB67AE85
+#define IV2 0x3C6EF372
+#define IV3 0xA54FF53A
+
+// NOTE: Keep this in sync with the same constant in Rust.
+#define MAX_SIMD_DEGREE 16
+
+#define TRANSPOSED_STRIDE_BYTES 2 * MAX_SIMD_DEGREE * 4
+
+#define CHUNK_START         (1 << 0)
+#define CHUNK_END           (1 << 1)
+#define PARENT              (1 << 2)
+#define ROOT                (1 << 3)
+#define KEYED_HASH          (1 << 4)
+#define DERIVE_KEY_CONTEXT  (1 << 5)
+#define DERIVE_KEY_MATERIAL (1 << 6)
+
+.section .text
+
+.p2align 2
+IV_VEC:
+        .word    IV0, IV1, IV2, IV3
+ROR1:
+        .word    3, 0, 1, 2
+ROR2:
+        .word    2, 3, 0, 1
+ROR3:
+        .word    1, 2, 3, 0
+
+# The bottom half of the load permutation is tweaked to account for the fact that
+# we hold the second row fixed during diagonalization.
+MSG_LOAD:
+        .short   0, 2, 4, 6, 1, 3, 5, 7, 14, 8, 10, 12, 15, 9, 11, 13
+
+# The message permutation as given in the in the BLAKE3 spec would be the correct
+# permutation to use if the load order above was 0, 1, 2, 3... However, since
+# we're using a tricky load order, we need to adjust the permutation accordingly.
+# The following Python snippet reproduces the permutation we're using here:
+#
+#     load_order = [0, 2, 4, 6, 1, 3, 5, 7, 14, 8, 10, 12, 15, 9, 11, 13]
+#     original_permutation = [2, 6, 3, 10, 7, 0, 4, 13, 1, 11, 12, 5, 9, 14, 15, 8]
+#     retargeted_permutation = [load_order.index(x) for x in original_permutation]
+#     shuffled_permutation = [retargeted_permutation[i] for i in load_order]
+#     print(shuffled_permutation)
+MSG_PERMUTE:
+        .short   1, 5, 7, 2, 3, 10, 0, 15, 12, 4, 11, 13, 9, 14, 6, 8
+
+// a0: block (zero-padded to 64 bytes)
+// a1: block_len
+// a2: cv_bytes
+// a3: counter
+// a4: flags
+// a5: out_ptr
+.global blake3_guts_riscv_rva23u64_compress
+blake3_guts_riscv_rva23u64_compress:
+        // Load the message load and message permutation indexes.
+        vsetivli zero, 16, e16, m2, ta, ma
+        la t0, MSG_LOAD
+        vle16.v v8, (t0)
+        la t0, MSG_PERMUTE
+        vle16.v v10, (t0)
+        // Load the CV into v0-v1.
+        vsetivli zero, 16, e8, m1, ta, ma
+        vle8.v v0, (a2)
+        addi a2, a2, 16
+        vle8.v v1, (a2)
+        // Set LMUL=4 and load the message block temporarily into scratch
+        // space. Apply the MSG_LOAD permutation, and then move the permuted
+        // message words into v4-v7.
+        // TODO: Do this with less register movement?
+        li t0, 64
+        vsetvli zero, t0, e8, m4, ta, ma
+        vle8.v v20, (a0)
+        vsetivli zero, 16, e32, m4, ta, ma
+        vrgatherei16.vv v16, v20, v8
+        vsetivli zero, 4, e32, m4, ta, ma
+        vslidedown.vi v20, v16, 4
+        vslidedown.vi v24, v16, 8
+        vslidedown.vi v28, v16, 12
+        vsetivli zero, 4, e32, m1, ta, ma
+        vmv.v.v v4, v16
+        vmv.v.v v5, v20
+        vmv.v.v v6, v24
+        vmv.v.v v7, v28
+        // Load the diagonalization gather indexes.
+        la t0, ROR1
+        vle32.v v12, (t0)
+        la t0, ROR2
+        vle32.v v13, (t0)
+        la t0, ROR3
+        vle32.v v14, (t0)
+        // Load the IV words.
+        la t0, IV_VEC
+        vle32.v v2, (t0)
+        // Load the counter, block_len, and flags.
+        vsetivli zero, 4, e32, m1, ta, ma
+        vslide1down.vx v3, v3, a3
+        srli a3, a3, 32
+        vslide1down.vx v3, v3, a3
+        vslide1down.vx v3, v3, a1
+        vslide1down.vx v3, v3, a4
+        li t0, 7  // round counter
+blake3_guts_riscv_rva23u64_compress_round_loop:
+        vadd.vv  v0, v0, v4
+        vadd.vv  v0, v0, v1
+        vxor.vv  v3, v3, v0
+        vror.vi  v3, v3, 16
+        vadd.vv  v2, v2, v3
+        vxor.vv  v1, v1, v2
+        vror.vi  v1, v1, 12
+        vadd.vv  v0, v0, v5
+        vadd.vv  v0, v0, v1
+        vxor.vv  v3, v3, v0
+        vror.vi  v3, v3, 8
+        vadd.vv  v2, v2, v3
+        vxor.vv  v1, v1, v2
+        vror.vi  v1, v1, 7
+        // Gathers can't overlap a source register, so use v20/v22/v23 in place
+        // of v0/v2/v3 for this section.
+        vrgather.vv v20, v0, v12
+        vrgather.vv v23, v3, v13
+        vrgather.vv v22, v2, v14
+        vadd.vv  v20, v20, v6
+        vadd.vv  v20, v20, v1
+        vxor.vv  v23, v23, v20
+        vror.vi  v23, v23, 16
+        vadd.vv  v22, v22, v23
+        vxor.vv  v1, v1, v22
+        vror.vi  v1, v1, 12
+        vadd.vv  v20, v20, v7
+        vadd.vv  v20, v20, v1
+        vxor.vv  v23, v23, v20
+        vror.vi  v23, v23, 8
+        vadd.vv  v22, v22, v23
+        vxor.vv  v1, v1, v22
+        vror.vi  v1, v1, 7
+        vrgather.vv v0, v20, v14
+        vrgather.vv v3, v23, v13
+        vrgather.vv v2, v22, v12
+        addi t0, t0, -1
+        beqz t0, blake3_guts_riscv_rva23u64_compress_end
+        // Shuffle message words.
+        // TODO: Find a way to do this without so much movement?
+        vmv.v.v v16, v4
+        vmv.v.v v20, v5
+        vmv.v.v v24, v6
+        vmv.v.v v28, v7
+        vsetivli zero, 16, e32, m4, ta, ma
+        vslideup.vi v16, v20, 4
+        vslideup.vi v16, v24, 8
+        vslideup.vi v16, v28, 12
+        vrgatherei16.vv v28, v16, v10
+        vsetivli zero, 4, e32, m4, ta, ma
+        vslidedown.vi v16, v28, 4
+        vslidedown.vi v20, v28, 8
+        vslidedown.vi v24, v28, 12
+        vsetivli zero, 4, e32, m1, ta, ma
+        vmv.v.v v4, v28
+        vmv.v.v v5, v16
+        vmv.v.v v6, v20
+        vmv.v.v v7, v24
+        j blake3_guts_riscv_rva23u64_compress_round_loop
+blake3_guts_riscv_rva23u64_compress_end:
+        vxor.vv v0, v0, v2
+        vxor.vv v1, v1, v3
+        vsetivli zero, 16, e8, m1, ta, ma
+        vse8.v v0, (a5)
+        addi a5, a5, 16
+        vse8.v v1, (a5)
+        ret
+
+
+.global blake3_guts_riscv_rva23u64_degree
+blake3_guts_riscv_rva23u64_degree:
+        csrr t0, vlenb
+        srli t0, t0, 2
+        li t1, MAX_SIMD_DEGREE
+        minu a0, t0, t1
+        ret
+
+// clobbers: t0
+blake3_guts_riscv_rva23u64_kernel:
+        li t0, IV0
+        vmv.v.x v8, t0
+        li t0, IV1
+        vmv.v.x v9, t0
+        li t0, IV2
+        vmv.v.x v10, t0
+        li t0, IV3
+        vmv.v.x v11, t0
+        vadd.vv  v0, v0, v16
+        vadd.vv  v1, v1, v18
+        vadd.vv  v2, v2, v20
+        vadd.vv  v3, v3, v22
+        vadd.vv  v0, v0, v4
+        vadd.vv  v1, v1, v5
+        vadd.vv  v2, v2, v6
+        vadd.vv  v3, v3, v7
+        vxor.vv  v12, v12, v0
+        vxor.vv  v13, v13, v1
+        vxor.vv  v14, v14, v2
+        vxor.vv  v15, v15, v3
+        vror.vi  v12, v12, 16
+        vror.vi  v13, v13, 16
+        vror.vi  v14, v14, 16
+        vror.vi  v15, v15, 16
+        vadd.vv  v8, v8, v12
+        vadd.vv  v9, v9, v13
+        vadd.vv  v10, v10, v14
+        vadd.vv  v11, v11, v15
+        vxor.vv  v4, v4, v8
+        vxor.vv  v5, v5, v9
+        vxor.vv  v6, v6, v10
+        vxor.vv  v7, v7, v11
+        vror.vi  v4, v4, 12
+        vror.vi  v5, v5, 12
+        vror.vi  v6, v6, 12
+        vror.vi  v7, v7, 12
+        vadd.vv  v0, v0, v17
+        vadd.vv  v1, v1, v19
+        vadd.vv  v2, v2, v21
+        vadd.vv  v3, v3, v23
+        vadd.vv  v0, v0, v4
+        vadd.vv  v1, v1, v5
+        vadd.vv  v2, v2, v6
+        vadd.vv  v3, v3, v7
+        vxor.vv  v12, v12, v0
+        vxor.vv  v13, v13, v1
+        vxor.vv  v14, v14, v2
+        vxor.vv  v15, v15, v3
+        vror.vi  v12, v12, 8
+        vror.vi  v13, v13, 8
+        vror.vi  v14, v14, 8
+        vror.vi  v15, v15, 8
+        vadd.vv  v8, v8, v12
+        vadd.vv  v9, v9, v13
+        vadd.vv  v10, v10, v14
+        vadd.vv  v11, v11, v15
+        vxor.vv  v4, v4, v8
+        vxor.vv  v5, v5, v9
+        vxor.vv  v6, v6, v10
+        vxor.vv  v7, v7, v11
+        vror.vi  v4, v4, 7
+        vror.vi  v5, v5, 7
+        vror.vi  v6, v6, 7
+        vror.vi  v7, v7, 7
+        vadd.vv  v0, v0, v24
+        vadd.vv  v1, v1, v26
+        vadd.vv  v2, v2, v28
+        vadd.vv  v3, v3, v30
+        vadd.vv  v0, v0, v5
+        vadd.vv  v1, v1, v6
+        vadd.vv  v2, v2, v7
+        vadd.vv  v3, v3, v4
+        vxor.vv  v15, v15, v0
+        vxor.vv  v12, v12, v1
+        vxor.vv  v13, v13, v2
+        vxor.vv  v14, v14, v3
+        vror.vi  v15, v15, 16
+        vror.vi  v12, v12, 16
+        vror.vi  v13, v13, 16
+        vror.vi  v14, v14, 16
+        vadd.vv  v10, v10, v15
+        vadd.vv  v11, v11, v12
+        vadd.vv  v8, v8, v13
+        vadd.vv  v9, v9, v14
+        vxor.vv  v5, v5, v10
+        vxor.vv  v6, v6, v11
+        vxor.vv  v7, v7, v8
+        vxor.vv  v4, v4, v9
+        vror.vi  v5, v5, 12
+        vror.vi  v6, v6, 12
+        vror.vi  v7, v7, 12
+        vror.vi  v4, v4, 12
+        vadd.vv  v0, v0, v25
+        vadd.vv  v1, v1, v27
+        vadd.vv  v2, v2, v29
+        vadd.vv  v3, v3, v31
+        vadd.vv  v0, v0, v5
+        vadd.vv  v1, v1, v6
+        vadd.vv  v2, v2, v7
+        vadd.vv  v3, v3, v4
+        vxor.vv  v15, v15, v0
+        vxor.vv  v12, v12, v1
+        vxor.vv  v13, v13, v2
+        vxor.vv  v14, v14, v3
+        vror.vi  v15, v15, 8
+        vror.vi  v12, v12, 8
+        vror.vi  v13, v13, 8
+        vror.vi  v14, v14, 8
+        vadd.vv  v10, v10, v15
+        vadd.vv  v11, v11, v12
+        vadd.vv  v8, v8, v13
+        vadd.vv  v9, v9, v14
+        vxor.vv  v5, v5, v10
+        vxor.vv  v6, v6, v11
+        vxor.vv  v7, v7, v8
+        vxor.vv  v4, v4, v9
+        vror.vi  v5, v5, 7
+        vror.vi  v6, v6, 7
+        vror.vi  v7, v7, 7
+        vror.vi  v4, v4, 7
+        vadd.vv  v0, v0, v18
+        vadd.vv  v1, v1, v19
+        vadd.vv  v2, v2, v23
+        vadd.vv  v3, v3, v20
+        vadd.vv  v0, v0, v4
+        vadd.vv  v1, v1, v5
+        vadd.vv  v2, v2, v6
+        vadd.vv  v3, v3, v7
+        vxor.vv  v12, v12, v0
+        vxor.vv  v13, v13, v1
+        vxor.vv  v14, v14, v2
+        vxor.vv  v15, v15, v3
+        vror.vi  v12, v12, 16
+        vror.vi  v13, v13, 16
+        vror.vi  v14, v14, 16
+        vror.vi  v15, v15, 16
+        vadd.vv  v8, v8, v12
+        vadd.vv  v9, v9, v13
+        vadd.vv  v10, v10, v14
+        vadd.vv  v11, v11, v15
+        vxor.vv  v4, v4, v8
+        vxor.vv  v5, v5, v9
+        vxor.vv  v6, v6, v10
+        vxor.vv  v7, v7, v11
+        vror.vi  v4, v4, 12
+        vror.vi  v5, v5, 12
+        vror.vi  v6, v6, 12
+        vror.vi  v7, v7, 12
+        vadd.vv  v0, v0, v22
+        vadd.vv  v1, v1, v26
+        vadd.vv  v2, v2, v16
+        vadd.vv  v3, v3, v29
+        vadd.vv  v0, v0, v4
+        vadd.vv  v1, v1, v5
+        vadd.vv  v2, v2, v6
+        vadd.vv  v3, v3, v7
+        vxor.vv  v12, v12, v0
+        vxor.vv  v13, v13, v1
+        vxor.vv  v14, v14, v2
+        vxor.vv  v15, v15, v3
+        vror.vi  v12, v12, 8
+        vror.vi  v13, v13, 8
+        vror.vi  v14, v14, 8
+        vror.vi  v15, v15, 8
+        vadd.vv  v8, v8, v12
+        vadd.vv  v9, v9, v13
+        vadd.vv  v10, v10, v14
+        vadd.vv  v11, v11, v15
+        vxor.vv  v4, v4, v8
+        vxor.vv  v5, v5, v9
+        vxor.vv  v6, v6, v10
+        vxor.vv  v7, v7, v11
+        vror.vi  v4, v4, 7
+        vror.vi  v5, v5, 7
+        vror.vi  v6, v6, 7
+        vror.vi  v7, v7, 7
+        vadd.vv  v0, v0, v17
+        vadd.vv  v1, v1, v28
+        vadd.vv  v2, v2, v25
+        vadd.vv  v3, v3, v31
+        vadd.vv  v0, v0, v5
+        vadd.vv  v1, v1, v6
+        vadd.vv  v2, v2, v7
+        vadd.vv  v3, v3, v4
+        vxor.vv  v15, v15, v0
+        vxor.vv  v12, v12, v1
+        vxor.vv  v13, v13, v2
+        vxor.vv  v14, v14, v3
+        vror.vi  v15, v15, 16
+        vror.vi  v12, v12, 16
+        vror.vi  v13, v13, 16
+        vror.vi  v14, v14, 16
+        vadd.vv  v10, v10, v15
+        vadd.vv  v11, v11, v12
+        vadd.vv  v8, v8, v13
+        vadd.vv  v9, v9, v14
+        vxor.vv  v5, v5, v10
+        vxor.vv  v6, v6, v11
+        vxor.vv  v7, v7, v8
+        vxor.vv  v4, v4, v9
+        vror.vi  v5, v5, 12
+        vror.vi  v6, v6, 12
+        vror.vi  v7, v7, 12
+        vror.vi  v4, v4, 12
+        vadd.vv  v0, v0, v27
+        vadd.vv  v1, v1, v21
+        vadd.vv  v2, v2, v30
+        vadd.vv  v3, v3, v24
+        vadd.vv  v0, v0, v5
+        vadd.vv  v1, v1, v6
+        vadd.vv  v2, v2, v7
+        vadd.vv  v3, v3, v4
+        vxor.vv  v15, v15, v0
+        vxor.vv  v12, v12, v1
+        vxor.vv  v13, v13, v2
+        vxor.vv  v14, v14, v3
+        vror.vi  v15, v15, 8
+        vror.vi  v12, v12, 8
+        vror.vi  v13, v13, 8
+        vror.vi  v14, v14, 8
+        vadd.vv  v10, v10, v15
+        vadd.vv  v11, v11, v12
+        vadd.vv  v8, v8, v13
+        vadd.vv  v9, v9, v14
+        vxor.vv  v5, v5, v10
+        vxor.vv  v6, v6, v11
+        vxor.vv  v7, v7, v8
+        vxor.vv  v4, v4, v9
+        vror.vi  v5, v5, 7
+        vror.vi  v6, v6, 7
+        vror.vi  v7, v7, 7
+        vror.vi  v4, v4, 7
+        vadd.vv  v0, v0, v19
+        vadd.vv  v1, v1, v26
+        vadd.vv  v2, v2, v29
+        vadd.vv  v3, v3, v23
+        vadd.vv  v0, v0, v4
+        vadd.vv  v1, v1, v5
+        vadd.vv  v2, v2, v6
+        vadd.vv  v3, v3, v7
+        vxor.vv  v12, v12, v0
+        vxor.vv  v13, v13, v1
+        vxor.vv  v14, v14, v2
+        vxor.vv  v15, v15, v3
+        vror.vi  v12, v12, 16
+        vror.vi  v13, v13, 16
+        vror.vi  v14, v14, 16
+        vror.vi  v15, v15, 16
+        vadd.vv  v8, v8, v12
+        vadd.vv  v9, v9, v13
+        vadd.vv  v10, v10, v14
+        vadd.vv  v11, v11, v15
+        vxor.vv  v4, v4, v8
+        vxor.vv  v5, v5, v9
+        vxor.vv  v6, v6, v10
+        vxor.vv  v7, v7, v11
+        vror.vi  v4, v4, 12
+        vror.vi  v5, v5, 12
+        vror.vi  v6, v6, 12
+        vror.vi  v7, v7, 12
+        vadd.vv  v0, v0, v20
+        vadd.vv  v1, v1, v28
+        vadd.vv  v2, v2, v18
+        vadd.vv  v3, v3, v30
+        vadd.vv  v0, v0, v4
+        vadd.vv  v1, v1, v5
+        vadd.vv  v2, v2, v6
+        vadd.vv  v3, v3, v7
+        vxor.vv  v12, v12, v0
+        vxor.vv  v13, v13, v1
+        vxor.vv  v14, v14, v2
+        vxor.vv  v15, v15, v3
+        vror.vi  v12, v12, 8
+        vror.vi  v13, v13, 8
+        vror.vi  v14, v14, 8
+        vror.vi  v15, v15, 8
+        vadd.vv  v8, v8, v12
+        vadd.vv  v9, v9, v13
+        vadd.vv  v10, v10, v14
+        vadd.vv  v11, v11, v15
+        vxor.vv  v4, v4, v8
+        vxor.vv  v5, v5, v9
+        vxor.vv  v6, v6, v10
+        vxor.vv  v7, v7, v11
+        vror.vi  v4, v4, 7
+        vror.vi  v5, v5, 7
+        vror.vi  v6, v6, 7
+        vror.vi  v7, v7, 7
+        vadd.vv  v0, v0, v22
+        vadd.vv  v1, v1, v25
+        vadd.vv  v2, v2, v27
+        vadd.vv  v3, v3, v24
+        vadd.vv  v0, v0, v5
+        vadd.vv  v1, v1, v6
+        vadd.vv  v2, v2, v7
+        vadd.vv  v3, v3, v4
+        vxor.vv  v15, v15, v0
+        vxor.vv  v12, v12, v1
+        vxor.vv  v13, v13, v2
+        vxor.vv  v14, v14, v3
+        vror.vi  v15, v15, 16
+        vror.vi  v12, v12, 16
+        vror.vi  v13, v13, 16
+        vror.vi  v14, v14, 16
+        vadd.vv  v10, v10, v15
+        vadd.vv  v11, v11, v12
+        vadd.vv  v8, v8, v13
+        vadd.vv  v9, v9, v14
+        vxor.vv  v5, v5, v10
+        vxor.vv  v6, v6, v11
+        vxor.vv  v7, v7, v8
+        vxor.vv  v4, v4, v9
+        vror.vi  v5, v5, 12
+        vror.vi  v6, v6, 12
+        vror.vi  v7, v7, 12
+        vror.vi  v4, v4, 12
+        vadd.vv  v0, v0, v21
+        vadd.vv  v1, v1, v16
+        vadd.vv  v2, v2, v31
+        vadd.vv  v3, v3, v17
+        vadd.vv  v0, v0, v5
+        vadd.vv  v1, v1, v6
+        vadd.vv  v2, v2, v7
+        vadd.vv  v3, v3, v4
+        vxor.vv  v15, v15, v0
+        vxor.vv  v12, v12, v1
+        vxor.vv  v13, v13, v2
+        vxor.vv  v14, v14, v3
+        vror.vi  v15, v15, 8
+        vror.vi  v12, v12, 8
+        vror.vi  v13, v13, 8
+        vror.vi  v14, v14, 8
+        vadd.vv  v10, v10, v15
+        vadd.vv  v11, v11, v12
+        vadd.vv  v8, v8, v13
+        vadd.vv  v9, v9, v14
+        vxor.vv  v5, v5, v10
+        vxor.vv  v6, v6, v11
+        vxor.vv  v7, v7, v8
+        vxor.vv  v4, v4, v9
+        vror.vi  v5, v5, 7
+        vror.vi  v6, v6, 7
+        vror.vi  v7, v7, 7
+        vror.vi  v4, v4, 7
+        vadd.vv  v0, v0, v26
+        vadd.vv  v1, v1, v28
+        vadd.vv  v2, v2, v30
+        vadd.vv  v3, v3, v29
+        vadd.vv  v0, v0, v4
+        vadd.vv  v1, v1, v5
+        vadd.vv  v2, v2, v6
+        vadd.vv  v3, v3, v7
+        vxor.vv  v12, v12, v0
+        vxor.vv  v13, v13, v1
+        vxor.vv  v14, v14, v2
+        vxor.vv  v15, v15, v3
+        vror.vi  v12, v12, 16
+        vror.vi  v13, v13, 16
+        vror.vi  v14, v14, 16
+        vror.vi  v15, v15, 16
+        vadd.vv  v8, v8, v12
+        vadd.vv  v9, v9, v13
+        vadd.vv  v10, v10, v14
+        vadd.vv  v11, v11, v15
+        vxor.vv  v4, v4, v8
+        vxor.vv  v5, v5, v9
+        vxor.vv  v6, v6, v10
+        vxor.vv  v7, v7, v11
+        vror.vi  v4, v4, 12
+        vror.vi  v5, v5, 12
+        vror.vi  v6, v6, 12
+        vror.vi  v7, v7, 12
+        vadd.vv  v0, v0, v23
+        vadd.vv  v1, v1, v25
+        vadd.vv  v2, v2, v19
+        vadd.vv  v3, v3, v31
+        vadd.vv  v0, v0, v4
+        vadd.vv  v1, v1, v5
+        vadd.vv  v2, v2, v6
+        vadd.vv  v3, v3, v7
+        vxor.vv  v12, v12, v0
+        vxor.vv  v13, v13, v1
+        vxor.vv  v14, v14, v2
+        vxor.vv  v15, v15, v3
+        vror.vi  v12, v12, 8
+        vror.vi  v13, v13, 8
+        vror.vi  v14, v14, 8
+        vror.vi  v15, v15, 8
+        vadd.vv  v8, v8, v12
+        vadd.vv  v9, v9, v13
+        vadd.vv  v10, v10, v14
+        vadd.vv  v11, v11, v15
+        vxor.vv  v4, v4, v8
+        vxor.vv  v5, v5, v9
+        vxor.vv  v6, v6, v10
+        vxor.vv  v7, v7, v11
+        vror.vi  v4, v4, 7
+        vror.vi  v5, v5, 7
+        vror.vi  v6, v6, 7
+        vror.vi  v7, v7, 7
+        vadd.vv  v0, v0, v20
+        vadd.vv  v1, v1, v27
+        vadd.vv  v2, v2, v21
+        vadd.vv  v3, v3, v17
+        vadd.vv  v0, v0, v5
+        vadd.vv  v1, v1, v6
+        vadd.vv  v2, v2, v7
+        vadd.vv  v3, v3, v4
+        vxor.vv  v15, v15, v0
+        vxor.vv  v12, v12, v1
+        vxor.vv  v13, v13, v2
+        vxor.vv  v14, v14, v3
+        vror.vi  v15, v15, 16
+        vror.vi  v12, v12, 16
+        vror.vi  v13, v13, 16
+        vror.vi  v14, v14, 16
+        vadd.vv  v10, v10, v15
+        vadd.vv  v11, v11, v12
+        vadd.vv  v8, v8, v13
+        vadd.vv  v9, v9, v14
+        vxor.vv  v5, v5, v10
+        vxor.vv  v6, v6, v11
+        vxor.vv  v7, v7, v8
+        vxor.vv  v4, v4, v9
+        vror.vi  v5, v5, 12
+        vror.vi  v6, v6, 12
+        vror.vi  v7, v7, 12
+        vror.vi  v4, v4, 12
+        vadd.vv  v0, v0, v16
+        vadd.vv  v1, v1, v18
+        vadd.vv  v2, v2, v24
+        vadd.vv  v3, v3, v22
+        vadd.vv  v0, v0, v5
+        vadd.vv  v1, v1, v6
+        vadd.vv  v2, v2, v7
+        vadd.vv  v3, v3, v4
+        vxor.vv  v15, v15, v0
+        vxor.vv  v12, v12, v1
+        vxor.vv  v13, v13, v2
+        vxor.vv  v14, v14, v3
+        vror.vi  v15, v15, 8
+        vror.vi  v12, v12, 8
+        vror.vi  v13, v13, 8
+        vror.vi  v14, v14, 8
+        vadd.vv  v10, v10, v15
+        vadd.vv  v11, v11, v12
+        vadd.vv  v8, v8, v13
+        vadd.vv  v9, v9, v14
+        vxor.vv  v5, v5, v10
+        vxor.vv  v6, v6, v11
+        vxor.vv  v7, v7, v8
+        vxor.vv  v4, v4, v9
+        vror.vi  v5, v5, 7
+        vror.vi  v6, v6, 7
+        vror.vi  v7, v7, 7
+        vror.vi  v4, v4, 7
+        vadd.vv  v0, v0, v28
+        vadd.vv  v1, v1, v25
+        vadd.vv  v2, v2, v31
+        vadd.vv  v3, v3, v30
+        vadd.vv  v0, v0, v4
+        vadd.vv  v1, v1, v5
+        vadd.vv  v2, v2, v6
+        vadd.vv  v3, v3, v7
+        vxor.vv  v12, v12, v0
+        vxor.vv  v13, v13, v1
+        vxor.vv  v14, v14, v2
+        vxor.vv  v15, v15, v3
+        vror.vi  v12, v12, 16
+        vror.vi  v13, v13, 16
+        vror.vi  v14, v14, 16
+        vror.vi  v15, v15, 16
+        vadd.vv  v8, v8, v12
+        vadd.vv  v9, v9, v13
+        vadd.vv  v10, v10, v14
+        vadd.vv  v11, v11, v15
+        vxor.vv  v4, v4, v8
+        vxor.vv  v5, v5, v9
+        vxor.vv  v6, v6, v10
+        vxor.vv  v7, v7, v11
+        vror.vi  v4, v4, 12
+        vror.vi  v5, v5, 12
+        vror.vi  v6, v6, 12
+        vror.vi  v7, v7, 12
+        vadd.vv  v0, v0, v29
+        vadd.vv  v1, v1, v27
+        vadd.vv  v2, v2, v26
+        vadd.vv  v3, v3, v24
+        vadd.vv  v0, v0, v4
+        vadd.vv  v1, v1, v5
+        vadd.vv  v2, v2, v6
+        vadd.vv  v3, v3, v7
+        vxor.vv  v12, v12, v0
+        vxor.vv  v13, v13, v1
+        vxor.vv  v14, v14, v2
+        vxor.vv  v15, v15, v3
+        vror.vi  v12, v12, 8
+        vror.vi  v13, v13, 8
+        vror.vi  v14, v14, 8
+        vror.vi  v15, v15, 8
+        vadd.vv  v8, v8, v12
+        vadd.vv  v9, v9, v13
+        vadd.vv  v10, v10, v14
+        vadd.vv  v11, v11, v15
+        vxor.vv  v4, v4, v8
+        vxor.vv  v5, v5, v9
+        vxor.vv  v6, v6, v10
+        vxor.vv  v7, v7, v11
+        vror.vi  v4, v4, 7
+        vror.vi  v5, v5, 7
+        vror.vi  v6, v6, 7
+        vror.vi  v7, v7, 7
+        vadd.vv  v0, v0, v23
+        vadd.vv  v1, v1, v21
+        vadd.vv  v2, v2, v16
+        vadd.vv  v3, v3, v22
+        vadd.vv  v0, v0, v5
+        vadd.vv  v1, v1, v6
+        vadd.vv  v2, v2, v7
+        vadd.vv  v3, v3, v4
+        vxor.vv  v15, v15, v0
+        vxor.vv  v12, v12, v1
+        vxor.vv  v13, v13, v2
+        vxor.vv  v14, v14, v3
+        vror.vi  v15, v15, 16
+        vror.vi  v12, v12, 16
+        vror.vi  v13, v13, 16
+        vror.vi  v14, v14, 16
+        vadd.vv  v10, v10, v15
+        vadd.vv  v11, v11, v12
+        vadd.vv  v8, v8, v13
+        vadd.vv  v9, v9, v14
+        vxor.vv  v5, v5, v10
+        vxor.vv  v6, v6, v11
+        vxor.vv  v7, v7, v8
+        vxor.vv  v4, v4, v9
+        vror.vi  v5, v5, 12
+        vror.vi  v6, v6, 12
+        vror.vi  v7, v7, 12
+        vror.vi  v4, v4, 12
+        vadd.vv  v0, v0, v18
+        vadd.vv  v1, v1, v19
+        vadd.vv  v2, v2, v17
+        vadd.vv  v3, v3, v20
+        vadd.vv  v0, v0, v5
+        vadd.vv  v1, v1, v6
+        vadd.vv  v2, v2, v7
+        vadd.vv  v3, v3, v4
+        vxor.vv  v15, v15, v0
+        vxor.vv  v12, v12, v1
+        vxor.vv  v13, v13, v2
+        vxor.vv  v14, v14, v3
+        vror.vi  v15, v15, 8
+        vror.vi  v12, v12, 8
+        vror.vi  v13, v13, 8
+        vror.vi  v14, v14, 8
+        vadd.vv  v10, v10, v15
+        vadd.vv  v11, v11, v12
+        vadd.vv  v8, v8, v13
+        vadd.vv  v9, v9, v14
+        vxor.vv  v5, v5, v10
+        vxor.vv  v6, v6, v11
+        vxor.vv  v7, v7, v8
+        vxor.vv  v4, v4, v9
+        vror.vi  v5, v5, 7
+        vror.vi  v6, v6, 7
+        vror.vi  v7, v7, 7
+        vror.vi  v4, v4, 7
+        vadd.vv  v0, v0, v25
+        vadd.vv  v1, v1, v27
+        vadd.vv  v2, v2, v24
+        vadd.vv  v3, v3, v31
+        vadd.vv  v0, v0, v4
+        vadd.vv  v1, v1, v5
+        vadd.vv  v2, v2, v6
+        vadd.vv  v3, v3, v7
+        vxor.vv  v12, v12, v0
+        vxor.vv  v13, v13, v1
+        vxor.vv  v14, v14, v2
+        vxor.vv  v15, v15, v3
+        vror.vi  v12, v12, 16
+        vror.vi  v13, v13, 16
+        vror.vi  v14, v14, 16
+        vror.vi  v15, v15, 16
+        vadd.vv  v8, v8, v12
+        vadd.vv  v9, v9, v13
+        vadd.vv  v10, v10, v14
+        vadd.vv  v11, v11, v15
+        vxor.vv  v4, v4, v8
+        vxor.vv  v5, v5, v9
+        vxor.vv  v6, v6, v10
+        vxor.vv  v7, v7, v11
+        vror.vi  v4, v4, 12
+        vror.vi  v5, v5, 12
+        vror.vi  v6, v6, 12
+        vror.vi  v7, v7, 12
+        vadd.vv  v0, v0, v30
+        vadd.vv  v1, v1, v21
+        vadd.vv  v2, v2, v28
+        vadd.vv  v3, v3, v17
+        vadd.vv  v0, v0, v4
+        vadd.vv  v1, v1, v5
+        vadd.vv  v2, v2, v6
+        vadd.vv  v3, v3, v7
+        vxor.vv  v12, v12, v0
+        vxor.vv  v13, v13, v1
+        vxor.vv  v14, v14, v2
+        vxor.vv  v15, v15, v3
+        vror.vi  v12, v12, 8
+        vror.vi  v13, v13, 8
+        vror.vi  v14, v14, 8
+        vror.vi  v15, v15, 8
+        vadd.vv  v8, v8, v12
+        vadd.vv  v9, v9, v13
+        vadd.vv  v10, v10, v14
+        vadd.vv  v11, v11, v15
+        vxor.vv  v4, v4, v8
+        vxor.vv  v5, v5, v9
+        vxor.vv  v6, v6, v10
+        vxor.vv  v7, v7, v11
+        vror.vi  v4, v4, 7
+        vror.vi  v5, v5, 7
+        vror.vi  v6, v6, 7
+        vror.vi  v7, v7, 7
+        vadd.vv  v0, v0, v29
+        vadd.vv  v1, v1, v16
+        vadd.vv  v2, v2, v18
+        vadd.vv  v3, v3, v20
+        vadd.vv  v0, v0, v5
+        vadd.vv  v1, v1, v6
+        vadd.vv  v2, v2, v7
+        vadd.vv  v3, v3, v4
+        vxor.vv  v15, v15, v0
+        vxor.vv  v12, v12, v1
+        vxor.vv  v13, v13, v2
+        vxor.vv  v14, v14, v3
+        vror.vi  v15, v15, 16
+        vror.vi  v12, v12, 16
+        vror.vi  v13, v13, 16
+        vror.vi  v14, v14, 16
+        vadd.vv  v10, v10, v15
+        vadd.vv  v11, v11, v12
+        vadd.vv  v8, v8, v13
+        vadd.vv  v9, v9, v14
+        vxor.vv  v5, v5, v10
+        vxor.vv  v6, v6, v11
+        vxor.vv  v7, v7, v8
+        vxor.vv  v4, v4, v9
+        vror.vi  v5, v5, 12
+        vror.vi  v6, v6, 12
+        vror.vi  v7, v7, 12
+        vror.vi  v4, v4, 12
+        vadd.vv  v0, v0, v19
+        vadd.vv  v1, v1, v26
+        vadd.vv  v2, v2, v22
+        vadd.vv  v3, v3, v23
+        vadd.vv  v0, v0, v5
+        vadd.vv  v1, v1, v6
+        vadd.vv  v2, v2, v7
+        vadd.vv  v3, v3, v4
+        vxor.vv  v15, v15, v0
+        vxor.vv  v12, v12, v1
+        vxor.vv  v13, v13, v2
+        vxor.vv  v14, v14, v3
+        vror.vi  v15, v15, 8
+        vror.vi  v12, v12, 8
+        vror.vi  v13, v13, 8
+        vror.vi  v14, v14, 8
+        vadd.vv  v10, v10, v15
+        vadd.vv  v11, v11, v12
+        vadd.vv  v8, v8, v13
+        vadd.vv  v9, v9, v14
+        vxor.vv  v5, v5, v10
+        vxor.vv  v6, v6, v11
+        vxor.vv  v7, v7, v8
+        vxor.vv  v4, v4, v9
+        vror.vi  v5, v5, 7
+        vror.vi  v6, v6, 7
+        vror.vi  v7, v7, 7
+        vror.vi  v4, v4, 7
+        vadd.vv  v0, v0, v27
+        vadd.vv  v1, v1, v21
+        vadd.vv  v2, v2, v17
+        vadd.vv  v3, v3, v24
+        vadd.vv  v0, v0, v4
+        vadd.vv  v1, v1, v5
+        vadd.vv  v2, v2, v6
+        vadd.vv  v3, v3, v7
+        vxor.vv  v12, v12, v0
+        vxor.vv  v13, v13, v1
+        vxor.vv  v14, v14, v2
+        vxor.vv  v15, v15, v3
+        vror.vi  v12, v12, 16
+        vror.vi  v13, v13, 16
+        vror.vi  v14, v14, 16
+        vror.vi  v15, v15, 16
+        vadd.vv  v8, v8, v12
+        vadd.vv  v9, v9, v13
+        vadd.vv  v10, v10, v14
+        vadd.vv  v11, v11, v15
+        vxor.vv  v4, v4, v8
+        vxor.vv  v5, v5, v9
+        vxor.vv  v6, v6, v10
+        vxor.vv  v7, v7, v11
+        vror.vi  v4, v4, 12
+        vror.vi  v5, v5, 12
+        vror.vi  v6, v6, 12
+        vror.vi  v7, v7, 12
+        vadd.vv  v0, v0, v31
+        vadd.vv  v1, v1, v16
+        vadd.vv  v2, v2, v25
+        vadd.vv  v3, v3, v22
+        vadd.vv  v0, v0, v4
+        vadd.vv  v1, v1, v5
+        vadd.vv  v2, v2, v6
+        vadd.vv  v3, v3, v7
+        vxor.vv  v12, v12, v0
+        vxor.vv  v13, v13, v1
+        vxor.vv  v14, v14, v2
+        vxor.vv  v15, v15, v3
+        vror.vi  v12, v12, 8
+        vror.vi  v13, v13, 8
+        vror.vi  v14, v14, 8
+        vror.vi  v15, v15, 8
+        vadd.vv  v8, v8, v12
+        vadd.vv  v9, v9, v13
+        vadd.vv  v10, v10, v14
+        vadd.vv  v11, v11, v15
+        vxor.vv  v4, v4, v8
+        vxor.vv  v5, v5, v9
+        vxor.vv  v6, v6, v10
+        vxor.vv  v7, v7, v11
+        vror.vi  v4, v4, 7
+        vror.vi  v5, v5, 7
+        vror.vi  v6, v6, 7
+        vror.vi  v7, v7, 7
+        vadd.vv  v0, v0, v30
+        vadd.vv  v1, v1, v18
+        vadd.vv  v2, v2, v19
+        vadd.vv  v3, v3, v23
+        vadd.vv  v0, v0, v5
+        vadd.vv  v1, v1, v6
+        vadd.vv  v2, v2, v7
+        vadd.vv  v3, v3, v4
+        vxor.vv  v15, v15, v0
+        vxor.vv  v12, v12, v1
+        vxor.vv  v13, v13, v2
+        vxor.vv  v14, v14, v3
+        vror.vi  v15, v15, 16
+        vror.vi  v12, v12, 16
+        vror.vi  v13, v13, 16
+        vror.vi  v14, v14, 16
+        vadd.vv  v10, v10, v15
+        vadd.vv  v11, v11, v12
+        vadd.vv  v8, v8, v13
+        vadd.vv  v9, v9, v14
+        vxor.vv  v5, v5, v10
+        vxor.vv  v6, v6, v11
+        vxor.vv  v7, v7, v8
+        vxor.vv  v4, v4, v9
+        vror.vi  v5, v5, 12
+        vror.vi  v6, v6, 12
+        vror.vi  v7, v7, 12
+        vror.vi  v4, v4, 12
+        vadd.vv  v0, v0, v26
+        vadd.vv  v1, v1, v28
+        vadd.vv  v2, v2, v20
+        vadd.vv  v3, v3, v29
+        vadd.vv  v0, v0, v5
+        vadd.vv  v1, v1, v6
+        vadd.vv  v2, v2, v7
+        vadd.vv  v3, v3, v4
+        vxor.vv  v15, v15, v0
+        vxor.vv  v12, v12, v1
+        vxor.vv  v13, v13, v2
+        vxor.vv  v14, v14, v3
+        vror.vi  v15, v15, 8
+        vror.vi  v12, v12, 8
+        vror.vi  v13, v13, 8
+        vror.vi  v14, v14, 8
+        vadd.vv  v10, v10, v15
+        vadd.vv  v11, v11, v12
+        vadd.vv  v8, v8, v13
+        vadd.vv  v9, v9, v14
+        vxor.vv  v5, v5, v10
+        vxor.vv  v6, v6, v11
+        vxor.vv  v7, v7, v8
+        vxor.vv  v4, v4, v9
+        vror.vi  v5, v5, 7
+        vror.vi  v6, v6, 7
+        vror.vi  v7, v7, 7
+        vror.vi  v4, v4, 7
+        ret
+
+// arguments from hash_chunks
+// a0: input [adjusted by 64]
+// a1: input_len [adjusted by -64]
+// a2: key [unused]
+// a3: counter
+// a4: flags
+// a5: aligned+transposed output [unused]
+// a6: total chunks [unused]
+// a7: remaining_bytes_in_last_chunk
+blake3_guts_riscv_rva23u64_hash_blocks:
+        // t0 := full_blocks := (input_len + 1024 - 64) / 1024
+        addi t0, a1, 1024 - 64
+        srli t0, t0, 10
+        // Load and transpose full message blocks. These are "strided segment
+        // loads". Each vlsseg8e32 instruction transposes 8 words from multiple
+        // message blocks into 8 registers, so we need two vlsseg8e32
+        // instructions (with the second offset by 32 bytes) to load full
+        // 64-byte blocks. The 1024-byte stride represents the spacing between
+        // two blocks in the same position in adjacent chunks.
+        // NOTE: If the final chunk is short, this could be 1 less than the
+        // total number of chunks, in which case this setup code and the kernel
+        // will leave a CV word undisturbed in each of v0-v7.
+        // NOTE: These loads could be misaligned. As far as I know, the Linux
+        // RISC-V ABI allows misaligned loads and stores. If we need to support
+        // an environment that doesn't allow them (or where they're
+        // unacceptably slow), we could add a fallback here.
+        vsetvli zero, t0, e32, m1, ta, ma
+        li t1, 1024
+        addi t2, a0, 32
+        vlsseg8e32.v v16, (a0), t1
+        vlsseg8e32.v v24, (t2), t1
+        // If remaining_bytes_in_last_chunk in 1..=63, there's a partial block
+        // at the end. Handle it out-of-line. If we take this branch, it will
+        // increment t0 by 1.
+        addi t1, a7, -1
+        li t2, 63
+        bltu t1, t2, handle_partial_block
+partial_block_finished:
+        // load the counter
+        vsetvli zero, t0, e64, m2, ta, ma
+        vmv.v.x v8, a3
+        vid.v v10
+        vadd.vv v8, v8, v10
+        // This is the mode setting that the kernel will use. If the final
+        // chunk is short, this iteration might have fewer blocks than an
+        // earlier iteration, so we need the tail undisturbed (tu).
+        vsetvli zero, t0, e32, m1, tu, ma
+        vncvt.x.x.w v12, v8
+        li t1, 32
+        vnsrl.wx v13, v8, t1
+        // Broadcast the block length, then overwrite the last block's length
+        // to be ((min(64, remaining_bytes_in_last_chunk) - 1) % 64) + 1. That
+        // is: 64 if remaining_bytes_in_last_chunk >= 64
+        //     else 64 if remaining_bytes_in_last_chunk is 0
+        //     else remaining_bytes_in_last_chunk
+        li t1, 64
+        vmv.v.x v14, t1
+        minu t1, t1, a7
+        addi t1, t1, -1
+        andi t1, t1, 63
+        addi t1, t1, 1
+        vslide1down.vx v14, v14, t1
+        // Broadcast the flags, then set CHUNK_END in the last block's flags if
+        // remaining_bytes_in_last_chunk is in 1..=64.
+        vmv.v.x v15, a4
+        addi t1, a7, -1
+        sltiu t1, t1, 64
+        slli t1, t1, 1  // CHUNK_END = 2
+        or t1, t1, a4
+        vslide1down.vx v15, v15, t1
+        // execute the kernel
+        mv t6, ra
+        call blake3_guts_riscv_rva23u64_kernel
+        mv ra, t6
+        // xor the two halves of the state
+        vxor.vv v0, v0, v8
+        vxor.vv v1, v1, v9
+        vxor.vv v2, v2, v10
+        vxor.vv v3, v3, v11
+        vxor.vv v4, v4, v12
+        vxor.vv v5, v5, v13
+        vxor.vv v6, v6, v14
+        vxor.vv v7, v7, v15
+        // Increment the input pointer, input_len, and
+        // remaining_bytes_in_last_chunk (which cannot go below zero).
+        addi a0, a0, 64
+        addi a1, a1, -64
+        addi a7, a7, -64
+        max a7, a7, zero
+        ret
+handle_partial_block:
+        // The minimum VLEN is 128 bits, so we're guaranteed to be able to fit
+        // the block in v8-v11 with LMUL=4. Clear 64 zero bytes before the
+        // load, to make sure the partial block is zero-padded.
+        li t1, 64
+        vsetvli zero, t1, e8, m4, ta, ma
+        vmv.v.i v8, 0
+        add t2, a0, a1
+        sub t2, t2, a7
+        vsetvli zero, a7, e8, m4, ta, ma
+        vle8.v v8, (t2)
+        // If VLEN is longer than 128 bits (16 bytes), then half or all of the
+        // block bytes will be in v8. Make sure they're split evenly across
+        // v8-v11.
+        csrr t1, vlenb
+        li t2, 64
+        bltu t1, t2, vlenb_less_than_64
+        vsetivli zero, 8, e32, m1, ta, ma
+        vslidedown.vi v9, v8, 8
+vlenb_less_than_64:
+        li t2, 32
+        bltu t1, t2, vlenb_less_than_32
+        vsetivli zero, 4, e32, m1, ta, ma
+        vmv.v.v v10, v9
+        vslidedown.vi v11, v9, 4
+        vslidedown.vi v9, v8, 4
+vlenb_less_than_32:
+        // Shift each of the words of the padded partial block to the end of
+        // the corresponding message vector. t0 was previously the number of
+        // full blocks. Now we increment it, so that it's the number of all
+        // blocks (both full and partial).
+        mv t1, t0
+        addi t0, t0, 1
+        // Set vl to at least 4, because v8-v11 each have 4 message words.
+        // Setting vl shorter will make vslide1down clobber those words.
+        li t2, 4
+        maxu t2, t0, t2
+        vsetvli zero, t2, e32, m1, ta, ma
+        vslideup.vx v16, v8, t1
+        vslide1down.vx v8, v8, zero
+        vslideup.vx v17, v8, t1
+        vslide1down.vx v8, v8, zero
+        vslideup.vx v18, v8, t1
+        vslide1down.vx v8, v8, zero
+        vslideup.vx v19, v8, t1
+        vslideup.vx v20, v9, t1
+        vslide1down.vx v9, v9, zero
+        vslideup.vx v21, v9, t1
+        vslide1down.vx v9, v9, zero
+        vslideup.vx v22, v9, t1
+        vslide1down.vx v9, v9, zero
+        vslideup.vx v23, v9, t1
+        vslideup.vx v24, v10, t1
+        vslide1down.vx v10, v10, zero
+        vslideup.vx v25, v10, t1
+        vslide1down.vx v10, v10, zero
+        vslideup.vx v26, v10, t1
+        vslide1down.vx v10, v10, zero
+        vslideup.vx v27, v10, t1
+        vslideup.vx v28, v11, t1
+        vslide1down.vx v11, v11, zero
+        vslideup.vx v29, v11, t1
+        vslide1down.vx v11, v11, zero
+        vslideup.vx v30, v11, t1
+        vslide1down.vx v11, v11, zero
+        vslideup.vx v31, v11, t1
+        j partial_block_finished
+
+// a0: input
+// a1: input_len
+// a2: key
+// a3: counter
+// a4: flags
+// a5: aligned+transposed output
+.global blake3_guts_riscv_rva23u64_hash_chunks
+blake3_guts_riscv_rva23u64_hash_chunks:
+        // Save the original num_chunks = (input_len+1023)/1024 in a6.
+        addi a6, a1, 1023
+        srli a6, a6, 10
+        // Track the bytes remaining in the last chunk in a7. The initial value
+        // of this is ((input_len - 1) % 1024) + 1. (The input to this function
+        // is never empty.) It decrements by 64 with each call to
+        // blake3_guts_riscv_rva23u64_hash_chunks, but not below 0.
+        addi a7, a1, -1
+        andi a7, a7, 1023
+        addi a7, a7, 1
+        // broadcast the key to v0-7
+        vsetvli zero, a6, e32, m1, ta, ma
+        lw t0, 0(a2)
+        vmv.v.x v0, t0
+        lw t0, 4(a2)
+        vmv.v.x v1, t0
+        lw t0, 8(a2)
+        vmv.v.x v2, t0
+        lw t0, 12(a2)
+        vmv.v.x v3, t0
+        lw t0, 16(a2)
+        vmv.v.x v4, t0
+        lw t0, 20(a2)
+        vmv.v.x v5, t0
+        lw t0, 24(a2)
+        vmv.v.x v6, t0
+        lw t0, 28(a2)
+        vmv.v.x v7, t0
+        // sixteen blocks (TODO: partial chunks)
+        // Note that hash_blocks increments the input pointer and decrements
+        // the input length.
+        mv t5, ra
+        ori a4, a4, 1    // set CHUNK_START
+        call blake3_guts_riscv_rva23u64_hash_blocks
+        andi a4, a4, -2  // unset CHUNK_START
+        call blake3_guts_riscv_rva23u64_hash_blocks
+        call blake3_guts_riscv_rva23u64_hash_blocks
+        call blake3_guts_riscv_rva23u64_hash_blocks
+        call blake3_guts_riscv_rva23u64_hash_blocks
+        call blake3_guts_riscv_rva23u64_hash_blocks
+        call blake3_guts_riscv_rva23u64_hash_blocks
+        call blake3_guts_riscv_rva23u64_hash_blocks
+        call blake3_guts_riscv_rva23u64_hash_blocks
+        call blake3_guts_riscv_rva23u64_hash_blocks
+        call blake3_guts_riscv_rva23u64_hash_blocks
+        call blake3_guts_riscv_rva23u64_hash_blocks
+        call blake3_guts_riscv_rva23u64_hash_blocks
+        call blake3_guts_riscv_rva23u64_hash_blocks
+        call blake3_guts_riscv_rva23u64_hash_blocks
+        ori a4, a4, 2    // set CHUNK_END
+        call blake3_guts_riscv_rva23u64_hash_blocks
+        mv ra, t5
+        // If the final chunk is short, we need to set vl back to the total
+        // number of chunks.
+        vsetvli zero, a6, e32, m1, ta, ma
+        // write aligned+transposed outputs with a stride of 2*MAX_SIMD_DEGREE words
+        vse32.v v0, (a5)
+        addi a5, a5, TRANSPOSED_STRIDE_BYTES
+        vse32.v v1, (a5)
+        addi a5, a5, TRANSPOSED_STRIDE_BYTES
+        vse32.v v2, (a5)
+        addi a5, a5, TRANSPOSED_STRIDE_BYTES
+        vse32.v v3, (a5)
+        addi a5, a5, TRANSPOSED_STRIDE_BYTES
+        vse32.v v4, (a5)
+        addi a5, a5, TRANSPOSED_STRIDE_BYTES
+        vse32.v v5, (a5)
+        addi a5, a5, TRANSPOSED_STRIDE_BYTES
+        vse32.v v6, (a5)
+        addi a5, a5, TRANSPOSED_STRIDE_BYTES
+        vse32.v v7, (a5)
+        ret
+
+// a0: aligned+transposed input
+// a1: num_parents
+// a2: key
+// a3: flags
+// a4: out pointer
+.global blake3_guts_riscv_rva23u64_hash_parents
+blake3_guts_riscv_rva23u64_hash_parents:
+        // load the transposed CVs and split alternating words into the low and
+        // high halves of the input vectors
+        vsetvli zero, a1, e32, m1, ta, ma
+        vlseg2e32.v v16, (a0)
+        vmv.v.v v24, v17
+        addi a0, a0, TRANSPOSED_STRIDE_BYTES
+        vlseg2e32.v v17, (a0)
+        vmv.v.v v25, v18
+        addi a0, a0, TRANSPOSED_STRIDE_BYTES
+        vlseg2e32.v v18, (a0)
+        vmv.v.v v26, v19
+        addi a0, a0, TRANSPOSED_STRIDE_BYTES
+        vlseg2e32.v v19, (a0)
+        vmv.v.v v27, v20
+        addi a0, a0, TRANSPOSED_STRIDE_BYTES
+        vlseg2e32.v v20, (a0)
+        vmv.v.v v28, v21
+        addi a0, a0, TRANSPOSED_STRIDE_BYTES
+        vlseg2e32.v v21, (a0)
+        vmv.v.v v29, v22
+        addi a0, a0, TRANSPOSED_STRIDE_BYTES
+        vlseg2e32.v v22, (a0)
+        vmv.v.v v30, v23
+        addi a0, a0, TRANSPOSED_STRIDE_BYTES
+        vlseg2e32.v v14, (a0)  // use v14-15 as scratch space to avoid overwriting v24
+        vmv.v.v v23, v14
+        vmv.v.v v31, v15
+        // broadcast the key to v0-7
+        lw t0, 0(a2)
+        vmv.v.x v0, t0
+        lw t0, 4(a2)
+        vmv.v.x v1, t0
+        lw t0, 8(a2)
+        vmv.v.x v2, t0
+        lw t0, 12(a2)
+        vmv.v.x v3, t0
+        lw t0, 16(a2)
+        vmv.v.x v4, t0
+        lw t0, 20(a2)
+        vmv.v.x v5, t0
+        lw t0, 24(a2)
+        vmv.v.x v6, t0
+        lw t0, 28(a2)
+        vmv.v.x v7, t0
+        // zero the counter
+        vmv.v.i v12, 0
+        vmv.v.i v13, 0
+        // broadcast the block length
+        li t0, 64
+        vmv.v.x v14, t0
+        // broadcast the flags
+        vmv.v.x v15, a3
+
+        // execute the kernel
+        mv t6, ra
+        call blake3_guts_riscv_rva23u64_kernel
+        mv ra, t6
+
+        // xor the two halves of the state
+        vxor.vv v0, v0, v8
+        vxor.vv v1, v1, v9
+        vxor.vv v2, v2, v10
+        vxor.vv v3, v3, v11
+        vxor.vv v4, v4, v12
+        vxor.vv v5, v5, v13
+        vxor.vv v6, v6, v14
+        vxor.vv v7, v7, v15
+        // write aligned+transposed outputs with a stride of 2*MAX_SIMD_DEGREE words
+        vse32.v v0, (a4)
+        addi a4, a4, TRANSPOSED_STRIDE_BYTES
+        vse32.v v1, (a4)
+        addi a4, a4, TRANSPOSED_STRIDE_BYTES
+        vse32.v v2, (a4)
+        addi a4, a4, TRANSPOSED_STRIDE_BYTES
+        vse32.v v3, (a4)
+        addi a4, a4, TRANSPOSED_STRIDE_BYTES
+        vse32.v v4, (a4)
+        addi a4, a4, TRANSPOSED_STRIDE_BYTES
+        vse32.v v5, (a4)
+        addi a4, a4, TRANSPOSED_STRIDE_BYTES
+        vse32.v v6, (a4)
+        addi a4, a4, TRANSPOSED_STRIDE_BYTES
+        vse32.v v7, (a4)
+        ret
+
+// a0: 64 zero-padded block bytes
+// a1: block_len
+// a2: cv
+// a3: counter
+// a4: flags
+// a5: out_ptr
+// a6: out_len
+blake3_guts_riscv_rva23u64_xof_inner:
+        // t1 := total_blocks := (out_len + 63) / 64
+        addi t1, a6, 63
+        srli t1, t1, 6
+        // t2 := full_blocks := out_len / 64
+        srli t2, a6, 6
+        // broadcast the CV to v0-7
+        vsetvli zero, t1, e32, m1, ta, ma
+        lw t3, 0(a2)
+        vmv.v.x v0, t3
+        lw t3, 4(a2)
+        vmv.v.x v1, t3
+        lw t3, 8(a2)
+        vmv.v.x v2, t3
+        lw t3, 12(a2)
+        vmv.v.x v3, t3
+        lw t3, 16(a2)
+        vmv.v.x v4, t3
+        lw t3, 20(a2)
+        vmv.v.x v5, t3
+        lw t3, 24(a2)
+        vmv.v.x v6, t3
+        lw t3, 28(a2)
+        vmv.v.x v7, t3
+        // broadcast the block_words to v16-31
+        lw t3, 0(a0)
+        vmv.v.x v16, t3
+        lw t3, 4(a0)
+        vmv.v.x v17, t3
+        lw t3, 8(a0)
+        vmv.v.x v18, t3
+        lw t3, 12(a0)
+        vmv.v.x v19, t3
+        lw t3, 16(a0)
+        vmv.v.x v20, t3
+        lw t3, 20(a0)
+        vmv.v.x v21, t3
+        lw t3, 24(a0)
+        vmv.v.x v22, t3
+        lw t3, 28(a0)
+        vmv.v.x v23, t3
+        lw t3, 32(a0)
+        vmv.v.x v24, t3
+        lw t3, 36(a0)
+        vmv.v.x v25, t3
+        lw t3, 40(a0)
+        vmv.v.x v26, t3
+        lw t3, 44(a0)
+        vmv.v.x v27, t3
+        lw t3, 48(a0)
+        vmv.v.x v28, t3
+        lw t3, 52(a0)
+        vmv.v.x v29, t3
+        lw t3, 56(a0)
+        vmv.v.x v30, t3
+        lw t3, 60(a0)
+        vmv.v.x v31, t3
+        // load the counter
+        vsetvli zero, t1, e64, m2, ta, ma
+        vmv.v.x v8, a3
+        vid.v v10
+        vadd.vv v8, v8, v10
+        vsetvli zero, t1, e32, m1, ta, ma
+        vncvt.x.x.w v12, v8
+        li t3, 32
+        vnsrl.wx v13, v8, t3
+        // broadcast the block length
+        vmv.v.x v14, a1
+        // broadcast the flags
+        vmv.v.x v15, a4
+
+        // execute the kernel
+        mv t6, ra
+        call blake3_guts_riscv_rva23u64_kernel
+        mv ra, t6
+
+        // reload the CV, this time into v16-23
+        lw t3, 0(a2)
+        vmv.v.x v16, t3
+        lw t3, 4(a2)
+        vmv.v.x v17, t3
+        lw t3, 8(a2)
+        vmv.v.x v18, t3
+        lw t3, 12(a2)
+        vmv.v.x v19, t3
+        lw t3, 16(a2)
+        vmv.v.x v20, t3
+        lw t3, 20(a2)
+        vmv.v.x v21, t3
+        lw t3, 24(a2)
+        vmv.v.x v22, t3
+        lw t3, 28(a2)
+        vmv.v.x v23, t3
+        // xor the two halves of the state and feed-forward the CV
+        vxor.vv v0, v0, v8
+        vxor.vv v1, v1, v9
+        vxor.vv v2, v2, v10
+        vxor.vv v3, v3, v11
+        vxor.vv v4, v4, v12
+        vxor.vv v5, v5, v13
+        vxor.vv v6, v6, v14
+        vxor.vv v7, v7, v15
+        vxor.vv v8, v8, v16
+        vxor.vv v9, v9, v17
+        vxor.vv v10, v10, v18
+        vxor.vv v11, v11, v19
+        vxor.vv v12, v12, v20
+        vxor.vv v13, v13, v21
+        vxor.vv v14, v14, v22
+        vxor.vv v15, v15, v23
+        ret
+
+// a0: 64 zero-padded block bytes
+// a1: block_len
+// a2: cv
+// a3: counter
+// a4: flags
+// a5: out_ptr
+// a6: out_len
+.global blake3_guts_riscv_rva23u64_xof
+blake3_guts_riscv_rva23u64_xof:
+        mv t5, ra
+        call blake3_guts_riscv_rva23u64_xof_inner
+        mv ra, t5
+
+        // t1 is now total_blocks, and t2 is full_blocks. Set vl to t2 and the
+        // tail policy to undisturbed. We'll handle full blocks with segmented
+        // stores, and then we'll use a separate branch for a partial final
+        // block, if any.
+        vsetvli zero, t2, e32, m1, tu, ma
+
+        // Transpose and store full output blocks. These are "strided segment
+        // stores". Each vssseg8e32 instruction transposes 8 words from
+        // adjacent registers into 32 bytes of contiguous output, so we need
+        // two vssseg8e32 instructions to store full 64-byte blocks. We offset
+        // the second store by 32 bytes and use a 64-byte stride.
+        // NOTE: These stores might be misaligned.
+        li t0, 64
+        addi t3, a5, 32
+        vssseg8e32.v v0, (a5), t0
+        vssseg8e32.v v8, (t3), t0
+
+        // If full_blocks != partial_blocks, we need to handle the final
+        // partial block. Otherwise, we're done.
+        bne t1, t2, blake3_guts_riscv_rva23u64_xof_partial_block
+        ret
+blake3_guts_riscv_rva23u64_xof_partial_block:
+        // Collect groups of 4 words in v0, v4, v8, and v12.
+        vsetivli zero, 4, e32, m1, ta, ma
+        vslidedown.vx v0, v0, t2
+        vslidedown.vx v1, v1, t2
+        vslideup.vi v0, v1, 1
+        vslidedown.vx v2, v2, t2
+        vslideup.vi v0, v2, 2
+        vslidedown.vx v3, v3, t2
+        vslideup.vi v0, v3, 3
+        vslidedown.vx v4, v4, t2
+        vslidedown.vx v5, v5, t2
+        vslideup.vi v4, v5, 1
+        vslidedown.vx v6, v6, t2
+        vslideup.vi v4, v6, 2
+        vslidedown.vx v7, v7, t2
+        vslideup.vi v4, v7, 3
+        vslidedown.vx v8, v8, t2
+        vslidedown.vx v9, v9, t2
+        vslideup.vi v8, v9, 1
+        vslidedown.vx v10, v10, t2
+        vslideup.vi v8, v10, 2
+        vslidedown.vx v11, v11, t2
+        vslideup.vi v8, v11, 3
+        vslidedown.vx v12, v12, t2
+        vslidedown.vx v13, v13, t2
+        vslideup.vi v12, v13, 1
+        vslidedown.vx v14, v14, t2
+        vslideup.vi v12, v14, 2
+        vslidedown.vx v15, v15, t2
+        vslideup.vi v12, v15, 3
+        // Use LMUL=4 to guarantee that one vector register group can hold 16
+        // words, and collect all 16 words in the v0 group.
+        vsetivli zero, 16, e32, m4, ta, ma
+        vslideup.vi v0, v4, 4
+        vslideup.vi v0, v8, 8
+        vslideup.vi v0, v12, 12
+        // Switch to bytes and write the output.
+        andi t3, a6, 63
+        add a5, a5, a6
+        sub a5, a5, t3
+        vsetvli zero, t3, e8, m4, ta, ma
+        vse8.v v0, (a5)
+        ret
+
+// a0: 64 zero-padded block bytes
+// a1: block_len
+// a2: cv
+// a3: counter
+// a4: flags
+// a5: out_ptr
+// a6: out_len
+.global blake3_guts_riscv_rva23u64_xof_xor
+blake3_guts_riscv_rva23u64_xof_xor:
+        mv t5, ra
+        call blake3_guts_riscv_rva23u64_xof_inner
+        mv ra, t5
+
+        // t1 is now total_blocks, and t2 is full_blocks. Set vl to t2 and the
+        // tail policy to undisturbed. We'll handle full blocks with segmented
+        // stores, and then we'll use a separate branch for a partial final
+        // block, if any.
+        vsetvli zero, t2, e32, m1, tu, ma
+
+        // Do a transposed load of the caller's buffer, xor that with the state
+        // words, and do a transposed store. These are "strided segment"
+        // loads/stores. Each vlsseg8e32/vssseg8e32 instruction works with
+        // groups of 8 words or 32 bytes, so we need pairs of these
+        // instructions to handle full 64-byte blocks. We offset the second by
+        // 32 bytes and use a 64-byte stride.
+        // NOTE: These accesses might be misaligned.
+        li t0, 64
+        addi t3, a5, 32
+        vlsseg8e32.v v16, (a5), t0
+        vlsseg8e32.v v24, (t3), t0
+        vxor.vv v0, v0, v16
+        vxor.vv v1, v1, v17
+        vxor.vv v2, v2, v18
+        vxor.vv v3, v3, v19
+        vxor.vv v4, v4, v20
+        vxor.vv v5, v5, v21
+        vxor.vv v6, v6, v22
+        vxor.vv v7, v7, v23
+        vxor.vv v8, v8, v24
+        vxor.vv v9, v9, v25
+        vxor.vv v10, v10, v26
+        vxor.vv v11, v11, v27
+        vxor.vv v12, v12, v28
+        vxor.vv v13, v13, v29
+        vxor.vv v14, v14, v30
+        vxor.vv v15, v15, v31
+        vssseg8e32.v v0, (a5), t0
+        vssseg8e32.v v8, (t3), t0
+
+        // If full_blocks != partial_blocks, we need to handle the final
+        // partial block. Otherwise, we're done.
+        bne t1, t2, blake3_guts_riscv_rva23u64_xof_xor_partial_block
+        ret
+blake3_guts_riscv_rva23u64_xof_xor_partial_block:
+        // Collect groups of 4 words in v0, v4, v8, and v12.
+        vsetivli zero, 4, e32, m1, ta, ma
+        vslidedown.vx v0, v0, t2
+        vslidedown.vx v1, v1, t2
+        vslideup.vi v0, v1, 1
+        vslidedown.vx v2, v2, t2
+        vslideup.vi v0, v2, 2
+        vslidedown.vx v3, v3, t2
+        vslideup.vi v0, v3, 3
+        vslidedown.vx v4, v4, t2
+        vslidedown.vx v5, v5, t2
+        vslideup.vi v4, v5, 1
+        vslidedown.vx v6, v6, t2
+        vslideup.vi v4, v6, 2
+        vslidedown.vx v7, v7, t2
+        vslideup.vi v4, v7, 3
+        vslidedown.vx v8, v8, t2
+        vslidedown.vx v9, v9, t2
+        vslideup.vi v8, v9, 1
+        vslidedown.vx v10, v10, t2
+        vslideup.vi v8, v10, 2
+        vslidedown.vx v11, v11, t2
+        vslideup.vi v8, v11, 3
+        vslidedown.vx v12, v12, t2
+        vslidedown.vx v13, v13, t2
+        vslideup.vi v12, v13, 1
+        vslidedown.vx v14, v14, t2
+        vslideup.vi v12, v14, 2
+        vslidedown.vx v15, v15, t2
+        vslideup.vi v12, v15, 3
+        // Use LMUL=4 to guarantee that one vector register group can hold 16
+        // words, and collect all 16 words in the v0 group.
+        vsetivli zero, 16, e32, m4, ta, ma
+        vslideup.vi v0, v4, 4
+        vslideup.vi v0, v8, 8
+        vslideup.vi v0, v12, 12
+        // Switch to bytes and read/xor/write the output.
+        andi t3, a6, 63
+        add a5, a5, a6
+        sub a5, a5, t3
+        vsetvli zero, t3, e8, m4, ta, ma
+        vle8.v v4, (a5)
+        vxor.vv v0, v0, v4
+        vse8.v v0, (a5)
+        ret
+
+// a0: input_ptr
+// a1: input_len
+// a2: key
+// a3: counter
+// a4: out_ptr
+.global blake3_guts_riscv_rva23u64_universal_hash
+blake3_guts_riscv_rva23u64_universal_hash:
+        // t0 := full_blocks := input_len / 64
+        srli t0, a1, 6
+        // Load and transpose full message blocks. These are "strided segment
+        // loads". Each vlsseg8e32 instruction transposes 8 words from multiple
+        // message blocks into 8 registers, so we need two vlsseg8e32
+        // instructions (with the second offset by 32 bytes) to load full
+        // 64-byte blocks. The 64-byte stride equals the block size, because in
+        // this case (unlike hash_blocks) the blocks are adjacent.
+        // NOTE: These loads could be misaligned. As far as I know, the Linux
+        // RISC-V ABI allows misaligned loads and stores. If we need to support
+        // an environment that doesn't allow them (or where they're
+        // unacceptably slow), we could add a fallback here.
+        vsetvli zero, t0, e32, m1, ta, ma
+        li t1, 64
+        addi t2, a0, 32
+        vlsseg8e32.v v16, (a0), t1
+        vlsseg8e32.v v24, (t2), t1
+        // Broadcast the block length.
+        li t1, 64
+        vmv.v.x v14, t1
+        // If there's a partial block, handle it in an out-of-line branch.
+        andi t1, a1, 63
+        bnez t1, universal_hash_handle_partial_block
+universal_hash_partial_block_finished:
+        // Broadcast the key to v0-7.
+        lw t1, 0(a2)
+        vmv.v.x v0, t1
+        lw t1, 4(a2)
+        vmv.v.x v1, t1
+        lw t1, 8(a2)
+        vmv.v.x v2, t1
+        lw t1, 12(a2)
+        vmv.v.x v3, t1
+        lw t1, 16(a2)
+        vmv.v.x v4, t1
+        lw t1, 20(a2)
+        vmv.v.x v5, t1
+        lw t1, 24(a2)
+        vmv.v.x v6, t1
+        lw t1, 28(a2)
+        vmv.v.x v7, t1
+        // Load the counter.
+        vsetvli zero, t0, e64, m2, ta, ma
+        vmv.v.x v8, a3
+        vid.v v10
+        vadd.vv v8, v8, v10
+        vsetvli zero, t0, e32, m1, ta, ma
+        vncvt.x.x.w v12, v8
+        li t1, 32
+        vnsrl.wx v13, v8, t1
+        // Broadcast the flags.
+        li t1, CHUNK_START | CHUNK_END | ROOT | KEYED_HASH
+        vmv.v.x v15, t1
+        // Execute the kernel.
+        mv t6, ra
+        call blake3_guts_riscv_rva23u64_kernel
+        mv ra, t6
+        // Finish the first four state vectors. The rest are dropped.
+        vxor.vv v0, v0, v8
+        vxor.vv v1, v1, v9
+        vxor.vv v2, v2, v10
+        vxor.vv v3, v3, v11
+        // XOR-reduce each vector.
+        vmv.v.i v4, 0
+        vredxor.vs v0, v0, v4
+        vredxor.vs v1, v1, v4
+        vredxor.vs v2, v2, v4
+        vredxor.vs v3, v3, v4
+        // Write the output.
+        vmv.x.s t0, v0
+        sw t0,  0(a4)
+        vmv.x.s t0, v1
+        sw t0,  4(a4)
+        vmv.x.s t0, v2
+        sw t0,  8(a4)
+        vmv.x.s t0, v3
+        sw t0, 12(a4)
+        ret
+universal_hash_handle_partial_block:
+        // Load the partial block into v8-v11. With LMUL=4, v8 is guaranteed to
+        // hold at least 64 bytes. Zero all 64 bytes first, for block padding.
+        // The block length is already in t1.
+        li t2, 64
+        vsetvli zero, t2, e8, m4, ta, ma
+        vmv.v.i v8, 0
+        vsetvli zero, t1, e8, m4, ta, ma
+        add t2, a0, a1
+        sub t2, t2, t1
+        vle8.v v8, (t2)
+        // If VLEN is longer than 128 bits (16 bytes), then half or all of the
+        // block bytes will be in v8. Make sure they're split evenly across
+        // v8-v11.
+        csrr t2, vlenb
+        li t3, 64
+        bltu t2, t3, universal_hash_vlenb_less_than_64
+        vsetivli zero, 8, e32, m1, ta, ma
+        vslidedown.vi v9, v8, 8
+universal_hash_vlenb_less_than_64:
+        li t3, 32
+        bltu t2, t3, universal_hash_vlenb_less_than_32
+        vsetivli zero, 4, e32, m1, ta, ma
+        vmv.v.v v10, v9
+        vslidedown.vi v11, v9, 4
+        vslidedown.vi v9, v8, 4
+universal_hash_vlenb_less_than_32:
+        // Shift each of the words of the padded partial block to the end of
+        // the corresponding message vector. t0 was previously the number of
+        // full blocks. Now we increment it, so that it's the number of all
+        // blocks (both full and partial).
+        mv t2, t0
+        addi t0, t0, 1
+        // Set vl to at least 4, because v8-v11 each have 4 message words.
+        // Setting vl shorter will make vslide1down clobber those words.
+        li t3, 4
+        maxu t3, t0, t3
+        vsetvli zero, t3, e32, m1, ta, ma
+        vslideup.vx v16, v8, t2
+        vslide1down.vx v8, v8, zero
+        vslideup.vx v17, v8, t2
+        vslide1down.vx v8, v8, zero
+        vslideup.vx v18, v8, t2
+        vslide1down.vx v8, v8, zero
+        vslideup.vx v19, v8, t2
+        vslideup.vx v20, v9, t2
+        vslide1down.vx v9, v9, zero
+        vslideup.vx v21, v9, t2
+        vslide1down.vx v9, v9, zero
+        vslideup.vx v22, v9, t2
+        vslide1down.vx v9, v9, zero
+        vslideup.vx v23, v9, t2
+        vslideup.vx v24, v10, t2
+        vslide1down.vx v10, v10, zero
+        vslideup.vx v25, v10, t2
+        vslide1down.vx v10, v10, zero
+        vslideup.vx v26, v10, t2
+        vslide1down.vx v10, v10, zero
+        vslideup.vx v27, v10, t2
+        vslideup.vx v28, v11, t2
+        vslide1down.vx v11, v11, zero
+        vslideup.vx v29, v11, t2
+        vslide1down.vx v11, v11, zero
+        vslideup.vx v30, v11, t2
+        vslide1down.vx v11, v11, zero
+        vslideup.vx v31, v11, t2
+        // Set the updated VL.
+        vsetvli zero, t0, e32, m1, ta, ma
+        // Append the final block length, still in t1.
+        vmv.v.x v8, t1
+        addi t2, t0, -1
+        vslideup.vx v14, v8, t2
+        j universal_hash_partial_block_finished
diff --git a/rust/guts/src/riscv_rva23u64.rs b/rust/guts/src/riscv_rva23u64.rs
new file mode 100644
index 0000000..7f2a7ab
--- /dev/null
+++ b/rust/guts/src/riscv_rva23u64.rs
@@ -0,0 +1,124 @@
+//! This implementation currently assumes riscv_rva23u64_zbb_zvbb. Zvbb in particular ("Vector
+//! Bit-manipulation used in Cryptography") is a bleeding-edge extension that was only frozen a few
+//! weeks ago at the time I'm writing this comment. Compiling and testing this code currently
+//! requires quite a lot of effort, including building Clang from master and building QEMU from a
+//! custom branch. Please don't expect this code to be usable on real hardware for some time.
+
+use crate::{BlockBytes, CVBytes, Implementation};
+
+// NOTE: Keep this in sync with the same constant in assembly.
+pub(crate) const MAX_SIMD_DEGREE: usize = 16;
+
+extern "C" {
+    fn blake3_guts_riscv_rva23u64_degree() -> usize;
+    fn blake3_guts_riscv_rva23u64_compress(
+        block: *const BlockBytes,
+        block_len: u32,
+        cv: *const CVBytes,
+        counter: u64,
+        flags: u32,
+        out: *mut CVBytes,
+    );
+    fn blake3_guts_riscv_rva23u64_hash_chunks(
+        input: *const u8,
+        input_len: usize,
+        key: *const CVBytes,
+        counter: u64,
+        flags: u32,
+        transposed_output: *mut u32,
+    );
+    fn blake3_guts_riscv_rva23u64_hash_parents(
+        transposed_input: *const u32,
+        num_parents: usize,
+        key: *const CVBytes,
+        flags: u32,
+        transposed_output: *mut u32,
+    );
+    fn blake3_guts_riscv_rva23u64_xof(
+        block: *const BlockBytes,
+        block_len: u32,
+        cv: *const CVBytes,
+        counter: u64,
+        flags: u32,
+        out: *mut u8,
+        out_len: usize,
+    );
+    fn blake3_guts_riscv_rva23u64_xof_xor(
+        block: *const BlockBytes,
+        block_len: u32,
+        cv: *const CVBytes,
+        counter: u64,
+        flags: u32,
+        out: *mut u8,
+        out_len: usize,
+    );
+    fn blake3_guts_riscv_rva23u64_universal_hash(
+        input: *const u8,
+        input_len: usize,
+        key: *const CVBytes,
+        counter: u64,
+        out: *mut [u8; 16],
+    );
+}
+
+pub fn implementation() -> Implementation {
+    Implementation::new(
+        blake3_guts_riscv_rva23u64_degree,
+        blake3_guts_riscv_rva23u64_compress,
+        blake3_guts_riscv_rva23u64_hash_chunks,
+        blake3_guts_riscv_rva23u64_hash_parents,
+        blake3_guts_riscv_rva23u64_xof,
+        blake3_guts_riscv_rva23u64_xof_xor,
+        blake3_guts_riscv_rva23u64_universal_hash,
+    )
+}
+
+#[cfg(test)]
+mod test {
+    use super::*;
+
+    #[test]
+    fn test_compress_vs_portable() {
+        crate::test::test_compress_vs_portable(&implementation());
+    }
+
+    #[test]
+    fn test_compress_vs_reference() {
+        crate::test::test_compress_vs_reference(&implementation());
+    }
+
+    #[test]
+    fn test_hash_chunks_vs_portable() {
+        crate::test::test_hash_chunks_vs_portable(&implementation());
+    }
+
+    #[test]
+    fn test_hash_parents_vs_portable() {
+        crate::test::test_hash_parents_vs_portable(&implementation());
+    }
+
+    #[test]
+    fn test_chunks_and_parents_vs_reference() {
+        crate::test::test_chunks_and_parents_vs_reference(&implementation());
+    }
+
+    #[test]
+    fn test_xof_vs_portable() {
+        crate::test::test_xof_vs_portable(&implementation());
+    }
+
+    #[test]
+    fn test_xof_vs_reference() {
+        crate::test::test_xof_vs_reference(&implementation());
+    }
+
+    #[test]
+    fn test_universal_hash_vs_portable() {
+        crate::test::test_universal_hash_vs_portable(&implementation());
+    }
+
+    #[test]
+    fn test_universal_hash_vs_reference() {
+        crate::test::test_universal_hash_vs_reference(&implementation());
+    }
+}