Implement RVV backend

2024-06-07 23:46:08 +02:00 · 2024-01-15 20:12:55 -07:00 · 2024-01-15 20:12:55 -07:00 · 9c468922bf
parent d60b753572
commit 9c468922bf
11 changed files with 582 additions and 1 deletions
--- a/Cargo.toml
+++ b/Cargo.toml
@ -18,6 +18,11 @@ default = ["std"]
 # implementation uses C intrinsics and requires a C compiler.
 neon = []

+# The RVV implementation does not participate in dynamic feature detection,
+# which is currently x86-only. If "rvv" is on, RVV support is assumed. The
+# RVV implementation uses C intrinsics and requires a C compiler.
+rvv = []
+
 # This crate uses libstd for std::io trait implementations, and also for
 # runtime CPU feature detection. This feature is enabled by default. If you use
 # --no-default-features, the only way to use the SIMD implementations in this
--- a/benches/bench.rs
+++ b/benches/bench.rs
@ -85,6 +85,14 @@ fn bench_single_compression_avx512(b: &mut Bencher) {
    }
 }

+#[bench]
+#[cfg(blake3_rvv)]
+fn bench_single_compression_avx512(b: &mut Bencher) {
+    if let Some(platform) = Platform::rvv() {
+        bench_single_compression_fn(b, platform);
+    }
+}
+
 fn bench_many_chunks_fn(b: &mut Bencher, platform: Platform) {
    let degree = platform.simd_degree();
    let mut inputs = Vec::new();
@ -151,6 +159,14 @@ fn bench_many_chunks_neon(b: &mut Bencher) {
    }
 }

+#[bench]
+#[cfg(feature = "rvv")]
+fn bench_many_chunks_neon(b: &mut Bencher) {
+    if let Some(platform) = Platform::rvv() {
+        bench_many_chunks_fn(b, platform);
+    }
+}
+
 // TODO: When we get const generics we can unify this with the chunks code.
 fn bench_many_parents_fn(b: &mut Bencher, platform: Platform) {
    let degree = platform.simd_degree();
@ -218,6 +234,14 @@ fn bench_many_parents_neon(b: &mut Bencher) {
    }
 }

+#[bench]
+#[cfg(feature = "rvv")]
+fn bench_many_parents_rvv(b: &mut Bencher) {
+    if let Some(platform) = Platform::rvv() {
+        bench_many_parents_fn(b, platform);
+    }
+}
+
 fn bench_atonce(b: &mut Bencher, len: usize) {
    let mut input = RandomInput::new(b, len);
    b.iter(|| blake3::hash(input.get()));
--- a/build.rs
+++ b/build.rs
@ -1,3 +1,4 @@
+use core::panic;
 use std::env;

 fn defined(var: &str) -> bool {
@ -21,6 +22,14 @@ fn is_no_neon() -> bool {
    defined("CARGO_FEATURE_NO_NEON")
 }

+fn is_rvv() -> bool {
+    cfg!(feature = "rvv")
+}
+
+fn is_no_rvv() -> bool {
+    cfg!(not(feature = "rvv"))
+}
+
 fn is_ci() -> bool {
    defined("BLAKE3_CI")
 }
@ -60,6 +69,18 @@ fn is_armv7() -> bool {
    target_components()[0] == "armv7"
 }

+fn is_riscv32() -> bool {
+    std::env::var("CARGO_CFG_TARGET_ARCH")
+        .map(|target_arch| target_arch == "riscv32")
+        .unwrap_or_default()
+}
+
+fn is_riscv64() -> bool {
+    std::env::var("CARGO_CFG_TARGET_ARCH")
+        .map(|target_arch| target_arch == "riscv64")
+        .unwrap_or_default()
+}
+
 fn endianness() -> String {
    let endianness = env::var("CARGO_CFG_TARGET_ENDIAN").unwrap();
    assert!(endianness == "little" || endianness == "big");
@ -239,15 +260,33 @@ fn build_neon_c_intrinsics() {
    build.compile("blake3_neon");
 }

+fn build_rvv_c_intrinsics() {
+    let mut build = new_build();
+    build.file("c/blake3_rvv.c");
+    if is_riscv32() {
+        build.flag("-march=rv32gcv1p0");
+    }
+    if is_riscv64() {
+        build.flag("-march=rv64gcv1p0");
+    }
+    build.compile("blake3_rvv");
+}
+
 fn main() -> Result<(), Box<dyn std::error::Error>> {
    if is_pure() && is_neon() {
        panic!("It doesn't make sense to enable both \"pure\" and \"neon\".");
    }
-
    if is_no_neon() && is_neon() {
        panic!("It doesn't make sense to enable both \"no_neon\" and \"neon\".");
    }

+    if is_pure() && is_rvv() {
+        panic!("It doesn't make sense to enable both \"pure\" and \"rvv\".");
+    }
+    if is_no_rvv() && is_rvv() {
+        panic!("It doesn't make sense to enable both \"no_rvv\" and \"rvv\".");
+    }
+
    if is_x86_64() || is_x86_32() {
        let support = c_compiler_support();
        if is_x86_32() || should_prefer_intrinsics() || is_pure() || support == NoCompiler {
@ -278,6 +317,11 @@ fn main() -> Result<(), Box<dyn std::error::Error>> {
        build_neon_c_intrinsics();
    }

+    if (is_riscv32() || is_riscv64()) && is_rvv() {
+        println!("cargo:rustc-cfg=blake3_rvv");
+        build_rvv_c_intrinsics();
+    }
+
    // The `cc` crate doesn't automatically emit rerun-if directives for the
    // environment variables it supports, in particular for $CC. We expect to
    // do a lot of benchmarking across different compilers, so we explicitly
--- a/c/CMakeLists.txt
+++ b/c/CMakeLists.txt
@ -33,6 +33,7 @@ include(GNUInstallDirs)
 set(BLAKE3_AMD64_NAMES amd64 AMD64 x86_64)
 set(BLAKE3_X86_NAMES i686 x86 X86)
 set(BLAKE3_ARMv8_NAMES aarch64 AArch64 arm64 ARM64 armv8 armv8a)
+set(BLAKE3_RISCV64_NAMES riscv64)
 # default SIMD compiler flag configuration (can be overriden by toolchains or CLI)
 if(MSVC)
  set(BLAKE3_CFLAGS_SSE2 "/arch:SSE2" CACHE STRING "the compiler flags to enable SSE2")
@ -48,6 +49,7 @@ elseif(CMAKE_C_COMPILER_ID STREQUAL "GNU"
  set(BLAKE3_CFLAGS_SSE4.1 "-msse4.1" CACHE STRING "the compiler flags to enable SSE4.1")
  set(BLAKE3_CFLAGS_AVX2 "-mavx2" CACHE STRING "the compiler flags to enable AVX2")
  set(BLAKE3_CFLAGS_AVX512 "-mavx512f -mavx512vl" CACHE STRING "the compiler flags to enable AVX512")
+  set(BLAKE3_CFLAGS_RVV_RISCV64 "-march=rv64gcv1p0" CACHE STRING "the compiler flags to enable RVV for riscv64")

  if (CMAKE_SYSTEM_PROCESSOR IN_LIST BLAKE3_ARMv8_NAMES
      AND NOT CMAKE_SIZEOF_VOID_P EQUAL 8)
@ -95,8 +97,10 @@ macro(BLAKE3_DISABLE_SIMD)
  set(BLAKE3_SIMD_AMD64_ASM OFF)
  set(BLAKE3_SIMD_X86_INTRINSICS OFF)
  set(BLAKE3_SIMD_NEON_INTRINSICS OFF)
+  set(BLAKE3_VLA_RVV_INTRINSICS OFF)
  target_compile_definitions(blake3 PRIVATE
    BLAKE3_USE_NEON=0
+    BLAKE3_USE_RVV
    BLAKE3_NO_SSE2
    BLAKE3_NO_SSE41
    BLAKE3_NO_AVX2
@ -179,6 +183,13 @@ elseif((CMAKE_SYSTEM_PROCESSOR IN_LIST BLAKE3_ARMv8_NAMES
    set_source_files_properties(blake3_neon.c PROPERTIES COMPILE_FLAGS "${BLAKE3_CFLAGS_NEON}")
  endif()

+elseif((CMAKE_SYSTEM_PROCESSOR IN_LIST BLAKE3_RISCV64_NAMES
+  OR BLAKE3_USE_RVV_INTRINSICS))
+  set(BLAKE3_VLA_RVV_INTRINSICS ON)
+  target_sources(blake3 PRIVATE
+    blake3_rvv.c
+  )
+  set_source_files_properties(blake3_rvv.c PROPERTIES COMPILE_FLAGS "${BLAKE3_CFLAGS_RVV_RISCV64}")
 else()
  BLAKE3_DISABLE_SIMD()
 endif()
@ -216,4 +227,5 @@ install(FILES "${CMAKE_BINARY_DIR}/libblake3.pc"
 add_feature_info("AMD64 assembly" BLAKE3_SIMD_AMD64_ASM "The library uses hand written amd64 SIMD assembly.")
 add_feature_info("x86 SIMD intrinsics" BLAKE3_SIMD_X86_INTRINSICS "The library uses x86 SIMD intrinsics.")
 add_feature_info("NEON SIMD intrinsics" BLAKE3_SIMD_NEON_INTRINSICS "The library uses NEON SIMD intrinsics.")
+add_feature_info("RISC-V RVV intrinsics" BLAKE3_VLA_RVV_INTRINSICS "The library uses RISC-V RVV intrinsics.")
 feature_summary(WHAT ENABLED_FEATURES)
--- a/c/blake3_c_rust_bindings/src/test.rs
+++ b/c/blake3_c_rust_bindings/src/test.rs
@ -192,6 +192,15 @@ fn test_compress_avx512() {
    );
 }

+#[test]
+#[cfg(feature = "rvv")]
+fn test_compress_rvv() {
+    test_compress_fn(
+        crate::ffi::x86::blake3_compress_in_place_rvv,
+        crate::ffi::x86::blake3_compress_xof_rvv,
+    );
+}
+
 type HashManyFn = unsafe extern "C" fn(
    inputs: *const *const u8,
    num_inputs: usize,
@ -359,6 +368,12 @@ fn test_hash_many_neon() {
    test_hash_many_fn(crate::ffi::neon::blake3_hash_many_neon);
 }

+#[test]
+#[cfg(feature = "rvv")]
+fn test_hash_many_rvv() {
+    test_hash_many_fn(crate::ffi::rvv::blake3_hash_many_rvv);
+}
+
 #[test]
 fn test_compare_reference_impl() {
    const OUT: usize = 303; // more than 64, not a multiple of 4
--- a/c/blake3_dispatch.c
+++ b/c/blake3_dispatch.c
@ -267,6 +267,11 @@ void blake3_hash_many(const uint8_t *const *inputs, size_t num_inputs,
  return;
 #endif

+#if BLAKE3_USE_RVV == 1
+  blake3_hash_many_rvv(inputs, num_inputs, blocks, key, counter,
+                       increment_counter, flags, flags_start, flags_end, out);
+#endif
+
  blake3_hash_many_portable(inputs, num_inputs, blocks, key, counter,
                            increment_counter, flags, flags_start, flags_end,
                            out);
--- a/c/blake3_impl.h
+++ b/c/blake3_impl.h
@ -281,5 +281,22 @@ void blake3_hash_many_neon(const uint8_t *const *inputs, size_t num_inputs,
                           uint8_t flags_end, uint8_t *out);
 #endif

+#if BLAKE3_USE_RVV == 1
+void blake3_compress_in_place_rvv(uint32_t cv[8],
+                                  const uint8_t block[BLAKE3_BLOCK_LEN],
+                                  uint8_t block_len, uint64_t counter,
+                                  uint8_t flags);
+
+void blake3_compress_xof_rvv(const uint32_t cv[8],
+                             const uint8_t block[BLAKE3_BLOCK_LEN],
+                             uint8_t block_len, uint64_t counter, uint8_t flags,
+                             uint8_t out[64]);
+
+void blake3_hash_many_rvv(const uint8_t *const *inputs, size_t num_inputs,
+                          size_t blocks, const uint32_t key[8],
+                          uint64_t counter, bool increment_counter,
+                          uint8_t flags, uint8_t flags_start, uint8_t flags_end,
+                          uint8_t *out);
+#endif

 #endif /* BLAKE3_IMPL_H */
--- a/c/blake3_rvv.c
+++ b/c/blake3_rvv.c
@ -0,0 +1,300 @@
+#include "blake3_impl.h"
+#include <riscv_vector.h>
+
+INLINE vuint32m1_t rvv_vror_v(vuint32m1_t src, size_t shr, size_t vl) {
+  vuint32m1_t dst = __riscv_vundefined_u32m1();
+  size_t delta = vl - shr;
+  dst = __riscv_vslideup(dst, src, delta, vl);       // VL = vl
+  dst = __riscv_vslidedown_tu(dst, src, shr, delta); // VL = dela
+  return dst;
+}
+
+INLINE vuint32m1_t rvv_vror_s(vuint32m1_t src, size_t shr, size_t vl) {
+  size_t shl = sizeof(uint32_t) * 8 - shr;
+  vuint32m1_t op0 = __riscv_vsrl(src, shr, vl);
+  vuint32m1_t op1 = __riscv_vsll(src, shl, vl);
+  return __riscv_vor(op0, op1, vl);
+}
+
+// NOTE: See the following for several approaches to transposing matrices with RVV:
+// https://fprox.substack.com/p/transposing-a-matrix-using-risc-v
+//
+// This version of transpose_nxn uses the strided store approach, which easily scales to NxN
+// matrices. For LMUL=1, the data at the above link suggests that this approach may be less
+// efficient than a scalar implementation for large N > 16. However, for LMUL=8, the data suggests
+// this vectorized approach may be more efficient than a scalar implementation at least up to N=512.
+//
+// If we assume a typical vector size of 128b (the minimum; no idea how representative this will
+// be), then LMUL=8 gives us vector groups of 1024b (x 4, since RVV has 32 registers). This should
+// let us process 32 state vectors at a time (c.f., 8 for AVX2, 16 for AVX512). Wider base vector
+// registers would increase this further, of course.
+//
+// One of the more efficient alternative approaches would be to use segmented loads/stores. However,
+// this would limit us to 8-row matrices, based on the currently supported vector tuple-sizes.
+//
+// With larger vector registers (or larger LMUL), the in-register masked slide approach may also be
+// worth exploring.
+
+INLINE
+void rvv_transpose_nxn(uint32_t *dst, uint32_t *src, size_t n) {
+  for (size_t row_idx = 0; row_idx < n; row_idx += 1) {
+    size_t avl = n;
+    uint32_t *row_src = src + row_idx * n;
+    uint32_t *row_dst = dst + row_idx;
+    for (
+        /* clang-format off */
+      size_t vl  = __riscv_vsetvl_e32m8(avl);
+              0  < avl;
+            avl -=  vl,
+        row_src +=  vl,
+        row_dst +=  vl * n
+        /* clang-format on */
+    ) {
+      vuint32m8_t row = __riscv_vle32_v_u32m8(row_src, vl);
+      __riscv_vsse32(row_dst, sizeof(uint32_t) * n, row, vl);
+    }
+  }
+}
+
+INLINE
+vuint32m1_t rvv_zip_lo_u32(vuint32m1_t op0, vuint32m1_t op1) {
+  vuint32m1_t odd_mask_u32 = __riscv_vmv_v_x_u32m1(0xAAAA, 1);
+  vbool32_t odd_mask = __riscv_vreinterpret_v_u32m1_b32(odd_mask_u32);
+  op0 = __riscv_vslideup_tu(op0, op0, 1, 4);              // VL = 4
+  op1 = __riscv_vslideup_tu(op1, op1, 2, 4);              // VL = 4
+  op1 = __riscv_vslideup_tu(op1, op1, 1, 2);              // VL = 2
+  return __riscv_vmerge_vvm_u32m1(op0, op1, odd_mask, 4); // VL = 4
+}
+
+INLINE
+vuint32m1_t rvv_zip_hi_u32(vuint32m1_t op0, vuint32m1_t op1) {
+  vuint32m1_t odd_mask_u32 = __riscv_vmv_v_x_u32m1(0xAAAA, 1);
+  vbool32_t odd_mask = __riscv_vreinterpret_v_u32m1_b32(odd_mask_u32);
+  op0 = __riscv_vslidedown_tu(op0, op0, 1, 4);            // VL = 4
+  op0 = __riscv_vslidedown_tu(op0, op0, 1, 2);            // VL = 2
+  op1 = __riscv_vslidedown_tu(op1, op1, 1, 2);            // VL = 2
+  return __riscv_vmerge_vvm_u32m1(op0, op1, odd_mask, 4); // VL = 4
+}
+
+INLINE
+vuint32m1_t rvv_zip_lo_u64(vuint32m1_t op0, vuint32m1_t op1) {
+  vuint64m1_t op0_u64 = __riscv_vreinterpret_v_u32m1_u64m1(op0);
+  vuint64m1_t op1_u64 = __riscv_vreinterpret_v_u32m1_u64m1(op1);
+  vuint64m1_t dst_u64 = __riscv_vslideup_tu(op0_u64, op1_u64, 1, 2); // VL = 2
+  return __riscv_vreinterpret_v_u64m1_u32m1(dst_u64);
+}
+
+INLINE
+vuint32m1_t rvv_zip_hi_u64(vuint32m1_t op0, vuint32m1_t op1) {
+  vuint64m1_t op0_u64 = __riscv_vreinterpret_v_u32m1_u64m1(op0);
+  vuint64m1_t op1_u64 = __riscv_vreinterpret_v_u32m1_u64m1(op1);
+  vuint64m1_t dst_u64 = __riscv_vslidedown_tu(op1_u64, op0_u64, 1, 1); // VL = 1
+  return __riscv_vreinterpret_v_u64m1_u32m1(dst_u64);
+}
+
+INLINE
+vuint32m1_t rvv_shuffle_zip_lo_hi_u256(vuint32m1_t op0, vuint32m1_t op1,
+                                       vuint32m1_t tab) {
+  op1 = __riscv_vrgather_vv_u32m1(op1, tab, 4);         // VL = 4
+  op0 = __riscv_vrgather_vv_u32m1_tu(op1, op0, tab, 2); // VL = 2
+  return op0;
+}
+
+INLINE
+vuint32m1_t rvv_shuffle_u128(vuint32m1_t src, vuint32m1_t tab) {
+  return __riscv_vrgather_vv_u32m1(src, tab, 4); // VL = 4
+}
+
+INLINE
+vuint32m1_t rvv_blend_u16(vuint32m1_t op0, vuint32m1_t op1, uint16_t mask) {
+  vuint16m1_t op0_u16 = __riscv_vreinterpret_v_u32m1_u16m1(op0);
+  vuint16m1_t op1_u16 = __riscv_vreinterpret_v_u32m1_u16m1(op1);
+  vbool16_t mask_u16 = __riscv_vreinterpret_v_u16m1_b16(__riscv_vmv_v_x_u16m1(mask, 1)); 
+  vuint16m1_t dst = __riscv_vmerge_vvm_u16m1(op0_u16, op1_u16, mask_u16, 4); // VL = 4
+  return __riscv_vreinterpret_v_u16m1_u32m1(dst);
+}
+
+/*
+ * ----------------------------------------------------------------------------
+ * compress_rvv
+ * ----------------------------------------------------------------------------
+ */
+
+INLINE void g1(vuint32m1_t *row0, vuint32m1_t *row1, vuint32m1_t *row2,
+               vuint32m1_t *row3, vuint32m1_t m, size_t vl) {
+  *row0 = __riscv_vadd(*row0, m, vl);
+  *row0 = __riscv_vadd(*row0, *row1, vl);
+  *row3 = __riscv_vxor(*row3, *row0, vl);
+  *row3 = rvv_vror_s(*row3, 16, vl);
+  *row2 = __riscv_vadd(*row2, *row3, vl);
+  *row1 = __riscv_vxor(*row1, *row2, vl);
+  *row1 = rvv_vror_s(*row1, 12, vl);
+}
+
+INLINE void g2(vuint32m1_t *row0, vuint32m1_t *row1, vuint32m1_t *row2,
+               vuint32m1_t *row3, vuint32m1_t m, size_t vl) {
+  *row0 = __riscv_vadd(*row0, m, vl);
+  *row0 = __riscv_vadd(*row0, *row1, vl);
+  *row3 = __riscv_vxor(*row3, *row0, vl);
+  *row3 = rvv_vror_s(*row3, 8, vl);
+  *row2 = __riscv_vadd(*row2, *row3, vl);
+  *row1 = __riscv_vxor(*row1, *row2, vl);
+  *row1 = rvv_vror_s(*row1, 7, vl);
+}
+
+INLINE void diagonalize(vuint32m1_t *row0, vuint32m1_t *row2, vuint32m1_t *row3,
+                        size_t vl) {
+  *row0 = rvv_vror_v(*row0, 3, vl);
+  *row3 = rvv_vror_v(*row3, 2, vl);
+  *row2 = rvv_vror_v(*row2, 1, vl);
+}
+
+INLINE void undiagonalize(vuint32m1_t *row0, vuint32m1_t *row2,
+                          vuint32m1_t *row3, size_t vl) {
+  *row0 = rvv_vror_v(*row0, 1, vl);
+  *row3 = rvv_vror_v(*row3, 2, vl);
+  *row2 = rvv_vror_v(*row2, 3, vl);
+}
+
+INLINE void compress_pre(vuint32m1x4_t *rows, const uint32_t cv[8],
+                         const uint8_t block[BLAKE3_BLOCK_LEN],
+                         uint8_t block_len, uint64_t counter, uint8_t flags,
+                         size_t vl) {
+  (void)rows;
+  (void)cv;
+  (void)block;
+  (void)block_len;
+  (void)counter;
+  (void)flags;
+  (void)vl;
+
+  // 0, 0, 3, 3
+  // 0, 1, 3, 2
+  // 1, 3, 2, 0
+  // 2, 0, 2, 0
+  // 3, 1, 1, 2
+  // 3, 1, 3, 1
+  // 3, 3, 2, 2
+
+  // 2, 1, 0, 3 (rotate)
+  // 1, 3, 2, 0 (rotate)
+  // 0, 3, 2, 1 (rotate)
+}
+
+void blake3_compress_xof_rvv(const uint32_t cv[8],
+                             const uint8_t block[BLAKE3_BLOCK_LEN],
+                             uint8_t block_len, uint64_t counter, uint8_t flags,
+                             uint8_t out[64]) {
+  assert((uintptr_t)&block[0] % sizeof(uint32_t) == 0); // FIXME: alignment
+  assert((uintptr_t)&out[0] % sizeof(uint32_t) == 0);   // FIXME: alignment
+  (void)cv;
+  (void)block;
+  (void)block_len;
+  (void)counter;
+  (void)flags;
+  (void)out;
+}
+
+void blake3_compress_in_place_rvv(uint32_t cv[8],
+                                  const uint8_t block[BLAKE3_BLOCK_LEN],
+                                  uint8_t block_len, uint64_t counter,
+                                  uint8_t flags) {
+  assert((uintptr_t)&block[0] % sizeof(uint32_t) == 0); // FIXME: alignment
+  (void)cv;
+  (void)block;
+  (void)block_len;
+  (void)counter;
+  (void)flags;
+}
+
+/*
+ * ----------------------------------------------------------------------------
+ * hash_vl_rvv
+ * ----------------------------------------------------------------------------
+ */
+
+void round_fn_vl() {
+  //
+}
+
+void transpose_vecs_vl() {
+  //
+}
+
+void transpose_msg_vecs_vl() {
+  //
+}
+
+void load_counters_vl() {
+  //
+}
+
+void blake3_hash_vl_rvv(const uint8_t *const *inputs, size_t blocks,
+                        const uint32_t key[8], uint64_t counter,
+                        bool increment_counter, uint8_t flags,
+                        uint8_t flags_start, uint8_t flags_end, uint8_t *out,
+                        size_t vl) {
+  assert((uintptr_t)&inputs[0] % sizeof(uint32_t) == 0); // FIXME: alignment
+  assert((uintptr_t)&out[0] % sizeof(uint32_t) == 0);    // FIXME: alignment
+  (void)inputs;
+  (void)blocks;
+  (void)key;
+  (void)counter;
+  (void)increment_counter;
+  (void)flags;
+  (void)flags_start;
+  (void)flags_end;
+  (void)out;
+  (void)vl;
+}
+
+/*
+ * ----------------------------------------------------------------------------
+ * hash_many_rvv
+ * ----------------------------------------------------------------------------
+ */
+
+INLINE void hash_one_rvv(const uint8_t *input, size_t blocks,
+                         const uint32_t key[8], uint64_t counter, uint8_t flags,
+                         uint8_t flags_start, uint8_t flags_end,
+                         uint8_t out[BLAKE3_OUT_LEN]) {
+  assert((uintptr_t)&input[0] % sizeof(uint32_t) == 0); // FIXME: alignment
+  assert((uintptr_t)&out[0] % sizeof(uint32_t) == 0);   // FIXME: alignment
+  uint32_t cv[8];
+  memcpy(cv, key, BLAKE3_KEY_LEN);
+  uint8_t block_flags = flags | flags_start;
+  while (blocks > 0) {
+    block_flags |= blocks == 1 ? flags_end : 0;
+    if (blocks == 1) {
+      block_flags |= flags_end;
+    }
+    blake3_compress_in_place_rvv(cv, input, BLAKE3_BLOCK_LEN, counter,
+                                 block_flags);
+    input = &input[BLAKE3_BLOCK_LEN];
+    blocks -= 1;
+    block_flags = flags;
+  }
+  memcpy(out, cv, BLAKE3_OUT_LEN);
+}
+
+void blake3_hash_many_rvv(const uint8_t *const *inputs, size_t num_inputs,
+                          size_t blocks, const uint32_t key[8],
+                          uint64_t counter, bool increment_counter,
+                          uint8_t flags, uint8_t flags_start, uint8_t flags_end,
+                          uint8_t *out) {
+  assert((uintptr_t)&inputs[0] % sizeof(uint32_t) == 0); // FIXME: alignment
+  assert((uintptr_t)&out[0] % sizeof(uint32_t) == 0);    // FIXME: alignment
+  for (
+      /* clang-format off */
+    size_t vl   = __riscv_vsetvl_e32m1(num_inputs);
+    num_inputs  > 0;
+    num_inputs -= vl,
+        inputs += vl,
+       counter += increment_counter * vl,
+           out  = &out[vl * BLAKE3_OUT_LEN]
+      /* clang-format on */
+  ) {
+    blake3_hash_vl_rvv(inputs, blocks, key, counter, increment_counter, flags,
+                       flags_start, flags_end, out, vl);
+  }
+}
--- a/src/ffi_rvv.rs
+++ b/src/ffi_rvv.rs
@ -0,0 +1,113 @@
+use crate::{CVWords, IncrementCounter, BLOCK_LEN, OUT_LEN};
+
+// Unsafe because this may only be called on platforms supporting RVV.
+pub unsafe fn compress_in_place(
+    cv: &mut CVWords,
+    block: &[u8; BLOCK_LEN],
+    block_len: u8,
+    counter: u64,
+    flags: u8,
+) {
+    ffi::blake3_compress_in_place_rvv(cv.as_mut_ptr(), block.as_ptr(), block_len, counter, flags)
+}
+
+// Unsafe because this may only be called on platforms supporting RVV.
+pub unsafe fn compress_xof(
+    cv: &CVWords,
+    block: &[u8; BLOCK_LEN],
+    block_len: u8,
+    counter: u64,
+    flags: u8,
+) -> [u8; 64] {
+    let mut out = [0u8; 64];
+    ffi::blake3_compress_xof_rvv(
+        cv.as_ptr(),
+        block.as_ptr(),
+        block_len,
+        counter,
+        flags,
+        out.as_mut_ptr(),
+    );
+    out
+}
+
+// Unsafe because this may only be called on platforms supporting RVV.
+pub unsafe fn hash_many<const N: usize>(
+    inputs: &[&[u8; N]],
+    key: &CVWords,
+    counter: u64,
+    increment_counter: IncrementCounter,
+    flags: u8,
+    flags_start: u8,
+    flags_end: u8,
+    out: &mut [u8],
+) {
+    // The Rust hash_many implementations do bounds checking on the `out`
+    // array, but the C implementations don't. Even though this is an unsafe
+    // function, assert the bounds here.
+    assert!(out.len() >= inputs.len() * OUT_LEN);
+    ffi::blake3_hash_many_rvv(
+        inputs.as_ptr() as *const *const u8,
+        inputs.len(),
+        N / BLOCK_LEN,
+        key.as_ptr(),
+        counter,
+        increment_counter.yes(),
+        flags,
+        flags_start,
+        flags_end,
+        out.as_mut_ptr(),
+    )
+}
+
+pub mod ffi {
+    extern "C" {
+        pub fn blake3_compress_in_place_rvv(
+            cv: *mut u32,
+            block: *const u8,
+            block_len: u8,
+            counter: u64,
+            flags: u8,
+        );
+        pub fn blake3_compress_xof_rvv(
+            cv: *const u32,
+            block: *const u8,
+            block_len: u8,
+            counter: u64,
+            flags: u8,
+            out: *mut u8,
+        );
+        pub fn blake3_hash_many_rvv(
+            inputs: *const *const u8,
+            num_inputs: usize,
+            blocks: usize,
+            key: *const u32,
+            counter: u64,
+            increment_counter: bool,
+            flags: u8,
+            flags_start: u8,
+            flags_end: u8,
+            out: *mut u8,
+        );
+    }
+}
+
+#[cfg(test)]
+mod test {
+    use super::*;
+
+    // This entire file is gated on feature="rvv", so RVV support is
+    // assumed here.
+
+    // TODO: test transpose?
+
+    #[test]
+    fn test_compress() {
+        crate::test::test_compress_fn(compress_in_place, compress_xof);
+    }
+
+    #[test]
+    fn test_hash_many() {
+        crate::test::test_hash_many_fn(hash_many, hash_many);
+    }
+}
--- a/src/lib.rs
+++ b/src/lib.rs
@ -66,6 +66,10 @@
 //! enabling this feature will produce a binary that's not portable to CPUs
 //! without NEON support.
 //!
+//! The RVV implementation requires the `rvv` feature for RISC-V targets. Not
+//! all RISC-V CPUs support RVV, and enabling this feature will produce a binary
+//! that's not portable to CPUs without RVV support.
+//! 
 //! The `traits-preview` feature enables implementations of traits from the
 //! RustCrypto [`digest`] crate, and re-exports that crate as `traits::digest`.
 //! However, the traits aren't stable, and they're expected to change in
@ -114,6 +118,9 @@ mod avx512;
 #[path = "ffi_neon.rs"]
 mod neon;
 mod portable;
+#[cfg(blake3_rvv)]
+#[path = "ffi_rvv.rs"]
+mod rvv;
 #[cfg(blake3_sse2_rust)]
 #[path = "rust_sse2.rs"]
 mod sse2;
--- a/src/platform.rs
+++ b/src/platform.rs
@ -51,6 +51,8 @@ pub enum Platform {
    AVX512,
    #[cfg(blake3_neon)]
    NEON,
+    #[cfg(blake3_rvv)]
+    RVV,
 }

 impl Platform {
@ -80,6 +82,12 @@ impl Platform {
        {
            return Platform::NEON;
        }
+        // We don't use dynamic feature detection for RVV. If the "rvv"
+        // feature is on, RVV is assumed to be supported.
+        #[cfg(blake3_rvv)]
+        {
+            return Platform::RVV;
+        }
        Platform::Portable
    }

@ -97,6 +105,8 @@ impl Platform {
            Platform::AVX512 => 16,
            #[cfg(blake3_neon)]
            Platform::NEON => 4,
+            #[cfg(blake3_rvv)]
+            Platform::RVV => todo!(),
        };
        debug_assert!(degree <= MAX_SIMD_DEGREE);
        degree
@ -131,6 +141,10 @@ impl Platform {
            // No NEON compress_in_place() implementation yet.
            #[cfg(blake3_neon)]
            Platform::NEON => portable::compress_in_place(cv, block, block_len, counter, flags),
+            #[cfg(blake3_rvv)]
+            Platform::RVV => unsafe {
+                crate::rvv::compress_in_place(cv, block, block_len, counter, flags)
+            },
        }
    }

@ -163,6 +177,11 @@ impl Platform {
            // No NEON compress_xof() implementation yet.
            #[cfg(blake3_neon)]
            Platform::NEON => portable::compress_xof(cv, block, block_len, counter, flags),
+            // No NEON compress_xof() implementation yet.
+            #[cfg(blake3_rvv)]
+            Platform::RVV => unsafe {
+                crate::rvv::compress_xof(cv, block, block_len, counter, flags)
+            },
        }
    }

@ -269,6 +288,20 @@ impl Platform {
                    out,
                )
            },
+            // Assumed to be safe if the "rvv" feature is on.
+            #[cfg(blake3_rvv)]
+            Platform::RVV => unsafe {
+                crate::rvv::hash_many(
+                    inputs,
+                    key,
+                    counter,
+                    increment_counter,
+                    flags,
+                    flags_start,
+                    flags_end,
+                    out,
+                )
+            },
        }
    }

@ -320,6 +353,12 @@ impl Platform {
        // Assumed to be safe if the "neon" feature is on.
        Some(Self::NEON)
    }
+
+    #[cfg(blake3_rvv)]
+    pub fn rvv() -> Option<Self> {
+        // Assumed to be safe if the "rvv" feature is on.
+        Some(Self::RVV)
+    }
 }

 // Note that AVX-512 is divided into multiple featuresets, and we use two of