1
0
Fork 0
mirror of https://github.com/BLAKE3-team/BLAKE3 synced 2024-06-07 23:46:08 +02:00

Implement RVV backend

This commit is contained in:
silvanshade 2024-01-15 20:12:55 -07:00
parent d60b753572
commit 9c468922bf
No known key found for this signature in database
11 changed files with 582 additions and 1 deletions

View File

@ -18,6 +18,11 @@ default = ["std"]
# implementation uses C intrinsics and requires a C compiler.
neon = []
# The RVV implementation does not participate in dynamic feature detection,
# which is currently x86-only. If "rvv" is on, RVV support is assumed. The
# RVV implementation uses C intrinsics and requires a C compiler.
rvv = []
# This crate uses libstd for std::io trait implementations, and also for
# runtime CPU feature detection. This feature is enabled by default. If you use
# --no-default-features, the only way to use the SIMD implementations in this

View File

@ -85,6 +85,14 @@ fn bench_single_compression_avx512(b: &mut Bencher) {
}
}
#[bench]
#[cfg(blake3_rvv)]
fn bench_single_compression_avx512(b: &mut Bencher) {
if let Some(platform) = Platform::rvv() {
bench_single_compression_fn(b, platform);
}
}
fn bench_many_chunks_fn(b: &mut Bencher, platform: Platform) {
let degree = platform.simd_degree();
let mut inputs = Vec::new();
@ -151,6 +159,14 @@ fn bench_many_chunks_neon(b: &mut Bencher) {
}
}
#[bench]
#[cfg(feature = "rvv")]
fn bench_many_chunks_neon(b: &mut Bencher) {
if let Some(platform) = Platform::rvv() {
bench_many_chunks_fn(b, platform);
}
}
// TODO: When we get const generics we can unify this with the chunks code.
fn bench_many_parents_fn(b: &mut Bencher, platform: Platform) {
let degree = platform.simd_degree();
@ -218,6 +234,14 @@ fn bench_many_parents_neon(b: &mut Bencher) {
}
}
#[bench]
#[cfg(feature = "rvv")]
fn bench_many_parents_rvv(b: &mut Bencher) {
if let Some(platform) = Platform::rvv() {
bench_many_parents_fn(b, platform);
}
}
fn bench_atonce(b: &mut Bencher, len: usize) {
let mut input = RandomInput::new(b, len);
b.iter(|| blake3::hash(input.get()));

View File

@ -1,3 +1,4 @@
use core::panic;
use std::env;
fn defined(var: &str) -> bool {
@ -21,6 +22,14 @@ fn is_no_neon() -> bool {
defined("CARGO_FEATURE_NO_NEON")
}
fn is_rvv() -> bool {
cfg!(feature = "rvv")
}
fn is_no_rvv() -> bool {
cfg!(not(feature = "rvv"))
}
fn is_ci() -> bool {
defined("BLAKE3_CI")
}
@ -60,6 +69,18 @@ fn is_armv7() -> bool {
target_components()[0] == "armv7"
}
fn is_riscv32() -> bool {
std::env::var("CARGO_CFG_TARGET_ARCH")
.map(|target_arch| target_arch == "riscv32")
.unwrap_or_default()
}
fn is_riscv64() -> bool {
std::env::var("CARGO_CFG_TARGET_ARCH")
.map(|target_arch| target_arch == "riscv64")
.unwrap_or_default()
}
fn endianness() -> String {
let endianness = env::var("CARGO_CFG_TARGET_ENDIAN").unwrap();
assert!(endianness == "little" || endianness == "big");
@ -239,15 +260,33 @@ fn build_neon_c_intrinsics() {
build.compile("blake3_neon");
}
fn build_rvv_c_intrinsics() {
let mut build = new_build();
build.file("c/blake3_rvv.c");
if is_riscv32() {
build.flag("-march=rv32gcv1p0");
}
if is_riscv64() {
build.flag("-march=rv64gcv1p0");
}
build.compile("blake3_rvv");
}
fn main() -> Result<(), Box<dyn std::error::Error>> {
if is_pure() && is_neon() {
panic!("It doesn't make sense to enable both \"pure\" and \"neon\".");
}
if is_no_neon() && is_neon() {
panic!("It doesn't make sense to enable both \"no_neon\" and \"neon\".");
}
if is_pure() && is_rvv() {
panic!("It doesn't make sense to enable both \"pure\" and \"rvv\".");
}
if is_no_rvv() && is_rvv() {
panic!("It doesn't make sense to enable both \"no_rvv\" and \"rvv\".");
}
if is_x86_64() || is_x86_32() {
let support = c_compiler_support();
if is_x86_32() || should_prefer_intrinsics() || is_pure() || support == NoCompiler {
@ -278,6 +317,11 @@ fn main() -> Result<(), Box<dyn std::error::Error>> {
build_neon_c_intrinsics();
}
if (is_riscv32() || is_riscv64()) && is_rvv() {
println!("cargo:rustc-cfg=blake3_rvv");
build_rvv_c_intrinsics();
}
// The `cc` crate doesn't automatically emit rerun-if directives for the
// environment variables it supports, in particular for $CC. We expect to
// do a lot of benchmarking across different compilers, so we explicitly

View File

@ -33,6 +33,7 @@ include(GNUInstallDirs)
set(BLAKE3_AMD64_NAMES amd64 AMD64 x86_64)
set(BLAKE3_X86_NAMES i686 x86 X86)
set(BLAKE3_ARMv8_NAMES aarch64 AArch64 arm64 ARM64 armv8 armv8a)
set(BLAKE3_RISCV64_NAMES riscv64)
# default SIMD compiler flag configuration (can be overriden by toolchains or CLI)
if(MSVC)
set(BLAKE3_CFLAGS_SSE2 "/arch:SSE2" CACHE STRING "the compiler flags to enable SSE2")
@ -48,6 +49,7 @@ elseif(CMAKE_C_COMPILER_ID STREQUAL "GNU"
set(BLAKE3_CFLAGS_SSE4.1 "-msse4.1" CACHE STRING "the compiler flags to enable SSE4.1")
set(BLAKE3_CFLAGS_AVX2 "-mavx2" CACHE STRING "the compiler flags to enable AVX2")
set(BLAKE3_CFLAGS_AVX512 "-mavx512f -mavx512vl" CACHE STRING "the compiler flags to enable AVX512")
set(BLAKE3_CFLAGS_RVV_RISCV64 "-march=rv64gcv1p0" CACHE STRING "the compiler flags to enable RVV for riscv64")
if (CMAKE_SYSTEM_PROCESSOR IN_LIST BLAKE3_ARMv8_NAMES
AND NOT CMAKE_SIZEOF_VOID_P EQUAL 8)
@ -95,8 +97,10 @@ macro(BLAKE3_DISABLE_SIMD)
set(BLAKE3_SIMD_AMD64_ASM OFF)
set(BLAKE3_SIMD_X86_INTRINSICS OFF)
set(BLAKE3_SIMD_NEON_INTRINSICS OFF)
set(BLAKE3_VLA_RVV_INTRINSICS OFF)
target_compile_definitions(blake3 PRIVATE
BLAKE3_USE_NEON=0
BLAKE3_USE_RVV
BLAKE3_NO_SSE2
BLAKE3_NO_SSE41
BLAKE3_NO_AVX2
@ -179,6 +183,13 @@ elseif((CMAKE_SYSTEM_PROCESSOR IN_LIST BLAKE3_ARMv8_NAMES
set_source_files_properties(blake3_neon.c PROPERTIES COMPILE_FLAGS "${BLAKE3_CFLAGS_NEON}")
endif()
elseif((CMAKE_SYSTEM_PROCESSOR IN_LIST BLAKE3_RISCV64_NAMES
OR BLAKE3_USE_RVV_INTRINSICS))
set(BLAKE3_VLA_RVV_INTRINSICS ON)
target_sources(blake3 PRIVATE
blake3_rvv.c
)
set_source_files_properties(blake3_rvv.c PROPERTIES COMPILE_FLAGS "${BLAKE3_CFLAGS_RVV_RISCV64}")
else()
BLAKE3_DISABLE_SIMD()
endif()
@ -216,4 +227,5 @@ install(FILES "${CMAKE_BINARY_DIR}/libblake3.pc"
add_feature_info("AMD64 assembly" BLAKE3_SIMD_AMD64_ASM "The library uses hand written amd64 SIMD assembly.")
add_feature_info("x86 SIMD intrinsics" BLAKE3_SIMD_X86_INTRINSICS "The library uses x86 SIMD intrinsics.")
add_feature_info("NEON SIMD intrinsics" BLAKE3_SIMD_NEON_INTRINSICS "The library uses NEON SIMD intrinsics.")
add_feature_info("RISC-V RVV intrinsics" BLAKE3_VLA_RVV_INTRINSICS "The library uses RISC-V RVV intrinsics.")
feature_summary(WHAT ENABLED_FEATURES)

View File

@ -192,6 +192,15 @@ fn test_compress_avx512() {
);
}
#[test]
#[cfg(feature = "rvv")]
fn test_compress_rvv() {
test_compress_fn(
crate::ffi::x86::blake3_compress_in_place_rvv,
crate::ffi::x86::blake3_compress_xof_rvv,
);
}
type HashManyFn = unsafe extern "C" fn(
inputs: *const *const u8,
num_inputs: usize,
@ -359,6 +368,12 @@ fn test_hash_many_neon() {
test_hash_many_fn(crate::ffi::neon::blake3_hash_many_neon);
}
#[test]
#[cfg(feature = "rvv")]
fn test_hash_many_rvv() {
test_hash_many_fn(crate::ffi::rvv::blake3_hash_many_rvv);
}
#[test]
fn test_compare_reference_impl() {
const OUT: usize = 303; // more than 64, not a multiple of 4

View File

@ -267,6 +267,11 @@ void blake3_hash_many(const uint8_t *const *inputs, size_t num_inputs,
return;
#endif
#if BLAKE3_USE_RVV == 1
blake3_hash_many_rvv(inputs, num_inputs, blocks, key, counter,
increment_counter, flags, flags_start, flags_end, out);
#endif
blake3_hash_many_portable(inputs, num_inputs, blocks, key, counter,
increment_counter, flags, flags_start, flags_end,
out);

View File

@ -281,5 +281,22 @@ void blake3_hash_many_neon(const uint8_t *const *inputs, size_t num_inputs,
uint8_t flags_end, uint8_t *out);
#endif
#if BLAKE3_USE_RVV == 1
void blake3_compress_in_place_rvv(uint32_t cv[8],
const uint8_t block[BLAKE3_BLOCK_LEN],
uint8_t block_len, uint64_t counter,
uint8_t flags);
void blake3_compress_xof_rvv(const uint32_t cv[8],
const uint8_t block[BLAKE3_BLOCK_LEN],
uint8_t block_len, uint64_t counter, uint8_t flags,
uint8_t out[64]);
void blake3_hash_many_rvv(const uint8_t *const *inputs, size_t num_inputs,
size_t blocks, const uint32_t key[8],
uint64_t counter, bool increment_counter,
uint8_t flags, uint8_t flags_start, uint8_t flags_end,
uint8_t *out);
#endif
#endif /* BLAKE3_IMPL_H */

300
c/blake3_rvv.c Normal file
View File

@ -0,0 +1,300 @@
#include "blake3_impl.h"
#include <riscv_vector.h>
INLINE vuint32m1_t rvv_vror_v(vuint32m1_t src, size_t shr, size_t vl) {
vuint32m1_t dst = __riscv_vundefined_u32m1();
size_t delta = vl - shr;
dst = __riscv_vslideup(dst, src, delta, vl); // VL = vl
dst = __riscv_vslidedown_tu(dst, src, shr, delta); // VL = dela
return dst;
}
INLINE vuint32m1_t rvv_vror_s(vuint32m1_t src, size_t shr, size_t vl) {
size_t shl = sizeof(uint32_t) * 8 - shr;
vuint32m1_t op0 = __riscv_vsrl(src, shr, vl);
vuint32m1_t op1 = __riscv_vsll(src, shl, vl);
return __riscv_vor(op0, op1, vl);
}
// NOTE: See the following for several approaches to transposing matrices with RVV:
// https://fprox.substack.com/p/transposing-a-matrix-using-risc-v
//
// This version of transpose_nxn uses the strided store approach, which easily scales to NxN
// matrices. For LMUL=1, the data at the above link suggests that this approach may be less
// efficient than a scalar implementation for large N > 16. However, for LMUL=8, the data suggests
// this vectorized approach may be more efficient than a scalar implementation at least up to N=512.
//
// If we assume a typical vector size of 128b (the minimum; no idea how representative this will
// be), then LMUL=8 gives us vector groups of 1024b (x 4, since RVV has 32 registers). This should
// let us process 32 state vectors at a time (c.f., 8 for AVX2, 16 for AVX512). Wider base vector
// registers would increase this further, of course.
//
// One of the more efficient alternative approaches would be to use segmented loads/stores. However,
// this would limit us to 8-row matrices, based on the currently supported vector tuple-sizes.
//
// With larger vector registers (or larger LMUL), the in-register masked slide approach may also be
// worth exploring.
INLINE
void rvv_transpose_nxn(uint32_t *dst, uint32_t *src, size_t n) {
for (size_t row_idx = 0; row_idx < n; row_idx += 1) {
size_t avl = n;
uint32_t *row_src = src + row_idx * n;
uint32_t *row_dst = dst + row_idx;
for (
/* clang-format off */
size_t vl = __riscv_vsetvl_e32m8(avl);
0 < avl;
avl -= vl,
row_src += vl,
row_dst += vl * n
/* clang-format on */
) {
vuint32m8_t row = __riscv_vle32_v_u32m8(row_src, vl);
__riscv_vsse32(row_dst, sizeof(uint32_t) * n, row, vl);
}
}
}
INLINE
vuint32m1_t rvv_zip_lo_u32(vuint32m1_t op0, vuint32m1_t op1) {
vuint32m1_t odd_mask_u32 = __riscv_vmv_v_x_u32m1(0xAAAA, 1);
vbool32_t odd_mask = __riscv_vreinterpret_v_u32m1_b32(odd_mask_u32);
op0 = __riscv_vslideup_tu(op0, op0, 1, 4); // VL = 4
op1 = __riscv_vslideup_tu(op1, op1, 2, 4); // VL = 4
op1 = __riscv_vslideup_tu(op1, op1, 1, 2); // VL = 2
return __riscv_vmerge_vvm_u32m1(op0, op1, odd_mask, 4); // VL = 4
}
INLINE
vuint32m1_t rvv_zip_hi_u32(vuint32m1_t op0, vuint32m1_t op1) {
vuint32m1_t odd_mask_u32 = __riscv_vmv_v_x_u32m1(0xAAAA, 1);
vbool32_t odd_mask = __riscv_vreinterpret_v_u32m1_b32(odd_mask_u32);
op0 = __riscv_vslidedown_tu(op0, op0, 1, 4); // VL = 4
op0 = __riscv_vslidedown_tu(op0, op0, 1, 2); // VL = 2
op1 = __riscv_vslidedown_tu(op1, op1, 1, 2); // VL = 2
return __riscv_vmerge_vvm_u32m1(op0, op1, odd_mask, 4); // VL = 4
}
INLINE
vuint32m1_t rvv_zip_lo_u64(vuint32m1_t op0, vuint32m1_t op1) {
vuint64m1_t op0_u64 = __riscv_vreinterpret_v_u32m1_u64m1(op0);
vuint64m1_t op1_u64 = __riscv_vreinterpret_v_u32m1_u64m1(op1);
vuint64m1_t dst_u64 = __riscv_vslideup_tu(op0_u64, op1_u64, 1, 2); // VL = 2
return __riscv_vreinterpret_v_u64m1_u32m1(dst_u64);
}
INLINE
vuint32m1_t rvv_zip_hi_u64(vuint32m1_t op0, vuint32m1_t op1) {
vuint64m1_t op0_u64 = __riscv_vreinterpret_v_u32m1_u64m1(op0);
vuint64m1_t op1_u64 = __riscv_vreinterpret_v_u32m1_u64m1(op1);
vuint64m1_t dst_u64 = __riscv_vslidedown_tu(op1_u64, op0_u64, 1, 1); // VL = 1
return __riscv_vreinterpret_v_u64m1_u32m1(dst_u64);
}
INLINE
vuint32m1_t rvv_shuffle_zip_lo_hi_u256(vuint32m1_t op0, vuint32m1_t op1,
vuint32m1_t tab) {
op1 = __riscv_vrgather_vv_u32m1(op1, tab, 4); // VL = 4
op0 = __riscv_vrgather_vv_u32m1_tu(op1, op0, tab, 2); // VL = 2
return op0;
}
INLINE
vuint32m1_t rvv_shuffle_u128(vuint32m1_t src, vuint32m1_t tab) {
return __riscv_vrgather_vv_u32m1(src, tab, 4); // VL = 4
}
INLINE
vuint32m1_t rvv_blend_u16(vuint32m1_t op0, vuint32m1_t op1, uint16_t mask) {
vuint16m1_t op0_u16 = __riscv_vreinterpret_v_u32m1_u16m1(op0);
vuint16m1_t op1_u16 = __riscv_vreinterpret_v_u32m1_u16m1(op1);
vbool16_t mask_u16 = __riscv_vreinterpret_v_u16m1_b16(__riscv_vmv_v_x_u16m1(mask, 1));
vuint16m1_t dst = __riscv_vmerge_vvm_u16m1(op0_u16, op1_u16, mask_u16, 4); // VL = 4
return __riscv_vreinterpret_v_u16m1_u32m1(dst);
}
/*
* ----------------------------------------------------------------------------
* compress_rvv
* ----------------------------------------------------------------------------
*/
INLINE void g1(vuint32m1_t *row0, vuint32m1_t *row1, vuint32m1_t *row2,
vuint32m1_t *row3, vuint32m1_t m, size_t vl) {
*row0 = __riscv_vadd(*row0, m, vl);
*row0 = __riscv_vadd(*row0, *row1, vl);
*row3 = __riscv_vxor(*row3, *row0, vl);
*row3 = rvv_vror_s(*row3, 16, vl);
*row2 = __riscv_vadd(*row2, *row3, vl);
*row1 = __riscv_vxor(*row1, *row2, vl);
*row1 = rvv_vror_s(*row1, 12, vl);
}
INLINE void g2(vuint32m1_t *row0, vuint32m1_t *row1, vuint32m1_t *row2,
vuint32m1_t *row3, vuint32m1_t m, size_t vl) {
*row0 = __riscv_vadd(*row0, m, vl);
*row0 = __riscv_vadd(*row0, *row1, vl);
*row3 = __riscv_vxor(*row3, *row0, vl);
*row3 = rvv_vror_s(*row3, 8, vl);
*row2 = __riscv_vadd(*row2, *row3, vl);
*row1 = __riscv_vxor(*row1, *row2, vl);
*row1 = rvv_vror_s(*row1, 7, vl);
}
INLINE void diagonalize(vuint32m1_t *row0, vuint32m1_t *row2, vuint32m1_t *row3,
size_t vl) {
*row0 = rvv_vror_v(*row0, 3, vl);
*row3 = rvv_vror_v(*row3, 2, vl);
*row2 = rvv_vror_v(*row2, 1, vl);
}
INLINE void undiagonalize(vuint32m1_t *row0, vuint32m1_t *row2,
vuint32m1_t *row3, size_t vl) {
*row0 = rvv_vror_v(*row0, 1, vl);
*row3 = rvv_vror_v(*row3, 2, vl);
*row2 = rvv_vror_v(*row2, 3, vl);
}
INLINE void compress_pre(vuint32m1x4_t *rows, const uint32_t cv[8],
const uint8_t block[BLAKE3_BLOCK_LEN],
uint8_t block_len, uint64_t counter, uint8_t flags,
size_t vl) {
(void)rows;
(void)cv;
(void)block;
(void)block_len;
(void)counter;
(void)flags;
(void)vl;
// 0, 0, 3, 3
// 0, 1, 3, 2
// 1, 3, 2, 0
// 2, 0, 2, 0
// 3, 1, 1, 2
// 3, 1, 3, 1
// 3, 3, 2, 2
// 2, 1, 0, 3 (rotate)
// 1, 3, 2, 0 (rotate)
// 0, 3, 2, 1 (rotate)
}
void blake3_compress_xof_rvv(const uint32_t cv[8],
const uint8_t block[BLAKE3_BLOCK_LEN],
uint8_t block_len, uint64_t counter, uint8_t flags,
uint8_t out[64]) {
assert((uintptr_t)&block[0] % sizeof(uint32_t) == 0); // FIXME: alignment
assert((uintptr_t)&out[0] % sizeof(uint32_t) == 0); // FIXME: alignment
(void)cv;
(void)block;
(void)block_len;
(void)counter;
(void)flags;
(void)out;
}
void blake3_compress_in_place_rvv(uint32_t cv[8],
const uint8_t block[BLAKE3_BLOCK_LEN],
uint8_t block_len, uint64_t counter,
uint8_t flags) {
assert((uintptr_t)&block[0] % sizeof(uint32_t) == 0); // FIXME: alignment
(void)cv;
(void)block;
(void)block_len;
(void)counter;
(void)flags;
}
/*
* ----------------------------------------------------------------------------
* hash_vl_rvv
* ----------------------------------------------------------------------------
*/
void round_fn_vl() {
//
}
void transpose_vecs_vl() {
//
}
void transpose_msg_vecs_vl() {
//
}
void load_counters_vl() {
//
}
void blake3_hash_vl_rvv(const uint8_t *const *inputs, size_t blocks,
const uint32_t key[8], uint64_t counter,
bool increment_counter, uint8_t flags,
uint8_t flags_start, uint8_t flags_end, uint8_t *out,
size_t vl) {
assert((uintptr_t)&inputs[0] % sizeof(uint32_t) == 0); // FIXME: alignment
assert((uintptr_t)&out[0] % sizeof(uint32_t) == 0); // FIXME: alignment
(void)inputs;
(void)blocks;
(void)key;
(void)counter;
(void)increment_counter;
(void)flags;
(void)flags_start;
(void)flags_end;
(void)out;
(void)vl;
}
/*
* ----------------------------------------------------------------------------
* hash_many_rvv
* ----------------------------------------------------------------------------
*/
INLINE void hash_one_rvv(const uint8_t *input, size_t blocks,
const uint32_t key[8], uint64_t counter, uint8_t flags,
uint8_t flags_start, uint8_t flags_end,
uint8_t out[BLAKE3_OUT_LEN]) {
assert((uintptr_t)&input[0] % sizeof(uint32_t) == 0); // FIXME: alignment
assert((uintptr_t)&out[0] % sizeof(uint32_t) == 0); // FIXME: alignment
uint32_t cv[8];
memcpy(cv, key, BLAKE3_KEY_LEN);
uint8_t block_flags = flags | flags_start;
while (blocks > 0) {
block_flags |= blocks == 1 ? flags_end : 0;
if (blocks == 1) {
block_flags |= flags_end;
}
blake3_compress_in_place_rvv(cv, input, BLAKE3_BLOCK_LEN, counter,
block_flags);
input = &input[BLAKE3_BLOCK_LEN];
blocks -= 1;
block_flags = flags;
}
memcpy(out, cv, BLAKE3_OUT_LEN);
}
void blake3_hash_many_rvv(const uint8_t *const *inputs, size_t num_inputs,
size_t blocks, const uint32_t key[8],
uint64_t counter, bool increment_counter,
uint8_t flags, uint8_t flags_start, uint8_t flags_end,
uint8_t *out) {
assert((uintptr_t)&inputs[0] % sizeof(uint32_t) == 0); // FIXME: alignment
assert((uintptr_t)&out[0] % sizeof(uint32_t) == 0); // FIXME: alignment
for (
/* clang-format off */
size_t vl = __riscv_vsetvl_e32m1(num_inputs);
num_inputs > 0;
num_inputs -= vl,
inputs += vl,
counter += increment_counter * vl,
out = &out[vl * BLAKE3_OUT_LEN]
/* clang-format on */
) {
blake3_hash_vl_rvv(inputs, blocks, key, counter, increment_counter, flags,
flags_start, flags_end, out, vl);
}
}

113
src/ffi_rvv.rs Normal file
View File

@ -0,0 +1,113 @@
use crate::{CVWords, IncrementCounter, BLOCK_LEN, OUT_LEN};
// Unsafe because this may only be called on platforms supporting RVV.
pub unsafe fn compress_in_place(
cv: &mut CVWords,
block: &[u8; BLOCK_LEN],
block_len: u8,
counter: u64,
flags: u8,
) {
ffi::blake3_compress_in_place_rvv(cv.as_mut_ptr(), block.as_ptr(), block_len, counter, flags)
}
// Unsafe because this may only be called on platforms supporting RVV.
pub unsafe fn compress_xof(
cv: &CVWords,
block: &[u8; BLOCK_LEN],
block_len: u8,
counter: u64,
flags: u8,
) -> [u8; 64] {
let mut out = [0u8; 64];
ffi::blake3_compress_xof_rvv(
cv.as_ptr(),
block.as_ptr(),
block_len,
counter,
flags,
out.as_mut_ptr(),
);
out
}
// Unsafe because this may only be called on platforms supporting RVV.
pub unsafe fn hash_many<const N: usize>(
inputs: &[&[u8; N]],
key: &CVWords,
counter: u64,
increment_counter: IncrementCounter,
flags: u8,
flags_start: u8,
flags_end: u8,
out: &mut [u8],
) {
// The Rust hash_many implementations do bounds checking on the `out`
// array, but the C implementations don't. Even though this is an unsafe
// function, assert the bounds here.
assert!(out.len() >= inputs.len() * OUT_LEN);
ffi::blake3_hash_many_rvv(
inputs.as_ptr() as *const *const u8,
inputs.len(),
N / BLOCK_LEN,
key.as_ptr(),
counter,
increment_counter.yes(),
flags,
flags_start,
flags_end,
out.as_mut_ptr(),
)
}
pub mod ffi {
extern "C" {
pub fn blake3_compress_in_place_rvv(
cv: *mut u32,
block: *const u8,
block_len: u8,
counter: u64,
flags: u8,
);
pub fn blake3_compress_xof_rvv(
cv: *const u32,
block: *const u8,
block_len: u8,
counter: u64,
flags: u8,
out: *mut u8,
);
pub fn blake3_hash_many_rvv(
inputs: *const *const u8,
num_inputs: usize,
blocks: usize,
key: *const u32,
counter: u64,
increment_counter: bool,
flags: u8,
flags_start: u8,
flags_end: u8,
out: *mut u8,
);
}
}
#[cfg(test)]
mod test {
use super::*;
// This entire file is gated on feature="rvv", so RVV support is
// assumed here.
// TODO: test transpose?
#[test]
fn test_compress() {
crate::test::test_compress_fn(compress_in_place, compress_xof);
}
#[test]
fn test_hash_many() {
crate::test::test_hash_many_fn(hash_many, hash_many);
}
}

View File

@ -66,6 +66,10 @@
//! enabling this feature will produce a binary that's not portable to CPUs
//! without NEON support.
//!
//! The RVV implementation requires the `rvv` feature for RISC-V targets. Not
//! all RISC-V CPUs support RVV, and enabling this feature will produce a binary
//! that's not portable to CPUs without RVV support.
//!
//! The `traits-preview` feature enables implementations of traits from the
//! RustCrypto [`digest`] crate, and re-exports that crate as `traits::digest`.
//! However, the traits aren't stable, and they're expected to change in
@ -114,6 +118,9 @@ mod avx512;
#[path = "ffi_neon.rs"]
mod neon;
mod portable;
#[cfg(blake3_rvv)]
#[path = "ffi_rvv.rs"]
mod rvv;
#[cfg(blake3_sse2_rust)]
#[path = "rust_sse2.rs"]
mod sse2;

View File

@ -51,6 +51,8 @@ pub enum Platform {
AVX512,
#[cfg(blake3_neon)]
NEON,
#[cfg(blake3_rvv)]
RVV,
}
impl Platform {
@ -80,6 +82,12 @@ impl Platform {
{
return Platform::NEON;
}
// We don't use dynamic feature detection for RVV. If the "rvv"
// feature is on, RVV is assumed to be supported.
#[cfg(blake3_rvv)]
{
return Platform::RVV;
}
Platform::Portable
}
@ -97,6 +105,8 @@ impl Platform {
Platform::AVX512 => 16,
#[cfg(blake3_neon)]
Platform::NEON => 4,
#[cfg(blake3_rvv)]
Platform::RVV => todo!(),
};
debug_assert!(degree <= MAX_SIMD_DEGREE);
degree
@ -131,6 +141,10 @@ impl Platform {
// No NEON compress_in_place() implementation yet.
#[cfg(blake3_neon)]
Platform::NEON => portable::compress_in_place(cv, block, block_len, counter, flags),
#[cfg(blake3_rvv)]
Platform::RVV => unsafe {
crate::rvv::compress_in_place(cv, block, block_len, counter, flags)
},
}
}
@ -163,6 +177,11 @@ impl Platform {
// No NEON compress_xof() implementation yet.
#[cfg(blake3_neon)]
Platform::NEON => portable::compress_xof(cv, block, block_len, counter, flags),
// No NEON compress_xof() implementation yet.
#[cfg(blake3_rvv)]
Platform::RVV => unsafe {
crate::rvv::compress_xof(cv, block, block_len, counter, flags)
},
}
}
@ -269,6 +288,20 @@ impl Platform {
out,
)
},
// Assumed to be safe if the "rvv" feature is on.
#[cfg(blake3_rvv)]
Platform::RVV => unsafe {
crate::rvv::hash_many(
inputs,
key,
counter,
increment_counter,
flags,
flags_start,
flags_end,
out,
)
},
}
}
@ -320,6 +353,12 @@ impl Platform {
// Assumed to be safe if the "neon" feature is on.
Some(Self::NEON)
}
#[cfg(blake3_rvv)]
pub fn rvv() -> Option<Self> {
// Assumed to be safe if the "rvv" feature is on.
Some(Self::RVV)
}
}
// Note that AVX-512 is divided into multiple featuresets, and we use two of