mirror of
https://github.com/BLAKE3-team/BLAKE3
synced 2024-06-07 23:46:08 +02:00
Implement RVV backend
This commit is contained in:
parent
d60b753572
commit
9c468922bf
|
@ -18,6 +18,11 @@ default = ["std"]
|
|||
# implementation uses C intrinsics and requires a C compiler.
|
||||
neon = []
|
||||
|
||||
# The RVV implementation does not participate in dynamic feature detection,
|
||||
# which is currently x86-only. If "rvv" is on, RVV support is assumed. The
|
||||
# RVV implementation uses C intrinsics and requires a C compiler.
|
||||
rvv = []
|
||||
|
||||
# This crate uses libstd for std::io trait implementations, and also for
|
||||
# runtime CPU feature detection. This feature is enabled by default. If you use
|
||||
# --no-default-features, the only way to use the SIMD implementations in this
|
||||
|
|
|
@ -85,6 +85,14 @@ fn bench_single_compression_avx512(b: &mut Bencher) {
|
|||
}
|
||||
}
|
||||
|
||||
#[bench]
|
||||
#[cfg(blake3_rvv)]
|
||||
fn bench_single_compression_avx512(b: &mut Bencher) {
|
||||
if let Some(platform) = Platform::rvv() {
|
||||
bench_single_compression_fn(b, platform);
|
||||
}
|
||||
}
|
||||
|
||||
fn bench_many_chunks_fn(b: &mut Bencher, platform: Platform) {
|
||||
let degree = platform.simd_degree();
|
||||
let mut inputs = Vec::new();
|
||||
|
@ -151,6 +159,14 @@ fn bench_many_chunks_neon(b: &mut Bencher) {
|
|||
}
|
||||
}
|
||||
|
||||
#[bench]
|
||||
#[cfg(feature = "rvv")]
|
||||
fn bench_many_chunks_neon(b: &mut Bencher) {
|
||||
if let Some(platform) = Platform::rvv() {
|
||||
bench_many_chunks_fn(b, platform);
|
||||
}
|
||||
}
|
||||
|
||||
// TODO: When we get const generics we can unify this with the chunks code.
|
||||
fn bench_many_parents_fn(b: &mut Bencher, platform: Platform) {
|
||||
let degree = platform.simd_degree();
|
||||
|
@ -218,6 +234,14 @@ fn bench_many_parents_neon(b: &mut Bencher) {
|
|||
}
|
||||
}
|
||||
|
||||
#[bench]
|
||||
#[cfg(feature = "rvv")]
|
||||
fn bench_many_parents_rvv(b: &mut Bencher) {
|
||||
if let Some(platform) = Platform::rvv() {
|
||||
bench_many_parents_fn(b, platform);
|
||||
}
|
||||
}
|
||||
|
||||
fn bench_atonce(b: &mut Bencher, len: usize) {
|
||||
let mut input = RandomInput::new(b, len);
|
||||
b.iter(|| blake3::hash(input.get()));
|
||||
|
|
46
build.rs
46
build.rs
|
@ -1,3 +1,4 @@
|
|||
use core::panic;
|
||||
use std::env;
|
||||
|
||||
fn defined(var: &str) -> bool {
|
||||
|
@ -21,6 +22,14 @@ fn is_no_neon() -> bool {
|
|||
defined("CARGO_FEATURE_NO_NEON")
|
||||
}
|
||||
|
||||
fn is_rvv() -> bool {
|
||||
cfg!(feature = "rvv")
|
||||
}
|
||||
|
||||
fn is_no_rvv() -> bool {
|
||||
cfg!(not(feature = "rvv"))
|
||||
}
|
||||
|
||||
fn is_ci() -> bool {
|
||||
defined("BLAKE3_CI")
|
||||
}
|
||||
|
@ -60,6 +69,18 @@ fn is_armv7() -> bool {
|
|||
target_components()[0] == "armv7"
|
||||
}
|
||||
|
||||
fn is_riscv32() -> bool {
|
||||
std::env::var("CARGO_CFG_TARGET_ARCH")
|
||||
.map(|target_arch| target_arch == "riscv32")
|
||||
.unwrap_or_default()
|
||||
}
|
||||
|
||||
fn is_riscv64() -> bool {
|
||||
std::env::var("CARGO_CFG_TARGET_ARCH")
|
||||
.map(|target_arch| target_arch == "riscv64")
|
||||
.unwrap_or_default()
|
||||
}
|
||||
|
||||
fn endianness() -> String {
|
||||
let endianness = env::var("CARGO_CFG_TARGET_ENDIAN").unwrap();
|
||||
assert!(endianness == "little" || endianness == "big");
|
||||
|
@ -239,15 +260,33 @@ fn build_neon_c_intrinsics() {
|
|||
build.compile("blake3_neon");
|
||||
}
|
||||
|
||||
fn build_rvv_c_intrinsics() {
|
||||
let mut build = new_build();
|
||||
build.file("c/blake3_rvv.c");
|
||||
if is_riscv32() {
|
||||
build.flag("-march=rv32gcv1p0");
|
||||
}
|
||||
if is_riscv64() {
|
||||
build.flag("-march=rv64gcv1p0");
|
||||
}
|
||||
build.compile("blake3_rvv");
|
||||
}
|
||||
|
||||
fn main() -> Result<(), Box<dyn std::error::Error>> {
|
||||
if is_pure() && is_neon() {
|
||||
panic!("It doesn't make sense to enable both \"pure\" and \"neon\".");
|
||||
}
|
||||
|
||||
if is_no_neon() && is_neon() {
|
||||
panic!("It doesn't make sense to enable both \"no_neon\" and \"neon\".");
|
||||
}
|
||||
|
||||
if is_pure() && is_rvv() {
|
||||
panic!("It doesn't make sense to enable both \"pure\" and \"rvv\".");
|
||||
}
|
||||
if is_no_rvv() && is_rvv() {
|
||||
panic!("It doesn't make sense to enable both \"no_rvv\" and \"rvv\".");
|
||||
}
|
||||
|
||||
if is_x86_64() || is_x86_32() {
|
||||
let support = c_compiler_support();
|
||||
if is_x86_32() || should_prefer_intrinsics() || is_pure() || support == NoCompiler {
|
||||
|
@ -278,6 +317,11 @@ fn main() -> Result<(), Box<dyn std::error::Error>> {
|
|||
build_neon_c_intrinsics();
|
||||
}
|
||||
|
||||
if (is_riscv32() || is_riscv64()) && is_rvv() {
|
||||
println!("cargo:rustc-cfg=blake3_rvv");
|
||||
build_rvv_c_intrinsics();
|
||||
}
|
||||
|
||||
// The `cc` crate doesn't automatically emit rerun-if directives for the
|
||||
// environment variables it supports, in particular for $CC. We expect to
|
||||
// do a lot of benchmarking across different compilers, so we explicitly
|
||||
|
|
|
@ -33,6 +33,7 @@ include(GNUInstallDirs)
|
|||
set(BLAKE3_AMD64_NAMES amd64 AMD64 x86_64)
|
||||
set(BLAKE3_X86_NAMES i686 x86 X86)
|
||||
set(BLAKE3_ARMv8_NAMES aarch64 AArch64 arm64 ARM64 armv8 armv8a)
|
||||
set(BLAKE3_RISCV64_NAMES riscv64)
|
||||
# default SIMD compiler flag configuration (can be overriden by toolchains or CLI)
|
||||
if(MSVC)
|
||||
set(BLAKE3_CFLAGS_SSE2 "/arch:SSE2" CACHE STRING "the compiler flags to enable SSE2")
|
||||
|
@ -48,6 +49,7 @@ elseif(CMAKE_C_COMPILER_ID STREQUAL "GNU"
|
|||
set(BLAKE3_CFLAGS_SSE4.1 "-msse4.1" CACHE STRING "the compiler flags to enable SSE4.1")
|
||||
set(BLAKE3_CFLAGS_AVX2 "-mavx2" CACHE STRING "the compiler flags to enable AVX2")
|
||||
set(BLAKE3_CFLAGS_AVX512 "-mavx512f -mavx512vl" CACHE STRING "the compiler flags to enable AVX512")
|
||||
set(BLAKE3_CFLAGS_RVV_RISCV64 "-march=rv64gcv1p0" CACHE STRING "the compiler flags to enable RVV for riscv64")
|
||||
|
||||
if (CMAKE_SYSTEM_PROCESSOR IN_LIST BLAKE3_ARMv8_NAMES
|
||||
AND NOT CMAKE_SIZEOF_VOID_P EQUAL 8)
|
||||
|
@ -95,8 +97,10 @@ macro(BLAKE3_DISABLE_SIMD)
|
|||
set(BLAKE3_SIMD_AMD64_ASM OFF)
|
||||
set(BLAKE3_SIMD_X86_INTRINSICS OFF)
|
||||
set(BLAKE3_SIMD_NEON_INTRINSICS OFF)
|
||||
set(BLAKE3_VLA_RVV_INTRINSICS OFF)
|
||||
target_compile_definitions(blake3 PRIVATE
|
||||
BLAKE3_USE_NEON=0
|
||||
BLAKE3_USE_RVV
|
||||
BLAKE3_NO_SSE2
|
||||
BLAKE3_NO_SSE41
|
||||
BLAKE3_NO_AVX2
|
||||
|
@ -179,6 +183,13 @@ elseif((CMAKE_SYSTEM_PROCESSOR IN_LIST BLAKE3_ARMv8_NAMES
|
|||
set_source_files_properties(blake3_neon.c PROPERTIES COMPILE_FLAGS "${BLAKE3_CFLAGS_NEON}")
|
||||
endif()
|
||||
|
||||
elseif((CMAKE_SYSTEM_PROCESSOR IN_LIST BLAKE3_RISCV64_NAMES
|
||||
OR BLAKE3_USE_RVV_INTRINSICS))
|
||||
set(BLAKE3_VLA_RVV_INTRINSICS ON)
|
||||
target_sources(blake3 PRIVATE
|
||||
blake3_rvv.c
|
||||
)
|
||||
set_source_files_properties(blake3_rvv.c PROPERTIES COMPILE_FLAGS "${BLAKE3_CFLAGS_RVV_RISCV64}")
|
||||
else()
|
||||
BLAKE3_DISABLE_SIMD()
|
||||
endif()
|
||||
|
@ -216,4 +227,5 @@ install(FILES "${CMAKE_BINARY_DIR}/libblake3.pc"
|
|||
add_feature_info("AMD64 assembly" BLAKE3_SIMD_AMD64_ASM "The library uses hand written amd64 SIMD assembly.")
|
||||
add_feature_info("x86 SIMD intrinsics" BLAKE3_SIMD_X86_INTRINSICS "The library uses x86 SIMD intrinsics.")
|
||||
add_feature_info("NEON SIMD intrinsics" BLAKE3_SIMD_NEON_INTRINSICS "The library uses NEON SIMD intrinsics.")
|
||||
add_feature_info("RISC-V RVV intrinsics" BLAKE3_VLA_RVV_INTRINSICS "The library uses RISC-V RVV intrinsics.")
|
||||
feature_summary(WHAT ENABLED_FEATURES)
|
||||
|
|
|
@ -192,6 +192,15 @@ fn test_compress_avx512() {
|
|||
);
|
||||
}
|
||||
|
||||
#[test]
|
||||
#[cfg(feature = "rvv")]
|
||||
fn test_compress_rvv() {
|
||||
test_compress_fn(
|
||||
crate::ffi::x86::blake3_compress_in_place_rvv,
|
||||
crate::ffi::x86::blake3_compress_xof_rvv,
|
||||
);
|
||||
}
|
||||
|
||||
type HashManyFn = unsafe extern "C" fn(
|
||||
inputs: *const *const u8,
|
||||
num_inputs: usize,
|
||||
|
@ -359,6 +368,12 @@ fn test_hash_many_neon() {
|
|||
test_hash_many_fn(crate::ffi::neon::blake3_hash_many_neon);
|
||||
}
|
||||
|
||||
#[test]
|
||||
#[cfg(feature = "rvv")]
|
||||
fn test_hash_many_rvv() {
|
||||
test_hash_many_fn(crate::ffi::rvv::blake3_hash_many_rvv);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_compare_reference_impl() {
|
||||
const OUT: usize = 303; // more than 64, not a multiple of 4
|
||||
|
|
|
@ -267,6 +267,11 @@ void blake3_hash_many(const uint8_t *const *inputs, size_t num_inputs,
|
|||
return;
|
||||
#endif
|
||||
|
||||
#if BLAKE3_USE_RVV == 1
|
||||
blake3_hash_many_rvv(inputs, num_inputs, blocks, key, counter,
|
||||
increment_counter, flags, flags_start, flags_end, out);
|
||||
#endif
|
||||
|
||||
blake3_hash_many_portable(inputs, num_inputs, blocks, key, counter,
|
||||
increment_counter, flags, flags_start, flags_end,
|
||||
out);
|
||||
|
|
|
@ -281,5 +281,22 @@ void blake3_hash_many_neon(const uint8_t *const *inputs, size_t num_inputs,
|
|||
uint8_t flags_end, uint8_t *out);
|
||||
#endif
|
||||
|
||||
#if BLAKE3_USE_RVV == 1
|
||||
void blake3_compress_in_place_rvv(uint32_t cv[8],
|
||||
const uint8_t block[BLAKE3_BLOCK_LEN],
|
||||
uint8_t block_len, uint64_t counter,
|
||||
uint8_t flags);
|
||||
|
||||
void blake3_compress_xof_rvv(const uint32_t cv[8],
|
||||
const uint8_t block[BLAKE3_BLOCK_LEN],
|
||||
uint8_t block_len, uint64_t counter, uint8_t flags,
|
||||
uint8_t out[64]);
|
||||
|
||||
void blake3_hash_many_rvv(const uint8_t *const *inputs, size_t num_inputs,
|
||||
size_t blocks, const uint32_t key[8],
|
||||
uint64_t counter, bool increment_counter,
|
||||
uint8_t flags, uint8_t flags_start, uint8_t flags_end,
|
||||
uint8_t *out);
|
||||
#endif
|
||||
|
||||
#endif /* BLAKE3_IMPL_H */
|
||||
|
|
|
@ -0,0 +1,300 @@
|
|||
#include "blake3_impl.h"
|
||||
#include <riscv_vector.h>
|
||||
|
||||
INLINE vuint32m1_t rvv_vror_v(vuint32m1_t src, size_t shr, size_t vl) {
|
||||
vuint32m1_t dst = __riscv_vundefined_u32m1();
|
||||
size_t delta = vl - shr;
|
||||
dst = __riscv_vslideup(dst, src, delta, vl); // VL = vl
|
||||
dst = __riscv_vslidedown_tu(dst, src, shr, delta); // VL = dela
|
||||
return dst;
|
||||
}
|
||||
|
||||
INLINE vuint32m1_t rvv_vror_s(vuint32m1_t src, size_t shr, size_t vl) {
|
||||
size_t shl = sizeof(uint32_t) * 8 - shr;
|
||||
vuint32m1_t op0 = __riscv_vsrl(src, shr, vl);
|
||||
vuint32m1_t op1 = __riscv_vsll(src, shl, vl);
|
||||
return __riscv_vor(op0, op1, vl);
|
||||
}
|
||||
|
||||
// NOTE: See the following for several approaches to transposing matrices with RVV:
|
||||
// https://fprox.substack.com/p/transposing-a-matrix-using-risc-v
|
||||
//
|
||||
// This version of transpose_nxn uses the strided store approach, which easily scales to NxN
|
||||
// matrices. For LMUL=1, the data at the above link suggests that this approach may be less
|
||||
// efficient than a scalar implementation for large N > 16. However, for LMUL=8, the data suggests
|
||||
// this vectorized approach may be more efficient than a scalar implementation at least up to N=512.
|
||||
//
|
||||
// If we assume a typical vector size of 128b (the minimum; no idea how representative this will
|
||||
// be), then LMUL=8 gives us vector groups of 1024b (x 4, since RVV has 32 registers). This should
|
||||
// let us process 32 state vectors at a time (c.f., 8 for AVX2, 16 for AVX512). Wider base vector
|
||||
// registers would increase this further, of course.
|
||||
//
|
||||
// One of the more efficient alternative approaches would be to use segmented loads/stores. However,
|
||||
// this would limit us to 8-row matrices, based on the currently supported vector tuple-sizes.
|
||||
//
|
||||
// With larger vector registers (or larger LMUL), the in-register masked slide approach may also be
|
||||
// worth exploring.
|
||||
|
||||
INLINE
|
||||
void rvv_transpose_nxn(uint32_t *dst, uint32_t *src, size_t n) {
|
||||
for (size_t row_idx = 0; row_idx < n; row_idx += 1) {
|
||||
size_t avl = n;
|
||||
uint32_t *row_src = src + row_idx * n;
|
||||
uint32_t *row_dst = dst + row_idx;
|
||||
for (
|
||||
/* clang-format off */
|
||||
size_t vl = __riscv_vsetvl_e32m8(avl);
|
||||
0 < avl;
|
||||
avl -= vl,
|
||||
row_src += vl,
|
||||
row_dst += vl * n
|
||||
/* clang-format on */
|
||||
) {
|
||||
vuint32m8_t row = __riscv_vle32_v_u32m8(row_src, vl);
|
||||
__riscv_vsse32(row_dst, sizeof(uint32_t) * n, row, vl);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
INLINE
|
||||
vuint32m1_t rvv_zip_lo_u32(vuint32m1_t op0, vuint32m1_t op1) {
|
||||
vuint32m1_t odd_mask_u32 = __riscv_vmv_v_x_u32m1(0xAAAA, 1);
|
||||
vbool32_t odd_mask = __riscv_vreinterpret_v_u32m1_b32(odd_mask_u32);
|
||||
op0 = __riscv_vslideup_tu(op0, op0, 1, 4); // VL = 4
|
||||
op1 = __riscv_vslideup_tu(op1, op1, 2, 4); // VL = 4
|
||||
op1 = __riscv_vslideup_tu(op1, op1, 1, 2); // VL = 2
|
||||
return __riscv_vmerge_vvm_u32m1(op0, op1, odd_mask, 4); // VL = 4
|
||||
}
|
||||
|
||||
INLINE
|
||||
vuint32m1_t rvv_zip_hi_u32(vuint32m1_t op0, vuint32m1_t op1) {
|
||||
vuint32m1_t odd_mask_u32 = __riscv_vmv_v_x_u32m1(0xAAAA, 1);
|
||||
vbool32_t odd_mask = __riscv_vreinterpret_v_u32m1_b32(odd_mask_u32);
|
||||
op0 = __riscv_vslidedown_tu(op0, op0, 1, 4); // VL = 4
|
||||
op0 = __riscv_vslidedown_tu(op0, op0, 1, 2); // VL = 2
|
||||
op1 = __riscv_vslidedown_tu(op1, op1, 1, 2); // VL = 2
|
||||
return __riscv_vmerge_vvm_u32m1(op0, op1, odd_mask, 4); // VL = 4
|
||||
}
|
||||
|
||||
INLINE
|
||||
vuint32m1_t rvv_zip_lo_u64(vuint32m1_t op0, vuint32m1_t op1) {
|
||||
vuint64m1_t op0_u64 = __riscv_vreinterpret_v_u32m1_u64m1(op0);
|
||||
vuint64m1_t op1_u64 = __riscv_vreinterpret_v_u32m1_u64m1(op1);
|
||||
vuint64m1_t dst_u64 = __riscv_vslideup_tu(op0_u64, op1_u64, 1, 2); // VL = 2
|
||||
return __riscv_vreinterpret_v_u64m1_u32m1(dst_u64);
|
||||
}
|
||||
|
||||
INLINE
|
||||
vuint32m1_t rvv_zip_hi_u64(vuint32m1_t op0, vuint32m1_t op1) {
|
||||
vuint64m1_t op0_u64 = __riscv_vreinterpret_v_u32m1_u64m1(op0);
|
||||
vuint64m1_t op1_u64 = __riscv_vreinterpret_v_u32m1_u64m1(op1);
|
||||
vuint64m1_t dst_u64 = __riscv_vslidedown_tu(op1_u64, op0_u64, 1, 1); // VL = 1
|
||||
return __riscv_vreinterpret_v_u64m1_u32m1(dst_u64);
|
||||
}
|
||||
|
||||
INLINE
|
||||
vuint32m1_t rvv_shuffle_zip_lo_hi_u256(vuint32m1_t op0, vuint32m1_t op1,
|
||||
vuint32m1_t tab) {
|
||||
op1 = __riscv_vrgather_vv_u32m1(op1, tab, 4); // VL = 4
|
||||
op0 = __riscv_vrgather_vv_u32m1_tu(op1, op0, tab, 2); // VL = 2
|
||||
return op0;
|
||||
}
|
||||
|
||||
INLINE
|
||||
vuint32m1_t rvv_shuffle_u128(vuint32m1_t src, vuint32m1_t tab) {
|
||||
return __riscv_vrgather_vv_u32m1(src, tab, 4); // VL = 4
|
||||
}
|
||||
|
||||
INLINE
|
||||
vuint32m1_t rvv_blend_u16(vuint32m1_t op0, vuint32m1_t op1, uint16_t mask) {
|
||||
vuint16m1_t op0_u16 = __riscv_vreinterpret_v_u32m1_u16m1(op0);
|
||||
vuint16m1_t op1_u16 = __riscv_vreinterpret_v_u32m1_u16m1(op1);
|
||||
vbool16_t mask_u16 = __riscv_vreinterpret_v_u16m1_b16(__riscv_vmv_v_x_u16m1(mask, 1));
|
||||
vuint16m1_t dst = __riscv_vmerge_vvm_u16m1(op0_u16, op1_u16, mask_u16, 4); // VL = 4
|
||||
return __riscv_vreinterpret_v_u16m1_u32m1(dst);
|
||||
}
|
||||
|
||||
/*
|
||||
* ----------------------------------------------------------------------------
|
||||
* compress_rvv
|
||||
* ----------------------------------------------------------------------------
|
||||
*/
|
||||
|
||||
INLINE void g1(vuint32m1_t *row0, vuint32m1_t *row1, vuint32m1_t *row2,
|
||||
vuint32m1_t *row3, vuint32m1_t m, size_t vl) {
|
||||
*row0 = __riscv_vadd(*row0, m, vl);
|
||||
*row0 = __riscv_vadd(*row0, *row1, vl);
|
||||
*row3 = __riscv_vxor(*row3, *row0, vl);
|
||||
*row3 = rvv_vror_s(*row3, 16, vl);
|
||||
*row2 = __riscv_vadd(*row2, *row3, vl);
|
||||
*row1 = __riscv_vxor(*row1, *row2, vl);
|
||||
*row1 = rvv_vror_s(*row1, 12, vl);
|
||||
}
|
||||
|
||||
INLINE void g2(vuint32m1_t *row0, vuint32m1_t *row1, vuint32m1_t *row2,
|
||||
vuint32m1_t *row3, vuint32m1_t m, size_t vl) {
|
||||
*row0 = __riscv_vadd(*row0, m, vl);
|
||||
*row0 = __riscv_vadd(*row0, *row1, vl);
|
||||
*row3 = __riscv_vxor(*row3, *row0, vl);
|
||||
*row3 = rvv_vror_s(*row3, 8, vl);
|
||||
*row2 = __riscv_vadd(*row2, *row3, vl);
|
||||
*row1 = __riscv_vxor(*row1, *row2, vl);
|
||||
*row1 = rvv_vror_s(*row1, 7, vl);
|
||||
}
|
||||
|
||||
INLINE void diagonalize(vuint32m1_t *row0, vuint32m1_t *row2, vuint32m1_t *row3,
|
||||
size_t vl) {
|
||||
*row0 = rvv_vror_v(*row0, 3, vl);
|
||||
*row3 = rvv_vror_v(*row3, 2, vl);
|
||||
*row2 = rvv_vror_v(*row2, 1, vl);
|
||||
}
|
||||
|
||||
INLINE void undiagonalize(vuint32m1_t *row0, vuint32m1_t *row2,
|
||||
vuint32m1_t *row3, size_t vl) {
|
||||
*row0 = rvv_vror_v(*row0, 1, vl);
|
||||
*row3 = rvv_vror_v(*row3, 2, vl);
|
||||
*row2 = rvv_vror_v(*row2, 3, vl);
|
||||
}
|
||||
|
||||
INLINE void compress_pre(vuint32m1x4_t *rows, const uint32_t cv[8],
|
||||
const uint8_t block[BLAKE3_BLOCK_LEN],
|
||||
uint8_t block_len, uint64_t counter, uint8_t flags,
|
||||
size_t vl) {
|
||||
(void)rows;
|
||||
(void)cv;
|
||||
(void)block;
|
||||
(void)block_len;
|
||||
(void)counter;
|
||||
(void)flags;
|
||||
(void)vl;
|
||||
|
||||
// 0, 0, 3, 3
|
||||
// 0, 1, 3, 2
|
||||
// 1, 3, 2, 0
|
||||
// 2, 0, 2, 0
|
||||
// 3, 1, 1, 2
|
||||
// 3, 1, 3, 1
|
||||
// 3, 3, 2, 2
|
||||
|
||||
// 2, 1, 0, 3 (rotate)
|
||||
// 1, 3, 2, 0 (rotate)
|
||||
// 0, 3, 2, 1 (rotate)
|
||||
}
|
||||
|
||||
void blake3_compress_xof_rvv(const uint32_t cv[8],
|
||||
const uint8_t block[BLAKE3_BLOCK_LEN],
|
||||
uint8_t block_len, uint64_t counter, uint8_t flags,
|
||||
uint8_t out[64]) {
|
||||
assert((uintptr_t)&block[0] % sizeof(uint32_t) == 0); // FIXME: alignment
|
||||
assert((uintptr_t)&out[0] % sizeof(uint32_t) == 0); // FIXME: alignment
|
||||
(void)cv;
|
||||
(void)block;
|
||||
(void)block_len;
|
||||
(void)counter;
|
||||
(void)flags;
|
||||
(void)out;
|
||||
}
|
||||
|
||||
void blake3_compress_in_place_rvv(uint32_t cv[8],
|
||||
const uint8_t block[BLAKE3_BLOCK_LEN],
|
||||
uint8_t block_len, uint64_t counter,
|
||||
uint8_t flags) {
|
||||
assert((uintptr_t)&block[0] % sizeof(uint32_t) == 0); // FIXME: alignment
|
||||
(void)cv;
|
||||
(void)block;
|
||||
(void)block_len;
|
||||
(void)counter;
|
||||
(void)flags;
|
||||
}
|
||||
|
||||
/*
|
||||
* ----------------------------------------------------------------------------
|
||||
* hash_vl_rvv
|
||||
* ----------------------------------------------------------------------------
|
||||
*/
|
||||
|
||||
void round_fn_vl() {
|
||||
//
|
||||
}
|
||||
|
||||
void transpose_vecs_vl() {
|
||||
//
|
||||
}
|
||||
|
||||
void transpose_msg_vecs_vl() {
|
||||
//
|
||||
}
|
||||
|
||||
void load_counters_vl() {
|
||||
//
|
||||
}
|
||||
|
||||
void blake3_hash_vl_rvv(const uint8_t *const *inputs, size_t blocks,
|
||||
const uint32_t key[8], uint64_t counter,
|
||||
bool increment_counter, uint8_t flags,
|
||||
uint8_t flags_start, uint8_t flags_end, uint8_t *out,
|
||||
size_t vl) {
|
||||
assert((uintptr_t)&inputs[0] % sizeof(uint32_t) == 0); // FIXME: alignment
|
||||
assert((uintptr_t)&out[0] % sizeof(uint32_t) == 0); // FIXME: alignment
|
||||
(void)inputs;
|
||||
(void)blocks;
|
||||
(void)key;
|
||||
(void)counter;
|
||||
(void)increment_counter;
|
||||
(void)flags;
|
||||
(void)flags_start;
|
||||
(void)flags_end;
|
||||
(void)out;
|
||||
(void)vl;
|
||||
}
|
||||
|
||||
/*
|
||||
* ----------------------------------------------------------------------------
|
||||
* hash_many_rvv
|
||||
* ----------------------------------------------------------------------------
|
||||
*/
|
||||
|
||||
INLINE void hash_one_rvv(const uint8_t *input, size_t blocks,
|
||||
const uint32_t key[8], uint64_t counter, uint8_t flags,
|
||||
uint8_t flags_start, uint8_t flags_end,
|
||||
uint8_t out[BLAKE3_OUT_LEN]) {
|
||||
assert((uintptr_t)&input[0] % sizeof(uint32_t) == 0); // FIXME: alignment
|
||||
assert((uintptr_t)&out[0] % sizeof(uint32_t) == 0); // FIXME: alignment
|
||||
uint32_t cv[8];
|
||||
memcpy(cv, key, BLAKE3_KEY_LEN);
|
||||
uint8_t block_flags = flags | flags_start;
|
||||
while (blocks > 0) {
|
||||
block_flags |= blocks == 1 ? flags_end : 0;
|
||||
if (blocks == 1) {
|
||||
block_flags |= flags_end;
|
||||
}
|
||||
blake3_compress_in_place_rvv(cv, input, BLAKE3_BLOCK_LEN, counter,
|
||||
block_flags);
|
||||
input = &input[BLAKE3_BLOCK_LEN];
|
||||
blocks -= 1;
|
||||
block_flags = flags;
|
||||
}
|
||||
memcpy(out, cv, BLAKE3_OUT_LEN);
|
||||
}
|
||||
|
||||
void blake3_hash_many_rvv(const uint8_t *const *inputs, size_t num_inputs,
|
||||
size_t blocks, const uint32_t key[8],
|
||||
uint64_t counter, bool increment_counter,
|
||||
uint8_t flags, uint8_t flags_start, uint8_t flags_end,
|
||||
uint8_t *out) {
|
||||
assert((uintptr_t)&inputs[0] % sizeof(uint32_t) == 0); // FIXME: alignment
|
||||
assert((uintptr_t)&out[0] % sizeof(uint32_t) == 0); // FIXME: alignment
|
||||
for (
|
||||
/* clang-format off */
|
||||
size_t vl = __riscv_vsetvl_e32m1(num_inputs);
|
||||
num_inputs > 0;
|
||||
num_inputs -= vl,
|
||||
inputs += vl,
|
||||
counter += increment_counter * vl,
|
||||
out = &out[vl * BLAKE3_OUT_LEN]
|
||||
/* clang-format on */
|
||||
) {
|
||||
blake3_hash_vl_rvv(inputs, blocks, key, counter, increment_counter, flags,
|
||||
flags_start, flags_end, out, vl);
|
||||
}
|
||||
}
|
|
@ -0,0 +1,113 @@
|
|||
use crate::{CVWords, IncrementCounter, BLOCK_LEN, OUT_LEN};
|
||||
|
||||
// Unsafe because this may only be called on platforms supporting RVV.
|
||||
pub unsafe fn compress_in_place(
|
||||
cv: &mut CVWords,
|
||||
block: &[u8; BLOCK_LEN],
|
||||
block_len: u8,
|
||||
counter: u64,
|
||||
flags: u8,
|
||||
) {
|
||||
ffi::blake3_compress_in_place_rvv(cv.as_mut_ptr(), block.as_ptr(), block_len, counter, flags)
|
||||
}
|
||||
|
||||
// Unsafe because this may only be called on platforms supporting RVV.
|
||||
pub unsafe fn compress_xof(
|
||||
cv: &CVWords,
|
||||
block: &[u8; BLOCK_LEN],
|
||||
block_len: u8,
|
||||
counter: u64,
|
||||
flags: u8,
|
||||
) -> [u8; 64] {
|
||||
let mut out = [0u8; 64];
|
||||
ffi::blake3_compress_xof_rvv(
|
||||
cv.as_ptr(),
|
||||
block.as_ptr(),
|
||||
block_len,
|
||||
counter,
|
||||
flags,
|
||||
out.as_mut_ptr(),
|
||||
);
|
||||
out
|
||||
}
|
||||
|
||||
// Unsafe because this may only be called on platforms supporting RVV.
|
||||
pub unsafe fn hash_many<const N: usize>(
|
||||
inputs: &[&[u8; N]],
|
||||
key: &CVWords,
|
||||
counter: u64,
|
||||
increment_counter: IncrementCounter,
|
||||
flags: u8,
|
||||
flags_start: u8,
|
||||
flags_end: u8,
|
||||
out: &mut [u8],
|
||||
) {
|
||||
// The Rust hash_many implementations do bounds checking on the `out`
|
||||
// array, but the C implementations don't. Even though this is an unsafe
|
||||
// function, assert the bounds here.
|
||||
assert!(out.len() >= inputs.len() * OUT_LEN);
|
||||
ffi::blake3_hash_many_rvv(
|
||||
inputs.as_ptr() as *const *const u8,
|
||||
inputs.len(),
|
||||
N / BLOCK_LEN,
|
||||
key.as_ptr(),
|
||||
counter,
|
||||
increment_counter.yes(),
|
||||
flags,
|
||||
flags_start,
|
||||
flags_end,
|
||||
out.as_mut_ptr(),
|
||||
)
|
||||
}
|
||||
|
||||
pub mod ffi {
|
||||
extern "C" {
|
||||
pub fn blake3_compress_in_place_rvv(
|
||||
cv: *mut u32,
|
||||
block: *const u8,
|
||||
block_len: u8,
|
||||
counter: u64,
|
||||
flags: u8,
|
||||
);
|
||||
pub fn blake3_compress_xof_rvv(
|
||||
cv: *const u32,
|
||||
block: *const u8,
|
||||
block_len: u8,
|
||||
counter: u64,
|
||||
flags: u8,
|
||||
out: *mut u8,
|
||||
);
|
||||
pub fn blake3_hash_many_rvv(
|
||||
inputs: *const *const u8,
|
||||
num_inputs: usize,
|
||||
blocks: usize,
|
||||
key: *const u32,
|
||||
counter: u64,
|
||||
increment_counter: bool,
|
||||
flags: u8,
|
||||
flags_start: u8,
|
||||
flags_end: u8,
|
||||
out: *mut u8,
|
||||
);
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod test {
|
||||
use super::*;
|
||||
|
||||
// This entire file is gated on feature="rvv", so RVV support is
|
||||
// assumed here.
|
||||
|
||||
// TODO: test transpose?
|
||||
|
||||
#[test]
|
||||
fn test_compress() {
|
||||
crate::test::test_compress_fn(compress_in_place, compress_xof);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_hash_many() {
|
||||
crate::test::test_hash_many_fn(hash_many, hash_many);
|
||||
}
|
||||
}
|
|
@ -66,6 +66,10 @@
|
|||
//! enabling this feature will produce a binary that's not portable to CPUs
|
||||
//! without NEON support.
|
||||
//!
|
||||
//! The RVV implementation requires the `rvv` feature for RISC-V targets. Not
|
||||
//! all RISC-V CPUs support RVV, and enabling this feature will produce a binary
|
||||
//! that's not portable to CPUs without RVV support.
|
||||
//!
|
||||
//! The `traits-preview` feature enables implementations of traits from the
|
||||
//! RustCrypto [`digest`] crate, and re-exports that crate as `traits::digest`.
|
||||
//! However, the traits aren't stable, and they're expected to change in
|
||||
|
@ -114,6 +118,9 @@ mod avx512;
|
|||
#[path = "ffi_neon.rs"]
|
||||
mod neon;
|
||||
mod portable;
|
||||
#[cfg(blake3_rvv)]
|
||||
#[path = "ffi_rvv.rs"]
|
||||
mod rvv;
|
||||
#[cfg(blake3_sse2_rust)]
|
||||
#[path = "rust_sse2.rs"]
|
||||
mod sse2;
|
||||
|
|
|
@ -51,6 +51,8 @@ pub enum Platform {
|
|||
AVX512,
|
||||
#[cfg(blake3_neon)]
|
||||
NEON,
|
||||
#[cfg(blake3_rvv)]
|
||||
RVV,
|
||||
}
|
||||
|
||||
impl Platform {
|
||||
|
@ -80,6 +82,12 @@ impl Platform {
|
|||
{
|
||||
return Platform::NEON;
|
||||
}
|
||||
// We don't use dynamic feature detection for RVV. If the "rvv"
|
||||
// feature is on, RVV is assumed to be supported.
|
||||
#[cfg(blake3_rvv)]
|
||||
{
|
||||
return Platform::RVV;
|
||||
}
|
||||
Platform::Portable
|
||||
}
|
||||
|
||||
|
@ -97,6 +105,8 @@ impl Platform {
|
|||
Platform::AVX512 => 16,
|
||||
#[cfg(blake3_neon)]
|
||||
Platform::NEON => 4,
|
||||
#[cfg(blake3_rvv)]
|
||||
Platform::RVV => todo!(),
|
||||
};
|
||||
debug_assert!(degree <= MAX_SIMD_DEGREE);
|
||||
degree
|
||||
|
@ -131,6 +141,10 @@ impl Platform {
|
|||
// No NEON compress_in_place() implementation yet.
|
||||
#[cfg(blake3_neon)]
|
||||
Platform::NEON => portable::compress_in_place(cv, block, block_len, counter, flags),
|
||||
#[cfg(blake3_rvv)]
|
||||
Platform::RVV => unsafe {
|
||||
crate::rvv::compress_in_place(cv, block, block_len, counter, flags)
|
||||
},
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -163,6 +177,11 @@ impl Platform {
|
|||
// No NEON compress_xof() implementation yet.
|
||||
#[cfg(blake3_neon)]
|
||||
Platform::NEON => portable::compress_xof(cv, block, block_len, counter, flags),
|
||||
// No NEON compress_xof() implementation yet.
|
||||
#[cfg(blake3_rvv)]
|
||||
Platform::RVV => unsafe {
|
||||
crate::rvv::compress_xof(cv, block, block_len, counter, flags)
|
||||
},
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -269,6 +288,20 @@ impl Platform {
|
|||
out,
|
||||
)
|
||||
},
|
||||
// Assumed to be safe if the "rvv" feature is on.
|
||||
#[cfg(blake3_rvv)]
|
||||
Platform::RVV => unsafe {
|
||||
crate::rvv::hash_many(
|
||||
inputs,
|
||||
key,
|
||||
counter,
|
||||
increment_counter,
|
||||
flags,
|
||||
flags_start,
|
||||
flags_end,
|
||||
out,
|
||||
)
|
||||
},
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -320,6 +353,12 @@ impl Platform {
|
|||
// Assumed to be safe if the "neon" feature is on.
|
||||
Some(Self::NEON)
|
||||
}
|
||||
|
||||
#[cfg(blake3_rvv)]
|
||||
pub fn rvv() -> Option<Self> {
|
||||
// Assumed to be safe if the "rvv" feature is on.
|
||||
Some(Self::RVV)
|
||||
}
|
||||
}
|
||||
|
||||
// Note that AVX-512 is divided into multiple featuresets, and we use two of
|
||||
|
|
Loading…
Reference in New Issue