1
0
Fork 0
mirror of https://github.com/BLAKE3-team/BLAKE3 synced 2024-05-26 15:56:02 +02:00

blake3_guts_riscv64gcv_hash_parents

The test command is currently:

    CC="$HOME/llvm-project/build/bin/clang" \
    CARGO_TARGET_RISCV64GC_UNKNOWN_LINUX_GNU_LINKER=riscv64-linux-gnu-gcc \
    CARGO_TARGET_RISCV64GC_UNKNOWN_LINUX_GNU_RUSTFLAGS="-L /usr/lib/gcc/riscv64-linux-gnu/12.2.0" \
    CARGO_TARGET_RISCV64GC_UNKNOWN_LINUX_GNU_RUNNER="$HOME/qemu/build/qemu-riscv64 -L /usr/riscv64-linux-gnu -cpu rv64,v=true,vext_spec=v1.0,vlen=512,zbb=true,x-zvbb=true" \
    cargo test --target riscv64gc-unknown-linux-gnu
This commit is contained in:
Jack O'Connor 2023-07-30 15:57:29 +08:00
parent 9ade720b60
commit 13f6c6fb7a
9 changed files with 1088 additions and 29 deletions

View File

@ -178,6 +178,8 @@ jobs:
- armv7-unknown-linux-gnueabihf
- aarch64-unknown-linux-gnu
- mips-unknown-linux-gnu
- riscv32-unknown-linux-gnu
- riscv64-unknown-linux-gnu
steps:
- uses: actions/checkout@v3

View File

@ -4572,7 +4572,7 @@ blake3_guts_avx512_hash_chunks_16_exact:
_blake3_guts_avx512_hash_parents_16_exact:
blake3_guts_avx512_hash_parents_16_exact:
// load the transposed CVs and split alternating words into the low and
// high halfs of the input vectors
// high halves of the input vectors
vmovdqa32 zmm0, ZMMWORD PTR [EVEN+rip]
vmovdqa32 zmm1, ZMMWORD PTR [ODD+rip]
vmovdqa32 zmm16, ZMMWORD PTR [rdi+0x0*0x40]

View File

@ -60,6 +60,10 @@ fn is_armv7() -> bool {
target_components()[0] == "armv7"
}
fn is_riscv64gc() -> bool {
target_components()[0] == "riscv64gc"
}
// Windows targets may be using the MSVC toolchain or the GNU toolchain. The
// right compiler flags to use depend on the toolchain. (And we don't want to
// use flag_if_supported, because we don't want features to be silently
@ -225,6 +229,18 @@ fn build_neon_c_intrinsics() {
build.compile("blake3_neon");
}
fn build_riscv64gcv_assembly() {
println!("cargo:rustc-cfg=blake3_riscv64gcv_ffi");
let mut build = new_build();
let asm_path = "src/riscv64gcv.S";
build.file(asm_path);
build.flag("--target=riscv64");
build.flag("-march=rv64gcv_zbb_zvbb1p0");
build.flag("-menable-experimental-extensions");
build.compile("blake3_riscv64gcv_assembly");
println!("cargo:rerun-if-changed={asm_path}");
}
fn main() {
if is_pure() && is_neon() {
panic!("It doesn't make sense to enable both \"pure\" and \"neon\".");
@ -258,6 +274,12 @@ fn main() {
build_neon_c_intrinsics();
}
// TODO: This implementation assumes some bleeding-edge extensions, and it should probably be
// gated by a Cargo feature.
if is_riscv64gc() && !is_pure() {
build_riscv64gcv_assembly();
}
// The `cc` crate doesn't automatically emit rerun-if directives for the
// environment variables it supports, in particular for $CC. We expect to
// do a lot of benchmarking across different compilers, so we explicitly

View File

@ -1,6 +1,10 @@
use crate::{BlockBytes, CVBytes, Implementation, BLOCK_LEN, CHUNK_LEN};
const DEGREE: usize = 16;
pub(crate) const DEGREE: usize = 16;
unsafe extern "C" fn degree() -> usize {
DEGREE
}
extern "C" {
fn blake3_guts_avx512_compress(
@ -232,7 +236,7 @@ fn supported() -> bool {
pub fn implementation() -> Option<Implementation> {
if supported() {
Some(Implementation::new(
|| DEGREE,
degree,
blake3_guts_avx512_compress,
hash_chunks,
hash_parents,

View File

@ -7,6 +7,8 @@ use core::sync::atomic::{AtomicPtr, Ordering::Relaxed};
#[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
pub mod avx512;
pub mod portable;
#[cfg(any(target_arch = "riscv64"))]
pub mod riscv64gcv;
#[cfg(test)]
mod test;
@ -42,7 +44,9 @@ pub const MSG_SCHEDULE: [[usize; 16]; 7] = [
cfg_if::cfg_if! {
if #[cfg(any(target_arch = "x86", target_arch = "x86_64"))] {
pub const MAX_SIMD_DEGREE: usize = 16;
pub const MAX_SIMD_DEGREE: usize = avx512::DEGREE;
} else if #[cfg(target_arch = "riscv64")] {
pub const MAX_SIMD_DEGREE: usize = riscv64gcv::MAX_SIMD_DEGREE;
} else if #[cfg(blake3_neon)] {
pub const MAX_SIMD_DEGREE: usize = 4;
} else {
@ -73,6 +77,11 @@ fn detect() -> Implementation {
return avx512_impl;
}
}
#[cfg(target_arch = "riscv64")]
{
return riscv64gcv::implementation();
}
#[allow(unreachable_code)]
portable::implementation()
}
@ -140,7 +149,7 @@ impl Implementation {
#[inline]
pub fn degree(&self) -> usize {
let degree = self.degree_fn()();
let degree = unsafe { self.degree_fn()() };
debug_assert!(degree >= 2);
debug_assert!(degree <= MAX_SIMD_DEGREE);
debug_assert_eq!(1, degree.count_ones(), "power of 2");
@ -222,11 +231,20 @@ impl Implementation {
pub fn hash_parents(
&self,
transposed_input: &TransposedVectors,
num_cvs: usize,
mut num_cvs: usize,
key: &CVBytes,
flags: u32,
transposed_output: TransposedSplit,
) -> usize {
debug_assert!(num_cvs <= 2 * MAX_SIMD_DEGREE);
// SAFETY: Cap num_cvs at 2 * MAX_SIMD_DEGREE, to guarantee no out-of-bounds accesses.
num_cvs = cmp::min(num_cvs, 2 * MAX_SIMD_DEGREE);
let mut odd_cv = [0u32; 8];
if num_cvs % 2 == 1 {
unsafe {
odd_cv = read_transposed_cv(transposed_input.as_ptr().add(num_cvs - 1));
}
}
let num_parents = num_cvs / 2;
unsafe {
self.hash_parents_fn()(
@ -239,10 +257,7 @@ impl Implementation {
}
if num_cvs % 2 == 1 {
unsafe {
copy_one_transposed_cv(
transposed_input.as_ptr().add(num_cvs - 1),
transposed_output.ptr.add(num_parents),
);
write_transposed_cv(&odd_cv, transposed_output.ptr.add(num_parents));
}
num_parents + 1
} else {
@ -254,18 +269,27 @@ impl Implementation {
pub fn reduce_parents(
&self,
transposed_in_out: &mut TransposedVectors,
num_cvs: usize,
mut num_cvs: usize,
key: &CVBytes,
flags: u32,
) -> usize {
let num_parents = num_cvs / 2;
debug_assert!(num_cvs <= 2 * MAX_SIMD_DEGREE);
// SAFETY: Cap num_cvs at 2 * MAX_SIMD_DEGREE, to guarantee no out-of-bounds accesses.
num_cvs = cmp::min(num_cvs, 2 * MAX_SIMD_DEGREE);
let in_out_ptr = transposed_in_out.as_mut_ptr();
let mut odd_cv = [0u32; 8];
if num_cvs % 2 == 1 {
unsafe {
odd_cv = read_transposed_cv(in_out_ptr.add(num_cvs - 1));
}
}
let num_parents = num_cvs / 2;
unsafe {
self.hash_parents_fn()(in_out_ptr, num_parents, key, flags | PARENT, in_out_ptr);
}
if num_cvs % 2 == 1 {
unsafe {
copy_one_transposed_cv(in_out_ptr.add(num_cvs - 1), in_out_ptr.add(num_parents));
write_transposed_cv(&odd_cv, in_out_ptr.add(num_parents));
}
num_parents + 1
} else {
@ -384,9 +408,9 @@ impl Clone for Implementation {
}
// never less than 2
type DegreeFn = fn() -> usize;
type DegreeFn = unsafe extern "C" fn() -> usize;
fn degree_init() -> usize {
unsafe extern "C" fn degree_init() -> usize {
init_detected_impl();
DETECTED_IMPL.degree_fn()()
}
@ -756,11 +780,19 @@ pub struct TransposedSplit<'vectors> {
unsafe impl<'vectors> Send for TransposedSplit<'vectors> {}
unsafe impl<'vectors> Sync for TransposedSplit<'vectors> {}
unsafe fn copy_one_transposed_cv(transposed_src: *const u32, transposed_dest: *mut u32) {
unsafe fn read_transposed_cv(src: *const u32) -> CVWords {
let mut cv = [0u32; 8];
for word_index in 0..8 {
let offset_words = word_index * TRANSPOSED_STRIDE;
let word = transposed_src.add(offset_words).read();
transposed_dest.add(offset_words).write(word);
cv[word_index] = src.add(offset_words).read();
}
cv
}
unsafe fn write_transposed_cv(cv: &CVWords, dest: *mut u32) {
for word_index in 0..8 {
let offset_words = word_index * TRANSPOSED_STRIDE;
dest.add(offset_words).write(cv[word_index]);
}
}

View File

@ -5,6 +5,10 @@ use crate::{
const DEGREE: usize = MAX_SIMD_DEGREE;
unsafe extern "C" fn degree() -> usize {
DEGREE
}
#[inline(always)]
fn g(state: &mut BlockWords, a: usize, b: usize, c: usize, d: usize, x: u32, y: u32) {
state[a] = state[a].wrapping_add(state[b]).wrapping_add(x);
@ -67,7 +71,7 @@ fn compress_inner(
state
}
unsafe extern "C" fn compress(
pub(crate) unsafe extern "C" fn compress(
block: *const BlockBytes,
block_len: u32,
cv: *const CVBytes,
@ -84,7 +88,7 @@ unsafe extern "C" fn compress(
*out = le_bytes_from_words_32(state[..8].try_into().unwrap());
}
unsafe extern "C" fn compress_xof(
pub(crate) unsafe extern "C" fn compress_xof(
block: *const BlockBytes,
block_len: u32,
cv: *const CVBytes,
@ -102,7 +106,7 @@ unsafe extern "C" fn compress_xof(
*out = le_bytes_from_words_64(&state);
}
unsafe extern "C" fn hash_chunks(
pub(crate) unsafe extern "C" fn hash_chunks(
input: *const u8,
input_len: usize,
key: *const CVBytes,
@ -121,7 +125,7 @@ unsafe extern "C" fn hash_chunks(
)
}
unsafe extern "C" fn hash_parents(
pub(crate) unsafe extern "C" fn hash_parents(
transposed_input: *const u32,
num_parents: usize,
key: *const CVBytes,
@ -138,7 +142,7 @@ unsafe extern "C" fn hash_parents(
)
}
unsafe extern "C" fn xof(
pub(crate) unsafe extern "C" fn xof(
block: *const BlockBytes,
block_len: u32,
cv: *const CVBytes,
@ -159,7 +163,7 @@ unsafe extern "C" fn xof(
)
}
unsafe extern "C" fn xof_xor(
pub(crate) unsafe extern "C" fn xof_xor(
block: *const BlockBytes,
block_len: u32,
cv: *const CVBytes,
@ -180,7 +184,7 @@ unsafe extern "C" fn xof_xor(
)
}
unsafe extern "C" fn universal_hash(
pub(crate) unsafe extern "C" fn universal_hash(
input: *const u8,
input_len: usize,
key: *const CVBytes,
@ -192,7 +196,7 @@ unsafe extern "C" fn universal_hash(
pub fn implementation() -> Implementation {
Implementation::new(
|| DEGREE,
degree,
compress,
hash_chunks,
hash_parents,

913
rust/guts/src/riscv64gcv.S Normal file
View File

@ -0,0 +1,913 @@
// This implementation currently assumes riscv64gcv_zbb_zvbb. Zvbb in particular ("Vector
// Bit-manipulation used in Cryptography") is a bleeding-edge extension that was only frozen a few
// weeks ago at the time I'm writing this comment. Compiling and testing this code currently
// requires quite a lot of effort, including building Clang from master and building QEMU from a
// custom branch. Please don't expect this code to be usable on real hardware for some time.
#define IV0 0x6A09E667
#define IV1 0xBB67AE85
#define IV2 0x3C6EF372
#define IV3 0xA54FF53A
.section .text
.global blake3_guts_riscv64gcv_degree
blake3_guts_riscv64gcv_degree:
csrr t0, vlenb
srli a0, t0, 2
blake3_guts_riscv64gcv_kernel:
li t0, IV0
vmv.v.x v8, t0
li t0, IV1
vmv.v.x v9, t0
li t0, IV2
vmv.v.x v10, t0
li t0, IV3
vmv.v.x v11, t0
vadd.vv v0, v0, v16
vadd.vv v1, v1, v18
vadd.vv v2, v2, v20
vadd.vv v3, v3, v22
vadd.vv v0, v0, v4
vadd.vv v1, v1, v5
vadd.vv v2, v2, v6
vadd.vv v3, v3, v7
vxor.vv v12, v12, v0
vxor.vv v13, v13, v1
vxor.vv v14, v14, v2
vxor.vv v15, v15, v3
vror.vi v12, v12, 16
vror.vi v13, v13, 16
vror.vi v14, v14, 16
vror.vi v15, v15, 16
vadd.vv v8, v8, v12
vadd.vv v9, v9, v13
vadd.vv v10, v10, v14
vadd.vv v11, v11, v15
vxor.vv v4, v4, v8
vxor.vv v5, v5, v9
vxor.vv v6, v6, v10
vxor.vv v7, v7, v11
vror.vi v4, v4, 12
vror.vi v5, v5, 12
vror.vi v6, v6, 12
vror.vi v7, v7, 12
vadd.vv v0, v0, v17
vadd.vv v1, v1, v19
vadd.vv v2, v2, v21
vadd.vv v3, v3, v23
vadd.vv v0, v0, v4
vadd.vv v1, v1, v5
vadd.vv v2, v2, v6
vadd.vv v3, v3, v7
vxor.vv v12, v12, v0
vxor.vv v13, v13, v1
vxor.vv v14, v14, v2
vxor.vv v15, v15, v3
vror.vi v12, v12, 8
vror.vi v13, v13, 8
vror.vi v14, v14, 8
vror.vi v15, v15, 8
vadd.vv v8, v8, v12
vadd.vv v9, v9, v13
vadd.vv v10, v10, v14
vadd.vv v11, v11, v15
vxor.vv v4, v4, v8
vxor.vv v5, v5, v9
vxor.vv v6, v6, v10
vxor.vv v7, v7, v11
vror.vi v4, v4, 7
vror.vi v5, v5, 7
vror.vi v6, v6, 7
vror.vi v7, v7, 7
vadd.vv v0, v0, v24
vadd.vv v1, v1, v26
vadd.vv v2, v2, v28
vadd.vv v3, v3, v30
vadd.vv v0, v0, v5
vadd.vv v1, v1, v6
vadd.vv v2, v2, v7
vadd.vv v3, v3, v4
vxor.vv v15, v15, v0
vxor.vv v12, v12, v1
vxor.vv v13, v13, v2
vxor.vv v14, v14, v3
vror.vi v15, v15, 16
vror.vi v12, v12, 16
vror.vi v13, v13, 16
vror.vi v14, v14, 16
vadd.vv v10, v10, v15
vadd.vv v11, v11, v12
vadd.vv v8, v8, v13
vadd.vv v9, v9, v14
vxor.vv v5, v5, v10
vxor.vv v6, v6, v11
vxor.vv v7, v7, v8
vxor.vv v4, v4, v9
vror.vi v5, v5, 12
vror.vi v6, v6, 12
vror.vi v7, v7, 12
vror.vi v4, v4, 12
vadd.vv v0, v0, v25
vadd.vv v1, v1, v27
vadd.vv v2, v2, v29
vadd.vv v3, v3, v31
vadd.vv v0, v0, v5
vadd.vv v1, v1, v6
vadd.vv v2, v2, v7
vadd.vv v3, v3, v4
vxor.vv v15, v15, v0
vxor.vv v12, v12, v1
vxor.vv v13, v13, v2
vxor.vv v14, v14, v3
vror.vi v15, v15, 8
vror.vi v12, v12, 8
vror.vi v13, v13, 8
vror.vi v14, v14, 8
vadd.vv v10, v10, v15
vadd.vv v11, v11, v12
vadd.vv v8, v8, v13
vadd.vv v9, v9, v14
vxor.vv v5, v5, v10
vxor.vv v6, v6, v11
vxor.vv v7, v7, v8
vxor.vv v4, v4, v9
vror.vi v5, v5, 7
vror.vi v6, v6, 7
vror.vi v7, v7, 7
vror.vi v4, v4, 7
vadd.vv v0, v0, v18
vadd.vv v1, v1, v19
vadd.vv v2, v2, v23
vadd.vv v3, v3, v20
vadd.vv v0, v0, v4
vadd.vv v1, v1, v5
vadd.vv v2, v2, v6
vadd.vv v3, v3, v7
vxor.vv v12, v12, v0
vxor.vv v13, v13, v1
vxor.vv v14, v14, v2
vxor.vv v15, v15, v3
vror.vi v12, v12, 16
vror.vi v13, v13, 16
vror.vi v14, v14, 16
vror.vi v15, v15, 16
vadd.vv v8, v8, v12
vadd.vv v9, v9, v13
vadd.vv v10, v10, v14
vadd.vv v11, v11, v15
vxor.vv v4, v4, v8
vxor.vv v5, v5, v9
vxor.vv v6, v6, v10
vxor.vv v7, v7, v11
vror.vi v4, v4, 12
vror.vi v5, v5, 12
vror.vi v6, v6, 12
vror.vi v7, v7, 12
vadd.vv v0, v0, v22
vadd.vv v1, v1, v26
vadd.vv v2, v2, v16
vadd.vv v3, v3, v29
vadd.vv v0, v0, v4
vadd.vv v1, v1, v5
vadd.vv v2, v2, v6
vadd.vv v3, v3, v7
vxor.vv v12, v12, v0
vxor.vv v13, v13, v1
vxor.vv v14, v14, v2
vxor.vv v15, v15, v3
vror.vi v12, v12, 8
vror.vi v13, v13, 8
vror.vi v14, v14, 8
vror.vi v15, v15, 8
vadd.vv v8, v8, v12
vadd.vv v9, v9, v13
vadd.vv v10, v10, v14
vadd.vv v11, v11, v15
vxor.vv v4, v4, v8
vxor.vv v5, v5, v9
vxor.vv v6, v6, v10
vxor.vv v7, v7, v11
vror.vi v4, v4, 7
vror.vi v5, v5, 7
vror.vi v6, v6, 7
vror.vi v7, v7, 7
vadd.vv v0, v0, v17
vadd.vv v1, v1, v28
vadd.vv v2, v2, v25
vadd.vv v3, v3, v31
vadd.vv v0, v0, v5
vadd.vv v1, v1, v6
vadd.vv v2, v2, v7
vadd.vv v3, v3, v4
vxor.vv v15, v15, v0
vxor.vv v12, v12, v1
vxor.vv v13, v13, v2
vxor.vv v14, v14, v3
vror.vi v15, v15, 16
vror.vi v12, v12, 16
vror.vi v13, v13, 16
vror.vi v14, v14, 16
vadd.vv v10, v10, v15
vadd.vv v11, v11, v12
vadd.vv v8, v8, v13
vadd.vv v9, v9, v14
vxor.vv v5, v5, v10
vxor.vv v6, v6, v11
vxor.vv v7, v7, v8
vxor.vv v4, v4, v9
vror.vi v5, v5, 12
vror.vi v6, v6, 12
vror.vi v7, v7, 12
vror.vi v4, v4, 12
vadd.vv v0, v0, v27
vadd.vv v1, v1, v21
vadd.vv v2, v2, v30
vadd.vv v3, v3, v24
vadd.vv v0, v0, v5
vadd.vv v1, v1, v6
vadd.vv v2, v2, v7
vadd.vv v3, v3, v4
vxor.vv v15, v15, v0
vxor.vv v12, v12, v1
vxor.vv v13, v13, v2
vxor.vv v14, v14, v3
vror.vi v15, v15, 8
vror.vi v12, v12, 8
vror.vi v13, v13, 8
vror.vi v14, v14, 8
vadd.vv v10, v10, v15
vadd.vv v11, v11, v12
vadd.vv v8, v8, v13
vadd.vv v9, v9, v14
vxor.vv v5, v5, v10
vxor.vv v6, v6, v11
vxor.vv v7, v7, v8
vxor.vv v4, v4, v9
vror.vi v5, v5, 7
vror.vi v6, v6, 7
vror.vi v7, v7, 7
vror.vi v4, v4, 7
vadd.vv v0, v0, v19
vadd.vv v1, v1, v26
vadd.vv v2, v2, v29
vadd.vv v3, v3, v23
vadd.vv v0, v0, v4
vadd.vv v1, v1, v5
vadd.vv v2, v2, v6
vadd.vv v3, v3, v7
vxor.vv v12, v12, v0
vxor.vv v13, v13, v1
vxor.vv v14, v14, v2
vxor.vv v15, v15, v3
vror.vi v12, v12, 16
vror.vi v13, v13, 16
vror.vi v14, v14, 16
vror.vi v15, v15, 16
vadd.vv v8, v8, v12
vadd.vv v9, v9, v13
vadd.vv v10, v10, v14
vadd.vv v11, v11, v15
vxor.vv v4, v4, v8
vxor.vv v5, v5, v9
vxor.vv v6, v6, v10
vxor.vv v7, v7, v11
vror.vi v4, v4, 12
vror.vi v5, v5, 12
vror.vi v6, v6, 12
vror.vi v7, v7, 12
vadd.vv v0, v0, v20
vadd.vv v1, v1, v28
vadd.vv v2, v2, v18
vadd.vv v3, v3, v30
vadd.vv v0, v0, v4
vadd.vv v1, v1, v5
vadd.vv v2, v2, v6
vadd.vv v3, v3, v7
vxor.vv v12, v12, v0
vxor.vv v13, v13, v1
vxor.vv v14, v14, v2
vxor.vv v15, v15, v3
vror.vi v12, v12, 8
vror.vi v13, v13, 8
vror.vi v14, v14, 8
vror.vi v15, v15, 8
vadd.vv v8, v8, v12
vadd.vv v9, v9, v13
vadd.vv v10, v10, v14
vadd.vv v11, v11, v15
vxor.vv v4, v4, v8
vxor.vv v5, v5, v9
vxor.vv v6, v6, v10
vxor.vv v7, v7, v11
vror.vi v4, v4, 7
vror.vi v5, v5, 7
vror.vi v6, v6, 7
vror.vi v7, v7, 7
vadd.vv v0, v0, v22
vadd.vv v1, v1, v25
vadd.vv v2, v2, v27
vadd.vv v3, v3, v24
vadd.vv v0, v0, v5
vadd.vv v1, v1, v6
vadd.vv v2, v2, v7
vadd.vv v3, v3, v4
vxor.vv v15, v15, v0
vxor.vv v12, v12, v1
vxor.vv v13, v13, v2
vxor.vv v14, v14, v3
vror.vi v15, v15, 16
vror.vi v12, v12, 16
vror.vi v13, v13, 16
vror.vi v14, v14, 16
vadd.vv v10, v10, v15
vadd.vv v11, v11, v12
vadd.vv v8, v8, v13
vadd.vv v9, v9, v14
vxor.vv v5, v5, v10
vxor.vv v6, v6, v11
vxor.vv v7, v7, v8
vxor.vv v4, v4, v9
vror.vi v5, v5, 12
vror.vi v6, v6, 12
vror.vi v7, v7, 12
vror.vi v4, v4, 12
vadd.vv v0, v0, v21
vadd.vv v1, v1, v16
vadd.vv v2, v2, v31
vadd.vv v3, v3, v17
vadd.vv v0, v0, v5
vadd.vv v1, v1, v6
vadd.vv v2, v2, v7
vadd.vv v3, v3, v4
vxor.vv v15, v15, v0
vxor.vv v12, v12, v1
vxor.vv v13, v13, v2
vxor.vv v14, v14, v3
vror.vi v15, v15, 8
vror.vi v12, v12, 8
vror.vi v13, v13, 8
vror.vi v14, v14, 8
vadd.vv v10, v10, v15
vadd.vv v11, v11, v12
vadd.vv v8, v8, v13
vadd.vv v9, v9, v14
vxor.vv v5, v5, v10
vxor.vv v6, v6, v11
vxor.vv v7, v7, v8
vxor.vv v4, v4, v9
vror.vi v5, v5, 7
vror.vi v6, v6, 7
vror.vi v7, v7, 7
vror.vi v4, v4, 7
vadd.vv v0, v0, v26
vadd.vv v1, v1, v28
vadd.vv v2, v2, v30
vadd.vv v3, v3, v29
vadd.vv v0, v0, v4
vadd.vv v1, v1, v5
vadd.vv v2, v2, v6
vadd.vv v3, v3, v7
vxor.vv v12, v12, v0
vxor.vv v13, v13, v1
vxor.vv v14, v14, v2
vxor.vv v15, v15, v3
vror.vi v12, v12, 16
vror.vi v13, v13, 16
vror.vi v14, v14, 16
vror.vi v15, v15, 16
vadd.vv v8, v8, v12
vadd.vv v9, v9, v13
vadd.vv v10, v10, v14
vadd.vv v11, v11, v15
vxor.vv v4, v4, v8
vxor.vv v5, v5, v9
vxor.vv v6, v6, v10
vxor.vv v7, v7, v11
vror.vi v4, v4, 12
vror.vi v5, v5, 12
vror.vi v6, v6, 12
vror.vi v7, v7, 12
vadd.vv v0, v0, v23
vadd.vv v1, v1, v25
vadd.vv v2, v2, v19
vadd.vv v3, v3, v31
vadd.vv v0, v0, v4
vadd.vv v1, v1, v5
vadd.vv v2, v2, v6
vadd.vv v3, v3, v7
vxor.vv v12, v12, v0
vxor.vv v13, v13, v1
vxor.vv v14, v14, v2
vxor.vv v15, v15, v3
vror.vi v12, v12, 8
vror.vi v13, v13, 8
vror.vi v14, v14, 8
vror.vi v15, v15, 8
vadd.vv v8, v8, v12
vadd.vv v9, v9, v13
vadd.vv v10, v10, v14
vadd.vv v11, v11, v15
vxor.vv v4, v4, v8
vxor.vv v5, v5, v9
vxor.vv v6, v6, v10
vxor.vv v7, v7, v11
vror.vi v4, v4, 7
vror.vi v5, v5, 7
vror.vi v6, v6, 7
vror.vi v7, v7, 7
vadd.vv v0, v0, v20
vadd.vv v1, v1, v27
vadd.vv v2, v2, v21
vadd.vv v3, v3, v17
vadd.vv v0, v0, v5
vadd.vv v1, v1, v6
vadd.vv v2, v2, v7
vadd.vv v3, v3, v4
vxor.vv v15, v15, v0
vxor.vv v12, v12, v1
vxor.vv v13, v13, v2
vxor.vv v14, v14, v3
vror.vi v15, v15, 16
vror.vi v12, v12, 16
vror.vi v13, v13, 16
vror.vi v14, v14, 16
vadd.vv v10, v10, v15
vadd.vv v11, v11, v12
vadd.vv v8, v8, v13
vadd.vv v9, v9, v14
vxor.vv v5, v5, v10
vxor.vv v6, v6, v11
vxor.vv v7, v7, v8
vxor.vv v4, v4, v9
vror.vi v5, v5, 12
vror.vi v6, v6, 12
vror.vi v7, v7, 12
vror.vi v4, v4, 12
vadd.vv v0, v0, v16
vadd.vv v1, v1, v18
vadd.vv v2, v2, v24
vadd.vv v3, v3, v22
vadd.vv v0, v0, v5
vadd.vv v1, v1, v6
vadd.vv v2, v2, v7
vadd.vv v3, v3, v4
vxor.vv v15, v15, v0
vxor.vv v12, v12, v1
vxor.vv v13, v13, v2
vxor.vv v14, v14, v3
vror.vi v15, v15, 8
vror.vi v12, v12, 8
vror.vi v13, v13, 8
vror.vi v14, v14, 8
vadd.vv v10, v10, v15
vadd.vv v11, v11, v12
vadd.vv v8, v8, v13
vadd.vv v9, v9, v14
vxor.vv v5, v5, v10
vxor.vv v6, v6, v11
vxor.vv v7, v7, v8
vxor.vv v4, v4, v9
vror.vi v5, v5, 7
vror.vi v6, v6, 7
vror.vi v7, v7, 7
vror.vi v4, v4, 7
vadd.vv v0, v0, v28
vadd.vv v1, v1, v25
vadd.vv v2, v2, v31
vadd.vv v3, v3, v30
vadd.vv v0, v0, v4
vadd.vv v1, v1, v5
vadd.vv v2, v2, v6
vadd.vv v3, v3, v7
vxor.vv v12, v12, v0
vxor.vv v13, v13, v1
vxor.vv v14, v14, v2
vxor.vv v15, v15, v3
vror.vi v12, v12, 16
vror.vi v13, v13, 16
vror.vi v14, v14, 16
vror.vi v15, v15, 16
vadd.vv v8, v8, v12
vadd.vv v9, v9, v13
vadd.vv v10, v10, v14
vadd.vv v11, v11, v15
vxor.vv v4, v4, v8
vxor.vv v5, v5, v9
vxor.vv v6, v6, v10
vxor.vv v7, v7, v11
vror.vi v4, v4, 12
vror.vi v5, v5, 12
vror.vi v6, v6, 12
vror.vi v7, v7, 12
vadd.vv v0, v0, v29
vadd.vv v1, v1, v27
vadd.vv v2, v2, v26
vadd.vv v3, v3, v24
vadd.vv v0, v0, v4
vadd.vv v1, v1, v5
vadd.vv v2, v2, v6
vadd.vv v3, v3, v7
vxor.vv v12, v12, v0
vxor.vv v13, v13, v1
vxor.vv v14, v14, v2
vxor.vv v15, v15, v3
vror.vi v12, v12, 8
vror.vi v13, v13, 8
vror.vi v14, v14, 8
vror.vi v15, v15, 8
vadd.vv v8, v8, v12
vadd.vv v9, v9, v13
vadd.vv v10, v10, v14
vadd.vv v11, v11, v15
vxor.vv v4, v4, v8
vxor.vv v5, v5, v9
vxor.vv v6, v6, v10
vxor.vv v7, v7, v11
vror.vi v4, v4, 7
vror.vi v5, v5, 7
vror.vi v6, v6, 7
vror.vi v7, v7, 7
vadd.vv v0, v0, v23
vadd.vv v1, v1, v21
vadd.vv v2, v2, v16
vadd.vv v3, v3, v22
vadd.vv v0, v0, v5
vadd.vv v1, v1, v6
vadd.vv v2, v2, v7
vadd.vv v3, v3, v4
vxor.vv v15, v15, v0
vxor.vv v12, v12, v1
vxor.vv v13, v13, v2
vxor.vv v14, v14, v3
vror.vi v15, v15, 16
vror.vi v12, v12, 16
vror.vi v13, v13, 16
vror.vi v14, v14, 16
vadd.vv v10, v10, v15
vadd.vv v11, v11, v12
vadd.vv v8, v8, v13
vadd.vv v9, v9, v14
vxor.vv v5, v5, v10
vxor.vv v6, v6, v11
vxor.vv v7, v7, v8
vxor.vv v4, v4, v9
vror.vi v5, v5, 12
vror.vi v6, v6, 12
vror.vi v7, v7, 12
vror.vi v4, v4, 12
vadd.vv v0, v0, v18
vadd.vv v1, v1, v19
vadd.vv v2, v2, v17
vadd.vv v3, v3, v20
vadd.vv v0, v0, v5
vadd.vv v1, v1, v6
vadd.vv v2, v2, v7
vadd.vv v3, v3, v4
vxor.vv v15, v15, v0
vxor.vv v12, v12, v1
vxor.vv v13, v13, v2
vxor.vv v14, v14, v3
vror.vi v15, v15, 8
vror.vi v12, v12, 8
vror.vi v13, v13, 8
vror.vi v14, v14, 8
vadd.vv v10, v10, v15
vadd.vv v11, v11, v12
vadd.vv v8, v8, v13
vadd.vv v9, v9, v14
vxor.vv v5, v5, v10
vxor.vv v6, v6, v11
vxor.vv v7, v7, v8
vxor.vv v4, v4, v9
vror.vi v5, v5, 7
vror.vi v6, v6, 7
vror.vi v7, v7, 7
vror.vi v4, v4, 7
vadd.vv v0, v0, v25
vadd.vv v1, v1, v27
vadd.vv v2, v2, v24
vadd.vv v3, v3, v31
vadd.vv v0, v0, v4
vadd.vv v1, v1, v5
vadd.vv v2, v2, v6
vadd.vv v3, v3, v7
vxor.vv v12, v12, v0
vxor.vv v13, v13, v1
vxor.vv v14, v14, v2
vxor.vv v15, v15, v3
vror.vi v12, v12, 16
vror.vi v13, v13, 16
vror.vi v14, v14, 16
vror.vi v15, v15, 16
vadd.vv v8, v8, v12
vadd.vv v9, v9, v13
vadd.vv v10, v10, v14
vadd.vv v11, v11, v15
vxor.vv v4, v4, v8
vxor.vv v5, v5, v9
vxor.vv v6, v6, v10
vxor.vv v7, v7, v11
vror.vi v4, v4, 12
vror.vi v5, v5, 12
vror.vi v6, v6, 12
vror.vi v7, v7, 12
vadd.vv v0, v0, v30
vadd.vv v1, v1, v21
vadd.vv v2, v2, v28
vadd.vv v3, v3, v17
vadd.vv v0, v0, v4
vadd.vv v1, v1, v5
vadd.vv v2, v2, v6
vadd.vv v3, v3, v7
vxor.vv v12, v12, v0
vxor.vv v13, v13, v1
vxor.vv v14, v14, v2
vxor.vv v15, v15, v3
vror.vi v12, v12, 8
vror.vi v13, v13, 8
vror.vi v14, v14, 8
vror.vi v15, v15, 8
vadd.vv v8, v8, v12
vadd.vv v9, v9, v13
vadd.vv v10, v10, v14
vadd.vv v11, v11, v15
vxor.vv v4, v4, v8
vxor.vv v5, v5, v9
vxor.vv v6, v6, v10
vxor.vv v7, v7, v11
vror.vi v4, v4, 7
vror.vi v5, v5, 7
vror.vi v6, v6, 7
vror.vi v7, v7, 7
vadd.vv v0, v0, v29
vadd.vv v1, v1, v16
vadd.vv v2, v2, v18
vadd.vv v3, v3, v20
vadd.vv v0, v0, v5
vadd.vv v1, v1, v6
vadd.vv v2, v2, v7
vadd.vv v3, v3, v4
vxor.vv v15, v15, v0
vxor.vv v12, v12, v1
vxor.vv v13, v13, v2
vxor.vv v14, v14, v3
vror.vi v15, v15, 16
vror.vi v12, v12, 16
vror.vi v13, v13, 16
vror.vi v14, v14, 16
vadd.vv v10, v10, v15
vadd.vv v11, v11, v12
vadd.vv v8, v8, v13
vadd.vv v9, v9, v14
vxor.vv v5, v5, v10
vxor.vv v6, v6, v11
vxor.vv v7, v7, v8
vxor.vv v4, v4, v9
vror.vi v5, v5, 12
vror.vi v6, v6, 12
vror.vi v7, v7, 12
vror.vi v4, v4, 12
vadd.vv v0, v0, v19
vadd.vv v1, v1, v26
vadd.vv v2, v2, v22
vadd.vv v3, v3, v23
vadd.vv v0, v0, v5
vadd.vv v1, v1, v6
vadd.vv v2, v2, v7
vadd.vv v3, v3, v4
vxor.vv v15, v15, v0
vxor.vv v12, v12, v1
vxor.vv v13, v13, v2
vxor.vv v14, v14, v3
vror.vi v15, v15, 8
vror.vi v12, v12, 8
vror.vi v13, v13, 8
vror.vi v14, v14, 8
vadd.vv v10, v10, v15
vadd.vv v11, v11, v12
vadd.vv v8, v8, v13
vadd.vv v9, v9, v14
vxor.vv v5, v5, v10
vxor.vv v6, v6, v11
vxor.vv v7, v7, v8
vxor.vv v4, v4, v9
vror.vi v5, v5, 7
vror.vi v6, v6, 7
vror.vi v7, v7, 7
vror.vi v4, v4, 7
vadd.vv v0, v0, v27
vadd.vv v1, v1, v21
vadd.vv v2, v2, v17
vadd.vv v3, v3, v24
vadd.vv v0, v0, v4
vadd.vv v1, v1, v5
vadd.vv v2, v2, v6
vadd.vv v3, v3, v7
vxor.vv v12, v12, v0
vxor.vv v13, v13, v1
vxor.vv v14, v14, v2
vxor.vv v15, v15, v3
vror.vi v12, v12, 16
vror.vi v13, v13, 16
vror.vi v14, v14, 16
vror.vi v15, v15, 16
vadd.vv v8, v8, v12
vadd.vv v9, v9, v13
vadd.vv v10, v10, v14
vadd.vv v11, v11, v15
vxor.vv v4, v4, v8
vxor.vv v5, v5, v9
vxor.vv v6, v6, v10
vxor.vv v7, v7, v11
vror.vi v4, v4, 12
vror.vi v5, v5, 12
vror.vi v6, v6, 12
vror.vi v7, v7, 12
vadd.vv v0, v0, v31
vadd.vv v1, v1, v16
vadd.vv v2, v2, v25
vadd.vv v3, v3, v22
vadd.vv v0, v0, v4
vadd.vv v1, v1, v5
vadd.vv v2, v2, v6
vadd.vv v3, v3, v7
vxor.vv v12, v12, v0
vxor.vv v13, v13, v1
vxor.vv v14, v14, v2
vxor.vv v15, v15, v3
vror.vi v12, v12, 8
vror.vi v13, v13, 8
vror.vi v14, v14, 8
vror.vi v15, v15, 8
vadd.vv v8, v8, v12
vadd.vv v9, v9, v13
vadd.vv v10, v10, v14
vadd.vv v11, v11, v15
vxor.vv v4, v4, v8
vxor.vv v5, v5, v9
vxor.vv v6, v6, v10
vxor.vv v7, v7, v11
vror.vi v4, v4, 7
vror.vi v5, v5, 7
vror.vi v6, v6, 7
vror.vi v7, v7, 7
vadd.vv v0, v0, v30
vadd.vv v1, v1, v18
vadd.vv v2, v2, v19
vadd.vv v3, v3, v23
vadd.vv v0, v0, v5
vadd.vv v1, v1, v6
vadd.vv v2, v2, v7
vadd.vv v3, v3, v4
vxor.vv v15, v15, v0
vxor.vv v12, v12, v1
vxor.vv v13, v13, v2
vxor.vv v14, v14, v3
vror.vi v15, v15, 16
vror.vi v12, v12, 16
vror.vi v13, v13, 16
vror.vi v14, v14, 16
vadd.vv v10, v10, v15
vadd.vv v11, v11, v12
vadd.vv v8, v8, v13
vadd.vv v9, v9, v14
vxor.vv v5, v5, v10
vxor.vv v6, v6, v11
vxor.vv v7, v7, v8
vxor.vv v4, v4, v9
vror.vi v5, v5, 12
vror.vi v6, v6, 12
vror.vi v7, v7, 12
vror.vi v4, v4, 12
vadd.vv v0, v0, v26
vadd.vv v1, v1, v28
vadd.vv v2, v2, v20
vadd.vv v3, v3, v29
vadd.vv v0, v0, v5
vadd.vv v1, v1, v6
vadd.vv v2, v2, v7
vadd.vv v3, v3, v4
vxor.vv v15, v15, v0
vxor.vv v12, v12, v1
vxor.vv v13, v13, v2
vxor.vv v14, v14, v3
vror.vi v15, v15, 8
vror.vi v12, v12, 8
vror.vi v13, v13, 8
vror.vi v14, v14, 8
vadd.vv v10, v10, v15
vadd.vv v11, v11, v12
vadd.vv v8, v8, v13
vadd.vv v9, v9, v14
vxor.vv v5, v5, v10
vxor.vv v6, v6, v11
vxor.vv v7, v7, v8
vxor.vv v4, v4, v9
vror.vi v5, v5, 7
vror.vi v6, v6, 7
vror.vi v7, v7, 7
vror.vi v4, v4, 7
ret
// a0: aligned+transposed input
// a1: num_parents
// a2: key
// a3: flags
// a4: out pointer
.global blake3_guts_riscv64gcv_hash_parents
blake3_guts_riscv64gcv_hash_parents:
// the transposed stride in bytes: 2 * MAX_SIMD_DEGREE * 4
li t1, 128
// load the transposed CVs and split alternating words into the low and
// high halves of the input vectors
vsetvli zero, a1, e32, m1, ta, ma
vlseg2e32.v v16, (a0)
vmv.v.v v24, v17
add a0, a0, t1
vlseg2e32.v v17, (a0)
vmv.v.v v25, v18
add a0, a0, t1
vlseg2e32.v v18, (a0)
vmv.v.v v26, v19
add a0, a0, t1
vlseg2e32.v v19, (a0)
vmv.v.v v27, v20
add a0, a0, t1
vlseg2e32.v v20, (a0)
vmv.v.v v28, v21
add a0, a0, t1
vlseg2e32.v v21, (a0)
vmv.v.v v29, v22
add a0, a0, t1
vlseg2e32.v v22, (a0)
vmv.v.v v30, v23
add a0, a0, t1
vlseg2e32.v v14, (a0) // use v14-15 as scratch space to avoid overwriting v24
vmv.v.v v23, v14
vmv.v.v v31, v15
// broadcast the key
lwu t0, (a2)
vmv.v.x v0, t0
add a2, a2, 4
lwu t0, (a2)
vmv.v.x v1, t0
add a2, a2, 4
lwu t0, (a2)
vmv.v.x v2, t0
add a2, a2, 4
lwu t0, (a2)
vmv.v.x v3, t0
add a2, a2, 4
lwu t0, (a2)
vmv.v.x v4, t0
add a2, a2, 4
lwu t0, (a2)
vmv.v.x v5, t0
add a2, a2, 4
lwu t0, (a2)
vmv.v.x v6, t0
add a2, a2, 4
lwu t0, (a2)
vmv.v.x v7, t0
// zero the counter
vmv.v.i v12, 0
vmv.v.i v13, 0
// broadcast the block length
li t0, 64
vmv.v.x v14, t0
// broadcast the flags
vmv.v.x v15, a3
// execute the kernel
mv t6, ra
call blake3_guts_riscv64gcv_kernel
mv ra, t6
// xor the two halves of the state
vxor.vv v0, v0, v8
vxor.vv v1, v1, v9
vxor.vv v2, v2, v10
vxor.vv v3, v3, v11
vxor.vv v4, v4, v12
vxor.vv v5, v5, v13
vxor.vv v6, v6, v14
vxor.vv v7, v7, v15
// write aligned+transposed outputs with a stride of 2*MAX_SIMD_DEGREE words
vse32.v v0, (a4)
add a4, a4, t1
vse32.v v1, (a4)
add a4, a4, t1
vse32.v v2, (a4)
add a4, a4, t1
vse32.v v3, (a4)
add a4, a4, t1
vse32.v v4, (a4)
add a4, a4, t1
vse32.v v5, (a4)
add a4, a4, t1
vse32.v v6, (a4)
add a4, a4, t1
vse32.v v7, (a4)
ret

View File

@ -0,0 +1,82 @@
//! This implementation currently assumes riscv64gcv_zbb_zvbb. Zvbb in particular ("Vector
//! Bit-manipulation used in Cryptography") is a bleeding-edge extension that was only frozen a few
//! weeks ago at the time I'm writing this comment. Compiling and testing this code currently
//! requires quite a lot of effort, including building Clang from master and building QEMU from a
//! custom branch. Please don't expect this code to be usable on real hardware for some time.
use crate::{CVBytes, Implementation};
pub(crate) const MAX_SIMD_DEGREE: usize = 16;
extern "C" {
fn blake3_guts_riscv64gcv_degree() -> usize;
fn blake3_guts_riscv64gcv_hash_parents(
transposed_input: *const u32,
num_parents: usize,
key: *const CVBytes,
flags: u32,
transposed_output: *mut u32,
);
}
pub fn implementation() -> Implementation {
Implementation::new(
blake3_guts_riscv64gcv_degree,
crate::portable::compress,
crate::portable::hash_chunks,
blake3_guts_riscv64gcv_hash_parents,
crate::portable::xof,
crate::portable::xof_xor,
crate::portable::universal_hash,
)
}
#[cfg(test)]
mod test {
use super::*;
#[test]
fn test_compress_vs_portable() {
crate::test::test_compress_vs_portable(&implementation());
}
#[test]
fn test_compress_vs_reference() {
crate::test::test_compress_vs_reference(&implementation());
}
#[test]
fn test_hash_chunks_vs_portable() {
crate::test::test_hash_chunks_vs_portable(&implementation());
}
#[test]
fn test_hash_parents_vs_portable() {
crate::test::test_hash_parents_vs_portable(&implementation());
}
#[test]
fn test_chunks_and_parents_vs_reference() {
crate::test::test_chunks_and_parents_vs_reference(&implementation());
}
#[test]
fn test_xof_vs_portable() {
crate::test::test_xof_vs_portable(&implementation());
}
#[test]
fn test_xof_vs_reference() {
crate::test::test_xof_vs_reference(&implementation());
}
#[test]
fn test_universal_hash_vs_portable() {
crate::test::test_universal_hash_vs_portable(&implementation());
}
#[test]
fn test_universal_hash_vs_reference() {
crate::test::test_universal_hash_vs_reference(&implementation());
}
}

View File

@ -101,7 +101,7 @@ pub fn test_hash_chunks_vs_portable(test_impl: &Implementation) {
// 95 is one whole block plus one interesting part of another
input_2_lengths.push(next_len - 95);
input_2_lengths.push(next_len);
if next_len == MAX_SIMD_DEGREE * CHUNK_LEN {
if next_len == test_impl.degree() * CHUNK_LEN {
break;
}
input_2_lengths.push(next_len + 95);
@ -165,7 +165,7 @@ pub fn test_hash_parents_vs_portable(test_impl: &Implementation) {
let input = painted_transposed_input();
for num_parents in 2..=(test_impl.degree() / 2) {
dbg!(num_parents);
let mut portable_output = TransposedVectors(input.0);
let mut portable_output = TransposedVectors::new();
let (portable_left, portable_right) =
test_impl.split_transposed_vectors(&mut portable_output);
portable::implementation().hash_parents(
@ -183,7 +183,7 @@ pub fn test_hash_parents_vs_portable(test_impl: &Implementation) {
portable_right,
);
let mut test_output = TransposedVectors(input.0);
let mut test_output = TransposedVectors::new();
let (test_left, test_right) = test_impl.split_transposed_vectors(&mut test_output);
test_impl.hash_parents(
&input,