1
0
Fork 0
mirror of https://github.com/BLAKE3-team/BLAKE3 synced 2024-05-28 01:16:02 +02:00

WIP: riscv universal_hash working for whole blocks

This commit is contained in:
Jack O'Connor 2023-09-17 17:37:34 -07:00
parent 3fa197260c
commit fd91b59473

View File

@ -14,6 +14,14 @@
#define TRANSPOSED_STRIDE_BYTES 2 * MAX_SIMD_DEGREE * 4
#define CHUNK_START (1 << 0)
#define CHUNK_END (1 << 1)
#define PARENT (1 << 2)
#define ROOT (1 << 3)
#define KEYED_HASH (1 << 4)
#define DERIVE_KEY_CONTEXT (1 << 5)
#define DERIVE_KEY_MATERIAL (1 << 6)
.section .text
.p2align 2
@ -1609,45 +1617,78 @@ blake3_guts_riscv64gcv_xof_xor_partial_block:
// a4: out_ptr
.global blake3_guts_riscv64gcv_universal_hash
blake3_guts_riscv64gcv_universal_hash:
// Prepare the padding null bytes in v8-v11, length = -input_len & 63.
// LMUL=4 is guaranteed to be sufficient to hold 64 bytes. Retain this
// length in t3.
neg t3, a1
andi t3, t3, 63
vsetvli zero, t3, e8, m4, ta, ma
vmv.v.i v8, 0
// t0 := full_blocks := input_len / 64
// TODO: handle the partial block at the end
srli t0, a1, 6
// Load the counter.
vsetvli zero, t0, e64, m2, ta, ma
vmv.v.x v8, a3
vid.v v10
vadd.vv v8, v8, v10
vsetvli zero, t0, e32, m1, ta, ma
vncvt.x.x.w v12, v8
li t1, 32
vnsrl.wx v13, v8, t1
// Load and transpose full message blocks. These are "strided segment
// loads". Each vlsseg8e32 instruction transposes 8 words from multiple
// message blocks into 8 registers, so we need two vlsseg8e32
// instructions (with the second offset by 32 bytes) to load full
// 64-byte blocks. The 64-byte stride equals the block size, because in
// this case (unlike hash_blocks) the blocks are adjacent.
// NOTE: These loads could be misaligned. As far as I know, the Linux
// RISC-V ABI allows misaligned loads and stores. If we need to support
// an environment that doesn't allow them (or where they're
// unacceptably slow), we could add a fallback here.
li t1, 64
addi t2, a0, 32
vlsseg8e32.v v16, (a0), t1
vlsseg8e32.v v24, (t2), t1
// Broadcast the key to v0-7.
lw t0, 0(a2)
vmv.v.x v0, t0
lw t0, 4(a2)
vmv.v.x v1, t0
lw t0, 8(a2)
vmv.v.x v2, t0
lw t0, 12(a2)
vmv.v.x v3, t0
lw t0, 16(a2)
vmv.v.x v4, t0
lw t0, 20(a2)
vmv.v.x v5, t0
lw t0, 24(a2)
vmv.v.x v6, t0
lw t0, 28(a2)
vmv.v.x v7, t0
// Broadcast the block length.
li t1, 64
vmv.v.x v14, t1
// Broadcast the flags.
li t1, CHUNK_START | CHUNK_END | ROOT | KEYED_HASH
vmv.v.x v15, t1
// Execute the kernel.
mv t6, ra
call blake3_guts_riscv64gcv_kernel
mv ra, t6
// XOR the first four words. The rest are dropped.
vxor.vv v0, v0, v8
vxor.vv v1, v1, v9
vxor.vv v2, v2, v10
vxor.vv v3, v3, v11
// XOR-reduce each vector.
vmv.v.i v4, 0
vredxor.vs v0, v0, v4
vredxor.vs v1, v1, v4
vredxor.vs v2, v2, v4
vredxor.vs v3, v3, v4
// Write the output.
vmv.x.s t0, v0
sw t0, 0(a4)
vmv.x.s t0, v1
sw t0, 4(a4)
vmv.x.s t0, v2
sw t0, 8(a4)
vmv.x.s t0, v3
sw t0, 12(a4)
// Load the input into v16-v31 and slide the padding bytes into place.
// Rather than checking which register group needs to be padded, just pad
// them both. The vslideup will be a no-op if the offset is >vl, and
// anything after the last block of input will ultimately be ignored.
vsetvli t0, a1, e8, m8, ta, ma
vle8.v v16, (a0)
add t1, a1, t3
vsetvli zero, t1, e8, m8, ta, ma
vslideup.vx v16, v8, a1
add a0, a0, t0
sub a1, a1, t0
vsetvli t0, a1, e8, m8, ta, ma
vle8.v v24, (a0)
add t1, a1, t3
vsetvli zero, t1, e8, m8, ta, ma
vslideup.vx v24, v8, a1
// Compute the vrgather indexes for the first of two rounds of vrgathers.
// We'll gather four vectors at a time, which lets us leave v0-v8
// untouched. That isn't a requirement in this case, but we can use the
// same vrgathers in hash_blocks where it is a requirement. Let W be the
// max number of 32-bit words per physical vector register (vlenb/4), then
// the gather indexes are:
// 0, 1, 2, 3, 16, 17, 18, 19, 32, 33, 34, 35, ...
// f(i) = (i/4)*16 + (i%4)
vsetvli t0, zero, e16, m2, ta, ma // VLMAX
vid.v v8
vid.v v10
vsrl.vi v8, v8, 2 // /4
vand.vi v10, v10, 3 // %4
vsll.vi v8, v8, 4 // *16
vadd.vv v8, v8, v10
ret
ret