mirror of
https://github.com/BLAKE3-team/BLAKE3
synced 2024-09-26 13:00:43 +02:00
add support for partial chunks
This commit is contained in:
parent
207b94c34e
commit
459af9529e
@ -185,6 +185,45 @@ impl Implementation {
|
||||
out
|
||||
}
|
||||
|
||||
// The contract for HashChunksFn doesn't require the implementation to support single-chunk
|
||||
// inputs. Instead we handle that case here by calling compress in a loop.
|
||||
#[inline]
|
||||
fn hash_one_chunk(
|
||||
&self,
|
||||
mut input: &[u8],
|
||||
key: &CVBytes,
|
||||
counter: u64,
|
||||
mut flags: u32,
|
||||
output: TransposedSplit,
|
||||
) {
|
||||
debug_assert!(input.len() <= CHUNK_LEN);
|
||||
let mut cv = *key;
|
||||
flags |= CHUNK_START;
|
||||
while input.len() > BLOCK_LEN {
|
||||
cv = self.compress(
|
||||
input[..BLOCK_LEN].try_into().unwrap(),
|
||||
BLOCK_LEN as u32,
|
||||
&cv,
|
||||
counter,
|
||||
flags,
|
||||
);
|
||||
input = &input[BLOCK_LEN..];
|
||||
flags &= !CHUNK_START;
|
||||
}
|
||||
let mut final_block = [0u8; BLOCK_LEN];
|
||||
final_block[..input.len()].copy_from_slice(input);
|
||||
cv = self.compress(
|
||||
&final_block,
|
||||
input.len() as u32,
|
||||
&cv,
|
||||
counter,
|
||||
flags | CHUNK_END,
|
||||
);
|
||||
unsafe {
|
||||
write_transposed_cv(&words_from_le_bytes_32(&cv), output.ptr);
|
||||
}
|
||||
}
|
||||
|
||||
#[inline]
|
||||
fn hash_chunks_fn(&self) -> HashChunksFn {
|
||||
unsafe { mem::transmute(self.hash_chunks_ptr.load(Relaxed)) }
|
||||
@ -199,8 +238,14 @@ impl Implementation {
|
||||
flags: u32,
|
||||
transposed_output: TransposedSplit,
|
||||
) -> usize {
|
||||
debug_assert!(input.len() > 0);
|
||||
debug_assert!(input.len() <= self.degree() * CHUNK_LEN);
|
||||
if input.len() <= CHUNK_LEN {
|
||||
// The underlying hash_chunks_fn isn't required to support this case. Instead we handle
|
||||
// it by calling compress_fn in a loop. But note that we still don't support root
|
||||
// finalization or the empty input here.
|
||||
self.hash_one_chunk(input, key, counter, flags, transposed_output);
|
||||
return 1;
|
||||
}
|
||||
// SAFETY: If the caller passes in more than MAX_SIMD_DEGREE * CHUNK_LEN bytes, silently
|
||||
// ignore the remainder. This makes it impossible to write out of bounds in a properly
|
||||
// constructed TransposedSplit.
|
||||
|
@ -827,24 +827,32 @@ blake3_guts_riscv64gcv_kernel:
|
||||
// a3: counter
|
||||
// a4: flags
|
||||
// a5: aligned+transposed output [unused]
|
||||
// a6: total chunks [unused]
|
||||
// a7: remaining_bytes_in_last_chunk
|
||||
blake3_guts_riscv64gcv_hash_blocks:
|
||||
// t0 = full_blocks = (input_len + 1024 - 64) / 1024
|
||||
// t0 := full_blocks := (input_len + 1024 - 64) / 1024
|
||||
addi t0, a1, 1024 - 64
|
||||
srli t0, t0, 10
|
||||
// Load and transpose full message blocks.
|
||||
// NOTE: If the final chunk is short, this could be 1 less than the
|
||||
// total number of chunks, in which case this setup code and the kernel
|
||||
// will leave a CV word undisturbed in each of v0-v7.
|
||||
addi t0, a1, 1024 - 64
|
||||
srli t0, t0, 10
|
||||
vsetvli zero, t0, e32, m1, ta, ma
|
||||
li t1, 1024
|
||||
// NOTE: These loads could be misaligned. As far as I know, the Linux
|
||||
// RISC-V ABI allows misaligned loads and stores. If we need to support
|
||||
// an environment that doesn't allow them (or where they're
|
||||
// unacceptably slow), we could add a fallback here.
|
||||
vsetvli zero, t0, e32, m1, ta, ma
|
||||
li t1, 1024
|
||||
addi t2, a0, 32
|
||||
vlsseg8e32.v v16, (a0), t1
|
||||
addi a0, a0, 32
|
||||
vlsseg8e32.v v24, (a0), t1
|
||||
addi a0, a0, 32
|
||||
addi a1, a1, -64
|
||||
vlsseg8e32.v v24, (t2), t1
|
||||
// If remaining_bytes_in_last_chunk in 1..=63, there's a partial block
|
||||
// at the end. Handle it out-of-line. If we take this branch, it will
|
||||
// increment t0 by 1.
|
||||
addi t1, a7, -1
|
||||
li t2, 63
|
||||
bltu t1, t2, handle_partial_block
|
||||
partial_block_finished:
|
||||
// load the counter
|
||||
vsetvli zero, t0, e64, m2, ta, ma
|
||||
vmv.v.x v8, a3
|
||||
@ -857,12 +865,26 @@ blake3_guts_riscv64gcv_hash_blocks:
|
||||
vncvt.x.x.w v12, v8
|
||||
li t1, 32
|
||||
vnsrl.wx v13, v8, t1
|
||||
// broadcast the block length
|
||||
// TODO: handle partial blocks
|
||||
// Broadcast the block length, then overwrite the last block's length
|
||||
// to be ((min(64, remaining_bytes_in_last_chunk) - 1) % 64) + 1. That
|
||||
// is: 64 if remaining_bytes_in_last_chunk >= 64
|
||||
// else 64 if remaining_bytes_in_last_chunk is 0
|
||||
// else remaining_bytes_in_last_chunk
|
||||
li t1, 64
|
||||
vmv.v.x v14, t1
|
||||
// broadcast the flags
|
||||
minu t1, t1, a7
|
||||
addi t1, t1, -1
|
||||
andi t1, t1, 63
|
||||
addi t1, t1, 1
|
||||
vslide1down.vx v14, v14, t1
|
||||
// Broadcast the flags, then set CHUNK_END in the last block's flags if
|
||||
// remaining_bytes_in_last_chunk is in 1..=64.
|
||||
vmv.v.x v15, a4
|
||||
addi t1, a7, -1
|
||||
sltiu t1, t1, 64
|
||||
slli t1, t1, 1 // CHUNK_END = 2
|
||||
or t1, t1, a4
|
||||
vslide1down.vx v15, v15, t1
|
||||
// execute the kernel
|
||||
mv t6, ra
|
||||
call blake3_guts_riscv64gcv_kernel
|
||||
@ -876,7 +898,80 @@ blake3_guts_riscv64gcv_hash_blocks:
|
||||
vxor.vv v5, v5, v13
|
||||
vxor.vv v6, v6, v14
|
||||
vxor.vv v7, v7, v15
|
||||
// Increment the input pointer, input_len, and
|
||||
// remaining_bytes_in_last_chunk (which cannot go below zero).
|
||||
addi a0, a0, 64
|
||||
addi a1, a1, -64
|
||||
addi a7, a7, -64
|
||||
max a7, a7, zero
|
||||
ret
|
||||
handle_partial_block:
|
||||
// The minimum VLEN is 128 bits, so we're guaranteed to be able to fit
|
||||
// the block in v8-v11 with LMUL=4. Clear 64 zero bytes before the
|
||||
// load, to make sure the partial block is zero-padded.
|
||||
li t1, 64
|
||||
vsetvli zero, t1, e8, m4, ta, ma
|
||||
vmv.v.i v8, 0
|
||||
add t2, a0, a1
|
||||
sub t2, t2, a7
|
||||
vsetvli zero, a7, e8, m4, ta, ma
|
||||
vle8.v v8, (t2)
|
||||
// If VLEN is longer than 128 bits (16 bytes), then half or all of the
|
||||
// block bytes will be in v8. Make sure they're split evenly across
|
||||
// v8-v11.
|
||||
csrr t1, vlenb
|
||||
li t2, 64
|
||||
bltu t1, t2, vlenb_less_than_64
|
||||
vsetivli zero, 8, e32, m1, ta, ma
|
||||
vslidedown.vi v9, v8, 8
|
||||
vlenb_less_than_64:
|
||||
li t2, 32
|
||||
bltu t1, t2, vlenb_less_than_32
|
||||
vsetivli zero, 4, e32, m1, ta, ma
|
||||
vmv.v.v v10, v9
|
||||
vslidedown.vi v11, v9, 4
|
||||
vslidedown.vi v9, v8, 4
|
||||
vlenb_less_than_32:
|
||||
// Shift each of the words of the padded partial block to the end of
|
||||
// the corresponding message vector. t0 was previously the number of
|
||||
// full blocks. Now we increment it, so that it's the number of all
|
||||
// blocks (both full and partial).
|
||||
mv t1, t0
|
||||
addi t0, t0, 1
|
||||
// Set vl to at least 4, because v8-v11 each have 4 message words.
|
||||
// Setting vl shorter will make vslide1down clobber those words.
|
||||
li t2, 4
|
||||
maxu t2, t0, t2
|
||||
vsetvli zero, t2, e32, m1, ta, ma
|
||||
vslideup.vx v16, v8, t1
|
||||
vslide1down.vx v8, v8, zero
|
||||
vslideup.vx v17, v8, t1
|
||||
vslide1down.vx v8, v8, zero
|
||||
vslideup.vx v18, v8, t1
|
||||
vslide1down.vx v8, v8, zero
|
||||
vslideup.vx v19, v8, t1
|
||||
vslideup.vx v20, v9, t1
|
||||
vslide1down.vx v9, v9, zero
|
||||
vslideup.vx v21, v9, t1
|
||||
vslide1down.vx v9, v9, zero
|
||||
vslideup.vx v22, v9, t1
|
||||
vslide1down.vx v9, v9, zero
|
||||
vslideup.vx v23, v9, t1
|
||||
vslideup.vx v24, v10, t1
|
||||
vslide1down.vx v10, v10, zero
|
||||
vslideup.vx v25, v10, t1
|
||||
vslide1down.vx v10, v10, zero
|
||||
vslideup.vx v26, v10, t1
|
||||
vslide1down.vx v10, v10, zero
|
||||
vslideup.vx v27, v10, t1
|
||||
vslideup.vx v28, v11, t1
|
||||
vslide1down.vx v11, v11, zero
|
||||
vslideup.vx v29, v11, t1
|
||||
vslide1down.vx v11, v11, zero
|
||||
vslideup.vx v30, v11, t1
|
||||
vslide1down.vx v11, v11, zero
|
||||
vslideup.vx v31, v11, t1
|
||||
j partial_block_finished
|
||||
|
||||
// a0: input
|
||||
// a1: input_len
|
||||
@ -886,9 +981,16 @@ blake3_guts_riscv64gcv_hash_blocks:
|
||||
// a5: aligned+transposed output
|
||||
.global blake3_guts_riscv64gcv_hash_chunks
|
||||
blake3_guts_riscv64gcv_hash_chunks:
|
||||
// keep the original num_chunks = (input_len+1023)/1024 in a6
|
||||
// Save the original num_chunks = (input_len+1023)/1024 in a6.
|
||||
addi a6, a1, 1023
|
||||
srli a6, a6, 10
|
||||
// Track the bytes remaining in the last chunk in a7. The initial value
|
||||
// of this is ((input_len - 1) % 1024) + 1. (The input to this function
|
||||
// is never empty.) It decrements by 64 with each call to
|
||||
// blake3_guts_riscv64gcv_hash_chunks, but not below 0.
|
||||
addi a7, a1, -1
|
||||
andi a7, a7, 1023
|
||||
addi a7, a7, 1
|
||||
// broadcast the key to v0-7
|
||||
vsetvli zero, a6, e32, m1, ta, ma
|
||||
lw t0, 0(a2)
|
||||
|
@ -4,7 +4,7 @@
|
||||
//! requires quite a lot of effort, including building Clang from master and building QEMU from a
|
||||
//! custom branch. Please don't expect this code to be usable on real hardware for some time.
|
||||
|
||||
use crate::{CVBytes, Implementation, CHUNK_LEN};
|
||||
use crate::{CVBytes, Implementation};
|
||||
|
||||
// NOTE: Keep this in sync with the same constant in assembly.
|
||||
pub(crate) const MAX_SIMD_DEGREE: usize = 16;
|
||||
@ -28,33 +28,11 @@ extern "C" {
|
||||
);
|
||||
}
|
||||
|
||||
unsafe extern "C" fn hash_chunks(
|
||||
input: *const u8,
|
||||
input_len: usize,
|
||||
key: *const CVBytes,
|
||||
counter: u64,
|
||||
flags: u32,
|
||||
transposed_output: *mut u32,
|
||||
) {
|
||||
if input_len % CHUNK_LEN == 0 {
|
||||
blake3_guts_riscv64gcv_hash_chunks(
|
||||
input,
|
||||
input_len,
|
||||
key,
|
||||
counter,
|
||||
flags,
|
||||
transposed_output,
|
||||
);
|
||||
} else {
|
||||
crate::portable::hash_chunks(input, input_len, key, counter, flags, transposed_output);
|
||||
}
|
||||
}
|
||||
|
||||
pub fn implementation() -> Implementation {
|
||||
Implementation::new(
|
||||
blake3_guts_riscv64gcv_degree,
|
||||
crate::portable::compress,
|
||||
hash_chunks,
|
||||
blake3_guts_riscv64gcv_hash_chunks,
|
||||
blake3_guts_riscv64gcv_hash_parents,
|
||||
crate::portable::xof,
|
||||
crate::portable::xof_xor,
|
||||
|
@ -107,8 +107,8 @@ pub fn test_hash_chunks_vs_portable(test_impl: &Implementation) {
|
||||
paint_test_input(aligned_input);
|
||||
paint_test_input(unaligned_input);
|
||||
// Try just below, equal to, and just above every whole number of chunks.
|
||||
let mut input_2_lengths = vec![1];
|
||||
let mut next_len = CHUNK_LEN;
|
||||
let mut input_2_lengths = Vec::new();
|
||||
let mut next_len = 2 * CHUNK_LEN;
|
||||
loop {
|
||||
// 95 is one whole block plus one interesting part of another
|
||||
input_2_lengths.push(next_len - 95);
|
||||
|
Loading…
Reference in New Issue
Block a user