add support for partial chunks

2024-05-27 00:16:03 +02:00 · 2023-08-14 17:31:19 +08:00 · 2023-08-14 17:31:19 +08:00 · 459af9529e
parent 207b94c34e
commit 459af9529e
4 changed files with 165 additions and 40 deletions
--- a/rust/guts/src/lib.rs
+++ b/rust/guts/src/lib.rs
@ -185,6 +185,45 @@ impl Implementation {
        out
    }

+    // The contract for HashChunksFn doesn't require the implementation to support single-chunk
+    // inputs. Instead we handle that case here by calling compress in a loop.
+    #[inline]
+    fn hash_one_chunk(
+        &self,
+        mut input: &[u8],
+        key: &CVBytes,
+        counter: u64,
+        mut flags: u32,
+        output: TransposedSplit,
+    ) {
+        debug_assert!(input.len() <= CHUNK_LEN);
+        let mut cv = *key;
+        flags |= CHUNK_START;
+        while input.len() > BLOCK_LEN {
+            cv = self.compress(
+                input[..BLOCK_LEN].try_into().unwrap(),
+                BLOCK_LEN as u32,
+                &cv,
+                counter,
+                flags,
+            );
+            input = &input[BLOCK_LEN..];
+            flags &= !CHUNK_START;
+        }
+        let mut final_block = [0u8; BLOCK_LEN];
+        final_block[..input.len()].copy_from_slice(input);
+        cv = self.compress(
+            &final_block,
+            input.len() as u32,
+            &cv,
+            counter,
+            flags | CHUNK_END,
+        );
+        unsafe {
+            write_transposed_cv(&words_from_le_bytes_32(&cv), output.ptr);
+        }
+    }
+
    #[inline]
    fn hash_chunks_fn(&self) -> HashChunksFn {
        unsafe { mem::transmute(self.hash_chunks_ptr.load(Relaxed)) }
@ -199,8 +238,14 @@ impl Implementation {
        flags: u32,
        transposed_output: TransposedSplit,
    ) -> usize {
-        debug_assert!(input.len() > 0);
        debug_assert!(input.len() <= self.degree() * CHUNK_LEN);
+        if input.len() <= CHUNK_LEN {
+            // The underlying hash_chunks_fn isn't required to support this case. Instead we handle
+            // it by calling compress_fn in a loop. But note that we still don't support root
+            // finalization or the empty input here.
+            self.hash_one_chunk(input, key, counter, flags, transposed_output);
+            return 1;
+        }
        // SAFETY: If the caller passes in more than MAX_SIMD_DEGREE * CHUNK_LEN bytes, silently
        // ignore the remainder. This makes it impossible to write out of bounds in a properly
        // constructed TransposedSplit.
--- a/rust/guts/src/riscv64gcv.S
+++ b/rust/guts/src/riscv64gcv.S
@ -827,24 +827,32 @@ blake3_guts_riscv64gcv_kernel:
 // a3: counter
 // a4: flags
 // a5: aligned+transposed output [unused]
+// a6: total chunks [unused]
+// a7: remaining_bytes_in_last_chunk
 blake3_guts_riscv64gcv_hash_blocks:
-        // t0 = full_blocks = (input_len + 1024 - 64) / 1024
+        // t0 := full_blocks := (input_len + 1024 - 64) / 1024
+        addi t0, a1, 1024 - 64
+        srli t0, t0, 10
+        // Load and transpose full message blocks.
        // NOTE: If the final chunk is short, this could be 1 less than the
        // total number of chunks, in which case this setup code and the kernel
        // will leave a CV word undisturbed in each of v0-v7.
-        addi t0, a1, 1024 - 64
-        srli t0, t0, 10
-        vsetvli zero, t0, e32, m1, ta, ma
-        li t1, 1024
        // NOTE: These loads could be misaligned. As far as I know, the Linux
        // RISC-V ABI allows misaligned loads and stores. If we need to support
        // an environment that doesn't allow them (or where they're
        // unacceptably slow), we could add a fallback here.
+        vsetvli zero, t0, e32, m1, ta, ma
+        li t1, 1024
+        addi t2, a0, 32
        vlsseg8e32.v v16, (a0), t1
-        addi a0, a0, 32
-        vlsseg8e32.v v24, (a0), t1
-        addi a0, a0, 32
-        addi a1, a1, -64
+        vlsseg8e32.v v24, (t2), t1
+        // If remaining_bytes_in_last_chunk in 1..=63, there's a partial block
+        // at the end. Handle it out-of-line. If we take this branch, it will
+        // increment t0 by 1.
+        addi t1, a7, -1
+        li t2, 63
+        bltu t1, t2, handle_partial_block
+partial_block_finished:
        // load the counter
        vsetvli zero, t0, e64, m2, ta, ma
        vmv.v.x v8, a3
@ -857,12 +865,26 @@ blake3_guts_riscv64gcv_hash_blocks:
        vncvt.x.x.w v12, v8
        li t1, 32
        vnsrl.wx v13, v8, t1
-        // broadcast the block length
-        // TODO: handle partial blocks
+        // Broadcast the block length, then overwrite the last block's length
+        // to be ((min(64, remaining_bytes_in_last_chunk) - 1) % 64) + 1. That
+        // is: 64 if remaining_bytes_in_last_chunk >= 64
+        //     else 64 if remaining_bytes_in_last_chunk is 0
+        //     else remaining_bytes_in_last_chunk
        li t1, 64
        vmv.v.x v14, t1
-        // broadcast the flags
+        minu t1, t1, a7
+        addi t1, t1, -1
+        andi t1, t1, 63
+        addi t1, t1, 1
+        vslide1down.vx v14, v14, t1
+        // Broadcast the flags, then set CHUNK_END in the last block's flags if
+        // remaining_bytes_in_last_chunk is in 1..=64.
        vmv.v.x v15, a4
+        addi t1, a7, -1
+        sltiu t1, t1, 64
+        slli t1, t1, 1  // CHUNK_END = 2
+        or t1, t1, a4
+        vslide1down.vx v15, v15, t1
        // execute the kernel
        mv t6, ra
        call blake3_guts_riscv64gcv_kernel
@ -876,7 +898,80 @@ blake3_guts_riscv64gcv_hash_blocks:
        vxor.vv v5, v5, v13
        vxor.vv v6, v6, v14
        vxor.vv v7, v7, v15
+        // Increment the input pointer, input_len, and
+        // remaining_bytes_in_last_chunk (which cannot go below zero).
+        addi a0, a0, 64
+        addi a1, a1, -64
+        addi a7, a7, -64
+        max a7, a7, zero
        ret
+handle_partial_block:
+        // The minimum VLEN is 128 bits, so we're guaranteed to be able to fit
+        // the block in v8-v11 with LMUL=4. Clear 64 zero bytes before the
+        // load, to make sure the partial block is zero-padded.
+        li t1, 64
+        vsetvli zero, t1, e8, m4, ta, ma
+        vmv.v.i v8, 0
+        add t2, a0, a1
+        sub t2, t2, a7
+        vsetvli zero, a7, e8, m4, ta, ma
+        vle8.v v8, (t2)
+        // If VLEN is longer than 128 bits (16 bytes), then half or all of the
+        // block bytes will be in v8. Make sure they're split evenly across
+        // v8-v11.
+        csrr t1, vlenb
+        li t2, 64
+        bltu t1, t2, vlenb_less_than_64
+        vsetivli zero, 8, e32, m1, ta, ma
+        vslidedown.vi v9, v8, 8
+vlenb_less_than_64:
+        li t2, 32
+        bltu t1, t2, vlenb_less_than_32
+        vsetivli zero, 4, e32, m1, ta, ma
+        vmv.v.v v10, v9
+        vslidedown.vi v11, v9, 4
+        vslidedown.vi v9, v8, 4
+vlenb_less_than_32:
+        // Shift each of the words of the padded partial block to the end of
+        // the corresponding message vector. t0 was previously the number of
+        // full blocks. Now we increment it, so that it's the number of all
+        // blocks (both full and partial).
+        mv t1, t0
+        addi t0, t0, 1
+        // Set vl to at least 4, because v8-v11 each have 4 message words.
+        // Setting vl shorter will make vslide1down clobber those words.
+        li t2, 4
+        maxu t2, t0, t2
+        vsetvli zero, t2, e32, m1, ta, ma
+        vslideup.vx v16, v8, t1
+        vslide1down.vx v8, v8, zero
+        vslideup.vx v17, v8, t1
+        vslide1down.vx v8, v8, zero
+        vslideup.vx v18, v8, t1
+        vslide1down.vx v8, v8, zero
+        vslideup.vx v19, v8, t1
+        vslideup.vx v20, v9, t1
+        vslide1down.vx v9, v9, zero
+        vslideup.vx v21, v9, t1
+        vslide1down.vx v9, v9, zero
+        vslideup.vx v22, v9, t1
+        vslide1down.vx v9, v9, zero
+        vslideup.vx v23, v9, t1
+        vslideup.vx v24, v10, t1
+        vslide1down.vx v10, v10, zero
+        vslideup.vx v25, v10, t1
+        vslide1down.vx v10, v10, zero
+        vslideup.vx v26, v10, t1
+        vslide1down.vx v10, v10, zero
+        vslideup.vx v27, v10, t1
+        vslideup.vx v28, v11, t1
+        vslide1down.vx v11, v11, zero
+        vslideup.vx v29, v11, t1
+        vslide1down.vx v11, v11, zero
+        vslideup.vx v30, v11, t1
+        vslide1down.vx v11, v11, zero
+        vslideup.vx v31, v11, t1
+        j partial_block_finished

 // a0: input
 // a1: input_len
@ -886,9 +981,16 @@ blake3_guts_riscv64gcv_hash_blocks:
 // a5: aligned+transposed output
 .global blake3_guts_riscv64gcv_hash_chunks
 blake3_guts_riscv64gcv_hash_chunks:
-        // keep the original num_chunks = (input_len+1023)/1024 in a6
+        // Save the original num_chunks = (input_len+1023)/1024 in a6.
        addi a6, a1, 1023
        srli a6, a6, 10
+        // Track the bytes remaining in the last chunk in a7. The initial value
+        // of this is ((input_len - 1) % 1024) + 1. (The input to this function
+        // is never empty.) It decrements by 64 with each call to
+        // blake3_guts_riscv64gcv_hash_chunks, but not below 0.
+        addi a7, a1, -1
+        andi a7, a7, 1023
+        addi a7, a7, 1
        // broadcast the key to v0-7
        vsetvli zero, a6, e32, m1, ta, ma
        lw t0, 0(a2)
--- a/rust/guts/src/riscv64gcv.rs
+++ b/rust/guts/src/riscv64gcv.rs
@ -4,7 +4,7 @@
 //! requires quite a lot of effort, including building Clang from master and building QEMU from a
 //! custom branch. Please don't expect this code to be usable on real hardware for some time.

-use crate::{CVBytes, Implementation, CHUNK_LEN};
+use crate::{CVBytes, Implementation};

 // NOTE: Keep this in sync with the same constant in assembly.
 pub(crate) const MAX_SIMD_DEGREE: usize = 16;
@ -28,33 +28,11 @@ extern "C" {
    );
 }

-unsafe extern "C" fn hash_chunks(
-    input: *const u8,
-    input_len: usize,
-    key: *const CVBytes,
-    counter: u64,
-    flags: u32,
-    transposed_output: *mut u32,
-) {
-    if input_len % CHUNK_LEN == 0 {
-        blake3_guts_riscv64gcv_hash_chunks(
-            input,
-            input_len,
-            key,
-            counter,
-            flags,
-            transposed_output,
-        );
-    } else {
-        crate::portable::hash_chunks(input, input_len, key, counter, flags, transposed_output);
-    }
-}
-
 pub fn implementation() -> Implementation {
    Implementation::new(
        blake3_guts_riscv64gcv_degree,
        crate::portable::compress,
-        hash_chunks,
+        blake3_guts_riscv64gcv_hash_chunks,
        blake3_guts_riscv64gcv_hash_parents,
        crate::portable::xof,
        crate::portable::xof_xor,
--- a/rust/guts/src/test.rs
+++ b/rust/guts/src/test.rs
@ -107,8 +107,8 @@ pub fn test_hash_chunks_vs_portable(test_impl: &Implementation) {
    paint_test_input(aligned_input);
    paint_test_input(unaligned_input);
    // Try just below, equal to, and just above every whole number of chunks.
-    let mut input_2_lengths = vec![1];
-    let mut next_len = CHUNK_LEN;
+    let mut input_2_lengths = Vec::new();
+    let mut next_len = 2 * CHUNK_LEN;
    loop {
        // 95 is one whole block plus one interesting part of another
        input_2_lengths.push(next_len - 95);