mirror of
https://github.com/BLAKE3-team/BLAKE3
synced 2024-05-04 10:56:08 +02:00
remove a mis-optimization that hurt performance for uneven updates
If the total number of chunks hashed so far is e.g. 1, and update() is called with e.g. 8 more chunks, we can't compress all 8 together. We have to break the input up, to make sure that that 1 lone chunk CV gets merged with its proper sibling, and that in general the correct layout of the tree is preserved. What we should do is hash 1-2-4-1 chunks of input, using increasing powers of 2 (with some cleanup at the end). What we were doing was 2-2-2-2 chunks. This was the result of a mistaken optimization that got us stuck with an always-odd number of chunks so far. Fixes https://github.com/BLAKE3-team/BLAKE3/issues/69.
This commit is contained in:
parent
74b5fe9054
commit
8d84cfc0af
|
@ -475,3 +475,22 @@ fn bench_rayon_0512_kib(b: &mut Bencher) {
|
|||
fn bench_rayon_1024_kib(b: &mut Bencher) {
|
||||
bench_rayon(b, 1024 * KIB);
|
||||
}
|
||||
|
||||
// This checks that update() splits up its input in increasing powers of 2, so
|
||||
// that it can recover a high degree of parallelism when the number of bytes
|
||||
// hashed so far is uneven. The performance of this benchmark should be
|
||||
// reasonably close to bench_incremental_0064_kib, within 80% or so. When we
|
||||
// had a bug in this logic (https://github.com/BLAKE3-team/BLAKE3/issues/69),
|
||||
// performance was less than half.
|
||||
#[bench]
|
||||
fn bench_two_updates(b: &mut Bencher) {
|
||||
let len = 65536;
|
||||
let mut input = RandomInput::new(b, len);
|
||||
b.iter(|| {
|
||||
let mut hasher = blake3::Hasher::new();
|
||||
let input = input.get();
|
||||
hasher.update(&input[..1]);
|
||||
hasher.update(&input[1..]);
|
||||
hasher.finalize()
|
||||
});
|
||||
}
|
||||
|
|
20
c/blake3.c
20
c/blake3.c
|
@ -486,16 +486,22 @@ void blake3_hasher_update(blake3_hasher *self, const void *input,
|
|||
while (input_len > BLAKE3_CHUNK_LEN) {
|
||||
size_t subtree_len = round_down_to_power_of_2(input_len);
|
||||
uint64_t count_so_far = self->chunk.chunk_counter * BLAKE3_CHUNK_LEN;
|
||||
// Shrink the subtree_len until *half of it* it evenly divides the count so
|
||||
// far. Why half? Because compress_subtree_to_parent_node will return a
|
||||
// pair of chaining values, each representing half of the input. As long as
|
||||
// those evenly divide the input so far, we're good. We know that
|
||||
// subtree_len itself is a power of 2, so we can use a bitmasking trick
|
||||
// instead of an actual remainder operation. (Note that if the caller
|
||||
// Shrink the subtree_len until it evenly divides the count so far. We know
|
||||
// that subtree_len itself is a power of 2, so we can use a bitmasking
|
||||
// trick instead of an actual remainder operation. (Note that if the caller
|
||||
// consistently passes power-of-2 inputs of the same size, as is hopefully
|
||||
// typical, this loop condition will always fail, and subtree_len will
|
||||
// always be the full length of the input.)
|
||||
while ((((uint64_t)((subtree_len / 2) - 1)) & count_so_far) != 0) {
|
||||
//
|
||||
// An aside: We don't have to shrink subtree_len quite this much. For
|
||||
// example, if count_so_far is 1, we could pass 2 chunks to
|
||||
// compress_subtree_to_parent_node. Since we'll get 2 CVs back, we'll still
|
||||
// get the right answer in the end, and we might get to use 2-way SIMD
|
||||
// parallelism. The problem with this optimization, is that it gets us
|
||||
// stuck always hashing 2 chunks. The total number of chunks will remain
|
||||
// odd, and we'll never graduate to higher degrees of parallelism. See
|
||||
// https://github.com/BLAKE3-team/BLAKE3/issues/69.
|
||||
while ((((uint64_t)(subtree_len - 1)) & count_so_far) != 0) {
|
||||
subtree_len /= 2;
|
||||
}
|
||||
// The shrunken subtree_len might now be 1 chunk long. If so, hash that one
|
||||
|
|
|
@ -332,3 +332,24 @@ fn bench_incremental_0512_kib(b: &mut Bencher) {
|
|||
fn bench_incremental_1024_kib(b: &mut Bencher) {
|
||||
bench_incremental(b, 1024 * KIB);
|
||||
}
|
||||
|
||||
// This checks that update() splits up its input in increasing powers of 2, so
|
||||
// that it can recover a high degree of parallelism when the number of bytes
|
||||
// hashed so far is uneven. The performance of this benchmark should be
|
||||
// reasonably close to bench_incremental_0064_kib, within 80% or so. When we
|
||||
// had a bug in this logic (https://github.com/BLAKE3-team/BLAKE3/issues/69),
|
||||
// performance was less than half.
|
||||
#[bench]
|
||||
fn bench_two_updates(b: &mut Bencher) {
|
||||
let len = 65536;
|
||||
let mut input = RandomInput::new(b, len);
|
||||
b.iter(|| {
|
||||
let mut hasher = blake3_c_rust_bindings::Hasher::new();
|
||||
let input = input.get();
|
||||
hasher.update(&input[..1]);
|
||||
hasher.update(&input[1..]);
|
||||
let mut out = [0; 32];
|
||||
hasher.finalize(&mut out);
|
||||
out
|
||||
});
|
||||
}
|
||||
|
|
29
src/lib.rs
29
src/lib.rs
|
@ -1023,17 +1023,24 @@ impl Hasher {
|
|||
debug_assert_eq!(CHUNK_LEN.count_ones(), 1, "power of 2 chunk len");
|
||||
let mut subtree_len = largest_power_of_two_leq(input.len());
|
||||
let count_so_far = self.chunk_state.chunk_counter * CHUNK_LEN as u64;
|
||||
// Shrink the subtree_len until *half of it* it evenly divides the
|
||||
// count so far. Why half? Because compress_subtree_to_parent_node
|
||||
// will return a pair of chaining values, each representing half of
|
||||
// the input. As long as those evenly divide the input so far,
|
||||
// we're good. We know that subtree_len itself is a power of 2, so
|
||||
// we can use a bitmasking trick instead of an actual remainder
|
||||
// operation. (Note that if the caller consistently passes
|
||||
// power-of-2 inputs of the same size, as is hopefully typical,
|
||||
// this loop condition will always fail, and subtree_len will
|
||||
// always be the full length of the input.)
|
||||
while ((subtree_len / 2) - 1) as u64 & count_so_far != 0 {
|
||||
// Shrink the subtree_len until it evenly divides the count so far.
|
||||
// We know that subtree_len itself is a power of 2, so we can use a
|
||||
// bitmasking trick instead of an actual remainder operation. (Note
|
||||
// that if the caller consistently passes power-of-2 inputs of the
|
||||
// same size, as is hopefully typical, this loop condition will
|
||||
// always fail, and subtree_len will always be the full length of
|
||||
// the input.)
|
||||
//
|
||||
// An aside: We don't have to shrink subtree_len quite this much.
|
||||
// For example, if count_so_far is 1, we could pass 2 chunks to
|
||||
// compress_subtree_to_parent_node. Since we'll get 2 CVs back,
|
||||
// we'll still get the right answer in the end, and we might get to
|
||||
// use 2-way SIMD parallelism. The problem with this optimization,
|
||||
// is that it gets us stuck always hashing 2 chunks. The total
|
||||
// number of chunks will remain odd, and we'll never graduate to
|
||||
// higher degrees of parallelism. See
|
||||
// https://github.com/BLAKE3-team/BLAKE3/issues/69.
|
||||
while (subtree_len - 1) as u64 & count_so_far != 0 {
|
||||
subtree_len /= 2;
|
||||
}
|
||||
// The shrunken subtree_len might now be 1 chunk long. If so, hash
|
||||
|
|
Loading…
Reference in New Issue