diff --git a/benches/bench.rs b/benches/bench.rs index 08aaf63..6726aa6 100644 --- a/benches/bench.rs +++ b/benches/bench.rs @@ -705,3 +705,11 @@ fn bench_xof_xor_kernel2(b: &mut Bencher) { ); }); } + +#[bench] +fn bench_just_kernel2(b: &mut Bencher) { + b.bytes = 16 * 64; + b.iter(|| unsafe { + blake3::kernel2::just_kernel2(); + }); +} diff --git a/src/kernel2.rs b/src/kernel2.rs index 33558de..cbc9818 100644 --- a/src/kernel2.rs +++ b/src/kernel2.rs @@ -815,8 +815,7 @@ global_asm!( "vprord zmm6, zmm6, 7", "vprord zmm7, zmm7, 7", "vprord zmm4, zmm4, 7", - // lower half final xors - // NOTE: upper half final xors done by XOF callers + // final xors "vpxord zmm0, zmm0, zmm8", "vpxord zmm1, zmm1, zmm9", "vpxord zmm2, zmm2, zmm10", @@ -983,6 +982,45 @@ unsafe fn incrementing_counter(initial_value: u64) -> (__m512i, __m512i) { (low_words, high_words) } +#[target_feature(enable = "avx512f,avx512vl")] +pub unsafe fn just_kernel2() { + asm!( + "call blake3_avx512_kernel2_16", + in("zmm0") _mm512_set1_epi32(0), + in("zmm1") _mm512_set1_epi32(0), + in("zmm2") _mm512_set1_epi32(0), + in("zmm3") _mm512_set1_epi32(0), + in("zmm4") _mm512_set1_epi32(0), + in("zmm5") _mm512_set1_epi32(0), + in("zmm6") _mm512_set1_epi32(0), + in("zmm7") _mm512_set1_epi32(0), + in("zmm8") _mm512_set1_epi32(0), + in("zmm9") _mm512_set1_epi32(0), + in("zmm10") _mm512_set1_epi32(0), + in("zmm11") _mm512_set1_epi32(0), + in("zmm12") _mm512_set1_epi32(0), + in("zmm13") _mm512_set1_epi32(0), + in("zmm14") _mm512_set1_epi32(0), + in("zmm15") _mm512_set1_epi32(0), + in("zmm16") _mm512_set1_epi32(0), + in("zmm17") _mm512_set1_epi32(0), + in("zmm18") _mm512_set1_epi32(0), + in("zmm19") _mm512_set1_epi32(0), + in("zmm20") _mm512_set1_epi32(0), + in("zmm21") _mm512_set1_epi32(0), + in("zmm22") _mm512_set1_epi32(0), + in("zmm23") _mm512_set1_epi32(0), + in("zmm24") _mm512_set1_epi32(0), + in("zmm25") _mm512_set1_epi32(0), + in("zmm26") _mm512_set1_epi32(0), + in("zmm27") _mm512_set1_epi32(0), + in("zmm28") _mm512_set1_epi32(0), + in("zmm29") _mm512_set1_epi32(0), + in("zmm30") _mm512_set1_epi32(0), + in("zmm31") _mm512_set1_epi32(0), + ); +} + #[inline] #[target_feature(enable = "avx512f,avx512vl")] unsafe fn xof_inner_16(