1
0
Fork 0
mirror of https://github.com/BLAKE3-team/BLAKE3 synced 2024-05-27 12:46:02 +02:00

kernel2::parents_16

This commit is contained in:
Jack O'Connor 2022-12-17 12:37:24 -08:00
parent c9c63d54dc
commit 0c80427419
3 changed files with 149 additions and 9 deletions

View File

@ -1,4 +1,5 @@
#![feature(test)]
#![feature(stdsimd)]
extern crate test;
@ -8,6 +9,7 @@ use blake3::guts::{BLOCK_LEN, CHUNK_LEN};
use blake3::platform::{Platform, MAX_SIMD_DEGREE};
use blake3::OUT_LEN;
use rand::prelude::*;
use std::mem;
use test::Bencher;
const KIB: usize = 1024;
@ -293,8 +295,8 @@ fn bench_many_parents_kernel(b: &mut Bencher) {
return;
}
use blake3::kernel::Words16;
let size = 16 * std::mem::size_of::<Words16>();
let alignment = std::mem::align_of::<Words16>();
let size = 16 * mem::size_of::<Words16>();
let alignment = mem::align_of::<Words16>();
assert_eq!(alignment, 64);
let mut input = RandomInput::new_aligned(b, size, alignment);
for _ in 0..100 {
@ -676,7 +678,7 @@ fn bench_chunks16_kernel2(b: &mut Bencher) {
return;
}
let mut input = RandomInput::new(b, 16 * CHUNK_LEN);
let key_words = [0; 8];
let key_words = [42; 8];
let counter = 0;
let flags = 0;
b.iter(|| unsafe {
@ -685,6 +687,33 @@ fn bench_chunks16_kernel2(b: &mut Bencher) {
});
}
#[bench]
fn bench_parents16_kernel2(b: &mut Bencher) {
if !is_x86_feature_detected!("avx512f") || !is_x86_feature_detected!("avx512vl") {
return;
}
b.bytes = 16 * BLOCK_LEN as u64;
let mut random_parent_bytes = [0; 16 * 64];
let mut rng = rand::thread_rng();
rng.fill_bytes(&mut random_parent_bytes);
let left_parent_bytes: [u8; 16 * 32] = random_parent_bytes[..16 * 32].try_into().unwrap();
let right_parent_bytes: [u8; 16 * 32] = random_parent_bytes[16 * 32..].try_into().unwrap();
let left_parent_vectors: [core::arch::x86_64::__m512i; 8] =
unsafe { mem::transmute(left_parent_bytes) };
let right_parent_vectors: [core::arch::x86_64::__m512i; 8] =
unsafe { mem::transmute(right_parent_bytes) };
let key_words = [42; 8];
let flags = 0;
b.iter(|| unsafe {
blake3::kernel2::parents_16(
&left_parent_vectors,
&right_parent_vectors,
&key_words,
flags,
);
});
}
#[bench]
fn bench_xof_kernel2(b: &mut Bencher) {
if !is_x86_feature_detected!("avx512f") || !is_x86_feature_detected!("avx512vl") {

View File

@ -2448,7 +2448,7 @@ global_asm!(
"BLAKE3_AVX512_ODD_INDEXES:",
".long 1, 3, 5, 7, 9, 11, 13, 15, 17, 19, 21, 23, 25, 27, 29, 31",
"blake3_avx512_parents_16:",
// The first 8 out of 16 input message vectors, which are the transposed CVs of the first 8
// The first 8 out of 16 input message vectors, which are the transposed CVs of the first 16
// children, come in looking like this:
//
// a0, b0, c0, d0, e0, f0, g0, h0, i0, j0, k0, l0, m0, n0, o0, p0

View File

@ -1115,12 +1115,123 @@ fn test_chunks_16() {
#[target_feature(enable = "avx512f,avx512vl")]
pub unsafe fn parents_16(
_left_children: &[__m512i; 8],
_right_children: &[__m512i; 8],
_key: &[u32; 8],
_flags: u32,
left_children: &[__m512i; 8],
right_children: &[__m512i; 8],
key: &[u32; 8],
flags: u32,
) -> [__m512i; 8] {
todo!();
let even_indexes = _mm512_setr_epi32(0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30);
let odd_indexes = _mm512_setr_epi32(1, 3, 5, 7, 9, 11, 13, 15, 17, 19, 21, 23, 25, 27, 29, 31);
let mut state_regs = [
_mm512_set1_epi32(key[0] as i32),
_mm512_set1_epi32(key[1] as i32),
_mm512_set1_epi32(key[2] as i32),
_mm512_set1_epi32(key[3] as i32),
_mm512_set1_epi32(key[4] as i32),
_mm512_set1_epi32(key[5] as i32),
_mm512_set1_epi32(key[6] as i32),
_mm512_set1_epi32(key[7] as i32),
];
asm!(
"call blake3_avx512_kernel2_16",
inout("zmm0") state_regs[0],
inout("zmm1") state_regs[1],
inout("zmm2") state_regs[2],
inout("zmm3") state_regs[3],
inout("zmm4") state_regs[4],
inout("zmm5") state_regs[5],
inout("zmm6") state_regs[6],
inout("zmm7") state_regs[7],
in("zmm8") _mm512_set1_epi32(IV[0] as i32),
in("zmm9") _mm512_set1_epi32(IV[1] as i32),
in("zmm10") _mm512_set1_epi32(IV[2] as i32),
in("zmm11") _mm512_set1_epi32(IV[3] as i32),
in("zmm12") _mm512_set1_epi32(0),
in("zmm13") _mm512_set1_epi32(0),
in("zmm14") _mm512_set1_epi32(BLOCK_LEN as i32),
in("zmm15") _mm512_set1_epi32((flags | crate::PARENT as u32) as i32),
in("zmm16") _mm512_permutex2var_epi32(left_children[0], even_indexes, right_children[0]),
in("zmm17") _mm512_permutex2var_epi32(left_children[1], even_indexes, right_children[1]),
in("zmm18") _mm512_permutex2var_epi32(left_children[2], even_indexes, right_children[2]),
in("zmm19") _mm512_permutex2var_epi32(left_children[3], even_indexes, right_children[3]),
in("zmm20") _mm512_permutex2var_epi32(left_children[4], even_indexes, right_children[4]),
in("zmm21") _mm512_permutex2var_epi32(left_children[5], even_indexes, right_children[5]),
in("zmm22") _mm512_permutex2var_epi32(left_children[6], even_indexes, right_children[6]),
in("zmm23") _mm512_permutex2var_epi32(left_children[7], even_indexes, right_children[7]),
in("zmm24") _mm512_permutex2var_epi32(left_children[0], odd_indexes, right_children[0]),
in("zmm25") _mm512_permutex2var_epi32(left_children[1], odd_indexes, right_children[1]),
in("zmm26") _mm512_permutex2var_epi32(left_children[2], odd_indexes, right_children[2]),
in("zmm27") _mm512_permutex2var_epi32(left_children[3], odd_indexes, right_children[3]),
in("zmm28") _mm512_permutex2var_epi32(left_children[4], odd_indexes, right_children[4]),
in("zmm29") _mm512_permutex2var_epi32(left_children[5], odd_indexes, right_children[5]),
in("zmm30") _mm512_permutex2var_epi32(left_children[6], odd_indexes, right_children[6]),
in("zmm31") _mm512_permutex2var_epi32(left_children[7], odd_indexes, right_children[7]),
);
state_regs
}
#[test]
fn test_parents_16() {
if !crate::platform::avx512_detected() {
return;
}
// the (untransposed) bytes of 32 concatenated child CVs
let mut child_bytes = [0; 32 * 32];
crate::test::paint_test_input(&mut child_bytes);
// the same bytes, reinterpreted as words
let child_words: [u32; 32 * 8] = core::array::from_fn(|word| {
u32::from_le_bytes(child_bytes[4 * word..][..4].try_into().unwrap())
});
// manually transpose the words into vector layout
let mut left_child_vecs = [[0u32; 16]; 8];
for cv in 0..16 {
for word in 0..8 {
left_child_vecs[word][cv] = child_words[8 * cv + word];
}
}
let mut right_child_vecs = [[0u32; 16]; 8];
for cv in 0..16 {
for word in 0..8 {
right_child_vecs[word][cv] = child_words[8 * (16 + cv) + word];
}
}
let left_children: [__m512i; 8] = unsafe { mem::transmute(left_child_vecs) };
let right_children: [__m512i; 8] = unsafe { mem::transmute(right_child_vecs) };
let key = [42, 43, 44, 45, 46, 47, 48, 49];
let outputs = unsafe {
parents_16(
&left_children,
&right_children,
&key,
crate::KEYED_HASH as u32,
)
};
let output_words: [[u32; 16]; 8] = unsafe { mem::transmute(outputs) };
let mut untransposed_output_words = [[0; 8]; 16];
for vec in 0..8 {
for word in 0..16 {
untransposed_output_words[word][vec] = output_words[vec][word];
}
}
let untransposed_output_bytes: [u8; 16 * 32] =
unsafe { mem::transmute(untransposed_output_words) };
let child_blocks: [&[u8; 64]; 16] =
core::array::from_fn(|block| child_bytes[64 * block..][..64].try_into().unwrap());
let mut expected = [0u8; 16 * 32];
crate::portable::hash_many(
&child_blocks,
&key,
0,
crate::IncrementCounter::No,
crate::PARENT | crate::KEYED_HASH,
0,
0,
&mut expected,
);
assert_eq!(expected, untransposed_output_bytes);
}
#[target_feature(enable = "avx512f,avx512vl")]