1
0
Fork 0
mirror of https://github.com/BLAKE3-team/BLAKE3 synced 2024-05-26 15:56:02 +02:00

AVX-512 xof_xor

This commit is contained in:
Jack O'Connor 2023-07-16 20:24:10 -07:00
parent 7e85ceac47
commit e3171426bc
2 changed files with 76 additions and 6 deletions

View File

@ -25,6 +25,8 @@
.global _blake3_guts_avx512_compress_xof
.global blake3_guts_avx512_xof_16
.global _blake3_guts_avx512_xof_16
.global blake3_guts_avx512_xof_xor_16
.global _blake3_guts_avx512_xof_xor_16
#ifdef __APPLE__
.text
@ -3543,8 +3545,8 @@ blake3_guts_kernel_16_avx512:
// r8d: flags
// r9: out pointer
.p2align 6
_blake3_guts_avx512_xof_16:
blake3_guts_avx512_xof_16:
_blake3_guts_avx512_xof_inner_16:
blake3_guts_avx512_xof_inner_16:
// broadcast the block words
vpbroadcastd zmm16,DWORD PTR [rdi]
vpbroadcastd zmm17,DWORD PTR [rdi+0x4]
@ -3670,8 +3672,18 @@ blake3_guts_avx512_xof_16:
vshufi32x4 zmm13,zmm21,zmm29,0xdd
vshufi32x4 zmm14,zmm22,zmm30,0xdd
vshufi32x4 zmm15,zmm23,zmm31,0xdd
ret
// write out the untransposed state
// rdi: block pointer
// esi: block_len
// rdx: cv
// rcx: counter
// r8d: flags
// r9: out pointer
.p2align 6
_blake3_guts_avx512_xof_16:
blake3_guts_avx512_xof_16:
call blake3_guts_avx512_xof_inner_16
vmovdqu32 ZMMWORD PTR [r9],zmm0
vmovdqu32 ZMMWORD PTR [r9+0x40],zmm1
vmovdqu32 ZMMWORD PTR [r9+0x80],zmm2
@ -3690,6 +3702,50 @@ blake3_guts_avx512_xof_16:
vmovdqu32 ZMMWORD PTR [r9+0x3c0],zmm15
ret
// rdi: block pointer
// esi: block_len
// rdx: cv
// rcx: counter
// r8d: flags
// r9: out pointer
.p2align 6
_blake3_guts_avx512_xof_xor_16:
blake3_guts_avx512_xof_xor_16:
call blake3_guts_avx512_xof_inner_16
vpxord zmm0, zmm0, ZMMWORD PTR [r9]
vmovdqu32 ZMMWORD PTR [r9],zmm0
vpxord zmm1, zmm1, ZMMWORD PTR [r9+0x40]
vmovdqu32 ZMMWORD PTR [r9+0x40],zmm1
vpxord zmm2, zmm2, ZMMWORD PTR [r9+0x80]
vmovdqu32 ZMMWORD PTR [r9+0x80],zmm2
vpxord zmm3, zmm3, ZMMWORD PTR [r9+0xc0]
vmovdqu32 ZMMWORD PTR [r9+0xc0],zmm3
vpxord zmm4, zmm4, ZMMWORD PTR [r9+0x100]
vmovdqu32 ZMMWORD PTR [r9+0x100],zmm4
vpxord zmm5, zmm5, ZMMWORD PTR [r9+0x140]
vmovdqu32 ZMMWORD PTR [r9+0x140],zmm5
vpxord zmm6, zmm6, ZMMWORD PTR [r9+0x180]
vmovdqu32 ZMMWORD PTR [r9+0x180],zmm6
vpxord zmm7, zmm7, ZMMWORD PTR [r9+0x1c0]
vmovdqu32 ZMMWORD PTR [r9+0x1c0],zmm7
vpxord zmm8, zmm8, ZMMWORD PTR [r9+0x200]
vmovdqu32 ZMMWORD PTR [r9+0x200],zmm8
vpxord zmm9, zmm9, ZMMWORD PTR [r9+0x240]
vmovdqu32 ZMMWORD PTR [r9+0x240],zmm9
vpxord zmm10, zmm10, ZMMWORD PTR [r9+0x280]
vmovdqu32 ZMMWORD PTR [r9+0x280],zmm10
vpxord zmm11, zmm11, ZMMWORD PTR [r9+0x2c0]
vmovdqu32 ZMMWORD PTR [r9+0x2c0],zmm11
vpxord zmm12, zmm12, ZMMWORD PTR [r9+0x300]
vmovdqu32 ZMMWORD PTR [r9+0x300],zmm12
vpxord zmm13, zmm13, ZMMWORD PTR [r9+0x340]
vmovdqu32 ZMMWORD PTR [r9+0x340],zmm13
vpxord zmm14, zmm14, ZMMWORD PTR [r9+0x380]
vmovdqu32 ZMMWORD PTR [r9+0x380],zmm14
vpxord zmm15, zmm15, ZMMWORD PTR [r9+0x3c0]
vmovdqu32 ZMMWORD PTR [r9+0x3c0],zmm15
ret
#ifdef __APPLE__
.static_data
#else

View File

@ -27,6 +27,14 @@ extern "C" {
flags: u32,
out: *mut u8,
);
fn blake3_guts_avx512_xof_xor_16(
block: *const BlockBytes,
block_len: u32,
cv: *const CVBytes,
counter: u64,
flags: u32,
out: *mut u8,
);
}
unsafe extern "C" fn hash_chunks(
@ -96,11 +104,17 @@ unsafe extern "C" fn xof_xor(
block: *const BlockBytes,
block_len: u32,
cv: *const CVBytes,
counter: u64,
mut counter: u64,
flags: u32,
out: *mut u8,
out_len: usize,
mut out: *mut u8,
mut out_len: usize,
) {
while out_len >= 16 * BLOCK_LEN {
blake3_guts_avx512_xof_xor_16(block, block_len, cv, counter, flags, out);
counter += 16;
out = out.add(16 * BLOCK_LEN);
out_len -= 16 * BLOCK_LEN;
}
crate::xof_xor_using_compress_xof(
blake3_guts_avx512_compress_xof,
block,