1
0
Fork 0
mirror of https://github.com/BLAKE3-team/BLAKE3 synced 2024-05-26 15:56:02 +02:00

add AVX-512 compress/compress_xof

This commit is contained in:
Jack O'Connor 2023-07-16 13:09:38 -07:00
parent c30d039a5e
commit 9137475aa6
6 changed files with 437 additions and 23 deletions

View File

@ -99,6 +99,3 @@ rand = "0.8.0"
rand_chacha = "0.3.0"
reference_impl = { path = "./reference_impl" }
hmac = "0.12.0"
[build-dependencies]
cc = "1.0.4"

View File

@ -19,6 +19,10 @@
.global _blake3_compress_in_place_avx512
.global blake3_compress_xof_avx512
.global _blake3_compress_xof_avx512
.global blake3_guts_avx512_compress
.global _blake3_guts_avx512_compress
.global blake3_guts_avx512_compress_xof
.global _blake3_guts_avx512_compress_xof
#ifdef __APPLE__
.text
@ -2553,6 +2557,190 @@ blake3_compress_xof_avx512:
vmovdqu xmmword ptr [r9+0x30], xmm3
ret
// type CompressFn = unsafe extern "C" fn(
// block: *const BlockBytes, rdi
// block_len: u32, esi
// cv: *const CVBytes, rdx
// counter: u64, rcx
// flags: u32, r8d
// out: *mut CVBytes, r9
// );
.p2align 6
_blake3_guts_avx512_compress:
blake3_guts_avx512_compress:
_CET_ENDBR
vmovdqu xmm0, xmmword ptr [rdx]
vmovdqu xmm1, xmmword ptr [rdx+0x10]
mov eax, esi
shl r8, 32
add rax, r8
vmovq xmm3, rcx
vmovq xmm4, rax
vpunpcklqdq xmm3, xmm3, xmm4
vmovaps xmm2, xmmword ptr [BLAKE3_IV+rip]
vmovups xmm8, xmmword ptr [rdi]
vmovups xmm9, xmmword ptr [rdi+0x10]
vshufps xmm4, xmm8, xmm9, 136
vshufps xmm5, xmm8, xmm9, 221
vmovups xmm8, xmmword ptr [rdi+0x20]
vmovups xmm9, xmmword ptr [rdi+0x30]
vshufps xmm6, xmm8, xmm9, 136
vshufps xmm7, xmm8, xmm9, 221
vpshufd xmm6, xmm6, 0x93
vpshufd xmm7, xmm7, 0x93
mov al, 7
9:
vpaddd xmm0, xmm0, xmm4
vpaddd xmm0, xmm0, xmm1
vpxord xmm3, xmm3, xmm0
vprord xmm3, xmm3, 16
vpaddd xmm2, xmm2, xmm3
vpxord xmm1, xmm1, xmm2
vprord xmm1, xmm1, 12
vpaddd xmm0, xmm0, xmm5
vpaddd xmm0, xmm0, xmm1
vpxord xmm3, xmm3, xmm0
vprord xmm3, xmm3, 8
vpaddd xmm2, xmm2, xmm3
vpxord xmm1, xmm1, xmm2
vprord xmm1, xmm1, 7
vpshufd xmm0, xmm0, 0x93
vpshufd xmm3, xmm3, 0x4E
vpshufd xmm2, xmm2, 0x39
vpaddd xmm0, xmm0, xmm6
vpaddd xmm0, xmm0, xmm1
vpxord xmm3, xmm3, xmm0
vprord xmm3, xmm3, 16
vpaddd xmm2, xmm2, xmm3
vpxord xmm1, xmm1, xmm2
vprord xmm1, xmm1, 12
vpaddd xmm0, xmm0, xmm7
vpaddd xmm0, xmm0, xmm1
vpxord xmm3, xmm3, xmm0
vprord xmm3, xmm3, 8
vpaddd xmm2, xmm2, xmm3
vpxord xmm1, xmm1, xmm2
vprord xmm1, xmm1, 7
vpshufd xmm0, xmm0, 0x39
vpshufd xmm3, xmm3, 0x4E
vpshufd xmm2, xmm2, 0x93
dec al
jz 9f
vshufps xmm8, xmm4, xmm5, 214
vpshufd xmm9, xmm4, 0x0F
vpshufd xmm4, xmm8, 0x39
vshufps xmm8, xmm6, xmm7, 250
vpblendd xmm9, xmm9, xmm8, 0xAA
vpunpcklqdq xmm8, xmm7, xmm5
vpblendd xmm8, xmm8, xmm6, 0x88
vpshufd xmm8, xmm8, 0x78
vpunpckhdq xmm5, xmm5, xmm7
vpunpckldq xmm6, xmm6, xmm5
vpshufd xmm7, xmm6, 0x1E
vmovdqa xmm5, xmm9
vmovdqa xmm6, xmm8
jmp 9b
9:
vpxor xmm0, xmm0, xmm2
vpxor xmm1, xmm1, xmm3
vmovdqu xmmword ptr [r9], xmm0
vmovdqu xmmword ptr [r9+0x10], xmm1
ret
// type CompressXofFn = unsafe extern "C" fn(
// block: *const BlockBytes, rdi
// block_len: u32, esi
// cv: *const CVBytes, rdx
// counter: u64, rcx
// flags: u32, r8d
// out: *mut BlockBytes, r9
// );
.p2align 6
_blake3_guts_avx512_compress_xof:
blake3_guts_avx512_compress_xof:
_CET_ENDBR
vmovdqu xmm0, xmmword ptr [rdx]
vmovdqu xmm1, xmmword ptr [rdx+0x10]
mov eax, esi
shl r8, 32
add rax, r8
vmovq xmm3, rcx
vmovq xmm4, rax
vpunpcklqdq xmm3, xmm3, xmm4
vmovaps xmm2, xmmword ptr [BLAKE3_IV+rip]
vmovups xmm8, xmmword ptr [rdi]
vmovups xmm9, xmmword ptr [rdi+0x10]
vshufps xmm4, xmm8, xmm9, 136
vshufps xmm5, xmm8, xmm9, 221
vmovups xmm8, xmmword ptr [rdi+0x20]
vmovups xmm9, xmmword ptr [rdi+0x30]
vshufps xmm6, xmm8, xmm9, 136
vshufps xmm7, xmm8, xmm9, 221
vpshufd xmm6, xmm6, 0x93
vpshufd xmm7, xmm7, 0x93
mov al, 7
9:
vpaddd xmm0, xmm0, xmm4
vpaddd xmm0, xmm0, xmm1
vpxord xmm3, xmm3, xmm0
vprord xmm3, xmm3, 16
vpaddd xmm2, xmm2, xmm3
vpxord xmm1, xmm1, xmm2
vprord xmm1, xmm1, 12
vpaddd xmm0, xmm0, xmm5
vpaddd xmm0, xmm0, xmm1
vpxord xmm3, xmm3, xmm0
vprord xmm3, xmm3, 8
vpaddd xmm2, xmm2, xmm3
vpxord xmm1, xmm1, xmm2
vprord xmm1, xmm1, 7
vpshufd xmm0, xmm0, 0x93
vpshufd xmm3, xmm3, 0x4E
vpshufd xmm2, xmm2, 0x39
vpaddd xmm0, xmm0, xmm6
vpaddd xmm0, xmm0, xmm1
vpxord xmm3, xmm3, xmm0
vprord xmm3, xmm3, 16
vpaddd xmm2, xmm2, xmm3
vpxord xmm1, xmm1, xmm2
vprord xmm1, xmm1, 12
vpaddd xmm0, xmm0, xmm7
vpaddd xmm0, xmm0, xmm1
vpxord xmm3, xmm3, xmm0
vprord xmm3, xmm3, 8
vpaddd xmm2, xmm2, xmm3
vpxord xmm1, xmm1, xmm2
vprord xmm1, xmm1, 7
vpshufd xmm0, xmm0, 0x39
vpshufd xmm3, xmm3, 0x4E
vpshufd xmm2, xmm2, 0x93
dec al
jz 9f
vshufps xmm8, xmm4, xmm5, 214
vpshufd xmm9, xmm4, 0x0F
vpshufd xmm4, xmm8, 0x39
vshufps xmm8, xmm6, xmm7, 250
vpblendd xmm9, xmm9, xmm8, 0xAA
vpunpcklqdq xmm8, xmm7, xmm5
vpblendd xmm8, xmm8, xmm6, 0x88
vpshufd xmm8, xmm8, 0x78
vpunpckhdq xmm5, xmm5, xmm7
vpunpckldq xmm6, xmm6, xmm5
vpshufd xmm7, xmm6, 0x1E
vmovdqa xmm5, xmm9
vmovdqa xmm6, xmm8
jmp 9b
9:
vpxor xmm0, xmm0, xmm2
vpxor xmm1, xmm1, xmm3
vpxor xmm2, xmm2, [rdx]
vpxor xmm3, xmm3, [rdx+0x10]
vmovdqu xmmword ptr [r9], xmm0
vmovdqu xmmword ptr [r9+0x10], xmm1
vmovdqu xmmword ptr [r9+0x20], xmm2
vmovdqu xmmword ptr [r9+0x30], xmm3
ret
#ifdef __APPLE__
.static_data
#else

View File

@ -15,3 +15,10 @@ cfg-if = "1.0.0"
[dev-dependencies]
hex = "0.4.3"
reference_impl = { path = "../../reference_impl" }
[features]
default = ["std"]
std = []
[build-dependencies]
cc = "1.0.79"

View File

@ -148,20 +148,20 @@ fn build_sse2_sse41_avx2_assembly() {
println!("cargo:rustc-cfg=blake3_avx2_ffi");
let mut build = new_build();
if is_windows_msvc() {
build.file("c/blake3_sse2_x86-64_windows_msvc.asm");
build.file("c/blake3_sse41_x86-64_windows_msvc.asm");
build.file("c/blake3_avx2_x86-64_windows_msvc.asm");
build.file("../../c/blake3_sse2_x86-64_windows_msvc.asm");
build.file("../../c/blake3_sse41_x86-64_windows_msvc.asm");
build.file("../../c/blake3_avx2_x86-64_windows_msvc.asm");
} else if is_windows_gnu() {
build.file("c/blake3_sse2_x86-64_windows_gnu.S");
build.file("c/blake3_sse41_x86-64_windows_gnu.S");
build.file("c/blake3_avx2_x86-64_windows_gnu.S");
build.file("../../c/blake3_sse2_x86-64_windows_gnu.S");
build.file("../../c/blake3_sse41_x86-64_windows_gnu.S");
build.file("../../c/blake3_avx2_x86-64_windows_gnu.S");
} else {
// All non-Windows implementations are assumed to support
// Linux-style assembly. These files do contain a small
// explicit workaround for macOS also.
build.file("c/blake3_sse2_x86-64_unix.S");
build.file("c/blake3_sse41_x86-64_unix.S");
build.file("c/blake3_avx2_x86-64_unix.S");
build.file("../../c/blake3_sse2_x86-64_unix.S");
build.file("../../c/blake3_sse41_x86-64_unix.S");
build.file("../../c/blake3_avx2_x86-64_unix.S");
}
build.compile("blake3_sse2_sse41_avx2_assembly");
}
@ -171,7 +171,7 @@ fn build_avx512_c_intrinsics() {
// implementation doesn't support those.
println!("cargo:rustc-cfg=blake3_avx512_ffi");
let mut build = new_build();
build.file("c/blake3_avx512.c");
build.file("../../c/blake3_avx512.c");
if is_windows_msvc() {
build.flag("/arch:AVX512");
} else {
@ -192,15 +192,15 @@ fn build_avx512_assembly() {
println!("cargo:rustc-cfg=blake3_avx512_ffi");
let mut build = new_build();
if is_windows_msvc() {
build.file("c/blake3_avx512_x86-64_windows_msvc.asm");
build.file("../../c/blake3_avx512_x86-64_windows_msvc.asm");
} else {
if is_windows_gnu() {
build.file("c/blake3_avx512_x86-64_windows_gnu.S");
build.file("../../c/blake3_avx512_x86-64_windows_gnu.S");
} else {
// All non-Windows implementations are assumed to support Linux-style
// assembly. These files do contain a small explicit workaround for
// macOS also.
build.file("c/blake3_avx512_x86-64_unix.S");
build.file("../../c/blake3_avx512_x86-64_unix.S");
}
// Older versions of Clang require these flags, even for assembly. See
// https://github.com/BLAKE3-team/BLAKE3/issues/79.
@ -215,7 +215,7 @@ fn build_neon_c_intrinsics() {
// Note that blake3_neon.c normally depends on the blake3_portable.c
// for the single-instance compression function, but we expose
// portable.rs over FFI instead. See ffi_neon.rs.
build.file("c/blake3_neon.c");
build.file("../../c/blake3_neon.c");
// ARMv7 platforms that support NEON generally need the following
// flags. AArch64 supports NEON by default and does not support -mpfu.
if is_armv7() {
@ -225,7 +225,7 @@ fn build_neon_c_intrinsics() {
build.compile("blake3_neon");
}
fn main() -> Result<(), Box<dyn std::error::Error>> {
fn main() {
if is_pure() && is_neon() {
panic!("It doesn't make sense to enable both \"pure\" and \"neon\".");
}
@ -266,12 +266,10 @@ fn main() -> Result<(), Box<dyn std::error::Error>> {
println!("cargo:rerun-if-env-changed=CFLAGS");
// Ditto for source files, though these shouldn't change as often.
for file in std::fs::read_dir("c")? {
for file in std::fs::read_dir("../../c").unwrap() {
println!(
"cargo:rerun-if-changed={}",
file?.path().to_str().expect("utf-8")
file.unwrap().path().to_str().expect("utf-8")
);
}
Ok(())
}

212
rust/guts/src/avx512.rs Normal file
View File

@ -0,0 +1,212 @@
use crate::{BlockBytes, CVBytes, Implementation};
const DEGREE: usize = 16;
extern "C" {
fn blake3_guts_avx512_compress(
block: *const BlockBytes,
block_len: u32,
cv: *const CVBytes,
counter: u64,
flags: u32,
out: *mut CVBytes,
);
fn blake3_guts_avx512_compress_xof(
block: *const BlockBytes,
block_len: u32,
cv: *const CVBytes,
counter: u64,
flags: u32,
out: *mut BlockBytes,
);
}
unsafe extern "C" fn hash_chunks(
input: *const u8,
input_len: usize,
key: *const CVBytes,
counter: u64,
flags: u32,
transposed_output: *mut u32,
) {
crate::hash_chunks_using_compress(
blake3_guts_avx512_compress,
input,
input_len,
key,
counter,
flags,
transposed_output,
)
}
unsafe extern "C" fn hash_parents(
transposed_input: *const u32,
num_parents: usize,
key: *const CVBytes,
flags: u32,
transposed_output: *mut u32, // may overlap the input
) {
crate::hash_parents_using_compress(
blake3_guts_avx512_compress,
transposed_input,
num_parents,
key,
flags,
transposed_output,
)
}
unsafe extern "C" fn xof(
block: *const BlockBytes,
block_len: u32,
cv: *const CVBytes,
counter: u64,
flags: u32,
out: *mut u8,
out_len: usize,
) {
crate::xof_using_compress_xof(
blake3_guts_avx512_compress_xof,
block,
block_len,
cv,
counter,
flags,
out,
out_len,
)
}
unsafe extern "C" fn xof_xor(
block: *const BlockBytes,
block_len: u32,
cv: *const CVBytes,
counter: u64,
flags: u32,
out: *mut u8,
out_len: usize,
) {
crate::xof_xor_using_compress_xof(
blake3_guts_avx512_compress_xof,
block,
block_len,
cv,
counter,
flags,
out,
out_len,
)
}
unsafe extern "C" fn universal_hash(
input: *const u8,
input_len: usize,
key: *const CVBytes,
counter: u64,
out: *mut [u8; 16],
) {
crate::universal_hash_using_compress(
blake3_guts_avx512_compress,
input,
input_len,
key,
counter,
out,
)
}
fn supported() -> bool {
// A testing-only short-circuit.
if cfg!(feature = "no_avx512") {
return false;
}
// Static check, e.g. for building with target-cpu=native.
#[cfg(all(target_feature = "avx512f", target_feature = "avx512vl"))]
{
return true;
}
// Dynamic check, if std is enabled.
#[cfg(feature = "std")]
{
if is_x86_feature_detected!("avx512f") && is_x86_feature_detected!("avx512vl") {
return true;
}
}
false
}
pub fn implementation() -> Option<Implementation> {
if supported() {
Some(Implementation::new(
|| DEGREE,
blake3_guts_avx512_compress,
hash_chunks,
hash_parents,
xof,
xof_xor,
universal_hash,
))
} else {
None
}
}
#[cfg(test)]
mod test {
use super::*;
#[test]
fn test_compress_vs_portable() {
let Some(implementation) = implementation() else { return };
crate::test::test_compress_vs_portable(&implementation);
}
#[test]
fn test_compress_vs_reference() {
let Some(implementation) = implementation() else { return };
crate::test::test_compress_vs_reference(&implementation);
}
#[test]
fn test_hash_chunks_vs_portable() {
let Some(implementation) = implementation() else { return };
crate::test::test_hash_chunks_vs_portable(&implementation);
}
#[test]
fn test_hash_parents_vs_portable() {
let Some(implementation) = implementation() else { return };
crate::test::test_hash_parents_vs_portable(&implementation);
}
#[test]
fn test_chunks_and_parents_vs_reference() {
let Some(implementation) = implementation() else { return };
crate::test::test_chunks_and_parents_vs_reference(&implementation);
}
#[test]
fn test_xof_vs_portable() {
let Some(implementation) = implementation() else { return };
crate::test::test_xof_vs_portable(&implementation);
}
#[test]
fn test_xof_vs_reference() {
let Some(implementation) = implementation() else { return };
crate::test::test_xof_vs_reference(&implementation);
}
#[test]
fn test_universal_hash_vs_portable() {
let Some(implementation) = implementation() else { return };
crate::test::test_universal_hash_vs_portable(&implementation);
}
#[test]
fn test_universal_hash_vs_reference() {
let Some(implementation) = implementation() else { return };
crate::test::test_universal_hash_vs_reference(&implementation);
}
}

View File

@ -4,6 +4,8 @@ use core::mem;
use core::ptr;
use core::sync::atomic::{AtomicPtr, Ordering::Relaxed};
#[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
pub mod avx512;
pub mod portable;
#[cfg(test)]
@ -64,8 +66,18 @@ pub static DETECTED_IMPL: Implementation = Implementation::new(
universal_hash_init,
);
fn detect() -> Implementation {
#[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
{
if let Some(avx512_impl) = avx512::implementation() {
return avx512_impl;
}
}
portable::implementation()
}
fn init_detected_impl() {
let detected = portable::implementation();
let detected = detect();
DETECTED_IMPL
.degree_ptr