1
0
Fork 0
mirror of https://github.com/BLAKE3-team/BLAKE3 synced 2024-04-24 05:05:04 +02:00
BLAKE3/src/platform.rs
2024-03-09 16:56:59 -08:00

512 lines
16 KiB
Rust

use crate::{portable, CVWords, IncrementCounter, BLOCK_LEN};
use arrayref::{array_mut_ref, array_ref};
cfg_if::cfg_if! {
if #[cfg(any(target_arch = "x86", target_arch = "x86_64"))] {
cfg_if::cfg_if! {
if #[cfg(blake3_avx512_ffi)] {
pub const MAX_SIMD_DEGREE: usize = 16;
} else {
pub const MAX_SIMD_DEGREE: usize = 8;
}
}
} else if #[cfg(blake3_neon)] {
pub const MAX_SIMD_DEGREE: usize = 4;
} else {
pub const MAX_SIMD_DEGREE: usize = 1;
}
}
// There are some places where we want a static size that's equal to the
// MAX_SIMD_DEGREE, but also at least 2. Constant contexts aren't currently
// allowed to use cmp::max, so we have to hardcode this additional constant
// value. Get rid of this once cmp::max is a const fn.
cfg_if::cfg_if! {
if #[cfg(any(target_arch = "x86", target_arch = "x86_64"))] {
cfg_if::cfg_if! {
if #[cfg(blake3_avx512_ffi)] {
pub const MAX_SIMD_DEGREE_OR_2: usize = 16;
} else {
pub const MAX_SIMD_DEGREE_OR_2: usize = 8;
}
}
} else if #[cfg(blake3_neon)] {
pub const MAX_SIMD_DEGREE_OR_2: usize = 4;
} else {
pub const MAX_SIMD_DEGREE_OR_2: usize = 2;
}
}
#[derive(Clone, Copy, Debug)]
pub enum Platform {
Portable,
#[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
SSE2,
#[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
SSE41,
#[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
AVX2,
#[cfg(blake3_avx512_ffi)]
#[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
AVX512,
#[cfg(blake3_neon)]
NEON,
}
impl Platform {
#[allow(unreachable_code)]
pub fn detect() -> Self {
#[cfg(miri)]
{
return Platform::Portable;
}
#[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
{
#[cfg(blake3_avx512_ffi)]
{
if avx512_detected() {
return Platform::AVX512;
}
}
if avx2_detected() {
return Platform::AVX2;
}
if sse41_detected() {
return Platform::SSE41;
}
if sse2_detected() {
return Platform::SSE2;
}
}
// We don't use dynamic feature detection for NEON. If the "neon"
// feature is on, NEON is assumed to be supported.
#[cfg(blake3_neon)]
{
return Platform::NEON;
}
Platform::Portable
}
pub fn simd_degree(&self) -> usize {
let degree = match self {
Platform::Portable => 1,
#[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
Platform::SSE2 => 4,
#[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
Platform::SSE41 => 4,
#[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
Platform::AVX2 => 8,
#[cfg(blake3_avx512_ffi)]
#[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
Platform::AVX512 => 16,
#[cfg(blake3_neon)]
Platform::NEON => 4,
};
debug_assert!(degree <= MAX_SIMD_DEGREE);
degree
}
pub fn compress_in_place(
&self,
cv: &mut CVWords,
block: &[u8; BLOCK_LEN],
block_len: u8,
counter: u64,
flags: u8,
) {
match self {
Platform::Portable => portable::compress_in_place(cv, block, block_len, counter, flags),
// Safe because detect() checked for platform support.
#[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
Platform::SSE2 => unsafe {
crate::sse2::compress_in_place(cv, block, block_len, counter, flags)
},
// Safe because detect() checked for platform support.
#[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
Platform::SSE41 | Platform::AVX2 => unsafe {
crate::sse41::compress_in_place(cv, block, block_len, counter, flags)
},
// Safe because detect() checked for platform support.
#[cfg(blake3_avx512_ffi)]
#[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
Platform::AVX512 => unsafe {
crate::avx512::compress_in_place(cv, block, block_len, counter, flags)
},
// No NEON compress_in_place() implementation yet.
#[cfg(blake3_neon)]
Platform::NEON => portable::compress_in_place(cv, block, block_len, counter, flags),
}
}
pub fn compress_xof(
&self,
cv: &CVWords,
block: &[u8; BLOCK_LEN],
block_len: u8,
counter: u64,
flags: u8,
) -> [u8; 64] {
match self {
Platform::Portable => portable::compress_xof(cv, block, block_len, counter, flags),
// Safe because detect() checked for platform support.
#[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
Platform::SSE2 => unsafe {
crate::sse2::compress_xof(cv, block, block_len, counter, flags)
},
// Safe because detect() checked for platform support.
#[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
Platform::SSE41 | Platform::AVX2 => unsafe {
crate::sse41::compress_xof(cv, block, block_len, counter, flags)
},
// Safe because detect() checked for platform support.
#[cfg(blake3_avx512_ffi)]
#[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
Platform::AVX512 => unsafe {
crate::avx512::compress_xof(cv, block, block_len, counter, flags)
},
// No NEON compress_xof() implementation yet.
#[cfg(blake3_neon)]
Platform::NEON => portable::compress_xof(cv, block, block_len, counter, flags),
}
}
// IMPLEMENTATION NOTE
// ===================
// hash_many() applies two optimizations. The critically important
// optimization is the high-performance parallel SIMD hashing mode,
// described in detail in the spec. This more than doubles throughput per
// thread. Another optimization is keeping the state vectors transposed
// from block to block within a chunk. When state vectors are transposed
// after every block, there's a small but measurable performance loss.
// Compressing chunks with a dedicated loop avoids this.
pub fn hash_many<const N: usize>(
&self,
inputs: &[&[u8; N]],
key: &CVWords,
counter: u64,
increment_counter: IncrementCounter,
flags: u8,
flags_start: u8,
flags_end: u8,
out: &mut [u8],
) {
match self {
Platform::Portable => portable::hash_many(
inputs,
key,
counter,
increment_counter,
flags,
flags_start,
flags_end,
out,
),
// Safe because detect() checked for platform support.
#[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
Platform::SSE2 => unsafe {
crate::sse2::hash_many(
inputs,
key,
counter,
increment_counter,
flags,
flags_start,
flags_end,
out,
)
},
// Safe because detect() checked for platform support.
#[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
Platform::SSE41 => unsafe {
crate::sse41::hash_many(
inputs,
key,
counter,
increment_counter,
flags,
flags_start,
flags_end,
out,
)
},
// Safe because detect() checked for platform support.
#[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
Platform::AVX2 => unsafe {
crate::avx2::hash_many(
inputs,
key,
counter,
increment_counter,
flags,
flags_start,
flags_end,
out,
)
},
// Safe because detect() checked for platform support.
#[cfg(blake3_avx512_ffi)]
#[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
Platform::AVX512 => unsafe {
crate::avx512::hash_many(
inputs,
key,
counter,
increment_counter,
flags,
flags_start,
flags_end,
out,
)
},
// Assumed to be safe if the "neon" feature is on.
#[cfg(blake3_neon)]
Platform::NEON => unsafe {
crate::neon::hash_many(
inputs,
key,
counter,
increment_counter,
flags,
flags_start,
flags_end,
out,
)
},
}
}
// Explicit platform constructors, for benchmarks.
pub fn portable() -> Self {
Self::Portable
}
#[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
pub fn sse2() -> Option<Self> {
if sse2_detected() {
Some(Self::SSE2)
} else {
None
}
}
#[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
pub fn sse41() -> Option<Self> {
if sse41_detected() {
Some(Self::SSE41)
} else {
None
}
}
#[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
pub fn avx2() -> Option<Self> {
if avx2_detected() {
Some(Self::AVX2)
} else {
None
}
}
#[cfg(blake3_avx512_ffi)]
#[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
pub fn avx512() -> Option<Self> {
if avx512_detected() {
Some(Self::AVX512)
} else {
None
}
}
#[cfg(blake3_neon)]
pub fn neon() -> Option<Self> {
// Assumed to be safe if the "neon" feature is on.
Some(Self::NEON)
}
}
// Note that AVX-512 is divided into multiple featuresets, and we use two of
// them, F and VL.
#[cfg(blake3_avx512_ffi)]
#[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
#[inline(always)]
#[allow(unreachable_code)]
pub fn avx512_detected() -> bool {
if cfg!(miri) {
return false;
}
// A testing-only short-circuit.
if cfg!(feature = "no_avx512") {
return false;
}
// Static check, e.g. for building with target-cpu=native.
#[cfg(all(target_feature = "avx512f", target_feature = "avx512vl"))]
{
return true;
}
// Dynamic check, if std is enabled.
#[cfg(feature = "std")]
{
if is_x86_feature_detected!("avx512f") && is_x86_feature_detected!("avx512vl") {
return true;
}
}
false
}
#[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
#[inline(always)]
#[allow(unreachable_code)]
pub fn avx2_detected() -> bool {
if cfg!(miri) {
return false;
}
// A testing-only short-circuit.
if cfg!(feature = "no_avx2") {
return false;
}
// Static check, e.g. for building with target-cpu=native.
#[cfg(target_feature = "avx2")]
{
return true;
}
// Dynamic check, if std is enabled.
#[cfg(feature = "std")]
{
if is_x86_feature_detected!("avx2") {
return true;
}
}
false
}
#[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
#[inline(always)]
#[allow(unreachable_code)]
pub fn sse41_detected() -> bool {
if cfg!(miri) {
return false;
}
// A testing-only short-circuit.
if cfg!(feature = "no_sse41") {
return false;
}
// Static check, e.g. for building with target-cpu=native.
#[cfg(target_feature = "sse4.1")]
{
return true;
}
// Dynamic check, if std is enabled.
#[cfg(feature = "std")]
{
if is_x86_feature_detected!("sse4.1") {
return true;
}
}
false
}
#[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
#[inline(always)]
#[allow(unreachable_code)]
pub fn sse2_detected() -> bool {
if cfg!(miri) {
return false;
}
// A testing-only short-circuit.
if cfg!(feature = "no_sse2") {
return false;
}
// Static check, e.g. for building with target-cpu=native.
#[cfg(target_feature = "sse2")]
{
return true;
}
// Dynamic check, if std is enabled.
#[cfg(feature = "std")]
{
if is_x86_feature_detected!("sse2") {
return true;
}
}
false
}
#[inline(always)]
pub fn words_from_le_bytes_32(bytes: &[u8; 32]) -> [u32; 8] {
let mut out = [0; 8];
out[0] = u32::from_le_bytes(*array_ref!(bytes, 0 * 4, 4));
out[1] = u32::from_le_bytes(*array_ref!(bytes, 1 * 4, 4));
out[2] = u32::from_le_bytes(*array_ref!(bytes, 2 * 4, 4));
out[3] = u32::from_le_bytes(*array_ref!(bytes, 3 * 4, 4));
out[4] = u32::from_le_bytes(*array_ref!(bytes, 4 * 4, 4));
out[5] = u32::from_le_bytes(*array_ref!(bytes, 5 * 4, 4));
out[6] = u32::from_le_bytes(*array_ref!(bytes, 6 * 4, 4));
out[7] = u32::from_le_bytes(*array_ref!(bytes, 7 * 4, 4));
out
}
#[inline(always)]
pub fn words_from_le_bytes_64(bytes: &[u8; 64]) -> [u32; 16] {
let mut out = [0; 16];
out[0] = u32::from_le_bytes(*array_ref!(bytes, 0 * 4, 4));
out[1] = u32::from_le_bytes(*array_ref!(bytes, 1 * 4, 4));
out[2] = u32::from_le_bytes(*array_ref!(bytes, 2 * 4, 4));
out[3] = u32::from_le_bytes(*array_ref!(bytes, 3 * 4, 4));
out[4] = u32::from_le_bytes(*array_ref!(bytes, 4 * 4, 4));
out[5] = u32::from_le_bytes(*array_ref!(bytes, 5 * 4, 4));
out[6] = u32::from_le_bytes(*array_ref!(bytes, 6 * 4, 4));
out[7] = u32::from_le_bytes(*array_ref!(bytes, 7 * 4, 4));
out[8] = u32::from_le_bytes(*array_ref!(bytes, 8 * 4, 4));
out[9] = u32::from_le_bytes(*array_ref!(bytes, 9 * 4, 4));
out[10] = u32::from_le_bytes(*array_ref!(bytes, 10 * 4, 4));
out[11] = u32::from_le_bytes(*array_ref!(bytes, 11 * 4, 4));
out[12] = u32::from_le_bytes(*array_ref!(bytes, 12 * 4, 4));
out[13] = u32::from_le_bytes(*array_ref!(bytes, 13 * 4, 4));
out[14] = u32::from_le_bytes(*array_ref!(bytes, 14 * 4, 4));
out[15] = u32::from_le_bytes(*array_ref!(bytes, 15 * 4, 4));
out
}
#[inline(always)]
pub fn le_bytes_from_words_32(words: &[u32; 8]) -> [u8; 32] {
let mut out = [0; 32];
*array_mut_ref!(out, 0 * 4, 4) = words[0].to_le_bytes();
*array_mut_ref!(out, 1 * 4, 4) = words[1].to_le_bytes();
*array_mut_ref!(out, 2 * 4, 4) = words[2].to_le_bytes();
*array_mut_ref!(out, 3 * 4, 4) = words[3].to_le_bytes();
*array_mut_ref!(out, 4 * 4, 4) = words[4].to_le_bytes();
*array_mut_ref!(out, 5 * 4, 4) = words[5].to_le_bytes();
*array_mut_ref!(out, 6 * 4, 4) = words[6].to_le_bytes();
*array_mut_ref!(out, 7 * 4, 4) = words[7].to_le_bytes();
out
}
#[inline(always)]
pub fn le_bytes_from_words_64(words: &[u32; 16]) -> [u8; 64] {
let mut out = [0; 64];
*array_mut_ref!(out, 0 * 4, 4) = words[0].to_le_bytes();
*array_mut_ref!(out, 1 * 4, 4) = words[1].to_le_bytes();
*array_mut_ref!(out, 2 * 4, 4) = words[2].to_le_bytes();
*array_mut_ref!(out, 3 * 4, 4) = words[3].to_le_bytes();
*array_mut_ref!(out, 4 * 4, 4) = words[4].to_le_bytes();
*array_mut_ref!(out, 5 * 4, 4) = words[5].to_le_bytes();
*array_mut_ref!(out, 6 * 4, 4) = words[6].to_le_bytes();
*array_mut_ref!(out, 7 * 4, 4) = words[7].to_le_bytes();
*array_mut_ref!(out, 8 * 4, 4) = words[8].to_le_bytes();
*array_mut_ref!(out, 9 * 4, 4) = words[9].to_le_bytes();
*array_mut_ref!(out, 10 * 4, 4) = words[10].to_le_bytes();
*array_mut_ref!(out, 11 * 4, 4) = words[11].to_le_bytes();
*array_mut_ref!(out, 12 * 4, 4) = words[12].to_le_bytes();
*array_mut_ref!(out, 13 * 4, 4) = words[13].to_le_bytes();
*array_mut_ref!(out, 14 * 4, 4) = words[14].to_le_bytes();
*array_mut_ref!(out, 15 * 4, 4) = words[15].to_le_bytes();
out
}