use crate::{portable, CVWords, IncrementCounter, BLOCK_LEN}; use arrayref::{array_mut_ref, array_ref}; cfg_if::cfg_if! { if #[cfg(any(target_arch = "x86", target_arch = "x86_64"))] { cfg_if::cfg_if! { if #[cfg(blake3_avx512_ffi)] { pub const MAX_SIMD_DEGREE: usize = 16; } else { pub const MAX_SIMD_DEGREE: usize = 8; } } } else if #[cfg(blake3_neon)] { pub const MAX_SIMD_DEGREE: usize = 4; } else { pub const MAX_SIMD_DEGREE: usize = 1; } } // There are some places where we want a static size that's equal to the // MAX_SIMD_DEGREE, but also at least 2. Constant contexts aren't currently // allowed to use cmp::max, so we have to hardcode this additional constant // value. Get rid of this once cmp::max is a const fn. cfg_if::cfg_if! { if #[cfg(any(target_arch = "x86", target_arch = "x86_64"))] { cfg_if::cfg_if! { if #[cfg(blake3_avx512_ffi)] { pub const MAX_SIMD_DEGREE_OR_2: usize = 16; } else { pub const MAX_SIMD_DEGREE_OR_2: usize = 8; } } } else if #[cfg(blake3_neon)] { pub const MAX_SIMD_DEGREE_OR_2: usize = 4; } else { pub const MAX_SIMD_DEGREE_OR_2: usize = 2; } } #[derive(Clone, Copy, Debug)] pub enum Platform { Portable, #[cfg(any(target_arch = "x86", target_arch = "x86_64"))] SSE2, #[cfg(any(target_arch = "x86", target_arch = "x86_64"))] SSE41, #[cfg(any(target_arch = "x86", target_arch = "x86_64"))] AVX2, #[cfg(blake3_avx512_ffi)] #[cfg(any(target_arch = "x86", target_arch = "x86_64"))] AVX512, #[cfg(blake3_neon)] NEON, } impl Platform { #[allow(unreachable_code)] pub fn detect() -> Self { #[cfg(any(target_arch = "x86", target_arch = "x86_64"))] { #[cfg(blake3_avx512_ffi)] { if avx512_detected() { return Platform::AVX512; } } if avx2_detected() { return Platform::AVX2; } if sse41_detected() { return Platform::SSE41; } if sse2_detected() { return Platform::SSE2; } } // We don't use dynamic feature detection for NEON. If the "neon" // feature is on, NEON is assumed to be supported. #[cfg(blake3_neon)] { return Platform::NEON; } Platform::Portable } pub fn simd_degree(&self) -> usize { let degree = match self { Platform::Portable => 1, #[cfg(any(target_arch = "x86", target_arch = "x86_64"))] Platform::SSE2 => 4, #[cfg(any(target_arch = "x86", target_arch = "x86_64"))] Platform::SSE41 => 4, #[cfg(any(target_arch = "x86", target_arch = "x86_64"))] Platform::AVX2 => 8, #[cfg(blake3_avx512_ffi)] #[cfg(any(target_arch = "x86", target_arch = "x86_64"))] Platform::AVX512 => 16, #[cfg(blake3_neon)] Platform::NEON => 4, }; debug_assert!(degree <= MAX_SIMD_DEGREE); degree } pub fn compress_in_place( &self, cv: &mut CVWords, block: &[u8; BLOCK_LEN], block_len: u8, counter: u64, flags: u8, ) { match self { Platform::Portable => portable::compress_in_place(cv, block, block_len, counter, flags), // Safe because detect() checked for platform support. #[cfg(any(target_arch = "x86", target_arch = "x86_64"))] Platform::SSE2 => unsafe { crate::sse2::compress_in_place(cv, block, block_len, counter, flags) }, // Safe because detect() checked for platform support. #[cfg(any(target_arch = "x86", target_arch = "x86_64"))] Platform::SSE41 | Platform::AVX2 => unsafe { crate::sse41::compress_in_place(cv, block, block_len, counter, flags) }, // Safe because detect() checked for platform support. #[cfg(blake3_avx512_ffi)] #[cfg(any(target_arch = "x86", target_arch = "x86_64"))] Platform::AVX512 => unsafe { crate::avx512::compress_in_place(cv, block, block_len, counter, flags) }, // No NEON compress_in_place() implementation yet. #[cfg(blake3_neon)] Platform::NEON => portable::compress_in_place(cv, block, block_len, counter, flags), } } pub fn compress_xof( &self, cv: &CVWords, block: &[u8; BLOCK_LEN], block_len: u8, counter: u64, flags: u8, ) -> [u8; 64] { match self { Platform::Portable => portable::compress_xof(cv, block, block_len, counter, flags), // Safe because detect() checked for platform support. #[cfg(any(target_arch = "x86", target_arch = "x86_64"))] Platform::SSE2 => unsafe { crate::sse2::compress_xof(cv, block, block_len, counter, flags) }, // Safe because detect() checked for platform support. #[cfg(any(target_arch = "x86", target_arch = "x86_64"))] Platform::SSE41 | Platform::AVX2 => unsafe { crate::sse41::compress_xof(cv, block, block_len, counter, flags) }, // Safe because detect() checked for platform support. #[cfg(blake3_avx512_ffi)] #[cfg(any(target_arch = "x86", target_arch = "x86_64"))] Platform::AVX512 => unsafe { crate::avx512::compress_xof(cv, block, block_len, counter, flags) }, // No NEON compress_xof() implementation yet. #[cfg(blake3_neon)] Platform::NEON => portable::compress_xof(cv, block, block_len, counter, flags), } } // IMPLEMENTATION NOTE // =================== // hash_many() applies two optimizations. The critically important // optimization is the high-performance parallel SIMD hashing mode, // described in detail in the spec. This more than doubles throughput per // thread. Another optimization is keeping the state vectors transposed // from block to block within a chunk. When state vectors are transposed // after every block, there's a small but measurable performance loss. // Compressing chunks with a dedicated loop avoids this. pub fn hash_many( &self, inputs: &[&[u8; N]], key: &CVWords, counter: u64, increment_counter: IncrementCounter, flags: u8, flags_start: u8, flags_end: u8, out: &mut [u8], ) { match self { Platform::Portable => portable::hash_many( inputs, key, counter, increment_counter, flags, flags_start, flags_end, out, ), // Safe because detect() checked for platform support. #[cfg(any(target_arch = "x86", target_arch = "x86_64"))] Platform::SSE2 => unsafe { crate::sse2::hash_many( inputs, key, counter, increment_counter, flags, flags_start, flags_end, out, ) }, // Safe because detect() checked for platform support. #[cfg(any(target_arch = "x86", target_arch = "x86_64"))] Platform::SSE41 => unsafe { crate::sse41::hash_many( inputs, key, counter, increment_counter, flags, flags_start, flags_end, out, ) }, // Safe because detect() checked for platform support. #[cfg(any(target_arch = "x86", target_arch = "x86_64"))] Platform::AVX2 => unsafe { crate::avx2::hash_many( inputs, key, counter, increment_counter, flags, flags_start, flags_end, out, ) }, // Safe because detect() checked for platform support. #[cfg(blake3_avx512_ffi)] #[cfg(any(target_arch = "x86", target_arch = "x86_64"))] Platform::AVX512 => unsafe { crate::avx512::hash_many( inputs, key, counter, increment_counter, flags, flags_start, flags_end, out, ) }, // Assumed to be safe if the "neon" feature is on. #[cfg(blake3_neon)] Platform::NEON => unsafe { crate::neon::hash_many( inputs, key, counter, increment_counter, flags, flags_start, flags_end, out, ) }, } } // Explicit platform constructors, for benchmarks. pub fn portable() -> Self { Self::Portable } #[cfg(any(target_arch = "x86", target_arch = "x86_64"))] pub fn sse2() -> Option { if sse2_detected() { Some(Self::SSE2) } else { None } } #[cfg(any(target_arch = "x86", target_arch = "x86_64"))] pub fn sse41() -> Option { if sse41_detected() { Some(Self::SSE41) } else { None } } #[cfg(any(target_arch = "x86", target_arch = "x86_64"))] pub fn avx2() -> Option { if avx2_detected() { Some(Self::AVX2) } else { None } } #[cfg(blake3_avx512_ffi)] #[cfg(any(target_arch = "x86", target_arch = "x86_64"))] pub fn avx512() -> Option { if avx512_detected() { Some(Self::AVX512) } else { None } } #[cfg(blake3_neon)] pub fn neon() -> Option { // Assumed to be safe if the "neon" feature is on. Some(Self::NEON) } } // Note that AVX-512 is divided into multiple featuresets, and we use two of // them, F and VL. #[cfg(blake3_avx512_ffi)] #[cfg(any(target_arch = "x86", target_arch = "x86_64"))] #[inline(always)] pub fn avx512_detected() -> bool { // A testing-only short-circuit. if cfg!(feature = "no_avx512") { return false; } // Static check, e.g. for building with target-cpu=native. #[cfg(all(target_feature = "avx512f", target_feature = "avx512vl"))] { return true; } // Dynamic check, if std is enabled. #[cfg(feature = "std")] { if is_x86_feature_detected!("avx512f") && is_x86_feature_detected!("avx512vl") { return true; } } false } #[cfg(any(target_arch = "x86", target_arch = "x86_64"))] #[inline(always)] pub fn avx2_detected() -> bool { // A testing-only short-circuit. if cfg!(feature = "no_avx2") { return false; } // Static check, e.g. for building with target-cpu=native. #[cfg(target_feature = "avx2")] { return true; } // Dynamic check, if std is enabled. #[cfg(feature = "std")] { if is_x86_feature_detected!("avx2") { return true; } } false } #[cfg(any(target_arch = "x86", target_arch = "x86_64"))] #[inline(always)] pub fn sse41_detected() -> bool { // A testing-only short-circuit. if cfg!(feature = "no_sse41") { return false; } // Static check, e.g. for building with target-cpu=native. #[cfg(target_feature = "sse4.1")] { return true; } // Dynamic check, if std is enabled. #[cfg(feature = "std")] { if is_x86_feature_detected!("sse4.1") { return true; } } false } #[cfg(any(target_arch = "x86", target_arch = "x86_64"))] #[inline(always)] #[allow(unreachable_code)] pub fn sse2_detected() -> bool { // A testing-only short-circuit. if cfg!(feature = "no_sse2") { return false; } // Static check, e.g. for building with target-cpu=native. #[cfg(target_feature = "sse2")] { return true; } // Dynamic check, if std is enabled. #[cfg(feature = "std")] { if is_x86_feature_detected!("sse2") { return true; } } false } #[inline(always)] pub fn words_from_le_bytes_32(bytes: &[u8; 32]) -> [u32; 8] { let mut out = [0; 8]; out[0] = u32::from_le_bytes(*array_ref!(bytes, 0 * 4, 4)); out[1] = u32::from_le_bytes(*array_ref!(bytes, 1 * 4, 4)); out[2] = u32::from_le_bytes(*array_ref!(bytes, 2 * 4, 4)); out[3] = u32::from_le_bytes(*array_ref!(bytes, 3 * 4, 4)); out[4] = u32::from_le_bytes(*array_ref!(bytes, 4 * 4, 4)); out[5] = u32::from_le_bytes(*array_ref!(bytes, 5 * 4, 4)); out[6] = u32::from_le_bytes(*array_ref!(bytes, 6 * 4, 4)); out[7] = u32::from_le_bytes(*array_ref!(bytes, 7 * 4, 4)); out } #[inline(always)] pub fn words_from_le_bytes_64(bytes: &[u8; 64]) -> [u32; 16] { let mut out = [0; 16]; out[0] = u32::from_le_bytes(*array_ref!(bytes, 0 * 4, 4)); out[1] = u32::from_le_bytes(*array_ref!(bytes, 1 * 4, 4)); out[2] = u32::from_le_bytes(*array_ref!(bytes, 2 * 4, 4)); out[3] = u32::from_le_bytes(*array_ref!(bytes, 3 * 4, 4)); out[4] = u32::from_le_bytes(*array_ref!(bytes, 4 * 4, 4)); out[5] = u32::from_le_bytes(*array_ref!(bytes, 5 * 4, 4)); out[6] = u32::from_le_bytes(*array_ref!(bytes, 6 * 4, 4)); out[7] = u32::from_le_bytes(*array_ref!(bytes, 7 * 4, 4)); out[8] = u32::from_le_bytes(*array_ref!(bytes, 8 * 4, 4)); out[9] = u32::from_le_bytes(*array_ref!(bytes, 9 * 4, 4)); out[10] = u32::from_le_bytes(*array_ref!(bytes, 10 * 4, 4)); out[11] = u32::from_le_bytes(*array_ref!(bytes, 11 * 4, 4)); out[12] = u32::from_le_bytes(*array_ref!(bytes, 12 * 4, 4)); out[13] = u32::from_le_bytes(*array_ref!(bytes, 13 * 4, 4)); out[14] = u32::from_le_bytes(*array_ref!(bytes, 14 * 4, 4)); out[15] = u32::from_le_bytes(*array_ref!(bytes, 15 * 4, 4)); out } #[inline(always)] pub fn le_bytes_from_words_32(words: &[u32; 8]) -> [u8; 32] { let mut out = [0; 32]; *array_mut_ref!(out, 0 * 4, 4) = words[0].to_le_bytes(); *array_mut_ref!(out, 1 * 4, 4) = words[1].to_le_bytes(); *array_mut_ref!(out, 2 * 4, 4) = words[2].to_le_bytes(); *array_mut_ref!(out, 3 * 4, 4) = words[3].to_le_bytes(); *array_mut_ref!(out, 4 * 4, 4) = words[4].to_le_bytes(); *array_mut_ref!(out, 5 * 4, 4) = words[5].to_le_bytes(); *array_mut_ref!(out, 6 * 4, 4) = words[6].to_le_bytes(); *array_mut_ref!(out, 7 * 4, 4) = words[7].to_le_bytes(); out } #[inline(always)] pub fn le_bytes_from_words_64(words: &[u32; 16]) -> [u8; 64] { let mut out = [0; 64]; *array_mut_ref!(out, 0 * 4, 4) = words[0].to_le_bytes(); *array_mut_ref!(out, 1 * 4, 4) = words[1].to_le_bytes(); *array_mut_ref!(out, 2 * 4, 4) = words[2].to_le_bytes(); *array_mut_ref!(out, 3 * 4, 4) = words[3].to_le_bytes(); *array_mut_ref!(out, 4 * 4, 4) = words[4].to_le_bytes(); *array_mut_ref!(out, 5 * 4, 4) = words[5].to_le_bytes(); *array_mut_ref!(out, 6 * 4, 4) = words[6].to_le_bytes(); *array_mut_ref!(out, 7 * 4, 4) = words[7].to_le_bytes(); *array_mut_ref!(out, 8 * 4, 4) = words[8].to_le_bytes(); *array_mut_ref!(out, 9 * 4, 4) = words[9].to_le_bytes(); *array_mut_ref!(out, 10 * 4, 4) = words[10].to_le_bytes(); *array_mut_ref!(out, 11 * 4, 4) = words[11].to_le_bytes(); *array_mut_ref!(out, 12 * 4, 4) = words[12].to_le_bytes(); *array_mut_ref!(out, 13 * 4, 4) = words[13].to_le_bytes(); *array_mut_ref!(out, 14 * 4, 4) = words[14].to_le_bytes(); *array_mut_ref!(out, 15 * 4, 4) = words[15].to_le_bytes(); out }