1
0
Fork 0
mirror of https://github.com/BLAKE3-team/BLAKE3 synced 2024-05-26 11:46:04 +02:00
BLAKE3/c/blake3_dispatch.c
Matthew Krupcale d91f20dd29 Start SSE2 implementation based on SSE4.1 version
Wire up basic functions and features for SSE2 support using the SSE4.1 version
as a basis without implementing the SSE2 instructions yet.

 * Cargo.toml: add no_sse2 feature
 * benches/bench.rs: wire SSE2 benchmarks
 * build.rs: add SSE2 rust intrinsics and assembly builds
 * c/Makefile.testing: add SSE2 C and assembly targets
 * c/README.md: add SSE2 to C build instructions
 * c/blake3_c_rust_bindings/build.rs: add SSE2 C rust binding builds
 * c/blake3_c_rust_bindings/src/lib.rs: add SSE2 C rust bindings
 * c/blake3_dispatch.c: add SSE2 C dispatch
 * c/blake3_impl.h: add SSE2 C function prototypes
 * c/blake3_sse2.c: add SSE2 C intrinsic file starting with SSE4.1 version
 * c/blake3_sse2_x86-64_{unix.S,windows_gnu.S,windows_msvc.asm}: add SSE2
   assembly files starting with SSE4.1 version
 * src/ffi_sse2.rs: add rust implementation using SSE2 C rust bindings
 * src/lib.rs: add SSE2 rust intrinsics and SSE2 C rust binding rust SSE2 module
   configurations
 * src/platform.rs: add SSE2 rust platform detection and dispatch
 * src/rust_sse2.rs: add SSE2 rust intrinsic file starting with SSE4.1 version
 * tools/instruction_set_support/src/main.rs: add SSE2 feature detection
2020-08-24 00:54:46 -04:00

271 lines
7.2 KiB
C

#include <stdbool.h>
#include <stddef.h>
#include <stdint.h>
#include "blake3_impl.h"
#if defined(IS_X86)
#if defined(_MSC_VER)
#include <intrin.h>
#elif defined(__GNUC__)
#include <immintrin.h>
#else
#error "Unimplemented!"
#endif
#endif
#if defined(IS_X86)
static uint64_t xgetbv() {
#if defined(_MSC_VER)
return _xgetbv(0);
#else
uint32_t eax = 0, edx = 0;
__asm__ __volatile__("xgetbv\n" : "=a"(eax), "=d"(edx) : "c"(0));
return ((uint64_t)edx << 32) | eax;
#endif
}
static void cpuid(uint32_t out[4], uint32_t id) {
#if defined(_MSC_VER)
__cpuid((int *)out, id);
#elif defined(__i386__) || defined(_M_IX86)
__asm__ __volatile__("movl %%ebx, %1\n"
"cpuid\n"
"xchgl %1, %%ebx\n"
: "=a"(out[0]), "=r"(out[1]), "=c"(out[2]), "=d"(out[3])
: "a"(id));
#else
__asm__ __volatile__("cpuid\n"
: "=a"(out[0]), "=b"(out[1]), "=c"(out[2]), "=d"(out[3])
: "a"(id));
#endif
}
static void cpuidex(uint32_t out[4], uint32_t id, uint32_t sid) {
#if defined(_MSC_VER)
__cpuidex((int *)out, id, sid);
#elif defined(__i386__) || defined(_M_IX86)
__asm__ __volatile__("movl %%ebx, %1\n"
"cpuid\n"
"xchgl %1, %%ebx\n"
: "=a"(out[0]), "=r"(out[1]), "=c"(out[2]), "=d"(out[3])
: "a"(id), "c"(sid));
#else
__asm__ __volatile__("cpuid\n"
: "=a"(out[0]), "=b"(out[1]), "=c"(out[2]), "=d"(out[3])
: "a"(id), "c"(sid));
#endif
}
#endif
enum cpu_feature {
SSE2 = 1 << 0,
SSSE3 = 1 << 1,
SSE41 = 1 << 2,
AVX = 1 << 3,
AVX2 = 1 << 4,
AVX512F = 1 << 5,
AVX512VL = 1 << 6,
/* ... */
UNDEFINED = 1 << 30
};
#if !defined(BLAKE3_TESTING)
static /* Allow the variable to be controlled manually for testing */
#endif
enum cpu_feature g_cpu_features = UNDEFINED;
#if !defined(BLAKE3_TESTING)
static
#endif
enum cpu_feature
get_cpu_features() {
if (g_cpu_features != UNDEFINED) {
return g_cpu_features;
} else {
#if defined(IS_X86)
uint32_t regs[4] = {0};
uint32_t *eax = &regs[0], *ebx = &regs[1], *ecx = &regs[2], *edx = &regs[3];
(void)edx;
enum cpu_feature features = 0;
cpuid(regs, 0);
const int max_id = *eax;
cpuid(regs, 1);
#if defined(__amd64__) || defined(_M_X64)
features |= SSE2;
#else
if (*edx & (1UL << 26))
features |= SSE2;
#endif
if (*ecx & (1UL << 0))
features |= SSSE3;
if (*ecx & (1UL << 19))
features |= SSE41;
if (*ecx & (1UL << 27)) { // OSXSAVE
const uint64_t mask = xgetbv();
if ((mask & 6) == 6) { // SSE and AVX states
if (*ecx & (1UL << 28))
features |= AVX;
if (max_id >= 7) {
cpuidex(regs, 7, 0);
if (*ebx & (1UL << 5))
features |= AVX2;
if ((mask & 224) == 224) { // Opmask, ZMM_Hi256, Hi16_Zmm
if (*ebx & (1UL << 31))
features |= AVX512VL;
if (*ebx & (1UL << 16))
features |= AVX512F;
}
}
}
}
g_cpu_features = features;
return features;
#else
/* How to detect NEON? */
return 0;
#endif
}
}
void blake3_compress_in_place(uint32_t cv[8],
const uint8_t block[BLAKE3_BLOCK_LEN],
uint8_t block_len, uint64_t counter,
uint8_t flags) {
#if defined(IS_X86)
const enum cpu_feature features = get_cpu_features();
#if !defined(BLAKE3_NO_AVX512)
if (features & AVX512VL) {
blake3_compress_in_place_avx512(cv, block, block_len, counter, flags);
return;
}
#endif
#if !defined(BLAKE3_NO_SSE41)
if (features & SSE41) {
blake3_compress_in_place_sse41(cv, block, block_len, counter, flags);
return;
}
#endif
#if !defined(BLAKE3_NO_SSE2)
if (features & SSE2) {
blake3_compress_in_place_sse2(cv, block, block_len, counter, flags);
return;
}
#endif
#endif
blake3_compress_in_place_portable(cv, block, block_len, counter, flags);
}
void blake3_compress_xof(const uint32_t cv[8],
const uint8_t block[BLAKE3_BLOCK_LEN],
uint8_t block_len, uint64_t counter, uint8_t flags,
uint8_t out[64]) {
#if defined(IS_X86)
const enum cpu_feature features = get_cpu_features();
#if !defined(BLAKE3_NO_AVX512)
if (features & AVX512VL) {
blake3_compress_xof_avx512(cv, block, block_len, counter, flags, out);
return;
}
#endif
#if !defined(BLAKE3_NO_SSE41)
if (features & SSE41) {
blake3_compress_xof_sse41(cv, block, block_len, counter, flags, out);
return;
}
#endif
#if !defined(BLAKE3_NO_SSE2)
if (features & SSE2) {
blake3_compress_xof_sse2(cv, block, block_len, counter, flags, out);
return;
}
#endif
#endif
blake3_compress_xof_portable(cv, block, block_len, counter, flags, out);
}
void blake3_hash_many(const uint8_t *const *inputs, size_t num_inputs,
size_t blocks, const uint32_t key[8], uint64_t counter,
bool increment_counter, uint8_t flags,
uint8_t flags_start, uint8_t flags_end, uint8_t *out) {
#if defined(IS_X86)
const enum cpu_feature features = get_cpu_features();
#if !defined(BLAKE3_NO_AVX512)
if ((features & (AVX512F|AVX512VL)) == (AVX512F|AVX512VL)) {
blake3_hash_many_avx512(inputs, num_inputs, blocks, key, counter,
increment_counter, flags, flags_start, flags_end,
out);
return;
}
#endif
#if !defined(BLAKE3_NO_AVX2)
if (features & AVX2) {
blake3_hash_many_avx2(inputs, num_inputs, blocks, key, counter,
increment_counter, flags, flags_start, flags_end,
out);
return;
}
#endif
#if !defined(BLAKE3_NO_SSE41)
if (features & SSE41) {
blake3_hash_many_sse41(inputs, num_inputs, blocks, key, counter,
increment_counter, flags, flags_start, flags_end,
out);
return;
}
#endif
#if !defined(BLAKE3_NO_SSE2)
if (features & SSE2) {
blake3_hash_many_sse2(inputs, num_inputs, blocks, key, counter,
increment_counter, flags, flags_start, flags_end,
out);
return;
}
#endif
#endif
#if defined(BLAKE3_USE_NEON)
blake3_hash_many_neon(inputs, num_inputs, blocks, key, counter,
increment_counter, flags, flags_start, flags_end, out);
return;
#endif
blake3_hash_many_portable(inputs, num_inputs, blocks, key, counter,
increment_counter, flags, flags_start, flags_end,
out);
}
// The dynamically detected SIMD degree of the current platform.
size_t blake3_simd_degree(void) {
#if defined(IS_X86)
const enum cpu_feature features = get_cpu_features();
#if !defined(BLAKE3_NO_AVX512)
if ((features & (AVX512F|AVX512VL)) == (AVX512F|AVX512VL)) {
return 16;
}
#endif
#if !defined(BLAKE3_NO_AVX2)
if (features & AVX2) {
return 8;
}
#endif
#if !defined(BLAKE3_NO_SSE41)
if (features & SSE41) {
return 4;
}
#endif
#if !defined(BLAKE3_NO_SSE2)
if (features & SSE2) {
return 4;
}
#endif
#endif
#if defined(BLAKE3_USE_NEON)
return 4;
#endif
return 1;
}