diff --git a/c/blake3_dispatch.c b/c/blake3_dispatch.c index af6c3da..95f9fb0 100644 --- a/c/blake3_dispatch.c +++ b/c/blake3_dispatch.c @@ -96,6 +96,7 @@ enum cpu_feature { AVX2 = 1 << 4, AVX512F = 1 << 5, AVX512VL = 1 << 6, + ARM_NEON = 1 << 7, /* ... */ UNDEFINED = 1 << 30 }; @@ -155,8 +156,19 @@ static } ATOMIC_STORE(g_cpu_features, features); return features; +#elif defined(__aarch64__) + uint64_t id_aa64pfr0_el1; + __asm__ ("mrs %0, ID_AA64PFR0_EL1" : "=r" (id_aa64pfr0_el1)); + if(((id_aa64pfr0_el1 >> 20) & (1<<0 | 1<<1 | 1<<2 | 1 << 3)) != 15) { + // https://developer.arm.com/documentation/ddi0595/2021-12/AArch64-Registers/ID-AA64PFR0-EL1--AArch64-Processor-Feature-Register-0?lang=en + // 15 means not implemented, 0 means neon is present but float16 is missing, 1 means neon with float16 is present ? + features = ARM_NEON; + } else { + features = 0; + } + ATOMIC_STORE(g_cpu_features, features); + return features; #else - /* How to detect NEON? */ return 0; #endif } @@ -260,11 +272,19 @@ void blake3_hash_many(const uint8_t *const *inputs, size_t num_inputs, } #endif #endif - #if BLAKE3_USE_NEON == 1 blake3_hash_many_neon(inputs, num_inputs, blocks, key, counter, increment_counter, flags, flags_start, flags_end, out); return; +#elif __aarch64__ +{ + const enum cpu_feature features = get_cpu_features(); + if(features & ARM_NEON) { + blake3_hash_many_neon(inputs, num_inputs, blocks, key, counter, + increment_counter, flags, flags_start, flags_end, out); + return; + } +} #endif blake3_hash_many_portable(inputs, num_inputs, blocks, key, counter, @@ -300,6 +320,11 @@ size_t blake3_simd_degree(void) { #endif #if BLAKE3_USE_NEON == 1 return 4; +#elif defined(__aarch64__) + const enum cpu_feature features = get_cpu_features(); + if(features & ARM_NEON) { + return 4; + } #endif return 1; } diff --git a/c/main.c b/c/main.c index 77cab58..c9b0e22 100644 --- a/c/main.c +++ b/c/main.c @@ -63,6 +63,7 @@ enum cpu_feature { AVX2 = 1 << 4, AVX512F = 1 << 5, AVX512VL = 1 << 6, + ARM_NEON = 1 << 7, /* ... */ UNDEFINED = 1 << 30 };