From ba665ca9f115ad978e042f6b6ed3ec54a93fedae Mon Sep 17 00:00:00 2001 From: hanshenrik Date: Mon, 5 Feb 2024 15:15:53 +0100 Subject: [PATCH 1/5] runtime neon detection tested on Oracle Cloud's cheapest ARM VPS VM.Standard.A1.Flex --- c/blake3_dispatch.c | 22 ++++++++++++++++++++-- c/main.c | 1 + 2 files changed, 21 insertions(+), 2 deletions(-) diff --git a/c/blake3_dispatch.c b/c/blake3_dispatch.c index af6c3da..fe72ee2 100644 --- a/c/blake3_dispatch.c +++ b/c/blake3_dispatch.c @@ -96,6 +96,7 @@ enum cpu_feature { AVX2 = 1 << 4, AVX512F = 1 << 5, AVX512VL = 1 << 6, + ARM_NEON = 1 << 7, /* ... */ UNDEFINED = 1 << 30 }; @@ -155,8 +156,17 @@ static } ATOMIC_STORE(g_cpu_features, features); return features; +#elif defined(__aarch64__) + uint64_t id_aa64pfr0_el1; + __asm__ ("mrs %0, ID_AA64PFR0_EL1" : "=r" (id_aa64pfr0_el1)); + if((id_aa64pfr0_el1 >> 20) & 0xF) { + features |= ARM_NEON; + } else { + features = 0; + } + ATOMIC_STORE(g_cpu_features, features); + return features; #else - /* How to detect NEON? */ return 0; #endif } @@ -260,11 +270,19 @@ void blake3_hash_many(const uint8_t *const *inputs, size_t num_inputs, } #endif #endif - #if BLAKE3_USE_NEON == 1 blake3_hash_many_neon(inputs, num_inputs, blocks, key, counter, increment_counter, flags, flags_start, flags_end, out); return; +#elif __aarch64__ +{ + const enum cpu_feature features = get_cpu_features(); + if(features & ARM_NEON) { + blake3_hash_many_neon(inputs, num_inputs, blocks, key, counter, + increment_counter, flags, flags_start, flags_end, out); + return; + } +} #endif blake3_hash_many_portable(inputs, num_inputs, blocks, key, counter, diff --git a/c/main.c b/c/main.c index 77cab58..c9b0e22 100644 --- a/c/main.c +++ b/c/main.c @@ -63,6 +63,7 @@ enum cpu_feature { AVX2 = 1 << 4, AVX512F = 1 << 5, AVX512VL = 1 << 6, + ARM_NEON = 1 << 7, /* ... */ UNDEFINED = 1 << 30 }; From 349f8300f9d2da29f3e622a67c45ae7c9b0dfdbc Mon Sep 17 00:00:00 2001 From: divinity76 Date: Mon, 5 Feb 2024 15:59:54 +0100 Subject: [PATCH 2/5] forgot about UNDEFINED --- c/blake3_dispatch.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/c/blake3_dispatch.c b/c/blake3_dispatch.c index fe72ee2..047ef08 100644 --- a/c/blake3_dispatch.c +++ b/c/blake3_dispatch.c @@ -160,7 +160,7 @@ static uint64_t id_aa64pfr0_el1; __asm__ ("mrs %0, ID_AA64PFR0_EL1" : "=r" (id_aa64pfr0_el1)); if((id_aa64pfr0_el1 >> 20) & 0xF) { - features |= ARM_NEON; + features = ARM_NEON; } else { features = 0; } From 21459753ca7b5593e9e517e571a127bfec159290 Mon Sep 17 00:00:00 2001 From: divinity76 Date: Mon, 5 Feb 2024 16:06:10 +0100 Subject: [PATCH 3/5] =?UTF-8?q?4=20first=20bits=20=C2=AF\=5F(=E3=83=84)=5F?= =?UTF-8?q?/=C2=AF?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- c/blake3_dispatch.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/c/blake3_dispatch.c b/c/blake3_dispatch.c index 047ef08..1d8331c 100644 --- a/c/blake3_dispatch.c +++ b/c/blake3_dispatch.c @@ -159,7 +159,7 @@ static #elif defined(__aarch64__) uint64_t id_aa64pfr0_el1; __asm__ ("mrs %0, ID_AA64PFR0_EL1" : "=r" (id_aa64pfr0_el1)); - if((id_aa64pfr0_el1 >> 20) & 0xF) { + if((id_aa64pfr0_el1 >> 20) & (1<<0 | 1<<1 | 1<<2 | 1 << 3)) { features = ARM_NEON; } else { features = 0; From f9b332c61ca983b62da069c02a3c638ec834d452 Mon Sep 17 00:00:00 2001 From: divinity76 Date: Mon, 5 Feb 2024 16:40:09 +0100 Subject: [PATCH 4/5] != 15 https://github.com/BLAKE3-team/BLAKE3/pull/383#issuecomment-1927210402 --- c/blake3_dispatch.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/c/blake3_dispatch.c b/c/blake3_dispatch.c index 1d8331c..438f47c 100644 --- a/c/blake3_dispatch.c +++ b/c/blake3_dispatch.c @@ -159,11 +159,13 @@ static #elif defined(__aarch64__) uint64_t id_aa64pfr0_el1; __asm__ ("mrs %0, ID_AA64PFR0_EL1" : "=r" (id_aa64pfr0_el1)); - if((id_aa64pfr0_el1 >> 20) & (1<<0 | 1<<1 | 1<<2 | 1 << 3)) { + if(((id_aa64pfr0_el1 >> 20) & (1<<0 | 1<<1 | 1<<2 | 1 << 3)) != 15) { + // https://developer.arm.com/documentation/ddi0595/2021-12/AArch64-Registers/ID-AA64PFR0-EL1--AArch64-Processor-Feature-Register-0?lang=en + // 15 means not implemented, 0 means neon is present but float16 is missing, 1 means neon with float16 is present ? features = ARM_NEON; } else { features = 0; - } + } ATOMIC_STORE(g_cpu_features, features); return features; #else From 059ad2d9220dfa744f13cb79dc9945ff4a97fb4c Mon Sep 17 00:00:00 2001 From: divinity76 Date: Tue, 6 Feb 2024 11:52:38 +0100 Subject: [PATCH 5/5] forgort blake3_simd_degree() --- c/blake3_dispatch.c | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/c/blake3_dispatch.c b/c/blake3_dispatch.c index 438f47c..95f9fb0 100644 --- a/c/blake3_dispatch.c +++ b/c/blake3_dispatch.c @@ -320,6 +320,11 @@ size_t blake3_simd_degree(void) { #endif #if BLAKE3_USE_NEON == 1 return 4; +#elif defined(__aarch64__) + const enum cpu_feature features = get_cpu_features(); + if(features & ARM_NEON) { + return 4; + } #endif return 1; }