mirror of
https://github.com/BLAKE3-team/BLAKE3
synced 2024-05-26 15:56:02 +02:00
slightly optimize neon loadu_128/storeu_128
vld1q_u8 and vst1q_u8 has no alignment requirements. This improves performance on Oracle Cloud's VM.Standard.A1.Flex by 1.15% on a 16*1024 input, from 13920 nanoseconds down to 13800 nanoseconds (approx)
This commit is contained in:
parent
8fc36186b8
commit
4df11ecd4f
|
@ -9,15 +9,11 @@
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
INLINE uint32x4_t loadu_128(const uint8_t src[16]) {
|
INLINE uint32x4_t loadu_128(const uint8_t src[16]) {
|
||||||
// vld1q_u32 has alignment requirements. Don't use it.
|
return vreinterpretq_u32_u8(vld1q_u8(src));
|
||||||
uint32x4_t x;
|
|
||||||
memcpy(&x, src, 16);
|
|
||||||
return x;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
INLINE void storeu_128(uint32x4_t src, uint8_t dest[16]) {
|
INLINE void storeu_128(uint32x4_t src, uint8_t dest[16]) {
|
||||||
// vst1q_u32 has alignment requirements. Don't use it.
|
vst1q_u8(dest, vreinterpretq_u8_u32(src));
|
||||||
memcpy(dest, &src, 16);
|
|
||||||
}
|
}
|
||||||
|
|
||||||
INLINE uint32x4_t add_128(uint32x4_t a, uint32x4_t b) {
|
INLINE uint32x4_t add_128(uint32x4_t a, uint32x4_t b) {
|
||||||
|
|
Loading…
Reference in New Issue