refactor: Add SHA3 macros

2024-11-07 14:49:17 +01:00 · 2021-06-02 00:19:54 +08:00 · 2021-06-02 00:19:54 +08:00 · ebc3796576
commit ebc3796576
parent 09ad076f61
3 changed files with 71 additions and 10 deletions
--- a/neon/blake2b-round.h
+++ b/neon/blake2b-round.h
@ -27,27 +27,88 @@

 #define vrorq_n_u64_63(x) veorq_u64(vaddq_u64(x, x), vshrq_n_u64(x, 63))

+/// \brief Three-way XOR
+/// \param a the first value
+/// \param b the second value
+/// \param c the third value
+/// \return three-way exclusive OR of the values
+/// \details VEOR3() performs veor3q_u64(). VEOR3 is provided as GCC inline assembly due
+///  to Clang and lack of support for the intrinsic.
+/// \details VEOR3 requires ARMv8.4.
+inline uint64x2_t VEOR3(uint64x2_t a, uint64x2_t b, uint64x2_t c)
+{
+#if defined(_MSC_VER)
+# error "Not implemented"
+#else
+    uint64x2_t r;
+    __asm__ ("eor3   %0.16b, %1.16b, %2.16b, %3.16b   \n\t"
+            :"=w" (r) : "w" (a), "w" (b), "w" (c));
+    return r;
+#endif
+}
+
+/// \brief XOR and rotate
+/// \param a the first value
+/// \param b the second value
+/// \param c the third value
+/// \return two-way exclusive OR of the values, then rotated by imm6
+/// \details VXARQ() performs vxarq_u64(). VXARQ is provided as GCC inline assembly due
+///  to Clang and lack of support for the intrinsic.
+/// \details VXARQ requires ARMv8.4.
+inline uint64x2_t VXARQ(uint64x2_t a, uint64x2_t b, const int imm6)
+{
+#if defined(_MSC_VER)
+# error "Not implemented"
+#else
+    uint64x2_t r;
+    __asm__ ("xar   %0.2d, %1.2d, %2.2d, %3   \n\t"
+            :"=w" (r) : "w" (a), "w" (b), "I" (imm6));
+    return r;
+#endif
+}
+
+/// \brief XOR and rotate
+/// \tparam C the rotate amount
+/// \param a the first value
+/// \param b the second value
+/// \return two-way exclusive OR of the values, then rotated by C
+/// \details VXARQ() performs vxarq_u64(). VXARQ is provided as GCC inline assembly due
+///  to Clang and lack of support for the intrinsic.
+/// \details VXARQ requires ARMv8.4.
+template <unsigned int C>
+inline uint64x2_t VXARQ(uint64x2_t a, uint64x2_t b)
+{
+#if defined(_MSC_VER)
+# error "Not implemented"
+#else
+    uint64x2_t r;
+    __asm__ ("xar   %0.2d, %1.2d, %2.2d, %3   \n\t"
+            :"=w" (r) : "w" (a), "w" (b), "I" (C));
+    return r;
+#endif
+}
+
 #if defined(__ARM_FEATURE_SHA3)
 #define G1(row1l,row2l,row3l,row4l,row1h,row2h,row3h,row4h,b0,b1) \
  row1l = vaddq_u64(vaddq_u64(row1l, b0), row2l); \
  row1h = vaddq_u64(vaddq_u64(row1h, b1), row2h); \
-  row4l = vxarq_u64(row4l, row1l, 32); row4h = vxarq_u64(row4h, row1h, 32); \
+  row4l = VXARQ(row4l, row1l, 32); row4h = VXARQ(row4h, row1h, 32); \
  row3l = vaddq_u64(row3l, row4l); row3h = vaddq_u64(row3h, row4h); \
-  row2l = vxarq_u64(row2l, row3l, 24); row2h = vxarq_u64(row2h, row3h, 24);
+  row2l = VXARQ(row2l, row3l, 24); row2h = VXARQ(row2h, row3h, 24);

 #define G2(row1l,row2l,row3l,row4l,row1h,row2h,row3h,row4h,b0,b1) \
  row1l = vaddq_u64(vaddq_u64(row1l, b0), row2l); \
  row1h = vaddq_u64(vaddq_u64(row1h, b1), row2h); \
-  row4l = vxarq_u64(row4l, row1l, 16); row4h = vxarq_u64(row4h, row1h, 16); \
+  row4l = VXARQ(row4l, row1l, 16); row4h = VXARQ(row4h, row1h, 16); \
  row3l = vaddq_u64(row3l, row4l); row3h = vaddq_u64(row3h, row4h); \
-  row2l = vxarq_u64(row2l, row3l, 63); row2h = vxarq_u64(row2h, row3h, 63);
+  row2l = VXARQ(row2l, row3l, 63); row2h = VXARQ(row2h, row3h, 63);

 #else
 /* No SHA3 support */
 #define G1(row1l,row2l,row3l,row4l,row1h,row2h,row3h,row4h,b0,b1) \
  row1l = vaddq_u64(vaddq_u64(row1l, b0), row2l); \
  row1h = vaddq_u64(vaddq_u64(row1h, b1), row2h); \
-  row4l = vxarq_u64(row4l, row1l, 32); row4h = vxarq_u64(row4h, row1h, 32); \
+  row4l = VXARQ(row4l, row1l, 32); row4h = VXARQ(row4h, row1h, 32); \
  row3l = vaddq_u64(row3l, row4l); row3h = vaddq_u64(row3h, row4h); \
  row2l = veorq_u64(row2l, row3l); row2h = veorq_u64(row2h, row3h); \
  row2l = vrorq_n_u64_24(row2l); row2h = vrorq_n_u64_24(row2h);
--- a/neon/blake2b.c
+++ b/neon/blake2b.c
@ -175,10 +175,10 @@ static void blake2b_compress( blake2b_state *S, const uint8_t block[BLAKE2B_BLOC
  ROUND( 11 );

 #if defined(__ARM_FEATURE_SHA3)
-  vst1q_u64(&S->h[0], veor3q_u64(h0, row1l, row3l));
-  vst1q_u64(&S->h[2], veor3q_u64(h1, row1h, row3h));
-  vst1q_u64(&S->h[4], veor3q_u64(h2, row2l, row4l));
-  vst1q_u64(&S->h[6], veor3q_u64(h3, row2h, row4h));
+  vst1q_u64(&S->h[0], VEOR3(h0, row1l, row3l));
+  vst1q_u64(&S->h[2], VEOR3(h1, row1h, row3h));
+  vst1q_u64(&S->h[4], VEOR3(h2, row2l, row4l));
+  vst1q_u64(&S->h[6], VEOR3(h3, row2h, row4h));
 #else
  vst1q_u64(&S->h[0], veorq_u64(h0, veorq_u64(row1l, row3l)));
  vst1q_u64(&S->h[2], veorq_u64(h1, veorq_u64(row1h, row3h)));
--- a/neon/makefile-aarch64
+++ b/neon/makefile-aarch64
@ -1,5 +1,5 @@
 CC=gcc
-CFLAGS=-march=armv8-a -O3 -I../testvectors -Wall -Wextra -std=c89 -pedantic -Wno-long-long
+CFLAGS=-march=armv8.2-a+sha3 -O3 -I../testvectors -Wall -Wextra -std=c89 -pedantic -Wno-long-long
 BLAKEBINS=blake2s blake2b blake2sp blake2bp blake2xs blake2xb

 all:		$(BLAKEBINS) check