From c33a8462d1e1770f91a1aa4c4854ae000ed865ae Mon Sep 17 00:00:00 2001 From: Matthew Krupcale Date: Tue, 25 Aug 2020 12:26:15 -0400 Subject: [PATCH] Write _mm_blend_epi16 emulation without multiplication Use _mm_and_si128 and _mm_cmpeq_epi16 rather than expensive multiplication _mm_mullo_epi16 with _mm_srai_epi16 that compiler may not be able to optimize. --- c/blake3_sse2.c | 9 +++++---- src/rust_sse2.rs | 5 +++-- 2 files changed, 8 insertions(+), 6 deletions(-) diff --git a/c/blake3_sse2.c b/c/blake3_sse2.c index de33d62..1592966 100644 --- a/c/blake3_sse2.c +++ b/c/blake3_sse2.c @@ -79,10 +79,11 @@ INLINE void undiagonalize(__m128i *row0, __m128i *row2, __m128i *row3) { } INLINE __m128i blend_epi16(__m128i a, __m128i b, const int imm8) { - __m128i mask = _mm_set1_epi16(imm8); - mask = _mm_mullo_epi16(mask, set4(0x40008000, 0x10002000, 0x04000800, 0x01000200)); - mask = _mm_srai_epi16(mask, 15); - return _mm_or_si128(_mm_and_si128(mask, b), _mm_andnot_si128(mask, a)); + const __m128i bits = _mm_set_epi16(0x80, 0x40, 0x20, 0x10, 0x08, 0x04, 0x02, 0x01); + __m128i mask = _mm_set1_epi16(imm8); + mask = _mm_and_si128(mask, bits); + mask = _mm_cmpeq_epi16(mask, bits); + return _mm_or_si128(_mm_and_si128(mask, b), _mm_andnot_si128(mask, a)); } INLINE void compress_pre(__m128i rows[4], const uint32_t cv[8], diff --git a/src/rust_sse2.rs b/src/rust_sse2.rs index 3084ed1..366ab21 100644 --- a/src/rust_sse2.rs +++ b/src/rust_sse2.rs @@ -139,9 +139,10 @@ unsafe fn undiagonalize(row0: &mut __m128i, row2: &mut __m128i, row3: &mut __m12 #[inline(always)] unsafe fn blend_epi16(a: __m128i, b: __m128i, imm8: i32) -> __m128i { + let bits = _mm_set_epi16(0x80, 0x40, 0x20, 0x10, 0x08, 0x04, 0x02, 0x01); let mut mask = _mm_set1_epi16(imm8 as i16); - mask = _mm_mullo_epi16(mask, set4(0x40008000, 0x10002000, 0x04000800, 0x01000200)); - mask = _mm_srai_epi16(mask, 15); + mask = _mm_and_si128(mask, bits); + mask = _mm_cmpeq_epi16(mask, bits); _mm_or_si128(_mm_and_si128(mask, b), _mm_andnot_si128(mask, a)) }