1
0
Fork 0
mirror of https://github.com/BLAKE3-team/BLAKE3 synced 2024-05-10 08:36:18 +02:00

streamline load_counters

avx2 before:

        mov     eax, esi
        neg     rax
        vmovq   xmm0, rax
        vpbroadcastq    ymm0, xmm0
        vpand   ymm0, ymm0, ymmword ptr [rip + .LCPI1_0]
        vmovq   xmm2, rdi
        vpbroadcastq    ymm1, xmm2
        vpaddq  ymm1, ymm0, ymm1
        vmovdqa ymm0, ymmword ptr [rip + .LCPI1_1] # ymm0 = [0,2,4,6,4,6,6,7]
        vpermd  ymm3, ymm0, ymm1
        mov     r8d, eax
        and     r8d, 5
        add     r8, rdi
        mov     esi, eax
        and     esi, 6
        add     rsi, rdi
        and     eax, 7
        vpshufd xmm4, xmm3, 231         # xmm4 = xmm3[3,1,2,3]
        vpinsrd xmm4, xmm4, r8d, 1
        add     rax, rdi
        vpinsrd xmm4, xmm4, esi, 2
        vpinsrd xmm4, xmm4, eax, 3
        vpshufd xmm3, xmm3, 144         # xmm3 = xmm3[0,0,1,2]
        vpinsrd xmm3, xmm3, edi, 0
        vmovdqa xmmword ptr [rdx], xmm3
        vmovdqa xmmword ptr [rdx + 16], xmm4
        vpermq  ymm3, ymm1, 144         # ymm3 = ymm1[0,0,1,2]
        vpblendd        ymm2, ymm3, ymm2, 3 # ymm2 = ymm2[0,1],ymm3[2,3,4,5,6,7]
        vpsrlq  ymm2, ymm2, 32
        vpermd  ymm2, ymm0, ymm2
        vextracti128    xmm1, ymm1, 1
        vmovq   xmm3, rax
        vmovq   xmm4, rsi
        vpunpcklqdq     xmm3, xmm4, xmm3 # xmm3 = xmm4[0],xmm3[0]
        vmovq   xmm4, r8
        vpalignr        xmm1, xmm4, xmm1, 8 # xmm1 = xmm1[8,9,10,11,12,13,14,15],xmm4[0,1,2,3,4,5,6,7]
        vinserti128     ymm1, ymm1, xmm3, 1
        vpsrlq  ymm1, ymm1, 32
        vpermd  ymm0, ymm0, ymm1

avx2 after:

        neg     esi
        vmovd   xmm0, esi
        vpbroadcastd    ymm0, xmm0
        vmovd   xmm1, edi
        vpbroadcastd    ymm1, xmm1
        vpand   ymm0, ymm0, ymmword ptr [rip + .LCPI0_0]
        vpaddd  ymm1, ymm1, ymm0
        vpbroadcastd    ymm2, dword ptr [rip + .LCPI0_1] # ymm2 = [2147483648,2147483648,2147483648,2147483648,2147483648,2147483648,2147483648,2147483648]
        vpor    ymm0, ymm0, ymm2
        vpxor   ymm2, ymm1, ymm2
        vpcmpgtd        ymm0, ymm0, ymm2
        shr     rdi, 32
        vmovd   xmm2, edi
        vpbroadcastd    ymm2, xmm2
        vpsubd  ymm0, ymm2, ymm0
This commit is contained in:
Samuel Neves 2020-01-23 11:58:36 +00:00
parent de1458c565
commit a830ab2661
3 changed files with 30 additions and 42 deletions

View File

@ -19,12 +19,6 @@ INLINE __m256i xorv(__m256i a, __m256i b) { return _mm256_xor_si256(a, b); }
INLINE __m256i set1(uint32_t x) { return _mm256_set1_epi32((int32_t)x); }
INLINE __m256i set8(uint32_t a, uint32_t b, uint32_t c, uint32_t d, uint32_t e,
uint32_t f, uint32_t g, uint32_t h) {
return _mm256_setr_epi32((int32_t)a, (int32_t)b, (int32_t)c, (int32_t)d,
(int32_t)e, (int32_t)f, (int32_t)g, (int32_t)h);
}
INLINE __m256i rot16(__m256i x) {
return _mm256_shuffle_epi8(
x, _mm256_set_epi8(13, 12, 15, 14, 9, 8, 11, 10, 5, 4, 7, 6, 1, 0, 3, 2,
@ -32,7 +26,7 @@ INLINE __m256i rot16(__m256i x) {
}
INLINE __m256i rot12(__m256i x) {
return xorv(_mm256_srli_epi32(x, 12), _mm256_slli_epi32(x, 32 - 12));
return _mm256_or_si256(_mm256_srli_epi32(x, 12), _mm256_slli_epi32(x, 32 - 12));
}
INLINE __m256i rot8(__m256i x) {
@ -42,7 +36,7 @@ INLINE __m256i rot8(__m256i x) {
}
INLINE __m256i rot7(__m256i x) {
return xorv(_mm256_srli_epi32(x, 7), _mm256_slli_epi32(x, 32 - 7));
return _mm256_or_si256(_mm256_srli_epi32(x, 7), _mm256_slli_epi32(x, 32 - 7));
}
INLINE void round_fn(__m256i v[16], __m256i m[16], size_t r) {
@ -221,18 +215,16 @@ INLINE void transpose_msg_vecs(const uint8_t *const *inputs,
}
INLINE void load_counters(uint64_t counter, bool increment_counter,
__m256i *out_low, __m256i *out_high) {
uint64_t mask = (increment_counter ? ~0 : 0);
*out_low = set8(
counter_low(counter + (mask & 0)), counter_low(counter + (mask & 1)),
counter_low(counter + (mask & 2)), counter_low(counter + (mask & 3)),
counter_low(counter + (mask & 4)), counter_low(counter + (mask & 5)),
counter_low(counter + (mask & 6)), counter_low(counter + (mask & 7)));
*out_high = set8(
counter_high(counter + (mask & 0)), counter_high(counter + (mask & 1)),
counter_high(counter + (mask & 2)), counter_high(counter + (mask & 3)),
counter_high(counter + (mask & 4)), counter_high(counter + (mask & 5)),
counter_high(counter + (mask & 6)), counter_high(counter + (mask & 7)));
__m256i *out_lo, __m256i *out_hi) {
const __m256i mask = _mm256_set1_epi32(-(uint32_t)increment_counter);
const __m256i add0 = _mm256_set_epi32(7, 6, 5, 4, 3, 2, 1, 0);
const __m256i add1 = _mm256_and_si256(mask, add0);
__m256i l = _mm256_add_epi32(_mm256_set1_epi32(counter), add1);
__m256i carry = _mm256_cmpgt_epi32(_mm256_xor_si256(add1, _mm256_set1_epi32(0x80000000)),
_mm256_xor_si256( l, _mm256_set1_epi32(0x80000000)));
__m256i h = _mm256_sub_epi32(_mm256_set1_epi32(counter >> 32), carry);
*out_lo = l;
*out_hi = h;
}
void blake3_hash8_avx2(const uint8_t *const *inputs, size_t blocks,

View File

@ -1044,20 +1044,14 @@ INLINE void transpose_msg_vecs16(const uint8_t *const *inputs,
INLINE void load_counters16(uint64_t counter, bool increment_counter,
__m512i *out_lo, __m512i *out_hi) {
uint64_t mask = (increment_counter ? ~0 : 0);
__m512i mask_vec = _mm512_set1_epi64(mask);
__m512i deltas_a = _mm512_setr_epi64(0, 1, 2, 3, 4, 5, 6, 7);
deltas_a = _mm512_and_si512(mask_vec, deltas_a);
__m512i deltas_b = _mm512_setr_epi64(8, 9, 10, 11, 12, 13, 14, 15);
deltas_b = _mm512_and_si512(mask_vec, deltas_b);
__m512i a = _mm512_add_epi64(_mm512_set1_epi64((int64_t)counter), deltas_a);
__m512i b = _mm512_add_epi64(_mm512_set1_epi64((int64_t)counter), deltas_b);
__m512i lo_indexes = _mm512_setr_epi32(0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20,
22, 24, 26, 28, 30);
__m512i hi_indexes = _mm512_setr_epi32(1, 3, 5, 7, 9, 11, 13, 15, 17, 19, 21,
23, 25, 27, 29, 31);
*out_lo = _mm512_permutex2var_epi32(a, lo_indexes, b);
*out_hi = _mm512_permutex2var_epi32(a, hi_indexes, b);
const __m512i mask = _mm512_set1_epi32(-(uint32_t)increment_counter);
const __m512i add0 = _mm512_set_epi32(15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
const __m512i add1 = _mm512_and_si512(mask, add0);
__m512i l = _mm512_add_epi32(_mm512_set1_epi32(counter), add1);
__mmask16 carry = _mm512_cmp_epu32_mask(l, add1, _MM_CMPINT_LT);
__m512i h = _mm512_mask_add_epi32(_mm512_set1_epi32(counter >> 32), carry, _mm512_set1_epi32(counter >> 32), _mm512_set1_epi32(1));
*out_lo = l;
*out_hi = h;
}
void blake3_hash16_avx512(const uint8_t *const *inputs, size_t blocks,

View File

@ -438,14 +438,16 @@ INLINE void transpose_msg_vecs(const uint8_t *const *inputs,
}
INLINE void load_counters(uint64_t counter, bool increment_counter,
__m128i *out_low, __m128i *out_high) {
uint64_t mask = (increment_counter ? ~0 : 0);
*out_low = set4(
counter_low(counter + (mask & 0)), counter_low(counter + (mask & 1)),
counter_low(counter + (mask & 2)), counter_low(counter + (mask & 3)));
*out_high = set4(
counter_high(counter + (mask & 0)), counter_high(counter + (mask & 1)),
counter_high(counter + (mask & 2)), counter_high(counter + (mask & 3)));
__m128i *out_lo, __m128i *out_hi) {
const __m128i mask = _mm_set1_epi32(-(uint32_t)increment_counter);
const __m128i add0 = _mm_set_epi32(3, 2, 1, 0);
const __m128i add1 = _mm_and_si128(mask, add0);
__m128i l = _mm_add_epi32(_mm_set1_epi32(counter), add1);
__m128i carry = _mm_cmpgt_epi32(_mm_xor_si128(add1, _mm_set1_epi32(0x80000000)),
_mm_xor_si128( l, _mm_set1_epi32(0x80000000)));
__m128i h = _mm_sub_epi32(_mm_set1_epi32(counter >> 32), carry);
*out_lo = l;
*out_hi = h;
}
void blake3_hash4_sse41(const uint8_t *const *inputs, size_t blocks,