1
0
Fork 0
mirror of https://github.com/BLAKE3-team/BLAKE3 synced 2024-04-19 16:33:59 +02:00

Do not require AVX512DQ

Whereas vinserti64x4 is present on AVX512F, vinserti32x8 requires
AVX512DQ, which we do not test for. At this point there is not
much risk of incompatibility, since Skylake-X chips have all the
requires instruction sets, but let's be precise about this.
This commit is contained in:
Samuel Neves 2020-04-12 11:36:10 +01:00
parent 370ba3644a
commit 7ef795d62e
4 changed files with 53 additions and 53 deletions

View File

@ -82,15 +82,15 @@ blake3_hash_many_avx512:
mov r14, qword ptr [rdi+0x50]
mov r15, qword ptr [rdi+0x58]
vmovdqu32 ymm16, ymmword ptr [rdx+r8-0x2*0x20]
vinserti32x8 zmm16, zmm16, ymmword ptr [rdx+r12-0x2*0x20], 0x01
vinserti64x4 zmm16, zmm16, ymmword ptr [rdx+r12-0x2*0x20], 0x01
vmovdqu32 ymm17, ymmword ptr [rdx+r9-0x2*0x20]
vinserti32x8 zmm17, zmm17, ymmword ptr [rdx+r13-0x2*0x20], 0x01
vinserti64x4 zmm17, zmm17, ymmword ptr [rdx+r13-0x2*0x20], 0x01
vpunpcklqdq zmm8, zmm16, zmm17
vpunpckhqdq zmm9, zmm16, zmm17
vmovdqu32 ymm18, ymmword ptr [rdx+r10-0x2*0x20]
vinserti32x8 zmm18, zmm18, ymmword ptr [rdx+r14-0x2*0x20], 0x01
vinserti64x4 zmm18, zmm18, ymmword ptr [rdx+r14-0x2*0x20], 0x01
vmovdqu32 ymm19, ymmword ptr [rdx+r11-0x2*0x20]
vinserti32x8 zmm19, zmm19, ymmword ptr [rdx+r15-0x2*0x20], 0x01
vinserti64x4 zmm19, zmm19, ymmword ptr [rdx+r15-0x2*0x20], 0x01
vpunpcklqdq zmm10, zmm18, zmm19
vpunpckhqdq zmm11, zmm18, zmm19
mov r8, qword ptr [rdi+0x20]
@ -102,15 +102,15 @@ blake3_hash_many_avx512:
mov r14, qword ptr [rdi+0x70]
mov r15, qword ptr [rdi+0x78]
vmovdqu32 ymm16, ymmword ptr [rdx+r8-0x2*0x20]
vinserti32x8 zmm16, zmm16, ymmword ptr [rdx+r12-0x2*0x20], 0x01
vinserti64x4 zmm16, zmm16, ymmword ptr [rdx+r12-0x2*0x20], 0x01
vmovdqu32 ymm17, ymmword ptr [rdx+r9-0x2*0x20]
vinserti32x8 zmm17, zmm17, ymmword ptr [rdx+r13-0x2*0x20], 0x01
vinserti64x4 zmm17, zmm17, ymmword ptr [rdx+r13-0x2*0x20], 0x01
vpunpcklqdq zmm12, zmm16, zmm17
vpunpckhqdq zmm13, zmm16, zmm17
vmovdqu32 ymm18, ymmword ptr [rdx+r10-0x2*0x20]
vinserti32x8 zmm18, zmm18, ymmword ptr [rdx+r14-0x2*0x20], 0x01
vinserti64x4 zmm18, zmm18, ymmword ptr [rdx+r14-0x2*0x20], 0x01
vmovdqu32 ymm19, ymmword ptr [rdx+r11-0x2*0x20]
vinserti32x8 zmm19, zmm19, ymmword ptr [rdx+r15-0x2*0x20], 0x01
vinserti64x4 zmm19, zmm19, ymmword ptr [rdx+r15-0x2*0x20], 0x01
vpunpcklqdq zmm14, zmm18, zmm19
vpunpckhqdq zmm15, zmm18, zmm19
vmovdqa32 zmm27, zmmword ptr [INDEX0+rip]
@ -144,15 +144,15 @@ blake3_hash_many_avx512:
mov r14, qword ptr [rdi+0x50]
mov r15, qword ptr [rdi+0x58]
vmovdqu32 ymm24, ymmword ptr [r8+rdx-0x1*0x20]
vinserti32x8 zmm24, zmm24, ymmword ptr [r12+rdx-0x1*0x20], 0x01
vinserti64x4 zmm24, zmm24, ymmword ptr [r12+rdx-0x1*0x20], 0x01
vmovdqu32 ymm25, ymmword ptr [r9+rdx-0x1*0x20]
vinserti32x8 zmm25, zmm25, ymmword ptr [r13+rdx-0x1*0x20], 0x01
vinserti64x4 zmm25, zmm25, ymmword ptr [r13+rdx-0x1*0x20], 0x01
vpunpcklqdq zmm8, zmm24, zmm25
vpunpckhqdq zmm9, zmm24, zmm25
vmovdqu32 ymm24, ymmword ptr [r10+rdx-0x1*0x20]
vinserti32x8 zmm24, zmm24, ymmword ptr [r14+rdx-0x1*0x20], 0x01
vinserti64x4 zmm24, zmm24, ymmword ptr [r14+rdx-0x1*0x20], 0x01
vmovdqu32 ymm25, ymmword ptr [r11+rdx-0x1*0x20]
vinserti32x8 zmm25, zmm25, ymmword ptr [r15+rdx-0x1*0x20], 0x01
vinserti64x4 zmm25, zmm25, ymmword ptr [r15+rdx-0x1*0x20], 0x01
vpunpcklqdq zmm10, zmm24, zmm25
vpunpckhqdq zmm11, zmm24, zmm25
prefetcht0 [r8+rdx+0x80]
@ -172,15 +172,15 @@ blake3_hash_many_avx512:
mov r14, qword ptr [rdi+0x70]
mov r15, qword ptr [rdi+0x78]
vmovdqu32 ymm24, ymmword ptr [r8+rdx-0x1*0x20]
vinserti32x8 zmm24, zmm24, ymmword ptr [r12+rdx-0x1*0x20], 0x01
vinserti64x4 zmm24, zmm24, ymmword ptr [r12+rdx-0x1*0x20], 0x01
vmovdqu32 ymm25, ymmword ptr [r9+rdx-0x1*0x20]
vinserti32x8 zmm25, zmm25, ymmword ptr [r13+rdx-0x1*0x20], 0x01
vinserti64x4 zmm25, zmm25, ymmword ptr [r13+rdx-0x1*0x20], 0x01
vpunpcklqdq zmm12, zmm24, zmm25
vpunpckhqdq zmm13, zmm24, zmm25
vmovdqu32 ymm24, ymmword ptr [r10+rdx-0x1*0x20]
vinserti32x8 zmm24, zmm24, ymmword ptr [r14+rdx-0x1*0x20], 0x01
vinserti64x4 zmm24, zmm24, ymmword ptr [r14+rdx-0x1*0x20], 0x01
vmovdqu32 ymm25, ymmword ptr [r11+rdx-0x1*0x20]
vinserti32x8 zmm25, zmm25, ymmword ptr [r15+rdx-0x1*0x20], 0x01
vinserti64x4 zmm25, zmm25, ymmword ptr [r15+rdx-0x1*0x20], 0x01
vpunpcklqdq zmm14, zmm24, zmm25
vpunpckhqdq zmm15, zmm24, zmm25
prefetcht0 [r8+rdx+0x80]
@ -2039,7 +2039,7 @@ blake3_hash_many_avx512:
vpermq ymm14, ymm14, 0xDC
vpermq ymm15, ymm15, 0xDC
vpbroadcastd zmm12, dword ptr [BLAKE3_BLOCK_LEN+rip]
vinserti32x8 zmm13, zmm14, ymm15, 0x01
vinserti64x4 zmm13, zmm14, ymm15, 0x01
mov eax, 17476
kmovw k2, eax
vpblendmd zmm13 {k2}, zmm13, zmm12

View File

@ -96,15 +96,15 @@ blake3_hash_many_avx512:
mov r14, qword ptr [rdi+0x50]
mov r15, qword ptr [rdi+0x58]
vmovdqu32 ymm16, ymmword ptr [rdx+r8-0x2*0x20]
vinserti32x8 zmm16, zmm16, ymmword ptr [rdx+r12-0x2*0x20], 0x01
vinserti64x4 zmm16, zmm16, ymmword ptr [rdx+r12-0x2*0x20], 0x01
vmovdqu32 ymm17, ymmword ptr [rdx+r9-0x2*0x20]
vinserti32x8 zmm17, zmm17, ymmword ptr [rdx+r13-0x2*0x20], 0x01
vinserti64x4 zmm17, zmm17, ymmword ptr [rdx+r13-0x2*0x20], 0x01
vpunpcklqdq zmm8, zmm16, zmm17
vpunpckhqdq zmm9, zmm16, zmm17
vmovdqu32 ymm18, ymmword ptr [rdx+r10-0x2*0x20]
vinserti32x8 zmm18, zmm18, ymmword ptr [rdx+r14-0x2*0x20], 0x01
vinserti64x4 zmm18, zmm18, ymmword ptr [rdx+r14-0x2*0x20], 0x01
vmovdqu32 ymm19, ymmword ptr [rdx+r11-0x2*0x20]
vinserti32x8 zmm19, zmm19, ymmword ptr [rdx+r15-0x2*0x20], 0x01
vinserti64x4 zmm19, zmm19, ymmword ptr [rdx+r15-0x2*0x20], 0x01
vpunpcklqdq zmm10, zmm18, zmm19
vpunpckhqdq zmm11, zmm18, zmm19
mov r8, qword ptr [rdi+0x20]
@ -116,15 +116,15 @@ blake3_hash_many_avx512:
mov r14, qword ptr [rdi+0x70]
mov r15, qword ptr [rdi+0x78]
vmovdqu32 ymm16, ymmword ptr [rdx+r8-0x2*0x20]
vinserti32x8 zmm16, zmm16, ymmword ptr [rdx+r12-0x2*0x20], 0x01
vinserti64x4 zmm16, zmm16, ymmword ptr [rdx+r12-0x2*0x20], 0x01
vmovdqu32 ymm17, ymmword ptr [rdx+r9-0x2*0x20]
vinserti32x8 zmm17, zmm17, ymmword ptr [rdx+r13-0x2*0x20], 0x01
vinserti64x4 zmm17, zmm17, ymmword ptr [rdx+r13-0x2*0x20], 0x01
vpunpcklqdq zmm12, zmm16, zmm17
vpunpckhqdq zmm13, zmm16, zmm17
vmovdqu32 ymm18, ymmword ptr [rdx+r10-0x2*0x20]
vinserti32x8 zmm18, zmm18, ymmword ptr [rdx+r14-0x2*0x20], 0x01
vinserti64x4 zmm18, zmm18, ymmword ptr [rdx+r14-0x2*0x20], 0x01
vmovdqu32 ymm19, ymmword ptr [rdx+r11-0x2*0x20]
vinserti32x8 zmm19, zmm19, ymmword ptr [rdx+r15-0x2*0x20], 0x01
vinserti64x4 zmm19, zmm19, ymmword ptr [rdx+r15-0x2*0x20], 0x01
vpunpcklqdq zmm14, zmm18, zmm19
vpunpckhqdq zmm15, zmm18, zmm19
vmovdqa32 zmm27, zmmword ptr [INDEX0+rip]
@ -158,15 +158,15 @@ blake3_hash_many_avx512:
mov r14, qword ptr [rdi+0x50]
mov r15, qword ptr [rdi+0x58]
vmovdqu32 ymm24, ymmword ptr [r8+rdx-0x1*0x20]
vinserti32x8 zmm24, zmm24, ymmword ptr [r12+rdx-0x1*0x20], 0x01
vinserti64x4 zmm24, zmm24, ymmword ptr [r12+rdx-0x1*0x20], 0x01
vmovdqu32 ymm25, ymmword ptr [r9+rdx-0x1*0x20]
vinserti32x8 zmm25, zmm25, ymmword ptr [r13+rdx-0x1*0x20], 0x01
vinserti64x4 zmm25, zmm25, ymmword ptr [r13+rdx-0x1*0x20], 0x01
vpunpcklqdq zmm8, zmm24, zmm25
vpunpckhqdq zmm9, zmm24, zmm25
vmovdqu32 ymm24, ymmword ptr [r10+rdx-0x1*0x20]
vinserti32x8 zmm24, zmm24, ymmword ptr [r14+rdx-0x1*0x20], 0x01
vinserti64x4 zmm24, zmm24, ymmword ptr [r14+rdx-0x1*0x20], 0x01
vmovdqu32 ymm25, ymmword ptr [r11+rdx-0x1*0x20]
vinserti32x8 zmm25, zmm25, ymmword ptr [r15+rdx-0x1*0x20], 0x01
vinserti64x4 zmm25, zmm25, ymmword ptr [r15+rdx-0x1*0x20], 0x01
vpunpcklqdq zmm10, zmm24, zmm25
vpunpckhqdq zmm11, zmm24, zmm25
prefetcht0 [r8+rdx+0x80]
@ -186,15 +186,15 @@ blake3_hash_many_avx512:
mov r14, qword ptr [rdi+0x70]
mov r15, qword ptr [rdi+0x78]
vmovdqu32 ymm24, ymmword ptr [r8+rdx-0x1*0x20]
vinserti32x8 zmm24, zmm24, ymmword ptr [r12+rdx-0x1*0x20], 0x01
vinserti64x4 zmm24, zmm24, ymmword ptr [r12+rdx-0x1*0x20], 0x01
vmovdqu32 ymm25, ymmword ptr [r9+rdx-0x1*0x20]
vinserti32x8 zmm25, zmm25, ymmword ptr [r13+rdx-0x1*0x20], 0x01
vinserti64x4 zmm25, zmm25, ymmword ptr [r13+rdx-0x1*0x20], 0x01
vpunpcklqdq zmm12, zmm24, zmm25
vpunpckhqdq zmm13, zmm24, zmm25
vmovdqu32 ymm24, ymmword ptr [r10+rdx-0x1*0x20]
vinserti32x8 zmm24, zmm24, ymmword ptr [r14+rdx-0x1*0x20], 0x01
vinserti64x4 zmm24, zmm24, ymmword ptr [r14+rdx-0x1*0x20], 0x01
vmovdqu32 ymm25, ymmword ptr [r11+rdx-0x1*0x20]
vinserti32x8 zmm25, zmm25, ymmword ptr [r15+rdx-0x1*0x20], 0x01
vinserti64x4 zmm25, zmm25, ymmword ptr [r15+rdx-0x1*0x20], 0x01
vpunpcklqdq zmm14, zmm24, zmm25
vpunpckhqdq zmm15, zmm24, zmm25
prefetcht0 [r8+rdx+0x80]
@ -2065,7 +2065,7 @@ blake3_hash_many_avx512:
vpermq ymm14, ymm14, 0xDC
vpermq ymm15, ymm15, 0xDC
vpbroadcastd zmm12, dword ptr [BLAKE3_BLOCK_LEN+rip]
vinserti32x8 zmm13, zmm14, ymm15, 0x01
vinserti64x4 zmm13, zmm14, ymm15, 0x01
mov eax, 17476
kmovw k2, eax
vpblendmd zmm13 {k2}, zmm13, zmm12

View File

@ -99,15 +99,15 @@ innerloop16:
mov r14, qword ptr [rdi+50H]
mov r15, qword ptr [rdi+58H]
vmovdqu32 ymm16, ymmword ptr [rdx+r8-2H*20H]
vinserti32x8 zmm16, zmm16, ymmword ptr [rdx+r12-2H*20H], 01H
vinserti64x4 zmm16, zmm16, ymmword ptr [rdx+r12-2H*20H], 01H
vmovdqu32 ymm17, ymmword ptr [rdx+r9-2H*20H]
vinserti32x8 zmm17, zmm17, ymmword ptr [rdx+r13-2H*20H], 01H
vinserti64x4 zmm17, zmm17, ymmword ptr [rdx+r13-2H*20H], 01H
vpunpcklqdq zmm8, zmm16, zmm17
vpunpckhqdq zmm9, zmm16, zmm17
vmovdqu32 ymm18, ymmword ptr [rdx+r10-2H*20H]
vinserti32x8 zmm18, zmm18, ymmword ptr [rdx+r14-2H*20H], 01H
vinserti64x4 zmm18, zmm18, ymmword ptr [rdx+r14-2H*20H], 01H
vmovdqu32 ymm19, ymmword ptr [rdx+r11-2H*20H]
vinserti32x8 zmm19, zmm19, ymmword ptr [rdx+r15-2H*20H], 01H
vinserti64x4 zmm19, zmm19, ymmword ptr [rdx+r15-2H*20H], 01H
vpunpcklqdq zmm10, zmm18, zmm19
vpunpckhqdq zmm11, zmm18, zmm19
mov r8, qword ptr [rdi+20H]
@ -119,15 +119,15 @@ innerloop16:
mov r14, qword ptr [rdi+70H]
mov r15, qword ptr [rdi+78H]
vmovdqu32 ymm16, ymmword ptr [rdx+r8-2H*20H]
vinserti32x8 zmm16, zmm16, ymmword ptr [rdx+r12-2H*20H], 01H
vinserti64x4 zmm16, zmm16, ymmword ptr [rdx+r12-2H*20H], 01H
vmovdqu32 ymm17, ymmword ptr [rdx+r9-2H*20H]
vinserti32x8 zmm17, zmm17, ymmword ptr [rdx+r13-2H*20H], 01H
vinserti64x4 zmm17, zmm17, ymmword ptr [rdx+r13-2H*20H], 01H
vpunpcklqdq zmm12, zmm16, zmm17
vpunpckhqdq zmm13, zmm16, zmm17
vmovdqu32 ymm18, ymmword ptr [rdx+r10-2H*20H]
vinserti32x8 zmm18, zmm18, ymmword ptr [rdx+r14-2H*20H], 01H
vinserti64x4 zmm18, zmm18, ymmword ptr [rdx+r14-2H*20H], 01H
vmovdqu32 ymm19, ymmword ptr [rdx+r11-2H*20H]
vinserti32x8 zmm19, zmm19, ymmword ptr [rdx+r15-2H*20H], 01H
vinserti64x4 zmm19, zmm19, ymmword ptr [rdx+r15-2H*20H], 01H
vpunpcklqdq zmm14, zmm18, zmm19
vpunpckhqdq zmm15, zmm18, zmm19
vmovdqa32 zmm27, zmmword ptr [INDEX0]
@ -161,15 +161,15 @@ innerloop16:
mov r14, qword ptr [rdi+50H]
mov r15, qword ptr [rdi+58H]
vmovdqu32 ymm24, ymmword ptr [r8+rdx-1H*20H]
vinserti32x8 zmm24, zmm24, ymmword ptr [r12+rdx-1H*20H], 01H
vinserti64x4 zmm24, zmm24, ymmword ptr [r12+rdx-1H*20H], 01H
vmovdqu32 ymm25, ymmword ptr [r9+rdx-1H*20H]
vinserti32x8 zmm25, zmm25, ymmword ptr [r13+rdx-1H*20H], 01H
vinserti64x4 zmm25, zmm25, ymmword ptr [r13+rdx-1H*20H], 01H
vpunpcklqdq zmm8, zmm24, zmm25
vpunpckhqdq zmm9, zmm24, zmm25
vmovdqu32 ymm24, ymmword ptr [r10+rdx-1H*20H]
vinserti32x8 zmm24, zmm24, ymmword ptr [r14+rdx-1H*20H], 01H
vinserti64x4 zmm24, zmm24, ymmword ptr [r14+rdx-1H*20H], 01H
vmovdqu32 ymm25, ymmword ptr [r11+rdx-1H*20H]
vinserti32x8 zmm25, zmm25, ymmword ptr [r15+rdx-1H*20H], 01H
vinserti64x4 zmm25, zmm25, ymmword ptr [r15+rdx-1H*20H], 01H
vpunpcklqdq zmm10, zmm24, zmm25
vpunpckhqdq zmm11, zmm24, zmm25
prefetcht0 byte ptr [r8+rdx+80H]
@ -189,15 +189,15 @@ innerloop16:
mov r14, qword ptr [rdi+70H]
mov r15, qword ptr [rdi+78H]
vmovdqu32 ymm24, ymmword ptr [r8+rdx-1H*20H]
vinserti32x8 zmm24, zmm24, ymmword ptr [r12+rdx-1H*20H], 01H
vinserti64x4 zmm24, zmm24, ymmword ptr [r12+rdx-1H*20H], 01H
vmovdqu32 ymm25, ymmword ptr [r9+rdx-1H*20H]
vinserti32x8 zmm25, zmm25, ymmword ptr [r13+rdx-1H*20H], 01H
vinserti64x4 zmm25, zmm25, ymmword ptr [r13+rdx-1H*20H], 01H
vpunpcklqdq zmm12, zmm24, zmm25
vpunpckhqdq zmm13, zmm24, zmm25
vmovdqu32 ymm24, ymmword ptr [r10+rdx-1H*20H]
vinserti32x8 zmm24, zmm24, ymmword ptr [r14+rdx-1H*20H], 01H
vinserti64x4 zmm24, zmm24, ymmword ptr [r14+rdx-1H*20H], 01H
vmovdqu32 ymm25, ymmword ptr [r11+rdx-1H*20H]
vinserti32x8 zmm25, zmm25, ymmword ptr [r15+rdx-1H*20H], 01H
vinserti64x4 zmm25, zmm25, ymmword ptr [r15+rdx-1H*20H], 01H
vpunpcklqdq zmm14, zmm24, zmm25
vpunpckhqdq zmm15, zmm24, zmm25
prefetcht0 byte ptr [r8+rdx+80H]
@ -2073,7 +2073,7 @@ final7blocks:
vpermq ymm14, ymm14, 0DCH
vpermq ymm15, ymm15, 0DCH
vpbroadcastd zmm12, dword ptr [BLAKE3_BLOCK_LEN]
vinserti32x8 zmm13, zmm14, ymm15, 01H
vinserti64x4 zmm13, zmm14, ymm15, 01H
mov eax, 17476
kmovw k2, eax
vpblendmd zmm13 {k2}, zmm13, zmm12

View File

@ -182,7 +182,7 @@ void blake3_hash_many(const uint8_t *const *inputs, size_t num_inputs,
#if defined(IS_X86)
const enum cpu_feature features = get_cpu_features();
#if !defined(BLAKE3_NO_AVX512)
if (features & AVX512F) {
if ((features & (AVX512F|AVX512VL)) == (AVX512F|AVX512VL)) {
blake3_hash_many_avx512(inputs, num_inputs, blocks, key, counter,
increment_counter, flags, flags_start, flags_end,
out);
@ -223,7 +223,7 @@ size_t blake3_simd_degree(void) {
#if defined(IS_X86)
const enum cpu_feature features = get_cpu_features();
#if !defined(BLAKE3_NO_AVX512)
if (features & AVX512F) {
if ((features & (AVX512F|AVX512VL)) == (AVX512F|AVX512VL)) {
return 16;
}
#endif