mirror of
https://github.com/BLAKE3-team/BLAKE3
synced 2024-04-19 16:33:59 +02:00
Do not require AVX512DQ
Whereas vinserti64x4 is present on AVX512F, vinserti32x8 requires AVX512DQ, which we do not test for. At this point there is not much risk of incompatibility, since Skylake-X chips have all the requires instruction sets, but let's be precise about this.
This commit is contained in:
parent
370ba3644a
commit
7ef795d62e
|
@ -82,15 +82,15 @@ blake3_hash_many_avx512:
|
|||
mov r14, qword ptr [rdi+0x50]
|
||||
mov r15, qword ptr [rdi+0x58]
|
||||
vmovdqu32 ymm16, ymmword ptr [rdx+r8-0x2*0x20]
|
||||
vinserti32x8 zmm16, zmm16, ymmword ptr [rdx+r12-0x2*0x20], 0x01
|
||||
vinserti64x4 zmm16, zmm16, ymmword ptr [rdx+r12-0x2*0x20], 0x01
|
||||
vmovdqu32 ymm17, ymmword ptr [rdx+r9-0x2*0x20]
|
||||
vinserti32x8 zmm17, zmm17, ymmword ptr [rdx+r13-0x2*0x20], 0x01
|
||||
vinserti64x4 zmm17, zmm17, ymmword ptr [rdx+r13-0x2*0x20], 0x01
|
||||
vpunpcklqdq zmm8, zmm16, zmm17
|
||||
vpunpckhqdq zmm9, zmm16, zmm17
|
||||
vmovdqu32 ymm18, ymmword ptr [rdx+r10-0x2*0x20]
|
||||
vinserti32x8 zmm18, zmm18, ymmword ptr [rdx+r14-0x2*0x20], 0x01
|
||||
vinserti64x4 zmm18, zmm18, ymmword ptr [rdx+r14-0x2*0x20], 0x01
|
||||
vmovdqu32 ymm19, ymmword ptr [rdx+r11-0x2*0x20]
|
||||
vinserti32x8 zmm19, zmm19, ymmword ptr [rdx+r15-0x2*0x20], 0x01
|
||||
vinserti64x4 zmm19, zmm19, ymmword ptr [rdx+r15-0x2*0x20], 0x01
|
||||
vpunpcklqdq zmm10, zmm18, zmm19
|
||||
vpunpckhqdq zmm11, zmm18, zmm19
|
||||
mov r8, qword ptr [rdi+0x20]
|
||||
|
@ -102,15 +102,15 @@ blake3_hash_many_avx512:
|
|||
mov r14, qword ptr [rdi+0x70]
|
||||
mov r15, qword ptr [rdi+0x78]
|
||||
vmovdqu32 ymm16, ymmword ptr [rdx+r8-0x2*0x20]
|
||||
vinserti32x8 zmm16, zmm16, ymmword ptr [rdx+r12-0x2*0x20], 0x01
|
||||
vinserti64x4 zmm16, zmm16, ymmword ptr [rdx+r12-0x2*0x20], 0x01
|
||||
vmovdqu32 ymm17, ymmword ptr [rdx+r9-0x2*0x20]
|
||||
vinserti32x8 zmm17, zmm17, ymmword ptr [rdx+r13-0x2*0x20], 0x01
|
||||
vinserti64x4 zmm17, zmm17, ymmword ptr [rdx+r13-0x2*0x20], 0x01
|
||||
vpunpcklqdq zmm12, zmm16, zmm17
|
||||
vpunpckhqdq zmm13, zmm16, zmm17
|
||||
vmovdqu32 ymm18, ymmword ptr [rdx+r10-0x2*0x20]
|
||||
vinserti32x8 zmm18, zmm18, ymmword ptr [rdx+r14-0x2*0x20], 0x01
|
||||
vinserti64x4 zmm18, zmm18, ymmword ptr [rdx+r14-0x2*0x20], 0x01
|
||||
vmovdqu32 ymm19, ymmword ptr [rdx+r11-0x2*0x20]
|
||||
vinserti32x8 zmm19, zmm19, ymmword ptr [rdx+r15-0x2*0x20], 0x01
|
||||
vinserti64x4 zmm19, zmm19, ymmword ptr [rdx+r15-0x2*0x20], 0x01
|
||||
vpunpcklqdq zmm14, zmm18, zmm19
|
||||
vpunpckhqdq zmm15, zmm18, zmm19
|
||||
vmovdqa32 zmm27, zmmword ptr [INDEX0+rip]
|
||||
|
@ -144,15 +144,15 @@ blake3_hash_many_avx512:
|
|||
mov r14, qword ptr [rdi+0x50]
|
||||
mov r15, qword ptr [rdi+0x58]
|
||||
vmovdqu32 ymm24, ymmword ptr [r8+rdx-0x1*0x20]
|
||||
vinserti32x8 zmm24, zmm24, ymmword ptr [r12+rdx-0x1*0x20], 0x01
|
||||
vinserti64x4 zmm24, zmm24, ymmword ptr [r12+rdx-0x1*0x20], 0x01
|
||||
vmovdqu32 ymm25, ymmword ptr [r9+rdx-0x1*0x20]
|
||||
vinserti32x8 zmm25, zmm25, ymmword ptr [r13+rdx-0x1*0x20], 0x01
|
||||
vinserti64x4 zmm25, zmm25, ymmword ptr [r13+rdx-0x1*0x20], 0x01
|
||||
vpunpcklqdq zmm8, zmm24, zmm25
|
||||
vpunpckhqdq zmm9, zmm24, zmm25
|
||||
vmovdqu32 ymm24, ymmword ptr [r10+rdx-0x1*0x20]
|
||||
vinserti32x8 zmm24, zmm24, ymmword ptr [r14+rdx-0x1*0x20], 0x01
|
||||
vinserti64x4 zmm24, zmm24, ymmword ptr [r14+rdx-0x1*0x20], 0x01
|
||||
vmovdqu32 ymm25, ymmword ptr [r11+rdx-0x1*0x20]
|
||||
vinserti32x8 zmm25, zmm25, ymmword ptr [r15+rdx-0x1*0x20], 0x01
|
||||
vinserti64x4 zmm25, zmm25, ymmword ptr [r15+rdx-0x1*0x20], 0x01
|
||||
vpunpcklqdq zmm10, zmm24, zmm25
|
||||
vpunpckhqdq zmm11, zmm24, zmm25
|
||||
prefetcht0 [r8+rdx+0x80]
|
||||
|
@ -172,15 +172,15 @@ blake3_hash_many_avx512:
|
|||
mov r14, qword ptr [rdi+0x70]
|
||||
mov r15, qword ptr [rdi+0x78]
|
||||
vmovdqu32 ymm24, ymmword ptr [r8+rdx-0x1*0x20]
|
||||
vinserti32x8 zmm24, zmm24, ymmword ptr [r12+rdx-0x1*0x20], 0x01
|
||||
vinserti64x4 zmm24, zmm24, ymmword ptr [r12+rdx-0x1*0x20], 0x01
|
||||
vmovdqu32 ymm25, ymmword ptr [r9+rdx-0x1*0x20]
|
||||
vinserti32x8 zmm25, zmm25, ymmword ptr [r13+rdx-0x1*0x20], 0x01
|
||||
vinserti64x4 zmm25, zmm25, ymmword ptr [r13+rdx-0x1*0x20], 0x01
|
||||
vpunpcklqdq zmm12, zmm24, zmm25
|
||||
vpunpckhqdq zmm13, zmm24, zmm25
|
||||
vmovdqu32 ymm24, ymmword ptr [r10+rdx-0x1*0x20]
|
||||
vinserti32x8 zmm24, zmm24, ymmword ptr [r14+rdx-0x1*0x20], 0x01
|
||||
vinserti64x4 zmm24, zmm24, ymmword ptr [r14+rdx-0x1*0x20], 0x01
|
||||
vmovdqu32 ymm25, ymmword ptr [r11+rdx-0x1*0x20]
|
||||
vinserti32x8 zmm25, zmm25, ymmword ptr [r15+rdx-0x1*0x20], 0x01
|
||||
vinserti64x4 zmm25, zmm25, ymmword ptr [r15+rdx-0x1*0x20], 0x01
|
||||
vpunpcklqdq zmm14, zmm24, zmm25
|
||||
vpunpckhqdq zmm15, zmm24, zmm25
|
||||
prefetcht0 [r8+rdx+0x80]
|
||||
|
@ -2039,7 +2039,7 @@ blake3_hash_many_avx512:
|
|||
vpermq ymm14, ymm14, 0xDC
|
||||
vpermq ymm15, ymm15, 0xDC
|
||||
vpbroadcastd zmm12, dword ptr [BLAKE3_BLOCK_LEN+rip]
|
||||
vinserti32x8 zmm13, zmm14, ymm15, 0x01
|
||||
vinserti64x4 zmm13, zmm14, ymm15, 0x01
|
||||
mov eax, 17476
|
||||
kmovw k2, eax
|
||||
vpblendmd zmm13 {k2}, zmm13, zmm12
|
||||
|
|
|
@ -96,15 +96,15 @@ blake3_hash_many_avx512:
|
|||
mov r14, qword ptr [rdi+0x50]
|
||||
mov r15, qword ptr [rdi+0x58]
|
||||
vmovdqu32 ymm16, ymmword ptr [rdx+r8-0x2*0x20]
|
||||
vinserti32x8 zmm16, zmm16, ymmword ptr [rdx+r12-0x2*0x20], 0x01
|
||||
vinserti64x4 zmm16, zmm16, ymmword ptr [rdx+r12-0x2*0x20], 0x01
|
||||
vmovdqu32 ymm17, ymmword ptr [rdx+r9-0x2*0x20]
|
||||
vinserti32x8 zmm17, zmm17, ymmword ptr [rdx+r13-0x2*0x20], 0x01
|
||||
vinserti64x4 zmm17, zmm17, ymmword ptr [rdx+r13-0x2*0x20], 0x01
|
||||
vpunpcklqdq zmm8, zmm16, zmm17
|
||||
vpunpckhqdq zmm9, zmm16, zmm17
|
||||
vmovdqu32 ymm18, ymmword ptr [rdx+r10-0x2*0x20]
|
||||
vinserti32x8 zmm18, zmm18, ymmword ptr [rdx+r14-0x2*0x20], 0x01
|
||||
vinserti64x4 zmm18, zmm18, ymmword ptr [rdx+r14-0x2*0x20], 0x01
|
||||
vmovdqu32 ymm19, ymmword ptr [rdx+r11-0x2*0x20]
|
||||
vinserti32x8 zmm19, zmm19, ymmword ptr [rdx+r15-0x2*0x20], 0x01
|
||||
vinserti64x4 zmm19, zmm19, ymmword ptr [rdx+r15-0x2*0x20], 0x01
|
||||
vpunpcklqdq zmm10, zmm18, zmm19
|
||||
vpunpckhqdq zmm11, zmm18, zmm19
|
||||
mov r8, qword ptr [rdi+0x20]
|
||||
|
@ -116,15 +116,15 @@ blake3_hash_many_avx512:
|
|||
mov r14, qword ptr [rdi+0x70]
|
||||
mov r15, qword ptr [rdi+0x78]
|
||||
vmovdqu32 ymm16, ymmword ptr [rdx+r8-0x2*0x20]
|
||||
vinserti32x8 zmm16, zmm16, ymmword ptr [rdx+r12-0x2*0x20], 0x01
|
||||
vinserti64x4 zmm16, zmm16, ymmword ptr [rdx+r12-0x2*0x20], 0x01
|
||||
vmovdqu32 ymm17, ymmword ptr [rdx+r9-0x2*0x20]
|
||||
vinserti32x8 zmm17, zmm17, ymmword ptr [rdx+r13-0x2*0x20], 0x01
|
||||
vinserti64x4 zmm17, zmm17, ymmword ptr [rdx+r13-0x2*0x20], 0x01
|
||||
vpunpcklqdq zmm12, zmm16, zmm17
|
||||
vpunpckhqdq zmm13, zmm16, zmm17
|
||||
vmovdqu32 ymm18, ymmword ptr [rdx+r10-0x2*0x20]
|
||||
vinserti32x8 zmm18, zmm18, ymmword ptr [rdx+r14-0x2*0x20], 0x01
|
||||
vinserti64x4 zmm18, zmm18, ymmword ptr [rdx+r14-0x2*0x20], 0x01
|
||||
vmovdqu32 ymm19, ymmword ptr [rdx+r11-0x2*0x20]
|
||||
vinserti32x8 zmm19, zmm19, ymmword ptr [rdx+r15-0x2*0x20], 0x01
|
||||
vinserti64x4 zmm19, zmm19, ymmword ptr [rdx+r15-0x2*0x20], 0x01
|
||||
vpunpcklqdq zmm14, zmm18, zmm19
|
||||
vpunpckhqdq zmm15, zmm18, zmm19
|
||||
vmovdqa32 zmm27, zmmword ptr [INDEX0+rip]
|
||||
|
@ -158,15 +158,15 @@ blake3_hash_many_avx512:
|
|||
mov r14, qword ptr [rdi+0x50]
|
||||
mov r15, qword ptr [rdi+0x58]
|
||||
vmovdqu32 ymm24, ymmword ptr [r8+rdx-0x1*0x20]
|
||||
vinserti32x8 zmm24, zmm24, ymmword ptr [r12+rdx-0x1*0x20], 0x01
|
||||
vinserti64x4 zmm24, zmm24, ymmword ptr [r12+rdx-0x1*0x20], 0x01
|
||||
vmovdqu32 ymm25, ymmword ptr [r9+rdx-0x1*0x20]
|
||||
vinserti32x8 zmm25, zmm25, ymmword ptr [r13+rdx-0x1*0x20], 0x01
|
||||
vinserti64x4 zmm25, zmm25, ymmword ptr [r13+rdx-0x1*0x20], 0x01
|
||||
vpunpcklqdq zmm8, zmm24, zmm25
|
||||
vpunpckhqdq zmm9, zmm24, zmm25
|
||||
vmovdqu32 ymm24, ymmword ptr [r10+rdx-0x1*0x20]
|
||||
vinserti32x8 zmm24, zmm24, ymmword ptr [r14+rdx-0x1*0x20], 0x01
|
||||
vinserti64x4 zmm24, zmm24, ymmword ptr [r14+rdx-0x1*0x20], 0x01
|
||||
vmovdqu32 ymm25, ymmword ptr [r11+rdx-0x1*0x20]
|
||||
vinserti32x8 zmm25, zmm25, ymmword ptr [r15+rdx-0x1*0x20], 0x01
|
||||
vinserti64x4 zmm25, zmm25, ymmword ptr [r15+rdx-0x1*0x20], 0x01
|
||||
vpunpcklqdq zmm10, zmm24, zmm25
|
||||
vpunpckhqdq zmm11, zmm24, zmm25
|
||||
prefetcht0 [r8+rdx+0x80]
|
||||
|
@ -186,15 +186,15 @@ blake3_hash_many_avx512:
|
|||
mov r14, qword ptr [rdi+0x70]
|
||||
mov r15, qword ptr [rdi+0x78]
|
||||
vmovdqu32 ymm24, ymmword ptr [r8+rdx-0x1*0x20]
|
||||
vinserti32x8 zmm24, zmm24, ymmword ptr [r12+rdx-0x1*0x20], 0x01
|
||||
vinserti64x4 zmm24, zmm24, ymmword ptr [r12+rdx-0x1*0x20], 0x01
|
||||
vmovdqu32 ymm25, ymmword ptr [r9+rdx-0x1*0x20]
|
||||
vinserti32x8 zmm25, zmm25, ymmword ptr [r13+rdx-0x1*0x20], 0x01
|
||||
vinserti64x4 zmm25, zmm25, ymmword ptr [r13+rdx-0x1*0x20], 0x01
|
||||
vpunpcklqdq zmm12, zmm24, zmm25
|
||||
vpunpckhqdq zmm13, zmm24, zmm25
|
||||
vmovdqu32 ymm24, ymmword ptr [r10+rdx-0x1*0x20]
|
||||
vinserti32x8 zmm24, zmm24, ymmword ptr [r14+rdx-0x1*0x20], 0x01
|
||||
vinserti64x4 zmm24, zmm24, ymmword ptr [r14+rdx-0x1*0x20], 0x01
|
||||
vmovdqu32 ymm25, ymmword ptr [r11+rdx-0x1*0x20]
|
||||
vinserti32x8 zmm25, zmm25, ymmword ptr [r15+rdx-0x1*0x20], 0x01
|
||||
vinserti64x4 zmm25, zmm25, ymmword ptr [r15+rdx-0x1*0x20], 0x01
|
||||
vpunpcklqdq zmm14, zmm24, zmm25
|
||||
vpunpckhqdq zmm15, zmm24, zmm25
|
||||
prefetcht0 [r8+rdx+0x80]
|
||||
|
@ -2065,7 +2065,7 @@ blake3_hash_many_avx512:
|
|||
vpermq ymm14, ymm14, 0xDC
|
||||
vpermq ymm15, ymm15, 0xDC
|
||||
vpbroadcastd zmm12, dword ptr [BLAKE3_BLOCK_LEN+rip]
|
||||
vinserti32x8 zmm13, zmm14, ymm15, 0x01
|
||||
vinserti64x4 zmm13, zmm14, ymm15, 0x01
|
||||
mov eax, 17476
|
||||
kmovw k2, eax
|
||||
vpblendmd zmm13 {k2}, zmm13, zmm12
|
||||
|
|
|
@ -99,15 +99,15 @@ innerloop16:
|
|||
mov r14, qword ptr [rdi+50H]
|
||||
mov r15, qword ptr [rdi+58H]
|
||||
vmovdqu32 ymm16, ymmword ptr [rdx+r8-2H*20H]
|
||||
vinserti32x8 zmm16, zmm16, ymmword ptr [rdx+r12-2H*20H], 01H
|
||||
vinserti64x4 zmm16, zmm16, ymmword ptr [rdx+r12-2H*20H], 01H
|
||||
vmovdqu32 ymm17, ymmword ptr [rdx+r9-2H*20H]
|
||||
vinserti32x8 zmm17, zmm17, ymmword ptr [rdx+r13-2H*20H], 01H
|
||||
vinserti64x4 zmm17, zmm17, ymmword ptr [rdx+r13-2H*20H], 01H
|
||||
vpunpcklqdq zmm8, zmm16, zmm17
|
||||
vpunpckhqdq zmm9, zmm16, zmm17
|
||||
vmovdqu32 ymm18, ymmword ptr [rdx+r10-2H*20H]
|
||||
vinserti32x8 zmm18, zmm18, ymmword ptr [rdx+r14-2H*20H], 01H
|
||||
vinserti64x4 zmm18, zmm18, ymmword ptr [rdx+r14-2H*20H], 01H
|
||||
vmovdqu32 ymm19, ymmword ptr [rdx+r11-2H*20H]
|
||||
vinserti32x8 zmm19, zmm19, ymmword ptr [rdx+r15-2H*20H], 01H
|
||||
vinserti64x4 zmm19, zmm19, ymmword ptr [rdx+r15-2H*20H], 01H
|
||||
vpunpcklqdq zmm10, zmm18, zmm19
|
||||
vpunpckhqdq zmm11, zmm18, zmm19
|
||||
mov r8, qword ptr [rdi+20H]
|
||||
|
@ -119,15 +119,15 @@ innerloop16:
|
|||
mov r14, qword ptr [rdi+70H]
|
||||
mov r15, qword ptr [rdi+78H]
|
||||
vmovdqu32 ymm16, ymmword ptr [rdx+r8-2H*20H]
|
||||
vinserti32x8 zmm16, zmm16, ymmword ptr [rdx+r12-2H*20H], 01H
|
||||
vinserti64x4 zmm16, zmm16, ymmword ptr [rdx+r12-2H*20H], 01H
|
||||
vmovdqu32 ymm17, ymmword ptr [rdx+r9-2H*20H]
|
||||
vinserti32x8 zmm17, zmm17, ymmword ptr [rdx+r13-2H*20H], 01H
|
||||
vinserti64x4 zmm17, zmm17, ymmword ptr [rdx+r13-2H*20H], 01H
|
||||
vpunpcklqdq zmm12, zmm16, zmm17
|
||||
vpunpckhqdq zmm13, zmm16, zmm17
|
||||
vmovdqu32 ymm18, ymmword ptr [rdx+r10-2H*20H]
|
||||
vinserti32x8 zmm18, zmm18, ymmword ptr [rdx+r14-2H*20H], 01H
|
||||
vinserti64x4 zmm18, zmm18, ymmword ptr [rdx+r14-2H*20H], 01H
|
||||
vmovdqu32 ymm19, ymmword ptr [rdx+r11-2H*20H]
|
||||
vinserti32x8 zmm19, zmm19, ymmword ptr [rdx+r15-2H*20H], 01H
|
||||
vinserti64x4 zmm19, zmm19, ymmword ptr [rdx+r15-2H*20H], 01H
|
||||
vpunpcklqdq zmm14, zmm18, zmm19
|
||||
vpunpckhqdq zmm15, zmm18, zmm19
|
||||
vmovdqa32 zmm27, zmmword ptr [INDEX0]
|
||||
|
@ -161,15 +161,15 @@ innerloop16:
|
|||
mov r14, qword ptr [rdi+50H]
|
||||
mov r15, qword ptr [rdi+58H]
|
||||
vmovdqu32 ymm24, ymmword ptr [r8+rdx-1H*20H]
|
||||
vinserti32x8 zmm24, zmm24, ymmword ptr [r12+rdx-1H*20H], 01H
|
||||
vinserti64x4 zmm24, zmm24, ymmword ptr [r12+rdx-1H*20H], 01H
|
||||
vmovdqu32 ymm25, ymmword ptr [r9+rdx-1H*20H]
|
||||
vinserti32x8 zmm25, zmm25, ymmword ptr [r13+rdx-1H*20H], 01H
|
||||
vinserti64x4 zmm25, zmm25, ymmword ptr [r13+rdx-1H*20H], 01H
|
||||
vpunpcklqdq zmm8, zmm24, zmm25
|
||||
vpunpckhqdq zmm9, zmm24, zmm25
|
||||
vmovdqu32 ymm24, ymmword ptr [r10+rdx-1H*20H]
|
||||
vinserti32x8 zmm24, zmm24, ymmword ptr [r14+rdx-1H*20H], 01H
|
||||
vinserti64x4 zmm24, zmm24, ymmword ptr [r14+rdx-1H*20H], 01H
|
||||
vmovdqu32 ymm25, ymmword ptr [r11+rdx-1H*20H]
|
||||
vinserti32x8 zmm25, zmm25, ymmword ptr [r15+rdx-1H*20H], 01H
|
||||
vinserti64x4 zmm25, zmm25, ymmword ptr [r15+rdx-1H*20H], 01H
|
||||
vpunpcklqdq zmm10, zmm24, zmm25
|
||||
vpunpckhqdq zmm11, zmm24, zmm25
|
||||
prefetcht0 byte ptr [r8+rdx+80H]
|
||||
|
@ -189,15 +189,15 @@ innerloop16:
|
|||
mov r14, qword ptr [rdi+70H]
|
||||
mov r15, qword ptr [rdi+78H]
|
||||
vmovdqu32 ymm24, ymmword ptr [r8+rdx-1H*20H]
|
||||
vinserti32x8 zmm24, zmm24, ymmword ptr [r12+rdx-1H*20H], 01H
|
||||
vinserti64x4 zmm24, zmm24, ymmword ptr [r12+rdx-1H*20H], 01H
|
||||
vmovdqu32 ymm25, ymmword ptr [r9+rdx-1H*20H]
|
||||
vinserti32x8 zmm25, zmm25, ymmword ptr [r13+rdx-1H*20H], 01H
|
||||
vinserti64x4 zmm25, zmm25, ymmword ptr [r13+rdx-1H*20H], 01H
|
||||
vpunpcklqdq zmm12, zmm24, zmm25
|
||||
vpunpckhqdq zmm13, zmm24, zmm25
|
||||
vmovdqu32 ymm24, ymmword ptr [r10+rdx-1H*20H]
|
||||
vinserti32x8 zmm24, zmm24, ymmword ptr [r14+rdx-1H*20H], 01H
|
||||
vinserti64x4 zmm24, zmm24, ymmword ptr [r14+rdx-1H*20H], 01H
|
||||
vmovdqu32 ymm25, ymmword ptr [r11+rdx-1H*20H]
|
||||
vinserti32x8 zmm25, zmm25, ymmword ptr [r15+rdx-1H*20H], 01H
|
||||
vinserti64x4 zmm25, zmm25, ymmword ptr [r15+rdx-1H*20H], 01H
|
||||
vpunpcklqdq zmm14, zmm24, zmm25
|
||||
vpunpckhqdq zmm15, zmm24, zmm25
|
||||
prefetcht0 byte ptr [r8+rdx+80H]
|
||||
|
@ -2073,7 +2073,7 @@ final7blocks:
|
|||
vpermq ymm14, ymm14, 0DCH
|
||||
vpermq ymm15, ymm15, 0DCH
|
||||
vpbroadcastd zmm12, dword ptr [BLAKE3_BLOCK_LEN]
|
||||
vinserti32x8 zmm13, zmm14, ymm15, 01H
|
||||
vinserti64x4 zmm13, zmm14, ymm15, 01H
|
||||
mov eax, 17476
|
||||
kmovw k2, eax
|
||||
vpblendmd zmm13 {k2}, zmm13, zmm12
|
||||
|
|
|
@ -182,7 +182,7 @@ void blake3_hash_many(const uint8_t *const *inputs, size_t num_inputs,
|
|||
#if defined(IS_X86)
|
||||
const enum cpu_feature features = get_cpu_features();
|
||||
#if !defined(BLAKE3_NO_AVX512)
|
||||
if (features & AVX512F) {
|
||||
if ((features & (AVX512F|AVX512VL)) == (AVX512F|AVX512VL)) {
|
||||
blake3_hash_many_avx512(inputs, num_inputs, blocks, key, counter,
|
||||
increment_counter, flags, flags_start, flags_end,
|
||||
out);
|
||||
|
@ -223,7 +223,7 @@ size_t blake3_simd_degree(void) {
|
|||
#if defined(IS_X86)
|
||||
const enum cpu_feature features = get_cpu_features();
|
||||
#if !defined(BLAKE3_NO_AVX512)
|
||||
if (features & AVX512F) {
|
||||
if ((features & (AVX512F|AVX512VL)) == (AVX512F|AVX512VL)) {
|
||||
return 16;
|
||||
}
|
||||
#endif
|
||||
|
|
Loading…
Reference in New Issue