mirror of
https://github.com/BLAKE3-team/BLAKE3
synced 2024-05-22 07:56:04 +02:00
C: asm: simplify pinsrd emulation
Use punpckl{,q}dq instead of pinsrw.
This commit is contained in:
parent
c592e9a3f6
commit
47e415c7f1
|
@ -1656,24 +1656,12 @@ blake3_hash_many_sse2:
|
|||
movaps xmm8, xmm0
|
||||
movaps xmm9, xmm1
|
||||
movd xmm13, dword ptr [rsp+0x110]
|
||||
mov eax, dword ptr [rsp+0x120]
|
||||
sar eax, 16
|
||||
pinsrw xmm13, word ptr [rsp+0x120], 2
|
||||
pinsrw xmm13, eax, 3
|
||||
mov eax, dword ptr [BLAKE3_BLOCK_LEN+rip]
|
||||
sar eax, 16
|
||||
pinsrw xmm13, word ptr [BLAKE3_BLOCK_LEN+rip], 4
|
||||
pinsrw xmm13, eax, 5
|
||||
movd xmm14, dword ptr [rsp+0x120]
|
||||
punpckldq xmm13, xmm14
|
||||
movaps xmmword ptr [rsp], xmm13
|
||||
movd xmm14, dword ptr [rsp+0x114]
|
||||
mov eax, dword ptr [BLAKE3_BLOCK_LEN+rip]
|
||||
sar eax, 16
|
||||
pinsrw xmm14, word ptr [rsp+0x124], 2
|
||||
pinsrw xmm14, eax, 3
|
||||
mov eax, dword ptr [BLAKE3_BLOCK_LEN+rip]
|
||||
sar eax, 16
|
||||
pinsrw xmm14, word ptr [BLAKE3_BLOCK_LEN+rip], 4
|
||||
pinsrw xmm14, eax, 5
|
||||
movd xmm13, dword ptr [rsp+0x124]
|
||||
punpckldq xmm14, xmm13
|
||||
movaps xmmword ptr [rsp+0x10], xmm14
|
||||
mov r8, qword ptr [rdi]
|
||||
mov r9, qword ptr [rdi+0x8]
|
||||
|
@ -1714,14 +1702,14 @@ blake3_hash_many_sse2:
|
|||
pshufd xmm14, xmm14, 0x93
|
||||
shufps xmm11, xmm15, 221
|
||||
pshufd xmm15, xmm11, 0x93
|
||||
shl rax, 0x20
|
||||
or rax, 0x40
|
||||
movd xmm3, rax
|
||||
movdqa xmmword ptr [rsp+0x20], xmm3
|
||||
movaps xmm3, xmmword ptr [rsp]
|
||||
movaps xmm11, xmmword ptr [rsp+0x10]
|
||||
mov r14d, eax
|
||||
sar r14d, 16
|
||||
pinsrw xmm3, eax, 6
|
||||
pinsrw xmm3, r14d, 7
|
||||
pinsrw xmm11, eax, 6
|
||||
pinsrw xmm11, r14d, 7
|
||||
punpcklqdq xmm3, xmmword ptr [rsp+0x20]
|
||||
punpcklqdq xmm11, xmmword ptr [rsp+0x20]
|
||||
mov al, 7
|
||||
9:
|
||||
paddd xmm0, xmm4
|
||||
|
@ -1930,14 +1918,8 @@ blake3_hash_many_sse2:
|
|||
movups xmm0, xmmword ptr [rcx]
|
||||
movups xmm1, xmmword ptr [rcx+0x10]
|
||||
movd xmm13, dword ptr [rsp+0x110]
|
||||
mov eax, dword ptr [rsp+0x120]
|
||||
sar eax, 16
|
||||
pinsrw xmm13, word ptr [rsp+0x120], 2
|
||||
pinsrw xmm13, eax, 3
|
||||
mov eax, dword ptr [BLAKE3_BLOCK_LEN+rip]
|
||||
sar eax, 16
|
||||
pinsrw xmm13, word ptr [BLAKE3_BLOCK_LEN+rip], 4
|
||||
pinsrw xmm13, eax, 5
|
||||
movd xmm14, dword ptr [rsp+0x120]
|
||||
punpckldq xmm13, xmm14
|
||||
mov r8, qword ptr [rdi]
|
||||
movzx eax, byte ptr [rbp+0x40]
|
||||
or eax, r13d
|
||||
|
@ -1949,11 +1931,11 @@ blake3_hash_many_sse2:
|
|||
cmp rdx, r15
|
||||
cmovne eax, r14d
|
||||
movaps xmm2, xmmword ptr [BLAKE3_IV+rip]
|
||||
movaps xmm3, xmm13
|
||||
mov r14d, eax
|
||||
sar r14d, 16
|
||||
pinsrw xmm3, eax, 6
|
||||
pinsrw xmm3, r14d, 7
|
||||
shl rax, 32
|
||||
or rax, 64
|
||||
movd xmm12, rax
|
||||
movdqa xmm3, xmm13
|
||||
punpcklqdq xmm3, xmm12
|
||||
movups xmm4, xmmword ptr [r8+rdx-0x40]
|
||||
movups xmm5, xmmword ptr [r8+rdx-0x30]
|
||||
movaps xmm8, xmm4
|
||||
|
|
|
@ -1667,24 +1667,12 @@ blake3_hash_many_sse2:
|
|||
movaps xmm8, xmm0
|
||||
movaps xmm9, xmm1
|
||||
movd xmm13, dword ptr [rsp+0x110]
|
||||
mov eax, dword ptr [rsp+0x120]
|
||||
sar eax, 16
|
||||
pinsrw xmm13, word ptr [rsp+0x120], 2
|
||||
pinsrw xmm13, eax, 3
|
||||
mov eax, dword ptr [BLAKE3_BLOCK_LEN+rip]
|
||||
sar eax, 16
|
||||
pinsrw xmm13, word ptr [BLAKE3_BLOCK_LEN+rip], 4
|
||||
pinsrw xmm13, eax, 5
|
||||
movd xmm14, dword ptr [rsp+0x120]
|
||||
punpckldq xmm13, xmm14
|
||||
movaps xmmword ptr [rsp], xmm13
|
||||
movd xmm14, dword ptr [rsp+0x114]
|
||||
mov eax, dword ptr [BLAKE3_BLOCK_LEN+rip]
|
||||
sar eax, 16
|
||||
pinsrw xmm14, word ptr [rsp+0x124], 2
|
||||
pinsrw xmm14, eax, 3
|
||||
mov eax, dword ptr [BLAKE3_BLOCK_LEN+rip]
|
||||
sar eax, 16
|
||||
pinsrw xmm14, word ptr [BLAKE3_BLOCK_LEN+rip], 4
|
||||
pinsrw xmm14, eax, 5
|
||||
movd xmm13, dword ptr [rsp+0x124]
|
||||
punpckldq xmm14, xmm13
|
||||
movaps xmmword ptr [rsp+0x10], xmm14
|
||||
mov r8, qword ptr [rdi]
|
||||
mov r9, qword ptr [rdi+0x8]
|
||||
|
@ -1725,14 +1713,14 @@ blake3_hash_many_sse2:
|
|||
pshufd xmm14, xmm14, 0x93
|
||||
shufps xmm11, xmm15, 221
|
||||
pshufd xmm15, xmm11, 0x93
|
||||
shl rax, 0x20
|
||||
or rax, 0x40
|
||||
movd xmm3, rax
|
||||
movdqa xmmword ptr [rsp+0x20], xmm3
|
||||
movaps xmm3, xmmword ptr [rsp]
|
||||
movaps xmm11, xmmword ptr [rsp+0x10]
|
||||
mov r14d, eax
|
||||
sar r14d, 16
|
||||
pinsrw xmm3, eax, 6
|
||||
pinsrw xmm3, r14d, 7
|
||||
pinsrw xmm11, eax, 6
|
||||
pinsrw xmm11, r14d, 7
|
||||
punpcklqdq xmm3, xmmword ptr [rsp+0x20]
|
||||
punpcklqdq xmm11, xmmword ptr [rsp+0x20]
|
||||
mov al, 7
|
||||
9:
|
||||
paddd xmm0, xmm4
|
||||
|
@ -1941,14 +1929,8 @@ blake3_hash_many_sse2:
|
|||
movups xmm0, xmmword ptr [rcx]
|
||||
movups xmm1, xmmword ptr [rcx+0x10]
|
||||
movd xmm13, dword ptr [rsp+0x110]
|
||||
mov eax, dword ptr [rsp+0x120]
|
||||
sar eax, 16
|
||||
pinsrw xmm13, word ptr [rsp+0x120], 2
|
||||
pinsrw xmm13, eax, 3
|
||||
mov eax, dword ptr [BLAKE3_BLOCK_LEN+rip]
|
||||
sar eax, 16
|
||||
pinsrw xmm13, word ptr [BLAKE3_BLOCK_LEN+rip], 4
|
||||
pinsrw xmm13, eax, 5
|
||||
movd xmm14, dword ptr [rsp+0x120]
|
||||
punpckldq xmm13, xmm14
|
||||
mov r8, qword ptr [rdi]
|
||||
movzx eax, byte ptr [rbp+0x80]
|
||||
or eax, r13d
|
||||
|
@ -1960,11 +1942,11 @@ blake3_hash_many_sse2:
|
|||
cmp rdx, r15
|
||||
cmovne eax, r14d
|
||||
movaps xmm2, xmmword ptr [BLAKE3_IV+rip]
|
||||
movaps xmm3, xmm13
|
||||
mov r14d, eax
|
||||
sar r14d, 16
|
||||
pinsrw xmm3, eax, 6
|
||||
pinsrw xmm3, r14d, 7
|
||||
shl rax, 32
|
||||
or rax, 64
|
||||
movd xmm12, rax
|
||||
movdqa xmm3, xmm13
|
||||
punpcklqdq xmm3, xmm12
|
||||
movups xmm4, xmmword ptr [r8+rdx-0x40]
|
||||
movups xmm5, xmmword ptr [r8+rdx-0x30]
|
||||
movaps xmm8, xmm4
|
||||
|
|
|
@ -1668,24 +1668,12 @@ final3blocks:
|
|||
movaps xmm8, xmm0
|
||||
movaps xmm9, xmm1
|
||||
movd xmm13, dword ptr [rsp+110H]
|
||||
mov eax, dword ptr [rsp+120H]
|
||||
sar eax, 16
|
||||
pinsrw xmm13, word ptr [rsp+120H], 2
|
||||
pinsrw xmm13, eax, 3
|
||||
mov eax, dword ptr [BLAKE3_BLOCK_LEN]
|
||||
sar eax, 16
|
||||
pinsrw xmm13, word ptr [BLAKE3_BLOCK_LEN], 4
|
||||
pinsrw xmm13, eax, 5
|
||||
movd xmm14, dword ptr [rsp+120H]
|
||||
punpckldq xmm13, xmm14
|
||||
movaps xmmword ptr [rsp], xmm13
|
||||
movd xmm14, dword ptr [rsp+114H]
|
||||
mov eax, dword ptr [BLAKE3_BLOCK_LEN]
|
||||
sar eax, 16
|
||||
pinsrw xmm14, word ptr [rsp+124H], 2
|
||||
pinsrw xmm14, eax, 3
|
||||
mov eax, dword ptr [BLAKE3_BLOCK_LEN]
|
||||
sar eax, 16
|
||||
pinsrw xmm14, word ptr [BLAKE3_BLOCK_LEN], 4
|
||||
pinsrw xmm14, eax, 5
|
||||
movd xmm13, dword ptr [rsp+124H]
|
||||
punpckldq xmm14, xmm13
|
||||
movaps xmmword ptr [rsp+10H], xmm14
|
||||
mov r8, qword ptr [rdi]
|
||||
mov r9, qword ptr [rdi+8H]
|
||||
|
@ -1726,14 +1714,14 @@ innerloop2:
|
|||
pshufd xmm14, xmm14, 93H
|
||||
shufps xmm11, xmm15, 221
|
||||
pshufd xmm15, xmm11, 93H
|
||||
shl rax, 20H
|
||||
or rax, 40H
|
||||
movd xmm3, rax
|
||||
movdqa xmmword ptr [rsp+20H], xmm3
|
||||
movaps xmm3, xmmword ptr [rsp]
|
||||
movaps xmm11, xmmword ptr [rsp+10H]
|
||||
mov r14d, eax
|
||||
sar r14d, 16
|
||||
pinsrw xmm3, eax, 6
|
||||
pinsrw xmm3, r14d, 7
|
||||
pinsrw xmm11, eax, 6
|
||||
pinsrw xmm11, r14d, 7
|
||||
punpcklqdq xmm3, xmmword ptr [rsp+20H]
|
||||
punpcklqdq xmm11, xmmword ptr [rsp+20H]
|
||||
mov al, 7
|
||||
roundloop2:
|
||||
paddd xmm0, xmm4
|
||||
|
@ -1942,14 +1930,8 @@ final1block:
|
|||
movups xmm0, xmmword ptr [rcx]
|
||||
movups xmm1, xmmword ptr [rcx+10H]
|
||||
movd xmm13, dword ptr [rsp+110H]
|
||||
mov eax, dword ptr [rsp+120H]
|
||||
sar eax, 16
|
||||
pinsrw xmm13, word ptr [rsp+120H], 2
|
||||
pinsrw xmm13, eax, 3
|
||||
mov eax, dword ptr [BLAKE3_BLOCK_LEN]
|
||||
sar eax, 16
|
||||
pinsrw xmm13, word ptr [BLAKE3_BLOCK_LEN], 4
|
||||
pinsrw xmm13, eax, 5
|
||||
movd xmm14, dword ptr [rsp+120H]
|
||||
punpckldq xmm13, xmm14
|
||||
mov r8, qword ptr [rdi]
|
||||
movzx eax, byte ptr [rbp+80H]
|
||||
or eax, r13d
|
||||
|
@ -1961,11 +1943,11 @@ innerloop1:
|
|||
cmp rdx, r15
|
||||
cmovne eax, r14d
|
||||
movaps xmm2, xmmword ptr [BLAKE3_IV]
|
||||
movaps xmm3, xmm13
|
||||
mov r14d, eax
|
||||
sar r14d, 16
|
||||
pinsrw xmm3, eax, 6
|
||||
pinsrw xmm3, r14d, 7
|
||||
shl rax, 32
|
||||
or rax, 64
|
||||
movd xmm12, rax
|
||||
movdqa xmm3, xmm13
|
||||
punpcklqdq xmm3, xmm12
|
||||
movups xmm4, xmmword ptr [r8+rdx-40H]
|
||||
movups xmm5, xmmword ptr [r8+rdx-30H]
|
||||
movaps xmm8, xmm4
|
||||
|
|
Loading…
Reference in New Issue