1
0
Fork 0
mirror of https://github.com/BLAKE3-team/BLAKE3 synced 2024-05-22 07:56:04 +02:00

C: asm: simplify pinsrd emulation

Use punpckl{,q}dq instead of pinsrw.
This commit is contained in:
Matthew Krupcale 2020-08-31 00:21:47 -04:00
parent c592e9a3f6
commit 47e415c7f1
3 changed files with 51 additions and 105 deletions

View File

@ -1656,24 +1656,12 @@ blake3_hash_many_sse2:
movaps xmm8, xmm0
movaps xmm9, xmm1
movd xmm13, dword ptr [rsp+0x110]
mov eax, dword ptr [rsp+0x120]
sar eax, 16
pinsrw xmm13, word ptr [rsp+0x120], 2
pinsrw xmm13, eax, 3
mov eax, dword ptr [BLAKE3_BLOCK_LEN+rip]
sar eax, 16
pinsrw xmm13, word ptr [BLAKE3_BLOCK_LEN+rip], 4
pinsrw xmm13, eax, 5
movd xmm14, dword ptr [rsp+0x120]
punpckldq xmm13, xmm14
movaps xmmword ptr [rsp], xmm13
movd xmm14, dword ptr [rsp+0x114]
mov eax, dword ptr [BLAKE3_BLOCK_LEN+rip]
sar eax, 16
pinsrw xmm14, word ptr [rsp+0x124], 2
pinsrw xmm14, eax, 3
mov eax, dword ptr [BLAKE3_BLOCK_LEN+rip]
sar eax, 16
pinsrw xmm14, word ptr [BLAKE3_BLOCK_LEN+rip], 4
pinsrw xmm14, eax, 5
movd xmm13, dword ptr [rsp+0x124]
punpckldq xmm14, xmm13
movaps xmmword ptr [rsp+0x10], xmm14
mov r8, qword ptr [rdi]
mov r9, qword ptr [rdi+0x8]
@ -1714,14 +1702,14 @@ blake3_hash_many_sse2:
pshufd xmm14, xmm14, 0x93
shufps xmm11, xmm15, 221
pshufd xmm15, xmm11, 0x93
shl rax, 0x20
or rax, 0x40
movd xmm3, rax
movdqa xmmword ptr [rsp+0x20], xmm3
movaps xmm3, xmmword ptr [rsp]
movaps xmm11, xmmword ptr [rsp+0x10]
mov r14d, eax
sar r14d, 16
pinsrw xmm3, eax, 6
pinsrw xmm3, r14d, 7
pinsrw xmm11, eax, 6
pinsrw xmm11, r14d, 7
punpcklqdq xmm3, xmmword ptr [rsp+0x20]
punpcklqdq xmm11, xmmword ptr [rsp+0x20]
mov al, 7
9:
paddd xmm0, xmm4
@ -1930,14 +1918,8 @@ blake3_hash_many_sse2:
movups xmm0, xmmword ptr [rcx]
movups xmm1, xmmword ptr [rcx+0x10]
movd xmm13, dword ptr [rsp+0x110]
mov eax, dword ptr [rsp+0x120]
sar eax, 16
pinsrw xmm13, word ptr [rsp+0x120], 2
pinsrw xmm13, eax, 3
mov eax, dword ptr [BLAKE3_BLOCK_LEN+rip]
sar eax, 16
pinsrw xmm13, word ptr [BLAKE3_BLOCK_LEN+rip], 4
pinsrw xmm13, eax, 5
movd xmm14, dword ptr [rsp+0x120]
punpckldq xmm13, xmm14
mov r8, qword ptr [rdi]
movzx eax, byte ptr [rbp+0x40]
or eax, r13d
@ -1949,11 +1931,11 @@ blake3_hash_many_sse2:
cmp rdx, r15
cmovne eax, r14d
movaps xmm2, xmmword ptr [BLAKE3_IV+rip]
movaps xmm3, xmm13
mov r14d, eax
sar r14d, 16
pinsrw xmm3, eax, 6
pinsrw xmm3, r14d, 7
shl rax, 32
or rax, 64
movd xmm12, rax
movdqa xmm3, xmm13
punpcklqdq xmm3, xmm12
movups xmm4, xmmword ptr [r8+rdx-0x40]
movups xmm5, xmmword ptr [r8+rdx-0x30]
movaps xmm8, xmm4

View File

@ -1667,24 +1667,12 @@ blake3_hash_many_sse2:
movaps xmm8, xmm0
movaps xmm9, xmm1
movd xmm13, dword ptr [rsp+0x110]
mov eax, dword ptr [rsp+0x120]
sar eax, 16
pinsrw xmm13, word ptr [rsp+0x120], 2
pinsrw xmm13, eax, 3
mov eax, dword ptr [BLAKE3_BLOCK_LEN+rip]
sar eax, 16
pinsrw xmm13, word ptr [BLAKE3_BLOCK_LEN+rip], 4
pinsrw xmm13, eax, 5
movd xmm14, dword ptr [rsp+0x120]
punpckldq xmm13, xmm14
movaps xmmword ptr [rsp], xmm13
movd xmm14, dword ptr [rsp+0x114]
mov eax, dword ptr [BLAKE3_BLOCK_LEN+rip]
sar eax, 16
pinsrw xmm14, word ptr [rsp+0x124], 2
pinsrw xmm14, eax, 3
mov eax, dword ptr [BLAKE3_BLOCK_LEN+rip]
sar eax, 16
pinsrw xmm14, word ptr [BLAKE3_BLOCK_LEN+rip], 4
pinsrw xmm14, eax, 5
movd xmm13, dword ptr [rsp+0x124]
punpckldq xmm14, xmm13
movaps xmmword ptr [rsp+0x10], xmm14
mov r8, qword ptr [rdi]
mov r9, qword ptr [rdi+0x8]
@ -1725,14 +1713,14 @@ blake3_hash_many_sse2:
pshufd xmm14, xmm14, 0x93
shufps xmm11, xmm15, 221
pshufd xmm15, xmm11, 0x93
shl rax, 0x20
or rax, 0x40
movd xmm3, rax
movdqa xmmword ptr [rsp+0x20], xmm3
movaps xmm3, xmmword ptr [rsp]
movaps xmm11, xmmword ptr [rsp+0x10]
mov r14d, eax
sar r14d, 16
pinsrw xmm3, eax, 6
pinsrw xmm3, r14d, 7
pinsrw xmm11, eax, 6
pinsrw xmm11, r14d, 7
punpcklqdq xmm3, xmmword ptr [rsp+0x20]
punpcklqdq xmm11, xmmword ptr [rsp+0x20]
mov al, 7
9:
paddd xmm0, xmm4
@ -1941,14 +1929,8 @@ blake3_hash_many_sse2:
movups xmm0, xmmword ptr [rcx]
movups xmm1, xmmword ptr [rcx+0x10]
movd xmm13, dword ptr [rsp+0x110]
mov eax, dword ptr [rsp+0x120]
sar eax, 16
pinsrw xmm13, word ptr [rsp+0x120], 2
pinsrw xmm13, eax, 3
mov eax, dword ptr [BLAKE3_BLOCK_LEN+rip]
sar eax, 16
pinsrw xmm13, word ptr [BLAKE3_BLOCK_LEN+rip], 4
pinsrw xmm13, eax, 5
movd xmm14, dword ptr [rsp+0x120]
punpckldq xmm13, xmm14
mov r8, qword ptr [rdi]
movzx eax, byte ptr [rbp+0x80]
or eax, r13d
@ -1960,11 +1942,11 @@ blake3_hash_many_sse2:
cmp rdx, r15
cmovne eax, r14d
movaps xmm2, xmmword ptr [BLAKE3_IV+rip]
movaps xmm3, xmm13
mov r14d, eax
sar r14d, 16
pinsrw xmm3, eax, 6
pinsrw xmm3, r14d, 7
shl rax, 32
or rax, 64
movd xmm12, rax
movdqa xmm3, xmm13
punpcklqdq xmm3, xmm12
movups xmm4, xmmword ptr [r8+rdx-0x40]
movups xmm5, xmmword ptr [r8+rdx-0x30]
movaps xmm8, xmm4

View File

@ -1668,24 +1668,12 @@ final3blocks:
movaps xmm8, xmm0
movaps xmm9, xmm1
movd xmm13, dword ptr [rsp+110H]
mov eax, dword ptr [rsp+120H]
sar eax, 16
pinsrw xmm13, word ptr [rsp+120H], 2
pinsrw xmm13, eax, 3
mov eax, dword ptr [BLAKE3_BLOCK_LEN]
sar eax, 16
pinsrw xmm13, word ptr [BLAKE3_BLOCK_LEN], 4
pinsrw xmm13, eax, 5
movd xmm14, dword ptr [rsp+120H]
punpckldq xmm13, xmm14
movaps xmmword ptr [rsp], xmm13
movd xmm14, dword ptr [rsp+114H]
mov eax, dword ptr [BLAKE3_BLOCK_LEN]
sar eax, 16
pinsrw xmm14, word ptr [rsp+124H], 2
pinsrw xmm14, eax, 3
mov eax, dword ptr [BLAKE3_BLOCK_LEN]
sar eax, 16
pinsrw xmm14, word ptr [BLAKE3_BLOCK_LEN], 4
pinsrw xmm14, eax, 5
movd xmm13, dword ptr [rsp+124H]
punpckldq xmm14, xmm13
movaps xmmword ptr [rsp+10H], xmm14
mov r8, qword ptr [rdi]
mov r9, qword ptr [rdi+8H]
@ -1726,14 +1714,14 @@ innerloop2:
pshufd xmm14, xmm14, 93H
shufps xmm11, xmm15, 221
pshufd xmm15, xmm11, 93H
shl rax, 20H
or rax, 40H
movd xmm3, rax
movdqa xmmword ptr [rsp+20H], xmm3
movaps xmm3, xmmword ptr [rsp]
movaps xmm11, xmmword ptr [rsp+10H]
mov r14d, eax
sar r14d, 16
pinsrw xmm3, eax, 6
pinsrw xmm3, r14d, 7
pinsrw xmm11, eax, 6
pinsrw xmm11, r14d, 7
punpcklqdq xmm3, xmmword ptr [rsp+20H]
punpcklqdq xmm11, xmmword ptr [rsp+20H]
mov al, 7
roundloop2:
paddd xmm0, xmm4
@ -1942,14 +1930,8 @@ final1block:
movups xmm0, xmmword ptr [rcx]
movups xmm1, xmmword ptr [rcx+10H]
movd xmm13, dword ptr [rsp+110H]
mov eax, dword ptr [rsp+120H]
sar eax, 16
pinsrw xmm13, word ptr [rsp+120H], 2
pinsrw xmm13, eax, 3
mov eax, dword ptr [BLAKE3_BLOCK_LEN]
sar eax, 16
pinsrw xmm13, word ptr [BLAKE3_BLOCK_LEN], 4
pinsrw xmm13, eax, 5
movd xmm14, dword ptr [rsp+120H]
punpckldq xmm13, xmm14
mov r8, qword ptr [rdi]
movzx eax, byte ptr [rbp+80H]
or eax, r13d
@ -1961,11 +1943,11 @@ innerloop1:
cmp rdx, r15
cmovne eax, r14d
movaps xmm2, xmmword ptr [BLAKE3_IV]
movaps xmm3, xmm13
mov r14d, eax
sar r14d, 16
pinsrw xmm3, eax, 6
pinsrw xmm3, r14d, 7
shl rax, 32
or rax, 64
movd xmm12, rax
movdqa xmm3, xmm13
punpcklqdq xmm3, xmm12
movups xmm4, xmmword ptr [r8+rdx-40H]
movups xmm5, xmmword ptr [r8+rdx-30H]
movaps xmm8, xmm4