From 47e415c7f19d97b3a39720f9c892288e82d4bd99 Mon Sep 17 00:00:00 2001 From: Matthew Krupcale Date: Mon, 31 Aug 2020 00:21:47 -0400 Subject: [PATCH] C: asm: simplify pinsrd emulation Use punpckl{,q}dq instead of pinsrw. --- c/blake3_sse2_x86-64_unix.S | 52 +++++++++------------------ c/blake3_sse2_x86-64_windows_gnu.S | 52 +++++++++------------------ c/blake3_sse2_x86-64_windows_msvc.asm | 52 +++++++++------------------ 3 files changed, 51 insertions(+), 105 deletions(-) diff --git a/c/blake3_sse2_x86-64_unix.S b/c/blake3_sse2_x86-64_unix.S index 2dcf879..8b26125 100644 --- a/c/blake3_sse2_x86-64_unix.S +++ b/c/blake3_sse2_x86-64_unix.S @@ -1656,24 +1656,12 @@ blake3_hash_many_sse2: movaps xmm8, xmm0 movaps xmm9, xmm1 movd xmm13, dword ptr [rsp+0x110] - mov eax, dword ptr [rsp+0x120] - sar eax, 16 - pinsrw xmm13, word ptr [rsp+0x120], 2 - pinsrw xmm13, eax, 3 - mov eax, dword ptr [BLAKE3_BLOCK_LEN+rip] - sar eax, 16 - pinsrw xmm13, word ptr [BLAKE3_BLOCK_LEN+rip], 4 - pinsrw xmm13, eax, 5 + movd xmm14, dword ptr [rsp+0x120] + punpckldq xmm13, xmm14 movaps xmmword ptr [rsp], xmm13 movd xmm14, dword ptr [rsp+0x114] - mov eax, dword ptr [BLAKE3_BLOCK_LEN+rip] - sar eax, 16 - pinsrw xmm14, word ptr [rsp+0x124], 2 - pinsrw xmm14, eax, 3 - mov eax, dword ptr [BLAKE3_BLOCK_LEN+rip] - sar eax, 16 - pinsrw xmm14, word ptr [BLAKE3_BLOCK_LEN+rip], 4 - pinsrw xmm14, eax, 5 + movd xmm13, dword ptr [rsp+0x124] + punpckldq xmm14, xmm13 movaps xmmword ptr [rsp+0x10], xmm14 mov r8, qword ptr [rdi] mov r9, qword ptr [rdi+0x8] @@ -1714,14 +1702,14 @@ blake3_hash_many_sse2: pshufd xmm14, xmm14, 0x93 shufps xmm11, xmm15, 221 pshufd xmm15, xmm11, 0x93 + shl rax, 0x20 + or rax, 0x40 + movd xmm3, rax + movdqa xmmword ptr [rsp+0x20], xmm3 movaps xmm3, xmmword ptr [rsp] movaps xmm11, xmmword ptr [rsp+0x10] - mov r14d, eax - sar r14d, 16 - pinsrw xmm3, eax, 6 - pinsrw xmm3, r14d, 7 - pinsrw xmm11, eax, 6 - pinsrw xmm11, r14d, 7 + punpcklqdq xmm3, xmmword ptr [rsp+0x20] + punpcklqdq xmm11, xmmword ptr [rsp+0x20] mov al, 7 9: paddd xmm0, xmm4 @@ -1930,14 +1918,8 @@ blake3_hash_many_sse2: movups xmm0, xmmword ptr [rcx] movups xmm1, xmmword ptr [rcx+0x10] movd xmm13, dword ptr [rsp+0x110] - mov eax, dword ptr [rsp+0x120] - sar eax, 16 - pinsrw xmm13, word ptr [rsp+0x120], 2 - pinsrw xmm13, eax, 3 - mov eax, dword ptr [BLAKE3_BLOCK_LEN+rip] - sar eax, 16 - pinsrw xmm13, word ptr [BLAKE3_BLOCK_LEN+rip], 4 - pinsrw xmm13, eax, 5 + movd xmm14, dword ptr [rsp+0x120] + punpckldq xmm13, xmm14 mov r8, qword ptr [rdi] movzx eax, byte ptr [rbp+0x40] or eax, r13d @@ -1949,11 +1931,11 @@ blake3_hash_many_sse2: cmp rdx, r15 cmovne eax, r14d movaps xmm2, xmmword ptr [BLAKE3_IV+rip] - movaps xmm3, xmm13 - mov r14d, eax - sar r14d, 16 - pinsrw xmm3, eax, 6 - pinsrw xmm3, r14d, 7 + shl rax, 32 + or rax, 64 + movd xmm12, rax + movdqa xmm3, xmm13 + punpcklqdq xmm3, xmm12 movups xmm4, xmmword ptr [r8+rdx-0x40] movups xmm5, xmmword ptr [r8+rdx-0x30] movaps xmm8, xmm4 diff --git a/c/blake3_sse2_x86-64_windows_gnu.S b/c/blake3_sse2_x86-64_windows_gnu.S index a01c23c..b2ee40d 100644 --- a/c/blake3_sse2_x86-64_windows_gnu.S +++ b/c/blake3_sse2_x86-64_windows_gnu.S @@ -1667,24 +1667,12 @@ blake3_hash_many_sse2: movaps xmm8, xmm0 movaps xmm9, xmm1 movd xmm13, dword ptr [rsp+0x110] - mov eax, dword ptr [rsp+0x120] - sar eax, 16 - pinsrw xmm13, word ptr [rsp+0x120], 2 - pinsrw xmm13, eax, 3 - mov eax, dword ptr [BLAKE3_BLOCK_LEN+rip] - sar eax, 16 - pinsrw xmm13, word ptr [BLAKE3_BLOCK_LEN+rip], 4 - pinsrw xmm13, eax, 5 + movd xmm14, dword ptr [rsp+0x120] + punpckldq xmm13, xmm14 movaps xmmword ptr [rsp], xmm13 movd xmm14, dword ptr [rsp+0x114] - mov eax, dword ptr [BLAKE3_BLOCK_LEN+rip] - sar eax, 16 - pinsrw xmm14, word ptr [rsp+0x124], 2 - pinsrw xmm14, eax, 3 - mov eax, dword ptr [BLAKE3_BLOCK_LEN+rip] - sar eax, 16 - pinsrw xmm14, word ptr [BLAKE3_BLOCK_LEN+rip], 4 - pinsrw xmm14, eax, 5 + movd xmm13, dword ptr [rsp+0x124] + punpckldq xmm14, xmm13 movaps xmmword ptr [rsp+0x10], xmm14 mov r8, qword ptr [rdi] mov r9, qword ptr [rdi+0x8] @@ -1725,14 +1713,14 @@ blake3_hash_many_sse2: pshufd xmm14, xmm14, 0x93 shufps xmm11, xmm15, 221 pshufd xmm15, xmm11, 0x93 + shl rax, 0x20 + or rax, 0x40 + movd xmm3, rax + movdqa xmmword ptr [rsp+0x20], xmm3 movaps xmm3, xmmword ptr [rsp] movaps xmm11, xmmword ptr [rsp+0x10] - mov r14d, eax - sar r14d, 16 - pinsrw xmm3, eax, 6 - pinsrw xmm3, r14d, 7 - pinsrw xmm11, eax, 6 - pinsrw xmm11, r14d, 7 + punpcklqdq xmm3, xmmword ptr [rsp+0x20] + punpcklqdq xmm11, xmmword ptr [rsp+0x20] mov al, 7 9: paddd xmm0, xmm4 @@ -1941,14 +1929,8 @@ blake3_hash_many_sse2: movups xmm0, xmmword ptr [rcx] movups xmm1, xmmword ptr [rcx+0x10] movd xmm13, dword ptr [rsp+0x110] - mov eax, dword ptr [rsp+0x120] - sar eax, 16 - pinsrw xmm13, word ptr [rsp+0x120], 2 - pinsrw xmm13, eax, 3 - mov eax, dword ptr [BLAKE3_BLOCK_LEN+rip] - sar eax, 16 - pinsrw xmm13, word ptr [BLAKE3_BLOCK_LEN+rip], 4 - pinsrw xmm13, eax, 5 + movd xmm14, dword ptr [rsp+0x120] + punpckldq xmm13, xmm14 mov r8, qword ptr [rdi] movzx eax, byte ptr [rbp+0x80] or eax, r13d @@ -1960,11 +1942,11 @@ blake3_hash_many_sse2: cmp rdx, r15 cmovne eax, r14d movaps xmm2, xmmword ptr [BLAKE3_IV+rip] - movaps xmm3, xmm13 - mov r14d, eax - sar r14d, 16 - pinsrw xmm3, eax, 6 - pinsrw xmm3, r14d, 7 + shl rax, 32 + or rax, 64 + movd xmm12, rax + movdqa xmm3, xmm13 + punpcklqdq xmm3, xmm12 movups xmm4, xmmword ptr [r8+rdx-0x40] movups xmm5, xmmword ptr [r8+rdx-0x30] movaps xmm8, xmm4 diff --git a/c/blake3_sse2_x86-64_windows_msvc.asm b/c/blake3_sse2_x86-64_windows_msvc.asm index da510d8..70a3044 100644 --- a/c/blake3_sse2_x86-64_windows_msvc.asm +++ b/c/blake3_sse2_x86-64_windows_msvc.asm @@ -1668,24 +1668,12 @@ final3blocks: movaps xmm8, xmm0 movaps xmm9, xmm1 movd xmm13, dword ptr [rsp+110H] - mov eax, dword ptr [rsp+120H] - sar eax, 16 - pinsrw xmm13, word ptr [rsp+120H], 2 - pinsrw xmm13, eax, 3 - mov eax, dword ptr [BLAKE3_BLOCK_LEN] - sar eax, 16 - pinsrw xmm13, word ptr [BLAKE3_BLOCK_LEN], 4 - pinsrw xmm13, eax, 5 + movd xmm14, dword ptr [rsp+120H] + punpckldq xmm13, xmm14 movaps xmmword ptr [rsp], xmm13 movd xmm14, dword ptr [rsp+114H] - mov eax, dword ptr [BLAKE3_BLOCK_LEN] - sar eax, 16 - pinsrw xmm14, word ptr [rsp+124H], 2 - pinsrw xmm14, eax, 3 - mov eax, dword ptr [BLAKE3_BLOCK_LEN] - sar eax, 16 - pinsrw xmm14, word ptr [BLAKE3_BLOCK_LEN], 4 - pinsrw xmm14, eax, 5 + movd xmm13, dword ptr [rsp+124H] + punpckldq xmm14, xmm13 movaps xmmword ptr [rsp+10H], xmm14 mov r8, qword ptr [rdi] mov r9, qword ptr [rdi+8H] @@ -1726,14 +1714,14 @@ innerloop2: pshufd xmm14, xmm14, 93H shufps xmm11, xmm15, 221 pshufd xmm15, xmm11, 93H + shl rax, 20H + or rax, 40H + movd xmm3, rax + movdqa xmmword ptr [rsp+20H], xmm3 movaps xmm3, xmmword ptr [rsp] movaps xmm11, xmmword ptr [rsp+10H] - mov r14d, eax - sar r14d, 16 - pinsrw xmm3, eax, 6 - pinsrw xmm3, r14d, 7 - pinsrw xmm11, eax, 6 - pinsrw xmm11, r14d, 7 + punpcklqdq xmm3, xmmword ptr [rsp+20H] + punpcklqdq xmm11, xmmword ptr [rsp+20H] mov al, 7 roundloop2: paddd xmm0, xmm4 @@ -1942,14 +1930,8 @@ final1block: movups xmm0, xmmword ptr [rcx] movups xmm1, xmmword ptr [rcx+10H] movd xmm13, dword ptr [rsp+110H] - mov eax, dword ptr [rsp+120H] - sar eax, 16 - pinsrw xmm13, word ptr [rsp+120H], 2 - pinsrw xmm13, eax, 3 - mov eax, dword ptr [BLAKE3_BLOCK_LEN] - sar eax, 16 - pinsrw xmm13, word ptr [BLAKE3_BLOCK_LEN], 4 - pinsrw xmm13, eax, 5 + movd xmm14, dword ptr [rsp+120H] + punpckldq xmm13, xmm14 mov r8, qword ptr [rdi] movzx eax, byte ptr [rbp+80H] or eax, r13d @@ -1961,11 +1943,11 @@ innerloop1: cmp rdx, r15 cmovne eax, r14d movaps xmm2, xmmword ptr [BLAKE3_IV] - movaps xmm3, xmm13 - mov r14d, eax - sar r14d, 16 - pinsrw xmm3, eax, 6 - pinsrw xmm3, r14d, 7 + shl rax, 32 + or rax, 64 + movd xmm12, rax + movdqa xmm3, xmm13 + punpcklqdq xmm3, xmm12 movups xmm4, xmmword ptr [r8+rdx-40H] movups xmm5, xmmword ptr [r8+rdx-30H] movaps xmm8, xmm4