Samuel Neves
a830ab2661
streamline load_counters
...
avx2 before:
mov eax, esi
neg rax
vmovq xmm0, rax
vpbroadcastq ymm0, xmm0
vpand ymm0, ymm0, ymmword ptr [rip + .LCPI1_0]
vmovq xmm2, rdi
vpbroadcastq ymm1, xmm2
vpaddq ymm1, ymm0, ymm1
vmovdqa ymm0, ymmword ptr [rip + .LCPI1_1] # ymm0 = [0,2,4,6,4,6,6,7]
vpermd ymm3, ymm0, ymm1
mov r8d, eax
and r8d, 5
add r8, rdi
mov esi, eax
and esi, 6
add rsi, rdi
and eax, 7
vpshufd xmm4, xmm3, 231 # xmm4 = xmm3[3,1,2,3]
vpinsrd xmm4, xmm4, r8d, 1
add rax, rdi
vpinsrd xmm4, xmm4, esi, 2
vpinsrd xmm4, xmm4, eax, 3
vpshufd xmm3, xmm3, 144 # xmm3 = xmm3[0,0,1,2]
vpinsrd xmm3, xmm3, edi, 0
vmovdqa xmmword ptr [rdx], xmm3
vmovdqa xmmword ptr [rdx + 16], xmm4
vpermq ymm3, ymm1, 144 # ymm3 = ymm1[0,0,1,2]
vpblendd ymm2, ymm3, ymm2, 3 # ymm2 = ymm2[0,1],ymm3[2,3,4,5,6,7]
vpsrlq ymm2, ymm2, 32
vpermd ymm2, ymm0, ymm2
vextracti128 xmm1, ymm1, 1
vmovq xmm3, rax
vmovq xmm4, rsi
vpunpcklqdq xmm3, xmm4, xmm3 # xmm3 = xmm4[0],xmm3[0]
vmovq xmm4, r8
vpalignr xmm1, xmm4, xmm1, 8 # xmm1 = xmm1[8,9,10,11,12,13,14,15],xmm4[0,1,2,3,4,5,6,7]
vinserti128 ymm1, ymm1, xmm3, 1
vpsrlq ymm1, ymm1, 32
vpermd ymm0, ymm0, ymm1
avx2 after:
neg esi
vmovd xmm0, esi
vpbroadcastd ymm0, xmm0
vmovd xmm1, edi
vpbroadcastd ymm1, xmm1
vpand ymm0, ymm0, ymmword ptr [rip + .LCPI0_0]
vpaddd ymm1, ymm1, ymm0
vpbroadcastd ymm2, dword ptr [rip + .LCPI0_1] # ymm2 = [2147483648,2147483648,2147483648,2147483648,2147483648,2147483648,2147483648,2147483648]
vpor ymm0, ymm0, ymm2
vpxor ymm2, ymm1, ymm2
vpcmpgtd ymm0, ymm0, ymm2
shr rdi, 32
vmovd xmm2, edi
vpbroadcastd ymm2, xmm2
vpsubd ymm0, ymm2, ymm0
2020-01-23 12:17:43 +00:00