mirror of
https://github.com/BLAKE3-team/BLAKE3
synced 2025-01-21 15:50:01 +01:00
The SSE2 patch introduced xmm10 as a temporary register for one of the rotations, but xmm6-xmm15 are callee-save registers on Windows, and SSE4.1 was only saving the registers it used. The minimal fix is to use one of the saved registers instead of xmm10. See https://github.com/BLAKE3-team/BLAKE3/issues/206.
2351 lines
69 KiB
NASM
2351 lines
69 KiB
NASM
public _blake3_hash_many_sse2
|
|
public blake3_hash_many_sse2
|
|
public blake3_compress_in_place_sse2
|
|
public _blake3_compress_in_place_sse2
|
|
public blake3_compress_xof_sse2
|
|
public _blake3_compress_xof_sse2
|
|
|
|
_TEXT SEGMENT ALIGN(16) 'CODE'
|
|
|
|
ALIGN 16
|
|
blake3_hash_many_sse2 PROC
|
|
_blake3_hash_many_sse2 PROC
|
|
push r15
|
|
push r14
|
|
push r13
|
|
push r12
|
|
push rsi
|
|
push rdi
|
|
push rbx
|
|
push rbp
|
|
mov rbp, rsp
|
|
sub rsp, 528
|
|
and rsp, 0FFFFFFFFFFFFFFC0H
|
|
movdqa xmmword ptr [rsp+170H], xmm6
|
|
movdqa xmmword ptr [rsp+180H], xmm7
|
|
movdqa xmmword ptr [rsp+190H], xmm8
|
|
movdqa xmmword ptr [rsp+1A0H], xmm9
|
|
movdqa xmmword ptr [rsp+1B0H], xmm10
|
|
movdqa xmmword ptr [rsp+1C0H], xmm11
|
|
movdqa xmmword ptr [rsp+1D0H], xmm12
|
|
movdqa xmmword ptr [rsp+1E0H], xmm13
|
|
movdqa xmmword ptr [rsp+1F0H], xmm14
|
|
movdqa xmmword ptr [rsp+200H], xmm15
|
|
mov rdi, rcx
|
|
mov rsi, rdx
|
|
mov rdx, r8
|
|
mov rcx, r9
|
|
mov r8, qword ptr [rbp+68H]
|
|
movzx r9, byte ptr [rbp+70H]
|
|
neg r9d
|
|
movd xmm0, r9d
|
|
pshufd xmm0, xmm0, 00H
|
|
movdqa xmmword ptr [rsp+130H], xmm0
|
|
movdqa xmm1, xmm0
|
|
pand xmm1, xmmword ptr [ADD0]
|
|
pand xmm0, xmmword ptr [ADD1]
|
|
movdqa xmmword ptr [rsp+150H], xmm0
|
|
movd xmm0, r8d
|
|
pshufd xmm0, xmm0, 00H
|
|
paddd xmm0, xmm1
|
|
movdqa xmmword ptr [rsp+110H], xmm0
|
|
pxor xmm0, xmmword ptr [CMP_MSB_MASK]
|
|
pxor xmm1, xmmword ptr [CMP_MSB_MASK]
|
|
pcmpgtd xmm1, xmm0
|
|
shr r8, 32
|
|
movd xmm2, r8d
|
|
pshufd xmm2, xmm2, 00H
|
|
psubd xmm2, xmm1
|
|
movdqa xmmword ptr [rsp+120H], xmm2
|
|
mov rbx, qword ptr [rbp+90H]
|
|
mov r15, rdx
|
|
shl r15, 6
|
|
movzx r13d, byte ptr [rbp+78H]
|
|
movzx r12d, byte ptr [rbp+88H]
|
|
cmp rsi, 4
|
|
jc final3blocks
|
|
outerloop4:
|
|
movdqu xmm3, xmmword ptr [rcx]
|
|
pshufd xmm0, xmm3, 00H
|
|
pshufd xmm1, xmm3, 55H
|
|
pshufd xmm2, xmm3, 0AAH
|
|
pshufd xmm3, xmm3, 0FFH
|
|
movdqu xmm7, xmmword ptr [rcx+10H]
|
|
pshufd xmm4, xmm7, 00H
|
|
pshufd xmm5, xmm7, 55H
|
|
pshufd xmm6, xmm7, 0AAH
|
|
pshufd xmm7, xmm7, 0FFH
|
|
mov r8, qword ptr [rdi]
|
|
mov r9, qword ptr [rdi+8H]
|
|
mov r10, qword ptr [rdi+10H]
|
|
mov r11, qword ptr [rdi+18H]
|
|
movzx eax, byte ptr [rbp+80H]
|
|
or eax, r13d
|
|
xor edx, edx
|
|
innerloop4:
|
|
mov r14d, eax
|
|
or eax, r12d
|
|
add rdx, 64
|
|
cmp rdx, r15
|
|
cmovne eax, r14d
|
|
movdqu xmm8, xmmword ptr [r8+rdx-40H]
|
|
movdqu xmm9, xmmword ptr [r9+rdx-40H]
|
|
movdqu xmm10, xmmword ptr [r10+rdx-40H]
|
|
movdqu xmm11, xmmword ptr [r11+rdx-40H]
|
|
movdqa xmm12, xmm8
|
|
punpckldq xmm8, xmm9
|
|
punpckhdq xmm12, xmm9
|
|
movdqa xmm14, xmm10
|
|
punpckldq xmm10, xmm11
|
|
punpckhdq xmm14, xmm11
|
|
movdqa xmm9, xmm8
|
|
punpcklqdq xmm8, xmm10
|
|
punpckhqdq xmm9, xmm10
|
|
movdqa xmm13, xmm12
|
|
punpcklqdq xmm12, xmm14
|
|
punpckhqdq xmm13, xmm14
|
|
movdqa xmmword ptr [rsp], xmm8
|
|
movdqa xmmword ptr [rsp+10H], xmm9
|
|
movdqa xmmword ptr [rsp+20H], xmm12
|
|
movdqa xmmword ptr [rsp+30H], xmm13
|
|
movdqu xmm8, xmmword ptr [r8+rdx-30H]
|
|
movdqu xmm9, xmmword ptr [r9+rdx-30H]
|
|
movdqu xmm10, xmmword ptr [r10+rdx-30H]
|
|
movdqu xmm11, xmmword ptr [r11+rdx-30H]
|
|
movdqa xmm12, xmm8
|
|
punpckldq xmm8, xmm9
|
|
punpckhdq xmm12, xmm9
|
|
movdqa xmm14, xmm10
|
|
punpckldq xmm10, xmm11
|
|
punpckhdq xmm14, xmm11
|
|
movdqa xmm9, xmm8
|
|
punpcklqdq xmm8, xmm10
|
|
punpckhqdq xmm9, xmm10
|
|
movdqa xmm13, xmm12
|
|
punpcklqdq xmm12, xmm14
|
|
punpckhqdq xmm13, xmm14
|
|
movdqa xmmword ptr [rsp+40H], xmm8
|
|
movdqa xmmword ptr [rsp+50H], xmm9
|
|
movdqa xmmword ptr [rsp+60H], xmm12
|
|
movdqa xmmword ptr [rsp+70H], xmm13
|
|
movdqu xmm8, xmmword ptr [r8+rdx-20H]
|
|
movdqu xmm9, xmmword ptr [r9+rdx-20H]
|
|
movdqu xmm10, xmmword ptr [r10+rdx-20H]
|
|
movdqu xmm11, xmmword ptr [r11+rdx-20H]
|
|
movdqa xmm12, xmm8
|
|
punpckldq xmm8, xmm9
|
|
punpckhdq xmm12, xmm9
|
|
movdqa xmm14, xmm10
|
|
punpckldq xmm10, xmm11
|
|
punpckhdq xmm14, xmm11
|
|
movdqa xmm9, xmm8
|
|
punpcklqdq xmm8, xmm10
|
|
punpckhqdq xmm9, xmm10
|
|
movdqa xmm13, xmm12
|
|
punpcklqdq xmm12, xmm14
|
|
punpckhqdq xmm13, xmm14
|
|
movdqa xmmword ptr [rsp+80H], xmm8
|
|
movdqa xmmword ptr [rsp+90H], xmm9
|
|
movdqa xmmword ptr [rsp+0A0H], xmm12
|
|
movdqa xmmword ptr [rsp+0B0H], xmm13
|
|
movdqu xmm8, xmmword ptr [r8+rdx-10H]
|
|
movdqu xmm9, xmmword ptr [r9+rdx-10H]
|
|
movdqu xmm10, xmmword ptr [r10+rdx-10H]
|
|
movdqu xmm11, xmmword ptr [r11+rdx-10H]
|
|
movdqa xmm12, xmm8
|
|
punpckldq xmm8, xmm9
|
|
punpckhdq xmm12, xmm9
|
|
movdqa xmm14, xmm10
|
|
punpckldq xmm10, xmm11
|
|
punpckhdq xmm14, xmm11
|
|
movdqa xmm9, xmm8
|
|
punpcklqdq xmm8, xmm10
|
|
punpckhqdq xmm9, xmm10
|
|
movdqa xmm13, xmm12
|
|
punpcklqdq xmm12, xmm14
|
|
punpckhqdq xmm13, xmm14
|
|
movdqa xmmword ptr [rsp+0C0H], xmm8
|
|
movdqa xmmword ptr [rsp+0D0H], xmm9
|
|
movdqa xmmword ptr [rsp+0E0H], xmm12
|
|
movdqa xmmword ptr [rsp+0F0H], xmm13
|
|
movdqa xmm9, xmmword ptr [BLAKE3_IV_1]
|
|
movdqa xmm10, xmmword ptr [BLAKE3_IV_2]
|
|
movdqa xmm11, xmmword ptr [BLAKE3_IV_3]
|
|
movdqa xmm12, xmmword ptr [rsp+110H]
|
|
movdqa xmm13, xmmword ptr [rsp+120H]
|
|
movdqa xmm14, xmmword ptr [BLAKE3_BLOCK_LEN]
|
|
movd xmm15, eax
|
|
pshufd xmm15, xmm15, 00H
|
|
prefetcht0 byte ptr [r8+rdx+80H]
|
|
prefetcht0 byte ptr [r9+rdx+80H]
|
|
prefetcht0 byte ptr [r10+rdx+80H]
|
|
prefetcht0 byte ptr [r11+rdx+80H]
|
|
paddd xmm0, xmmword ptr [rsp]
|
|
paddd xmm1, xmmword ptr [rsp+20H]
|
|
paddd xmm2, xmmword ptr [rsp+40H]
|
|
paddd xmm3, xmmword ptr [rsp+60H]
|
|
paddd xmm0, xmm4
|
|
paddd xmm1, xmm5
|
|
paddd xmm2, xmm6
|
|
paddd xmm3, xmm7
|
|
pxor xmm12, xmm0
|
|
pxor xmm13, xmm1
|
|
pxor xmm14, xmm2
|
|
pxor xmm15, xmm3
|
|
pshuflw xmm12, xmm12, 0B1H
|
|
pshufhw xmm12, xmm12, 0B1H
|
|
pshuflw xmm13, xmm13, 0B1H
|
|
pshufhw xmm13, xmm13, 0B1H
|
|
pshuflw xmm14, xmm14, 0B1H
|
|
pshufhw xmm14, xmm14, 0B1H
|
|
pshuflw xmm15, xmm15, 0B1H
|
|
pshufhw xmm15, xmm15, 0B1H
|
|
movdqa xmm8, xmmword ptr [BLAKE3_IV_0]
|
|
paddd xmm8, xmm12
|
|
paddd xmm9, xmm13
|
|
paddd xmm10, xmm14
|
|
paddd xmm11, xmm15
|
|
pxor xmm4, xmm8
|
|
pxor xmm5, xmm9
|
|
pxor xmm6, xmm10
|
|
pxor xmm7, xmm11
|
|
movdqa xmmword ptr [rsp+100H], xmm8
|
|
movdqa xmm8, xmm4
|
|
psrld xmm8, 12
|
|
pslld xmm4, 20
|
|
por xmm4, xmm8
|
|
movdqa xmm8, xmm5
|
|
psrld xmm8, 12
|
|
pslld xmm5, 20
|
|
por xmm5, xmm8
|
|
movdqa xmm8, xmm6
|
|
psrld xmm8, 12
|
|
pslld xmm6, 20
|
|
por xmm6, xmm8
|
|
movdqa xmm8, xmm7
|
|
psrld xmm8, 12
|
|
pslld xmm7, 20
|
|
por xmm7, xmm8
|
|
paddd xmm0, xmmword ptr [rsp+10H]
|
|
paddd xmm1, xmmword ptr [rsp+30H]
|
|
paddd xmm2, xmmword ptr [rsp+50H]
|
|
paddd xmm3, xmmword ptr [rsp+70H]
|
|
paddd xmm0, xmm4
|
|
paddd xmm1, xmm5
|
|
paddd xmm2, xmm6
|
|
paddd xmm3, xmm7
|
|
pxor xmm12, xmm0
|
|
pxor xmm13, xmm1
|
|
pxor xmm14, xmm2
|
|
pxor xmm15, xmm3
|
|
movdqa xmm8, xmm12
|
|
psrld xmm12, 8
|
|
pslld xmm8, 24
|
|
pxor xmm12, xmm8
|
|
movdqa xmm8, xmm13
|
|
psrld xmm13, 8
|
|
pslld xmm8, 24
|
|
pxor xmm13, xmm8
|
|
movdqa xmm8, xmm14
|
|
psrld xmm14, 8
|
|
pslld xmm8, 24
|
|
pxor xmm14, xmm8
|
|
movdqa xmm8, xmm15
|
|
psrld xmm15, 8
|
|
pslld xmm8, 24
|
|
pxor xmm15, xmm8
|
|
movdqa xmm8, xmmword ptr [rsp+100H]
|
|
paddd xmm8, xmm12
|
|
paddd xmm9, xmm13
|
|
paddd xmm10, xmm14
|
|
paddd xmm11, xmm15
|
|
pxor xmm4, xmm8
|
|
pxor xmm5, xmm9
|
|
pxor xmm6, xmm10
|
|
pxor xmm7, xmm11
|
|
movdqa xmmword ptr [rsp+100H], xmm8
|
|
movdqa xmm8, xmm4
|
|
psrld xmm8, 7
|
|
pslld xmm4, 25
|
|
por xmm4, xmm8
|
|
movdqa xmm8, xmm5
|
|
psrld xmm8, 7
|
|
pslld xmm5, 25
|
|
por xmm5, xmm8
|
|
movdqa xmm8, xmm6
|
|
psrld xmm8, 7
|
|
pslld xmm6, 25
|
|
por xmm6, xmm8
|
|
movdqa xmm8, xmm7
|
|
psrld xmm8, 7
|
|
pslld xmm7, 25
|
|
por xmm7, xmm8
|
|
paddd xmm0, xmmword ptr [rsp+80H]
|
|
paddd xmm1, xmmword ptr [rsp+0A0H]
|
|
paddd xmm2, xmmword ptr [rsp+0C0H]
|
|
paddd xmm3, xmmword ptr [rsp+0E0H]
|
|
paddd xmm0, xmm5
|
|
paddd xmm1, xmm6
|
|
paddd xmm2, xmm7
|
|
paddd xmm3, xmm4
|
|
pxor xmm15, xmm0
|
|
pxor xmm12, xmm1
|
|
pxor xmm13, xmm2
|
|
pxor xmm14, xmm3
|
|
pshuflw xmm15, xmm15, 0B1H
|
|
pshufhw xmm15, xmm15, 0B1H
|
|
pshuflw xmm12, xmm12, 0B1H
|
|
pshufhw xmm12, xmm12, 0B1H
|
|
pshuflw xmm13, xmm13, 0B1H
|
|
pshufhw xmm13, xmm13, 0B1H
|
|
pshuflw xmm14, xmm14, 0B1H
|
|
pshufhw xmm14, xmm14, 0B1H
|
|
paddd xmm10, xmm15
|
|
paddd xmm11, xmm12
|
|
movdqa xmm8, xmmword ptr [rsp+100H]
|
|
paddd xmm8, xmm13
|
|
paddd xmm9, xmm14
|
|
pxor xmm5, xmm10
|
|
pxor xmm6, xmm11
|
|
pxor xmm7, xmm8
|
|
pxor xmm4, xmm9
|
|
movdqa xmmword ptr [rsp+100H], xmm8
|
|
movdqa xmm8, xmm5
|
|
psrld xmm8, 12
|
|
pslld xmm5, 20
|
|
por xmm5, xmm8
|
|
movdqa xmm8, xmm6
|
|
psrld xmm8, 12
|
|
pslld xmm6, 20
|
|
por xmm6, xmm8
|
|
movdqa xmm8, xmm7
|
|
psrld xmm8, 12
|
|
pslld xmm7, 20
|
|
por xmm7, xmm8
|
|
movdqa xmm8, xmm4
|
|
psrld xmm8, 12
|
|
pslld xmm4, 20
|
|
por xmm4, xmm8
|
|
paddd xmm0, xmmword ptr [rsp+90H]
|
|
paddd xmm1, xmmword ptr [rsp+0B0H]
|
|
paddd xmm2, xmmword ptr [rsp+0D0H]
|
|
paddd xmm3, xmmword ptr [rsp+0F0H]
|
|
paddd xmm0, xmm5
|
|
paddd xmm1, xmm6
|
|
paddd xmm2, xmm7
|
|
paddd xmm3, xmm4
|
|
pxor xmm15, xmm0
|
|
pxor xmm12, xmm1
|
|
pxor xmm13, xmm2
|
|
pxor xmm14, xmm3
|
|
movdqa xmm8, xmm15
|
|
psrld xmm15, 8
|
|
pslld xmm8, 24
|
|
pxor xmm15, xmm8
|
|
movdqa xmm8, xmm12
|
|
psrld xmm12, 8
|
|
pslld xmm8, 24
|
|
pxor xmm12, xmm8
|
|
movdqa xmm8, xmm13
|
|
psrld xmm13, 8
|
|
pslld xmm8, 24
|
|
pxor xmm13, xmm8
|
|
movdqa xmm8, xmm14
|
|
psrld xmm14, 8
|
|
pslld xmm8, 24
|
|
pxor xmm14, xmm8
|
|
paddd xmm10, xmm15
|
|
paddd xmm11, xmm12
|
|
movdqa xmm8, xmmword ptr [rsp+100H]
|
|
paddd xmm8, xmm13
|
|
paddd xmm9, xmm14
|
|
pxor xmm5, xmm10
|
|
pxor xmm6, xmm11
|
|
pxor xmm7, xmm8
|
|
pxor xmm4, xmm9
|
|
movdqa xmmword ptr [rsp+100H], xmm8
|
|
movdqa xmm8, xmm5
|
|
psrld xmm8, 7
|
|
pslld xmm5, 25
|
|
por xmm5, xmm8
|
|
movdqa xmm8, xmm6
|
|
psrld xmm8, 7
|
|
pslld xmm6, 25
|
|
por xmm6, xmm8
|
|
movdqa xmm8, xmm7
|
|
psrld xmm8, 7
|
|
pslld xmm7, 25
|
|
por xmm7, xmm8
|
|
movdqa xmm8, xmm4
|
|
psrld xmm8, 7
|
|
pslld xmm4, 25
|
|
por xmm4, xmm8
|
|
paddd xmm0, xmmword ptr [rsp+20H]
|
|
paddd xmm1, xmmword ptr [rsp+30H]
|
|
paddd xmm2, xmmword ptr [rsp+70H]
|
|
paddd xmm3, xmmword ptr [rsp+40H]
|
|
paddd xmm0, xmm4
|
|
paddd xmm1, xmm5
|
|
paddd xmm2, xmm6
|
|
paddd xmm3, xmm7
|
|
pxor xmm12, xmm0
|
|
pxor xmm13, xmm1
|
|
pxor xmm14, xmm2
|
|
pxor xmm15, xmm3
|
|
pshuflw xmm12, xmm12, 0B1H
|
|
pshufhw xmm12, xmm12, 0B1H
|
|
pshuflw xmm13, xmm13, 0B1H
|
|
pshufhw xmm13, xmm13, 0B1H
|
|
pshuflw xmm14, xmm14, 0B1H
|
|
pshufhw xmm14, xmm14, 0B1H
|
|
pshuflw xmm15, xmm15, 0B1H
|
|
pshufhw xmm15, xmm15, 0B1H
|
|
movdqa xmm8, xmmword ptr [rsp+100H]
|
|
paddd xmm8, xmm12
|
|
paddd xmm9, xmm13
|
|
paddd xmm10, xmm14
|
|
paddd xmm11, xmm15
|
|
pxor xmm4, xmm8
|
|
pxor xmm5, xmm9
|
|
pxor xmm6, xmm10
|
|
pxor xmm7, xmm11
|
|
movdqa xmmword ptr [rsp+100H], xmm8
|
|
movdqa xmm8, xmm4
|
|
psrld xmm8, 12
|
|
pslld xmm4, 20
|
|
por xmm4, xmm8
|
|
movdqa xmm8, xmm5
|
|
psrld xmm8, 12
|
|
pslld xmm5, 20
|
|
por xmm5, xmm8
|
|
movdqa xmm8, xmm6
|
|
psrld xmm8, 12
|
|
pslld xmm6, 20
|
|
por xmm6, xmm8
|
|
movdqa xmm8, xmm7
|
|
psrld xmm8, 12
|
|
pslld xmm7, 20
|
|
por xmm7, xmm8
|
|
paddd xmm0, xmmword ptr [rsp+60H]
|
|
paddd xmm1, xmmword ptr [rsp+0A0H]
|
|
paddd xmm2, xmmword ptr [rsp]
|
|
paddd xmm3, xmmword ptr [rsp+0D0H]
|
|
paddd xmm0, xmm4
|
|
paddd xmm1, xmm5
|
|
paddd xmm2, xmm6
|
|
paddd xmm3, xmm7
|
|
pxor xmm12, xmm0
|
|
pxor xmm13, xmm1
|
|
pxor xmm14, xmm2
|
|
pxor xmm15, xmm3
|
|
movdqa xmm8, xmm12
|
|
psrld xmm12, 8
|
|
pslld xmm8, 24
|
|
pxor xmm12, xmm8
|
|
movdqa xmm8, xmm13
|
|
psrld xmm13, 8
|
|
pslld xmm8, 24
|
|
pxor xmm13, xmm8
|
|
movdqa xmm8, xmm14
|
|
psrld xmm14, 8
|
|
pslld xmm8, 24
|
|
pxor xmm14, xmm8
|
|
movdqa xmm8, xmm15
|
|
psrld xmm15, 8
|
|
pslld xmm8, 24
|
|
pxor xmm15, xmm8
|
|
movdqa xmm8, xmmword ptr [rsp+100H]
|
|
paddd xmm8, xmm12
|
|
paddd xmm9, xmm13
|
|
paddd xmm10, xmm14
|
|
paddd xmm11, xmm15
|
|
pxor xmm4, xmm8
|
|
pxor xmm5, xmm9
|
|
pxor xmm6, xmm10
|
|
pxor xmm7, xmm11
|
|
movdqa xmmword ptr [rsp+100H], xmm8
|
|
movdqa xmm8, xmm4
|
|
psrld xmm8, 7
|
|
pslld xmm4, 25
|
|
por xmm4, xmm8
|
|
movdqa xmm8, xmm5
|
|
psrld xmm8, 7
|
|
pslld xmm5, 25
|
|
por xmm5, xmm8
|
|
movdqa xmm8, xmm6
|
|
psrld xmm8, 7
|
|
pslld xmm6, 25
|
|
por xmm6, xmm8
|
|
movdqa xmm8, xmm7
|
|
psrld xmm8, 7
|
|
pslld xmm7, 25
|
|
por xmm7, xmm8
|
|
paddd xmm0, xmmword ptr [rsp+10H]
|
|
paddd xmm1, xmmword ptr [rsp+0C0H]
|
|
paddd xmm2, xmmword ptr [rsp+90H]
|
|
paddd xmm3, xmmword ptr [rsp+0F0H]
|
|
paddd xmm0, xmm5
|
|
paddd xmm1, xmm6
|
|
paddd xmm2, xmm7
|
|
paddd xmm3, xmm4
|
|
pxor xmm15, xmm0
|
|
pxor xmm12, xmm1
|
|
pxor xmm13, xmm2
|
|
pxor xmm14, xmm3
|
|
pshuflw xmm15, xmm15, 0B1H
|
|
pshufhw xmm15, xmm15, 0B1H
|
|
pshuflw xmm12, xmm12, 0B1H
|
|
pshufhw xmm12, xmm12, 0B1H
|
|
pshuflw xmm13, xmm13, 0B1H
|
|
pshufhw xmm13, xmm13, 0B1H
|
|
pshuflw xmm14, xmm14, 0B1H
|
|
pshufhw xmm14, xmm14, 0B1H
|
|
paddd xmm10, xmm15
|
|
paddd xmm11, xmm12
|
|
movdqa xmm8, xmmword ptr [rsp+100H]
|
|
paddd xmm8, xmm13
|
|
paddd xmm9, xmm14
|
|
pxor xmm5, xmm10
|
|
pxor xmm6, xmm11
|
|
pxor xmm7, xmm8
|
|
pxor xmm4, xmm9
|
|
movdqa xmmword ptr [rsp+100H], xmm8
|
|
movdqa xmm8, xmm5
|
|
psrld xmm8, 12
|
|
pslld xmm5, 20
|
|
por xmm5, xmm8
|
|
movdqa xmm8, xmm6
|
|
psrld xmm8, 12
|
|
pslld xmm6, 20
|
|
por xmm6, xmm8
|
|
movdqa xmm8, xmm7
|
|
psrld xmm8, 12
|
|
pslld xmm7, 20
|
|
por xmm7, xmm8
|
|
movdqa xmm8, xmm4
|
|
psrld xmm8, 12
|
|
pslld xmm4, 20
|
|
por xmm4, xmm8
|
|
paddd xmm0, xmmword ptr [rsp+0B0H]
|
|
paddd xmm1, xmmword ptr [rsp+50H]
|
|
paddd xmm2, xmmword ptr [rsp+0E0H]
|
|
paddd xmm3, xmmword ptr [rsp+80H]
|
|
paddd xmm0, xmm5
|
|
paddd xmm1, xmm6
|
|
paddd xmm2, xmm7
|
|
paddd xmm3, xmm4
|
|
pxor xmm15, xmm0
|
|
pxor xmm12, xmm1
|
|
pxor xmm13, xmm2
|
|
pxor xmm14, xmm3
|
|
movdqa xmm8, xmm15
|
|
psrld xmm15, 8
|
|
pslld xmm8, 24
|
|
pxor xmm15, xmm8
|
|
movdqa xmm8, xmm12
|
|
psrld xmm12, 8
|
|
pslld xmm8, 24
|
|
pxor xmm12, xmm8
|
|
movdqa xmm8, xmm13
|
|
psrld xmm13, 8
|
|
pslld xmm8, 24
|
|
pxor xmm13, xmm8
|
|
movdqa xmm8, xmm14
|
|
psrld xmm14, 8
|
|
pslld xmm8, 24
|
|
pxor xmm14, xmm8
|
|
paddd xmm10, xmm15
|
|
paddd xmm11, xmm12
|
|
movdqa xmm8, xmmword ptr [rsp+100H]
|
|
paddd xmm8, xmm13
|
|
paddd xmm9, xmm14
|
|
pxor xmm5, xmm10
|
|
pxor xmm6, xmm11
|
|
pxor xmm7, xmm8
|
|
pxor xmm4, xmm9
|
|
movdqa xmmword ptr [rsp+100H], xmm8
|
|
movdqa xmm8, xmm5
|
|
psrld xmm8, 7
|
|
pslld xmm5, 25
|
|
por xmm5, xmm8
|
|
movdqa xmm8, xmm6
|
|
psrld xmm8, 7
|
|
pslld xmm6, 25
|
|
por xmm6, xmm8
|
|
movdqa xmm8, xmm7
|
|
psrld xmm8, 7
|
|
pslld xmm7, 25
|
|
por xmm7, xmm8
|
|
movdqa xmm8, xmm4
|
|
psrld xmm8, 7
|
|
pslld xmm4, 25
|
|
por xmm4, xmm8
|
|
paddd xmm0, xmmword ptr [rsp+30H]
|
|
paddd xmm1, xmmword ptr [rsp+0A0H]
|
|
paddd xmm2, xmmword ptr [rsp+0D0H]
|
|
paddd xmm3, xmmword ptr [rsp+70H]
|
|
paddd xmm0, xmm4
|
|
paddd xmm1, xmm5
|
|
paddd xmm2, xmm6
|
|
paddd xmm3, xmm7
|
|
pxor xmm12, xmm0
|
|
pxor xmm13, xmm1
|
|
pxor xmm14, xmm2
|
|
pxor xmm15, xmm3
|
|
pshuflw xmm12, xmm12, 0B1H
|
|
pshufhw xmm12, xmm12, 0B1H
|
|
pshuflw xmm13, xmm13, 0B1H
|
|
pshufhw xmm13, xmm13, 0B1H
|
|
pshuflw xmm14, xmm14, 0B1H
|
|
pshufhw xmm14, xmm14, 0B1H
|
|
pshuflw xmm15, xmm15, 0B1H
|
|
pshufhw xmm15, xmm15, 0B1H
|
|
movdqa xmm8, xmmword ptr [rsp+100H]
|
|
paddd xmm8, xmm12
|
|
paddd xmm9, xmm13
|
|
paddd xmm10, xmm14
|
|
paddd xmm11, xmm15
|
|
pxor xmm4, xmm8
|
|
pxor xmm5, xmm9
|
|
pxor xmm6, xmm10
|
|
pxor xmm7, xmm11
|
|
movdqa xmmword ptr [rsp+100H], xmm8
|
|
movdqa xmm8, xmm4
|
|
psrld xmm8, 12
|
|
pslld xmm4, 20
|
|
por xmm4, xmm8
|
|
movdqa xmm8, xmm5
|
|
psrld xmm8, 12
|
|
pslld xmm5, 20
|
|
por xmm5, xmm8
|
|
movdqa xmm8, xmm6
|
|
psrld xmm8, 12
|
|
pslld xmm6, 20
|
|
por xmm6, xmm8
|
|
movdqa xmm8, xmm7
|
|
psrld xmm8, 12
|
|
pslld xmm7, 20
|
|
por xmm7, xmm8
|
|
paddd xmm0, xmmword ptr [rsp+40H]
|
|
paddd xmm1, xmmword ptr [rsp+0C0H]
|
|
paddd xmm2, xmmword ptr [rsp+20H]
|
|
paddd xmm3, xmmword ptr [rsp+0E0H]
|
|
paddd xmm0, xmm4
|
|
paddd xmm1, xmm5
|
|
paddd xmm2, xmm6
|
|
paddd xmm3, xmm7
|
|
pxor xmm12, xmm0
|
|
pxor xmm13, xmm1
|
|
pxor xmm14, xmm2
|
|
pxor xmm15, xmm3
|
|
movdqa xmm8, xmm12
|
|
psrld xmm12, 8
|
|
pslld xmm8, 24
|
|
pxor xmm12, xmm8
|
|
movdqa xmm8, xmm13
|
|
psrld xmm13, 8
|
|
pslld xmm8, 24
|
|
pxor xmm13, xmm8
|
|
movdqa xmm8, xmm14
|
|
psrld xmm14, 8
|
|
pslld xmm8, 24
|
|
pxor xmm14, xmm8
|
|
movdqa xmm8, xmm15
|
|
psrld xmm15, 8
|
|
pslld xmm8, 24
|
|
pxor xmm15, xmm8
|
|
movdqa xmm8, xmmword ptr [rsp+100H]
|
|
paddd xmm8, xmm12
|
|
paddd xmm9, xmm13
|
|
paddd xmm10, xmm14
|
|
paddd xmm11, xmm15
|
|
pxor xmm4, xmm8
|
|
pxor xmm5, xmm9
|
|
pxor xmm6, xmm10
|
|
pxor xmm7, xmm11
|
|
movdqa xmmword ptr [rsp+100H], xmm8
|
|
movdqa xmm8, xmm4
|
|
psrld xmm8, 7
|
|
pslld xmm4, 25
|
|
por xmm4, xmm8
|
|
movdqa xmm8, xmm5
|
|
psrld xmm8, 7
|
|
pslld xmm5, 25
|
|
por xmm5, xmm8
|
|
movdqa xmm8, xmm6
|
|
psrld xmm8, 7
|
|
pslld xmm6, 25
|
|
por xmm6, xmm8
|
|
movdqa xmm8, xmm7
|
|
psrld xmm8, 7
|
|
pslld xmm7, 25
|
|
por xmm7, xmm8
|
|
paddd xmm0, xmmword ptr [rsp+60H]
|
|
paddd xmm1, xmmword ptr [rsp+90H]
|
|
paddd xmm2, xmmword ptr [rsp+0B0H]
|
|
paddd xmm3, xmmword ptr [rsp+80H]
|
|
paddd xmm0, xmm5
|
|
paddd xmm1, xmm6
|
|
paddd xmm2, xmm7
|
|
paddd xmm3, xmm4
|
|
pxor xmm15, xmm0
|
|
pxor xmm12, xmm1
|
|
pxor xmm13, xmm2
|
|
pxor xmm14, xmm3
|
|
pshuflw xmm15, xmm15, 0B1H
|
|
pshufhw xmm15, xmm15, 0B1H
|
|
pshuflw xmm12, xmm12, 0B1H
|
|
pshufhw xmm12, xmm12, 0B1H
|
|
pshuflw xmm13, xmm13, 0B1H
|
|
pshufhw xmm13, xmm13, 0B1H
|
|
pshuflw xmm14, xmm14, 0B1H
|
|
pshufhw xmm14, xmm14, 0B1H
|
|
paddd xmm10, xmm15
|
|
paddd xmm11, xmm12
|
|
movdqa xmm8, xmmword ptr [rsp+100H]
|
|
paddd xmm8, xmm13
|
|
paddd xmm9, xmm14
|
|
pxor xmm5, xmm10
|
|
pxor xmm6, xmm11
|
|
pxor xmm7, xmm8
|
|
pxor xmm4, xmm9
|
|
movdqa xmmword ptr [rsp+100H], xmm8
|
|
movdqa xmm8, xmm5
|
|
psrld xmm8, 12
|
|
pslld xmm5, 20
|
|
por xmm5, xmm8
|
|
movdqa xmm8, xmm6
|
|
psrld xmm8, 12
|
|
pslld xmm6, 20
|
|
por xmm6, xmm8
|
|
movdqa xmm8, xmm7
|
|
psrld xmm8, 12
|
|
pslld xmm7, 20
|
|
por xmm7, xmm8
|
|
movdqa xmm8, xmm4
|
|
psrld xmm8, 12
|
|
pslld xmm4, 20
|
|
por xmm4, xmm8
|
|
paddd xmm0, xmmword ptr [rsp+50H]
|
|
paddd xmm1, xmmword ptr [rsp]
|
|
paddd xmm2, xmmword ptr [rsp+0F0H]
|
|
paddd xmm3, xmmword ptr [rsp+10H]
|
|
paddd xmm0, xmm5
|
|
paddd xmm1, xmm6
|
|
paddd xmm2, xmm7
|
|
paddd xmm3, xmm4
|
|
pxor xmm15, xmm0
|
|
pxor xmm12, xmm1
|
|
pxor xmm13, xmm2
|
|
pxor xmm14, xmm3
|
|
movdqa xmm8, xmm15
|
|
psrld xmm15, 8
|
|
pslld xmm8, 24
|
|
pxor xmm15, xmm8
|
|
movdqa xmm8, xmm12
|
|
psrld xmm12, 8
|
|
pslld xmm8, 24
|
|
pxor xmm12, xmm8
|
|
movdqa xmm8, xmm13
|
|
psrld xmm13, 8
|
|
pslld xmm8, 24
|
|
pxor xmm13, xmm8
|
|
movdqa xmm8, xmm14
|
|
psrld xmm14, 8
|
|
pslld xmm8, 24
|
|
pxor xmm14, xmm8
|
|
paddd xmm10, xmm15
|
|
paddd xmm11, xmm12
|
|
movdqa xmm8, xmmword ptr [rsp+100H]
|
|
paddd xmm8, xmm13
|
|
paddd xmm9, xmm14
|
|
pxor xmm5, xmm10
|
|
pxor xmm6, xmm11
|
|
pxor xmm7, xmm8
|
|
pxor xmm4, xmm9
|
|
movdqa xmmword ptr [rsp+100H], xmm8
|
|
movdqa xmm8, xmm5
|
|
psrld xmm8, 7
|
|
pslld xmm5, 25
|
|
por xmm5, xmm8
|
|
movdqa xmm8, xmm6
|
|
psrld xmm8, 7
|
|
pslld xmm6, 25
|
|
por xmm6, xmm8
|
|
movdqa xmm8, xmm7
|
|
psrld xmm8, 7
|
|
pslld xmm7, 25
|
|
por xmm7, xmm8
|
|
movdqa xmm8, xmm4
|
|
psrld xmm8, 7
|
|
pslld xmm4, 25
|
|
por xmm4, xmm8
|
|
paddd xmm0, xmmword ptr [rsp+0A0H]
|
|
paddd xmm1, xmmword ptr [rsp+0C0H]
|
|
paddd xmm2, xmmword ptr [rsp+0E0H]
|
|
paddd xmm3, xmmword ptr [rsp+0D0H]
|
|
paddd xmm0, xmm4
|
|
paddd xmm1, xmm5
|
|
paddd xmm2, xmm6
|
|
paddd xmm3, xmm7
|
|
pxor xmm12, xmm0
|
|
pxor xmm13, xmm1
|
|
pxor xmm14, xmm2
|
|
pxor xmm15, xmm3
|
|
pshuflw xmm12, xmm12, 0B1H
|
|
pshufhw xmm12, xmm12, 0B1H
|
|
pshuflw xmm13, xmm13, 0B1H
|
|
pshufhw xmm13, xmm13, 0B1H
|
|
pshuflw xmm14, xmm14, 0B1H
|
|
pshufhw xmm14, xmm14, 0B1H
|
|
pshuflw xmm15, xmm15, 0B1H
|
|
pshufhw xmm15, xmm15, 0B1H
|
|
movdqa xmm8, xmmword ptr [rsp+100H]
|
|
paddd xmm8, xmm12
|
|
paddd xmm9, xmm13
|
|
paddd xmm10, xmm14
|
|
paddd xmm11, xmm15
|
|
pxor xmm4, xmm8
|
|
pxor xmm5, xmm9
|
|
pxor xmm6, xmm10
|
|
pxor xmm7, xmm11
|
|
movdqa xmmword ptr [rsp+100H], xmm8
|
|
movdqa xmm8, xmm4
|
|
psrld xmm8, 12
|
|
pslld xmm4, 20
|
|
por xmm4, xmm8
|
|
movdqa xmm8, xmm5
|
|
psrld xmm8, 12
|
|
pslld xmm5, 20
|
|
por xmm5, xmm8
|
|
movdqa xmm8, xmm6
|
|
psrld xmm8, 12
|
|
pslld xmm6, 20
|
|
por xmm6, xmm8
|
|
movdqa xmm8, xmm7
|
|
psrld xmm8, 12
|
|
pslld xmm7, 20
|
|
por xmm7, xmm8
|
|
paddd xmm0, xmmword ptr [rsp+70H]
|
|
paddd xmm1, xmmword ptr [rsp+90H]
|
|
paddd xmm2, xmmword ptr [rsp+30H]
|
|
paddd xmm3, xmmword ptr [rsp+0F0H]
|
|
paddd xmm0, xmm4
|
|
paddd xmm1, xmm5
|
|
paddd xmm2, xmm6
|
|
paddd xmm3, xmm7
|
|
pxor xmm12, xmm0
|
|
pxor xmm13, xmm1
|
|
pxor xmm14, xmm2
|
|
pxor xmm15, xmm3
|
|
movdqa xmm8, xmm12
|
|
psrld xmm12, 8
|
|
pslld xmm8, 24
|
|
pxor xmm12, xmm8
|
|
movdqa xmm8, xmm13
|
|
psrld xmm13, 8
|
|
pslld xmm8, 24
|
|
pxor xmm13, xmm8
|
|
movdqa xmm8, xmm14
|
|
psrld xmm14, 8
|
|
pslld xmm8, 24
|
|
pxor xmm14, xmm8
|
|
movdqa xmm8, xmm15
|
|
psrld xmm15, 8
|
|
pslld xmm8, 24
|
|
pxor xmm15, xmm8
|
|
movdqa xmm8, xmmword ptr [rsp+100H]
|
|
paddd xmm8, xmm12
|
|
paddd xmm9, xmm13
|
|
paddd xmm10, xmm14
|
|
paddd xmm11, xmm15
|
|
pxor xmm4, xmm8
|
|
pxor xmm5, xmm9
|
|
pxor xmm6, xmm10
|
|
pxor xmm7, xmm11
|
|
movdqa xmmword ptr [rsp+100H], xmm8
|
|
movdqa xmm8, xmm4
|
|
psrld xmm8, 7
|
|
pslld xmm4, 25
|
|
por xmm4, xmm8
|
|
movdqa xmm8, xmm5
|
|
psrld xmm8, 7
|
|
pslld xmm5, 25
|
|
por xmm5, xmm8
|
|
movdqa xmm8, xmm6
|
|
psrld xmm8, 7
|
|
pslld xmm6, 25
|
|
por xmm6, xmm8
|
|
movdqa xmm8, xmm7
|
|
psrld xmm8, 7
|
|
pslld xmm7, 25
|
|
por xmm7, xmm8
|
|
paddd xmm0, xmmword ptr [rsp+40H]
|
|
paddd xmm1, xmmword ptr [rsp+0B0H]
|
|
paddd xmm2, xmmword ptr [rsp+50H]
|
|
paddd xmm3, xmmword ptr [rsp+10H]
|
|
paddd xmm0, xmm5
|
|
paddd xmm1, xmm6
|
|
paddd xmm2, xmm7
|
|
paddd xmm3, xmm4
|
|
pxor xmm15, xmm0
|
|
pxor xmm12, xmm1
|
|
pxor xmm13, xmm2
|
|
pxor xmm14, xmm3
|
|
pshuflw xmm15, xmm15, 0B1H
|
|
pshufhw xmm15, xmm15, 0B1H
|
|
pshuflw xmm12, xmm12, 0B1H
|
|
pshufhw xmm12, xmm12, 0B1H
|
|
pshuflw xmm13, xmm13, 0B1H
|
|
pshufhw xmm13, xmm13, 0B1H
|
|
pshuflw xmm14, xmm14, 0B1H
|
|
pshufhw xmm14, xmm14, 0B1H
|
|
paddd xmm10, xmm15
|
|
paddd xmm11, xmm12
|
|
movdqa xmm8, xmmword ptr [rsp+100H]
|
|
paddd xmm8, xmm13
|
|
paddd xmm9, xmm14
|
|
pxor xmm5, xmm10
|
|
pxor xmm6, xmm11
|
|
pxor xmm7, xmm8
|
|
pxor xmm4, xmm9
|
|
movdqa xmmword ptr [rsp+100H], xmm8
|
|
movdqa xmm8, xmm5
|
|
psrld xmm8, 12
|
|
pslld xmm5, 20
|
|
por xmm5, xmm8
|
|
movdqa xmm8, xmm6
|
|
psrld xmm8, 12
|
|
pslld xmm6, 20
|
|
por xmm6, xmm8
|
|
movdqa xmm8, xmm7
|
|
psrld xmm8, 12
|
|
pslld xmm7, 20
|
|
por xmm7, xmm8
|
|
movdqa xmm8, xmm4
|
|
psrld xmm8, 12
|
|
pslld xmm4, 20
|
|
por xmm4, xmm8
|
|
paddd xmm0, xmmword ptr [rsp]
|
|
paddd xmm1, xmmword ptr [rsp+20H]
|
|
paddd xmm2, xmmword ptr [rsp+80H]
|
|
paddd xmm3, xmmword ptr [rsp+60H]
|
|
paddd xmm0, xmm5
|
|
paddd xmm1, xmm6
|
|
paddd xmm2, xmm7
|
|
paddd xmm3, xmm4
|
|
pxor xmm15, xmm0
|
|
pxor xmm12, xmm1
|
|
pxor xmm13, xmm2
|
|
pxor xmm14, xmm3
|
|
movdqa xmm8, xmm15
|
|
psrld xmm15, 8
|
|
pslld xmm8, 24
|
|
pxor xmm15, xmm8
|
|
movdqa xmm8, xmm12
|
|
psrld xmm12, 8
|
|
pslld xmm8, 24
|
|
pxor xmm12, xmm8
|
|
movdqa xmm8, xmm13
|
|
psrld xmm13, 8
|
|
pslld xmm8, 24
|
|
pxor xmm13, xmm8
|
|
movdqa xmm8, xmm14
|
|
psrld xmm14, 8
|
|
pslld xmm8, 24
|
|
pxor xmm14, xmm8
|
|
paddd xmm10, xmm15
|
|
paddd xmm11, xmm12
|
|
movdqa xmm8, xmmword ptr [rsp+100H]
|
|
paddd xmm8, xmm13
|
|
paddd xmm9, xmm14
|
|
pxor xmm5, xmm10
|
|
pxor xmm6, xmm11
|
|
pxor xmm7, xmm8
|
|
pxor xmm4, xmm9
|
|
movdqa xmmword ptr [rsp+100H], xmm8
|
|
movdqa xmm8, xmm5
|
|
psrld xmm8, 7
|
|
pslld xmm5, 25
|
|
por xmm5, xmm8
|
|
movdqa xmm8, xmm6
|
|
psrld xmm8, 7
|
|
pslld xmm6, 25
|
|
por xmm6, xmm8
|
|
movdqa xmm8, xmm7
|
|
psrld xmm8, 7
|
|
pslld xmm7, 25
|
|
por xmm7, xmm8
|
|
movdqa xmm8, xmm4
|
|
psrld xmm8, 7
|
|
pslld xmm4, 25
|
|
por xmm4, xmm8
|
|
paddd xmm0, xmmword ptr [rsp+0C0H]
|
|
paddd xmm1, xmmword ptr [rsp+90H]
|
|
paddd xmm2, xmmword ptr [rsp+0F0H]
|
|
paddd xmm3, xmmword ptr [rsp+0E0H]
|
|
paddd xmm0, xmm4
|
|
paddd xmm1, xmm5
|
|
paddd xmm2, xmm6
|
|
paddd xmm3, xmm7
|
|
pxor xmm12, xmm0
|
|
pxor xmm13, xmm1
|
|
pxor xmm14, xmm2
|
|
pxor xmm15, xmm3
|
|
pshuflw xmm12, xmm12, 0B1H
|
|
pshufhw xmm12, xmm12, 0B1H
|
|
pshuflw xmm13, xmm13, 0B1H
|
|
pshufhw xmm13, xmm13, 0B1H
|
|
pshuflw xmm14, xmm14, 0B1H
|
|
pshufhw xmm14, xmm14, 0B1H
|
|
pshuflw xmm15, xmm15, 0B1H
|
|
pshufhw xmm15, xmm15, 0B1H
|
|
movdqa xmm8, xmmword ptr [rsp+100H]
|
|
paddd xmm8, xmm12
|
|
paddd xmm9, xmm13
|
|
paddd xmm10, xmm14
|
|
paddd xmm11, xmm15
|
|
pxor xmm4, xmm8
|
|
pxor xmm5, xmm9
|
|
pxor xmm6, xmm10
|
|
pxor xmm7, xmm11
|
|
movdqa xmmword ptr [rsp+100H], xmm8
|
|
movdqa xmm8, xmm4
|
|
psrld xmm8, 12
|
|
pslld xmm4, 20
|
|
por xmm4, xmm8
|
|
movdqa xmm8, xmm5
|
|
psrld xmm8, 12
|
|
pslld xmm5, 20
|
|
por xmm5, xmm8
|
|
movdqa xmm8, xmm6
|
|
psrld xmm8, 12
|
|
pslld xmm6, 20
|
|
por xmm6, xmm8
|
|
movdqa xmm8, xmm7
|
|
psrld xmm8, 12
|
|
pslld xmm7, 20
|
|
por xmm7, xmm8
|
|
paddd xmm0, xmmword ptr [rsp+0D0H]
|
|
paddd xmm1, xmmword ptr [rsp+0B0H]
|
|
paddd xmm2, xmmword ptr [rsp+0A0H]
|
|
paddd xmm3, xmmword ptr [rsp+80H]
|
|
paddd xmm0, xmm4
|
|
paddd xmm1, xmm5
|
|
paddd xmm2, xmm6
|
|
paddd xmm3, xmm7
|
|
pxor xmm12, xmm0
|
|
pxor xmm13, xmm1
|
|
pxor xmm14, xmm2
|
|
pxor xmm15, xmm3
|
|
movdqa xmm8, xmm12
|
|
psrld xmm12, 8
|
|
pslld xmm8, 24
|
|
pxor xmm12, xmm8
|
|
movdqa xmm8, xmm13
|
|
psrld xmm13, 8
|
|
pslld xmm8, 24
|
|
pxor xmm13, xmm8
|
|
movdqa xmm8, xmm14
|
|
psrld xmm14, 8
|
|
pslld xmm8, 24
|
|
pxor xmm14, xmm8
|
|
movdqa xmm8, xmm15
|
|
psrld xmm15, 8
|
|
pslld xmm8, 24
|
|
pxor xmm15, xmm8
|
|
movdqa xmm8, xmmword ptr [rsp+100H]
|
|
paddd xmm8, xmm12
|
|
paddd xmm9, xmm13
|
|
paddd xmm10, xmm14
|
|
paddd xmm11, xmm15
|
|
pxor xmm4, xmm8
|
|
pxor xmm5, xmm9
|
|
pxor xmm6, xmm10
|
|
pxor xmm7, xmm11
|
|
movdqa xmmword ptr [rsp+100H], xmm8
|
|
movdqa xmm8, xmm4
|
|
psrld xmm8, 7
|
|
pslld xmm4, 25
|
|
por xmm4, xmm8
|
|
movdqa xmm8, xmm5
|
|
psrld xmm8, 7
|
|
pslld xmm5, 25
|
|
por xmm5, xmm8
|
|
movdqa xmm8, xmm6
|
|
psrld xmm8, 7
|
|
pslld xmm6, 25
|
|
por xmm6, xmm8
|
|
movdqa xmm8, xmm7
|
|
psrld xmm8, 7
|
|
pslld xmm7, 25
|
|
por xmm7, xmm8
|
|
paddd xmm0, xmmword ptr [rsp+70H]
|
|
paddd xmm1, xmmword ptr [rsp+50H]
|
|
paddd xmm2, xmmword ptr [rsp]
|
|
paddd xmm3, xmmword ptr [rsp+60H]
|
|
paddd xmm0, xmm5
|
|
paddd xmm1, xmm6
|
|
paddd xmm2, xmm7
|
|
paddd xmm3, xmm4
|
|
pxor xmm15, xmm0
|
|
pxor xmm12, xmm1
|
|
pxor xmm13, xmm2
|
|
pxor xmm14, xmm3
|
|
pshuflw xmm15, xmm15, 0B1H
|
|
pshufhw xmm15, xmm15, 0B1H
|
|
pshuflw xmm12, xmm12, 0B1H
|
|
pshufhw xmm12, xmm12, 0B1H
|
|
pshuflw xmm13, xmm13, 0B1H
|
|
pshufhw xmm13, xmm13, 0B1H
|
|
pshuflw xmm14, xmm14, 0B1H
|
|
pshufhw xmm14, xmm14, 0B1H
|
|
paddd xmm10, xmm15
|
|
paddd xmm11, xmm12
|
|
movdqa xmm8, xmmword ptr [rsp+100H]
|
|
paddd xmm8, xmm13
|
|
paddd xmm9, xmm14
|
|
pxor xmm5, xmm10
|
|
pxor xmm6, xmm11
|
|
pxor xmm7, xmm8
|
|
pxor xmm4, xmm9
|
|
movdqa xmmword ptr [rsp+100H], xmm8
|
|
movdqa xmm8, xmm5
|
|
psrld xmm8, 12
|
|
pslld xmm5, 20
|
|
por xmm5, xmm8
|
|
movdqa xmm8, xmm6
|
|
psrld xmm8, 12
|
|
pslld xmm6, 20
|
|
por xmm6, xmm8
|
|
movdqa xmm8, xmm7
|
|
psrld xmm8, 12
|
|
pslld xmm7, 20
|
|
por xmm7, xmm8
|
|
movdqa xmm8, xmm4
|
|
psrld xmm8, 12
|
|
pslld xmm4, 20
|
|
por xmm4, xmm8
|
|
paddd xmm0, xmmword ptr [rsp+20H]
|
|
paddd xmm1, xmmword ptr [rsp+30H]
|
|
paddd xmm2, xmmword ptr [rsp+10H]
|
|
paddd xmm3, xmmword ptr [rsp+40H]
|
|
paddd xmm0, xmm5
|
|
paddd xmm1, xmm6
|
|
paddd xmm2, xmm7
|
|
paddd xmm3, xmm4
|
|
pxor xmm15, xmm0
|
|
pxor xmm12, xmm1
|
|
pxor xmm13, xmm2
|
|
pxor xmm14, xmm3
|
|
movdqa xmm8, xmm15
|
|
psrld xmm15, 8
|
|
pslld xmm8, 24
|
|
pxor xmm15, xmm8
|
|
movdqa xmm8, xmm12
|
|
psrld xmm12, 8
|
|
pslld xmm8, 24
|
|
pxor xmm12, xmm8
|
|
movdqa xmm8, xmm13
|
|
psrld xmm13, 8
|
|
pslld xmm8, 24
|
|
pxor xmm13, xmm8
|
|
movdqa xmm8, xmm14
|
|
psrld xmm14, 8
|
|
pslld xmm8, 24
|
|
pxor xmm14, xmm8
|
|
paddd xmm10, xmm15
|
|
paddd xmm11, xmm12
|
|
movdqa xmm8, xmmword ptr [rsp+100H]
|
|
paddd xmm8, xmm13
|
|
paddd xmm9, xmm14
|
|
pxor xmm5, xmm10
|
|
pxor xmm6, xmm11
|
|
pxor xmm7, xmm8
|
|
pxor xmm4, xmm9
|
|
movdqa xmmword ptr [rsp+100H], xmm8
|
|
movdqa xmm8, xmm5
|
|
psrld xmm8, 7
|
|
pslld xmm5, 25
|
|
por xmm5, xmm8
|
|
movdqa xmm8, xmm6
|
|
psrld xmm8, 7
|
|
pslld xmm6, 25
|
|
por xmm6, xmm8
|
|
movdqa xmm8, xmm7
|
|
psrld xmm8, 7
|
|
pslld xmm7, 25
|
|
por xmm7, xmm8
|
|
movdqa xmm8, xmm4
|
|
psrld xmm8, 7
|
|
pslld xmm4, 25
|
|
por xmm4, xmm8
|
|
paddd xmm0, xmmword ptr [rsp+90H]
|
|
paddd xmm1, xmmword ptr [rsp+0B0H]
|
|
paddd xmm2, xmmword ptr [rsp+80H]
|
|
paddd xmm3, xmmword ptr [rsp+0F0H]
|
|
paddd xmm0, xmm4
|
|
paddd xmm1, xmm5
|
|
paddd xmm2, xmm6
|
|
paddd xmm3, xmm7
|
|
pxor xmm12, xmm0
|
|
pxor xmm13, xmm1
|
|
pxor xmm14, xmm2
|
|
pxor xmm15, xmm3
|
|
pshuflw xmm12, xmm12, 0B1H
|
|
pshufhw xmm12, xmm12, 0B1H
|
|
pshuflw xmm13, xmm13, 0B1H
|
|
pshufhw xmm13, xmm13, 0B1H
|
|
pshuflw xmm14, xmm14, 0B1H
|
|
pshufhw xmm14, xmm14, 0B1H
|
|
pshuflw xmm15, xmm15, 0B1H
|
|
pshufhw xmm15, xmm15, 0B1H
|
|
movdqa xmm8, xmmword ptr [rsp+100H]
|
|
paddd xmm8, xmm12
|
|
paddd xmm9, xmm13
|
|
paddd xmm10, xmm14
|
|
paddd xmm11, xmm15
|
|
pxor xmm4, xmm8
|
|
pxor xmm5, xmm9
|
|
pxor xmm6, xmm10
|
|
pxor xmm7, xmm11
|
|
movdqa xmmword ptr [rsp+100H], xmm8
|
|
movdqa xmm8, xmm4
|
|
psrld xmm8, 12
|
|
pslld xmm4, 20
|
|
por xmm4, xmm8
|
|
movdqa xmm8, xmm5
|
|
psrld xmm8, 12
|
|
pslld xmm5, 20
|
|
por xmm5, xmm8
|
|
movdqa xmm8, xmm6
|
|
psrld xmm8, 12
|
|
pslld xmm6, 20
|
|
por xmm6, xmm8
|
|
movdqa xmm8, xmm7
|
|
psrld xmm8, 12
|
|
pslld xmm7, 20
|
|
por xmm7, xmm8
|
|
paddd xmm0, xmmword ptr [rsp+0E0H]
|
|
paddd xmm1, xmmword ptr [rsp+50H]
|
|
paddd xmm2, xmmword ptr [rsp+0C0H]
|
|
paddd xmm3, xmmword ptr [rsp+10H]
|
|
paddd xmm0, xmm4
|
|
paddd xmm1, xmm5
|
|
paddd xmm2, xmm6
|
|
paddd xmm3, xmm7
|
|
pxor xmm12, xmm0
|
|
pxor xmm13, xmm1
|
|
pxor xmm14, xmm2
|
|
pxor xmm15, xmm3
|
|
movdqa xmm8, xmm12
|
|
psrld xmm12, 8
|
|
pslld xmm8, 24
|
|
pxor xmm12, xmm8
|
|
movdqa xmm8, xmm13
|
|
psrld xmm13, 8
|
|
pslld xmm8, 24
|
|
pxor xmm13, xmm8
|
|
movdqa xmm8, xmm14
|
|
psrld xmm14, 8
|
|
pslld xmm8, 24
|
|
pxor xmm14, xmm8
|
|
movdqa xmm8, xmm15
|
|
psrld xmm15, 8
|
|
pslld xmm8, 24
|
|
pxor xmm15, xmm8
|
|
movdqa xmm8, xmmword ptr [rsp+100H]
|
|
paddd xmm8, xmm12
|
|
paddd xmm9, xmm13
|
|
paddd xmm10, xmm14
|
|
paddd xmm11, xmm15
|
|
pxor xmm4, xmm8
|
|
pxor xmm5, xmm9
|
|
pxor xmm6, xmm10
|
|
pxor xmm7, xmm11
|
|
movdqa xmmword ptr [rsp+100H], xmm8
|
|
movdqa xmm8, xmm4
|
|
psrld xmm8, 7
|
|
pslld xmm4, 25
|
|
por xmm4, xmm8
|
|
movdqa xmm8, xmm5
|
|
psrld xmm8, 7
|
|
pslld xmm5, 25
|
|
por xmm5, xmm8
|
|
movdqa xmm8, xmm6
|
|
psrld xmm8, 7
|
|
pslld xmm6, 25
|
|
por xmm6, xmm8
|
|
movdqa xmm8, xmm7
|
|
psrld xmm8, 7
|
|
pslld xmm7, 25
|
|
por xmm7, xmm8
|
|
paddd xmm0, xmmword ptr [rsp+0D0H]
|
|
paddd xmm1, xmmword ptr [rsp]
|
|
paddd xmm2, xmmword ptr [rsp+20H]
|
|
paddd xmm3, xmmword ptr [rsp+40H]
|
|
paddd xmm0, xmm5
|
|
paddd xmm1, xmm6
|
|
paddd xmm2, xmm7
|
|
paddd xmm3, xmm4
|
|
pxor xmm15, xmm0
|
|
pxor xmm12, xmm1
|
|
pxor xmm13, xmm2
|
|
pxor xmm14, xmm3
|
|
pshuflw xmm15, xmm15, 0B1H
|
|
pshufhw xmm15, xmm15, 0B1H
|
|
pshuflw xmm12, xmm12, 0B1H
|
|
pshufhw xmm12, xmm12, 0B1H
|
|
pshuflw xmm13, xmm13, 0B1H
|
|
pshufhw xmm13, xmm13, 0B1H
|
|
pshuflw xmm14, xmm14, 0B1H
|
|
pshufhw xmm14, xmm14, 0B1H
|
|
paddd xmm10, xmm15
|
|
paddd xmm11, xmm12
|
|
movdqa xmm8, xmmword ptr [rsp+100H]
|
|
paddd xmm8, xmm13
|
|
paddd xmm9, xmm14
|
|
pxor xmm5, xmm10
|
|
pxor xmm6, xmm11
|
|
pxor xmm7, xmm8
|
|
pxor xmm4, xmm9
|
|
movdqa xmmword ptr [rsp+100H], xmm8
|
|
movdqa xmm8, xmm5
|
|
psrld xmm8, 12
|
|
pslld xmm5, 20
|
|
por xmm5, xmm8
|
|
movdqa xmm8, xmm6
|
|
psrld xmm8, 12
|
|
pslld xmm6, 20
|
|
por xmm6, xmm8
|
|
movdqa xmm8, xmm7
|
|
psrld xmm8, 12
|
|
pslld xmm7, 20
|
|
por xmm7, xmm8
|
|
movdqa xmm8, xmm4
|
|
psrld xmm8, 12
|
|
pslld xmm4, 20
|
|
por xmm4, xmm8
|
|
paddd xmm0, xmmword ptr [rsp+30H]
|
|
paddd xmm1, xmmword ptr [rsp+0A0H]
|
|
paddd xmm2, xmmword ptr [rsp+60H]
|
|
paddd xmm3, xmmword ptr [rsp+70H]
|
|
paddd xmm0, xmm5
|
|
paddd xmm1, xmm6
|
|
paddd xmm2, xmm7
|
|
paddd xmm3, xmm4
|
|
pxor xmm15, xmm0
|
|
pxor xmm12, xmm1
|
|
pxor xmm13, xmm2
|
|
pxor xmm14, xmm3
|
|
movdqa xmm8, xmm15
|
|
psrld xmm15, 8
|
|
pslld xmm8, 24
|
|
pxor xmm15, xmm8
|
|
movdqa xmm8, xmm12
|
|
psrld xmm12, 8
|
|
pslld xmm8, 24
|
|
pxor xmm12, xmm8
|
|
movdqa xmm8, xmm13
|
|
psrld xmm13, 8
|
|
pslld xmm8, 24
|
|
pxor xmm13, xmm8
|
|
movdqa xmm8, xmm14
|
|
psrld xmm14, 8
|
|
pslld xmm8, 24
|
|
pxor xmm14, xmm8
|
|
paddd xmm10, xmm15
|
|
paddd xmm11, xmm12
|
|
movdqa xmm8, xmmword ptr [rsp+100H]
|
|
paddd xmm8, xmm13
|
|
paddd xmm9, xmm14
|
|
pxor xmm5, xmm10
|
|
pxor xmm6, xmm11
|
|
pxor xmm7, xmm8
|
|
pxor xmm4, xmm9
|
|
movdqa xmmword ptr [rsp+100H], xmm8
|
|
movdqa xmm8, xmm5
|
|
psrld xmm8, 7
|
|
pslld xmm5, 25
|
|
por xmm5, xmm8
|
|
movdqa xmm8, xmm6
|
|
psrld xmm8, 7
|
|
pslld xmm6, 25
|
|
por xmm6, xmm8
|
|
movdqa xmm8, xmm7
|
|
psrld xmm8, 7
|
|
pslld xmm7, 25
|
|
por xmm7, xmm8
|
|
movdqa xmm8, xmm4
|
|
psrld xmm8, 7
|
|
pslld xmm4, 25
|
|
por xmm4, xmm8
|
|
paddd xmm0, xmmword ptr [rsp+0B0H]
|
|
paddd xmm1, xmmword ptr [rsp+50H]
|
|
paddd xmm2, xmmword ptr [rsp+10H]
|
|
paddd xmm3, xmmword ptr [rsp+80H]
|
|
paddd xmm0, xmm4
|
|
paddd xmm1, xmm5
|
|
paddd xmm2, xmm6
|
|
paddd xmm3, xmm7
|
|
pxor xmm12, xmm0
|
|
pxor xmm13, xmm1
|
|
pxor xmm14, xmm2
|
|
pxor xmm15, xmm3
|
|
pshuflw xmm12, xmm12, 0B1H
|
|
pshufhw xmm12, xmm12, 0B1H
|
|
pshuflw xmm13, xmm13, 0B1H
|
|
pshufhw xmm13, xmm13, 0B1H
|
|
pshuflw xmm14, xmm14, 0B1H
|
|
pshufhw xmm14, xmm14, 0B1H
|
|
pshuflw xmm15, xmm15, 0B1H
|
|
pshufhw xmm15, xmm15, 0B1H
|
|
movdqa xmm8, xmmword ptr [rsp+100H]
|
|
paddd xmm8, xmm12
|
|
paddd xmm9, xmm13
|
|
paddd xmm10, xmm14
|
|
paddd xmm11, xmm15
|
|
pxor xmm4, xmm8
|
|
pxor xmm5, xmm9
|
|
pxor xmm6, xmm10
|
|
pxor xmm7, xmm11
|
|
movdqa xmmword ptr [rsp+100H], xmm8
|
|
movdqa xmm8, xmm4
|
|
psrld xmm8, 12
|
|
pslld xmm4, 20
|
|
por xmm4, xmm8
|
|
movdqa xmm8, xmm5
|
|
psrld xmm8, 12
|
|
pslld xmm5, 20
|
|
por xmm5, xmm8
|
|
movdqa xmm8, xmm6
|
|
psrld xmm8, 12
|
|
pslld xmm6, 20
|
|
por xmm6, xmm8
|
|
movdqa xmm8, xmm7
|
|
psrld xmm8, 12
|
|
pslld xmm7, 20
|
|
por xmm7, xmm8
|
|
paddd xmm0, xmmword ptr [rsp+0F0H]
|
|
paddd xmm1, xmmword ptr [rsp]
|
|
paddd xmm2, xmmword ptr [rsp+90H]
|
|
paddd xmm3, xmmword ptr [rsp+60H]
|
|
paddd xmm0, xmm4
|
|
paddd xmm1, xmm5
|
|
paddd xmm2, xmm6
|
|
paddd xmm3, xmm7
|
|
pxor xmm12, xmm0
|
|
pxor xmm13, xmm1
|
|
pxor xmm14, xmm2
|
|
pxor xmm15, xmm3
|
|
movdqa xmm8, xmm12
|
|
psrld xmm12, 8
|
|
pslld xmm8, 24
|
|
pxor xmm12, xmm8
|
|
movdqa xmm8, xmm13
|
|
psrld xmm13, 8
|
|
pslld xmm8, 24
|
|
pxor xmm13, xmm8
|
|
movdqa xmm8, xmm14
|
|
psrld xmm14, 8
|
|
pslld xmm8, 24
|
|
pxor xmm14, xmm8
|
|
movdqa xmm8, xmm15
|
|
psrld xmm15, 8
|
|
pslld xmm8, 24
|
|
pxor xmm15, xmm8
|
|
movdqa xmm8, xmmword ptr [rsp+100H]
|
|
paddd xmm8, xmm12
|
|
paddd xmm9, xmm13
|
|
paddd xmm10, xmm14
|
|
paddd xmm11, xmm15
|
|
pxor xmm4, xmm8
|
|
pxor xmm5, xmm9
|
|
pxor xmm6, xmm10
|
|
pxor xmm7, xmm11
|
|
movdqa xmmword ptr [rsp+100H], xmm8
|
|
movdqa xmm8, xmm4
|
|
psrld xmm8, 7
|
|
pslld xmm4, 25
|
|
por xmm4, xmm8
|
|
movdqa xmm8, xmm5
|
|
psrld xmm8, 7
|
|
pslld xmm5, 25
|
|
por xmm5, xmm8
|
|
movdqa xmm8, xmm6
|
|
psrld xmm8, 7
|
|
pslld xmm6, 25
|
|
por xmm6, xmm8
|
|
movdqa xmm8, xmm7
|
|
psrld xmm8, 7
|
|
pslld xmm7, 25
|
|
por xmm7, xmm8
|
|
paddd xmm0, xmmword ptr [rsp+0E0H]
|
|
paddd xmm1, xmmword ptr [rsp+20H]
|
|
paddd xmm2, xmmword ptr [rsp+30H]
|
|
paddd xmm3, xmmword ptr [rsp+70H]
|
|
paddd xmm0, xmm5
|
|
paddd xmm1, xmm6
|
|
paddd xmm2, xmm7
|
|
paddd xmm3, xmm4
|
|
pxor xmm15, xmm0
|
|
pxor xmm12, xmm1
|
|
pxor xmm13, xmm2
|
|
pxor xmm14, xmm3
|
|
pshuflw xmm15, xmm15, 0B1H
|
|
pshufhw xmm15, xmm15, 0B1H
|
|
pshuflw xmm12, xmm12, 0B1H
|
|
pshufhw xmm12, xmm12, 0B1H
|
|
pshuflw xmm13, xmm13, 0B1H
|
|
pshufhw xmm13, xmm13, 0B1H
|
|
pshuflw xmm14, xmm14, 0B1H
|
|
pshufhw xmm14, xmm14, 0B1H
|
|
paddd xmm10, xmm15
|
|
paddd xmm11, xmm12
|
|
movdqa xmm8, xmmword ptr [rsp+100H]
|
|
paddd xmm8, xmm13
|
|
paddd xmm9, xmm14
|
|
pxor xmm5, xmm10
|
|
pxor xmm6, xmm11
|
|
pxor xmm7, xmm8
|
|
pxor xmm4, xmm9
|
|
movdqa xmmword ptr [rsp+100H], xmm8
|
|
movdqa xmm8, xmm5
|
|
psrld xmm8, 12
|
|
pslld xmm5, 20
|
|
por xmm5, xmm8
|
|
movdqa xmm8, xmm6
|
|
psrld xmm8, 12
|
|
pslld xmm6, 20
|
|
por xmm6, xmm8
|
|
movdqa xmm8, xmm7
|
|
psrld xmm8, 12
|
|
pslld xmm7, 20
|
|
por xmm7, xmm8
|
|
movdqa xmm8, xmm4
|
|
psrld xmm8, 12
|
|
pslld xmm4, 20
|
|
por xmm4, xmm8
|
|
paddd xmm0, xmmword ptr [rsp+0A0H]
|
|
paddd xmm1, xmmword ptr [rsp+0C0H]
|
|
paddd xmm2, xmmword ptr [rsp+40H]
|
|
paddd xmm3, xmmword ptr [rsp+0D0H]
|
|
paddd xmm0, xmm5
|
|
paddd xmm1, xmm6
|
|
paddd xmm2, xmm7
|
|
paddd xmm3, xmm4
|
|
pxor xmm15, xmm0
|
|
pxor xmm12, xmm1
|
|
pxor xmm13, xmm2
|
|
pxor xmm14, xmm3
|
|
movdqa xmm8, xmm15
|
|
psrld xmm15, 8
|
|
pslld xmm8, 24
|
|
pxor xmm15, xmm8
|
|
movdqa xmm8, xmm12
|
|
psrld xmm12, 8
|
|
pslld xmm8, 24
|
|
pxor xmm12, xmm8
|
|
movdqa xmm8, xmm13
|
|
psrld xmm13, 8
|
|
pslld xmm8, 24
|
|
pxor xmm13, xmm8
|
|
movdqa xmm8, xmm14
|
|
psrld xmm14, 8
|
|
pslld xmm8, 24
|
|
pxor xmm14, xmm8
|
|
paddd xmm10, xmm15
|
|
paddd xmm11, xmm12
|
|
movdqa xmm8, xmmword ptr [rsp+100H]
|
|
paddd xmm8, xmm13
|
|
paddd xmm9, xmm14
|
|
pxor xmm5, xmm10
|
|
pxor xmm6, xmm11
|
|
pxor xmm7, xmm8
|
|
pxor xmm4, xmm9
|
|
pxor xmm0, xmm8
|
|
pxor xmm1, xmm9
|
|
pxor xmm2, xmm10
|
|
pxor xmm3, xmm11
|
|
movdqa xmm8, xmm5
|
|
psrld xmm8, 7
|
|
pslld xmm5, 25
|
|
por xmm5, xmm8
|
|
movdqa xmm8, xmm6
|
|
psrld xmm8, 7
|
|
pslld xmm6, 25
|
|
por xmm6, xmm8
|
|
movdqa xmm8, xmm7
|
|
psrld xmm8, 7
|
|
pslld xmm7, 25
|
|
por xmm7, xmm8
|
|
movdqa xmm8, xmm4
|
|
psrld xmm8, 7
|
|
pslld xmm4, 25
|
|
por xmm4, xmm8
|
|
pxor xmm4, xmm12
|
|
pxor xmm5, xmm13
|
|
pxor xmm6, xmm14
|
|
pxor xmm7, xmm15
|
|
mov eax, r13d
|
|
jne innerloop4
|
|
movdqa xmm9, xmm0
|
|
punpckldq xmm0, xmm1
|
|
punpckhdq xmm9, xmm1
|
|
movdqa xmm11, xmm2
|
|
punpckldq xmm2, xmm3
|
|
punpckhdq xmm11, xmm3
|
|
movdqa xmm1, xmm0
|
|
punpcklqdq xmm0, xmm2
|
|
punpckhqdq xmm1, xmm2
|
|
movdqa xmm3, xmm9
|
|
punpcklqdq xmm9, xmm11
|
|
punpckhqdq xmm3, xmm11
|
|
movdqu xmmword ptr [rbx], xmm0
|
|
movdqu xmmword ptr [rbx+20H], xmm1
|
|
movdqu xmmword ptr [rbx+40H], xmm9
|
|
movdqu xmmword ptr [rbx+60H], xmm3
|
|
movdqa xmm9, xmm4
|
|
punpckldq xmm4, xmm5
|
|
punpckhdq xmm9, xmm5
|
|
movdqa xmm11, xmm6
|
|
punpckldq xmm6, xmm7
|
|
punpckhdq xmm11, xmm7
|
|
movdqa xmm5, xmm4
|
|
punpcklqdq xmm4, xmm6
|
|
punpckhqdq xmm5, xmm6
|
|
movdqa xmm7, xmm9
|
|
punpcklqdq xmm9, xmm11
|
|
punpckhqdq xmm7, xmm11
|
|
movdqu xmmword ptr [rbx+10H], xmm4
|
|
movdqu xmmword ptr [rbx+30H], xmm5
|
|
movdqu xmmword ptr [rbx+50H], xmm9
|
|
movdqu xmmword ptr [rbx+70H], xmm7
|
|
movdqa xmm1, xmmword ptr [rsp+110H]
|
|
movdqa xmm0, xmm1
|
|
paddd xmm1, xmmword ptr [rsp+150H]
|
|
movdqa xmmword ptr [rsp+110H], xmm1
|
|
pxor xmm0, xmmword ptr [CMP_MSB_MASK]
|
|
pxor xmm1, xmmword ptr [CMP_MSB_MASK]
|
|
pcmpgtd xmm0, xmm1
|
|
movdqa xmm1, xmmword ptr [rsp+120H]
|
|
psubd xmm1, xmm0
|
|
movdqa xmmword ptr [rsp+120H], xmm1
|
|
add rbx, 128
|
|
add rdi, 32
|
|
sub rsi, 4
|
|
cmp rsi, 4
|
|
jnc outerloop4
|
|
test rsi, rsi
|
|
jne final3blocks
|
|
unwind:
|
|
movdqa xmm6, xmmword ptr [rsp+170H]
|
|
movdqa xmm7, xmmword ptr [rsp+180H]
|
|
movdqa xmm8, xmmword ptr [rsp+190H]
|
|
movdqa xmm9, xmmword ptr [rsp+1A0H]
|
|
movdqa xmm10, xmmword ptr [rsp+1B0H]
|
|
movdqa xmm11, xmmword ptr [rsp+1C0H]
|
|
movdqa xmm12, xmmword ptr [rsp+1D0H]
|
|
movdqa xmm13, xmmword ptr [rsp+1E0H]
|
|
movdqa xmm14, xmmword ptr [rsp+1F0H]
|
|
movdqa xmm15, xmmword ptr [rsp+200H]
|
|
mov rsp, rbp
|
|
pop rbp
|
|
pop rbx
|
|
pop rdi
|
|
pop rsi
|
|
pop r12
|
|
pop r13
|
|
pop r14
|
|
pop r15
|
|
ret
|
|
ALIGN 16
|
|
final3blocks:
|
|
test esi, 2H
|
|
je final1block
|
|
movups xmm0, xmmword ptr [rcx]
|
|
movups xmm1, xmmword ptr [rcx+10H]
|
|
movaps xmm8, xmm0
|
|
movaps xmm9, xmm1
|
|
movd xmm13, dword ptr [rsp+110H]
|
|
movd xmm14, dword ptr [rsp+120H]
|
|
punpckldq xmm13, xmm14
|
|
movaps xmmword ptr [rsp], xmm13
|
|
movd xmm14, dword ptr [rsp+114H]
|
|
movd xmm13, dword ptr [rsp+124H]
|
|
punpckldq xmm14, xmm13
|
|
movaps xmmword ptr [rsp+10H], xmm14
|
|
mov r8, qword ptr [rdi]
|
|
mov r9, qword ptr [rdi+8H]
|
|
movzx eax, byte ptr [rbp+80H]
|
|
or eax, r13d
|
|
xor edx, edx
|
|
innerloop2:
|
|
mov r14d, eax
|
|
or eax, r12d
|
|
add rdx, 64
|
|
cmp rdx, r15
|
|
cmovne eax, r14d
|
|
movaps xmm2, xmmword ptr [BLAKE3_IV]
|
|
movaps xmm10, xmm2
|
|
movups xmm4, xmmword ptr [r8+rdx-40H]
|
|
movups xmm5, xmmword ptr [r8+rdx-30H]
|
|
movaps xmm3, xmm4
|
|
shufps xmm4, xmm5, 136
|
|
shufps xmm3, xmm5, 221
|
|
movaps xmm5, xmm3
|
|
movups xmm6, xmmword ptr [r8+rdx-20H]
|
|
movups xmm7, xmmword ptr [r8+rdx-10H]
|
|
movaps xmm3, xmm6
|
|
shufps xmm6, xmm7, 136
|
|
pshufd xmm6, xmm6, 93H
|
|
shufps xmm3, xmm7, 221
|
|
pshufd xmm7, xmm3, 93H
|
|
movups xmm12, xmmword ptr [r9+rdx-40H]
|
|
movups xmm13, xmmword ptr [r9+rdx-30H]
|
|
movaps xmm11, xmm12
|
|
shufps xmm12, xmm13, 136
|
|
shufps xmm11, xmm13, 221
|
|
movaps xmm13, xmm11
|
|
movups xmm14, xmmword ptr [r9+rdx-20H]
|
|
movups xmm15, xmmword ptr [r9+rdx-10H]
|
|
movaps xmm11, xmm14
|
|
shufps xmm14, xmm15, 136
|
|
pshufd xmm14, xmm14, 93H
|
|
shufps xmm11, xmm15, 221
|
|
pshufd xmm15, xmm11, 93H
|
|
shl rax, 20H
|
|
or rax, 40H
|
|
movd xmm3, rax
|
|
movdqa xmmword ptr [rsp+20H], xmm3
|
|
movaps xmm3, xmmword ptr [rsp]
|
|
movaps xmm11, xmmword ptr [rsp+10H]
|
|
punpcklqdq xmm3, xmmword ptr [rsp+20H]
|
|
punpcklqdq xmm11, xmmword ptr [rsp+20H]
|
|
mov al, 7
|
|
roundloop2:
|
|
paddd xmm0, xmm4
|
|
paddd xmm8, xmm12
|
|
movaps xmmword ptr [rsp+20H], xmm4
|
|
movaps xmmword ptr [rsp+30H], xmm12
|
|
paddd xmm0, xmm1
|
|
paddd xmm8, xmm9
|
|
pxor xmm3, xmm0
|
|
pxor xmm11, xmm8
|
|
pshuflw xmm3, xmm3, 0B1H
|
|
pshufhw xmm3, xmm3, 0B1H
|
|
pshuflw xmm11, xmm11, 0B1H
|
|
pshufhw xmm11, xmm11, 0B1H
|
|
paddd xmm2, xmm3
|
|
paddd xmm10, xmm11
|
|
pxor xmm1, xmm2
|
|
pxor xmm9, xmm10
|
|
movdqa xmm4, xmm1
|
|
pslld xmm1, 20
|
|
psrld xmm4, 12
|
|
por xmm1, xmm4
|
|
movdqa xmm4, xmm9
|
|
pslld xmm9, 20
|
|
psrld xmm4, 12
|
|
por xmm9, xmm4
|
|
paddd xmm0, xmm5
|
|
paddd xmm8, xmm13
|
|
movaps xmmword ptr [rsp+40H], xmm5
|
|
movaps xmmword ptr [rsp+50H], xmm13
|
|
paddd xmm0, xmm1
|
|
paddd xmm8, xmm9
|
|
pxor xmm3, xmm0
|
|
pxor xmm11, xmm8
|
|
movdqa xmm13, xmm3
|
|
psrld xmm3, 8
|
|
pslld xmm13, 24
|
|
pxor xmm3, xmm13
|
|
movdqa xmm13, xmm11
|
|
psrld xmm11, 8
|
|
pslld xmm13, 24
|
|
pxor xmm11, xmm13
|
|
paddd xmm2, xmm3
|
|
paddd xmm10, xmm11
|
|
pxor xmm1, xmm2
|
|
pxor xmm9, xmm10
|
|
movdqa xmm4, xmm1
|
|
pslld xmm1, 25
|
|
psrld xmm4, 7
|
|
por xmm1, xmm4
|
|
movdqa xmm4, xmm9
|
|
pslld xmm9, 25
|
|
psrld xmm4, 7
|
|
por xmm9, xmm4
|
|
pshufd xmm0, xmm0, 93H
|
|
pshufd xmm8, xmm8, 93H
|
|
pshufd xmm3, xmm3, 4EH
|
|
pshufd xmm11, xmm11, 4EH
|
|
pshufd xmm2, xmm2, 39H
|
|
pshufd xmm10, xmm10, 39H
|
|
paddd xmm0, xmm6
|
|
paddd xmm8, xmm14
|
|
paddd xmm0, xmm1
|
|
paddd xmm8, xmm9
|
|
pxor xmm3, xmm0
|
|
pxor xmm11, xmm8
|
|
pshuflw xmm3, xmm3, 0B1H
|
|
pshufhw xmm3, xmm3, 0B1H
|
|
pshuflw xmm11, xmm11, 0B1H
|
|
pshufhw xmm11, xmm11, 0B1H
|
|
paddd xmm2, xmm3
|
|
paddd xmm10, xmm11
|
|
pxor xmm1, xmm2
|
|
pxor xmm9, xmm10
|
|
movdqa xmm4, xmm1
|
|
pslld xmm1, 20
|
|
psrld xmm4, 12
|
|
por xmm1, xmm4
|
|
movdqa xmm4, xmm9
|
|
pslld xmm9, 20
|
|
psrld xmm4, 12
|
|
por xmm9, xmm4
|
|
paddd xmm0, xmm7
|
|
paddd xmm8, xmm15
|
|
paddd xmm0, xmm1
|
|
paddd xmm8, xmm9
|
|
pxor xmm3, xmm0
|
|
pxor xmm11, xmm8
|
|
movdqa xmm13, xmm3
|
|
psrld xmm3, 8
|
|
pslld xmm13, 24
|
|
pxor xmm3, xmm13
|
|
movdqa xmm13, xmm11
|
|
psrld xmm11, 8
|
|
pslld xmm13, 24
|
|
pxor xmm11, xmm13
|
|
paddd xmm2, xmm3
|
|
paddd xmm10, xmm11
|
|
pxor xmm1, xmm2
|
|
pxor xmm9, xmm10
|
|
movdqa xmm4, xmm1
|
|
pslld xmm1, 25
|
|
psrld xmm4, 7
|
|
por xmm1, xmm4
|
|
movdqa xmm4, xmm9
|
|
pslld xmm9, 25
|
|
psrld xmm4, 7
|
|
por xmm9, xmm4
|
|
pshufd xmm0, xmm0, 39H
|
|
pshufd xmm8, xmm8, 39H
|
|
pshufd xmm3, xmm3, 4EH
|
|
pshufd xmm11, xmm11, 4EH
|
|
pshufd xmm2, xmm2, 93H
|
|
pshufd xmm10, xmm10, 93H
|
|
dec al
|
|
je endroundloop2
|
|
movdqa xmm12, xmmword ptr [rsp+20H]
|
|
movdqa xmm5, xmmword ptr [rsp+40H]
|
|
pshufd xmm13, xmm12, 0FH
|
|
shufps xmm12, xmm5, 214
|
|
pshufd xmm4, xmm12, 39H
|
|
movdqa xmm12, xmm6
|
|
shufps xmm12, xmm7, 250
|
|
pand xmm13, xmmword ptr [PBLENDW_0x33_MASK]
|
|
pand xmm12, xmmword ptr [PBLENDW_0xCC_MASK]
|
|
por xmm13, xmm12
|
|
movdqa xmmword ptr [rsp+20H], xmm13
|
|
movdqa xmm12, xmm7
|
|
punpcklqdq xmm12, xmm5
|
|
movdqa xmm13, xmm6
|
|
pand xmm12, xmmword ptr [PBLENDW_0x3F_MASK]
|
|
pand xmm13, xmmword ptr [PBLENDW_0xC0_MASK]
|
|
por xmm12, xmm13
|
|
pshufd xmm12, xmm12, 78H
|
|
punpckhdq xmm5, xmm7
|
|
punpckldq xmm6, xmm5
|
|
pshufd xmm7, xmm6, 1EH
|
|
movdqa xmmword ptr [rsp+40H], xmm12
|
|
movdqa xmm5, xmmword ptr [rsp+30H]
|
|
movdqa xmm13, xmmword ptr [rsp+50H]
|
|
pshufd xmm6, xmm5, 0FH
|
|
shufps xmm5, xmm13, 214
|
|
pshufd xmm12, xmm5, 39H
|
|
movdqa xmm5, xmm14
|
|
shufps xmm5, xmm15, 250
|
|
pand xmm6, xmmword ptr [PBLENDW_0x33_MASK]
|
|
pand xmm5, xmmword ptr [PBLENDW_0xCC_MASK]
|
|
por xmm6, xmm5
|
|
movdqa xmm5, xmm15
|
|
punpcklqdq xmm5, xmm13
|
|
movdqa xmmword ptr [rsp+30H], xmm2
|
|
movdqa xmm2, xmm14
|
|
pand xmm5, xmmword ptr [PBLENDW_0x3F_MASK]
|
|
pand xmm2, xmmword ptr [PBLENDW_0xC0_MASK]
|
|
por xmm5, xmm2
|
|
movdqa xmm2, xmmword ptr [rsp+30H]
|
|
pshufd xmm5, xmm5, 78H
|
|
punpckhdq xmm13, xmm15
|
|
punpckldq xmm14, xmm13
|
|
pshufd xmm15, xmm14, 1EH
|
|
movdqa xmm13, xmm6
|
|
movdqa xmm14, xmm5
|
|
movdqa xmm5, xmmword ptr [rsp+20H]
|
|
movdqa xmm6, xmmword ptr [rsp+40H]
|
|
jmp roundloop2
|
|
endroundloop2:
|
|
pxor xmm0, xmm2
|
|
pxor xmm1, xmm3
|
|
pxor xmm8, xmm10
|
|
pxor xmm9, xmm11
|
|
mov eax, r13d
|
|
cmp rdx, r15
|
|
jne innerloop2
|
|
movups xmmword ptr [rbx], xmm0
|
|
movups xmmword ptr [rbx+10H], xmm1
|
|
movups xmmword ptr [rbx+20H], xmm8
|
|
movups xmmword ptr [rbx+30H], xmm9
|
|
mov eax, dword ptr [rsp+130H]
|
|
neg eax
|
|
mov r10d, dword ptr [rsp+110H+8*rax]
|
|
mov r11d, dword ptr [rsp+120H+8*rax]
|
|
mov dword ptr [rsp+110H], r10d
|
|
mov dword ptr [rsp+120H], r11d
|
|
add rdi, 16
|
|
add rbx, 64
|
|
sub rsi, 2
|
|
final1block:
|
|
test esi, 1H
|
|
je unwind
|
|
movups xmm0, xmmword ptr [rcx]
|
|
movups xmm1, xmmword ptr [rcx+10H]
|
|
movd xmm13, dword ptr [rsp+110H]
|
|
movd xmm14, dword ptr [rsp+120H]
|
|
punpckldq xmm13, xmm14
|
|
mov r8, qword ptr [rdi]
|
|
movzx eax, byte ptr [rbp+80H]
|
|
or eax, r13d
|
|
xor edx, edx
|
|
innerloop1:
|
|
mov r14d, eax
|
|
or eax, r12d
|
|
add rdx, 64
|
|
cmp rdx, r15
|
|
cmovne eax, r14d
|
|
movaps xmm2, xmmword ptr [BLAKE3_IV]
|
|
shl rax, 32
|
|
or rax, 64
|
|
movd xmm12, rax
|
|
movdqa xmm3, xmm13
|
|
punpcklqdq xmm3, xmm12
|
|
movups xmm4, xmmword ptr [r8+rdx-40H]
|
|
movups xmm5, xmmword ptr [r8+rdx-30H]
|
|
movaps xmm8, xmm4
|
|
shufps xmm4, xmm5, 136
|
|
shufps xmm8, xmm5, 221
|
|
movaps xmm5, xmm8
|
|
movups xmm6, xmmword ptr [r8+rdx-20H]
|
|
movups xmm7, xmmword ptr [r8+rdx-10H]
|
|
movaps xmm8, xmm6
|
|
shufps xmm6, xmm7, 136
|
|
pshufd xmm6, xmm6, 93H
|
|
shufps xmm8, xmm7, 221
|
|
pshufd xmm7, xmm8, 93H
|
|
mov al, 7
|
|
roundloop1:
|
|
paddd xmm0, xmm4
|
|
paddd xmm0, xmm1
|
|
pxor xmm3, xmm0
|
|
pshuflw xmm3, xmm3, 0B1H
|
|
pshufhw xmm3, xmm3, 0B1H
|
|
paddd xmm2, xmm3
|
|
pxor xmm1, xmm2
|
|
movdqa xmm11, xmm1
|
|
pslld xmm1, 20
|
|
psrld xmm11, 12
|
|
por xmm1, xmm11
|
|
paddd xmm0, xmm5
|
|
paddd xmm0, xmm1
|
|
pxor xmm3, xmm0
|
|
movdqa xmm14, xmm3
|
|
psrld xmm3, 8
|
|
pslld xmm14, 24
|
|
pxor xmm3, xmm14
|
|
paddd xmm2, xmm3
|
|
pxor xmm1, xmm2
|
|
movdqa xmm11, xmm1
|
|
pslld xmm1, 25
|
|
psrld xmm11, 7
|
|
por xmm1, xmm11
|
|
pshufd xmm0, xmm0, 93H
|
|
pshufd xmm3, xmm3, 4EH
|
|
pshufd xmm2, xmm2, 39H
|
|
paddd xmm0, xmm6
|
|
paddd xmm0, xmm1
|
|
pxor xmm3, xmm0
|
|
pshuflw xmm3, xmm3, 0B1H
|
|
pshufhw xmm3, xmm3, 0B1H
|
|
paddd xmm2, xmm3
|
|
pxor xmm1, xmm2
|
|
movdqa xmm11, xmm1
|
|
pslld xmm1, 20
|
|
psrld xmm11, 12
|
|
por xmm1, xmm11
|
|
paddd xmm0, xmm7
|
|
paddd xmm0, xmm1
|
|
pxor xmm3, xmm0
|
|
movdqa xmm14, xmm3
|
|
psrld xmm3, 8
|
|
pslld xmm14, 24
|
|
pxor xmm3, xmm14
|
|
paddd xmm2, xmm3
|
|
pxor xmm1, xmm2
|
|
movdqa xmm11, xmm1
|
|
pslld xmm1, 25
|
|
psrld xmm11, 7
|
|
por xmm1, xmm11
|
|
pshufd xmm0, xmm0, 39H
|
|
pshufd xmm3, xmm3, 4EH
|
|
pshufd xmm2, xmm2, 93H
|
|
dec al
|
|
jz endroundloop1
|
|
movdqa xmm8, xmm4
|
|
shufps xmm8, xmm5, 214
|
|
pshufd xmm9, xmm4, 0FH
|
|
pshufd xmm4, xmm8, 39H
|
|
movdqa xmm8, xmm6
|
|
shufps xmm8, xmm7, 250
|
|
pand xmm9, xmmword ptr [PBLENDW_0x33_MASK]
|
|
pand xmm8, xmmword ptr [PBLENDW_0xCC_MASK]
|
|
por xmm9, xmm8
|
|
movdqa xmm8, xmm7
|
|
punpcklqdq xmm8, xmm5
|
|
movdqa xmm10, xmm6
|
|
pand xmm8, xmmword ptr [PBLENDW_0x3F_MASK]
|
|
pand xmm10, xmmword ptr [PBLENDW_0xC0_MASK]
|
|
por xmm8, xmm10
|
|
pshufd xmm8, xmm8, 78H
|
|
punpckhdq xmm5, xmm7
|
|
punpckldq xmm6, xmm5
|
|
pshufd xmm7, xmm6, 1EH
|
|
movdqa xmm5, xmm9
|
|
movdqa xmm6, xmm8
|
|
jmp roundloop1
|
|
endroundloop1:
|
|
pxor xmm0, xmm2
|
|
pxor xmm1, xmm3
|
|
mov eax, r13d
|
|
cmp rdx, r15
|
|
jne innerloop1
|
|
movups xmmword ptr [rbx], xmm0
|
|
movups xmmword ptr [rbx+10H], xmm1
|
|
jmp unwind
|
|
_blake3_hash_many_sse2 ENDP
|
|
blake3_hash_many_sse2 ENDP
|
|
|
|
blake3_compress_in_place_sse2 PROC
|
|
_blake3_compress_in_place_sse2 PROC
|
|
sub rsp, 120
|
|
movdqa xmmword ptr [rsp], xmm6
|
|
movdqa xmmword ptr [rsp+10H], xmm7
|
|
movdqa xmmword ptr [rsp+20H], xmm8
|
|
movdqa xmmword ptr [rsp+30H], xmm9
|
|
movdqa xmmword ptr [rsp+40H], xmm11
|
|
movdqa xmmword ptr [rsp+50H], xmm14
|
|
movdqa xmmword ptr [rsp+60H], xmm15
|
|
movups xmm0, xmmword ptr [rcx]
|
|
movups xmm1, xmmword ptr [rcx+10H]
|
|
movaps xmm2, xmmword ptr [BLAKE3_IV]
|
|
movzx eax, byte ptr [rsp+0A0H]
|
|
movzx r8d, r8b
|
|
shl rax, 32
|
|
add r8, rax
|
|
movd xmm3, r9
|
|
movd xmm4, r8
|
|
punpcklqdq xmm3, xmm4
|
|
movups xmm4, xmmword ptr [rdx]
|
|
movups xmm5, xmmword ptr [rdx+10H]
|
|
movaps xmm8, xmm4
|
|
shufps xmm4, xmm5, 136
|
|
shufps xmm8, xmm5, 221
|
|
movaps xmm5, xmm8
|
|
movups xmm6, xmmword ptr [rdx+20H]
|
|
movups xmm7, xmmword ptr [rdx+30H]
|
|
movaps xmm8, xmm6
|
|
shufps xmm6, xmm7, 136
|
|
pshufd xmm6, xmm6, 93H
|
|
shufps xmm8, xmm7, 221
|
|
pshufd xmm7, xmm8, 93H
|
|
mov al, 7
|
|
@@:
|
|
paddd xmm0, xmm4
|
|
paddd xmm0, xmm1
|
|
pxor xmm3, xmm0
|
|
pshuflw xmm3, xmm3, 0B1H
|
|
pshufhw xmm3, xmm3, 0B1H
|
|
paddd xmm2, xmm3
|
|
pxor xmm1, xmm2
|
|
movdqa xmm11, xmm1
|
|
pslld xmm1, 20
|
|
psrld xmm11, 12
|
|
por xmm1, xmm11
|
|
paddd xmm0, xmm5
|
|
paddd xmm0, xmm1
|
|
pxor xmm3, xmm0
|
|
movdqa xmm14, xmm3
|
|
psrld xmm3, 8
|
|
pslld xmm14, 24
|
|
pxor xmm3, xmm14
|
|
paddd xmm2, xmm3
|
|
pxor xmm1, xmm2
|
|
movdqa xmm11, xmm1
|
|
pslld xmm1, 25
|
|
psrld xmm11, 7
|
|
por xmm1, xmm11
|
|
pshufd xmm0, xmm0, 93H
|
|
pshufd xmm3, xmm3, 4EH
|
|
pshufd xmm2, xmm2, 39H
|
|
paddd xmm0, xmm6
|
|
paddd xmm0, xmm1
|
|
pxor xmm3, xmm0
|
|
pshuflw xmm3, xmm3, 0B1H
|
|
pshufhw xmm3, xmm3, 0B1H
|
|
paddd xmm2, xmm3
|
|
pxor xmm1, xmm2
|
|
movdqa xmm11, xmm1
|
|
pslld xmm1, 20
|
|
psrld xmm11, 12
|
|
por xmm1, xmm11
|
|
paddd xmm0, xmm7
|
|
paddd xmm0, xmm1
|
|
pxor xmm3, xmm0
|
|
movdqa xmm14, xmm3
|
|
psrld xmm3, 8
|
|
pslld xmm14, 24
|
|
pxor xmm3, xmm14
|
|
paddd xmm2, xmm3
|
|
pxor xmm1, xmm2
|
|
movdqa xmm11, xmm1
|
|
pslld xmm1, 25
|
|
psrld xmm11, 7
|
|
por xmm1, xmm11
|
|
pshufd xmm0, xmm0, 39H
|
|
pshufd xmm3, xmm3, 4EH
|
|
pshufd xmm2, xmm2, 93H
|
|
dec al
|
|
jz @F
|
|
movdqa xmm8, xmm4
|
|
shufps xmm8, xmm5, 214
|
|
pshufd xmm9, xmm4, 0FH
|
|
pshufd xmm4, xmm8, 39H
|
|
movdqa xmm8, xmm6
|
|
shufps xmm8, xmm7, 250
|
|
pand xmm9, xmmword ptr [PBLENDW_0x33_MASK]
|
|
pand xmm8, xmmword ptr [PBLENDW_0xCC_MASK]
|
|
por xmm9, xmm8
|
|
movdqa xmm8, xmm7
|
|
punpcklqdq xmm8, xmm5
|
|
movdqa xmm14, xmm6
|
|
pand xmm8, xmmword ptr [PBLENDW_0x3F_MASK]
|
|
pand xmm14, xmmword ptr [PBLENDW_0xC0_MASK]
|
|
por xmm8, xmm14
|
|
pshufd xmm8, xmm8, 78H
|
|
punpckhdq xmm5, xmm7
|
|
punpckldq xmm6, xmm5
|
|
pshufd xmm7, xmm6, 1EH
|
|
movdqa xmm5, xmm9
|
|
movdqa xmm6, xmm8
|
|
jmp @B
|
|
@@:
|
|
pxor xmm0, xmm2
|
|
pxor xmm1, xmm3
|
|
movups xmmword ptr [rcx], xmm0
|
|
movups xmmword ptr [rcx+10H], xmm1
|
|
movdqa xmm6, xmmword ptr [rsp]
|
|
movdqa xmm7, xmmword ptr [rsp+10H]
|
|
movdqa xmm8, xmmword ptr [rsp+20H]
|
|
movdqa xmm9, xmmword ptr [rsp+30H]
|
|
movdqa xmm11, xmmword ptr [rsp+40H]
|
|
movdqa xmm14, xmmword ptr [rsp+50H]
|
|
movdqa xmm15, xmmword ptr [rsp+60H]
|
|
add rsp, 120
|
|
ret
|
|
_blake3_compress_in_place_sse2 ENDP
|
|
blake3_compress_in_place_sse2 ENDP
|
|
|
|
ALIGN 16
|
|
blake3_compress_xof_sse2 PROC
|
|
_blake3_compress_xof_sse2 PROC
|
|
sub rsp, 120
|
|
movdqa xmmword ptr [rsp], xmm6
|
|
movdqa xmmword ptr [rsp+10H], xmm7
|
|
movdqa xmmword ptr [rsp+20H], xmm8
|
|
movdqa xmmword ptr [rsp+30H], xmm9
|
|
movdqa xmmword ptr [rsp+40H], xmm11
|
|
movdqa xmmword ptr [rsp+50H], xmm14
|
|
movdqa xmmword ptr [rsp+60H], xmm15
|
|
movups xmm0, xmmword ptr [rcx]
|
|
movups xmm1, xmmword ptr [rcx+10H]
|
|
movaps xmm2, xmmword ptr [BLAKE3_IV]
|
|
movzx eax, byte ptr [rsp+0A0H]
|
|
movzx r8d, r8b
|
|
mov r10, qword ptr [rsp+0A8H]
|
|
shl rax, 32
|
|
add r8, rax
|
|
movd xmm3, r9
|
|
movd xmm4, r8
|
|
punpcklqdq xmm3, xmm4
|
|
movups xmm4, xmmword ptr [rdx]
|
|
movups xmm5, xmmword ptr [rdx+10H]
|
|
movaps xmm8, xmm4
|
|
shufps xmm4, xmm5, 136
|
|
shufps xmm8, xmm5, 221
|
|
movaps xmm5, xmm8
|
|
movups xmm6, xmmword ptr [rdx+20H]
|
|
movups xmm7, xmmword ptr [rdx+30H]
|
|
movaps xmm8, xmm6
|
|
shufps xmm6, xmm7, 136
|
|
pshufd xmm6, xmm6, 93H
|
|
shufps xmm8, xmm7, 221
|
|
pshufd xmm7, xmm8, 93H
|
|
mov al, 7
|
|
@@:
|
|
paddd xmm0, xmm4
|
|
paddd xmm0, xmm1
|
|
pxor xmm3, xmm0
|
|
pshuflw xmm3, xmm3, 0B1H
|
|
pshufhw xmm3, xmm3, 0B1H
|
|
paddd xmm2, xmm3
|
|
pxor xmm1, xmm2
|
|
movdqa xmm11, xmm1
|
|
pslld xmm1, 20
|
|
psrld xmm11, 12
|
|
por xmm1, xmm11
|
|
paddd xmm0, xmm5
|
|
paddd xmm0, xmm1
|
|
pxor xmm3, xmm0
|
|
movdqa xmm14, xmm3
|
|
psrld xmm3, 8
|
|
pslld xmm14, 24
|
|
pxor xmm3, xmm14
|
|
paddd xmm2, xmm3
|
|
pxor xmm1, xmm2
|
|
movdqa xmm11, xmm1
|
|
pslld xmm1, 25
|
|
psrld xmm11, 7
|
|
por xmm1, xmm11
|
|
pshufd xmm0, xmm0, 93H
|
|
pshufd xmm3, xmm3, 4EH
|
|
pshufd xmm2, xmm2, 39H
|
|
paddd xmm0, xmm6
|
|
paddd xmm0, xmm1
|
|
pxor xmm3, xmm0
|
|
pshuflw xmm3, xmm3, 0B1H
|
|
pshufhw xmm3, xmm3, 0B1H
|
|
paddd xmm2, xmm3
|
|
pxor xmm1, xmm2
|
|
movdqa xmm11, xmm1
|
|
pslld xmm1, 20
|
|
psrld xmm11, 12
|
|
por xmm1, xmm11
|
|
paddd xmm0, xmm7
|
|
paddd xmm0, xmm1
|
|
pxor xmm3, xmm0
|
|
movdqa xmm14, xmm3
|
|
psrld xmm3, 8
|
|
pslld xmm14, 24
|
|
pxor xmm3, xmm14
|
|
paddd xmm2, xmm3
|
|
pxor xmm1, xmm2
|
|
movdqa xmm11, xmm1
|
|
pslld xmm1, 25
|
|
psrld xmm11, 7
|
|
por xmm1, xmm11
|
|
pshufd xmm0, xmm0, 39H
|
|
pshufd xmm3, xmm3, 4EH
|
|
pshufd xmm2, xmm2, 93H
|
|
dec al
|
|
jz @F
|
|
movdqa xmm8, xmm4
|
|
shufps xmm8, xmm5, 214
|
|
pshufd xmm9, xmm4, 0FH
|
|
pshufd xmm4, xmm8, 39H
|
|
movdqa xmm8, xmm6
|
|
shufps xmm8, xmm7, 250
|
|
pand xmm9, xmmword ptr [PBLENDW_0x33_MASK]
|
|
pand xmm8, xmmword ptr [PBLENDW_0xCC_MASK]
|
|
por xmm9, xmm8
|
|
movdqa xmm8, xmm7
|
|
punpcklqdq xmm8, xmm5
|
|
movdqa xmm14, xmm6
|
|
pand xmm8, xmmword ptr [PBLENDW_0x3F_MASK]
|
|
pand xmm14, xmmword ptr [PBLENDW_0xC0_MASK]
|
|
por xmm8, xmm14
|
|
pshufd xmm8, xmm8, 78H
|
|
punpckhdq xmm5, xmm7
|
|
punpckldq xmm6, xmm5
|
|
pshufd xmm7, xmm6, 1EH
|
|
movdqa xmm5, xmm9
|
|
movdqa xmm6, xmm8
|
|
jmp @B
|
|
@@:
|
|
movdqu xmm4, xmmword ptr [rcx]
|
|
movdqu xmm5, xmmword ptr [rcx+10H]
|
|
pxor xmm0, xmm2
|
|
pxor xmm1, xmm3
|
|
pxor xmm2, xmm4
|
|
pxor xmm3, xmm5
|
|
movups xmmword ptr [r10], xmm0
|
|
movups xmmword ptr [r10+10H], xmm1
|
|
movups xmmword ptr [r10+20H], xmm2
|
|
movups xmmword ptr [r10+30H], xmm3
|
|
movdqa xmm6, xmmword ptr [rsp]
|
|
movdqa xmm7, xmmword ptr [rsp+10H]
|
|
movdqa xmm8, xmmword ptr [rsp+20H]
|
|
movdqa xmm9, xmmword ptr [rsp+30H]
|
|
movdqa xmm11, xmmword ptr [rsp+40H]
|
|
movdqa xmm14, xmmword ptr [rsp+50H]
|
|
movdqa xmm15, xmmword ptr [rsp+60H]
|
|
add rsp, 120
|
|
ret
|
|
_blake3_compress_xof_sse2 ENDP
|
|
blake3_compress_xof_sse2 ENDP
|
|
|
|
_TEXT ENDS
|
|
|
|
|
|
_RDATA SEGMENT READONLY PAGE ALIAS(".rdata") 'CONST'
|
|
ALIGN 64
|
|
BLAKE3_IV:
|
|
dd 6A09E667H, 0BB67AE85H, 3C6EF372H, 0A54FF53AH
|
|
|
|
ADD0:
|
|
dd 0, 1, 2, 3
|
|
|
|
ADD1:
|
|
dd 4 dup (4)
|
|
|
|
BLAKE3_IV_0:
|
|
dd 4 dup (6A09E667H)
|
|
|
|
BLAKE3_IV_1:
|
|
dd 4 dup (0BB67AE85H)
|
|
|
|
BLAKE3_IV_2:
|
|
dd 4 dup (3C6EF372H)
|
|
|
|
BLAKE3_IV_3:
|
|
dd 4 dup (0A54FF53AH)
|
|
|
|
BLAKE3_BLOCK_LEN:
|
|
dd 4 dup (64)
|
|
|
|
CMP_MSB_MASK:
|
|
dd 8 dup(80000000H)
|
|
|
|
PBLENDW_0x33_MASK:
|
|
dd 0FFFFFFFFH, 000000000H, 0FFFFFFFFH, 000000000H
|
|
PBLENDW_0xCC_MASK:
|
|
dd 000000000H, 0FFFFFFFFH, 000000000H, 0FFFFFFFFH
|
|
PBLENDW_0x3F_MASK:
|
|
dd 0FFFFFFFFH, 0FFFFFFFFH, 0FFFFFFFFH, 000000000H
|
|
PBLENDW_0xC0_MASK:
|
|
dd 000000000H, 000000000H, 000000000H, 0FFFFFFFFH
|
|
|
|
_RDATA ENDS
|
|
END
|