1
0
Fork 0
mirror of https://github.com/BLAKE3-team/BLAKE3 synced 2024-04-25 06:35:17 +02:00
BLAKE3/c/blake3_sse2_x86-64_windows_...
Jack O'Connor 371b5483c9 fix incorrect output / undefined behavior in Windows SSE2 assembly
The SSE2 patch introduced xmm10 as a temporary register for one of the
rotations, but xmm6-xmm15 are callee-save registers on Windows, and
SSE4.1 was only saving the registers it used. The minimal fix is to use
one of the saved registers instead of xmm10.

See https://github.com/BLAKE3-team/BLAKE3/issues/206.
2021-11-05 12:25:44 -04:00

2351 lines
69 KiB
NASM

public _blake3_hash_many_sse2
public blake3_hash_many_sse2
public blake3_compress_in_place_sse2
public _blake3_compress_in_place_sse2
public blake3_compress_xof_sse2
public _blake3_compress_xof_sse2
_TEXT SEGMENT ALIGN(16) 'CODE'
ALIGN 16
blake3_hash_many_sse2 PROC
_blake3_hash_many_sse2 PROC
push r15
push r14
push r13
push r12
push rsi
push rdi
push rbx
push rbp
mov rbp, rsp
sub rsp, 528
and rsp, 0FFFFFFFFFFFFFFC0H
movdqa xmmword ptr [rsp+170H], xmm6
movdqa xmmword ptr [rsp+180H], xmm7
movdqa xmmword ptr [rsp+190H], xmm8
movdqa xmmword ptr [rsp+1A0H], xmm9
movdqa xmmword ptr [rsp+1B0H], xmm10
movdqa xmmword ptr [rsp+1C0H], xmm11
movdqa xmmword ptr [rsp+1D0H], xmm12
movdqa xmmword ptr [rsp+1E0H], xmm13
movdqa xmmword ptr [rsp+1F0H], xmm14
movdqa xmmword ptr [rsp+200H], xmm15
mov rdi, rcx
mov rsi, rdx
mov rdx, r8
mov rcx, r9
mov r8, qword ptr [rbp+68H]
movzx r9, byte ptr [rbp+70H]
neg r9d
movd xmm0, r9d
pshufd xmm0, xmm0, 00H
movdqa xmmword ptr [rsp+130H], xmm0
movdqa xmm1, xmm0
pand xmm1, xmmword ptr [ADD0]
pand xmm0, xmmword ptr [ADD1]
movdqa xmmword ptr [rsp+150H], xmm0
movd xmm0, r8d
pshufd xmm0, xmm0, 00H
paddd xmm0, xmm1
movdqa xmmword ptr [rsp+110H], xmm0
pxor xmm0, xmmword ptr [CMP_MSB_MASK]
pxor xmm1, xmmword ptr [CMP_MSB_MASK]
pcmpgtd xmm1, xmm0
shr r8, 32
movd xmm2, r8d
pshufd xmm2, xmm2, 00H
psubd xmm2, xmm1
movdqa xmmword ptr [rsp+120H], xmm2
mov rbx, qword ptr [rbp+90H]
mov r15, rdx
shl r15, 6
movzx r13d, byte ptr [rbp+78H]
movzx r12d, byte ptr [rbp+88H]
cmp rsi, 4
jc final3blocks
outerloop4:
movdqu xmm3, xmmword ptr [rcx]
pshufd xmm0, xmm3, 00H
pshufd xmm1, xmm3, 55H
pshufd xmm2, xmm3, 0AAH
pshufd xmm3, xmm3, 0FFH
movdqu xmm7, xmmword ptr [rcx+10H]
pshufd xmm4, xmm7, 00H
pshufd xmm5, xmm7, 55H
pshufd xmm6, xmm7, 0AAH
pshufd xmm7, xmm7, 0FFH
mov r8, qword ptr [rdi]
mov r9, qword ptr [rdi+8H]
mov r10, qword ptr [rdi+10H]
mov r11, qword ptr [rdi+18H]
movzx eax, byte ptr [rbp+80H]
or eax, r13d
xor edx, edx
innerloop4:
mov r14d, eax
or eax, r12d
add rdx, 64
cmp rdx, r15
cmovne eax, r14d
movdqu xmm8, xmmword ptr [r8+rdx-40H]
movdqu xmm9, xmmword ptr [r9+rdx-40H]
movdqu xmm10, xmmword ptr [r10+rdx-40H]
movdqu xmm11, xmmword ptr [r11+rdx-40H]
movdqa xmm12, xmm8
punpckldq xmm8, xmm9
punpckhdq xmm12, xmm9
movdqa xmm14, xmm10
punpckldq xmm10, xmm11
punpckhdq xmm14, xmm11
movdqa xmm9, xmm8
punpcklqdq xmm8, xmm10
punpckhqdq xmm9, xmm10
movdqa xmm13, xmm12
punpcklqdq xmm12, xmm14
punpckhqdq xmm13, xmm14
movdqa xmmword ptr [rsp], xmm8
movdqa xmmword ptr [rsp+10H], xmm9
movdqa xmmword ptr [rsp+20H], xmm12
movdqa xmmword ptr [rsp+30H], xmm13
movdqu xmm8, xmmword ptr [r8+rdx-30H]
movdqu xmm9, xmmword ptr [r9+rdx-30H]
movdqu xmm10, xmmword ptr [r10+rdx-30H]
movdqu xmm11, xmmword ptr [r11+rdx-30H]
movdqa xmm12, xmm8
punpckldq xmm8, xmm9
punpckhdq xmm12, xmm9
movdqa xmm14, xmm10
punpckldq xmm10, xmm11
punpckhdq xmm14, xmm11
movdqa xmm9, xmm8
punpcklqdq xmm8, xmm10
punpckhqdq xmm9, xmm10
movdqa xmm13, xmm12
punpcklqdq xmm12, xmm14
punpckhqdq xmm13, xmm14
movdqa xmmword ptr [rsp+40H], xmm8
movdqa xmmword ptr [rsp+50H], xmm9
movdqa xmmword ptr [rsp+60H], xmm12
movdqa xmmword ptr [rsp+70H], xmm13
movdqu xmm8, xmmword ptr [r8+rdx-20H]
movdqu xmm9, xmmword ptr [r9+rdx-20H]
movdqu xmm10, xmmword ptr [r10+rdx-20H]
movdqu xmm11, xmmword ptr [r11+rdx-20H]
movdqa xmm12, xmm8
punpckldq xmm8, xmm9
punpckhdq xmm12, xmm9
movdqa xmm14, xmm10
punpckldq xmm10, xmm11
punpckhdq xmm14, xmm11
movdqa xmm9, xmm8
punpcklqdq xmm8, xmm10
punpckhqdq xmm9, xmm10
movdqa xmm13, xmm12
punpcklqdq xmm12, xmm14
punpckhqdq xmm13, xmm14
movdqa xmmword ptr [rsp+80H], xmm8
movdqa xmmword ptr [rsp+90H], xmm9
movdqa xmmword ptr [rsp+0A0H], xmm12
movdqa xmmword ptr [rsp+0B0H], xmm13
movdqu xmm8, xmmword ptr [r8+rdx-10H]
movdqu xmm9, xmmword ptr [r9+rdx-10H]
movdqu xmm10, xmmword ptr [r10+rdx-10H]
movdqu xmm11, xmmword ptr [r11+rdx-10H]
movdqa xmm12, xmm8
punpckldq xmm8, xmm9
punpckhdq xmm12, xmm9
movdqa xmm14, xmm10
punpckldq xmm10, xmm11
punpckhdq xmm14, xmm11
movdqa xmm9, xmm8
punpcklqdq xmm8, xmm10
punpckhqdq xmm9, xmm10
movdqa xmm13, xmm12
punpcklqdq xmm12, xmm14
punpckhqdq xmm13, xmm14
movdqa xmmword ptr [rsp+0C0H], xmm8
movdqa xmmword ptr [rsp+0D0H], xmm9
movdqa xmmword ptr [rsp+0E0H], xmm12
movdqa xmmword ptr [rsp+0F0H], xmm13
movdqa xmm9, xmmword ptr [BLAKE3_IV_1]
movdqa xmm10, xmmword ptr [BLAKE3_IV_2]
movdqa xmm11, xmmword ptr [BLAKE3_IV_3]
movdqa xmm12, xmmword ptr [rsp+110H]
movdqa xmm13, xmmword ptr [rsp+120H]
movdqa xmm14, xmmword ptr [BLAKE3_BLOCK_LEN]
movd xmm15, eax
pshufd xmm15, xmm15, 00H
prefetcht0 byte ptr [r8+rdx+80H]
prefetcht0 byte ptr [r9+rdx+80H]
prefetcht0 byte ptr [r10+rdx+80H]
prefetcht0 byte ptr [r11+rdx+80H]
paddd xmm0, xmmword ptr [rsp]
paddd xmm1, xmmword ptr [rsp+20H]
paddd xmm2, xmmword ptr [rsp+40H]
paddd xmm3, xmmword ptr [rsp+60H]
paddd xmm0, xmm4
paddd xmm1, xmm5
paddd xmm2, xmm6
paddd xmm3, xmm7
pxor xmm12, xmm0
pxor xmm13, xmm1
pxor xmm14, xmm2
pxor xmm15, xmm3
pshuflw xmm12, xmm12, 0B1H
pshufhw xmm12, xmm12, 0B1H
pshuflw xmm13, xmm13, 0B1H
pshufhw xmm13, xmm13, 0B1H
pshuflw xmm14, xmm14, 0B1H
pshufhw xmm14, xmm14, 0B1H
pshuflw xmm15, xmm15, 0B1H
pshufhw xmm15, xmm15, 0B1H
movdqa xmm8, xmmword ptr [BLAKE3_IV_0]
paddd xmm8, xmm12
paddd xmm9, xmm13
paddd xmm10, xmm14
paddd xmm11, xmm15
pxor xmm4, xmm8
pxor xmm5, xmm9
pxor xmm6, xmm10
pxor xmm7, xmm11
movdqa xmmword ptr [rsp+100H], xmm8
movdqa xmm8, xmm4
psrld xmm8, 12
pslld xmm4, 20
por xmm4, xmm8
movdqa xmm8, xmm5
psrld xmm8, 12
pslld xmm5, 20
por xmm5, xmm8
movdqa xmm8, xmm6
psrld xmm8, 12
pslld xmm6, 20
por xmm6, xmm8
movdqa xmm8, xmm7
psrld xmm8, 12
pslld xmm7, 20
por xmm7, xmm8
paddd xmm0, xmmword ptr [rsp+10H]
paddd xmm1, xmmword ptr [rsp+30H]
paddd xmm2, xmmword ptr [rsp+50H]
paddd xmm3, xmmword ptr [rsp+70H]
paddd xmm0, xmm4
paddd xmm1, xmm5
paddd xmm2, xmm6
paddd xmm3, xmm7
pxor xmm12, xmm0
pxor xmm13, xmm1
pxor xmm14, xmm2
pxor xmm15, xmm3
movdqa xmm8, xmm12
psrld xmm12, 8
pslld xmm8, 24
pxor xmm12, xmm8
movdqa xmm8, xmm13
psrld xmm13, 8
pslld xmm8, 24
pxor xmm13, xmm8
movdqa xmm8, xmm14
psrld xmm14, 8
pslld xmm8, 24
pxor xmm14, xmm8
movdqa xmm8, xmm15
psrld xmm15, 8
pslld xmm8, 24
pxor xmm15, xmm8
movdqa xmm8, xmmword ptr [rsp+100H]
paddd xmm8, xmm12
paddd xmm9, xmm13
paddd xmm10, xmm14
paddd xmm11, xmm15
pxor xmm4, xmm8
pxor xmm5, xmm9
pxor xmm6, xmm10
pxor xmm7, xmm11
movdqa xmmword ptr [rsp+100H], xmm8
movdqa xmm8, xmm4
psrld xmm8, 7
pslld xmm4, 25
por xmm4, xmm8
movdqa xmm8, xmm5
psrld xmm8, 7
pslld xmm5, 25
por xmm5, xmm8
movdqa xmm8, xmm6
psrld xmm8, 7
pslld xmm6, 25
por xmm6, xmm8
movdqa xmm8, xmm7
psrld xmm8, 7
pslld xmm7, 25
por xmm7, xmm8
paddd xmm0, xmmword ptr [rsp+80H]
paddd xmm1, xmmword ptr [rsp+0A0H]
paddd xmm2, xmmword ptr [rsp+0C0H]
paddd xmm3, xmmword ptr [rsp+0E0H]
paddd xmm0, xmm5
paddd xmm1, xmm6
paddd xmm2, xmm7
paddd xmm3, xmm4
pxor xmm15, xmm0
pxor xmm12, xmm1
pxor xmm13, xmm2
pxor xmm14, xmm3
pshuflw xmm15, xmm15, 0B1H
pshufhw xmm15, xmm15, 0B1H
pshuflw xmm12, xmm12, 0B1H
pshufhw xmm12, xmm12, 0B1H
pshuflw xmm13, xmm13, 0B1H
pshufhw xmm13, xmm13, 0B1H
pshuflw xmm14, xmm14, 0B1H
pshufhw xmm14, xmm14, 0B1H
paddd xmm10, xmm15
paddd xmm11, xmm12
movdqa xmm8, xmmword ptr [rsp+100H]
paddd xmm8, xmm13
paddd xmm9, xmm14
pxor xmm5, xmm10
pxor xmm6, xmm11
pxor xmm7, xmm8
pxor xmm4, xmm9
movdqa xmmword ptr [rsp+100H], xmm8
movdqa xmm8, xmm5
psrld xmm8, 12
pslld xmm5, 20
por xmm5, xmm8
movdqa xmm8, xmm6
psrld xmm8, 12
pslld xmm6, 20
por xmm6, xmm8
movdqa xmm8, xmm7
psrld xmm8, 12
pslld xmm7, 20
por xmm7, xmm8
movdqa xmm8, xmm4
psrld xmm8, 12
pslld xmm4, 20
por xmm4, xmm8
paddd xmm0, xmmword ptr [rsp+90H]
paddd xmm1, xmmword ptr [rsp+0B0H]
paddd xmm2, xmmword ptr [rsp+0D0H]
paddd xmm3, xmmword ptr [rsp+0F0H]
paddd xmm0, xmm5
paddd xmm1, xmm6
paddd xmm2, xmm7
paddd xmm3, xmm4
pxor xmm15, xmm0
pxor xmm12, xmm1
pxor xmm13, xmm2
pxor xmm14, xmm3
movdqa xmm8, xmm15
psrld xmm15, 8
pslld xmm8, 24
pxor xmm15, xmm8
movdqa xmm8, xmm12
psrld xmm12, 8
pslld xmm8, 24
pxor xmm12, xmm8
movdqa xmm8, xmm13
psrld xmm13, 8
pslld xmm8, 24
pxor xmm13, xmm8
movdqa xmm8, xmm14
psrld xmm14, 8
pslld xmm8, 24
pxor xmm14, xmm8
paddd xmm10, xmm15
paddd xmm11, xmm12
movdqa xmm8, xmmword ptr [rsp+100H]
paddd xmm8, xmm13
paddd xmm9, xmm14
pxor xmm5, xmm10
pxor xmm6, xmm11
pxor xmm7, xmm8
pxor xmm4, xmm9
movdqa xmmword ptr [rsp+100H], xmm8
movdqa xmm8, xmm5
psrld xmm8, 7
pslld xmm5, 25
por xmm5, xmm8
movdqa xmm8, xmm6
psrld xmm8, 7
pslld xmm6, 25
por xmm6, xmm8
movdqa xmm8, xmm7
psrld xmm8, 7
pslld xmm7, 25
por xmm7, xmm8
movdqa xmm8, xmm4
psrld xmm8, 7
pslld xmm4, 25
por xmm4, xmm8
paddd xmm0, xmmword ptr [rsp+20H]
paddd xmm1, xmmword ptr [rsp+30H]
paddd xmm2, xmmword ptr [rsp+70H]
paddd xmm3, xmmword ptr [rsp+40H]
paddd xmm0, xmm4
paddd xmm1, xmm5
paddd xmm2, xmm6
paddd xmm3, xmm7
pxor xmm12, xmm0
pxor xmm13, xmm1
pxor xmm14, xmm2
pxor xmm15, xmm3
pshuflw xmm12, xmm12, 0B1H
pshufhw xmm12, xmm12, 0B1H
pshuflw xmm13, xmm13, 0B1H
pshufhw xmm13, xmm13, 0B1H
pshuflw xmm14, xmm14, 0B1H
pshufhw xmm14, xmm14, 0B1H
pshuflw xmm15, xmm15, 0B1H
pshufhw xmm15, xmm15, 0B1H
movdqa xmm8, xmmword ptr [rsp+100H]
paddd xmm8, xmm12
paddd xmm9, xmm13
paddd xmm10, xmm14
paddd xmm11, xmm15
pxor xmm4, xmm8
pxor xmm5, xmm9
pxor xmm6, xmm10
pxor xmm7, xmm11
movdqa xmmword ptr [rsp+100H], xmm8
movdqa xmm8, xmm4
psrld xmm8, 12
pslld xmm4, 20
por xmm4, xmm8
movdqa xmm8, xmm5
psrld xmm8, 12
pslld xmm5, 20
por xmm5, xmm8
movdqa xmm8, xmm6
psrld xmm8, 12
pslld xmm6, 20
por xmm6, xmm8
movdqa xmm8, xmm7
psrld xmm8, 12
pslld xmm7, 20
por xmm7, xmm8
paddd xmm0, xmmword ptr [rsp+60H]
paddd xmm1, xmmword ptr [rsp+0A0H]
paddd xmm2, xmmword ptr [rsp]
paddd xmm3, xmmword ptr [rsp+0D0H]
paddd xmm0, xmm4
paddd xmm1, xmm5
paddd xmm2, xmm6
paddd xmm3, xmm7
pxor xmm12, xmm0
pxor xmm13, xmm1
pxor xmm14, xmm2
pxor xmm15, xmm3
movdqa xmm8, xmm12
psrld xmm12, 8
pslld xmm8, 24
pxor xmm12, xmm8
movdqa xmm8, xmm13
psrld xmm13, 8
pslld xmm8, 24
pxor xmm13, xmm8
movdqa xmm8, xmm14
psrld xmm14, 8
pslld xmm8, 24
pxor xmm14, xmm8
movdqa xmm8, xmm15
psrld xmm15, 8
pslld xmm8, 24
pxor xmm15, xmm8
movdqa xmm8, xmmword ptr [rsp+100H]
paddd xmm8, xmm12
paddd xmm9, xmm13
paddd xmm10, xmm14
paddd xmm11, xmm15
pxor xmm4, xmm8
pxor xmm5, xmm9
pxor xmm6, xmm10
pxor xmm7, xmm11
movdqa xmmword ptr [rsp+100H], xmm8
movdqa xmm8, xmm4
psrld xmm8, 7
pslld xmm4, 25
por xmm4, xmm8
movdqa xmm8, xmm5
psrld xmm8, 7
pslld xmm5, 25
por xmm5, xmm8
movdqa xmm8, xmm6
psrld xmm8, 7
pslld xmm6, 25
por xmm6, xmm8
movdqa xmm8, xmm7
psrld xmm8, 7
pslld xmm7, 25
por xmm7, xmm8
paddd xmm0, xmmword ptr [rsp+10H]
paddd xmm1, xmmword ptr [rsp+0C0H]
paddd xmm2, xmmword ptr [rsp+90H]
paddd xmm3, xmmword ptr [rsp+0F0H]
paddd xmm0, xmm5
paddd xmm1, xmm6
paddd xmm2, xmm7
paddd xmm3, xmm4
pxor xmm15, xmm0
pxor xmm12, xmm1
pxor xmm13, xmm2
pxor xmm14, xmm3
pshuflw xmm15, xmm15, 0B1H
pshufhw xmm15, xmm15, 0B1H
pshuflw xmm12, xmm12, 0B1H
pshufhw xmm12, xmm12, 0B1H
pshuflw xmm13, xmm13, 0B1H
pshufhw xmm13, xmm13, 0B1H
pshuflw xmm14, xmm14, 0B1H
pshufhw xmm14, xmm14, 0B1H
paddd xmm10, xmm15
paddd xmm11, xmm12
movdqa xmm8, xmmword ptr [rsp+100H]
paddd xmm8, xmm13
paddd xmm9, xmm14
pxor xmm5, xmm10
pxor xmm6, xmm11
pxor xmm7, xmm8
pxor xmm4, xmm9
movdqa xmmword ptr [rsp+100H], xmm8
movdqa xmm8, xmm5
psrld xmm8, 12
pslld xmm5, 20
por xmm5, xmm8
movdqa xmm8, xmm6
psrld xmm8, 12
pslld xmm6, 20
por xmm6, xmm8
movdqa xmm8, xmm7
psrld xmm8, 12
pslld xmm7, 20
por xmm7, xmm8
movdqa xmm8, xmm4
psrld xmm8, 12
pslld xmm4, 20
por xmm4, xmm8
paddd xmm0, xmmword ptr [rsp+0B0H]
paddd xmm1, xmmword ptr [rsp+50H]
paddd xmm2, xmmword ptr [rsp+0E0H]
paddd xmm3, xmmword ptr [rsp+80H]
paddd xmm0, xmm5
paddd xmm1, xmm6
paddd xmm2, xmm7
paddd xmm3, xmm4
pxor xmm15, xmm0
pxor xmm12, xmm1
pxor xmm13, xmm2
pxor xmm14, xmm3
movdqa xmm8, xmm15
psrld xmm15, 8
pslld xmm8, 24
pxor xmm15, xmm8
movdqa xmm8, xmm12
psrld xmm12, 8
pslld xmm8, 24
pxor xmm12, xmm8
movdqa xmm8, xmm13
psrld xmm13, 8
pslld xmm8, 24
pxor xmm13, xmm8
movdqa xmm8, xmm14
psrld xmm14, 8
pslld xmm8, 24
pxor xmm14, xmm8
paddd xmm10, xmm15
paddd xmm11, xmm12
movdqa xmm8, xmmword ptr [rsp+100H]
paddd xmm8, xmm13
paddd xmm9, xmm14
pxor xmm5, xmm10
pxor xmm6, xmm11
pxor xmm7, xmm8
pxor xmm4, xmm9
movdqa xmmword ptr [rsp+100H], xmm8
movdqa xmm8, xmm5
psrld xmm8, 7
pslld xmm5, 25
por xmm5, xmm8
movdqa xmm8, xmm6
psrld xmm8, 7
pslld xmm6, 25
por xmm6, xmm8
movdqa xmm8, xmm7
psrld xmm8, 7
pslld xmm7, 25
por xmm7, xmm8
movdqa xmm8, xmm4
psrld xmm8, 7
pslld xmm4, 25
por xmm4, xmm8
paddd xmm0, xmmword ptr [rsp+30H]
paddd xmm1, xmmword ptr [rsp+0A0H]
paddd xmm2, xmmword ptr [rsp+0D0H]
paddd xmm3, xmmword ptr [rsp+70H]
paddd xmm0, xmm4
paddd xmm1, xmm5
paddd xmm2, xmm6
paddd xmm3, xmm7
pxor xmm12, xmm0
pxor xmm13, xmm1
pxor xmm14, xmm2
pxor xmm15, xmm3
pshuflw xmm12, xmm12, 0B1H
pshufhw xmm12, xmm12, 0B1H
pshuflw xmm13, xmm13, 0B1H
pshufhw xmm13, xmm13, 0B1H
pshuflw xmm14, xmm14, 0B1H
pshufhw xmm14, xmm14, 0B1H
pshuflw xmm15, xmm15, 0B1H
pshufhw xmm15, xmm15, 0B1H
movdqa xmm8, xmmword ptr [rsp+100H]
paddd xmm8, xmm12
paddd xmm9, xmm13
paddd xmm10, xmm14
paddd xmm11, xmm15
pxor xmm4, xmm8
pxor xmm5, xmm9
pxor xmm6, xmm10
pxor xmm7, xmm11
movdqa xmmword ptr [rsp+100H], xmm8
movdqa xmm8, xmm4
psrld xmm8, 12
pslld xmm4, 20
por xmm4, xmm8
movdqa xmm8, xmm5
psrld xmm8, 12
pslld xmm5, 20
por xmm5, xmm8
movdqa xmm8, xmm6
psrld xmm8, 12
pslld xmm6, 20
por xmm6, xmm8
movdqa xmm8, xmm7
psrld xmm8, 12
pslld xmm7, 20
por xmm7, xmm8
paddd xmm0, xmmword ptr [rsp+40H]
paddd xmm1, xmmword ptr [rsp+0C0H]
paddd xmm2, xmmword ptr [rsp+20H]
paddd xmm3, xmmword ptr [rsp+0E0H]
paddd xmm0, xmm4
paddd xmm1, xmm5
paddd xmm2, xmm6
paddd xmm3, xmm7
pxor xmm12, xmm0
pxor xmm13, xmm1
pxor xmm14, xmm2
pxor xmm15, xmm3
movdqa xmm8, xmm12
psrld xmm12, 8
pslld xmm8, 24
pxor xmm12, xmm8
movdqa xmm8, xmm13
psrld xmm13, 8
pslld xmm8, 24
pxor xmm13, xmm8
movdqa xmm8, xmm14
psrld xmm14, 8
pslld xmm8, 24
pxor xmm14, xmm8
movdqa xmm8, xmm15
psrld xmm15, 8
pslld xmm8, 24
pxor xmm15, xmm8
movdqa xmm8, xmmword ptr [rsp+100H]
paddd xmm8, xmm12
paddd xmm9, xmm13
paddd xmm10, xmm14
paddd xmm11, xmm15
pxor xmm4, xmm8
pxor xmm5, xmm9
pxor xmm6, xmm10
pxor xmm7, xmm11
movdqa xmmword ptr [rsp+100H], xmm8
movdqa xmm8, xmm4
psrld xmm8, 7
pslld xmm4, 25
por xmm4, xmm8
movdqa xmm8, xmm5
psrld xmm8, 7
pslld xmm5, 25
por xmm5, xmm8
movdqa xmm8, xmm6
psrld xmm8, 7
pslld xmm6, 25
por xmm6, xmm8
movdqa xmm8, xmm7
psrld xmm8, 7
pslld xmm7, 25
por xmm7, xmm8
paddd xmm0, xmmword ptr [rsp+60H]
paddd xmm1, xmmword ptr [rsp+90H]
paddd xmm2, xmmword ptr [rsp+0B0H]
paddd xmm3, xmmword ptr [rsp+80H]
paddd xmm0, xmm5
paddd xmm1, xmm6
paddd xmm2, xmm7
paddd xmm3, xmm4
pxor xmm15, xmm0
pxor xmm12, xmm1
pxor xmm13, xmm2
pxor xmm14, xmm3
pshuflw xmm15, xmm15, 0B1H
pshufhw xmm15, xmm15, 0B1H
pshuflw xmm12, xmm12, 0B1H
pshufhw xmm12, xmm12, 0B1H
pshuflw xmm13, xmm13, 0B1H
pshufhw xmm13, xmm13, 0B1H
pshuflw xmm14, xmm14, 0B1H
pshufhw xmm14, xmm14, 0B1H
paddd xmm10, xmm15
paddd xmm11, xmm12
movdqa xmm8, xmmword ptr [rsp+100H]
paddd xmm8, xmm13
paddd xmm9, xmm14
pxor xmm5, xmm10
pxor xmm6, xmm11
pxor xmm7, xmm8
pxor xmm4, xmm9
movdqa xmmword ptr [rsp+100H], xmm8
movdqa xmm8, xmm5
psrld xmm8, 12
pslld xmm5, 20
por xmm5, xmm8
movdqa xmm8, xmm6
psrld xmm8, 12
pslld xmm6, 20
por xmm6, xmm8
movdqa xmm8, xmm7
psrld xmm8, 12
pslld xmm7, 20
por xmm7, xmm8
movdqa xmm8, xmm4
psrld xmm8, 12
pslld xmm4, 20
por xmm4, xmm8
paddd xmm0, xmmword ptr [rsp+50H]
paddd xmm1, xmmword ptr [rsp]
paddd xmm2, xmmword ptr [rsp+0F0H]
paddd xmm3, xmmword ptr [rsp+10H]
paddd xmm0, xmm5
paddd xmm1, xmm6
paddd xmm2, xmm7
paddd xmm3, xmm4
pxor xmm15, xmm0
pxor xmm12, xmm1
pxor xmm13, xmm2
pxor xmm14, xmm3
movdqa xmm8, xmm15
psrld xmm15, 8
pslld xmm8, 24
pxor xmm15, xmm8
movdqa xmm8, xmm12
psrld xmm12, 8
pslld xmm8, 24
pxor xmm12, xmm8
movdqa xmm8, xmm13
psrld xmm13, 8
pslld xmm8, 24
pxor xmm13, xmm8
movdqa xmm8, xmm14
psrld xmm14, 8
pslld xmm8, 24
pxor xmm14, xmm8
paddd xmm10, xmm15
paddd xmm11, xmm12
movdqa xmm8, xmmword ptr [rsp+100H]
paddd xmm8, xmm13
paddd xmm9, xmm14
pxor xmm5, xmm10
pxor xmm6, xmm11
pxor xmm7, xmm8
pxor xmm4, xmm9
movdqa xmmword ptr [rsp+100H], xmm8
movdqa xmm8, xmm5
psrld xmm8, 7
pslld xmm5, 25
por xmm5, xmm8
movdqa xmm8, xmm6
psrld xmm8, 7
pslld xmm6, 25
por xmm6, xmm8
movdqa xmm8, xmm7
psrld xmm8, 7
pslld xmm7, 25
por xmm7, xmm8
movdqa xmm8, xmm4
psrld xmm8, 7
pslld xmm4, 25
por xmm4, xmm8
paddd xmm0, xmmword ptr [rsp+0A0H]
paddd xmm1, xmmword ptr [rsp+0C0H]
paddd xmm2, xmmword ptr [rsp+0E0H]
paddd xmm3, xmmword ptr [rsp+0D0H]
paddd xmm0, xmm4
paddd xmm1, xmm5
paddd xmm2, xmm6
paddd xmm3, xmm7
pxor xmm12, xmm0
pxor xmm13, xmm1
pxor xmm14, xmm2
pxor xmm15, xmm3
pshuflw xmm12, xmm12, 0B1H
pshufhw xmm12, xmm12, 0B1H
pshuflw xmm13, xmm13, 0B1H
pshufhw xmm13, xmm13, 0B1H
pshuflw xmm14, xmm14, 0B1H
pshufhw xmm14, xmm14, 0B1H
pshuflw xmm15, xmm15, 0B1H
pshufhw xmm15, xmm15, 0B1H
movdqa xmm8, xmmword ptr [rsp+100H]
paddd xmm8, xmm12
paddd xmm9, xmm13
paddd xmm10, xmm14
paddd xmm11, xmm15
pxor xmm4, xmm8
pxor xmm5, xmm9
pxor xmm6, xmm10
pxor xmm7, xmm11
movdqa xmmword ptr [rsp+100H], xmm8
movdqa xmm8, xmm4
psrld xmm8, 12
pslld xmm4, 20
por xmm4, xmm8
movdqa xmm8, xmm5
psrld xmm8, 12
pslld xmm5, 20
por xmm5, xmm8
movdqa xmm8, xmm6
psrld xmm8, 12
pslld xmm6, 20
por xmm6, xmm8
movdqa xmm8, xmm7
psrld xmm8, 12
pslld xmm7, 20
por xmm7, xmm8
paddd xmm0, xmmword ptr [rsp+70H]
paddd xmm1, xmmword ptr [rsp+90H]
paddd xmm2, xmmword ptr [rsp+30H]
paddd xmm3, xmmword ptr [rsp+0F0H]
paddd xmm0, xmm4
paddd xmm1, xmm5
paddd xmm2, xmm6
paddd xmm3, xmm7
pxor xmm12, xmm0
pxor xmm13, xmm1
pxor xmm14, xmm2
pxor xmm15, xmm3
movdqa xmm8, xmm12
psrld xmm12, 8
pslld xmm8, 24
pxor xmm12, xmm8
movdqa xmm8, xmm13
psrld xmm13, 8
pslld xmm8, 24
pxor xmm13, xmm8
movdqa xmm8, xmm14
psrld xmm14, 8
pslld xmm8, 24
pxor xmm14, xmm8
movdqa xmm8, xmm15
psrld xmm15, 8
pslld xmm8, 24
pxor xmm15, xmm8
movdqa xmm8, xmmword ptr [rsp+100H]
paddd xmm8, xmm12
paddd xmm9, xmm13
paddd xmm10, xmm14
paddd xmm11, xmm15
pxor xmm4, xmm8
pxor xmm5, xmm9
pxor xmm6, xmm10
pxor xmm7, xmm11
movdqa xmmword ptr [rsp+100H], xmm8
movdqa xmm8, xmm4
psrld xmm8, 7
pslld xmm4, 25
por xmm4, xmm8
movdqa xmm8, xmm5
psrld xmm8, 7
pslld xmm5, 25
por xmm5, xmm8
movdqa xmm8, xmm6
psrld xmm8, 7
pslld xmm6, 25
por xmm6, xmm8
movdqa xmm8, xmm7
psrld xmm8, 7
pslld xmm7, 25
por xmm7, xmm8
paddd xmm0, xmmword ptr [rsp+40H]
paddd xmm1, xmmword ptr [rsp+0B0H]
paddd xmm2, xmmword ptr [rsp+50H]
paddd xmm3, xmmword ptr [rsp+10H]
paddd xmm0, xmm5
paddd xmm1, xmm6
paddd xmm2, xmm7
paddd xmm3, xmm4
pxor xmm15, xmm0
pxor xmm12, xmm1
pxor xmm13, xmm2
pxor xmm14, xmm3
pshuflw xmm15, xmm15, 0B1H
pshufhw xmm15, xmm15, 0B1H
pshuflw xmm12, xmm12, 0B1H
pshufhw xmm12, xmm12, 0B1H
pshuflw xmm13, xmm13, 0B1H
pshufhw xmm13, xmm13, 0B1H
pshuflw xmm14, xmm14, 0B1H
pshufhw xmm14, xmm14, 0B1H
paddd xmm10, xmm15
paddd xmm11, xmm12
movdqa xmm8, xmmword ptr [rsp+100H]
paddd xmm8, xmm13
paddd xmm9, xmm14
pxor xmm5, xmm10
pxor xmm6, xmm11
pxor xmm7, xmm8
pxor xmm4, xmm9
movdqa xmmword ptr [rsp+100H], xmm8
movdqa xmm8, xmm5
psrld xmm8, 12
pslld xmm5, 20
por xmm5, xmm8
movdqa xmm8, xmm6
psrld xmm8, 12
pslld xmm6, 20
por xmm6, xmm8
movdqa xmm8, xmm7
psrld xmm8, 12
pslld xmm7, 20
por xmm7, xmm8
movdqa xmm8, xmm4
psrld xmm8, 12
pslld xmm4, 20
por xmm4, xmm8
paddd xmm0, xmmword ptr [rsp]
paddd xmm1, xmmword ptr [rsp+20H]
paddd xmm2, xmmword ptr [rsp+80H]
paddd xmm3, xmmword ptr [rsp+60H]
paddd xmm0, xmm5
paddd xmm1, xmm6
paddd xmm2, xmm7
paddd xmm3, xmm4
pxor xmm15, xmm0
pxor xmm12, xmm1
pxor xmm13, xmm2
pxor xmm14, xmm3
movdqa xmm8, xmm15
psrld xmm15, 8
pslld xmm8, 24
pxor xmm15, xmm8
movdqa xmm8, xmm12
psrld xmm12, 8
pslld xmm8, 24
pxor xmm12, xmm8
movdqa xmm8, xmm13
psrld xmm13, 8
pslld xmm8, 24
pxor xmm13, xmm8
movdqa xmm8, xmm14
psrld xmm14, 8
pslld xmm8, 24
pxor xmm14, xmm8
paddd xmm10, xmm15
paddd xmm11, xmm12
movdqa xmm8, xmmword ptr [rsp+100H]
paddd xmm8, xmm13
paddd xmm9, xmm14
pxor xmm5, xmm10
pxor xmm6, xmm11
pxor xmm7, xmm8
pxor xmm4, xmm9
movdqa xmmword ptr [rsp+100H], xmm8
movdqa xmm8, xmm5
psrld xmm8, 7
pslld xmm5, 25
por xmm5, xmm8
movdqa xmm8, xmm6
psrld xmm8, 7
pslld xmm6, 25
por xmm6, xmm8
movdqa xmm8, xmm7
psrld xmm8, 7
pslld xmm7, 25
por xmm7, xmm8
movdqa xmm8, xmm4
psrld xmm8, 7
pslld xmm4, 25
por xmm4, xmm8
paddd xmm0, xmmword ptr [rsp+0C0H]
paddd xmm1, xmmword ptr [rsp+90H]
paddd xmm2, xmmword ptr [rsp+0F0H]
paddd xmm3, xmmword ptr [rsp+0E0H]
paddd xmm0, xmm4
paddd xmm1, xmm5
paddd xmm2, xmm6
paddd xmm3, xmm7
pxor xmm12, xmm0
pxor xmm13, xmm1
pxor xmm14, xmm2
pxor xmm15, xmm3
pshuflw xmm12, xmm12, 0B1H
pshufhw xmm12, xmm12, 0B1H
pshuflw xmm13, xmm13, 0B1H
pshufhw xmm13, xmm13, 0B1H
pshuflw xmm14, xmm14, 0B1H
pshufhw xmm14, xmm14, 0B1H
pshuflw xmm15, xmm15, 0B1H
pshufhw xmm15, xmm15, 0B1H
movdqa xmm8, xmmword ptr [rsp+100H]
paddd xmm8, xmm12
paddd xmm9, xmm13
paddd xmm10, xmm14
paddd xmm11, xmm15
pxor xmm4, xmm8
pxor xmm5, xmm9
pxor xmm6, xmm10
pxor xmm7, xmm11
movdqa xmmword ptr [rsp+100H], xmm8
movdqa xmm8, xmm4
psrld xmm8, 12
pslld xmm4, 20
por xmm4, xmm8
movdqa xmm8, xmm5
psrld xmm8, 12
pslld xmm5, 20
por xmm5, xmm8
movdqa xmm8, xmm6
psrld xmm8, 12
pslld xmm6, 20
por xmm6, xmm8
movdqa xmm8, xmm7
psrld xmm8, 12
pslld xmm7, 20
por xmm7, xmm8
paddd xmm0, xmmword ptr [rsp+0D0H]
paddd xmm1, xmmword ptr [rsp+0B0H]
paddd xmm2, xmmword ptr [rsp+0A0H]
paddd xmm3, xmmword ptr [rsp+80H]
paddd xmm0, xmm4
paddd xmm1, xmm5
paddd xmm2, xmm6
paddd xmm3, xmm7
pxor xmm12, xmm0
pxor xmm13, xmm1
pxor xmm14, xmm2
pxor xmm15, xmm3
movdqa xmm8, xmm12
psrld xmm12, 8
pslld xmm8, 24
pxor xmm12, xmm8
movdqa xmm8, xmm13
psrld xmm13, 8
pslld xmm8, 24
pxor xmm13, xmm8
movdqa xmm8, xmm14
psrld xmm14, 8
pslld xmm8, 24
pxor xmm14, xmm8
movdqa xmm8, xmm15
psrld xmm15, 8
pslld xmm8, 24
pxor xmm15, xmm8
movdqa xmm8, xmmword ptr [rsp+100H]
paddd xmm8, xmm12
paddd xmm9, xmm13
paddd xmm10, xmm14
paddd xmm11, xmm15
pxor xmm4, xmm8
pxor xmm5, xmm9
pxor xmm6, xmm10
pxor xmm7, xmm11
movdqa xmmword ptr [rsp+100H], xmm8
movdqa xmm8, xmm4
psrld xmm8, 7
pslld xmm4, 25
por xmm4, xmm8
movdqa xmm8, xmm5
psrld xmm8, 7
pslld xmm5, 25
por xmm5, xmm8
movdqa xmm8, xmm6
psrld xmm8, 7
pslld xmm6, 25
por xmm6, xmm8
movdqa xmm8, xmm7
psrld xmm8, 7
pslld xmm7, 25
por xmm7, xmm8
paddd xmm0, xmmword ptr [rsp+70H]
paddd xmm1, xmmword ptr [rsp+50H]
paddd xmm2, xmmword ptr [rsp]
paddd xmm3, xmmword ptr [rsp+60H]
paddd xmm0, xmm5
paddd xmm1, xmm6
paddd xmm2, xmm7
paddd xmm3, xmm4
pxor xmm15, xmm0
pxor xmm12, xmm1
pxor xmm13, xmm2
pxor xmm14, xmm3
pshuflw xmm15, xmm15, 0B1H
pshufhw xmm15, xmm15, 0B1H
pshuflw xmm12, xmm12, 0B1H
pshufhw xmm12, xmm12, 0B1H
pshuflw xmm13, xmm13, 0B1H
pshufhw xmm13, xmm13, 0B1H
pshuflw xmm14, xmm14, 0B1H
pshufhw xmm14, xmm14, 0B1H
paddd xmm10, xmm15
paddd xmm11, xmm12
movdqa xmm8, xmmword ptr [rsp+100H]
paddd xmm8, xmm13
paddd xmm9, xmm14
pxor xmm5, xmm10
pxor xmm6, xmm11
pxor xmm7, xmm8
pxor xmm4, xmm9
movdqa xmmword ptr [rsp+100H], xmm8
movdqa xmm8, xmm5
psrld xmm8, 12
pslld xmm5, 20
por xmm5, xmm8
movdqa xmm8, xmm6
psrld xmm8, 12
pslld xmm6, 20
por xmm6, xmm8
movdqa xmm8, xmm7
psrld xmm8, 12
pslld xmm7, 20
por xmm7, xmm8
movdqa xmm8, xmm4
psrld xmm8, 12
pslld xmm4, 20
por xmm4, xmm8
paddd xmm0, xmmword ptr [rsp+20H]
paddd xmm1, xmmword ptr [rsp+30H]
paddd xmm2, xmmword ptr [rsp+10H]
paddd xmm3, xmmword ptr [rsp+40H]
paddd xmm0, xmm5
paddd xmm1, xmm6
paddd xmm2, xmm7
paddd xmm3, xmm4
pxor xmm15, xmm0
pxor xmm12, xmm1
pxor xmm13, xmm2
pxor xmm14, xmm3
movdqa xmm8, xmm15
psrld xmm15, 8
pslld xmm8, 24
pxor xmm15, xmm8
movdqa xmm8, xmm12
psrld xmm12, 8
pslld xmm8, 24
pxor xmm12, xmm8
movdqa xmm8, xmm13
psrld xmm13, 8
pslld xmm8, 24
pxor xmm13, xmm8
movdqa xmm8, xmm14
psrld xmm14, 8
pslld xmm8, 24
pxor xmm14, xmm8
paddd xmm10, xmm15
paddd xmm11, xmm12
movdqa xmm8, xmmword ptr [rsp+100H]
paddd xmm8, xmm13
paddd xmm9, xmm14
pxor xmm5, xmm10
pxor xmm6, xmm11
pxor xmm7, xmm8
pxor xmm4, xmm9
movdqa xmmword ptr [rsp+100H], xmm8
movdqa xmm8, xmm5
psrld xmm8, 7
pslld xmm5, 25
por xmm5, xmm8
movdqa xmm8, xmm6
psrld xmm8, 7
pslld xmm6, 25
por xmm6, xmm8
movdqa xmm8, xmm7
psrld xmm8, 7
pslld xmm7, 25
por xmm7, xmm8
movdqa xmm8, xmm4
psrld xmm8, 7
pslld xmm4, 25
por xmm4, xmm8
paddd xmm0, xmmword ptr [rsp+90H]
paddd xmm1, xmmword ptr [rsp+0B0H]
paddd xmm2, xmmword ptr [rsp+80H]
paddd xmm3, xmmword ptr [rsp+0F0H]
paddd xmm0, xmm4
paddd xmm1, xmm5
paddd xmm2, xmm6
paddd xmm3, xmm7
pxor xmm12, xmm0
pxor xmm13, xmm1
pxor xmm14, xmm2
pxor xmm15, xmm3
pshuflw xmm12, xmm12, 0B1H
pshufhw xmm12, xmm12, 0B1H
pshuflw xmm13, xmm13, 0B1H
pshufhw xmm13, xmm13, 0B1H
pshuflw xmm14, xmm14, 0B1H
pshufhw xmm14, xmm14, 0B1H
pshuflw xmm15, xmm15, 0B1H
pshufhw xmm15, xmm15, 0B1H
movdqa xmm8, xmmword ptr [rsp+100H]
paddd xmm8, xmm12
paddd xmm9, xmm13
paddd xmm10, xmm14
paddd xmm11, xmm15
pxor xmm4, xmm8
pxor xmm5, xmm9
pxor xmm6, xmm10
pxor xmm7, xmm11
movdqa xmmword ptr [rsp+100H], xmm8
movdqa xmm8, xmm4
psrld xmm8, 12
pslld xmm4, 20
por xmm4, xmm8
movdqa xmm8, xmm5
psrld xmm8, 12
pslld xmm5, 20
por xmm5, xmm8
movdqa xmm8, xmm6
psrld xmm8, 12
pslld xmm6, 20
por xmm6, xmm8
movdqa xmm8, xmm7
psrld xmm8, 12
pslld xmm7, 20
por xmm7, xmm8
paddd xmm0, xmmword ptr [rsp+0E0H]
paddd xmm1, xmmword ptr [rsp+50H]
paddd xmm2, xmmword ptr [rsp+0C0H]
paddd xmm3, xmmword ptr [rsp+10H]
paddd xmm0, xmm4
paddd xmm1, xmm5
paddd xmm2, xmm6
paddd xmm3, xmm7
pxor xmm12, xmm0
pxor xmm13, xmm1
pxor xmm14, xmm2
pxor xmm15, xmm3
movdqa xmm8, xmm12
psrld xmm12, 8
pslld xmm8, 24
pxor xmm12, xmm8
movdqa xmm8, xmm13
psrld xmm13, 8
pslld xmm8, 24
pxor xmm13, xmm8
movdqa xmm8, xmm14
psrld xmm14, 8
pslld xmm8, 24
pxor xmm14, xmm8
movdqa xmm8, xmm15
psrld xmm15, 8
pslld xmm8, 24
pxor xmm15, xmm8
movdqa xmm8, xmmword ptr [rsp+100H]
paddd xmm8, xmm12
paddd xmm9, xmm13
paddd xmm10, xmm14
paddd xmm11, xmm15
pxor xmm4, xmm8
pxor xmm5, xmm9
pxor xmm6, xmm10
pxor xmm7, xmm11
movdqa xmmword ptr [rsp+100H], xmm8
movdqa xmm8, xmm4
psrld xmm8, 7
pslld xmm4, 25
por xmm4, xmm8
movdqa xmm8, xmm5
psrld xmm8, 7
pslld xmm5, 25
por xmm5, xmm8
movdqa xmm8, xmm6
psrld xmm8, 7
pslld xmm6, 25
por xmm6, xmm8
movdqa xmm8, xmm7
psrld xmm8, 7
pslld xmm7, 25
por xmm7, xmm8
paddd xmm0, xmmword ptr [rsp+0D0H]
paddd xmm1, xmmword ptr [rsp]
paddd xmm2, xmmword ptr [rsp+20H]
paddd xmm3, xmmword ptr [rsp+40H]
paddd xmm0, xmm5
paddd xmm1, xmm6
paddd xmm2, xmm7
paddd xmm3, xmm4
pxor xmm15, xmm0
pxor xmm12, xmm1
pxor xmm13, xmm2
pxor xmm14, xmm3
pshuflw xmm15, xmm15, 0B1H
pshufhw xmm15, xmm15, 0B1H
pshuflw xmm12, xmm12, 0B1H
pshufhw xmm12, xmm12, 0B1H
pshuflw xmm13, xmm13, 0B1H
pshufhw xmm13, xmm13, 0B1H
pshuflw xmm14, xmm14, 0B1H
pshufhw xmm14, xmm14, 0B1H
paddd xmm10, xmm15
paddd xmm11, xmm12
movdqa xmm8, xmmword ptr [rsp+100H]
paddd xmm8, xmm13
paddd xmm9, xmm14
pxor xmm5, xmm10
pxor xmm6, xmm11
pxor xmm7, xmm8
pxor xmm4, xmm9
movdqa xmmword ptr [rsp+100H], xmm8
movdqa xmm8, xmm5
psrld xmm8, 12
pslld xmm5, 20
por xmm5, xmm8
movdqa xmm8, xmm6
psrld xmm8, 12
pslld xmm6, 20
por xmm6, xmm8
movdqa xmm8, xmm7
psrld xmm8, 12
pslld xmm7, 20
por xmm7, xmm8
movdqa xmm8, xmm4
psrld xmm8, 12
pslld xmm4, 20
por xmm4, xmm8
paddd xmm0, xmmword ptr [rsp+30H]
paddd xmm1, xmmword ptr [rsp+0A0H]
paddd xmm2, xmmword ptr [rsp+60H]
paddd xmm3, xmmword ptr [rsp+70H]
paddd xmm0, xmm5
paddd xmm1, xmm6
paddd xmm2, xmm7
paddd xmm3, xmm4
pxor xmm15, xmm0
pxor xmm12, xmm1
pxor xmm13, xmm2
pxor xmm14, xmm3
movdqa xmm8, xmm15
psrld xmm15, 8
pslld xmm8, 24
pxor xmm15, xmm8
movdqa xmm8, xmm12
psrld xmm12, 8
pslld xmm8, 24
pxor xmm12, xmm8
movdqa xmm8, xmm13
psrld xmm13, 8
pslld xmm8, 24
pxor xmm13, xmm8
movdqa xmm8, xmm14
psrld xmm14, 8
pslld xmm8, 24
pxor xmm14, xmm8
paddd xmm10, xmm15
paddd xmm11, xmm12
movdqa xmm8, xmmword ptr [rsp+100H]
paddd xmm8, xmm13
paddd xmm9, xmm14
pxor xmm5, xmm10
pxor xmm6, xmm11
pxor xmm7, xmm8
pxor xmm4, xmm9
movdqa xmmword ptr [rsp+100H], xmm8
movdqa xmm8, xmm5
psrld xmm8, 7
pslld xmm5, 25
por xmm5, xmm8
movdqa xmm8, xmm6
psrld xmm8, 7
pslld xmm6, 25
por xmm6, xmm8
movdqa xmm8, xmm7
psrld xmm8, 7
pslld xmm7, 25
por xmm7, xmm8
movdqa xmm8, xmm4
psrld xmm8, 7
pslld xmm4, 25
por xmm4, xmm8
paddd xmm0, xmmword ptr [rsp+0B0H]
paddd xmm1, xmmword ptr [rsp+50H]
paddd xmm2, xmmword ptr [rsp+10H]
paddd xmm3, xmmword ptr [rsp+80H]
paddd xmm0, xmm4
paddd xmm1, xmm5
paddd xmm2, xmm6
paddd xmm3, xmm7
pxor xmm12, xmm0
pxor xmm13, xmm1
pxor xmm14, xmm2
pxor xmm15, xmm3
pshuflw xmm12, xmm12, 0B1H
pshufhw xmm12, xmm12, 0B1H
pshuflw xmm13, xmm13, 0B1H
pshufhw xmm13, xmm13, 0B1H
pshuflw xmm14, xmm14, 0B1H
pshufhw xmm14, xmm14, 0B1H
pshuflw xmm15, xmm15, 0B1H
pshufhw xmm15, xmm15, 0B1H
movdqa xmm8, xmmword ptr [rsp+100H]
paddd xmm8, xmm12
paddd xmm9, xmm13
paddd xmm10, xmm14
paddd xmm11, xmm15
pxor xmm4, xmm8
pxor xmm5, xmm9
pxor xmm6, xmm10
pxor xmm7, xmm11
movdqa xmmword ptr [rsp+100H], xmm8
movdqa xmm8, xmm4
psrld xmm8, 12
pslld xmm4, 20
por xmm4, xmm8
movdqa xmm8, xmm5
psrld xmm8, 12
pslld xmm5, 20
por xmm5, xmm8
movdqa xmm8, xmm6
psrld xmm8, 12
pslld xmm6, 20
por xmm6, xmm8
movdqa xmm8, xmm7
psrld xmm8, 12
pslld xmm7, 20
por xmm7, xmm8
paddd xmm0, xmmword ptr [rsp+0F0H]
paddd xmm1, xmmword ptr [rsp]
paddd xmm2, xmmword ptr [rsp+90H]
paddd xmm3, xmmword ptr [rsp+60H]
paddd xmm0, xmm4
paddd xmm1, xmm5
paddd xmm2, xmm6
paddd xmm3, xmm7
pxor xmm12, xmm0
pxor xmm13, xmm1
pxor xmm14, xmm2
pxor xmm15, xmm3
movdqa xmm8, xmm12
psrld xmm12, 8
pslld xmm8, 24
pxor xmm12, xmm8
movdqa xmm8, xmm13
psrld xmm13, 8
pslld xmm8, 24
pxor xmm13, xmm8
movdqa xmm8, xmm14
psrld xmm14, 8
pslld xmm8, 24
pxor xmm14, xmm8
movdqa xmm8, xmm15
psrld xmm15, 8
pslld xmm8, 24
pxor xmm15, xmm8
movdqa xmm8, xmmword ptr [rsp+100H]
paddd xmm8, xmm12
paddd xmm9, xmm13
paddd xmm10, xmm14
paddd xmm11, xmm15
pxor xmm4, xmm8
pxor xmm5, xmm9
pxor xmm6, xmm10
pxor xmm7, xmm11
movdqa xmmword ptr [rsp+100H], xmm8
movdqa xmm8, xmm4
psrld xmm8, 7
pslld xmm4, 25
por xmm4, xmm8
movdqa xmm8, xmm5
psrld xmm8, 7
pslld xmm5, 25
por xmm5, xmm8
movdqa xmm8, xmm6
psrld xmm8, 7
pslld xmm6, 25
por xmm6, xmm8
movdqa xmm8, xmm7
psrld xmm8, 7
pslld xmm7, 25
por xmm7, xmm8
paddd xmm0, xmmword ptr [rsp+0E0H]
paddd xmm1, xmmword ptr [rsp+20H]
paddd xmm2, xmmword ptr [rsp+30H]
paddd xmm3, xmmword ptr [rsp+70H]
paddd xmm0, xmm5
paddd xmm1, xmm6
paddd xmm2, xmm7
paddd xmm3, xmm4
pxor xmm15, xmm0
pxor xmm12, xmm1
pxor xmm13, xmm2
pxor xmm14, xmm3
pshuflw xmm15, xmm15, 0B1H
pshufhw xmm15, xmm15, 0B1H
pshuflw xmm12, xmm12, 0B1H
pshufhw xmm12, xmm12, 0B1H
pshuflw xmm13, xmm13, 0B1H
pshufhw xmm13, xmm13, 0B1H
pshuflw xmm14, xmm14, 0B1H
pshufhw xmm14, xmm14, 0B1H
paddd xmm10, xmm15
paddd xmm11, xmm12
movdqa xmm8, xmmword ptr [rsp+100H]
paddd xmm8, xmm13
paddd xmm9, xmm14
pxor xmm5, xmm10
pxor xmm6, xmm11
pxor xmm7, xmm8
pxor xmm4, xmm9
movdqa xmmword ptr [rsp+100H], xmm8
movdqa xmm8, xmm5
psrld xmm8, 12
pslld xmm5, 20
por xmm5, xmm8
movdqa xmm8, xmm6
psrld xmm8, 12
pslld xmm6, 20
por xmm6, xmm8
movdqa xmm8, xmm7
psrld xmm8, 12
pslld xmm7, 20
por xmm7, xmm8
movdqa xmm8, xmm4
psrld xmm8, 12
pslld xmm4, 20
por xmm4, xmm8
paddd xmm0, xmmword ptr [rsp+0A0H]
paddd xmm1, xmmword ptr [rsp+0C0H]
paddd xmm2, xmmword ptr [rsp+40H]
paddd xmm3, xmmword ptr [rsp+0D0H]
paddd xmm0, xmm5
paddd xmm1, xmm6
paddd xmm2, xmm7
paddd xmm3, xmm4
pxor xmm15, xmm0
pxor xmm12, xmm1
pxor xmm13, xmm2
pxor xmm14, xmm3
movdqa xmm8, xmm15
psrld xmm15, 8
pslld xmm8, 24
pxor xmm15, xmm8
movdqa xmm8, xmm12
psrld xmm12, 8
pslld xmm8, 24
pxor xmm12, xmm8
movdqa xmm8, xmm13
psrld xmm13, 8
pslld xmm8, 24
pxor xmm13, xmm8
movdqa xmm8, xmm14
psrld xmm14, 8
pslld xmm8, 24
pxor xmm14, xmm8
paddd xmm10, xmm15
paddd xmm11, xmm12
movdqa xmm8, xmmword ptr [rsp+100H]
paddd xmm8, xmm13
paddd xmm9, xmm14
pxor xmm5, xmm10
pxor xmm6, xmm11
pxor xmm7, xmm8
pxor xmm4, xmm9
pxor xmm0, xmm8
pxor xmm1, xmm9
pxor xmm2, xmm10
pxor xmm3, xmm11
movdqa xmm8, xmm5
psrld xmm8, 7
pslld xmm5, 25
por xmm5, xmm8
movdqa xmm8, xmm6
psrld xmm8, 7
pslld xmm6, 25
por xmm6, xmm8
movdqa xmm8, xmm7
psrld xmm8, 7
pslld xmm7, 25
por xmm7, xmm8
movdqa xmm8, xmm4
psrld xmm8, 7
pslld xmm4, 25
por xmm4, xmm8
pxor xmm4, xmm12
pxor xmm5, xmm13
pxor xmm6, xmm14
pxor xmm7, xmm15
mov eax, r13d
jne innerloop4
movdqa xmm9, xmm0
punpckldq xmm0, xmm1
punpckhdq xmm9, xmm1
movdqa xmm11, xmm2
punpckldq xmm2, xmm3
punpckhdq xmm11, xmm3
movdqa xmm1, xmm0
punpcklqdq xmm0, xmm2
punpckhqdq xmm1, xmm2
movdqa xmm3, xmm9
punpcklqdq xmm9, xmm11
punpckhqdq xmm3, xmm11
movdqu xmmword ptr [rbx], xmm0
movdqu xmmword ptr [rbx+20H], xmm1
movdqu xmmword ptr [rbx+40H], xmm9
movdqu xmmword ptr [rbx+60H], xmm3
movdqa xmm9, xmm4
punpckldq xmm4, xmm5
punpckhdq xmm9, xmm5
movdqa xmm11, xmm6
punpckldq xmm6, xmm7
punpckhdq xmm11, xmm7
movdqa xmm5, xmm4
punpcklqdq xmm4, xmm6
punpckhqdq xmm5, xmm6
movdqa xmm7, xmm9
punpcklqdq xmm9, xmm11
punpckhqdq xmm7, xmm11
movdqu xmmword ptr [rbx+10H], xmm4
movdqu xmmword ptr [rbx+30H], xmm5
movdqu xmmword ptr [rbx+50H], xmm9
movdqu xmmword ptr [rbx+70H], xmm7
movdqa xmm1, xmmword ptr [rsp+110H]
movdqa xmm0, xmm1
paddd xmm1, xmmword ptr [rsp+150H]
movdqa xmmword ptr [rsp+110H], xmm1
pxor xmm0, xmmword ptr [CMP_MSB_MASK]
pxor xmm1, xmmword ptr [CMP_MSB_MASK]
pcmpgtd xmm0, xmm1
movdqa xmm1, xmmword ptr [rsp+120H]
psubd xmm1, xmm0
movdqa xmmword ptr [rsp+120H], xmm1
add rbx, 128
add rdi, 32
sub rsi, 4
cmp rsi, 4
jnc outerloop4
test rsi, rsi
jne final3blocks
unwind:
movdqa xmm6, xmmword ptr [rsp+170H]
movdqa xmm7, xmmword ptr [rsp+180H]
movdqa xmm8, xmmword ptr [rsp+190H]
movdqa xmm9, xmmword ptr [rsp+1A0H]
movdqa xmm10, xmmword ptr [rsp+1B0H]
movdqa xmm11, xmmword ptr [rsp+1C0H]
movdqa xmm12, xmmword ptr [rsp+1D0H]
movdqa xmm13, xmmword ptr [rsp+1E0H]
movdqa xmm14, xmmword ptr [rsp+1F0H]
movdqa xmm15, xmmword ptr [rsp+200H]
mov rsp, rbp
pop rbp
pop rbx
pop rdi
pop rsi
pop r12
pop r13
pop r14
pop r15
ret
ALIGN 16
final3blocks:
test esi, 2H
je final1block
movups xmm0, xmmword ptr [rcx]
movups xmm1, xmmword ptr [rcx+10H]
movaps xmm8, xmm0
movaps xmm9, xmm1
movd xmm13, dword ptr [rsp+110H]
movd xmm14, dword ptr [rsp+120H]
punpckldq xmm13, xmm14
movaps xmmword ptr [rsp], xmm13
movd xmm14, dword ptr [rsp+114H]
movd xmm13, dword ptr [rsp+124H]
punpckldq xmm14, xmm13
movaps xmmword ptr [rsp+10H], xmm14
mov r8, qword ptr [rdi]
mov r9, qword ptr [rdi+8H]
movzx eax, byte ptr [rbp+80H]
or eax, r13d
xor edx, edx
innerloop2:
mov r14d, eax
or eax, r12d
add rdx, 64
cmp rdx, r15
cmovne eax, r14d
movaps xmm2, xmmword ptr [BLAKE3_IV]
movaps xmm10, xmm2
movups xmm4, xmmword ptr [r8+rdx-40H]
movups xmm5, xmmword ptr [r8+rdx-30H]
movaps xmm3, xmm4
shufps xmm4, xmm5, 136
shufps xmm3, xmm5, 221
movaps xmm5, xmm3
movups xmm6, xmmword ptr [r8+rdx-20H]
movups xmm7, xmmword ptr [r8+rdx-10H]
movaps xmm3, xmm6
shufps xmm6, xmm7, 136
pshufd xmm6, xmm6, 93H
shufps xmm3, xmm7, 221
pshufd xmm7, xmm3, 93H
movups xmm12, xmmword ptr [r9+rdx-40H]
movups xmm13, xmmword ptr [r9+rdx-30H]
movaps xmm11, xmm12
shufps xmm12, xmm13, 136
shufps xmm11, xmm13, 221
movaps xmm13, xmm11
movups xmm14, xmmword ptr [r9+rdx-20H]
movups xmm15, xmmword ptr [r9+rdx-10H]
movaps xmm11, xmm14
shufps xmm14, xmm15, 136
pshufd xmm14, xmm14, 93H
shufps xmm11, xmm15, 221
pshufd xmm15, xmm11, 93H
shl rax, 20H
or rax, 40H
movd xmm3, rax
movdqa xmmword ptr [rsp+20H], xmm3
movaps xmm3, xmmword ptr [rsp]
movaps xmm11, xmmword ptr [rsp+10H]
punpcklqdq xmm3, xmmword ptr [rsp+20H]
punpcklqdq xmm11, xmmword ptr [rsp+20H]
mov al, 7
roundloop2:
paddd xmm0, xmm4
paddd xmm8, xmm12
movaps xmmword ptr [rsp+20H], xmm4
movaps xmmword ptr [rsp+30H], xmm12
paddd xmm0, xmm1
paddd xmm8, xmm9
pxor xmm3, xmm0
pxor xmm11, xmm8
pshuflw xmm3, xmm3, 0B1H
pshufhw xmm3, xmm3, 0B1H
pshuflw xmm11, xmm11, 0B1H
pshufhw xmm11, xmm11, 0B1H
paddd xmm2, xmm3
paddd xmm10, xmm11
pxor xmm1, xmm2
pxor xmm9, xmm10
movdqa xmm4, xmm1
pslld xmm1, 20
psrld xmm4, 12
por xmm1, xmm4
movdqa xmm4, xmm9
pslld xmm9, 20
psrld xmm4, 12
por xmm9, xmm4
paddd xmm0, xmm5
paddd xmm8, xmm13
movaps xmmword ptr [rsp+40H], xmm5
movaps xmmword ptr [rsp+50H], xmm13
paddd xmm0, xmm1
paddd xmm8, xmm9
pxor xmm3, xmm0
pxor xmm11, xmm8
movdqa xmm13, xmm3
psrld xmm3, 8
pslld xmm13, 24
pxor xmm3, xmm13
movdqa xmm13, xmm11
psrld xmm11, 8
pslld xmm13, 24
pxor xmm11, xmm13
paddd xmm2, xmm3
paddd xmm10, xmm11
pxor xmm1, xmm2
pxor xmm9, xmm10
movdqa xmm4, xmm1
pslld xmm1, 25
psrld xmm4, 7
por xmm1, xmm4
movdqa xmm4, xmm9
pslld xmm9, 25
psrld xmm4, 7
por xmm9, xmm4
pshufd xmm0, xmm0, 93H
pshufd xmm8, xmm8, 93H
pshufd xmm3, xmm3, 4EH
pshufd xmm11, xmm11, 4EH
pshufd xmm2, xmm2, 39H
pshufd xmm10, xmm10, 39H
paddd xmm0, xmm6
paddd xmm8, xmm14
paddd xmm0, xmm1
paddd xmm8, xmm9
pxor xmm3, xmm0
pxor xmm11, xmm8
pshuflw xmm3, xmm3, 0B1H
pshufhw xmm3, xmm3, 0B1H
pshuflw xmm11, xmm11, 0B1H
pshufhw xmm11, xmm11, 0B1H
paddd xmm2, xmm3
paddd xmm10, xmm11
pxor xmm1, xmm2
pxor xmm9, xmm10
movdqa xmm4, xmm1
pslld xmm1, 20
psrld xmm4, 12
por xmm1, xmm4
movdqa xmm4, xmm9
pslld xmm9, 20
psrld xmm4, 12
por xmm9, xmm4
paddd xmm0, xmm7
paddd xmm8, xmm15
paddd xmm0, xmm1
paddd xmm8, xmm9
pxor xmm3, xmm0
pxor xmm11, xmm8
movdqa xmm13, xmm3
psrld xmm3, 8
pslld xmm13, 24
pxor xmm3, xmm13
movdqa xmm13, xmm11
psrld xmm11, 8
pslld xmm13, 24
pxor xmm11, xmm13
paddd xmm2, xmm3
paddd xmm10, xmm11
pxor xmm1, xmm2
pxor xmm9, xmm10
movdqa xmm4, xmm1
pslld xmm1, 25
psrld xmm4, 7
por xmm1, xmm4
movdqa xmm4, xmm9
pslld xmm9, 25
psrld xmm4, 7
por xmm9, xmm4
pshufd xmm0, xmm0, 39H
pshufd xmm8, xmm8, 39H
pshufd xmm3, xmm3, 4EH
pshufd xmm11, xmm11, 4EH
pshufd xmm2, xmm2, 93H
pshufd xmm10, xmm10, 93H
dec al
je endroundloop2
movdqa xmm12, xmmword ptr [rsp+20H]
movdqa xmm5, xmmword ptr [rsp+40H]
pshufd xmm13, xmm12, 0FH
shufps xmm12, xmm5, 214
pshufd xmm4, xmm12, 39H
movdqa xmm12, xmm6
shufps xmm12, xmm7, 250
pand xmm13, xmmword ptr [PBLENDW_0x33_MASK]
pand xmm12, xmmword ptr [PBLENDW_0xCC_MASK]
por xmm13, xmm12
movdqa xmmword ptr [rsp+20H], xmm13
movdqa xmm12, xmm7
punpcklqdq xmm12, xmm5
movdqa xmm13, xmm6
pand xmm12, xmmword ptr [PBLENDW_0x3F_MASK]
pand xmm13, xmmword ptr [PBLENDW_0xC0_MASK]
por xmm12, xmm13
pshufd xmm12, xmm12, 78H
punpckhdq xmm5, xmm7
punpckldq xmm6, xmm5
pshufd xmm7, xmm6, 1EH
movdqa xmmword ptr [rsp+40H], xmm12
movdqa xmm5, xmmword ptr [rsp+30H]
movdqa xmm13, xmmword ptr [rsp+50H]
pshufd xmm6, xmm5, 0FH
shufps xmm5, xmm13, 214
pshufd xmm12, xmm5, 39H
movdqa xmm5, xmm14
shufps xmm5, xmm15, 250
pand xmm6, xmmword ptr [PBLENDW_0x33_MASK]
pand xmm5, xmmword ptr [PBLENDW_0xCC_MASK]
por xmm6, xmm5
movdqa xmm5, xmm15
punpcklqdq xmm5, xmm13
movdqa xmmword ptr [rsp+30H], xmm2
movdqa xmm2, xmm14
pand xmm5, xmmword ptr [PBLENDW_0x3F_MASK]
pand xmm2, xmmword ptr [PBLENDW_0xC0_MASK]
por xmm5, xmm2
movdqa xmm2, xmmword ptr [rsp+30H]
pshufd xmm5, xmm5, 78H
punpckhdq xmm13, xmm15
punpckldq xmm14, xmm13
pshufd xmm15, xmm14, 1EH
movdqa xmm13, xmm6
movdqa xmm14, xmm5
movdqa xmm5, xmmword ptr [rsp+20H]
movdqa xmm6, xmmword ptr [rsp+40H]
jmp roundloop2
endroundloop2:
pxor xmm0, xmm2
pxor xmm1, xmm3
pxor xmm8, xmm10
pxor xmm9, xmm11
mov eax, r13d
cmp rdx, r15
jne innerloop2
movups xmmword ptr [rbx], xmm0
movups xmmword ptr [rbx+10H], xmm1
movups xmmword ptr [rbx+20H], xmm8
movups xmmword ptr [rbx+30H], xmm9
mov eax, dword ptr [rsp+130H]
neg eax
mov r10d, dword ptr [rsp+110H+8*rax]
mov r11d, dword ptr [rsp+120H+8*rax]
mov dword ptr [rsp+110H], r10d
mov dword ptr [rsp+120H], r11d
add rdi, 16
add rbx, 64
sub rsi, 2
final1block:
test esi, 1H
je unwind
movups xmm0, xmmword ptr [rcx]
movups xmm1, xmmword ptr [rcx+10H]
movd xmm13, dword ptr [rsp+110H]
movd xmm14, dword ptr [rsp+120H]
punpckldq xmm13, xmm14
mov r8, qword ptr [rdi]
movzx eax, byte ptr [rbp+80H]
or eax, r13d
xor edx, edx
innerloop1:
mov r14d, eax
or eax, r12d
add rdx, 64
cmp rdx, r15
cmovne eax, r14d
movaps xmm2, xmmword ptr [BLAKE3_IV]
shl rax, 32
or rax, 64
movd xmm12, rax
movdqa xmm3, xmm13
punpcklqdq xmm3, xmm12
movups xmm4, xmmword ptr [r8+rdx-40H]
movups xmm5, xmmword ptr [r8+rdx-30H]
movaps xmm8, xmm4
shufps xmm4, xmm5, 136
shufps xmm8, xmm5, 221
movaps xmm5, xmm8
movups xmm6, xmmword ptr [r8+rdx-20H]
movups xmm7, xmmword ptr [r8+rdx-10H]
movaps xmm8, xmm6
shufps xmm6, xmm7, 136
pshufd xmm6, xmm6, 93H
shufps xmm8, xmm7, 221
pshufd xmm7, xmm8, 93H
mov al, 7
roundloop1:
paddd xmm0, xmm4
paddd xmm0, xmm1
pxor xmm3, xmm0
pshuflw xmm3, xmm3, 0B1H
pshufhw xmm3, xmm3, 0B1H
paddd xmm2, xmm3
pxor xmm1, xmm2
movdqa xmm11, xmm1
pslld xmm1, 20
psrld xmm11, 12
por xmm1, xmm11
paddd xmm0, xmm5
paddd xmm0, xmm1
pxor xmm3, xmm0
movdqa xmm14, xmm3
psrld xmm3, 8
pslld xmm14, 24
pxor xmm3, xmm14
paddd xmm2, xmm3
pxor xmm1, xmm2
movdqa xmm11, xmm1
pslld xmm1, 25
psrld xmm11, 7
por xmm1, xmm11
pshufd xmm0, xmm0, 93H
pshufd xmm3, xmm3, 4EH
pshufd xmm2, xmm2, 39H
paddd xmm0, xmm6
paddd xmm0, xmm1
pxor xmm3, xmm0
pshuflw xmm3, xmm3, 0B1H
pshufhw xmm3, xmm3, 0B1H
paddd xmm2, xmm3
pxor xmm1, xmm2
movdqa xmm11, xmm1
pslld xmm1, 20
psrld xmm11, 12
por xmm1, xmm11
paddd xmm0, xmm7
paddd xmm0, xmm1
pxor xmm3, xmm0
movdqa xmm14, xmm3
psrld xmm3, 8
pslld xmm14, 24
pxor xmm3, xmm14
paddd xmm2, xmm3
pxor xmm1, xmm2
movdqa xmm11, xmm1
pslld xmm1, 25
psrld xmm11, 7
por xmm1, xmm11
pshufd xmm0, xmm0, 39H
pshufd xmm3, xmm3, 4EH
pshufd xmm2, xmm2, 93H
dec al
jz endroundloop1
movdqa xmm8, xmm4
shufps xmm8, xmm5, 214
pshufd xmm9, xmm4, 0FH
pshufd xmm4, xmm8, 39H
movdqa xmm8, xmm6
shufps xmm8, xmm7, 250
pand xmm9, xmmword ptr [PBLENDW_0x33_MASK]
pand xmm8, xmmword ptr [PBLENDW_0xCC_MASK]
por xmm9, xmm8
movdqa xmm8, xmm7
punpcklqdq xmm8, xmm5
movdqa xmm10, xmm6
pand xmm8, xmmword ptr [PBLENDW_0x3F_MASK]
pand xmm10, xmmword ptr [PBLENDW_0xC0_MASK]
por xmm8, xmm10
pshufd xmm8, xmm8, 78H
punpckhdq xmm5, xmm7
punpckldq xmm6, xmm5
pshufd xmm7, xmm6, 1EH
movdqa xmm5, xmm9
movdqa xmm6, xmm8
jmp roundloop1
endroundloop1:
pxor xmm0, xmm2
pxor xmm1, xmm3
mov eax, r13d
cmp rdx, r15
jne innerloop1
movups xmmword ptr [rbx], xmm0
movups xmmword ptr [rbx+10H], xmm1
jmp unwind
_blake3_hash_many_sse2 ENDP
blake3_hash_many_sse2 ENDP
blake3_compress_in_place_sse2 PROC
_blake3_compress_in_place_sse2 PROC
sub rsp, 120
movdqa xmmword ptr [rsp], xmm6
movdqa xmmword ptr [rsp+10H], xmm7
movdqa xmmword ptr [rsp+20H], xmm8
movdqa xmmword ptr [rsp+30H], xmm9
movdqa xmmword ptr [rsp+40H], xmm11
movdqa xmmword ptr [rsp+50H], xmm14
movdqa xmmword ptr [rsp+60H], xmm15
movups xmm0, xmmword ptr [rcx]
movups xmm1, xmmword ptr [rcx+10H]
movaps xmm2, xmmword ptr [BLAKE3_IV]
movzx eax, byte ptr [rsp+0A0H]
movzx r8d, r8b
shl rax, 32
add r8, rax
movd xmm3, r9
movd xmm4, r8
punpcklqdq xmm3, xmm4
movups xmm4, xmmword ptr [rdx]
movups xmm5, xmmword ptr [rdx+10H]
movaps xmm8, xmm4
shufps xmm4, xmm5, 136
shufps xmm8, xmm5, 221
movaps xmm5, xmm8
movups xmm6, xmmword ptr [rdx+20H]
movups xmm7, xmmword ptr [rdx+30H]
movaps xmm8, xmm6
shufps xmm6, xmm7, 136
pshufd xmm6, xmm6, 93H
shufps xmm8, xmm7, 221
pshufd xmm7, xmm8, 93H
mov al, 7
@@:
paddd xmm0, xmm4
paddd xmm0, xmm1
pxor xmm3, xmm0
pshuflw xmm3, xmm3, 0B1H
pshufhw xmm3, xmm3, 0B1H
paddd xmm2, xmm3
pxor xmm1, xmm2
movdqa xmm11, xmm1
pslld xmm1, 20
psrld xmm11, 12
por xmm1, xmm11
paddd xmm0, xmm5
paddd xmm0, xmm1
pxor xmm3, xmm0
movdqa xmm14, xmm3
psrld xmm3, 8
pslld xmm14, 24
pxor xmm3, xmm14
paddd xmm2, xmm3
pxor xmm1, xmm2
movdqa xmm11, xmm1
pslld xmm1, 25
psrld xmm11, 7
por xmm1, xmm11
pshufd xmm0, xmm0, 93H
pshufd xmm3, xmm3, 4EH
pshufd xmm2, xmm2, 39H
paddd xmm0, xmm6
paddd xmm0, xmm1
pxor xmm3, xmm0
pshuflw xmm3, xmm3, 0B1H
pshufhw xmm3, xmm3, 0B1H
paddd xmm2, xmm3
pxor xmm1, xmm2
movdqa xmm11, xmm1
pslld xmm1, 20
psrld xmm11, 12
por xmm1, xmm11
paddd xmm0, xmm7
paddd xmm0, xmm1
pxor xmm3, xmm0
movdqa xmm14, xmm3
psrld xmm3, 8
pslld xmm14, 24
pxor xmm3, xmm14
paddd xmm2, xmm3
pxor xmm1, xmm2
movdqa xmm11, xmm1
pslld xmm1, 25
psrld xmm11, 7
por xmm1, xmm11
pshufd xmm0, xmm0, 39H
pshufd xmm3, xmm3, 4EH
pshufd xmm2, xmm2, 93H
dec al
jz @F
movdqa xmm8, xmm4
shufps xmm8, xmm5, 214
pshufd xmm9, xmm4, 0FH
pshufd xmm4, xmm8, 39H
movdqa xmm8, xmm6
shufps xmm8, xmm7, 250
pand xmm9, xmmword ptr [PBLENDW_0x33_MASK]
pand xmm8, xmmword ptr [PBLENDW_0xCC_MASK]
por xmm9, xmm8
movdqa xmm8, xmm7
punpcklqdq xmm8, xmm5
movdqa xmm14, xmm6
pand xmm8, xmmword ptr [PBLENDW_0x3F_MASK]
pand xmm14, xmmword ptr [PBLENDW_0xC0_MASK]
por xmm8, xmm14
pshufd xmm8, xmm8, 78H
punpckhdq xmm5, xmm7
punpckldq xmm6, xmm5
pshufd xmm7, xmm6, 1EH
movdqa xmm5, xmm9
movdqa xmm6, xmm8
jmp @B
@@:
pxor xmm0, xmm2
pxor xmm1, xmm3
movups xmmword ptr [rcx], xmm0
movups xmmword ptr [rcx+10H], xmm1
movdqa xmm6, xmmword ptr [rsp]
movdqa xmm7, xmmword ptr [rsp+10H]
movdqa xmm8, xmmword ptr [rsp+20H]
movdqa xmm9, xmmword ptr [rsp+30H]
movdqa xmm11, xmmword ptr [rsp+40H]
movdqa xmm14, xmmword ptr [rsp+50H]
movdqa xmm15, xmmword ptr [rsp+60H]
add rsp, 120
ret
_blake3_compress_in_place_sse2 ENDP
blake3_compress_in_place_sse2 ENDP
ALIGN 16
blake3_compress_xof_sse2 PROC
_blake3_compress_xof_sse2 PROC
sub rsp, 120
movdqa xmmword ptr [rsp], xmm6
movdqa xmmword ptr [rsp+10H], xmm7
movdqa xmmword ptr [rsp+20H], xmm8
movdqa xmmword ptr [rsp+30H], xmm9
movdqa xmmword ptr [rsp+40H], xmm11
movdqa xmmword ptr [rsp+50H], xmm14
movdqa xmmword ptr [rsp+60H], xmm15
movups xmm0, xmmword ptr [rcx]
movups xmm1, xmmword ptr [rcx+10H]
movaps xmm2, xmmword ptr [BLAKE3_IV]
movzx eax, byte ptr [rsp+0A0H]
movzx r8d, r8b
mov r10, qword ptr [rsp+0A8H]
shl rax, 32
add r8, rax
movd xmm3, r9
movd xmm4, r8
punpcklqdq xmm3, xmm4
movups xmm4, xmmword ptr [rdx]
movups xmm5, xmmword ptr [rdx+10H]
movaps xmm8, xmm4
shufps xmm4, xmm5, 136
shufps xmm8, xmm5, 221
movaps xmm5, xmm8
movups xmm6, xmmword ptr [rdx+20H]
movups xmm7, xmmword ptr [rdx+30H]
movaps xmm8, xmm6
shufps xmm6, xmm7, 136
pshufd xmm6, xmm6, 93H
shufps xmm8, xmm7, 221
pshufd xmm7, xmm8, 93H
mov al, 7
@@:
paddd xmm0, xmm4
paddd xmm0, xmm1
pxor xmm3, xmm0
pshuflw xmm3, xmm3, 0B1H
pshufhw xmm3, xmm3, 0B1H
paddd xmm2, xmm3
pxor xmm1, xmm2
movdqa xmm11, xmm1
pslld xmm1, 20
psrld xmm11, 12
por xmm1, xmm11
paddd xmm0, xmm5
paddd xmm0, xmm1
pxor xmm3, xmm0
movdqa xmm14, xmm3
psrld xmm3, 8
pslld xmm14, 24
pxor xmm3, xmm14
paddd xmm2, xmm3
pxor xmm1, xmm2
movdqa xmm11, xmm1
pslld xmm1, 25
psrld xmm11, 7
por xmm1, xmm11
pshufd xmm0, xmm0, 93H
pshufd xmm3, xmm3, 4EH
pshufd xmm2, xmm2, 39H
paddd xmm0, xmm6
paddd xmm0, xmm1
pxor xmm3, xmm0
pshuflw xmm3, xmm3, 0B1H
pshufhw xmm3, xmm3, 0B1H
paddd xmm2, xmm3
pxor xmm1, xmm2
movdqa xmm11, xmm1
pslld xmm1, 20
psrld xmm11, 12
por xmm1, xmm11
paddd xmm0, xmm7
paddd xmm0, xmm1
pxor xmm3, xmm0
movdqa xmm14, xmm3
psrld xmm3, 8
pslld xmm14, 24
pxor xmm3, xmm14
paddd xmm2, xmm3
pxor xmm1, xmm2
movdqa xmm11, xmm1
pslld xmm1, 25
psrld xmm11, 7
por xmm1, xmm11
pshufd xmm0, xmm0, 39H
pshufd xmm3, xmm3, 4EH
pshufd xmm2, xmm2, 93H
dec al
jz @F
movdqa xmm8, xmm4
shufps xmm8, xmm5, 214
pshufd xmm9, xmm4, 0FH
pshufd xmm4, xmm8, 39H
movdqa xmm8, xmm6
shufps xmm8, xmm7, 250
pand xmm9, xmmword ptr [PBLENDW_0x33_MASK]
pand xmm8, xmmword ptr [PBLENDW_0xCC_MASK]
por xmm9, xmm8
movdqa xmm8, xmm7
punpcklqdq xmm8, xmm5
movdqa xmm14, xmm6
pand xmm8, xmmword ptr [PBLENDW_0x3F_MASK]
pand xmm14, xmmword ptr [PBLENDW_0xC0_MASK]
por xmm8, xmm14
pshufd xmm8, xmm8, 78H
punpckhdq xmm5, xmm7
punpckldq xmm6, xmm5
pshufd xmm7, xmm6, 1EH
movdqa xmm5, xmm9
movdqa xmm6, xmm8
jmp @B
@@:
movdqu xmm4, xmmword ptr [rcx]
movdqu xmm5, xmmword ptr [rcx+10H]
pxor xmm0, xmm2
pxor xmm1, xmm3
pxor xmm2, xmm4
pxor xmm3, xmm5
movups xmmword ptr [r10], xmm0
movups xmmword ptr [r10+10H], xmm1
movups xmmword ptr [r10+20H], xmm2
movups xmmword ptr [r10+30H], xmm3
movdqa xmm6, xmmword ptr [rsp]
movdqa xmm7, xmmword ptr [rsp+10H]
movdqa xmm8, xmmword ptr [rsp+20H]
movdqa xmm9, xmmword ptr [rsp+30H]
movdqa xmm11, xmmword ptr [rsp+40H]
movdqa xmm14, xmmword ptr [rsp+50H]
movdqa xmm15, xmmword ptr [rsp+60H]
add rsp, 120
ret
_blake3_compress_xof_sse2 ENDP
blake3_compress_xof_sse2 ENDP
_TEXT ENDS
_RDATA SEGMENT READONLY PAGE ALIAS(".rdata") 'CONST'
ALIGN 64
BLAKE3_IV:
dd 6A09E667H, 0BB67AE85H, 3C6EF372H, 0A54FF53AH
ADD0:
dd 0, 1, 2, 3
ADD1:
dd 4 dup (4)
BLAKE3_IV_0:
dd 4 dup (6A09E667H)
BLAKE3_IV_1:
dd 4 dup (0BB67AE85H)
BLAKE3_IV_2:
dd 4 dup (3C6EF372H)
BLAKE3_IV_3:
dd 4 dup (0A54FF53AH)
BLAKE3_BLOCK_LEN:
dd 4 dup (64)
CMP_MSB_MASK:
dd 8 dup(80000000H)
PBLENDW_0x33_MASK:
dd 0FFFFFFFFH, 000000000H, 0FFFFFFFFH, 000000000H
PBLENDW_0xCC_MASK:
dd 000000000H, 0FFFFFFFFH, 000000000H, 0FFFFFFFFH
PBLENDW_0x3F_MASK:
dd 0FFFFFFFFH, 0FFFFFFFFH, 0FFFFFFFFH, 000000000H
PBLENDW_0xC0_MASK:
dd 000000000H, 000000000H, 000000000H, 0FFFFFFFFH
_RDATA ENDS
END