public _blake3_hash_many_sse2 public blake3_hash_many_sse2 public blake3_compress_in_place_sse2 public _blake3_compress_in_place_sse2 public blake3_compress_xof_sse2 public _blake3_compress_xof_sse2 _TEXT SEGMENT ALIGN(16) 'CODE' ALIGN 16 blake3_hash_many_sse2 PROC _blake3_hash_many_sse2 PROC push r15 push r14 push r13 push r12 push rsi push rdi push rbx push rbp mov rbp, rsp sub rsp, 528 and rsp, 0FFFFFFFFFFFFFFC0H movdqa xmmword ptr [rsp+170H], xmm6 movdqa xmmword ptr [rsp+180H], xmm7 movdqa xmmword ptr [rsp+190H], xmm8 movdqa xmmword ptr [rsp+1A0H], xmm9 movdqa xmmword ptr [rsp+1B0H], xmm10 movdqa xmmword ptr [rsp+1C0H], xmm11 movdqa xmmword ptr [rsp+1D0H], xmm12 movdqa xmmword ptr [rsp+1E0H], xmm13 movdqa xmmword ptr [rsp+1F0H], xmm14 movdqa xmmword ptr [rsp+200H], xmm15 mov rdi, rcx mov rsi, rdx mov rdx, r8 mov rcx, r9 mov r8, qword ptr [rbp+68H] movzx r9, byte ptr [rbp+70H] neg r9d movd xmm0, r9d pshufd xmm0, xmm0, 00H movdqa xmmword ptr [rsp+130H], xmm0 movdqa xmm1, xmm0 pand xmm1, xmmword ptr [ADD0] pand xmm0, xmmword ptr [ADD1] movdqa xmmword ptr [rsp+150H], xmm0 movd xmm0, r8d pshufd xmm0, xmm0, 00H paddd xmm0, xmm1 movdqa xmmword ptr [rsp+110H], xmm0 pxor xmm0, xmmword ptr [CMP_MSB_MASK] pxor xmm1, xmmword ptr [CMP_MSB_MASK] pcmpgtd xmm1, xmm0 shr r8, 32 movd xmm2, r8d pshufd xmm2, xmm2, 00H psubd xmm2, xmm1 movdqa xmmword ptr [rsp+120H], xmm2 mov rbx, qword ptr [rbp+90H] mov r15, rdx shl r15, 6 movzx r13d, byte ptr [rbp+78H] movzx r12d, byte ptr [rbp+88H] cmp rsi, 4 jc final3blocks outerloop4: movdqu xmm3, xmmword ptr [rcx] pshufd xmm0, xmm3, 00H pshufd xmm1, xmm3, 55H pshufd xmm2, xmm3, 0AAH pshufd xmm3, xmm3, 0FFH movdqu xmm7, xmmword ptr [rcx+10H] pshufd xmm4, xmm7, 00H pshufd xmm5, xmm7, 55H pshufd xmm6, xmm7, 0AAH pshufd xmm7, xmm7, 0FFH mov r8, qword ptr [rdi] mov r9, qword ptr [rdi+8H] mov r10, qword ptr [rdi+10H] mov r11, qword ptr [rdi+18H] movzx eax, byte ptr [rbp+80H] or eax, r13d xor edx, edx innerloop4: mov r14d, eax or eax, r12d add rdx, 64 cmp rdx, r15 cmovne eax, r14d movdqu xmm8, xmmword ptr [r8+rdx-40H] movdqu xmm9, xmmword ptr [r9+rdx-40H] movdqu xmm10, xmmword ptr [r10+rdx-40H] movdqu xmm11, xmmword ptr [r11+rdx-40H] movdqa xmm12, xmm8 punpckldq xmm8, xmm9 punpckhdq xmm12, xmm9 movdqa xmm14, xmm10 punpckldq xmm10, xmm11 punpckhdq xmm14, xmm11 movdqa xmm9, xmm8 punpcklqdq xmm8, xmm10 punpckhqdq xmm9, xmm10 movdqa xmm13, xmm12 punpcklqdq xmm12, xmm14 punpckhqdq xmm13, xmm14 movdqa xmmword ptr [rsp], xmm8 movdqa xmmword ptr [rsp+10H], xmm9 movdqa xmmword ptr [rsp+20H], xmm12 movdqa xmmword ptr [rsp+30H], xmm13 movdqu xmm8, xmmword ptr [r8+rdx-30H] movdqu xmm9, xmmword ptr [r9+rdx-30H] movdqu xmm10, xmmword ptr [r10+rdx-30H] movdqu xmm11, xmmword ptr [r11+rdx-30H] movdqa xmm12, xmm8 punpckldq xmm8, xmm9 punpckhdq xmm12, xmm9 movdqa xmm14, xmm10 punpckldq xmm10, xmm11 punpckhdq xmm14, xmm11 movdqa xmm9, xmm8 punpcklqdq xmm8, xmm10 punpckhqdq xmm9, xmm10 movdqa xmm13, xmm12 punpcklqdq xmm12, xmm14 punpckhqdq xmm13, xmm14 movdqa xmmword ptr [rsp+40H], xmm8 movdqa xmmword ptr [rsp+50H], xmm9 movdqa xmmword ptr [rsp+60H], xmm12 movdqa xmmword ptr [rsp+70H], xmm13 movdqu xmm8, xmmword ptr [r8+rdx-20H] movdqu xmm9, xmmword ptr [r9+rdx-20H] movdqu xmm10, xmmword ptr [r10+rdx-20H] movdqu xmm11, xmmword ptr [r11+rdx-20H] movdqa xmm12, xmm8 punpckldq xmm8, xmm9 punpckhdq xmm12, xmm9 movdqa xmm14, xmm10 punpckldq xmm10, xmm11 punpckhdq xmm14, xmm11 movdqa xmm9, xmm8 punpcklqdq xmm8, xmm10 punpckhqdq xmm9, xmm10 movdqa xmm13, xmm12 punpcklqdq xmm12, xmm14 punpckhqdq xmm13, xmm14 movdqa xmmword ptr [rsp+80H], xmm8 movdqa xmmword ptr [rsp+90H], xmm9 movdqa xmmword ptr [rsp+0A0H], xmm12 movdqa xmmword ptr [rsp+0B0H], xmm13 movdqu xmm8, xmmword ptr [r8+rdx-10H] movdqu xmm9, xmmword ptr [r9+rdx-10H] movdqu xmm10, xmmword ptr [r10+rdx-10H] movdqu xmm11, xmmword ptr [r11+rdx-10H] movdqa xmm12, xmm8 punpckldq xmm8, xmm9 punpckhdq xmm12, xmm9 movdqa xmm14, xmm10 punpckldq xmm10, xmm11 punpckhdq xmm14, xmm11 movdqa xmm9, xmm8 punpcklqdq xmm8, xmm10 punpckhqdq xmm9, xmm10 movdqa xmm13, xmm12 punpcklqdq xmm12, xmm14 punpckhqdq xmm13, xmm14 movdqa xmmword ptr [rsp+0C0H], xmm8 movdqa xmmword ptr [rsp+0D0H], xmm9 movdqa xmmword ptr [rsp+0E0H], xmm12 movdqa xmmword ptr [rsp+0F0H], xmm13 movdqa xmm9, xmmword ptr [BLAKE3_IV_1] movdqa xmm10, xmmword ptr [BLAKE3_IV_2] movdqa xmm11, xmmword ptr [BLAKE3_IV_3] movdqa xmm12, xmmword ptr [rsp+110H] movdqa xmm13, xmmword ptr [rsp+120H] movdqa xmm14, xmmword ptr [BLAKE3_BLOCK_LEN] movd xmm15, eax pshufd xmm15, xmm15, 00H prefetcht0 byte ptr [r8+rdx+80H] prefetcht0 byte ptr [r9+rdx+80H] prefetcht0 byte ptr [r10+rdx+80H] prefetcht0 byte ptr [r11+rdx+80H] paddd xmm0, xmmword ptr [rsp] paddd xmm1, xmmword ptr [rsp+20H] paddd xmm2, xmmword ptr [rsp+40H] paddd xmm3, xmmword ptr [rsp+60H] paddd xmm0, xmm4 paddd xmm1, xmm5 paddd xmm2, xmm6 paddd xmm3, xmm7 pxor xmm12, xmm0 pxor xmm13, xmm1 pxor xmm14, xmm2 pxor xmm15, xmm3 pshuflw xmm12, xmm12, 0B1H pshufhw xmm12, xmm12, 0B1H pshuflw xmm13, xmm13, 0B1H pshufhw xmm13, xmm13, 0B1H pshuflw xmm14, xmm14, 0B1H pshufhw xmm14, xmm14, 0B1H pshuflw xmm15, xmm15, 0B1H pshufhw xmm15, xmm15, 0B1H movdqa xmm8, xmmword ptr [BLAKE3_IV_0] paddd xmm8, xmm12 paddd xmm9, xmm13 paddd xmm10, xmm14 paddd xmm11, xmm15 pxor xmm4, xmm8 pxor xmm5, xmm9 pxor xmm6, xmm10 pxor xmm7, xmm11 movdqa xmmword ptr [rsp+100H], xmm8 movdqa xmm8, xmm4 psrld xmm8, 12 pslld xmm4, 20 por xmm4, xmm8 movdqa xmm8, xmm5 psrld xmm8, 12 pslld xmm5, 20 por xmm5, xmm8 movdqa xmm8, xmm6 psrld xmm8, 12 pslld xmm6, 20 por xmm6, xmm8 movdqa xmm8, xmm7 psrld xmm8, 12 pslld xmm7, 20 por xmm7, xmm8 paddd xmm0, xmmword ptr [rsp+10H] paddd xmm1, xmmword ptr [rsp+30H] paddd xmm2, xmmword ptr [rsp+50H] paddd xmm3, xmmword ptr [rsp+70H] paddd xmm0, xmm4 paddd xmm1, xmm5 paddd xmm2, xmm6 paddd xmm3, xmm7 pxor xmm12, xmm0 pxor xmm13, xmm1 pxor xmm14, xmm2 pxor xmm15, xmm3 movdqa xmm8, xmm12 psrld xmm12, 8 pslld xmm8, 24 pxor xmm12, xmm8 movdqa xmm8, xmm13 psrld xmm13, 8 pslld xmm8, 24 pxor xmm13, xmm8 movdqa xmm8, xmm14 psrld xmm14, 8 pslld xmm8, 24 pxor xmm14, xmm8 movdqa xmm8, xmm15 psrld xmm15, 8 pslld xmm8, 24 pxor xmm15, xmm8 movdqa xmm8, xmmword ptr [rsp+100H] paddd xmm8, xmm12 paddd xmm9, xmm13 paddd xmm10, xmm14 paddd xmm11, xmm15 pxor xmm4, xmm8 pxor xmm5, xmm9 pxor xmm6, xmm10 pxor xmm7, xmm11 movdqa xmmword ptr [rsp+100H], xmm8 movdqa xmm8, xmm4 psrld xmm8, 7 pslld xmm4, 25 por xmm4, xmm8 movdqa xmm8, xmm5 psrld xmm8, 7 pslld xmm5, 25 por xmm5, xmm8 movdqa xmm8, xmm6 psrld xmm8, 7 pslld xmm6, 25 por xmm6, xmm8 movdqa xmm8, xmm7 psrld xmm8, 7 pslld xmm7, 25 por xmm7, xmm8 paddd xmm0, xmmword ptr [rsp+80H] paddd xmm1, xmmword ptr [rsp+0A0H] paddd xmm2, xmmword ptr [rsp+0C0H] paddd xmm3, xmmword ptr [rsp+0E0H] paddd xmm0, xmm5 paddd xmm1, xmm6 paddd xmm2, xmm7 paddd xmm3, xmm4 pxor xmm15, xmm0 pxor xmm12, xmm1 pxor xmm13, xmm2 pxor xmm14, xmm3 pshuflw xmm15, xmm15, 0B1H pshufhw xmm15, xmm15, 0B1H pshuflw xmm12, xmm12, 0B1H pshufhw xmm12, xmm12, 0B1H pshuflw xmm13, xmm13, 0B1H pshufhw xmm13, xmm13, 0B1H pshuflw xmm14, xmm14, 0B1H pshufhw xmm14, xmm14, 0B1H paddd xmm10, xmm15 paddd xmm11, xmm12 movdqa xmm8, xmmword ptr [rsp+100H] paddd xmm8, xmm13 paddd xmm9, xmm14 pxor xmm5, xmm10 pxor xmm6, xmm11 pxor xmm7, xmm8 pxor xmm4, xmm9 movdqa xmmword ptr [rsp+100H], xmm8 movdqa xmm8, xmm5 psrld xmm8, 12 pslld xmm5, 20 por xmm5, xmm8 movdqa xmm8, xmm6 psrld xmm8, 12 pslld xmm6, 20 por xmm6, xmm8 movdqa xmm8, xmm7 psrld xmm8, 12 pslld xmm7, 20 por xmm7, xmm8 movdqa xmm8, xmm4 psrld xmm8, 12 pslld xmm4, 20 por xmm4, xmm8 paddd xmm0, xmmword ptr [rsp+90H] paddd xmm1, xmmword ptr [rsp+0B0H] paddd xmm2, xmmword ptr [rsp+0D0H] paddd xmm3, xmmword ptr [rsp+0F0H] paddd xmm0, xmm5 paddd xmm1, xmm6 paddd xmm2, xmm7 paddd xmm3, xmm4 pxor xmm15, xmm0 pxor xmm12, xmm1 pxor xmm13, xmm2 pxor xmm14, xmm3 movdqa xmm8, xmm15 psrld xmm15, 8 pslld xmm8, 24 pxor xmm15, xmm8 movdqa xmm8, xmm12 psrld xmm12, 8 pslld xmm8, 24 pxor xmm12, xmm8 movdqa xmm8, xmm13 psrld xmm13, 8 pslld xmm8, 24 pxor xmm13, xmm8 movdqa xmm8, xmm14 psrld xmm14, 8 pslld xmm8, 24 pxor xmm14, xmm8 paddd xmm10, xmm15 paddd xmm11, xmm12 movdqa xmm8, xmmword ptr [rsp+100H] paddd xmm8, xmm13 paddd xmm9, xmm14 pxor xmm5, xmm10 pxor xmm6, xmm11 pxor xmm7, xmm8 pxor xmm4, xmm9 movdqa xmmword ptr [rsp+100H], xmm8 movdqa xmm8, xmm5 psrld xmm8, 7 pslld xmm5, 25 por xmm5, xmm8 movdqa xmm8, xmm6 psrld xmm8, 7 pslld xmm6, 25 por xmm6, xmm8 movdqa xmm8, xmm7 psrld xmm8, 7 pslld xmm7, 25 por xmm7, xmm8 movdqa xmm8, xmm4 psrld xmm8, 7 pslld xmm4, 25 por xmm4, xmm8 paddd xmm0, xmmword ptr [rsp+20H] paddd xmm1, xmmword ptr [rsp+30H] paddd xmm2, xmmword ptr [rsp+70H] paddd xmm3, xmmword ptr [rsp+40H] paddd xmm0, xmm4 paddd xmm1, xmm5 paddd xmm2, xmm6 paddd xmm3, xmm7 pxor xmm12, xmm0 pxor xmm13, xmm1 pxor xmm14, xmm2 pxor xmm15, xmm3 pshuflw xmm12, xmm12, 0B1H pshufhw xmm12, xmm12, 0B1H pshuflw xmm13, xmm13, 0B1H pshufhw xmm13, xmm13, 0B1H pshuflw xmm14, xmm14, 0B1H pshufhw xmm14, xmm14, 0B1H pshuflw xmm15, xmm15, 0B1H pshufhw xmm15, xmm15, 0B1H movdqa xmm8, xmmword ptr [rsp+100H] paddd xmm8, xmm12 paddd xmm9, xmm13 paddd xmm10, xmm14 paddd xmm11, xmm15 pxor xmm4, xmm8 pxor xmm5, xmm9 pxor xmm6, xmm10 pxor xmm7, xmm11 movdqa xmmword ptr [rsp+100H], xmm8 movdqa xmm8, xmm4 psrld xmm8, 12 pslld xmm4, 20 por xmm4, xmm8 movdqa xmm8, xmm5 psrld xmm8, 12 pslld xmm5, 20 por xmm5, xmm8 movdqa xmm8, xmm6 psrld xmm8, 12 pslld xmm6, 20 por xmm6, xmm8 movdqa xmm8, xmm7 psrld xmm8, 12 pslld xmm7, 20 por xmm7, xmm8 paddd xmm0, xmmword ptr [rsp+60H] paddd xmm1, xmmword ptr [rsp+0A0H] paddd xmm2, xmmword ptr [rsp] paddd xmm3, xmmword ptr [rsp+0D0H] paddd xmm0, xmm4 paddd xmm1, xmm5 paddd xmm2, xmm6 paddd xmm3, xmm7 pxor xmm12, xmm0 pxor xmm13, xmm1 pxor xmm14, xmm2 pxor xmm15, xmm3 movdqa xmm8, xmm12 psrld xmm12, 8 pslld xmm8, 24 pxor xmm12, xmm8 movdqa xmm8, xmm13 psrld xmm13, 8 pslld xmm8, 24 pxor xmm13, xmm8 movdqa xmm8, xmm14 psrld xmm14, 8 pslld xmm8, 24 pxor xmm14, xmm8 movdqa xmm8, xmm15 psrld xmm15, 8 pslld xmm8, 24 pxor xmm15, xmm8 movdqa xmm8, xmmword ptr [rsp+100H] paddd xmm8, xmm12 paddd xmm9, xmm13 paddd xmm10, xmm14 paddd xmm11, xmm15 pxor xmm4, xmm8 pxor xmm5, xmm9 pxor xmm6, xmm10 pxor xmm7, xmm11 movdqa xmmword ptr [rsp+100H], xmm8 movdqa xmm8, xmm4 psrld xmm8, 7 pslld xmm4, 25 por xmm4, xmm8 movdqa xmm8, xmm5 psrld xmm8, 7 pslld xmm5, 25 por xmm5, xmm8 movdqa xmm8, xmm6 psrld xmm8, 7 pslld xmm6, 25 por xmm6, xmm8 movdqa xmm8, xmm7 psrld xmm8, 7 pslld xmm7, 25 por xmm7, xmm8 paddd xmm0, xmmword ptr [rsp+10H] paddd xmm1, xmmword ptr [rsp+0C0H] paddd xmm2, xmmword ptr [rsp+90H] paddd xmm3, xmmword ptr [rsp+0F0H] paddd xmm0, xmm5 paddd xmm1, xmm6 paddd xmm2, xmm7 paddd xmm3, xmm4 pxor xmm15, xmm0 pxor xmm12, xmm1 pxor xmm13, xmm2 pxor xmm14, xmm3 pshuflw xmm15, xmm15, 0B1H pshufhw xmm15, xmm15, 0B1H pshuflw xmm12, xmm12, 0B1H pshufhw xmm12, xmm12, 0B1H pshuflw xmm13, xmm13, 0B1H pshufhw xmm13, xmm13, 0B1H pshuflw xmm14, xmm14, 0B1H pshufhw xmm14, xmm14, 0B1H paddd xmm10, xmm15 paddd xmm11, xmm12 movdqa xmm8, xmmword ptr [rsp+100H] paddd xmm8, xmm13 paddd xmm9, xmm14 pxor xmm5, xmm10 pxor xmm6, xmm11 pxor xmm7, xmm8 pxor xmm4, xmm9 movdqa xmmword ptr [rsp+100H], xmm8 movdqa xmm8, xmm5 psrld xmm8, 12 pslld xmm5, 20 por xmm5, xmm8 movdqa xmm8, xmm6 psrld xmm8, 12 pslld xmm6, 20 por xmm6, xmm8 movdqa xmm8, xmm7 psrld xmm8, 12 pslld xmm7, 20 por xmm7, xmm8 movdqa xmm8, xmm4 psrld xmm8, 12 pslld xmm4, 20 por xmm4, xmm8 paddd xmm0, xmmword ptr [rsp+0B0H] paddd xmm1, xmmword ptr [rsp+50H] paddd xmm2, xmmword ptr [rsp+0E0H] paddd xmm3, xmmword ptr [rsp+80H] paddd xmm0, xmm5 paddd xmm1, xmm6 paddd xmm2, xmm7 paddd xmm3, xmm4 pxor xmm15, xmm0 pxor xmm12, xmm1 pxor xmm13, xmm2 pxor xmm14, xmm3 movdqa xmm8, xmm15 psrld xmm15, 8 pslld xmm8, 24 pxor xmm15, xmm8 movdqa xmm8, xmm12 psrld xmm12, 8 pslld xmm8, 24 pxor xmm12, xmm8 movdqa xmm8, xmm13 psrld xmm13, 8 pslld xmm8, 24 pxor xmm13, xmm8 movdqa xmm8, xmm14 psrld xmm14, 8 pslld xmm8, 24 pxor xmm14, xmm8 paddd xmm10, xmm15 paddd xmm11, xmm12 movdqa xmm8, xmmword ptr [rsp+100H] paddd xmm8, xmm13 paddd xmm9, xmm14 pxor xmm5, xmm10 pxor xmm6, xmm11 pxor xmm7, xmm8 pxor xmm4, xmm9 movdqa xmmword ptr [rsp+100H], xmm8 movdqa xmm8, xmm5 psrld xmm8, 7 pslld xmm5, 25 por xmm5, xmm8 movdqa xmm8, xmm6 psrld xmm8, 7 pslld xmm6, 25 por xmm6, xmm8 movdqa xmm8, xmm7 psrld xmm8, 7 pslld xmm7, 25 por xmm7, xmm8 movdqa xmm8, xmm4 psrld xmm8, 7 pslld xmm4, 25 por xmm4, xmm8 paddd xmm0, xmmword ptr [rsp+30H] paddd xmm1, xmmword ptr [rsp+0A0H] paddd xmm2, xmmword ptr [rsp+0D0H] paddd xmm3, xmmword ptr [rsp+70H] paddd xmm0, xmm4 paddd xmm1, xmm5 paddd xmm2, xmm6 paddd xmm3, xmm7 pxor xmm12, xmm0 pxor xmm13, xmm1 pxor xmm14, xmm2 pxor xmm15, xmm3 pshuflw xmm12, xmm12, 0B1H pshufhw xmm12, xmm12, 0B1H pshuflw xmm13, xmm13, 0B1H pshufhw xmm13, xmm13, 0B1H pshuflw xmm14, xmm14, 0B1H pshufhw xmm14, xmm14, 0B1H pshuflw xmm15, xmm15, 0B1H pshufhw xmm15, xmm15, 0B1H movdqa xmm8, xmmword ptr [rsp+100H] paddd xmm8, xmm12 paddd xmm9, xmm13 paddd xmm10, xmm14 paddd xmm11, xmm15 pxor xmm4, xmm8 pxor xmm5, xmm9 pxor xmm6, xmm10 pxor xmm7, xmm11 movdqa xmmword ptr [rsp+100H], xmm8 movdqa xmm8, xmm4 psrld xmm8, 12 pslld xmm4, 20 por xmm4, xmm8 movdqa xmm8, xmm5 psrld xmm8, 12 pslld xmm5, 20 por xmm5, xmm8 movdqa xmm8, xmm6 psrld xmm8, 12 pslld xmm6, 20 por xmm6, xmm8 movdqa xmm8, xmm7 psrld xmm8, 12 pslld xmm7, 20 por xmm7, xmm8 paddd xmm0, xmmword ptr [rsp+40H] paddd xmm1, xmmword ptr [rsp+0C0H] paddd xmm2, xmmword ptr [rsp+20H] paddd xmm3, xmmword ptr [rsp+0E0H] paddd xmm0, xmm4 paddd xmm1, xmm5 paddd xmm2, xmm6 paddd xmm3, xmm7 pxor xmm12, xmm0 pxor xmm13, xmm1 pxor xmm14, xmm2 pxor xmm15, xmm3 movdqa xmm8, xmm12 psrld xmm12, 8 pslld xmm8, 24 pxor xmm12, xmm8 movdqa xmm8, xmm13 psrld xmm13, 8 pslld xmm8, 24 pxor xmm13, xmm8 movdqa xmm8, xmm14 psrld xmm14, 8 pslld xmm8, 24 pxor xmm14, xmm8 movdqa xmm8, xmm15 psrld xmm15, 8 pslld xmm8, 24 pxor xmm15, xmm8 movdqa xmm8, xmmword ptr [rsp+100H] paddd xmm8, xmm12 paddd xmm9, xmm13 paddd xmm10, xmm14 paddd xmm11, xmm15 pxor xmm4, xmm8 pxor xmm5, xmm9 pxor xmm6, xmm10 pxor xmm7, xmm11 movdqa xmmword ptr [rsp+100H], xmm8 movdqa xmm8, xmm4 psrld xmm8, 7 pslld xmm4, 25 por xmm4, xmm8 movdqa xmm8, xmm5 psrld xmm8, 7 pslld xmm5, 25 por xmm5, xmm8 movdqa xmm8, xmm6 psrld xmm8, 7 pslld xmm6, 25 por xmm6, xmm8 movdqa xmm8, xmm7 psrld xmm8, 7 pslld xmm7, 25 por xmm7, xmm8 paddd xmm0, xmmword ptr [rsp+60H] paddd xmm1, xmmword ptr [rsp+90H] paddd xmm2, xmmword ptr [rsp+0B0H] paddd xmm3, xmmword ptr [rsp+80H] paddd xmm0, xmm5 paddd xmm1, xmm6 paddd xmm2, xmm7 paddd xmm3, xmm4 pxor xmm15, xmm0 pxor xmm12, xmm1 pxor xmm13, xmm2 pxor xmm14, xmm3 pshuflw xmm15, xmm15, 0B1H pshufhw xmm15, xmm15, 0B1H pshuflw xmm12, xmm12, 0B1H pshufhw xmm12, xmm12, 0B1H pshuflw xmm13, xmm13, 0B1H pshufhw xmm13, xmm13, 0B1H pshuflw xmm14, xmm14, 0B1H pshufhw xmm14, xmm14, 0B1H paddd xmm10, xmm15 paddd xmm11, xmm12 movdqa xmm8, xmmword ptr [rsp+100H] paddd xmm8, xmm13 paddd xmm9, xmm14 pxor xmm5, xmm10 pxor xmm6, xmm11 pxor xmm7, xmm8 pxor xmm4, xmm9 movdqa xmmword ptr [rsp+100H], xmm8 movdqa xmm8, xmm5 psrld xmm8, 12 pslld xmm5, 20 por xmm5, xmm8 movdqa xmm8, xmm6 psrld xmm8, 12 pslld xmm6, 20 por xmm6, xmm8 movdqa xmm8, xmm7 psrld xmm8, 12 pslld xmm7, 20 por xmm7, xmm8 movdqa xmm8, xmm4 psrld xmm8, 12 pslld xmm4, 20 por xmm4, xmm8 paddd xmm0, xmmword ptr [rsp+50H] paddd xmm1, xmmword ptr [rsp] paddd xmm2, xmmword ptr [rsp+0F0H] paddd xmm3, xmmword ptr [rsp+10H] paddd xmm0, xmm5 paddd xmm1, xmm6 paddd xmm2, xmm7 paddd xmm3, xmm4 pxor xmm15, xmm0 pxor xmm12, xmm1 pxor xmm13, xmm2 pxor xmm14, xmm3 movdqa xmm8, xmm15 psrld xmm15, 8 pslld xmm8, 24 pxor xmm15, xmm8 movdqa xmm8, xmm12 psrld xmm12, 8 pslld xmm8, 24 pxor xmm12, xmm8 movdqa xmm8, xmm13 psrld xmm13, 8 pslld xmm8, 24 pxor xmm13, xmm8 movdqa xmm8, xmm14 psrld xmm14, 8 pslld xmm8, 24 pxor xmm14, xmm8 paddd xmm10, xmm15 paddd xmm11, xmm12 movdqa xmm8, xmmword ptr [rsp+100H] paddd xmm8, xmm13 paddd xmm9, xmm14 pxor xmm5, xmm10 pxor xmm6, xmm11 pxor xmm7, xmm8 pxor xmm4, xmm9 movdqa xmmword ptr [rsp+100H], xmm8 movdqa xmm8, xmm5 psrld xmm8, 7 pslld xmm5, 25 por xmm5, xmm8 movdqa xmm8, xmm6 psrld xmm8, 7 pslld xmm6, 25 por xmm6, xmm8 movdqa xmm8, xmm7 psrld xmm8, 7 pslld xmm7, 25 por xmm7, xmm8 movdqa xmm8, xmm4 psrld xmm8, 7 pslld xmm4, 25 por xmm4, xmm8 paddd xmm0, xmmword ptr [rsp+0A0H] paddd xmm1, xmmword ptr [rsp+0C0H] paddd xmm2, xmmword ptr [rsp+0E0H] paddd xmm3, xmmword ptr [rsp+0D0H] paddd xmm0, xmm4 paddd xmm1, xmm5 paddd xmm2, xmm6 paddd xmm3, xmm7 pxor xmm12, xmm0 pxor xmm13, xmm1 pxor xmm14, xmm2 pxor xmm15, xmm3 pshuflw xmm12, xmm12, 0B1H pshufhw xmm12, xmm12, 0B1H pshuflw xmm13, xmm13, 0B1H pshufhw xmm13, xmm13, 0B1H pshuflw xmm14, xmm14, 0B1H pshufhw xmm14, xmm14, 0B1H pshuflw xmm15, xmm15, 0B1H pshufhw xmm15, xmm15, 0B1H movdqa xmm8, xmmword ptr [rsp+100H] paddd xmm8, xmm12 paddd xmm9, xmm13 paddd xmm10, xmm14 paddd xmm11, xmm15 pxor xmm4, xmm8 pxor xmm5, xmm9 pxor xmm6, xmm10 pxor xmm7, xmm11 movdqa xmmword ptr [rsp+100H], xmm8 movdqa xmm8, xmm4 psrld xmm8, 12 pslld xmm4, 20 por xmm4, xmm8 movdqa xmm8, xmm5 psrld xmm8, 12 pslld xmm5, 20 por xmm5, xmm8 movdqa xmm8, xmm6 psrld xmm8, 12 pslld xmm6, 20 por xmm6, xmm8 movdqa xmm8, xmm7 psrld xmm8, 12 pslld xmm7, 20 por xmm7, xmm8 paddd xmm0, xmmword ptr [rsp+70H] paddd xmm1, xmmword ptr [rsp+90H] paddd xmm2, xmmword ptr [rsp+30H] paddd xmm3, xmmword ptr [rsp+0F0H] paddd xmm0, xmm4 paddd xmm1, xmm5 paddd xmm2, xmm6 paddd xmm3, xmm7 pxor xmm12, xmm0 pxor xmm13, xmm1 pxor xmm14, xmm2 pxor xmm15, xmm3 movdqa xmm8, xmm12 psrld xmm12, 8 pslld xmm8, 24 pxor xmm12, xmm8 movdqa xmm8, xmm13 psrld xmm13, 8 pslld xmm8, 24 pxor xmm13, xmm8 movdqa xmm8, xmm14 psrld xmm14, 8 pslld xmm8, 24 pxor xmm14, xmm8 movdqa xmm8, xmm15 psrld xmm15, 8 pslld xmm8, 24 pxor xmm15, xmm8 movdqa xmm8, xmmword ptr [rsp+100H] paddd xmm8, xmm12 paddd xmm9, xmm13 paddd xmm10, xmm14 paddd xmm11, xmm15 pxor xmm4, xmm8 pxor xmm5, xmm9 pxor xmm6, xmm10 pxor xmm7, xmm11 movdqa xmmword ptr [rsp+100H], xmm8 movdqa xmm8, xmm4 psrld xmm8, 7 pslld xmm4, 25 por xmm4, xmm8 movdqa xmm8, xmm5 psrld xmm8, 7 pslld xmm5, 25 por xmm5, xmm8 movdqa xmm8, xmm6 psrld xmm8, 7 pslld xmm6, 25 por xmm6, xmm8 movdqa xmm8, xmm7 psrld xmm8, 7 pslld xmm7, 25 por xmm7, xmm8 paddd xmm0, xmmword ptr [rsp+40H] paddd xmm1, xmmword ptr [rsp+0B0H] paddd xmm2, xmmword ptr [rsp+50H] paddd xmm3, xmmword ptr [rsp+10H] paddd xmm0, xmm5 paddd xmm1, xmm6 paddd xmm2, xmm7 paddd xmm3, xmm4 pxor xmm15, xmm0 pxor xmm12, xmm1 pxor xmm13, xmm2 pxor xmm14, xmm3 pshuflw xmm15, xmm15, 0B1H pshufhw xmm15, xmm15, 0B1H pshuflw xmm12, xmm12, 0B1H pshufhw xmm12, xmm12, 0B1H pshuflw xmm13, xmm13, 0B1H pshufhw xmm13, xmm13, 0B1H pshuflw xmm14, xmm14, 0B1H pshufhw xmm14, xmm14, 0B1H paddd xmm10, xmm15 paddd xmm11, xmm12 movdqa xmm8, xmmword ptr [rsp+100H] paddd xmm8, xmm13 paddd xmm9, xmm14 pxor xmm5, xmm10 pxor xmm6, xmm11 pxor xmm7, xmm8 pxor xmm4, xmm9 movdqa xmmword ptr [rsp+100H], xmm8 movdqa xmm8, xmm5 psrld xmm8, 12 pslld xmm5, 20 por xmm5, xmm8 movdqa xmm8, xmm6 psrld xmm8, 12 pslld xmm6, 20 por xmm6, xmm8 movdqa xmm8, xmm7 psrld xmm8, 12 pslld xmm7, 20 por xmm7, xmm8 movdqa xmm8, xmm4 psrld xmm8, 12 pslld xmm4, 20 por xmm4, xmm8 paddd xmm0, xmmword ptr [rsp] paddd xmm1, xmmword ptr [rsp+20H] paddd xmm2, xmmword ptr [rsp+80H] paddd xmm3, xmmword ptr [rsp+60H] paddd xmm0, xmm5 paddd xmm1, xmm6 paddd xmm2, xmm7 paddd xmm3, xmm4 pxor xmm15, xmm0 pxor xmm12, xmm1 pxor xmm13, xmm2 pxor xmm14, xmm3 movdqa xmm8, xmm15 psrld xmm15, 8 pslld xmm8, 24 pxor xmm15, xmm8 movdqa xmm8, xmm12 psrld xmm12, 8 pslld xmm8, 24 pxor xmm12, xmm8 movdqa xmm8, xmm13 psrld xmm13, 8 pslld xmm8, 24 pxor xmm13, xmm8 movdqa xmm8, xmm14 psrld xmm14, 8 pslld xmm8, 24 pxor xmm14, xmm8 paddd xmm10, xmm15 paddd xmm11, xmm12 movdqa xmm8, xmmword ptr [rsp+100H] paddd xmm8, xmm13 paddd xmm9, xmm14 pxor xmm5, xmm10 pxor xmm6, xmm11 pxor xmm7, xmm8 pxor xmm4, xmm9 movdqa xmmword ptr [rsp+100H], xmm8 movdqa xmm8, xmm5 psrld xmm8, 7 pslld xmm5, 25 por xmm5, xmm8 movdqa xmm8, xmm6 psrld xmm8, 7 pslld xmm6, 25 por xmm6, xmm8 movdqa xmm8, xmm7 psrld xmm8, 7 pslld xmm7, 25 por xmm7, xmm8 movdqa xmm8, xmm4 psrld xmm8, 7 pslld xmm4, 25 por xmm4, xmm8 paddd xmm0, xmmword ptr [rsp+0C0H] paddd xmm1, xmmword ptr [rsp+90H] paddd xmm2, xmmword ptr [rsp+0F0H] paddd xmm3, xmmword ptr [rsp+0E0H] paddd xmm0, xmm4 paddd xmm1, xmm5 paddd xmm2, xmm6 paddd xmm3, xmm7 pxor xmm12, xmm0 pxor xmm13, xmm1 pxor xmm14, xmm2 pxor xmm15, xmm3 pshuflw xmm12, xmm12, 0B1H pshufhw xmm12, xmm12, 0B1H pshuflw xmm13, xmm13, 0B1H pshufhw xmm13, xmm13, 0B1H pshuflw xmm14, xmm14, 0B1H pshufhw xmm14, xmm14, 0B1H pshuflw xmm15, xmm15, 0B1H pshufhw xmm15, xmm15, 0B1H movdqa xmm8, xmmword ptr [rsp+100H] paddd xmm8, xmm12 paddd xmm9, xmm13 paddd xmm10, xmm14 paddd xmm11, xmm15 pxor xmm4, xmm8 pxor xmm5, xmm9 pxor xmm6, xmm10 pxor xmm7, xmm11 movdqa xmmword ptr [rsp+100H], xmm8 movdqa xmm8, xmm4 psrld xmm8, 12 pslld xmm4, 20 por xmm4, xmm8 movdqa xmm8, xmm5 psrld xmm8, 12 pslld xmm5, 20 por xmm5, xmm8 movdqa xmm8, xmm6 psrld xmm8, 12 pslld xmm6, 20 por xmm6, xmm8 movdqa xmm8, xmm7 psrld xmm8, 12 pslld xmm7, 20 por xmm7, xmm8 paddd xmm0, xmmword ptr [rsp+0D0H] paddd xmm1, xmmword ptr [rsp+0B0H] paddd xmm2, xmmword ptr [rsp+0A0H] paddd xmm3, xmmword ptr [rsp+80H] paddd xmm0, xmm4 paddd xmm1, xmm5 paddd xmm2, xmm6 paddd xmm3, xmm7 pxor xmm12, xmm0 pxor xmm13, xmm1 pxor xmm14, xmm2 pxor xmm15, xmm3 movdqa xmm8, xmm12 psrld xmm12, 8 pslld xmm8, 24 pxor xmm12, xmm8 movdqa xmm8, xmm13 psrld xmm13, 8 pslld xmm8, 24 pxor xmm13, xmm8 movdqa xmm8, xmm14 psrld xmm14, 8 pslld xmm8, 24 pxor xmm14, xmm8 movdqa xmm8, xmm15 psrld xmm15, 8 pslld xmm8, 24 pxor xmm15, xmm8 movdqa xmm8, xmmword ptr [rsp+100H] paddd xmm8, xmm12 paddd xmm9, xmm13 paddd xmm10, xmm14 paddd xmm11, xmm15 pxor xmm4, xmm8 pxor xmm5, xmm9 pxor xmm6, xmm10 pxor xmm7, xmm11 movdqa xmmword ptr [rsp+100H], xmm8 movdqa xmm8, xmm4 psrld xmm8, 7 pslld xmm4, 25 por xmm4, xmm8 movdqa xmm8, xmm5 psrld xmm8, 7 pslld xmm5, 25 por xmm5, xmm8 movdqa xmm8, xmm6 psrld xmm8, 7 pslld xmm6, 25 por xmm6, xmm8 movdqa xmm8, xmm7 psrld xmm8, 7 pslld xmm7, 25 por xmm7, xmm8 paddd xmm0, xmmword ptr [rsp+70H] paddd xmm1, xmmword ptr [rsp+50H] paddd xmm2, xmmword ptr [rsp] paddd xmm3, xmmword ptr [rsp+60H] paddd xmm0, xmm5 paddd xmm1, xmm6 paddd xmm2, xmm7 paddd xmm3, xmm4 pxor xmm15, xmm0 pxor xmm12, xmm1 pxor xmm13, xmm2 pxor xmm14, xmm3 pshuflw xmm15, xmm15, 0B1H pshufhw xmm15, xmm15, 0B1H pshuflw xmm12, xmm12, 0B1H pshufhw xmm12, xmm12, 0B1H pshuflw xmm13, xmm13, 0B1H pshufhw xmm13, xmm13, 0B1H pshuflw xmm14, xmm14, 0B1H pshufhw xmm14, xmm14, 0B1H paddd xmm10, xmm15 paddd xmm11, xmm12 movdqa xmm8, xmmword ptr [rsp+100H] paddd xmm8, xmm13 paddd xmm9, xmm14 pxor xmm5, xmm10 pxor xmm6, xmm11 pxor xmm7, xmm8 pxor xmm4, xmm9 movdqa xmmword ptr [rsp+100H], xmm8 movdqa xmm8, xmm5 psrld xmm8, 12 pslld xmm5, 20 por xmm5, xmm8 movdqa xmm8, xmm6 psrld xmm8, 12 pslld xmm6, 20 por xmm6, xmm8 movdqa xmm8, xmm7 psrld xmm8, 12 pslld xmm7, 20 por xmm7, xmm8 movdqa xmm8, xmm4 psrld xmm8, 12 pslld xmm4, 20 por xmm4, xmm8 paddd xmm0, xmmword ptr [rsp+20H] paddd xmm1, xmmword ptr [rsp+30H] paddd xmm2, xmmword ptr [rsp+10H] paddd xmm3, xmmword ptr [rsp+40H] paddd xmm0, xmm5 paddd xmm1, xmm6 paddd xmm2, xmm7 paddd xmm3, xmm4 pxor xmm15, xmm0 pxor xmm12, xmm1 pxor xmm13, xmm2 pxor xmm14, xmm3 movdqa xmm8, xmm15 psrld xmm15, 8 pslld xmm8, 24 pxor xmm15, xmm8 movdqa xmm8, xmm12 psrld xmm12, 8 pslld xmm8, 24 pxor xmm12, xmm8 movdqa xmm8, xmm13 psrld xmm13, 8 pslld xmm8, 24 pxor xmm13, xmm8 movdqa xmm8, xmm14 psrld xmm14, 8 pslld xmm8, 24 pxor xmm14, xmm8 paddd xmm10, xmm15 paddd xmm11, xmm12 movdqa xmm8, xmmword ptr [rsp+100H] paddd xmm8, xmm13 paddd xmm9, xmm14 pxor xmm5, xmm10 pxor xmm6, xmm11 pxor xmm7, xmm8 pxor xmm4, xmm9 movdqa xmmword ptr [rsp+100H], xmm8 movdqa xmm8, xmm5 psrld xmm8, 7 pslld xmm5, 25 por xmm5, xmm8 movdqa xmm8, xmm6 psrld xmm8, 7 pslld xmm6, 25 por xmm6, xmm8 movdqa xmm8, xmm7 psrld xmm8, 7 pslld xmm7, 25 por xmm7, xmm8 movdqa xmm8, xmm4 psrld xmm8, 7 pslld xmm4, 25 por xmm4, xmm8 paddd xmm0, xmmword ptr [rsp+90H] paddd xmm1, xmmword ptr [rsp+0B0H] paddd xmm2, xmmword ptr [rsp+80H] paddd xmm3, xmmword ptr [rsp+0F0H] paddd xmm0, xmm4 paddd xmm1, xmm5 paddd xmm2, xmm6 paddd xmm3, xmm7 pxor xmm12, xmm0 pxor xmm13, xmm1 pxor xmm14, xmm2 pxor xmm15, xmm3 pshuflw xmm12, xmm12, 0B1H pshufhw xmm12, xmm12, 0B1H pshuflw xmm13, xmm13, 0B1H pshufhw xmm13, xmm13, 0B1H pshuflw xmm14, xmm14, 0B1H pshufhw xmm14, xmm14, 0B1H pshuflw xmm15, xmm15, 0B1H pshufhw xmm15, xmm15, 0B1H movdqa xmm8, xmmword ptr [rsp+100H] paddd xmm8, xmm12 paddd xmm9, xmm13 paddd xmm10, xmm14 paddd xmm11, xmm15 pxor xmm4, xmm8 pxor xmm5, xmm9 pxor xmm6, xmm10 pxor xmm7, xmm11 movdqa xmmword ptr [rsp+100H], xmm8 movdqa xmm8, xmm4 psrld xmm8, 12 pslld xmm4, 20 por xmm4, xmm8 movdqa xmm8, xmm5 psrld xmm8, 12 pslld xmm5, 20 por xmm5, xmm8 movdqa xmm8, xmm6 psrld xmm8, 12 pslld xmm6, 20 por xmm6, xmm8 movdqa xmm8, xmm7 psrld xmm8, 12 pslld xmm7, 20 por xmm7, xmm8 paddd xmm0, xmmword ptr [rsp+0E0H] paddd xmm1, xmmword ptr [rsp+50H] paddd xmm2, xmmword ptr [rsp+0C0H] paddd xmm3, xmmword ptr [rsp+10H] paddd xmm0, xmm4 paddd xmm1, xmm5 paddd xmm2, xmm6 paddd xmm3, xmm7 pxor xmm12, xmm0 pxor xmm13, xmm1 pxor xmm14, xmm2 pxor xmm15, xmm3 movdqa xmm8, xmm12 psrld xmm12, 8 pslld xmm8, 24 pxor xmm12, xmm8 movdqa xmm8, xmm13 psrld xmm13, 8 pslld xmm8, 24 pxor xmm13, xmm8 movdqa xmm8, xmm14 psrld xmm14, 8 pslld xmm8, 24 pxor xmm14, xmm8 movdqa xmm8, xmm15 psrld xmm15, 8 pslld xmm8, 24 pxor xmm15, xmm8 movdqa xmm8, xmmword ptr [rsp+100H] paddd xmm8, xmm12 paddd xmm9, xmm13 paddd xmm10, xmm14 paddd xmm11, xmm15 pxor xmm4, xmm8 pxor xmm5, xmm9 pxor xmm6, xmm10 pxor xmm7, xmm11 movdqa xmmword ptr [rsp+100H], xmm8 movdqa xmm8, xmm4 psrld xmm8, 7 pslld xmm4, 25 por xmm4, xmm8 movdqa xmm8, xmm5 psrld xmm8, 7 pslld xmm5, 25 por xmm5, xmm8 movdqa xmm8, xmm6 psrld xmm8, 7 pslld xmm6, 25 por xmm6, xmm8 movdqa xmm8, xmm7 psrld xmm8, 7 pslld xmm7, 25 por xmm7, xmm8 paddd xmm0, xmmword ptr [rsp+0D0H] paddd xmm1, xmmword ptr [rsp] paddd xmm2, xmmword ptr [rsp+20H] paddd xmm3, xmmword ptr [rsp+40H] paddd xmm0, xmm5 paddd xmm1, xmm6 paddd xmm2, xmm7 paddd xmm3, xmm4 pxor xmm15, xmm0 pxor xmm12, xmm1 pxor xmm13, xmm2 pxor xmm14, xmm3 pshuflw xmm15, xmm15, 0B1H pshufhw xmm15, xmm15, 0B1H pshuflw xmm12, xmm12, 0B1H pshufhw xmm12, xmm12, 0B1H pshuflw xmm13, xmm13, 0B1H pshufhw xmm13, xmm13, 0B1H pshuflw xmm14, xmm14, 0B1H pshufhw xmm14, xmm14, 0B1H paddd xmm10, xmm15 paddd xmm11, xmm12 movdqa xmm8, xmmword ptr [rsp+100H] paddd xmm8, xmm13 paddd xmm9, xmm14 pxor xmm5, xmm10 pxor xmm6, xmm11 pxor xmm7, xmm8 pxor xmm4, xmm9 movdqa xmmword ptr [rsp+100H], xmm8 movdqa xmm8, xmm5 psrld xmm8, 12 pslld xmm5, 20 por xmm5, xmm8 movdqa xmm8, xmm6 psrld xmm8, 12 pslld xmm6, 20 por xmm6, xmm8 movdqa xmm8, xmm7 psrld xmm8, 12 pslld xmm7, 20 por xmm7, xmm8 movdqa xmm8, xmm4 psrld xmm8, 12 pslld xmm4, 20 por xmm4, xmm8 paddd xmm0, xmmword ptr [rsp+30H] paddd xmm1, xmmword ptr [rsp+0A0H] paddd xmm2, xmmword ptr [rsp+60H] paddd xmm3, xmmword ptr [rsp+70H] paddd xmm0, xmm5 paddd xmm1, xmm6 paddd xmm2, xmm7 paddd xmm3, xmm4 pxor xmm15, xmm0 pxor xmm12, xmm1 pxor xmm13, xmm2 pxor xmm14, xmm3 movdqa xmm8, xmm15 psrld xmm15, 8 pslld xmm8, 24 pxor xmm15, xmm8 movdqa xmm8, xmm12 psrld xmm12, 8 pslld xmm8, 24 pxor xmm12, xmm8 movdqa xmm8, xmm13 psrld xmm13, 8 pslld xmm8, 24 pxor xmm13, xmm8 movdqa xmm8, xmm14 psrld xmm14, 8 pslld xmm8, 24 pxor xmm14, xmm8 paddd xmm10, xmm15 paddd xmm11, xmm12 movdqa xmm8, xmmword ptr [rsp+100H] paddd xmm8, xmm13 paddd xmm9, xmm14 pxor xmm5, xmm10 pxor xmm6, xmm11 pxor xmm7, xmm8 pxor xmm4, xmm9 movdqa xmmword ptr [rsp+100H], xmm8 movdqa xmm8, xmm5 psrld xmm8, 7 pslld xmm5, 25 por xmm5, xmm8 movdqa xmm8, xmm6 psrld xmm8, 7 pslld xmm6, 25 por xmm6, xmm8 movdqa xmm8, xmm7 psrld xmm8, 7 pslld xmm7, 25 por xmm7, xmm8 movdqa xmm8, xmm4 psrld xmm8, 7 pslld xmm4, 25 por xmm4, xmm8 paddd xmm0, xmmword ptr [rsp+0B0H] paddd xmm1, xmmword ptr [rsp+50H] paddd xmm2, xmmword ptr [rsp+10H] paddd xmm3, xmmword ptr [rsp+80H] paddd xmm0, xmm4 paddd xmm1, xmm5 paddd xmm2, xmm6 paddd xmm3, xmm7 pxor xmm12, xmm0 pxor xmm13, xmm1 pxor xmm14, xmm2 pxor xmm15, xmm3 pshuflw xmm12, xmm12, 0B1H pshufhw xmm12, xmm12, 0B1H pshuflw xmm13, xmm13, 0B1H pshufhw xmm13, xmm13, 0B1H pshuflw xmm14, xmm14, 0B1H pshufhw xmm14, xmm14, 0B1H pshuflw xmm15, xmm15, 0B1H pshufhw xmm15, xmm15, 0B1H movdqa xmm8, xmmword ptr [rsp+100H] paddd xmm8, xmm12 paddd xmm9, xmm13 paddd xmm10, xmm14 paddd xmm11, xmm15 pxor xmm4, xmm8 pxor xmm5, xmm9 pxor xmm6, xmm10 pxor xmm7, xmm11 movdqa xmmword ptr [rsp+100H], xmm8 movdqa xmm8, xmm4 psrld xmm8, 12 pslld xmm4, 20 por xmm4, xmm8 movdqa xmm8, xmm5 psrld xmm8, 12 pslld xmm5, 20 por xmm5, xmm8 movdqa xmm8, xmm6 psrld xmm8, 12 pslld xmm6, 20 por xmm6, xmm8 movdqa xmm8, xmm7 psrld xmm8, 12 pslld xmm7, 20 por xmm7, xmm8 paddd xmm0, xmmword ptr [rsp+0F0H] paddd xmm1, xmmword ptr [rsp] paddd xmm2, xmmword ptr [rsp+90H] paddd xmm3, xmmword ptr [rsp+60H] paddd xmm0, xmm4 paddd xmm1, xmm5 paddd xmm2, xmm6 paddd xmm3, xmm7 pxor xmm12, xmm0 pxor xmm13, xmm1 pxor xmm14, xmm2 pxor xmm15, xmm3 movdqa xmm8, xmm12 psrld xmm12, 8 pslld xmm8, 24 pxor xmm12, xmm8 movdqa xmm8, xmm13 psrld xmm13, 8 pslld xmm8, 24 pxor xmm13, xmm8 movdqa xmm8, xmm14 psrld xmm14, 8 pslld xmm8, 24 pxor xmm14, xmm8 movdqa xmm8, xmm15 psrld xmm15, 8 pslld xmm8, 24 pxor xmm15, xmm8 movdqa xmm8, xmmword ptr [rsp+100H] paddd xmm8, xmm12 paddd xmm9, xmm13 paddd xmm10, xmm14 paddd xmm11, xmm15 pxor xmm4, xmm8 pxor xmm5, xmm9 pxor xmm6, xmm10 pxor xmm7, xmm11 movdqa xmmword ptr [rsp+100H], xmm8 movdqa xmm8, xmm4 psrld xmm8, 7 pslld xmm4, 25 por xmm4, xmm8 movdqa xmm8, xmm5 psrld xmm8, 7 pslld xmm5, 25 por xmm5, xmm8 movdqa xmm8, xmm6 psrld xmm8, 7 pslld xmm6, 25 por xmm6, xmm8 movdqa xmm8, xmm7 psrld xmm8, 7 pslld xmm7, 25 por xmm7, xmm8 paddd xmm0, xmmword ptr [rsp+0E0H] paddd xmm1, xmmword ptr [rsp+20H] paddd xmm2, xmmword ptr [rsp+30H] paddd xmm3, xmmword ptr [rsp+70H] paddd xmm0, xmm5 paddd xmm1, xmm6 paddd xmm2, xmm7 paddd xmm3, xmm4 pxor xmm15, xmm0 pxor xmm12, xmm1 pxor xmm13, xmm2 pxor xmm14, xmm3 pshuflw xmm15, xmm15, 0B1H pshufhw xmm15, xmm15, 0B1H pshuflw xmm12, xmm12, 0B1H pshufhw xmm12, xmm12, 0B1H pshuflw xmm13, xmm13, 0B1H pshufhw xmm13, xmm13, 0B1H pshuflw xmm14, xmm14, 0B1H pshufhw xmm14, xmm14, 0B1H paddd xmm10, xmm15 paddd xmm11, xmm12 movdqa xmm8, xmmword ptr [rsp+100H] paddd xmm8, xmm13 paddd xmm9, xmm14 pxor xmm5, xmm10 pxor xmm6, xmm11 pxor xmm7, xmm8 pxor xmm4, xmm9 movdqa xmmword ptr [rsp+100H], xmm8 movdqa xmm8, xmm5 psrld xmm8, 12 pslld xmm5, 20 por xmm5, xmm8 movdqa xmm8, xmm6 psrld xmm8, 12 pslld xmm6, 20 por xmm6, xmm8 movdqa xmm8, xmm7 psrld xmm8, 12 pslld xmm7, 20 por xmm7, xmm8 movdqa xmm8, xmm4 psrld xmm8, 12 pslld xmm4, 20 por xmm4, xmm8 paddd xmm0, xmmword ptr [rsp+0A0H] paddd xmm1, xmmword ptr [rsp+0C0H] paddd xmm2, xmmword ptr [rsp+40H] paddd xmm3, xmmword ptr [rsp+0D0H] paddd xmm0, xmm5 paddd xmm1, xmm6 paddd xmm2, xmm7 paddd xmm3, xmm4 pxor xmm15, xmm0 pxor xmm12, xmm1 pxor xmm13, xmm2 pxor xmm14, xmm3 movdqa xmm8, xmm15 psrld xmm15, 8 pslld xmm8, 24 pxor xmm15, xmm8 movdqa xmm8, xmm12 psrld xmm12, 8 pslld xmm8, 24 pxor xmm12, xmm8 movdqa xmm8, xmm13 psrld xmm13, 8 pslld xmm8, 24 pxor xmm13, xmm8 movdqa xmm8, xmm14 psrld xmm14, 8 pslld xmm8, 24 pxor xmm14, xmm8 paddd xmm10, xmm15 paddd xmm11, xmm12 movdqa xmm8, xmmword ptr [rsp+100H] paddd xmm8, xmm13 paddd xmm9, xmm14 pxor xmm5, xmm10 pxor xmm6, xmm11 pxor xmm7, xmm8 pxor xmm4, xmm9 pxor xmm0, xmm8 pxor xmm1, xmm9 pxor xmm2, xmm10 pxor xmm3, xmm11 movdqa xmm8, xmm5 psrld xmm8, 7 pslld xmm5, 25 por xmm5, xmm8 movdqa xmm8, xmm6 psrld xmm8, 7 pslld xmm6, 25 por xmm6, xmm8 movdqa xmm8, xmm7 psrld xmm8, 7 pslld xmm7, 25 por xmm7, xmm8 movdqa xmm8, xmm4 psrld xmm8, 7 pslld xmm4, 25 por xmm4, xmm8 pxor xmm4, xmm12 pxor xmm5, xmm13 pxor xmm6, xmm14 pxor xmm7, xmm15 mov eax, r13d jne innerloop4 movdqa xmm9, xmm0 punpckldq xmm0, xmm1 punpckhdq xmm9, xmm1 movdqa xmm11, xmm2 punpckldq xmm2, xmm3 punpckhdq xmm11, xmm3 movdqa xmm1, xmm0 punpcklqdq xmm0, xmm2 punpckhqdq xmm1, xmm2 movdqa xmm3, xmm9 punpcklqdq xmm9, xmm11 punpckhqdq xmm3, xmm11 movdqu xmmword ptr [rbx], xmm0 movdqu xmmword ptr [rbx+20H], xmm1 movdqu xmmword ptr [rbx+40H], xmm9 movdqu xmmword ptr [rbx+60H], xmm3 movdqa xmm9, xmm4 punpckldq xmm4, xmm5 punpckhdq xmm9, xmm5 movdqa xmm11, xmm6 punpckldq xmm6, xmm7 punpckhdq xmm11, xmm7 movdqa xmm5, xmm4 punpcklqdq xmm4, xmm6 punpckhqdq xmm5, xmm6 movdqa xmm7, xmm9 punpcklqdq xmm9, xmm11 punpckhqdq xmm7, xmm11 movdqu xmmword ptr [rbx+10H], xmm4 movdqu xmmword ptr [rbx+30H], xmm5 movdqu xmmword ptr [rbx+50H], xmm9 movdqu xmmword ptr [rbx+70H], xmm7 movdqa xmm1, xmmword ptr [rsp+110H] movdqa xmm0, xmm1 paddd xmm1, xmmword ptr [rsp+150H] movdqa xmmword ptr [rsp+110H], xmm1 pxor xmm0, xmmword ptr [CMP_MSB_MASK] pxor xmm1, xmmword ptr [CMP_MSB_MASK] pcmpgtd xmm0, xmm1 movdqa xmm1, xmmword ptr [rsp+120H] psubd xmm1, xmm0 movdqa xmmword ptr [rsp+120H], xmm1 add rbx, 128 add rdi, 32 sub rsi, 4 cmp rsi, 4 jnc outerloop4 test rsi, rsi jne final3blocks unwind: movdqa xmm6, xmmword ptr [rsp+170H] movdqa xmm7, xmmword ptr [rsp+180H] movdqa xmm8, xmmword ptr [rsp+190H] movdqa xmm9, xmmword ptr [rsp+1A0H] movdqa xmm10, xmmword ptr [rsp+1B0H] movdqa xmm11, xmmword ptr [rsp+1C0H] movdqa xmm12, xmmword ptr [rsp+1D0H] movdqa xmm13, xmmword ptr [rsp+1E0H] movdqa xmm14, xmmword ptr [rsp+1F0H] movdqa xmm15, xmmword ptr [rsp+200H] mov rsp, rbp pop rbp pop rbx pop rdi pop rsi pop r12 pop r13 pop r14 pop r15 ret ALIGN 16 final3blocks: test esi, 2H je final1block movups xmm0, xmmword ptr [rcx] movups xmm1, xmmword ptr [rcx+10H] movaps xmm8, xmm0 movaps xmm9, xmm1 movd xmm13, dword ptr [rsp+110H] movd xmm14, dword ptr [rsp+120H] punpckldq xmm13, xmm14 movaps xmmword ptr [rsp], xmm13 movd xmm14, dword ptr [rsp+114H] movd xmm13, dword ptr [rsp+124H] punpckldq xmm14, xmm13 movaps xmmword ptr [rsp+10H], xmm14 mov r8, qword ptr [rdi] mov r9, qword ptr [rdi+8H] movzx eax, byte ptr [rbp+80H] or eax, r13d xor edx, edx innerloop2: mov r14d, eax or eax, r12d add rdx, 64 cmp rdx, r15 cmovne eax, r14d movaps xmm2, xmmword ptr [BLAKE3_IV] movaps xmm10, xmm2 movups xmm4, xmmword ptr [r8+rdx-40H] movups xmm5, xmmword ptr [r8+rdx-30H] movaps xmm3, xmm4 shufps xmm4, xmm5, 136 shufps xmm3, xmm5, 221 movaps xmm5, xmm3 movups xmm6, xmmword ptr [r8+rdx-20H] movups xmm7, xmmword ptr [r8+rdx-10H] movaps xmm3, xmm6 shufps xmm6, xmm7, 136 pshufd xmm6, xmm6, 93H shufps xmm3, xmm7, 221 pshufd xmm7, xmm3, 93H movups xmm12, xmmword ptr [r9+rdx-40H] movups xmm13, xmmword ptr [r9+rdx-30H] movaps xmm11, xmm12 shufps xmm12, xmm13, 136 shufps xmm11, xmm13, 221 movaps xmm13, xmm11 movups xmm14, xmmword ptr [r9+rdx-20H] movups xmm15, xmmword ptr [r9+rdx-10H] movaps xmm11, xmm14 shufps xmm14, xmm15, 136 pshufd xmm14, xmm14, 93H shufps xmm11, xmm15, 221 pshufd xmm15, xmm11, 93H shl rax, 20H or rax, 40H movd xmm3, rax movdqa xmmword ptr [rsp+20H], xmm3 movaps xmm3, xmmword ptr [rsp] movaps xmm11, xmmword ptr [rsp+10H] punpcklqdq xmm3, xmmword ptr [rsp+20H] punpcklqdq xmm11, xmmword ptr [rsp+20H] mov al, 7 roundloop2: paddd xmm0, xmm4 paddd xmm8, xmm12 movaps xmmword ptr [rsp+20H], xmm4 movaps xmmword ptr [rsp+30H], xmm12 paddd xmm0, xmm1 paddd xmm8, xmm9 pxor xmm3, xmm0 pxor xmm11, xmm8 pshuflw xmm3, xmm3, 0B1H pshufhw xmm3, xmm3, 0B1H pshuflw xmm11, xmm11, 0B1H pshufhw xmm11, xmm11, 0B1H paddd xmm2, xmm3 paddd xmm10, xmm11 pxor xmm1, xmm2 pxor xmm9, xmm10 movdqa xmm4, xmm1 pslld xmm1, 20 psrld xmm4, 12 por xmm1, xmm4 movdqa xmm4, xmm9 pslld xmm9, 20 psrld xmm4, 12 por xmm9, xmm4 paddd xmm0, xmm5 paddd xmm8, xmm13 movaps xmmword ptr [rsp+40H], xmm5 movaps xmmword ptr [rsp+50H], xmm13 paddd xmm0, xmm1 paddd xmm8, xmm9 pxor xmm3, xmm0 pxor xmm11, xmm8 movdqa xmm13, xmm3 psrld xmm3, 8 pslld xmm13, 24 pxor xmm3, xmm13 movdqa xmm13, xmm11 psrld xmm11, 8 pslld xmm13, 24 pxor xmm11, xmm13 paddd xmm2, xmm3 paddd xmm10, xmm11 pxor xmm1, xmm2 pxor xmm9, xmm10 movdqa xmm4, xmm1 pslld xmm1, 25 psrld xmm4, 7 por xmm1, xmm4 movdqa xmm4, xmm9 pslld xmm9, 25 psrld xmm4, 7 por xmm9, xmm4 pshufd xmm0, xmm0, 93H pshufd xmm8, xmm8, 93H pshufd xmm3, xmm3, 4EH pshufd xmm11, xmm11, 4EH pshufd xmm2, xmm2, 39H pshufd xmm10, xmm10, 39H paddd xmm0, xmm6 paddd xmm8, xmm14 paddd xmm0, xmm1 paddd xmm8, xmm9 pxor xmm3, xmm0 pxor xmm11, xmm8 pshuflw xmm3, xmm3, 0B1H pshufhw xmm3, xmm3, 0B1H pshuflw xmm11, xmm11, 0B1H pshufhw xmm11, xmm11, 0B1H paddd xmm2, xmm3 paddd xmm10, xmm11 pxor xmm1, xmm2 pxor xmm9, xmm10 movdqa xmm4, xmm1 pslld xmm1, 20 psrld xmm4, 12 por xmm1, xmm4 movdqa xmm4, xmm9 pslld xmm9, 20 psrld xmm4, 12 por xmm9, xmm4 paddd xmm0, xmm7 paddd xmm8, xmm15 paddd xmm0, xmm1 paddd xmm8, xmm9 pxor xmm3, xmm0 pxor xmm11, xmm8 movdqa xmm13, xmm3 psrld xmm3, 8 pslld xmm13, 24 pxor xmm3, xmm13 movdqa xmm13, xmm11 psrld xmm11, 8 pslld xmm13, 24 pxor xmm11, xmm13 paddd xmm2, xmm3 paddd xmm10, xmm11 pxor xmm1, xmm2 pxor xmm9, xmm10 movdqa xmm4, xmm1 pslld xmm1, 25 psrld xmm4, 7 por xmm1, xmm4 movdqa xmm4, xmm9 pslld xmm9, 25 psrld xmm4, 7 por xmm9, xmm4 pshufd xmm0, xmm0, 39H pshufd xmm8, xmm8, 39H pshufd xmm3, xmm3, 4EH pshufd xmm11, xmm11, 4EH pshufd xmm2, xmm2, 93H pshufd xmm10, xmm10, 93H dec al je endroundloop2 movdqa xmm12, xmmword ptr [rsp+20H] movdqa xmm5, xmmword ptr [rsp+40H] pshufd xmm13, xmm12, 0FH shufps xmm12, xmm5, 214 pshufd xmm4, xmm12, 39H movdqa xmm12, xmm6 shufps xmm12, xmm7, 250 pand xmm13, xmmword ptr [PBLENDW_0x33_MASK] pand xmm12, xmmword ptr [PBLENDW_0xCC_MASK] por xmm13, xmm12 movdqa xmmword ptr [rsp+20H], xmm13 movdqa xmm12, xmm7 punpcklqdq xmm12, xmm5 movdqa xmm13, xmm6 pand xmm12, xmmword ptr [PBLENDW_0x3F_MASK] pand xmm13, xmmword ptr [PBLENDW_0xC0_MASK] por xmm12, xmm13 pshufd xmm12, xmm12, 78H punpckhdq xmm5, xmm7 punpckldq xmm6, xmm5 pshufd xmm7, xmm6, 1EH movdqa xmmword ptr [rsp+40H], xmm12 movdqa xmm5, xmmword ptr [rsp+30H] movdqa xmm13, xmmword ptr [rsp+50H] pshufd xmm6, xmm5, 0FH shufps xmm5, xmm13, 214 pshufd xmm12, xmm5, 39H movdqa xmm5, xmm14 shufps xmm5, xmm15, 250 pand xmm6, xmmword ptr [PBLENDW_0x33_MASK] pand xmm5, xmmword ptr [PBLENDW_0xCC_MASK] por xmm6, xmm5 movdqa xmm5, xmm15 punpcklqdq xmm5, xmm13 movdqa xmmword ptr [rsp+30H], xmm2 movdqa xmm2, xmm14 pand xmm5, xmmword ptr [PBLENDW_0x3F_MASK] pand xmm2, xmmword ptr [PBLENDW_0xC0_MASK] por xmm5, xmm2 movdqa xmm2, xmmword ptr [rsp+30H] pshufd xmm5, xmm5, 78H punpckhdq xmm13, xmm15 punpckldq xmm14, xmm13 pshufd xmm15, xmm14, 1EH movdqa xmm13, xmm6 movdqa xmm14, xmm5 movdqa xmm5, xmmword ptr [rsp+20H] movdqa xmm6, xmmword ptr [rsp+40H] jmp roundloop2 endroundloop2: pxor xmm0, xmm2 pxor xmm1, xmm3 pxor xmm8, xmm10 pxor xmm9, xmm11 mov eax, r13d cmp rdx, r15 jne innerloop2 movups xmmword ptr [rbx], xmm0 movups xmmword ptr [rbx+10H], xmm1 movups xmmword ptr [rbx+20H], xmm8 movups xmmword ptr [rbx+30H], xmm9 mov eax, dword ptr [rsp+130H] neg eax mov r10d, dword ptr [rsp+110H+8*rax] mov r11d, dword ptr [rsp+120H+8*rax] mov dword ptr [rsp+110H], r10d mov dword ptr [rsp+120H], r11d add rdi, 16 add rbx, 64 sub rsi, 2 final1block: test esi, 1H je unwind movups xmm0, xmmword ptr [rcx] movups xmm1, xmmword ptr [rcx+10H] movd xmm13, dword ptr [rsp+110H] movd xmm14, dword ptr [rsp+120H] punpckldq xmm13, xmm14 mov r8, qword ptr [rdi] movzx eax, byte ptr [rbp+80H] or eax, r13d xor edx, edx innerloop1: mov r14d, eax or eax, r12d add rdx, 64 cmp rdx, r15 cmovne eax, r14d movaps xmm2, xmmword ptr [BLAKE3_IV] shl rax, 32 or rax, 64 movd xmm12, rax movdqa xmm3, xmm13 punpcklqdq xmm3, xmm12 movups xmm4, xmmword ptr [r8+rdx-40H] movups xmm5, xmmword ptr [r8+rdx-30H] movaps xmm8, xmm4 shufps xmm4, xmm5, 136 shufps xmm8, xmm5, 221 movaps xmm5, xmm8 movups xmm6, xmmword ptr [r8+rdx-20H] movups xmm7, xmmword ptr [r8+rdx-10H] movaps xmm8, xmm6 shufps xmm6, xmm7, 136 pshufd xmm6, xmm6, 93H shufps xmm8, xmm7, 221 pshufd xmm7, xmm8, 93H mov al, 7 roundloop1: paddd xmm0, xmm4 paddd xmm0, xmm1 pxor xmm3, xmm0 pshuflw xmm3, xmm3, 0B1H pshufhw xmm3, xmm3, 0B1H paddd xmm2, xmm3 pxor xmm1, xmm2 movdqa xmm11, xmm1 pslld xmm1, 20 psrld xmm11, 12 por xmm1, xmm11 paddd xmm0, xmm5 paddd xmm0, xmm1 pxor xmm3, xmm0 movdqa xmm14, xmm3 psrld xmm3, 8 pslld xmm14, 24 pxor xmm3, xmm14 paddd xmm2, xmm3 pxor xmm1, xmm2 movdqa xmm11, xmm1 pslld xmm1, 25 psrld xmm11, 7 por xmm1, xmm11 pshufd xmm0, xmm0, 93H pshufd xmm3, xmm3, 4EH pshufd xmm2, xmm2, 39H paddd xmm0, xmm6 paddd xmm0, xmm1 pxor xmm3, xmm0 pshuflw xmm3, xmm3, 0B1H pshufhw xmm3, xmm3, 0B1H paddd xmm2, xmm3 pxor xmm1, xmm2 movdqa xmm11, xmm1 pslld xmm1, 20 psrld xmm11, 12 por xmm1, xmm11 paddd xmm0, xmm7 paddd xmm0, xmm1 pxor xmm3, xmm0 movdqa xmm14, xmm3 psrld xmm3, 8 pslld xmm14, 24 pxor xmm3, xmm14 paddd xmm2, xmm3 pxor xmm1, xmm2 movdqa xmm11, xmm1 pslld xmm1, 25 psrld xmm11, 7 por xmm1, xmm11 pshufd xmm0, xmm0, 39H pshufd xmm3, xmm3, 4EH pshufd xmm2, xmm2, 93H dec al jz endroundloop1 movdqa xmm8, xmm4 shufps xmm8, xmm5, 214 pshufd xmm9, xmm4, 0FH pshufd xmm4, xmm8, 39H movdqa xmm8, xmm6 shufps xmm8, xmm7, 250 pand xmm9, xmmword ptr [PBLENDW_0x33_MASK] pand xmm8, xmmword ptr [PBLENDW_0xCC_MASK] por xmm9, xmm8 movdqa xmm8, xmm7 punpcklqdq xmm8, xmm5 movdqa xmm10, xmm6 pand xmm8, xmmword ptr [PBLENDW_0x3F_MASK] pand xmm10, xmmword ptr [PBLENDW_0xC0_MASK] por xmm8, xmm10 pshufd xmm8, xmm8, 78H punpckhdq xmm5, xmm7 punpckldq xmm6, xmm5 pshufd xmm7, xmm6, 1EH movdqa xmm5, xmm9 movdqa xmm6, xmm8 jmp roundloop1 endroundloop1: pxor xmm0, xmm2 pxor xmm1, xmm3 mov eax, r13d cmp rdx, r15 jne innerloop1 movups xmmword ptr [rbx], xmm0 movups xmmword ptr [rbx+10H], xmm1 jmp unwind _blake3_hash_many_sse2 ENDP blake3_hash_many_sse2 ENDP blake3_compress_in_place_sse2 PROC _blake3_compress_in_place_sse2 PROC sub rsp, 120 movdqa xmmword ptr [rsp], xmm6 movdqa xmmword ptr [rsp+10H], xmm7 movdqa xmmword ptr [rsp+20H], xmm8 movdqa xmmword ptr [rsp+30H], xmm9 movdqa xmmword ptr [rsp+40H], xmm11 movdqa xmmword ptr [rsp+50H], xmm14 movdqa xmmword ptr [rsp+60H], xmm15 movups xmm0, xmmword ptr [rcx] movups xmm1, xmmword ptr [rcx+10H] movaps xmm2, xmmword ptr [BLAKE3_IV] movzx eax, byte ptr [rsp+0A0H] movzx r8d, r8b shl rax, 32 add r8, rax movd xmm3, r9 movd xmm4, r8 punpcklqdq xmm3, xmm4 movups xmm4, xmmword ptr [rdx] movups xmm5, xmmword ptr [rdx+10H] movaps xmm8, xmm4 shufps xmm4, xmm5, 136 shufps xmm8, xmm5, 221 movaps xmm5, xmm8 movups xmm6, xmmword ptr [rdx+20H] movups xmm7, xmmword ptr [rdx+30H] movaps xmm8, xmm6 shufps xmm6, xmm7, 136 pshufd xmm6, xmm6, 93H shufps xmm8, xmm7, 221 pshufd xmm7, xmm8, 93H mov al, 7 @@: paddd xmm0, xmm4 paddd xmm0, xmm1 pxor xmm3, xmm0 pshuflw xmm3, xmm3, 0B1H pshufhw xmm3, xmm3, 0B1H paddd xmm2, xmm3 pxor xmm1, xmm2 movdqa xmm11, xmm1 pslld xmm1, 20 psrld xmm11, 12 por xmm1, xmm11 paddd xmm0, xmm5 paddd xmm0, xmm1 pxor xmm3, xmm0 movdqa xmm14, xmm3 psrld xmm3, 8 pslld xmm14, 24 pxor xmm3, xmm14 paddd xmm2, xmm3 pxor xmm1, xmm2 movdqa xmm11, xmm1 pslld xmm1, 25 psrld xmm11, 7 por xmm1, xmm11 pshufd xmm0, xmm0, 93H pshufd xmm3, xmm3, 4EH pshufd xmm2, xmm2, 39H paddd xmm0, xmm6 paddd xmm0, xmm1 pxor xmm3, xmm0 pshuflw xmm3, xmm3, 0B1H pshufhw xmm3, xmm3, 0B1H paddd xmm2, xmm3 pxor xmm1, xmm2 movdqa xmm11, xmm1 pslld xmm1, 20 psrld xmm11, 12 por xmm1, xmm11 paddd xmm0, xmm7 paddd xmm0, xmm1 pxor xmm3, xmm0 movdqa xmm14, xmm3 psrld xmm3, 8 pslld xmm14, 24 pxor xmm3, xmm14 paddd xmm2, xmm3 pxor xmm1, xmm2 movdqa xmm11, xmm1 pslld xmm1, 25 psrld xmm11, 7 por xmm1, xmm11 pshufd xmm0, xmm0, 39H pshufd xmm3, xmm3, 4EH pshufd xmm2, xmm2, 93H dec al jz @F movdqa xmm8, xmm4 shufps xmm8, xmm5, 214 pshufd xmm9, xmm4, 0FH pshufd xmm4, xmm8, 39H movdqa xmm8, xmm6 shufps xmm8, xmm7, 250 pand xmm9, xmmword ptr [PBLENDW_0x33_MASK] pand xmm8, xmmword ptr [PBLENDW_0xCC_MASK] por xmm9, xmm8 movdqa xmm8, xmm7 punpcklqdq xmm8, xmm5 movdqa xmm14, xmm6 pand xmm8, xmmword ptr [PBLENDW_0x3F_MASK] pand xmm14, xmmword ptr [PBLENDW_0xC0_MASK] por xmm8, xmm14 pshufd xmm8, xmm8, 78H punpckhdq xmm5, xmm7 punpckldq xmm6, xmm5 pshufd xmm7, xmm6, 1EH movdqa xmm5, xmm9 movdqa xmm6, xmm8 jmp @B @@: pxor xmm0, xmm2 pxor xmm1, xmm3 movups xmmword ptr [rcx], xmm0 movups xmmword ptr [rcx+10H], xmm1 movdqa xmm6, xmmword ptr [rsp] movdqa xmm7, xmmword ptr [rsp+10H] movdqa xmm8, xmmword ptr [rsp+20H] movdqa xmm9, xmmword ptr [rsp+30H] movdqa xmm11, xmmword ptr [rsp+40H] movdqa xmm14, xmmword ptr [rsp+50H] movdqa xmm15, xmmword ptr [rsp+60H] add rsp, 120 ret _blake3_compress_in_place_sse2 ENDP blake3_compress_in_place_sse2 ENDP ALIGN 16 blake3_compress_xof_sse2 PROC _blake3_compress_xof_sse2 PROC sub rsp, 120 movdqa xmmword ptr [rsp], xmm6 movdqa xmmword ptr [rsp+10H], xmm7 movdqa xmmword ptr [rsp+20H], xmm8 movdqa xmmword ptr [rsp+30H], xmm9 movdqa xmmword ptr [rsp+40H], xmm11 movdqa xmmword ptr [rsp+50H], xmm14 movdqa xmmword ptr [rsp+60H], xmm15 movups xmm0, xmmword ptr [rcx] movups xmm1, xmmword ptr [rcx+10H] movaps xmm2, xmmword ptr [BLAKE3_IV] movzx eax, byte ptr [rsp+0A0H] movzx r8d, r8b mov r10, qword ptr [rsp+0A8H] shl rax, 32 add r8, rax movd xmm3, r9 movd xmm4, r8 punpcklqdq xmm3, xmm4 movups xmm4, xmmword ptr [rdx] movups xmm5, xmmword ptr [rdx+10H] movaps xmm8, xmm4 shufps xmm4, xmm5, 136 shufps xmm8, xmm5, 221 movaps xmm5, xmm8 movups xmm6, xmmword ptr [rdx+20H] movups xmm7, xmmword ptr [rdx+30H] movaps xmm8, xmm6 shufps xmm6, xmm7, 136 pshufd xmm6, xmm6, 93H shufps xmm8, xmm7, 221 pshufd xmm7, xmm8, 93H mov al, 7 @@: paddd xmm0, xmm4 paddd xmm0, xmm1 pxor xmm3, xmm0 pshuflw xmm3, xmm3, 0B1H pshufhw xmm3, xmm3, 0B1H paddd xmm2, xmm3 pxor xmm1, xmm2 movdqa xmm11, xmm1 pslld xmm1, 20 psrld xmm11, 12 por xmm1, xmm11 paddd xmm0, xmm5 paddd xmm0, xmm1 pxor xmm3, xmm0 movdqa xmm14, xmm3 psrld xmm3, 8 pslld xmm14, 24 pxor xmm3, xmm14 paddd xmm2, xmm3 pxor xmm1, xmm2 movdqa xmm11, xmm1 pslld xmm1, 25 psrld xmm11, 7 por xmm1, xmm11 pshufd xmm0, xmm0, 93H pshufd xmm3, xmm3, 4EH pshufd xmm2, xmm2, 39H paddd xmm0, xmm6 paddd xmm0, xmm1 pxor xmm3, xmm0 pshuflw xmm3, xmm3, 0B1H pshufhw xmm3, xmm3, 0B1H paddd xmm2, xmm3 pxor xmm1, xmm2 movdqa xmm11, xmm1 pslld xmm1, 20 psrld xmm11, 12 por xmm1, xmm11 paddd xmm0, xmm7 paddd xmm0, xmm1 pxor xmm3, xmm0 movdqa xmm14, xmm3 psrld xmm3, 8 pslld xmm14, 24 pxor xmm3, xmm14 paddd xmm2, xmm3 pxor xmm1, xmm2 movdqa xmm11, xmm1 pslld xmm1, 25 psrld xmm11, 7 por xmm1, xmm11 pshufd xmm0, xmm0, 39H pshufd xmm3, xmm3, 4EH pshufd xmm2, xmm2, 93H dec al jz @F movdqa xmm8, xmm4 shufps xmm8, xmm5, 214 pshufd xmm9, xmm4, 0FH pshufd xmm4, xmm8, 39H movdqa xmm8, xmm6 shufps xmm8, xmm7, 250 pand xmm9, xmmword ptr [PBLENDW_0x33_MASK] pand xmm8, xmmword ptr [PBLENDW_0xCC_MASK] por xmm9, xmm8 movdqa xmm8, xmm7 punpcklqdq xmm8, xmm5 movdqa xmm14, xmm6 pand xmm8, xmmword ptr [PBLENDW_0x3F_MASK] pand xmm14, xmmword ptr [PBLENDW_0xC0_MASK] por xmm8, xmm14 pshufd xmm8, xmm8, 78H punpckhdq xmm5, xmm7 punpckldq xmm6, xmm5 pshufd xmm7, xmm6, 1EH movdqa xmm5, xmm9 movdqa xmm6, xmm8 jmp @B @@: movdqu xmm4, xmmword ptr [rcx] movdqu xmm5, xmmword ptr [rcx+10H] pxor xmm0, xmm2 pxor xmm1, xmm3 pxor xmm2, xmm4 pxor xmm3, xmm5 movups xmmword ptr [r10], xmm0 movups xmmword ptr [r10+10H], xmm1 movups xmmword ptr [r10+20H], xmm2 movups xmmword ptr [r10+30H], xmm3 movdqa xmm6, xmmword ptr [rsp] movdqa xmm7, xmmword ptr [rsp+10H] movdqa xmm8, xmmword ptr [rsp+20H] movdqa xmm9, xmmword ptr [rsp+30H] movdqa xmm11, xmmword ptr [rsp+40H] movdqa xmm14, xmmword ptr [rsp+50H] movdqa xmm15, xmmword ptr [rsp+60H] add rsp, 120 ret _blake3_compress_xof_sse2 ENDP blake3_compress_xof_sse2 ENDP _TEXT ENDS _RDATA SEGMENT READONLY PAGE ALIAS(".rdata") 'CONST' ALIGN 64 BLAKE3_IV: dd 6A09E667H, 0BB67AE85H, 3C6EF372H, 0A54FF53AH ADD0: dd 0, 1, 2, 3 ADD1: dd 4 dup (4) BLAKE3_IV_0: dd 4 dup (6A09E667H) BLAKE3_IV_1: dd 4 dup (0BB67AE85H) BLAKE3_IV_2: dd 4 dup (3C6EF372H) BLAKE3_IV_3: dd 4 dup (0A54FF53AH) BLAKE3_BLOCK_LEN: dd 4 dup (64) CMP_MSB_MASK: dd 8 dup(80000000H) PBLENDW_0x33_MASK: dd 0FFFFFFFFH, 000000000H, 0FFFFFFFFH, 000000000H PBLENDW_0xCC_MASK: dd 000000000H, 0FFFFFFFFH, 000000000H, 0FFFFFFFFH PBLENDW_0x3F_MASK: dd 0FFFFFFFFH, 0FFFFFFFFH, 0FFFFFFFFH, 000000000H PBLENDW_0xC0_MASK: dd 000000000H, 000000000H, 000000000H, 0FFFFFFFFH _RDATA ENDS END