diff --git a/c/blake3_sse41_x86-64_windows_gnu.S b/c/blake3_sse41_x86-64_windows_gnu.S index ca6b363..60d0a40 100644 --- a/c/blake3_sse41_x86-64_windows_gnu.S +++ b/c/blake3_sse41_x86-64_windows_gnu.S @@ -1800,15 +1800,18 @@ blake3_hash_many_sse41: .p2align 6 blake3_compress_in_place_sse41: _blake3_compress_in_place_sse41: - sub rsp, 72 + sub rsp, 120 movdqa xmmword ptr [rsp], xmm6 movdqa xmmword ptr [rsp+0x10], xmm7 movdqa xmmword ptr [rsp+0x20], xmm8 movdqa xmmword ptr [rsp+0x30], xmm9 + movdqa xmmword ptr [rsp+0x40], xmm11 + movdqa xmmword ptr [rsp+0x50], xmm14 + movdqa xmmword ptr [rsp+0x60], xmm15 movups xmm0, xmmword ptr [rcx] movups xmm1, xmmword ptr [rcx+0x10] movaps xmm2, xmmword ptr [BLAKE3_IV+rip] - movzx eax, byte ptr [rsp+0x70] + movzx eax, byte ptr [rsp+0xA0] movzx r8d, r8b shl rax, 32 add r8, rax @@ -1906,24 +1909,30 @@ _blake3_compress_in_place_sse41: movdqa xmm7, xmmword ptr [rsp+0x10] movdqa xmm8, xmmword ptr [rsp+0x20] movdqa xmm9, xmmword ptr [rsp+0x30] - add rsp, 72 + movdqa xmm11, xmmword ptr [rsp+0x40] + movdqa xmm14, xmmword ptr [rsp+0x50] + movdqa xmm15, xmmword ptr [rsp+0x60] + add rsp, 120 ret .p2align 6 _blake3_compress_xof_sse41: blake3_compress_xof_sse41: - sub rsp, 72 + sub rsp, 120 movdqa xmmword ptr [rsp], xmm6 movdqa xmmword ptr [rsp+0x10], xmm7 movdqa xmmword ptr [rsp+0x20], xmm8 movdqa xmmword ptr [rsp+0x30], xmm9 + movdqa xmmword ptr [rsp+0x40], xmm11 + movdqa xmmword ptr [rsp+0x50], xmm14 + movdqa xmmword ptr [rsp+0x60], xmm15 movups xmm0, xmmword ptr [rcx] movups xmm1, xmmword ptr [rcx+0x10] movaps xmm2, xmmword ptr [BLAKE3_IV+rip] - movzx eax, byte ptr [rsp+0x70] + movzx eax, byte ptr [rsp+0xA0] movzx r8d, r8b - mov r10, qword ptr [rsp+0x78] + mov r10, qword ptr [rsp+0xA8] shl rax, 32 add r8, rax movq xmm3, r9 @@ -2026,7 +2035,10 @@ blake3_compress_xof_sse41: movdqa xmm7, xmmword ptr [rsp+0x10] movdqa xmm8, xmmword ptr [rsp+0x20] movdqa xmm9, xmmword ptr [rsp+0x30] - add rsp, 72 + movdqa xmm11, xmmword ptr [rsp+0x40] + movdqa xmm14, xmmword ptr [rsp+0x50] + movdqa xmm15, xmmword ptr [rsp+0x60] + add rsp, 120 ret diff --git a/c/blake3_sse41_x86-64_windows_msvc.asm b/c/blake3_sse41_x86-64_windows_msvc.asm index 57b2297..87001e4 100644 --- a/c/blake3_sse41_x86-64_windows_msvc.asm +++ b/c/blake3_sse41_x86-64_windows_msvc.asm @@ -1802,15 +1802,18 @@ blake3_hash_many_sse41 ENDP blake3_compress_in_place_sse41 PROC _blake3_compress_in_place_sse41 PROC - sub rsp, 72 + sub rsp, 120 movdqa xmmword ptr [rsp], xmm6 movdqa xmmword ptr [rsp+10H], xmm7 movdqa xmmword ptr [rsp+20H], xmm8 movdqa xmmword ptr [rsp+30H], xmm9 + movdqa xmmword ptr [rsp+40H], xmm11 + movdqa xmmword ptr [rsp+50H], xmm14 + movdqa xmmword ptr [rsp+60H], xmm15 movups xmm0, xmmword ptr [rcx] movups xmm1, xmmword ptr [rcx+10H] movaps xmm2, xmmword ptr [BLAKE3_IV] - movzx eax, byte ptr [rsp+70H] + movzx eax, byte ptr [rsp+0A0H] movzx r8d, r8b shl rax, 32 add r8, rax @@ -1908,7 +1911,10 @@ _blake3_compress_in_place_sse41 PROC movdqa xmm7, xmmword ptr [rsp+10H] movdqa xmm8, xmmword ptr [rsp+20H] movdqa xmm9, xmmword ptr [rsp+30H] - add rsp, 72 + movdqa xmm11, xmmword ptr [rsp+40H] + movdqa xmm14, xmmword ptr [rsp+50H] + movdqa xmm15, xmmword ptr [rsp+60H] + add rsp, 120 ret _blake3_compress_in_place_sse41 ENDP blake3_compress_in_place_sse41 ENDP @@ -1916,17 +1922,20 @@ blake3_compress_in_place_sse41 ENDP ALIGN 16 blake3_compress_xof_sse41 PROC _blake3_compress_xof_sse41 PROC - sub rsp, 72 + sub rsp, 120 movdqa xmmword ptr [rsp], xmm6 movdqa xmmword ptr [rsp+10H], xmm7 movdqa xmmword ptr [rsp+20H], xmm8 movdqa xmmword ptr [rsp+30H], xmm9 + movdqa xmmword ptr [rsp+40H], xmm11 + movdqa xmmword ptr [rsp+50H], xmm14 + movdqa xmmword ptr [rsp+60H], xmm15 movups xmm0, xmmword ptr [rcx] movups xmm1, xmmword ptr [rcx+10H] movaps xmm2, xmmword ptr [BLAKE3_IV] - movzx eax, byte ptr [rsp+70H] + movzx eax, byte ptr [rsp+0A0H] movzx r8d, r8b - mov r10, qword ptr [rsp+78H] + mov r10, qword ptr [rsp+0A8H] shl rax, 32 add r8, rax movq xmm3, r9 @@ -2029,7 +2038,10 @@ _blake3_compress_xof_sse41 PROC movdqa xmm7, xmmword ptr [rsp+10H] movdqa xmm8, xmmword ptr [rsp+20H] movdqa xmm9, xmmword ptr [rsp+30H] - add rsp, 72 + movdqa xmm11, xmmword ptr [rsp+40H] + movdqa xmm14, xmmword ptr [rsp+50H] + movdqa xmm15, xmmword ptr [rsp+60H] + add rsp, 120 ret _blake3_compress_xof_sse41 ENDP blake3_compress_xof_sse41 ENDP