1
0
Fork 0
mirror of https://github.com/BLAKE3-team/BLAKE3 synced 2024-04-19 20:44:11 +02:00

fix incorrect output / undefined behavior in Windows SSE2 assembly

The SSE2 patch introduced xmm10 as a temporary register for one of the
rotations, but xmm6-xmm15 are callee-save registers on Windows, and
SSE4.1 was only saving the registers it used. The minimal fix is to use
one of the saved registers instead of xmm10.

See https://github.com/BLAKE3-team/BLAKE3/issues/206.
This commit is contained in:
Jack O'Connor 2021-11-05 00:17:05 -04:00
parent 04571021fb
commit 371b5483c9
3 changed files with 42 additions and 12 deletions

View File

@ -2137,10 +2137,10 @@ _blake3_compress_in_place_sse2:
por xmm9, xmm8
movdqa xmm8, xmm7
punpcklqdq xmm8, xmm5
movdqa xmm10, xmm6
movdqa xmm14, xmm6
pand xmm8, xmmword ptr [PBLENDW_0x3F_MASK+rip]
pand xmm10, xmmword ptr [PBLENDW_0xC0_MASK+rip]
por xmm8, xmm10
pand xmm14, xmmword ptr [PBLENDW_0xC0_MASK+rip]
por xmm8, xmm14
pshufd xmm8, xmm8, 0x78
punpckhdq xmm5, xmm7
punpckldq xmm6, xmm5
@ -2268,10 +2268,10 @@ blake3_compress_xof_sse2:
por xmm9, xmm8
movdqa xmm8, xmm7
punpcklqdq xmm8, xmm5
movdqa xmm10, xmm6
movdqa xmm14, xmm6
pand xmm8, xmmword ptr [PBLENDW_0x3F_MASK+rip]
pand xmm10, xmmword ptr [PBLENDW_0xC0_MASK+rip]
por xmm8, xmm10
pand xmm14, xmmword ptr [PBLENDW_0xC0_MASK+rip]
por xmm8, xmm14
pshufd xmm8, xmm8, 0x78
punpckhdq xmm5, xmm7
punpckldq xmm6, xmm5

View File

@ -2139,10 +2139,10 @@ _blake3_compress_in_place_sse2 PROC
por xmm9, xmm8
movdqa xmm8, xmm7
punpcklqdq xmm8, xmm5
movdqa xmm10, xmm6
movdqa xmm14, xmm6
pand xmm8, xmmword ptr [PBLENDW_0x3F_MASK]
pand xmm10, xmmword ptr [PBLENDW_0xC0_MASK]
por xmm8, xmm10
pand xmm14, xmmword ptr [PBLENDW_0xC0_MASK]
por xmm8, xmm14
pshufd xmm8, xmm8, 78H
punpckhdq xmm5, xmm7
punpckldq xmm6, xmm5
@ -2271,10 +2271,10 @@ _blake3_compress_xof_sse2 PROC
por xmm9, xmm8
movdqa xmm8, xmm7
punpcklqdq xmm8, xmm5
movdqa xmm10, xmm6
movdqa xmm14, xmm6
pand xmm8, xmmword ptr [PBLENDW_0x3F_MASK]
pand xmm10, xmmword ptr [PBLENDW_0xC0_MASK]
por xmm8, xmm10
pand xmm14, xmmword ptr [PBLENDW_0xC0_MASK]
por xmm8, xmm14
pshufd xmm8, xmm8, 78H
punpckhdq xmm5, xmm7
punpckldq xmm6, xmm5

View File

@ -564,3 +564,33 @@ fn test_hex_encoding_decoding() {
#[cfg(feature = "std")]
assert_eq!(_result.to_string(), "invalid hex character: 0x80");
}
// This test is a mimized failure case for the Windows SSE2 bug described in
// https://github.com/BLAKE3-team/BLAKE3/issues/206.
//
// Before that issue was fixed, this test would fail on Windows in the following configuration:
//
// cargo test --features=no_avx512,no_avx2,no_sse41 --release
//
// Bugs like this one (stomping on a caller's register) are very sensitive to the details of
// surrounding code, so it's not especially likely that this test will catch another bug (or even
// the same bug) in the future. Still, there's no harm in keeping it.
#[test]
fn test_issue_206_windows_sse2() {
// This stupid loop has to be here to trigger the bug. I don't know why.
for _ in &[0] {
// The length 65 (two blocks) is significant. It doesn't repro with 64 (one block). It also
// doesn't repro with an all-zero input.
let input = &[0xff; 65];
let expected_hash = [
183, 235, 50, 217, 156, 24, 190, 219, 2, 216, 176, 255, 224, 53, 28, 95, 57, 148, 179,
245, 162, 90, 37, 121, 0, 142, 219, 62, 234, 204, 225, 161,
];
// This throwaway call has to be here to trigger the bug.
crate::Hasher::new().update(input);
// This assert fails when the bug is triggered.
assert_eq!(crate::Hasher::new().update(input).finalize(), expected_hash);
}
}