1
0
Fork 0
mirror of https://github.com/BLAKE3-team/BLAKE3 synced 2024-05-23 04:36:05 +02:00

C: asm: simplify pblendw emulation

Use statically calculated ~mask. This reduces the number of moves and registers necessary at the expense of an extra memory load. This is probably a good trade-off since we are not bound by memory uops in this loop.
This commit is contained in:
Matthew Krupcale 2020-08-31 11:36:01 -04:00
parent 47e415c7f1
commit be2da69b6b
3 changed files with 108 additions and 183 deletions

View File

@ -1833,24 +1833,17 @@ blake3_hash_many_sse2:
pshufd xmm4, xmm12, 0x39
movdqa xmm12, xmm6
shufps xmm12, xmm7, 250
movdqa xmmword ptr [rsp+0x20], xmm2
movdqa xmmword ptr [rsp+0x40], xmm3
movdqa xmm2, xmmword ptr [PBLENDW_0xCC_MASK+rip]
movdqa xmm3, xmm2
pandn xmm2, xmm13
pand xmm3, xmm12
movdqa xmm13, xmm3
por xmm13, xmm2
pand xmm13, xmmword ptr [PBLENDW_0x33_MASK+rip]
pand xmm12, xmmword ptr [PBLENDW_0xCC_MASK+rip]
por xmm13, xmm12
movdqa xmm12, xmm7
punpcklqdq xmm12, xmm5
movdqa xmm2, xmmword ptr [PBLENDW_0xC0_MASK+rip]
movdqa xmm3, xmm2
pandn xmm2, xmm12
pand xmm3, xmm6
movdqa xmm12, xmm3
movdqa xmmword ptr [rsp+0x20], xmm2
movdqa xmm2, xmm6
pand xmm12, xmmword ptr [PBLENDW_0x3F_MASK+rip]
pand xmm2, xmmword ptr [PBLENDW_0xC0_MASK+rip]
por xmm12, xmm2
movdqa xmm2, xmmword ptr [rsp+0x20]
movdqa xmm3, xmmword ptr [rsp+0x40]
pshufd xmm12, xmm12, 0x78
punpckhdq xmm5, xmm7
punpckldq xmm6, xmm5
@ -1864,24 +1857,17 @@ blake3_hash_many_sse2:
pshufd xmm12, xmm5, 0x39
movdqa xmm5, xmm14
shufps xmm5, xmm15, 250
movdqa xmmword ptr [rsp+0x30], xmm2
movdqa xmmword ptr [rsp+0x50], xmm3
movdqa xmm2, xmmword ptr [PBLENDW_0xCC_MASK+rip]
movdqa xmm3, xmm2
pandn xmm2, xmm6
pand xmm3, xmm5
movdqa xmm6, xmm3
por xmm6, xmm2
pand xmm6, xmmword ptr [PBLENDW_0x33_MASK+rip]
pand xmm5, xmmword ptr [PBLENDW_0xCC_MASK+rip]
por xmm6, xmm5
movdqa xmm5, xmm15
punpcklqdq xmm5, xmm13
movdqa xmm2, xmmword ptr [PBLENDW_0xC0_MASK+rip]
movdqa xmm3, xmm2
pandn xmm2, xmm5
pand xmm3, xmm14
movdqa xmm5, xmm3
movdqa xmmword ptr [rsp+0x30], xmm2
movdqa xmm2, xmm14
pand xmm5, xmmword ptr [PBLENDW_0x3F_MASK+rip]
pand xmm2, xmmword ptr [PBLENDW_0xC0_MASK+rip]
por xmm5, xmm2
movdqa xmm2, xmmword ptr [rsp+0x30]
movdqa xmm3, xmmword ptr [rsp+0x50]
pshufd xmm5, xmm5, 0x78
punpckhdq xmm13, xmm15
punpckldq xmm14, xmm13
@ -2013,19 +1999,14 @@ blake3_hash_many_sse2:
pshufd xmm4, xmm8, 0x39
movdqa xmm8, xmm6
shufps xmm8, xmm7, 250
movdqa xmm10, xmmword ptr [PBLENDW_0xCC_MASK+rip]
movdqa xmm11, xmm10
pandn xmm10, xmm9
pand xmm11, xmm8
movdqa xmm9, xmm11
por xmm9, xmm10
pand xmm9, xmmword ptr [PBLENDW_0x33_MASK+rip]
pand xmm8, xmmword ptr [PBLENDW_0xCC_MASK+rip]
por xmm9, xmm8
movdqa xmm8, xmm7
punpcklqdq xmm8, xmm5
movdqa xmm10, xmmword ptr [PBLENDW_0xC0_MASK+rip]
movdqa xmm11, xmm10
pandn xmm10, xmm8
pand xmm11, xmm6
movdqa xmm8, xmm11
movdqa xmm10, xmm6
pand xmm8, xmmword ptr [PBLENDW_0x3F_MASK+rip]
pand xmm10, xmmword ptr [PBLENDW_0xC0_MASK+rip]
por xmm8, xmm10
pshufd xmm8, xmm8, 0x78
punpckhdq xmm5, xmm7
@ -2133,19 +2114,14 @@ _blake3_compress_in_place_sse2:
pshufd xmm4, xmm8, 0x39
movdqa xmm8, xmm6
shufps xmm8, xmm7, 250
movdqa xmm10, xmmword ptr [PBLENDW_0xCC_MASK+rip]
movdqa xmm11, xmm10
pandn xmm10, xmm9
pand xmm11, xmm8
movdqa xmm9, xmm11
por xmm9, xmm10
pand xmm9, xmmword ptr [PBLENDW_0x33_MASK+rip]
pand xmm8, xmmword ptr [PBLENDW_0xCC_MASK+rip]
por xmm9, xmm8
movdqa xmm8, xmm7
punpcklqdq xmm8, xmm5
movdqa xmm10, xmmword ptr [PBLENDW_0xC0_MASK+rip]
movdqa xmm11, xmm10
pandn xmm10, xmm8
pand xmm11, xmm6
movdqa xmm8, xmm11
movdqa xmm10, xmm6
pand xmm8, xmmword ptr [PBLENDW_0x3F_MASK+rip]
pand xmm10, xmmword ptr [PBLENDW_0xC0_MASK+rip]
por xmm8, xmm10
pshufd xmm8, xmm8, 0x78
punpckhdq xmm5, xmm7
@ -2252,19 +2228,14 @@ _blake3_compress_xof_sse2:
pshufd xmm4, xmm8, 0x39
movdqa xmm8, xmm6
shufps xmm8, xmm7, 250
movdqa xmm10, xmmword ptr [PBLENDW_0xCC_MASK+rip]
movdqa xmm11, xmm10
pandn xmm10, xmm9
pand xmm11, xmm8
movdqa xmm9, xmm11
por xmm9, xmm10
pand xmm9, xmmword ptr [PBLENDW_0x33_MASK+rip]
pand xmm8, xmmword ptr [PBLENDW_0xCC_MASK+rip]
por xmm9, xmm8
movdqa xmm8, xmm7
punpcklqdq xmm8, xmm5
movdqa xmm10, xmmword ptr [PBLENDW_0xC0_MASK+rip]
movdqa xmm11, xmm10
pandn xmm10, xmm8
pand xmm11, xmm6
movdqa xmm8, xmm11
movdqa xmm10, xmm6
pand xmm8, xmmword ptr [PBLENDW_0x3F_MASK+rip]
pand xmm10, xmmword ptr [PBLENDW_0xC0_MASK+rip]
por xmm8, xmm10
pshufd xmm8, xmm8, 0x78
punpckhdq xmm5, xmm7
@ -2312,7 +2283,11 @@ BLAKE3_BLOCK_LEN:
.long 64, 64, 64, 64
CMP_MSB_MASK:
.long 0x80000000, 0x80000000, 0x80000000, 0x80000000
PBLENDW_0x33_MASK:
.long 0xFFFFFFFF, 0x00000000, 0xFFFFFFFF, 0x00000000
PBLENDW_0xCC_MASK:
.long 0x00000000, 0xFFFFFFFF, 0x00000000, 0xFFFFFFFF
PBLENDW_0x3F_MASK:
.long 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0x00000000
PBLENDW_0xC0_MASK:
.long 0x00000000, 0x00000000, 0x00000000, 0xFFFFFFFF

View File

@ -1844,24 +1844,17 @@ blake3_hash_many_sse2:
pshufd xmm4, xmm12, 0x39
movdqa xmm12, xmm6
shufps xmm12, xmm7, 250
movdqa xmmword ptr [rsp+0x20], xmm2
movdqa xmmword ptr [rsp+0x40], xmm3
movdqa xmm2, xmmword ptr [PBLENDW_0xCC_MASK+rip]
movdqa xmm3, xmm2
pandn xmm2, xmm13
pand xmm3, xmm12
movdqa xmm13, xmm3
por xmm13, xmm2
pand xmm13, xmmword ptr [PBLENDW_0x33_MASK+rip]
pand xmm12, xmmword ptr [PBLENDW_0xCC_MASK+rip]
por xmm13, xmm12
movdqa xmm12, xmm7
punpcklqdq xmm12, xmm5
movdqa xmm2, xmmword ptr [PBLENDW_0xC0_MASK+rip]
movdqa xmm3, xmm2
pandn xmm2, xmm12
pand xmm3, xmm6
movdqa xmm12, xmm3
movdqa xmmword ptr [rsp+0x20], xmm2
movdqa xmm2, xmm6
pand xmm12, xmmword ptr [PBLENDW_0x3F_MASK+rip]
pand xmm2, xmmword ptr [PBLENDW_0xC0_MASK+rip]
por xmm12, xmm2
movdqa xmm2, xmmword ptr [rsp+0x20]
movdqa xmm3, xmmword ptr [rsp+0x40]
pshufd xmm12, xmm12, 0x78
punpckhdq xmm5, xmm7
punpckldq xmm6, xmm5
@ -1875,24 +1868,17 @@ blake3_hash_many_sse2:
pshufd xmm12, xmm5, 0x39
movdqa xmm5, xmm14
shufps xmm5, xmm15, 250
movdqa xmmword ptr [rsp+0x30], xmm2
movdqa xmmword ptr [rsp+0x50], xmm3
movdqa xmm2, xmmword ptr [PBLENDW_0xCC_MASK+rip]
movdqa xmm3, xmm2
pandn xmm2, xmm6
pand xmm3, xmm5
movdqa xmm6, xmm3
por xmm6, xmm2
pand xmm6, xmmword ptr [PBLENDW_0x33_MASK+rip]
pand xmm5, xmmword ptr [PBLENDW_0xCC_MASK+rip]
por xmm6, xmm5
movdqa xmm5, xmm15
punpcklqdq xmm5, xmm13
movdqa xmm2, xmmword ptr [PBLENDW_0xC0_MASK+rip]
movdqa xmm3, xmm2
pandn xmm2, xmm5
pand xmm3, xmm14
movdqa xmm5, xmm3
movdqa xmmword ptr [rsp+0x30], xmm2
movdqa xmm2, xmm14
pand xmm5, xmmword ptr [PBLENDW_0x3F_MASK+rip]
pand xmm2, xmmword ptr [PBLENDW_0xC0_MASK+rip]
por xmm5, xmm2
movdqa xmm2, xmmword ptr [rsp+0x30]
movdqa xmm3, xmmword ptr [rsp+0x50]
pshufd xmm5, xmm5, 0x78
punpckhdq xmm13, xmm15
punpckldq xmm14, xmm13
@ -2024,19 +2010,14 @@ blake3_hash_many_sse2:
pshufd xmm4, xmm8, 0x39
movdqa xmm8, xmm6
shufps xmm8, xmm7, 250
movdqa xmm10, xmmword ptr [PBLENDW_0xCC_MASK+rip]
movdqa xmm11, xmm10
pandn xmm10, xmm9
pand xmm11, xmm8
movdqa xmm9, xmm11
por xmm9, xmm10
pand xmm9, xmmword ptr [PBLENDW_0x33_MASK+rip]
pand xmm8, xmmword ptr [PBLENDW_0xCC_MASK+rip]
por xmm9, xmm8
movdqa xmm8, xmm7
punpcklqdq xmm8, xmm5
movdqa xmm10, xmmword ptr [PBLENDW_0xC0_MASK+rip]
movdqa xmm11, xmm10
pandn xmm10, xmm8
pand xmm11, xmm6
movdqa xmm8, xmm11
movdqa xmm10, xmm6
pand xmm8, xmmword ptr [PBLENDW_0x3F_MASK+rip]
pand xmm10, xmmword ptr [PBLENDW_0xC0_MASK+rip]
por xmm8, xmm10
pshufd xmm8, xmm8, 0x78
punpckhdq xmm5, xmm7
@ -2153,19 +2134,14 @@ _blake3_compress_in_place_sse2:
pshufd xmm4, xmm8, 0x39
movdqa xmm8, xmm6
shufps xmm8, xmm7, 250
movdqa xmm10, xmmword ptr [PBLENDW_0xCC_MASK+rip]
movdqa xmm11, xmm10
pandn xmm10, xmm9
pand xmm11, xmm8
movdqa xmm9, xmm11
por xmm9, xmm10
pand xmm9, xmmword ptr [PBLENDW_0x33_MASK+rip]
pand xmm8, xmmword ptr [PBLENDW_0xCC_MASK+rip]
por xmm9, xmm8
movdqa xmm8, xmm7
punpcklqdq xmm8, xmm5
movdqa xmm10, xmmword ptr [PBLENDW_0xC0_MASK+rip]
movdqa xmm11, xmm10
pandn xmm10, xmm8
pand xmm11, xmm6
movdqa xmm8, xmm11
movdqa xmm10, xmm6
pand xmm8, xmmword ptr [PBLENDW_0x3F_MASK+rip]
pand xmm10, xmmword ptr [PBLENDW_0xC0_MASK+rip]
por xmm8, xmm10
pshufd xmm8, xmm8, 0x78
punpckhdq xmm5, xmm7
@ -2289,19 +2265,14 @@ blake3_compress_xof_sse2:
pshufd xmm4, xmm8, 0x39
movdqa xmm8, xmm6
shufps xmm8, xmm7, 250
movdqa xmm10, xmmword ptr [PBLENDW_0xCC_MASK+rip]
movdqa xmm11, xmm10
pandn xmm10, xmm9
pand xmm11, xmm8
movdqa xmm9, xmm11
por xmm9, xmm10
pand xmm9, xmmword ptr [PBLENDW_0x33_MASK+rip]
pand xmm8, xmmword ptr [PBLENDW_0xCC_MASK+rip]
por xmm9, xmm8
movdqa xmm8, xmm7
punpcklqdq xmm8, xmm5
movdqa xmm10, xmmword ptr [PBLENDW_0xC0_MASK+rip]
movdqa xmm11, xmm10
pandn xmm10, xmm8
pand xmm11, xmm6
movdqa xmm8, xmm11
movdqa xmm10, xmm6
pand xmm8, xmmword ptr [PBLENDW_0x3F_MASK+rip]
pand xmm10, xmmword ptr [PBLENDW_0xC0_MASK+rip]
por xmm8, xmm10
pshufd xmm8, xmm8, 0x78
punpckhdq xmm5, xmm7
@ -2353,7 +2324,11 @@ BLAKE3_BLOCK_LEN:
.long 64, 64, 64, 64
CMP_MSB_MASK:
.long 0x80000000, 0x80000000, 0x80000000, 0x80000000
PBLENDW_0x33_MASK:
.long 0xFFFFFFFF, 0x00000000, 0xFFFFFFFF, 0x00000000
PBLENDW_0xCC_MASK:
.long 0x00000000, 0xFFFFFFFF, 0x00000000, 0xFFFFFFFF
PBLENDW_0x3F_MASK:
.long 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0x00000000
PBLENDW_0xC0_MASK:
.long 0x00000000, 0x00000000, 0x00000000, 0xFFFFFFFF

View File

@ -1845,24 +1845,17 @@ roundloop2:
pshufd xmm4, xmm12, 39H
movdqa xmm12, xmm6
shufps xmm12, xmm7, 250
movdqa xmmword ptr [rsp+20H], xmm2
movdqa xmmword ptr [rsp+40H], xmm3
movdqa xmm2, xmmword ptr [PBLENDW_0xCC_MASK]
movdqa xmm3, xmm2
pandn xmm2, xmm13
pand xmm3, xmm12
movdqa xmm13, xmm3
por xmm13, xmm2
pand xmm13, xmmword ptr [PBLENDW_0x33_MASK]
pand xmm12, xmmword ptr [PBLENDW_0xCC_MASK]
por xmm13, xmm12
movdqa xmm12, xmm7
punpcklqdq xmm12, xmm5
movdqa xmm2, xmmword ptr [PBLENDW_0xC0_MASK]
movdqa xmm3, xmm2
pandn xmm2, xmm12
pand xmm3, xmm6
movdqa xmm12, xmm3
movdqa xmmword ptr [rsp+20H], xmm2
movdqa xmm2, xmm6
pand xmm12, xmmword ptr [PBLENDW_0x3F_MASK]
pand xmm2, xmmword ptr [PBLENDW_0xC0_MASK]
por xmm12, xmm2
movdqa xmm2, xmmword ptr [rsp+20H]
movdqa xmm3, xmmword ptr [rsp+40H]
pshufd xmm12, xmm12, 78H
punpckhdq xmm5, xmm7
punpckldq xmm6, xmm5
@ -1876,24 +1869,17 @@ roundloop2:
pshufd xmm12, xmm5, 39H
movdqa xmm5, xmm14
shufps xmm5, xmm15, 250
movdqa xmmword ptr [rsp+30H], xmm2
movdqa xmmword ptr [rsp+50H], xmm3
movdqa xmm2, xmmword ptr [PBLENDW_0xCC_MASK]
movdqa xmm3, xmm2
pandn xmm2, xmm6
pand xmm3, xmm5
movdqa xmm6, xmm3
por xmm6, xmm2
pand xmm6, xmmword ptr [PBLENDW_0x33_MASK]
pand xmm5, xmmword ptr [PBLENDW_0xCC_MASK]
por xmm6, xmm5
movdqa xmm5, xmm15
punpcklqdq xmm5, xmm13
movdqa xmm2, xmmword ptr [PBLENDW_0xC0_MASK]
movdqa xmm3, xmm2
pandn xmm2, xmm5
pand xmm3, xmm14
movdqa xmm5, xmm3
movdqa xmmword ptr [rsp+30H], xmm2
movdqa xmm2, xmm14
pand xmm5, xmmword ptr [PBLENDW_0x3F_MASK]
pand xmm2, xmmword ptr [PBLENDW_0xC0_MASK]
por xmm5, xmm2
movdqa xmm2, xmmword ptr [rsp+30H]
movdqa xmm3, xmmword ptr [rsp+50H]
pshufd xmm5, xmm5, 78H
punpckhdq xmm13, xmm15
punpckldq xmm14, xmm13
@ -2025,19 +2011,14 @@ roundloop1:
pshufd xmm4, xmm8, 39H
movdqa xmm8, xmm6
shufps xmm8, xmm7, 250
movdqa xmm10, xmmword ptr [PBLENDW_0xCC_MASK]
movdqa xmm11, xmm10
pandn xmm10, xmm9
pand xmm11, xmm8
movdqa xmm9, xmm11
por xmm9, xmm10
pand xmm9, xmmword ptr [PBLENDW_0x33_MASK]
pand xmm8, xmmword ptr [PBLENDW_0xCC_MASK]
por xmm9, xmm8
movdqa xmm8, xmm7
punpcklqdq xmm8, xmm5
movdqa xmm10, xmmword ptr [PBLENDW_0xC0_MASK]
movdqa xmm11, xmm10
pandn xmm10, xmm8
pand xmm11, xmm6
movdqa xmm8, xmm11
movdqa xmm10, xmm6
pand xmm8, xmmword ptr [PBLENDW_0x3F_MASK]
pand xmm10, xmmword ptr [PBLENDW_0xC0_MASK]
por xmm8, xmm10
pshufd xmm8, xmm8, 78H
punpckhdq xmm5, xmm7
@ -2155,19 +2136,14 @@ _blake3_compress_in_place_sse2 PROC
pshufd xmm4, xmm8, 39H
movdqa xmm8, xmm6
shufps xmm8, xmm7, 250
movdqa xmm10, xmmword ptr [PBLENDW_0xCC_MASK]
movdqa xmm11, xmm10
pandn xmm10, xmm9
pand xmm11, xmm8
movdqa xmm9, xmm11
por xmm9, xmm10
pand xmm9, xmmword ptr [PBLENDW_0x33_MASK]
pand xmm8, xmmword ptr [PBLENDW_0xCC_MASK]
por xmm9, xmm8
movdqa xmm8, xmm7
punpcklqdq xmm8, xmm5
movdqa xmm10, xmmword ptr [PBLENDW_0xC0_MASK]
movdqa xmm11, xmm10
pandn xmm10, xmm8
pand xmm11, xmm6
movdqa xmm8, xmm11
movdqa xmm10, xmm6
pand xmm8, xmmword ptr [PBLENDW_0x3F_MASK]
pand xmm10, xmmword ptr [PBLENDW_0xC0_MASK]
por xmm8, xmm10
pshufd xmm8, xmm8, 78H
punpckhdq xmm5, xmm7
@ -2292,19 +2268,14 @@ _blake3_compress_xof_sse2 PROC
pshufd xmm4, xmm8, 39H
movdqa xmm8, xmm6
shufps xmm8, xmm7, 250
movdqa xmm10, xmmword ptr [PBLENDW_0xCC_MASK]
movdqa xmm11, xmm10
pandn xmm10, xmm9
pand xmm11, xmm8
movdqa xmm9, xmm11
por xmm9, xmm10
pand xmm9, xmmword ptr [PBLENDW_0x33_MASK]
pand xmm8, xmmword ptr [PBLENDW_0xCC_MASK]
por xmm9, xmm8
movdqa xmm8, xmm7
punpcklqdq xmm8, xmm5
movdqa xmm10, xmmword ptr [PBLENDW_0xC0_MASK]
movdqa xmm11, xmm10
pandn xmm10, xmm8
pand xmm11, xmm6
movdqa xmm8, xmm11
movdqa xmm10, xmm6
pand xmm8, xmmword ptr [PBLENDW_0x3F_MASK]
pand xmm10, xmmword ptr [PBLENDW_0xC0_MASK]
por xmm8, xmm10
pshufd xmm8, xmm8, 78H
punpckhdq xmm5, xmm7
@ -2368,8 +2339,12 @@ BLAKE3_BLOCK_LEN:
CMP_MSB_MASK:
dd 8 dup(80000000H)
PBLENDW_0x33_MASK:
dd 0FFFFFFFFH, 000000000H, 0FFFFFFFFH, 000000000H
PBLENDW_0xCC_MASK:
dd 000000000H, 0FFFFFFFFH, 000000000H, 0FFFFFFFFH
PBLENDW_0x3F_MASK:
dd 0FFFFFFFFH, 0FFFFFFFFH, 0FFFFFFFFH, 000000000H
PBLENDW_0xC0_MASK:
dd 000000000H, 000000000H, 000000000H, 0FFFFFFFFH