mirror of
https://github.com/BLAKE3-team/BLAKE3
synced 2025-01-21 15:50:01 +01:00
3977 lines
127 KiB
ArmAsm
3977 lines
127 KiB
ArmAsm
# This file is generated by asm.py. Don't edit this file directly.
|
|
blake3_sse2_kernel_2d_1:
|
|
paddd xmm0, xmm4
|
|
paddd xmm0, xmm1
|
|
pxor xmm3, xmm0
|
|
pshuflw xmm3, xmm3, 0xB1
|
|
pshufhw xmm3, xmm3, 0xB1
|
|
paddd xmm2, xmm3
|
|
pxor xmm1, xmm2
|
|
movdqa xmm11, xmm1
|
|
pslld xmm1, 20
|
|
psrld xmm11, 12
|
|
por xmm1, xmm11
|
|
paddd xmm0, xmm5
|
|
paddd xmm0, xmm1
|
|
pxor xmm3, xmm0
|
|
movdqa xmm11, xmm3
|
|
pslld xmm3, 24
|
|
psrld xmm11, 8
|
|
por xmm3, xmm11
|
|
paddd xmm2, xmm3
|
|
pxor xmm1, xmm2
|
|
movdqa xmm11, xmm1
|
|
pslld xmm1, 25
|
|
psrld xmm11, 7
|
|
por xmm1, xmm11
|
|
pshufd xmm0, xmm0, 0x93
|
|
pshufd xmm3, xmm3, 0x4E
|
|
pshufd xmm2, xmm2, 0x39
|
|
paddd xmm0, xmm6
|
|
paddd xmm0, xmm1
|
|
pxor xmm3, xmm0
|
|
pshuflw xmm3, xmm3, 0xB1
|
|
pshufhw xmm3, xmm3, 0xB1
|
|
paddd xmm2, xmm3
|
|
pxor xmm1, xmm2
|
|
movdqa xmm11, xmm1
|
|
pslld xmm1, 20
|
|
psrld xmm11, 12
|
|
por xmm1, xmm11
|
|
paddd xmm0, xmm7
|
|
paddd xmm0, xmm1
|
|
pxor xmm3, xmm0
|
|
movdqa xmm11, xmm3
|
|
pslld xmm3, 24
|
|
psrld xmm11, 8
|
|
por xmm3, xmm11
|
|
paddd xmm2, xmm3
|
|
pxor xmm1, xmm2
|
|
movdqa xmm11, xmm1
|
|
pslld xmm1, 25
|
|
psrld xmm11, 7
|
|
por xmm1, xmm11
|
|
pshufd xmm0, xmm0, 0x39
|
|
pshufd xmm3, xmm3, 0x4E
|
|
pshufd xmm2, xmm2, 0x93
|
|
movdqa xmm8, xmm4
|
|
shufps xmm8, xmm5, 214
|
|
pshufd xmm9, xmm4, 0x0F
|
|
pshufd xmm4, xmm8, 0x39
|
|
movdqa xmm8, xmm6
|
|
shufps xmm8, xmm7, 250
|
|
pand xmm9, xmmword ptr [PBLENDW_0x33_MASK+rip]
|
|
pand xmm8, xmmword ptr [PBLENDW_0xCC_MASK+rip]
|
|
por xmm9, xmm8
|
|
movdqa xmm8, xmm7
|
|
punpcklqdq xmm8, xmm5
|
|
movdqa xmm10, xmm6
|
|
pand xmm8, xmmword ptr [PBLENDW_0x3F_MASK+rip]
|
|
pand xmm10, xmmword ptr [PBLENDW_0xC0_MASK+rip]
|
|
por xmm8, xmm10
|
|
pshufd xmm8, xmm8, 0x78
|
|
punpckhdq xmm5, xmm7
|
|
punpckldq xmm6, xmm5
|
|
pshufd xmm7, xmm6, 0x1E
|
|
movdqa xmm5, xmm9
|
|
movdqa xmm6, xmm8
|
|
paddd xmm0, xmm4
|
|
paddd xmm0, xmm1
|
|
pxor xmm3, xmm0
|
|
pshuflw xmm3, xmm3, 0xB1
|
|
pshufhw xmm3, xmm3, 0xB1
|
|
paddd xmm2, xmm3
|
|
pxor xmm1, xmm2
|
|
movdqa xmm11, xmm1
|
|
pslld xmm1, 20
|
|
psrld xmm11, 12
|
|
por xmm1, xmm11
|
|
paddd xmm0, xmm5
|
|
paddd xmm0, xmm1
|
|
pxor xmm3, xmm0
|
|
movdqa xmm11, xmm3
|
|
pslld xmm3, 24
|
|
psrld xmm11, 8
|
|
por xmm3, xmm11
|
|
paddd xmm2, xmm3
|
|
pxor xmm1, xmm2
|
|
movdqa xmm11, xmm1
|
|
pslld xmm1, 25
|
|
psrld xmm11, 7
|
|
por xmm1, xmm11
|
|
pshufd xmm0, xmm0, 0x93
|
|
pshufd xmm3, xmm3, 0x4E
|
|
pshufd xmm2, xmm2, 0x39
|
|
paddd xmm0, xmm6
|
|
paddd xmm0, xmm1
|
|
pxor xmm3, xmm0
|
|
pshuflw xmm3, xmm3, 0xB1
|
|
pshufhw xmm3, xmm3, 0xB1
|
|
paddd xmm2, xmm3
|
|
pxor xmm1, xmm2
|
|
movdqa xmm11, xmm1
|
|
pslld xmm1, 20
|
|
psrld xmm11, 12
|
|
por xmm1, xmm11
|
|
paddd xmm0, xmm7
|
|
paddd xmm0, xmm1
|
|
pxor xmm3, xmm0
|
|
movdqa xmm11, xmm3
|
|
pslld xmm3, 24
|
|
psrld xmm11, 8
|
|
por xmm3, xmm11
|
|
paddd xmm2, xmm3
|
|
pxor xmm1, xmm2
|
|
movdqa xmm11, xmm1
|
|
pslld xmm1, 25
|
|
psrld xmm11, 7
|
|
por xmm1, xmm11
|
|
pshufd xmm0, xmm0, 0x39
|
|
pshufd xmm3, xmm3, 0x4E
|
|
pshufd xmm2, xmm2, 0x93
|
|
movdqa xmm8, xmm4
|
|
shufps xmm8, xmm5, 214
|
|
pshufd xmm9, xmm4, 0x0F
|
|
pshufd xmm4, xmm8, 0x39
|
|
movdqa xmm8, xmm6
|
|
shufps xmm8, xmm7, 250
|
|
pand xmm9, xmmword ptr [PBLENDW_0x33_MASK+rip]
|
|
pand xmm8, xmmword ptr [PBLENDW_0xCC_MASK+rip]
|
|
por xmm9, xmm8
|
|
movdqa xmm8, xmm7
|
|
punpcklqdq xmm8, xmm5
|
|
movdqa xmm10, xmm6
|
|
pand xmm8, xmmword ptr [PBLENDW_0x3F_MASK+rip]
|
|
pand xmm10, xmmword ptr [PBLENDW_0xC0_MASK+rip]
|
|
por xmm8, xmm10
|
|
pshufd xmm8, xmm8, 0x78
|
|
punpckhdq xmm5, xmm7
|
|
punpckldq xmm6, xmm5
|
|
pshufd xmm7, xmm6, 0x1E
|
|
movdqa xmm5, xmm9
|
|
movdqa xmm6, xmm8
|
|
paddd xmm0, xmm4
|
|
paddd xmm0, xmm1
|
|
pxor xmm3, xmm0
|
|
pshuflw xmm3, xmm3, 0xB1
|
|
pshufhw xmm3, xmm3, 0xB1
|
|
paddd xmm2, xmm3
|
|
pxor xmm1, xmm2
|
|
movdqa xmm11, xmm1
|
|
pslld xmm1, 20
|
|
psrld xmm11, 12
|
|
por xmm1, xmm11
|
|
paddd xmm0, xmm5
|
|
paddd xmm0, xmm1
|
|
pxor xmm3, xmm0
|
|
movdqa xmm11, xmm3
|
|
pslld xmm3, 24
|
|
psrld xmm11, 8
|
|
por xmm3, xmm11
|
|
paddd xmm2, xmm3
|
|
pxor xmm1, xmm2
|
|
movdqa xmm11, xmm1
|
|
pslld xmm1, 25
|
|
psrld xmm11, 7
|
|
por xmm1, xmm11
|
|
pshufd xmm0, xmm0, 0x93
|
|
pshufd xmm3, xmm3, 0x4E
|
|
pshufd xmm2, xmm2, 0x39
|
|
paddd xmm0, xmm6
|
|
paddd xmm0, xmm1
|
|
pxor xmm3, xmm0
|
|
pshuflw xmm3, xmm3, 0xB1
|
|
pshufhw xmm3, xmm3, 0xB1
|
|
paddd xmm2, xmm3
|
|
pxor xmm1, xmm2
|
|
movdqa xmm11, xmm1
|
|
pslld xmm1, 20
|
|
psrld xmm11, 12
|
|
por xmm1, xmm11
|
|
paddd xmm0, xmm7
|
|
paddd xmm0, xmm1
|
|
pxor xmm3, xmm0
|
|
movdqa xmm11, xmm3
|
|
pslld xmm3, 24
|
|
psrld xmm11, 8
|
|
por xmm3, xmm11
|
|
paddd xmm2, xmm3
|
|
pxor xmm1, xmm2
|
|
movdqa xmm11, xmm1
|
|
pslld xmm1, 25
|
|
psrld xmm11, 7
|
|
por xmm1, xmm11
|
|
pshufd xmm0, xmm0, 0x39
|
|
pshufd xmm3, xmm3, 0x4E
|
|
pshufd xmm2, xmm2, 0x93
|
|
movdqa xmm8, xmm4
|
|
shufps xmm8, xmm5, 214
|
|
pshufd xmm9, xmm4, 0x0F
|
|
pshufd xmm4, xmm8, 0x39
|
|
movdqa xmm8, xmm6
|
|
shufps xmm8, xmm7, 250
|
|
pand xmm9, xmmword ptr [PBLENDW_0x33_MASK+rip]
|
|
pand xmm8, xmmword ptr [PBLENDW_0xCC_MASK+rip]
|
|
por xmm9, xmm8
|
|
movdqa xmm8, xmm7
|
|
punpcklqdq xmm8, xmm5
|
|
movdqa xmm10, xmm6
|
|
pand xmm8, xmmword ptr [PBLENDW_0x3F_MASK+rip]
|
|
pand xmm10, xmmword ptr [PBLENDW_0xC0_MASK+rip]
|
|
por xmm8, xmm10
|
|
pshufd xmm8, xmm8, 0x78
|
|
punpckhdq xmm5, xmm7
|
|
punpckldq xmm6, xmm5
|
|
pshufd xmm7, xmm6, 0x1E
|
|
movdqa xmm5, xmm9
|
|
movdqa xmm6, xmm8
|
|
paddd xmm0, xmm4
|
|
paddd xmm0, xmm1
|
|
pxor xmm3, xmm0
|
|
pshuflw xmm3, xmm3, 0xB1
|
|
pshufhw xmm3, xmm3, 0xB1
|
|
paddd xmm2, xmm3
|
|
pxor xmm1, xmm2
|
|
movdqa xmm11, xmm1
|
|
pslld xmm1, 20
|
|
psrld xmm11, 12
|
|
por xmm1, xmm11
|
|
paddd xmm0, xmm5
|
|
paddd xmm0, xmm1
|
|
pxor xmm3, xmm0
|
|
movdqa xmm11, xmm3
|
|
pslld xmm3, 24
|
|
psrld xmm11, 8
|
|
por xmm3, xmm11
|
|
paddd xmm2, xmm3
|
|
pxor xmm1, xmm2
|
|
movdqa xmm11, xmm1
|
|
pslld xmm1, 25
|
|
psrld xmm11, 7
|
|
por xmm1, xmm11
|
|
pshufd xmm0, xmm0, 0x93
|
|
pshufd xmm3, xmm3, 0x4E
|
|
pshufd xmm2, xmm2, 0x39
|
|
paddd xmm0, xmm6
|
|
paddd xmm0, xmm1
|
|
pxor xmm3, xmm0
|
|
pshuflw xmm3, xmm3, 0xB1
|
|
pshufhw xmm3, xmm3, 0xB1
|
|
paddd xmm2, xmm3
|
|
pxor xmm1, xmm2
|
|
movdqa xmm11, xmm1
|
|
pslld xmm1, 20
|
|
psrld xmm11, 12
|
|
por xmm1, xmm11
|
|
paddd xmm0, xmm7
|
|
paddd xmm0, xmm1
|
|
pxor xmm3, xmm0
|
|
movdqa xmm11, xmm3
|
|
pslld xmm3, 24
|
|
psrld xmm11, 8
|
|
por xmm3, xmm11
|
|
paddd xmm2, xmm3
|
|
pxor xmm1, xmm2
|
|
movdqa xmm11, xmm1
|
|
pslld xmm1, 25
|
|
psrld xmm11, 7
|
|
por xmm1, xmm11
|
|
pshufd xmm0, xmm0, 0x39
|
|
pshufd xmm3, xmm3, 0x4E
|
|
pshufd xmm2, xmm2, 0x93
|
|
movdqa xmm8, xmm4
|
|
shufps xmm8, xmm5, 214
|
|
pshufd xmm9, xmm4, 0x0F
|
|
pshufd xmm4, xmm8, 0x39
|
|
movdqa xmm8, xmm6
|
|
shufps xmm8, xmm7, 250
|
|
pand xmm9, xmmword ptr [PBLENDW_0x33_MASK+rip]
|
|
pand xmm8, xmmword ptr [PBLENDW_0xCC_MASK+rip]
|
|
por xmm9, xmm8
|
|
movdqa xmm8, xmm7
|
|
punpcklqdq xmm8, xmm5
|
|
movdqa xmm10, xmm6
|
|
pand xmm8, xmmword ptr [PBLENDW_0x3F_MASK+rip]
|
|
pand xmm10, xmmword ptr [PBLENDW_0xC0_MASK+rip]
|
|
por xmm8, xmm10
|
|
pshufd xmm8, xmm8, 0x78
|
|
punpckhdq xmm5, xmm7
|
|
punpckldq xmm6, xmm5
|
|
pshufd xmm7, xmm6, 0x1E
|
|
movdqa xmm5, xmm9
|
|
movdqa xmm6, xmm8
|
|
paddd xmm0, xmm4
|
|
paddd xmm0, xmm1
|
|
pxor xmm3, xmm0
|
|
pshuflw xmm3, xmm3, 0xB1
|
|
pshufhw xmm3, xmm3, 0xB1
|
|
paddd xmm2, xmm3
|
|
pxor xmm1, xmm2
|
|
movdqa xmm11, xmm1
|
|
pslld xmm1, 20
|
|
psrld xmm11, 12
|
|
por xmm1, xmm11
|
|
paddd xmm0, xmm5
|
|
paddd xmm0, xmm1
|
|
pxor xmm3, xmm0
|
|
movdqa xmm11, xmm3
|
|
pslld xmm3, 24
|
|
psrld xmm11, 8
|
|
por xmm3, xmm11
|
|
paddd xmm2, xmm3
|
|
pxor xmm1, xmm2
|
|
movdqa xmm11, xmm1
|
|
pslld xmm1, 25
|
|
psrld xmm11, 7
|
|
por xmm1, xmm11
|
|
pshufd xmm0, xmm0, 0x93
|
|
pshufd xmm3, xmm3, 0x4E
|
|
pshufd xmm2, xmm2, 0x39
|
|
paddd xmm0, xmm6
|
|
paddd xmm0, xmm1
|
|
pxor xmm3, xmm0
|
|
pshuflw xmm3, xmm3, 0xB1
|
|
pshufhw xmm3, xmm3, 0xB1
|
|
paddd xmm2, xmm3
|
|
pxor xmm1, xmm2
|
|
movdqa xmm11, xmm1
|
|
pslld xmm1, 20
|
|
psrld xmm11, 12
|
|
por xmm1, xmm11
|
|
paddd xmm0, xmm7
|
|
paddd xmm0, xmm1
|
|
pxor xmm3, xmm0
|
|
movdqa xmm11, xmm3
|
|
pslld xmm3, 24
|
|
psrld xmm11, 8
|
|
por xmm3, xmm11
|
|
paddd xmm2, xmm3
|
|
pxor xmm1, xmm2
|
|
movdqa xmm11, xmm1
|
|
pslld xmm1, 25
|
|
psrld xmm11, 7
|
|
por xmm1, xmm11
|
|
pshufd xmm0, xmm0, 0x39
|
|
pshufd xmm3, xmm3, 0x4E
|
|
pshufd xmm2, xmm2, 0x93
|
|
movdqa xmm8, xmm4
|
|
shufps xmm8, xmm5, 214
|
|
pshufd xmm9, xmm4, 0x0F
|
|
pshufd xmm4, xmm8, 0x39
|
|
movdqa xmm8, xmm6
|
|
shufps xmm8, xmm7, 250
|
|
pand xmm9, xmmword ptr [PBLENDW_0x33_MASK+rip]
|
|
pand xmm8, xmmword ptr [PBLENDW_0xCC_MASK+rip]
|
|
por xmm9, xmm8
|
|
movdqa xmm8, xmm7
|
|
punpcklqdq xmm8, xmm5
|
|
movdqa xmm10, xmm6
|
|
pand xmm8, xmmword ptr [PBLENDW_0x3F_MASK+rip]
|
|
pand xmm10, xmmword ptr [PBLENDW_0xC0_MASK+rip]
|
|
por xmm8, xmm10
|
|
pshufd xmm8, xmm8, 0x78
|
|
punpckhdq xmm5, xmm7
|
|
punpckldq xmm6, xmm5
|
|
pshufd xmm7, xmm6, 0x1E
|
|
movdqa xmm5, xmm9
|
|
movdqa xmm6, xmm8
|
|
paddd xmm0, xmm4
|
|
paddd xmm0, xmm1
|
|
pxor xmm3, xmm0
|
|
pshuflw xmm3, xmm3, 0xB1
|
|
pshufhw xmm3, xmm3, 0xB1
|
|
paddd xmm2, xmm3
|
|
pxor xmm1, xmm2
|
|
movdqa xmm11, xmm1
|
|
pslld xmm1, 20
|
|
psrld xmm11, 12
|
|
por xmm1, xmm11
|
|
paddd xmm0, xmm5
|
|
paddd xmm0, xmm1
|
|
pxor xmm3, xmm0
|
|
movdqa xmm11, xmm3
|
|
pslld xmm3, 24
|
|
psrld xmm11, 8
|
|
por xmm3, xmm11
|
|
paddd xmm2, xmm3
|
|
pxor xmm1, xmm2
|
|
movdqa xmm11, xmm1
|
|
pslld xmm1, 25
|
|
psrld xmm11, 7
|
|
por xmm1, xmm11
|
|
pshufd xmm0, xmm0, 0x93
|
|
pshufd xmm3, xmm3, 0x4E
|
|
pshufd xmm2, xmm2, 0x39
|
|
paddd xmm0, xmm6
|
|
paddd xmm0, xmm1
|
|
pxor xmm3, xmm0
|
|
pshuflw xmm3, xmm3, 0xB1
|
|
pshufhw xmm3, xmm3, 0xB1
|
|
paddd xmm2, xmm3
|
|
pxor xmm1, xmm2
|
|
movdqa xmm11, xmm1
|
|
pslld xmm1, 20
|
|
psrld xmm11, 12
|
|
por xmm1, xmm11
|
|
paddd xmm0, xmm7
|
|
paddd xmm0, xmm1
|
|
pxor xmm3, xmm0
|
|
movdqa xmm11, xmm3
|
|
pslld xmm3, 24
|
|
psrld xmm11, 8
|
|
por xmm3, xmm11
|
|
paddd xmm2, xmm3
|
|
pxor xmm1, xmm2
|
|
movdqa xmm11, xmm1
|
|
pslld xmm1, 25
|
|
psrld xmm11, 7
|
|
por xmm1, xmm11
|
|
pshufd xmm0, xmm0, 0x39
|
|
pshufd xmm3, xmm3, 0x4E
|
|
pshufd xmm2, xmm2, 0x93
|
|
movdqa xmm8, xmm4
|
|
shufps xmm8, xmm5, 214
|
|
pshufd xmm9, xmm4, 0x0F
|
|
pshufd xmm4, xmm8, 0x39
|
|
movdqa xmm8, xmm6
|
|
shufps xmm8, xmm7, 250
|
|
pand xmm9, xmmword ptr [PBLENDW_0x33_MASK+rip]
|
|
pand xmm8, xmmword ptr [PBLENDW_0xCC_MASK+rip]
|
|
por xmm9, xmm8
|
|
movdqa xmm8, xmm7
|
|
punpcklqdq xmm8, xmm5
|
|
movdqa xmm10, xmm6
|
|
pand xmm8, xmmword ptr [PBLENDW_0x3F_MASK+rip]
|
|
pand xmm10, xmmword ptr [PBLENDW_0xC0_MASK+rip]
|
|
por xmm8, xmm10
|
|
pshufd xmm8, xmm8, 0x78
|
|
punpckhdq xmm5, xmm7
|
|
punpckldq xmm6, xmm5
|
|
pshufd xmm7, xmm6, 0x1E
|
|
movdqa xmm5, xmm9
|
|
movdqa xmm6, xmm8
|
|
paddd xmm0, xmm4
|
|
paddd xmm0, xmm1
|
|
pxor xmm3, xmm0
|
|
pshuflw xmm3, xmm3, 0xB1
|
|
pshufhw xmm3, xmm3, 0xB1
|
|
paddd xmm2, xmm3
|
|
pxor xmm1, xmm2
|
|
movdqa xmm11, xmm1
|
|
pslld xmm1, 20
|
|
psrld xmm11, 12
|
|
por xmm1, xmm11
|
|
paddd xmm0, xmm5
|
|
paddd xmm0, xmm1
|
|
pxor xmm3, xmm0
|
|
movdqa xmm11, xmm3
|
|
pslld xmm3, 24
|
|
psrld xmm11, 8
|
|
por xmm3, xmm11
|
|
paddd xmm2, xmm3
|
|
pxor xmm1, xmm2
|
|
movdqa xmm11, xmm1
|
|
pslld xmm1, 25
|
|
psrld xmm11, 7
|
|
por xmm1, xmm11
|
|
pshufd xmm0, xmm0, 0x93
|
|
pshufd xmm3, xmm3, 0x4E
|
|
pshufd xmm2, xmm2, 0x39
|
|
paddd xmm0, xmm6
|
|
paddd xmm0, xmm1
|
|
pxor xmm3, xmm0
|
|
pshuflw xmm3, xmm3, 0xB1
|
|
pshufhw xmm3, xmm3, 0xB1
|
|
paddd xmm2, xmm3
|
|
pxor xmm1, xmm2
|
|
movdqa xmm11, xmm1
|
|
pslld xmm1, 20
|
|
psrld xmm11, 12
|
|
por xmm1, xmm11
|
|
paddd xmm0, xmm7
|
|
paddd xmm0, xmm1
|
|
pxor xmm3, xmm0
|
|
movdqa xmm11, xmm3
|
|
pslld xmm3, 24
|
|
psrld xmm11, 8
|
|
por xmm3, xmm11
|
|
paddd xmm2, xmm3
|
|
pxor xmm1, xmm2
|
|
movdqa xmm11, xmm1
|
|
pslld xmm1, 25
|
|
psrld xmm11, 7
|
|
por xmm1, xmm11
|
|
pshufd xmm0, xmm0, 0x39
|
|
pshufd xmm3, xmm3, 0x4E
|
|
pshufd xmm2, xmm2, 0x93
|
|
pxor xmm0, xmm2
|
|
pxor xmm1, xmm3
|
|
ret
|
|
.global blake3_sse2_compress
|
|
blake3_sse2_compress:
|
|
movups xmm0, xmmword ptr [rdi]
|
|
movups xmm1, xmmword ptr [rdi+0x10]
|
|
movaps xmm2, xmmword ptr [BLAKE3_IV+rip]
|
|
shl r8, 32
|
|
mov ecx, ecx
|
|
or rcx, r8
|
|
vmovq xmm3, rdx
|
|
vmovq xmm4, rcx
|
|
punpcklqdq xmm3, xmm4
|
|
movups xmm4, xmmword ptr [rsi]
|
|
movups xmm5, xmmword ptr [rsi+0x10]
|
|
movaps xmm8, xmm4
|
|
shufps xmm4, xmm5, 136
|
|
shufps xmm8, xmm5, 221
|
|
movaps xmm5, xmm8
|
|
movups xmm6, xmmword ptr [rsi+0x20]
|
|
movups xmm7, xmmword ptr [rsi+0x30]
|
|
movaps xmm8, xmm6
|
|
shufps xmm6, xmm7, 136
|
|
pshufd xmm6, xmm6, 0x93
|
|
shufps xmm8, xmm7, 221
|
|
pshufd xmm7, xmm8, 0x93
|
|
call blake3_sse2_kernel_2d_1
|
|
movups xmmword ptr [rdi], xmm0
|
|
movups xmmword ptr [rdi+0x10], xmm1
|
|
ret
|
|
.global blake3_sse2_xof_stream_1
|
|
blake3_sse2_xof_stream_1:
|
|
movups xmm0, xmmword ptr [rdi]
|
|
movups xmm1, xmmword ptr [rdi+0x10]
|
|
movaps xmm2, xmmword ptr [BLAKE3_IV+rip]
|
|
shl r8, 32
|
|
mov ecx, ecx
|
|
or rcx, r8
|
|
vmovq xmm3, rdx
|
|
vmovq xmm4, rcx
|
|
punpcklqdq xmm3, xmm4
|
|
movups xmm4, xmmword ptr [rsi]
|
|
movups xmm5, xmmword ptr [rsi+0x10]
|
|
movaps xmm8, xmm4
|
|
shufps xmm4, xmm5, 136
|
|
shufps xmm8, xmm5, 221
|
|
movaps xmm5, xmm8
|
|
movups xmm6, xmmword ptr [rsi+0x20]
|
|
movups xmm7, xmmword ptr [rsi+0x30]
|
|
movaps xmm8, xmm6
|
|
shufps xmm6, xmm7, 136
|
|
pshufd xmm6, xmm6, 0x93
|
|
shufps xmm8, xmm7, 221
|
|
pshufd xmm7, xmm8, 0x93
|
|
call blake3_sse2_kernel_2d_1
|
|
movdqu xmm4, xmmword ptr [rdi]
|
|
movdqu xmm5, xmmword ptr [rdi+0x10]
|
|
pxor xmm2, xmm4
|
|
pxor xmm3, xmm5
|
|
movups xmmword ptr [r9], xmm0
|
|
movups xmmword ptr [r9+0x10], xmm1
|
|
movups xmmword ptr [r9+0x20], xmm2
|
|
movups xmmword ptr [r9+0x30], xmm3
|
|
ret
|
|
.global blake3_sse2_xof_xor_1
|
|
blake3_sse2_xof_xor_1:
|
|
movups xmm0, xmmword ptr [rdi]
|
|
movups xmm1, xmmword ptr [rdi+0x10]
|
|
movaps xmm2, xmmword ptr [BLAKE3_IV+rip]
|
|
shl r8, 32
|
|
mov ecx, ecx
|
|
or rcx, r8
|
|
vmovq xmm3, rdx
|
|
vmovq xmm4, rcx
|
|
punpcklqdq xmm3, xmm4
|
|
movups xmm4, xmmword ptr [rsi]
|
|
movups xmm5, xmmword ptr [rsi+0x10]
|
|
movaps xmm8, xmm4
|
|
shufps xmm4, xmm5, 136
|
|
shufps xmm8, xmm5, 221
|
|
movaps xmm5, xmm8
|
|
movups xmm6, xmmword ptr [rsi+0x20]
|
|
movups xmm7, xmmword ptr [rsi+0x30]
|
|
movaps xmm8, xmm6
|
|
shufps xmm6, xmm7, 136
|
|
pshufd xmm6, xmm6, 0x93
|
|
shufps xmm8, xmm7, 221
|
|
pshufd xmm7, xmm8, 0x93
|
|
call blake3_sse2_kernel_2d_1
|
|
movdqu xmm4, xmmword ptr [rdi]
|
|
movdqu xmm5, xmmword ptr [rdi+0x10]
|
|
pxor xmm2, xmm4
|
|
pxor xmm3, xmm5
|
|
movdqu xmm4, [r9]
|
|
movdqu xmm5, [r9+0x10]
|
|
movdqu xmm6, [r9+0x20]
|
|
movdqu xmm7, [r9+0x30]
|
|
pxor xmm0, xmm4
|
|
pxor xmm1, xmm5
|
|
pxor xmm2, xmm6
|
|
pxor xmm3, xmm7
|
|
movups xmmword ptr [r9], xmm0
|
|
movups xmmword ptr [r9+0x10], xmm1
|
|
movups xmmword ptr [r9+0x20], xmm2
|
|
movups xmmword ptr [r9+0x30], xmm3
|
|
ret
|
|
.balign 16
|
|
PBLENDW_0x33_MASK:
|
|
.long 0xFFFFFFFF, 0x00000000, 0xFFFFFFFF, 0x00000000
|
|
PBLENDW_0xCC_MASK:
|
|
.long 0x00000000, 0xFFFFFFFF, 0x00000000, 0xFFFFFFFF
|
|
PBLENDW_0x3F_MASK:
|
|
.long 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0x00000000
|
|
PBLENDW_0xC0_MASK:
|
|
.long 0x00000000, 0x00000000, 0x00000000, 0xFFFFFFFF
|
|
blake3_sse41_kernel_2d_1:
|
|
movaps xmm14, xmmword ptr [ROT8+rip]
|
|
movaps xmm15, xmmword ptr [ROT16+rip]
|
|
paddd xmm0, xmm4
|
|
paddd xmm0, xmm1
|
|
pxor xmm3, xmm0
|
|
pshufb xmm3, xmm15
|
|
paddd xmm2, xmm3
|
|
pxor xmm1, xmm2
|
|
movdqa xmm11, xmm1
|
|
pslld xmm1, 20
|
|
psrld xmm11, 12
|
|
por xmm1, xmm11
|
|
paddd xmm0, xmm5
|
|
paddd xmm0, xmm1
|
|
pxor xmm3, xmm0
|
|
pshufb xmm3, xmm14
|
|
paddd xmm2, xmm3
|
|
pxor xmm1, xmm2
|
|
movdqa xmm11, xmm1
|
|
pslld xmm1, 25
|
|
psrld xmm11, 7
|
|
por xmm1, xmm11
|
|
pshufd xmm0, xmm0, 0x93
|
|
pshufd xmm3, xmm3, 0x4E
|
|
pshufd xmm2, xmm2, 0x39
|
|
paddd xmm0, xmm6
|
|
paddd xmm0, xmm1
|
|
pxor xmm3, xmm0
|
|
pshufb xmm3, xmm15
|
|
paddd xmm2, xmm3
|
|
pxor xmm1, xmm2
|
|
movdqa xmm11, xmm1
|
|
pslld xmm1, 20
|
|
psrld xmm11, 12
|
|
por xmm1, xmm11
|
|
paddd xmm0, xmm7
|
|
paddd xmm0, xmm1
|
|
pxor xmm3, xmm0
|
|
pshufb xmm3, xmm14
|
|
paddd xmm2, xmm3
|
|
pxor xmm1, xmm2
|
|
movdqa xmm11, xmm1
|
|
pslld xmm1, 25
|
|
psrld xmm11, 7
|
|
por xmm1, xmm11
|
|
pshufd xmm0, xmm0, 0x39
|
|
pshufd xmm3, xmm3, 0x4E
|
|
pshufd xmm2, xmm2, 0x93
|
|
movdqa xmm8, xmm4
|
|
shufps xmm8, xmm5, 214
|
|
pshufd xmm9, xmm4, 0x0F
|
|
pshufd xmm4, xmm8, 0x39
|
|
movdqa xmm8, xmm6
|
|
shufps xmm8, xmm7, 250
|
|
pblendw xmm9, xmm8, 0xCC
|
|
movdqa xmm8, xmm7
|
|
punpcklqdq xmm8, xmm5
|
|
pblendw xmm8, xmm6, 0xC0
|
|
pshufd xmm8, xmm8, 0x78
|
|
punpckhdq xmm5, xmm7
|
|
punpckldq xmm6, xmm5
|
|
pshufd xmm7, xmm6, 0x1E
|
|
movdqa xmm5, xmm9
|
|
movdqa xmm6, xmm8
|
|
paddd xmm0, xmm4
|
|
paddd xmm0, xmm1
|
|
pxor xmm3, xmm0
|
|
pshufb xmm3, xmm15
|
|
paddd xmm2, xmm3
|
|
pxor xmm1, xmm2
|
|
movdqa xmm11, xmm1
|
|
pslld xmm1, 20
|
|
psrld xmm11, 12
|
|
por xmm1, xmm11
|
|
paddd xmm0, xmm5
|
|
paddd xmm0, xmm1
|
|
pxor xmm3, xmm0
|
|
pshufb xmm3, xmm14
|
|
paddd xmm2, xmm3
|
|
pxor xmm1, xmm2
|
|
movdqa xmm11, xmm1
|
|
pslld xmm1, 25
|
|
psrld xmm11, 7
|
|
por xmm1, xmm11
|
|
pshufd xmm0, xmm0, 0x93
|
|
pshufd xmm3, xmm3, 0x4E
|
|
pshufd xmm2, xmm2, 0x39
|
|
paddd xmm0, xmm6
|
|
paddd xmm0, xmm1
|
|
pxor xmm3, xmm0
|
|
pshufb xmm3, xmm15
|
|
paddd xmm2, xmm3
|
|
pxor xmm1, xmm2
|
|
movdqa xmm11, xmm1
|
|
pslld xmm1, 20
|
|
psrld xmm11, 12
|
|
por xmm1, xmm11
|
|
paddd xmm0, xmm7
|
|
paddd xmm0, xmm1
|
|
pxor xmm3, xmm0
|
|
pshufb xmm3, xmm14
|
|
paddd xmm2, xmm3
|
|
pxor xmm1, xmm2
|
|
movdqa xmm11, xmm1
|
|
pslld xmm1, 25
|
|
psrld xmm11, 7
|
|
por xmm1, xmm11
|
|
pshufd xmm0, xmm0, 0x39
|
|
pshufd xmm3, xmm3, 0x4E
|
|
pshufd xmm2, xmm2, 0x93
|
|
movdqa xmm8, xmm4
|
|
shufps xmm8, xmm5, 214
|
|
pshufd xmm9, xmm4, 0x0F
|
|
pshufd xmm4, xmm8, 0x39
|
|
movdqa xmm8, xmm6
|
|
shufps xmm8, xmm7, 250
|
|
pblendw xmm9, xmm8, 0xCC
|
|
movdqa xmm8, xmm7
|
|
punpcklqdq xmm8, xmm5
|
|
pblendw xmm8, xmm6, 0xC0
|
|
pshufd xmm8, xmm8, 0x78
|
|
punpckhdq xmm5, xmm7
|
|
punpckldq xmm6, xmm5
|
|
pshufd xmm7, xmm6, 0x1E
|
|
movdqa xmm5, xmm9
|
|
movdqa xmm6, xmm8
|
|
paddd xmm0, xmm4
|
|
paddd xmm0, xmm1
|
|
pxor xmm3, xmm0
|
|
pshufb xmm3, xmm15
|
|
paddd xmm2, xmm3
|
|
pxor xmm1, xmm2
|
|
movdqa xmm11, xmm1
|
|
pslld xmm1, 20
|
|
psrld xmm11, 12
|
|
por xmm1, xmm11
|
|
paddd xmm0, xmm5
|
|
paddd xmm0, xmm1
|
|
pxor xmm3, xmm0
|
|
pshufb xmm3, xmm14
|
|
paddd xmm2, xmm3
|
|
pxor xmm1, xmm2
|
|
movdqa xmm11, xmm1
|
|
pslld xmm1, 25
|
|
psrld xmm11, 7
|
|
por xmm1, xmm11
|
|
pshufd xmm0, xmm0, 0x93
|
|
pshufd xmm3, xmm3, 0x4E
|
|
pshufd xmm2, xmm2, 0x39
|
|
paddd xmm0, xmm6
|
|
paddd xmm0, xmm1
|
|
pxor xmm3, xmm0
|
|
pshufb xmm3, xmm15
|
|
paddd xmm2, xmm3
|
|
pxor xmm1, xmm2
|
|
movdqa xmm11, xmm1
|
|
pslld xmm1, 20
|
|
psrld xmm11, 12
|
|
por xmm1, xmm11
|
|
paddd xmm0, xmm7
|
|
paddd xmm0, xmm1
|
|
pxor xmm3, xmm0
|
|
pshufb xmm3, xmm14
|
|
paddd xmm2, xmm3
|
|
pxor xmm1, xmm2
|
|
movdqa xmm11, xmm1
|
|
pslld xmm1, 25
|
|
psrld xmm11, 7
|
|
por xmm1, xmm11
|
|
pshufd xmm0, xmm0, 0x39
|
|
pshufd xmm3, xmm3, 0x4E
|
|
pshufd xmm2, xmm2, 0x93
|
|
movdqa xmm8, xmm4
|
|
shufps xmm8, xmm5, 214
|
|
pshufd xmm9, xmm4, 0x0F
|
|
pshufd xmm4, xmm8, 0x39
|
|
movdqa xmm8, xmm6
|
|
shufps xmm8, xmm7, 250
|
|
pblendw xmm9, xmm8, 0xCC
|
|
movdqa xmm8, xmm7
|
|
punpcklqdq xmm8, xmm5
|
|
pblendw xmm8, xmm6, 0xC0
|
|
pshufd xmm8, xmm8, 0x78
|
|
punpckhdq xmm5, xmm7
|
|
punpckldq xmm6, xmm5
|
|
pshufd xmm7, xmm6, 0x1E
|
|
movdqa xmm5, xmm9
|
|
movdqa xmm6, xmm8
|
|
paddd xmm0, xmm4
|
|
paddd xmm0, xmm1
|
|
pxor xmm3, xmm0
|
|
pshufb xmm3, xmm15
|
|
paddd xmm2, xmm3
|
|
pxor xmm1, xmm2
|
|
movdqa xmm11, xmm1
|
|
pslld xmm1, 20
|
|
psrld xmm11, 12
|
|
por xmm1, xmm11
|
|
paddd xmm0, xmm5
|
|
paddd xmm0, xmm1
|
|
pxor xmm3, xmm0
|
|
pshufb xmm3, xmm14
|
|
paddd xmm2, xmm3
|
|
pxor xmm1, xmm2
|
|
movdqa xmm11, xmm1
|
|
pslld xmm1, 25
|
|
psrld xmm11, 7
|
|
por xmm1, xmm11
|
|
pshufd xmm0, xmm0, 0x93
|
|
pshufd xmm3, xmm3, 0x4E
|
|
pshufd xmm2, xmm2, 0x39
|
|
paddd xmm0, xmm6
|
|
paddd xmm0, xmm1
|
|
pxor xmm3, xmm0
|
|
pshufb xmm3, xmm15
|
|
paddd xmm2, xmm3
|
|
pxor xmm1, xmm2
|
|
movdqa xmm11, xmm1
|
|
pslld xmm1, 20
|
|
psrld xmm11, 12
|
|
por xmm1, xmm11
|
|
paddd xmm0, xmm7
|
|
paddd xmm0, xmm1
|
|
pxor xmm3, xmm0
|
|
pshufb xmm3, xmm14
|
|
paddd xmm2, xmm3
|
|
pxor xmm1, xmm2
|
|
movdqa xmm11, xmm1
|
|
pslld xmm1, 25
|
|
psrld xmm11, 7
|
|
por xmm1, xmm11
|
|
pshufd xmm0, xmm0, 0x39
|
|
pshufd xmm3, xmm3, 0x4E
|
|
pshufd xmm2, xmm2, 0x93
|
|
movdqa xmm8, xmm4
|
|
shufps xmm8, xmm5, 214
|
|
pshufd xmm9, xmm4, 0x0F
|
|
pshufd xmm4, xmm8, 0x39
|
|
movdqa xmm8, xmm6
|
|
shufps xmm8, xmm7, 250
|
|
pblendw xmm9, xmm8, 0xCC
|
|
movdqa xmm8, xmm7
|
|
punpcklqdq xmm8, xmm5
|
|
pblendw xmm8, xmm6, 0xC0
|
|
pshufd xmm8, xmm8, 0x78
|
|
punpckhdq xmm5, xmm7
|
|
punpckldq xmm6, xmm5
|
|
pshufd xmm7, xmm6, 0x1E
|
|
movdqa xmm5, xmm9
|
|
movdqa xmm6, xmm8
|
|
paddd xmm0, xmm4
|
|
paddd xmm0, xmm1
|
|
pxor xmm3, xmm0
|
|
pshufb xmm3, xmm15
|
|
paddd xmm2, xmm3
|
|
pxor xmm1, xmm2
|
|
movdqa xmm11, xmm1
|
|
pslld xmm1, 20
|
|
psrld xmm11, 12
|
|
por xmm1, xmm11
|
|
paddd xmm0, xmm5
|
|
paddd xmm0, xmm1
|
|
pxor xmm3, xmm0
|
|
pshufb xmm3, xmm14
|
|
paddd xmm2, xmm3
|
|
pxor xmm1, xmm2
|
|
movdqa xmm11, xmm1
|
|
pslld xmm1, 25
|
|
psrld xmm11, 7
|
|
por xmm1, xmm11
|
|
pshufd xmm0, xmm0, 0x93
|
|
pshufd xmm3, xmm3, 0x4E
|
|
pshufd xmm2, xmm2, 0x39
|
|
paddd xmm0, xmm6
|
|
paddd xmm0, xmm1
|
|
pxor xmm3, xmm0
|
|
pshufb xmm3, xmm15
|
|
paddd xmm2, xmm3
|
|
pxor xmm1, xmm2
|
|
movdqa xmm11, xmm1
|
|
pslld xmm1, 20
|
|
psrld xmm11, 12
|
|
por xmm1, xmm11
|
|
paddd xmm0, xmm7
|
|
paddd xmm0, xmm1
|
|
pxor xmm3, xmm0
|
|
pshufb xmm3, xmm14
|
|
paddd xmm2, xmm3
|
|
pxor xmm1, xmm2
|
|
movdqa xmm11, xmm1
|
|
pslld xmm1, 25
|
|
psrld xmm11, 7
|
|
por xmm1, xmm11
|
|
pshufd xmm0, xmm0, 0x39
|
|
pshufd xmm3, xmm3, 0x4E
|
|
pshufd xmm2, xmm2, 0x93
|
|
movdqa xmm8, xmm4
|
|
shufps xmm8, xmm5, 214
|
|
pshufd xmm9, xmm4, 0x0F
|
|
pshufd xmm4, xmm8, 0x39
|
|
movdqa xmm8, xmm6
|
|
shufps xmm8, xmm7, 250
|
|
pblendw xmm9, xmm8, 0xCC
|
|
movdqa xmm8, xmm7
|
|
punpcklqdq xmm8, xmm5
|
|
pblendw xmm8, xmm6, 0xC0
|
|
pshufd xmm8, xmm8, 0x78
|
|
punpckhdq xmm5, xmm7
|
|
punpckldq xmm6, xmm5
|
|
pshufd xmm7, xmm6, 0x1E
|
|
movdqa xmm5, xmm9
|
|
movdqa xmm6, xmm8
|
|
paddd xmm0, xmm4
|
|
paddd xmm0, xmm1
|
|
pxor xmm3, xmm0
|
|
pshufb xmm3, xmm15
|
|
paddd xmm2, xmm3
|
|
pxor xmm1, xmm2
|
|
movdqa xmm11, xmm1
|
|
pslld xmm1, 20
|
|
psrld xmm11, 12
|
|
por xmm1, xmm11
|
|
paddd xmm0, xmm5
|
|
paddd xmm0, xmm1
|
|
pxor xmm3, xmm0
|
|
pshufb xmm3, xmm14
|
|
paddd xmm2, xmm3
|
|
pxor xmm1, xmm2
|
|
movdqa xmm11, xmm1
|
|
pslld xmm1, 25
|
|
psrld xmm11, 7
|
|
por xmm1, xmm11
|
|
pshufd xmm0, xmm0, 0x93
|
|
pshufd xmm3, xmm3, 0x4E
|
|
pshufd xmm2, xmm2, 0x39
|
|
paddd xmm0, xmm6
|
|
paddd xmm0, xmm1
|
|
pxor xmm3, xmm0
|
|
pshufb xmm3, xmm15
|
|
paddd xmm2, xmm3
|
|
pxor xmm1, xmm2
|
|
movdqa xmm11, xmm1
|
|
pslld xmm1, 20
|
|
psrld xmm11, 12
|
|
por xmm1, xmm11
|
|
paddd xmm0, xmm7
|
|
paddd xmm0, xmm1
|
|
pxor xmm3, xmm0
|
|
pshufb xmm3, xmm14
|
|
paddd xmm2, xmm3
|
|
pxor xmm1, xmm2
|
|
movdqa xmm11, xmm1
|
|
pslld xmm1, 25
|
|
psrld xmm11, 7
|
|
por xmm1, xmm11
|
|
pshufd xmm0, xmm0, 0x39
|
|
pshufd xmm3, xmm3, 0x4E
|
|
pshufd xmm2, xmm2, 0x93
|
|
movdqa xmm8, xmm4
|
|
shufps xmm8, xmm5, 214
|
|
pshufd xmm9, xmm4, 0x0F
|
|
pshufd xmm4, xmm8, 0x39
|
|
movdqa xmm8, xmm6
|
|
shufps xmm8, xmm7, 250
|
|
pblendw xmm9, xmm8, 0xCC
|
|
movdqa xmm8, xmm7
|
|
punpcklqdq xmm8, xmm5
|
|
pblendw xmm8, xmm6, 0xC0
|
|
pshufd xmm8, xmm8, 0x78
|
|
punpckhdq xmm5, xmm7
|
|
punpckldq xmm6, xmm5
|
|
pshufd xmm7, xmm6, 0x1E
|
|
movdqa xmm5, xmm9
|
|
movdqa xmm6, xmm8
|
|
paddd xmm0, xmm4
|
|
paddd xmm0, xmm1
|
|
pxor xmm3, xmm0
|
|
pshufb xmm3, xmm15
|
|
paddd xmm2, xmm3
|
|
pxor xmm1, xmm2
|
|
movdqa xmm11, xmm1
|
|
pslld xmm1, 20
|
|
psrld xmm11, 12
|
|
por xmm1, xmm11
|
|
paddd xmm0, xmm5
|
|
paddd xmm0, xmm1
|
|
pxor xmm3, xmm0
|
|
pshufb xmm3, xmm14
|
|
paddd xmm2, xmm3
|
|
pxor xmm1, xmm2
|
|
movdqa xmm11, xmm1
|
|
pslld xmm1, 25
|
|
psrld xmm11, 7
|
|
por xmm1, xmm11
|
|
pshufd xmm0, xmm0, 0x93
|
|
pshufd xmm3, xmm3, 0x4E
|
|
pshufd xmm2, xmm2, 0x39
|
|
paddd xmm0, xmm6
|
|
paddd xmm0, xmm1
|
|
pxor xmm3, xmm0
|
|
pshufb xmm3, xmm15
|
|
paddd xmm2, xmm3
|
|
pxor xmm1, xmm2
|
|
movdqa xmm11, xmm1
|
|
pslld xmm1, 20
|
|
psrld xmm11, 12
|
|
por xmm1, xmm11
|
|
paddd xmm0, xmm7
|
|
paddd xmm0, xmm1
|
|
pxor xmm3, xmm0
|
|
pshufb xmm3, xmm14
|
|
paddd xmm2, xmm3
|
|
pxor xmm1, xmm2
|
|
movdqa xmm11, xmm1
|
|
pslld xmm1, 25
|
|
psrld xmm11, 7
|
|
por xmm1, xmm11
|
|
pshufd xmm0, xmm0, 0x39
|
|
pshufd xmm3, xmm3, 0x4E
|
|
pshufd xmm2, xmm2, 0x93
|
|
pxor xmm0, xmm2
|
|
pxor xmm1, xmm3
|
|
ret
|
|
.global blake3_sse41_compress
|
|
blake3_sse41_compress:
|
|
movups xmm0, xmmword ptr [rdi]
|
|
movups xmm1, xmmword ptr [rdi+0x10]
|
|
movaps xmm2, xmmword ptr [BLAKE3_IV+rip]
|
|
shl r8, 32
|
|
mov ecx, ecx
|
|
or rcx, r8
|
|
vmovq xmm3, rdx
|
|
vmovq xmm4, rcx
|
|
punpcklqdq xmm3, xmm4
|
|
movups xmm4, xmmword ptr [rsi]
|
|
movups xmm5, xmmword ptr [rsi+0x10]
|
|
movaps xmm8, xmm4
|
|
shufps xmm4, xmm5, 136
|
|
shufps xmm8, xmm5, 221
|
|
movaps xmm5, xmm8
|
|
movups xmm6, xmmword ptr [rsi+0x20]
|
|
movups xmm7, xmmword ptr [rsi+0x30]
|
|
movaps xmm8, xmm6
|
|
shufps xmm6, xmm7, 136
|
|
pshufd xmm6, xmm6, 0x93
|
|
shufps xmm8, xmm7, 221
|
|
pshufd xmm7, xmm8, 0x93
|
|
call blake3_sse41_kernel_2d_1
|
|
movups xmmword ptr [rdi], xmm0
|
|
movups xmmword ptr [rdi+0x10], xmm1
|
|
ret
|
|
.global blake3_sse41_xof_stream_1
|
|
blake3_sse41_xof_stream_1:
|
|
movups xmm0, xmmword ptr [rdi]
|
|
movups xmm1, xmmword ptr [rdi+0x10]
|
|
movaps xmm2, xmmword ptr [BLAKE3_IV+rip]
|
|
shl r8, 32
|
|
mov ecx, ecx
|
|
or rcx, r8
|
|
vmovq xmm3, rdx
|
|
vmovq xmm4, rcx
|
|
punpcklqdq xmm3, xmm4
|
|
movups xmm4, xmmword ptr [rsi]
|
|
movups xmm5, xmmword ptr [rsi+0x10]
|
|
movaps xmm8, xmm4
|
|
shufps xmm4, xmm5, 136
|
|
shufps xmm8, xmm5, 221
|
|
movaps xmm5, xmm8
|
|
movups xmm6, xmmword ptr [rsi+0x20]
|
|
movups xmm7, xmmword ptr [rsi+0x30]
|
|
movaps xmm8, xmm6
|
|
shufps xmm6, xmm7, 136
|
|
pshufd xmm6, xmm6, 0x93
|
|
shufps xmm8, xmm7, 221
|
|
pshufd xmm7, xmm8, 0x93
|
|
call blake3_sse41_kernel_2d_1
|
|
movdqu xmm4, xmmword ptr [rdi]
|
|
movdqu xmm5, xmmword ptr [rdi+0x10]
|
|
pxor xmm2, xmm4
|
|
pxor xmm3, xmm5
|
|
movups xmmword ptr [r9], xmm0
|
|
movups xmmword ptr [r9+0x10], xmm1
|
|
movups xmmword ptr [r9+0x20], xmm2
|
|
movups xmmword ptr [r9+0x30], xmm3
|
|
ret
|
|
.global blake3_sse41_xof_xor_1
|
|
blake3_sse41_xof_xor_1:
|
|
movups xmm0, xmmword ptr [rdi]
|
|
movups xmm1, xmmword ptr [rdi+0x10]
|
|
movaps xmm2, xmmword ptr [BLAKE3_IV+rip]
|
|
shl r8, 32
|
|
mov ecx, ecx
|
|
or rcx, r8
|
|
vmovq xmm3, rdx
|
|
vmovq xmm4, rcx
|
|
punpcklqdq xmm3, xmm4
|
|
movups xmm4, xmmword ptr [rsi]
|
|
movups xmm5, xmmword ptr [rsi+0x10]
|
|
movaps xmm8, xmm4
|
|
shufps xmm4, xmm5, 136
|
|
shufps xmm8, xmm5, 221
|
|
movaps xmm5, xmm8
|
|
movups xmm6, xmmword ptr [rsi+0x20]
|
|
movups xmm7, xmmword ptr [rsi+0x30]
|
|
movaps xmm8, xmm6
|
|
shufps xmm6, xmm7, 136
|
|
pshufd xmm6, xmm6, 0x93
|
|
shufps xmm8, xmm7, 221
|
|
pshufd xmm7, xmm8, 0x93
|
|
call blake3_sse41_kernel_2d_1
|
|
movdqu xmm4, xmmword ptr [rdi]
|
|
movdqu xmm5, xmmword ptr [rdi+0x10]
|
|
pxor xmm2, xmm4
|
|
pxor xmm3, xmm5
|
|
movdqu xmm4, [r9]
|
|
movdqu xmm5, [r9+0x10]
|
|
movdqu xmm6, [r9+0x20]
|
|
movdqu xmm7, [r9+0x30]
|
|
pxor xmm0, xmm4
|
|
pxor xmm1, xmm5
|
|
pxor xmm2, xmm6
|
|
pxor xmm3, xmm7
|
|
movups xmmword ptr [r9], xmm0
|
|
movups xmmword ptr [r9+0x10], xmm1
|
|
movups xmmword ptr [r9+0x20], xmm2
|
|
movups xmmword ptr [r9+0x30], xmm3
|
|
ret
|
|
blake3_avx2_kernel_2d_2:
|
|
vbroadcasti128 ymm14, xmmword ptr [ROT16+rip]
|
|
vbroadcasti128 ymm15, xmmword ptr [ROT8+rip]
|
|
vpaddd ymm0, ymm0, ymm4
|
|
vpaddd ymm0, ymm0, ymm1
|
|
vpxord ymm3, ymm3, ymm0
|
|
vpshufb ymm3, ymm3, ymm14
|
|
vpaddd ymm2, ymm2, ymm3
|
|
vpxord ymm1, ymm1, ymm2
|
|
vpsrld ymm8, ymm1, 12
|
|
vpslld ymm1, ymm1, 20
|
|
vpor ymm1, ymm1, ymm8
|
|
vpaddd ymm0, ymm0, ymm5
|
|
vpaddd ymm0, ymm0, ymm1
|
|
vpxord ymm3, ymm3, ymm0
|
|
vpshufb ymm3, ymm3, ymm15
|
|
vpaddd ymm2, ymm2, ymm3
|
|
vpxord ymm1, ymm1, ymm2
|
|
vpsrld ymm8, ymm1, 7
|
|
vpslld ymm1, ymm1, 25
|
|
vpor ymm1, ymm1, ymm8
|
|
vpshufd ymm0, ymm0, 0x93
|
|
vpshufd ymm3, ymm3, 0x4E
|
|
vpshufd ymm2, ymm2, 0x39
|
|
vpaddd ymm0, ymm0, ymm6
|
|
vpaddd ymm0, ymm0, ymm1
|
|
vpxord ymm3, ymm3, ymm0
|
|
vpshufb ymm3, ymm3, ymm14
|
|
vpaddd ymm2, ymm2, ymm3
|
|
vpxord ymm1, ymm1, ymm2
|
|
vpsrld ymm8, ymm1, 12
|
|
vpslld ymm1, ymm1, 20
|
|
vpor ymm1, ymm1, ymm8
|
|
vpaddd ymm0, ymm0, ymm7
|
|
vpaddd ymm0, ymm0, ymm1
|
|
vpxord ymm3, ymm3, ymm0
|
|
vpshufb ymm3, ymm3, ymm15
|
|
vpaddd ymm2, ymm2, ymm3
|
|
vpxord ymm1, ymm1, ymm2
|
|
vpsrld ymm8, ymm1, 7
|
|
vpslld ymm1, ymm1, 25
|
|
vpor ymm1, ymm1, ymm8
|
|
vpshufd ymm0, ymm0, 0x39
|
|
vpshufd ymm3, ymm3, 0x4E
|
|
vpshufd ymm2, ymm2, 0x93
|
|
vshufps ymm8, ymm4, ymm5, 214
|
|
vpshufd ymm9, ymm4, 0x0F
|
|
vpshufd ymm4, ymm8, 0x39
|
|
vshufps ymm8, ymm6, ymm7, 250
|
|
vpblendd ymm9, ymm9, ymm8, 0xAA
|
|
vpunpcklqdq ymm8, ymm7, ymm5
|
|
vpblendd ymm8, ymm8, ymm6, 0x88
|
|
vpshufd ymm8, ymm8, 0x78
|
|
vpunpckhdq ymm5, ymm5, ymm7
|
|
vpunpckldq ymm6, ymm6, ymm5
|
|
vpshufd ymm7, ymm6, 0x1E
|
|
vmovdqa ymm5, ymm9
|
|
vmovdqa ymm6, ymm8
|
|
vpaddd ymm0, ymm0, ymm4
|
|
vpaddd ymm0, ymm0, ymm1
|
|
vpxord ymm3, ymm3, ymm0
|
|
vpshufb ymm3, ymm3, ymm14
|
|
vpaddd ymm2, ymm2, ymm3
|
|
vpxord ymm1, ymm1, ymm2
|
|
vpsrld ymm8, ymm1, 12
|
|
vpslld ymm1, ymm1, 20
|
|
vpor ymm1, ymm1, ymm8
|
|
vpaddd ymm0, ymm0, ymm5
|
|
vpaddd ymm0, ymm0, ymm1
|
|
vpxord ymm3, ymm3, ymm0
|
|
vpshufb ymm3, ymm3, ymm15
|
|
vpaddd ymm2, ymm2, ymm3
|
|
vpxord ymm1, ymm1, ymm2
|
|
vpsrld ymm8, ymm1, 7
|
|
vpslld ymm1, ymm1, 25
|
|
vpor ymm1, ymm1, ymm8
|
|
vpshufd ymm0, ymm0, 0x93
|
|
vpshufd ymm3, ymm3, 0x4E
|
|
vpshufd ymm2, ymm2, 0x39
|
|
vpaddd ymm0, ymm0, ymm6
|
|
vpaddd ymm0, ymm0, ymm1
|
|
vpxord ymm3, ymm3, ymm0
|
|
vpshufb ymm3, ymm3, ymm14
|
|
vpaddd ymm2, ymm2, ymm3
|
|
vpxord ymm1, ymm1, ymm2
|
|
vpsrld ymm8, ymm1, 12
|
|
vpslld ymm1, ymm1, 20
|
|
vpor ymm1, ymm1, ymm8
|
|
vpaddd ymm0, ymm0, ymm7
|
|
vpaddd ymm0, ymm0, ymm1
|
|
vpxord ymm3, ymm3, ymm0
|
|
vpshufb ymm3, ymm3, ymm15
|
|
vpaddd ymm2, ymm2, ymm3
|
|
vpxord ymm1, ymm1, ymm2
|
|
vpsrld ymm8, ymm1, 7
|
|
vpslld ymm1, ymm1, 25
|
|
vpor ymm1, ymm1, ymm8
|
|
vpshufd ymm0, ymm0, 0x39
|
|
vpshufd ymm3, ymm3, 0x4E
|
|
vpshufd ymm2, ymm2, 0x93
|
|
vshufps ymm8, ymm4, ymm5, 214
|
|
vpshufd ymm9, ymm4, 0x0F
|
|
vpshufd ymm4, ymm8, 0x39
|
|
vshufps ymm8, ymm6, ymm7, 250
|
|
vpblendd ymm9, ymm9, ymm8, 0xAA
|
|
vpunpcklqdq ymm8, ymm7, ymm5
|
|
vpblendd ymm8, ymm8, ymm6, 0x88
|
|
vpshufd ymm8, ymm8, 0x78
|
|
vpunpckhdq ymm5, ymm5, ymm7
|
|
vpunpckldq ymm6, ymm6, ymm5
|
|
vpshufd ymm7, ymm6, 0x1E
|
|
vmovdqa ymm5, ymm9
|
|
vmovdqa ymm6, ymm8
|
|
vpaddd ymm0, ymm0, ymm4
|
|
vpaddd ymm0, ymm0, ymm1
|
|
vpxord ymm3, ymm3, ymm0
|
|
vpshufb ymm3, ymm3, ymm14
|
|
vpaddd ymm2, ymm2, ymm3
|
|
vpxord ymm1, ymm1, ymm2
|
|
vpsrld ymm8, ymm1, 12
|
|
vpslld ymm1, ymm1, 20
|
|
vpor ymm1, ymm1, ymm8
|
|
vpaddd ymm0, ymm0, ymm5
|
|
vpaddd ymm0, ymm0, ymm1
|
|
vpxord ymm3, ymm3, ymm0
|
|
vpshufb ymm3, ymm3, ymm15
|
|
vpaddd ymm2, ymm2, ymm3
|
|
vpxord ymm1, ymm1, ymm2
|
|
vpsrld ymm8, ymm1, 7
|
|
vpslld ymm1, ymm1, 25
|
|
vpor ymm1, ymm1, ymm8
|
|
vpshufd ymm0, ymm0, 0x93
|
|
vpshufd ymm3, ymm3, 0x4E
|
|
vpshufd ymm2, ymm2, 0x39
|
|
vpaddd ymm0, ymm0, ymm6
|
|
vpaddd ymm0, ymm0, ymm1
|
|
vpxord ymm3, ymm3, ymm0
|
|
vpshufb ymm3, ymm3, ymm14
|
|
vpaddd ymm2, ymm2, ymm3
|
|
vpxord ymm1, ymm1, ymm2
|
|
vpsrld ymm8, ymm1, 12
|
|
vpslld ymm1, ymm1, 20
|
|
vpor ymm1, ymm1, ymm8
|
|
vpaddd ymm0, ymm0, ymm7
|
|
vpaddd ymm0, ymm0, ymm1
|
|
vpxord ymm3, ymm3, ymm0
|
|
vpshufb ymm3, ymm3, ymm15
|
|
vpaddd ymm2, ymm2, ymm3
|
|
vpxord ymm1, ymm1, ymm2
|
|
vpsrld ymm8, ymm1, 7
|
|
vpslld ymm1, ymm1, 25
|
|
vpor ymm1, ymm1, ymm8
|
|
vpshufd ymm0, ymm0, 0x39
|
|
vpshufd ymm3, ymm3, 0x4E
|
|
vpshufd ymm2, ymm2, 0x93
|
|
vshufps ymm8, ymm4, ymm5, 214
|
|
vpshufd ymm9, ymm4, 0x0F
|
|
vpshufd ymm4, ymm8, 0x39
|
|
vshufps ymm8, ymm6, ymm7, 250
|
|
vpblendd ymm9, ymm9, ymm8, 0xAA
|
|
vpunpcklqdq ymm8, ymm7, ymm5
|
|
vpblendd ymm8, ymm8, ymm6, 0x88
|
|
vpshufd ymm8, ymm8, 0x78
|
|
vpunpckhdq ymm5, ymm5, ymm7
|
|
vpunpckldq ymm6, ymm6, ymm5
|
|
vpshufd ymm7, ymm6, 0x1E
|
|
vmovdqa ymm5, ymm9
|
|
vmovdqa ymm6, ymm8
|
|
vpaddd ymm0, ymm0, ymm4
|
|
vpaddd ymm0, ymm0, ymm1
|
|
vpxord ymm3, ymm3, ymm0
|
|
vpshufb ymm3, ymm3, ymm14
|
|
vpaddd ymm2, ymm2, ymm3
|
|
vpxord ymm1, ymm1, ymm2
|
|
vpsrld ymm8, ymm1, 12
|
|
vpslld ymm1, ymm1, 20
|
|
vpor ymm1, ymm1, ymm8
|
|
vpaddd ymm0, ymm0, ymm5
|
|
vpaddd ymm0, ymm0, ymm1
|
|
vpxord ymm3, ymm3, ymm0
|
|
vpshufb ymm3, ymm3, ymm15
|
|
vpaddd ymm2, ymm2, ymm3
|
|
vpxord ymm1, ymm1, ymm2
|
|
vpsrld ymm8, ymm1, 7
|
|
vpslld ymm1, ymm1, 25
|
|
vpor ymm1, ymm1, ymm8
|
|
vpshufd ymm0, ymm0, 0x93
|
|
vpshufd ymm3, ymm3, 0x4E
|
|
vpshufd ymm2, ymm2, 0x39
|
|
vpaddd ymm0, ymm0, ymm6
|
|
vpaddd ymm0, ymm0, ymm1
|
|
vpxord ymm3, ymm3, ymm0
|
|
vpshufb ymm3, ymm3, ymm14
|
|
vpaddd ymm2, ymm2, ymm3
|
|
vpxord ymm1, ymm1, ymm2
|
|
vpsrld ymm8, ymm1, 12
|
|
vpslld ymm1, ymm1, 20
|
|
vpor ymm1, ymm1, ymm8
|
|
vpaddd ymm0, ymm0, ymm7
|
|
vpaddd ymm0, ymm0, ymm1
|
|
vpxord ymm3, ymm3, ymm0
|
|
vpshufb ymm3, ymm3, ymm15
|
|
vpaddd ymm2, ymm2, ymm3
|
|
vpxord ymm1, ymm1, ymm2
|
|
vpsrld ymm8, ymm1, 7
|
|
vpslld ymm1, ymm1, 25
|
|
vpor ymm1, ymm1, ymm8
|
|
vpshufd ymm0, ymm0, 0x39
|
|
vpshufd ymm3, ymm3, 0x4E
|
|
vpshufd ymm2, ymm2, 0x93
|
|
vshufps ymm8, ymm4, ymm5, 214
|
|
vpshufd ymm9, ymm4, 0x0F
|
|
vpshufd ymm4, ymm8, 0x39
|
|
vshufps ymm8, ymm6, ymm7, 250
|
|
vpblendd ymm9, ymm9, ymm8, 0xAA
|
|
vpunpcklqdq ymm8, ymm7, ymm5
|
|
vpblendd ymm8, ymm8, ymm6, 0x88
|
|
vpshufd ymm8, ymm8, 0x78
|
|
vpunpckhdq ymm5, ymm5, ymm7
|
|
vpunpckldq ymm6, ymm6, ymm5
|
|
vpshufd ymm7, ymm6, 0x1E
|
|
vmovdqa ymm5, ymm9
|
|
vmovdqa ymm6, ymm8
|
|
vpaddd ymm0, ymm0, ymm4
|
|
vpaddd ymm0, ymm0, ymm1
|
|
vpxord ymm3, ymm3, ymm0
|
|
vpshufb ymm3, ymm3, ymm14
|
|
vpaddd ymm2, ymm2, ymm3
|
|
vpxord ymm1, ymm1, ymm2
|
|
vpsrld ymm8, ymm1, 12
|
|
vpslld ymm1, ymm1, 20
|
|
vpor ymm1, ymm1, ymm8
|
|
vpaddd ymm0, ymm0, ymm5
|
|
vpaddd ymm0, ymm0, ymm1
|
|
vpxord ymm3, ymm3, ymm0
|
|
vpshufb ymm3, ymm3, ymm15
|
|
vpaddd ymm2, ymm2, ymm3
|
|
vpxord ymm1, ymm1, ymm2
|
|
vpsrld ymm8, ymm1, 7
|
|
vpslld ymm1, ymm1, 25
|
|
vpor ymm1, ymm1, ymm8
|
|
vpshufd ymm0, ymm0, 0x93
|
|
vpshufd ymm3, ymm3, 0x4E
|
|
vpshufd ymm2, ymm2, 0x39
|
|
vpaddd ymm0, ymm0, ymm6
|
|
vpaddd ymm0, ymm0, ymm1
|
|
vpxord ymm3, ymm3, ymm0
|
|
vpshufb ymm3, ymm3, ymm14
|
|
vpaddd ymm2, ymm2, ymm3
|
|
vpxord ymm1, ymm1, ymm2
|
|
vpsrld ymm8, ymm1, 12
|
|
vpslld ymm1, ymm1, 20
|
|
vpor ymm1, ymm1, ymm8
|
|
vpaddd ymm0, ymm0, ymm7
|
|
vpaddd ymm0, ymm0, ymm1
|
|
vpxord ymm3, ymm3, ymm0
|
|
vpshufb ymm3, ymm3, ymm15
|
|
vpaddd ymm2, ymm2, ymm3
|
|
vpxord ymm1, ymm1, ymm2
|
|
vpsrld ymm8, ymm1, 7
|
|
vpslld ymm1, ymm1, 25
|
|
vpor ymm1, ymm1, ymm8
|
|
vpshufd ymm0, ymm0, 0x39
|
|
vpshufd ymm3, ymm3, 0x4E
|
|
vpshufd ymm2, ymm2, 0x93
|
|
vshufps ymm8, ymm4, ymm5, 214
|
|
vpshufd ymm9, ymm4, 0x0F
|
|
vpshufd ymm4, ymm8, 0x39
|
|
vshufps ymm8, ymm6, ymm7, 250
|
|
vpblendd ymm9, ymm9, ymm8, 0xAA
|
|
vpunpcklqdq ymm8, ymm7, ymm5
|
|
vpblendd ymm8, ymm8, ymm6, 0x88
|
|
vpshufd ymm8, ymm8, 0x78
|
|
vpunpckhdq ymm5, ymm5, ymm7
|
|
vpunpckldq ymm6, ymm6, ymm5
|
|
vpshufd ymm7, ymm6, 0x1E
|
|
vmovdqa ymm5, ymm9
|
|
vmovdqa ymm6, ymm8
|
|
vpaddd ymm0, ymm0, ymm4
|
|
vpaddd ymm0, ymm0, ymm1
|
|
vpxord ymm3, ymm3, ymm0
|
|
vpshufb ymm3, ymm3, ymm14
|
|
vpaddd ymm2, ymm2, ymm3
|
|
vpxord ymm1, ymm1, ymm2
|
|
vpsrld ymm8, ymm1, 12
|
|
vpslld ymm1, ymm1, 20
|
|
vpor ymm1, ymm1, ymm8
|
|
vpaddd ymm0, ymm0, ymm5
|
|
vpaddd ymm0, ymm0, ymm1
|
|
vpxord ymm3, ymm3, ymm0
|
|
vpshufb ymm3, ymm3, ymm15
|
|
vpaddd ymm2, ymm2, ymm3
|
|
vpxord ymm1, ymm1, ymm2
|
|
vpsrld ymm8, ymm1, 7
|
|
vpslld ymm1, ymm1, 25
|
|
vpor ymm1, ymm1, ymm8
|
|
vpshufd ymm0, ymm0, 0x93
|
|
vpshufd ymm3, ymm3, 0x4E
|
|
vpshufd ymm2, ymm2, 0x39
|
|
vpaddd ymm0, ymm0, ymm6
|
|
vpaddd ymm0, ymm0, ymm1
|
|
vpxord ymm3, ymm3, ymm0
|
|
vpshufb ymm3, ymm3, ymm14
|
|
vpaddd ymm2, ymm2, ymm3
|
|
vpxord ymm1, ymm1, ymm2
|
|
vpsrld ymm8, ymm1, 12
|
|
vpslld ymm1, ymm1, 20
|
|
vpor ymm1, ymm1, ymm8
|
|
vpaddd ymm0, ymm0, ymm7
|
|
vpaddd ymm0, ymm0, ymm1
|
|
vpxord ymm3, ymm3, ymm0
|
|
vpshufb ymm3, ymm3, ymm15
|
|
vpaddd ymm2, ymm2, ymm3
|
|
vpxord ymm1, ymm1, ymm2
|
|
vpsrld ymm8, ymm1, 7
|
|
vpslld ymm1, ymm1, 25
|
|
vpor ymm1, ymm1, ymm8
|
|
vpshufd ymm0, ymm0, 0x39
|
|
vpshufd ymm3, ymm3, 0x4E
|
|
vpshufd ymm2, ymm2, 0x93
|
|
vshufps ymm8, ymm4, ymm5, 214
|
|
vpshufd ymm9, ymm4, 0x0F
|
|
vpshufd ymm4, ymm8, 0x39
|
|
vshufps ymm8, ymm6, ymm7, 250
|
|
vpblendd ymm9, ymm9, ymm8, 0xAA
|
|
vpunpcklqdq ymm8, ymm7, ymm5
|
|
vpblendd ymm8, ymm8, ymm6, 0x88
|
|
vpshufd ymm8, ymm8, 0x78
|
|
vpunpckhdq ymm5, ymm5, ymm7
|
|
vpunpckldq ymm6, ymm6, ymm5
|
|
vpshufd ymm7, ymm6, 0x1E
|
|
vmovdqa ymm5, ymm9
|
|
vmovdqa ymm6, ymm8
|
|
vpaddd ymm0, ymm0, ymm4
|
|
vpaddd ymm0, ymm0, ymm1
|
|
vpxord ymm3, ymm3, ymm0
|
|
vpshufb ymm3, ymm3, ymm14
|
|
vpaddd ymm2, ymm2, ymm3
|
|
vpxord ymm1, ymm1, ymm2
|
|
vpsrld ymm8, ymm1, 12
|
|
vpslld ymm1, ymm1, 20
|
|
vpor ymm1, ymm1, ymm8
|
|
vpaddd ymm0, ymm0, ymm5
|
|
vpaddd ymm0, ymm0, ymm1
|
|
vpxord ymm3, ymm3, ymm0
|
|
vpshufb ymm3, ymm3, ymm15
|
|
vpaddd ymm2, ymm2, ymm3
|
|
vpxord ymm1, ymm1, ymm2
|
|
vpsrld ymm8, ymm1, 7
|
|
vpslld ymm1, ymm1, 25
|
|
vpor ymm1, ymm1, ymm8
|
|
vpshufd ymm0, ymm0, 0x93
|
|
vpshufd ymm3, ymm3, 0x4E
|
|
vpshufd ymm2, ymm2, 0x39
|
|
vpaddd ymm0, ymm0, ymm6
|
|
vpaddd ymm0, ymm0, ymm1
|
|
vpxord ymm3, ymm3, ymm0
|
|
vpshufb ymm3, ymm3, ymm14
|
|
vpaddd ymm2, ymm2, ymm3
|
|
vpxord ymm1, ymm1, ymm2
|
|
vpsrld ymm8, ymm1, 12
|
|
vpslld ymm1, ymm1, 20
|
|
vpor ymm1, ymm1, ymm8
|
|
vpaddd ymm0, ymm0, ymm7
|
|
vpaddd ymm0, ymm0, ymm1
|
|
vpxord ymm3, ymm3, ymm0
|
|
vpshufb ymm3, ymm3, ymm15
|
|
vpaddd ymm2, ymm2, ymm3
|
|
vpxord ymm1, ymm1, ymm2
|
|
vpsrld ymm8, ymm1, 7
|
|
vpslld ymm1, ymm1, 25
|
|
vpor ymm1, ymm1, ymm8
|
|
vpshufd ymm0, ymm0, 0x39
|
|
vpshufd ymm3, ymm3, 0x4E
|
|
vpshufd ymm2, ymm2, 0x93
|
|
vpxord ymm0, ymm0, ymm2
|
|
vpxord ymm1, ymm1, ymm3
|
|
ret
|
|
.global blake3_avx2_xof_stream_2
|
|
blake3_avx2_xof_stream_2:
|
|
vbroadcasti128 ymm0, xmmword ptr [rdi]
|
|
vbroadcasti128 ymm1, xmmword ptr [rdi+0x10]
|
|
vmovdqa ymm4, ymmword ptr [INCREMENT_2D+rip]
|
|
vbroadcasti128 ymm2, xmmword ptr [BLAKE3_IV+rip]
|
|
vpbroadcastq ymm5, rdx
|
|
vpaddq ymm6, ymm4, ymm5
|
|
shl r8, 32
|
|
mov ecx, ecx
|
|
or rcx, r8
|
|
vpbroadcastq ymm7, rcx
|
|
vpblendd ymm3, ymm6, ymm7, 0xCC
|
|
vbroadcasti128 ymm8, xmmword ptr [rsi]
|
|
vbroadcasti128 ymm9, xmmword ptr [rsi+0x10]
|
|
vshufps ymm4, ymm8, ymm9, 136
|
|
vshufps ymm5, ymm8, ymm9, 221
|
|
vbroadcasti128 ymm8, xmmword ptr [rsi+0x20]
|
|
vbroadcasti128 ymm9, xmmword ptr [rsi+0x30]
|
|
vshufps ymm6, ymm8, ymm9, 136
|
|
vshufps ymm7, ymm8, ymm9, 221
|
|
vpshufd ymm6, ymm6, 0x93
|
|
vpshufd ymm7, ymm7, 0x93
|
|
call blake3_avx2_kernel_2d_2
|
|
vbroadcasti128 ymm4, xmmword ptr [rdi]
|
|
vpxor ymm2, ymm2, ymm4
|
|
vbroadcasti128 ymm5, xmmword ptr [rdi + 16]
|
|
vpxor ymm3, ymm3, ymm5
|
|
vmovdqu xmmword ptr [r9 + 0 * 16], xmm0
|
|
vmovdqu xmmword ptr [r9 + 1 * 16], xmm1
|
|
vmovdqu xmmword ptr [r9 + 2 * 16], xmm2
|
|
vmovdqu xmmword ptr [r9 + 3 * 16], xmm3
|
|
vextracti128 xmmword ptr [r9 + 4 * 16], ymm0, 1
|
|
vextracti128 xmmword ptr [r9 + 5 * 16], ymm1, 1
|
|
vextracti128 xmmword ptr [r9 + 6 * 16], ymm2, 1
|
|
vextracti128 xmmword ptr [r9 + 7 * 16], ymm3, 1
|
|
ret
|
|
.global blake3_avx2_xof_xor_2
|
|
blake3_avx2_xof_xor_2:
|
|
vbroadcasti128 ymm0, xmmword ptr [rdi]
|
|
vbroadcasti128 ymm1, xmmword ptr [rdi+0x10]
|
|
vmovdqa ymm4, ymmword ptr [INCREMENT_2D+rip]
|
|
vbroadcasti128 ymm2, xmmword ptr [BLAKE3_IV+rip]
|
|
vpbroadcastq ymm5, rdx
|
|
vpaddq ymm6, ymm4, ymm5
|
|
shl r8, 32
|
|
mov ecx, ecx
|
|
or rcx, r8
|
|
vpbroadcastq ymm7, rcx
|
|
vpblendd ymm3, ymm6, ymm7, 0xCC
|
|
vbroadcasti128 ymm8, xmmword ptr [rsi]
|
|
vbroadcasti128 ymm9, xmmword ptr [rsi+0x10]
|
|
vshufps ymm4, ymm8, ymm9, 136
|
|
vshufps ymm5, ymm8, ymm9, 221
|
|
vbroadcasti128 ymm8, xmmword ptr [rsi+0x20]
|
|
vbroadcasti128 ymm9, xmmword ptr [rsi+0x30]
|
|
vshufps ymm6, ymm8, ymm9, 136
|
|
vshufps ymm7, ymm8, ymm9, 221
|
|
vpshufd ymm6, ymm6, 0x93
|
|
vpshufd ymm7, ymm7, 0x93
|
|
call blake3_avx2_kernel_2d_2
|
|
vbroadcasti128 ymm4, xmmword ptr [rdi]
|
|
vpxor ymm2, ymm2, ymm4
|
|
vbroadcasti128 ymm5, xmmword ptr [rdi + 16]
|
|
vpxor ymm3, ymm3, ymm5
|
|
vperm2f128 ymm4, ymm0, ymm1, 32
|
|
vperm2f128 ymm5, ymm2, ymm3, 32
|
|
vperm2f128 ymm6, ymm0, ymm1, 49
|
|
vperm2f128 ymm7, ymm2, ymm3, 49
|
|
vpxor ymm4, ymm4, ymmword ptr [r9 + 0 * 32]
|
|
vpxor ymm5, ymm5, ymmword ptr [r9 + 1 * 32]
|
|
vpxor ymm6, ymm6, ymmword ptr [r9 + 2 * 32]
|
|
vpxor ymm7, ymm7, ymmword ptr [r9 + 3 * 32]
|
|
vmovdqu ymmword ptr [r9 + 0 * 32], ymm4
|
|
vmovdqu ymmword ptr [r9 + 1 * 32], ymm5
|
|
vmovdqu ymmword ptr [r9 + 2 * 32], ymm6
|
|
vmovdqu ymmword ptr [r9 + 3 * 32], ymm7
|
|
ret
|
|
blake3_avx512_kernel_2d_1:
|
|
vpaddd xmm0, xmm0, xmm4
|
|
vpaddd xmm0, xmm0, xmm1
|
|
vpxord xmm3, xmm3, xmm0
|
|
vprord xmm3, xmm3, 16
|
|
vpaddd xmm2, xmm2, xmm3
|
|
vpxord xmm1, xmm1, xmm2
|
|
vprord xmm1, xmm1, 12
|
|
vpaddd xmm0, xmm0, xmm5
|
|
vpaddd xmm0, xmm0, xmm1
|
|
vpxord xmm3, xmm3, xmm0
|
|
vprord xmm3, xmm3, 8
|
|
vpaddd xmm2, xmm2, xmm3
|
|
vpxord xmm1, xmm1, xmm2
|
|
vprord xmm1, xmm1, 7
|
|
vpshufd xmm0, xmm0, 0x93
|
|
vpshufd xmm3, xmm3, 0x4E
|
|
vpshufd xmm2, xmm2, 0x39
|
|
vpaddd xmm0, xmm0, xmm6
|
|
vpaddd xmm0, xmm0, xmm1
|
|
vpxord xmm3, xmm3, xmm0
|
|
vprord xmm3, xmm3, 16
|
|
vpaddd xmm2, xmm2, xmm3
|
|
vpxord xmm1, xmm1, xmm2
|
|
vprord xmm1, xmm1, 12
|
|
vpaddd xmm0, xmm0, xmm7
|
|
vpaddd xmm0, xmm0, xmm1
|
|
vpxord xmm3, xmm3, xmm0
|
|
vprord xmm3, xmm3, 8
|
|
vpaddd xmm2, xmm2, xmm3
|
|
vpxord xmm1, xmm1, xmm2
|
|
vprord xmm1, xmm1, 7
|
|
vpshufd xmm0, xmm0, 0x39
|
|
vpshufd xmm3, xmm3, 0x4E
|
|
vpshufd xmm2, xmm2, 0x93
|
|
vshufps xmm8, xmm4, xmm5, 214
|
|
vpshufd xmm9, xmm4, 0x0F
|
|
vpshufd xmm4, xmm8, 0x39
|
|
vshufps xmm8, xmm6, xmm7, 250
|
|
vpblendd xmm9, xmm9, xmm8, 0xAA
|
|
vpunpcklqdq xmm8, xmm7, xmm5
|
|
vpblendd xmm8, xmm8, xmm6, 0x88
|
|
vpshufd xmm8, xmm8, 0x78
|
|
vpunpckhdq xmm5, xmm5, xmm7
|
|
vpunpckldq xmm6, xmm6, xmm5
|
|
vpshufd xmm7, xmm6, 0x1E
|
|
vmovdqa xmm5, xmm9
|
|
vmovdqa xmm6, xmm8
|
|
vpaddd xmm0, xmm0, xmm4
|
|
vpaddd xmm0, xmm0, xmm1
|
|
vpxord xmm3, xmm3, xmm0
|
|
vprord xmm3, xmm3, 16
|
|
vpaddd xmm2, xmm2, xmm3
|
|
vpxord xmm1, xmm1, xmm2
|
|
vprord xmm1, xmm1, 12
|
|
vpaddd xmm0, xmm0, xmm5
|
|
vpaddd xmm0, xmm0, xmm1
|
|
vpxord xmm3, xmm3, xmm0
|
|
vprord xmm3, xmm3, 8
|
|
vpaddd xmm2, xmm2, xmm3
|
|
vpxord xmm1, xmm1, xmm2
|
|
vprord xmm1, xmm1, 7
|
|
vpshufd xmm0, xmm0, 0x93
|
|
vpshufd xmm3, xmm3, 0x4E
|
|
vpshufd xmm2, xmm2, 0x39
|
|
vpaddd xmm0, xmm0, xmm6
|
|
vpaddd xmm0, xmm0, xmm1
|
|
vpxord xmm3, xmm3, xmm0
|
|
vprord xmm3, xmm3, 16
|
|
vpaddd xmm2, xmm2, xmm3
|
|
vpxord xmm1, xmm1, xmm2
|
|
vprord xmm1, xmm1, 12
|
|
vpaddd xmm0, xmm0, xmm7
|
|
vpaddd xmm0, xmm0, xmm1
|
|
vpxord xmm3, xmm3, xmm0
|
|
vprord xmm3, xmm3, 8
|
|
vpaddd xmm2, xmm2, xmm3
|
|
vpxord xmm1, xmm1, xmm2
|
|
vprord xmm1, xmm1, 7
|
|
vpshufd xmm0, xmm0, 0x39
|
|
vpshufd xmm3, xmm3, 0x4E
|
|
vpshufd xmm2, xmm2, 0x93
|
|
vshufps xmm8, xmm4, xmm5, 214
|
|
vpshufd xmm9, xmm4, 0x0F
|
|
vpshufd xmm4, xmm8, 0x39
|
|
vshufps xmm8, xmm6, xmm7, 250
|
|
vpblendd xmm9, xmm9, xmm8, 0xAA
|
|
vpunpcklqdq xmm8, xmm7, xmm5
|
|
vpblendd xmm8, xmm8, xmm6, 0x88
|
|
vpshufd xmm8, xmm8, 0x78
|
|
vpunpckhdq xmm5, xmm5, xmm7
|
|
vpunpckldq xmm6, xmm6, xmm5
|
|
vpshufd xmm7, xmm6, 0x1E
|
|
vmovdqa xmm5, xmm9
|
|
vmovdqa xmm6, xmm8
|
|
vpaddd xmm0, xmm0, xmm4
|
|
vpaddd xmm0, xmm0, xmm1
|
|
vpxord xmm3, xmm3, xmm0
|
|
vprord xmm3, xmm3, 16
|
|
vpaddd xmm2, xmm2, xmm3
|
|
vpxord xmm1, xmm1, xmm2
|
|
vprord xmm1, xmm1, 12
|
|
vpaddd xmm0, xmm0, xmm5
|
|
vpaddd xmm0, xmm0, xmm1
|
|
vpxord xmm3, xmm3, xmm0
|
|
vprord xmm3, xmm3, 8
|
|
vpaddd xmm2, xmm2, xmm3
|
|
vpxord xmm1, xmm1, xmm2
|
|
vprord xmm1, xmm1, 7
|
|
vpshufd xmm0, xmm0, 0x93
|
|
vpshufd xmm3, xmm3, 0x4E
|
|
vpshufd xmm2, xmm2, 0x39
|
|
vpaddd xmm0, xmm0, xmm6
|
|
vpaddd xmm0, xmm0, xmm1
|
|
vpxord xmm3, xmm3, xmm0
|
|
vprord xmm3, xmm3, 16
|
|
vpaddd xmm2, xmm2, xmm3
|
|
vpxord xmm1, xmm1, xmm2
|
|
vprord xmm1, xmm1, 12
|
|
vpaddd xmm0, xmm0, xmm7
|
|
vpaddd xmm0, xmm0, xmm1
|
|
vpxord xmm3, xmm3, xmm0
|
|
vprord xmm3, xmm3, 8
|
|
vpaddd xmm2, xmm2, xmm3
|
|
vpxord xmm1, xmm1, xmm2
|
|
vprord xmm1, xmm1, 7
|
|
vpshufd xmm0, xmm0, 0x39
|
|
vpshufd xmm3, xmm3, 0x4E
|
|
vpshufd xmm2, xmm2, 0x93
|
|
vshufps xmm8, xmm4, xmm5, 214
|
|
vpshufd xmm9, xmm4, 0x0F
|
|
vpshufd xmm4, xmm8, 0x39
|
|
vshufps xmm8, xmm6, xmm7, 250
|
|
vpblendd xmm9, xmm9, xmm8, 0xAA
|
|
vpunpcklqdq xmm8, xmm7, xmm5
|
|
vpblendd xmm8, xmm8, xmm6, 0x88
|
|
vpshufd xmm8, xmm8, 0x78
|
|
vpunpckhdq xmm5, xmm5, xmm7
|
|
vpunpckldq xmm6, xmm6, xmm5
|
|
vpshufd xmm7, xmm6, 0x1E
|
|
vmovdqa xmm5, xmm9
|
|
vmovdqa xmm6, xmm8
|
|
vpaddd xmm0, xmm0, xmm4
|
|
vpaddd xmm0, xmm0, xmm1
|
|
vpxord xmm3, xmm3, xmm0
|
|
vprord xmm3, xmm3, 16
|
|
vpaddd xmm2, xmm2, xmm3
|
|
vpxord xmm1, xmm1, xmm2
|
|
vprord xmm1, xmm1, 12
|
|
vpaddd xmm0, xmm0, xmm5
|
|
vpaddd xmm0, xmm0, xmm1
|
|
vpxord xmm3, xmm3, xmm0
|
|
vprord xmm3, xmm3, 8
|
|
vpaddd xmm2, xmm2, xmm3
|
|
vpxord xmm1, xmm1, xmm2
|
|
vprord xmm1, xmm1, 7
|
|
vpshufd xmm0, xmm0, 0x93
|
|
vpshufd xmm3, xmm3, 0x4E
|
|
vpshufd xmm2, xmm2, 0x39
|
|
vpaddd xmm0, xmm0, xmm6
|
|
vpaddd xmm0, xmm0, xmm1
|
|
vpxord xmm3, xmm3, xmm0
|
|
vprord xmm3, xmm3, 16
|
|
vpaddd xmm2, xmm2, xmm3
|
|
vpxord xmm1, xmm1, xmm2
|
|
vprord xmm1, xmm1, 12
|
|
vpaddd xmm0, xmm0, xmm7
|
|
vpaddd xmm0, xmm0, xmm1
|
|
vpxord xmm3, xmm3, xmm0
|
|
vprord xmm3, xmm3, 8
|
|
vpaddd xmm2, xmm2, xmm3
|
|
vpxord xmm1, xmm1, xmm2
|
|
vprord xmm1, xmm1, 7
|
|
vpshufd xmm0, xmm0, 0x39
|
|
vpshufd xmm3, xmm3, 0x4E
|
|
vpshufd xmm2, xmm2, 0x93
|
|
vshufps xmm8, xmm4, xmm5, 214
|
|
vpshufd xmm9, xmm4, 0x0F
|
|
vpshufd xmm4, xmm8, 0x39
|
|
vshufps xmm8, xmm6, xmm7, 250
|
|
vpblendd xmm9, xmm9, xmm8, 0xAA
|
|
vpunpcklqdq xmm8, xmm7, xmm5
|
|
vpblendd xmm8, xmm8, xmm6, 0x88
|
|
vpshufd xmm8, xmm8, 0x78
|
|
vpunpckhdq xmm5, xmm5, xmm7
|
|
vpunpckldq xmm6, xmm6, xmm5
|
|
vpshufd xmm7, xmm6, 0x1E
|
|
vmovdqa xmm5, xmm9
|
|
vmovdqa xmm6, xmm8
|
|
vpaddd xmm0, xmm0, xmm4
|
|
vpaddd xmm0, xmm0, xmm1
|
|
vpxord xmm3, xmm3, xmm0
|
|
vprord xmm3, xmm3, 16
|
|
vpaddd xmm2, xmm2, xmm3
|
|
vpxord xmm1, xmm1, xmm2
|
|
vprord xmm1, xmm1, 12
|
|
vpaddd xmm0, xmm0, xmm5
|
|
vpaddd xmm0, xmm0, xmm1
|
|
vpxord xmm3, xmm3, xmm0
|
|
vprord xmm3, xmm3, 8
|
|
vpaddd xmm2, xmm2, xmm3
|
|
vpxord xmm1, xmm1, xmm2
|
|
vprord xmm1, xmm1, 7
|
|
vpshufd xmm0, xmm0, 0x93
|
|
vpshufd xmm3, xmm3, 0x4E
|
|
vpshufd xmm2, xmm2, 0x39
|
|
vpaddd xmm0, xmm0, xmm6
|
|
vpaddd xmm0, xmm0, xmm1
|
|
vpxord xmm3, xmm3, xmm0
|
|
vprord xmm3, xmm3, 16
|
|
vpaddd xmm2, xmm2, xmm3
|
|
vpxord xmm1, xmm1, xmm2
|
|
vprord xmm1, xmm1, 12
|
|
vpaddd xmm0, xmm0, xmm7
|
|
vpaddd xmm0, xmm0, xmm1
|
|
vpxord xmm3, xmm3, xmm0
|
|
vprord xmm3, xmm3, 8
|
|
vpaddd xmm2, xmm2, xmm3
|
|
vpxord xmm1, xmm1, xmm2
|
|
vprord xmm1, xmm1, 7
|
|
vpshufd xmm0, xmm0, 0x39
|
|
vpshufd xmm3, xmm3, 0x4E
|
|
vpshufd xmm2, xmm2, 0x93
|
|
vshufps xmm8, xmm4, xmm5, 214
|
|
vpshufd xmm9, xmm4, 0x0F
|
|
vpshufd xmm4, xmm8, 0x39
|
|
vshufps xmm8, xmm6, xmm7, 250
|
|
vpblendd xmm9, xmm9, xmm8, 0xAA
|
|
vpunpcklqdq xmm8, xmm7, xmm5
|
|
vpblendd xmm8, xmm8, xmm6, 0x88
|
|
vpshufd xmm8, xmm8, 0x78
|
|
vpunpckhdq xmm5, xmm5, xmm7
|
|
vpunpckldq xmm6, xmm6, xmm5
|
|
vpshufd xmm7, xmm6, 0x1E
|
|
vmovdqa xmm5, xmm9
|
|
vmovdqa xmm6, xmm8
|
|
vpaddd xmm0, xmm0, xmm4
|
|
vpaddd xmm0, xmm0, xmm1
|
|
vpxord xmm3, xmm3, xmm0
|
|
vprord xmm3, xmm3, 16
|
|
vpaddd xmm2, xmm2, xmm3
|
|
vpxord xmm1, xmm1, xmm2
|
|
vprord xmm1, xmm1, 12
|
|
vpaddd xmm0, xmm0, xmm5
|
|
vpaddd xmm0, xmm0, xmm1
|
|
vpxord xmm3, xmm3, xmm0
|
|
vprord xmm3, xmm3, 8
|
|
vpaddd xmm2, xmm2, xmm3
|
|
vpxord xmm1, xmm1, xmm2
|
|
vprord xmm1, xmm1, 7
|
|
vpshufd xmm0, xmm0, 0x93
|
|
vpshufd xmm3, xmm3, 0x4E
|
|
vpshufd xmm2, xmm2, 0x39
|
|
vpaddd xmm0, xmm0, xmm6
|
|
vpaddd xmm0, xmm0, xmm1
|
|
vpxord xmm3, xmm3, xmm0
|
|
vprord xmm3, xmm3, 16
|
|
vpaddd xmm2, xmm2, xmm3
|
|
vpxord xmm1, xmm1, xmm2
|
|
vprord xmm1, xmm1, 12
|
|
vpaddd xmm0, xmm0, xmm7
|
|
vpaddd xmm0, xmm0, xmm1
|
|
vpxord xmm3, xmm3, xmm0
|
|
vprord xmm3, xmm3, 8
|
|
vpaddd xmm2, xmm2, xmm3
|
|
vpxord xmm1, xmm1, xmm2
|
|
vprord xmm1, xmm1, 7
|
|
vpshufd xmm0, xmm0, 0x39
|
|
vpshufd xmm3, xmm3, 0x4E
|
|
vpshufd xmm2, xmm2, 0x93
|
|
vshufps xmm8, xmm4, xmm5, 214
|
|
vpshufd xmm9, xmm4, 0x0F
|
|
vpshufd xmm4, xmm8, 0x39
|
|
vshufps xmm8, xmm6, xmm7, 250
|
|
vpblendd xmm9, xmm9, xmm8, 0xAA
|
|
vpunpcklqdq xmm8, xmm7, xmm5
|
|
vpblendd xmm8, xmm8, xmm6, 0x88
|
|
vpshufd xmm8, xmm8, 0x78
|
|
vpunpckhdq xmm5, xmm5, xmm7
|
|
vpunpckldq xmm6, xmm6, xmm5
|
|
vpshufd xmm7, xmm6, 0x1E
|
|
vmovdqa xmm5, xmm9
|
|
vmovdqa xmm6, xmm8
|
|
vpaddd xmm0, xmm0, xmm4
|
|
vpaddd xmm0, xmm0, xmm1
|
|
vpxord xmm3, xmm3, xmm0
|
|
vprord xmm3, xmm3, 16
|
|
vpaddd xmm2, xmm2, xmm3
|
|
vpxord xmm1, xmm1, xmm2
|
|
vprord xmm1, xmm1, 12
|
|
vpaddd xmm0, xmm0, xmm5
|
|
vpaddd xmm0, xmm0, xmm1
|
|
vpxord xmm3, xmm3, xmm0
|
|
vprord xmm3, xmm3, 8
|
|
vpaddd xmm2, xmm2, xmm3
|
|
vpxord xmm1, xmm1, xmm2
|
|
vprord xmm1, xmm1, 7
|
|
vpshufd xmm0, xmm0, 0x93
|
|
vpshufd xmm3, xmm3, 0x4E
|
|
vpshufd xmm2, xmm2, 0x39
|
|
vpaddd xmm0, xmm0, xmm6
|
|
vpaddd xmm0, xmm0, xmm1
|
|
vpxord xmm3, xmm3, xmm0
|
|
vprord xmm3, xmm3, 16
|
|
vpaddd xmm2, xmm2, xmm3
|
|
vpxord xmm1, xmm1, xmm2
|
|
vprord xmm1, xmm1, 12
|
|
vpaddd xmm0, xmm0, xmm7
|
|
vpaddd xmm0, xmm0, xmm1
|
|
vpxord xmm3, xmm3, xmm0
|
|
vprord xmm3, xmm3, 8
|
|
vpaddd xmm2, xmm2, xmm3
|
|
vpxord xmm1, xmm1, xmm2
|
|
vprord xmm1, xmm1, 7
|
|
vpshufd xmm0, xmm0, 0x39
|
|
vpshufd xmm3, xmm3, 0x4E
|
|
vpshufd xmm2, xmm2, 0x93
|
|
vpxord xmm0, xmm0, xmm2
|
|
vpxord xmm1, xmm1, xmm3
|
|
ret
|
|
.global blake3_avx512_compress
|
|
blake3_avx512_compress:
|
|
vmovdqu xmm0, xmmword ptr [rdi]
|
|
vmovdqu xmm1, xmmword ptr [rdi+0x10]
|
|
shl r8, 32
|
|
mov ecx, ecx
|
|
or rcx, r8
|
|
vmovq xmm3, rdx
|
|
vmovq xmm4, rcx
|
|
vpunpcklqdq xmm3, xmm3, xmm4
|
|
vmovaps xmm2, xmmword ptr [BLAKE3_IV+rip]
|
|
vmovups xmm8, xmmword ptr [rsi]
|
|
vmovups xmm9, xmmword ptr [rsi+0x10]
|
|
vshufps xmm4, xmm8, xmm9, 136
|
|
vshufps xmm5, xmm8, xmm9, 221
|
|
vmovups xmm8, xmmword ptr [rsi+0x20]
|
|
vmovups xmm9, xmmword ptr [rsi+0x30]
|
|
vshufps xmm6, xmm8, xmm9, 136
|
|
vshufps xmm7, xmm8, xmm9, 221
|
|
vpshufd xmm6, xmm6, 0x93
|
|
vpshufd xmm7, xmm7, 0x93
|
|
call blake3_avx512_kernel_2d_1
|
|
vmovdqu xmmword ptr [rdi], xmm0
|
|
vmovdqu xmmword ptr [rdi+0x10], xmm1
|
|
ret
|
|
.global blake3_avx512_xof_stream_1
|
|
blake3_avx512_xof_stream_1:
|
|
vmovdqu xmm0, xmmword ptr [rdi]
|
|
vmovdqu xmm1, xmmword ptr [rdi+0x10]
|
|
shl r8, 32
|
|
mov ecx, ecx
|
|
or rcx, r8
|
|
vmovq xmm3, rdx
|
|
vmovq xmm4, rcx
|
|
vpunpcklqdq xmm3, xmm3, xmm4
|
|
vmovaps xmm2, xmmword ptr [BLAKE3_IV+rip]
|
|
vmovups xmm8, xmmword ptr [rsi]
|
|
vmovups xmm9, xmmword ptr [rsi+0x10]
|
|
vshufps xmm4, xmm8, xmm9, 136
|
|
vshufps xmm5, xmm8, xmm9, 221
|
|
vmovups xmm8, xmmword ptr [rsi+0x20]
|
|
vmovups xmm9, xmmword ptr [rsi+0x30]
|
|
vshufps xmm6, xmm8, xmm9, 136
|
|
vshufps xmm7, xmm8, xmm9, 221
|
|
vpshufd xmm6, xmm6, 0x93
|
|
vpshufd xmm7, xmm7, 0x93
|
|
call blake3_avx512_kernel_2d_1
|
|
vpxor xmm2, xmm2, [rdi]
|
|
vpxor xmm3, xmm3, [rdi+0x10]
|
|
vmovdqu xmmword ptr [r9], xmm0
|
|
vmovdqu xmmword ptr [r9+0x10], xmm1
|
|
vmovdqu xmmword ptr [r9+0x20], xmm2
|
|
vmovdqu xmmword ptr [r9+0x30], xmm3
|
|
ret
|
|
.global blake3_avx512_xof_xor_1
|
|
blake3_avx512_xof_xor_1:
|
|
vmovdqu xmm0, xmmword ptr [rdi]
|
|
vmovdqu xmm1, xmmword ptr [rdi+0x10]
|
|
shl r8, 32
|
|
mov ecx, ecx
|
|
or rcx, r8
|
|
vmovq xmm3, rdx
|
|
vmovq xmm4, rcx
|
|
vpunpcklqdq xmm3, xmm3, xmm4
|
|
vmovaps xmm2, xmmword ptr [BLAKE3_IV+rip]
|
|
vmovups xmm8, xmmword ptr [rsi]
|
|
vmovups xmm9, xmmword ptr [rsi+0x10]
|
|
vshufps xmm4, xmm8, xmm9, 136
|
|
vshufps xmm5, xmm8, xmm9, 221
|
|
vmovups xmm8, xmmword ptr [rsi+0x20]
|
|
vmovups xmm9, xmmword ptr [rsi+0x30]
|
|
vshufps xmm6, xmm8, xmm9, 136
|
|
vshufps xmm7, xmm8, xmm9, 221
|
|
vpshufd xmm6, xmm6, 0x93
|
|
vpshufd xmm7, xmm7, 0x93
|
|
call blake3_avx512_kernel_2d_1
|
|
vpxor xmm2, xmm2, [rdi]
|
|
vpxor xmm3, xmm3, [rdi+0x10]
|
|
vpxor xmm0, xmm0, [r9]
|
|
vpxor xmm1, xmm1, [r9+0x10]
|
|
vpxor xmm2, xmm2, [r9+0x20]
|
|
vpxor xmm3, xmm3, [r9+0x30]
|
|
vmovdqu xmmword ptr [r9], xmm0
|
|
vmovdqu xmmword ptr [r9+0x10], xmm1
|
|
vmovdqu xmmword ptr [r9+0x20], xmm2
|
|
vmovdqu xmmword ptr [r9+0x30], xmm3
|
|
ret
|
|
blake3_avx512_kernel_2d_2:
|
|
vpaddd ymm0, ymm0, ymm4
|
|
vpaddd ymm0, ymm0, ymm1
|
|
vpxord ymm3, ymm3, ymm0
|
|
vprord ymm3, ymm3, 16
|
|
vpaddd ymm2, ymm2, ymm3
|
|
vpxord ymm1, ymm1, ymm2
|
|
vprord ymm1, ymm1, 12
|
|
vpaddd ymm0, ymm0, ymm5
|
|
vpaddd ymm0, ymm0, ymm1
|
|
vpxord ymm3, ymm3, ymm0
|
|
vprord ymm3, ymm3, 8
|
|
vpaddd ymm2, ymm2, ymm3
|
|
vpxord ymm1, ymm1, ymm2
|
|
vprord ymm1, ymm1, 7
|
|
vpshufd ymm0, ymm0, 0x93
|
|
vpshufd ymm3, ymm3, 0x4E
|
|
vpshufd ymm2, ymm2, 0x39
|
|
vpaddd ymm0, ymm0, ymm6
|
|
vpaddd ymm0, ymm0, ymm1
|
|
vpxord ymm3, ymm3, ymm0
|
|
vprord ymm3, ymm3, 16
|
|
vpaddd ymm2, ymm2, ymm3
|
|
vpxord ymm1, ymm1, ymm2
|
|
vprord ymm1, ymm1, 12
|
|
vpaddd ymm0, ymm0, ymm7
|
|
vpaddd ymm0, ymm0, ymm1
|
|
vpxord ymm3, ymm3, ymm0
|
|
vprord ymm3, ymm3, 8
|
|
vpaddd ymm2, ymm2, ymm3
|
|
vpxord ymm1, ymm1, ymm2
|
|
vprord ymm1, ymm1, 7
|
|
vpshufd ymm0, ymm0, 0x39
|
|
vpshufd ymm3, ymm3, 0x4E
|
|
vpshufd ymm2, ymm2, 0x93
|
|
vshufps ymm8, ymm4, ymm5, 214
|
|
vpshufd ymm9, ymm4, 0x0F
|
|
vpshufd ymm4, ymm8, 0x39
|
|
vshufps ymm8, ymm6, ymm7, 250
|
|
vpblendd ymm9, ymm9, ymm8, 0xAA
|
|
vpunpcklqdq ymm8, ymm7, ymm5
|
|
vpblendd ymm8, ymm8, ymm6, 0x88
|
|
vpshufd ymm8, ymm8, 0x78
|
|
vpunpckhdq ymm5, ymm5, ymm7
|
|
vpunpckldq ymm6, ymm6, ymm5
|
|
vpshufd ymm7, ymm6, 0x1E
|
|
vmovdqa ymm5, ymm9
|
|
vmovdqa ymm6, ymm8
|
|
vpaddd ymm0, ymm0, ymm4
|
|
vpaddd ymm0, ymm0, ymm1
|
|
vpxord ymm3, ymm3, ymm0
|
|
vprord ymm3, ymm3, 16
|
|
vpaddd ymm2, ymm2, ymm3
|
|
vpxord ymm1, ymm1, ymm2
|
|
vprord ymm1, ymm1, 12
|
|
vpaddd ymm0, ymm0, ymm5
|
|
vpaddd ymm0, ymm0, ymm1
|
|
vpxord ymm3, ymm3, ymm0
|
|
vprord ymm3, ymm3, 8
|
|
vpaddd ymm2, ymm2, ymm3
|
|
vpxord ymm1, ymm1, ymm2
|
|
vprord ymm1, ymm1, 7
|
|
vpshufd ymm0, ymm0, 0x93
|
|
vpshufd ymm3, ymm3, 0x4E
|
|
vpshufd ymm2, ymm2, 0x39
|
|
vpaddd ymm0, ymm0, ymm6
|
|
vpaddd ymm0, ymm0, ymm1
|
|
vpxord ymm3, ymm3, ymm0
|
|
vprord ymm3, ymm3, 16
|
|
vpaddd ymm2, ymm2, ymm3
|
|
vpxord ymm1, ymm1, ymm2
|
|
vprord ymm1, ymm1, 12
|
|
vpaddd ymm0, ymm0, ymm7
|
|
vpaddd ymm0, ymm0, ymm1
|
|
vpxord ymm3, ymm3, ymm0
|
|
vprord ymm3, ymm3, 8
|
|
vpaddd ymm2, ymm2, ymm3
|
|
vpxord ymm1, ymm1, ymm2
|
|
vprord ymm1, ymm1, 7
|
|
vpshufd ymm0, ymm0, 0x39
|
|
vpshufd ymm3, ymm3, 0x4E
|
|
vpshufd ymm2, ymm2, 0x93
|
|
vshufps ymm8, ymm4, ymm5, 214
|
|
vpshufd ymm9, ymm4, 0x0F
|
|
vpshufd ymm4, ymm8, 0x39
|
|
vshufps ymm8, ymm6, ymm7, 250
|
|
vpblendd ymm9, ymm9, ymm8, 0xAA
|
|
vpunpcklqdq ymm8, ymm7, ymm5
|
|
vpblendd ymm8, ymm8, ymm6, 0x88
|
|
vpshufd ymm8, ymm8, 0x78
|
|
vpunpckhdq ymm5, ymm5, ymm7
|
|
vpunpckldq ymm6, ymm6, ymm5
|
|
vpshufd ymm7, ymm6, 0x1E
|
|
vmovdqa ymm5, ymm9
|
|
vmovdqa ymm6, ymm8
|
|
vpaddd ymm0, ymm0, ymm4
|
|
vpaddd ymm0, ymm0, ymm1
|
|
vpxord ymm3, ymm3, ymm0
|
|
vprord ymm3, ymm3, 16
|
|
vpaddd ymm2, ymm2, ymm3
|
|
vpxord ymm1, ymm1, ymm2
|
|
vprord ymm1, ymm1, 12
|
|
vpaddd ymm0, ymm0, ymm5
|
|
vpaddd ymm0, ymm0, ymm1
|
|
vpxord ymm3, ymm3, ymm0
|
|
vprord ymm3, ymm3, 8
|
|
vpaddd ymm2, ymm2, ymm3
|
|
vpxord ymm1, ymm1, ymm2
|
|
vprord ymm1, ymm1, 7
|
|
vpshufd ymm0, ymm0, 0x93
|
|
vpshufd ymm3, ymm3, 0x4E
|
|
vpshufd ymm2, ymm2, 0x39
|
|
vpaddd ymm0, ymm0, ymm6
|
|
vpaddd ymm0, ymm0, ymm1
|
|
vpxord ymm3, ymm3, ymm0
|
|
vprord ymm3, ymm3, 16
|
|
vpaddd ymm2, ymm2, ymm3
|
|
vpxord ymm1, ymm1, ymm2
|
|
vprord ymm1, ymm1, 12
|
|
vpaddd ymm0, ymm0, ymm7
|
|
vpaddd ymm0, ymm0, ymm1
|
|
vpxord ymm3, ymm3, ymm0
|
|
vprord ymm3, ymm3, 8
|
|
vpaddd ymm2, ymm2, ymm3
|
|
vpxord ymm1, ymm1, ymm2
|
|
vprord ymm1, ymm1, 7
|
|
vpshufd ymm0, ymm0, 0x39
|
|
vpshufd ymm3, ymm3, 0x4E
|
|
vpshufd ymm2, ymm2, 0x93
|
|
vshufps ymm8, ymm4, ymm5, 214
|
|
vpshufd ymm9, ymm4, 0x0F
|
|
vpshufd ymm4, ymm8, 0x39
|
|
vshufps ymm8, ymm6, ymm7, 250
|
|
vpblendd ymm9, ymm9, ymm8, 0xAA
|
|
vpunpcklqdq ymm8, ymm7, ymm5
|
|
vpblendd ymm8, ymm8, ymm6, 0x88
|
|
vpshufd ymm8, ymm8, 0x78
|
|
vpunpckhdq ymm5, ymm5, ymm7
|
|
vpunpckldq ymm6, ymm6, ymm5
|
|
vpshufd ymm7, ymm6, 0x1E
|
|
vmovdqa ymm5, ymm9
|
|
vmovdqa ymm6, ymm8
|
|
vpaddd ymm0, ymm0, ymm4
|
|
vpaddd ymm0, ymm0, ymm1
|
|
vpxord ymm3, ymm3, ymm0
|
|
vprord ymm3, ymm3, 16
|
|
vpaddd ymm2, ymm2, ymm3
|
|
vpxord ymm1, ymm1, ymm2
|
|
vprord ymm1, ymm1, 12
|
|
vpaddd ymm0, ymm0, ymm5
|
|
vpaddd ymm0, ymm0, ymm1
|
|
vpxord ymm3, ymm3, ymm0
|
|
vprord ymm3, ymm3, 8
|
|
vpaddd ymm2, ymm2, ymm3
|
|
vpxord ymm1, ymm1, ymm2
|
|
vprord ymm1, ymm1, 7
|
|
vpshufd ymm0, ymm0, 0x93
|
|
vpshufd ymm3, ymm3, 0x4E
|
|
vpshufd ymm2, ymm2, 0x39
|
|
vpaddd ymm0, ymm0, ymm6
|
|
vpaddd ymm0, ymm0, ymm1
|
|
vpxord ymm3, ymm3, ymm0
|
|
vprord ymm3, ymm3, 16
|
|
vpaddd ymm2, ymm2, ymm3
|
|
vpxord ymm1, ymm1, ymm2
|
|
vprord ymm1, ymm1, 12
|
|
vpaddd ymm0, ymm0, ymm7
|
|
vpaddd ymm0, ymm0, ymm1
|
|
vpxord ymm3, ymm3, ymm0
|
|
vprord ymm3, ymm3, 8
|
|
vpaddd ymm2, ymm2, ymm3
|
|
vpxord ymm1, ymm1, ymm2
|
|
vprord ymm1, ymm1, 7
|
|
vpshufd ymm0, ymm0, 0x39
|
|
vpshufd ymm3, ymm3, 0x4E
|
|
vpshufd ymm2, ymm2, 0x93
|
|
vshufps ymm8, ymm4, ymm5, 214
|
|
vpshufd ymm9, ymm4, 0x0F
|
|
vpshufd ymm4, ymm8, 0x39
|
|
vshufps ymm8, ymm6, ymm7, 250
|
|
vpblendd ymm9, ymm9, ymm8, 0xAA
|
|
vpunpcklqdq ymm8, ymm7, ymm5
|
|
vpblendd ymm8, ymm8, ymm6, 0x88
|
|
vpshufd ymm8, ymm8, 0x78
|
|
vpunpckhdq ymm5, ymm5, ymm7
|
|
vpunpckldq ymm6, ymm6, ymm5
|
|
vpshufd ymm7, ymm6, 0x1E
|
|
vmovdqa ymm5, ymm9
|
|
vmovdqa ymm6, ymm8
|
|
vpaddd ymm0, ymm0, ymm4
|
|
vpaddd ymm0, ymm0, ymm1
|
|
vpxord ymm3, ymm3, ymm0
|
|
vprord ymm3, ymm3, 16
|
|
vpaddd ymm2, ymm2, ymm3
|
|
vpxord ymm1, ymm1, ymm2
|
|
vprord ymm1, ymm1, 12
|
|
vpaddd ymm0, ymm0, ymm5
|
|
vpaddd ymm0, ymm0, ymm1
|
|
vpxord ymm3, ymm3, ymm0
|
|
vprord ymm3, ymm3, 8
|
|
vpaddd ymm2, ymm2, ymm3
|
|
vpxord ymm1, ymm1, ymm2
|
|
vprord ymm1, ymm1, 7
|
|
vpshufd ymm0, ymm0, 0x93
|
|
vpshufd ymm3, ymm3, 0x4E
|
|
vpshufd ymm2, ymm2, 0x39
|
|
vpaddd ymm0, ymm0, ymm6
|
|
vpaddd ymm0, ymm0, ymm1
|
|
vpxord ymm3, ymm3, ymm0
|
|
vprord ymm3, ymm3, 16
|
|
vpaddd ymm2, ymm2, ymm3
|
|
vpxord ymm1, ymm1, ymm2
|
|
vprord ymm1, ymm1, 12
|
|
vpaddd ymm0, ymm0, ymm7
|
|
vpaddd ymm0, ymm0, ymm1
|
|
vpxord ymm3, ymm3, ymm0
|
|
vprord ymm3, ymm3, 8
|
|
vpaddd ymm2, ymm2, ymm3
|
|
vpxord ymm1, ymm1, ymm2
|
|
vprord ymm1, ymm1, 7
|
|
vpshufd ymm0, ymm0, 0x39
|
|
vpshufd ymm3, ymm3, 0x4E
|
|
vpshufd ymm2, ymm2, 0x93
|
|
vshufps ymm8, ymm4, ymm5, 214
|
|
vpshufd ymm9, ymm4, 0x0F
|
|
vpshufd ymm4, ymm8, 0x39
|
|
vshufps ymm8, ymm6, ymm7, 250
|
|
vpblendd ymm9, ymm9, ymm8, 0xAA
|
|
vpunpcklqdq ymm8, ymm7, ymm5
|
|
vpblendd ymm8, ymm8, ymm6, 0x88
|
|
vpshufd ymm8, ymm8, 0x78
|
|
vpunpckhdq ymm5, ymm5, ymm7
|
|
vpunpckldq ymm6, ymm6, ymm5
|
|
vpshufd ymm7, ymm6, 0x1E
|
|
vmovdqa ymm5, ymm9
|
|
vmovdqa ymm6, ymm8
|
|
vpaddd ymm0, ymm0, ymm4
|
|
vpaddd ymm0, ymm0, ymm1
|
|
vpxord ymm3, ymm3, ymm0
|
|
vprord ymm3, ymm3, 16
|
|
vpaddd ymm2, ymm2, ymm3
|
|
vpxord ymm1, ymm1, ymm2
|
|
vprord ymm1, ymm1, 12
|
|
vpaddd ymm0, ymm0, ymm5
|
|
vpaddd ymm0, ymm0, ymm1
|
|
vpxord ymm3, ymm3, ymm0
|
|
vprord ymm3, ymm3, 8
|
|
vpaddd ymm2, ymm2, ymm3
|
|
vpxord ymm1, ymm1, ymm2
|
|
vprord ymm1, ymm1, 7
|
|
vpshufd ymm0, ymm0, 0x93
|
|
vpshufd ymm3, ymm3, 0x4E
|
|
vpshufd ymm2, ymm2, 0x39
|
|
vpaddd ymm0, ymm0, ymm6
|
|
vpaddd ymm0, ymm0, ymm1
|
|
vpxord ymm3, ymm3, ymm0
|
|
vprord ymm3, ymm3, 16
|
|
vpaddd ymm2, ymm2, ymm3
|
|
vpxord ymm1, ymm1, ymm2
|
|
vprord ymm1, ymm1, 12
|
|
vpaddd ymm0, ymm0, ymm7
|
|
vpaddd ymm0, ymm0, ymm1
|
|
vpxord ymm3, ymm3, ymm0
|
|
vprord ymm3, ymm3, 8
|
|
vpaddd ymm2, ymm2, ymm3
|
|
vpxord ymm1, ymm1, ymm2
|
|
vprord ymm1, ymm1, 7
|
|
vpshufd ymm0, ymm0, 0x39
|
|
vpshufd ymm3, ymm3, 0x4E
|
|
vpshufd ymm2, ymm2, 0x93
|
|
vshufps ymm8, ymm4, ymm5, 214
|
|
vpshufd ymm9, ymm4, 0x0F
|
|
vpshufd ymm4, ymm8, 0x39
|
|
vshufps ymm8, ymm6, ymm7, 250
|
|
vpblendd ymm9, ymm9, ymm8, 0xAA
|
|
vpunpcklqdq ymm8, ymm7, ymm5
|
|
vpblendd ymm8, ymm8, ymm6, 0x88
|
|
vpshufd ymm8, ymm8, 0x78
|
|
vpunpckhdq ymm5, ymm5, ymm7
|
|
vpunpckldq ymm6, ymm6, ymm5
|
|
vpshufd ymm7, ymm6, 0x1E
|
|
vmovdqa ymm5, ymm9
|
|
vmovdqa ymm6, ymm8
|
|
vpaddd ymm0, ymm0, ymm4
|
|
vpaddd ymm0, ymm0, ymm1
|
|
vpxord ymm3, ymm3, ymm0
|
|
vprord ymm3, ymm3, 16
|
|
vpaddd ymm2, ymm2, ymm3
|
|
vpxord ymm1, ymm1, ymm2
|
|
vprord ymm1, ymm1, 12
|
|
vpaddd ymm0, ymm0, ymm5
|
|
vpaddd ymm0, ymm0, ymm1
|
|
vpxord ymm3, ymm3, ymm0
|
|
vprord ymm3, ymm3, 8
|
|
vpaddd ymm2, ymm2, ymm3
|
|
vpxord ymm1, ymm1, ymm2
|
|
vprord ymm1, ymm1, 7
|
|
vpshufd ymm0, ymm0, 0x93
|
|
vpshufd ymm3, ymm3, 0x4E
|
|
vpshufd ymm2, ymm2, 0x39
|
|
vpaddd ymm0, ymm0, ymm6
|
|
vpaddd ymm0, ymm0, ymm1
|
|
vpxord ymm3, ymm3, ymm0
|
|
vprord ymm3, ymm3, 16
|
|
vpaddd ymm2, ymm2, ymm3
|
|
vpxord ymm1, ymm1, ymm2
|
|
vprord ymm1, ymm1, 12
|
|
vpaddd ymm0, ymm0, ymm7
|
|
vpaddd ymm0, ymm0, ymm1
|
|
vpxord ymm3, ymm3, ymm0
|
|
vprord ymm3, ymm3, 8
|
|
vpaddd ymm2, ymm2, ymm3
|
|
vpxord ymm1, ymm1, ymm2
|
|
vprord ymm1, ymm1, 7
|
|
vpshufd ymm0, ymm0, 0x39
|
|
vpshufd ymm3, ymm3, 0x4E
|
|
vpshufd ymm2, ymm2, 0x93
|
|
vpxord ymm0, ymm0, ymm2
|
|
vpxord ymm1, ymm1, ymm3
|
|
ret
|
|
.global blake3_avx512_xof_stream_2
|
|
blake3_avx512_xof_stream_2:
|
|
vbroadcasti128 ymm0, xmmword ptr [rdi]
|
|
vbroadcasti128 ymm1, xmmword ptr [rdi+0x10]
|
|
vmovdqa ymm4, ymmword ptr [INCREMENT_2D+rip]
|
|
vbroadcasti128 ymm2, xmmword ptr [BLAKE3_IV+rip]
|
|
vpbroadcastq ymm5, rdx
|
|
vpaddq ymm6, ymm4, ymm5
|
|
shl r8, 32
|
|
mov ecx, ecx
|
|
or rcx, r8
|
|
vpbroadcastq ymm7, rcx
|
|
vpblendd ymm3, ymm6, ymm7, 0xCC
|
|
vbroadcasti128 ymm8, xmmword ptr [rsi]
|
|
vbroadcasti128 ymm9, xmmword ptr [rsi+0x10]
|
|
vshufps ymm4, ymm8, ymm9, 136
|
|
vshufps ymm5, ymm8, ymm9, 221
|
|
vbroadcasti128 ymm8, xmmword ptr [rsi+0x20]
|
|
vbroadcasti128 ymm9, xmmword ptr [rsi+0x30]
|
|
vshufps ymm6, ymm8, ymm9, 136
|
|
vshufps ymm7, ymm8, ymm9, 221
|
|
vpshufd ymm6, ymm6, 0x93
|
|
vpshufd ymm7, ymm7, 0x93
|
|
call blake3_avx512_kernel_2d_2
|
|
vbroadcasti128 ymm4, xmmword ptr [rdi]
|
|
vpxor ymm2, ymm2, ymm4
|
|
vbroadcasti128 ymm5, xmmword ptr [rdi + 16]
|
|
vpxor ymm3, ymm3, ymm5
|
|
vmovdqu xmmword ptr [r9 + 0 * 16], xmm0
|
|
vmovdqu xmmword ptr [r9 + 1 * 16], xmm1
|
|
vmovdqu xmmword ptr [r9 + 2 * 16], xmm2
|
|
vmovdqu xmmword ptr [r9 + 3 * 16], xmm3
|
|
vextracti128 xmmword ptr [r9+4*16], ymm0, 1
|
|
vextracti128 xmmword ptr [r9+5*16], ymm1, 1
|
|
vextracti128 xmmword ptr [r9+6*16], ymm2, 1
|
|
vextracti128 xmmword ptr [r9+7*16], ymm3, 1
|
|
ret
|
|
.global blake3_avx512_xof_xor_2
|
|
blake3_avx512_xof_xor_2:
|
|
vbroadcasti128 ymm0, xmmword ptr [rdi]
|
|
vbroadcasti128 ymm1, xmmword ptr [rdi+0x10]
|
|
vmovdqa ymm4, ymmword ptr [INCREMENT_2D+rip]
|
|
vbroadcasti128 ymm2, xmmword ptr [BLAKE3_IV+rip]
|
|
vpbroadcastq ymm5, rdx
|
|
vpaddq ymm6, ymm4, ymm5
|
|
shl r8, 32
|
|
mov ecx, ecx
|
|
or rcx, r8
|
|
vpbroadcastq ymm7, rcx
|
|
vpblendd ymm3, ymm6, ymm7, 0xCC
|
|
vbroadcasti128 ymm8, xmmword ptr [rsi]
|
|
vbroadcasti128 ymm9, xmmword ptr [rsi+0x10]
|
|
vshufps ymm4, ymm8, ymm9, 136
|
|
vshufps ymm5, ymm8, ymm9, 221
|
|
vbroadcasti128 ymm8, xmmword ptr [rsi+0x20]
|
|
vbroadcasti128 ymm9, xmmword ptr [rsi+0x30]
|
|
vshufps ymm6, ymm8, ymm9, 136
|
|
vshufps ymm7, ymm8, ymm9, 221
|
|
vpshufd ymm6, ymm6, 0x93
|
|
vpshufd ymm7, ymm7, 0x93
|
|
call blake3_avx512_kernel_2d_2
|
|
vbroadcasti128 ymm4, xmmword ptr [rdi]
|
|
vpxor ymm2, ymm2, ymm4
|
|
vbroadcasti128 ymm5, xmmword ptr [rdi + 16]
|
|
vpxor ymm3, ymm3, ymm5
|
|
vperm2f128 ymm4, ymm0, ymm1, 32
|
|
vperm2f128 ymm5, ymm2, ymm3, 32
|
|
vperm2f128 ymm6, ymm0, ymm1, 49
|
|
vperm2f128 ymm7, ymm2, ymm3, 49
|
|
vpxor ymm4, ymm4, ymmword ptr [r9 + 0 * 32]
|
|
vpxor ymm5, ymm5, ymmword ptr [r9 + 1 * 32]
|
|
vpxor ymm6, ymm6, ymmword ptr [r9 + 2 * 32]
|
|
vpxor ymm7, ymm7, ymmword ptr [r9 + 3 * 32]
|
|
vmovdqu ymmword ptr [r9 + 0 * 32], ymm4
|
|
vmovdqu ymmword ptr [r9 + 1 * 32], ymm5
|
|
vmovdqu ymmword ptr [r9 + 2 * 32], ymm6
|
|
vmovdqu ymmword ptr [r9 + 3 * 32], ymm7
|
|
ret
|
|
blake3_avx512_kernel_2d_4:
|
|
mov eax, 43690
|
|
kmovw k3, eax
|
|
mov eax, 34952
|
|
kmovw k4, eax
|
|
vpaddd zmm0, zmm0, zmm4
|
|
vpaddd zmm0, zmm0, zmm1
|
|
vpxord zmm3, zmm3, zmm0
|
|
vprord zmm3, zmm3, 16
|
|
vpaddd zmm2, zmm2, zmm3
|
|
vpxord zmm1, zmm1, zmm2
|
|
vprord zmm1, zmm1, 12
|
|
vpaddd zmm0, zmm0, zmm5
|
|
vpaddd zmm0, zmm0, zmm1
|
|
vpxord zmm3, zmm3, zmm0
|
|
vprord zmm3, zmm3, 8
|
|
vpaddd zmm2, zmm2, zmm3
|
|
vpxord zmm1, zmm1, zmm2
|
|
vprord zmm1, zmm1, 7
|
|
vpshufd zmm0, zmm0, 0x93
|
|
vpshufd zmm3, zmm3, 0x4E
|
|
vpshufd zmm2, zmm2, 0x39
|
|
vpaddd zmm0, zmm0, zmm6
|
|
vpaddd zmm0, zmm0, zmm1
|
|
vpxord zmm3, zmm3, zmm0
|
|
vprord zmm3, zmm3, 16
|
|
vpaddd zmm2, zmm2, zmm3
|
|
vpxord zmm1, zmm1, zmm2
|
|
vprord zmm1, zmm1, 12
|
|
vpaddd zmm0, zmm0, zmm7
|
|
vpaddd zmm0, zmm0, zmm1
|
|
vpxord zmm3, zmm3, zmm0
|
|
vprord zmm3, zmm3, 8
|
|
vpaddd zmm2, zmm2, zmm3
|
|
vpxord zmm1, zmm1, zmm2
|
|
vprord zmm1, zmm1, 7
|
|
vpshufd zmm0, zmm0, 0x39
|
|
vpshufd zmm3, zmm3, 0x4E
|
|
vpshufd zmm2, zmm2, 0x93
|
|
vshufps zmm8, zmm4, zmm5, 214
|
|
vpshufd zmm9, zmm4, 0x0F
|
|
vpshufd zmm4, zmm8, 0x39
|
|
vshufps zmm8, zmm6, zmm7, 250
|
|
vpblendmd zmm9 {k3}, zmm9, zmm8
|
|
vpunpcklqdq zmm8, zmm7, zmm5
|
|
vpblendmd zmm8 {k4}, zmm8, zmm6
|
|
vpshufd zmm8, zmm8, 0x78
|
|
vpunpckhdq zmm5, zmm5, zmm7
|
|
vpunpckldq zmm6, zmm6, zmm5
|
|
vpshufd zmm7, zmm6, 0x1E
|
|
vmovdqa32 zmm5, zmm9
|
|
vmovdqa32 zmm6, zmm8
|
|
vpaddd zmm0, zmm0, zmm4
|
|
vpaddd zmm0, zmm0, zmm1
|
|
vpxord zmm3, zmm3, zmm0
|
|
vprord zmm3, zmm3, 16
|
|
vpaddd zmm2, zmm2, zmm3
|
|
vpxord zmm1, zmm1, zmm2
|
|
vprord zmm1, zmm1, 12
|
|
vpaddd zmm0, zmm0, zmm5
|
|
vpaddd zmm0, zmm0, zmm1
|
|
vpxord zmm3, zmm3, zmm0
|
|
vprord zmm3, zmm3, 8
|
|
vpaddd zmm2, zmm2, zmm3
|
|
vpxord zmm1, zmm1, zmm2
|
|
vprord zmm1, zmm1, 7
|
|
vpshufd zmm0, zmm0, 0x93
|
|
vpshufd zmm3, zmm3, 0x4E
|
|
vpshufd zmm2, zmm2, 0x39
|
|
vpaddd zmm0, zmm0, zmm6
|
|
vpaddd zmm0, zmm0, zmm1
|
|
vpxord zmm3, zmm3, zmm0
|
|
vprord zmm3, zmm3, 16
|
|
vpaddd zmm2, zmm2, zmm3
|
|
vpxord zmm1, zmm1, zmm2
|
|
vprord zmm1, zmm1, 12
|
|
vpaddd zmm0, zmm0, zmm7
|
|
vpaddd zmm0, zmm0, zmm1
|
|
vpxord zmm3, zmm3, zmm0
|
|
vprord zmm3, zmm3, 8
|
|
vpaddd zmm2, zmm2, zmm3
|
|
vpxord zmm1, zmm1, zmm2
|
|
vprord zmm1, zmm1, 7
|
|
vpshufd zmm0, zmm0, 0x39
|
|
vpshufd zmm3, zmm3, 0x4E
|
|
vpshufd zmm2, zmm2, 0x93
|
|
vshufps zmm8, zmm4, zmm5, 214
|
|
vpshufd zmm9, zmm4, 0x0F
|
|
vpshufd zmm4, zmm8, 0x39
|
|
vshufps zmm8, zmm6, zmm7, 250
|
|
vpblendmd zmm9 {k3}, zmm9, zmm8
|
|
vpunpcklqdq zmm8, zmm7, zmm5
|
|
vpblendmd zmm8 {k4}, zmm8, zmm6
|
|
vpshufd zmm8, zmm8, 0x78
|
|
vpunpckhdq zmm5, zmm5, zmm7
|
|
vpunpckldq zmm6, zmm6, zmm5
|
|
vpshufd zmm7, zmm6, 0x1E
|
|
vmovdqa32 zmm5, zmm9
|
|
vmovdqa32 zmm6, zmm8
|
|
vpaddd zmm0, zmm0, zmm4
|
|
vpaddd zmm0, zmm0, zmm1
|
|
vpxord zmm3, zmm3, zmm0
|
|
vprord zmm3, zmm3, 16
|
|
vpaddd zmm2, zmm2, zmm3
|
|
vpxord zmm1, zmm1, zmm2
|
|
vprord zmm1, zmm1, 12
|
|
vpaddd zmm0, zmm0, zmm5
|
|
vpaddd zmm0, zmm0, zmm1
|
|
vpxord zmm3, zmm3, zmm0
|
|
vprord zmm3, zmm3, 8
|
|
vpaddd zmm2, zmm2, zmm3
|
|
vpxord zmm1, zmm1, zmm2
|
|
vprord zmm1, zmm1, 7
|
|
vpshufd zmm0, zmm0, 0x93
|
|
vpshufd zmm3, zmm3, 0x4E
|
|
vpshufd zmm2, zmm2, 0x39
|
|
vpaddd zmm0, zmm0, zmm6
|
|
vpaddd zmm0, zmm0, zmm1
|
|
vpxord zmm3, zmm3, zmm0
|
|
vprord zmm3, zmm3, 16
|
|
vpaddd zmm2, zmm2, zmm3
|
|
vpxord zmm1, zmm1, zmm2
|
|
vprord zmm1, zmm1, 12
|
|
vpaddd zmm0, zmm0, zmm7
|
|
vpaddd zmm0, zmm0, zmm1
|
|
vpxord zmm3, zmm3, zmm0
|
|
vprord zmm3, zmm3, 8
|
|
vpaddd zmm2, zmm2, zmm3
|
|
vpxord zmm1, zmm1, zmm2
|
|
vprord zmm1, zmm1, 7
|
|
vpshufd zmm0, zmm0, 0x39
|
|
vpshufd zmm3, zmm3, 0x4E
|
|
vpshufd zmm2, zmm2, 0x93
|
|
vshufps zmm8, zmm4, zmm5, 214
|
|
vpshufd zmm9, zmm4, 0x0F
|
|
vpshufd zmm4, zmm8, 0x39
|
|
vshufps zmm8, zmm6, zmm7, 250
|
|
vpblendmd zmm9 {k3}, zmm9, zmm8
|
|
vpunpcklqdq zmm8, zmm7, zmm5
|
|
vpblendmd zmm8 {k4}, zmm8, zmm6
|
|
vpshufd zmm8, zmm8, 0x78
|
|
vpunpckhdq zmm5, zmm5, zmm7
|
|
vpunpckldq zmm6, zmm6, zmm5
|
|
vpshufd zmm7, zmm6, 0x1E
|
|
vmovdqa32 zmm5, zmm9
|
|
vmovdqa32 zmm6, zmm8
|
|
vpaddd zmm0, zmm0, zmm4
|
|
vpaddd zmm0, zmm0, zmm1
|
|
vpxord zmm3, zmm3, zmm0
|
|
vprord zmm3, zmm3, 16
|
|
vpaddd zmm2, zmm2, zmm3
|
|
vpxord zmm1, zmm1, zmm2
|
|
vprord zmm1, zmm1, 12
|
|
vpaddd zmm0, zmm0, zmm5
|
|
vpaddd zmm0, zmm0, zmm1
|
|
vpxord zmm3, zmm3, zmm0
|
|
vprord zmm3, zmm3, 8
|
|
vpaddd zmm2, zmm2, zmm3
|
|
vpxord zmm1, zmm1, zmm2
|
|
vprord zmm1, zmm1, 7
|
|
vpshufd zmm0, zmm0, 0x93
|
|
vpshufd zmm3, zmm3, 0x4E
|
|
vpshufd zmm2, zmm2, 0x39
|
|
vpaddd zmm0, zmm0, zmm6
|
|
vpaddd zmm0, zmm0, zmm1
|
|
vpxord zmm3, zmm3, zmm0
|
|
vprord zmm3, zmm3, 16
|
|
vpaddd zmm2, zmm2, zmm3
|
|
vpxord zmm1, zmm1, zmm2
|
|
vprord zmm1, zmm1, 12
|
|
vpaddd zmm0, zmm0, zmm7
|
|
vpaddd zmm0, zmm0, zmm1
|
|
vpxord zmm3, zmm3, zmm0
|
|
vprord zmm3, zmm3, 8
|
|
vpaddd zmm2, zmm2, zmm3
|
|
vpxord zmm1, zmm1, zmm2
|
|
vprord zmm1, zmm1, 7
|
|
vpshufd zmm0, zmm0, 0x39
|
|
vpshufd zmm3, zmm3, 0x4E
|
|
vpshufd zmm2, zmm2, 0x93
|
|
vshufps zmm8, zmm4, zmm5, 214
|
|
vpshufd zmm9, zmm4, 0x0F
|
|
vpshufd zmm4, zmm8, 0x39
|
|
vshufps zmm8, zmm6, zmm7, 250
|
|
vpblendmd zmm9 {k3}, zmm9, zmm8
|
|
vpunpcklqdq zmm8, zmm7, zmm5
|
|
vpblendmd zmm8 {k4}, zmm8, zmm6
|
|
vpshufd zmm8, zmm8, 0x78
|
|
vpunpckhdq zmm5, zmm5, zmm7
|
|
vpunpckldq zmm6, zmm6, zmm5
|
|
vpshufd zmm7, zmm6, 0x1E
|
|
vmovdqa32 zmm5, zmm9
|
|
vmovdqa32 zmm6, zmm8
|
|
vpaddd zmm0, zmm0, zmm4
|
|
vpaddd zmm0, zmm0, zmm1
|
|
vpxord zmm3, zmm3, zmm0
|
|
vprord zmm3, zmm3, 16
|
|
vpaddd zmm2, zmm2, zmm3
|
|
vpxord zmm1, zmm1, zmm2
|
|
vprord zmm1, zmm1, 12
|
|
vpaddd zmm0, zmm0, zmm5
|
|
vpaddd zmm0, zmm0, zmm1
|
|
vpxord zmm3, zmm3, zmm0
|
|
vprord zmm3, zmm3, 8
|
|
vpaddd zmm2, zmm2, zmm3
|
|
vpxord zmm1, zmm1, zmm2
|
|
vprord zmm1, zmm1, 7
|
|
vpshufd zmm0, zmm0, 0x93
|
|
vpshufd zmm3, zmm3, 0x4E
|
|
vpshufd zmm2, zmm2, 0x39
|
|
vpaddd zmm0, zmm0, zmm6
|
|
vpaddd zmm0, zmm0, zmm1
|
|
vpxord zmm3, zmm3, zmm0
|
|
vprord zmm3, zmm3, 16
|
|
vpaddd zmm2, zmm2, zmm3
|
|
vpxord zmm1, zmm1, zmm2
|
|
vprord zmm1, zmm1, 12
|
|
vpaddd zmm0, zmm0, zmm7
|
|
vpaddd zmm0, zmm0, zmm1
|
|
vpxord zmm3, zmm3, zmm0
|
|
vprord zmm3, zmm3, 8
|
|
vpaddd zmm2, zmm2, zmm3
|
|
vpxord zmm1, zmm1, zmm2
|
|
vprord zmm1, zmm1, 7
|
|
vpshufd zmm0, zmm0, 0x39
|
|
vpshufd zmm3, zmm3, 0x4E
|
|
vpshufd zmm2, zmm2, 0x93
|
|
vshufps zmm8, zmm4, zmm5, 214
|
|
vpshufd zmm9, zmm4, 0x0F
|
|
vpshufd zmm4, zmm8, 0x39
|
|
vshufps zmm8, zmm6, zmm7, 250
|
|
vpblendmd zmm9 {k3}, zmm9, zmm8
|
|
vpunpcklqdq zmm8, zmm7, zmm5
|
|
vpblendmd zmm8 {k4}, zmm8, zmm6
|
|
vpshufd zmm8, zmm8, 0x78
|
|
vpunpckhdq zmm5, zmm5, zmm7
|
|
vpunpckldq zmm6, zmm6, zmm5
|
|
vpshufd zmm7, zmm6, 0x1E
|
|
vmovdqa32 zmm5, zmm9
|
|
vmovdqa32 zmm6, zmm8
|
|
vpaddd zmm0, zmm0, zmm4
|
|
vpaddd zmm0, zmm0, zmm1
|
|
vpxord zmm3, zmm3, zmm0
|
|
vprord zmm3, zmm3, 16
|
|
vpaddd zmm2, zmm2, zmm3
|
|
vpxord zmm1, zmm1, zmm2
|
|
vprord zmm1, zmm1, 12
|
|
vpaddd zmm0, zmm0, zmm5
|
|
vpaddd zmm0, zmm0, zmm1
|
|
vpxord zmm3, zmm3, zmm0
|
|
vprord zmm3, zmm3, 8
|
|
vpaddd zmm2, zmm2, zmm3
|
|
vpxord zmm1, zmm1, zmm2
|
|
vprord zmm1, zmm1, 7
|
|
vpshufd zmm0, zmm0, 0x93
|
|
vpshufd zmm3, zmm3, 0x4E
|
|
vpshufd zmm2, zmm2, 0x39
|
|
vpaddd zmm0, zmm0, zmm6
|
|
vpaddd zmm0, zmm0, zmm1
|
|
vpxord zmm3, zmm3, zmm0
|
|
vprord zmm3, zmm3, 16
|
|
vpaddd zmm2, zmm2, zmm3
|
|
vpxord zmm1, zmm1, zmm2
|
|
vprord zmm1, zmm1, 12
|
|
vpaddd zmm0, zmm0, zmm7
|
|
vpaddd zmm0, zmm0, zmm1
|
|
vpxord zmm3, zmm3, zmm0
|
|
vprord zmm3, zmm3, 8
|
|
vpaddd zmm2, zmm2, zmm3
|
|
vpxord zmm1, zmm1, zmm2
|
|
vprord zmm1, zmm1, 7
|
|
vpshufd zmm0, zmm0, 0x39
|
|
vpshufd zmm3, zmm3, 0x4E
|
|
vpshufd zmm2, zmm2, 0x93
|
|
vshufps zmm8, zmm4, zmm5, 214
|
|
vpshufd zmm9, zmm4, 0x0F
|
|
vpshufd zmm4, zmm8, 0x39
|
|
vshufps zmm8, zmm6, zmm7, 250
|
|
vpblendmd zmm9 {k3}, zmm9, zmm8
|
|
vpunpcklqdq zmm8, zmm7, zmm5
|
|
vpblendmd zmm8 {k4}, zmm8, zmm6
|
|
vpshufd zmm8, zmm8, 0x78
|
|
vpunpckhdq zmm5, zmm5, zmm7
|
|
vpunpckldq zmm6, zmm6, zmm5
|
|
vpshufd zmm7, zmm6, 0x1E
|
|
vmovdqa32 zmm5, zmm9
|
|
vmovdqa32 zmm6, zmm8
|
|
vpaddd zmm0, zmm0, zmm4
|
|
vpaddd zmm0, zmm0, zmm1
|
|
vpxord zmm3, zmm3, zmm0
|
|
vprord zmm3, zmm3, 16
|
|
vpaddd zmm2, zmm2, zmm3
|
|
vpxord zmm1, zmm1, zmm2
|
|
vprord zmm1, zmm1, 12
|
|
vpaddd zmm0, zmm0, zmm5
|
|
vpaddd zmm0, zmm0, zmm1
|
|
vpxord zmm3, zmm3, zmm0
|
|
vprord zmm3, zmm3, 8
|
|
vpaddd zmm2, zmm2, zmm3
|
|
vpxord zmm1, zmm1, zmm2
|
|
vprord zmm1, zmm1, 7
|
|
vpshufd zmm0, zmm0, 0x93
|
|
vpshufd zmm3, zmm3, 0x4E
|
|
vpshufd zmm2, zmm2, 0x39
|
|
vpaddd zmm0, zmm0, zmm6
|
|
vpaddd zmm0, zmm0, zmm1
|
|
vpxord zmm3, zmm3, zmm0
|
|
vprord zmm3, zmm3, 16
|
|
vpaddd zmm2, zmm2, zmm3
|
|
vpxord zmm1, zmm1, zmm2
|
|
vprord zmm1, zmm1, 12
|
|
vpaddd zmm0, zmm0, zmm7
|
|
vpaddd zmm0, zmm0, zmm1
|
|
vpxord zmm3, zmm3, zmm0
|
|
vprord zmm3, zmm3, 8
|
|
vpaddd zmm2, zmm2, zmm3
|
|
vpxord zmm1, zmm1, zmm2
|
|
vprord zmm1, zmm1, 7
|
|
vpshufd zmm0, zmm0, 0x39
|
|
vpshufd zmm3, zmm3, 0x4E
|
|
vpshufd zmm2, zmm2, 0x93
|
|
vpxord zmm0, zmm0, zmm2
|
|
vpxord zmm1, zmm1, zmm3
|
|
ret
|
|
.global blake3_avx512_xof_stream_4
|
|
blake3_avx512_xof_stream_4:
|
|
vbroadcasti32x4 zmm0, xmmword ptr [rdi]
|
|
vbroadcasti32x4 zmm1, xmmword ptr [rdi+0x10]
|
|
vmovdqa32 zmm4, zmmword ptr [INCREMENT_2D+rip]
|
|
vbroadcasti32x4 zmm2, xmmword ptr [BLAKE3_IV+rip]
|
|
vpbroadcastq zmm5, rdx
|
|
vpaddq zmm6, zmm4, zmm5
|
|
shl r8, 32
|
|
mov ecx, ecx
|
|
or rcx, r8
|
|
vpbroadcastq zmm7, rcx
|
|
mov eax, 0xAA
|
|
kmovw k2, eax
|
|
vpblendmq zmm3 {k2}, zmm6, zmm7
|
|
vbroadcasti32x4 zmm8, xmmword ptr [rsi]
|
|
vbroadcasti32x4 zmm9, xmmword ptr [rsi+0x10]
|
|
vshufps zmm4, zmm8, zmm9, 136
|
|
vshufps zmm5, zmm8, zmm9, 221
|
|
vbroadcasti32x4 zmm8, xmmword ptr [rsi+0x20]
|
|
vbroadcasti32x4 zmm9, xmmword ptr [rsi+0x30]
|
|
vshufps zmm6, zmm8, zmm9, 136
|
|
vshufps zmm7, zmm8, zmm9, 221
|
|
vpshufd zmm6, zmm6, 0x93
|
|
vpshufd zmm7, zmm7, 0x93
|
|
call blake3_avx512_kernel_2d_4
|
|
vbroadcasti32x4 zmm4, xmmword ptr [rdi]
|
|
vpxord zmm2, zmm2, zmm4
|
|
vbroadcasti32x4 zmm5, xmmword ptr [rdi + 16]
|
|
vpxord zmm3, zmm3, zmm5
|
|
vmovdqu xmmword ptr [r9 + 0 * 16], xmm0
|
|
vmovdqu xmmword ptr [r9 + 1 * 16], xmm1
|
|
vmovdqu xmmword ptr [r9 + 2 * 16], xmm2
|
|
vmovdqu xmmword ptr [r9 + 3 * 16], xmm3
|
|
vextracti32x4 xmmword ptr [r9 + 4 * 16], zmm0, 1
|
|
vextracti32x4 xmmword ptr [r9 + 5 * 16], zmm1, 1
|
|
vextracti32x4 xmmword ptr [r9 + 6 * 16], zmm2, 1
|
|
vextracti32x4 xmmword ptr [r9 + 7 * 16], zmm3, 1
|
|
vextracti32x4 xmmword ptr [r9 + 8 * 16], zmm0, 2
|
|
vextracti32x4 xmmword ptr [r9 + 9 * 16], zmm1, 2
|
|
vextracti32x4 xmmword ptr [r9 + 10 * 16], zmm2, 2
|
|
vextracti32x4 xmmword ptr [r9 + 11 * 16], zmm3, 2
|
|
vextracti32x4 xmmword ptr [r9 + 12 * 16], zmm0, 3
|
|
vextracti32x4 xmmword ptr [r9 + 13 * 16], zmm1, 3
|
|
vextracti32x4 xmmword ptr [r9 + 14 * 16], zmm2, 3
|
|
vextracti32x4 xmmword ptr [r9 + 15 * 16], zmm3, 3
|
|
ret
|
|
.global blake3_avx512_xof_xor_4
|
|
blake3_avx512_xof_xor_4:
|
|
vbroadcasti32x4 zmm0, xmmword ptr [rdi]
|
|
vbroadcasti32x4 zmm1, xmmword ptr [rdi+0x10]
|
|
vmovdqa32 zmm4, zmmword ptr [INCREMENT_2D+rip]
|
|
vbroadcasti32x4 zmm2, xmmword ptr [BLAKE3_IV+rip]
|
|
vpbroadcastq zmm5, rdx
|
|
vpaddq zmm6, zmm4, zmm5
|
|
shl r8, 32
|
|
mov ecx, ecx
|
|
or rcx, r8
|
|
vpbroadcastq zmm7, rcx
|
|
mov eax, 0xAA
|
|
kmovw k2, eax
|
|
vpblendmq zmm3 {k2}, zmm6, zmm7
|
|
vbroadcasti32x4 zmm8, xmmword ptr [rsi]
|
|
vbroadcasti32x4 zmm9, xmmword ptr [rsi+0x10]
|
|
vshufps zmm4, zmm8, zmm9, 136
|
|
vshufps zmm5, zmm8, zmm9, 221
|
|
vbroadcasti32x4 zmm8, xmmword ptr [rsi+0x20]
|
|
vbroadcasti32x4 zmm9, xmmword ptr [rsi+0x30]
|
|
vshufps zmm6, zmm8, zmm9, 136
|
|
vshufps zmm7, zmm8, zmm9, 221
|
|
vpshufd zmm6, zmm6, 0x93
|
|
vpshufd zmm7, zmm7, 0x93
|
|
call blake3_avx512_kernel_2d_4
|
|
vbroadcasti32x4 zmm4, xmmword ptr [rdi]
|
|
vpxord zmm2, zmm2, zmm4
|
|
vbroadcasti32x4 zmm5, xmmword ptr [rdi + 16]
|
|
vpxord zmm3, zmm3, zmm5
|
|
vshufi32x4 zmm4, zmm0, zmm1, 68
|
|
vshufi32x4 zmm5, zmm0, zmm1, 238
|
|
vshufi32x4 zmm6, zmm2, zmm3, 68
|
|
vshufi32x4 zmm7, zmm2, zmm3, 238
|
|
vshufi32x4 zmm0, zmm4, zmm6, 136
|
|
vshufi32x4 zmm1, zmm4, zmm6, 221
|
|
vshufi32x4 zmm2, zmm5, zmm7, 136
|
|
vshufi32x4 zmm3, zmm5, zmm7, 221
|
|
vpxord zmm0, zmm0, zmmword ptr [r9 + 0*64]
|
|
vpxord zmm1, zmm1, zmmword ptr [r9 + 1*64]
|
|
vpxord zmm2, zmm2, zmmword ptr [r9 + 2*64]
|
|
vpxord zmm3, zmm3, zmmword ptr [r9 + 3*64]
|
|
vmovdqu32 zmmword ptr [r9 + 0*64], zmm0
|
|
vmovdqu32 zmmword ptr [r9 + 1*64], zmm1
|
|
vmovdqu32 zmmword ptr [r9 + 2*64], zmm2
|
|
vmovdqu32 zmmword ptr [r9 + 3*64], zmm3
|
|
ret
|
|
blake3_avx512_kernel_3d_16:
|
|
vpaddd zmm0, zmm0, zmm16
|
|
vpaddd zmm1, zmm1, zmm18
|
|
vpaddd zmm2, zmm2, zmm20
|
|
vpaddd zmm3, zmm3, zmm22
|
|
vpaddd zmm0, zmm0, zmm4
|
|
vpaddd zmm1, zmm1, zmm5
|
|
vpaddd zmm2, zmm2, zmm6
|
|
vpaddd zmm3, zmm3, zmm7
|
|
vpxord zmm12, zmm12, zmm0
|
|
vpxord zmm13, zmm13, zmm1
|
|
vpxord zmm14, zmm14, zmm2
|
|
vpxord zmm15, zmm15, zmm3
|
|
vprord zmm12, zmm12, 16
|
|
vprord zmm13, zmm13, 16
|
|
vprord zmm14, zmm14, 16
|
|
vprord zmm15, zmm15, 16
|
|
vpaddd zmm8, zmm8, zmm12
|
|
vpaddd zmm9, zmm9, zmm13
|
|
vpaddd zmm10, zmm10, zmm14
|
|
vpaddd zmm11, zmm11, zmm15
|
|
vpxord zmm4, zmm4, zmm8
|
|
vpxord zmm5, zmm5, zmm9
|
|
vpxord zmm6, zmm6, zmm10
|
|
vpxord zmm7, zmm7, zmm11
|
|
vprord zmm4, zmm4, 12
|
|
vprord zmm5, zmm5, 12
|
|
vprord zmm6, zmm6, 12
|
|
vprord zmm7, zmm7, 12
|
|
vpaddd zmm0, zmm0, zmm17
|
|
vpaddd zmm1, zmm1, zmm19
|
|
vpaddd zmm2, zmm2, zmm21
|
|
vpaddd zmm3, zmm3, zmm23
|
|
vpaddd zmm0, zmm0, zmm4
|
|
vpaddd zmm1, zmm1, zmm5
|
|
vpaddd zmm2, zmm2, zmm6
|
|
vpaddd zmm3, zmm3, zmm7
|
|
vpxord zmm12, zmm12, zmm0
|
|
vpxord zmm13, zmm13, zmm1
|
|
vpxord zmm14, zmm14, zmm2
|
|
vpxord zmm15, zmm15, zmm3
|
|
vprord zmm12, zmm12, 8
|
|
vprord zmm13, zmm13, 8
|
|
vprord zmm14, zmm14, 8
|
|
vprord zmm15, zmm15, 8
|
|
vpaddd zmm8, zmm8, zmm12
|
|
vpaddd zmm9, zmm9, zmm13
|
|
vpaddd zmm10, zmm10, zmm14
|
|
vpaddd zmm11, zmm11, zmm15
|
|
vpxord zmm4, zmm4, zmm8
|
|
vpxord zmm5, zmm5, zmm9
|
|
vpxord zmm6, zmm6, zmm10
|
|
vpxord zmm7, zmm7, zmm11
|
|
vprord zmm4, zmm4, 7
|
|
vprord zmm5, zmm5, 7
|
|
vprord zmm6, zmm6, 7
|
|
vprord zmm7, zmm7, 7
|
|
vpaddd zmm0, zmm0, zmm24
|
|
vpaddd zmm1, zmm1, zmm26
|
|
vpaddd zmm2, zmm2, zmm28
|
|
vpaddd zmm3, zmm3, zmm30
|
|
vpaddd zmm0, zmm0, zmm5
|
|
vpaddd zmm1, zmm1, zmm6
|
|
vpaddd zmm2, zmm2, zmm7
|
|
vpaddd zmm3, zmm3, zmm4
|
|
vpxord zmm15, zmm15, zmm0
|
|
vpxord zmm12, zmm12, zmm1
|
|
vpxord zmm13, zmm13, zmm2
|
|
vpxord zmm14, zmm14, zmm3
|
|
vprord zmm15, zmm15, 16
|
|
vprord zmm12, zmm12, 16
|
|
vprord zmm13, zmm13, 16
|
|
vprord zmm14, zmm14, 16
|
|
vpaddd zmm10, zmm10, zmm15
|
|
vpaddd zmm11, zmm11, zmm12
|
|
vpaddd zmm8, zmm8, zmm13
|
|
vpaddd zmm9, zmm9, zmm14
|
|
vpxord zmm5, zmm5, zmm10
|
|
vpxord zmm6, zmm6, zmm11
|
|
vpxord zmm7, zmm7, zmm8
|
|
vpxord zmm4, zmm4, zmm9
|
|
vprord zmm5, zmm5, 12
|
|
vprord zmm6, zmm6, 12
|
|
vprord zmm7, zmm7, 12
|
|
vprord zmm4, zmm4, 12
|
|
vpaddd zmm0, zmm0, zmm25
|
|
vpaddd zmm1, zmm1, zmm27
|
|
vpaddd zmm2, zmm2, zmm29
|
|
vpaddd zmm3, zmm3, zmm31
|
|
vpaddd zmm0, zmm0, zmm5
|
|
vpaddd zmm1, zmm1, zmm6
|
|
vpaddd zmm2, zmm2, zmm7
|
|
vpaddd zmm3, zmm3, zmm4
|
|
vpxord zmm15, zmm15, zmm0
|
|
vpxord zmm12, zmm12, zmm1
|
|
vpxord zmm13, zmm13, zmm2
|
|
vpxord zmm14, zmm14, zmm3
|
|
vprord zmm15, zmm15, 8
|
|
vprord zmm12, zmm12, 8
|
|
vprord zmm13, zmm13, 8
|
|
vprord zmm14, zmm14, 8
|
|
vpaddd zmm10, zmm10, zmm15
|
|
vpaddd zmm11, zmm11, zmm12
|
|
vpaddd zmm8, zmm8, zmm13
|
|
vpaddd zmm9, zmm9, zmm14
|
|
vpxord zmm5, zmm5, zmm10
|
|
vpxord zmm6, zmm6, zmm11
|
|
vpxord zmm7, zmm7, zmm8
|
|
vpxord zmm4, zmm4, zmm9
|
|
vprord zmm5, zmm5, 7
|
|
vprord zmm6, zmm6, 7
|
|
vprord zmm7, zmm7, 7
|
|
vprord zmm4, zmm4, 7
|
|
vpaddd zmm0, zmm0, zmm18
|
|
vpaddd zmm1, zmm1, zmm19
|
|
vpaddd zmm2, zmm2, zmm23
|
|
vpaddd zmm3, zmm3, zmm20
|
|
vpaddd zmm0, zmm0, zmm4
|
|
vpaddd zmm1, zmm1, zmm5
|
|
vpaddd zmm2, zmm2, zmm6
|
|
vpaddd zmm3, zmm3, zmm7
|
|
vpxord zmm12, zmm12, zmm0
|
|
vpxord zmm13, zmm13, zmm1
|
|
vpxord zmm14, zmm14, zmm2
|
|
vpxord zmm15, zmm15, zmm3
|
|
vprord zmm12, zmm12, 16
|
|
vprord zmm13, zmm13, 16
|
|
vprord zmm14, zmm14, 16
|
|
vprord zmm15, zmm15, 16
|
|
vpaddd zmm8, zmm8, zmm12
|
|
vpaddd zmm9, zmm9, zmm13
|
|
vpaddd zmm10, zmm10, zmm14
|
|
vpaddd zmm11, zmm11, zmm15
|
|
vpxord zmm4, zmm4, zmm8
|
|
vpxord zmm5, zmm5, zmm9
|
|
vpxord zmm6, zmm6, zmm10
|
|
vpxord zmm7, zmm7, zmm11
|
|
vprord zmm4, zmm4, 12
|
|
vprord zmm5, zmm5, 12
|
|
vprord zmm6, zmm6, 12
|
|
vprord zmm7, zmm7, 12
|
|
vpaddd zmm0, zmm0, zmm22
|
|
vpaddd zmm1, zmm1, zmm26
|
|
vpaddd zmm2, zmm2, zmm16
|
|
vpaddd zmm3, zmm3, zmm29
|
|
vpaddd zmm0, zmm0, zmm4
|
|
vpaddd zmm1, zmm1, zmm5
|
|
vpaddd zmm2, zmm2, zmm6
|
|
vpaddd zmm3, zmm3, zmm7
|
|
vpxord zmm12, zmm12, zmm0
|
|
vpxord zmm13, zmm13, zmm1
|
|
vpxord zmm14, zmm14, zmm2
|
|
vpxord zmm15, zmm15, zmm3
|
|
vprord zmm12, zmm12, 8
|
|
vprord zmm13, zmm13, 8
|
|
vprord zmm14, zmm14, 8
|
|
vprord zmm15, zmm15, 8
|
|
vpaddd zmm8, zmm8, zmm12
|
|
vpaddd zmm9, zmm9, zmm13
|
|
vpaddd zmm10, zmm10, zmm14
|
|
vpaddd zmm11, zmm11, zmm15
|
|
vpxord zmm4, zmm4, zmm8
|
|
vpxord zmm5, zmm5, zmm9
|
|
vpxord zmm6, zmm6, zmm10
|
|
vpxord zmm7, zmm7, zmm11
|
|
vprord zmm4, zmm4, 7
|
|
vprord zmm5, zmm5, 7
|
|
vprord zmm6, zmm6, 7
|
|
vprord zmm7, zmm7, 7
|
|
vpaddd zmm0, zmm0, zmm17
|
|
vpaddd zmm1, zmm1, zmm28
|
|
vpaddd zmm2, zmm2, zmm25
|
|
vpaddd zmm3, zmm3, zmm31
|
|
vpaddd zmm0, zmm0, zmm5
|
|
vpaddd zmm1, zmm1, zmm6
|
|
vpaddd zmm2, zmm2, zmm7
|
|
vpaddd zmm3, zmm3, zmm4
|
|
vpxord zmm15, zmm15, zmm0
|
|
vpxord zmm12, zmm12, zmm1
|
|
vpxord zmm13, zmm13, zmm2
|
|
vpxord zmm14, zmm14, zmm3
|
|
vprord zmm15, zmm15, 16
|
|
vprord zmm12, zmm12, 16
|
|
vprord zmm13, zmm13, 16
|
|
vprord zmm14, zmm14, 16
|
|
vpaddd zmm10, zmm10, zmm15
|
|
vpaddd zmm11, zmm11, zmm12
|
|
vpaddd zmm8, zmm8, zmm13
|
|
vpaddd zmm9, zmm9, zmm14
|
|
vpxord zmm5, zmm5, zmm10
|
|
vpxord zmm6, zmm6, zmm11
|
|
vpxord zmm7, zmm7, zmm8
|
|
vpxord zmm4, zmm4, zmm9
|
|
vprord zmm5, zmm5, 12
|
|
vprord zmm6, zmm6, 12
|
|
vprord zmm7, zmm7, 12
|
|
vprord zmm4, zmm4, 12
|
|
vpaddd zmm0, zmm0, zmm27
|
|
vpaddd zmm1, zmm1, zmm21
|
|
vpaddd zmm2, zmm2, zmm30
|
|
vpaddd zmm3, zmm3, zmm24
|
|
vpaddd zmm0, zmm0, zmm5
|
|
vpaddd zmm1, zmm1, zmm6
|
|
vpaddd zmm2, zmm2, zmm7
|
|
vpaddd zmm3, zmm3, zmm4
|
|
vpxord zmm15, zmm15, zmm0
|
|
vpxord zmm12, zmm12, zmm1
|
|
vpxord zmm13, zmm13, zmm2
|
|
vpxord zmm14, zmm14, zmm3
|
|
vprord zmm15, zmm15, 8
|
|
vprord zmm12, zmm12, 8
|
|
vprord zmm13, zmm13, 8
|
|
vprord zmm14, zmm14, 8
|
|
vpaddd zmm10, zmm10, zmm15
|
|
vpaddd zmm11, zmm11, zmm12
|
|
vpaddd zmm8, zmm8, zmm13
|
|
vpaddd zmm9, zmm9, zmm14
|
|
vpxord zmm5, zmm5, zmm10
|
|
vpxord zmm6, zmm6, zmm11
|
|
vpxord zmm7, zmm7, zmm8
|
|
vpxord zmm4, zmm4, zmm9
|
|
vprord zmm5, zmm5, 7
|
|
vprord zmm6, zmm6, 7
|
|
vprord zmm7, zmm7, 7
|
|
vprord zmm4, zmm4, 7
|
|
vpaddd zmm0, zmm0, zmm19
|
|
vpaddd zmm1, zmm1, zmm26
|
|
vpaddd zmm2, zmm2, zmm29
|
|
vpaddd zmm3, zmm3, zmm23
|
|
vpaddd zmm0, zmm0, zmm4
|
|
vpaddd zmm1, zmm1, zmm5
|
|
vpaddd zmm2, zmm2, zmm6
|
|
vpaddd zmm3, zmm3, zmm7
|
|
vpxord zmm12, zmm12, zmm0
|
|
vpxord zmm13, zmm13, zmm1
|
|
vpxord zmm14, zmm14, zmm2
|
|
vpxord zmm15, zmm15, zmm3
|
|
vprord zmm12, zmm12, 16
|
|
vprord zmm13, zmm13, 16
|
|
vprord zmm14, zmm14, 16
|
|
vprord zmm15, zmm15, 16
|
|
vpaddd zmm8, zmm8, zmm12
|
|
vpaddd zmm9, zmm9, zmm13
|
|
vpaddd zmm10, zmm10, zmm14
|
|
vpaddd zmm11, zmm11, zmm15
|
|
vpxord zmm4, zmm4, zmm8
|
|
vpxord zmm5, zmm5, zmm9
|
|
vpxord zmm6, zmm6, zmm10
|
|
vpxord zmm7, zmm7, zmm11
|
|
vprord zmm4, zmm4, 12
|
|
vprord zmm5, zmm5, 12
|
|
vprord zmm6, zmm6, 12
|
|
vprord zmm7, zmm7, 12
|
|
vpaddd zmm0, zmm0, zmm20
|
|
vpaddd zmm1, zmm1, zmm28
|
|
vpaddd zmm2, zmm2, zmm18
|
|
vpaddd zmm3, zmm3, zmm30
|
|
vpaddd zmm0, zmm0, zmm4
|
|
vpaddd zmm1, zmm1, zmm5
|
|
vpaddd zmm2, zmm2, zmm6
|
|
vpaddd zmm3, zmm3, zmm7
|
|
vpxord zmm12, zmm12, zmm0
|
|
vpxord zmm13, zmm13, zmm1
|
|
vpxord zmm14, zmm14, zmm2
|
|
vpxord zmm15, zmm15, zmm3
|
|
vprord zmm12, zmm12, 8
|
|
vprord zmm13, zmm13, 8
|
|
vprord zmm14, zmm14, 8
|
|
vprord zmm15, zmm15, 8
|
|
vpaddd zmm8, zmm8, zmm12
|
|
vpaddd zmm9, zmm9, zmm13
|
|
vpaddd zmm10, zmm10, zmm14
|
|
vpaddd zmm11, zmm11, zmm15
|
|
vpxord zmm4, zmm4, zmm8
|
|
vpxord zmm5, zmm5, zmm9
|
|
vpxord zmm6, zmm6, zmm10
|
|
vpxord zmm7, zmm7, zmm11
|
|
vprord zmm4, zmm4, 7
|
|
vprord zmm5, zmm5, 7
|
|
vprord zmm6, zmm6, 7
|
|
vprord zmm7, zmm7, 7
|
|
vpaddd zmm0, zmm0, zmm22
|
|
vpaddd zmm1, zmm1, zmm25
|
|
vpaddd zmm2, zmm2, zmm27
|
|
vpaddd zmm3, zmm3, zmm24
|
|
vpaddd zmm0, zmm0, zmm5
|
|
vpaddd zmm1, zmm1, zmm6
|
|
vpaddd zmm2, zmm2, zmm7
|
|
vpaddd zmm3, zmm3, zmm4
|
|
vpxord zmm15, zmm15, zmm0
|
|
vpxord zmm12, zmm12, zmm1
|
|
vpxord zmm13, zmm13, zmm2
|
|
vpxord zmm14, zmm14, zmm3
|
|
vprord zmm15, zmm15, 16
|
|
vprord zmm12, zmm12, 16
|
|
vprord zmm13, zmm13, 16
|
|
vprord zmm14, zmm14, 16
|
|
vpaddd zmm10, zmm10, zmm15
|
|
vpaddd zmm11, zmm11, zmm12
|
|
vpaddd zmm8, zmm8, zmm13
|
|
vpaddd zmm9, zmm9, zmm14
|
|
vpxord zmm5, zmm5, zmm10
|
|
vpxord zmm6, zmm6, zmm11
|
|
vpxord zmm7, zmm7, zmm8
|
|
vpxord zmm4, zmm4, zmm9
|
|
vprord zmm5, zmm5, 12
|
|
vprord zmm6, zmm6, 12
|
|
vprord zmm7, zmm7, 12
|
|
vprord zmm4, zmm4, 12
|
|
vpaddd zmm0, zmm0, zmm21
|
|
vpaddd zmm1, zmm1, zmm16
|
|
vpaddd zmm2, zmm2, zmm31
|
|
vpaddd zmm3, zmm3, zmm17
|
|
vpaddd zmm0, zmm0, zmm5
|
|
vpaddd zmm1, zmm1, zmm6
|
|
vpaddd zmm2, zmm2, zmm7
|
|
vpaddd zmm3, zmm3, zmm4
|
|
vpxord zmm15, zmm15, zmm0
|
|
vpxord zmm12, zmm12, zmm1
|
|
vpxord zmm13, zmm13, zmm2
|
|
vpxord zmm14, zmm14, zmm3
|
|
vprord zmm15, zmm15, 8
|
|
vprord zmm12, zmm12, 8
|
|
vprord zmm13, zmm13, 8
|
|
vprord zmm14, zmm14, 8
|
|
vpaddd zmm10, zmm10, zmm15
|
|
vpaddd zmm11, zmm11, zmm12
|
|
vpaddd zmm8, zmm8, zmm13
|
|
vpaddd zmm9, zmm9, zmm14
|
|
vpxord zmm5, zmm5, zmm10
|
|
vpxord zmm6, zmm6, zmm11
|
|
vpxord zmm7, zmm7, zmm8
|
|
vpxord zmm4, zmm4, zmm9
|
|
vprord zmm5, zmm5, 7
|
|
vprord zmm6, zmm6, 7
|
|
vprord zmm7, zmm7, 7
|
|
vprord zmm4, zmm4, 7
|
|
vpaddd zmm0, zmm0, zmm26
|
|
vpaddd zmm1, zmm1, zmm28
|
|
vpaddd zmm2, zmm2, zmm30
|
|
vpaddd zmm3, zmm3, zmm29
|
|
vpaddd zmm0, zmm0, zmm4
|
|
vpaddd zmm1, zmm1, zmm5
|
|
vpaddd zmm2, zmm2, zmm6
|
|
vpaddd zmm3, zmm3, zmm7
|
|
vpxord zmm12, zmm12, zmm0
|
|
vpxord zmm13, zmm13, zmm1
|
|
vpxord zmm14, zmm14, zmm2
|
|
vpxord zmm15, zmm15, zmm3
|
|
vprord zmm12, zmm12, 16
|
|
vprord zmm13, zmm13, 16
|
|
vprord zmm14, zmm14, 16
|
|
vprord zmm15, zmm15, 16
|
|
vpaddd zmm8, zmm8, zmm12
|
|
vpaddd zmm9, zmm9, zmm13
|
|
vpaddd zmm10, zmm10, zmm14
|
|
vpaddd zmm11, zmm11, zmm15
|
|
vpxord zmm4, zmm4, zmm8
|
|
vpxord zmm5, zmm5, zmm9
|
|
vpxord zmm6, zmm6, zmm10
|
|
vpxord zmm7, zmm7, zmm11
|
|
vprord zmm4, zmm4, 12
|
|
vprord zmm5, zmm5, 12
|
|
vprord zmm6, zmm6, 12
|
|
vprord zmm7, zmm7, 12
|
|
vpaddd zmm0, zmm0, zmm23
|
|
vpaddd zmm1, zmm1, zmm25
|
|
vpaddd zmm2, zmm2, zmm19
|
|
vpaddd zmm3, zmm3, zmm31
|
|
vpaddd zmm0, zmm0, zmm4
|
|
vpaddd zmm1, zmm1, zmm5
|
|
vpaddd zmm2, zmm2, zmm6
|
|
vpaddd zmm3, zmm3, zmm7
|
|
vpxord zmm12, zmm12, zmm0
|
|
vpxord zmm13, zmm13, zmm1
|
|
vpxord zmm14, zmm14, zmm2
|
|
vpxord zmm15, zmm15, zmm3
|
|
vprord zmm12, zmm12, 8
|
|
vprord zmm13, zmm13, 8
|
|
vprord zmm14, zmm14, 8
|
|
vprord zmm15, zmm15, 8
|
|
vpaddd zmm8, zmm8, zmm12
|
|
vpaddd zmm9, zmm9, zmm13
|
|
vpaddd zmm10, zmm10, zmm14
|
|
vpaddd zmm11, zmm11, zmm15
|
|
vpxord zmm4, zmm4, zmm8
|
|
vpxord zmm5, zmm5, zmm9
|
|
vpxord zmm6, zmm6, zmm10
|
|
vpxord zmm7, zmm7, zmm11
|
|
vprord zmm4, zmm4, 7
|
|
vprord zmm5, zmm5, 7
|
|
vprord zmm6, zmm6, 7
|
|
vprord zmm7, zmm7, 7
|
|
vpaddd zmm0, zmm0, zmm20
|
|
vpaddd zmm1, zmm1, zmm27
|
|
vpaddd zmm2, zmm2, zmm21
|
|
vpaddd zmm3, zmm3, zmm17
|
|
vpaddd zmm0, zmm0, zmm5
|
|
vpaddd zmm1, zmm1, zmm6
|
|
vpaddd zmm2, zmm2, zmm7
|
|
vpaddd zmm3, zmm3, zmm4
|
|
vpxord zmm15, zmm15, zmm0
|
|
vpxord zmm12, zmm12, zmm1
|
|
vpxord zmm13, zmm13, zmm2
|
|
vpxord zmm14, zmm14, zmm3
|
|
vprord zmm15, zmm15, 16
|
|
vprord zmm12, zmm12, 16
|
|
vprord zmm13, zmm13, 16
|
|
vprord zmm14, zmm14, 16
|
|
vpaddd zmm10, zmm10, zmm15
|
|
vpaddd zmm11, zmm11, zmm12
|
|
vpaddd zmm8, zmm8, zmm13
|
|
vpaddd zmm9, zmm9, zmm14
|
|
vpxord zmm5, zmm5, zmm10
|
|
vpxord zmm6, zmm6, zmm11
|
|
vpxord zmm7, zmm7, zmm8
|
|
vpxord zmm4, zmm4, zmm9
|
|
vprord zmm5, zmm5, 12
|
|
vprord zmm6, zmm6, 12
|
|
vprord zmm7, zmm7, 12
|
|
vprord zmm4, zmm4, 12
|
|
vpaddd zmm0, zmm0, zmm16
|
|
vpaddd zmm1, zmm1, zmm18
|
|
vpaddd zmm2, zmm2, zmm24
|
|
vpaddd zmm3, zmm3, zmm22
|
|
vpaddd zmm0, zmm0, zmm5
|
|
vpaddd zmm1, zmm1, zmm6
|
|
vpaddd zmm2, zmm2, zmm7
|
|
vpaddd zmm3, zmm3, zmm4
|
|
vpxord zmm15, zmm15, zmm0
|
|
vpxord zmm12, zmm12, zmm1
|
|
vpxord zmm13, zmm13, zmm2
|
|
vpxord zmm14, zmm14, zmm3
|
|
vprord zmm15, zmm15, 8
|
|
vprord zmm12, zmm12, 8
|
|
vprord zmm13, zmm13, 8
|
|
vprord zmm14, zmm14, 8
|
|
vpaddd zmm10, zmm10, zmm15
|
|
vpaddd zmm11, zmm11, zmm12
|
|
vpaddd zmm8, zmm8, zmm13
|
|
vpaddd zmm9, zmm9, zmm14
|
|
vpxord zmm5, zmm5, zmm10
|
|
vpxord zmm6, zmm6, zmm11
|
|
vpxord zmm7, zmm7, zmm8
|
|
vpxord zmm4, zmm4, zmm9
|
|
vprord zmm5, zmm5, 7
|
|
vprord zmm6, zmm6, 7
|
|
vprord zmm7, zmm7, 7
|
|
vprord zmm4, zmm4, 7
|
|
vpaddd zmm0, zmm0, zmm28
|
|
vpaddd zmm1, zmm1, zmm25
|
|
vpaddd zmm2, zmm2, zmm31
|
|
vpaddd zmm3, zmm3, zmm30
|
|
vpaddd zmm0, zmm0, zmm4
|
|
vpaddd zmm1, zmm1, zmm5
|
|
vpaddd zmm2, zmm2, zmm6
|
|
vpaddd zmm3, zmm3, zmm7
|
|
vpxord zmm12, zmm12, zmm0
|
|
vpxord zmm13, zmm13, zmm1
|
|
vpxord zmm14, zmm14, zmm2
|
|
vpxord zmm15, zmm15, zmm3
|
|
vprord zmm12, zmm12, 16
|
|
vprord zmm13, zmm13, 16
|
|
vprord zmm14, zmm14, 16
|
|
vprord zmm15, zmm15, 16
|
|
vpaddd zmm8, zmm8, zmm12
|
|
vpaddd zmm9, zmm9, zmm13
|
|
vpaddd zmm10, zmm10, zmm14
|
|
vpaddd zmm11, zmm11, zmm15
|
|
vpxord zmm4, zmm4, zmm8
|
|
vpxord zmm5, zmm5, zmm9
|
|
vpxord zmm6, zmm6, zmm10
|
|
vpxord zmm7, zmm7, zmm11
|
|
vprord zmm4, zmm4, 12
|
|
vprord zmm5, zmm5, 12
|
|
vprord zmm6, zmm6, 12
|
|
vprord zmm7, zmm7, 12
|
|
vpaddd zmm0, zmm0, zmm29
|
|
vpaddd zmm1, zmm1, zmm27
|
|
vpaddd zmm2, zmm2, zmm26
|
|
vpaddd zmm3, zmm3, zmm24
|
|
vpaddd zmm0, zmm0, zmm4
|
|
vpaddd zmm1, zmm1, zmm5
|
|
vpaddd zmm2, zmm2, zmm6
|
|
vpaddd zmm3, zmm3, zmm7
|
|
vpxord zmm12, zmm12, zmm0
|
|
vpxord zmm13, zmm13, zmm1
|
|
vpxord zmm14, zmm14, zmm2
|
|
vpxord zmm15, zmm15, zmm3
|
|
vprord zmm12, zmm12, 8
|
|
vprord zmm13, zmm13, 8
|
|
vprord zmm14, zmm14, 8
|
|
vprord zmm15, zmm15, 8
|
|
vpaddd zmm8, zmm8, zmm12
|
|
vpaddd zmm9, zmm9, zmm13
|
|
vpaddd zmm10, zmm10, zmm14
|
|
vpaddd zmm11, zmm11, zmm15
|
|
vpxord zmm4, zmm4, zmm8
|
|
vpxord zmm5, zmm5, zmm9
|
|
vpxord zmm6, zmm6, zmm10
|
|
vpxord zmm7, zmm7, zmm11
|
|
vprord zmm4, zmm4, 7
|
|
vprord zmm5, zmm5, 7
|
|
vprord zmm6, zmm6, 7
|
|
vprord zmm7, zmm7, 7
|
|
vpaddd zmm0, zmm0, zmm23
|
|
vpaddd zmm1, zmm1, zmm21
|
|
vpaddd zmm2, zmm2, zmm16
|
|
vpaddd zmm3, zmm3, zmm22
|
|
vpaddd zmm0, zmm0, zmm5
|
|
vpaddd zmm1, zmm1, zmm6
|
|
vpaddd zmm2, zmm2, zmm7
|
|
vpaddd zmm3, zmm3, zmm4
|
|
vpxord zmm15, zmm15, zmm0
|
|
vpxord zmm12, zmm12, zmm1
|
|
vpxord zmm13, zmm13, zmm2
|
|
vpxord zmm14, zmm14, zmm3
|
|
vprord zmm15, zmm15, 16
|
|
vprord zmm12, zmm12, 16
|
|
vprord zmm13, zmm13, 16
|
|
vprord zmm14, zmm14, 16
|
|
vpaddd zmm10, zmm10, zmm15
|
|
vpaddd zmm11, zmm11, zmm12
|
|
vpaddd zmm8, zmm8, zmm13
|
|
vpaddd zmm9, zmm9, zmm14
|
|
vpxord zmm5, zmm5, zmm10
|
|
vpxord zmm6, zmm6, zmm11
|
|
vpxord zmm7, zmm7, zmm8
|
|
vpxord zmm4, zmm4, zmm9
|
|
vprord zmm5, zmm5, 12
|
|
vprord zmm6, zmm6, 12
|
|
vprord zmm7, zmm7, 12
|
|
vprord zmm4, zmm4, 12
|
|
vpaddd zmm0, zmm0, zmm18
|
|
vpaddd zmm1, zmm1, zmm19
|
|
vpaddd zmm2, zmm2, zmm17
|
|
vpaddd zmm3, zmm3, zmm20
|
|
vpaddd zmm0, zmm0, zmm5
|
|
vpaddd zmm1, zmm1, zmm6
|
|
vpaddd zmm2, zmm2, zmm7
|
|
vpaddd zmm3, zmm3, zmm4
|
|
vpxord zmm15, zmm15, zmm0
|
|
vpxord zmm12, zmm12, zmm1
|
|
vpxord zmm13, zmm13, zmm2
|
|
vpxord zmm14, zmm14, zmm3
|
|
vprord zmm15, zmm15, 8
|
|
vprord zmm12, zmm12, 8
|
|
vprord zmm13, zmm13, 8
|
|
vprord zmm14, zmm14, 8
|
|
vpaddd zmm10, zmm10, zmm15
|
|
vpaddd zmm11, zmm11, zmm12
|
|
vpaddd zmm8, zmm8, zmm13
|
|
vpaddd zmm9, zmm9, zmm14
|
|
vpxord zmm5, zmm5, zmm10
|
|
vpxord zmm6, zmm6, zmm11
|
|
vpxord zmm7, zmm7, zmm8
|
|
vpxord zmm4, zmm4, zmm9
|
|
vprord zmm5, zmm5, 7
|
|
vprord zmm6, zmm6, 7
|
|
vprord zmm7, zmm7, 7
|
|
vprord zmm4, zmm4, 7
|
|
vpaddd zmm0, zmm0, zmm25
|
|
vpaddd zmm1, zmm1, zmm27
|
|
vpaddd zmm2, zmm2, zmm24
|
|
vpaddd zmm3, zmm3, zmm31
|
|
vpaddd zmm0, zmm0, zmm4
|
|
vpaddd zmm1, zmm1, zmm5
|
|
vpaddd zmm2, zmm2, zmm6
|
|
vpaddd zmm3, zmm3, zmm7
|
|
vpxord zmm12, zmm12, zmm0
|
|
vpxord zmm13, zmm13, zmm1
|
|
vpxord zmm14, zmm14, zmm2
|
|
vpxord zmm15, zmm15, zmm3
|
|
vprord zmm12, zmm12, 16
|
|
vprord zmm13, zmm13, 16
|
|
vprord zmm14, zmm14, 16
|
|
vprord zmm15, zmm15, 16
|
|
vpaddd zmm8, zmm8, zmm12
|
|
vpaddd zmm9, zmm9, zmm13
|
|
vpaddd zmm10, zmm10, zmm14
|
|
vpaddd zmm11, zmm11, zmm15
|
|
vpxord zmm4, zmm4, zmm8
|
|
vpxord zmm5, zmm5, zmm9
|
|
vpxord zmm6, zmm6, zmm10
|
|
vpxord zmm7, zmm7, zmm11
|
|
vprord zmm4, zmm4, 12
|
|
vprord zmm5, zmm5, 12
|
|
vprord zmm6, zmm6, 12
|
|
vprord zmm7, zmm7, 12
|
|
vpaddd zmm0, zmm0, zmm30
|
|
vpaddd zmm1, zmm1, zmm21
|
|
vpaddd zmm2, zmm2, zmm28
|
|
vpaddd zmm3, zmm3, zmm17
|
|
vpaddd zmm0, zmm0, zmm4
|
|
vpaddd zmm1, zmm1, zmm5
|
|
vpaddd zmm2, zmm2, zmm6
|
|
vpaddd zmm3, zmm3, zmm7
|
|
vpxord zmm12, zmm12, zmm0
|
|
vpxord zmm13, zmm13, zmm1
|
|
vpxord zmm14, zmm14, zmm2
|
|
vpxord zmm15, zmm15, zmm3
|
|
vprord zmm12, zmm12, 8
|
|
vprord zmm13, zmm13, 8
|
|
vprord zmm14, zmm14, 8
|
|
vprord zmm15, zmm15, 8
|
|
vpaddd zmm8, zmm8, zmm12
|
|
vpaddd zmm9, zmm9, zmm13
|
|
vpaddd zmm10, zmm10, zmm14
|
|
vpaddd zmm11, zmm11, zmm15
|
|
vpxord zmm4, zmm4, zmm8
|
|
vpxord zmm5, zmm5, zmm9
|
|
vpxord zmm6, zmm6, zmm10
|
|
vpxord zmm7, zmm7, zmm11
|
|
vprord zmm4, zmm4, 7
|
|
vprord zmm5, zmm5, 7
|
|
vprord zmm6, zmm6, 7
|
|
vprord zmm7, zmm7, 7
|
|
vpaddd zmm0, zmm0, zmm29
|
|
vpaddd zmm1, zmm1, zmm16
|
|
vpaddd zmm2, zmm2, zmm18
|
|
vpaddd zmm3, zmm3, zmm20
|
|
vpaddd zmm0, zmm0, zmm5
|
|
vpaddd zmm1, zmm1, zmm6
|
|
vpaddd zmm2, zmm2, zmm7
|
|
vpaddd zmm3, zmm3, zmm4
|
|
vpxord zmm15, zmm15, zmm0
|
|
vpxord zmm12, zmm12, zmm1
|
|
vpxord zmm13, zmm13, zmm2
|
|
vpxord zmm14, zmm14, zmm3
|
|
vprord zmm15, zmm15, 16
|
|
vprord zmm12, zmm12, 16
|
|
vprord zmm13, zmm13, 16
|
|
vprord zmm14, zmm14, 16
|
|
vpaddd zmm10, zmm10, zmm15
|
|
vpaddd zmm11, zmm11, zmm12
|
|
vpaddd zmm8, zmm8, zmm13
|
|
vpaddd zmm9, zmm9, zmm14
|
|
vpxord zmm5, zmm5, zmm10
|
|
vpxord zmm6, zmm6, zmm11
|
|
vpxord zmm7, zmm7, zmm8
|
|
vpxord zmm4, zmm4, zmm9
|
|
vprord zmm5, zmm5, 12
|
|
vprord zmm6, zmm6, 12
|
|
vprord zmm7, zmm7, 12
|
|
vprord zmm4, zmm4, 12
|
|
vpaddd zmm0, zmm0, zmm19
|
|
vpaddd zmm1, zmm1, zmm26
|
|
vpaddd zmm2, zmm2, zmm22
|
|
vpaddd zmm3, zmm3, zmm23
|
|
vpaddd zmm0, zmm0, zmm5
|
|
vpaddd zmm1, zmm1, zmm6
|
|
vpaddd zmm2, zmm2, zmm7
|
|
vpaddd zmm3, zmm3, zmm4
|
|
vpxord zmm15, zmm15, zmm0
|
|
vpxord zmm12, zmm12, zmm1
|
|
vpxord zmm13, zmm13, zmm2
|
|
vpxord zmm14, zmm14, zmm3
|
|
vprord zmm15, zmm15, 8
|
|
vprord zmm12, zmm12, 8
|
|
vprord zmm13, zmm13, 8
|
|
vprord zmm14, zmm14, 8
|
|
vpaddd zmm10, zmm10, zmm15
|
|
vpaddd zmm11, zmm11, zmm12
|
|
vpaddd zmm8, zmm8, zmm13
|
|
vpaddd zmm9, zmm9, zmm14
|
|
vpxord zmm5, zmm5, zmm10
|
|
vpxord zmm6, zmm6, zmm11
|
|
vpxord zmm7, zmm7, zmm8
|
|
vpxord zmm4, zmm4, zmm9
|
|
vprord zmm5, zmm5, 7
|
|
vprord zmm6, zmm6, 7
|
|
vprord zmm7, zmm7, 7
|
|
vprord zmm4, zmm4, 7
|
|
vpaddd zmm0, zmm0, zmm27
|
|
vpaddd zmm1, zmm1, zmm21
|
|
vpaddd zmm2, zmm2, zmm17
|
|
vpaddd zmm3, zmm3, zmm24
|
|
vpaddd zmm0, zmm0, zmm4
|
|
vpaddd zmm1, zmm1, zmm5
|
|
vpaddd zmm2, zmm2, zmm6
|
|
vpaddd zmm3, zmm3, zmm7
|
|
vpxord zmm12, zmm12, zmm0
|
|
vpxord zmm13, zmm13, zmm1
|
|
vpxord zmm14, zmm14, zmm2
|
|
vpxord zmm15, zmm15, zmm3
|
|
vprord zmm12, zmm12, 16
|
|
vprord zmm13, zmm13, 16
|
|
vprord zmm14, zmm14, 16
|
|
vprord zmm15, zmm15, 16
|
|
vpaddd zmm8, zmm8, zmm12
|
|
vpaddd zmm9, zmm9, zmm13
|
|
vpaddd zmm10, zmm10, zmm14
|
|
vpaddd zmm11, zmm11, zmm15
|
|
vpxord zmm4, zmm4, zmm8
|
|
vpxord zmm5, zmm5, zmm9
|
|
vpxord zmm6, zmm6, zmm10
|
|
vpxord zmm7, zmm7, zmm11
|
|
vprord zmm4, zmm4, 12
|
|
vprord zmm5, zmm5, 12
|
|
vprord zmm6, zmm6, 12
|
|
vprord zmm7, zmm7, 12
|
|
vpaddd zmm0, zmm0, zmm31
|
|
vpaddd zmm1, zmm1, zmm16
|
|
vpaddd zmm2, zmm2, zmm25
|
|
vpaddd zmm3, zmm3, zmm22
|
|
vpaddd zmm0, zmm0, zmm4
|
|
vpaddd zmm1, zmm1, zmm5
|
|
vpaddd zmm2, zmm2, zmm6
|
|
vpaddd zmm3, zmm3, zmm7
|
|
vpxord zmm12, zmm12, zmm0
|
|
vpxord zmm13, zmm13, zmm1
|
|
vpxord zmm14, zmm14, zmm2
|
|
vpxord zmm15, zmm15, zmm3
|
|
vprord zmm12, zmm12, 8
|
|
vprord zmm13, zmm13, 8
|
|
vprord zmm14, zmm14, 8
|
|
vprord zmm15, zmm15, 8
|
|
vpaddd zmm8, zmm8, zmm12
|
|
vpaddd zmm9, zmm9, zmm13
|
|
vpaddd zmm10, zmm10, zmm14
|
|
vpaddd zmm11, zmm11, zmm15
|
|
vpxord zmm4, zmm4, zmm8
|
|
vpxord zmm5, zmm5, zmm9
|
|
vpxord zmm6, zmm6, zmm10
|
|
vpxord zmm7, zmm7, zmm11
|
|
vprord zmm4, zmm4, 7
|
|
vprord zmm5, zmm5, 7
|
|
vprord zmm6, zmm6, 7
|
|
vprord zmm7, zmm7, 7
|
|
vpaddd zmm0, zmm0, zmm30
|
|
vpaddd zmm1, zmm1, zmm18
|
|
vpaddd zmm2, zmm2, zmm19
|
|
vpaddd zmm3, zmm3, zmm23
|
|
vpaddd zmm0, zmm0, zmm5
|
|
vpaddd zmm1, zmm1, zmm6
|
|
vpaddd zmm2, zmm2, zmm7
|
|
vpaddd zmm3, zmm3, zmm4
|
|
vpxord zmm15, zmm15, zmm0
|
|
vpxord zmm12, zmm12, zmm1
|
|
vpxord zmm13, zmm13, zmm2
|
|
vpxord zmm14, zmm14, zmm3
|
|
vprord zmm15, zmm15, 16
|
|
vprord zmm12, zmm12, 16
|
|
vprord zmm13, zmm13, 16
|
|
vprord zmm14, zmm14, 16
|
|
vpaddd zmm10, zmm10, zmm15
|
|
vpaddd zmm11, zmm11, zmm12
|
|
vpaddd zmm8, zmm8, zmm13
|
|
vpaddd zmm9, zmm9, zmm14
|
|
vpxord zmm5, zmm5, zmm10
|
|
vpxord zmm6, zmm6, zmm11
|
|
vpxord zmm7, zmm7, zmm8
|
|
vpxord zmm4, zmm4, zmm9
|
|
vprord zmm5, zmm5, 12
|
|
vprord zmm6, zmm6, 12
|
|
vprord zmm7, zmm7, 12
|
|
vprord zmm4, zmm4, 12
|
|
vpaddd zmm0, zmm0, zmm26
|
|
vpaddd zmm1, zmm1, zmm28
|
|
vpaddd zmm2, zmm2, zmm20
|
|
vpaddd zmm3, zmm3, zmm29
|
|
vpaddd zmm0, zmm0, zmm5
|
|
vpaddd zmm1, zmm1, zmm6
|
|
vpaddd zmm2, zmm2, zmm7
|
|
vpaddd zmm3, zmm3, zmm4
|
|
vpxord zmm15, zmm15, zmm0
|
|
vpxord zmm12, zmm12, zmm1
|
|
vpxord zmm13, zmm13, zmm2
|
|
vpxord zmm14, zmm14, zmm3
|
|
vprord zmm15, zmm15, 8
|
|
vprord zmm12, zmm12, 8
|
|
vprord zmm13, zmm13, 8
|
|
vprord zmm14, zmm14, 8
|
|
vpaddd zmm10, zmm10, zmm15
|
|
vpaddd zmm11, zmm11, zmm12
|
|
vpaddd zmm8, zmm8, zmm13
|
|
vpaddd zmm9, zmm9, zmm14
|
|
vpxord zmm5, zmm5, zmm10
|
|
vpxord zmm6, zmm6, zmm11
|
|
vpxord zmm7, zmm7, zmm8
|
|
vpxord zmm4, zmm4, zmm9
|
|
vprord zmm5, zmm5, 7
|
|
vprord zmm6, zmm6, 7
|
|
vprord zmm7, zmm7, 7
|
|
vprord zmm4, zmm4, 7
|
|
vpxord zmm0, zmm0, zmm8
|
|
vpxord zmm1, zmm1, zmm9
|
|
vpxord zmm2, zmm2, zmm10
|
|
vpxord zmm3, zmm3, zmm11
|
|
vpxord zmm4, zmm4, zmm12
|
|
vpxord zmm5, zmm5, zmm13
|
|
vpxord zmm6, zmm6, zmm14
|
|
vpxord zmm7, zmm7, zmm15
|
|
ret
|
|
.global blake3_avx512_xof_stream_16
|
|
blake3_avx512_xof_stream_16:
|
|
vmovdqa32 zmm12, zmmword ptr [rip + EVEN_INDEXES]
|
|
vmovdqa32 zmm13, zmmword ptr [rip + ODD_INDEXES]
|
|
vpbroadcastd zmm0, dword ptr [rdi+0]
|
|
vpbroadcastd zmm1, dword ptr [rdi+4]
|
|
vpbroadcastd zmm2, dword ptr [rdi+8]
|
|
vpbroadcastd zmm3, dword ptr [rdi+12]
|
|
vpbroadcastd zmm4, dword ptr [rdi+16]
|
|
vpbroadcastd zmm5, dword ptr [rdi+20]
|
|
vpbroadcastd zmm6, dword ptr [rdi+24]
|
|
vpbroadcastd zmm7, dword ptr [rdi+28]
|
|
vpbroadcastd zmm16, dword ptr [rsi+0]
|
|
vpbroadcastd zmm17, dword ptr [rsi+4]
|
|
vpbroadcastd zmm18, dword ptr [rsi+8]
|
|
vpbroadcastd zmm19, dword ptr [rsi+12]
|
|
vpbroadcastd zmm20, dword ptr [rsi+16]
|
|
vpbroadcastd zmm21, dword ptr [rsi+20]
|
|
vpbroadcastd zmm22, dword ptr [rsi+24]
|
|
vpbroadcastd zmm23, dword ptr [rsi+28]
|
|
vpbroadcastd zmm24, dword ptr [rsi+32]
|
|
vpbroadcastd zmm25, dword ptr [rsi+36]
|
|
vpbroadcastd zmm26, dword ptr [rsi+40]
|
|
vpbroadcastd zmm27, dword ptr [rsi+44]
|
|
vpbroadcastd zmm28, dword ptr [rsi+48]
|
|
vpbroadcastd zmm29, dword ptr [rsi+52]
|
|
vpbroadcastd zmm30, dword ptr [rsi+56]
|
|
vpbroadcastd zmm31, dword ptr [rsi+60]
|
|
vmovdqa64 zmm8, zmmword ptr [INCREMENT_3D+rip]
|
|
vpbroadcastq zmm9, rdx
|
|
vpaddq zmm9, zmm9, zmm8
|
|
add rdx, 8
|
|
vpbroadcastq zmm10, rdx
|
|
vpaddq zmm10, zmm10, zmm8
|
|
vpermi2d zmm12, zmm9, zmm10
|
|
vpermi2d zmm13, zmm9, zmm10
|
|
vpbroadcastd zmm14, ecx
|
|
vpbroadcastd zmm15, r8d
|
|
vpbroadcastd zmm8, dword ptr [BLAKE3_IV+rip+0]
|
|
vpbroadcastd zmm9, dword ptr [BLAKE3_IV+rip+4]
|
|
vpbroadcastd zmm10, dword ptr [BLAKE3_IV+rip+8]
|
|
vpbroadcastd zmm11, dword ptr [BLAKE3_IV+rip+12]
|
|
call blake3_avx512_kernel_3d_16
|
|
vpbroadcastd zmm16, dword ptr [rdi + 0 * 4]
|
|
vpxord zmm8, zmm8, zmm16
|
|
vpbroadcastd zmm17, dword ptr [rdi + 1 * 4]
|
|
vpxord zmm9, zmm9, zmm17
|
|
vpbroadcastd zmm18, dword ptr [rdi + 2 * 4]
|
|
vpxord zmm10, zmm10, zmm18
|
|
vpbroadcastd zmm19, dword ptr [rdi + 3 * 4]
|
|
vpxord zmm11, zmm11, zmm19
|
|
vpbroadcastd zmm20, dword ptr [rdi + 4 * 4]
|
|
vpxord zmm12, zmm12, zmm20
|
|
vpbroadcastd zmm21, dword ptr [rdi + 5 * 4]
|
|
vpxord zmm13, zmm13, zmm21
|
|
vpbroadcastd zmm22, dword ptr [rdi + 6 * 4]
|
|
vpxord zmm14, zmm14, zmm22
|
|
vpbroadcastd zmm23, dword ptr [rdi + 7 * 4]
|
|
vpxord zmm15, zmm15, zmm23
|
|
vpunpckldq zmm16, zmm0, zmm1
|
|
vpunpckhdq zmm17, zmm0, zmm1
|
|
vpunpckldq zmm18, zmm2, zmm3
|
|
vpunpckhdq zmm19, zmm2, zmm3
|
|
vpunpcklqdq zmm0, zmm16, zmm18
|
|
vmovdqu32 xmmword ptr [r9 + 0 * 16], xmm0
|
|
vextracti32x4 xmmword ptr [r9 + 16 * 16], zmm0, 1
|
|
vextracti32x4 xmmword ptr [r9 + 32 * 16], zmm0, 2
|
|
vextracti32x4 xmmword ptr [r9 + 48 * 16], zmm0, 3
|
|
vpunpckhqdq zmm1, zmm16, zmm18
|
|
vmovdqu32 xmmword ptr [r9 + 4 * 16], xmm1
|
|
vextracti32x4 xmmword ptr [r9 + 20 * 16], zmm1, 1
|
|
vextracti32x4 xmmword ptr [r9 + 36 * 16], zmm1, 2
|
|
vextracti32x4 xmmword ptr [r9 + 52 * 16], zmm1, 3
|
|
vpunpcklqdq zmm2, zmm17, zmm19
|
|
vmovdqu32 xmmword ptr [r9 + 8 * 16], xmm2
|
|
vextracti32x4 xmmword ptr [r9 + 24 * 16], zmm2, 1
|
|
vextracti32x4 xmmword ptr [r9 + 40 * 16], zmm2, 2
|
|
vextracti32x4 xmmword ptr [r9 + 56 * 16], zmm2, 3
|
|
vpunpckhqdq zmm3, zmm17, zmm19
|
|
vmovdqu32 xmmword ptr [r9 + 12 * 16], xmm3
|
|
vextracti32x4 xmmword ptr [r9 + 28 * 16], zmm3, 1
|
|
vextracti32x4 xmmword ptr [r9 + 44 * 16], zmm3, 2
|
|
vextracti32x4 xmmword ptr [r9 + 60 * 16], zmm3, 3
|
|
vpunpckldq zmm20, zmm4, zmm5
|
|
vpunpckhdq zmm21, zmm4, zmm5
|
|
vpunpckldq zmm22, zmm6, zmm7
|
|
vpunpckhdq zmm23, zmm6, zmm7
|
|
vpunpcklqdq zmm4, zmm20, zmm22
|
|
vmovdqu32 xmmword ptr [r9 + 1 * 16], xmm4
|
|
vextracti32x4 xmmword ptr [r9 + 17 * 16], zmm4, 1
|
|
vextracti32x4 xmmword ptr [r9 + 33 * 16], zmm4, 2
|
|
vextracti32x4 xmmword ptr [r9 + 49 * 16], zmm4, 3
|
|
vpunpckhqdq zmm5, zmm20, zmm22
|
|
vmovdqu32 xmmword ptr [r9 + 5 * 16], xmm5
|
|
vextracti32x4 xmmword ptr [r9 + 21 * 16], zmm5, 1
|
|
vextracti32x4 xmmword ptr [r9 + 37 * 16], zmm5, 2
|
|
vextracti32x4 xmmword ptr [r9 + 53 * 16], zmm5, 3
|
|
vpunpcklqdq zmm6, zmm21, zmm23
|
|
vmovdqu32 xmmword ptr [r9 + 9 * 16], xmm6
|
|
vextracti32x4 xmmword ptr [r9 + 25 * 16], zmm6, 1
|
|
vextracti32x4 xmmword ptr [r9 + 41 * 16], zmm6, 2
|
|
vextracti32x4 xmmword ptr [r9 + 57 * 16], zmm6, 3
|
|
vpunpckhqdq zmm7, zmm21, zmm23
|
|
vmovdqu32 xmmword ptr [r9 + 13 * 16], xmm7
|
|
vextracti32x4 xmmword ptr [r9 + 29 * 16], zmm7, 1
|
|
vextracti32x4 xmmword ptr [r9 + 45 * 16], zmm7, 2
|
|
vextracti32x4 xmmword ptr [r9 + 61 * 16], zmm7, 3
|
|
vpunpckldq zmm24, zmm8, zmm9
|
|
vpunpckhdq zmm25, zmm8, zmm9
|
|
vpunpckldq zmm26, zmm10, zmm11
|
|
vpunpckhdq zmm27, zmm10, zmm11
|
|
vpunpcklqdq zmm8, zmm24, zmm26
|
|
vmovdqu32 xmmword ptr [r9 + 2 * 16], xmm8
|
|
vextracti32x4 xmmword ptr [r9 + 18 * 16], zmm8, 1
|
|
vextracti32x4 xmmword ptr [r9 + 34 * 16], zmm8, 2
|
|
vextracti32x4 xmmword ptr [r9 + 50 * 16], zmm8, 3
|
|
vpunpckhqdq zmm9, zmm24, zmm26
|
|
vmovdqu32 xmmword ptr [r9 + 6 * 16], xmm9
|
|
vextracti32x4 xmmword ptr [r9 + 22 * 16], zmm9, 1
|
|
vextracti32x4 xmmword ptr [r9 + 38 * 16], zmm9, 2
|
|
vextracti32x4 xmmword ptr [r9 + 54 * 16], zmm9, 3
|
|
vpunpcklqdq zmm10, zmm25, zmm27
|
|
vmovdqu32 xmmword ptr [r9 + 10 * 16], xmm10
|
|
vextracti32x4 xmmword ptr [r9 + 26 * 16], zmm10, 1
|
|
vextracti32x4 xmmword ptr [r9 + 42 * 16], zmm10, 2
|
|
vextracti32x4 xmmword ptr [r9 + 58 * 16], zmm10, 3
|
|
vpunpckhqdq zmm11, zmm25, zmm27
|
|
vmovdqu32 xmmword ptr [r9 + 14 * 16], xmm11
|
|
vextracti32x4 xmmword ptr [r9 + 30 * 16], zmm11, 1
|
|
vextracti32x4 xmmword ptr [r9 + 46 * 16], zmm11, 2
|
|
vextracti32x4 xmmword ptr [r9 + 62 * 16], zmm11, 3
|
|
vpunpckldq zmm28, zmm12, zmm13
|
|
vpunpckhdq zmm29, zmm12, zmm13
|
|
vpunpckldq zmm30, zmm14, zmm15
|
|
vpunpckhdq zmm31, zmm14, zmm15
|
|
vpunpcklqdq zmm12, zmm28, zmm30
|
|
vmovdqu32 xmmword ptr [r9 + 3 * 16], xmm12
|
|
vextracti32x4 xmmword ptr [r9 + 19 * 16], zmm12, 1
|
|
vextracti32x4 xmmword ptr [r9 + 35 * 16], zmm12, 2
|
|
vextracti32x4 xmmword ptr [r9 + 51 * 16], zmm12, 3
|
|
vpunpckhqdq zmm13, zmm28, zmm30
|
|
vmovdqu32 xmmword ptr [r9 + 7 * 16], xmm13
|
|
vextracti32x4 xmmword ptr [r9 + 23 * 16], zmm13, 1
|
|
vextracti32x4 xmmword ptr [r9 + 39 * 16], zmm13, 2
|
|
vextracti32x4 xmmword ptr [r9 + 55 * 16], zmm13, 3
|
|
vpunpcklqdq zmm14, zmm29, zmm31
|
|
vmovdqu32 xmmword ptr [r9 + 11 * 16], xmm14
|
|
vextracti32x4 xmmword ptr [r9 + 27 * 16], zmm14, 1
|
|
vextracti32x4 xmmword ptr [r9 + 43 * 16], zmm14, 2
|
|
vextracti32x4 xmmword ptr [r9 + 59 * 16], zmm14, 3
|
|
vpunpckhqdq zmm15, zmm29, zmm31
|
|
vmovdqu32 xmmword ptr [r9 + 15 * 16], xmm15
|
|
vextracti32x4 xmmword ptr [r9 + 31 * 16], zmm15, 1
|
|
vextracti32x4 xmmword ptr [r9 + 47 * 16], zmm15, 2
|
|
vextracti32x4 xmmword ptr [r9 + 63 * 16], zmm15, 3
|
|
ret
|
|
.global blake3_avx512_xof_xor_16
|
|
blake3_avx512_xof_xor_16:
|
|
vmovdqa32 zmm12, zmmword ptr [rip + EVEN_INDEXES]
|
|
vmovdqa32 zmm13, zmmword ptr [rip + ODD_INDEXES]
|
|
vpbroadcastd zmm0, dword ptr [rdi+0]
|
|
vpbroadcastd zmm1, dword ptr [rdi+4]
|
|
vpbroadcastd zmm2, dword ptr [rdi+8]
|
|
vpbroadcastd zmm3, dword ptr [rdi+12]
|
|
vpbroadcastd zmm4, dword ptr [rdi+16]
|
|
vpbroadcastd zmm5, dword ptr [rdi+20]
|
|
vpbroadcastd zmm6, dword ptr [rdi+24]
|
|
vpbroadcastd zmm7, dword ptr [rdi+28]
|
|
vpbroadcastd zmm16, dword ptr [rsi+0]
|
|
vpbroadcastd zmm17, dword ptr [rsi+4]
|
|
vpbroadcastd zmm18, dword ptr [rsi+8]
|
|
vpbroadcastd zmm19, dword ptr [rsi+12]
|
|
vpbroadcastd zmm20, dword ptr [rsi+16]
|
|
vpbroadcastd zmm21, dword ptr [rsi+20]
|
|
vpbroadcastd zmm22, dword ptr [rsi+24]
|
|
vpbroadcastd zmm23, dword ptr [rsi+28]
|
|
vpbroadcastd zmm24, dword ptr [rsi+32]
|
|
vpbroadcastd zmm25, dword ptr [rsi+36]
|
|
vpbroadcastd zmm26, dword ptr [rsi+40]
|
|
vpbroadcastd zmm27, dword ptr [rsi+44]
|
|
vpbroadcastd zmm28, dword ptr [rsi+48]
|
|
vpbroadcastd zmm29, dword ptr [rsi+52]
|
|
vpbroadcastd zmm30, dword ptr [rsi+56]
|
|
vpbroadcastd zmm31, dword ptr [rsi+60]
|
|
vmovdqa64 zmm8, zmmword ptr [INCREMENT_3D+rip]
|
|
vpbroadcastq zmm9, rdx
|
|
vpaddq zmm9, zmm9, zmm8
|
|
add rdx, 8
|
|
vpbroadcastq zmm10, rdx
|
|
vpaddq zmm10, zmm10, zmm8
|
|
vpermi2d zmm12, zmm9, zmm10
|
|
vpermi2d zmm13, zmm9, zmm10
|
|
vpbroadcastd zmm14, ecx
|
|
vpbroadcastd zmm15, r8d
|
|
vpbroadcastd zmm8, dword ptr [BLAKE3_IV+rip+0]
|
|
vpbroadcastd zmm9, dword ptr [BLAKE3_IV+rip+4]
|
|
vpbroadcastd zmm10, dword ptr [BLAKE3_IV+rip+8]
|
|
vpbroadcastd zmm11, dword ptr [BLAKE3_IV+rip+12]
|
|
call blake3_avx512_kernel_3d_16
|
|
vpbroadcastd zmm16, dword ptr [rdi + 0 * 4]
|
|
vpxord zmm8, zmm8, zmm16
|
|
vpbroadcastd zmm17, dword ptr [rdi + 1 * 4]
|
|
vpxord zmm9, zmm9, zmm17
|
|
vpbroadcastd zmm18, dword ptr [rdi + 2 * 4]
|
|
vpxord zmm10, zmm10, zmm18
|
|
vpbroadcastd zmm19, dword ptr [rdi + 3 * 4]
|
|
vpxord zmm11, zmm11, zmm19
|
|
vpbroadcastd zmm20, dword ptr [rdi + 4 * 4]
|
|
vpxord zmm12, zmm12, zmm20
|
|
vpbroadcastd zmm21, dword ptr [rdi + 5 * 4]
|
|
vpxord zmm13, zmm13, zmm21
|
|
vpbroadcastd zmm22, dword ptr [rdi + 6 * 4]
|
|
vpxord zmm14, zmm14, zmm22
|
|
vpbroadcastd zmm23, dword ptr [rdi + 7 * 4]
|
|
vpxord zmm15, zmm15, zmm23
|
|
vpunpckldq zmm16, zmm0, zmm1
|
|
vpunpckhdq zmm17, zmm0, zmm1
|
|
vpunpckldq zmm18, zmm2, zmm3
|
|
vpunpckhdq zmm19, zmm2, zmm3
|
|
vpunpckldq zmm20, zmm4, zmm5
|
|
vpunpckhdq zmm21, zmm4, zmm5
|
|
vpunpckldq zmm22, zmm6, zmm7
|
|
vpunpckhdq zmm23, zmm6, zmm7
|
|
vpunpckldq zmm24, zmm8, zmm9
|
|
vpunpckhdq zmm25, zmm8, zmm9
|
|
vpunpckldq zmm26, zmm10, zmm11
|
|
vpunpckhdq zmm27, zmm10, zmm11
|
|
vpunpckldq zmm28, zmm12, zmm13
|
|
vpunpckhdq zmm29, zmm12, zmm13
|
|
vpunpckldq zmm30, zmm14, zmm15
|
|
vpunpckhdq zmm31, zmm14, zmm15
|
|
vpunpcklqdq zmm0, zmm16, zmm18
|
|
vpunpckhqdq zmm1, zmm16, zmm18
|
|
vpunpcklqdq zmm2, zmm17, zmm19
|
|
vpunpckhqdq zmm3, zmm17, zmm19
|
|
vpunpcklqdq zmm4, zmm20, zmm22
|
|
vpunpckhqdq zmm5, zmm20, zmm22
|
|
vpunpcklqdq zmm6, zmm21, zmm23
|
|
vpunpckhqdq zmm7, zmm21, zmm23
|
|
vpunpcklqdq zmm8, zmm24, zmm26
|
|
vpunpckhqdq zmm9, zmm24, zmm26
|
|
vpunpcklqdq zmm10, zmm25, zmm27
|
|
vpunpckhqdq zmm11, zmm25, zmm27
|
|
vpunpcklqdq zmm12, zmm28, zmm30
|
|
vpunpckhqdq zmm13, zmm28, zmm30
|
|
vpunpcklqdq zmm14, zmm29, zmm31
|
|
vpunpckhqdq zmm15, zmm29, zmm31
|
|
vshufi32x4 zmm16, zmm0, zmm4, 0x88
|
|
vshufi32x4 zmm17, zmm1, zmm5, 0x88
|
|
vshufi32x4 zmm18, zmm2, zmm6, 0x88
|
|
vshufi32x4 zmm19, zmm3, zmm7, 0x88
|
|
vshufi32x4 zmm20, zmm0, zmm4, 0xdd
|
|
vshufi32x4 zmm21, zmm1, zmm5, 0xdd
|
|
vshufi32x4 zmm22, zmm2, zmm6, 0xdd
|
|
vshufi32x4 zmm23, zmm3, zmm7, 0xdd
|
|
vshufi32x4 zmm24, zmm8, zmm12, 0x88
|
|
vshufi32x4 zmm25, zmm9, zmm13, 0x88
|
|
vshufi32x4 zmm26, zmm10, zmm14, 0x88
|
|
vshufi32x4 zmm27, zmm11, zmm15, 0x88
|
|
vshufi32x4 zmm28, zmm8, zmm12, 0xdd
|
|
vshufi32x4 zmm29, zmm9, zmm13, 0xdd
|
|
vshufi32x4 zmm30, zmm10, zmm14, 0xdd
|
|
vshufi32x4 zmm31, zmm11, zmm15, 0xdd
|
|
vshufi32x4 zmm0, zmm16, zmm24, 0x88
|
|
vshufi32x4 zmm1, zmm17, zmm25, 0x88
|
|
vshufi32x4 zmm2, zmm18, zmm26, 0x88
|
|
vshufi32x4 zmm3, zmm19, zmm27, 0x88
|
|
vshufi32x4 zmm4, zmm20, zmm28, 0x88
|
|
vshufi32x4 zmm5, zmm21, zmm29, 0x88
|
|
vshufi32x4 zmm6, zmm22, zmm30, 0x88
|
|
vshufi32x4 zmm7, zmm23, zmm31, 0x88
|
|
vshufi32x4 zmm8, zmm16, zmm24, 0xdd
|
|
vshufi32x4 zmm9, zmm17, zmm25, 0xdd
|
|
vshufi32x4 zmm10, zmm18, zmm26, 0xdd
|
|
vshufi32x4 zmm11, zmm19, zmm27, 0xdd
|
|
vshufi32x4 zmm12, zmm20, zmm28, 0xdd
|
|
vshufi32x4 zmm13, zmm21, zmm29, 0xdd
|
|
vshufi32x4 zmm14, zmm22, zmm30, 0xdd
|
|
vshufi32x4 zmm15, zmm23, zmm31, 0xdd
|
|
vmovdqu32 zmm16, zmmword ptr [r9 + 0 * 64]
|
|
vpxord zmm0, zmm0, zmm16
|
|
vmovdqu32 zmmword ptr [r9 + 0 * 64], zmm0
|
|
vmovdqu32 zmm17, zmmword ptr [r9 + 1 * 64]
|
|
vpxord zmm1, zmm1, zmm17
|
|
vmovdqu32 zmmword ptr [r9 + 1 * 64], zmm1
|
|
vmovdqu32 zmm18, zmmword ptr [r9 + 2 * 64]
|
|
vpxord zmm2, zmm2, zmm18
|
|
vmovdqu32 zmmword ptr [r9 + 2 * 64], zmm2
|
|
vmovdqu32 zmm19, zmmword ptr [r9 + 3 * 64]
|
|
vpxord zmm3, zmm3, zmm19
|
|
vmovdqu32 zmmword ptr [r9 + 3 * 64], zmm3
|
|
vmovdqu32 zmm20, zmmword ptr [r9 + 4 * 64]
|
|
vpxord zmm4, zmm4, zmm20
|
|
vmovdqu32 zmmword ptr [r9 + 4 * 64], zmm4
|
|
vmovdqu32 zmm21, zmmword ptr [r9 + 5 * 64]
|
|
vpxord zmm5, zmm5, zmm21
|
|
vmovdqu32 zmmword ptr [r9 + 5 * 64], zmm5
|
|
vmovdqu32 zmm22, zmmword ptr [r9 + 6 * 64]
|
|
vpxord zmm6, zmm6, zmm22
|
|
vmovdqu32 zmmword ptr [r9 + 6 * 64], zmm6
|
|
vmovdqu32 zmm23, zmmword ptr [r9 + 7 * 64]
|
|
vpxord zmm7, zmm7, zmm23
|
|
vmovdqu32 zmmword ptr [r9 + 7 * 64], zmm7
|
|
vmovdqu32 zmm24, zmmword ptr [r9 + 8 * 64]
|
|
vpxord zmm8, zmm8, zmm24
|
|
vmovdqu32 zmmword ptr [r9 + 8 * 64], zmm8
|
|
vmovdqu32 zmm25, zmmword ptr [r9 + 9 * 64]
|
|
vpxord zmm9, zmm9, zmm25
|
|
vmovdqu32 zmmword ptr [r9 + 9 * 64], zmm9
|
|
vmovdqu32 zmm26, zmmword ptr [r9 + 10 * 64]
|
|
vpxord zmm10, zmm10, zmm26
|
|
vmovdqu32 zmmword ptr [r9 + 10 * 64], zmm10
|
|
vmovdqu32 zmm27, zmmword ptr [r9 + 11 * 64]
|
|
vpxord zmm11, zmm11, zmm27
|
|
vmovdqu32 zmmword ptr [r9 + 11 * 64], zmm11
|
|
vmovdqu32 zmm28, zmmword ptr [r9 + 12 * 64]
|
|
vpxord zmm12, zmm12, zmm28
|
|
vmovdqu32 zmmword ptr [r9 + 12 * 64], zmm12
|
|
vmovdqu32 zmm29, zmmword ptr [r9 + 13 * 64]
|
|
vpxord zmm13, zmm13, zmm29
|
|
vmovdqu32 zmmword ptr [r9 + 13 * 64], zmm13
|
|
vmovdqu32 zmm30, zmmword ptr [r9 + 14 * 64]
|
|
vpxord zmm14, zmm14, zmm30
|
|
vmovdqu32 zmmword ptr [r9 + 14 * 64], zmm14
|
|
vmovdqu32 zmm31, zmmword ptr [r9 + 15 * 64]
|
|
vpxord zmm15, zmm15, zmm31
|
|
vmovdqu32 zmmword ptr [r9 + 15 * 64], zmm15
|
|
ret
|
|
.balign 16
|
|
BLAKE3_IV:
|
|
BLAKE3_IV_0:
|
|
.long 0x6A09E667
|
|
BLAKE3_IV_1:
|
|
.long 0xBB67AE85
|
|
BLAKE3_IV_2:
|
|
.long 0x3C6EF372
|
|
BLAKE3_IV_3:
|
|
.long 0xA54FF53A
|
|
.balign 16
|
|
ROT16:
|
|
.byte 2, 3, 0, 1, 6, 7, 4, 5, 10, 11, 8, 9, 14, 15, 12, 13
|
|
ROT8:
|
|
.byte 1, 2, 3, 0, 5, 6, 7, 4, 9, 10, 11, 8, 13, 14, 15, 12
|
|
.balign 64
|
|
INCREMENT_2D:
|
|
.quad 0, 0, 1, 0, 2, 0, 3, 0
|
|
INCREMENT_3D:
|
|
.quad 0, 1, 2, 3, 4, 5, 6, 7
|
|
.balign 64
|
|
EVEN_INDEXES:
|
|
.long 0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30
|
|
ODD_INDEXES:
|
|
.long 1, 3, 5, 7, 9, 11, 13, 15, 17, 19, 21, 23, 25, 27, 29, 31
|