1
0
Fork 0
mirror of https://github.com/BLAKE3-team/BLAKE3 synced 2024-04-19 20:44:11 +02:00

Start SSE2 implementation based on SSE4.1 version

Wire up basic functions and features for SSE2 support using the SSE4.1 version
as a basis without implementing the SSE2 instructions yet.

 * Cargo.toml: add no_sse2 feature
 * benches/bench.rs: wire SSE2 benchmarks
 * build.rs: add SSE2 rust intrinsics and assembly builds
 * c/Makefile.testing: add SSE2 C and assembly targets
 * c/README.md: add SSE2 to C build instructions
 * c/blake3_c_rust_bindings/build.rs: add SSE2 C rust binding builds
 * c/blake3_c_rust_bindings/src/lib.rs: add SSE2 C rust bindings
 * c/blake3_dispatch.c: add SSE2 C dispatch
 * c/blake3_impl.h: add SSE2 C function prototypes
 * c/blake3_sse2.c: add SSE2 C intrinsic file starting with SSE4.1 version
 * c/blake3_sse2_x86-64_{unix.S,windows_gnu.S,windows_msvc.asm}: add SSE2
   assembly files starting with SSE4.1 version
 * src/ffi_sse2.rs: add rust implementation using SSE2 C rust bindings
 * src/lib.rs: add SSE2 rust intrinsics and SSE2 C rust binding rust SSE2 module
   configurations
 * src/platform.rs: add SSE2 rust platform detection and dispatch
 * src/rust_sse2.rs: add SSE2 rust intrinsic file starting with SSE4.1 version
 * tools/instruction_set_support/src/main.rs: add SSE2 feature detection
This commit is contained in:
Matthew Krupcale 2020-08-14 18:02:06 -04:00
parent adbf07d67a
commit d91f20dd29
18 changed files with 7841 additions and 14 deletions

View File

@ -61,6 +61,7 @@ prefer_intrinsics = []
# level turns out to be the right approach, then we can design a stable
# feature. Until then, we reserve the right to break these features in a patch
# release.
no_sse2 = []
no_sse41 = []
no_avx2 = []
no_avx512 = []

View File

@ -60,6 +60,14 @@ fn bench_single_compression_portable(b: &mut Bencher) {
bench_single_compression_fn(b, Platform::portable());
}
#[bench]
#[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
fn bench_single_compression_sse2(b: &mut Bencher) {
if let Some(platform) = Platform::sse2() {
bench_single_compression_fn(b, platform);
}
}
#[bench]
#[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
fn bench_single_compression_sse41(b: &mut Bencher) {
@ -102,6 +110,14 @@ fn bench_many_chunks_fn(b: &mut Bencher, platform: Platform) {
});
}
#[bench]
#[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
fn bench_many_chunks_sse2(b: &mut Bencher) {
if let Some(platform) = Platform::sse2() {
bench_many_chunks_fn(b, platform);
}
}
#[bench]
#[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
fn bench_many_chunks_sse41(b: &mut Bencher) {
@ -161,6 +177,14 @@ fn bench_many_parents_fn(b: &mut Bencher, platform: Platform) {
});
}
#[bench]
#[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
fn bench_many_parents_sse2(b: &mut Bencher) {
if let Some(platform) = Platform::sse2() {
bench_many_parents_fn(b, platform);
}
}
#[bench]
#[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
fn bench_many_parents_sse41(b: &mut Bencher) {

View File

@ -118,34 +118,40 @@ fn c_compiler_support() -> CCompilerSupport {
}
}
fn build_sse41_avx2_rust_intrinsics() {
// No C code to compile here. Set the cfg flags that enable the Rust SSE4.1
// and AVX2 intrinsics modules. The regular Cargo build will compile them.
fn build_sse2_sse41_avx2_rust_intrinsics() {
// No C code to compile here. Set the cfg flags that enable the Rust SSE2,
// SSE4.1, and AVX2 intrinsics modules. The regular Cargo build will compile
// them.
println!("cargo:rustc-cfg=blake3_sse2_rust");
println!("cargo:rustc-cfg=blake3_sse41_rust");
println!("cargo:rustc-cfg=blake3_avx2_rust");
}
fn build_sse41_avx2_assembly() {
fn build_sse2_sse41_avx2_assembly() {
// Build the assembly implementations for SSE4.1 and AVX2. This is
// preferred, but it only supports x86_64.
assert!(is_x86_64());
println!("cargo:rustc-cfg=blake3_sse2_ffi");
println!("cargo:rustc-cfg=blake3_sse41_ffi");
println!("cargo:rustc-cfg=blake3_avx2_ffi");
let mut build = new_build();
if is_windows_msvc() {
build.file("c/blake3_sse2_x86-64_windows_msvc.asm");
build.file("c/blake3_sse41_x86-64_windows_msvc.asm");
build.file("c/blake3_avx2_x86-64_windows_msvc.asm");
} else if is_windows_gnu() {
build.file("c/blake3_sse2_x86-64_windows_gnu.S");
build.file("c/blake3_sse41_x86-64_windows_gnu.S");
build.file("c/blake3_avx2_x86-64_windows_gnu.S");
} else {
// All non-Windows implementations are assumed to support
// Linux-style assembly. These files do contain a small
// explicit workaround for macOS also.
build.file("c/blake3_sse2_x86-64_unix.S");
build.file("c/blake3_sse41_x86-64_unix.S");
build.file("c/blake3_avx2_x86-64_unix.S");
}
build.compile("blake3_sse41_avx2_assembly");
build.compile("blake3_sse2_sse41_avx2_assembly");
}
fn build_avx512_c_intrinsics() {
@ -215,11 +221,11 @@ fn main() -> Result<(), Box<dyn std::error::Error>> {
if is_x86_64() || is_x86_32() {
let support = c_compiler_support();
if is_x86_32() || should_prefer_intrinsics() || is_pure() || support == NoCompiler {
build_sse41_avx2_rust_intrinsics();
build_sse2_sse41_avx2_rust_intrinsics();
} else {
// We assume that all C compilers can assemble SSE4.1 and AVX2. We
// don't explicitly check for support.
build_sse41_avx2_assembly();
build_sse2_sse41_avx2_assembly();
}
if is_pure() || support == NoCompiler || support == NoAVX512 {

View File

@ -9,6 +9,13 @@ TARGETS=
ASM_TARGETS=
EXTRAFLAGS=-Wa,--noexecstack
ifdef BLAKE3_NO_SSE2
EXTRAFLAGS += -DBLAKE3_NO_SSE2
else
TARGETS += blake3_sse2.o
ASM_TARGETS += blake3_sse2_x86-64_unix.S
endif
ifdef BLAKE3_NO_SSE41
EXTRAFLAGS += -DBLAKE3_NO_SSE41
else
@ -38,6 +45,9 @@ endif
all: blake3.c blake3_dispatch.c blake3_portable.c main.c $(TARGETS)
$(CC) $(CFLAGS) $(EXTRAFLAGS) $^ -o $(NAME) $(LDFLAGS)
blake3_sse2.o: blake3_sse2.c
$(CC) $(CFLAGS) $(EXTRAFLAGS) -c $^ -o $@ -msse2
blake3_sse41.o: blake3_sse41.c
$(CC) $(CFLAGS) $(EXTRAFLAGS) -c $^ -o $@ -msse4.1

View File

@ -40,7 +40,8 @@ with a Unix-like OS, you can compile a working binary like this:
```bash
gcc -O3 -o example example.c blake3.c blake3_dispatch.c blake3_portable.c \
blake3_sse41_x86-64_unix.S blake3_avx2_x86-64_unix.S blake3_avx512_x86-64_unix.S
blake3_sse2_x86-64_unix.S blake3_sse41_x86-64_unix.S blake3_avx2_x86-64_unix.S \
blake3_avx512_x86-64_unix.S
```
# API
@ -144,8 +145,8 @@ by hand. Note that these steps may change in future versions.
Dynamic dispatch is enabled by default on x86. The implementation will
query the CPU at runtime to detect SIMD support, and it will use the
widest instruction set available. By default, `blake3_dispatch.c`
expects to be linked with code for four different instruction sets:
portable C, SSE4.1, AVX2, and AVX-512.
expects to be linked with code for five different instruction sets:
portable C, SSE2, SSE4.1, AVX2, and AVX-512.
For each of the x86 SIMD instruction sets, two versions are available,
one in assembly (with three flavors: Unix, Windows MSVC, and Windows
@ -160,7 +161,8 @@ the assembly implementations:
```bash
gcc -shared -O3 -o libblake3.so blake3.c blake3_dispatch.c blake3_portable.c \
blake3_sse41_x86-64_unix.S blake3_avx2_x86-64_unix.S blake3_avx512_x86-64_unix.S
blake3_sse2_x86-64_unix.S blake3_sse41_x86-64_unix.S blake3_avx2_x86-64_unix.S \
blake3_avx512_x86-64_unix.S
```
When building the intrinsics-based implementations, you need to build
@ -169,11 +171,12 @@ explicitly enabled in the compiler. Here's the same shared library using
the intrinsics-based implementations:
```bash
gcc -c -fPIC -O3 -msse2 blake3_sse2.c -o blake3_sse2.o
gcc -c -fPIC -O3 -msse4.1 blake3_sse41.c -o blake3_sse41.o
gcc -c -fPIC -O3 -mavx2 blake3_avx2.c -o blake3_avx2.o
gcc -c -fPIC -O3 -mavx512f -mavx512vl blake3_avx512.c -o blake3_avx512.o
gcc -shared -O3 -o libblake3.so blake3.c blake3_dispatch.c blake3_portable.c \
blake3_avx2.o blake3_avx512.o blake3_sse41.o
blake3_avx2.o blake3_avx512.o blake3_sse41.o blake3_sse2.o
```
Note above that building `blake3_avx512.c` requires both `-mavx512f` and
@ -187,8 +190,8 @@ each instruction set. Here's an example of building a shared library on
x86 with only portable code:
```bash
gcc -shared -O3 -o libblake3.so -DBLAKE3_NO_SSE41 -DBLAKE3_NO_AVX2 -DBLAKE3_NO_AVX512 \
blake3.c blake3_dispatch.c blake3_portable.c
gcc -shared -O3 -o libblake3.so -DBLAKE3_NO_SSE2 -DBLAKE3_NO_SSE41 -DBLAKE3_NO_AVX2 \
-DBLAKE3_NO_AVX512 blake3.c blake3_dispatch.c blake3_portable.c
```
## ARM NEON

View File

@ -60,12 +60,14 @@ fn main() -> Result<(), Box<dyn std::error::Error>> {
// "prefer_intrinsics" feature is enabled.
if is_windows_msvc() {
let mut build = new_build();
build.file("../blake3_sse2_x86-64_windows_msvc.asm");
build.file("../blake3_sse41_x86-64_windows_msvc.asm");
build.file("../blake3_avx2_x86-64_windows_msvc.asm");
build.file("../blake3_avx512_x86-64_windows_msvc.asm");
build.compile("blake3_asm");
} else if is_windows_gnu() {
let mut build = new_build();
build.file("../blake3_sse2_x86-64_windows_gnu.S");
build.file("../blake3_sse41_x86-64_windows_gnu.S");
build.file("../blake3_avx2_x86-64_windows_gnu.S");
build.file("../blake3_avx512_x86-64_windows_gnu.S");
@ -75,6 +77,7 @@ fn main() -> Result<(), Box<dyn std::error::Error>> {
// Linux-style assembly. These files do contain a small
// explicit workaround for macOS also.
let mut build = new_build();
build.file("../blake3_sse2_x86-64_unix.S");
build.file("../blake3_sse41_x86-64_unix.S");
build.file("../blake3_avx2_x86-64_unix.S");
build.file("../blake3_avx512_x86-64_unix.S");
@ -87,6 +90,18 @@ fn main() -> Result<(), Box<dyn std::error::Error>> {
// compiled separately, with the corresponding instruction set
// extension explicitly enabled in the compiler.
let mut sse2_build = new_build();
sse2_build.file("../blake3_sse2.c");
if is_windows_msvc() {
// /arch:SSE2 is the default on x86 and undefined on x86_64:
// https://docs.microsoft.com/en-us/cpp/build/reference/arch-x86
// It also includes SSE4.1 intrisincs:
// https://stackoverflow.com/a/32183222/823869
} else {
sse2_build.flag("-msse2");
}
sse2_build.compile("blake3_sse2");
let mut sse41_build = new_build();
sse41_build.file("../blake3_sse41.c");
if is_windows_msvc() {

View File

@ -15,6 +15,11 @@ pub const OUT_LEN: usize = 32;
// Feature detection functions for tests and benchmarks. Note that the C code
// does its own feature detection in blake3_dispatch.c.
#[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
pub fn sse2_detected() -> bool {
is_x86_feature_detected!("sse2")
}
#[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
pub fn sse41_detected() -> bool {
is_x86_feature_detected!("sse4.1")
@ -153,6 +158,35 @@ pub mod ffi {
#[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
pub mod x86 {
extern "C" {
// SSE2 low level functions
pub fn blake3_compress_in_place_sse2(
cv: *mut u32,
block: *const u8,
block_len: u8,
counter: u64,
flags: u8,
);
pub fn blake3_compress_xof_sse2(
cv: *const u32,
block: *const u8,
block_len: u8,
counter: u64,
flags: u8,
out: *mut u8,
);
pub fn blake3_hash_many_sse2(
inputs: *const *const u8,
num_inputs: usize,
blocks: usize,
key: *const u32,
counter: u64,
increment_counter: bool,
flags: u8,
flags_start: u8,
flags_end: u8,
out: *mut u8,
);
// SSE4.1 low level functions
pub fn blake3_compress_in_place_sse41(
cv: *mut u32,

View File

@ -149,6 +149,12 @@ void blake3_compress_in_place(uint32_t cv[8],
return;
}
#endif
#if !defined(BLAKE3_NO_SSE2)
if (features & SSE2) {
blake3_compress_in_place_sse2(cv, block, block_len, counter, flags);
return;
}
#endif
#endif
blake3_compress_in_place_portable(cv, block, block_len, counter, flags);
}
@ -171,6 +177,12 @@ void blake3_compress_xof(const uint32_t cv[8],
return;
}
#endif
#if !defined(BLAKE3_NO_SSE2)
if (features & SSE2) {
blake3_compress_xof_sse2(cv, block, block_len, counter, flags, out);
return;
}
#endif
#endif
blake3_compress_xof_portable(cv, block, block_len, counter, flags, out);
}
@ -205,6 +217,14 @@ void blake3_hash_many(const uint8_t *const *inputs, size_t num_inputs,
return;
}
#endif
#if !defined(BLAKE3_NO_SSE2)
if (features & SSE2) {
blake3_hash_many_sse2(inputs, num_inputs, blocks, key, counter,
increment_counter, flags, flags_start, flags_end,
out);
return;
}
#endif
#endif
#if defined(BLAKE3_USE_NEON)
@ -237,6 +257,11 @@ size_t blake3_simd_degree(void) {
return 4;
}
#endif
#if !defined(BLAKE3_NO_SSE2)
if (features & SSE2) {
return 4;
}
#endif
#endif
#if defined(BLAKE3_USE_NEON)
return 4;

View File

@ -182,6 +182,21 @@ void blake3_hash_many_portable(const uint8_t *const *inputs, size_t num_inputs,
uint8_t flags_end, uint8_t *out);
#if defined(IS_X86)
#if !defined(BLAKE3_NO_SSE2)
void blake3_compress_in_place_sse2(uint32_t cv[8],
const uint8_t block[BLAKE3_BLOCK_LEN],
uint8_t block_len, uint64_t counter,
uint8_t flags);
void blake3_compress_xof_sse2(const uint32_t cv[8],
const uint8_t block[BLAKE3_BLOCK_LEN],
uint8_t block_len, uint64_t counter,
uint8_t flags, uint8_t out[64]);
void blake3_hash_many_sse2(const uint8_t *const *inputs, size_t num_inputs,
size_t blocks, const uint32_t key[8],
uint64_t counter, bool increment_counter,
uint8_t flags, uint8_t flags_start,
uint8_t flags_end, uint8_t *out);
#endif
#if !defined(BLAKE3_NO_SSE41)
void blake3_compress_in_place_sse41(uint32_t cv[8],
const uint8_t block[BLAKE3_BLOCK_LEN],

559
c/blake3_sse2.c Normal file
View File

@ -0,0 +1,559 @@
#include "blake3_impl.h"
#include <immintrin.h>
#define DEGREE 4
#define _mm_shuffle_ps2(a, b, c) \
(_mm_castps_si128( \
_mm_shuffle_ps(_mm_castsi128_ps(a), _mm_castsi128_ps(b), (c))))
INLINE __m128i loadu(const uint8_t src[16]) {
return _mm_loadu_si128((const __m128i *)src);
}
INLINE void storeu(__m128i src, uint8_t dest[16]) {
_mm_storeu_si128((__m128i *)dest, src);
}
INLINE __m128i addv(__m128i a, __m128i b) { return _mm_add_epi32(a, b); }
// Note that clang-format doesn't like the name "xor" for some reason.
INLINE __m128i xorv(__m128i a, __m128i b) { return _mm_xor_si128(a, b); }
INLINE __m128i set1(uint32_t x) { return _mm_set1_epi32((int32_t)x); }
INLINE __m128i set4(uint32_t a, uint32_t b, uint32_t c, uint32_t d) {
return _mm_setr_epi32((int32_t)a, (int32_t)b, (int32_t)c, (int32_t)d);
}
INLINE __m128i rot16(__m128i x) {
return _mm_shuffle_epi8(
x, _mm_set_epi8(13, 12, 15, 14, 9, 8, 11, 10, 5, 4, 7, 6, 1, 0, 3, 2));
}
INLINE __m128i rot12(__m128i x) {
return xorv(_mm_srli_epi32(x, 12), _mm_slli_epi32(x, 32 - 12));
}
INLINE __m128i rot8(__m128i x) {
return _mm_shuffle_epi8(
x, _mm_set_epi8(12, 15, 14, 13, 8, 11, 10, 9, 4, 7, 6, 5, 0, 3, 2, 1));
}
INLINE __m128i rot7(__m128i x) {
return xorv(_mm_srli_epi32(x, 7), _mm_slli_epi32(x, 32 - 7));
}
INLINE void g1(__m128i *row0, __m128i *row1, __m128i *row2, __m128i *row3,
__m128i m) {
*row0 = addv(addv(*row0, m), *row1);
*row3 = xorv(*row3, *row0);
*row3 = rot16(*row3);
*row2 = addv(*row2, *row3);
*row1 = xorv(*row1, *row2);
*row1 = rot12(*row1);
}
INLINE void g2(__m128i *row0, __m128i *row1, __m128i *row2, __m128i *row3,
__m128i m) {
*row0 = addv(addv(*row0, m), *row1);
*row3 = xorv(*row3, *row0);
*row3 = rot8(*row3);
*row2 = addv(*row2, *row3);
*row1 = xorv(*row1, *row2);
*row1 = rot7(*row1);
}
// Note the optimization here of leaving row1 as the unrotated row, rather than
// row0. All the message loads below are adjusted to compensate for this. See
// discussion at https://github.com/sneves/blake2-avx2/pull/4
INLINE void diagonalize(__m128i *row0, __m128i *row2, __m128i *row3) {
*row0 = _mm_shuffle_epi32(*row0, _MM_SHUFFLE(2, 1, 0, 3));
*row3 = _mm_shuffle_epi32(*row3, _MM_SHUFFLE(1, 0, 3, 2));
*row2 = _mm_shuffle_epi32(*row2, _MM_SHUFFLE(0, 3, 2, 1));
}
INLINE void undiagonalize(__m128i *row0, __m128i *row2, __m128i *row3) {
*row0 = _mm_shuffle_epi32(*row0, _MM_SHUFFLE(0, 3, 2, 1));
*row3 = _mm_shuffle_epi32(*row3, _MM_SHUFFLE(1, 0, 3, 2));
*row2 = _mm_shuffle_epi32(*row2, _MM_SHUFFLE(2, 1, 0, 3));
}
INLINE void compress_pre(__m128i rows[4], const uint32_t cv[8],
const uint8_t block[BLAKE3_BLOCK_LEN],
uint8_t block_len, uint64_t counter, uint8_t flags) {
rows[0] = loadu((uint8_t *)&cv[0]);
rows[1] = loadu((uint8_t *)&cv[4]);
rows[2] = set4(IV[0], IV[1], IV[2], IV[3]);
rows[3] = set4(counter_low(counter), counter_high(counter),
(uint32_t)block_len, (uint32_t)flags);
__m128i m0 = loadu(&block[sizeof(__m128i) * 0]);
__m128i m1 = loadu(&block[sizeof(__m128i) * 1]);
__m128i m2 = loadu(&block[sizeof(__m128i) * 2]);
__m128i m3 = loadu(&block[sizeof(__m128i) * 3]);
__m128i t0, t1, t2, t3, tt;
// Round 1. The first round permutes the message words from the original
// input order, into the groups that get mixed in parallel.
t0 = _mm_shuffle_ps2(m0, m1, _MM_SHUFFLE(2, 0, 2, 0)); // 6 4 2 0
g1(&rows[0], &rows[1], &rows[2], &rows[3], t0);
t1 = _mm_shuffle_ps2(m0, m1, _MM_SHUFFLE(3, 1, 3, 1)); // 7 5 3 1
g2(&rows[0], &rows[1], &rows[2], &rows[3], t1);
diagonalize(&rows[0], &rows[2], &rows[3]);
t2 = _mm_shuffle_ps2(m2, m3, _MM_SHUFFLE(2, 0, 2, 0)); // 14 12 10 8
t2 = _mm_shuffle_epi32(t2, _MM_SHUFFLE(2, 1, 0, 3)); // 12 10 8 14
g1(&rows[0], &rows[1], &rows[2], &rows[3], t2);
t3 = _mm_shuffle_ps2(m2, m3, _MM_SHUFFLE(3, 1, 3, 1)); // 15 13 11 9
t3 = _mm_shuffle_epi32(t3, _MM_SHUFFLE(2, 1, 0, 3)); // 13 11 9 15
g2(&rows[0], &rows[1], &rows[2], &rows[3], t3);
undiagonalize(&rows[0], &rows[2], &rows[3]);
m0 = t0;
m1 = t1;
m2 = t2;
m3 = t3;
// Round 2. This round and all following rounds apply a fixed permutation
// to the message words from the round before.
t0 = _mm_shuffle_ps2(m0, m1, _MM_SHUFFLE(3, 1, 1, 2));
t0 = _mm_shuffle_epi32(t0, _MM_SHUFFLE(0, 3, 2, 1));
g1(&rows[0], &rows[1], &rows[2], &rows[3], t0);
t1 = _mm_shuffle_ps2(m2, m3, _MM_SHUFFLE(3, 3, 2, 2));
tt = _mm_shuffle_epi32(m0, _MM_SHUFFLE(0, 0, 3, 3));
t1 = _mm_blend_epi16(tt, t1, 0xCC);
g2(&rows[0], &rows[1], &rows[2], &rows[3], t1);
diagonalize(&rows[0], &rows[2], &rows[3]);
t2 = _mm_unpacklo_epi64(m3, m1);
tt = _mm_blend_epi16(t2, m2, 0xC0);
t2 = _mm_shuffle_epi32(tt, _MM_SHUFFLE(1, 3, 2, 0));
g1(&rows[0], &rows[1], &rows[2], &rows[3], t2);
t3 = _mm_unpackhi_epi32(m1, m3);
tt = _mm_unpacklo_epi32(m2, t3);
t3 = _mm_shuffle_epi32(tt, _MM_SHUFFLE(0, 1, 3, 2));
g2(&rows[0], &rows[1], &rows[2], &rows[3], t3);
undiagonalize(&rows[0], &rows[2], &rows[3]);
m0 = t0;
m1 = t1;
m2 = t2;
m3 = t3;
// Round 3
t0 = _mm_shuffle_ps2(m0, m1, _MM_SHUFFLE(3, 1, 1, 2));
t0 = _mm_shuffle_epi32(t0, _MM_SHUFFLE(0, 3, 2, 1));
g1(&rows[0], &rows[1], &rows[2], &rows[3], t0);
t1 = _mm_shuffle_ps2(m2, m3, _MM_SHUFFLE(3, 3, 2, 2));
tt = _mm_shuffle_epi32(m0, _MM_SHUFFLE(0, 0, 3, 3));
t1 = _mm_blend_epi16(tt, t1, 0xCC);
g2(&rows[0], &rows[1], &rows[2], &rows[3], t1);
diagonalize(&rows[0], &rows[2], &rows[3]);
t2 = _mm_unpacklo_epi64(m3, m1);
tt = _mm_blend_epi16(t2, m2, 0xC0);
t2 = _mm_shuffle_epi32(tt, _MM_SHUFFLE(1, 3, 2, 0));
g1(&rows[0], &rows[1], &rows[2], &rows[3], t2);
t3 = _mm_unpackhi_epi32(m1, m3);
tt = _mm_unpacklo_epi32(m2, t3);
t3 = _mm_shuffle_epi32(tt, _MM_SHUFFLE(0, 1, 3, 2));
g2(&rows[0], &rows[1], &rows[2], &rows[3], t3);
undiagonalize(&rows[0], &rows[2], &rows[3]);
m0 = t0;
m1 = t1;
m2 = t2;
m3 = t3;
// Round 4
t0 = _mm_shuffle_ps2(m0, m1, _MM_SHUFFLE(3, 1, 1, 2));
t0 = _mm_shuffle_epi32(t0, _MM_SHUFFLE(0, 3, 2, 1));
g1(&rows[0], &rows[1], &rows[2], &rows[3], t0);
t1 = _mm_shuffle_ps2(m2, m3, _MM_SHUFFLE(3, 3, 2, 2));
tt = _mm_shuffle_epi32(m0, _MM_SHUFFLE(0, 0, 3, 3));
t1 = _mm_blend_epi16(tt, t1, 0xCC);
g2(&rows[0], &rows[1], &rows[2], &rows[3], t1);
diagonalize(&rows[0], &rows[2], &rows[3]);
t2 = _mm_unpacklo_epi64(m3, m1);
tt = _mm_blend_epi16(t2, m2, 0xC0);
t2 = _mm_shuffle_epi32(tt, _MM_SHUFFLE(1, 3, 2, 0));
g1(&rows[0], &rows[1], &rows[2], &rows[3], t2);
t3 = _mm_unpackhi_epi32(m1, m3);
tt = _mm_unpacklo_epi32(m2, t3);
t3 = _mm_shuffle_epi32(tt, _MM_SHUFFLE(0, 1, 3, 2));
g2(&rows[0], &rows[1], &rows[2], &rows[3], t3);
undiagonalize(&rows[0], &rows[2], &rows[3]);
m0 = t0;
m1 = t1;
m2 = t2;
m3 = t3;
// Round 5
t0 = _mm_shuffle_ps2(m0, m1, _MM_SHUFFLE(3, 1, 1, 2));
t0 = _mm_shuffle_epi32(t0, _MM_SHUFFLE(0, 3, 2, 1));
g1(&rows[0], &rows[1], &rows[2], &rows[3], t0);
t1 = _mm_shuffle_ps2(m2, m3, _MM_SHUFFLE(3, 3, 2, 2));
tt = _mm_shuffle_epi32(m0, _MM_SHUFFLE(0, 0, 3, 3));
t1 = _mm_blend_epi16(tt, t1, 0xCC);
g2(&rows[0], &rows[1], &rows[2], &rows[3], t1);
diagonalize(&rows[0], &rows[2], &rows[3]);
t2 = _mm_unpacklo_epi64(m3, m1);
tt = _mm_blend_epi16(t2, m2, 0xC0);
t2 = _mm_shuffle_epi32(tt, _MM_SHUFFLE(1, 3, 2, 0));
g1(&rows[0], &rows[1], &rows[2], &rows[3], t2);
t3 = _mm_unpackhi_epi32(m1, m3);
tt = _mm_unpacklo_epi32(m2, t3);
t3 = _mm_shuffle_epi32(tt, _MM_SHUFFLE(0, 1, 3, 2));
g2(&rows[0], &rows[1], &rows[2], &rows[3], t3);
undiagonalize(&rows[0], &rows[2], &rows[3]);
m0 = t0;
m1 = t1;
m2 = t2;
m3 = t3;
// Round 6
t0 = _mm_shuffle_ps2(m0, m1, _MM_SHUFFLE(3, 1, 1, 2));
t0 = _mm_shuffle_epi32(t0, _MM_SHUFFLE(0, 3, 2, 1));
g1(&rows[0], &rows[1], &rows[2], &rows[3], t0);
t1 = _mm_shuffle_ps2(m2, m3, _MM_SHUFFLE(3, 3, 2, 2));
tt = _mm_shuffle_epi32(m0, _MM_SHUFFLE(0, 0, 3, 3));
t1 = _mm_blend_epi16(tt, t1, 0xCC);
g2(&rows[0], &rows[1], &rows[2], &rows[3], t1);
diagonalize(&rows[0], &rows[2], &rows[3]);
t2 = _mm_unpacklo_epi64(m3, m1);
tt = _mm_blend_epi16(t2, m2, 0xC0);
t2 = _mm_shuffle_epi32(tt, _MM_SHUFFLE(1, 3, 2, 0));
g1(&rows[0], &rows[1], &rows[2], &rows[3], t2);
t3 = _mm_unpackhi_epi32(m1, m3);
tt = _mm_unpacklo_epi32(m2, t3);
t3 = _mm_shuffle_epi32(tt, _MM_SHUFFLE(0, 1, 3, 2));
g2(&rows[0], &rows[1], &rows[2], &rows[3], t3);
undiagonalize(&rows[0], &rows[2], &rows[3]);
m0 = t0;
m1 = t1;
m2 = t2;
m3 = t3;
// Round 7
t0 = _mm_shuffle_ps2(m0, m1, _MM_SHUFFLE(3, 1, 1, 2));
t0 = _mm_shuffle_epi32(t0, _MM_SHUFFLE(0, 3, 2, 1));
g1(&rows[0], &rows[1], &rows[2], &rows[3], t0);
t1 = _mm_shuffle_ps2(m2, m3, _MM_SHUFFLE(3, 3, 2, 2));
tt = _mm_shuffle_epi32(m0, _MM_SHUFFLE(0, 0, 3, 3));
t1 = _mm_blend_epi16(tt, t1, 0xCC);
g2(&rows[0], &rows[1], &rows[2], &rows[3], t1);
diagonalize(&rows[0], &rows[2], &rows[3]);
t2 = _mm_unpacklo_epi64(m3, m1);
tt = _mm_blend_epi16(t2, m2, 0xC0);
t2 = _mm_shuffle_epi32(tt, _MM_SHUFFLE(1, 3, 2, 0));
g1(&rows[0], &rows[1], &rows[2], &rows[3], t2);
t3 = _mm_unpackhi_epi32(m1, m3);
tt = _mm_unpacklo_epi32(m2, t3);
t3 = _mm_shuffle_epi32(tt, _MM_SHUFFLE(0, 1, 3, 2));
g2(&rows[0], &rows[1], &rows[2], &rows[3], t3);
undiagonalize(&rows[0], &rows[2], &rows[3]);
}
void blake3_compress_in_place_sse2(uint32_t cv[8],
const uint8_t block[BLAKE3_BLOCK_LEN],
uint8_t block_len, uint64_t counter,
uint8_t flags) {
__m128i rows[4];
compress_pre(rows, cv, block, block_len, counter, flags);
storeu(xorv(rows[0], rows[2]), (uint8_t *)&cv[0]);
storeu(xorv(rows[1], rows[3]), (uint8_t *)&cv[4]);
}
void blake3_compress_xof_sse2(const uint32_t cv[8],
const uint8_t block[BLAKE3_BLOCK_LEN],
uint8_t block_len, uint64_t counter,
uint8_t flags, uint8_t out[64]) {
__m128i rows[4];
compress_pre(rows, cv, block, block_len, counter, flags);
storeu(xorv(rows[0], rows[2]), &out[0]);
storeu(xorv(rows[1], rows[3]), &out[16]);
storeu(xorv(rows[2], loadu((uint8_t *)&cv[0])), &out[32]);
storeu(xorv(rows[3], loadu((uint8_t *)&cv[4])), &out[48]);
}
INLINE void round_fn(__m128i v[16], __m128i m[16], size_t r) {
v[0] = addv(v[0], m[(size_t)MSG_SCHEDULE[r][0]]);
v[1] = addv(v[1], m[(size_t)MSG_SCHEDULE[r][2]]);
v[2] = addv(v[2], m[(size_t)MSG_SCHEDULE[r][4]]);
v[3] = addv(v[3], m[(size_t)MSG_SCHEDULE[r][6]]);
v[0] = addv(v[0], v[4]);
v[1] = addv(v[1], v[5]);
v[2] = addv(v[2], v[6]);
v[3] = addv(v[3], v[7]);
v[12] = xorv(v[12], v[0]);
v[13] = xorv(v[13], v[1]);
v[14] = xorv(v[14], v[2]);
v[15] = xorv(v[15], v[3]);
v[12] = rot16(v[12]);
v[13] = rot16(v[13]);
v[14] = rot16(v[14]);
v[15] = rot16(v[15]);
v[8] = addv(v[8], v[12]);
v[9] = addv(v[9], v[13]);
v[10] = addv(v[10], v[14]);
v[11] = addv(v[11], v[15]);
v[4] = xorv(v[4], v[8]);
v[5] = xorv(v[5], v[9]);
v[6] = xorv(v[6], v[10]);
v[7] = xorv(v[7], v[11]);
v[4] = rot12(v[4]);
v[5] = rot12(v[5]);
v[6] = rot12(v[6]);
v[7] = rot12(v[7]);
v[0] = addv(v[0], m[(size_t)MSG_SCHEDULE[r][1]]);
v[1] = addv(v[1], m[(size_t)MSG_SCHEDULE[r][3]]);
v[2] = addv(v[2], m[(size_t)MSG_SCHEDULE[r][5]]);
v[3] = addv(v[3], m[(size_t)MSG_SCHEDULE[r][7]]);
v[0] = addv(v[0], v[4]);
v[1] = addv(v[1], v[5]);
v[2] = addv(v[2], v[6]);
v[3] = addv(v[3], v[7]);
v[12] = xorv(v[12], v[0]);
v[13] = xorv(v[13], v[1]);
v[14] = xorv(v[14], v[2]);
v[15] = xorv(v[15], v[3]);
v[12] = rot8(v[12]);
v[13] = rot8(v[13]);
v[14] = rot8(v[14]);
v[15] = rot8(v[15]);
v[8] = addv(v[8], v[12]);
v[9] = addv(v[9], v[13]);
v[10] = addv(v[10], v[14]);
v[11] = addv(v[11], v[15]);
v[4] = xorv(v[4], v[8]);
v[5] = xorv(v[5], v[9]);
v[6] = xorv(v[6], v[10]);
v[7] = xorv(v[7], v[11]);
v[4] = rot7(v[4]);
v[5] = rot7(v[5]);
v[6] = rot7(v[6]);
v[7] = rot7(v[7]);
v[0] = addv(v[0], m[(size_t)MSG_SCHEDULE[r][8]]);
v[1] = addv(v[1], m[(size_t)MSG_SCHEDULE[r][10]]);
v[2] = addv(v[2], m[(size_t)MSG_SCHEDULE[r][12]]);
v[3] = addv(v[3], m[(size_t)MSG_SCHEDULE[r][14]]);
v[0] = addv(v[0], v[5]);
v[1] = addv(v[1], v[6]);
v[2] = addv(v[2], v[7]);
v[3] = addv(v[3], v[4]);
v[15] = xorv(v[15], v[0]);
v[12] = xorv(v[12], v[1]);
v[13] = xorv(v[13], v[2]);
v[14] = xorv(v[14], v[3]);
v[15] = rot16(v[15]);
v[12] = rot16(v[12]);
v[13] = rot16(v[13]);
v[14] = rot16(v[14]);
v[10] = addv(v[10], v[15]);
v[11] = addv(v[11], v[12]);
v[8] = addv(v[8], v[13]);
v[9] = addv(v[9], v[14]);
v[5] = xorv(v[5], v[10]);
v[6] = xorv(v[6], v[11]);
v[7] = xorv(v[7], v[8]);
v[4] = xorv(v[4], v[9]);
v[5] = rot12(v[5]);
v[6] = rot12(v[6]);
v[7] = rot12(v[7]);
v[4] = rot12(v[4]);
v[0] = addv(v[0], m[(size_t)MSG_SCHEDULE[r][9]]);
v[1] = addv(v[1], m[(size_t)MSG_SCHEDULE[r][11]]);
v[2] = addv(v[2], m[(size_t)MSG_SCHEDULE[r][13]]);
v[3] = addv(v[3], m[(size_t)MSG_SCHEDULE[r][15]]);
v[0] = addv(v[0], v[5]);
v[1] = addv(v[1], v[6]);
v[2] = addv(v[2], v[7]);
v[3] = addv(v[3], v[4]);
v[15] = xorv(v[15], v[0]);
v[12] = xorv(v[12], v[1]);
v[13] = xorv(v[13], v[2]);
v[14] = xorv(v[14], v[3]);
v[15] = rot8(v[15]);
v[12] = rot8(v[12]);
v[13] = rot8(v[13]);
v[14] = rot8(v[14]);
v[10] = addv(v[10], v[15]);
v[11] = addv(v[11], v[12]);
v[8] = addv(v[8], v[13]);
v[9] = addv(v[9], v[14]);
v[5] = xorv(v[5], v[10]);
v[6] = xorv(v[6], v[11]);
v[7] = xorv(v[7], v[8]);
v[4] = xorv(v[4], v[9]);
v[5] = rot7(v[5]);
v[6] = rot7(v[6]);
v[7] = rot7(v[7]);
v[4] = rot7(v[4]);
}
INLINE void transpose_vecs(__m128i vecs[DEGREE]) {
// Interleave 32-bit lates. The low unpack is lanes 00/11 and the high is
// 22/33. Note that this doesn't split the vector into two lanes, as the
// AVX2 counterparts do.
__m128i ab_01 = _mm_unpacklo_epi32(vecs[0], vecs[1]);
__m128i ab_23 = _mm_unpackhi_epi32(vecs[0], vecs[1]);
__m128i cd_01 = _mm_unpacklo_epi32(vecs[2], vecs[3]);
__m128i cd_23 = _mm_unpackhi_epi32(vecs[2], vecs[3]);
// Interleave 64-bit lanes.
__m128i abcd_0 = _mm_unpacklo_epi64(ab_01, cd_01);
__m128i abcd_1 = _mm_unpackhi_epi64(ab_01, cd_01);
__m128i abcd_2 = _mm_unpacklo_epi64(ab_23, cd_23);
__m128i abcd_3 = _mm_unpackhi_epi64(ab_23, cd_23);
vecs[0] = abcd_0;
vecs[1] = abcd_1;
vecs[2] = abcd_2;
vecs[3] = abcd_3;
}
INLINE void transpose_msg_vecs(const uint8_t *const *inputs,
size_t block_offset, __m128i out[16]) {
out[0] = loadu(&inputs[0][block_offset + 0 * sizeof(__m128i)]);
out[1] = loadu(&inputs[1][block_offset + 0 * sizeof(__m128i)]);
out[2] = loadu(&inputs[2][block_offset + 0 * sizeof(__m128i)]);
out[3] = loadu(&inputs[3][block_offset + 0 * sizeof(__m128i)]);
out[4] = loadu(&inputs[0][block_offset + 1 * sizeof(__m128i)]);
out[5] = loadu(&inputs[1][block_offset + 1 * sizeof(__m128i)]);
out[6] = loadu(&inputs[2][block_offset + 1 * sizeof(__m128i)]);
out[7] = loadu(&inputs[3][block_offset + 1 * sizeof(__m128i)]);
out[8] = loadu(&inputs[0][block_offset + 2 * sizeof(__m128i)]);
out[9] = loadu(&inputs[1][block_offset + 2 * sizeof(__m128i)]);
out[10] = loadu(&inputs[2][block_offset + 2 * sizeof(__m128i)]);
out[11] = loadu(&inputs[3][block_offset + 2 * sizeof(__m128i)]);
out[12] = loadu(&inputs[0][block_offset + 3 * sizeof(__m128i)]);
out[13] = loadu(&inputs[1][block_offset + 3 * sizeof(__m128i)]);
out[14] = loadu(&inputs[2][block_offset + 3 * sizeof(__m128i)]);
out[15] = loadu(&inputs[3][block_offset + 3 * sizeof(__m128i)]);
for (size_t i = 0; i < 4; ++i) {
_mm_prefetch(&inputs[i][block_offset + 256], _MM_HINT_T0);
}
transpose_vecs(&out[0]);
transpose_vecs(&out[4]);
transpose_vecs(&out[8]);
transpose_vecs(&out[12]);
}
INLINE void load_counters(uint64_t counter, bool increment_counter,
__m128i *out_lo, __m128i *out_hi) {
const __m128i mask = _mm_set1_epi32(-(int32_t)increment_counter);
const __m128i add0 = _mm_set_epi32(3, 2, 1, 0);
const __m128i add1 = _mm_and_si128(mask, add0);
__m128i l = _mm_add_epi32(_mm_set1_epi32(counter), add1);
__m128i carry = _mm_cmpgt_epi32(_mm_xor_si128(add1, _mm_set1_epi32(0x80000000)),
_mm_xor_si128( l, _mm_set1_epi32(0x80000000)));
__m128i h = _mm_sub_epi32(_mm_set1_epi32(counter >> 32), carry);
*out_lo = l;
*out_hi = h;
}
void blake3_hash4_sse2(const uint8_t *const *inputs, size_t blocks,
const uint32_t key[8], uint64_t counter,
bool increment_counter, uint8_t flags,
uint8_t flags_start, uint8_t flags_end, uint8_t *out) {
__m128i h_vecs[8] = {
set1(key[0]), set1(key[1]), set1(key[2]), set1(key[3]),
set1(key[4]), set1(key[5]), set1(key[6]), set1(key[7]),
};
__m128i counter_low_vec, counter_high_vec;
load_counters(counter, increment_counter, &counter_low_vec,
&counter_high_vec);
uint8_t block_flags = flags | flags_start;
for (size_t block = 0; block < blocks; block++) {
if (block + 1 == blocks) {
block_flags |= flags_end;
}
__m128i block_len_vec = set1(BLAKE3_BLOCK_LEN);
__m128i block_flags_vec = set1(block_flags);
__m128i msg_vecs[16];
transpose_msg_vecs(inputs, block * BLAKE3_BLOCK_LEN, msg_vecs);
__m128i v[16] = {
h_vecs[0], h_vecs[1], h_vecs[2], h_vecs[3],
h_vecs[4], h_vecs[5], h_vecs[6], h_vecs[7],
set1(IV[0]), set1(IV[1]), set1(IV[2]), set1(IV[3]),
counter_low_vec, counter_high_vec, block_len_vec, block_flags_vec,
};
round_fn(v, msg_vecs, 0);
round_fn(v, msg_vecs, 1);
round_fn(v, msg_vecs, 2);
round_fn(v, msg_vecs, 3);
round_fn(v, msg_vecs, 4);
round_fn(v, msg_vecs, 5);
round_fn(v, msg_vecs, 6);
h_vecs[0] = xorv(v[0], v[8]);
h_vecs[1] = xorv(v[1], v[9]);
h_vecs[2] = xorv(v[2], v[10]);
h_vecs[3] = xorv(v[3], v[11]);
h_vecs[4] = xorv(v[4], v[12]);
h_vecs[5] = xorv(v[5], v[13]);
h_vecs[6] = xorv(v[6], v[14]);
h_vecs[7] = xorv(v[7], v[15]);
block_flags = flags;
}
transpose_vecs(&h_vecs[0]);
transpose_vecs(&h_vecs[4]);
// The first four vecs now contain the first half of each output, and the
// second four vecs contain the second half of each output.
storeu(h_vecs[0], &out[0 * sizeof(__m128i)]);
storeu(h_vecs[4], &out[1 * sizeof(__m128i)]);
storeu(h_vecs[1], &out[2 * sizeof(__m128i)]);
storeu(h_vecs[5], &out[3 * sizeof(__m128i)]);
storeu(h_vecs[2], &out[4 * sizeof(__m128i)]);
storeu(h_vecs[6], &out[5 * sizeof(__m128i)]);
storeu(h_vecs[3], &out[6 * sizeof(__m128i)]);
storeu(h_vecs[7], &out[7 * sizeof(__m128i)]);
}
INLINE void hash_one_sse2(const uint8_t *input, size_t blocks,
const uint32_t key[8], uint64_t counter,
uint8_t flags, uint8_t flags_start,
uint8_t flags_end, uint8_t out[BLAKE3_OUT_LEN]) {
uint32_t cv[8];
memcpy(cv, key, BLAKE3_KEY_LEN);
uint8_t block_flags = flags | flags_start;
while (blocks > 0) {
if (blocks == 1) {
block_flags |= flags_end;
}
blake3_compress_in_place_sse2(cv, input, BLAKE3_BLOCK_LEN, counter,
block_flags);
input = &input[BLAKE3_BLOCK_LEN];
blocks -= 1;
block_flags = flags;
}
memcpy(out, cv, BLAKE3_OUT_LEN);
}
void blake3_hash_many_sse2(const uint8_t *const *inputs, size_t num_inputs,
size_t blocks, const uint32_t key[8],
uint64_t counter, bool increment_counter,
uint8_t flags, uint8_t flags_start,
uint8_t flags_end, uint8_t *out) {
while (num_inputs >= DEGREE) {
blake3_hash4_sse2(inputs, blocks, key, counter, increment_counter, flags,
flags_start, flags_end, out);
if (increment_counter) {
counter += DEGREE;
}
inputs += DEGREE;
num_inputs -= DEGREE;
out = &out[DEGREE * BLAKE3_OUT_LEN];
}
while (num_inputs > 0) {
hash_one_sse2(inputs[0], blocks, key, counter, flags, flags_start,
flags_end, out);
if (increment_counter) {
counter += 1;
}
inputs += 1;
num_inputs -= 1;
out = &out[BLAKE3_OUT_LEN];
}
}

2028
c/blake3_sse2_x86-64_unix.S Normal file
View File

@ -0,0 +1,2028 @@
#if defined(__ELF__) && defined(__linux__)
.section .note.GNU-stack,"",%progbits
#endif
#if defined(__ELF__) && defined(__CET__) && defined(__has_include)
#if __has_include(<cet.h>)
#include <cet.h>
#endif
#endif
#if !defined(_CET_ENDBR)
#define _CET_ENDBR
#endif
.intel_syntax noprefix
.global blake3_hash_many_sse2
.global _blake3_hash_many_sse2
.global blake3_compress_in_place_sse2
.global _blake3_compress_in_place_sse2
.global blake3_compress_xof_sse2
.global _blake3_compress_xof_sse2
#ifdef __APPLE__
.text
#else
.section .text
#endif
.p2align 6
_blake3_hash_many_sse2:
blake3_hash_many_sse2:
_CET_ENDBR
push r15
push r14
push r13
push r12
push rbx
push rbp
mov rbp, rsp
sub rsp, 360
and rsp, 0xFFFFFFFFFFFFFFC0
neg r9d
movd xmm0, r9d
pshufd xmm0, xmm0, 0x00
movdqa xmmword ptr [rsp+0x130], xmm0
movdqa xmm1, xmm0
pand xmm1, xmmword ptr [ADD0+rip]
pand xmm0, xmmword ptr [ADD1+rip]
movdqa xmmword ptr [rsp+0x150], xmm0
movd xmm0, r8d
pshufd xmm0, xmm0, 0x00
paddd xmm0, xmm1
movdqa xmmword ptr [rsp+0x110], xmm0
pxor xmm0, xmmword ptr [CMP_MSB_MASK+rip]
pxor xmm1, xmmword ptr [CMP_MSB_MASK+rip]
pcmpgtd xmm1, xmm0
shr r8, 32
movd xmm2, r8d
pshufd xmm2, xmm2, 0x00
psubd xmm2, xmm1
movdqa xmmword ptr [rsp+0x120], xmm2
mov rbx, qword ptr [rbp+0x50]
mov r15, rdx
shl r15, 6
movzx r13d, byte ptr [rbp+0x38]
movzx r12d, byte ptr [rbp+0x48]
cmp rsi, 4
jc 3f
2:
movdqu xmm3, xmmword ptr [rcx]
pshufd xmm0, xmm3, 0x00
pshufd xmm1, xmm3, 0x55
pshufd xmm2, xmm3, 0xAA
pshufd xmm3, xmm3, 0xFF
movdqu xmm7, xmmword ptr [rcx+0x10]
pshufd xmm4, xmm7, 0x00
pshufd xmm5, xmm7, 0x55
pshufd xmm6, xmm7, 0xAA
pshufd xmm7, xmm7, 0xFF
mov r8, qword ptr [rdi]
mov r9, qword ptr [rdi+0x8]
mov r10, qword ptr [rdi+0x10]
mov r11, qword ptr [rdi+0x18]
movzx eax, byte ptr [rbp+0x40]
or eax, r13d
xor edx, edx
9:
mov r14d, eax
or eax, r12d
add rdx, 64
cmp rdx, r15
cmovne eax, r14d
movdqu xmm8, xmmword ptr [r8+rdx-0x40]
movdqu xmm9, xmmword ptr [r9+rdx-0x40]
movdqu xmm10, xmmword ptr [r10+rdx-0x40]
movdqu xmm11, xmmword ptr [r11+rdx-0x40]
movdqa xmm12, xmm8
punpckldq xmm8, xmm9
punpckhdq xmm12, xmm9
movdqa xmm14, xmm10
punpckldq xmm10, xmm11
punpckhdq xmm14, xmm11
movdqa xmm9, xmm8
punpcklqdq xmm8, xmm10
punpckhqdq xmm9, xmm10
movdqa xmm13, xmm12
punpcklqdq xmm12, xmm14
punpckhqdq xmm13, xmm14
movdqa xmmword ptr [rsp], xmm8
movdqa xmmword ptr [rsp+0x10], xmm9
movdqa xmmword ptr [rsp+0x20], xmm12
movdqa xmmword ptr [rsp+0x30], xmm13
movdqu xmm8, xmmword ptr [r8+rdx-0x30]
movdqu xmm9, xmmword ptr [r9+rdx-0x30]
movdqu xmm10, xmmword ptr [r10+rdx-0x30]
movdqu xmm11, xmmword ptr [r11+rdx-0x30]
movdqa xmm12, xmm8
punpckldq xmm8, xmm9
punpckhdq xmm12, xmm9
movdqa xmm14, xmm10
punpckldq xmm10, xmm11
punpckhdq xmm14, xmm11
movdqa xmm9, xmm8
punpcklqdq xmm8, xmm10
punpckhqdq xmm9, xmm10
movdqa xmm13, xmm12
punpcklqdq xmm12, xmm14
punpckhqdq xmm13, xmm14
movdqa xmmword ptr [rsp+0x40], xmm8
movdqa xmmword ptr [rsp+0x50], xmm9
movdqa xmmword ptr [rsp+0x60], xmm12
movdqa xmmword ptr [rsp+0x70], xmm13
movdqu xmm8, xmmword ptr [r8+rdx-0x20]
movdqu xmm9, xmmword ptr [r9+rdx-0x20]
movdqu xmm10, xmmword ptr [r10+rdx-0x20]
movdqu xmm11, xmmword ptr [r11+rdx-0x20]
movdqa xmm12, xmm8
punpckldq xmm8, xmm9
punpckhdq xmm12, xmm9
movdqa xmm14, xmm10
punpckldq xmm10, xmm11
punpckhdq xmm14, xmm11
movdqa xmm9, xmm8
punpcklqdq xmm8, xmm10
punpckhqdq xmm9, xmm10
movdqa xmm13, xmm12
punpcklqdq xmm12, xmm14
punpckhqdq xmm13, xmm14
movdqa xmmword ptr [rsp+0x80], xmm8
movdqa xmmword ptr [rsp+0x90], xmm9
movdqa xmmword ptr [rsp+0xA0], xmm12
movdqa xmmword ptr [rsp+0xB0], xmm13
movdqu xmm8, xmmword ptr [r8+rdx-0x10]
movdqu xmm9, xmmword ptr [r9+rdx-0x10]
movdqu xmm10, xmmword ptr [r10+rdx-0x10]
movdqu xmm11, xmmword ptr [r11+rdx-0x10]
movdqa xmm12, xmm8
punpckldq xmm8, xmm9
punpckhdq xmm12, xmm9
movdqa xmm14, xmm10
punpckldq xmm10, xmm11
punpckhdq xmm14, xmm11
movdqa xmm9, xmm8
punpcklqdq xmm8, xmm10
punpckhqdq xmm9, xmm10
movdqa xmm13, xmm12
punpcklqdq xmm12, xmm14
punpckhqdq xmm13, xmm14
movdqa xmmword ptr [rsp+0xC0], xmm8
movdqa xmmword ptr [rsp+0xD0], xmm9
movdqa xmmword ptr [rsp+0xE0], xmm12
movdqa xmmword ptr [rsp+0xF0], xmm13
movdqa xmm9, xmmword ptr [BLAKE3_IV_1+rip]
movdqa xmm10, xmmword ptr [BLAKE3_IV_2+rip]
movdqa xmm11, xmmword ptr [BLAKE3_IV_3+rip]
movdqa xmm12, xmmword ptr [rsp+0x110]
movdqa xmm13, xmmword ptr [rsp+0x120]
movdqa xmm14, xmmword ptr [BLAKE3_BLOCK_LEN+rip]
movd xmm15, eax
pshufd xmm15, xmm15, 0x00
prefetcht0 [r8+rdx+0x80]
prefetcht0 [r9+rdx+0x80]
prefetcht0 [r10+rdx+0x80]
prefetcht0 [r11+rdx+0x80]
paddd xmm0, xmmword ptr [rsp]
paddd xmm1, xmmword ptr [rsp+0x20]
paddd xmm2, xmmword ptr [rsp+0x40]
paddd xmm3, xmmword ptr [rsp+0x60]
paddd xmm0, xmm4
paddd xmm1, xmm5
paddd xmm2, xmm6
paddd xmm3, xmm7
pxor xmm12, xmm0
pxor xmm13, xmm1
pxor xmm14, xmm2
pxor xmm15, xmm3
movdqa xmm8, xmmword ptr [ROT16+rip]
pshufb xmm12, xmm8
pshufb xmm13, xmm8
pshufb xmm14, xmm8
pshufb xmm15, xmm8
movdqa xmm8, xmmword ptr [BLAKE3_IV_0+rip]
paddd xmm8, xmm12
paddd xmm9, xmm13
paddd xmm10, xmm14
paddd xmm11, xmm15
pxor xmm4, xmm8
pxor xmm5, xmm9
pxor xmm6, xmm10
pxor xmm7, xmm11
movdqa xmmword ptr [rsp+0x100], xmm8
movdqa xmm8, xmm4
psrld xmm8, 12
pslld xmm4, 20
por xmm4, xmm8
movdqa xmm8, xmm5
psrld xmm8, 12
pslld xmm5, 20
por xmm5, xmm8
movdqa xmm8, xmm6
psrld xmm8, 12
pslld xmm6, 20
por xmm6, xmm8
movdqa xmm8, xmm7
psrld xmm8, 12
pslld xmm7, 20
por xmm7, xmm8
paddd xmm0, xmmword ptr [rsp+0x10]
paddd xmm1, xmmword ptr [rsp+0x30]
paddd xmm2, xmmword ptr [rsp+0x50]
paddd xmm3, xmmword ptr [rsp+0x70]
paddd xmm0, xmm4
paddd xmm1, xmm5
paddd xmm2, xmm6
paddd xmm3, xmm7
pxor xmm12, xmm0
pxor xmm13, xmm1
pxor xmm14, xmm2
pxor xmm15, xmm3
movdqa xmm8, xmmword ptr [ROT8+rip]
pshufb xmm12, xmm8
pshufb xmm13, xmm8
pshufb xmm14, xmm8
pshufb xmm15, xmm8
movdqa xmm8, xmmword ptr [rsp+0x100]
paddd xmm8, xmm12
paddd xmm9, xmm13
paddd xmm10, xmm14
paddd xmm11, xmm15
pxor xmm4, xmm8
pxor xmm5, xmm9
pxor xmm6, xmm10
pxor xmm7, xmm11
movdqa xmmword ptr [rsp+0x100], xmm8
movdqa xmm8, xmm4
psrld xmm8, 7
pslld xmm4, 25
por xmm4, xmm8
movdqa xmm8, xmm5
psrld xmm8, 7
pslld xmm5, 25
por xmm5, xmm8
movdqa xmm8, xmm6
psrld xmm8, 7
pslld xmm6, 25
por xmm6, xmm8
movdqa xmm8, xmm7
psrld xmm8, 7
pslld xmm7, 25
por xmm7, xmm8
paddd xmm0, xmmword ptr [rsp+0x80]
paddd xmm1, xmmword ptr [rsp+0xA0]
paddd xmm2, xmmword ptr [rsp+0xC0]
paddd xmm3, xmmword ptr [rsp+0xE0]
paddd xmm0, xmm5
paddd xmm1, xmm6
paddd xmm2, xmm7
paddd xmm3, xmm4
pxor xmm15, xmm0
pxor xmm12, xmm1
pxor xmm13, xmm2
pxor xmm14, xmm3
movdqa xmm8, xmmword ptr [ROT16+rip]
pshufb xmm15, xmm8
pshufb xmm12, xmm8
pshufb xmm13, xmm8
pshufb xmm14, xmm8
paddd xmm10, xmm15
paddd xmm11, xmm12
movdqa xmm8, xmmword ptr [rsp+0x100]
paddd xmm8, xmm13
paddd xmm9, xmm14
pxor xmm5, xmm10
pxor xmm6, xmm11
pxor xmm7, xmm8
pxor xmm4, xmm9
movdqa xmmword ptr [rsp+0x100], xmm8
movdqa xmm8, xmm5
psrld xmm8, 12
pslld xmm5, 20
por xmm5, xmm8
movdqa xmm8, xmm6
psrld xmm8, 12
pslld xmm6, 20
por xmm6, xmm8
movdqa xmm8, xmm7
psrld xmm8, 12
pslld xmm7, 20
por xmm7, xmm8
movdqa xmm8, xmm4
psrld xmm8, 12
pslld xmm4, 20
por xmm4, xmm8
paddd xmm0, xmmword ptr [rsp+0x90]
paddd xmm1, xmmword ptr [rsp+0xB0]
paddd xmm2, xmmword ptr [rsp+0xD0]
paddd xmm3, xmmword ptr [rsp+0xF0]
paddd xmm0, xmm5
paddd xmm1, xmm6
paddd xmm2, xmm7
paddd xmm3, xmm4
pxor xmm15, xmm0
pxor xmm12, xmm1
pxor xmm13, xmm2
pxor xmm14, xmm3
movdqa xmm8, xmmword ptr [ROT8+rip]
pshufb xmm15, xmm8
pshufb xmm12, xmm8
pshufb xmm13, xmm8
pshufb xmm14, xmm8
paddd xmm10, xmm15
paddd xmm11, xmm12
movdqa xmm8, xmmword ptr [rsp+0x100]
paddd xmm8, xmm13
paddd xmm9, xmm14
pxor xmm5, xmm10
pxor xmm6, xmm11
pxor xmm7, xmm8
pxor xmm4, xmm9
movdqa xmmword ptr [rsp+0x100], xmm8
movdqa xmm8, xmm5
psrld xmm8, 7
pslld xmm5, 25
por xmm5, xmm8
movdqa xmm8, xmm6
psrld xmm8, 7
pslld xmm6, 25
por xmm6, xmm8
movdqa xmm8, xmm7
psrld xmm8, 7
pslld xmm7, 25
por xmm7, xmm8
movdqa xmm8, xmm4
psrld xmm8, 7
pslld xmm4, 25
por xmm4, xmm8
paddd xmm0, xmmword ptr [rsp+0x20]
paddd xmm1, xmmword ptr [rsp+0x30]
paddd xmm2, xmmword ptr [rsp+0x70]
paddd xmm3, xmmword ptr [rsp+0x40]
paddd xmm0, xmm4
paddd xmm1, xmm5
paddd xmm2, xmm6
paddd xmm3, xmm7
pxor xmm12, xmm0
pxor xmm13, xmm1
pxor xmm14, xmm2
pxor xmm15, xmm3
movdqa xmm8, xmmword ptr [ROT16+rip]
pshufb xmm12, xmm8
pshufb xmm13, xmm8
pshufb xmm14, xmm8
pshufb xmm15, xmm8
movdqa xmm8, xmmword ptr [rsp+0x100]
paddd xmm8, xmm12
paddd xmm9, xmm13
paddd xmm10, xmm14
paddd xmm11, xmm15
pxor xmm4, xmm8
pxor xmm5, xmm9
pxor xmm6, xmm10
pxor xmm7, xmm11
movdqa xmmword ptr [rsp+0x100], xmm8
movdqa xmm8, xmm4
psrld xmm8, 12
pslld xmm4, 20
por xmm4, xmm8
movdqa xmm8, xmm5
psrld xmm8, 12
pslld xmm5, 20
por xmm5, xmm8
movdqa xmm8, xmm6
psrld xmm8, 12
pslld xmm6, 20
por xmm6, xmm8
movdqa xmm8, xmm7
psrld xmm8, 12
pslld xmm7, 20
por xmm7, xmm8
paddd xmm0, xmmword ptr [rsp+0x60]
paddd xmm1, xmmword ptr [rsp+0xA0]
paddd xmm2, xmmword ptr [rsp]
paddd xmm3, xmmword ptr [rsp+0xD0]
paddd xmm0, xmm4
paddd xmm1, xmm5
paddd xmm2, xmm6
paddd xmm3, xmm7
pxor xmm12, xmm0
pxor xmm13, xmm1
pxor xmm14, xmm2
pxor xmm15, xmm3
movdqa xmm8, xmmword ptr [ROT8+rip]
pshufb xmm12, xmm8
pshufb xmm13, xmm8
pshufb xmm14, xmm8
pshufb xmm15, xmm8
movdqa xmm8, xmmword ptr [rsp+0x100]
paddd xmm8, xmm12
paddd xmm9, xmm13
paddd xmm10, xmm14
paddd xmm11, xmm15
pxor xmm4, xmm8
pxor xmm5, xmm9
pxor xmm6, xmm10
pxor xmm7, xmm11
movdqa xmmword ptr [rsp+0x100], xmm8
movdqa xmm8, xmm4
psrld xmm8, 7
pslld xmm4, 25
por xmm4, xmm8
movdqa xmm8, xmm5
psrld xmm8, 7
pslld xmm5, 25
por xmm5, xmm8
movdqa xmm8, xmm6
psrld xmm8, 7
pslld xmm6, 25
por xmm6, xmm8
movdqa xmm8, xmm7
psrld xmm8, 7
pslld xmm7, 25
por xmm7, xmm8
paddd xmm0, xmmword ptr [rsp+0x10]
paddd xmm1, xmmword ptr [rsp+0xC0]
paddd xmm2, xmmword ptr [rsp+0x90]
paddd xmm3, xmmword ptr [rsp+0xF0]
paddd xmm0, xmm5
paddd xmm1, xmm6
paddd xmm2, xmm7
paddd xmm3, xmm4
pxor xmm15, xmm0
pxor xmm12, xmm1
pxor xmm13, xmm2
pxor xmm14, xmm3
movdqa xmm8, xmmword ptr [ROT16+rip]
pshufb xmm15, xmm8
pshufb xmm12, xmm8
pshufb xmm13, xmm8
pshufb xmm14, xmm8
paddd xmm10, xmm15
paddd xmm11, xmm12
movdqa xmm8, xmmword ptr [rsp+0x100]
paddd xmm8, xmm13
paddd xmm9, xmm14
pxor xmm5, xmm10
pxor xmm6, xmm11
pxor xmm7, xmm8
pxor xmm4, xmm9
movdqa xmmword ptr [rsp+0x100], xmm8
movdqa xmm8, xmm5
psrld xmm8, 12
pslld xmm5, 20
por xmm5, xmm8
movdqa xmm8, xmm6
psrld xmm8, 12
pslld xmm6, 20
por xmm6, xmm8
movdqa xmm8, xmm7
psrld xmm8, 12
pslld xmm7, 20
por xmm7, xmm8
movdqa xmm8, xmm4
psrld xmm8, 12
pslld xmm4, 20
por xmm4, xmm8
paddd xmm0, xmmword ptr [rsp+0xB0]
paddd xmm1, xmmword ptr [rsp+0x50]
paddd xmm2, xmmword ptr [rsp+0xE0]
paddd xmm3, xmmword ptr [rsp+0x80]
paddd xmm0, xmm5
paddd xmm1, xmm6
paddd xmm2, xmm7
paddd xmm3, xmm4
pxor xmm15, xmm0
pxor xmm12, xmm1
pxor xmm13, xmm2
pxor xmm14, xmm3
movdqa xmm8, xmmword ptr [ROT8+rip]
pshufb xmm15, xmm8
pshufb xmm12, xmm8
pshufb xmm13, xmm8
pshufb xmm14, xmm8
paddd xmm10, xmm15
paddd xmm11, xmm12
movdqa xmm8, xmmword ptr [rsp+0x100]
paddd xmm8, xmm13
paddd xmm9, xmm14
pxor xmm5, xmm10
pxor xmm6, xmm11
pxor xmm7, xmm8
pxor xmm4, xmm9
movdqa xmmword ptr [rsp+0x100], xmm8
movdqa xmm8, xmm5
psrld xmm8, 7
pslld xmm5, 25
por xmm5, xmm8
movdqa xmm8, xmm6
psrld xmm8, 7
pslld xmm6, 25
por xmm6, xmm8
movdqa xmm8, xmm7
psrld xmm8, 7
pslld xmm7, 25
por xmm7, xmm8
movdqa xmm8, xmm4
psrld xmm8, 7
pslld xmm4, 25
por xmm4, xmm8
paddd xmm0, xmmword ptr [rsp+0x30]
paddd xmm1, xmmword ptr [rsp+0xA0]
paddd xmm2, xmmword ptr [rsp+0xD0]
paddd xmm3, xmmword ptr [rsp+0x70]
paddd xmm0, xmm4
paddd xmm1, xmm5
paddd xmm2, xmm6
paddd xmm3, xmm7
pxor xmm12, xmm0
pxor xmm13, xmm1
pxor xmm14, xmm2
pxor xmm15, xmm3
movdqa xmm8, xmmword ptr [ROT16+rip]
pshufb xmm12, xmm8
pshufb xmm13, xmm8
pshufb xmm14, xmm8
pshufb xmm15, xmm8
movdqa xmm8, xmmword ptr [rsp+0x100]
paddd xmm8, xmm12
paddd xmm9, xmm13
paddd xmm10, xmm14
paddd xmm11, xmm15
pxor xmm4, xmm8
pxor xmm5, xmm9
pxor xmm6, xmm10
pxor xmm7, xmm11
movdqa xmmword ptr [rsp+0x100], xmm8
movdqa xmm8, xmm4
psrld xmm8, 12
pslld xmm4, 20
por xmm4, xmm8
movdqa xmm8, xmm5
psrld xmm8, 12
pslld xmm5, 20
por xmm5, xmm8
movdqa xmm8, xmm6
psrld xmm8, 12
pslld xmm6, 20
por xmm6, xmm8
movdqa xmm8, xmm7
psrld xmm8, 12
pslld xmm7, 20
por xmm7, xmm8
paddd xmm0, xmmword ptr [rsp+0x40]
paddd xmm1, xmmword ptr [rsp+0xC0]
paddd xmm2, xmmword ptr [rsp+0x20]
paddd xmm3, xmmword ptr [rsp+0xE0]
paddd xmm0, xmm4
paddd xmm1, xmm5
paddd xmm2, xmm6
paddd xmm3, xmm7
pxor xmm12, xmm0
pxor xmm13, xmm1
pxor xmm14, xmm2
pxor xmm15, xmm3
movdqa xmm8, xmmword ptr [ROT8+rip]
pshufb xmm12, xmm8
pshufb xmm13, xmm8
pshufb xmm14, xmm8
pshufb xmm15, xmm8
movdqa xmm8, xmmword ptr [rsp+0x100]
paddd xmm8, xmm12
paddd xmm9, xmm13
paddd xmm10, xmm14
paddd xmm11, xmm15
pxor xmm4, xmm8
pxor xmm5, xmm9
pxor xmm6, xmm10
pxor xmm7, xmm11
movdqa xmmword ptr [rsp+0x100], xmm8
movdqa xmm8, xmm4
psrld xmm8, 7
pslld xmm4, 25
por xmm4, xmm8
movdqa xmm8, xmm5
psrld xmm8, 7
pslld xmm5, 25
por xmm5, xmm8
movdqa xmm8, xmm6
psrld xmm8, 7
pslld xmm6, 25
por xmm6, xmm8
movdqa xmm8, xmm7
psrld xmm8, 7
pslld xmm7, 25
por xmm7, xmm8
paddd xmm0, xmmword ptr [rsp+0x60]
paddd xmm1, xmmword ptr [rsp+0x90]
paddd xmm2, xmmword ptr [rsp+0xB0]
paddd xmm3, xmmword ptr [rsp+0x80]
paddd xmm0, xmm5
paddd xmm1, xmm6
paddd xmm2, xmm7
paddd xmm3, xmm4
pxor xmm15, xmm0
pxor xmm12, xmm1
pxor xmm13, xmm2
pxor xmm14, xmm3
movdqa xmm8, xmmword ptr [ROT16+rip]
pshufb xmm15, xmm8
pshufb xmm12, xmm8
pshufb xmm13, xmm8
pshufb xmm14, xmm8
paddd xmm10, xmm15
paddd xmm11, xmm12
movdqa xmm8, xmmword ptr [rsp+0x100]
paddd xmm8, xmm13
paddd xmm9, xmm14
pxor xmm5, xmm10
pxor xmm6, xmm11
pxor xmm7, xmm8
pxor xmm4, xmm9
movdqa xmmword ptr [rsp+0x100], xmm8
movdqa xmm8, xmm5
psrld xmm8, 12
pslld xmm5, 20
por xmm5, xmm8
movdqa xmm8, xmm6
psrld xmm8, 12
pslld xmm6, 20
por xmm6, xmm8
movdqa xmm8, xmm7
psrld xmm8, 12
pslld xmm7, 20
por xmm7, xmm8
movdqa xmm8, xmm4
psrld xmm8, 12
pslld xmm4, 20
por xmm4, xmm8
paddd xmm0, xmmword ptr [rsp+0x50]
paddd xmm1, xmmword ptr [rsp]
paddd xmm2, xmmword ptr [rsp+0xF0]
paddd xmm3, xmmword ptr [rsp+0x10]
paddd xmm0, xmm5
paddd xmm1, xmm6
paddd xmm2, xmm7
paddd xmm3, xmm4
pxor xmm15, xmm0
pxor xmm12, xmm1
pxor xmm13, xmm2
pxor xmm14, xmm3
movdqa xmm8, xmmword ptr [ROT8+rip]
pshufb xmm15, xmm8
pshufb xmm12, xmm8
pshufb xmm13, xmm8
pshufb xmm14, xmm8
paddd xmm10, xmm15
paddd xmm11, xmm12
movdqa xmm8, xmmword ptr [rsp+0x100]
paddd xmm8, xmm13
paddd xmm9, xmm14
pxor xmm5, xmm10
pxor xmm6, xmm11
pxor xmm7, xmm8
pxor xmm4, xmm9
movdqa xmmword ptr [rsp+0x100], xmm8
movdqa xmm8, xmm5
psrld xmm8, 7
pslld xmm5, 25
por xmm5, xmm8
movdqa xmm8, xmm6
psrld xmm8, 7
pslld xmm6, 25
por xmm6, xmm8
movdqa xmm8, xmm7
psrld xmm8, 7
pslld xmm7, 25
por xmm7, xmm8
movdqa xmm8, xmm4
psrld xmm8, 7
pslld xmm4, 25
por xmm4, xmm8
paddd xmm0, xmmword ptr [rsp+0xA0]
paddd xmm1, xmmword ptr [rsp+0xC0]
paddd xmm2, xmmword ptr [rsp+0xE0]
paddd xmm3, xmmword ptr [rsp+0xD0]
paddd xmm0, xmm4
paddd xmm1, xmm5
paddd xmm2, xmm6
paddd xmm3, xmm7
pxor xmm12, xmm0
pxor xmm13, xmm1
pxor xmm14, xmm2
pxor xmm15, xmm3
movdqa xmm8, xmmword ptr [ROT16+rip]
pshufb xmm12, xmm8
pshufb xmm13, xmm8
pshufb xmm14, xmm8
pshufb xmm15, xmm8
movdqa xmm8, xmmword ptr [rsp+0x100]
paddd xmm8, xmm12
paddd xmm9, xmm13
paddd xmm10, xmm14
paddd xmm11, xmm15
pxor xmm4, xmm8
pxor xmm5, xmm9
pxor xmm6, xmm10
pxor xmm7, xmm11
movdqa xmmword ptr [rsp+0x100], xmm8
movdqa xmm8, xmm4
psrld xmm8, 12
pslld xmm4, 20
por xmm4, xmm8
movdqa xmm8, xmm5
psrld xmm8, 12
pslld xmm5, 20
por xmm5, xmm8
movdqa xmm8, xmm6
psrld xmm8, 12
pslld xmm6, 20
por xmm6, xmm8
movdqa xmm8, xmm7
psrld xmm8, 12
pslld xmm7, 20
por xmm7, xmm8
paddd xmm0, xmmword ptr [rsp+0x70]
paddd xmm1, xmmword ptr [rsp+0x90]
paddd xmm2, xmmword ptr [rsp+0x30]
paddd xmm3, xmmword ptr [rsp+0xF0]
paddd xmm0, xmm4
paddd xmm1, xmm5
paddd xmm2, xmm6
paddd xmm3, xmm7
pxor xmm12, xmm0
pxor xmm13, xmm1
pxor xmm14, xmm2
pxor xmm15, xmm3
movdqa xmm8, xmmword ptr [ROT8+rip]
pshufb xmm12, xmm8
pshufb xmm13, xmm8
pshufb xmm14, xmm8
pshufb xmm15, xmm8
movdqa xmm8, xmmword ptr [rsp+0x100]
paddd xmm8, xmm12
paddd xmm9, xmm13
paddd xmm10, xmm14
paddd xmm11, xmm15
pxor xmm4, xmm8
pxor xmm5, xmm9
pxor xmm6, xmm10
pxor xmm7, xmm11
movdqa xmmword ptr [rsp+0x100], xmm8
movdqa xmm8, xmm4
psrld xmm8, 7
pslld xmm4, 25
por xmm4, xmm8
movdqa xmm8, xmm5
psrld xmm8, 7
pslld xmm5, 25
por xmm5, xmm8
movdqa xmm8, xmm6
psrld xmm8, 7
pslld xmm6, 25
por xmm6, xmm8
movdqa xmm8, xmm7
psrld xmm8, 7
pslld xmm7, 25
por xmm7, xmm8
paddd xmm0, xmmword ptr [rsp+0x40]
paddd xmm1, xmmword ptr [rsp+0xB0]
paddd xmm2, xmmword ptr [rsp+0x50]
paddd xmm3, xmmword ptr [rsp+0x10]
paddd xmm0, xmm5
paddd xmm1, xmm6
paddd xmm2, xmm7
paddd xmm3, xmm4
pxor xmm15, xmm0
pxor xmm12, xmm1
pxor xmm13, xmm2
pxor xmm14, xmm3
movdqa xmm8, xmmword ptr [ROT16+rip]
pshufb xmm15, xmm8
pshufb xmm12, xmm8
pshufb xmm13, xmm8
pshufb xmm14, xmm8
paddd xmm10, xmm15
paddd xmm11, xmm12
movdqa xmm8, xmmword ptr [rsp+0x100]
paddd xmm8, xmm13
paddd xmm9, xmm14
pxor xmm5, xmm10
pxor xmm6, xmm11
pxor xmm7, xmm8
pxor xmm4, xmm9
movdqa xmmword ptr [rsp+0x100], xmm8
movdqa xmm8, xmm5
psrld xmm8, 12
pslld xmm5, 20
por xmm5, xmm8
movdqa xmm8, xmm6
psrld xmm8, 12
pslld xmm6, 20
por xmm6, xmm8
movdqa xmm8, xmm7
psrld xmm8, 12
pslld xmm7, 20
por xmm7, xmm8
movdqa xmm8, xmm4
psrld xmm8, 12
pslld xmm4, 20
por xmm4, xmm8
paddd xmm0, xmmword ptr [rsp]
paddd xmm1, xmmword ptr [rsp+0x20]
paddd xmm2, xmmword ptr [rsp+0x80]
paddd xmm3, xmmword ptr [rsp+0x60]
paddd xmm0, xmm5
paddd xmm1, xmm6
paddd xmm2, xmm7
paddd xmm3, xmm4
pxor xmm15, xmm0
pxor xmm12, xmm1
pxor xmm13, xmm2
pxor xmm14, xmm3
movdqa xmm8, xmmword ptr [ROT8+rip]
pshufb xmm15, xmm8
pshufb xmm12, xmm8
pshufb xmm13, xmm8
pshufb xmm14, xmm8
paddd xmm10, xmm15
paddd xmm11, xmm12
movdqa xmm8, xmmword ptr [rsp+0x100]
paddd xmm8, xmm13
paddd xmm9, xmm14
pxor xmm5, xmm10
pxor xmm6, xmm11
pxor xmm7, xmm8
pxor xmm4, xmm9
movdqa xmmword ptr [rsp+0x100], xmm8
movdqa xmm8, xmm5
psrld xmm8, 7
pslld xmm5, 25
por xmm5, xmm8
movdqa xmm8, xmm6
psrld xmm8, 7
pslld xmm6, 25
por xmm6, xmm8
movdqa xmm8, xmm7
psrld xmm8, 7
pslld xmm7, 25
por xmm7, xmm8
movdqa xmm8, xmm4
psrld xmm8, 7
pslld xmm4, 25
por xmm4, xmm8
paddd xmm0, xmmword ptr [rsp+0xC0]
paddd xmm1, xmmword ptr [rsp+0x90]
paddd xmm2, xmmword ptr [rsp+0xF0]
paddd xmm3, xmmword ptr [rsp+0xE0]
paddd xmm0, xmm4
paddd xmm1, xmm5
paddd xmm2, xmm6
paddd xmm3, xmm7
pxor xmm12, xmm0
pxor xmm13, xmm1
pxor xmm14, xmm2
pxor xmm15, xmm3
movdqa xmm8, xmmword ptr [ROT16+rip]
pshufb xmm12, xmm8
pshufb xmm13, xmm8
pshufb xmm14, xmm8
pshufb xmm15, xmm8
movdqa xmm8, xmmword ptr [rsp+0x100]
paddd xmm8, xmm12
paddd xmm9, xmm13
paddd xmm10, xmm14
paddd xmm11, xmm15
pxor xmm4, xmm8
pxor xmm5, xmm9
pxor xmm6, xmm10
pxor xmm7, xmm11
movdqa xmmword ptr [rsp+0x100], xmm8
movdqa xmm8, xmm4
psrld xmm8, 12
pslld xmm4, 20
por xmm4, xmm8
movdqa xmm8, xmm5
psrld xmm8, 12
pslld xmm5, 20
por xmm5, xmm8
movdqa xmm8, xmm6
psrld xmm8, 12
pslld xmm6, 20
por xmm6, xmm8
movdqa xmm8, xmm7
psrld xmm8, 12
pslld xmm7, 20
por xmm7, xmm8
paddd xmm0, xmmword ptr [rsp+0xD0]
paddd xmm1, xmmword ptr [rsp+0xB0]
paddd xmm2, xmmword ptr [rsp+0xA0]
paddd xmm3, xmmword ptr [rsp+0x80]
paddd xmm0, xmm4
paddd xmm1, xmm5
paddd xmm2, xmm6
paddd xmm3, xmm7
pxor xmm12, xmm0
pxor xmm13, xmm1
pxor xmm14, xmm2
pxor xmm15, xmm3
movdqa xmm8, xmmword ptr [ROT8+rip]
pshufb xmm12, xmm8
pshufb xmm13, xmm8
pshufb xmm14, xmm8
pshufb xmm15, xmm8
movdqa xmm8, xmmword ptr [rsp+0x100]
paddd xmm8, xmm12
paddd xmm9, xmm13
paddd xmm10, xmm14
paddd xmm11, xmm15
pxor xmm4, xmm8
pxor xmm5, xmm9
pxor xmm6, xmm10
pxor xmm7, xmm11
movdqa xmmword ptr [rsp+0x100], xmm8
movdqa xmm8, xmm4
psrld xmm8, 7
pslld xmm4, 25
por xmm4, xmm8
movdqa xmm8, xmm5
psrld xmm8, 7
pslld xmm5, 25
por xmm5, xmm8
movdqa xmm8, xmm6
psrld xmm8, 7
pslld xmm6, 25
por xmm6, xmm8
movdqa xmm8, xmm7
psrld xmm8, 7
pslld xmm7, 25
por xmm7, xmm8
paddd xmm0, xmmword ptr [rsp+0x70]
paddd xmm1, xmmword ptr [rsp+0x50]
paddd xmm2, xmmword ptr [rsp]
paddd xmm3, xmmword ptr [rsp+0x60]
paddd xmm0, xmm5
paddd xmm1, xmm6
paddd xmm2, xmm7
paddd xmm3, xmm4
pxor xmm15, xmm0
pxor xmm12, xmm1
pxor xmm13, xmm2
pxor xmm14, xmm3
movdqa xmm8, xmmword ptr [ROT16+rip]
pshufb xmm15, xmm8
pshufb xmm12, xmm8
pshufb xmm13, xmm8
pshufb xmm14, xmm8
paddd xmm10, xmm15
paddd xmm11, xmm12
movdqa xmm8, xmmword ptr [rsp+0x100]
paddd xmm8, xmm13
paddd xmm9, xmm14
pxor xmm5, xmm10
pxor xmm6, xmm11
pxor xmm7, xmm8
pxor xmm4, xmm9
movdqa xmmword ptr [rsp+0x100], xmm8
movdqa xmm8, xmm5
psrld xmm8, 12
pslld xmm5, 20
por xmm5, xmm8
movdqa xmm8, xmm6
psrld xmm8, 12
pslld xmm6, 20
por xmm6, xmm8
movdqa xmm8, xmm7
psrld xmm8, 12
pslld xmm7, 20
por xmm7, xmm8
movdqa xmm8, xmm4
psrld xmm8, 12
pslld xmm4, 20
por xmm4, xmm8
paddd xmm0, xmmword ptr [rsp+0x20]
paddd xmm1, xmmword ptr [rsp+0x30]
paddd xmm2, xmmword ptr [rsp+0x10]
paddd xmm3, xmmword ptr [rsp+0x40]
paddd xmm0, xmm5
paddd xmm1, xmm6
paddd xmm2, xmm7
paddd xmm3, xmm4
pxor xmm15, xmm0
pxor xmm12, xmm1
pxor xmm13, xmm2
pxor xmm14, xmm3
movdqa xmm8, xmmword ptr [ROT8+rip]
pshufb xmm15, xmm8
pshufb xmm12, xmm8
pshufb xmm13, xmm8
pshufb xmm14, xmm8
paddd xmm10, xmm15
paddd xmm11, xmm12
movdqa xmm8, xmmword ptr [rsp+0x100]
paddd xmm8, xmm13
paddd xmm9, xmm14
pxor xmm5, xmm10
pxor xmm6, xmm11
pxor xmm7, xmm8
pxor xmm4, xmm9
movdqa xmmword ptr [rsp+0x100], xmm8
movdqa xmm8, xmm5
psrld xmm8, 7
pslld xmm5, 25
por xmm5, xmm8
movdqa xmm8, xmm6
psrld xmm8, 7
pslld xmm6, 25
por xmm6, xmm8
movdqa xmm8, xmm7
psrld xmm8, 7
pslld xmm7, 25
por xmm7, xmm8
movdqa xmm8, xmm4
psrld xmm8, 7
pslld xmm4, 25
por xmm4, xmm8
paddd xmm0, xmmword ptr [rsp+0x90]
paddd xmm1, xmmword ptr [rsp+0xB0]
paddd xmm2, xmmword ptr [rsp+0x80]
paddd xmm3, xmmword ptr [rsp+0xF0]
paddd xmm0, xmm4
paddd xmm1, xmm5
paddd xmm2, xmm6
paddd xmm3, xmm7
pxor xmm12, xmm0
pxor xmm13, xmm1
pxor xmm14, xmm2
pxor xmm15, xmm3
movdqa xmm8, xmmword ptr [ROT16+rip]
pshufb xmm12, xmm8
pshufb xmm13, xmm8
pshufb xmm14, xmm8
pshufb xmm15, xmm8
movdqa xmm8, xmmword ptr [rsp+0x100]
paddd xmm8, xmm12
paddd xmm9, xmm13
paddd xmm10, xmm14
paddd xmm11, xmm15
pxor xmm4, xmm8
pxor xmm5, xmm9
pxor xmm6, xmm10
pxor xmm7, xmm11
movdqa xmmword ptr [rsp+0x100], xmm8
movdqa xmm8, xmm4
psrld xmm8, 12
pslld xmm4, 20
por xmm4, xmm8
movdqa xmm8, xmm5
psrld xmm8, 12
pslld xmm5, 20
por xmm5, xmm8
movdqa xmm8, xmm6
psrld xmm8, 12
pslld xmm6, 20
por xmm6, xmm8
movdqa xmm8, xmm7
psrld xmm8, 12
pslld xmm7, 20
por xmm7, xmm8
paddd xmm0, xmmword ptr [rsp+0xE0]
paddd xmm1, xmmword ptr [rsp+0x50]
paddd xmm2, xmmword ptr [rsp+0xC0]
paddd xmm3, xmmword ptr [rsp+0x10]
paddd xmm0, xmm4
paddd xmm1, xmm5
paddd xmm2, xmm6
paddd xmm3, xmm7
pxor xmm12, xmm0
pxor xmm13, xmm1
pxor xmm14, xmm2
pxor xmm15, xmm3
movdqa xmm8, xmmword ptr [ROT8+rip]
pshufb xmm12, xmm8
pshufb xmm13, xmm8
pshufb xmm14, xmm8
pshufb xmm15, xmm8
movdqa xmm8, xmmword ptr [rsp+0x100]
paddd xmm8, xmm12
paddd xmm9, xmm13
paddd xmm10, xmm14
paddd xmm11, xmm15
pxor xmm4, xmm8
pxor xmm5, xmm9
pxor xmm6, xmm10
pxor xmm7, xmm11
movdqa xmmword ptr [rsp+0x100], xmm8
movdqa xmm8, xmm4
psrld xmm8, 7
pslld xmm4, 25
por xmm4, xmm8
movdqa xmm8, xmm5
psrld xmm8, 7
pslld xmm5, 25
por xmm5, xmm8
movdqa xmm8, xmm6
psrld xmm8, 7
pslld xmm6, 25
por xmm6, xmm8
movdqa xmm8, xmm7
psrld xmm8, 7
pslld xmm7, 25
por xmm7, xmm8
paddd xmm0, xmmword ptr [rsp+0xD0]
paddd xmm1, xmmword ptr [rsp]
paddd xmm2, xmmword ptr [rsp+0x20]
paddd xmm3, xmmword ptr [rsp+0x40]
paddd xmm0, xmm5
paddd xmm1, xmm6
paddd xmm2, xmm7
paddd xmm3, xmm4
pxor xmm15, xmm0
pxor xmm12, xmm1
pxor xmm13, xmm2
pxor xmm14, xmm3
movdqa xmm8, xmmword ptr [ROT16+rip]
pshufb xmm15, xmm8
pshufb xmm12, xmm8
pshufb xmm13, xmm8
pshufb xmm14, xmm8
paddd xmm10, xmm15
paddd xmm11, xmm12
movdqa xmm8, xmmword ptr [rsp+0x100]
paddd xmm8, xmm13
paddd xmm9, xmm14
pxor xmm5, xmm10
pxor xmm6, xmm11
pxor xmm7, xmm8
pxor xmm4, xmm9
movdqa xmmword ptr [rsp+0x100], xmm8
movdqa xmm8, xmm5
psrld xmm8, 12
pslld xmm5, 20
por xmm5, xmm8
movdqa xmm8, xmm6
psrld xmm8, 12
pslld xmm6, 20
por xmm6, xmm8
movdqa xmm8, xmm7
psrld xmm8, 12
pslld xmm7, 20
por xmm7, xmm8
movdqa xmm8, xmm4
psrld xmm8, 12
pslld xmm4, 20
por xmm4, xmm8
paddd xmm0, xmmword ptr [rsp+0x30]
paddd xmm1, xmmword ptr [rsp+0xA0]
paddd xmm2, xmmword ptr [rsp+0x60]
paddd xmm3, xmmword ptr [rsp+0x70]
paddd xmm0, xmm5
paddd xmm1, xmm6
paddd xmm2, xmm7
paddd xmm3, xmm4
pxor xmm15, xmm0
pxor xmm12, xmm1
pxor xmm13, xmm2
pxor xmm14, xmm3
movdqa xmm8, xmmword ptr [ROT8+rip]
pshufb xmm15, xmm8
pshufb xmm12, xmm8
pshufb xmm13, xmm8
pshufb xmm14, xmm8
paddd xmm10, xmm15
paddd xmm11, xmm12
movdqa xmm8, xmmword ptr [rsp+0x100]
paddd xmm8, xmm13
paddd xmm9, xmm14
pxor xmm5, xmm10
pxor xmm6, xmm11
pxor xmm7, xmm8
pxor xmm4, xmm9
movdqa xmmword ptr [rsp+0x100], xmm8
movdqa xmm8, xmm5
psrld xmm8, 7
pslld xmm5, 25
por xmm5, xmm8
movdqa xmm8, xmm6
psrld xmm8, 7
pslld xmm6, 25
por xmm6, xmm8
movdqa xmm8, xmm7
psrld xmm8, 7
pslld xmm7, 25
por xmm7, xmm8
movdqa xmm8, xmm4
psrld xmm8, 7
pslld xmm4, 25
por xmm4, xmm8
paddd xmm0, xmmword ptr [rsp+0xB0]
paddd xmm1, xmmword ptr [rsp+0x50]
paddd xmm2, xmmword ptr [rsp+0x10]
paddd xmm3, xmmword ptr [rsp+0x80]
paddd xmm0, xmm4
paddd xmm1, xmm5
paddd xmm2, xmm6
paddd xmm3, xmm7
pxor xmm12, xmm0
pxor xmm13, xmm1
pxor xmm14, xmm2
pxor xmm15, xmm3
movdqa xmm8, xmmword ptr [ROT16+rip]
pshufb xmm12, xmm8
pshufb xmm13, xmm8
pshufb xmm14, xmm8
pshufb xmm15, xmm8
movdqa xmm8, xmmword ptr [rsp+0x100]
paddd xmm8, xmm12
paddd xmm9, xmm13
paddd xmm10, xmm14
paddd xmm11, xmm15
pxor xmm4, xmm8
pxor xmm5, xmm9
pxor xmm6, xmm10
pxor xmm7, xmm11
movdqa xmmword ptr [rsp+0x100], xmm8
movdqa xmm8, xmm4
psrld xmm8, 12
pslld xmm4, 20
por xmm4, xmm8
movdqa xmm8, xmm5
psrld xmm8, 12
pslld xmm5, 20
por xmm5, xmm8
movdqa xmm8, xmm6
psrld xmm8, 12
pslld xmm6, 20
por xmm6, xmm8
movdqa xmm8, xmm7
psrld xmm8, 12
pslld xmm7, 20
por xmm7, xmm8
paddd xmm0, xmmword ptr [rsp+0xF0]
paddd xmm1, xmmword ptr [rsp]
paddd xmm2, xmmword ptr [rsp+0x90]
paddd xmm3, xmmword ptr [rsp+0x60]
paddd xmm0, xmm4
paddd xmm1, xmm5
paddd xmm2, xmm6
paddd xmm3, xmm7
pxor xmm12, xmm0
pxor xmm13, xmm1
pxor xmm14, xmm2
pxor xmm15, xmm3
movdqa xmm8, xmmword ptr [ROT8+rip]
pshufb xmm12, xmm8
pshufb xmm13, xmm8
pshufb xmm14, xmm8
pshufb xmm15, xmm8
movdqa xmm8, xmmword ptr [rsp+0x100]
paddd xmm8, xmm12
paddd xmm9, xmm13
paddd xmm10, xmm14
paddd xmm11, xmm15
pxor xmm4, xmm8
pxor xmm5, xmm9
pxor xmm6, xmm10
pxor xmm7, xmm11
movdqa xmmword ptr [rsp+0x100], xmm8
movdqa xmm8, xmm4
psrld xmm8, 7
pslld xmm4, 25
por xmm4, xmm8
movdqa xmm8, xmm5
psrld xmm8, 7
pslld xmm5, 25
por xmm5, xmm8
movdqa xmm8, xmm6
psrld xmm8, 7
pslld xmm6, 25
por xmm6, xmm8
movdqa xmm8, xmm7
psrld xmm8, 7
pslld xmm7, 25
por xmm7, xmm8
paddd xmm0, xmmword ptr [rsp+0xE0]
paddd xmm1, xmmword ptr [rsp+0x20]
paddd xmm2, xmmword ptr [rsp+0x30]
paddd xmm3, xmmword ptr [rsp+0x70]
paddd xmm0, xmm5
paddd xmm1, xmm6
paddd xmm2, xmm7
paddd xmm3, xmm4
pxor xmm15, xmm0
pxor xmm12, xmm1
pxor xmm13, xmm2
pxor xmm14, xmm3
movdqa xmm8, xmmword ptr [ROT16+rip]
pshufb xmm15, xmm8
pshufb xmm12, xmm8
pshufb xmm13, xmm8
pshufb xmm14, xmm8
paddd xmm10, xmm15
paddd xmm11, xmm12
movdqa xmm8, xmmword ptr [rsp+0x100]
paddd xmm8, xmm13
paddd xmm9, xmm14
pxor xmm5, xmm10
pxor xmm6, xmm11
pxor xmm7, xmm8
pxor xmm4, xmm9
movdqa xmmword ptr [rsp+0x100], xmm8
movdqa xmm8, xmm5
psrld xmm8, 12
pslld xmm5, 20
por xmm5, xmm8
movdqa xmm8, xmm6
psrld xmm8, 12
pslld xmm6, 20
por xmm6, xmm8
movdqa xmm8, xmm7
psrld xmm8, 12
pslld xmm7, 20
por xmm7, xmm8
movdqa xmm8, xmm4
psrld xmm8, 12
pslld xmm4, 20
por xmm4, xmm8
paddd xmm0, xmmword ptr [rsp+0xA0]
paddd xmm1, xmmword ptr [rsp+0xC0]
paddd xmm2, xmmword ptr [rsp+0x40]
paddd xmm3, xmmword ptr [rsp+0xD0]
paddd xmm0, xmm5
paddd xmm1, xmm6
paddd xmm2, xmm7
paddd xmm3, xmm4
pxor xmm15, xmm0
pxor xmm12, xmm1
pxor xmm13, xmm2
pxor xmm14, xmm3
movdqa xmm8, xmmword ptr [ROT8+rip]
pshufb xmm15, xmm8
pshufb xmm12, xmm8
pshufb xmm13, xmm8
pshufb xmm14, xmm8
paddd xmm10, xmm15
paddd xmm11, xmm12
movdqa xmm8, xmmword ptr [rsp+0x100]
paddd xmm8, xmm13
paddd xmm9, xmm14
pxor xmm5, xmm10
pxor xmm6, xmm11
pxor xmm7, xmm8
pxor xmm4, xmm9
pxor xmm0, xmm8
pxor xmm1, xmm9
pxor xmm2, xmm10
pxor xmm3, xmm11
movdqa xmm8, xmm5
psrld xmm8, 7
pslld xmm5, 25
por xmm5, xmm8
movdqa xmm8, xmm6
psrld xmm8, 7
pslld xmm6, 25
por xmm6, xmm8
movdqa xmm8, xmm7
psrld xmm8, 7
pslld xmm7, 25
por xmm7, xmm8
movdqa xmm8, xmm4
psrld xmm8, 7
pslld xmm4, 25
por xmm4, xmm8
pxor xmm4, xmm12
pxor xmm5, xmm13
pxor xmm6, xmm14
pxor xmm7, xmm15
mov eax, r13d
jne 9b
movdqa xmm9, xmm0
punpckldq xmm0, xmm1
punpckhdq xmm9, xmm1
movdqa xmm11, xmm2
punpckldq xmm2, xmm3
punpckhdq xmm11, xmm3
movdqa xmm1, xmm0
punpcklqdq xmm0, xmm2
punpckhqdq xmm1, xmm2
movdqa xmm3, xmm9
punpcklqdq xmm9, xmm11
punpckhqdq xmm3, xmm11
movdqu xmmword ptr [rbx], xmm0
movdqu xmmword ptr [rbx+0x20], xmm1
movdqu xmmword ptr [rbx+0x40], xmm9
movdqu xmmword ptr [rbx+0x60], xmm3
movdqa xmm9, xmm4
punpckldq xmm4, xmm5
punpckhdq xmm9, xmm5
movdqa xmm11, xmm6
punpckldq xmm6, xmm7
punpckhdq xmm11, xmm7
movdqa xmm5, xmm4
punpcklqdq xmm4, xmm6
punpckhqdq xmm5, xmm6
movdqa xmm7, xmm9
punpcklqdq xmm9, xmm11
punpckhqdq xmm7, xmm11
movdqu xmmword ptr [rbx+0x10], xmm4
movdqu xmmword ptr [rbx+0x30], xmm5
movdqu xmmword ptr [rbx+0x50], xmm9
movdqu xmmword ptr [rbx+0x70], xmm7
movdqa xmm1, xmmword ptr [rsp+0x110]
movdqa xmm0, xmm1
paddd xmm1, xmmword ptr [rsp+0x150]
movdqa xmmword ptr [rsp+0x110], xmm1
pxor xmm0, xmmword ptr [CMP_MSB_MASK+rip]
pxor xmm1, xmmword ptr [CMP_MSB_MASK+rip]
pcmpgtd xmm0, xmm1
movdqa xmm1, xmmword ptr [rsp+0x120]
psubd xmm1, xmm0
movdqa xmmword ptr [rsp+0x120], xmm1
add rbx, 128
add rdi, 32
sub rsi, 4
cmp rsi, 4
jnc 2b
test rsi, rsi
jnz 3f
4:
mov rsp, rbp
pop rbp
pop rbx
pop r12
pop r13
pop r14
pop r15
ret
.p2align 5
3:
test esi, 0x2
je 3f
movups xmm0, xmmword ptr [rcx]
movups xmm1, xmmword ptr [rcx+0x10]
movaps xmm8, xmm0
movaps xmm9, xmm1
movd xmm13, dword ptr [rsp+0x110]
pinsrd xmm13, dword ptr [rsp+0x120], 1
pinsrd xmm13, dword ptr [BLAKE3_BLOCK_LEN+rip], 2
movaps xmmword ptr [rsp], xmm13
movd xmm14, dword ptr [rsp+0x114]
pinsrd xmm14, dword ptr [rsp+0x124], 1
pinsrd xmm14, dword ptr [BLAKE3_BLOCK_LEN+rip], 2
movaps xmmword ptr [rsp+0x10], xmm14
mov r8, qword ptr [rdi]
mov r9, qword ptr [rdi+0x8]
movzx eax, byte ptr [rbp+0x40]
or eax, r13d
xor edx, edx
2:
mov r14d, eax
or eax, r12d
add rdx, 64
cmp rdx, r15
cmovne eax, r14d
movaps xmm2, xmmword ptr [BLAKE3_IV+rip]
movaps xmm10, xmm2
movups xmm4, xmmword ptr [r8+rdx-0x40]
movups xmm5, xmmword ptr [r8+rdx-0x30]
movaps xmm3, xmm4
shufps xmm4, xmm5, 136
shufps xmm3, xmm5, 221
movaps xmm5, xmm3
movups xmm6, xmmword ptr [r8+rdx-0x20]
movups xmm7, xmmword ptr [r8+rdx-0x10]
movaps xmm3, xmm6
shufps xmm6, xmm7, 136
pshufd xmm6, xmm6, 0x93
shufps xmm3, xmm7, 221
pshufd xmm7, xmm3, 0x93
movups xmm12, xmmword ptr [r9+rdx-0x40]
movups xmm13, xmmword ptr [r9+rdx-0x30]
movaps xmm11, xmm12
shufps xmm12, xmm13, 136
shufps xmm11, xmm13, 221
movaps xmm13, xmm11
movups xmm14, xmmword ptr [r9+rdx-0x20]
movups xmm15, xmmword ptr [r9+rdx-0x10]
movaps xmm11, xmm14
shufps xmm14, xmm15, 136
pshufd xmm14, xmm14, 0x93
shufps xmm11, xmm15, 221
pshufd xmm15, xmm11, 0x93
movaps xmm3, xmmword ptr [rsp]
movaps xmm11, xmmword ptr [rsp+0x10]
pinsrd xmm3, eax, 3
pinsrd xmm11, eax, 3
mov al, 7
9:
paddd xmm0, xmm4
paddd xmm8, xmm12
movaps xmmword ptr [rsp+0x20], xmm4
movaps xmmword ptr [rsp+0x30], xmm12
paddd xmm0, xmm1
paddd xmm8, xmm9
pxor xmm3, xmm0
pxor xmm11, xmm8
movaps xmm12, xmmword ptr [ROT16+rip]
pshufb xmm3, xmm12
pshufb xmm11, xmm12
paddd xmm2, xmm3
paddd xmm10, xmm11
pxor xmm1, xmm2
pxor xmm9, xmm10
movdqa xmm4, xmm1
pslld xmm1, 20
psrld xmm4, 12
por xmm1, xmm4
movdqa xmm4, xmm9
pslld xmm9, 20
psrld xmm4, 12
por xmm9, xmm4
paddd xmm0, xmm5
paddd xmm8, xmm13
movaps xmmword ptr [rsp+0x40], xmm5
movaps xmmword ptr [rsp+0x50], xmm13
paddd xmm0, xmm1
paddd xmm8, xmm9
pxor xmm3, xmm0
pxor xmm11, xmm8
movaps xmm13, xmmword ptr [ROT8+rip]
pshufb xmm3, xmm13
pshufb xmm11, xmm13
paddd xmm2, xmm3
paddd xmm10, xmm11
pxor xmm1, xmm2
pxor xmm9, xmm10
movdqa xmm4, xmm1
pslld xmm1, 25
psrld xmm4, 7
por xmm1, xmm4
movdqa xmm4, xmm9
pslld xmm9, 25
psrld xmm4, 7
por xmm9, xmm4
pshufd xmm0, xmm0, 0x93
pshufd xmm8, xmm8, 0x93
pshufd xmm3, xmm3, 0x4E
pshufd xmm11, xmm11, 0x4E
pshufd xmm2, xmm2, 0x39
pshufd xmm10, xmm10, 0x39
paddd xmm0, xmm6
paddd xmm8, xmm14
paddd xmm0, xmm1
paddd xmm8, xmm9
pxor xmm3, xmm0
pxor xmm11, xmm8
pshufb xmm3, xmm12
pshufb xmm11, xmm12
paddd xmm2, xmm3
paddd xmm10, xmm11
pxor xmm1, xmm2
pxor xmm9, xmm10
movdqa xmm4, xmm1
pslld xmm1, 20
psrld xmm4, 12
por xmm1, xmm4
movdqa xmm4, xmm9
pslld xmm9, 20
psrld xmm4, 12
por xmm9, xmm4
paddd xmm0, xmm7
paddd xmm8, xmm15
paddd xmm0, xmm1
paddd xmm8, xmm9
pxor xmm3, xmm0
pxor xmm11, xmm8
pshufb xmm3, xmm13
pshufb xmm11, xmm13
paddd xmm2, xmm3
paddd xmm10, xmm11
pxor xmm1, xmm2
pxor xmm9, xmm10
movdqa xmm4, xmm1
pslld xmm1, 25
psrld xmm4, 7
por xmm1, xmm4
movdqa xmm4, xmm9
pslld xmm9, 25
psrld xmm4, 7
por xmm9, xmm4
pshufd xmm0, xmm0, 0x39
pshufd xmm8, xmm8, 0x39
pshufd xmm3, xmm3, 0x4E
pshufd xmm11, xmm11, 0x4E
pshufd xmm2, xmm2, 0x93
pshufd xmm10, xmm10, 0x93
dec al
je 9f
movdqa xmm12, xmmword ptr [rsp+0x20]
movdqa xmm5, xmmword ptr [rsp+0x40]
pshufd xmm13, xmm12, 0x0F
shufps xmm12, xmm5, 214
pshufd xmm4, xmm12, 0x39
movdqa xmm12, xmm6
shufps xmm12, xmm7, 250
pblendw xmm13, xmm12, 0xCC
movdqa xmm12, xmm7
punpcklqdq xmm12, xmm5
pblendw xmm12, xmm6, 0xC0
pshufd xmm12, xmm12, 0x78
punpckhdq xmm5, xmm7
punpckldq xmm6, xmm5
pshufd xmm7, xmm6, 0x1E
movdqa xmmword ptr [rsp+0x20], xmm13
movdqa xmmword ptr [rsp+0x40], xmm12
movdqa xmm5, xmmword ptr [rsp+0x30]
movdqa xmm13, xmmword ptr [rsp+0x50]
pshufd xmm6, xmm5, 0x0F
shufps xmm5, xmm13, 214
pshufd xmm12, xmm5, 0x39
movdqa xmm5, xmm14
shufps xmm5, xmm15, 250
pblendw xmm6, xmm5, 0xCC
movdqa xmm5, xmm15
punpcklqdq xmm5, xmm13
pblendw xmm5, xmm14, 0xC0
pshufd xmm5, xmm5, 0x78
punpckhdq xmm13, xmm15
punpckldq xmm14, xmm13
pshufd xmm15, xmm14, 0x1E
movdqa xmm13, xmm6
movdqa xmm14, xmm5
movdqa xmm5, xmmword ptr [rsp+0x20]
movdqa xmm6, xmmword ptr [rsp+0x40]
jmp 9b
9:
pxor xmm0, xmm2
pxor xmm1, xmm3
pxor xmm8, xmm10
pxor xmm9, xmm11
mov eax, r13d
cmp rdx, r15
jne 2b
movups xmmword ptr [rbx], xmm0
movups xmmword ptr [rbx+0x10], xmm1
movups xmmword ptr [rbx+0x20], xmm8
movups xmmword ptr [rbx+0x30], xmm9
movdqa xmm0, xmmword ptr [rsp+0x130]
movdqa xmm1, xmmword ptr [rsp+0x110]
movdqa xmm2, xmmword ptr [rsp+0x120]
movdqu xmm3, xmmword ptr [rsp+0x118]
movdqu xmm4, xmmword ptr [rsp+0x128]
blendvps xmm1, xmm3, xmm0
blendvps xmm2, xmm4, xmm0
movdqa xmmword ptr [rsp+0x110], xmm1
movdqa xmmword ptr [rsp+0x120], xmm2
add rdi, 16
add rbx, 64
sub rsi, 2
3:
test esi, 0x1
je 4b
movups xmm0, xmmword ptr [rcx]
movups xmm1, xmmword ptr [rcx+0x10]
movd xmm13, dword ptr [rsp+0x110]
pinsrd xmm13, dword ptr [rsp+0x120], 1
pinsrd xmm13, dword ptr [BLAKE3_BLOCK_LEN+rip], 2
movaps xmm14, xmmword ptr [ROT8+rip]
movaps xmm15, xmmword ptr [ROT16+rip]
mov r8, qword ptr [rdi]
movzx eax, byte ptr [rbp+0x40]
or eax, r13d
xor edx, edx
2:
mov r14d, eax
or eax, r12d
add rdx, 64
cmp rdx, r15
cmovne eax, r14d
movaps xmm2, xmmword ptr [BLAKE3_IV+rip]
movaps xmm3, xmm13
pinsrd xmm3, eax, 3
movups xmm4, xmmword ptr [r8+rdx-0x40]
movups xmm5, xmmword ptr [r8+rdx-0x30]
movaps xmm8, xmm4
shufps xmm4, xmm5, 136
shufps xmm8, xmm5, 221
movaps xmm5, xmm8
movups xmm6, xmmword ptr [r8+rdx-0x20]
movups xmm7, xmmword ptr [r8+rdx-0x10]
movaps xmm8, xmm6
shufps xmm6, xmm7, 136
pshufd xmm6, xmm6, 0x93
shufps xmm8, xmm7, 221
pshufd xmm7, xmm8, 0x93
mov al, 7
9:
paddd xmm0, xmm4
paddd xmm0, xmm1
pxor xmm3, xmm0
pshufb xmm3, xmm15
paddd xmm2, xmm3
pxor xmm1, xmm2
movdqa xmm11, xmm1
pslld xmm1, 20
psrld xmm11, 12
por xmm1, xmm11
paddd xmm0, xmm5
paddd xmm0, xmm1
pxor xmm3, xmm0
pshufb xmm3, xmm14
paddd xmm2, xmm3
pxor xmm1, xmm2
movdqa xmm11, xmm1
pslld xmm1, 25
psrld xmm11, 7
por xmm1, xmm11
pshufd xmm0, xmm0, 0x93
pshufd xmm3, xmm3, 0x4E
pshufd xmm2, xmm2, 0x39
paddd xmm0, xmm6
paddd xmm0, xmm1
pxor xmm3, xmm0
pshufb xmm3, xmm15
paddd xmm2, xmm3
pxor xmm1, xmm2
movdqa xmm11, xmm1
pslld xmm1, 20
psrld xmm11, 12
por xmm1, xmm11
paddd xmm0, xmm7
paddd xmm0, xmm1
pxor xmm3, xmm0
pshufb xmm3, xmm14
paddd xmm2, xmm3
pxor xmm1, xmm2
movdqa xmm11, xmm1
pslld xmm1, 25
psrld xmm11, 7
por xmm1, xmm11
pshufd xmm0, xmm0, 0x39
pshufd xmm3, xmm3, 0x4E
pshufd xmm2, xmm2, 0x93
dec al
jz 9f
movdqa xmm8, xmm4
shufps xmm8, xmm5, 214
pshufd xmm9, xmm4, 0x0F
pshufd xmm4, xmm8, 0x39
movdqa xmm8, xmm6
shufps xmm8, xmm7, 250
pblendw xmm9, xmm8, 0xCC
movdqa xmm8, xmm7
punpcklqdq xmm8, xmm5
pblendw xmm8, xmm6, 0xC0
pshufd xmm8, xmm8, 0x78
punpckhdq xmm5, xmm7
punpckldq xmm6, xmm5
pshufd xmm7, xmm6, 0x1E
movdqa xmm5, xmm9
movdqa xmm6, xmm8
jmp 9b
9:
pxor xmm0, xmm2
pxor xmm1, xmm3
mov eax, r13d
cmp rdx, r15
jne 2b
movups xmmword ptr [rbx], xmm0
movups xmmword ptr [rbx+0x10], xmm1
jmp 4b
.p2align 6
blake3_compress_in_place_sse2:
_blake3_compress_in_place_sse2:
_CET_ENDBR
movups xmm0, xmmword ptr [rdi]
movups xmm1, xmmword ptr [rdi+0x10]
movaps xmm2, xmmword ptr [BLAKE3_IV+rip]
shl r8, 32
add rdx, r8
movq xmm3, rcx
movq xmm4, rdx
punpcklqdq xmm3, xmm4
movups xmm4, xmmword ptr [rsi]
movups xmm5, xmmword ptr [rsi+0x10]
movaps xmm8, xmm4
shufps xmm4, xmm5, 136
shufps xmm8, xmm5, 221
movaps xmm5, xmm8
movups xmm6, xmmword ptr [rsi+0x20]
movups xmm7, xmmword ptr [rsi+0x30]
movaps xmm8, xmm6
shufps xmm6, xmm7, 136
pshufd xmm6, xmm6, 0x93
shufps xmm8, xmm7, 221
pshufd xmm7, xmm8, 0x93
movaps xmm14, xmmword ptr [ROT8+rip]
movaps xmm15, xmmword ptr [ROT16+rip]
mov al, 7
9:
paddd xmm0, xmm4
paddd xmm0, xmm1
pxor xmm3, xmm0
pshufb xmm3, xmm15
paddd xmm2, xmm3
pxor xmm1, xmm2
movdqa xmm11, xmm1
pslld xmm1, 20
psrld xmm11, 12
por xmm1, xmm11
paddd xmm0, xmm5
paddd xmm0, xmm1
pxor xmm3, xmm0
pshufb xmm3, xmm14
paddd xmm2, xmm3
pxor xmm1, xmm2
movdqa xmm11, xmm1
pslld xmm1, 25
psrld xmm11, 7
por xmm1, xmm11
pshufd xmm0, xmm0, 0x93
pshufd xmm3, xmm3, 0x4E
pshufd xmm2, xmm2, 0x39
paddd xmm0, xmm6
paddd xmm0, xmm1
pxor xmm3, xmm0
pshufb xmm3, xmm15
paddd xmm2, xmm3
pxor xmm1, xmm2
movdqa xmm11, xmm1
pslld xmm1, 20
psrld xmm11, 12
por xmm1, xmm11
paddd xmm0, xmm7
paddd xmm0, xmm1
pxor xmm3, xmm0
pshufb xmm3, xmm14
paddd xmm2, xmm3
pxor xmm1, xmm2
movdqa xmm11, xmm1
pslld xmm1, 25
psrld xmm11, 7
por xmm1, xmm11
pshufd xmm0, xmm0, 0x39
pshufd xmm3, xmm3, 0x4E
pshufd xmm2, xmm2, 0x93
dec al
jz 9f
movdqa xmm8, xmm4
shufps xmm8, xmm5, 214
pshufd xmm9, xmm4, 0x0F
pshufd xmm4, xmm8, 0x39
movdqa xmm8, xmm6
shufps xmm8, xmm7, 250
pblendw xmm9, xmm8, 0xCC
movdqa xmm8, xmm7
punpcklqdq xmm8, xmm5
pblendw xmm8, xmm6, 0xC0
pshufd xmm8, xmm8, 0x78
punpckhdq xmm5, xmm7
punpckldq xmm6, xmm5
pshufd xmm7, xmm6, 0x1E
movdqa xmm5, xmm9
movdqa xmm6, xmm8
jmp 9b
9:
pxor xmm0, xmm2
pxor xmm1, xmm3
movups xmmword ptr [rdi], xmm0
movups xmmword ptr [rdi+0x10], xmm1
ret
.p2align 6
blake3_compress_xof_sse2:
_blake3_compress_xof_sse2:
_CET_ENDBR
movups xmm0, xmmword ptr [rdi]
movups xmm1, xmmword ptr [rdi+0x10]
movaps xmm2, xmmword ptr [BLAKE3_IV+rip]
movzx eax, r8b
movzx edx, dl
shl rax, 32
add rdx, rax
movq xmm3, rcx
movq xmm4, rdx
punpcklqdq xmm3, xmm4
movups xmm4, xmmword ptr [rsi]
movups xmm5, xmmword ptr [rsi+0x10]
movaps xmm8, xmm4
shufps xmm4, xmm5, 136
shufps xmm8, xmm5, 221
movaps xmm5, xmm8
movups xmm6, xmmword ptr [rsi+0x20]
movups xmm7, xmmword ptr [rsi+0x30]
movaps xmm8, xmm6
shufps xmm6, xmm7, 136
pshufd xmm6, xmm6, 0x93
shufps xmm8, xmm7, 221
pshufd xmm7, xmm8, 0x93
movaps xmm14, xmmword ptr [ROT8+rip]
movaps xmm15, xmmword ptr [ROT16+rip]
mov al, 7
9:
paddd xmm0, xmm4
paddd xmm0, xmm1
pxor xmm3, xmm0
pshufb xmm3, xmm15
paddd xmm2, xmm3
pxor xmm1, xmm2
movdqa xmm11, xmm1
pslld xmm1, 20
psrld xmm11, 12
por xmm1, xmm11
paddd xmm0, xmm5
paddd xmm0, xmm1
pxor xmm3, xmm0
pshufb xmm3, xmm14
paddd xmm2, xmm3
pxor xmm1, xmm2
movdqa xmm11, xmm1
pslld xmm1, 25
psrld xmm11, 7
por xmm1, xmm11
pshufd xmm0, xmm0, 0x93
pshufd xmm3, xmm3, 0x4E
pshufd xmm2, xmm2, 0x39
paddd xmm0, xmm6
paddd xmm0, xmm1
pxor xmm3, xmm0
pshufb xmm3, xmm15
paddd xmm2, xmm3
pxor xmm1, xmm2
movdqa xmm11, xmm1
pslld xmm1, 20
psrld xmm11, 12
por xmm1, xmm11
paddd xmm0, xmm7
paddd xmm0, xmm1
pxor xmm3, xmm0
pshufb xmm3, xmm14
paddd xmm2, xmm3
pxor xmm1, xmm2
movdqa xmm11, xmm1
pslld xmm1, 25
psrld xmm11, 7
por xmm1, xmm11
pshufd xmm0, xmm0, 0x39
pshufd xmm3, xmm3, 0x4E
pshufd xmm2, xmm2, 0x93
dec al
jz 9f
movdqa xmm8, xmm4
shufps xmm8, xmm5, 214
pshufd xmm9, xmm4, 0x0F
pshufd xmm4, xmm8, 0x39
movdqa xmm8, xmm6
shufps xmm8, xmm7, 250
pblendw xmm9, xmm8, 0xCC
movdqa xmm8, xmm7
punpcklqdq xmm8, xmm5
pblendw xmm8, xmm6, 0xC0
pshufd xmm8, xmm8, 0x78
punpckhdq xmm5, xmm7
punpckldq xmm6, xmm5
pshufd xmm7, xmm6, 0x1E
movdqa xmm5, xmm9
movdqa xmm6, xmm8
jmp 9b
9:
movdqu xmm4, xmmword ptr [rdi]
movdqu xmm5, xmmword ptr [rdi+0x10]
pxor xmm0, xmm2
pxor xmm1, xmm3
pxor xmm2, xmm4
pxor xmm3, xmm5
movups xmmword ptr [r9], xmm0
movups xmmword ptr [r9+0x10], xmm1
movups xmmword ptr [r9+0x20], xmm2
movups xmmword ptr [r9+0x30], xmm3
ret
#ifdef __APPLE__
.static_data
#else
.section .rodata
#endif
.p2align 6
BLAKE3_IV:
.long 0x6A09E667, 0xBB67AE85
.long 0x3C6EF372, 0xA54FF53A
ROT16:
.byte 2, 3, 0, 1, 6, 7, 4, 5, 10, 11, 8, 9, 14, 15, 12, 13
ROT8:
.byte 1, 2, 3, 0, 5, 6, 7, 4, 9, 10, 11, 8, 13, 14, 15, 12
ADD0:
.long 0, 1, 2, 3
ADD1:
.long 4, 4, 4, 4
BLAKE3_IV_0:
.long 0x6A09E667, 0x6A09E667, 0x6A09E667, 0x6A09E667
BLAKE3_IV_1:
.long 0xBB67AE85, 0xBB67AE85, 0xBB67AE85, 0xBB67AE85
BLAKE3_IV_2:
.long 0x3C6EF372, 0x3C6EF372, 0x3C6EF372, 0x3C6EF372
BLAKE3_IV_3:
.long 0xA54FF53A, 0xA54FF53A, 0xA54FF53A, 0xA54FF53A
BLAKE3_BLOCK_LEN:
.long 64, 64, 64, 64
CMP_MSB_MASK:
.long 0x80000000, 0x80000000, 0x80000000, 0x80000000

View File

@ -0,0 +1,2069 @@
.intel_syntax noprefix
.global blake3_hash_many_sse2
.global _blake3_hash_many_sse2
.global blake3_compress_in_place_sse2
.global _blake3_compress_in_place_sse2
.global blake3_compress_xof_sse2
.global _blake3_compress_xof_sse2
.section .text
.p2align 6
_blake3_hash_many_sse2:
blake3_hash_many_sse2:
push r15
push r14
push r13
push r12
push rsi
push rdi
push rbx
push rbp
mov rbp, rsp
sub rsp, 528
and rsp, 0xFFFFFFFFFFFFFFC0
movdqa xmmword ptr [rsp+0x170], xmm6
movdqa xmmword ptr [rsp+0x180], xmm7
movdqa xmmword ptr [rsp+0x190], xmm8
movdqa xmmword ptr [rsp+0x1A0], xmm9
movdqa xmmword ptr [rsp+0x1B0], xmm10
movdqa xmmword ptr [rsp+0x1C0], xmm11
movdqa xmmword ptr [rsp+0x1D0], xmm12
movdqa xmmword ptr [rsp+0x1E0], xmm13
movdqa xmmword ptr [rsp+0x1F0], xmm14
movdqa xmmword ptr [rsp+0x200], xmm15
mov rdi, rcx
mov rsi, rdx
mov rdx, r8
mov rcx, r9
mov r8, qword ptr [rbp+0x68]
movzx r9, byte ptr [rbp+0x70]
neg r9d
movd xmm0, r9d
pshufd xmm0, xmm0, 0x00
movdqa xmmword ptr [rsp+0x130], xmm0
movdqa xmm1, xmm0
pand xmm1, xmmword ptr [ADD0+rip]
pand xmm0, xmmword ptr [ADD1+rip]
movdqa xmmword ptr [rsp+0x150], xmm0
movd xmm0, r8d
pshufd xmm0, xmm0, 0x00
paddd xmm0, xmm1
movdqa xmmword ptr [rsp+0x110], xmm0
pxor xmm0, xmmword ptr [CMP_MSB_MASK+rip]
pxor xmm1, xmmword ptr [CMP_MSB_MASK+rip]
pcmpgtd xmm1, xmm0
shr r8, 32
movd xmm2, r8d
pshufd xmm2, xmm2, 0x00
psubd xmm2, xmm1
movdqa xmmword ptr [rsp+0x120], xmm2
mov rbx, qword ptr [rbp+0x90]
mov r15, rdx
shl r15, 6
movzx r13d, byte ptr [rbp+0x78]
movzx r12d, byte ptr [rbp+0x88]
cmp rsi, 4
jc 3f
2:
movdqu xmm3, xmmword ptr [rcx]
pshufd xmm0, xmm3, 0x00
pshufd xmm1, xmm3, 0x55
pshufd xmm2, xmm3, 0xAA
pshufd xmm3, xmm3, 0xFF
movdqu xmm7, xmmword ptr [rcx+0x10]
pshufd xmm4, xmm7, 0x00
pshufd xmm5, xmm7, 0x55
pshufd xmm6, xmm7, 0xAA
pshufd xmm7, xmm7, 0xFF
mov r8, qword ptr [rdi]
mov r9, qword ptr [rdi+0x8]
mov r10, qword ptr [rdi+0x10]
mov r11, qword ptr [rdi+0x18]
movzx eax, byte ptr [rbp+0x80]
or eax, r13d
xor edx, edx
9:
mov r14d, eax
or eax, r12d
add rdx, 64
cmp rdx, r15
cmovne eax, r14d
movdqu xmm8, xmmword ptr [r8+rdx-0x40]
movdqu xmm9, xmmword ptr [r9+rdx-0x40]
movdqu xmm10, xmmword ptr [r10+rdx-0x40]
movdqu xmm11, xmmword ptr [r11+rdx-0x40]
movdqa xmm12, xmm8
punpckldq xmm8, xmm9
punpckhdq xmm12, xmm9
movdqa xmm14, xmm10
punpckldq xmm10, xmm11
punpckhdq xmm14, xmm11
movdqa xmm9, xmm8
punpcklqdq xmm8, xmm10
punpckhqdq xmm9, xmm10
movdqa xmm13, xmm12
punpcklqdq xmm12, xmm14
punpckhqdq xmm13, xmm14
movdqa xmmword ptr [rsp], xmm8
movdqa xmmword ptr [rsp+0x10], xmm9
movdqa xmmword ptr [rsp+0x20], xmm12
movdqa xmmword ptr [rsp+0x30], xmm13
movdqu xmm8, xmmword ptr [r8+rdx-0x30]
movdqu xmm9, xmmword ptr [r9+rdx-0x30]
movdqu xmm10, xmmword ptr [r10+rdx-0x30]
movdqu xmm11, xmmword ptr [r11+rdx-0x30]
movdqa xmm12, xmm8
punpckldq xmm8, xmm9
punpckhdq xmm12, xmm9
movdqa xmm14, xmm10
punpckldq xmm10, xmm11
punpckhdq xmm14, xmm11
movdqa xmm9, xmm8
punpcklqdq xmm8, xmm10
punpckhqdq xmm9, xmm10
movdqa xmm13, xmm12
punpcklqdq xmm12, xmm14
punpckhqdq xmm13, xmm14
movdqa xmmword ptr [rsp+0x40], xmm8
movdqa xmmword ptr [rsp+0x50], xmm9
movdqa xmmword ptr [rsp+0x60], xmm12
movdqa xmmword ptr [rsp+0x70], xmm13
movdqu xmm8, xmmword ptr [r8+rdx-0x20]
movdqu xmm9, xmmword ptr [r9+rdx-0x20]
movdqu xmm10, xmmword ptr [r10+rdx-0x20]
movdqu xmm11, xmmword ptr [r11+rdx-0x20]
movdqa xmm12, xmm8
punpckldq xmm8, xmm9
punpckhdq xmm12, xmm9
movdqa xmm14, xmm10
punpckldq xmm10, xmm11
punpckhdq xmm14, xmm11
movdqa xmm9, xmm8
punpcklqdq xmm8, xmm10
punpckhqdq xmm9, xmm10
movdqa xmm13, xmm12
punpcklqdq xmm12, xmm14
punpckhqdq xmm13, xmm14
movdqa xmmword ptr [rsp+0x80], xmm8
movdqa xmmword ptr [rsp+0x90], xmm9
movdqa xmmword ptr [rsp+0xA0], xmm12
movdqa xmmword ptr [rsp+0xB0], xmm13
movdqu xmm8, xmmword ptr [r8+rdx-0x10]
movdqu xmm9, xmmword ptr [r9+rdx-0x10]
movdqu xmm10, xmmword ptr [r10+rdx-0x10]
movdqu xmm11, xmmword ptr [r11+rdx-0x10]
movdqa xmm12, xmm8
punpckldq xmm8, xmm9
punpckhdq xmm12, xmm9
movdqa xmm14, xmm10
punpckldq xmm10, xmm11
punpckhdq xmm14, xmm11
movdqa xmm9, xmm8
punpcklqdq xmm8, xmm10
punpckhqdq xmm9, xmm10
movdqa xmm13, xmm12
punpcklqdq xmm12, xmm14
punpckhqdq xmm13, xmm14
movdqa xmmword ptr [rsp+0xC0], xmm8
movdqa xmmword ptr [rsp+0xD0], xmm9
movdqa xmmword ptr [rsp+0xE0], xmm12
movdqa xmmword ptr [rsp+0xF0], xmm13
movdqa xmm9, xmmword ptr [BLAKE3_IV_1+rip]
movdqa xmm10, xmmword ptr [BLAKE3_IV_2+rip]
movdqa xmm11, xmmword ptr [BLAKE3_IV_3+rip]
movdqa xmm12, xmmword ptr [rsp+0x110]
movdqa xmm13, xmmword ptr [rsp+0x120]
movdqa xmm14, xmmword ptr [BLAKE3_BLOCK_LEN+rip]
movd xmm15, eax
pshufd xmm15, xmm15, 0x00
prefetcht0 [r8+rdx+0x80]
prefetcht0 [r9+rdx+0x80]
prefetcht0 [r10+rdx+0x80]
prefetcht0 [r11+rdx+0x80]
paddd xmm0, xmmword ptr [rsp]
paddd xmm1, xmmword ptr [rsp+0x20]
paddd xmm2, xmmword ptr [rsp+0x40]
paddd xmm3, xmmword ptr [rsp+0x60]
paddd xmm0, xmm4
paddd xmm1, xmm5
paddd xmm2, xmm6
paddd xmm3, xmm7
pxor xmm12, xmm0
pxor xmm13, xmm1
pxor xmm14, xmm2
pxor xmm15, xmm3
movdqa xmm8, xmmword ptr [ROT16+rip]
pshufb xmm12, xmm8
pshufb xmm13, xmm8
pshufb xmm14, xmm8
pshufb xmm15, xmm8
movdqa xmm8, xmmword ptr [BLAKE3_IV_0+rip]
paddd xmm8, xmm12
paddd xmm9, xmm13
paddd xmm10, xmm14
paddd xmm11, xmm15
pxor xmm4, xmm8
pxor xmm5, xmm9
pxor xmm6, xmm10
pxor xmm7, xmm11
movdqa xmmword ptr [rsp+0x100], xmm8
movdqa xmm8, xmm4
psrld xmm8, 12
pslld xmm4, 20
por xmm4, xmm8
movdqa xmm8, xmm5
psrld xmm8, 12
pslld xmm5, 20
por xmm5, xmm8
movdqa xmm8, xmm6
psrld xmm8, 12
pslld xmm6, 20
por xmm6, xmm8
movdqa xmm8, xmm7
psrld xmm8, 12
pslld xmm7, 20
por xmm7, xmm8
paddd xmm0, xmmword ptr [rsp+0x10]
paddd xmm1, xmmword ptr [rsp+0x30]
paddd xmm2, xmmword ptr [rsp+0x50]
paddd xmm3, xmmword ptr [rsp+0x70]
paddd xmm0, xmm4
paddd xmm1, xmm5
paddd xmm2, xmm6
paddd xmm3, xmm7
pxor xmm12, xmm0
pxor xmm13, xmm1
pxor xmm14, xmm2
pxor xmm15, xmm3
movdqa xmm8, xmmword ptr [ROT8+rip]
pshufb xmm12, xmm8
pshufb xmm13, xmm8
pshufb xmm14, xmm8
pshufb xmm15, xmm8
movdqa xmm8, xmmword ptr [rsp+0x100]
paddd xmm8, xmm12
paddd xmm9, xmm13
paddd xmm10, xmm14
paddd xmm11, xmm15
pxor xmm4, xmm8
pxor xmm5, xmm9
pxor xmm6, xmm10
pxor xmm7, xmm11
movdqa xmmword ptr [rsp+0x100], xmm8
movdqa xmm8, xmm4
psrld xmm8, 7
pslld xmm4, 25
por xmm4, xmm8
movdqa xmm8, xmm5
psrld xmm8, 7
pslld xmm5, 25
por xmm5, xmm8
movdqa xmm8, xmm6
psrld xmm8, 7
pslld xmm6, 25
por xmm6, xmm8
movdqa xmm8, xmm7
psrld xmm8, 7
pslld xmm7, 25
por xmm7, xmm8
paddd xmm0, xmmword ptr [rsp+0x80]
paddd xmm1, xmmword ptr [rsp+0xA0]
paddd xmm2, xmmword ptr [rsp+0xC0]
paddd xmm3, xmmword ptr [rsp+0xE0]
paddd xmm0, xmm5
paddd xmm1, xmm6
paddd xmm2, xmm7
paddd xmm3, xmm4
pxor xmm15, xmm0
pxor xmm12, xmm1
pxor xmm13, xmm2
pxor xmm14, xmm3
movdqa xmm8, xmmword ptr [ROT16+rip]
pshufb xmm15, xmm8
pshufb xmm12, xmm8
pshufb xmm13, xmm8
pshufb xmm14, xmm8
paddd xmm10, xmm15
paddd xmm11, xmm12
movdqa xmm8, xmmword ptr [rsp+0x100]
paddd xmm8, xmm13
paddd xmm9, xmm14
pxor xmm5, xmm10
pxor xmm6, xmm11
pxor xmm7, xmm8
pxor xmm4, xmm9
movdqa xmmword ptr [rsp+0x100], xmm8
movdqa xmm8, xmm5
psrld xmm8, 12
pslld xmm5, 20
por xmm5, xmm8
movdqa xmm8, xmm6
psrld xmm8, 12
pslld xmm6, 20
por xmm6, xmm8
movdqa xmm8, xmm7
psrld xmm8, 12
pslld xmm7, 20
por xmm7, xmm8
movdqa xmm8, xmm4
psrld xmm8, 12
pslld xmm4, 20
por xmm4, xmm8
paddd xmm0, xmmword ptr [rsp+0x90]
paddd xmm1, xmmword ptr [rsp+0xB0]
paddd xmm2, xmmword ptr [rsp+0xD0]
paddd xmm3, xmmword ptr [rsp+0xF0]
paddd xmm0, xmm5
paddd xmm1, xmm6
paddd xmm2, xmm7
paddd xmm3, xmm4
pxor xmm15, xmm0
pxor xmm12, xmm1
pxor xmm13, xmm2
pxor xmm14, xmm3
movdqa xmm8, xmmword ptr [ROT8+rip]
pshufb xmm15, xmm8
pshufb xmm12, xmm8
pshufb xmm13, xmm8
pshufb xmm14, xmm8
paddd xmm10, xmm15
paddd xmm11, xmm12
movdqa xmm8, xmmword ptr [rsp+0x100]
paddd xmm8, xmm13
paddd xmm9, xmm14
pxor xmm5, xmm10
pxor xmm6, xmm11
pxor xmm7, xmm8
pxor xmm4, xmm9
movdqa xmmword ptr [rsp+0x100], xmm8
movdqa xmm8, xmm5
psrld xmm8, 7
pslld xmm5, 25
por xmm5, xmm8
movdqa xmm8, xmm6
psrld xmm8, 7
pslld xmm6, 25
por xmm6, xmm8
movdqa xmm8, xmm7
psrld xmm8, 7
pslld xmm7, 25
por xmm7, xmm8
movdqa xmm8, xmm4
psrld xmm8, 7
pslld xmm4, 25
por xmm4, xmm8
paddd xmm0, xmmword ptr [rsp+0x20]
paddd xmm1, xmmword ptr [rsp+0x30]
paddd xmm2, xmmword ptr [rsp+0x70]
paddd xmm3, xmmword ptr [rsp+0x40]
paddd xmm0, xmm4
paddd xmm1, xmm5
paddd xmm2, xmm6
paddd xmm3, xmm7
pxor xmm12, xmm0
pxor xmm13, xmm1
pxor xmm14, xmm2
pxor xmm15, xmm3
movdqa xmm8, xmmword ptr [ROT16+rip]
pshufb xmm12, xmm8
pshufb xmm13, xmm8
pshufb xmm14, xmm8
pshufb xmm15, xmm8
movdqa xmm8, xmmword ptr [rsp+0x100]
paddd xmm8, xmm12
paddd xmm9, xmm13
paddd xmm10, xmm14
paddd xmm11, xmm15
pxor xmm4, xmm8
pxor xmm5, xmm9
pxor xmm6, xmm10
pxor xmm7, xmm11
movdqa xmmword ptr [rsp+0x100], xmm8
movdqa xmm8, xmm4
psrld xmm8, 12
pslld xmm4, 20
por xmm4, xmm8
movdqa xmm8, xmm5
psrld xmm8, 12
pslld xmm5, 20
por xmm5, xmm8
movdqa xmm8, xmm6
psrld xmm8, 12
pslld xmm6, 20
por xmm6, xmm8
movdqa xmm8, xmm7
psrld xmm8, 12
pslld xmm7, 20
por xmm7, xmm8
paddd xmm0, xmmword ptr [rsp+0x60]
paddd xmm1, xmmword ptr [rsp+0xA0]
paddd xmm2, xmmword ptr [rsp]
paddd xmm3, xmmword ptr [rsp+0xD0]
paddd xmm0, xmm4
paddd xmm1, xmm5
paddd xmm2, xmm6
paddd xmm3, xmm7
pxor xmm12, xmm0
pxor xmm13, xmm1
pxor xmm14, xmm2
pxor xmm15, xmm3
movdqa xmm8, xmmword ptr [ROT8+rip]
pshufb xmm12, xmm8
pshufb xmm13, xmm8
pshufb xmm14, xmm8
pshufb xmm15, xmm8
movdqa xmm8, xmmword ptr [rsp+0x100]
paddd xmm8, xmm12
paddd xmm9, xmm13
paddd xmm10, xmm14
paddd xmm11, xmm15
pxor xmm4, xmm8
pxor xmm5, xmm9
pxor xmm6, xmm10
pxor xmm7, xmm11
movdqa xmmword ptr [rsp+0x100], xmm8
movdqa xmm8, xmm4
psrld xmm8, 7
pslld xmm4, 25
por xmm4, xmm8
movdqa xmm8, xmm5
psrld xmm8, 7
pslld xmm5, 25
por xmm5, xmm8
movdqa xmm8, xmm6
psrld xmm8, 7
pslld xmm6, 25
por xmm6, xmm8
movdqa xmm8, xmm7
psrld xmm8, 7
pslld xmm7, 25
por xmm7, xmm8
paddd xmm0, xmmword ptr [rsp+0x10]
paddd xmm1, xmmword ptr [rsp+0xC0]
paddd xmm2, xmmword ptr [rsp+0x90]
paddd xmm3, xmmword ptr [rsp+0xF0]
paddd xmm0, xmm5
paddd xmm1, xmm6
paddd xmm2, xmm7
paddd xmm3, xmm4
pxor xmm15, xmm0
pxor xmm12, xmm1
pxor xmm13, xmm2
pxor xmm14, xmm3
movdqa xmm8, xmmword ptr [ROT16+rip]
pshufb xmm15, xmm8
pshufb xmm12, xmm8
pshufb xmm13, xmm8
pshufb xmm14, xmm8
paddd xmm10, xmm15
paddd xmm11, xmm12
movdqa xmm8, xmmword ptr [rsp+0x100]
paddd xmm8, xmm13
paddd xmm9, xmm14
pxor xmm5, xmm10
pxor xmm6, xmm11
pxor xmm7, xmm8
pxor xmm4, xmm9
movdqa xmmword ptr [rsp+0x100], xmm8
movdqa xmm8, xmm5
psrld xmm8, 12
pslld xmm5, 20
por xmm5, xmm8
movdqa xmm8, xmm6
psrld xmm8, 12
pslld xmm6, 20
por xmm6, xmm8
movdqa xmm8, xmm7
psrld xmm8, 12
pslld xmm7, 20
por xmm7, xmm8
movdqa xmm8, xmm4
psrld xmm8, 12
pslld xmm4, 20
por xmm4, xmm8
paddd xmm0, xmmword ptr [rsp+0xB0]
paddd xmm1, xmmword ptr [rsp+0x50]
paddd xmm2, xmmword ptr [rsp+0xE0]
paddd xmm3, xmmword ptr [rsp+0x80]
paddd xmm0, xmm5
paddd xmm1, xmm6
paddd xmm2, xmm7
paddd xmm3, xmm4
pxor xmm15, xmm0
pxor xmm12, xmm1
pxor xmm13, xmm2
pxor xmm14, xmm3
movdqa xmm8, xmmword ptr [ROT8+rip]
pshufb xmm15, xmm8
pshufb xmm12, xmm8
pshufb xmm13, xmm8
pshufb xmm14, xmm8
paddd xmm10, xmm15
paddd xmm11, xmm12
movdqa xmm8, xmmword ptr [rsp+0x100]
paddd xmm8, xmm13
paddd xmm9, xmm14
pxor xmm5, xmm10
pxor xmm6, xmm11
pxor xmm7, xmm8
pxor xmm4, xmm9
movdqa xmmword ptr [rsp+0x100], xmm8
movdqa xmm8, xmm5
psrld xmm8, 7
pslld xmm5, 25
por xmm5, xmm8
movdqa xmm8, xmm6
psrld xmm8, 7
pslld xmm6, 25
por xmm6, xmm8
movdqa xmm8, xmm7
psrld xmm8, 7
pslld xmm7, 25
por xmm7, xmm8
movdqa xmm8, xmm4
psrld xmm8, 7
pslld xmm4, 25
por xmm4, xmm8
paddd xmm0, xmmword ptr [rsp+0x30]
paddd xmm1, xmmword ptr [rsp+0xA0]
paddd xmm2, xmmword ptr [rsp+0xD0]
paddd xmm3, xmmword ptr [rsp+0x70]
paddd xmm0, xmm4
paddd xmm1, xmm5
paddd xmm2, xmm6
paddd xmm3, xmm7
pxor xmm12, xmm0
pxor xmm13, xmm1
pxor xmm14, xmm2
pxor xmm15, xmm3
movdqa xmm8, xmmword ptr [ROT16+rip]
pshufb xmm12, xmm8
pshufb xmm13, xmm8
pshufb xmm14, xmm8
pshufb xmm15, xmm8
movdqa xmm8, xmmword ptr [rsp+0x100]
paddd xmm8, xmm12
paddd xmm9, xmm13
paddd xmm10, xmm14
paddd xmm11, xmm15
pxor xmm4, xmm8
pxor xmm5, xmm9
pxor xmm6, xmm10
pxor xmm7, xmm11
movdqa xmmword ptr [rsp+0x100], xmm8
movdqa xmm8, xmm4
psrld xmm8, 12
pslld xmm4, 20
por xmm4, xmm8
movdqa xmm8, xmm5
psrld xmm8, 12
pslld xmm5, 20
por xmm5, xmm8
movdqa xmm8, xmm6
psrld xmm8, 12
pslld xmm6, 20
por xmm6, xmm8
movdqa xmm8, xmm7
psrld xmm8, 12
pslld xmm7, 20
por xmm7, xmm8
paddd xmm0, xmmword ptr [rsp+0x40]
paddd xmm1, xmmword ptr [rsp+0xC0]
paddd xmm2, xmmword ptr [rsp+0x20]
paddd xmm3, xmmword ptr [rsp+0xE0]
paddd xmm0, xmm4
paddd xmm1, xmm5
paddd xmm2, xmm6
paddd xmm3, xmm7
pxor xmm12, xmm0
pxor xmm13, xmm1
pxor xmm14, xmm2
pxor xmm15, xmm3
movdqa xmm8, xmmword ptr [ROT8+rip]
pshufb xmm12, xmm8
pshufb xmm13, xmm8
pshufb xmm14, xmm8
pshufb xmm15, xmm8
movdqa xmm8, xmmword ptr [rsp+0x100]
paddd xmm8, xmm12
paddd xmm9, xmm13
paddd xmm10, xmm14
paddd xmm11, xmm15
pxor xmm4, xmm8
pxor xmm5, xmm9
pxor xmm6, xmm10
pxor xmm7, xmm11
movdqa xmmword ptr [rsp+0x100], xmm8
movdqa xmm8, xmm4
psrld xmm8, 7
pslld xmm4, 25
por xmm4, xmm8
movdqa xmm8, xmm5
psrld xmm8, 7
pslld xmm5, 25
por xmm5, xmm8
movdqa xmm8, xmm6
psrld xmm8, 7
pslld xmm6, 25
por xmm6, xmm8
movdqa xmm8, xmm7
psrld xmm8, 7
pslld xmm7, 25
por xmm7, xmm8
paddd xmm0, xmmword ptr [rsp+0x60]
paddd xmm1, xmmword ptr [rsp+0x90]
paddd xmm2, xmmword ptr [rsp+0xB0]
paddd xmm3, xmmword ptr [rsp+0x80]
paddd xmm0, xmm5
paddd xmm1, xmm6
paddd xmm2, xmm7
paddd xmm3, xmm4
pxor xmm15, xmm0
pxor xmm12, xmm1
pxor xmm13, xmm2
pxor xmm14, xmm3
movdqa xmm8, xmmword ptr [ROT16+rip]
pshufb xmm15, xmm8
pshufb xmm12, xmm8
pshufb xmm13, xmm8
pshufb xmm14, xmm8
paddd xmm10, xmm15
paddd xmm11, xmm12
movdqa xmm8, xmmword ptr [rsp+0x100]
paddd xmm8, xmm13
paddd xmm9, xmm14
pxor xmm5, xmm10
pxor xmm6, xmm11
pxor xmm7, xmm8
pxor xmm4, xmm9
movdqa xmmword ptr [rsp+0x100], xmm8
movdqa xmm8, xmm5
psrld xmm8, 12
pslld xmm5, 20
por xmm5, xmm8
movdqa xmm8, xmm6
psrld xmm8, 12
pslld xmm6, 20
por xmm6, xmm8
movdqa xmm8, xmm7
psrld xmm8, 12
pslld xmm7, 20
por xmm7, xmm8
movdqa xmm8, xmm4
psrld xmm8, 12
pslld xmm4, 20
por xmm4, xmm8
paddd xmm0, xmmword ptr [rsp+0x50]
paddd xmm1, xmmword ptr [rsp]
paddd xmm2, xmmword ptr [rsp+0xF0]
paddd xmm3, xmmword ptr [rsp+0x10]
paddd xmm0, xmm5
paddd xmm1, xmm6
paddd xmm2, xmm7
paddd xmm3, xmm4
pxor xmm15, xmm0
pxor xmm12, xmm1
pxor xmm13, xmm2
pxor xmm14, xmm3
movdqa xmm8, xmmword ptr [ROT8+rip]
pshufb xmm15, xmm8
pshufb xmm12, xmm8
pshufb xmm13, xmm8
pshufb xmm14, xmm8
paddd xmm10, xmm15
paddd xmm11, xmm12
movdqa xmm8, xmmword ptr [rsp+0x100]
paddd xmm8, xmm13
paddd xmm9, xmm14
pxor xmm5, xmm10
pxor xmm6, xmm11
pxor xmm7, xmm8
pxor xmm4, xmm9
movdqa xmmword ptr [rsp+0x100], xmm8
movdqa xmm8, xmm5
psrld xmm8, 7
pslld xmm5, 25
por xmm5, xmm8
movdqa xmm8, xmm6
psrld xmm8, 7
pslld xmm6, 25
por xmm6, xmm8
movdqa xmm8, xmm7
psrld xmm8, 7
pslld xmm7, 25
por xmm7, xmm8
movdqa xmm8, xmm4
psrld xmm8, 7
pslld xmm4, 25
por xmm4, xmm8
paddd xmm0, xmmword ptr [rsp+0xA0]
paddd xmm1, xmmword ptr [rsp+0xC0]
paddd xmm2, xmmword ptr [rsp+0xE0]
paddd xmm3, xmmword ptr [rsp+0xD0]
paddd xmm0, xmm4
paddd xmm1, xmm5
paddd xmm2, xmm6
paddd xmm3, xmm7
pxor xmm12, xmm0
pxor xmm13, xmm1
pxor xmm14, xmm2
pxor xmm15, xmm3
movdqa xmm8, xmmword ptr [ROT16+rip]
pshufb xmm12, xmm8
pshufb xmm13, xmm8
pshufb xmm14, xmm8
pshufb xmm15, xmm8
movdqa xmm8, xmmword ptr [rsp+0x100]
paddd xmm8, xmm12
paddd xmm9, xmm13
paddd xmm10, xmm14
paddd xmm11, xmm15
pxor xmm4, xmm8
pxor xmm5, xmm9
pxor xmm6, xmm10
pxor xmm7, xmm11
movdqa xmmword ptr [rsp+0x100], xmm8
movdqa xmm8, xmm4
psrld xmm8, 12
pslld xmm4, 20
por xmm4, xmm8
movdqa xmm8, xmm5
psrld xmm8, 12
pslld xmm5, 20
por xmm5, xmm8
movdqa xmm8, xmm6
psrld xmm8, 12
pslld xmm6, 20
por xmm6, xmm8
movdqa xmm8, xmm7
psrld xmm8, 12
pslld xmm7, 20
por xmm7, xmm8
paddd xmm0, xmmword ptr [rsp+0x70]
paddd xmm1, xmmword ptr [rsp+0x90]
paddd xmm2, xmmword ptr [rsp+0x30]
paddd xmm3, xmmword ptr [rsp+0xF0]
paddd xmm0, xmm4
paddd xmm1, xmm5
paddd xmm2, xmm6
paddd xmm3, xmm7
pxor xmm12, xmm0
pxor xmm13, xmm1
pxor xmm14, xmm2
pxor xmm15, xmm3
movdqa xmm8, xmmword ptr [ROT8+rip]
pshufb xmm12, xmm8
pshufb xmm13, xmm8
pshufb xmm14, xmm8
pshufb xmm15, xmm8
movdqa xmm8, xmmword ptr [rsp+0x100]
paddd xmm8, xmm12
paddd xmm9, xmm13
paddd xmm10, xmm14
paddd xmm11, xmm15
pxor xmm4, xmm8
pxor xmm5, xmm9
pxor xmm6, xmm10
pxor xmm7, xmm11
movdqa xmmword ptr [rsp+0x100], xmm8
movdqa xmm8, xmm4
psrld xmm8, 7
pslld xmm4, 25
por xmm4, xmm8
movdqa xmm8, xmm5
psrld xmm8, 7
pslld xmm5, 25
por xmm5, xmm8
movdqa xmm8, xmm6
psrld xmm8, 7
pslld xmm6, 25
por xmm6, xmm8
movdqa xmm8, xmm7
psrld xmm8, 7
pslld xmm7, 25
por xmm7, xmm8
paddd xmm0, xmmword ptr [rsp+0x40]
paddd xmm1, xmmword ptr [rsp+0xB0]
paddd xmm2, xmmword ptr [rsp+0x50]
paddd xmm3, xmmword ptr [rsp+0x10]
paddd xmm0, xmm5
paddd xmm1, xmm6
paddd xmm2, xmm7
paddd xmm3, xmm4
pxor xmm15, xmm0
pxor xmm12, xmm1
pxor xmm13, xmm2
pxor xmm14, xmm3
movdqa xmm8, xmmword ptr [ROT16+rip]
pshufb xmm15, xmm8
pshufb xmm12, xmm8
pshufb xmm13, xmm8
pshufb xmm14, xmm8
paddd xmm10, xmm15
paddd xmm11, xmm12
movdqa xmm8, xmmword ptr [rsp+0x100]
paddd xmm8, xmm13
paddd xmm9, xmm14
pxor xmm5, xmm10
pxor xmm6, xmm11
pxor xmm7, xmm8
pxor xmm4, xmm9
movdqa xmmword ptr [rsp+0x100], xmm8
movdqa xmm8, xmm5
psrld xmm8, 12
pslld xmm5, 20
por xmm5, xmm8
movdqa xmm8, xmm6
psrld xmm8, 12
pslld xmm6, 20
por xmm6, xmm8
movdqa xmm8, xmm7
psrld xmm8, 12
pslld xmm7, 20
por xmm7, xmm8
movdqa xmm8, xmm4
psrld xmm8, 12
pslld xmm4, 20
por xmm4, xmm8
paddd xmm0, xmmword ptr [rsp]
paddd xmm1, xmmword ptr [rsp+0x20]
paddd xmm2, xmmword ptr [rsp+0x80]
paddd xmm3, xmmword ptr [rsp+0x60]
paddd xmm0, xmm5
paddd xmm1, xmm6
paddd xmm2, xmm7
paddd xmm3, xmm4
pxor xmm15, xmm0
pxor xmm12, xmm1
pxor xmm13, xmm2
pxor xmm14, xmm3
movdqa xmm8, xmmword ptr [ROT8+rip]
pshufb xmm15, xmm8
pshufb xmm12, xmm8
pshufb xmm13, xmm8
pshufb xmm14, xmm8
paddd xmm10, xmm15
paddd xmm11, xmm12
movdqa xmm8, xmmword ptr [rsp+0x100]
paddd xmm8, xmm13
paddd xmm9, xmm14
pxor xmm5, xmm10
pxor xmm6, xmm11
pxor xmm7, xmm8
pxor xmm4, xmm9
movdqa xmmword ptr [rsp+0x100], xmm8
movdqa xmm8, xmm5
psrld xmm8, 7
pslld xmm5, 25
por xmm5, xmm8
movdqa xmm8, xmm6
psrld xmm8, 7
pslld xmm6, 25
por xmm6, xmm8
movdqa xmm8, xmm7
psrld xmm8, 7
pslld xmm7, 25
por xmm7, xmm8
movdqa xmm8, xmm4
psrld xmm8, 7
pslld xmm4, 25
por xmm4, xmm8
paddd xmm0, xmmword ptr [rsp+0xC0]
paddd xmm1, xmmword ptr [rsp+0x90]
paddd xmm2, xmmword ptr [rsp+0xF0]
paddd xmm3, xmmword ptr [rsp+0xE0]
paddd xmm0, xmm4
paddd xmm1, xmm5
paddd xmm2, xmm6
paddd xmm3, xmm7
pxor xmm12, xmm0
pxor xmm13, xmm1
pxor xmm14, xmm2
pxor xmm15, xmm3
movdqa xmm8, xmmword ptr [ROT16+rip]
pshufb xmm12, xmm8
pshufb xmm13, xmm8
pshufb xmm14, xmm8
pshufb xmm15, xmm8
movdqa xmm8, xmmword ptr [rsp+0x100]
paddd xmm8, xmm12
paddd xmm9, xmm13
paddd xmm10, xmm14
paddd xmm11, xmm15
pxor xmm4, xmm8
pxor xmm5, xmm9
pxor xmm6, xmm10
pxor xmm7, xmm11
movdqa xmmword ptr [rsp+0x100], xmm8
movdqa xmm8, xmm4
psrld xmm8, 12
pslld xmm4, 20
por xmm4, xmm8
movdqa xmm8, xmm5
psrld xmm8, 12
pslld xmm5, 20
por xmm5, xmm8
movdqa xmm8, xmm6
psrld xmm8, 12
pslld xmm6, 20
por xmm6, xmm8
movdqa xmm8, xmm7
psrld xmm8, 12
pslld xmm7, 20
por xmm7, xmm8
paddd xmm0, xmmword ptr [rsp+0xD0]
paddd xmm1, xmmword ptr [rsp+0xB0]
paddd xmm2, xmmword ptr [rsp+0xA0]
paddd xmm3, xmmword ptr [rsp+0x80]
paddd xmm0, xmm4
paddd xmm1, xmm5
paddd xmm2, xmm6
paddd xmm3, xmm7
pxor xmm12, xmm0
pxor xmm13, xmm1
pxor xmm14, xmm2
pxor xmm15, xmm3
movdqa xmm8, xmmword ptr [ROT8+rip]
pshufb xmm12, xmm8
pshufb xmm13, xmm8
pshufb xmm14, xmm8
pshufb xmm15, xmm8
movdqa xmm8, xmmword ptr [rsp+0x100]
paddd xmm8, xmm12
paddd xmm9, xmm13
paddd xmm10, xmm14
paddd xmm11, xmm15
pxor xmm4, xmm8
pxor xmm5, xmm9
pxor xmm6, xmm10
pxor xmm7, xmm11
movdqa xmmword ptr [rsp+0x100], xmm8
movdqa xmm8, xmm4
psrld xmm8, 7
pslld xmm4, 25
por xmm4, xmm8
movdqa xmm8, xmm5
psrld xmm8, 7
pslld xmm5, 25
por xmm5, xmm8
movdqa xmm8, xmm6
psrld xmm8, 7
pslld xmm6, 25
por xmm6, xmm8
movdqa xmm8, xmm7
psrld xmm8, 7
pslld xmm7, 25
por xmm7, xmm8
paddd xmm0, xmmword ptr [rsp+0x70]
paddd xmm1, xmmword ptr [rsp+0x50]
paddd xmm2, xmmword ptr [rsp]
paddd xmm3, xmmword ptr [rsp+0x60]
paddd xmm0, xmm5
paddd xmm1, xmm6
paddd xmm2, xmm7
paddd xmm3, xmm4
pxor xmm15, xmm0
pxor xmm12, xmm1
pxor xmm13, xmm2
pxor xmm14, xmm3
movdqa xmm8, xmmword ptr [ROT16+rip]
pshufb xmm15, xmm8
pshufb xmm12, xmm8
pshufb xmm13, xmm8
pshufb xmm14, xmm8
paddd xmm10, xmm15
paddd xmm11, xmm12
movdqa xmm8, xmmword ptr [rsp+0x100]
paddd xmm8, xmm13
paddd xmm9, xmm14
pxor xmm5, xmm10
pxor xmm6, xmm11
pxor xmm7, xmm8
pxor xmm4, xmm9
movdqa xmmword ptr [rsp+0x100], xmm8
movdqa xmm8, xmm5
psrld xmm8, 12
pslld xmm5, 20
por xmm5, xmm8
movdqa xmm8, xmm6
psrld xmm8, 12
pslld xmm6, 20
por xmm6, xmm8
movdqa xmm8, xmm7
psrld xmm8, 12
pslld xmm7, 20
por xmm7, xmm8
movdqa xmm8, xmm4
psrld xmm8, 12
pslld xmm4, 20
por xmm4, xmm8
paddd xmm0, xmmword ptr [rsp+0x20]
paddd xmm1, xmmword ptr [rsp+0x30]
paddd xmm2, xmmword ptr [rsp+0x10]
paddd xmm3, xmmword ptr [rsp+0x40]
paddd xmm0, xmm5
paddd xmm1, xmm6
paddd xmm2, xmm7
paddd xmm3, xmm4
pxor xmm15, xmm0
pxor xmm12, xmm1
pxor xmm13, xmm2
pxor xmm14, xmm3
movdqa xmm8, xmmword ptr [ROT8+rip]
pshufb xmm15, xmm8
pshufb xmm12, xmm8
pshufb xmm13, xmm8
pshufb xmm14, xmm8
paddd xmm10, xmm15
paddd xmm11, xmm12
movdqa xmm8, xmmword ptr [rsp+0x100]
paddd xmm8, xmm13
paddd xmm9, xmm14
pxor xmm5, xmm10
pxor xmm6, xmm11
pxor xmm7, xmm8
pxor xmm4, xmm9
movdqa xmmword ptr [rsp+0x100], xmm8
movdqa xmm8, xmm5
psrld xmm8, 7
pslld xmm5, 25
por xmm5, xmm8
movdqa xmm8, xmm6
psrld xmm8, 7
pslld xmm6, 25
por xmm6, xmm8
movdqa xmm8, xmm7
psrld xmm8, 7
pslld xmm7, 25
por xmm7, xmm8
movdqa xmm8, xmm4
psrld xmm8, 7
pslld xmm4, 25
por xmm4, xmm8
paddd xmm0, xmmword ptr [rsp+0x90]
paddd xmm1, xmmword ptr [rsp+0xB0]
paddd xmm2, xmmword ptr [rsp+0x80]
paddd xmm3, xmmword ptr [rsp+0xF0]
paddd xmm0, xmm4
paddd xmm1, xmm5
paddd xmm2, xmm6
paddd xmm3, xmm7
pxor xmm12, xmm0
pxor xmm13, xmm1
pxor xmm14, xmm2
pxor xmm15, xmm3
movdqa xmm8, xmmword ptr [ROT16+rip]
pshufb xmm12, xmm8
pshufb xmm13, xmm8
pshufb xmm14, xmm8
pshufb xmm15, xmm8
movdqa xmm8, xmmword ptr [rsp+0x100]
paddd xmm8, xmm12
paddd xmm9, xmm13
paddd xmm10, xmm14
paddd xmm11, xmm15
pxor xmm4, xmm8
pxor xmm5, xmm9
pxor xmm6, xmm10
pxor xmm7, xmm11
movdqa xmmword ptr [rsp+0x100], xmm8
movdqa xmm8, xmm4
psrld xmm8, 12
pslld xmm4, 20
por xmm4, xmm8
movdqa xmm8, xmm5
psrld xmm8, 12
pslld xmm5, 20
por xmm5, xmm8
movdqa xmm8, xmm6
psrld xmm8, 12
pslld xmm6, 20
por xmm6, xmm8
movdqa xmm8, xmm7
psrld xmm8, 12
pslld xmm7, 20
por xmm7, xmm8
paddd xmm0, xmmword ptr [rsp+0xE0]
paddd xmm1, xmmword ptr [rsp+0x50]
paddd xmm2, xmmword ptr [rsp+0xC0]
paddd xmm3, xmmword ptr [rsp+0x10]
paddd xmm0, xmm4
paddd xmm1, xmm5
paddd xmm2, xmm6
paddd xmm3, xmm7
pxor xmm12, xmm0
pxor xmm13, xmm1
pxor xmm14, xmm2
pxor xmm15, xmm3
movdqa xmm8, xmmword ptr [ROT8+rip]
pshufb xmm12, xmm8
pshufb xmm13, xmm8
pshufb xmm14, xmm8
pshufb xmm15, xmm8
movdqa xmm8, xmmword ptr [rsp+0x100]
paddd xmm8, xmm12
paddd xmm9, xmm13
paddd xmm10, xmm14
paddd xmm11, xmm15
pxor xmm4, xmm8
pxor xmm5, xmm9
pxor xmm6, xmm10
pxor xmm7, xmm11
movdqa xmmword ptr [rsp+0x100], xmm8
movdqa xmm8, xmm4
psrld xmm8, 7
pslld xmm4, 25
por xmm4, xmm8
movdqa xmm8, xmm5
psrld xmm8, 7
pslld xmm5, 25
por xmm5, xmm8
movdqa xmm8, xmm6
psrld xmm8, 7
pslld xmm6, 25
por xmm6, xmm8
movdqa xmm8, xmm7
psrld xmm8, 7
pslld xmm7, 25
por xmm7, xmm8
paddd xmm0, xmmword ptr [rsp+0xD0]
paddd xmm1, xmmword ptr [rsp]
paddd xmm2, xmmword ptr [rsp+0x20]
paddd xmm3, xmmword ptr [rsp+0x40]
paddd xmm0, xmm5
paddd xmm1, xmm6
paddd xmm2, xmm7
paddd xmm3, xmm4
pxor xmm15, xmm0
pxor xmm12, xmm1
pxor xmm13, xmm2
pxor xmm14, xmm3
movdqa xmm8, xmmword ptr [ROT16+rip]
pshufb xmm15, xmm8
pshufb xmm12, xmm8
pshufb xmm13, xmm8
pshufb xmm14, xmm8
paddd xmm10, xmm15
paddd xmm11, xmm12
movdqa xmm8, xmmword ptr [rsp+0x100]
paddd xmm8, xmm13
paddd xmm9, xmm14
pxor xmm5, xmm10
pxor xmm6, xmm11
pxor xmm7, xmm8
pxor xmm4, xmm9
movdqa xmmword ptr [rsp+0x100], xmm8
movdqa xmm8, xmm5
psrld xmm8, 12
pslld xmm5, 20
por xmm5, xmm8
movdqa xmm8, xmm6
psrld xmm8, 12
pslld xmm6, 20
por xmm6, xmm8
movdqa xmm8, xmm7
psrld xmm8, 12
pslld xmm7, 20
por xmm7, xmm8
movdqa xmm8, xmm4
psrld xmm8, 12
pslld xmm4, 20
por xmm4, xmm8
paddd xmm0, xmmword ptr [rsp+0x30]
paddd xmm1, xmmword ptr [rsp+0xA0]
paddd xmm2, xmmword ptr [rsp+0x60]
paddd xmm3, xmmword ptr [rsp+0x70]
paddd xmm0, xmm5
paddd xmm1, xmm6
paddd xmm2, xmm7
paddd xmm3, xmm4
pxor xmm15, xmm0
pxor xmm12, xmm1
pxor xmm13, xmm2
pxor xmm14, xmm3
movdqa xmm8, xmmword ptr [ROT8+rip]
pshufb xmm15, xmm8
pshufb xmm12, xmm8
pshufb xmm13, xmm8
pshufb xmm14, xmm8
paddd xmm10, xmm15
paddd xmm11, xmm12
movdqa xmm8, xmmword ptr [rsp+0x100]
paddd xmm8, xmm13
paddd xmm9, xmm14
pxor xmm5, xmm10
pxor xmm6, xmm11
pxor xmm7, xmm8
pxor xmm4, xmm9
movdqa xmmword ptr [rsp+0x100], xmm8
movdqa xmm8, xmm5
psrld xmm8, 7
pslld xmm5, 25
por xmm5, xmm8
movdqa xmm8, xmm6
psrld xmm8, 7
pslld xmm6, 25
por xmm6, xmm8
movdqa xmm8, xmm7
psrld xmm8, 7
pslld xmm7, 25
por xmm7, xmm8
movdqa xmm8, xmm4
psrld xmm8, 7
pslld xmm4, 25
por xmm4, xmm8
paddd xmm0, xmmword ptr [rsp+0xB0]
paddd xmm1, xmmword ptr [rsp+0x50]
paddd xmm2, xmmword ptr [rsp+0x10]
paddd xmm3, xmmword ptr [rsp+0x80]
paddd xmm0, xmm4
paddd xmm1, xmm5
paddd xmm2, xmm6
paddd xmm3, xmm7
pxor xmm12, xmm0
pxor xmm13, xmm1
pxor xmm14, xmm2
pxor xmm15, xmm3
movdqa xmm8, xmmword ptr [ROT16+rip]
pshufb xmm12, xmm8
pshufb xmm13, xmm8
pshufb xmm14, xmm8
pshufb xmm15, xmm8
movdqa xmm8, xmmword ptr [rsp+0x100]
paddd xmm8, xmm12
paddd xmm9, xmm13
paddd xmm10, xmm14
paddd xmm11, xmm15
pxor xmm4, xmm8
pxor xmm5, xmm9
pxor xmm6, xmm10
pxor xmm7, xmm11
movdqa xmmword ptr [rsp+0x100], xmm8
movdqa xmm8, xmm4
psrld xmm8, 12
pslld xmm4, 20
por xmm4, xmm8
movdqa xmm8, xmm5
psrld xmm8, 12
pslld xmm5, 20
por xmm5, xmm8
movdqa xmm8, xmm6
psrld xmm8, 12
pslld xmm6, 20
por xmm6, xmm8
movdqa xmm8, xmm7
psrld xmm8, 12
pslld xmm7, 20
por xmm7, xmm8
paddd xmm0, xmmword ptr [rsp+0xF0]
paddd xmm1, xmmword ptr [rsp]
paddd xmm2, xmmword ptr [rsp+0x90]
paddd xmm3, xmmword ptr [rsp+0x60]
paddd xmm0, xmm4
paddd xmm1, xmm5
paddd xmm2, xmm6
paddd xmm3, xmm7
pxor xmm12, xmm0
pxor xmm13, xmm1
pxor xmm14, xmm2
pxor xmm15, xmm3
movdqa xmm8, xmmword ptr [ROT8+rip]
pshufb xmm12, xmm8
pshufb xmm13, xmm8
pshufb xmm14, xmm8
pshufb xmm15, xmm8
movdqa xmm8, xmmword ptr [rsp+0x100]
paddd xmm8, xmm12
paddd xmm9, xmm13
paddd xmm10, xmm14
paddd xmm11, xmm15
pxor xmm4, xmm8
pxor xmm5, xmm9
pxor xmm6, xmm10
pxor xmm7, xmm11
movdqa xmmword ptr [rsp+0x100], xmm8
movdqa xmm8, xmm4
psrld xmm8, 7
pslld xmm4, 25
por xmm4, xmm8
movdqa xmm8, xmm5
psrld xmm8, 7
pslld xmm5, 25
por xmm5, xmm8
movdqa xmm8, xmm6
psrld xmm8, 7
pslld xmm6, 25
por xmm6, xmm8
movdqa xmm8, xmm7
psrld xmm8, 7
pslld xmm7, 25
por xmm7, xmm8
paddd xmm0, xmmword ptr [rsp+0xE0]
paddd xmm1, xmmword ptr [rsp+0x20]
paddd xmm2, xmmword ptr [rsp+0x30]
paddd xmm3, xmmword ptr [rsp+0x70]
paddd xmm0, xmm5
paddd xmm1, xmm6
paddd xmm2, xmm7
paddd xmm3, xmm4
pxor xmm15, xmm0
pxor xmm12, xmm1
pxor xmm13, xmm2
pxor xmm14, xmm3
movdqa xmm8, xmmword ptr [ROT16+rip]
pshufb xmm15, xmm8
pshufb xmm12, xmm8
pshufb xmm13, xmm8
pshufb xmm14, xmm8
paddd xmm10, xmm15
paddd xmm11, xmm12
movdqa xmm8, xmmword ptr [rsp+0x100]
paddd xmm8, xmm13
paddd xmm9, xmm14
pxor xmm5, xmm10
pxor xmm6, xmm11
pxor xmm7, xmm8
pxor xmm4, xmm9
movdqa xmmword ptr [rsp+0x100], xmm8
movdqa xmm8, xmm5
psrld xmm8, 12
pslld xmm5, 20
por xmm5, xmm8
movdqa xmm8, xmm6
psrld xmm8, 12
pslld xmm6, 20
por xmm6, xmm8
movdqa xmm8, xmm7
psrld xmm8, 12
pslld xmm7, 20
por xmm7, xmm8
movdqa xmm8, xmm4
psrld xmm8, 12
pslld xmm4, 20
por xmm4, xmm8
paddd xmm0, xmmword ptr [rsp+0xA0]
paddd xmm1, xmmword ptr [rsp+0xC0]
paddd xmm2, xmmword ptr [rsp+0x40]
paddd xmm3, xmmword ptr [rsp+0xD0]
paddd xmm0, xmm5
paddd xmm1, xmm6
paddd xmm2, xmm7
paddd xmm3, xmm4
pxor xmm15, xmm0
pxor xmm12, xmm1
pxor xmm13, xmm2
pxor xmm14, xmm3
movdqa xmm8, xmmword ptr [ROT8+rip]
pshufb xmm15, xmm8
pshufb xmm12, xmm8
pshufb xmm13, xmm8
pshufb xmm14, xmm8
paddd xmm10, xmm15
paddd xmm11, xmm12
movdqa xmm8, xmmword ptr [rsp+0x100]
paddd xmm8, xmm13
paddd xmm9, xmm14
pxor xmm5, xmm10
pxor xmm6, xmm11
pxor xmm7, xmm8
pxor xmm4, xmm9
pxor xmm0, xmm8
pxor xmm1, xmm9
pxor xmm2, xmm10
pxor xmm3, xmm11
movdqa xmm8, xmm5
psrld xmm8, 7
pslld xmm5, 25
por xmm5, xmm8
movdqa xmm8, xmm6
psrld xmm8, 7
pslld xmm6, 25
por xmm6, xmm8
movdqa xmm8, xmm7
psrld xmm8, 7
pslld xmm7, 25
por xmm7, xmm8
movdqa xmm8, xmm4
psrld xmm8, 7
pslld xmm4, 25
por xmm4, xmm8
pxor xmm4, xmm12
pxor xmm5, xmm13
pxor xmm6, xmm14
pxor xmm7, xmm15
mov eax, r13d
jne 9b
movdqa xmm9, xmm0
punpckldq xmm0, xmm1
punpckhdq xmm9, xmm1
movdqa xmm11, xmm2
punpckldq xmm2, xmm3
punpckhdq xmm11, xmm3
movdqa xmm1, xmm0
punpcklqdq xmm0, xmm2
punpckhqdq xmm1, xmm2
movdqa xmm3, xmm9
punpcklqdq xmm9, xmm11
punpckhqdq xmm3, xmm11
movdqu xmmword ptr [rbx], xmm0
movdqu xmmword ptr [rbx+0x20], xmm1
movdqu xmmword ptr [rbx+0x40], xmm9
movdqu xmmword ptr [rbx+0x60], xmm3
movdqa xmm9, xmm4
punpckldq xmm4, xmm5
punpckhdq xmm9, xmm5
movdqa xmm11, xmm6
punpckldq xmm6, xmm7
punpckhdq xmm11, xmm7
movdqa xmm5, xmm4
punpcklqdq xmm4, xmm6
punpckhqdq xmm5, xmm6
movdqa xmm7, xmm9
punpcklqdq xmm9, xmm11
punpckhqdq xmm7, xmm11
movdqu xmmword ptr [rbx+0x10], xmm4
movdqu xmmword ptr [rbx+0x30], xmm5
movdqu xmmword ptr [rbx+0x50], xmm9
movdqu xmmword ptr [rbx+0x70], xmm7
movdqa xmm1, xmmword ptr [rsp+0x110]
movdqa xmm0, xmm1
paddd xmm1, xmmword ptr [rsp+0x150]
movdqa xmmword ptr [rsp+0x110], xmm1
pxor xmm0, xmmword ptr [CMP_MSB_MASK+rip]
pxor xmm1, xmmword ptr [CMP_MSB_MASK+rip]
pcmpgtd xmm0, xmm1
movdqa xmm1, xmmword ptr [rsp+0x120]
psubd xmm1, xmm0
movdqa xmmword ptr [rsp+0x120], xmm1
add rbx, 128
add rdi, 32
sub rsi, 4
cmp rsi, 4
jnc 2b
test rsi, rsi
jne 3f
4:
movdqa xmm6, xmmword ptr [rsp+0x170]
movdqa xmm7, xmmword ptr [rsp+0x180]
movdqa xmm8, xmmword ptr [rsp+0x190]
movdqa xmm9, xmmword ptr [rsp+0x1A0]
movdqa xmm10, xmmword ptr [rsp+0x1B0]
movdqa xmm11, xmmword ptr [rsp+0x1C0]
movdqa xmm12, xmmword ptr [rsp+0x1D0]
movdqa xmm13, xmmword ptr [rsp+0x1E0]
movdqa xmm14, xmmword ptr [rsp+0x1F0]
movdqa xmm15, xmmword ptr [rsp+0x200]
mov rsp, rbp
pop rbp
pop rbx
pop rdi
pop rsi
pop r12
pop r13
pop r14
pop r15
ret
.p2align 5
3:
test esi, 0x2
je 3f
movups xmm0, xmmword ptr [rcx]
movups xmm1, xmmword ptr [rcx+0x10]
movaps xmm8, xmm0
movaps xmm9, xmm1
movd xmm13, dword ptr [rsp+0x110]
pinsrd xmm13, dword ptr [rsp+0x120], 1
pinsrd xmm13, dword ptr [BLAKE3_BLOCK_LEN+rip], 2
movaps xmmword ptr [rsp], xmm13
movd xmm14, dword ptr [rsp+0x114]
pinsrd xmm14, dword ptr [rsp+0x124], 1
pinsrd xmm14, dword ptr [BLAKE3_BLOCK_LEN+rip], 2
movaps xmmword ptr [rsp+0x10], xmm14
mov r8, qword ptr [rdi]
mov r9, qword ptr [rdi+0x8]
movzx eax, byte ptr [rbp+0x80]
or eax, r13d
xor edx, edx
2:
mov r14d, eax
or eax, r12d
add rdx, 64
cmp rdx, r15
cmovne eax, r14d
movaps xmm2, xmmword ptr [BLAKE3_IV+rip]
movaps xmm10, xmm2
movups xmm4, xmmword ptr [r8+rdx-0x40]
movups xmm5, xmmword ptr [r8+rdx-0x30]
movaps xmm3, xmm4
shufps xmm4, xmm5, 136
shufps xmm3, xmm5, 221
movaps xmm5, xmm3
movups xmm6, xmmword ptr [r8+rdx-0x20]
movups xmm7, xmmword ptr [r8+rdx-0x10]
movaps xmm3, xmm6
shufps xmm6, xmm7, 136
pshufd xmm6, xmm6, 0x93
shufps xmm3, xmm7, 221
pshufd xmm7, xmm3, 0x93
movups xmm12, xmmword ptr [r9+rdx-0x40]
movups xmm13, xmmword ptr [r9+rdx-0x30]
movaps xmm11, xmm12
shufps xmm12, xmm13, 136
shufps xmm11, xmm13, 221
movaps xmm13, xmm11
movups xmm14, xmmword ptr [r9+rdx-0x20]
movups xmm15, xmmword ptr [r9+rdx-0x10]
movaps xmm11, xmm14
shufps xmm14, xmm15, 136
pshufd xmm14, xmm14, 0x93
shufps xmm11, xmm15, 221
pshufd xmm15, xmm11, 0x93
movaps xmm3, xmmword ptr [rsp]
movaps xmm11, xmmword ptr [rsp+0x10]
pinsrd xmm3, eax, 3
pinsrd xmm11, eax, 3
mov al, 7
9:
paddd xmm0, xmm4
paddd xmm8, xmm12
movaps xmmword ptr [rsp+0x20], xmm4
movaps xmmword ptr [rsp+0x30], xmm12
paddd xmm0, xmm1
paddd xmm8, xmm9
pxor xmm3, xmm0
pxor xmm11, xmm8
movaps xmm12, xmmword ptr [ROT16+rip]
pshufb xmm3, xmm12
pshufb xmm11, xmm12
paddd xmm2, xmm3
paddd xmm10, xmm11
pxor xmm1, xmm2
pxor xmm9, xmm10
movdqa xmm4, xmm1
pslld xmm1, 20
psrld xmm4, 12
por xmm1, xmm4
movdqa xmm4, xmm9
pslld xmm9, 20
psrld xmm4, 12
por xmm9, xmm4
paddd xmm0, xmm5
paddd xmm8, xmm13
movaps xmmword ptr [rsp+0x40], xmm5
movaps xmmword ptr [rsp+0x50], xmm13
paddd xmm0, xmm1
paddd xmm8, xmm9
pxor xmm3, xmm0
pxor xmm11, xmm8
movaps xmm13, xmmword ptr [ROT8+rip]
pshufb xmm3, xmm13
pshufb xmm11, xmm13
paddd xmm2, xmm3
paddd xmm10, xmm11
pxor xmm1, xmm2
pxor xmm9, xmm10
movdqa xmm4, xmm1
pslld xmm1, 25
psrld xmm4, 7
por xmm1, xmm4
movdqa xmm4, xmm9
pslld xmm9, 25
psrld xmm4, 7
por xmm9, xmm4
pshufd xmm0, xmm0, 0x93
pshufd xmm8, xmm8, 0x93
pshufd xmm3, xmm3, 0x4E
pshufd xmm11, xmm11, 0x4E
pshufd xmm2, xmm2, 0x39
pshufd xmm10, xmm10, 0x39
paddd xmm0, xmm6
paddd xmm8, xmm14
paddd xmm0, xmm1
paddd xmm8, xmm9
pxor xmm3, xmm0
pxor xmm11, xmm8
pshufb xmm3, xmm12
pshufb xmm11, xmm12
paddd xmm2, xmm3
paddd xmm10, xmm11
pxor xmm1, xmm2
pxor xmm9, xmm10
movdqa xmm4, xmm1
pslld xmm1, 20
psrld xmm4, 12
por xmm1, xmm4
movdqa xmm4, xmm9
pslld xmm9, 20
psrld xmm4, 12
por xmm9, xmm4
paddd xmm0, xmm7
paddd xmm8, xmm15
paddd xmm0, xmm1
paddd xmm8, xmm9
pxor xmm3, xmm0
pxor xmm11, xmm8
pshufb xmm3, xmm13
pshufb xmm11, xmm13
paddd xmm2, xmm3
paddd xmm10, xmm11
pxor xmm1, xmm2
pxor xmm9, xmm10
movdqa xmm4, xmm1
pslld xmm1, 25
psrld xmm4, 7
por xmm1, xmm4
movdqa xmm4, xmm9
pslld xmm9, 25
psrld xmm4, 7
por xmm9, xmm4
pshufd xmm0, xmm0, 0x39
pshufd xmm8, xmm8, 0x39
pshufd xmm3, xmm3, 0x4E
pshufd xmm11, xmm11, 0x4E
pshufd xmm2, xmm2, 0x93
pshufd xmm10, xmm10, 0x93
dec al
je 9f
movdqa xmm12, xmmword ptr [rsp+0x20]
movdqa xmm5, xmmword ptr [rsp+0x40]
pshufd xmm13, xmm12, 0x0F
shufps xmm12, xmm5, 214
pshufd xmm4, xmm12, 0x39
movdqa xmm12, xmm6
shufps xmm12, xmm7, 250
pblendw xmm13, xmm12, 0xCC
movdqa xmm12, xmm7
punpcklqdq xmm12, xmm5
pblendw xmm12, xmm6, 0xC0
pshufd xmm12, xmm12, 0x78
punpckhdq xmm5, xmm7
punpckldq xmm6, xmm5
pshufd xmm7, xmm6, 0x1E
movdqa xmmword ptr [rsp+0x20], xmm13
movdqa xmmword ptr [rsp+0x40], xmm12
movdqa xmm5, xmmword ptr [rsp+0x30]
movdqa xmm13, xmmword ptr [rsp+0x50]
pshufd xmm6, xmm5, 0x0F
shufps xmm5, xmm13, 214
pshufd xmm12, xmm5, 0x39
movdqa xmm5, xmm14
shufps xmm5, xmm15, 250
pblendw xmm6, xmm5, 0xCC
movdqa xmm5, xmm15
punpcklqdq xmm5, xmm13
pblendw xmm5, xmm14, 0xC0
pshufd xmm5, xmm5, 0x78
punpckhdq xmm13, xmm15
punpckldq xmm14, xmm13
pshufd xmm15, xmm14, 0x1E
movdqa xmm13, xmm6
movdqa xmm14, xmm5
movdqa xmm5, xmmword ptr [rsp+0x20]
movdqa xmm6, xmmword ptr [rsp+0x40]
jmp 9b
9:
pxor xmm0, xmm2
pxor xmm1, xmm3
pxor xmm8, xmm10
pxor xmm9, xmm11
mov eax, r13d
cmp rdx, r15
jne 2b
movups xmmword ptr [rbx], xmm0
movups xmmword ptr [rbx+0x10], xmm1
movups xmmword ptr [rbx+0x20], xmm8
movups xmmword ptr [rbx+0x30], xmm9
movdqa xmm0, xmmword ptr [rsp+0x130]
movdqa xmm1, xmmword ptr [rsp+0x110]
movdqa xmm2, xmmword ptr [rsp+0x120]
movdqu xmm3, xmmword ptr [rsp+0x118]
movdqu xmm4, xmmword ptr [rsp+0x128]
blendvps xmm1, xmm3, xmm0
blendvps xmm2, xmm4, xmm0
movdqa xmmword ptr [rsp+0x110], xmm1
movdqa xmmword ptr [rsp+0x120], xmm2
add rdi, 16
add rbx, 64
sub rsi, 2
3:
test esi, 0x1
je 4b
movups xmm0, xmmword ptr [rcx]
movups xmm1, xmmword ptr [rcx+0x10]
movd xmm13, dword ptr [rsp+0x110]
pinsrd xmm13, dword ptr [rsp+0x120], 1
pinsrd xmm13, dword ptr [BLAKE3_BLOCK_LEN+rip], 2
movaps xmm14, xmmword ptr [ROT8+rip]
movaps xmm15, xmmword ptr [ROT16+rip]
mov r8, qword ptr [rdi]
movzx eax, byte ptr [rbp+0x80]
or eax, r13d
xor edx, edx
2:
mov r14d, eax
or eax, r12d
add rdx, 64
cmp rdx, r15
cmovne eax, r14d
movaps xmm2, xmmword ptr [BLAKE3_IV+rip]
movaps xmm3, xmm13
pinsrd xmm3, eax, 3
movups xmm4, xmmword ptr [r8+rdx-0x40]
movups xmm5, xmmword ptr [r8+rdx-0x30]
movaps xmm8, xmm4
shufps xmm4, xmm5, 136
shufps xmm8, xmm5, 221
movaps xmm5, xmm8
movups xmm6, xmmword ptr [r8+rdx-0x20]
movups xmm7, xmmword ptr [r8+rdx-0x10]
movaps xmm8, xmm6
shufps xmm6, xmm7, 136
pshufd xmm6, xmm6, 0x93
shufps xmm8, xmm7, 221
pshufd xmm7, xmm8, 0x93
mov al, 7
9:
paddd xmm0, xmm4
paddd xmm0, xmm1
pxor xmm3, xmm0
pshufb xmm3, xmm15
paddd xmm2, xmm3
pxor xmm1, xmm2
movdqa xmm11, xmm1
pslld xmm1, 20
psrld xmm11, 12
por xmm1, xmm11
paddd xmm0, xmm5
paddd xmm0, xmm1
pxor xmm3, xmm0
pshufb xmm3, xmm14
paddd xmm2, xmm3
pxor xmm1, xmm2
movdqa xmm11, xmm1
pslld xmm1, 25
psrld xmm11, 7
por xmm1, xmm11
pshufd xmm0, xmm0, 0x93
pshufd xmm3, xmm3, 0x4E
pshufd xmm2, xmm2, 0x39
paddd xmm0, xmm6
paddd xmm0, xmm1
pxor xmm3, xmm0
pshufb xmm3, xmm15
paddd xmm2, xmm3
pxor xmm1, xmm2
movdqa xmm11, xmm1
pslld xmm1, 20
psrld xmm11, 12
por xmm1, xmm11
paddd xmm0, xmm7
paddd xmm0, xmm1
pxor xmm3, xmm0
pshufb xmm3, xmm14
paddd xmm2, xmm3
pxor xmm1, xmm2
movdqa xmm11, xmm1
pslld xmm1, 25
psrld xmm11, 7
por xmm1, xmm11
pshufd xmm0, xmm0, 0x39
pshufd xmm3, xmm3, 0x4E
pshufd xmm2, xmm2, 0x93
dec al
jz 9f
movdqa xmm8, xmm4
shufps xmm8, xmm5, 214
pshufd xmm9, xmm4, 0x0F
pshufd xmm4, xmm8, 0x39
movdqa xmm8, xmm6
shufps xmm8, xmm7, 250
pblendw xmm9, xmm8, 0xCC
movdqa xmm8, xmm7
punpcklqdq xmm8, xmm5
pblendw xmm8, xmm6, 0xC0
pshufd xmm8, xmm8, 0x78
punpckhdq xmm5, xmm7
punpckldq xmm6, xmm5
pshufd xmm7, xmm6, 0x1E
movdqa xmm5, xmm9
movdqa xmm6, xmm8
jmp 9b
9:
pxor xmm0, xmm2
pxor xmm1, xmm3
mov eax, r13d
cmp rdx, r15
jne 2b
movups xmmword ptr [rbx], xmm0
movups xmmword ptr [rbx+0x10], xmm1
jmp 4b
.p2align 6
blake3_compress_in_place_sse2:
_blake3_compress_in_place_sse2:
sub rsp, 120
movdqa xmmword ptr [rsp], xmm6
movdqa xmmword ptr [rsp+0x10], xmm7
movdqa xmmword ptr [rsp+0x20], xmm8
movdqa xmmword ptr [rsp+0x30], xmm9
movdqa xmmword ptr [rsp+0x40], xmm11
movdqa xmmword ptr [rsp+0x50], xmm14
movdqa xmmword ptr [rsp+0x60], xmm15
movups xmm0, xmmword ptr [rcx]
movups xmm1, xmmword ptr [rcx+0x10]
movaps xmm2, xmmword ptr [BLAKE3_IV+rip]
movzx eax, byte ptr [rsp+0xA0]
movzx r8d, r8b
shl rax, 32
add r8, rax
movq xmm3, r9
movq xmm4, r8
punpcklqdq xmm3, xmm4
movups xmm4, xmmword ptr [rdx]
movups xmm5, xmmword ptr [rdx+0x10]
movaps xmm8, xmm4
shufps xmm4, xmm5, 136
shufps xmm8, xmm5, 221
movaps xmm5, xmm8
movups xmm6, xmmword ptr [rdx+0x20]
movups xmm7, xmmword ptr [rdx+0x30]
movaps xmm8, xmm6
shufps xmm6, xmm7, 136
pshufd xmm6, xmm6, 0x93
shufps xmm8, xmm7, 221
pshufd xmm7, xmm8, 0x93
movaps xmm14, xmmword ptr [ROT8+rip]
movaps xmm15, xmmword ptr [ROT16+rip]
mov al, 7
9:
paddd xmm0, xmm4
paddd xmm0, xmm1
pxor xmm3, xmm0
pshufb xmm3, xmm15
paddd xmm2, xmm3
pxor xmm1, xmm2
movdqa xmm11, xmm1
pslld xmm1, 20
psrld xmm11, 12
por xmm1, xmm11
paddd xmm0, xmm5
paddd xmm0, xmm1
pxor xmm3, xmm0
pshufb xmm3, xmm14
paddd xmm2, xmm3
pxor xmm1, xmm2
movdqa xmm11, xmm1
pslld xmm1, 25
psrld xmm11, 7
por xmm1, xmm11
pshufd xmm0, xmm0, 0x93
pshufd xmm3, xmm3, 0x4E
pshufd xmm2, xmm2, 0x39
paddd xmm0, xmm6
paddd xmm0, xmm1
pxor xmm3, xmm0
pshufb xmm3, xmm15
paddd xmm2, xmm3
pxor xmm1, xmm2
movdqa xmm11, xmm1
pslld xmm1, 20
psrld xmm11, 12
por xmm1, xmm11
paddd xmm0, xmm7
paddd xmm0, xmm1
pxor xmm3, xmm0
pshufb xmm3, xmm14
paddd xmm2, xmm3
pxor xmm1, xmm2
movdqa xmm11, xmm1
pslld xmm1, 25
psrld xmm11, 7
por xmm1, xmm11
pshufd xmm0, xmm0, 0x39
pshufd xmm3, xmm3, 0x4E
pshufd xmm2, xmm2, 0x93
dec al
jz 9f
movdqa xmm8, xmm4
shufps xmm8, xmm5, 214
pshufd xmm9, xmm4, 0x0F
pshufd xmm4, xmm8, 0x39
movdqa xmm8, xmm6
shufps xmm8, xmm7, 250
pblendw xmm9, xmm8, 0xCC
movdqa xmm8, xmm7
punpcklqdq xmm8, xmm5
pblendw xmm8, xmm6, 0xC0
pshufd xmm8, xmm8, 0x78
punpckhdq xmm5, xmm7
punpckldq xmm6, xmm5
pshufd xmm7, xmm6, 0x1E
movdqa xmm5, xmm9
movdqa xmm6, xmm8
jmp 9b
9:
pxor xmm0, xmm2
pxor xmm1, xmm3
movups xmmword ptr [rcx], xmm0
movups xmmword ptr [rcx+0x10], xmm1
movdqa xmm6, xmmword ptr [rsp]
movdqa xmm7, xmmword ptr [rsp+0x10]
movdqa xmm8, xmmword ptr [rsp+0x20]
movdqa xmm9, xmmword ptr [rsp+0x30]
movdqa xmm11, xmmword ptr [rsp+0x40]
movdqa xmm14, xmmword ptr [rsp+0x50]
movdqa xmm15, xmmword ptr [rsp+0x60]
add rsp, 120
ret
.p2align 6
_blake3_compress_xof_sse2:
blake3_compress_xof_sse2:
sub rsp, 120
movdqa xmmword ptr [rsp], xmm6
movdqa xmmword ptr [rsp+0x10], xmm7
movdqa xmmword ptr [rsp+0x20], xmm8
movdqa xmmword ptr [rsp+0x30], xmm9
movdqa xmmword ptr [rsp+0x40], xmm11
movdqa xmmword ptr [rsp+0x50], xmm14
movdqa xmmword ptr [rsp+0x60], xmm15
movups xmm0, xmmword ptr [rcx]
movups xmm1, xmmword ptr [rcx+0x10]
movaps xmm2, xmmword ptr [BLAKE3_IV+rip]
movzx eax, byte ptr [rsp+0xA0]
movzx r8d, r8b
mov r10, qword ptr [rsp+0xA8]
shl rax, 32
add r8, rax
movq xmm3, r9
movq xmm4, r8
punpcklqdq xmm3, xmm4
movups xmm4, xmmword ptr [rdx]
movups xmm5, xmmword ptr [rdx+0x10]
movaps xmm8, xmm4
shufps xmm4, xmm5, 136
shufps xmm8, xmm5, 221
movaps xmm5, xmm8
movups xmm6, xmmword ptr [rdx+0x20]
movups xmm7, xmmword ptr [rdx+0x30]
movaps xmm8, xmm6
shufps xmm6, xmm7, 136
pshufd xmm6, xmm6, 0x93
shufps xmm8, xmm7, 221
pshufd xmm7, xmm8, 0x93
movaps xmm14, xmmword ptr [ROT8+rip]
movaps xmm15, xmmword ptr [ROT16+rip]
mov al, 7
9:
paddd xmm0, xmm4
paddd xmm0, xmm1
pxor xmm3, xmm0
pshufb xmm3, xmm15
paddd xmm2, xmm3
pxor xmm1, xmm2
movdqa xmm11, xmm1
pslld xmm1, 20
psrld xmm11, 12
por xmm1, xmm11
paddd xmm0, xmm5
paddd xmm0, xmm1
pxor xmm3, xmm0
pshufb xmm3, xmm14
paddd xmm2, xmm3
pxor xmm1, xmm2
movdqa xmm11, xmm1
pslld xmm1, 25
psrld xmm11, 7
por xmm1, xmm11
pshufd xmm0, xmm0, 0x93
pshufd xmm3, xmm3, 0x4E
pshufd xmm2, xmm2, 0x39
paddd xmm0, xmm6
paddd xmm0, xmm1
pxor xmm3, xmm0
pshufb xmm3, xmm15
paddd xmm2, xmm3
pxor xmm1, xmm2
movdqa xmm11, xmm1
pslld xmm1, 20
psrld xmm11, 12
por xmm1, xmm11
paddd xmm0, xmm7
paddd xmm0, xmm1
pxor xmm3, xmm0
pshufb xmm3, xmm14
paddd xmm2, xmm3
pxor xmm1, xmm2
movdqa xmm11, xmm1
pslld xmm1, 25
psrld xmm11, 7
por xmm1, xmm11
pshufd xmm0, xmm0, 0x39
pshufd xmm3, xmm3, 0x4E
pshufd xmm2, xmm2, 0x93
dec al
jz 9f
movdqa xmm8, xmm4
shufps xmm8, xmm5, 214
pshufd xmm9, xmm4, 0x0F
pshufd xmm4, xmm8, 0x39
movdqa xmm8, xmm6
shufps xmm8, xmm7, 250
pblendw xmm9, xmm8, 0xCC
movdqa xmm8, xmm7
punpcklqdq xmm8, xmm5
pblendw xmm8, xmm6, 0xC0
pshufd xmm8, xmm8, 0x78
punpckhdq xmm5, xmm7
punpckldq xmm6, xmm5
pshufd xmm7, xmm6, 0x1E
movdqa xmm5, xmm9
movdqa xmm6, xmm8
jmp 9b
9:
movdqu xmm4, xmmword ptr [rcx]
movdqu xmm5, xmmword ptr [rcx+0x10]
pxor xmm0, xmm2
pxor xmm1, xmm3
pxor xmm2, xmm4
pxor xmm3, xmm5
movups xmmword ptr [r10], xmm0
movups xmmword ptr [r10+0x10], xmm1
movups xmmword ptr [r10+0x20], xmm2
movups xmmword ptr [r10+0x30], xmm3
movdqa xmm6, xmmword ptr [rsp]
movdqa xmm7, xmmword ptr [rsp+0x10]
movdqa xmm8, xmmword ptr [rsp+0x20]
movdqa xmm9, xmmword ptr [rsp+0x30]
movdqa xmm11, xmmword ptr [rsp+0x40]
movdqa xmm14, xmmword ptr [rsp+0x50]
movdqa xmm15, xmmword ptr [rsp+0x60]
add rsp, 120
ret
.section .rodata
.p2align 6
BLAKE3_IV:
.long 0x6A09E667, 0xBB67AE85
.long 0x3C6EF372, 0xA54FF53A
ROT16:
.byte 2, 3, 0, 1, 6, 7, 4, 5, 10, 11, 8, 9, 14, 15, 12, 13
ROT8:
.byte 1, 2, 3, 0, 5, 6, 7, 4, 9, 10, 11, 8, 13, 14, 15, 12
ADD0:
.long 0, 1, 2, 3
ADD1:
.long 4, 4, 4, 4
BLAKE3_IV_0:
.long 0x6A09E667, 0x6A09E667, 0x6A09E667, 0x6A09E667
BLAKE3_IV_1:
.long 0xBB67AE85, 0xBB67AE85, 0xBB67AE85, 0xBB67AE85
BLAKE3_IV_2:
.long 0x3C6EF372, 0x3C6EF372, 0x3C6EF372, 0x3C6EF372
BLAKE3_IV_3:
.long 0xA54FF53A, 0xA54FF53A, 0xA54FF53A, 0xA54FF53A
BLAKE3_BLOCK_LEN:
.long 64, 64, 64, 64
CMP_MSB_MASK:
.long 0x80000000, 0x80000000, 0x80000000, 0x80000000

View File

@ -0,0 +1,2089 @@
public _blake3_hash_many_sse2
public blake3_hash_many_sse2
public blake3_compress_in_place_sse2
public _blake3_compress_in_place_sse2
public blake3_compress_xof_sse2
public _blake3_compress_xof_sse2
_TEXT SEGMENT ALIGN(16) 'CODE'
ALIGN 16
blake3_hash_many_sse2 PROC
_blake3_hash_many_sse2 PROC
push r15
push r14
push r13
push r12
push rsi
push rdi
push rbx
push rbp
mov rbp, rsp
sub rsp, 528
and rsp, 0FFFFFFFFFFFFFFC0H
movdqa xmmword ptr [rsp+170H], xmm6
movdqa xmmword ptr [rsp+180H], xmm7
movdqa xmmword ptr [rsp+190H], xmm8
movdqa xmmword ptr [rsp+1A0H], xmm9
movdqa xmmword ptr [rsp+1B0H], xmm10
movdqa xmmword ptr [rsp+1C0H], xmm11
movdqa xmmword ptr [rsp+1D0H], xmm12
movdqa xmmword ptr [rsp+1E0H], xmm13
movdqa xmmword ptr [rsp+1F0H], xmm14
movdqa xmmword ptr [rsp+200H], xmm15
mov rdi, rcx
mov rsi, rdx
mov rdx, r8
mov rcx, r9
mov r8, qword ptr [rbp+68H]
movzx r9, byte ptr [rbp+70H]
neg r9d
movd xmm0, r9d
pshufd xmm0, xmm0, 00H
movdqa xmmword ptr [rsp+130H], xmm0
movdqa xmm1, xmm0
pand xmm1, xmmword ptr [ADD0]
pand xmm0, xmmword ptr [ADD1]
movdqa xmmword ptr [rsp+150H], xmm0
movd xmm0, r8d
pshufd xmm0, xmm0, 00H
paddd xmm0, xmm1
movdqa xmmword ptr [rsp+110H], xmm0
pxor xmm0, xmmword ptr [CMP_MSB_MASK]
pxor xmm1, xmmword ptr [CMP_MSB_MASK]
pcmpgtd xmm1, xmm0
shr r8, 32
movd xmm2, r8d
pshufd xmm2, xmm2, 00H
psubd xmm2, xmm1
movdqa xmmword ptr [rsp+120H], xmm2
mov rbx, qword ptr [rbp+90H]
mov r15, rdx
shl r15, 6
movzx r13d, byte ptr [rbp+78H]
movzx r12d, byte ptr [rbp+88H]
cmp rsi, 4
jc final3blocks
outerloop4:
movdqu xmm3, xmmword ptr [rcx]
pshufd xmm0, xmm3, 00H
pshufd xmm1, xmm3, 55H
pshufd xmm2, xmm3, 0AAH
pshufd xmm3, xmm3, 0FFH
movdqu xmm7, xmmword ptr [rcx+10H]
pshufd xmm4, xmm7, 00H
pshufd xmm5, xmm7, 55H
pshufd xmm6, xmm7, 0AAH
pshufd xmm7, xmm7, 0FFH
mov r8, qword ptr [rdi]
mov r9, qword ptr [rdi+8H]
mov r10, qword ptr [rdi+10H]
mov r11, qword ptr [rdi+18H]
movzx eax, byte ptr [rbp+80H]
or eax, r13d
xor edx, edx
innerloop4:
mov r14d, eax
or eax, r12d
add rdx, 64
cmp rdx, r15
cmovne eax, r14d
movdqu xmm8, xmmword ptr [r8+rdx-40H]
movdqu xmm9, xmmword ptr [r9+rdx-40H]
movdqu xmm10, xmmword ptr [r10+rdx-40H]
movdqu xmm11, xmmword ptr [r11+rdx-40H]
movdqa xmm12, xmm8
punpckldq xmm8, xmm9
punpckhdq xmm12, xmm9
movdqa xmm14, xmm10
punpckldq xmm10, xmm11
punpckhdq xmm14, xmm11
movdqa xmm9, xmm8
punpcklqdq xmm8, xmm10
punpckhqdq xmm9, xmm10
movdqa xmm13, xmm12
punpcklqdq xmm12, xmm14
punpckhqdq xmm13, xmm14
movdqa xmmword ptr [rsp], xmm8
movdqa xmmword ptr [rsp+10H], xmm9
movdqa xmmword ptr [rsp+20H], xmm12
movdqa xmmword ptr [rsp+30H], xmm13
movdqu xmm8, xmmword ptr [r8+rdx-30H]
movdqu xmm9, xmmword ptr [r9+rdx-30H]
movdqu xmm10, xmmword ptr [r10+rdx-30H]
movdqu xmm11, xmmword ptr [r11+rdx-30H]
movdqa xmm12, xmm8
punpckldq xmm8, xmm9
punpckhdq xmm12, xmm9
movdqa xmm14, xmm10
punpckldq xmm10, xmm11
punpckhdq xmm14, xmm11
movdqa xmm9, xmm8
punpcklqdq xmm8, xmm10
punpckhqdq xmm9, xmm10
movdqa xmm13, xmm12
punpcklqdq xmm12, xmm14
punpckhqdq xmm13, xmm14
movdqa xmmword ptr [rsp+40H], xmm8
movdqa xmmword ptr [rsp+50H], xmm9
movdqa xmmword ptr [rsp+60H], xmm12
movdqa xmmword ptr [rsp+70H], xmm13
movdqu xmm8, xmmword ptr [r8+rdx-20H]
movdqu xmm9, xmmword ptr [r9+rdx-20H]
movdqu xmm10, xmmword ptr [r10+rdx-20H]
movdqu xmm11, xmmword ptr [r11+rdx-20H]
movdqa xmm12, xmm8
punpckldq xmm8, xmm9
punpckhdq xmm12, xmm9
movdqa xmm14, xmm10
punpckldq xmm10, xmm11
punpckhdq xmm14, xmm11
movdqa xmm9, xmm8
punpcklqdq xmm8, xmm10
punpckhqdq xmm9, xmm10
movdqa xmm13, xmm12
punpcklqdq xmm12, xmm14
punpckhqdq xmm13, xmm14
movdqa xmmword ptr [rsp+80H], xmm8
movdqa xmmword ptr [rsp+90H], xmm9
movdqa xmmword ptr [rsp+0A0H], xmm12
movdqa xmmword ptr [rsp+0B0H], xmm13
movdqu xmm8, xmmword ptr [r8+rdx-10H]
movdqu xmm9, xmmword ptr [r9+rdx-10H]
movdqu xmm10, xmmword ptr [r10+rdx-10H]
movdqu xmm11, xmmword ptr [r11+rdx-10H]
movdqa xmm12, xmm8
punpckldq xmm8, xmm9
punpckhdq xmm12, xmm9
movdqa xmm14, xmm10
punpckldq xmm10, xmm11
punpckhdq xmm14, xmm11
movdqa xmm9, xmm8
punpcklqdq xmm8, xmm10
punpckhqdq xmm9, xmm10
movdqa xmm13, xmm12
punpcklqdq xmm12, xmm14
punpckhqdq xmm13, xmm14
movdqa xmmword ptr [rsp+0C0H], xmm8
movdqa xmmword ptr [rsp+0D0H], xmm9
movdqa xmmword ptr [rsp+0E0H], xmm12
movdqa xmmword ptr [rsp+0F0H], xmm13
movdqa xmm9, xmmword ptr [BLAKE3_IV_1]
movdqa xmm10, xmmword ptr [BLAKE3_IV_2]
movdqa xmm11, xmmword ptr [BLAKE3_IV_3]
movdqa xmm12, xmmword ptr [rsp+110H]
movdqa xmm13, xmmword ptr [rsp+120H]
movdqa xmm14, xmmword ptr [BLAKE3_BLOCK_LEN]
movd xmm15, eax
pshufd xmm15, xmm15, 00H
prefetcht0 byte ptr [r8+rdx+80H]
prefetcht0 byte ptr [r9+rdx+80H]
prefetcht0 byte ptr [r10+rdx+80H]
prefetcht0 byte ptr [r11+rdx+80H]
paddd xmm0, xmmword ptr [rsp]
paddd xmm1, xmmword ptr [rsp+20H]
paddd xmm2, xmmword ptr [rsp+40H]
paddd xmm3, xmmword ptr [rsp+60H]
paddd xmm0, xmm4
paddd xmm1, xmm5
paddd xmm2, xmm6
paddd xmm3, xmm7
pxor xmm12, xmm0
pxor xmm13, xmm1
pxor xmm14, xmm2
pxor xmm15, xmm3
movdqa xmm8, xmmword ptr [ROT16]
pshufb xmm12, xmm8
pshufb xmm13, xmm8
pshufb xmm14, xmm8
pshufb xmm15, xmm8
movdqa xmm8, xmmword ptr [BLAKE3_IV_0]
paddd xmm8, xmm12
paddd xmm9, xmm13
paddd xmm10, xmm14
paddd xmm11, xmm15
pxor xmm4, xmm8
pxor xmm5, xmm9
pxor xmm6, xmm10
pxor xmm7, xmm11
movdqa xmmword ptr [rsp+100H], xmm8
movdqa xmm8, xmm4
psrld xmm8, 12
pslld xmm4, 20
por xmm4, xmm8
movdqa xmm8, xmm5
psrld xmm8, 12
pslld xmm5, 20
por xmm5, xmm8
movdqa xmm8, xmm6
psrld xmm8, 12
pslld xmm6, 20
por xmm6, xmm8
movdqa xmm8, xmm7
psrld xmm8, 12
pslld xmm7, 20
por xmm7, xmm8
paddd xmm0, xmmword ptr [rsp+10H]
paddd xmm1, xmmword ptr [rsp+30H]
paddd xmm2, xmmword ptr [rsp+50H]
paddd xmm3, xmmword ptr [rsp+70H]
paddd xmm0, xmm4
paddd xmm1, xmm5
paddd xmm2, xmm6
paddd xmm3, xmm7
pxor xmm12, xmm0
pxor xmm13, xmm1
pxor xmm14, xmm2
pxor xmm15, xmm3
movdqa xmm8, xmmword ptr [ROT8]
pshufb xmm12, xmm8
pshufb xmm13, xmm8
pshufb xmm14, xmm8
pshufb xmm15, xmm8
movdqa xmm8, xmmword ptr [rsp+100H]
paddd xmm8, xmm12
paddd xmm9, xmm13
paddd xmm10, xmm14
paddd xmm11, xmm15
pxor xmm4, xmm8
pxor xmm5, xmm9
pxor xmm6, xmm10
pxor xmm7, xmm11
movdqa xmmword ptr [rsp+100H], xmm8
movdqa xmm8, xmm4
psrld xmm8, 7
pslld xmm4, 25
por xmm4, xmm8
movdqa xmm8, xmm5
psrld xmm8, 7
pslld xmm5, 25
por xmm5, xmm8
movdqa xmm8, xmm6
psrld xmm8, 7
pslld xmm6, 25
por xmm6, xmm8
movdqa xmm8, xmm7
psrld xmm8, 7
pslld xmm7, 25
por xmm7, xmm8
paddd xmm0, xmmword ptr [rsp+80H]
paddd xmm1, xmmword ptr [rsp+0A0H]
paddd xmm2, xmmword ptr [rsp+0C0H]
paddd xmm3, xmmword ptr [rsp+0E0H]
paddd xmm0, xmm5
paddd xmm1, xmm6
paddd xmm2, xmm7
paddd xmm3, xmm4
pxor xmm15, xmm0
pxor xmm12, xmm1
pxor xmm13, xmm2
pxor xmm14, xmm3
movdqa xmm8, xmmword ptr [ROT16]
pshufb xmm15, xmm8
pshufb xmm12, xmm8
pshufb xmm13, xmm8
pshufb xmm14, xmm8
paddd xmm10, xmm15
paddd xmm11, xmm12
movdqa xmm8, xmmword ptr [rsp+100H]
paddd xmm8, xmm13
paddd xmm9, xmm14
pxor xmm5, xmm10
pxor xmm6, xmm11
pxor xmm7, xmm8
pxor xmm4, xmm9
movdqa xmmword ptr [rsp+100H], xmm8
movdqa xmm8, xmm5
psrld xmm8, 12
pslld xmm5, 20
por xmm5, xmm8
movdqa xmm8, xmm6
psrld xmm8, 12
pslld xmm6, 20
por xmm6, xmm8
movdqa xmm8, xmm7
psrld xmm8, 12
pslld xmm7, 20
por xmm7, xmm8
movdqa xmm8, xmm4
psrld xmm8, 12
pslld xmm4, 20
por xmm4, xmm8
paddd xmm0, xmmword ptr [rsp+90H]
paddd xmm1, xmmword ptr [rsp+0B0H]
paddd xmm2, xmmword ptr [rsp+0D0H]
paddd xmm3, xmmword ptr [rsp+0F0H]
paddd xmm0, xmm5
paddd xmm1, xmm6
paddd xmm2, xmm7
paddd xmm3, xmm4
pxor xmm15, xmm0
pxor xmm12, xmm1
pxor xmm13, xmm2
pxor xmm14, xmm3
movdqa xmm8, xmmword ptr [ROT8]
pshufb xmm15, xmm8
pshufb xmm12, xmm8
pshufb xmm13, xmm8
pshufb xmm14, xmm8
paddd xmm10, xmm15
paddd xmm11, xmm12
movdqa xmm8, xmmword ptr [rsp+100H]
paddd xmm8, xmm13
paddd xmm9, xmm14
pxor xmm5, xmm10
pxor xmm6, xmm11
pxor xmm7, xmm8
pxor xmm4, xmm9
movdqa xmmword ptr [rsp+100H], xmm8
movdqa xmm8, xmm5
psrld xmm8, 7
pslld xmm5, 25
por xmm5, xmm8
movdqa xmm8, xmm6
psrld xmm8, 7
pslld xmm6, 25
por xmm6, xmm8
movdqa xmm8, xmm7
psrld xmm8, 7
pslld xmm7, 25
por xmm7, xmm8
movdqa xmm8, xmm4
psrld xmm8, 7
pslld xmm4, 25
por xmm4, xmm8
paddd xmm0, xmmword ptr [rsp+20H]
paddd xmm1, xmmword ptr [rsp+30H]
paddd xmm2, xmmword ptr [rsp+70H]
paddd xmm3, xmmword ptr [rsp+40H]
paddd xmm0, xmm4
paddd xmm1, xmm5
paddd xmm2, xmm6
paddd xmm3, xmm7
pxor xmm12, xmm0
pxor xmm13, xmm1
pxor xmm14, xmm2
pxor xmm15, xmm3
movdqa xmm8, xmmword ptr [ROT16]
pshufb xmm12, xmm8
pshufb xmm13, xmm8
pshufb xmm14, xmm8
pshufb xmm15, xmm8
movdqa xmm8, xmmword ptr [rsp+100H]
paddd xmm8, xmm12
paddd xmm9, xmm13
paddd xmm10, xmm14
paddd xmm11, xmm15
pxor xmm4, xmm8
pxor xmm5, xmm9
pxor xmm6, xmm10
pxor xmm7, xmm11
movdqa xmmword ptr [rsp+100H], xmm8
movdqa xmm8, xmm4
psrld xmm8, 12
pslld xmm4, 20
por xmm4, xmm8
movdqa xmm8, xmm5
psrld xmm8, 12
pslld xmm5, 20
por xmm5, xmm8
movdqa xmm8, xmm6
psrld xmm8, 12
pslld xmm6, 20
por xmm6, xmm8
movdqa xmm8, xmm7
psrld xmm8, 12
pslld xmm7, 20
por xmm7, xmm8
paddd xmm0, xmmword ptr [rsp+60H]
paddd xmm1, xmmword ptr [rsp+0A0H]
paddd xmm2, xmmword ptr [rsp]
paddd xmm3, xmmword ptr [rsp+0D0H]
paddd xmm0, xmm4
paddd xmm1, xmm5
paddd xmm2, xmm6
paddd xmm3, xmm7
pxor xmm12, xmm0
pxor xmm13, xmm1
pxor xmm14, xmm2
pxor xmm15, xmm3
movdqa xmm8, xmmword ptr [ROT8]
pshufb xmm12, xmm8
pshufb xmm13, xmm8
pshufb xmm14, xmm8
pshufb xmm15, xmm8
movdqa xmm8, xmmword ptr [rsp+100H]
paddd xmm8, xmm12
paddd xmm9, xmm13
paddd xmm10, xmm14
paddd xmm11, xmm15
pxor xmm4, xmm8
pxor xmm5, xmm9
pxor xmm6, xmm10
pxor xmm7, xmm11
movdqa xmmword ptr [rsp+100H], xmm8
movdqa xmm8, xmm4
psrld xmm8, 7
pslld xmm4, 25
por xmm4, xmm8
movdqa xmm8, xmm5
psrld xmm8, 7
pslld xmm5, 25
por xmm5, xmm8
movdqa xmm8, xmm6
psrld xmm8, 7
pslld xmm6, 25
por xmm6, xmm8
movdqa xmm8, xmm7
psrld xmm8, 7
pslld xmm7, 25
por xmm7, xmm8
paddd xmm0, xmmword ptr [rsp+10H]
paddd xmm1, xmmword ptr [rsp+0C0H]
paddd xmm2, xmmword ptr [rsp+90H]
paddd xmm3, xmmword ptr [rsp+0F0H]
paddd xmm0, xmm5
paddd xmm1, xmm6
paddd xmm2, xmm7
paddd xmm3, xmm4
pxor xmm15, xmm0
pxor xmm12, xmm1
pxor xmm13, xmm2
pxor xmm14, xmm3
movdqa xmm8, xmmword ptr [ROT16]
pshufb xmm15, xmm8
pshufb xmm12, xmm8
pshufb xmm13, xmm8
pshufb xmm14, xmm8
paddd xmm10, xmm15
paddd xmm11, xmm12
movdqa xmm8, xmmword ptr [rsp+100H]
paddd xmm8, xmm13
paddd xmm9, xmm14
pxor xmm5, xmm10
pxor xmm6, xmm11
pxor xmm7, xmm8
pxor xmm4, xmm9
movdqa xmmword ptr [rsp+100H], xmm8
movdqa xmm8, xmm5
psrld xmm8, 12
pslld xmm5, 20
por xmm5, xmm8
movdqa xmm8, xmm6
psrld xmm8, 12
pslld xmm6, 20
por xmm6, xmm8
movdqa xmm8, xmm7
psrld xmm8, 12
pslld xmm7, 20
por xmm7, xmm8
movdqa xmm8, xmm4
psrld xmm8, 12
pslld xmm4, 20
por xmm4, xmm8
paddd xmm0, xmmword ptr [rsp+0B0H]
paddd xmm1, xmmword ptr [rsp+50H]
paddd xmm2, xmmword ptr [rsp+0E0H]
paddd xmm3, xmmword ptr [rsp+80H]
paddd xmm0, xmm5
paddd xmm1, xmm6
paddd xmm2, xmm7
paddd xmm3, xmm4
pxor xmm15, xmm0
pxor xmm12, xmm1
pxor xmm13, xmm2
pxor xmm14, xmm3
movdqa xmm8, xmmword ptr [ROT8]
pshufb xmm15, xmm8
pshufb xmm12, xmm8
pshufb xmm13, xmm8
pshufb xmm14, xmm8
paddd xmm10, xmm15
paddd xmm11, xmm12
movdqa xmm8, xmmword ptr [rsp+100H]
paddd xmm8, xmm13
paddd xmm9, xmm14
pxor xmm5, xmm10
pxor xmm6, xmm11
pxor xmm7, xmm8
pxor xmm4, xmm9
movdqa xmmword ptr [rsp+100H], xmm8
movdqa xmm8, xmm5
psrld xmm8, 7
pslld xmm5, 25
por xmm5, xmm8
movdqa xmm8, xmm6
psrld xmm8, 7
pslld xmm6, 25
por xmm6, xmm8
movdqa xmm8, xmm7
psrld xmm8, 7
pslld xmm7, 25
por xmm7, xmm8
movdqa xmm8, xmm4
psrld xmm8, 7
pslld xmm4, 25
por xmm4, xmm8
paddd xmm0, xmmword ptr [rsp+30H]
paddd xmm1, xmmword ptr [rsp+0A0H]
paddd xmm2, xmmword ptr [rsp+0D0H]
paddd xmm3, xmmword ptr [rsp+70H]
paddd xmm0, xmm4
paddd xmm1, xmm5
paddd xmm2, xmm6
paddd xmm3, xmm7
pxor xmm12, xmm0
pxor xmm13, xmm1
pxor xmm14, xmm2
pxor xmm15, xmm3
movdqa xmm8, xmmword ptr [ROT16]
pshufb xmm12, xmm8
pshufb xmm13, xmm8
pshufb xmm14, xmm8
pshufb xmm15, xmm8
movdqa xmm8, xmmword ptr [rsp+100H]
paddd xmm8, xmm12
paddd xmm9, xmm13
paddd xmm10, xmm14
paddd xmm11, xmm15
pxor xmm4, xmm8
pxor xmm5, xmm9
pxor xmm6, xmm10
pxor xmm7, xmm11
movdqa xmmword ptr [rsp+100H], xmm8
movdqa xmm8, xmm4
psrld xmm8, 12
pslld xmm4, 20
por xmm4, xmm8
movdqa xmm8, xmm5
psrld xmm8, 12
pslld xmm5, 20
por xmm5, xmm8
movdqa xmm8, xmm6
psrld xmm8, 12
pslld xmm6, 20
por xmm6, xmm8
movdqa xmm8, xmm7
psrld xmm8, 12
pslld xmm7, 20
por xmm7, xmm8
paddd xmm0, xmmword ptr [rsp+40H]
paddd xmm1, xmmword ptr [rsp+0C0H]
paddd xmm2, xmmword ptr [rsp+20H]
paddd xmm3, xmmword ptr [rsp+0E0H]
paddd xmm0, xmm4
paddd xmm1, xmm5
paddd xmm2, xmm6
paddd xmm3, xmm7
pxor xmm12, xmm0
pxor xmm13, xmm1
pxor xmm14, xmm2
pxor xmm15, xmm3
movdqa xmm8, xmmword ptr [ROT8]
pshufb xmm12, xmm8
pshufb xmm13, xmm8
pshufb xmm14, xmm8
pshufb xmm15, xmm8
movdqa xmm8, xmmword ptr [rsp+100H]
paddd xmm8, xmm12
paddd xmm9, xmm13
paddd xmm10, xmm14
paddd xmm11, xmm15
pxor xmm4, xmm8
pxor xmm5, xmm9
pxor xmm6, xmm10
pxor xmm7, xmm11
movdqa xmmword ptr [rsp+100H], xmm8
movdqa xmm8, xmm4
psrld xmm8, 7
pslld xmm4, 25
por xmm4, xmm8
movdqa xmm8, xmm5
psrld xmm8, 7
pslld xmm5, 25
por xmm5, xmm8
movdqa xmm8, xmm6
psrld xmm8, 7
pslld xmm6, 25
por xmm6, xmm8
movdqa xmm8, xmm7
psrld xmm8, 7
pslld xmm7, 25
por xmm7, xmm8
paddd xmm0, xmmword ptr [rsp+60H]
paddd xmm1, xmmword ptr [rsp+90H]
paddd xmm2, xmmword ptr [rsp+0B0H]
paddd xmm3, xmmword ptr [rsp+80H]
paddd xmm0, xmm5
paddd xmm1, xmm6
paddd xmm2, xmm7
paddd xmm3, xmm4
pxor xmm15, xmm0
pxor xmm12, xmm1
pxor xmm13, xmm2
pxor xmm14, xmm3
movdqa xmm8, xmmword ptr [ROT16]
pshufb xmm15, xmm8
pshufb xmm12, xmm8
pshufb xmm13, xmm8
pshufb xmm14, xmm8
paddd xmm10, xmm15
paddd xmm11, xmm12
movdqa xmm8, xmmword ptr [rsp+100H]
paddd xmm8, xmm13
paddd xmm9, xmm14
pxor xmm5, xmm10
pxor xmm6, xmm11
pxor xmm7, xmm8
pxor xmm4, xmm9
movdqa xmmword ptr [rsp+100H], xmm8
movdqa xmm8, xmm5
psrld xmm8, 12
pslld xmm5, 20
por xmm5, xmm8
movdqa xmm8, xmm6
psrld xmm8, 12
pslld xmm6, 20
por xmm6, xmm8
movdqa xmm8, xmm7
psrld xmm8, 12
pslld xmm7, 20
por xmm7, xmm8
movdqa xmm8, xmm4
psrld xmm8, 12
pslld xmm4, 20
por xmm4, xmm8
paddd xmm0, xmmword ptr [rsp+50H]
paddd xmm1, xmmword ptr [rsp]
paddd xmm2, xmmword ptr [rsp+0F0H]
paddd xmm3, xmmword ptr [rsp+10H]
paddd xmm0, xmm5
paddd xmm1, xmm6
paddd xmm2, xmm7
paddd xmm3, xmm4
pxor xmm15, xmm0
pxor xmm12, xmm1
pxor xmm13, xmm2
pxor xmm14, xmm3
movdqa xmm8, xmmword ptr [ROT8]
pshufb xmm15, xmm8
pshufb xmm12, xmm8
pshufb xmm13, xmm8
pshufb xmm14, xmm8
paddd xmm10, xmm15
paddd xmm11, xmm12
movdqa xmm8, xmmword ptr [rsp+100H]
paddd xmm8, xmm13
paddd xmm9, xmm14
pxor xmm5, xmm10
pxor xmm6, xmm11
pxor xmm7, xmm8
pxor xmm4, xmm9
movdqa xmmword ptr [rsp+100H], xmm8
movdqa xmm8, xmm5
psrld xmm8, 7
pslld xmm5, 25
por xmm5, xmm8
movdqa xmm8, xmm6
psrld xmm8, 7
pslld xmm6, 25
por xmm6, xmm8
movdqa xmm8, xmm7
psrld xmm8, 7
pslld xmm7, 25
por xmm7, xmm8
movdqa xmm8, xmm4
psrld xmm8, 7
pslld xmm4, 25
por xmm4, xmm8
paddd xmm0, xmmword ptr [rsp+0A0H]
paddd xmm1, xmmword ptr [rsp+0C0H]
paddd xmm2, xmmword ptr [rsp+0E0H]
paddd xmm3, xmmword ptr [rsp+0D0H]
paddd xmm0, xmm4
paddd xmm1, xmm5
paddd xmm2, xmm6
paddd xmm3, xmm7
pxor xmm12, xmm0
pxor xmm13, xmm1
pxor xmm14, xmm2
pxor xmm15, xmm3
movdqa xmm8, xmmword ptr [ROT16]
pshufb xmm12, xmm8
pshufb xmm13, xmm8
pshufb xmm14, xmm8
pshufb xmm15, xmm8
movdqa xmm8, xmmword ptr [rsp+100H]
paddd xmm8, xmm12
paddd xmm9, xmm13
paddd xmm10, xmm14
paddd xmm11, xmm15
pxor xmm4, xmm8
pxor xmm5, xmm9
pxor xmm6, xmm10
pxor xmm7, xmm11
movdqa xmmword ptr [rsp+100H], xmm8
movdqa xmm8, xmm4
psrld xmm8, 12
pslld xmm4, 20
por xmm4, xmm8
movdqa xmm8, xmm5
psrld xmm8, 12
pslld xmm5, 20
por xmm5, xmm8
movdqa xmm8, xmm6
psrld xmm8, 12
pslld xmm6, 20
por xmm6, xmm8
movdqa xmm8, xmm7
psrld xmm8, 12
pslld xmm7, 20
por xmm7, xmm8
paddd xmm0, xmmword ptr [rsp+70H]
paddd xmm1, xmmword ptr [rsp+90H]
paddd xmm2, xmmword ptr [rsp+30H]
paddd xmm3, xmmword ptr [rsp+0F0H]
paddd xmm0, xmm4
paddd xmm1, xmm5
paddd xmm2, xmm6
paddd xmm3, xmm7
pxor xmm12, xmm0
pxor xmm13, xmm1
pxor xmm14, xmm2
pxor xmm15, xmm3
movdqa xmm8, xmmword ptr [ROT8]
pshufb xmm12, xmm8
pshufb xmm13, xmm8
pshufb xmm14, xmm8
pshufb xmm15, xmm8
movdqa xmm8, xmmword ptr [rsp+100H]
paddd xmm8, xmm12
paddd xmm9, xmm13
paddd xmm10, xmm14
paddd xmm11, xmm15
pxor xmm4, xmm8
pxor xmm5, xmm9
pxor xmm6, xmm10
pxor xmm7, xmm11
movdqa xmmword ptr [rsp+100H], xmm8
movdqa xmm8, xmm4
psrld xmm8, 7
pslld xmm4, 25
por xmm4, xmm8
movdqa xmm8, xmm5
psrld xmm8, 7
pslld xmm5, 25
por xmm5, xmm8
movdqa xmm8, xmm6
psrld xmm8, 7
pslld xmm6, 25
por xmm6, xmm8
movdqa xmm8, xmm7
psrld xmm8, 7
pslld xmm7, 25
por xmm7, xmm8
paddd xmm0, xmmword ptr [rsp+40H]
paddd xmm1, xmmword ptr [rsp+0B0H]
paddd xmm2, xmmword ptr [rsp+50H]
paddd xmm3, xmmword ptr [rsp+10H]
paddd xmm0, xmm5
paddd xmm1, xmm6
paddd xmm2, xmm7
paddd xmm3, xmm4
pxor xmm15, xmm0
pxor xmm12, xmm1
pxor xmm13, xmm2
pxor xmm14, xmm3
movdqa xmm8, xmmword ptr [ROT16]
pshufb xmm15, xmm8
pshufb xmm12, xmm8
pshufb xmm13, xmm8
pshufb xmm14, xmm8
paddd xmm10, xmm15
paddd xmm11, xmm12
movdqa xmm8, xmmword ptr [rsp+100H]
paddd xmm8, xmm13
paddd xmm9, xmm14
pxor xmm5, xmm10
pxor xmm6, xmm11
pxor xmm7, xmm8
pxor xmm4, xmm9
movdqa xmmword ptr [rsp+100H], xmm8
movdqa xmm8, xmm5
psrld xmm8, 12
pslld xmm5, 20
por xmm5, xmm8
movdqa xmm8, xmm6
psrld xmm8, 12
pslld xmm6, 20
por xmm6, xmm8
movdqa xmm8, xmm7
psrld xmm8, 12
pslld xmm7, 20
por xmm7, xmm8
movdqa xmm8, xmm4
psrld xmm8, 12
pslld xmm4, 20
por xmm4, xmm8
paddd xmm0, xmmword ptr [rsp]
paddd xmm1, xmmword ptr [rsp+20H]
paddd xmm2, xmmword ptr [rsp+80H]
paddd xmm3, xmmword ptr [rsp+60H]
paddd xmm0, xmm5
paddd xmm1, xmm6
paddd xmm2, xmm7
paddd xmm3, xmm4
pxor xmm15, xmm0
pxor xmm12, xmm1
pxor xmm13, xmm2
pxor xmm14, xmm3
movdqa xmm8, xmmword ptr [ROT8]
pshufb xmm15, xmm8
pshufb xmm12, xmm8
pshufb xmm13, xmm8
pshufb xmm14, xmm8
paddd xmm10, xmm15
paddd xmm11, xmm12
movdqa xmm8, xmmword ptr [rsp+100H]
paddd xmm8, xmm13
paddd xmm9, xmm14
pxor xmm5, xmm10
pxor xmm6, xmm11
pxor xmm7, xmm8
pxor xmm4, xmm9
movdqa xmmword ptr [rsp+100H], xmm8
movdqa xmm8, xmm5
psrld xmm8, 7
pslld xmm5, 25
por xmm5, xmm8
movdqa xmm8, xmm6
psrld xmm8, 7
pslld xmm6, 25
por xmm6, xmm8
movdqa xmm8, xmm7
psrld xmm8, 7
pslld xmm7, 25
por xmm7, xmm8
movdqa xmm8, xmm4
psrld xmm8, 7
pslld xmm4, 25
por xmm4, xmm8
paddd xmm0, xmmword ptr [rsp+0C0H]
paddd xmm1, xmmword ptr [rsp+90H]
paddd xmm2, xmmword ptr [rsp+0F0H]
paddd xmm3, xmmword ptr [rsp+0E0H]
paddd xmm0, xmm4
paddd xmm1, xmm5
paddd xmm2, xmm6
paddd xmm3, xmm7
pxor xmm12, xmm0
pxor xmm13, xmm1
pxor xmm14, xmm2
pxor xmm15, xmm3
movdqa xmm8, xmmword ptr [ROT16]
pshufb xmm12, xmm8
pshufb xmm13, xmm8
pshufb xmm14, xmm8
pshufb xmm15, xmm8
movdqa xmm8, xmmword ptr [rsp+100H]
paddd xmm8, xmm12
paddd xmm9, xmm13
paddd xmm10, xmm14
paddd xmm11, xmm15
pxor xmm4, xmm8
pxor xmm5, xmm9
pxor xmm6, xmm10
pxor xmm7, xmm11
movdqa xmmword ptr [rsp+100H], xmm8
movdqa xmm8, xmm4
psrld xmm8, 12
pslld xmm4, 20
por xmm4, xmm8
movdqa xmm8, xmm5
psrld xmm8, 12
pslld xmm5, 20
por xmm5, xmm8
movdqa xmm8, xmm6
psrld xmm8, 12
pslld xmm6, 20
por xmm6, xmm8
movdqa xmm8, xmm7
psrld xmm8, 12
pslld xmm7, 20
por xmm7, xmm8
paddd xmm0, xmmword ptr [rsp+0D0H]
paddd xmm1, xmmword ptr [rsp+0B0H]
paddd xmm2, xmmword ptr [rsp+0A0H]
paddd xmm3, xmmword ptr [rsp+80H]
paddd xmm0, xmm4
paddd xmm1, xmm5
paddd xmm2, xmm6
paddd xmm3, xmm7
pxor xmm12, xmm0
pxor xmm13, xmm1
pxor xmm14, xmm2
pxor xmm15, xmm3
movdqa xmm8, xmmword ptr [ROT8]
pshufb xmm12, xmm8
pshufb xmm13, xmm8
pshufb xmm14, xmm8
pshufb xmm15, xmm8
movdqa xmm8, xmmword ptr [rsp+100H]
paddd xmm8, xmm12
paddd xmm9, xmm13
paddd xmm10, xmm14
paddd xmm11, xmm15
pxor xmm4, xmm8
pxor xmm5, xmm9
pxor xmm6, xmm10
pxor xmm7, xmm11
movdqa xmmword ptr [rsp+100H], xmm8
movdqa xmm8, xmm4
psrld xmm8, 7
pslld xmm4, 25
por xmm4, xmm8
movdqa xmm8, xmm5
psrld xmm8, 7
pslld xmm5, 25
por xmm5, xmm8
movdqa xmm8, xmm6
psrld xmm8, 7
pslld xmm6, 25
por xmm6, xmm8
movdqa xmm8, xmm7
psrld xmm8, 7
pslld xmm7, 25
por xmm7, xmm8
paddd xmm0, xmmword ptr [rsp+70H]
paddd xmm1, xmmword ptr [rsp+50H]
paddd xmm2, xmmword ptr [rsp]
paddd xmm3, xmmword ptr [rsp+60H]
paddd xmm0, xmm5
paddd xmm1, xmm6
paddd xmm2, xmm7
paddd xmm3, xmm4
pxor xmm15, xmm0
pxor xmm12, xmm1
pxor xmm13, xmm2
pxor xmm14, xmm3
movdqa xmm8, xmmword ptr [ROT16]
pshufb xmm15, xmm8
pshufb xmm12, xmm8
pshufb xmm13, xmm8
pshufb xmm14, xmm8
paddd xmm10, xmm15
paddd xmm11, xmm12
movdqa xmm8, xmmword ptr [rsp+100H]
paddd xmm8, xmm13
paddd xmm9, xmm14
pxor xmm5, xmm10
pxor xmm6, xmm11
pxor xmm7, xmm8
pxor xmm4, xmm9
movdqa xmmword ptr [rsp+100H], xmm8
movdqa xmm8, xmm5
psrld xmm8, 12
pslld xmm5, 20
por xmm5, xmm8
movdqa xmm8, xmm6
psrld xmm8, 12
pslld xmm6, 20
por xmm6, xmm8
movdqa xmm8, xmm7
psrld xmm8, 12
pslld xmm7, 20
por xmm7, xmm8
movdqa xmm8, xmm4
psrld xmm8, 12
pslld xmm4, 20
por xmm4, xmm8
paddd xmm0, xmmword ptr [rsp+20H]
paddd xmm1, xmmword ptr [rsp+30H]
paddd xmm2, xmmword ptr [rsp+10H]
paddd xmm3, xmmword ptr [rsp+40H]
paddd xmm0, xmm5
paddd xmm1, xmm6
paddd xmm2, xmm7
paddd xmm3, xmm4
pxor xmm15, xmm0
pxor xmm12, xmm1
pxor xmm13, xmm2
pxor xmm14, xmm3
movdqa xmm8, xmmword ptr [ROT8]
pshufb xmm15, xmm8
pshufb xmm12, xmm8
pshufb xmm13, xmm8
pshufb xmm14, xmm8
paddd xmm10, xmm15
paddd xmm11, xmm12
movdqa xmm8, xmmword ptr [rsp+100H]
paddd xmm8, xmm13
paddd xmm9, xmm14
pxor xmm5, xmm10
pxor xmm6, xmm11
pxor xmm7, xmm8
pxor xmm4, xmm9
movdqa xmmword ptr [rsp+100H], xmm8
movdqa xmm8, xmm5
psrld xmm8, 7
pslld xmm5, 25
por xmm5, xmm8
movdqa xmm8, xmm6
psrld xmm8, 7
pslld xmm6, 25
por xmm6, xmm8
movdqa xmm8, xmm7
psrld xmm8, 7
pslld xmm7, 25
por xmm7, xmm8
movdqa xmm8, xmm4
psrld xmm8, 7
pslld xmm4, 25
por xmm4, xmm8
paddd xmm0, xmmword ptr [rsp+90H]
paddd xmm1, xmmword ptr [rsp+0B0H]
paddd xmm2, xmmword ptr [rsp+80H]
paddd xmm3, xmmword ptr [rsp+0F0H]
paddd xmm0, xmm4
paddd xmm1, xmm5
paddd xmm2, xmm6
paddd xmm3, xmm7
pxor xmm12, xmm0
pxor xmm13, xmm1
pxor xmm14, xmm2
pxor xmm15, xmm3
movdqa xmm8, xmmword ptr [ROT16]
pshufb xmm12, xmm8
pshufb xmm13, xmm8
pshufb xmm14, xmm8
pshufb xmm15, xmm8
movdqa xmm8, xmmword ptr [rsp+100H]
paddd xmm8, xmm12
paddd xmm9, xmm13
paddd xmm10, xmm14
paddd xmm11, xmm15
pxor xmm4, xmm8
pxor xmm5, xmm9
pxor xmm6, xmm10
pxor xmm7, xmm11
movdqa xmmword ptr [rsp+100H], xmm8
movdqa xmm8, xmm4
psrld xmm8, 12
pslld xmm4, 20
por xmm4, xmm8
movdqa xmm8, xmm5
psrld xmm8, 12
pslld xmm5, 20
por xmm5, xmm8
movdqa xmm8, xmm6
psrld xmm8, 12
pslld xmm6, 20
por xmm6, xmm8
movdqa xmm8, xmm7
psrld xmm8, 12
pslld xmm7, 20
por xmm7, xmm8
paddd xmm0, xmmword ptr [rsp+0E0H]
paddd xmm1, xmmword ptr [rsp+50H]
paddd xmm2, xmmword ptr [rsp+0C0H]
paddd xmm3, xmmword ptr [rsp+10H]
paddd xmm0, xmm4
paddd xmm1, xmm5
paddd xmm2, xmm6
paddd xmm3, xmm7
pxor xmm12, xmm0
pxor xmm13, xmm1
pxor xmm14, xmm2
pxor xmm15, xmm3
movdqa xmm8, xmmword ptr [ROT8]
pshufb xmm12, xmm8
pshufb xmm13, xmm8
pshufb xmm14, xmm8
pshufb xmm15, xmm8
movdqa xmm8, xmmword ptr [rsp+100H]
paddd xmm8, xmm12
paddd xmm9, xmm13
paddd xmm10, xmm14
paddd xmm11, xmm15
pxor xmm4, xmm8
pxor xmm5, xmm9
pxor xmm6, xmm10
pxor xmm7, xmm11
movdqa xmmword ptr [rsp+100H], xmm8
movdqa xmm8, xmm4
psrld xmm8, 7
pslld xmm4, 25
por xmm4, xmm8
movdqa xmm8, xmm5
psrld xmm8, 7
pslld xmm5, 25
por xmm5, xmm8
movdqa xmm8, xmm6
psrld xmm8, 7
pslld xmm6, 25
por xmm6, xmm8
movdqa xmm8, xmm7
psrld xmm8, 7
pslld xmm7, 25
por xmm7, xmm8
paddd xmm0, xmmword ptr [rsp+0D0H]
paddd xmm1, xmmword ptr [rsp]
paddd xmm2, xmmword ptr [rsp+20H]
paddd xmm3, xmmword ptr [rsp+40H]
paddd xmm0, xmm5
paddd xmm1, xmm6
paddd xmm2, xmm7
paddd xmm3, xmm4
pxor xmm15, xmm0
pxor xmm12, xmm1
pxor xmm13, xmm2
pxor xmm14, xmm3
movdqa xmm8, xmmword ptr [ROT16]
pshufb xmm15, xmm8
pshufb xmm12, xmm8
pshufb xmm13, xmm8
pshufb xmm14, xmm8
paddd xmm10, xmm15
paddd xmm11, xmm12
movdqa xmm8, xmmword ptr [rsp+100H]
paddd xmm8, xmm13
paddd xmm9, xmm14
pxor xmm5, xmm10
pxor xmm6, xmm11
pxor xmm7, xmm8
pxor xmm4, xmm9
movdqa xmmword ptr [rsp+100H], xmm8
movdqa xmm8, xmm5
psrld xmm8, 12
pslld xmm5, 20
por xmm5, xmm8
movdqa xmm8, xmm6
psrld xmm8, 12
pslld xmm6, 20
por xmm6, xmm8
movdqa xmm8, xmm7
psrld xmm8, 12
pslld xmm7, 20
por xmm7, xmm8
movdqa xmm8, xmm4
psrld xmm8, 12
pslld xmm4, 20
por xmm4, xmm8
paddd xmm0, xmmword ptr [rsp+30H]
paddd xmm1, xmmword ptr [rsp+0A0H]
paddd xmm2, xmmword ptr [rsp+60H]
paddd xmm3, xmmword ptr [rsp+70H]
paddd xmm0, xmm5
paddd xmm1, xmm6
paddd xmm2, xmm7
paddd xmm3, xmm4
pxor xmm15, xmm0
pxor xmm12, xmm1
pxor xmm13, xmm2
pxor xmm14, xmm3
movdqa xmm8, xmmword ptr [ROT8]
pshufb xmm15, xmm8
pshufb xmm12, xmm8
pshufb xmm13, xmm8
pshufb xmm14, xmm8
paddd xmm10, xmm15
paddd xmm11, xmm12
movdqa xmm8, xmmword ptr [rsp+100H]
paddd xmm8, xmm13
paddd xmm9, xmm14
pxor xmm5, xmm10
pxor xmm6, xmm11
pxor xmm7, xmm8
pxor xmm4, xmm9
movdqa xmmword ptr [rsp+100H], xmm8
movdqa xmm8, xmm5
psrld xmm8, 7
pslld xmm5, 25
por xmm5, xmm8
movdqa xmm8, xmm6
psrld xmm8, 7
pslld xmm6, 25
por xmm6, xmm8
movdqa xmm8, xmm7
psrld xmm8, 7
pslld xmm7, 25
por xmm7, xmm8
movdqa xmm8, xmm4
psrld xmm8, 7
pslld xmm4, 25
por xmm4, xmm8
paddd xmm0, xmmword ptr [rsp+0B0H]
paddd xmm1, xmmword ptr [rsp+50H]
paddd xmm2, xmmword ptr [rsp+10H]
paddd xmm3, xmmword ptr [rsp+80H]
paddd xmm0, xmm4
paddd xmm1, xmm5
paddd xmm2, xmm6
paddd xmm3, xmm7
pxor xmm12, xmm0
pxor xmm13, xmm1
pxor xmm14, xmm2
pxor xmm15, xmm3
movdqa xmm8, xmmword ptr [ROT16]
pshufb xmm12, xmm8
pshufb xmm13, xmm8
pshufb xmm14, xmm8
pshufb xmm15, xmm8
movdqa xmm8, xmmword ptr [rsp+100H]
paddd xmm8, xmm12
paddd xmm9, xmm13
paddd xmm10, xmm14
paddd xmm11, xmm15
pxor xmm4, xmm8
pxor xmm5, xmm9
pxor xmm6, xmm10
pxor xmm7, xmm11
movdqa xmmword ptr [rsp+100H], xmm8
movdqa xmm8, xmm4
psrld xmm8, 12
pslld xmm4, 20
por xmm4, xmm8
movdqa xmm8, xmm5
psrld xmm8, 12
pslld xmm5, 20
por xmm5, xmm8
movdqa xmm8, xmm6
psrld xmm8, 12
pslld xmm6, 20
por xmm6, xmm8
movdqa xmm8, xmm7
psrld xmm8, 12
pslld xmm7, 20
por xmm7, xmm8
paddd xmm0, xmmword ptr [rsp+0F0H]
paddd xmm1, xmmword ptr [rsp]
paddd xmm2, xmmword ptr [rsp+90H]
paddd xmm3, xmmword ptr [rsp+60H]
paddd xmm0, xmm4
paddd xmm1, xmm5
paddd xmm2, xmm6
paddd xmm3, xmm7
pxor xmm12, xmm0
pxor xmm13, xmm1
pxor xmm14, xmm2
pxor xmm15, xmm3
movdqa xmm8, xmmword ptr [ROT8]
pshufb xmm12, xmm8
pshufb xmm13, xmm8
pshufb xmm14, xmm8
pshufb xmm15, xmm8
movdqa xmm8, xmmword ptr [rsp+100H]
paddd xmm8, xmm12
paddd xmm9, xmm13
paddd xmm10, xmm14
paddd xmm11, xmm15
pxor xmm4, xmm8
pxor xmm5, xmm9
pxor xmm6, xmm10
pxor xmm7, xmm11
movdqa xmmword ptr [rsp+100H], xmm8
movdqa xmm8, xmm4
psrld xmm8, 7
pslld xmm4, 25
por xmm4, xmm8
movdqa xmm8, xmm5
psrld xmm8, 7
pslld xmm5, 25
por xmm5, xmm8
movdqa xmm8, xmm6
psrld xmm8, 7
pslld xmm6, 25
por xmm6, xmm8
movdqa xmm8, xmm7
psrld xmm8, 7
pslld xmm7, 25
por xmm7, xmm8
paddd xmm0, xmmword ptr [rsp+0E0H]
paddd xmm1, xmmword ptr [rsp+20H]
paddd xmm2, xmmword ptr [rsp+30H]
paddd xmm3, xmmword ptr [rsp+70H]
paddd xmm0, xmm5
paddd xmm1, xmm6
paddd xmm2, xmm7
paddd xmm3, xmm4
pxor xmm15, xmm0
pxor xmm12, xmm1
pxor xmm13, xmm2
pxor xmm14, xmm3
movdqa xmm8, xmmword ptr [ROT16]
pshufb xmm15, xmm8
pshufb xmm12, xmm8
pshufb xmm13, xmm8
pshufb xmm14, xmm8
paddd xmm10, xmm15
paddd xmm11, xmm12
movdqa xmm8, xmmword ptr [rsp+100H]
paddd xmm8, xmm13
paddd xmm9, xmm14
pxor xmm5, xmm10
pxor xmm6, xmm11
pxor xmm7, xmm8
pxor xmm4, xmm9
movdqa xmmword ptr [rsp+100H], xmm8
movdqa xmm8, xmm5
psrld xmm8, 12
pslld xmm5, 20
por xmm5, xmm8
movdqa xmm8, xmm6
psrld xmm8, 12
pslld xmm6, 20
por xmm6, xmm8
movdqa xmm8, xmm7
psrld xmm8, 12
pslld xmm7, 20
por xmm7, xmm8
movdqa xmm8, xmm4
psrld xmm8, 12
pslld xmm4, 20
por xmm4, xmm8
paddd xmm0, xmmword ptr [rsp+0A0H]
paddd xmm1, xmmword ptr [rsp+0C0H]
paddd xmm2, xmmword ptr [rsp+40H]
paddd xmm3, xmmword ptr [rsp+0D0H]
paddd xmm0, xmm5
paddd xmm1, xmm6
paddd xmm2, xmm7
paddd xmm3, xmm4
pxor xmm15, xmm0
pxor xmm12, xmm1
pxor xmm13, xmm2
pxor xmm14, xmm3
movdqa xmm8, xmmword ptr [ROT8]
pshufb xmm15, xmm8
pshufb xmm12, xmm8
pshufb xmm13, xmm8
pshufb xmm14, xmm8
paddd xmm10, xmm15
paddd xmm11, xmm12
movdqa xmm8, xmmword ptr [rsp+100H]
paddd xmm8, xmm13
paddd xmm9, xmm14
pxor xmm5, xmm10
pxor xmm6, xmm11
pxor xmm7, xmm8
pxor xmm4, xmm9
pxor xmm0, xmm8
pxor xmm1, xmm9
pxor xmm2, xmm10
pxor xmm3, xmm11
movdqa xmm8, xmm5
psrld xmm8, 7
pslld xmm5, 25
por xmm5, xmm8
movdqa xmm8, xmm6
psrld xmm8, 7
pslld xmm6, 25
por xmm6, xmm8
movdqa xmm8, xmm7
psrld xmm8, 7
pslld xmm7, 25
por xmm7, xmm8
movdqa xmm8, xmm4
psrld xmm8, 7
pslld xmm4, 25
por xmm4, xmm8
pxor xmm4, xmm12
pxor xmm5, xmm13
pxor xmm6, xmm14
pxor xmm7, xmm15
mov eax, r13d
jne innerloop4
movdqa xmm9, xmm0
punpckldq xmm0, xmm1
punpckhdq xmm9, xmm1
movdqa xmm11, xmm2
punpckldq xmm2, xmm3
punpckhdq xmm11, xmm3
movdqa xmm1, xmm0
punpcklqdq xmm0, xmm2
punpckhqdq xmm1, xmm2
movdqa xmm3, xmm9
punpcklqdq xmm9, xmm11
punpckhqdq xmm3, xmm11
movdqu xmmword ptr [rbx], xmm0
movdqu xmmword ptr [rbx+20H], xmm1
movdqu xmmword ptr [rbx+40H], xmm9
movdqu xmmword ptr [rbx+60H], xmm3
movdqa xmm9, xmm4
punpckldq xmm4, xmm5
punpckhdq xmm9, xmm5
movdqa xmm11, xmm6
punpckldq xmm6, xmm7
punpckhdq xmm11, xmm7
movdqa xmm5, xmm4
punpcklqdq xmm4, xmm6
punpckhqdq xmm5, xmm6
movdqa xmm7, xmm9
punpcklqdq xmm9, xmm11
punpckhqdq xmm7, xmm11
movdqu xmmword ptr [rbx+10H], xmm4
movdqu xmmword ptr [rbx+30H], xmm5
movdqu xmmword ptr [rbx+50H], xmm9
movdqu xmmword ptr [rbx+70H], xmm7
movdqa xmm1, xmmword ptr [rsp+110H]
movdqa xmm0, xmm1
paddd xmm1, xmmword ptr [rsp+150H]
movdqa xmmword ptr [rsp+110H], xmm1
pxor xmm0, xmmword ptr [CMP_MSB_MASK]
pxor xmm1, xmmword ptr [CMP_MSB_MASK]
pcmpgtd xmm0, xmm1
movdqa xmm1, xmmword ptr [rsp+120H]
psubd xmm1, xmm0
movdqa xmmword ptr [rsp+120H], xmm1
add rbx, 128
add rdi, 32
sub rsi, 4
cmp rsi, 4
jnc outerloop4
test rsi, rsi
jne final3blocks
unwind:
movdqa xmm6, xmmword ptr [rsp+170H]
movdqa xmm7, xmmword ptr [rsp+180H]
movdqa xmm8, xmmword ptr [rsp+190H]
movdqa xmm9, xmmword ptr [rsp+1A0H]
movdqa xmm10, xmmword ptr [rsp+1B0H]
movdqa xmm11, xmmword ptr [rsp+1C0H]
movdqa xmm12, xmmword ptr [rsp+1D0H]
movdqa xmm13, xmmword ptr [rsp+1E0H]
movdqa xmm14, xmmword ptr [rsp+1F0H]
movdqa xmm15, xmmword ptr [rsp+200H]
mov rsp, rbp
pop rbp
pop rbx
pop rdi
pop rsi
pop r12
pop r13
pop r14
pop r15
ret
ALIGN 16
final3blocks:
test esi, 2H
je final1block
movups xmm0, xmmword ptr [rcx]
movups xmm1, xmmword ptr [rcx+10H]
movaps xmm8, xmm0
movaps xmm9, xmm1
movd xmm13, dword ptr [rsp+110H]
pinsrd xmm13, dword ptr [rsp+120H], 1
pinsrd xmm13, dword ptr [BLAKE3_BLOCK_LEN], 2
movaps xmmword ptr [rsp], xmm13
movd xmm14, dword ptr [rsp+114H]
pinsrd xmm14, dword ptr [rsp+124H], 1
pinsrd xmm14, dword ptr [BLAKE3_BLOCK_LEN], 2
movaps xmmword ptr [rsp+10H], xmm14
mov r8, qword ptr [rdi]
mov r9, qword ptr [rdi+8H]
movzx eax, byte ptr [rbp+80H]
or eax, r13d
xor edx, edx
innerloop2:
mov r14d, eax
or eax, r12d
add rdx, 64
cmp rdx, r15
cmovne eax, r14d
movaps xmm2, xmmword ptr [BLAKE3_IV]
movaps xmm10, xmm2
movups xmm4, xmmword ptr [r8+rdx-40H]
movups xmm5, xmmword ptr [r8+rdx-30H]
movaps xmm3, xmm4
shufps xmm4, xmm5, 136
shufps xmm3, xmm5, 221
movaps xmm5, xmm3
movups xmm6, xmmword ptr [r8+rdx-20H]
movups xmm7, xmmword ptr [r8+rdx-10H]
movaps xmm3, xmm6
shufps xmm6, xmm7, 136
pshufd xmm6, xmm6, 93H
shufps xmm3, xmm7, 221
pshufd xmm7, xmm3, 93H
movups xmm12, xmmword ptr [r9+rdx-40H]
movups xmm13, xmmword ptr [r9+rdx-30H]
movaps xmm11, xmm12
shufps xmm12, xmm13, 136
shufps xmm11, xmm13, 221
movaps xmm13, xmm11
movups xmm14, xmmword ptr [r9+rdx-20H]
movups xmm15, xmmword ptr [r9+rdx-10H]
movaps xmm11, xmm14
shufps xmm14, xmm15, 136
pshufd xmm14, xmm14, 93H
shufps xmm11, xmm15, 221
pshufd xmm15, xmm11, 93H
movaps xmm3, xmmword ptr [rsp]
movaps xmm11, xmmword ptr [rsp+10H]
pinsrd xmm3, eax, 3
pinsrd xmm11, eax, 3
mov al, 7
roundloop2:
paddd xmm0, xmm4
paddd xmm8, xmm12
movaps xmmword ptr [rsp+20H], xmm4
movaps xmmword ptr [rsp+30H], xmm12
paddd xmm0, xmm1
paddd xmm8, xmm9
pxor xmm3, xmm0
pxor xmm11, xmm8
movaps xmm12, xmmword ptr [ROT16]
pshufb xmm3, xmm12
pshufb xmm11, xmm12
paddd xmm2, xmm3
paddd xmm10, xmm11
pxor xmm1, xmm2
pxor xmm9, xmm10
movdqa xmm4, xmm1
pslld xmm1, 20
psrld xmm4, 12
por xmm1, xmm4
movdqa xmm4, xmm9
pslld xmm9, 20
psrld xmm4, 12
por xmm9, xmm4
paddd xmm0, xmm5
paddd xmm8, xmm13
movaps xmmword ptr [rsp+40H], xmm5
movaps xmmword ptr [rsp+50H], xmm13
paddd xmm0, xmm1
paddd xmm8, xmm9
pxor xmm3, xmm0
pxor xmm11, xmm8
movaps xmm13, xmmword ptr [ROT8]
pshufb xmm3, xmm13
pshufb xmm11, xmm13
paddd xmm2, xmm3
paddd xmm10, xmm11
pxor xmm1, xmm2
pxor xmm9, xmm10
movdqa xmm4, xmm1
pslld xmm1, 25
psrld xmm4, 7
por xmm1, xmm4
movdqa xmm4, xmm9
pslld xmm9, 25
psrld xmm4, 7
por xmm9, xmm4
pshufd xmm0, xmm0, 93H
pshufd xmm8, xmm8, 93H
pshufd xmm3, xmm3, 4EH
pshufd xmm11, xmm11, 4EH
pshufd xmm2, xmm2, 39H
pshufd xmm10, xmm10, 39H
paddd xmm0, xmm6
paddd xmm8, xmm14
paddd xmm0, xmm1
paddd xmm8, xmm9
pxor xmm3, xmm0
pxor xmm11, xmm8
pshufb xmm3, xmm12
pshufb xmm11, xmm12
paddd xmm2, xmm3
paddd xmm10, xmm11
pxor xmm1, xmm2
pxor xmm9, xmm10
movdqa xmm4, xmm1
pslld xmm1, 20
psrld xmm4, 12
por xmm1, xmm4
movdqa xmm4, xmm9
pslld xmm9, 20
psrld xmm4, 12
por xmm9, xmm4
paddd xmm0, xmm7
paddd xmm8, xmm15
paddd xmm0, xmm1
paddd xmm8, xmm9
pxor xmm3, xmm0
pxor xmm11, xmm8
pshufb xmm3, xmm13
pshufb xmm11, xmm13
paddd xmm2, xmm3
paddd xmm10, xmm11
pxor xmm1, xmm2
pxor xmm9, xmm10
movdqa xmm4, xmm1
pslld xmm1, 25
psrld xmm4, 7
por xmm1, xmm4
movdqa xmm4, xmm9
pslld xmm9, 25
psrld xmm4, 7
por xmm9, xmm4
pshufd xmm0, xmm0, 39H
pshufd xmm8, xmm8, 39H
pshufd xmm3, xmm3, 4EH
pshufd xmm11, xmm11, 4EH
pshufd xmm2, xmm2, 93H
pshufd xmm10, xmm10, 93H
dec al
je endroundloop2
movdqa xmm12, xmmword ptr [rsp+20H]
movdqa xmm5, xmmword ptr [rsp+40H]
pshufd xmm13, xmm12, 0FH
shufps xmm12, xmm5, 214
pshufd xmm4, xmm12, 39H
movdqa xmm12, xmm6
shufps xmm12, xmm7, 250
pblendw xmm13, xmm12, 0CCH
movdqa xmm12, xmm7
punpcklqdq xmm12, xmm5
pblendw xmm12, xmm6, 0C0H
pshufd xmm12, xmm12, 78H
punpckhdq xmm5, xmm7
punpckldq xmm6, xmm5
pshufd xmm7, xmm6, 1EH
movdqa xmmword ptr [rsp+20H], xmm13
movdqa xmmword ptr [rsp+40H], xmm12
movdqa xmm5, xmmword ptr [rsp+30H]
movdqa xmm13, xmmword ptr [rsp+50H]
pshufd xmm6, xmm5, 0FH
shufps xmm5, xmm13, 214
pshufd xmm12, xmm5, 39H
movdqa xmm5, xmm14
shufps xmm5, xmm15, 250
pblendw xmm6, xmm5, 0CCH
movdqa xmm5, xmm15
punpcklqdq xmm5, xmm13
pblendw xmm5, xmm14, 0C0H
pshufd xmm5, xmm5, 78H
punpckhdq xmm13, xmm15
punpckldq xmm14, xmm13
pshufd xmm15, xmm14, 1EH
movdqa xmm13, xmm6
movdqa xmm14, xmm5
movdqa xmm5, xmmword ptr [rsp+20H]
movdqa xmm6, xmmword ptr [rsp+40H]
jmp roundloop2
endroundloop2:
pxor xmm0, xmm2
pxor xmm1, xmm3
pxor xmm8, xmm10
pxor xmm9, xmm11
mov eax, r13d
cmp rdx, r15
jne innerloop2
movups xmmword ptr [rbx], xmm0
movups xmmword ptr [rbx+10H], xmm1
movups xmmword ptr [rbx+20H], xmm8
movups xmmword ptr [rbx+30H], xmm9
movdqa xmm0, xmmword ptr [rsp+130H]
movdqa xmm1, xmmword ptr [rsp+110H]
movdqa xmm2, xmmword ptr [rsp+120H]
movdqu xmm3, xmmword ptr [rsp+118H]
movdqu xmm4, xmmword ptr [rsp+128H]
blendvps xmm1, xmm3, xmm0
blendvps xmm2, xmm4, xmm0
movdqa xmmword ptr [rsp+110H], xmm1
movdqa xmmword ptr [rsp+120H], xmm2
add rdi, 16
add rbx, 64
sub rsi, 2
final1block:
test esi, 1H
je unwind
movups xmm0, xmmword ptr [rcx]
movups xmm1, xmmword ptr [rcx+10H]
movd xmm13, dword ptr [rsp+110H]
pinsrd xmm13, dword ptr [rsp+120H], 1
pinsrd xmm13, dword ptr [BLAKE3_BLOCK_LEN], 2
movaps xmm14, xmmword ptr [ROT8]
movaps xmm15, xmmword ptr [ROT16]
mov r8, qword ptr [rdi]
movzx eax, byte ptr [rbp+80H]
or eax, r13d
xor edx, edx
innerloop1:
mov r14d, eax
or eax, r12d
add rdx, 64
cmp rdx, r15
cmovne eax, r14d
movaps xmm2, xmmword ptr [BLAKE3_IV]
movaps xmm3, xmm13
pinsrd xmm3, eax, 3
movups xmm4, xmmword ptr [r8+rdx-40H]
movups xmm5, xmmword ptr [r8+rdx-30H]
movaps xmm8, xmm4
shufps xmm4, xmm5, 136
shufps xmm8, xmm5, 221
movaps xmm5, xmm8
movups xmm6, xmmword ptr [r8+rdx-20H]
movups xmm7, xmmword ptr [r8+rdx-10H]
movaps xmm8, xmm6
shufps xmm6, xmm7, 136
pshufd xmm6, xmm6, 93H
shufps xmm8, xmm7, 221
pshufd xmm7, xmm8, 93H
mov al, 7
roundloop1:
paddd xmm0, xmm4
paddd xmm0, xmm1
pxor xmm3, xmm0
pshufb xmm3, xmm15
paddd xmm2, xmm3
pxor xmm1, xmm2
movdqa xmm11, xmm1
pslld xmm1, 20
psrld xmm11, 12
por xmm1, xmm11
paddd xmm0, xmm5
paddd xmm0, xmm1
pxor xmm3, xmm0
pshufb xmm3, xmm14
paddd xmm2, xmm3
pxor xmm1, xmm2
movdqa xmm11, xmm1
pslld xmm1, 25
psrld xmm11, 7
por xmm1, xmm11
pshufd xmm0, xmm0, 93H
pshufd xmm3, xmm3, 4EH
pshufd xmm2, xmm2, 39H
paddd xmm0, xmm6
paddd xmm0, xmm1
pxor xmm3, xmm0
pshufb xmm3, xmm15
paddd xmm2, xmm3
pxor xmm1, xmm2
movdqa xmm11, xmm1
pslld xmm1, 20
psrld xmm11, 12
por xmm1, xmm11
paddd xmm0, xmm7
paddd xmm0, xmm1
pxor xmm3, xmm0
pshufb xmm3, xmm14
paddd xmm2, xmm3
pxor xmm1, xmm2
movdqa xmm11, xmm1
pslld xmm1, 25
psrld xmm11, 7
por xmm1, xmm11
pshufd xmm0, xmm0, 39H
pshufd xmm3, xmm3, 4EH
pshufd xmm2, xmm2, 93H
dec al
jz endroundloop1
movdqa xmm8, xmm4
shufps xmm8, xmm5, 214
pshufd xmm9, xmm4, 0FH
pshufd xmm4, xmm8, 39H
movdqa xmm8, xmm6
shufps xmm8, xmm7, 250
pblendw xmm9, xmm8, 0CCH
movdqa xmm8, xmm7
punpcklqdq xmm8, xmm5
pblendw xmm8, xmm6, 0C0H
pshufd xmm8, xmm8, 78H
punpckhdq xmm5, xmm7
punpckldq xmm6, xmm5
pshufd xmm7, xmm6, 1EH
movdqa xmm5, xmm9
movdqa xmm6, xmm8
jmp roundloop1
endroundloop1:
pxor xmm0, xmm2
pxor xmm1, xmm3
mov eax, r13d
cmp rdx, r15
jne innerloop1
movups xmmword ptr [rbx], xmm0
movups xmmword ptr [rbx+10H], xmm1
jmp unwind
_blake3_hash_many_sse2 ENDP
blake3_hash_many_sse2 ENDP
blake3_compress_in_place_sse2 PROC
_blake3_compress_in_place_sse2 PROC
sub rsp, 120
movdqa xmmword ptr [rsp], xmm6
movdqa xmmword ptr [rsp+10H], xmm7
movdqa xmmword ptr [rsp+20H], xmm8
movdqa xmmword ptr [rsp+30H], xmm9
movdqa xmmword ptr [rsp+40H], xmm11
movdqa xmmword ptr [rsp+50H], xmm14
movdqa xmmword ptr [rsp+60H], xmm15
movups xmm0, xmmword ptr [rcx]
movups xmm1, xmmword ptr [rcx+10H]
movaps xmm2, xmmword ptr [BLAKE3_IV]
movzx eax, byte ptr [rsp+0A0H]
movzx r8d, r8b
shl rax, 32
add r8, rax
movq xmm3, r9
movq xmm4, r8
punpcklqdq xmm3, xmm4
movups xmm4, xmmword ptr [rdx]
movups xmm5, xmmword ptr [rdx+10H]
movaps xmm8, xmm4
shufps xmm4, xmm5, 136
shufps xmm8, xmm5, 221
movaps xmm5, xmm8
movups xmm6, xmmword ptr [rdx+20H]
movups xmm7, xmmword ptr [rdx+30H]
movaps xmm8, xmm6
shufps xmm6, xmm7, 136
pshufd xmm6, xmm6, 93H
shufps xmm8, xmm7, 221
pshufd xmm7, xmm8, 93H
movaps xmm14, xmmword ptr [ROT8]
movaps xmm15, xmmword ptr [ROT16]
mov al, 7
@@:
paddd xmm0, xmm4
paddd xmm0, xmm1
pxor xmm3, xmm0
pshufb xmm3, xmm15
paddd xmm2, xmm3
pxor xmm1, xmm2
movdqa xmm11, xmm1
pslld xmm1, 20
psrld xmm11, 12
por xmm1, xmm11
paddd xmm0, xmm5
paddd xmm0, xmm1
pxor xmm3, xmm0
pshufb xmm3, xmm14
paddd xmm2, xmm3
pxor xmm1, xmm2
movdqa xmm11, xmm1
pslld xmm1, 25
psrld xmm11, 7
por xmm1, xmm11
pshufd xmm0, xmm0, 93H
pshufd xmm3, xmm3, 4EH
pshufd xmm2, xmm2, 39H
paddd xmm0, xmm6
paddd xmm0, xmm1
pxor xmm3, xmm0
pshufb xmm3, xmm15
paddd xmm2, xmm3
pxor xmm1, xmm2
movdqa xmm11, xmm1
pslld xmm1, 20
psrld xmm11, 12
por xmm1, xmm11
paddd xmm0, xmm7
paddd xmm0, xmm1
pxor xmm3, xmm0
pshufb xmm3, xmm14
paddd xmm2, xmm3
pxor xmm1, xmm2
movdqa xmm11, xmm1
pslld xmm1, 25
psrld xmm11, 7
por xmm1, xmm11
pshufd xmm0, xmm0, 39H
pshufd xmm3, xmm3, 4EH
pshufd xmm2, xmm2, 93H
dec al
jz @F
movdqa xmm8, xmm4
shufps xmm8, xmm5, 214
pshufd xmm9, xmm4, 0FH
pshufd xmm4, xmm8, 39H
movdqa xmm8, xmm6
shufps xmm8, xmm7, 250
pblendw xmm9, xmm8, 0CCH
movdqa xmm8, xmm7
punpcklqdq xmm8, xmm5
pblendw xmm8, xmm6, 0C0H
pshufd xmm8, xmm8, 78H
punpckhdq xmm5, xmm7
punpckldq xmm6, xmm5
pshufd xmm7, xmm6, 1EH
movdqa xmm5, xmm9
movdqa xmm6, xmm8
jmp @B
@@:
pxor xmm0, xmm2
pxor xmm1, xmm3
movups xmmword ptr [rcx], xmm0
movups xmmword ptr [rcx+10H], xmm1
movdqa xmm6, xmmword ptr [rsp]
movdqa xmm7, xmmword ptr [rsp+10H]
movdqa xmm8, xmmword ptr [rsp+20H]
movdqa xmm9, xmmword ptr [rsp+30H]
movdqa xmm11, xmmword ptr [rsp+40H]
movdqa xmm14, xmmword ptr [rsp+50H]
movdqa xmm15, xmmword ptr [rsp+60H]
add rsp, 120
ret
_blake3_compress_in_place_sse2 ENDP
blake3_compress_in_place_sse2 ENDP
ALIGN 16
blake3_compress_xof_sse2 PROC
_blake3_compress_xof_sse2 PROC
sub rsp, 120
movdqa xmmword ptr [rsp], xmm6
movdqa xmmword ptr [rsp+10H], xmm7
movdqa xmmword ptr [rsp+20H], xmm8
movdqa xmmword ptr [rsp+30H], xmm9
movdqa xmmword ptr [rsp+40H], xmm11
movdqa xmmword ptr [rsp+50H], xmm14
movdqa xmmword ptr [rsp+60H], xmm15
movups xmm0, xmmword ptr [rcx]
movups xmm1, xmmword ptr [rcx+10H]
movaps xmm2, xmmword ptr [BLAKE3_IV]
movzx eax, byte ptr [rsp+0A0H]
movzx r8d, r8b
mov r10, qword ptr [rsp+0A8H]
shl rax, 32
add r8, rax
movq xmm3, r9
movq xmm4, r8
punpcklqdq xmm3, xmm4
movups xmm4, xmmword ptr [rdx]
movups xmm5, xmmword ptr [rdx+10H]
movaps xmm8, xmm4
shufps xmm4, xmm5, 136
shufps xmm8, xmm5, 221
movaps xmm5, xmm8
movups xmm6, xmmword ptr [rdx+20H]
movups xmm7, xmmword ptr [rdx+30H]
movaps xmm8, xmm6
shufps xmm6, xmm7, 136
pshufd xmm6, xmm6, 93H
shufps xmm8, xmm7, 221
pshufd xmm7, xmm8, 93H
movaps xmm14, xmmword ptr [ROT8]
movaps xmm15, xmmword ptr [ROT16]
mov al, 7
@@:
paddd xmm0, xmm4
paddd xmm0, xmm1
pxor xmm3, xmm0
pshufb xmm3, xmm15
paddd xmm2, xmm3
pxor xmm1, xmm2
movdqa xmm11, xmm1
pslld xmm1, 20
psrld xmm11, 12
por xmm1, xmm11
paddd xmm0, xmm5
paddd xmm0, xmm1
pxor xmm3, xmm0
pshufb xmm3, xmm14
paddd xmm2, xmm3
pxor xmm1, xmm2
movdqa xmm11, xmm1
pslld xmm1, 25
psrld xmm11, 7
por xmm1, xmm11
pshufd xmm0, xmm0, 93H
pshufd xmm3, xmm3, 4EH
pshufd xmm2, xmm2, 39H
paddd xmm0, xmm6
paddd xmm0, xmm1
pxor xmm3, xmm0
pshufb xmm3, xmm15
paddd xmm2, xmm3
pxor xmm1, xmm2
movdqa xmm11, xmm1
pslld xmm1, 20
psrld xmm11, 12
por xmm1, xmm11
paddd xmm0, xmm7
paddd xmm0, xmm1
pxor xmm3, xmm0
pshufb xmm3, xmm14
paddd xmm2, xmm3
pxor xmm1, xmm2
movdqa xmm11, xmm1
pslld xmm1, 25
psrld xmm11, 7
por xmm1, xmm11
pshufd xmm0, xmm0, 39H
pshufd xmm3, xmm3, 4EH
pshufd xmm2, xmm2, 93H
dec al
jz @F
movdqa xmm8, xmm4
shufps xmm8, xmm5, 214
pshufd xmm9, xmm4, 0FH
pshufd xmm4, xmm8, 39H
movdqa xmm8, xmm6
shufps xmm8, xmm7, 250
pblendw xmm9, xmm8, 0CCH
movdqa xmm8, xmm7
punpcklqdq xmm8, xmm5
pblendw xmm8, xmm6, 0C0H
pshufd xmm8, xmm8, 78H
punpckhdq xmm5, xmm7
punpckldq xmm6, xmm5
pshufd xmm7, xmm6, 1EH
movdqa xmm5, xmm9
movdqa xmm6, xmm8
jmp @B
@@:
movdqu xmm4, xmmword ptr [rcx]
movdqu xmm5, xmmword ptr [rcx+10H]
pxor xmm0, xmm2
pxor xmm1, xmm3
pxor xmm2, xmm4
pxor xmm3, xmm5
movups xmmword ptr [r10], xmm0
movups xmmword ptr [r10+10H], xmm1
movups xmmword ptr [r10+20H], xmm2
movups xmmword ptr [r10+30H], xmm3
movdqa xmm6, xmmword ptr [rsp]
movdqa xmm7, xmmword ptr [rsp+10H]
movdqa xmm8, xmmword ptr [rsp+20H]
movdqa xmm9, xmmword ptr [rsp+30H]
movdqa xmm11, xmmword ptr [rsp+40H]
movdqa xmm14, xmmword ptr [rsp+50H]
movdqa xmm15, xmmword ptr [rsp+60H]
add rsp, 120
ret
_blake3_compress_xof_sse2 ENDP
blake3_compress_xof_sse2 ENDP
_TEXT ENDS
_RDATA SEGMENT READONLY PAGE ALIAS(".rdata") 'CONST'
ALIGN 64
BLAKE3_IV:
dd 6A09E667H, 0BB67AE85H, 3C6EF372H, 0A54FF53AH
ADD0:
dd 0, 1, 2, 3
ADD1:
dd 4 dup (4)
BLAKE3_IV_0:
dd 4 dup (6A09E667H)
BLAKE3_IV_1:
dd 4 dup (0BB67AE85H)
BLAKE3_IV_2:
dd 4 dup (3C6EF372H)
BLAKE3_IV_3:
dd 4 dup (0A54FF53AH)
BLAKE3_BLOCK_LEN:
dd 4 dup (64)
ROT16:
db 2, 3, 0, 1, 6, 7, 4, 5, 10, 11, 8, 9, 14, 15, 12, 13
ROT8:
db 1, 2, 3, 0, 5, 6, 7, 4, 9, 10, 11, 8, 13, 14, 15, 12
CMP_MSB_MASK:
dd 8 dup(80000000H)
_RDATA ENDS
END

114
src/ffi_sse2.rs Normal file
View File

@ -0,0 +1,114 @@
use crate::{CVWords, IncrementCounter, BLOCK_LEN, OUT_LEN};
// Unsafe because this may only be called on platforms supporting SSE2.
pub unsafe fn compress_in_place(
cv: &mut CVWords,
block: &[u8; BLOCK_LEN],
block_len: u8,
counter: u64,
flags: u8,
) {
ffi::blake3_compress_in_place_sse2(cv.as_mut_ptr(), block.as_ptr(), block_len, counter, flags)
}
// Unsafe because this may only be called on platforms supporting SSE2.
pub unsafe fn compress_xof(
cv: &CVWords,
block: &[u8; BLOCK_LEN],
block_len: u8,
counter: u64,
flags: u8,
) -> [u8; 64] {
let mut out = [0u8; 64];
ffi::blake3_compress_xof_sse2(
cv.as_ptr(),
block.as_ptr(),
block_len,
counter,
flags,
out.as_mut_ptr(),
);
out
}
// Unsafe because this may only be called on platforms supporting SSE2.
pub unsafe fn hash_many<A: arrayvec::Array<Item = u8>>(
inputs: &[&A],
key: &CVWords,
counter: u64,
increment_counter: IncrementCounter,
flags: u8,
flags_start: u8,
flags_end: u8,
out: &mut [u8],
) {
// The Rust hash_many implementations do bounds checking on the `out`
// array, but the C implementations don't. Even though this is an unsafe
// function, assert the bounds here.
assert!(out.len() >= inputs.len() * OUT_LEN);
ffi::blake3_hash_many_sse2(
inputs.as_ptr() as *const *const u8,
inputs.len(),
A::CAPACITY / BLOCK_LEN,
key.as_ptr(),
counter,
increment_counter.yes(),
flags,
flags_start,
flags_end,
out.as_mut_ptr(),
)
}
pub mod ffi {
extern "C" {
pub fn blake3_compress_in_place_sse2(
cv: *mut u32,
block: *const u8,
block_len: u8,
counter: u64,
flags: u8,
);
pub fn blake3_compress_xof_sse2(
cv: *const u32,
block: *const u8,
block_len: u8,
counter: u64,
flags: u8,
out: *mut u8,
);
pub fn blake3_hash_many_sse2(
inputs: *const *const u8,
num_inputs: usize,
blocks: usize,
key: *const u32,
counter: u64,
increment_counter: bool,
flags: u8,
flags_start: u8,
flags_end: u8,
out: *mut u8,
);
}
}
#[cfg(test)]
mod test {
use super::*;
#[test]
fn test_compress() {
if !crate::platform::sse2_detected() {
return;
}
crate::test::test_compress_fn(compress_in_place, compress_xof);
}
#[test]
fn test_hash_many() {
if !crate::platform::sse2_detected() {
return;
}
crate::test::test_hash_many_fn(hash_many, hash_many);
}
}

View File

@ -94,6 +94,12 @@ mod sse41;
#[cfg(blake3_sse41_ffi)]
#[path = "ffi_sse41.rs"]
mod sse41;
#[cfg(blake3_sse2_rust)]
#[path = "rust_sse2.rs"]
mod sse2;
#[cfg(blake3_sse2_ffi)]
#[path = "ffi_sse2.rs"]
mod sse2;
pub mod traits;

View File

@ -41,6 +41,8 @@ cfg_if::cfg_if! {
pub enum Platform {
Portable,
#[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
SSE2,
#[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
SSE41,
#[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
AVX2,
@ -68,6 +70,9 @@ impl Platform {
if sse41_detected() {
return Platform::SSE41;
}
if sse2_detected() {
return Platform::SSE2;
}
}
// We don't use dynamic feature detection for NEON. If the "neon"
// feature is on, NEON is assumed to be supported.
@ -82,6 +87,8 @@ impl Platform {
let degree = match self {
Platform::Portable => 1,
#[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
Platform::SSE2 => 4,
#[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
Platform::SSE41 => 4,
#[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
Platform::AVX2 => 8,
@ -107,6 +114,11 @@ impl Platform {
Platform::Portable => portable::compress_in_place(cv, block, block_len, counter, flags),
// Safe because detect() checked for platform support.
#[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
Platform::SSE2 => unsafe {
crate::sse2::compress_in_place(cv, block, block_len, counter, flags)
},
// Safe because detect() checked for platform support.
#[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
Platform::SSE41 | Platform::AVX2 => unsafe {
crate::sse41::compress_in_place(cv, block, block_len, counter, flags)
},
@ -134,6 +146,11 @@ impl Platform {
Platform::Portable => portable::compress_xof(cv, block, block_len, counter, flags),
// Safe because detect() checked for platform support.
#[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
Platform::SSE2 => unsafe {
crate::sse2::compress_xof(cv, block, block_len, counter, flags)
},
// Safe because detect() checked for platform support.
#[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
Platform::SSE41 | Platform::AVX2 => unsafe {
crate::sse41::compress_xof(cv, block, block_len, counter, flags)
},
@ -183,6 +200,20 @@ impl Platform {
),
// Safe because detect() checked for platform support.
#[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
Platform::SSE2 => unsafe {
crate::sse2::hash_many(
inputs,
key,
counter,
increment_counter,
flags,
flags_start,
flags_end,
out,
)
},
// Safe because detect() checked for platform support.
#[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
Platform::SSE41 => unsafe {
crate::sse41::hash_many(
inputs,
@ -247,6 +278,15 @@ impl Platform {
Self::Portable
}
#[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
pub fn sse2() -> Option<Self> {
if sse2_detected() {
Some(Self::SSE2)
} else {
None
}
}
#[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
pub fn sse41() -> Option<Self> {
if sse41_detected() {
@ -351,6 +391,28 @@ pub fn sse41_detected() -> bool {
false
}
#[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
#[inline(always)]
pub fn sse2_detected() -> bool {
// A testing-only short-circuit.
if cfg!(feature = "no_sse2") {
return false;
}
// Static check, e.g. for building with target-cpu=native.
#[cfg(target_feature = "sse2")]
{
return true;
}
// Dyanmic check, if std is enabled.
#[cfg(feature = "std")]
{
if is_x86_feature_detected!("sse2") {
return true;
}
}
false
}
#[inline(always)]
pub fn words_from_le_bytes_32(bytes: &[u8; 32]) -> [u32; 8] {
let mut out = [0; 8];

766
src/rust_sse2.rs Normal file
View File

@ -0,0 +1,766 @@
#[cfg(target_arch = "x86")]
use core::arch::x86::*;
#[cfg(target_arch = "x86_64")]
use core::arch::x86_64::*;
use crate::{
counter_high, counter_low, CVBytes, CVWords, IncrementCounter, BLOCK_LEN, IV, MSG_SCHEDULE,
OUT_LEN,
};
use arrayref::{array_mut_ref, array_ref, mut_array_refs};
pub const DEGREE: usize = 4;
#[inline(always)]
unsafe fn loadu(src: *const u8) -> __m128i {
// This is an unaligned load, so the pointer cast is allowed.
_mm_loadu_si128(src as *const __m128i)
}
#[inline(always)]
unsafe fn storeu(src: __m128i, dest: *mut u8) {
// This is an unaligned store, so the pointer cast is allowed.
_mm_storeu_si128(dest as *mut __m128i, src)
}
#[inline(always)]
unsafe fn add(a: __m128i, b: __m128i) -> __m128i {
_mm_add_epi32(a, b)
}
#[inline(always)]
unsafe fn xor(a: __m128i, b: __m128i) -> __m128i {
_mm_xor_si128(a, b)
}
#[inline(always)]
unsafe fn set1(x: u32) -> __m128i {
_mm_set1_epi32(x as i32)
}
#[inline(always)]
unsafe fn set4(a: u32, b: u32, c: u32, d: u32) -> __m128i {
_mm_setr_epi32(a as i32, b as i32, c as i32, d as i32)
}
// These rotations are the "simple/shifts version". For the
// "complicated/shuffles version", see
// https://github.com/sneves/blake2-avx2/blob/b3723921f668df09ece52dcd225a36d4a4eea1d9/blake2s-common.h#L63-L66.
// For a discussion of the tradeoffs, see
// https://github.com/sneves/blake2-avx2/pull/5. Due to an LLVM bug
// (https://bugs.llvm.org/show_bug.cgi?id=44379), this version performs better
// on recent x86 chips.
#[inline(always)]
unsafe fn rot16(a: __m128i) -> __m128i {
_mm_or_si128(_mm_srli_epi32(a, 16), _mm_slli_epi32(a, 32 - 16))
}
#[inline(always)]
unsafe fn rot12(a: __m128i) -> __m128i {
_mm_or_si128(_mm_srli_epi32(a, 12), _mm_slli_epi32(a, 32 - 12))
}
#[inline(always)]
unsafe fn rot8(a: __m128i) -> __m128i {
_mm_or_si128(_mm_srli_epi32(a, 8), _mm_slli_epi32(a, 32 - 8))
}
#[inline(always)]
unsafe fn rot7(a: __m128i) -> __m128i {
_mm_or_si128(_mm_srli_epi32(a, 7), _mm_slli_epi32(a, 32 - 7))
}
#[inline(always)]
unsafe fn g1(
row0: &mut __m128i,
row1: &mut __m128i,
row2: &mut __m128i,
row3: &mut __m128i,
m: __m128i,
) {
*row0 = add(add(*row0, m), *row1);
*row3 = xor(*row3, *row0);
*row3 = rot16(*row3);
*row2 = add(*row2, *row3);
*row1 = xor(*row1, *row2);
*row1 = rot12(*row1);
}
#[inline(always)]
unsafe fn g2(
row0: &mut __m128i,
row1: &mut __m128i,
row2: &mut __m128i,
row3: &mut __m128i,
m: __m128i,
) {
*row0 = add(add(*row0, m), *row1);
*row3 = xor(*row3, *row0);
*row3 = rot8(*row3);
*row2 = add(*row2, *row3);
*row1 = xor(*row1, *row2);
*row1 = rot7(*row1);
}
// Adapted from https://github.com/rust-lang-nursery/stdsimd/pull/479.
macro_rules! _MM_SHUFFLE {
($z:expr, $y:expr, $x:expr, $w:expr) => {
($z << 6) | ($y << 4) | ($x << 2) | $w
};
}
macro_rules! shuffle2 {
($a:expr, $b:expr, $c:expr) => {
_mm_castps_si128(_mm_shuffle_ps(
_mm_castsi128_ps($a),
_mm_castsi128_ps($b),
$c,
))
};
}
// Note the optimization here of leaving row1 as the unrotated row, rather than
// row0. All the message loads below are adjusted to compensate for this. See
// discussion at https://github.com/sneves/blake2-avx2/pull/4
#[inline(always)]
unsafe fn diagonalize(row0: &mut __m128i, row2: &mut __m128i, row3: &mut __m128i) {
*row0 = _mm_shuffle_epi32(*row0, _MM_SHUFFLE!(2, 1, 0, 3));
*row3 = _mm_shuffle_epi32(*row3, _MM_SHUFFLE!(1, 0, 3, 2));
*row2 = _mm_shuffle_epi32(*row2, _MM_SHUFFLE!(0, 3, 2, 1));
}
#[inline(always)]
unsafe fn undiagonalize(row0: &mut __m128i, row2: &mut __m128i, row3: &mut __m128i) {
*row0 = _mm_shuffle_epi32(*row0, _MM_SHUFFLE!(0, 3, 2, 1));
*row3 = _mm_shuffle_epi32(*row3, _MM_SHUFFLE!(1, 0, 3, 2));
*row2 = _mm_shuffle_epi32(*row2, _MM_SHUFFLE!(2, 1, 0, 3));
}
#[inline(always)]
unsafe fn compress_pre(
cv: &CVWords,
block: &[u8; BLOCK_LEN],
block_len: u8,
counter: u64,
flags: u8,
) -> [__m128i; 4] {
let row0 = &mut loadu(cv.as_ptr().add(0) as *const u8);
let row1 = &mut loadu(cv.as_ptr().add(4) as *const u8);
let row2 = &mut set4(IV[0], IV[1], IV[2], IV[3]);
let row3 = &mut set4(
counter_low(counter),
counter_high(counter),
block_len as u32,
flags as u32,
);
let mut m0 = loadu(block.as_ptr().add(0 * 4 * DEGREE));
let mut m1 = loadu(block.as_ptr().add(1 * 4 * DEGREE));
let mut m2 = loadu(block.as_ptr().add(2 * 4 * DEGREE));
let mut m3 = loadu(block.as_ptr().add(3 * 4 * DEGREE));
let mut t0;
let mut t1;
let mut t2;
let mut t3;
let mut tt;
// Round 1. The first round permutes the message words from the original
// input order, into the groups that get mixed in parallel.
t0 = shuffle2!(m0, m1, _MM_SHUFFLE!(2, 0, 2, 0)); // 6 4 2 0
g1(row0, row1, row2, row3, t0);
t1 = shuffle2!(m0, m1, _MM_SHUFFLE!(3, 1, 3, 1)); // 7 5 3 1
g2(row0, row1, row2, row3, t1);
diagonalize(row0, row2, row3);
t2 = shuffle2!(m2, m3, _MM_SHUFFLE!(2, 0, 2, 0)); // 14 12 10 8
t2 = _mm_shuffle_epi32(t2, _MM_SHUFFLE!(2, 1, 0, 3)); // 12 10 8 14
g1(row0, row1, row2, row3, t2);
t3 = shuffle2!(m2, m3, _MM_SHUFFLE!(3, 1, 3, 1)); // 15 13 11 9
t3 = _mm_shuffle_epi32(t3, _MM_SHUFFLE!(2, 1, 0, 3)); // 13 11 9 15
g2(row0, row1, row2, row3, t3);
undiagonalize(row0, row2, row3);
m0 = t0;
m1 = t1;
m2 = t2;
m3 = t3;
// Round 2. This round and all following rounds apply a fixed permutation
// to the message words from the round before.
t0 = shuffle2!(m0, m1, _MM_SHUFFLE!(3, 1, 1, 2));
t0 = _mm_shuffle_epi32(t0, _MM_SHUFFLE!(0, 3, 2, 1));
g1(row0, row1, row2, row3, t0);
t1 = shuffle2!(m2, m3, _MM_SHUFFLE!(3, 3, 2, 2));
tt = _mm_shuffle_epi32(m0, _MM_SHUFFLE!(0, 0, 3, 3));
t1 = _mm_blend_epi16(tt, t1, 0xCC);
g2(row0, row1, row2, row3, t1);
diagonalize(row0, row2, row3);
t2 = _mm_unpacklo_epi64(m3, m1);
tt = _mm_blend_epi16(t2, m2, 0xC0);
t2 = _mm_shuffle_epi32(tt, _MM_SHUFFLE!(1, 3, 2, 0));
g1(row0, row1, row2, row3, t2);
t3 = _mm_unpackhi_epi32(m1, m3);
tt = _mm_unpacklo_epi32(m2, t3);
t3 = _mm_shuffle_epi32(tt, _MM_SHUFFLE!(0, 1, 3, 2));
g2(row0, row1, row2, row3, t3);
undiagonalize(row0, row2, row3);
m0 = t0;
m1 = t1;
m2 = t2;
m3 = t3;
// Round 3
t0 = shuffle2!(m0, m1, _MM_SHUFFLE!(3, 1, 1, 2));
t0 = _mm_shuffle_epi32(t0, _MM_SHUFFLE!(0, 3, 2, 1));
g1(row0, row1, row2, row3, t0);
t1 = shuffle2!(m2, m3, _MM_SHUFFLE!(3, 3, 2, 2));
tt = _mm_shuffle_epi32(m0, _MM_SHUFFLE!(0, 0, 3, 3));
t1 = _mm_blend_epi16(tt, t1, 0xCC);
g2(row0, row1, row2, row3, t1);
diagonalize(row0, row2, row3);
t2 = _mm_unpacklo_epi64(m3, m1);
tt = _mm_blend_epi16(t2, m2, 0xC0);
t2 = _mm_shuffle_epi32(tt, _MM_SHUFFLE!(1, 3, 2, 0));
g1(row0, row1, row2, row3, t2);
t3 = _mm_unpackhi_epi32(m1, m3);
tt = _mm_unpacklo_epi32(m2, t3);
t3 = _mm_shuffle_epi32(tt, _MM_SHUFFLE!(0, 1, 3, 2));
g2(row0, row1, row2, row3, t3);
undiagonalize(row0, row2, row3);
m0 = t0;
m1 = t1;
m2 = t2;
m3 = t3;
// Round 4
t0 = shuffle2!(m0, m1, _MM_SHUFFLE!(3, 1, 1, 2));
t0 = _mm_shuffle_epi32(t0, _MM_SHUFFLE!(0, 3, 2, 1));
g1(row0, row1, row2, row3, t0);
t1 = shuffle2!(m2, m3, _MM_SHUFFLE!(3, 3, 2, 2));
tt = _mm_shuffle_epi32(m0, _MM_SHUFFLE!(0, 0, 3, 3));
t1 = _mm_blend_epi16(tt, t1, 0xCC);
g2(row0, row1, row2, row3, t1);
diagonalize(row0, row2, row3);
t2 = _mm_unpacklo_epi64(m3, m1);
tt = _mm_blend_epi16(t2, m2, 0xC0);
t2 = _mm_shuffle_epi32(tt, _MM_SHUFFLE!(1, 3, 2, 0));
g1(row0, row1, row2, row3, t2);
t3 = _mm_unpackhi_epi32(m1, m3);
tt = _mm_unpacklo_epi32(m2, t3);
t3 = _mm_shuffle_epi32(tt, _MM_SHUFFLE!(0, 1, 3, 2));
g2(row0, row1, row2, row3, t3);
undiagonalize(row0, row2, row3);
m0 = t0;
m1 = t1;
m2 = t2;
m3 = t3;
// Round 5
t0 = shuffle2!(m0, m1, _MM_SHUFFLE!(3, 1, 1, 2));
t0 = _mm_shuffle_epi32(t0, _MM_SHUFFLE!(0, 3, 2, 1));
g1(row0, row1, row2, row3, t0);
t1 = shuffle2!(m2, m3, _MM_SHUFFLE!(3, 3, 2, 2));
tt = _mm_shuffle_epi32(m0, _MM_SHUFFLE!(0, 0, 3, 3));
t1 = _mm_blend_epi16(tt, t1, 0xCC);
g2(row0, row1, row2, row3, t1);
diagonalize(row0, row2, row3);
t2 = _mm_unpacklo_epi64(m3, m1);
tt = _mm_blend_epi16(t2, m2, 0xC0);
t2 = _mm_shuffle_epi32(tt, _MM_SHUFFLE!(1, 3, 2, 0));
g1(row0, row1, row2, row3, t2);
t3 = _mm_unpackhi_epi32(m1, m3);
tt = _mm_unpacklo_epi32(m2, t3);
t3 = _mm_shuffle_epi32(tt, _MM_SHUFFLE!(0, 1, 3, 2));
g2(row0, row1, row2, row3, t3);
undiagonalize(row0, row2, row3);
m0 = t0;
m1 = t1;
m2 = t2;
m3 = t3;
// Round 6
t0 = shuffle2!(m0, m1, _MM_SHUFFLE!(3, 1, 1, 2));
t0 = _mm_shuffle_epi32(t0, _MM_SHUFFLE!(0, 3, 2, 1));
g1(row0, row1, row2, row3, t0);
t1 = shuffle2!(m2, m3, _MM_SHUFFLE!(3, 3, 2, 2));
tt = _mm_shuffle_epi32(m0, _MM_SHUFFLE!(0, 0, 3, 3));
t1 = _mm_blend_epi16(tt, t1, 0xCC);
g2(row0, row1, row2, row3, t1);
diagonalize(row0, row2, row3);
t2 = _mm_unpacklo_epi64(m3, m1);
tt = _mm_blend_epi16(t2, m2, 0xC0);
t2 = _mm_shuffle_epi32(tt, _MM_SHUFFLE!(1, 3, 2, 0));
g1(row0, row1, row2, row3, t2);
t3 = _mm_unpackhi_epi32(m1, m3);
tt = _mm_unpacklo_epi32(m2, t3);
t3 = _mm_shuffle_epi32(tt, _MM_SHUFFLE!(0, 1, 3, 2));
g2(row0, row1, row2, row3, t3);
undiagonalize(row0, row2, row3);
m0 = t0;
m1 = t1;
m2 = t2;
m3 = t3;
// Round 7
t0 = shuffle2!(m0, m1, _MM_SHUFFLE!(3, 1, 1, 2));
t0 = _mm_shuffle_epi32(t0, _MM_SHUFFLE!(0, 3, 2, 1));
g1(row0, row1, row2, row3, t0);
t1 = shuffle2!(m2, m3, _MM_SHUFFLE!(3, 3, 2, 2));
tt = _mm_shuffle_epi32(m0, _MM_SHUFFLE!(0, 0, 3, 3));
t1 = _mm_blend_epi16(tt, t1, 0xCC);
g2(row0, row1, row2, row3, t1);
diagonalize(row0, row2, row3);
t2 = _mm_unpacklo_epi64(m3, m1);
tt = _mm_blend_epi16(t2, m2, 0xC0);
t2 = _mm_shuffle_epi32(tt, _MM_SHUFFLE!(1, 3, 2, 0));
g1(row0, row1, row2, row3, t2);
t3 = _mm_unpackhi_epi32(m1, m3);
tt = _mm_unpacklo_epi32(m2, t3);
t3 = _mm_shuffle_epi32(tt, _MM_SHUFFLE!(0, 1, 3, 2));
g2(row0, row1, row2, row3, t3);
undiagonalize(row0, row2, row3);
[*row0, *row1, *row2, *row3]
}
#[target_feature(enable = "sse2")]
pub unsafe fn compress_in_place(
cv: &mut CVWords,
block: &[u8; BLOCK_LEN],
block_len: u8,
counter: u64,
flags: u8,
) {
let [row0, row1, row2, row3] = compress_pre(cv, block, block_len, counter, flags);
storeu(xor(row0, row2), cv.as_mut_ptr().add(0) as *mut u8);
storeu(xor(row1, row3), cv.as_mut_ptr().add(4) as *mut u8);
}
#[target_feature(enable = "sse2")]
pub unsafe fn compress_xof(
cv: &CVWords,
block: &[u8; BLOCK_LEN],
block_len: u8,
counter: u64,
flags: u8,
) -> [u8; 64] {
let [mut row0, mut row1, mut row2, mut row3] =
compress_pre(cv, block, block_len, counter, flags);
row0 = xor(row0, row2);
row1 = xor(row1, row3);
row2 = xor(row2, loadu(cv.as_ptr().add(0) as *const u8));
row3 = xor(row3, loadu(cv.as_ptr().add(4) as *const u8));
core::mem::transmute([row0, row1, row2, row3])
}
#[inline(always)]
unsafe fn round(v: &mut [__m128i; 16], m: &[__m128i; 16], r: usize) {
v[0] = add(v[0], m[MSG_SCHEDULE[r][0] as usize]);
v[1] = add(v[1], m[MSG_SCHEDULE[r][2] as usize]);
v[2] = add(v[2], m[MSG_SCHEDULE[r][4] as usize]);
v[3] = add(v[3], m[MSG_SCHEDULE[r][6] as usize]);
v[0] = add(v[0], v[4]);
v[1] = add(v[1], v[5]);
v[2] = add(v[2], v[6]);
v[3] = add(v[3], v[7]);
v[12] = xor(v[12], v[0]);
v[13] = xor(v[13], v[1]);
v[14] = xor(v[14], v[2]);
v[15] = xor(v[15], v[3]);
v[12] = rot16(v[12]);
v[13] = rot16(v[13]);
v[14] = rot16(v[14]);
v[15] = rot16(v[15]);
v[8] = add(v[8], v[12]);
v[9] = add(v[9], v[13]);
v[10] = add(v[10], v[14]);
v[11] = add(v[11], v[15]);
v[4] = xor(v[4], v[8]);
v[5] = xor(v[5], v[9]);
v[6] = xor(v[6], v[10]);
v[7] = xor(v[7], v[11]);
v[4] = rot12(v[4]);
v[5] = rot12(v[5]);
v[6] = rot12(v[6]);
v[7] = rot12(v[7]);
v[0] = add(v[0], m[MSG_SCHEDULE[r][1] as usize]);
v[1] = add(v[1], m[MSG_SCHEDULE[r][3] as usize]);
v[2] = add(v[2], m[MSG_SCHEDULE[r][5] as usize]);
v[3] = add(v[3], m[MSG_SCHEDULE[r][7] as usize]);
v[0] = add(v[0], v[4]);
v[1] = add(v[1], v[5]);
v[2] = add(v[2], v[6]);
v[3] = add(v[3], v[7]);
v[12] = xor(v[12], v[0]);
v[13] = xor(v[13], v[1]);
v[14] = xor(v[14], v[2]);
v[15] = xor(v[15], v[3]);
v[12] = rot8(v[12]);
v[13] = rot8(v[13]);
v[14] = rot8(v[14]);
v[15] = rot8(v[15]);
v[8] = add(v[8], v[12]);
v[9] = add(v[9], v[13]);
v[10] = add(v[10], v[14]);
v[11] = add(v[11], v[15]);
v[4] = xor(v[4], v[8]);
v[5] = xor(v[5], v[9]);
v[6] = xor(v[6], v[10]);
v[7] = xor(v[7], v[11]);
v[4] = rot7(v[4]);
v[5] = rot7(v[5]);
v[6] = rot7(v[6]);
v[7] = rot7(v[7]);
v[0] = add(v[0], m[MSG_SCHEDULE[r][8] as usize]);
v[1] = add(v[1], m[MSG_SCHEDULE[r][10] as usize]);
v[2] = add(v[2], m[MSG_SCHEDULE[r][12] as usize]);
v[3] = add(v[3], m[MSG_SCHEDULE[r][14] as usize]);
v[0] = add(v[0], v[5]);
v[1] = add(v[1], v[6]);
v[2] = add(v[2], v[7]);
v[3] = add(v[3], v[4]);
v[15] = xor(v[15], v[0]);
v[12] = xor(v[12], v[1]);
v[13] = xor(v[13], v[2]);
v[14] = xor(v[14], v[3]);
v[15] = rot16(v[15]);
v[12] = rot16(v[12]);
v[13] = rot16(v[13]);
v[14] = rot16(v[14]);
v[10] = add(v[10], v[15]);
v[11] = add(v[11], v[12]);
v[8] = add(v[8], v[13]);
v[9] = add(v[9], v[14]);
v[5] = xor(v[5], v[10]);
v[6] = xor(v[6], v[11]);
v[7] = xor(v[7], v[8]);
v[4] = xor(v[4], v[9]);
v[5] = rot12(v[5]);
v[6] = rot12(v[6]);
v[7] = rot12(v[7]);
v[4] = rot12(v[4]);
v[0] = add(v[0], m[MSG_SCHEDULE[r][9] as usize]);
v[1] = add(v[1], m[MSG_SCHEDULE[r][11] as usize]);
v[2] = add(v[2], m[MSG_SCHEDULE[r][13] as usize]);
v[3] = add(v[3], m[MSG_SCHEDULE[r][15] as usize]);
v[0] = add(v[0], v[5]);
v[1] = add(v[1], v[6]);
v[2] = add(v[2], v[7]);
v[3] = add(v[3], v[4]);
v[15] = xor(v[15], v[0]);
v[12] = xor(v[12], v[1]);
v[13] = xor(v[13], v[2]);
v[14] = xor(v[14], v[3]);
v[15] = rot8(v[15]);
v[12] = rot8(v[12]);
v[13] = rot8(v[13]);
v[14] = rot8(v[14]);
v[10] = add(v[10], v[15]);
v[11] = add(v[11], v[12]);
v[8] = add(v[8], v[13]);
v[9] = add(v[9], v[14]);
v[5] = xor(v[5], v[10]);
v[6] = xor(v[6], v[11]);
v[7] = xor(v[7], v[8]);
v[4] = xor(v[4], v[9]);
v[5] = rot7(v[5]);
v[6] = rot7(v[6]);
v[7] = rot7(v[7]);
v[4] = rot7(v[4]);
}
#[inline(always)]
unsafe fn transpose_vecs(vecs: &mut [__m128i; DEGREE]) {
// Interleave 32-bit lates. The low unpack is lanes 00/11 and the high is
// 22/33. Note that this doesn't split the vector into two lanes, as the
// AVX2 counterparts do.
let ab_01 = _mm_unpacklo_epi32(vecs[0], vecs[1]);
let ab_23 = _mm_unpackhi_epi32(vecs[0], vecs[1]);
let cd_01 = _mm_unpacklo_epi32(vecs[2], vecs[3]);
let cd_23 = _mm_unpackhi_epi32(vecs[2], vecs[3]);
// Interleave 64-bit lanes.
let abcd_0 = _mm_unpacklo_epi64(ab_01, cd_01);
let abcd_1 = _mm_unpackhi_epi64(ab_01, cd_01);
let abcd_2 = _mm_unpacklo_epi64(ab_23, cd_23);
let abcd_3 = _mm_unpackhi_epi64(ab_23, cd_23);
vecs[0] = abcd_0;
vecs[1] = abcd_1;
vecs[2] = abcd_2;
vecs[3] = abcd_3;
}
#[inline(always)]
unsafe fn transpose_msg_vecs(inputs: &[*const u8; DEGREE], block_offset: usize) -> [__m128i; 16] {
let mut vecs = [
loadu(inputs[0].add(block_offset + 0 * 4 * DEGREE)),
loadu(inputs[1].add(block_offset + 0 * 4 * DEGREE)),
loadu(inputs[2].add(block_offset + 0 * 4 * DEGREE)),
loadu(inputs[3].add(block_offset + 0 * 4 * DEGREE)),
loadu(inputs[0].add(block_offset + 1 * 4 * DEGREE)),
loadu(inputs[1].add(block_offset + 1 * 4 * DEGREE)),
loadu(inputs[2].add(block_offset + 1 * 4 * DEGREE)),
loadu(inputs[3].add(block_offset + 1 * 4 * DEGREE)),
loadu(inputs[0].add(block_offset + 2 * 4 * DEGREE)),
loadu(inputs[1].add(block_offset + 2 * 4 * DEGREE)),
loadu(inputs[2].add(block_offset + 2 * 4 * DEGREE)),
loadu(inputs[3].add(block_offset + 2 * 4 * DEGREE)),
loadu(inputs[0].add(block_offset + 3 * 4 * DEGREE)),
loadu(inputs[1].add(block_offset + 3 * 4 * DEGREE)),
loadu(inputs[2].add(block_offset + 3 * 4 * DEGREE)),
loadu(inputs[3].add(block_offset + 3 * 4 * DEGREE)),
];
for i in 0..DEGREE {
_mm_prefetch(inputs[i].add(block_offset + 256) as * const i8, _MM_HINT_T0);
}
let squares = mut_array_refs!(&mut vecs, DEGREE, DEGREE, DEGREE, DEGREE);
transpose_vecs(squares.0);
transpose_vecs(squares.1);
transpose_vecs(squares.2);
transpose_vecs(squares.3);
vecs
}
#[inline(always)]
unsafe fn load_counters(counter: u64, increment_counter: IncrementCounter) -> (__m128i, __m128i) {
let mask = if increment_counter.yes() { !0 } else { 0 };
(
set4(
counter_low(counter + (mask & 0)),
counter_low(counter + (mask & 1)),
counter_low(counter + (mask & 2)),
counter_low(counter + (mask & 3)),
),
set4(
counter_high(counter + (mask & 0)),
counter_high(counter + (mask & 1)),
counter_high(counter + (mask & 2)),
counter_high(counter + (mask & 3)),
),
)
}
#[target_feature(enable = "sse2")]
pub unsafe fn hash4(
inputs: &[*const u8; DEGREE],
blocks: usize,
key: &CVWords,
counter: u64,
increment_counter: IncrementCounter,
flags: u8,
flags_start: u8,
flags_end: u8,
out: &mut [u8; DEGREE * OUT_LEN],
) {
let mut h_vecs = [
set1(key[0]),
set1(key[1]),
set1(key[2]),
set1(key[3]),
set1(key[4]),
set1(key[5]),
set1(key[6]),
set1(key[7]),
];
let (counter_low_vec, counter_high_vec) = load_counters(counter, increment_counter);
let mut block_flags = flags | flags_start;
for block in 0..blocks {
if block + 1 == blocks {
block_flags |= flags_end;
}
let block_len_vec = set1(BLOCK_LEN as u32); // full blocks only
let block_flags_vec = set1(block_flags as u32);
let msg_vecs = transpose_msg_vecs(inputs, block * BLOCK_LEN);
// The transposed compression function. Note that inlining this
// manually here improves compile times by a lot, compared to factoring
// it out into its own function and making it #[inline(always)]. Just
// guessing, it might have something to do with loop unrolling.
let mut v = [
h_vecs[0],
h_vecs[1],
h_vecs[2],
h_vecs[3],
h_vecs[4],
h_vecs[5],
h_vecs[6],
h_vecs[7],
set1(IV[0]),
set1(IV[1]),
set1(IV[2]),
set1(IV[3]),
counter_low_vec,
counter_high_vec,
block_len_vec,
block_flags_vec,
];
round(&mut v, &msg_vecs, 0);
round(&mut v, &msg_vecs, 1);
round(&mut v, &msg_vecs, 2);
round(&mut v, &msg_vecs, 3);
round(&mut v, &msg_vecs, 4);
round(&mut v, &msg_vecs, 5);
round(&mut v, &msg_vecs, 6);
h_vecs[0] = xor(v[0], v[8]);
h_vecs[1] = xor(v[1], v[9]);
h_vecs[2] = xor(v[2], v[10]);
h_vecs[3] = xor(v[3], v[11]);
h_vecs[4] = xor(v[4], v[12]);
h_vecs[5] = xor(v[5], v[13]);
h_vecs[6] = xor(v[6], v[14]);
h_vecs[7] = xor(v[7], v[15]);
block_flags = flags;
}
let squares = mut_array_refs!(&mut h_vecs, DEGREE, DEGREE);
transpose_vecs(squares.0);
transpose_vecs(squares.1);
// The first four vecs now contain the first half of each output, and the
// second four vecs contain the second half of each output.
storeu(h_vecs[0], out.as_mut_ptr().add(0 * 4 * DEGREE));
storeu(h_vecs[4], out.as_mut_ptr().add(1 * 4 * DEGREE));
storeu(h_vecs[1], out.as_mut_ptr().add(2 * 4 * DEGREE));
storeu(h_vecs[5], out.as_mut_ptr().add(3 * 4 * DEGREE));
storeu(h_vecs[2], out.as_mut_ptr().add(4 * 4 * DEGREE));
storeu(h_vecs[6], out.as_mut_ptr().add(5 * 4 * DEGREE));
storeu(h_vecs[3], out.as_mut_ptr().add(6 * 4 * DEGREE));
storeu(h_vecs[7], out.as_mut_ptr().add(7 * 4 * DEGREE));
}
#[target_feature(enable = "sse2")]
unsafe fn hash1<A: arrayvec::Array<Item = u8>>(
input: &A,
key: &CVWords,
counter: u64,
flags: u8,
flags_start: u8,
flags_end: u8,
out: &mut CVBytes,
) {
debug_assert_eq!(A::CAPACITY % BLOCK_LEN, 0, "uneven blocks");
let mut cv = *key;
let mut block_flags = flags | flags_start;
let mut slice = input.as_slice();
while slice.len() >= BLOCK_LEN {
if slice.len() == BLOCK_LEN {
block_flags |= flags_end;
}
compress_in_place(
&mut cv,
array_ref!(slice, 0, BLOCK_LEN),
BLOCK_LEN as u8,
counter,
block_flags,
);
block_flags = flags;
slice = &slice[BLOCK_LEN..];
}
*out = core::mem::transmute(cv); // x86 is little-endian
}
#[target_feature(enable = "sse2")]
pub unsafe fn hash_many<A: arrayvec::Array<Item = u8>>(
mut inputs: &[&A],
key: &CVWords,
mut counter: u64,
increment_counter: IncrementCounter,
flags: u8,
flags_start: u8,
flags_end: u8,
mut out: &mut [u8],
) {
debug_assert!(out.len() >= inputs.len() * OUT_LEN, "out too short");
while inputs.len() >= DEGREE && out.len() >= DEGREE * OUT_LEN {
// Safe because the layout of arrays is guaranteed, and because the
// `blocks` count is determined statically from the argument type.
let input_ptrs: &[*const u8; DEGREE] = &*(inputs.as_ptr() as *const [*const u8; DEGREE]);
let blocks = A::CAPACITY / BLOCK_LEN;
hash4(
input_ptrs,
blocks,
key,
counter,
increment_counter,
flags,
flags_start,
flags_end,
array_mut_ref!(out, 0, DEGREE * OUT_LEN),
);
if increment_counter.yes() {
counter += DEGREE as u64;
}
inputs = &inputs[DEGREE..];
out = &mut out[DEGREE * OUT_LEN..];
}
for (&input, output) in inputs.iter().zip(out.chunks_exact_mut(OUT_LEN)) {
hash1(
input,
key,
counter,
flags,
flags_start,
flags_end,
array_mut_ref!(output, 0, OUT_LEN),
);
if increment_counter.yes() {
counter += 1;
}
}
}
#[cfg(test)]
mod test {
use super::*;
#[test]
fn test_transpose() {
if !crate::platform::sse2_detected() {
return;
}
#[target_feature(enable = "sse2")]
unsafe fn transpose_wrapper(vecs: &mut [__m128i; DEGREE]) {
transpose_vecs(vecs);
}
let mut matrix = [[0 as u32; DEGREE]; DEGREE];
for i in 0..DEGREE {
for j in 0..DEGREE {
matrix[i][j] = (i * DEGREE + j) as u32;
}
}
unsafe {
let mut vecs: [__m128i; DEGREE] = core::mem::transmute(matrix);
transpose_wrapper(&mut vecs);
matrix = core::mem::transmute(vecs);
}
for i in 0..DEGREE {
for j in 0..DEGREE {
// Reversed indexes from above.
assert_eq!(matrix[j][i], (i * DEGREE + j) as u32);
}
}
}
#[test]
fn test_compress() {
if !crate::platform::sse2_detected() {
return;
}
crate::test::test_compress_fn(compress_in_place, compress_xof);
}
#[test]
fn test_hash_many() {
if !crate::platform::sse2_detected() {
return;
}
crate::test::test_hash_many_fn(hash_many, hash_many);
}
}

View File

@ -1,6 +1,7 @@
fn main() {
#[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
{
dbg!(is_x86_feature_detected!("sse2"));
dbg!(is_x86_feature_detected!("sse4.1"));
dbg!(is_x86_feature_detected!("avx2"));
dbg!(is_x86_feature_detected!("avx512f"));