mirror of
https://github.com/BLAKE3-team/BLAKE3
synced 2024-04-19 20:44:11 +02:00
Start SSE2 implementation based on SSE4.1 version
Wire up basic functions and features for SSE2 support using the SSE4.1 version as a basis without implementing the SSE2 instructions yet. * Cargo.toml: add no_sse2 feature * benches/bench.rs: wire SSE2 benchmarks * build.rs: add SSE2 rust intrinsics and assembly builds * c/Makefile.testing: add SSE2 C and assembly targets * c/README.md: add SSE2 to C build instructions * c/blake3_c_rust_bindings/build.rs: add SSE2 C rust binding builds * c/blake3_c_rust_bindings/src/lib.rs: add SSE2 C rust bindings * c/blake3_dispatch.c: add SSE2 C dispatch * c/blake3_impl.h: add SSE2 C function prototypes * c/blake3_sse2.c: add SSE2 C intrinsic file starting with SSE4.1 version * c/blake3_sse2_x86-64_{unix.S,windows_gnu.S,windows_msvc.asm}: add SSE2 assembly files starting with SSE4.1 version * src/ffi_sse2.rs: add rust implementation using SSE2 C rust bindings * src/lib.rs: add SSE2 rust intrinsics and SSE2 C rust binding rust SSE2 module configurations * src/platform.rs: add SSE2 rust platform detection and dispatch * src/rust_sse2.rs: add SSE2 rust intrinsic file starting with SSE4.1 version * tools/instruction_set_support/src/main.rs: add SSE2 feature detection
This commit is contained in:
parent
adbf07d67a
commit
d91f20dd29
|
@ -61,6 +61,7 @@ prefer_intrinsics = []
|
|||
# level turns out to be the right approach, then we can design a stable
|
||||
# feature. Until then, we reserve the right to break these features in a patch
|
||||
# release.
|
||||
no_sse2 = []
|
||||
no_sse41 = []
|
||||
no_avx2 = []
|
||||
no_avx512 = []
|
||||
|
|
|
@ -60,6 +60,14 @@ fn bench_single_compression_portable(b: &mut Bencher) {
|
|||
bench_single_compression_fn(b, Platform::portable());
|
||||
}
|
||||
|
||||
#[bench]
|
||||
#[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
|
||||
fn bench_single_compression_sse2(b: &mut Bencher) {
|
||||
if let Some(platform) = Platform::sse2() {
|
||||
bench_single_compression_fn(b, platform);
|
||||
}
|
||||
}
|
||||
|
||||
#[bench]
|
||||
#[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
|
||||
fn bench_single_compression_sse41(b: &mut Bencher) {
|
||||
|
@ -102,6 +110,14 @@ fn bench_many_chunks_fn(b: &mut Bencher, platform: Platform) {
|
|||
});
|
||||
}
|
||||
|
||||
#[bench]
|
||||
#[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
|
||||
fn bench_many_chunks_sse2(b: &mut Bencher) {
|
||||
if let Some(platform) = Platform::sse2() {
|
||||
bench_many_chunks_fn(b, platform);
|
||||
}
|
||||
}
|
||||
|
||||
#[bench]
|
||||
#[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
|
||||
fn bench_many_chunks_sse41(b: &mut Bencher) {
|
||||
|
@ -161,6 +177,14 @@ fn bench_many_parents_fn(b: &mut Bencher, platform: Platform) {
|
|||
});
|
||||
}
|
||||
|
||||
#[bench]
|
||||
#[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
|
||||
fn bench_many_parents_sse2(b: &mut Bencher) {
|
||||
if let Some(platform) = Platform::sse2() {
|
||||
bench_many_parents_fn(b, platform);
|
||||
}
|
||||
}
|
||||
|
||||
#[bench]
|
||||
#[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
|
||||
fn bench_many_parents_sse41(b: &mut Bencher) {
|
||||
|
|
20
build.rs
20
build.rs
|
@ -118,34 +118,40 @@ fn c_compiler_support() -> CCompilerSupport {
|
|||
}
|
||||
}
|
||||
|
||||
fn build_sse41_avx2_rust_intrinsics() {
|
||||
// No C code to compile here. Set the cfg flags that enable the Rust SSE4.1
|
||||
// and AVX2 intrinsics modules. The regular Cargo build will compile them.
|
||||
fn build_sse2_sse41_avx2_rust_intrinsics() {
|
||||
// No C code to compile here. Set the cfg flags that enable the Rust SSE2,
|
||||
// SSE4.1, and AVX2 intrinsics modules. The regular Cargo build will compile
|
||||
// them.
|
||||
println!("cargo:rustc-cfg=blake3_sse2_rust");
|
||||
println!("cargo:rustc-cfg=blake3_sse41_rust");
|
||||
println!("cargo:rustc-cfg=blake3_avx2_rust");
|
||||
}
|
||||
|
||||
fn build_sse41_avx2_assembly() {
|
||||
fn build_sse2_sse41_avx2_assembly() {
|
||||
// Build the assembly implementations for SSE4.1 and AVX2. This is
|
||||
// preferred, but it only supports x86_64.
|
||||
assert!(is_x86_64());
|
||||
println!("cargo:rustc-cfg=blake3_sse2_ffi");
|
||||
println!("cargo:rustc-cfg=blake3_sse41_ffi");
|
||||
println!("cargo:rustc-cfg=blake3_avx2_ffi");
|
||||
let mut build = new_build();
|
||||
if is_windows_msvc() {
|
||||
build.file("c/blake3_sse2_x86-64_windows_msvc.asm");
|
||||
build.file("c/blake3_sse41_x86-64_windows_msvc.asm");
|
||||
build.file("c/blake3_avx2_x86-64_windows_msvc.asm");
|
||||
} else if is_windows_gnu() {
|
||||
build.file("c/blake3_sse2_x86-64_windows_gnu.S");
|
||||
build.file("c/blake3_sse41_x86-64_windows_gnu.S");
|
||||
build.file("c/blake3_avx2_x86-64_windows_gnu.S");
|
||||
} else {
|
||||
// All non-Windows implementations are assumed to support
|
||||
// Linux-style assembly. These files do contain a small
|
||||
// explicit workaround for macOS also.
|
||||
build.file("c/blake3_sse2_x86-64_unix.S");
|
||||
build.file("c/blake3_sse41_x86-64_unix.S");
|
||||
build.file("c/blake3_avx2_x86-64_unix.S");
|
||||
}
|
||||
build.compile("blake3_sse41_avx2_assembly");
|
||||
build.compile("blake3_sse2_sse41_avx2_assembly");
|
||||
}
|
||||
|
||||
fn build_avx512_c_intrinsics() {
|
||||
|
@ -215,11 +221,11 @@ fn main() -> Result<(), Box<dyn std::error::Error>> {
|
|||
if is_x86_64() || is_x86_32() {
|
||||
let support = c_compiler_support();
|
||||
if is_x86_32() || should_prefer_intrinsics() || is_pure() || support == NoCompiler {
|
||||
build_sse41_avx2_rust_intrinsics();
|
||||
build_sse2_sse41_avx2_rust_intrinsics();
|
||||
} else {
|
||||
// We assume that all C compilers can assemble SSE4.1 and AVX2. We
|
||||
// don't explicitly check for support.
|
||||
build_sse41_avx2_assembly();
|
||||
build_sse2_sse41_avx2_assembly();
|
||||
}
|
||||
|
||||
if is_pure() || support == NoCompiler || support == NoAVX512 {
|
||||
|
|
|
@ -9,6 +9,13 @@ TARGETS=
|
|||
ASM_TARGETS=
|
||||
EXTRAFLAGS=-Wa,--noexecstack
|
||||
|
||||
ifdef BLAKE3_NO_SSE2
|
||||
EXTRAFLAGS += -DBLAKE3_NO_SSE2
|
||||
else
|
||||
TARGETS += blake3_sse2.o
|
||||
ASM_TARGETS += blake3_sse2_x86-64_unix.S
|
||||
endif
|
||||
|
||||
ifdef BLAKE3_NO_SSE41
|
||||
EXTRAFLAGS += -DBLAKE3_NO_SSE41
|
||||
else
|
||||
|
@ -38,6 +45,9 @@ endif
|
|||
all: blake3.c blake3_dispatch.c blake3_portable.c main.c $(TARGETS)
|
||||
$(CC) $(CFLAGS) $(EXTRAFLAGS) $^ -o $(NAME) $(LDFLAGS)
|
||||
|
||||
blake3_sse2.o: blake3_sse2.c
|
||||
$(CC) $(CFLAGS) $(EXTRAFLAGS) -c $^ -o $@ -msse2
|
||||
|
||||
blake3_sse41.o: blake3_sse41.c
|
||||
$(CC) $(CFLAGS) $(EXTRAFLAGS) -c $^ -o $@ -msse4.1
|
||||
|
||||
|
|
17
c/README.md
17
c/README.md
|
@ -40,7 +40,8 @@ with a Unix-like OS, you can compile a working binary like this:
|
|||
|
||||
```bash
|
||||
gcc -O3 -o example example.c blake3.c blake3_dispatch.c blake3_portable.c \
|
||||
blake3_sse41_x86-64_unix.S blake3_avx2_x86-64_unix.S blake3_avx512_x86-64_unix.S
|
||||
blake3_sse2_x86-64_unix.S blake3_sse41_x86-64_unix.S blake3_avx2_x86-64_unix.S \
|
||||
blake3_avx512_x86-64_unix.S
|
||||
```
|
||||
|
||||
# API
|
||||
|
@ -144,8 +145,8 @@ by hand. Note that these steps may change in future versions.
|
|||
Dynamic dispatch is enabled by default on x86. The implementation will
|
||||
query the CPU at runtime to detect SIMD support, and it will use the
|
||||
widest instruction set available. By default, `blake3_dispatch.c`
|
||||
expects to be linked with code for four different instruction sets:
|
||||
portable C, SSE4.1, AVX2, and AVX-512.
|
||||
expects to be linked with code for five different instruction sets:
|
||||
portable C, SSE2, SSE4.1, AVX2, and AVX-512.
|
||||
|
||||
For each of the x86 SIMD instruction sets, two versions are available,
|
||||
one in assembly (with three flavors: Unix, Windows MSVC, and Windows
|
||||
|
@ -160,7 +161,8 @@ the assembly implementations:
|
|||
|
||||
```bash
|
||||
gcc -shared -O3 -o libblake3.so blake3.c blake3_dispatch.c blake3_portable.c \
|
||||
blake3_sse41_x86-64_unix.S blake3_avx2_x86-64_unix.S blake3_avx512_x86-64_unix.S
|
||||
blake3_sse2_x86-64_unix.S blake3_sse41_x86-64_unix.S blake3_avx2_x86-64_unix.S \
|
||||
blake3_avx512_x86-64_unix.S
|
||||
```
|
||||
|
||||
When building the intrinsics-based implementations, you need to build
|
||||
|
@ -169,11 +171,12 @@ explicitly enabled in the compiler. Here's the same shared library using
|
|||
the intrinsics-based implementations:
|
||||
|
||||
```bash
|
||||
gcc -c -fPIC -O3 -msse2 blake3_sse2.c -o blake3_sse2.o
|
||||
gcc -c -fPIC -O3 -msse4.1 blake3_sse41.c -o blake3_sse41.o
|
||||
gcc -c -fPIC -O3 -mavx2 blake3_avx2.c -o blake3_avx2.o
|
||||
gcc -c -fPIC -O3 -mavx512f -mavx512vl blake3_avx512.c -o blake3_avx512.o
|
||||
gcc -shared -O3 -o libblake3.so blake3.c blake3_dispatch.c blake3_portable.c \
|
||||
blake3_avx2.o blake3_avx512.o blake3_sse41.o
|
||||
blake3_avx2.o blake3_avx512.o blake3_sse41.o blake3_sse2.o
|
||||
```
|
||||
|
||||
Note above that building `blake3_avx512.c` requires both `-mavx512f` and
|
||||
|
@ -187,8 +190,8 @@ each instruction set. Here's an example of building a shared library on
|
|||
x86 with only portable code:
|
||||
|
||||
```bash
|
||||
gcc -shared -O3 -o libblake3.so -DBLAKE3_NO_SSE41 -DBLAKE3_NO_AVX2 -DBLAKE3_NO_AVX512 \
|
||||
blake3.c blake3_dispatch.c blake3_portable.c
|
||||
gcc -shared -O3 -o libblake3.so -DBLAKE3_NO_SSE2 -DBLAKE3_NO_SSE41 -DBLAKE3_NO_AVX2 \
|
||||
-DBLAKE3_NO_AVX512 blake3.c blake3_dispatch.c blake3_portable.c
|
||||
```
|
||||
|
||||
## ARM NEON
|
||||
|
|
|
@ -60,12 +60,14 @@ fn main() -> Result<(), Box<dyn std::error::Error>> {
|
|||
// "prefer_intrinsics" feature is enabled.
|
||||
if is_windows_msvc() {
|
||||
let mut build = new_build();
|
||||
build.file("../blake3_sse2_x86-64_windows_msvc.asm");
|
||||
build.file("../blake3_sse41_x86-64_windows_msvc.asm");
|
||||
build.file("../blake3_avx2_x86-64_windows_msvc.asm");
|
||||
build.file("../blake3_avx512_x86-64_windows_msvc.asm");
|
||||
build.compile("blake3_asm");
|
||||
} else if is_windows_gnu() {
|
||||
let mut build = new_build();
|
||||
build.file("../blake3_sse2_x86-64_windows_gnu.S");
|
||||
build.file("../blake3_sse41_x86-64_windows_gnu.S");
|
||||
build.file("../blake3_avx2_x86-64_windows_gnu.S");
|
||||
build.file("../blake3_avx512_x86-64_windows_gnu.S");
|
||||
|
@ -75,6 +77,7 @@ fn main() -> Result<(), Box<dyn std::error::Error>> {
|
|||
// Linux-style assembly. These files do contain a small
|
||||
// explicit workaround for macOS also.
|
||||
let mut build = new_build();
|
||||
build.file("../blake3_sse2_x86-64_unix.S");
|
||||
build.file("../blake3_sse41_x86-64_unix.S");
|
||||
build.file("../blake3_avx2_x86-64_unix.S");
|
||||
build.file("../blake3_avx512_x86-64_unix.S");
|
||||
|
@ -87,6 +90,18 @@ fn main() -> Result<(), Box<dyn std::error::Error>> {
|
|||
// compiled separately, with the corresponding instruction set
|
||||
// extension explicitly enabled in the compiler.
|
||||
|
||||
let mut sse2_build = new_build();
|
||||
sse2_build.file("../blake3_sse2.c");
|
||||
if is_windows_msvc() {
|
||||
// /arch:SSE2 is the default on x86 and undefined on x86_64:
|
||||
// https://docs.microsoft.com/en-us/cpp/build/reference/arch-x86
|
||||
// It also includes SSE4.1 intrisincs:
|
||||
// https://stackoverflow.com/a/32183222/823869
|
||||
} else {
|
||||
sse2_build.flag("-msse2");
|
||||
}
|
||||
sse2_build.compile("blake3_sse2");
|
||||
|
||||
let mut sse41_build = new_build();
|
||||
sse41_build.file("../blake3_sse41.c");
|
||||
if is_windows_msvc() {
|
||||
|
|
|
@ -15,6 +15,11 @@ pub const OUT_LEN: usize = 32;
|
|||
|
||||
// Feature detection functions for tests and benchmarks. Note that the C code
|
||||
// does its own feature detection in blake3_dispatch.c.
|
||||
#[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
|
||||
pub fn sse2_detected() -> bool {
|
||||
is_x86_feature_detected!("sse2")
|
||||
}
|
||||
|
||||
#[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
|
||||
pub fn sse41_detected() -> bool {
|
||||
is_x86_feature_detected!("sse4.1")
|
||||
|
@ -153,6 +158,35 @@ pub mod ffi {
|
|||
#[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
|
||||
pub mod x86 {
|
||||
extern "C" {
|
||||
// SSE2 low level functions
|
||||
pub fn blake3_compress_in_place_sse2(
|
||||
cv: *mut u32,
|
||||
block: *const u8,
|
||||
block_len: u8,
|
||||
counter: u64,
|
||||
flags: u8,
|
||||
);
|
||||
pub fn blake3_compress_xof_sse2(
|
||||
cv: *const u32,
|
||||
block: *const u8,
|
||||
block_len: u8,
|
||||
counter: u64,
|
||||
flags: u8,
|
||||
out: *mut u8,
|
||||
);
|
||||
pub fn blake3_hash_many_sse2(
|
||||
inputs: *const *const u8,
|
||||
num_inputs: usize,
|
||||
blocks: usize,
|
||||
key: *const u32,
|
||||
counter: u64,
|
||||
increment_counter: bool,
|
||||
flags: u8,
|
||||
flags_start: u8,
|
||||
flags_end: u8,
|
||||
out: *mut u8,
|
||||
);
|
||||
|
||||
// SSE4.1 low level functions
|
||||
pub fn blake3_compress_in_place_sse41(
|
||||
cv: *mut u32,
|
||||
|
|
|
@ -149,6 +149,12 @@ void blake3_compress_in_place(uint32_t cv[8],
|
|||
return;
|
||||
}
|
||||
#endif
|
||||
#if !defined(BLAKE3_NO_SSE2)
|
||||
if (features & SSE2) {
|
||||
blake3_compress_in_place_sse2(cv, block, block_len, counter, flags);
|
||||
return;
|
||||
}
|
||||
#endif
|
||||
#endif
|
||||
blake3_compress_in_place_portable(cv, block, block_len, counter, flags);
|
||||
}
|
||||
|
@ -171,6 +177,12 @@ void blake3_compress_xof(const uint32_t cv[8],
|
|||
return;
|
||||
}
|
||||
#endif
|
||||
#if !defined(BLAKE3_NO_SSE2)
|
||||
if (features & SSE2) {
|
||||
blake3_compress_xof_sse2(cv, block, block_len, counter, flags, out);
|
||||
return;
|
||||
}
|
||||
#endif
|
||||
#endif
|
||||
blake3_compress_xof_portable(cv, block, block_len, counter, flags, out);
|
||||
}
|
||||
|
@ -205,6 +217,14 @@ void blake3_hash_many(const uint8_t *const *inputs, size_t num_inputs,
|
|||
return;
|
||||
}
|
||||
#endif
|
||||
#if !defined(BLAKE3_NO_SSE2)
|
||||
if (features & SSE2) {
|
||||
blake3_hash_many_sse2(inputs, num_inputs, blocks, key, counter,
|
||||
increment_counter, flags, flags_start, flags_end,
|
||||
out);
|
||||
return;
|
||||
}
|
||||
#endif
|
||||
#endif
|
||||
|
||||
#if defined(BLAKE3_USE_NEON)
|
||||
|
@ -237,6 +257,11 @@ size_t blake3_simd_degree(void) {
|
|||
return 4;
|
||||
}
|
||||
#endif
|
||||
#if !defined(BLAKE3_NO_SSE2)
|
||||
if (features & SSE2) {
|
||||
return 4;
|
||||
}
|
||||
#endif
|
||||
#endif
|
||||
#if defined(BLAKE3_USE_NEON)
|
||||
return 4;
|
||||
|
|
|
@ -182,6 +182,21 @@ void blake3_hash_many_portable(const uint8_t *const *inputs, size_t num_inputs,
|
|||
uint8_t flags_end, uint8_t *out);
|
||||
|
||||
#if defined(IS_X86)
|
||||
#if !defined(BLAKE3_NO_SSE2)
|
||||
void blake3_compress_in_place_sse2(uint32_t cv[8],
|
||||
const uint8_t block[BLAKE3_BLOCK_LEN],
|
||||
uint8_t block_len, uint64_t counter,
|
||||
uint8_t flags);
|
||||
void blake3_compress_xof_sse2(const uint32_t cv[8],
|
||||
const uint8_t block[BLAKE3_BLOCK_LEN],
|
||||
uint8_t block_len, uint64_t counter,
|
||||
uint8_t flags, uint8_t out[64]);
|
||||
void blake3_hash_many_sse2(const uint8_t *const *inputs, size_t num_inputs,
|
||||
size_t blocks, const uint32_t key[8],
|
||||
uint64_t counter, bool increment_counter,
|
||||
uint8_t flags, uint8_t flags_start,
|
||||
uint8_t flags_end, uint8_t *out);
|
||||
#endif
|
||||
#if !defined(BLAKE3_NO_SSE41)
|
||||
void blake3_compress_in_place_sse41(uint32_t cv[8],
|
||||
const uint8_t block[BLAKE3_BLOCK_LEN],
|
||||
|
|
|
@ -0,0 +1,559 @@
|
|||
#include "blake3_impl.h"
|
||||
|
||||
#include <immintrin.h>
|
||||
|
||||
#define DEGREE 4
|
||||
|
||||
#define _mm_shuffle_ps2(a, b, c) \
|
||||
(_mm_castps_si128( \
|
||||
_mm_shuffle_ps(_mm_castsi128_ps(a), _mm_castsi128_ps(b), (c))))
|
||||
|
||||
INLINE __m128i loadu(const uint8_t src[16]) {
|
||||
return _mm_loadu_si128((const __m128i *)src);
|
||||
}
|
||||
|
||||
INLINE void storeu(__m128i src, uint8_t dest[16]) {
|
||||
_mm_storeu_si128((__m128i *)dest, src);
|
||||
}
|
||||
|
||||
INLINE __m128i addv(__m128i a, __m128i b) { return _mm_add_epi32(a, b); }
|
||||
|
||||
// Note that clang-format doesn't like the name "xor" for some reason.
|
||||
INLINE __m128i xorv(__m128i a, __m128i b) { return _mm_xor_si128(a, b); }
|
||||
|
||||
INLINE __m128i set1(uint32_t x) { return _mm_set1_epi32((int32_t)x); }
|
||||
|
||||
INLINE __m128i set4(uint32_t a, uint32_t b, uint32_t c, uint32_t d) {
|
||||
return _mm_setr_epi32((int32_t)a, (int32_t)b, (int32_t)c, (int32_t)d);
|
||||
}
|
||||
|
||||
INLINE __m128i rot16(__m128i x) {
|
||||
return _mm_shuffle_epi8(
|
||||
x, _mm_set_epi8(13, 12, 15, 14, 9, 8, 11, 10, 5, 4, 7, 6, 1, 0, 3, 2));
|
||||
}
|
||||
|
||||
INLINE __m128i rot12(__m128i x) {
|
||||
return xorv(_mm_srli_epi32(x, 12), _mm_slli_epi32(x, 32 - 12));
|
||||
}
|
||||
|
||||
INLINE __m128i rot8(__m128i x) {
|
||||
return _mm_shuffle_epi8(
|
||||
x, _mm_set_epi8(12, 15, 14, 13, 8, 11, 10, 9, 4, 7, 6, 5, 0, 3, 2, 1));
|
||||
}
|
||||
|
||||
INLINE __m128i rot7(__m128i x) {
|
||||
return xorv(_mm_srli_epi32(x, 7), _mm_slli_epi32(x, 32 - 7));
|
||||
}
|
||||
|
||||
INLINE void g1(__m128i *row0, __m128i *row1, __m128i *row2, __m128i *row3,
|
||||
__m128i m) {
|
||||
*row0 = addv(addv(*row0, m), *row1);
|
||||
*row3 = xorv(*row3, *row0);
|
||||
*row3 = rot16(*row3);
|
||||
*row2 = addv(*row2, *row3);
|
||||
*row1 = xorv(*row1, *row2);
|
||||
*row1 = rot12(*row1);
|
||||
}
|
||||
|
||||
INLINE void g2(__m128i *row0, __m128i *row1, __m128i *row2, __m128i *row3,
|
||||
__m128i m) {
|
||||
*row0 = addv(addv(*row0, m), *row1);
|
||||
*row3 = xorv(*row3, *row0);
|
||||
*row3 = rot8(*row3);
|
||||
*row2 = addv(*row2, *row3);
|
||||
*row1 = xorv(*row1, *row2);
|
||||
*row1 = rot7(*row1);
|
||||
}
|
||||
|
||||
// Note the optimization here of leaving row1 as the unrotated row, rather than
|
||||
// row0. All the message loads below are adjusted to compensate for this. See
|
||||
// discussion at https://github.com/sneves/blake2-avx2/pull/4
|
||||
INLINE void diagonalize(__m128i *row0, __m128i *row2, __m128i *row3) {
|
||||
*row0 = _mm_shuffle_epi32(*row0, _MM_SHUFFLE(2, 1, 0, 3));
|
||||
*row3 = _mm_shuffle_epi32(*row3, _MM_SHUFFLE(1, 0, 3, 2));
|
||||
*row2 = _mm_shuffle_epi32(*row2, _MM_SHUFFLE(0, 3, 2, 1));
|
||||
}
|
||||
|
||||
INLINE void undiagonalize(__m128i *row0, __m128i *row2, __m128i *row3) {
|
||||
*row0 = _mm_shuffle_epi32(*row0, _MM_SHUFFLE(0, 3, 2, 1));
|
||||
*row3 = _mm_shuffle_epi32(*row3, _MM_SHUFFLE(1, 0, 3, 2));
|
||||
*row2 = _mm_shuffle_epi32(*row2, _MM_SHUFFLE(2, 1, 0, 3));
|
||||
}
|
||||
|
||||
INLINE void compress_pre(__m128i rows[4], const uint32_t cv[8],
|
||||
const uint8_t block[BLAKE3_BLOCK_LEN],
|
||||
uint8_t block_len, uint64_t counter, uint8_t flags) {
|
||||
rows[0] = loadu((uint8_t *)&cv[0]);
|
||||
rows[1] = loadu((uint8_t *)&cv[4]);
|
||||
rows[2] = set4(IV[0], IV[1], IV[2], IV[3]);
|
||||
rows[3] = set4(counter_low(counter), counter_high(counter),
|
||||
(uint32_t)block_len, (uint32_t)flags);
|
||||
|
||||
__m128i m0 = loadu(&block[sizeof(__m128i) * 0]);
|
||||
__m128i m1 = loadu(&block[sizeof(__m128i) * 1]);
|
||||
__m128i m2 = loadu(&block[sizeof(__m128i) * 2]);
|
||||
__m128i m3 = loadu(&block[sizeof(__m128i) * 3]);
|
||||
|
||||
__m128i t0, t1, t2, t3, tt;
|
||||
|
||||
// Round 1. The first round permutes the message words from the original
|
||||
// input order, into the groups that get mixed in parallel.
|
||||
t0 = _mm_shuffle_ps2(m0, m1, _MM_SHUFFLE(2, 0, 2, 0)); // 6 4 2 0
|
||||
g1(&rows[0], &rows[1], &rows[2], &rows[3], t0);
|
||||
t1 = _mm_shuffle_ps2(m0, m1, _MM_SHUFFLE(3, 1, 3, 1)); // 7 5 3 1
|
||||
g2(&rows[0], &rows[1], &rows[2], &rows[3], t1);
|
||||
diagonalize(&rows[0], &rows[2], &rows[3]);
|
||||
t2 = _mm_shuffle_ps2(m2, m3, _MM_SHUFFLE(2, 0, 2, 0)); // 14 12 10 8
|
||||
t2 = _mm_shuffle_epi32(t2, _MM_SHUFFLE(2, 1, 0, 3)); // 12 10 8 14
|
||||
g1(&rows[0], &rows[1], &rows[2], &rows[3], t2);
|
||||
t3 = _mm_shuffle_ps2(m2, m3, _MM_SHUFFLE(3, 1, 3, 1)); // 15 13 11 9
|
||||
t3 = _mm_shuffle_epi32(t3, _MM_SHUFFLE(2, 1, 0, 3)); // 13 11 9 15
|
||||
g2(&rows[0], &rows[1], &rows[2], &rows[3], t3);
|
||||
undiagonalize(&rows[0], &rows[2], &rows[3]);
|
||||
m0 = t0;
|
||||
m1 = t1;
|
||||
m2 = t2;
|
||||
m3 = t3;
|
||||
|
||||
// Round 2. This round and all following rounds apply a fixed permutation
|
||||
// to the message words from the round before.
|
||||
t0 = _mm_shuffle_ps2(m0, m1, _MM_SHUFFLE(3, 1, 1, 2));
|
||||
t0 = _mm_shuffle_epi32(t0, _MM_SHUFFLE(0, 3, 2, 1));
|
||||
g1(&rows[0], &rows[1], &rows[2], &rows[3], t0);
|
||||
t1 = _mm_shuffle_ps2(m2, m3, _MM_SHUFFLE(3, 3, 2, 2));
|
||||
tt = _mm_shuffle_epi32(m0, _MM_SHUFFLE(0, 0, 3, 3));
|
||||
t1 = _mm_blend_epi16(tt, t1, 0xCC);
|
||||
g2(&rows[0], &rows[1], &rows[2], &rows[3], t1);
|
||||
diagonalize(&rows[0], &rows[2], &rows[3]);
|
||||
t2 = _mm_unpacklo_epi64(m3, m1);
|
||||
tt = _mm_blend_epi16(t2, m2, 0xC0);
|
||||
t2 = _mm_shuffle_epi32(tt, _MM_SHUFFLE(1, 3, 2, 0));
|
||||
g1(&rows[0], &rows[1], &rows[2], &rows[3], t2);
|
||||
t3 = _mm_unpackhi_epi32(m1, m3);
|
||||
tt = _mm_unpacklo_epi32(m2, t3);
|
||||
t3 = _mm_shuffle_epi32(tt, _MM_SHUFFLE(0, 1, 3, 2));
|
||||
g2(&rows[0], &rows[1], &rows[2], &rows[3], t3);
|
||||
undiagonalize(&rows[0], &rows[2], &rows[3]);
|
||||
m0 = t0;
|
||||
m1 = t1;
|
||||
m2 = t2;
|
||||
m3 = t3;
|
||||
|
||||
// Round 3
|
||||
t0 = _mm_shuffle_ps2(m0, m1, _MM_SHUFFLE(3, 1, 1, 2));
|
||||
t0 = _mm_shuffle_epi32(t0, _MM_SHUFFLE(0, 3, 2, 1));
|
||||
g1(&rows[0], &rows[1], &rows[2], &rows[3], t0);
|
||||
t1 = _mm_shuffle_ps2(m2, m3, _MM_SHUFFLE(3, 3, 2, 2));
|
||||
tt = _mm_shuffle_epi32(m0, _MM_SHUFFLE(0, 0, 3, 3));
|
||||
t1 = _mm_blend_epi16(tt, t1, 0xCC);
|
||||
g2(&rows[0], &rows[1], &rows[2], &rows[3], t1);
|
||||
diagonalize(&rows[0], &rows[2], &rows[3]);
|
||||
t2 = _mm_unpacklo_epi64(m3, m1);
|
||||
tt = _mm_blend_epi16(t2, m2, 0xC0);
|
||||
t2 = _mm_shuffle_epi32(tt, _MM_SHUFFLE(1, 3, 2, 0));
|
||||
g1(&rows[0], &rows[1], &rows[2], &rows[3], t2);
|
||||
t3 = _mm_unpackhi_epi32(m1, m3);
|
||||
tt = _mm_unpacklo_epi32(m2, t3);
|
||||
t3 = _mm_shuffle_epi32(tt, _MM_SHUFFLE(0, 1, 3, 2));
|
||||
g2(&rows[0], &rows[1], &rows[2], &rows[3], t3);
|
||||
undiagonalize(&rows[0], &rows[2], &rows[3]);
|
||||
m0 = t0;
|
||||
m1 = t1;
|
||||
m2 = t2;
|
||||
m3 = t3;
|
||||
|
||||
// Round 4
|
||||
t0 = _mm_shuffle_ps2(m0, m1, _MM_SHUFFLE(3, 1, 1, 2));
|
||||
t0 = _mm_shuffle_epi32(t0, _MM_SHUFFLE(0, 3, 2, 1));
|
||||
g1(&rows[0], &rows[1], &rows[2], &rows[3], t0);
|
||||
t1 = _mm_shuffle_ps2(m2, m3, _MM_SHUFFLE(3, 3, 2, 2));
|
||||
tt = _mm_shuffle_epi32(m0, _MM_SHUFFLE(0, 0, 3, 3));
|
||||
t1 = _mm_blend_epi16(tt, t1, 0xCC);
|
||||
g2(&rows[0], &rows[1], &rows[2], &rows[3], t1);
|
||||
diagonalize(&rows[0], &rows[2], &rows[3]);
|
||||
t2 = _mm_unpacklo_epi64(m3, m1);
|
||||
tt = _mm_blend_epi16(t2, m2, 0xC0);
|
||||
t2 = _mm_shuffle_epi32(tt, _MM_SHUFFLE(1, 3, 2, 0));
|
||||
g1(&rows[0], &rows[1], &rows[2], &rows[3], t2);
|
||||
t3 = _mm_unpackhi_epi32(m1, m3);
|
||||
tt = _mm_unpacklo_epi32(m2, t3);
|
||||
t3 = _mm_shuffle_epi32(tt, _MM_SHUFFLE(0, 1, 3, 2));
|
||||
g2(&rows[0], &rows[1], &rows[2], &rows[3], t3);
|
||||
undiagonalize(&rows[0], &rows[2], &rows[3]);
|
||||
m0 = t0;
|
||||
m1 = t1;
|
||||
m2 = t2;
|
||||
m3 = t3;
|
||||
|
||||
// Round 5
|
||||
t0 = _mm_shuffle_ps2(m0, m1, _MM_SHUFFLE(3, 1, 1, 2));
|
||||
t0 = _mm_shuffle_epi32(t0, _MM_SHUFFLE(0, 3, 2, 1));
|
||||
g1(&rows[0], &rows[1], &rows[2], &rows[3], t0);
|
||||
t1 = _mm_shuffle_ps2(m2, m3, _MM_SHUFFLE(3, 3, 2, 2));
|
||||
tt = _mm_shuffle_epi32(m0, _MM_SHUFFLE(0, 0, 3, 3));
|
||||
t1 = _mm_blend_epi16(tt, t1, 0xCC);
|
||||
g2(&rows[0], &rows[1], &rows[2], &rows[3], t1);
|
||||
diagonalize(&rows[0], &rows[2], &rows[3]);
|
||||
t2 = _mm_unpacklo_epi64(m3, m1);
|
||||
tt = _mm_blend_epi16(t2, m2, 0xC0);
|
||||
t2 = _mm_shuffle_epi32(tt, _MM_SHUFFLE(1, 3, 2, 0));
|
||||
g1(&rows[0], &rows[1], &rows[2], &rows[3], t2);
|
||||
t3 = _mm_unpackhi_epi32(m1, m3);
|
||||
tt = _mm_unpacklo_epi32(m2, t3);
|
||||
t3 = _mm_shuffle_epi32(tt, _MM_SHUFFLE(0, 1, 3, 2));
|
||||
g2(&rows[0], &rows[1], &rows[2], &rows[3], t3);
|
||||
undiagonalize(&rows[0], &rows[2], &rows[3]);
|
||||
m0 = t0;
|
||||
m1 = t1;
|
||||
m2 = t2;
|
||||
m3 = t3;
|
||||
|
||||
// Round 6
|
||||
t0 = _mm_shuffle_ps2(m0, m1, _MM_SHUFFLE(3, 1, 1, 2));
|
||||
t0 = _mm_shuffle_epi32(t0, _MM_SHUFFLE(0, 3, 2, 1));
|
||||
g1(&rows[0], &rows[1], &rows[2], &rows[3], t0);
|
||||
t1 = _mm_shuffle_ps2(m2, m3, _MM_SHUFFLE(3, 3, 2, 2));
|
||||
tt = _mm_shuffle_epi32(m0, _MM_SHUFFLE(0, 0, 3, 3));
|
||||
t1 = _mm_blend_epi16(tt, t1, 0xCC);
|
||||
g2(&rows[0], &rows[1], &rows[2], &rows[3], t1);
|
||||
diagonalize(&rows[0], &rows[2], &rows[3]);
|
||||
t2 = _mm_unpacklo_epi64(m3, m1);
|
||||
tt = _mm_blend_epi16(t2, m2, 0xC0);
|
||||
t2 = _mm_shuffle_epi32(tt, _MM_SHUFFLE(1, 3, 2, 0));
|
||||
g1(&rows[0], &rows[1], &rows[2], &rows[3], t2);
|
||||
t3 = _mm_unpackhi_epi32(m1, m3);
|
||||
tt = _mm_unpacklo_epi32(m2, t3);
|
||||
t3 = _mm_shuffle_epi32(tt, _MM_SHUFFLE(0, 1, 3, 2));
|
||||
g2(&rows[0], &rows[1], &rows[2], &rows[3], t3);
|
||||
undiagonalize(&rows[0], &rows[2], &rows[3]);
|
||||
m0 = t0;
|
||||
m1 = t1;
|
||||
m2 = t2;
|
||||
m3 = t3;
|
||||
|
||||
// Round 7
|
||||
t0 = _mm_shuffle_ps2(m0, m1, _MM_SHUFFLE(3, 1, 1, 2));
|
||||
t0 = _mm_shuffle_epi32(t0, _MM_SHUFFLE(0, 3, 2, 1));
|
||||
g1(&rows[0], &rows[1], &rows[2], &rows[3], t0);
|
||||
t1 = _mm_shuffle_ps2(m2, m3, _MM_SHUFFLE(3, 3, 2, 2));
|
||||
tt = _mm_shuffle_epi32(m0, _MM_SHUFFLE(0, 0, 3, 3));
|
||||
t1 = _mm_blend_epi16(tt, t1, 0xCC);
|
||||
g2(&rows[0], &rows[1], &rows[2], &rows[3], t1);
|
||||
diagonalize(&rows[0], &rows[2], &rows[3]);
|
||||
t2 = _mm_unpacklo_epi64(m3, m1);
|
||||
tt = _mm_blend_epi16(t2, m2, 0xC0);
|
||||
t2 = _mm_shuffle_epi32(tt, _MM_SHUFFLE(1, 3, 2, 0));
|
||||
g1(&rows[0], &rows[1], &rows[2], &rows[3], t2);
|
||||
t3 = _mm_unpackhi_epi32(m1, m3);
|
||||
tt = _mm_unpacklo_epi32(m2, t3);
|
||||
t3 = _mm_shuffle_epi32(tt, _MM_SHUFFLE(0, 1, 3, 2));
|
||||
g2(&rows[0], &rows[1], &rows[2], &rows[3], t3);
|
||||
undiagonalize(&rows[0], &rows[2], &rows[3]);
|
||||
}
|
||||
|
||||
void blake3_compress_in_place_sse2(uint32_t cv[8],
|
||||
const uint8_t block[BLAKE3_BLOCK_LEN],
|
||||
uint8_t block_len, uint64_t counter,
|
||||
uint8_t flags) {
|
||||
__m128i rows[4];
|
||||
compress_pre(rows, cv, block, block_len, counter, flags);
|
||||
storeu(xorv(rows[0], rows[2]), (uint8_t *)&cv[0]);
|
||||
storeu(xorv(rows[1], rows[3]), (uint8_t *)&cv[4]);
|
||||
}
|
||||
|
||||
void blake3_compress_xof_sse2(const uint32_t cv[8],
|
||||
const uint8_t block[BLAKE3_BLOCK_LEN],
|
||||
uint8_t block_len, uint64_t counter,
|
||||
uint8_t flags, uint8_t out[64]) {
|
||||
__m128i rows[4];
|
||||
compress_pre(rows, cv, block, block_len, counter, flags);
|
||||
storeu(xorv(rows[0], rows[2]), &out[0]);
|
||||
storeu(xorv(rows[1], rows[3]), &out[16]);
|
||||
storeu(xorv(rows[2], loadu((uint8_t *)&cv[0])), &out[32]);
|
||||
storeu(xorv(rows[3], loadu((uint8_t *)&cv[4])), &out[48]);
|
||||
}
|
||||
|
||||
INLINE void round_fn(__m128i v[16], __m128i m[16], size_t r) {
|
||||
v[0] = addv(v[0], m[(size_t)MSG_SCHEDULE[r][0]]);
|
||||
v[1] = addv(v[1], m[(size_t)MSG_SCHEDULE[r][2]]);
|
||||
v[2] = addv(v[2], m[(size_t)MSG_SCHEDULE[r][4]]);
|
||||
v[3] = addv(v[3], m[(size_t)MSG_SCHEDULE[r][6]]);
|
||||
v[0] = addv(v[0], v[4]);
|
||||
v[1] = addv(v[1], v[5]);
|
||||
v[2] = addv(v[2], v[6]);
|
||||
v[3] = addv(v[3], v[7]);
|
||||
v[12] = xorv(v[12], v[0]);
|
||||
v[13] = xorv(v[13], v[1]);
|
||||
v[14] = xorv(v[14], v[2]);
|
||||
v[15] = xorv(v[15], v[3]);
|
||||
v[12] = rot16(v[12]);
|
||||
v[13] = rot16(v[13]);
|
||||
v[14] = rot16(v[14]);
|
||||
v[15] = rot16(v[15]);
|
||||
v[8] = addv(v[8], v[12]);
|
||||
v[9] = addv(v[9], v[13]);
|
||||
v[10] = addv(v[10], v[14]);
|
||||
v[11] = addv(v[11], v[15]);
|
||||
v[4] = xorv(v[4], v[8]);
|
||||
v[5] = xorv(v[5], v[9]);
|
||||
v[6] = xorv(v[6], v[10]);
|
||||
v[7] = xorv(v[7], v[11]);
|
||||
v[4] = rot12(v[4]);
|
||||
v[5] = rot12(v[5]);
|
||||
v[6] = rot12(v[6]);
|
||||
v[7] = rot12(v[7]);
|
||||
v[0] = addv(v[0], m[(size_t)MSG_SCHEDULE[r][1]]);
|
||||
v[1] = addv(v[1], m[(size_t)MSG_SCHEDULE[r][3]]);
|
||||
v[2] = addv(v[2], m[(size_t)MSG_SCHEDULE[r][5]]);
|
||||
v[3] = addv(v[3], m[(size_t)MSG_SCHEDULE[r][7]]);
|
||||
v[0] = addv(v[0], v[4]);
|
||||
v[1] = addv(v[1], v[5]);
|
||||
v[2] = addv(v[2], v[6]);
|
||||
v[3] = addv(v[3], v[7]);
|
||||
v[12] = xorv(v[12], v[0]);
|
||||
v[13] = xorv(v[13], v[1]);
|
||||
v[14] = xorv(v[14], v[2]);
|
||||
v[15] = xorv(v[15], v[3]);
|
||||
v[12] = rot8(v[12]);
|
||||
v[13] = rot8(v[13]);
|
||||
v[14] = rot8(v[14]);
|
||||
v[15] = rot8(v[15]);
|
||||
v[8] = addv(v[8], v[12]);
|
||||
v[9] = addv(v[9], v[13]);
|
||||
v[10] = addv(v[10], v[14]);
|
||||
v[11] = addv(v[11], v[15]);
|
||||
v[4] = xorv(v[4], v[8]);
|
||||
v[5] = xorv(v[5], v[9]);
|
||||
v[6] = xorv(v[6], v[10]);
|
||||
v[7] = xorv(v[7], v[11]);
|
||||
v[4] = rot7(v[4]);
|
||||
v[5] = rot7(v[5]);
|
||||
v[6] = rot7(v[6]);
|
||||
v[7] = rot7(v[7]);
|
||||
|
||||
v[0] = addv(v[0], m[(size_t)MSG_SCHEDULE[r][8]]);
|
||||
v[1] = addv(v[1], m[(size_t)MSG_SCHEDULE[r][10]]);
|
||||
v[2] = addv(v[2], m[(size_t)MSG_SCHEDULE[r][12]]);
|
||||
v[3] = addv(v[3], m[(size_t)MSG_SCHEDULE[r][14]]);
|
||||
v[0] = addv(v[0], v[5]);
|
||||
v[1] = addv(v[1], v[6]);
|
||||
v[2] = addv(v[2], v[7]);
|
||||
v[3] = addv(v[3], v[4]);
|
||||
v[15] = xorv(v[15], v[0]);
|
||||
v[12] = xorv(v[12], v[1]);
|
||||
v[13] = xorv(v[13], v[2]);
|
||||
v[14] = xorv(v[14], v[3]);
|
||||
v[15] = rot16(v[15]);
|
||||
v[12] = rot16(v[12]);
|
||||
v[13] = rot16(v[13]);
|
||||
v[14] = rot16(v[14]);
|
||||
v[10] = addv(v[10], v[15]);
|
||||
v[11] = addv(v[11], v[12]);
|
||||
v[8] = addv(v[8], v[13]);
|
||||
v[9] = addv(v[9], v[14]);
|
||||
v[5] = xorv(v[5], v[10]);
|
||||
v[6] = xorv(v[6], v[11]);
|
||||
v[7] = xorv(v[7], v[8]);
|
||||
v[4] = xorv(v[4], v[9]);
|
||||
v[5] = rot12(v[5]);
|
||||
v[6] = rot12(v[6]);
|
||||
v[7] = rot12(v[7]);
|
||||
v[4] = rot12(v[4]);
|
||||
v[0] = addv(v[0], m[(size_t)MSG_SCHEDULE[r][9]]);
|
||||
v[1] = addv(v[1], m[(size_t)MSG_SCHEDULE[r][11]]);
|
||||
v[2] = addv(v[2], m[(size_t)MSG_SCHEDULE[r][13]]);
|
||||
v[3] = addv(v[3], m[(size_t)MSG_SCHEDULE[r][15]]);
|
||||
v[0] = addv(v[0], v[5]);
|
||||
v[1] = addv(v[1], v[6]);
|
||||
v[2] = addv(v[2], v[7]);
|
||||
v[3] = addv(v[3], v[4]);
|
||||
v[15] = xorv(v[15], v[0]);
|
||||
v[12] = xorv(v[12], v[1]);
|
||||
v[13] = xorv(v[13], v[2]);
|
||||
v[14] = xorv(v[14], v[3]);
|
||||
v[15] = rot8(v[15]);
|
||||
v[12] = rot8(v[12]);
|
||||
v[13] = rot8(v[13]);
|
||||
v[14] = rot8(v[14]);
|
||||
v[10] = addv(v[10], v[15]);
|
||||
v[11] = addv(v[11], v[12]);
|
||||
v[8] = addv(v[8], v[13]);
|
||||
v[9] = addv(v[9], v[14]);
|
||||
v[5] = xorv(v[5], v[10]);
|
||||
v[6] = xorv(v[6], v[11]);
|
||||
v[7] = xorv(v[7], v[8]);
|
||||
v[4] = xorv(v[4], v[9]);
|
||||
v[5] = rot7(v[5]);
|
||||
v[6] = rot7(v[6]);
|
||||
v[7] = rot7(v[7]);
|
||||
v[4] = rot7(v[4]);
|
||||
}
|
||||
|
||||
INLINE void transpose_vecs(__m128i vecs[DEGREE]) {
|
||||
// Interleave 32-bit lates. The low unpack is lanes 00/11 and the high is
|
||||
// 22/33. Note that this doesn't split the vector into two lanes, as the
|
||||
// AVX2 counterparts do.
|
||||
__m128i ab_01 = _mm_unpacklo_epi32(vecs[0], vecs[1]);
|
||||
__m128i ab_23 = _mm_unpackhi_epi32(vecs[0], vecs[1]);
|
||||
__m128i cd_01 = _mm_unpacklo_epi32(vecs[2], vecs[3]);
|
||||
__m128i cd_23 = _mm_unpackhi_epi32(vecs[2], vecs[3]);
|
||||
|
||||
// Interleave 64-bit lanes.
|
||||
__m128i abcd_0 = _mm_unpacklo_epi64(ab_01, cd_01);
|
||||
__m128i abcd_1 = _mm_unpackhi_epi64(ab_01, cd_01);
|
||||
__m128i abcd_2 = _mm_unpacklo_epi64(ab_23, cd_23);
|
||||
__m128i abcd_3 = _mm_unpackhi_epi64(ab_23, cd_23);
|
||||
|
||||
vecs[0] = abcd_0;
|
||||
vecs[1] = abcd_1;
|
||||
vecs[2] = abcd_2;
|
||||
vecs[3] = abcd_3;
|
||||
}
|
||||
|
||||
INLINE void transpose_msg_vecs(const uint8_t *const *inputs,
|
||||
size_t block_offset, __m128i out[16]) {
|
||||
out[0] = loadu(&inputs[0][block_offset + 0 * sizeof(__m128i)]);
|
||||
out[1] = loadu(&inputs[1][block_offset + 0 * sizeof(__m128i)]);
|
||||
out[2] = loadu(&inputs[2][block_offset + 0 * sizeof(__m128i)]);
|
||||
out[3] = loadu(&inputs[3][block_offset + 0 * sizeof(__m128i)]);
|
||||
out[4] = loadu(&inputs[0][block_offset + 1 * sizeof(__m128i)]);
|
||||
out[5] = loadu(&inputs[1][block_offset + 1 * sizeof(__m128i)]);
|
||||
out[6] = loadu(&inputs[2][block_offset + 1 * sizeof(__m128i)]);
|
||||
out[7] = loadu(&inputs[3][block_offset + 1 * sizeof(__m128i)]);
|
||||
out[8] = loadu(&inputs[0][block_offset + 2 * sizeof(__m128i)]);
|
||||
out[9] = loadu(&inputs[1][block_offset + 2 * sizeof(__m128i)]);
|
||||
out[10] = loadu(&inputs[2][block_offset + 2 * sizeof(__m128i)]);
|
||||
out[11] = loadu(&inputs[3][block_offset + 2 * sizeof(__m128i)]);
|
||||
out[12] = loadu(&inputs[0][block_offset + 3 * sizeof(__m128i)]);
|
||||
out[13] = loadu(&inputs[1][block_offset + 3 * sizeof(__m128i)]);
|
||||
out[14] = loadu(&inputs[2][block_offset + 3 * sizeof(__m128i)]);
|
||||
out[15] = loadu(&inputs[3][block_offset + 3 * sizeof(__m128i)]);
|
||||
for (size_t i = 0; i < 4; ++i) {
|
||||
_mm_prefetch(&inputs[i][block_offset + 256], _MM_HINT_T0);
|
||||
}
|
||||
transpose_vecs(&out[0]);
|
||||
transpose_vecs(&out[4]);
|
||||
transpose_vecs(&out[8]);
|
||||
transpose_vecs(&out[12]);
|
||||
}
|
||||
|
||||
INLINE void load_counters(uint64_t counter, bool increment_counter,
|
||||
__m128i *out_lo, __m128i *out_hi) {
|
||||
const __m128i mask = _mm_set1_epi32(-(int32_t)increment_counter);
|
||||
const __m128i add0 = _mm_set_epi32(3, 2, 1, 0);
|
||||
const __m128i add1 = _mm_and_si128(mask, add0);
|
||||
__m128i l = _mm_add_epi32(_mm_set1_epi32(counter), add1);
|
||||
__m128i carry = _mm_cmpgt_epi32(_mm_xor_si128(add1, _mm_set1_epi32(0x80000000)),
|
||||
_mm_xor_si128( l, _mm_set1_epi32(0x80000000)));
|
||||
__m128i h = _mm_sub_epi32(_mm_set1_epi32(counter >> 32), carry);
|
||||
*out_lo = l;
|
||||
*out_hi = h;
|
||||
}
|
||||
|
||||
void blake3_hash4_sse2(const uint8_t *const *inputs, size_t blocks,
|
||||
const uint32_t key[8], uint64_t counter,
|
||||
bool increment_counter, uint8_t flags,
|
||||
uint8_t flags_start, uint8_t flags_end, uint8_t *out) {
|
||||
__m128i h_vecs[8] = {
|
||||
set1(key[0]), set1(key[1]), set1(key[2]), set1(key[3]),
|
||||
set1(key[4]), set1(key[5]), set1(key[6]), set1(key[7]),
|
||||
};
|
||||
__m128i counter_low_vec, counter_high_vec;
|
||||
load_counters(counter, increment_counter, &counter_low_vec,
|
||||
&counter_high_vec);
|
||||
uint8_t block_flags = flags | flags_start;
|
||||
|
||||
for (size_t block = 0; block < blocks; block++) {
|
||||
if (block + 1 == blocks) {
|
||||
block_flags |= flags_end;
|
||||
}
|
||||
__m128i block_len_vec = set1(BLAKE3_BLOCK_LEN);
|
||||
__m128i block_flags_vec = set1(block_flags);
|
||||
__m128i msg_vecs[16];
|
||||
transpose_msg_vecs(inputs, block * BLAKE3_BLOCK_LEN, msg_vecs);
|
||||
|
||||
__m128i v[16] = {
|
||||
h_vecs[0], h_vecs[1], h_vecs[2], h_vecs[3],
|
||||
h_vecs[4], h_vecs[5], h_vecs[6], h_vecs[7],
|
||||
set1(IV[0]), set1(IV[1]), set1(IV[2]), set1(IV[3]),
|
||||
counter_low_vec, counter_high_vec, block_len_vec, block_flags_vec,
|
||||
};
|
||||
round_fn(v, msg_vecs, 0);
|
||||
round_fn(v, msg_vecs, 1);
|
||||
round_fn(v, msg_vecs, 2);
|
||||
round_fn(v, msg_vecs, 3);
|
||||
round_fn(v, msg_vecs, 4);
|
||||
round_fn(v, msg_vecs, 5);
|
||||
round_fn(v, msg_vecs, 6);
|
||||
h_vecs[0] = xorv(v[0], v[8]);
|
||||
h_vecs[1] = xorv(v[1], v[9]);
|
||||
h_vecs[2] = xorv(v[2], v[10]);
|
||||
h_vecs[3] = xorv(v[3], v[11]);
|
||||
h_vecs[4] = xorv(v[4], v[12]);
|
||||
h_vecs[5] = xorv(v[5], v[13]);
|
||||
h_vecs[6] = xorv(v[6], v[14]);
|
||||
h_vecs[7] = xorv(v[7], v[15]);
|
||||
|
||||
block_flags = flags;
|
||||
}
|
||||
|
||||
transpose_vecs(&h_vecs[0]);
|
||||
transpose_vecs(&h_vecs[4]);
|
||||
// The first four vecs now contain the first half of each output, and the
|
||||
// second four vecs contain the second half of each output.
|
||||
storeu(h_vecs[0], &out[0 * sizeof(__m128i)]);
|
||||
storeu(h_vecs[4], &out[1 * sizeof(__m128i)]);
|
||||
storeu(h_vecs[1], &out[2 * sizeof(__m128i)]);
|
||||
storeu(h_vecs[5], &out[3 * sizeof(__m128i)]);
|
||||
storeu(h_vecs[2], &out[4 * sizeof(__m128i)]);
|
||||
storeu(h_vecs[6], &out[5 * sizeof(__m128i)]);
|
||||
storeu(h_vecs[3], &out[6 * sizeof(__m128i)]);
|
||||
storeu(h_vecs[7], &out[7 * sizeof(__m128i)]);
|
||||
}
|
||||
|
||||
INLINE void hash_one_sse2(const uint8_t *input, size_t blocks,
|
||||
const uint32_t key[8], uint64_t counter,
|
||||
uint8_t flags, uint8_t flags_start,
|
||||
uint8_t flags_end, uint8_t out[BLAKE3_OUT_LEN]) {
|
||||
uint32_t cv[8];
|
||||
memcpy(cv, key, BLAKE3_KEY_LEN);
|
||||
uint8_t block_flags = flags | flags_start;
|
||||
while (blocks > 0) {
|
||||
if (blocks == 1) {
|
||||
block_flags |= flags_end;
|
||||
}
|
||||
blake3_compress_in_place_sse2(cv, input, BLAKE3_BLOCK_LEN, counter,
|
||||
block_flags);
|
||||
input = &input[BLAKE3_BLOCK_LEN];
|
||||
blocks -= 1;
|
||||
block_flags = flags;
|
||||
}
|
||||
memcpy(out, cv, BLAKE3_OUT_LEN);
|
||||
}
|
||||
|
||||
void blake3_hash_many_sse2(const uint8_t *const *inputs, size_t num_inputs,
|
||||
size_t blocks, const uint32_t key[8],
|
||||
uint64_t counter, bool increment_counter,
|
||||
uint8_t flags, uint8_t flags_start,
|
||||
uint8_t flags_end, uint8_t *out) {
|
||||
while (num_inputs >= DEGREE) {
|
||||
blake3_hash4_sse2(inputs, blocks, key, counter, increment_counter, flags,
|
||||
flags_start, flags_end, out);
|
||||
if (increment_counter) {
|
||||
counter += DEGREE;
|
||||
}
|
||||
inputs += DEGREE;
|
||||
num_inputs -= DEGREE;
|
||||
out = &out[DEGREE * BLAKE3_OUT_LEN];
|
||||
}
|
||||
while (num_inputs > 0) {
|
||||
hash_one_sse2(inputs[0], blocks, key, counter, flags, flags_start,
|
||||
flags_end, out);
|
||||
if (increment_counter) {
|
||||
counter += 1;
|
||||
}
|
||||
inputs += 1;
|
||||
num_inputs -= 1;
|
||||
out = &out[BLAKE3_OUT_LEN];
|
||||
}
|
||||
}
|
|
@ -0,0 +1,2028 @@
|
|||
#if defined(__ELF__) && defined(__linux__)
|
||||
.section .note.GNU-stack,"",%progbits
|
||||
#endif
|
||||
|
||||
#if defined(__ELF__) && defined(__CET__) && defined(__has_include)
|
||||
#if __has_include(<cet.h>)
|
||||
#include <cet.h>
|
||||
#endif
|
||||
#endif
|
||||
|
||||
#if !defined(_CET_ENDBR)
|
||||
#define _CET_ENDBR
|
||||
#endif
|
||||
|
||||
.intel_syntax noprefix
|
||||
.global blake3_hash_many_sse2
|
||||
.global _blake3_hash_many_sse2
|
||||
.global blake3_compress_in_place_sse2
|
||||
.global _blake3_compress_in_place_sse2
|
||||
.global blake3_compress_xof_sse2
|
||||
.global _blake3_compress_xof_sse2
|
||||
#ifdef __APPLE__
|
||||
.text
|
||||
#else
|
||||
.section .text
|
||||
#endif
|
||||
.p2align 6
|
||||
_blake3_hash_many_sse2:
|
||||
blake3_hash_many_sse2:
|
||||
_CET_ENDBR
|
||||
push r15
|
||||
push r14
|
||||
push r13
|
||||
push r12
|
||||
push rbx
|
||||
push rbp
|
||||
mov rbp, rsp
|
||||
sub rsp, 360
|
||||
and rsp, 0xFFFFFFFFFFFFFFC0
|
||||
neg r9d
|
||||
movd xmm0, r9d
|
||||
pshufd xmm0, xmm0, 0x00
|
||||
movdqa xmmword ptr [rsp+0x130], xmm0
|
||||
movdqa xmm1, xmm0
|
||||
pand xmm1, xmmword ptr [ADD0+rip]
|
||||
pand xmm0, xmmword ptr [ADD1+rip]
|
||||
movdqa xmmword ptr [rsp+0x150], xmm0
|
||||
movd xmm0, r8d
|
||||
pshufd xmm0, xmm0, 0x00
|
||||
paddd xmm0, xmm1
|
||||
movdqa xmmword ptr [rsp+0x110], xmm0
|
||||
pxor xmm0, xmmword ptr [CMP_MSB_MASK+rip]
|
||||
pxor xmm1, xmmword ptr [CMP_MSB_MASK+rip]
|
||||
pcmpgtd xmm1, xmm0
|
||||
shr r8, 32
|
||||
movd xmm2, r8d
|
||||
pshufd xmm2, xmm2, 0x00
|
||||
psubd xmm2, xmm1
|
||||
movdqa xmmword ptr [rsp+0x120], xmm2
|
||||
mov rbx, qword ptr [rbp+0x50]
|
||||
mov r15, rdx
|
||||
shl r15, 6
|
||||
movzx r13d, byte ptr [rbp+0x38]
|
||||
movzx r12d, byte ptr [rbp+0x48]
|
||||
cmp rsi, 4
|
||||
jc 3f
|
||||
2:
|
||||
movdqu xmm3, xmmword ptr [rcx]
|
||||
pshufd xmm0, xmm3, 0x00
|
||||
pshufd xmm1, xmm3, 0x55
|
||||
pshufd xmm2, xmm3, 0xAA
|
||||
pshufd xmm3, xmm3, 0xFF
|
||||
movdqu xmm7, xmmword ptr [rcx+0x10]
|
||||
pshufd xmm4, xmm7, 0x00
|
||||
pshufd xmm5, xmm7, 0x55
|
||||
pshufd xmm6, xmm7, 0xAA
|
||||
pshufd xmm7, xmm7, 0xFF
|
||||
mov r8, qword ptr [rdi]
|
||||
mov r9, qword ptr [rdi+0x8]
|
||||
mov r10, qword ptr [rdi+0x10]
|
||||
mov r11, qword ptr [rdi+0x18]
|
||||
movzx eax, byte ptr [rbp+0x40]
|
||||
or eax, r13d
|
||||
xor edx, edx
|
||||
9:
|
||||
mov r14d, eax
|
||||
or eax, r12d
|
||||
add rdx, 64
|
||||
cmp rdx, r15
|
||||
cmovne eax, r14d
|
||||
movdqu xmm8, xmmword ptr [r8+rdx-0x40]
|
||||
movdqu xmm9, xmmword ptr [r9+rdx-0x40]
|
||||
movdqu xmm10, xmmword ptr [r10+rdx-0x40]
|
||||
movdqu xmm11, xmmword ptr [r11+rdx-0x40]
|
||||
movdqa xmm12, xmm8
|
||||
punpckldq xmm8, xmm9
|
||||
punpckhdq xmm12, xmm9
|
||||
movdqa xmm14, xmm10
|
||||
punpckldq xmm10, xmm11
|
||||
punpckhdq xmm14, xmm11
|
||||
movdqa xmm9, xmm8
|
||||
punpcklqdq xmm8, xmm10
|
||||
punpckhqdq xmm9, xmm10
|
||||
movdqa xmm13, xmm12
|
||||
punpcklqdq xmm12, xmm14
|
||||
punpckhqdq xmm13, xmm14
|
||||
movdqa xmmword ptr [rsp], xmm8
|
||||
movdqa xmmword ptr [rsp+0x10], xmm9
|
||||
movdqa xmmword ptr [rsp+0x20], xmm12
|
||||
movdqa xmmword ptr [rsp+0x30], xmm13
|
||||
movdqu xmm8, xmmword ptr [r8+rdx-0x30]
|
||||
movdqu xmm9, xmmword ptr [r9+rdx-0x30]
|
||||
movdqu xmm10, xmmword ptr [r10+rdx-0x30]
|
||||
movdqu xmm11, xmmword ptr [r11+rdx-0x30]
|
||||
movdqa xmm12, xmm8
|
||||
punpckldq xmm8, xmm9
|
||||
punpckhdq xmm12, xmm9
|
||||
movdqa xmm14, xmm10
|
||||
punpckldq xmm10, xmm11
|
||||
punpckhdq xmm14, xmm11
|
||||
movdqa xmm9, xmm8
|
||||
punpcklqdq xmm8, xmm10
|
||||
punpckhqdq xmm9, xmm10
|
||||
movdqa xmm13, xmm12
|
||||
punpcklqdq xmm12, xmm14
|
||||
punpckhqdq xmm13, xmm14
|
||||
movdqa xmmword ptr [rsp+0x40], xmm8
|
||||
movdqa xmmword ptr [rsp+0x50], xmm9
|
||||
movdqa xmmword ptr [rsp+0x60], xmm12
|
||||
movdqa xmmword ptr [rsp+0x70], xmm13
|
||||
movdqu xmm8, xmmword ptr [r8+rdx-0x20]
|
||||
movdqu xmm9, xmmword ptr [r9+rdx-0x20]
|
||||
movdqu xmm10, xmmword ptr [r10+rdx-0x20]
|
||||
movdqu xmm11, xmmword ptr [r11+rdx-0x20]
|
||||
movdqa xmm12, xmm8
|
||||
punpckldq xmm8, xmm9
|
||||
punpckhdq xmm12, xmm9
|
||||
movdqa xmm14, xmm10
|
||||
punpckldq xmm10, xmm11
|
||||
punpckhdq xmm14, xmm11
|
||||
movdqa xmm9, xmm8
|
||||
punpcklqdq xmm8, xmm10
|
||||
punpckhqdq xmm9, xmm10
|
||||
movdqa xmm13, xmm12
|
||||
punpcklqdq xmm12, xmm14
|
||||
punpckhqdq xmm13, xmm14
|
||||
movdqa xmmword ptr [rsp+0x80], xmm8
|
||||
movdqa xmmword ptr [rsp+0x90], xmm9
|
||||
movdqa xmmword ptr [rsp+0xA0], xmm12
|
||||
movdqa xmmword ptr [rsp+0xB0], xmm13
|
||||
movdqu xmm8, xmmword ptr [r8+rdx-0x10]
|
||||
movdqu xmm9, xmmword ptr [r9+rdx-0x10]
|
||||
movdqu xmm10, xmmword ptr [r10+rdx-0x10]
|
||||
movdqu xmm11, xmmword ptr [r11+rdx-0x10]
|
||||
movdqa xmm12, xmm8
|
||||
punpckldq xmm8, xmm9
|
||||
punpckhdq xmm12, xmm9
|
||||
movdqa xmm14, xmm10
|
||||
punpckldq xmm10, xmm11
|
||||
punpckhdq xmm14, xmm11
|
||||
movdqa xmm9, xmm8
|
||||
punpcklqdq xmm8, xmm10
|
||||
punpckhqdq xmm9, xmm10
|
||||
movdqa xmm13, xmm12
|
||||
punpcklqdq xmm12, xmm14
|
||||
punpckhqdq xmm13, xmm14
|
||||
movdqa xmmword ptr [rsp+0xC0], xmm8
|
||||
movdqa xmmword ptr [rsp+0xD0], xmm9
|
||||
movdqa xmmword ptr [rsp+0xE0], xmm12
|
||||
movdqa xmmword ptr [rsp+0xF0], xmm13
|
||||
movdqa xmm9, xmmword ptr [BLAKE3_IV_1+rip]
|
||||
movdqa xmm10, xmmword ptr [BLAKE3_IV_2+rip]
|
||||
movdqa xmm11, xmmword ptr [BLAKE3_IV_3+rip]
|
||||
movdqa xmm12, xmmword ptr [rsp+0x110]
|
||||
movdqa xmm13, xmmword ptr [rsp+0x120]
|
||||
movdqa xmm14, xmmword ptr [BLAKE3_BLOCK_LEN+rip]
|
||||
movd xmm15, eax
|
||||
pshufd xmm15, xmm15, 0x00
|
||||
prefetcht0 [r8+rdx+0x80]
|
||||
prefetcht0 [r9+rdx+0x80]
|
||||
prefetcht0 [r10+rdx+0x80]
|
||||
prefetcht0 [r11+rdx+0x80]
|
||||
paddd xmm0, xmmword ptr [rsp]
|
||||
paddd xmm1, xmmword ptr [rsp+0x20]
|
||||
paddd xmm2, xmmword ptr [rsp+0x40]
|
||||
paddd xmm3, xmmword ptr [rsp+0x60]
|
||||
paddd xmm0, xmm4
|
||||
paddd xmm1, xmm5
|
||||
paddd xmm2, xmm6
|
||||
paddd xmm3, xmm7
|
||||
pxor xmm12, xmm0
|
||||
pxor xmm13, xmm1
|
||||
pxor xmm14, xmm2
|
||||
pxor xmm15, xmm3
|
||||
movdqa xmm8, xmmword ptr [ROT16+rip]
|
||||
pshufb xmm12, xmm8
|
||||
pshufb xmm13, xmm8
|
||||
pshufb xmm14, xmm8
|
||||
pshufb xmm15, xmm8
|
||||
movdqa xmm8, xmmword ptr [BLAKE3_IV_0+rip]
|
||||
paddd xmm8, xmm12
|
||||
paddd xmm9, xmm13
|
||||
paddd xmm10, xmm14
|
||||
paddd xmm11, xmm15
|
||||
pxor xmm4, xmm8
|
||||
pxor xmm5, xmm9
|
||||
pxor xmm6, xmm10
|
||||
pxor xmm7, xmm11
|
||||
movdqa xmmword ptr [rsp+0x100], xmm8
|
||||
movdqa xmm8, xmm4
|
||||
psrld xmm8, 12
|
||||
pslld xmm4, 20
|
||||
por xmm4, xmm8
|
||||
movdqa xmm8, xmm5
|
||||
psrld xmm8, 12
|
||||
pslld xmm5, 20
|
||||
por xmm5, xmm8
|
||||
movdqa xmm8, xmm6
|
||||
psrld xmm8, 12
|
||||
pslld xmm6, 20
|
||||
por xmm6, xmm8
|
||||
movdqa xmm8, xmm7
|
||||
psrld xmm8, 12
|
||||
pslld xmm7, 20
|
||||
por xmm7, xmm8
|
||||
paddd xmm0, xmmword ptr [rsp+0x10]
|
||||
paddd xmm1, xmmword ptr [rsp+0x30]
|
||||
paddd xmm2, xmmword ptr [rsp+0x50]
|
||||
paddd xmm3, xmmword ptr [rsp+0x70]
|
||||
paddd xmm0, xmm4
|
||||
paddd xmm1, xmm5
|
||||
paddd xmm2, xmm6
|
||||
paddd xmm3, xmm7
|
||||
pxor xmm12, xmm0
|
||||
pxor xmm13, xmm1
|
||||
pxor xmm14, xmm2
|
||||
pxor xmm15, xmm3
|
||||
movdqa xmm8, xmmword ptr [ROT8+rip]
|
||||
pshufb xmm12, xmm8
|
||||
pshufb xmm13, xmm8
|
||||
pshufb xmm14, xmm8
|
||||
pshufb xmm15, xmm8
|
||||
movdqa xmm8, xmmword ptr [rsp+0x100]
|
||||
paddd xmm8, xmm12
|
||||
paddd xmm9, xmm13
|
||||
paddd xmm10, xmm14
|
||||
paddd xmm11, xmm15
|
||||
pxor xmm4, xmm8
|
||||
pxor xmm5, xmm9
|
||||
pxor xmm6, xmm10
|
||||
pxor xmm7, xmm11
|
||||
movdqa xmmword ptr [rsp+0x100], xmm8
|
||||
movdqa xmm8, xmm4
|
||||
psrld xmm8, 7
|
||||
pslld xmm4, 25
|
||||
por xmm4, xmm8
|
||||
movdqa xmm8, xmm5
|
||||
psrld xmm8, 7
|
||||
pslld xmm5, 25
|
||||
por xmm5, xmm8
|
||||
movdqa xmm8, xmm6
|
||||
psrld xmm8, 7
|
||||
pslld xmm6, 25
|
||||
por xmm6, xmm8
|
||||
movdqa xmm8, xmm7
|
||||
psrld xmm8, 7
|
||||
pslld xmm7, 25
|
||||
por xmm7, xmm8
|
||||
paddd xmm0, xmmword ptr [rsp+0x80]
|
||||
paddd xmm1, xmmword ptr [rsp+0xA0]
|
||||
paddd xmm2, xmmword ptr [rsp+0xC0]
|
||||
paddd xmm3, xmmword ptr [rsp+0xE0]
|
||||
paddd xmm0, xmm5
|
||||
paddd xmm1, xmm6
|
||||
paddd xmm2, xmm7
|
||||
paddd xmm3, xmm4
|
||||
pxor xmm15, xmm0
|
||||
pxor xmm12, xmm1
|
||||
pxor xmm13, xmm2
|
||||
pxor xmm14, xmm3
|
||||
movdqa xmm8, xmmword ptr [ROT16+rip]
|
||||
pshufb xmm15, xmm8
|
||||
pshufb xmm12, xmm8
|
||||
pshufb xmm13, xmm8
|
||||
pshufb xmm14, xmm8
|
||||
paddd xmm10, xmm15
|
||||
paddd xmm11, xmm12
|
||||
movdqa xmm8, xmmword ptr [rsp+0x100]
|
||||
paddd xmm8, xmm13
|
||||
paddd xmm9, xmm14
|
||||
pxor xmm5, xmm10
|
||||
pxor xmm6, xmm11
|
||||
pxor xmm7, xmm8
|
||||
pxor xmm4, xmm9
|
||||
movdqa xmmword ptr [rsp+0x100], xmm8
|
||||
movdqa xmm8, xmm5
|
||||
psrld xmm8, 12
|
||||
pslld xmm5, 20
|
||||
por xmm5, xmm8
|
||||
movdqa xmm8, xmm6
|
||||
psrld xmm8, 12
|
||||
pslld xmm6, 20
|
||||
por xmm6, xmm8
|
||||
movdqa xmm8, xmm7
|
||||
psrld xmm8, 12
|
||||
pslld xmm7, 20
|
||||
por xmm7, xmm8
|
||||
movdqa xmm8, xmm4
|
||||
psrld xmm8, 12
|
||||
pslld xmm4, 20
|
||||
por xmm4, xmm8
|
||||
paddd xmm0, xmmword ptr [rsp+0x90]
|
||||
paddd xmm1, xmmword ptr [rsp+0xB0]
|
||||
paddd xmm2, xmmword ptr [rsp+0xD0]
|
||||
paddd xmm3, xmmword ptr [rsp+0xF0]
|
||||
paddd xmm0, xmm5
|
||||
paddd xmm1, xmm6
|
||||
paddd xmm2, xmm7
|
||||
paddd xmm3, xmm4
|
||||
pxor xmm15, xmm0
|
||||
pxor xmm12, xmm1
|
||||
pxor xmm13, xmm2
|
||||
pxor xmm14, xmm3
|
||||
movdqa xmm8, xmmword ptr [ROT8+rip]
|
||||
pshufb xmm15, xmm8
|
||||
pshufb xmm12, xmm8
|
||||
pshufb xmm13, xmm8
|
||||
pshufb xmm14, xmm8
|
||||
paddd xmm10, xmm15
|
||||
paddd xmm11, xmm12
|
||||
movdqa xmm8, xmmword ptr [rsp+0x100]
|
||||
paddd xmm8, xmm13
|
||||
paddd xmm9, xmm14
|
||||
pxor xmm5, xmm10
|
||||
pxor xmm6, xmm11
|
||||
pxor xmm7, xmm8
|
||||
pxor xmm4, xmm9
|
||||
movdqa xmmword ptr [rsp+0x100], xmm8
|
||||
movdqa xmm8, xmm5
|
||||
psrld xmm8, 7
|
||||
pslld xmm5, 25
|
||||
por xmm5, xmm8
|
||||
movdqa xmm8, xmm6
|
||||
psrld xmm8, 7
|
||||
pslld xmm6, 25
|
||||
por xmm6, xmm8
|
||||
movdqa xmm8, xmm7
|
||||
psrld xmm8, 7
|
||||
pslld xmm7, 25
|
||||
por xmm7, xmm8
|
||||
movdqa xmm8, xmm4
|
||||
psrld xmm8, 7
|
||||
pslld xmm4, 25
|
||||
por xmm4, xmm8
|
||||
paddd xmm0, xmmword ptr [rsp+0x20]
|
||||
paddd xmm1, xmmword ptr [rsp+0x30]
|
||||
paddd xmm2, xmmword ptr [rsp+0x70]
|
||||
paddd xmm3, xmmword ptr [rsp+0x40]
|
||||
paddd xmm0, xmm4
|
||||
paddd xmm1, xmm5
|
||||
paddd xmm2, xmm6
|
||||
paddd xmm3, xmm7
|
||||
pxor xmm12, xmm0
|
||||
pxor xmm13, xmm1
|
||||
pxor xmm14, xmm2
|
||||
pxor xmm15, xmm3
|
||||
movdqa xmm8, xmmword ptr [ROT16+rip]
|
||||
pshufb xmm12, xmm8
|
||||
pshufb xmm13, xmm8
|
||||
pshufb xmm14, xmm8
|
||||
pshufb xmm15, xmm8
|
||||
movdqa xmm8, xmmword ptr [rsp+0x100]
|
||||
paddd xmm8, xmm12
|
||||
paddd xmm9, xmm13
|
||||
paddd xmm10, xmm14
|
||||
paddd xmm11, xmm15
|
||||
pxor xmm4, xmm8
|
||||
pxor xmm5, xmm9
|
||||
pxor xmm6, xmm10
|
||||
pxor xmm7, xmm11
|
||||
movdqa xmmword ptr [rsp+0x100], xmm8
|
||||
movdqa xmm8, xmm4
|
||||
psrld xmm8, 12
|
||||
pslld xmm4, 20
|
||||
por xmm4, xmm8
|
||||
movdqa xmm8, xmm5
|
||||
psrld xmm8, 12
|
||||
pslld xmm5, 20
|
||||
por xmm5, xmm8
|
||||
movdqa xmm8, xmm6
|
||||
psrld xmm8, 12
|
||||
pslld xmm6, 20
|
||||
por xmm6, xmm8
|
||||
movdqa xmm8, xmm7
|
||||
psrld xmm8, 12
|
||||
pslld xmm7, 20
|
||||
por xmm7, xmm8
|
||||
paddd xmm0, xmmword ptr [rsp+0x60]
|
||||
paddd xmm1, xmmword ptr [rsp+0xA0]
|
||||
paddd xmm2, xmmword ptr [rsp]
|
||||
paddd xmm3, xmmword ptr [rsp+0xD0]
|
||||
paddd xmm0, xmm4
|
||||
paddd xmm1, xmm5
|
||||
paddd xmm2, xmm6
|
||||
paddd xmm3, xmm7
|
||||
pxor xmm12, xmm0
|
||||
pxor xmm13, xmm1
|
||||
pxor xmm14, xmm2
|
||||
pxor xmm15, xmm3
|
||||
movdqa xmm8, xmmword ptr [ROT8+rip]
|
||||
pshufb xmm12, xmm8
|
||||
pshufb xmm13, xmm8
|
||||
pshufb xmm14, xmm8
|
||||
pshufb xmm15, xmm8
|
||||
movdqa xmm8, xmmword ptr [rsp+0x100]
|
||||
paddd xmm8, xmm12
|
||||
paddd xmm9, xmm13
|
||||
paddd xmm10, xmm14
|
||||
paddd xmm11, xmm15
|
||||
pxor xmm4, xmm8
|
||||
pxor xmm5, xmm9
|
||||
pxor xmm6, xmm10
|
||||
pxor xmm7, xmm11
|
||||
movdqa xmmword ptr [rsp+0x100], xmm8
|
||||
movdqa xmm8, xmm4
|
||||
psrld xmm8, 7
|
||||
pslld xmm4, 25
|
||||
por xmm4, xmm8
|
||||
movdqa xmm8, xmm5
|
||||
psrld xmm8, 7
|
||||
pslld xmm5, 25
|
||||
por xmm5, xmm8
|
||||
movdqa xmm8, xmm6
|
||||
psrld xmm8, 7
|
||||
pslld xmm6, 25
|
||||
por xmm6, xmm8
|
||||
movdqa xmm8, xmm7
|
||||
psrld xmm8, 7
|
||||
pslld xmm7, 25
|
||||
por xmm7, xmm8
|
||||
paddd xmm0, xmmword ptr [rsp+0x10]
|
||||
paddd xmm1, xmmword ptr [rsp+0xC0]
|
||||
paddd xmm2, xmmword ptr [rsp+0x90]
|
||||
paddd xmm3, xmmword ptr [rsp+0xF0]
|
||||
paddd xmm0, xmm5
|
||||
paddd xmm1, xmm6
|
||||
paddd xmm2, xmm7
|
||||
paddd xmm3, xmm4
|
||||
pxor xmm15, xmm0
|
||||
pxor xmm12, xmm1
|
||||
pxor xmm13, xmm2
|
||||
pxor xmm14, xmm3
|
||||
movdqa xmm8, xmmword ptr [ROT16+rip]
|
||||
pshufb xmm15, xmm8
|
||||
pshufb xmm12, xmm8
|
||||
pshufb xmm13, xmm8
|
||||
pshufb xmm14, xmm8
|
||||
paddd xmm10, xmm15
|
||||
paddd xmm11, xmm12
|
||||
movdqa xmm8, xmmword ptr [rsp+0x100]
|
||||
paddd xmm8, xmm13
|
||||
paddd xmm9, xmm14
|
||||
pxor xmm5, xmm10
|
||||
pxor xmm6, xmm11
|
||||
pxor xmm7, xmm8
|
||||
pxor xmm4, xmm9
|
||||
movdqa xmmword ptr [rsp+0x100], xmm8
|
||||
movdqa xmm8, xmm5
|
||||
psrld xmm8, 12
|
||||
pslld xmm5, 20
|
||||
por xmm5, xmm8
|
||||
movdqa xmm8, xmm6
|
||||
psrld xmm8, 12
|
||||
pslld xmm6, 20
|
||||
por xmm6, xmm8
|
||||
movdqa xmm8, xmm7
|
||||
psrld xmm8, 12
|
||||
pslld xmm7, 20
|
||||
por xmm7, xmm8
|
||||
movdqa xmm8, xmm4
|
||||
psrld xmm8, 12
|
||||
pslld xmm4, 20
|
||||
por xmm4, xmm8
|
||||
paddd xmm0, xmmword ptr [rsp+0xB0]
|
||||
paddd xmm1, xmmword ptr [rsp+0x50]
|
||||
paddd xmm2, xmmword ptr [rsp+0xE0]
|
||||
paddd xmm3, xmmword ptr [rsp+0x80]
|
||||
paddd xmm0, xmm5
|
||||
paddd xmm1, xmm6
|
||||
paddd xmm2, xmm7
|
||||
paddd xmm3, xmm4
|
||||
pxor xmm15, xmm0
|
||||
pxor xmm12, xmm1
|
||||
pxor xmm13, xmm2
|
||||
pxor xmm14, xmm3
|
||||
movdqa xmm8, xmmword ptr [ROT8+rip]
|
||||
pshufb xmm15, xmm8
|
||||
pshufb xmm12, xmm8
|
||||
pshufb xmm13, xmm8
|
||||
pshufb xmm14, xmm8
|
||||
paddd xmm10, xmm15
|
||||
paddd xmm11, xmm12
|
||||
movdqa xmm8, xmmword ptr [rsp+0x100]
|
||||
paddd xmm8, xmm13
|
||||
paddd xmm9, xmm14
|
||||
pxor xmm5, xmm10
|
||||
pxor xmm6, xmm11
|
||||
pxor xmm7, xmm8
|
||||
pxor xmm4, xmm9
|
||||
movdqa xmmword ptr [rsp+0x100], xmm8
|
||||
movdqa xmm8, xmm5
|
||||
psrld xmm8, 7
|
||||
pslld xmm5, 25
|
||||
por xmm5, xmm8
|
||||
movdqa xmm8, xmm6
|
||||
psrld xmm8, 7
|
||||
pslld xmm6, 25
|
||||
por xmm6, xmm8
|
||||
movdqa xmm8, xmm7
|
||||
psrld xmm8, 7
|
||||
pslld xmm7, 25
|
||||
por xmm7, xmm8
|
||||
movdqa xmm8, xmm4
|
||||
psrld xmm8, 7
|
||||
pslld xmm4, 25
|
||||
por xmm4, xmm8
|
||||
paddd xmm0, xmmword ptr [rsp+0x30]
|
||||
paddd xmm1, xmmword ptr [rsp+0xA0]
|
||||
paddd xmm2, xmmword ptr [rsp+0xD0]
|
||||
paddd xmm3, xmmword ptr [rsp+0x70]
|
||||
paddd xmm0, xmm4
|
||||
paddd xmm1, xmm5
|
||||
paddd xmm2, xmm6
|
||||
paddd xmm3, xmm7
|
||||
pxor xmm12, xmm0
|
||||
pxor xmm13, xmm1
|
||||
pxor xmm14, xmm2
|
||||
pxor xmm15, xmm3
|
||||
movdqa xmm8, xmmword ptr [ROT16+rip]
|
||||
pshufb xmm12, xmm8
|
||||
pshufb xmm13, xmm8
|
||||
pshufb xmm14, xmm8
|
||||
pshufb xmm15, xmm8
|
||||
movdqa xmm8, xmmword ptr [rsp+0x100]
|
||||
paddd xmm8, xmm12
|
||||
paddd xmm9, xmm13
|
||||
paddd xmm10, xmm14
|
||||
paddd xmm11, xmm15
|
||||
pxor xmm4, xmm8
|
||||
pxor xmm5, xmm9
|
||||
pxor xmm6, xmm10
|
||||
pxor xmm7, xmm11
|
||||
movdqa xmmword ptr [rsp+0x100], xmm8
|
||||
movdqa xmm8, xmm4
|
||||
psrld xmm8, 12
|
||||
pslld xmm4, 20
|
||||
por xmm4, xmm8
|
||||
movdqa xmm8, xmm5
|
||||
psrld xmm8, 12
|
||||
pslld xmm5, 20
|
||||
por xmm5, xmm8
|
||||
movdqa xmm8, xmm6
|
||||
psrld xmm8, 12
|
||||
pslld xmm6, 20
|
||||
por xmm6, xmm8
|
||||
movdqa xmm8, xmm7
|
||||
psrld xmm8, 12
|
||||
pslld xmm7, 20
|
||||
por xmm7, xmm8
|
||||
paddd xmm0, xmmword ptr [rsp+0x40]
|
||||
paddd xmm1, xmmword ptr [rsp+0xC0]
|
||||
paddd xmm2, xmmword ptr [rsp+0x20]
|
||||
paddd xmm3, xmmword ptr [rsp+0xE0]
|
||||
paddd xmm0, xmm4
|
||||
paddd xmm1, xmm5
|
||||
paddd xmm2, xmm6
|
||||
paddd xmm3, xmm7
|
||||
pxor xmm12, xmm0
|
||||
pxor xmm13, xmm1
|
||||
pxor xmm14, xmm2
|
||||
pxor xmm15, xmm3
|
||||
movdqa xmm8, xmmword ptr [ROT8+rip]
|
||||
pshufb xmm12, xmm8
|
||||
pshufb xmm13, xmm8
|
||||
pshufb xmm14, xmm8
|
||||
pshufb xmm15, xmm8
|
||||
movdqa xmm8, xmmword ptr [rsp+0x100]
|
||||
paddd xmm8, xmm12
|
||||
paddd xmm9, xmm13
|
||||
paddd xmm10, xmm14
|
||||
paddd xmm11, xmm15
|
||||
pxor xmm4, xmm8
|
||||
pxor xmm5, xmm9
|
||||
pxor xmm6, xmm10
|
||||
pxor xmm7, xmm11
|
||||
movdqa xmmword ptr [rsp+0x100], xmm8
|
||||
movdqa xmm8, xmm4
|
||||
psrld xmm8, 7
|
||||
pslld xmm4, 25
|
||||
por xmm4, xmm8
|
||||
movdqa xmm8, xmm5
|
||||
psrld xmm8, 7
|
||||
pslld xmm5, 25
|
||||
por xmm5, xmm8
|
||||
movdqa xmm8, xmm6
|
||||
psrld xmm8, 7
|
||||
pslld xmm6, 25
|
||||
por xmm6, xmm8
|
||||
movdqa xmm8, xmm7
|
||||
psrld xmm8, 7
|
||||
pslld xmm7, 25
|
||||
por xmm7, xmm8
|
||||
paddd xmm0, xmmword ptr [rsp+0x60]
|
||||
paddd xmm1, xmmword ptr [rsp+0x90]
|
||||
paddd xmm2, xmmword ptr [rsp+0xB0]
|
||||
paddd xmm3, xmmword ptr [rsp+0x80]
|
||||
paddd xmm0, xmm5
|
||||
paddd xmm1, xmm6
|
||||
paddd xmm2, xmm7
|
||||
paddd xmm3, xmm4
|
||||
pxor xmm15, xmm0
|
||||
pxor xmm12, xmm1
|
||||
pxor xmm13, xmm2
|
||||
pxor xmm14, xmm3
|
||||
movdqa xmm8, xmmword ptr [ROT16+rip]
|
||||
pshufb xmm15, xmm8
|
||||
pshufb xmm12, xmm8
|
||||
pshufb xmm13, xmm8
|
||||
pshufb xmm14, xmm8
|
||||
paddd xmm10, xmm15
|
||||
paddd xmm11, xmm12
|
||||
movdqa xmm8, xmmword ptr [rsp+0x100]
|
||||
paddd xmm8, xmm13
|
||||
paddd xmm9, xmm14
|
||||
pxor xmm5, xmm10
|
||||
pxor xmm6, xmm11
|
||||
pxor xmm7, xmm8
|
||||
pxor xmm4, xmm9
|
||||
movdqa xmmword ptr [rsp+0x100], xmm8
|
||||
movdqa xmm8, xmm5
|
||||
psrld xmm8, 12
|
||||
pslld xmm5, 20
|
||||
por xmm5, xmm8
|
||||
movdqa xmm8, xmm6
|
||||
psrld xmm8, 12
|
||||
pslld xmm6, 20
|
||||
por xmm6, xmm8
|
||||
movdqa xmm8, xmm7
|
||||
psrld xmm8, 12
|
||||
pslld xmm7, 20
|
||||
por xmm7, xmm8
|
||||
movdqa xmm8, xmm4
|
||||
psrld xmm8, 12
|
||||
pslld xmm4, 20
|
||||
por xmm4, xmm8
|
||||
paddd xmm0, xmmword ptr [rsp+0x50]
|
||||
paddd xmm1, xmmword ptr [rsp]
|
||||
paddd xmm2, xmmword ptr [rsp+0xF0]
|
||||
paddd xmm3, xmmword ptr [rsp+0x10]
|
||||
paddd xmm0, xmm5
|
||||
paddd xmm1, xmm6
|
||||
paddd xmm2, xmm7
|
||||
paddd xmm3, xmm4
|
||||
pxor xmm15, xmm0
|
||||
pxor xmm12, xmm1
|
||||
pxor xmm13, xmm2
|
||||
pxor xmm14, xmm3
|
||||
movdqa xmm8, xmmword ptr [ROT8+rip]
|
||||
pshufb xmm15, xmm8
|
||||
pshufb xmm12, xmm8
|
||||
pshufb xmm13, xmm8
|
||||
pshufb xmm14, xmm8
|
||||
paddd xmm10, xmm15
|
||||
paddd xmm11, xmm12
|
||||
movdqa xmm8, xmmword ptr [rsp+0x100]
|
||||
paddd xmm8, xmm13
|
||||
paddd xmm9, xmm14
|
||||
pxor xmm5, xmm10
|
||||
pxor xmm6, xmm11
|
||||
pxor xmm7, xmm8
|
||||
pxor xmm4, xmm9
|
||||
movdqa xmmword ptr [rsp+0x100], xmm8
|
||||
movdqa xmm8, xmm5
|
||||
psrld xmm8, 7
|
||||
pslld xmm5, 25
|
||||
por xmm5, xmm8
|
||||
movdqa xmm8, xmm6
|
||||
psrld xmm8, 7
|
||||
pslld xmm6, 25
|
||||
por xmm6, xmm8
|
||||
movdqa xmm8, xmm7
|
||||
psrld xmm8, 7
|
||||
pslld xmm7, 25
|
||||
por xmm7, xmm8
|
||||
movdqa xmm8, xmm4
|
||||
psrld xmm8, 7
|
||||
pslld xmm4, 25
|
||||
por xmm4, xmm8
|
||||
paddd xmm0, xmmword ptr [rsp+0xA0]
|
||||
paddd xmm1, xmmword ptr [rsp+0xC0]
|
||||
paddd xmm2, xmmword ptr [rsp+0xE0]
|
||||
paddd xmm3, xmmword ptr [rsp+0xD0]
|
||||
paddd xmm0, xmm4
|
||||
paddd xmm1, xmm5
|
||||
paddd xmm2, xmm6
|
||||
paddd xmm3, xmm7
|
||||
pxor xmm12, xmm0
|
||||
pxor xmm13, xmm1
|
||||
pxor xmm14, xmm2
|
||||
pxor xmm15, xmm3
|
||||
movdqa xmm8, xmmword ptr [ROT16+rip]
|
||||
pshufb xmm12, xmm8
|
||||
pshufb xmm13, xmm8
|
||||
pshufb xmm14, xmm8
|
||||
pshufb xmm15, xmm8
|
||||
movdqa xmm8, xmmword ptr [rsp+0x100]
|
||||
paddd xmm8, xmm12
|
||||
paddd xmm9, xmm13
|
||||
paddd xmm10, xmm14
|
||||
paddd xmm11, xmm15
|
||||
pxor xmm4, xmm8
|
||||
pxor xmm5, xmm9
|
||||
pxor xmm6, xmm10
|
||||
pxor xmm7, xmm11
|
||||
movdqa xmmword ptr [rsp+0x100], xmm8
|
||||
movdqa xmm8, xmm4
|
||||
psrld xmm8, 12
|
||||
pslld xmm4, 20
|
||||
por xmm4, xmm8
|
||||
movdqa xmm8, xmm5
|
||||
psrld xmm8, 12
|
||||
pslld xmm5, 20
|
||||
por xmm5, xmm8
|
||||
movdqa xmm8, xmm6
|
||||
psrld xmm8, 12
|
||||
pslld xmm6, 20
|
||||
por xmm6, xmm8
|
||||
movdqa xmm8, xmm7
|
||||
psrld xmm8, 12
|
||||
pslld xmm7, 20
|
||||
por xmm7, xmm8
|
||||
paddd xmm0, xmmword ptr [rsp+0x70]
|
||||
paddd xmm1, xmmword ptr [rsp+0x90]
|
||||
paddd xmm2, xmmword ptr [rsp+0x30]
|
||||
paddd xmm3, xmmword ptr [rsp+0xF0]
|
||||
paddd xmm0, xmm4
|
||||
paddd xmm1, xmm5
|
||||
paddd xmm2, xmm6
|
||||
paddd xmm3, xmm7
|
||||
pxor xmm12, xmm0
|
||||
pxor xmm13, xmm1
|
||||
pxor xmm14, xmm2
|
||||
pxor xmm15, xmm3
|
||||
movdqa xmm8, xmmword ptr [ROT8+rip]
|
||||
pshufb xmm12, xmm8
|
||||
pshufb xmm13, xmm8
|
||||
pshufb xmm14, xmm8
|
||||
pshufb xmm15, xmm8
|
||||
movdqa xmm8, xmmword ptr [rsp+0x100]
|
||||
paddd xmm8, xmm12
|
||||
paddd xmm9, xmm13
|
||||
paddd xmm10, xmm14
|
||||
paddd xmm11, xmm15
|
||||
pxor xmm4, xmm8
|
||||
pxor xmm5, xmm9
|
||||
pxor xmm6, xmm10
|
||||
pxor xmm7, xmm11
|
||||
movdqa xmmword ptr [rsp+0x100], xmm8
|
||||
movdqa xmm8, xmm4
|
||||
psrld xmm8, 7
|
||||
pslld xmm4, 25
|
||||
por xmm4, xmm8
|
||||
movdqa xmm8, xmm5
|
||||
psrld xmm8, 7
|
||||
pslld xmm5, 25
|
||||
por xmm5, xmm8
|
||||
movdqa xmm8, xmm6
|
||||
psrld xmm8, 7
|
||||
pslld xmm6, 25
|
||||
por xmm6, xmm8
|
||||
movdqa xmm8, xmm7
|
||||
psrld xmm8, 7
|
||||
pslld xmm7, 25
|
||||
por xmm7, xmm8
|
||||
paddd xmm0, xmmword ptr [rsp+0x40]
|
||||
paddd xmm1, xmmword ptr [rsp+0xB0]
|
||||
paddd xmm2, xmmword ptr [rsp+0x50]
|
||||
paddd xmm3, xmmword ptr [rsp+0x10]
|
||||
paddd xmm0, xmm5
|
||||
paddd xmm1, xmm6
|
||||
paddd xmm2, xmm7
|
||||
paddd xmm3, xmm4
|
||||
pxor xmm15, xmm0
|
||||
pxor xmm12, xmm1
|
||||
pxor xmm13, xmm2
|
||||
pxor xmm14, xmm3
|
||||
movdqa xmm8, xmmword ptr [ROT16+rip]
|
||||
pshufb xmm15, xmm8
|
||||
pshufb xmm12, xmm8
|
||||
pshufb xmm13, xmm8
|
||||
pshufb xmm14, xmm8
|
||||
paddd xmm10, xmm15
|
||||
paddd xmm11, xmm12
|
||||
movdqa xmm8, xmmword ptr [rsp+0x100]
|
||||
paddd xmm8, xmm13
|
||||
paddd xmm9, xmm14
|
||||
pxor xmm5, xmm10
|
||||
pxor xmm6, xmm11
|
||||
pxor xmm7, xmm8
|
||||
pxor xmm4, xmm9
|
||||
movdqa xmmword ptr [rsp+0x100], xmm8
|
||||
movdqa xmm8, xmm5
|
||||
psrld xmm8, 12
|
||||
pslld xmm5, 20
|
||||
por xmm5, xmm8
|
||||
movdqa xmm8, xmm6
|
||||
psrld xmm8, 12
|
||||
pslld xmm6, 20
|
||||
por xmm6, xmm8
|
||||
movdqa xmm8, xmm7
|
||||
psrld xmm8, 12
|
||||
pslld xmm7, 20
|
||||
por xmm7, xmm8
|
||||
movdqa xmm8, xmm4
|
||||
psrld xmm8, 12
|
||||
pslld xmm4, 20
|
||||
por xmm4, xmm8
|
||||
paddd xmm0, xmmword ptr [rsp]
|
||||
paddd xmm1, xmmword ptr [rsp+0x20]
|
||||
paddd xmm2, xmmword ptr [rsp+0x80]
|
||||
paddd xmm3, xmmword ptr [rsp+0x60]
|
||||
paddd xmm0, xmm5
|
||||
paddd xmm1, xmm6
|
||||
paddd xmm2, xmm7
|
||||
paddd xmm3, xmm4
|
||||
pxor xmm15, xmm0
|
||||
pxor xmm12, xmm1
|
||||
pxor xmm13, xmm2
|
||||
pxor xmm14, xmm3
|
||||
movdqa xmm8, xmmword ptr [ROT8+rip]
|
||||
pshufb xmm15, xmm8
|
||||
pshufb xmm12, xmm8
|
||||
pshufb xmm13, xmm8
|
||||
pshufb xmm14, xmm8
|
||||
paddd xmm10, xmm15
|
||||
paddd xmm11, xmm12
|
||||
movdqa xmm8, xmmword ptr [rsp+0x100]
|
||||
paddd xmm8, xmm13
|
||||
paddd xmm9, xmm14
|
||||
pxor xmm5, xmm10
|
||||
pxor xmm6, xmm11
|
||||
pxor xmm7, xmm8
|
||||
pxor xmm4, xmm9
|
||||
movdqa xmmword ptr [rsp+0x100], xmm8
|
||||
movdqa xmm8, xmm5
|
||||
psrld xmm8, 7
|
||||
pslld xmm5, 25
|
||||
por xmm5, xmm8
|
||||
movdqa xmm8, xmm6
|
||||
psrld xmm8, 7
|
||||
pslld xmm6, 25
|
||||
por xmm6, xmm8
|
||||
movdqa xmm8, xmm7
|
||||
psrld xmm8, 7
|
||||
pslld xmm7, 25
|
||||
por xmm7, xmm8
|
||||
movdqa xmm8, xmm4
|
||||
psrld xmm8, 7
|
||||
pslld xmm4, 25
|
||||
por xmm4, xmm8
|
||||
paddd xmm0, xmmword ptr [rsp+0xC0]
|
||||
paddd xmm1, xmmword ptr [rsp+0x90]
|
||||
paddd xmm2, xmmword ptr [rsp+0xF0]
|
||||
paddd xmm3, xmmword ptr [rsp+0xE0]
|
||||
paddd xmm0, xmm4
|
||||
paddd xmm1, xmm5
|
||||
paddd xmm2, xmm6
|
||||
paddd xmm3, xmm7
|
||||
pxor xmm12, xmm0
|
||||
pxor xmm13, xmm1
|
||||
pxor xmm14, xmm2
|
||||
pxor xmm15, xmm3
|
||||
movdqa xmm8, xmmword ptr [ROT16+rip]
|
||||
pshufb xmm12, xmm8
|
||||
pshufb xmm13, xmm8
|
||||
pshufb xmm14, xmm8
|
||||
pshufb xmm15, xmm8
|
||||
movdqa xmm8, xmmword ptr [rsp+0x100]
|
||||
paddd xmm8, xmm12
|
||||
paddd xmm9, xmm13
|
||||
paddd xmm10, xmm14
|
||||
paddd xmm11, xmm15
|
||||
pxor xmm4, xmm8
|
||||
pxor xmm5, xmm9
|
||||
pxor xmm6, xmm10
|
||||
pxor xmm7, xmm11
|
||||
movdqa xmmword ptr [rsp+0x100], xmm8
|
||||
movdqa xmm8, xmm4
|
||||
psrld xmm8, 12
|
||||
pslld xmm4, 20
|
||||
por xmm4, xmm8
|
||||
movdqa xmm8, xmm5
|
||||
psrld xmm8, 12
|
||||
pslld xmm5, 20
|
||||
por xmm5, xmm8
|
||||
movdqa xmm8, xmm6
|
||||
psrld xmm8, 12
|
||||
pslld xmm6, 20
|
||||
por xmm6, xmm8
|
||||
movdqa xmm8, xmm7
|
||||
psrld xmm8, 12
|
||||
pslld xmm7, 20
|
||||
por xmm7, xmm8
|
||||
paddd xmm0, xmmword ptr [rsp+0xD0]
|
||||
paddd xmm1, xmmword ptr [rsp+0xB0]
|
||||
paddd xmm2, xmmword ptr [rsp+0xA0]
|
||||
paddd xmm3, xmmword ptr [rsp+0x80]
|
||||
paddd xmm0, xmm4
|
||||
paddd xmm1, xmm5
|
||||
paddd xmm2, xmm6
|
||||
paddd xmm3, xmm7
|
||||
pxor xmm12, xmm0
|
||||
pxor xmm13, xmm1
|
||||
pxor xmm14, xmm2
|
||||
pxor xmm15, xmm3
|
||||
movdqa xmm8, xmmword ptr [ROT8+rip]
|
||||
pshufb xmm12, xmm8
|
||||
pshufb xmm13, xmm8
|
||||
pshufb xmm14, xmm8
|
||||
pshufb xmm15, xmm8
|
||||
movdqa xmm8, xmmword ptr [rsp+0x100]
|
||||
paddd xmm8, xmm12
|
||||
paddd xmm9, xmm13
|
||||
paddd xmm10, xmm14
|
||||
paddd xmm11, xmm15
|
||||
pxor xmm4, xmm8
|
||||
pxor xmm5, xmm9
|
||||
pxor xmm6, xmm10
|
||||
pxor xmm7, xmm11
|
||||
movdqa xmmword ptr [rsp+0x100], xmm8
|
||||
movdqa xmm8, xmm4
|
||||
psrld xmm8, 7
|
||||
pslld xmm4, 25
|
||||
por xmm4, xmm8
|
||||
movdqa xmm8, xmm5
|
||||
psrld xmm8, 7
|
||||
pslld xmm5, 25
|
||||
por xmm5, xmm8
|
||||
movdqa xmm8, xmm6
|
||||
psrld xmm8, 7
|
||||
pslld xmm6, 25
|
||||
por xmm6, xmm8
|
||||
movdqa xmm8, xmm7
|
||||
psrld xmm8, 7
|
||||
pslld xmm7, 25
|
||||
por xmm7, xmm8
|
||||
paddd xmm0, xmmword ptr [rsp+0x70]
|
||||
paddd xmm1, xmmword ptr [rsp+0x50]
|
||||
paddd xmm2, xmmword ptr [rsp]
|
||||
paddd xmm3, xmmword ptr [rsp+0x60]
|
||||
paddd xmm0, xmm5
|
||||
paddd xmm1, xmm6
|
||||
paddd xmm2, xmm7
|
||||
paddd xmm3, xmm4
|
||||
pxor xmm15, xmm0
|
||||
pxor xmm12, xmm1
|
||||
pxor xmm13, xmm2
|
||||
pxor xmm14, xmm3
|
||||
movdqa xmm8, xmmword ptr [ROT16+rip]
|
||||
pshufb xmm15, xmm8
|
||||
pshufb xmm12, xmm8
|
||||
pshufb xmm13, xmm8
|
||||
pshufb xmm14, xmm8
|
||||
paddd xmm10, xmm15
|
||||
paddd xmm11, xmm12
|
||||
movdqa xmm8, xmmword ptr [rsp+0x100]
|
||||
paddd xmm8, xmm13
|
||||
paddd xmm9, xmm14
|
||||
pxor xmm5, xmm10
|
||||
pxor xmm6, xmm11
|
||||
pxor xmm7, xmm8
|
||||
pxor xmm4, xmm9
|
||||
movdqa xmmword ptr [rsp+0x100], xmm8
|
||||
movdqa xmm8, xmm5
|
||||
psrld xmm8, 12
|
||||
pslld xmm5, 20
|
||||
por xmm5, xmm8
|
||||
movdqa xmm8, xmm6
|
||||
psrld xmm8, 12
|
||||
pslld xmm6, 20
|
||||
por xmm6, xmm8
|
||||
movdqa xmm8, xmm7
|
||||
psrld xmm8, 12
|
||||
pslld xmm7, 20
|
||||
por xmm7, xmm8
|
||||
movdqa xmm8, xmm4
|
||||
psrld xmm8, 12
|
||||
pslld xmm4, 20
|
||||
por xmm4, xmm8
|
||||
paddd xmm0, xmmword ptr [rsp+0x20]
|
||||
paddd xmm1, xmmword ptr [rsp+0x30]
|
||||
paddd xmm2, xmmword ptr [rsp+0x10]
|
||||
paddd xmm3, xmmword ptr [rsp+0x40]
|
||||
paddd xmm0, xmm5
|
||||
paddd xmm1, xmm6
|
||||
paddd xmm2, xmm7
|
||||
paddd xmm3, xmm4
|
||||
pxor xmm15, xmm0
|
||||
pxor xmm12, xmm1
|
||||
pxor xmm13, xmm2
|
||||
pxor xmm14, xmm3
|
||||
movdqa xmm8, xmmword ptr [ROT8+rip]
|
||||
pshufb xmm15, xmm8
|
||||
pshufb xmm12, xmm8
|
||||
pshufb xmm13, xmm8
|
||||
pshufb xmm14, xmm8
|
||||
paddd xmm10, xmm15
|
||||
paddd xmm11, xmm12
|
||||
movdqa xmm8, xmmword ptr [rsp+0x100]
|
||||
paddd xmm8, xmm13
|
||||
paddd xmm9, xmm14
|
||||
pxor xmm5, xmm10
|
||||
pxor xmm6, xmm11
|
||||
pxor xmm7, xmm8
|
||||
pxor xmm4, xmm9
|
||||
movdqa xmmword ptr [rsp+0x100], xmm8
|
||||
movdqa xmm8, xmm5
|
||||
psrld xmm8, 7
|
||||
pslld xmm5, 25
|
||||
por xmm5, xmm8
|
||||
movdqa xmm8, xmm6
|
||||
psrld xmm8, 7
|
||||
pslld xmm6, 25
|
||||
por xmm6, xmm8
|
||||
movdqa xmm8, xmm7
|
||||
psrld xmm8, 7
|
||||
pslld xmm7, 25
|
||||
por xmm7, xmm8
|
||||
movdqa xmm8, xmm4
|
||||
psrld xmm8, 7
|
||||
pslld xmm4, 25
|
||||
por xmm4, xmm8
|
||||
paddd xmm0, xmmword ptr [rsp+0x90]
|
||||
paddd xmm1, xmmword ptr [rsp+0xB0]
|
||||
paddd xmm2, xmmword ptr [rsp+0x80]
|
||||
paddd xmm3, xmmword ptr [rsp+0xF0]
|
||||
paddd xmm0, xmm4
|
||||
paddd xmm1, xmm5
|
||||
paddd xmm2, xmm6
|
||||
paddd xmm3, xmm7
|
||||
pxor xmm12, xmm0
|
||||
pxor xmm13, xmm1
|
||||
pxor xmm14, xmm2
|
||||
pxor xmm15, xmm3
|
||||
movdqa xmm8, xmmword ptr [ROT16+rip]
|
||||
pshufb xmm12, xmm8
|
||||
pshufb xmm13, xmm8
|
||||
pshufb xmm14, xmm8
|
||||
pshufb xmm15, xmm8
|
||||
movdqa xmm8, xmmword ptr [rsp+0x100]
|
||||
paddd xmm8, xmm12
|
||||
paddd xmm9, xmm13
|
||||
paddd xmm10, xmm14
|
||||
paddd xmm11, xmm15
|
||||
pxor xmm4, xmm8
|
||||
pxor xmm5, xmm9
|
||||
pxor xmm6, xmm10
|
||||
pxor xmm7, xmm11
|
||||
movdqa xmmword ptr [rsp+0x100], xmm8
|
||||
movdqa xmm8, xmm4
|
||||
psrld xmm8, 12
|
||||
pslld xmm4, 20
|
||||
por xmm4, xmm8
|
||||
movdqa xmm8, xmm5
|
||||
psrld xmm8, 12
|
||||
pslld xmm5, 20
|
||||
por xmm5, xmm8
|
||||
movdqa xmm8, xmm6
|
||||
psrld xmm8, 12
|
||||
pslld xmm6, 20
|
||||
por xmm6, xmm8
|
||||
movdqa xmm8, xmm7
|
||||
psrld xmm8, 12
|
||||
pslld xmm7, 20
|
||||
por xmm7, xmm8
|
||||
paddd xmm0, xmmword ptr [rsp+0xE0]
|
||||
paddd xmm1, xmmword ptr [rsp+0x50]
|
||||
paddd xmm2, xmmword ptr [rsp+0xC0]
|
||||
paddd xmm3, xmmword ptr [rsp+0x10]
|
||||
paddd xmm0, xmm4
|
||||
paddd xmm1, xmm5
|
||||
paddd xmm2, xmm6
|
||||
paddd xmm3, xmm7
|
||||
pxor xmm12, xmm0
|
||||
pxor xmm13, xmm1
|
||||
pxor xmm14, xmm2
|
||||
pxor xmm15, xmm3
|
||||
movdqa xmm8, xmmword ptr [ROT8+rip]
|
||||
pshufb xmm12, xmm8
|
||||
pshufb xmm13, xmm8
|
||||
pshufb xmm14, xmm8
|
||||
pshufb xmm15, xmm8
|
||||
movdqa xmm8, xmmword ptr [rsp+0x100]
|
||||
paddd xmm8, xmm12
|
||||
paddd xmm9, xmm13
|
||||
paddd xmm10, xmm14
|
||||
paddd xmm11, xmm15
|
||||
pxor xmm4, xmm8
|
||||
pxor xmm5, xmm9
|
||||
pxor xmm6, xmm10
|
||||
pxor xmm7, xmm11
|
||||
movdqa xmmword ptr [rsp+0x100], xmm8
|
||||
movdqa xmm8, xmm4
|
||||
psrld xmm8, 7
|
||||
pslld xmm4, 25
|
||||
por xmm4, xmm8
|
||||
movdqa xmm8, xmm5
|
||||
psrld xmm8, 7
|
||||
pslld xmm5, 25
|
||||
por xmm5, xmm8
|
||||
movdqa xmm8, xmm6
|
||||
psrld xmm8, 7
|
||||
pslld xmm6, 25
|
||||
por xmm6, xmm8
|
||||
movdqa xmm8, xmm7
|
||||
psrld xmm8, 7
|
||||
pslld xmm7, 25
|
||||
por xmm7, xmm8
|
||||
paddd xmm0, xmmword ptr [rsp+0xD0]
|
||||
paddd xmm1, xmmword ptr [rsp]
|
||||
paddd xmm2, xmmword ptr [rsp+0x20]
|
||||
paddd xmm3, xmmword ptr [rsp+0x40]
|
||||
paddd xmm0, xmm5
|
||||
paddd xmm1, xmm6
|
||||
paddd xmm2, xmm7
|
||||
paddd xmm3, xmm4
|
||||
pxor xmm15, xmm0
|
||||
pxor xmm12, xmm1
|
||||
pxor xmm13, xmm2
|
||||
pxor xmm14, xmm3
|
||||
movdqa xmm8, xmmword ptr [ROT16+rip]
|
||||
pshufb xmm15, xmm8
|
||||
pshufb xmm12, xmm8
|
||||
pshufb xmm13, xmm8
|
||||
pshufb xmm14, xmm8
|
||||
paddd xmm10, xmm15
|
||||
paddd xmm11, xmm12
|
||||
movdqa xmm8, xmmword ptr [rsp+0x100]
|
||||
paddd xmm8, xmm13
|
||||
paddd xmm9, xmm14
|
||||
pxor xmm5, xmm10
|
||||
pxor xmm6, xmm11
|
||||
pxor xmm7, xmm8
|
||||
pxor xmm4, xmm9
|
||||
movdqa xmmword ptr [rsp+0x100], xmm8
|
||||
movdqa xmm8, xmm5
|
||||
psrld xmm8, 12
|
||||
pslld xmm5, 20
|
||||
por xmm5, xmm8
|
||||
movdqa xmm8, xmm6
|
||||
psrld xmm8, 12
|
||||
pslld xmm6, 20
|
||||
por xmm6, xmm8
|
||||
movdqa xmm8, xmm7
|
||||
psrld xmm8, 12
|
||||
pslld xmm7, 20
|
||||
por xmm7, xmm8
|
||||
movdqa xmm8, xmm4
|
||||
psrld xmm8, 12
|
||||
pslld xmm4, 20
|
||||
por xmm4, xmm8
|
||||
paddd xmm0, xmmword ptr [rsp+0x30]
|
||||
paddd xmm1, xmmword ptr [rsp+0xA0]
|
||||
paddd xmm2, xmmword ptr [rsp+0x60]
|
||||
paddd xmm3, xmmword ptr [rsp+0x70]
|
||||
paddd xmm0, xmm5
|
||||
paddd xmm1, xmm6
|
||||
paddd xmm2, xmm7
|
||||
paddd xmm3, xmm4
|
||||
pxor xmm15, xmm0
|
||||
pxor xmm12, xmm1
|
||||
pxor xmm13, xmm2
|
||||
pxor xmm14, xmm3
|
||||
movdqa xmm8, xmmword ptr [ROT8+rip]
|
||||
pshufb xmm15, xmm8
|
||||
pshufb xmm12, xmm8
|
||||
pshufb xmm13, xmm8
|
||||
pshufb xmm14, xmm8
|
||||
paddd xmm10, xmm15
|
||||
paddd xmm11, xmm12
|
||||
movdqa xmm8, xmmword ptr [rsp+0x100]
|
||||
paddd xmm8, xmm13
|
||||
paddd xmm9, xmm14
|
||||
pxor xmm5, xmm10
|
||||
pxor xmm6, xmm11
|
||||
pxor xmm7, xmm8
|
||||
pxor xmm4, xmm9
|
||||
movdqa xmmword ptr [rsp+0x100], xmm8
|
||||
movdqa xmm8, xmm5
|
||||
psrld xmm8, 7
|
||||
pslld xmm5, 25
|
||||
por xmm5, xmm8
|
||||
movdqa xmm8, xmm6
|
||||
psrld xmm8, 7
|
||||
pslld xmm6, 25
|
||||
por xmm6, xmm8
|
||||
movdqa xmm8, xmm7
|
||||
psrld xmm8, 7
|
||||
pslld xmm7, 25
|
||||
por xmm7, xmm8
|
||||
movdqa xmm8, xmm4
|
||||
psrld xmm8, 7
|
||||
pslld xmm4, 25
|
||||
por xmm4, xmm8
|
||||
paddd xmm0, xmmword ptr [rsp+0xB0]
|
||||
paddd xmm1, xmmword ptr [rsp+0x50]
|
||||
paddd xmm2, xmmword ptr [rsp+0x10]
|
||||
paddd xmm3, xmmword ptr [rsp+0x80]
|
||||
paddd xmm0, xmm4
|
||||
paddd xmm1, xmm5
|
||||
paddd xmm2, xmm6
|
||||
paddd xmm3, xmm7
|
||||
pxor xmm12, xmm0
|
||||
pxor xmm13, xmm1
|
||||
pxor xmm14, xmm2
|
||||
pxor xmm15, xmm3
|
||||
movdqa xmm8, xmmword ptr [ROT16+rip]
|
||||
pshufb xmm12, xmm8
|
||||
pshufb xmm13, xmm8
|
||||
pshufb xmm14, xmm8
|
||||
pshufb xmm15, xmm8
|
||||
movdqa xmm8, xmmword ptr [rsp+0x100]
|
||||
paddd xmm8, xmm12
|
||||
paddd xmm9, xmm13
|
||||
paddd xmm10, xmm14
|
||||
paddd xmm11, xmm15
|
||||
pxor xmm4, xmm8
|
||||
pxor xmm5, xmm9
|
||||
pxor xmm6, xmm10
|
||||
pxor xmm7, xmm11
|
||||
movdqa xmmword ptr [rsp+0x100], xmm8
|
||||
movdqa xmm8, xmm4
|
||||
psrld xmm8, 12
|
||||
pslld xmm4, 20
|
||||
por xmm4, xmm8
|
||||
movdqa xmm8, xmm5
|
||||
psrld xmm8, 12
|
||||
pslld xmm5, 20
|
||||
por xmm5, xmm8
|
||||
movdqa xmm8, xmm6
|
||||
psrld xmm8, 12
|
||||
pslld xmm6, 20
|
||||
por xmm6, xmm8
|
||||
movdqa xmm8, xmm7
|
||||
psrld xmm8, 12
|
||||
pslld xmm7, 20
|
||||
por xmm7, xmm8
|
||||
paddd xmm0, xmmword ptr [rsp+0xF0]
|
||||
paddd xmm1, xmmword ptr [rsp]
|
||||
paddd xmm2, xmmword ptr [rsp+0x90]
|
||||
paddd xmm3, xmmword ptr [rsp+0x60]
|
||||
paddd xmm0, xmm4
|
||||
paddd xmm1, xmm5
|
||||
paddd xmm2, xmm6
|
||||
paddd xmm3, xmm7
|
||||
pxor xmm12, xmm0
|
||||
pxor xmm13, xmm1
|
||||
pxor xmm14, xmm2
|
||||
pxor xmm15, xmm3
|
||||
movdqa xmm8, xmmword ptr [ROT8+rip]
|
||||
pshufb xmm12, xmm8
|
||||
pshufb xmm13, xmm8
|
||||
pshufb xmm14, xmm8
|
||||
pshufb xmm15, xmm8
|
||||
movdqa xmm8, xmmword ptr [rsp+0x100]
|
||||
paddd xmm8, xmm12
|
||||
paddd xmm9, xmm13
|
||||
paddd xmm10, xmm14
|
||||
paddd xmm11, xmm15
|
||||
pxor xmm4, xmm8
|
||||
pxor xmm5, xmm9
|
||||
pxor xmm6, xmm10
|
||||
pxor xmm7, xmm11
|
||||
movdqa xmmword ptr [rsp+0x100], xmm8
|
||||
movdqa xmm8, xmm4
|
||||
psrld xmm8, 7
|
||||
pslld xmm4, 25
|
||||
por xmm4, xmm8
|
||||
movdqa xmm8, xmm5
|
||||
psrld xmm8, 7
|
||||
pslld xmm5, 25
|
||||
por xmm5, xmm8
|
||||
movdqa xmm8, xmm6
|
||||
psrld xmm8, 7
|
||||
pslld xmm6, 25
|
||||
por xmm6, xmm8
|
||||
movdqa xmm8, xmm7
|
||||
psrld xmm8, 7
|
||||
pslld xmm7, 25
|
||||
por xmm7, xmm8
|
||||
paddd xmm0, xmmword ptr [rsp+0xE0]
|
||||
paddd xmm1, xmmword ptr [rsp+0x20]
|
||||
paddd xmm2, xmmword ptr [rsp+0x30]
|
||||
paddd xmm3, xmmword ptr [rsp+0x70]
|
||||
paddd xmm0, xmm5
|
||||
paddd xmm1, xmm6
|
||||
paddd xmm2, xmm7
|
||||
paddd xmm3, xmm4
|
||||
pxor xmm15, xmm0
|
||||
pxor xmm12, xmm1
|
||||
pxor xmm13, xmm2
|
||||
pxor xmm14, xmm3
|
||||
movdqa xmm8, xmmword ptr [ROT16+rip]
|
||||
pshufb xmm15, xmm8
|
||||
pshufb xmm12, xmm8
|
||||
pshufb xmm13, xmm8
|
||||
pshufb xmm14, xmm8
|
||||
paddd xmm10, xmm15
|
||||
paddd xmm11, xmm12
|
||||
movdqa xmm8, xmmword ptr [rsp+0x100]
|
||||
paddd xmm8, xmm13
|
||||
paddd xmm9, xmm14
|
||||
pxor xmm5, xmm10
|
||||
pxor xmm6, xmm11
|
||||
pxor xmm7, xmm8
|
||||
pxor xmm4, xmm9
|
||||
movdqa xmmword ptr [rsp+0x100], xmm8
|
||||
movdqa xmm8, xmm5
|
||||
psrld xmm8, 12
|
||||
pslld xmm5, 20
|
||||
por xmm5, xmm8
|
||||
movdqa xmm8, xmm6
|
||||
psrld xmm8, 12
|
||||
pslld xmm6, 20
|
||||
por xmm6, xmm8
|
||||
movdqa xmm8, xmm7
|
||||
psrld xmm8, 12
|
||||
pslld xmm7, 20
|
||||
por xmm7, xmm8
|
||||
movdqa xmm8, xmm4
|
||||
psrld xmm8, 12
|
||||
pslld xmm4, 20
|
||||
por xmm4, xmm8
|
||||
paddd xmm0, xmmword ptr [rsp+0xA0]
|
||||
paddd xmm1, xmmword ptr [rsp+0xC0]
|
||||
paddd xmm2, xmmword ptr [rsp+0x40]
|
||||
paddd xmm3, xmmword ptr [rsp+0xD0]
|
||||
paddd xmm0, xmm5
|
||||
paddd xmm1, xmm6
|
||||
paddd xmm2, xmm7
|
||||
paddd xmm3, xmm4
|
||||
pxor xmm15, xmm0
|
||||
pxor xmm12, xmm1
|
||||
pxor xmm13, xmm2
|
||||
pxor xmm14, xmm3
|
||||
movdqa xmm8, xmmword ptr [ROT8+rip]
|
||||
pshufb xmm15, xmm8
|
||||
pshufb xmm12, xmm8
|
||||
pshufb xmm13, xmm8
|
||||
pshufb xmm14, xmm8
|
||||
paddd xmm10, xmm15
|
||||
paddd xmm11, xmm12
|
||||
movdqa xmm8, xmmword ptr [rsp+0x100]
|
||||
paddd xmm8, xmm13
|
||||
paddd xmm9, xmm14
|
||||
pxor xmm5, xmm10
|
||||
pxor xmm6, xmm11
|
||||
pxor xmm7, xmm8
|
||||
pxor xmm4, xmm9
|
||||
pxor xmm0, xmm8
|
||||
pxor xmm1, xmm9
|
||||
pxor xmm2, xmm10
|
||||
pxor xmm3, xmm11
|
||||
movdqa xmm8, xmm5
|
||||
psrld xmm8, 7
|
||||
pslld xmm5, 25
|
||||
por xmm5, xmm8
|
||||
movdqa xmm8, xmm6
|
||||
psrld xmm8, 7
|
||||
pslld xmm6, 25
|
||||
por xmm6, xmm8
|
||||
movdqa xmm8, xmm7
|
||||
psrld xmm8, 7
|
||||
pslld xmm7, 25
|
||||
por xmm7, xmm8
|
||||
movdqa xmm8, xmm4
|
||||
psrld xmm8, 7
|
||||
pslld xmm4, 25
|
||||
por xmm4, xmm8
|
||||
pxor xmm4, xmm12
|
||||
pxor xmm5, xmm13
|
||||
pxor xmm6, xmm14
|
||||
pxor xmm7, xmm15
|
||||
mov eax, r13d
|
||||
jne 9b
|
||||
movdqa xmm9, xmm0
|
||||
punpckldq xmm0, xmm1
|
||||
punpckhdq xmm9, xmm1
|
||||
movdqa xmm11, xmm2
|
||||
punpckldq xmm2, xmm3
|
||||
punpckhdq xmm11, xmm3
|
||||
movdqa xmm1, xmm0
|
||||
punpcklqdq xmm0, xmm2
|
||||
punpckhqdq xmm1, xmm2
|
||||
movdqa xmm3, xmm9
|
||||
punpcklqdq xmm9, xmm11
|
||||
punpckhqdq xmm3, xmm11
|
||||
movdqu xmmword ptr [rbx], xmm0
|
||||
movdqu xmmword ptr [rbx+0x20], xmm1
|
||||
movdqu xmmword ptr [rbx+0x40], xmm9
|
||||
movdqu xmmword ptr [rbx+0x60], xmm3
|
||||
movdqa xmm9, xmm4
|
||||
punpckldq xmm4, xmm5
|
||||
punpckhdq xmm9, xmm5
|
||||
movdqa xmm11, xmm6
|
||||
punpckldq xmm6, xmm7
|
||||
punpckhdq xmm11, xmm7
|
||||
movdqa xmm5, xmm4
|
||||
punpcklqdq xmm4, xmm6
|
||||
punpckhqdq xmm5, xmm6
|
||||
movdqa xmm7, xmm9
|
||||
punpcklqdq xmm9, xmm11
|
||||
punpckhqdq xmm7, xmm11
|
||||
movdqu xmmword ptr [rbx+0x10], xmm4
|
||||
movdqu xmmword ptr [rbx+0x30], xmm5
|
||||
movdqu xmmword ptr [rbx+0x50], xmm9
|
||||
movdqu xmmword ptr [rbx+0x70], xmm7
|
||||
movdqa xmm1, xmmword ptr [rsp+0x110]
|
||||
movdqa xmm0, xmm1
|
||||
paddd xmm1, xmmword ptr [rsp+0x150]
|
||||
movdqa xmmword ptr [rsp+0x110], xmm1
|
||||
pxor xmm0, xmmword ptr [CMP_MSB_MASK+rip]
|
||||
pxor xmm1, xmmword ptr [CMP_MSB_MASK+rip]
|
||||
pcmpgtd xmm0, xmm1
|
||||
movdqa xmm1, xmmword ptr [rsp+0x120]
|
||||
psubd xmm1, xmm0
|
||||
movdqa xmmword ptr [rsp+0x120], xmm1
|
||||
add rbx, 128
|
||||
add rdi, 32
|
||||
sub rsi, 4
|
||||
cmp rsi, 4
|
||||
jnc 2b
|
||||
test rsi, rsi
|
||||
jnz 3f
|
||||
4:
|
||||
mov rsp, rbp
|
||||
pop rbp
|
||||
pop rbx
|
||||
pop r12
|
||||
pop r13
|
||||
pop r14
|
||||
pop r15
|
||||
ret
|
||||
.p2align 5
|
||||
3:
|
||||
test esi, 0x2
|
||||
je 3f
|
||||
movups xmm0, xmmword ptr [rcx]
|
||||
movups xmm1, xmmword ptr [rcx+0x10]
|
||||
movaps xmm8, xmm0
|
||||
movaps xmm9, xmm1
|
||||
movd xmm13, dword ptr [rsp+0x110]
|
||||
pinsrd xmm13, dword ptr [rsp+0x120], 1
|
||||
pinsrd xmm13, dword ptr [BLAKE3_BLOCK_LEN+rip], 2
|
||||
movaps xmmword ptr [rsp], xmm13
|
||||
movd xmm14, dword ptr [rsp+0x114]
|
||||
pinsrd xmm14, dword ptr [rsp+0x124], 1
|
||||
pinsrd xmm14, dword ptr [BLAKE3_BLOCK_LEN+rip], 2
|
||||
movaps xmmword ptr [rsp+0x10], xmm14
|
||||
mov r8, qword ptr [rdi]
|
||||
mov r9, qword ptr [rdi+0x8]
|
||||
movzx eax, byte ptr [rbp+0x40]
|
||||
or eax, r13d
|
||||
xor edx, edx
|
||||
2:
|
||||
mov r14d, eax
|
||||
or eax, r12d
|
||||
add rdx, 64
|
||||
cmp rdx, r15
|
||||
cmovne eax, r14d
|
||||
movaps xmm2, xmmword ptr [BLAKE3_IV+rip]
|
||||
movaps xmm10, xmm2
|
||||
movups xmm4, xmmword ptr [r8+rdx-0x40]
|
||||
movups xmm5, xmmword ptr [r8+rdx-0x30]
|
||||
movaps xmm3, xmm4
|
||||
shufps xmm4, xmm5, 136
|
||||
shufps xmm3, xmm5, 221
|
||||
movaps xmm5, xmm3
|
||||
movups xmm6, xmmword ptr [r8+rdx-0x20]
|
||||
movups xmm7, xmmword ptr [r8+rdx-0x10]
|
||||
movaps xmm3, xmm6
|
||||
shufps xmm6, xmm7, 136
|
||||
pshufd xmm6, xmm6, 0x93
|
||||
shufps xmm3, xmm7, 221
|
||||
pshufd xmm7, xmm3, 0x93
|
||||
movups xmm12, xmmword ptr [r9+rdx-0x40]
|
||||
movups xmm13, xmmword ptr [r9+rdx-0x30]
|
||||
movaps xmm11, xmm12
|
||||
shufps xmm12, xmm13, 136
|
||||
shufps xmm11, xmm13, 221
|
||||
movaps xmm13, xmm11
|
||||
movups xmm14, xmmword ptr [r9+rdx-0x20]
|
||||
movups xmm15, xmmword ptr [r9+rdx-0x10]
|
||||
movaps xmm11, xmm14
|
||||
shufps xmm14, xmm15, 136
|
||||
pshufd xmm14, xmm14, 0x93
|
||||
shufps xmm11, xmm15, 221
|
||||
pshufd xmm15, xmm11, 0x93
|
||||
movaps xmm3, xmmword ptr [rsp]
|
||||
movaps xmm11, xmmword ptr [rsp+0x10]
|
||||
pinsrd xmm3, eax, 3
|
||||
pinsrd xmm11, eax, 3
|
||||
mov al, 7
|
||||
9:
|
||||
paddd xmm0, xmm4
|
||||
paddd xmm8, xmm12
|
||||
movaps xmmword ptr [rsp+0x20], xmm4
|
||||
movaps xmmword ptr [rsp+0x30], xmm12
|
||||
paddd xmm0, xmm1
|
||||
paddd xmm8, xmm9
|
||||
pxor xmm3, xmm0
|
||||
pxor xmm11, xmm8
|
||||
movaps xmm12, xmmword ptr [ROT16+rip]
|
||||
pshufb xmm3, xmm12
|
||||
pshufb xmm11, xmm12
|
||||
paddd xmm2, xmm3
|
||||
paddd xmm10, xmm11
|
||||
pxor xmm1, xmm2
|
||||
pxor xmm9, xmm10
|
||||
movdqa xmm4, xmm1
|
||||
pslld xmm1, 20
|
||||
psrld xmm4, 12
|
||||
por xmm1, xmm4
|
||||
movdqa xmm4, xmm9
|
||||
pslld xmm9, 20
|
||||
psrld xmm4, 12
|
||||
por xmm9, xmm4
|
||||
paddd xmm0, xmm5
|
||||
paddd xmm8, xmm13
|
||||
movaps xmmword ptr [rsp+0x40], xmm5
|
||||
movaps xmmword ptr [rsp+0x50], xmm13
|
||||
paddd xmm0, xmm1
|
||||
paddd xmm8, xmm9
|
||||
pxor xmm3, xmm0
|
||||
pxor xmm11, xmm8
|
||||
movaps xmm13, xmmword ptr [ROT8+rip]
|
||||
pshufb xmm3, xmm13
|
||||
pshufb xmm11, xmm13
|
||||
paddd xmm2, xmm3
|
||||
paddd xmm10, xmm11
|
||||
pxor xmm1, xmm2
|
||||
pxor xmm9, xmm10
|
||||
movdqa xmm4, xmm1
|
||||
pslld xmm1, 25
|
||||
psrld xmm4, 7
|
||||
por xmm1, xmm4
|
||||
movdqa xmm4, xmm9
|
||||
pslld xmm9, 25
|
||||
psrld xmm4, 7
|
||||
por xmm9, xmm4
|
||||
pshufd xmm0, xmm0, 0x93
|
||||
pshufd xmm8, xmm8, 0x93
|
||||
pshufd xmm3, xmm3, 0x4E
|
||||
pshufd xmm11, xmm11, 0x4E
|
||||
pshufd xmm2, xmm2, 0x39
|
||||
pshufd xmm10, xmm10, 0x39
|
||||
paddd xmm0, xmm6
|
||||
paddd xmm8, xmm14
|
||||
paddd xmm0, xmm1
|
||||
paddd xmm8, xmm9
|
||||
pxor xmm3, xmm0
|
||||
pxor xmm11, xmm8
|
||||
pshufb xmm3, xmm12
|
||||
pshufb xmm11, xmm12
|
||||
paddd xmm2, xmm3
|
||||
paddd xmm10, xmm11
|
||||
pxor xmm1, xmm2
|
||||
pxor xmm9, xmm10
|
||||
movdqa xmm4, xmm1
|
||||
pslld xmm1, 20
|
||||
psrld xmm4, 12
|
||||
por xmm1, xmm4
|
||||
movdqa xmm4, xmm9
|
||||
pslld xmm9, 20
|
||||
psrld xmm4, 12
|
||||
por xmm9, xmm4
|
||||
paddd xmm0, xmm7
|
||||
paddd xmm8, xmm15
|
||||
paddd xmm0, xmm1
|
||||
paddd xmm8, xmm9
|
||||
pxor xmm3, xmm0
|
||||
pxor xmm11, xmm8
|
||||
pshufb xmm3, xmm13
|
||||
pshufb xmm11, xmm13
|
||||
paddd xmm2, xmm3
|
||||
paddd xmm10, xmm11
|
||||
pxor xmm1, xmm2
|
||||
pxor xmm9, xmm10
|
||||
movdqa xmm4, xmm1
|
||||
pslld xmm1, 25
|
||||
psrld xmm4, 7
|
||||
por xmm1, xmm4
|
||||
movdqa xmm4, xmm9
|
||||
pslld xmm9, 25
|
||||
psrld xmm4, 7
|
||||
por xmm9, xmm4
|
||||
pshufd xmm0, xmm0, 0x39
|
||||
pshufd xmm8, xmm8, 0x39
|
||||
pshufd xmm3, xmm3, 0x4E
|
||||
pshufd xmm11, xmm11, 0x4E
|
||||
pshufd xmm2, xmm2, 0x93
|
||||
pshufd xmm10, xmm10, 0x93
|
||||
dec al
|
||||
je 9f
|
||||
movdqa xmm12, xmmword ptr [rsp+0x20]
|
||||
movdqa xmm5, xmmword ptr [rsp+0x40]
|
||||
pshufd xmm13, xmm12, 0x0F
|
||||
shufps xmm12, xmm5, 214
|
||||
pshufd xmm4, xmm12, 0x39
|
||||
movdqa xmm12, xmm6
|
||||
shufps xmm12, xmm7, 250
|
||||
pblendw xmm13, xmm12, 0xCC
|
||||
movdqa xmm12, xmm7
|
||||
punpcklqdq xmm12, xmm5
|
||||
pblendw xmm12, xmm6, 0xC0
|
||||
pshufd xmm12, xmm12, 0x78
|
||||
punpckhdq xmm5, xmm7
|
||||
punpckldq xmm6, xmm5
|
||||
pshufd xmm7, xmm6, 0x1E
|
||||
movdqa xmmword ptr [rsp+0x20], xmm13
|
||||
movdqa xmmword ptr [rsp+0x40], xmm12
|
||||
movdqa xmm5, xmmword ptr [rsp+0x30]
|
||||
movdqa xmm13, xmmword ptr [rsp+0x50]
|
||||
pshufd xmm6, xmm5, 0x0F
|
||||
shufps xmm5, xmm13, 214
|
||||
pshufd xmm12, xmm5, 0x39
|
||||
movdqa xmm5, xmm14
|
||||
shufps xmm5, xmm15, 250
|
||||
pblendw xmm6, xmm5, 0xCC
|
||||
movdqa xmm5, xmm15
|
||||
punpcklqdq xmm5, xmm13
|
||||
pblendw xmm5, xmm14, 0xC0
|
||||
pshufd xmm5, xmm5, 0x78
|
||||
punpckhdq xmm13, xmm15
|
||||
punpckldq xmm14, xmm13
|
||||
pshufd xmm15, xmm14, 0x1E
|
||||
movdqa xmm13, xmm6
|
||||
movdqa xmm14, xmm5
|
||||
movdqa xmm5, xmmword ptr [rsp+0x20]
|
||||
movdqa xmm6, xmmword ptr [rsp+0x40]
|
||||
jmp 9b
|
||||
9:
|
||||
pxor xmm0, xmm2
|
||||
pxor xmm1, xmm3
|
||||
pxor xmm8, xmm10
|
||||
pxor xmm9, xmm11
|
||||
mov eax, r13d
|
||||
cmp rdx, r15
|
||||
jne 2b
|
||||
movups xmmword ptr [rbx], xmm0
|
||||
movups xmmword ptr [rbx+0x10], xmm1
|
||||
movups xmmword ptr [rbx+0x20], xmm8
|
||||
movups xmmword ptr [rbx+0x30], xmm9
|
||||
movdqa xmm0, xmmword ptr [rsp+0x130]
|
||||
movdqa xmm1, xmmword ptr [rsp+0x110]
|
||||
movdqa xmm2, xmmword ptr [rsp+0x120]
|
||||
movdqu xmm3, xmmword ptr [rsp+0x118]
|
||||
movdqu xmm4, xmmword ptr [rsp+0x128]
|
||||
blendvps xmm1, xmm3, xmm0
|
||||
blendvps xmm2, xmm4, xmm0
|
||||
movdqa xmmword ptr [rsp+0x110], xmm1
|
||||
movdqa xmmword ptr [rsp+0x120], xmm2
|
||||
add rdi, 16
|
||||
add rbx, 64
|
||||
sub rsi, 2
|
||||
3:
|
||||
test esi, 0x1
|
||||
je 4b
|
||||
movups xmm0, xmmword ptr [rcx]
|
||||
movups xmm1, xmmword ptr [rcx+0x10]
|
||||
movd xmm13, dword ptr [rsp+0x110]
|
||||
pinsrd xmm13, dword ptr [rsp+0x120], 1
|
||||
pinsrd xmm13, dword ptr [BLAKE3_BLOCK_LEN+rip], 2
|
||||
movaps xmm14, xmmword ptr [ROT8+rip]
|
||||
movaps xmm15, xmmword ptr [ROT16+rip]
|
||||
mov r8, qword ptr [rdi]
|
||||
movzx eax, byte ptr [rbp+0x40]
|
||||
or eax, r13d
|
||||
xor edx, edx
|
||||
2:
|
||||
mov r14d, eax
|
||||
or eax, r12d
|
||||
add rdx, 64
|
||||
cmp rdx, r15
|
||||
cmovne eax, r14d
|
||||
movaps xmm2, xmmword ptr [BLAKE3_IV+rip]
|
||||
movaps xmm3, xmm13
|
||||
pinsrd xmm3, eax, 3
|
||||
movups xmm4, xmmword ptr [r8+rdx-0x40]
|
||||
movups xmm5, xmmword ptr [r8+rdx-0x30]
|
||||
movaps xmm8, xmm4
|
||||
shufps xmm4, xmm5, 136
|
||||
shufps xmm8, xmm5, 221
|
||||
movaps xmm5, xmm8
|
||||
movups xmm6, xmmword ptr [r8+rdx-0x20]
|
||||
movups xmm7, xmmword ptr [r8+rdx-0x10]
|
||||
movaps xmm8, xmm6
|
||||
shufps xmm6, xmm7, 136
|
||||
pshufd xmm6, xmm6, 0x93
|
||||
shufps xmm8, xmm7, 221
|
||||
pshufd xmm7, xmm8, 0x93
|
||||
mov al, 7
|
||||
9:
|
||||
paddd xmm0, xmm4
|
||||
paddd xmm0, xmm1
|
||||
pxor xmm3, xmm0
|
||||
pshufb xmm3, xmm15
|
||||
paddd xmm2, xmm3
|
||||
pxor xmm1, xmm2
|
||||
movdqa xmm11, xmm1
|
||||
pslld xmm1, 20
|
||||
psrld xmm11, 12
|
||||
por xmm1, xmm11
|
||||
paddd xmm0, xmm5
|
||||
paddd xmm0, xmm1
|
||||
pxor xmm3, xmm0
|
||||
pshufb xmm3, xmm14
|
||||
paddd xmm2, xmm3
|
||||
pxor xmm1, xmm2
|
||||
movdqa xmm11, xmm1
|
||||
pslld xmm1, 25
|
||||
psrld xmm11, 7
|
||||
por xmm1, xmm11
|
||||
pshufd xmm0, xmm0, 0x93
|
||||
pshufd xmm3, xmm3, 0x4E
|
||||
pshufd xmm2, xmm2, 0x39
|
||||
paddd xmm0, xmm6
|
||||
paddd xmm0, xmm1
|
||||
pxor xmm3, xmm0
|
||||
pshufb xmm3, xmm15
|
||||
paddd xmm2, xmm3
|
||||
pxor xmm1, xmm2
|
||||
movdqa xmm11, xmm1
|
||||
pslld xmm1, 20
|
||||
psrld xmm11, 12
|
||||
por xmm1, xmm11
|
||||
paddd xmm0, xmm7
|
||||
paddd xmm0, xmm1
|
||||
pxor xmm3, xmm0
|
||||
pshufb xmm3, xmm14
|
||||
paddd xmm2, xmm3
|
||||
pxor xmm1, xmm2
|
||||
movdqa xmm11, xmm1
|
||||
pslld xmm1, 25
|
||||
psrld xmm11, 7
|
||||
por xmm1, xmm11
|
||||
pshufd xmm0, xmm0, 0x39
|
||||
pshufd xmm3, xmm3, 0x4E
|
||||
pshufd xmm2, xmm2, 0x93
|
||||
dec al
|
||||
jz 9f
|
||||
movdqa xmm8, xmm4
|
||||
shufps xmm8, xmm5, 214
|
||||
pshufd xmm9, xmm4, 0x0F
|
||||
pshufd xmm4, xmm8, 0x39
|
||||
movdqa xmm8, xmm6
|
||||
shufps xmm8, xmm7, 250
|
||||
pblendw xmm9, xmm8, 0xCC
|
||||
movdqa xmm8, xmm7
|
||||
punpcklqdq xmm8, xmm5
|
||||
pblendw xmm8, xmm6, 0xC0
|
||||
pshufd xmm8, xmm8, 0x78
|
||||
punpckhdq xmm5, xmm7
|
||||
punpckldq xmm6, xmm5
|
||||
pshufd xmm7, xmm6, 0x1E
|
||||
movdqa xmm5, xmm9
|
||||
movdqa xmm6, xmm8
|
||||
jmp 9b
|
||||
9:
|
||||
pxor xmm0, xmm2
|
||||
pxor xmm1, xmm3
|
||||
mov eax, r13d
|
||||
cmp rdx, r15
|
||||
jne 2b
|
||||
movups xmmword ptr [rbx], xmm0
|
||||
movups xmmword ptr [rbx+0x10], xmm1
|
||||
jmp 4b
|
||||
|
||||
.p2align 6
|
||||
blake3_compress_in_place_sse2:
|
||||
_blake3_compress_in_place_sse2:
|
||||
_CET_ENDBR
|
||||
movups xmm0, xmmword ptr [rdi]
|
||||
movups xmm1, xmmword ptr [rdi+0x10]
|
||||
movaps xmm2, xmmword ptr [BLAKE3_IV+rip]
|
||||
shl r8, 32
|
||||
add rdx, r8
|
||||
movq xmm3, rcx
|
||||
movq xmm4, rdx
|
||||
punpcklqdq xmm3, xmm4
|
||||
movups xmm4, xmmword ptr [rsi]
|
||||
movups xmm5, xmmword ptr [rsi+0x10]
|
||||
movaps xmm8, xmm4
|
||||
shufps xmm4, xmm5, 136
|
||||
shufps xmm8, xmm5, 221
|
||||
movaps xmm5, xmm8
|
||||
movups xmm6, xmmword ptr [rsi+0x20]
|
||||
movups xmm7, xmmword ptr [rsi+0x30]
|
||||
movaps xmm8, xmm6
|
||||
shufps xmm6, xmm7, 136
|
||||
pshufd xmm6, xmm6, 0x93
|
||||
shufps xmm8, xmm7, 221
|
||||
pshufd xmm7, xmm8, 0x93
|
||||
movaps xmm14, xmmword ptr [ROT8+rip]
|
||||
movaps xmm15, xmmword ptr [ROT16+rip]
|
||||
mov al, 7
|
||||
9:
|
||||
paddd xmm0, xmm4
|
||||
paddd xmm0, xmm1
|
||||
pxor xmm3, xmm0
|
||||
pshufb xmm3, xmm15
|
||||
paddd xmm2, xmm3
|
||||
pxor xmm1, xmm2
|
||||
movdqa xmm11, xmm1
|
||||
pslld xmm1, 20
|
||||
psrld xmm11, 12
|
||||
por xmm1, xmm11
|
||||
paddd xmm0, xmm5
|
||||
paddd xmm0, xmm1
|
||||
pxor xmm3, xmm0
|
||||
pshufb xmm3, xmm14
|
||||
paddd xmm2, xmm3
|
||||
pxor xmm1, xmm2
|
||||
movdqa xmm11, xmm1
|
||||
pslld xmm1, 25
|
||||
psrld xmm11, 7
|
||||
por xmm1, xmm11
|
||||
pshufd xmm0, xmm0, 0x93
|
||||
pshufd xmm3, xmm3, 0x4E
|
||||
pshufd xmm2, xmm2, 0x39
|
||||
paddd xmm0, xmm6
|
||||
paddd xmm0, xmm1
|
||||
pxor xmm3, xmm0
|
||||
pshufb xmm3, xmm15
|
||||
paddd xmm2, xmm3
|
||||
pxor xmm1, xmm2
|
||||
movdqa xmm11, xmm1
|
||||
pslld xmm1, 20
|
||||
psrld xmm11, 12
|
||||
por xmm1, xmm11
|
||||
paddd xmm0, xmm7
|
||||
paddd xmm0, xmm1
|
||||
pxor xmm3, xmm0
|
||||
pshufb xmm3, xmm14
|
||||
paddd xmm2, xmm3
|
||||
pxor xmm1, xmm2
|
||||
movdqa xmm11, xmm1
|
||||
pslld xmm1, 25
|
||||
psrld xmm11, 7
|
||||
por xmm1, xmm11
|
||||
pshufd xmm0, xmm0, 0x39
|
||||
pshufd xmm3, xmm3, 0x4E
|
||||
pshufd xmm2, xmm2, 0x93
|
||||
dec al
|
||||
jz 9f
|
||||
movdqa xmm8, xmm4
|
||||
shufps xmm8, xmm5, 214
|
||||
pshufd xmm9, xmm4, 0x0F
|
||||
pshufd xmm4, xmm8, 0x39
|
||||
movdqa xmm8, xmm6
|
||||
shufps xmm8, xmm7, 250
|
||||
pblendw xmm9, xmm8, 0xCC
|
||||
movdqa xmm8, xmm7
|
||||
punpcklqdq xmm8, xmm5
|
||||
pblendw xmm8, xmm6, 0xC0
|
||||
pshufd xmm8, xmm8, 0x78
|
||||
punpckhdq xmm5, xmm7
|
||||
punpckldq xmm6, xmm5
|
||||
pshufd xmm7, xmm6, 0x1E
|
||||
movdqa xmm5, xmm9
|
||||
movdqa xmm6, xmm8
|
||||
jmp 9b
|
||||
9:
|
||||
pxor xmm0, xmm2
|
||||
pxor xmm1, xmm3
|
||||
movups xmmword ptr [rdi], xmm0
|
||||
movups xmmword ptr [rdi+0x10], xmm1
|
||||
ret
|
||||
|
||||
.p2align 6
|
||||
blake3_compress_xof_sse2:
|
||||
_blake3_compress_xof_sse2:
|
||||
_CET_ENDBR
|
||||
movups xmm0, xmmword ptr [rdi]
|
||||
movups xmm1, xmmword ptr [rdi+0x10]
|
||||
movaps xmm2, xmmword ptr [BLAKE3_IV+rip]
|
||||
movzx eax, r8b
|
||||
movzx edx, dl
|
||||
shl rax, 32
|
||||
add rdx, rax
|
||||
movq xmm3, rcx
|
||||
movq xmm4, rdx
|
||||
punpcklqdq xmm3, xmm4
|
||||
movups xmm4, xmmword ptr [rsi]
|
||||
movups xmm5, xmmword ptr [rsi+0x10]
|
||||
movaps xmm8, xmm4
|
||||
shufps xmm4, xmm5, 136
|
||||
shufps xmm8, xmm5, 221
|
||||
movaps xmm5, xmm8
|
||||
movups xmm6, xmmword ptr [rsi+0x20]
|
||||
movups xmm7, xmmword ptr [rsi+0x30]
|
||||
movaps xmm8, xmm6
|
||||
shufps xmm6, xmm7, 136
|
||||
pshufd xmm6, xmm6, 0x93
|
||||
shufps xmm8, xmm7, 221
|
||||
pshufd xmm7, xmm8, 0x93
|
||||
movaps xmm14, xmmword ptr [ROT8+rip]
|
||||
movaps xmm15, xmmword ptr [ROT16+rip]
|
||||
mov al, 7
|
||||
9:
|
||||
paddd xmm0, xmm4
|
||||
paddd xmm0, xmm1
|
||||
pxor xmm3, xmm0
|
||||
pshufb xmm3, xmm15
|
||||
paddd xmm2, xmm3
|
||||
pxor xmm1, xmm2
|
||||
movdqa xmm11, xmm1
|
||||
pslld xmm1, 20
|
||||
psrld xmm11, 12
|
||||
por xmm1, xmm11
|
||||
paddd xmm0, xmm5
|
||||
paddd xmm0, xmm1
|
||||
pxor xmm3, xmm0
|
||||
pshufb xmm3, xmm14
|
||||
paddd xmm2, xmm3
|
||||
pxor xmm1, xmm2
|
||||
movdqa xmm11, xmm1
|
||||
pslld xmm1, 25
|
||||
psrld xmm11, 7
|
||||
por xmm1, xmm11
|
||||
pshufd xmm0, xmm0, 0x93
|
||||
pshufd xmm3, xmm3, 0x4E
|
||||
pshufd xmm2, xmm2, 0x39
|
||||
paddd xmm0, xmm6
|
||||
paddd xmm0, xmm1
|
||||
pxor xmm3, xmm0
|
||||
pshufb xmm3, xmm15
|
||||
paddd xmm2, xmm3
|
||||
pxor xmm1, xmm2
|
||||
movdqa xmm11, xmm1
|
||||
pslld xmm1, 20
|
||||
psrld xmm11, 12
|
||||
por xmm1, xmm11
|
||||
paddd xmm0, xmm7
|
||||
paddd xmm0, xmm1
|
||||
pxor xmm3, xmm0
|
||||
pshufb xmm3, xmm14
|
||||
paddd xmm2, xmm3
|
||||
pxor xmm1, xmm2
|
||||
movdqa xmm11, xmm1
|
||||
pslld xmm1, 25
|
||||
psrld xmm11, 7
|
||||
por xmm1, xmm11
|
||||
pshufd xmm0, xmm0, 0x39
|
||||
pshufd xmm3, xmm3, 0x4E
|
||||
pshufd xmm2, xmm2, 0x93
|
||||
dec al
|
||||
jz 9f
|
||||
movdqa xmm8, xmm4
|
||||
shufps xmm8, xmm5, 214
|
||||
pshufd xmm9, xmm4, 0x0F
|
||||
pshufd xmm4, xmm8, 0x39
|
||||
movdqa xmm8, xmm6
|
||||
shufps xmm8, xmm7, 250
|
||||
pblendw xmm9, xmm8, 0xCC
|
||||
movdqa xmm8, xmm7
|
||||
punpcklqdq xmm8, xmm5
|
||||
pblendw xmm8, xmm6, 0xC0
|
||||
pshufd xmm8, xmm8, 0x78
|
||||
punpckhdq xmm5, xmm7
|
||||
punpckldq xmm6, xmm5
|
||||
pshufd xmm7, xmm6, 0x1E
|
||||
movdqa xmm5, xmm9
|
||||
movdqa xmm6, xmm8
|
||||
jmp 9b
|
||||
9:
|
||||
movdqu xmm4, xmmword ptr [rdi]
|
||||
movdqu xmm5, xmmword ptr [rdi+0x10]
|
||||
pxor xmm0, xmm2
|
||||
pxor xmm1, xmm3
|
||||
pxor xmm2, xmm4
|
||||
pxor xmm3, xmm5
|
||||
movups xmmword ptr [r9], xmm0
|
||||
movups xmmword ptr [r9+0x10], xmm1
|
||||
movups xmmword ptr [r9+0x20], xmm2
|
||||
movups xmmword ptr [r9+0x30], xmm3
|
||||
ret
|
||||
|
||||
|
||||
#ifdef __APPLE__
|
||||
.static_data
|
||||
#else
|
||||
.section .rodata
|
||||
#endif
|
||||
.p2align 6
|
||||
BLAKE3_IV:
|
||||
.long 0x6A09E667, 0xBB67AE85
|
||||
.long 0x3C6EF372, 0xA54FF53A
|
||||
ROT16:
|
||||
.byte 2, 3, 0, 1, 6, 7, 4, 5, 10, 11, 8, 9, 14, 15, 12, 13
|
||||
ROT8:
|
||||
.byte 1, 2, 3, 0, 5, 6, 7, 4, 9, 10, 11, 8, 13, 14, 15, 12
|
||||
ADD0:
|
||||
.long 0, 1, 2, 3
|
||||
ADD1:
|
||||
.long 4, 4, 4, 4
|
||||
BLAKE3_IV_0:
|
||||
.long 0x6A09E667, 0x6A09E667, 0x6A09E667, 0x6A09E667
|
||||
BLAKE3_IV_1:
|
||||
.long 0xBB67AE85, 0xBB67AE85, 0xBB67AE85, 0xBB67AE85
|
||||
BLAKE3_IV_2:
|
||||
.long 0x3C6EF372, 0x3C6EF372, 0x3C6EF372, 0x3C6EF372
|
||||
BLAKE3_IV_3:
|
||||
.long 0xA54FF53A, 0xA54FF53A, 0xA54FF53A, 0xA54FF53A
|
||||
BLAKE3_BLOCK_LEN:
|
||||
.long 64, 64, 64, 64
|
||||
CMP_MSB_MASK:
|
||||
.long 0x80000000, 0x80000000, 0x80000000, 0x80000000
|
|
@ -0,0 +1,2069 @@
|
|||
.intel_syntax noprefix
|
||||
.global blake3_hash_many_sse2
|
||||
.global _blake3_hash_many_sse2
|
||||
.global blake3_compress_in_place_sse2
|
||||
.global _blake3_compress_in_place_sse2
|
||||
.global blake3_compress_xof_sse2
|
||||
.global _blake3_compress_xof_sse2
|
||||
.section .text
|
||||
.p2align 6
|
||||
_blake3_hash_many_sse2:
|
||||
blake3_hash_many_sse2:
|
||||
push r15
|
||||
push r14
|
||||
push r13
|
||||
push r12
|
||||
push rsi
|
||||
push rdi
|
||||
push rbx
|
||||
push rbp
|
||||
mov rbp, rsp
|
||||
sub rsp, 528
|
||||
and rsp, 0xFFFFFFFFFFFFFFC0
|
||||
movdqa xmmword ptr [rsp+0x170], xmm6
|
||||
movdqa xmmword ptr [rsp+0x180], xmm7
|
||||
movdqa xmmword ptr [rsp+0x190], xmm8
|
||||
movdqa xmmword ptr [rsp+0x1A0], xmm9
|
||||
movdqa xmmword ptr [rsp+0x1B0], xmm10
|
||||
movdqa xmmword ptr [rsp+0x1C0], xmm11
|
||||
movdqa xmmword ptr [rsp+0x1D0], xmm12
|
||||
movdqa xmmword ptr [rsp+0x1E0], xmm13
|
||||
movdqa xmmword ptr [rsp+0x1F0], xmm14
|
||||
movdqa xmmword ptr [rsp+0x200], xmm15
|
||||
mov rdi, rcx
|
||||
mov rsi, rdx
|
||||
mov rdx, r8
|
||||
mov rcx, r9
|
||||
mov r8, qword ptr [rbp+0x68]
|
||||
movzx r9, byte ptr [rbp+0x70]
|
||||
neg r9d
|
||||
movd xmm0, r9d
|
||||
pshufd xmm0, xmm0, 0x00
|
||||
movdqa xmmword ptr [rsp+0x130], xmm0
|
||||
movdqa xmm1, xmm0
|
||||
pand xmm1, xmmword ptr [ADD0+rip]
|
||||
pand xmm0, xmmword ptr [ADD1+rip]
|
||||
movdqa xmmword ptr [rsp+0x150], xmm0
|
||||
movd xmm0, r8d
|
||||
pshufd xmm0, xmm0, 0x00
|
||||
paddd xmm0, xmm1
|
||||
movdqa xmmword ptr [rsp+0x110], xmm0
|
||||
pxor xmm0, xmmword ptr [CMP_MSB_MASK+rip]
|
||||
pxor xmm1, xmmword ptr [CMP_MSB_MASK+rip]
|
||||
pcmpgtd xmm1, xmm0
|
||||
shr r8, 32
|
||||
movd xmm2, r8d
|
||||
pshufd xmm2, xmm2, 0x00
|
||||
psubd xmm2, xmm1
|
||||
movdqa xmmword ptr [rsp+0x120], xmm2
|
||||
mov rbx, qword ptr [rbp+0x90]
|
||||
mov r15, rdx
|
||||
shl r15, 6
|
||||
movzx r13d, byte ptr [rbp+0x78]
|
||||
movzx r12d, byte ptr [rbp+0x88]
|
||||
cmp rsi, 4
|
||||
jc 3f
|
||||
2:
|
||||
movdqu xmm3, xmmword ptr [rcx]
|
||||
pshufd xmm0, xmm3, 0x00
|
||||
pshufd xmm1, xmm3, 0x55
|
||||
pshufd xmm2, xmm3, 0xAA
|
||||
pshufd xmm3, xmm3, 0xFF
|
||||
movdqu xmm7, xmmword ptr [rcx+0x10]
|
||||
pshufd xmm4, xmm7, 0x00
|
||||
pshufd xmm5, xmm7, 0x55
|
||||
pshufd xmm6, xmm7, 0xAA
|
||||
pshufd xmm7, xmm7, 0xFF
|
||||
mov r8, qword ptr [rdi]
|
||||
mov r9, qword ptr [rdi+0x8]
|
||||
mov r10, qword ptr [rdi+0x10]
|
||||
mov r11, qword ptr [rdi+0x18]
|
||||
movzx eax, byte ptr [rbp+0x80]
|
||||
or eax, r13d
|
||||
xor edx, edx
|
||||
9:
|
||||
mov r14d, eax
|
||||
or eax, r12d
|
||||
add rdx, 64
|
||||
cmp rdx, r15
|
||||
cmovne eax, r14d
|
||||
movdqu xmm8, xmmword ptr [r8+rdx-0x40]
|
||||
movdqu xmm9, xmmword ptr [r9+rdx-0x40]
|
||||
movdqu xmm10, xmmword ptr [r10+rdx-0x40]
|
||||
movdqu xmm11, xmmword ptr [r11+rdx-0x40]
|
||||
movdqa xmm12, xmm8
|
||||
punpckldq xmm8, xmm9
|
||||
punpckhdq xmm12, xmm9
|
||||
movdqa xmm14, xmm10
|
||||
punpckldq xmm10, xmm11
|
||||
punpckhdq xmm14, xmm11
|
||||
movdqa xmm9, xmm8
|
||||
punpcklqdq xmm8, xmm10
|
||||
punpckhqdq xmm9, xmm10
|
||||
movdqa xmm13, xmm12
|
||||
punpcklqdq xmm12, xmm14
|
||||
punpckhqdq xmm13, xmm14
|
||||
movdqa xmmword ptr [rsp], xmm8
|
||||
movdqa xmmword ptr [rsp+0x10], xmm9
|
||||
movdqa xmmword ptr [rsp+0x20], xmm12
|
||||
movdqa xmmword ptr [rsp+0x30], xmm13
|
||||
movdqu xmm8, xmmword ptr [r8+rdx-0x30]
|
||||
movdqu xmm9, xmmword ptr [r9+rdx-0x30]
|
||||
movdqu xmm10, xmmword ptr [r10+rdx-0x30]
|
||||
movdqu xmm11, xmmword ptr [r11+rdx-0x30]
|
||||
movdqa xmm12, xmm8
|
||||
punpckldq xmm8, xmm9
|
||||
punpckhdq xmm12, xmm9
|
||||
movdqa xmm14, xmm10
|
||||
punpckldq xmm10, xmm11
|
||||
punpckhdq xmm14, xmm11
|
||||
movdqa xmm9, xmm8
|
||||
punpcklqdq xmm8, xmm10
|
||||
punpckhqdq xmm9, xmm10
|
||||
movdqa xmm13, xmm12
|
||||
punpcklqdq xmm12, xmm14
|
||||
punpckhqdq xmm13, xmm14
|
||||
movdqa xmmword ptr [rsp+0x40], xmm8
|
||||
movdqa xmmword ptr [rsp+0x50], xmm9
|
||||
movdqa xmmword ptr [rsp+0x60], xmm12
|
||||
movdqa xmmword ptr [rsp+0x70], xmm13
|
||||
movdqu xmm8, xmmword ptr [r8+rdx-0x20]
|
||||
movdqu xmm9, xmmword ptr [r9+rdx-0x20]
|
||||
movdqu xmm10, xmmword ptr [r10+rdx-0x20]
|
||||
movdqu xmm11, xmmword ptr [r11+rdx-0x20]
|
||||
movdqa xmm12, xmm8
|
||||
punpckldq xmm8, xmm9
|
||||
punpckhdq xmm12, xmm9
|
||||
movdqa xmm14, xmm10
|
||||
punpckldq xmm10, xmm11
|
||||
punpckhdq xmm14, xmm11
|
||||
movdqa xmm9, xmm8
|
||||
punpcklqdq xmm8, xmm10
|
||||
punpckhqdq xmm9, xmm10
|
||||
movdqa xmm13, xmm12
|
||||
punpcklqdq xmm12, xmm14
|
||||
punpckhqdq xmm13, xmm14
|
||||
movdqa xmmword ptr [rsp+0x80], xmm8
|
||||
movdqa xmmword ptr [rsp+0x90], xmm9
|
||||
movdqa xmmword ptr [rsp+0xA0], xmm12
|
||||
movdqa xmmword ptr [rsp+0xB0], xmm13
|
||||
movdqu xmm8, xmmword ptr [r8+rdx-0x10]
|
||||
movdqu xmm9, xmmword ptr [r9+rdx-0x10]
|
||||
movdqu xmm10, xmmword ptr [r10+rdx-0x10]
|
||||
movdqu xmm11, xmmword ptr [r11+rdx-0x10]
|
||||
movdqa xmm12, xmm8
|
||||
punpckldq xmm8, xmm9
|
||||
punpckhdq xmm12, xmm9
|
||||
movdqa xmm14, xmm10
|
||||
punpckldq xmm10, xmm11
|
||||
punpckhdq xmm14, xmm11
|
||||
movdqa xmm9, xmm8
|
||||
punpcklqdq xmm8, xmm10
|
||||
punpckhqdq xmm9, xmm10
|
||||
movdqa xmm13, xmm12
|
||||
punpcklqdq xmm12, xmm14
|
||||
punpckhqdq xmm13, xmm14
|
||||
movdqa xmmword ptr [rsp+0xC0], xmm8
|
||||
movdqa xmmword ptr [rsp+0xD0], xmm9
|
||||
movdqa xmmword ptr [rsp+0xE0], xmm12
|
||||
movdqa xmmword ptr [rsp+0xF0], xmm13
|
||||
movdqa xmm9, xmmword ptr [BLAKE3_IV_1+rip]
|
||||
movdqa xmm10, xmmword ptr [BLAKE3_IV_2+rip]
|
||||
movdqa xmm11, xmmword ptr [BLAKE3_IV_3+rip]
|
||||
movdqa xmm12, xmmword ptr [rsp+0x110]
|
||||
movdqa xmm13, xmmword ptr [rsp+0x120]
|
||||
movdqa xmm14, xmmword ptr [BLAKE3_BLOCK_LEN+rip]
|
||||
movd xmm15, eax
|
||||
pshufd xmm15, xmm15, 0x00
|
||||
prefetcht0 [r8+rdx+0x80]
|
||||
prefetcht0 [r9+rdx+0x80]
|
||||
prefetcht0 [r10+rdx+0x80]
|
||||
prefetcht0 [r11+rdx+0x80]
|
||||
paddd xmm0, xmmword ptr [rsp]
|
||||
paddd xmm1, xmmword ptr [rsp+0x20]
|
||||
paddd xmm2, xmmword ptr [rsp+0x40]
|
||||
paddd xmm3, xmmword ptr [rsp+0x60]
|
||||
paddd xmm0, xmm4
|
||||
paddd xmm1, xmm5
|
||||
paddd xmm2, xmm6
|
||||
paddd xmm3, xmm7
|
||||
pxor xmm12, xmm0
|
||||
pxor xmm13, xmm1
|
||||
pxor xmm14, xmm2
|
||||
pxor xmm15, xmm3
|
||||
movdqa xmm8, xmmword ptr [ROT16+rip]
|
||||
pshufb xmm12, xmm8
|
||||
pshufb xmm13, xmm8
|
||||
pshufb xmm14, xmm8
|
||||
pshufb xmm15, xmm8
|
||||
movdqa xmm8, xmmword ptr [BLAKE3_IV_0+rip]
|
||||
paddd xmm8, xmm12
|
||||
paddd xmm9, xmm13
|
||||
paddd xmm10, xmm14
|
||||
paddd xmm11, xmm15
|
||||
pxor xmm4, xmm8
|
||||
pxor xmm5, xmm9
|
||||
pxor xmm6, xmm10
|
||||
pxor xmm7, xmm11
|
||||
movdqa xmmword ptr [rsp+0x100], xmm8
|
||||
movdqa xmm8, xmm4
|
||||
psrld xmm8, 12
|
||||
pslld xmm4, 20
|
||||
por xmm4, xmm8
|
||||
movdqa xmm8, xmm5
|
||||
psrld xmm8, 12
|
||||
pslld xmm5, 20
|
||||
por xmm5, xmm8
|
||||
movdqa xmm8, xmm6
|
||||
psrld xmm8, 12
|
||||
pslld xmm6, 20
|
||||
por xmm6, xmm8
|
||||
movdqa xmm8, xmm7
|
||||
psrld xmm8, 12
|
||||
pslld xmm7, 20
|
||||
por xmm7, xmm8
|
||||
paddd xmm0, xmmword ptr [rsp+0x10]
|
||||
paddd xmm1, xmmword ptr [rsp+0x30]
|
||||
paddd xmm2, xmmword ptr [rsp+0x50]
|
||||
paddd xmm3, xmmword ptr [rsp+0x70]
|
||||
paddd xmm0, xmm4
|
||||
paddd xmm1, xmm5
|
||||
paddd xmm2, xmm6
|
||||
paddd xmm3, xmm7
|
||||
pxor xmm12, xmm0
|
||||
pxor xmm13, xmm1
|
||||
pxor xmm14, xmm2
|
||||
pxor xmm15, xmm3
|
||||
movdqa xmm8, xmmword ptr [ROT8+rip]
|
||||
pshufb xmm12, xmm8
|
||||
pshufb xmm13, xmm8
|
||||
pshufb xmm14, xmm8
|
||||
pshufb xmm15, xmm8
|
||||
movdqa xmm8, xmmword ptr [rsp+0x100]
|
||||
paddd xmm8, xmm12
|
||||
paddd xmm9, xmm13
|
||||
paddd xmm10, xmm14
|
||||
paddd xmm11, xmm15
|
||||
pxor xmm4, xmm8
|
||||
pxor xmm5, xmm9
|
||||
pxor xmm6, xmm10
|
||||
pxor xmm7, xmm11
|
||||
movdqa xmmword ptr [rsp+0x100], xmm8
|
||||
movdqa xmm8, xmm4
|
||||
psrld xmm8, 7
|
||||
pslld xmm4, 25
|
||||
por xmm4, xmm8
|
||||
movdqa xmm8, xmm5
|
||||
psrld xmm8, 7
|
||||
pslld xmm5, 25
|
||||
por xmm5, xmm8
|
||||
movdqa xmm8, xmm6
|
||||
psrld xmm8, 7
|
||||
pslld xmm6, 25
|
||||
por xmm6, xmm8
|
||||
movdqa xmm8, xmm7
|
||||
psrld xmm8, 7
|
||||
pslld xmm7, 25
|
||||
por xmm7, xmm8
|
||||
paddd xmm0, xmmword ptr [rsp+0x80]
|
||||
paddd xmm1, xmmword ptr [rsp+0xA0]
|
||||
paddd xmm2, xmmword ptr [rsp+0xC0]
|
||||
paddd xmm3, xmmword ptr [rsp+0xE0]
|
||||
paddd xmm0, xmm5
|
||||
paddd xmm1, xmm6
|
||||
paddd xmm2, xmm7
|
||||
paddd xmm3, xmm4
|
||||
pxor xmm15, xmm0
|
||||
pxor xmm12, xmm1
|
||||
pxor xmm13, xmm2
|
||||
pxor xmm14, xmm3
|
||||
movdqa xmm8, xmmword ptr [ROT16+rip]
|
||||
pshufb xmm15, xmm8
|
||||
pshufb xmm12, xmm8
|
||||
pshufb xmm13, xmm8
|
||||
pshufb xmm14, xmm8
|
||||
paddd xmm10, xmm15
|
||||
paddd xmm11, xmm12
|
||||
movdqa xmm8, xmmword ptr [rsp+0x100]
|
||||
paddd xmm8, xmm13
|
||||
paddd xmm9, xmm14
|
||||
pxor xmm5, xmm10
|
||||
pxor xmm6, xmm11
|
||||
pxor xmm7, xmm8
|
||||
pxor xmm4, xmm9
|
||||
movdqa xmmword ptr [rsp+0x100], xmm8
|
||||
movdqa xmm8, xmm5
|
||||
psrld xmm8, 12
|
||||
pslld xmm5, 20
|
||||
por xmm5, xmm8
|
||||
movdqa xmm8, xmm6
|
||||
psrld xmm8, 12
|
||||
pslld xmm6, 20
|
||||
por xmm6, xmm8
|
||||
movdqa xmm8, xmm7
|
||||
psrld xmm8, 12
|
||||
pslld xmm7, 20
|
||||
por xmm7, xmm8
|
||||
movdqa xmm8, xmm4
|
||||
psrld xmm8, 12
|
||||
pslld xmm4, 20
|
||||
por xmm4, xmm8
|
||||
paddd xmm0, xmmword ptr [rsp+0x90]
|
||||
paddd xmm1, xmmword ptr [rsp+0xB0]
|
||||
paddd xmm2, xmmword ptr [rsp+0xD0]
|
||||
paddd xmm3, xmmword ptr [rsp+0xF0]
|
||||
paddd xmm0, xmm5
|
||||
paddd xmm1, xmm6
|
||||
paddd xmm2, xmm7
|
||||
paddd xmm3, xmm4
|
||||
pxor xmm15, xmm0
|
||||
pxor xmm12, xmm1
|
||||
pxor xmm13, xmm2
|
||||
pxor xmm14, xmm3
|
||||
movdqa xmm8, xmmword ptr [ROT8+rip]
|
||||
pshufb xmm15, xmm8
|
||||
pshufb xmm12, xmm8
|
||||
pshufb xmm13, xmm8
|
||||
pshufb xmm14, xmm8
|
||||
paddd xmm10, xmm15
|
||||
paddd xmm11, xmm12
|
||||
movdqa xmm8, xmmword ptr [rsp+0x100]
|
||||
paddd xmm8, xmm13
|
||||
paddd xmm9, xmm14
|
||||
pxor xmm5, xmm10
|
||||
pxor xmm6, xmm11
|
||||
pxor xmm7, xmm8
|
||||
pxor xmm4, xmm9
|
||||
movdqa xmmword ptr [rsp+0x100], xmm8
|
||||
movdqa xmm8, xmm5
|
||||
psrld xmm8, 7
|
||||
pslld xmm5, 25
|
||||
por xmm5, xmm8
|
||||
movdqa xmm8, xmm6
|
||||
psrld xmm8, 7
|
||||
pslld xmm6, 25
|
||||
por xmm6, xmm8
|
||||
movdqa xmm8, xmm7
|
||||
psrld xmm8, 7
|
||||
pslld xmm7, 25
|
||||
por xmm7, xmm8
|
||||
movdqa xmm8, xmm4
|
||||
psrld xmm8, 7
|
||||
pslld xmm4, 25
|
||||
por xmm4, xmm8
|
||||
paddd xmm0, xmmword ptr [rsp+0x20]
|
||||
paddd xmm1, xmmword ptr [rsp+0x30]
|
||||
paddd xmm2, xmmword ptr [rsp+0x70]
|
||||
paddd xmm3, xmmword ptr [rsp+0x40]
|
||||
paddd xmm0, xmm4
|
||||
paddd xmm1, xmm5
|
||||
paddd xmm2, xmm6
|
||||
paddd xmm3, xmm7
|
||||
pxor xmm12, xmm0
|
||||
pxor xmm13, xmm1
|
||||
pxor xmm14, xmm2
|
||||
pxor xmm15, xmm3
|
||||
movdqa xmm8, xmmword ptr [ROT16+rip]
|
||||
pshufb xmm12, xmm8
|
||||
pshufb xmm13, xmm8
|
||||
pshufb xmm14, xmm8
|
||||
pshufb xmm15, xmm8
|
||||
movdqa xmm8, xmmword ptr [rsp+0x100]
|
||||
paddd xmm8, xmm12
|
||||
paddd xmm9, xmm13
|
||||
paddd xmm10, xmm14
|
||||
paddd xmm11, xmm15
|
||||
pxor xmm4, xmm8
|
||||
pxor xmm5, xmm9
|
||||
pxor xmm6, xmm10
|
||||
pxor xmm7, xmm11
|
||||
movdqa xmmword ptr [rsp+0x100], xmm8
|
||||
movdqa xmm8, xmm4
|
||||
psrld xmm8, 12
|
||||
pslld xmm4, 20
|
||||
por xmm4, xmm8
|
||||
movdqa xmm8, xmm5
|
||||
psrld xmm8, 12
|
||||
pslld xmm5, 20
|
||||
por xmm5, xmm8
|
||||
movdqa xmm8, xmm6
|
||||
psrld xmm8, 12
|
||||
pslld xmm6, 20
|
||||
por xmm6, xmm8
|
||||
movdqa xmm8, xmm7
|
||||
psrld xmm8, 12
|
||||
pslld xmm7, 20
|
||||
por xmm7, xmm8
|
||||
paddd xmm0, xmmword ptr [rsp+0x60]
|
||||
paddd xmm1, xmmword ptr [rsp+0xA0]
|
||||
paddd xmm2, xmmword ptr [rsp]
|
||||
paddd xmm3, xmmword ptr [rsp+0xD0]
|
||||
paddd xmm0, xmm4
|
||||
paddd xmm1, xmm5
|
||||
paddd xmm2, xmm6
|
||||
paddd xmm3, xmm7
|
||||
pxor xmm12, xmm0
|
||||
pxor xmm13, xmm1
|
||||
pxor xmm14, xmm2
|
||||
pxor xmm15, xmm3
|
||||
movdqa xmm8, xmmword ptr [ROT8+rip]
|
||||
pshufb xmm12, xmm8
|
||||
pshufb xmm13, xmm8
|
||||
pshufb xmm14, xmm8
|
||||
pshufb xmm15, xmm8
|
||||
movdqa xmm8, xmmword ptr [rsp+0x100]
|
||||
paddd xmm8, xmm12
|
||||
paddd xmm9, xmm13
|
||||
paddd xmm10, xmm14
|
||||
paddd xmm11, xmm15
|
||||
pxor xmm4, xmm8
|
||||
pxor xmm5, xmm9
|
||||
pxor xmm6, xmm10
|
||||
pxor xmm7, xmm11
|
||||
movdqa xmmword ptr [rsp+0x100], xmm8
|
||||
movdqa xmm8, xmm4
|
||||
psrld xmm8, 7
|
||||
pslld xmm4, 25
|
||||
por xmm4, xmm8
|
||||
movdqa xmm8, xmm5
|
||||
psrld xmm8, 7
|
||||
pslld xmm5, 25
|
||||
por xmm5, xmm8
|
||||
movdqa xmm8, xmm6
|
||||
psrld xmm8, 7
|
||||
pslld xmm6, 25
|
||||
por xmm6, xmm8
|
||||
movdqa xmm8, xmm7
|
||||
psrld xmm8, 7
|
||||
pslld xmm7, 25
|
||||
por xmm7, xmm8
|
||||
paddd xmm0, xmmword ptr [rsp+0x10]
|
||||
paddd xmm1, xmmword ptr [rsp+0xC0]
|
||||
paddd xmm2, xmmword ptr [rsp+0x90]
|
||||
paddd xmm3, xmmword ptr [rsp+0xF0]
|
||||
paddd xmm0, xmm5
|
||||
paddd xmm1, xmm6
|
||||
paddd xmm2, xmm7
|
||||
paddd xmm3, xmm4
|
||||
pxor xmm15, xmm0
|
||||
pxor xmm12, xmm1
|
||||
pxor xmm13, xmm2
|
||||
pxor xmm14, xmm3
|
||||
movdqa xmm8, xmmword ptr [ROT16+rip]
|
||||
pshufb xmm15, xmm8
|
||||
pshufb xmm12, xmm8
|
||||
pshufb xmm13, xmm8
|
||||
pshufb xmm14, xmm8
|
||||
paddd xmm10, xmm15
|
||||
paddd xmm11, xmm12
|
||||
movdqa xmm8, xmmword ptr [rsp+0x100]
|
||||
paddd xmm8, xmm13
|
||||
paddd xmm9, xmm14
|
||||
pxor xmm5, xmm10
|
||||
pxor xmm6, xmm11
|
||||
pxor xmm7, xmm8
|
||||
pxor xmm4, xmm9
|
||||
movdqa xmmword ptr [rsp+0x100], xmm8
|
||||
movdqa xmm8, xmm5
|
||||
psrld xmm8, 12
|
||||
pslld xmm5, 20
|
||||
por xmm5, xmm8
|
||||
movdqa xmm8, xmm6
|
||||
psrld xmm8, 12
|
||||
pslld xmm6, 20
|
||||
por xmm6, xmm8
|
||||
movdqa xmm8, xmm7
|
||||
psrld xmm8, 12
|
||||
pslld xmm7, 20
|
||||
por xmm7, xmm8
|
||||
movdqa xmm8, xmm4
|
||||
psrld xmm8, 12
|
||||
pslld xmm4, 20
|
||||
por xmm4, xmm8
|
||||
paddd xmm0, xmmword ptr [rsp+0xB0]
|
||||
paddd xmm1, xmmword ptr [rsp+0x50]
|
||||
paddd xmm2, xmmword ptr [rsp+0xE0]
|
||||
paddd xmm3, xmmword ptr [rsp+0x80]
|
||||
paddd xmm0, xmm5
|
||||
paddd xmm1, xmm6
|
||||
paddd xmm2, xmm7
|
||||
paddd xmm3, xmm4
|
||||
pxor xmm15, xmm0
|
||||
pxor xmm12, xmm1
|
||||
pxor xmm13, xmm2
|
||||
pxor xmm14, xmm3
|
||||
movdqa xmm8, xmmword ptr [ROT8+rip]
|
||||
pshufb xmm15, xmm8
|
||||
pshufb xmm12, xmm8
|
||||
pshufb xmm13, xmm8
|
||||
pshufb xmm14, xmm8
|
||||
paddd xmm10, xmm15
|
||||
paddd xmm11, xmm12
|
||||
movdqa xmm8, xmmword ptr [rsp+0x100]
|
||||
paddd xmm8, xmm13
|
||||
paddd xmm9, xmm14
|
||||
pxor xmm5, xmm10
|
||||
pxor xmm6, xmm11
|
||||
pxor xmm7, xmm8
|
||||
pxor xmm4, xmm9
|
||||
movdqa xmmword ptr [rsp+0x100], xmm8
|
||||
movdqa xmm8, xmm5
|
||||
psrld xmm8, 7
|
||||
pslld xmm5, 25
|
||||
por xmm5, xmm8
|
||||
movdqa xmm8, xmm6
|
||||
psrld xmm8, 7
|
||||
pslld xmm6, 25
|
||||
por xmm6, xmm8
|
||||
movdqa xmm8, xmm7
|
||||
psrld xmm8, 7
|
||||
pslld xmm7, 25
|
||||
por xmm7, xmm8
|
||||
movdqa xmm8, xmm4
|
||||
psrld xmm8, 7
|
||||
pslld xmm4, 25
|
||||
por xmm4, xmm8
|
||||
paddd xmm0, xmmword ptr [rsp+0x30]
|
||||
paddd xmm1, xmmword ptr [rsp+0xA0]
|
||||
paddd xmm2, xmmword ptr [rsp+0xD0]
|
||||
paddd xmm3, xmmword ptr [rsp+0x70]
|
||||
paddd xmm0, xmm4
|
||||
paddd xmm1, xmm5
|
||||
paddd xmm2, xmm6
|
||||
paddd xmm3, xmm7
|
||||
pxor xmm12, xmm0
|
||||
pxor xmm13, xmm1
|
||||
pxor xmm14, xmm2
|
||||
pxor xmm15, xmm3
|
||||
movdqa xmm8, xmmword ptr [ROT16+rip]
|
||||
pshufb xmm12, xmm8
|
||||
pshufb xmm13, xmm8
|
||||
pshufb xmm14, xmm8
|
||||
pshufb xmm15, xmm8
|
||||
movdqa xmm8, xmmword ptr [rsp+0x100]
|
||||
paddd xmm8, xmm12
|
||||
paddd xmm9, xmm13
|
||||
paddd xmm10, xmm14
|
||||
paddd xmm11, xmm15
|
||||
pxor xmm4, xmm8
|
||||
pxor xmm5, xmm9
|
||||
pxor xmm6, xmm10
|
||||
pxor xmm7, xmm11
|
||||
movdqa xmmword ptr [rsp+0x100], xmm8
|
||||
movdqa xmm8, xmm4
|
||||
psrld xmm8, 12
|
||||
pslld xmm4, 20
|
||||
por xmm4, xmm8
|
||||
movdqa xmm8, xmm5
|
||||
psrld xmm8, 12
|
||||
pslld xmm5, 20
|
||||
por xmm5, xmm8
|
||||
movdqa xmm8, xmm6
|
||||
psrld xmm8, 12
|
||||
pslld xmm6, 20
|
||||
por xmm6, xmm8
|
||||
movdqa xmm8, xmm7
|
||||
psrld xmm8, 12
|
||||
pslld xmm7, 20
|
||||
por xmm7, xmm8
|
||||
paddd xmm0, xmmword ptr [rsp+0x40]
|
||||
paddd xmm1, xmmword ptr [rsp+0xC0]
|
||||
paddd xmm2, xmmword ptr [rsp+0x20]
|
||||
paddd xmm3, xmmword ptr [rsp+0xE0]
|
||||
paddd xmm0, xmm4
|
||||
paddd xmm1, xmm5
|
||||
paddd xmm2, xmm6
|
||||
paddd xmm3, xmm7
|
||||
pxor xmm12, xmm0
|
||||
pxor xmm13, xmm1
|
||||
pxor xmm14, xmm2
|
||||
pxor xmm15, xmm3
|
||||
movdqa xmm8, xmmword ptr [ROT8+rip]
|
||||
pshufb xmm12, xmm8
|
||||
pshufb xmm13, xmm8
|
||||
pshufb xmm14, xmm8
|
||||
pshufb xmm15, xmm8
|
||||
movdqa xmm8, xmmword ptr [rsp+0x100]
|
||||
paddd xmm8, xmm12
|
||||
paddd xmm9, xmm13
|
||||
paddd xmm10, xmm14
|
||||
paddd xmm11, xmm15
|
||||
pxor xmm4, xmm8
|
||||
pxor xmm5, xmm9
|
||||
pxor xmm6, xmm10
|
||||
pxor xmm7, xmm11
|
||||
movdqa xmmword ptr [rsp+0x100], xmm8
|
||||
movdqa xmm8, xmm4
|
||||
psrld xmm8, 7
|
||||
pslld xmm4, 25
|
||||
por xmm4, xmm8
|
||||
movdqa xmm8, xmm5
|
||||
psrld xmm8, 7
|
||||
pslld xmm5, 25
|
||||
por xmm5, xmm8
|
||||
movdqa xmm8, xmm6
|
||||
psrld xmm8, 7
|
||||
pslld xmm6, 25
|
||||
por xmm6, xmm8
|
||||
movdqa xmm8, xmm7
|
||||
psrld xmm8, 7
|
||||
pslld xmm7, 25
|
||||
por xmm7, xmm8
|
||||
paddd xmm0, xmmword ptr [rsp+0x60]
|
||||
paddd xmm1, xmmword ptr [rsp+0x90]
|
||||
paddd xmm2, xmmword ptr [rsp+0xB0]
|
||||
paddd xmm3, xmmword ptr [rsp+0x80]
|
||||
paddd xmm0, xmm5
|
||||
paddd xmm1, xmm6
|
||||
paddd xmm2, xmm7
|
||||
paddd xmm3, xmm4
|
||||
pxor xmm15, xmm0
|
||||
pxor xmm12, xmm1
|
||||
pxor xmm13, xmm2
|
||||
pxor xmm14, xmm3
|
||||
movdqa xmm8, xmmword ptr [ROT16+rip]
|
||||
pshufb xmm15, xmm8
|
||||
pshufb xmm12, xmm8
|
||||
pshufb xmm13, xmm8
|
||||
pshufb xmm14, xmm8
|
||||
paddd xmm10, xmm15
|
||||
paddd xmm11, xmm12
|
||||
movdqa xmm8, xmmword ptr [rsp+0x100]
|
||||
paddd xmm8, xmm13
|
||||
paddd xmm9, xmm14
|
||||
pxor xmm5, xmm10
|
||||
pxor xmm6, xmm11
|
||||
pxor xmm7, xmm8
|
||||
pxor xmm4, xmm9
|
||||
movdqa xmmword ptr [rsp+0x100], xmm8
|
||||
movdqa xmm8, xmm5
|
||||
psrld xmm8, 12
|
||||
pslld xmm5, 20
|
||||
por xmm5, xmm8
|
||||
movdqa xmm8, xmm6
|
||||
psrld xmm8, 12
|
||||
pslld xmm6, 20
|
||||
por xmm6, xmm8
|
||||
movdqa xmm8, xmm7
|
||||
psrld xmm8, 12
|
||||
pslld xmm7, 20
|
||||
por xmm7, xmm8
|
||||
movdqa xmm8, xmm4
|
||||
psrld xmm8, 12
|
||||
pslld xmm4, 20
|
||||
por xmm4, xmm8
|
||||
paddd xmm0, xmmword ptr [rsp+0x50]
|
||||
paddd xmm1, xmmword ptr [rsp]
|
||||
paddd xmm2, xmmword ptr [rsp+0xF0]
|
||||
paddd xmm3, xmmword ptr [rsp+0x10]
|
||||
paddd xmm0, xmm5
|
||||
paddd xmm1, xmm6
|
||||
paddd xmm2, xmm7
|
||||
paddd xmm3, xmm4
|
||||
pxor xmm15, xmm0
|
||||
pxor xmm12, xmm1
|
||||
pxor xmm13, xmm2
|
||||
pxor xmm14, xmm3
|
||||
movdqa xmm8, xmmword ptr [ROT8+rip]
|
||||
pshufb xmm15, xmm8
|
||||
pshufb xmm12, xmm8
|
||||
pshufb xmm13, xmm8
|
||||
pshufb xmm14, xmm8
|
||||
paddd xmm10, xmm15
|
||||
paddd xmm11, xmm12
|
||||
movdqa xmm8, xmmword ptr [rsp+0x100]
|
||||
paddd xmm8, xmm13
|
||||
paddd xmm9, xmm14
|
||||
pxor xmm5, xmm10
|
||||
pxor xmm6, xmm11
|
||||
pxor xmm7, xmm8
|
||||
pxor xmm4, xmm9
|
||||
movdqa xmmword ptr [rsp+0x100], xmm8
|
||||
movdqa xmm8, xmm5
|
||||
psrld xmm8, 7
|
||||
pslld xmm5, 25
|
||||
por xmm5, xmm8
|
||||
movdqa xmm8, xmm6
|
||||
psrld xmm8, 7
|
||||
pslld xmm6, 25
|
||||
por xmm6, xmm8
|
||||
movdqa xmm8, xmm7
|
||||
psrld xmm8, 7
|
||||
pslld xmm7, 25
|
||||
por xmm7, xmm8
|
||||
movdqa xmm8, xmm4
|
||||
psrld xmm8, 7
|
||||
pslld xmm4, 25
|
||||
por xmm4, xmm8
|
||||
paddd xmm0, xmmword ptr [rsp+0xA0]
|
||||
paddd xmm1, xmmword ptr [rsp+0xC0]
|
||||
paddd xmm2, xmmword ptr [rsp+0xE0]
|
||||
paddd xmm3, xmmword ptr [rsp+0xD0]
|
||||
paddd xmm0, xmm4
|
||||
paddd xmm1, xmm5
|
||||
paddd xmm2, xmm6
|
||||
paddd xmm3, xmm7
|
||||
pxor xmm12, xmm0
|
||||
pxor xmm13, xmm1
|
||||
pxor xmm14, xmm2
|
||||
pxor xmm15, xmm3
|
||||
movdqa xmm8, xmmword ptr [ROT16+rip]
|
||||
pshufb xmm12, xmm8
|
||||
pshufb xmm13, xmm8
|
||||
pshufb xmm14, xmm8
|
||||
pshufb xmm15, xmm8
|
||||
movdqa xmm8, xmmword ptr [rsp+0x100]
|
||||
paddd xmm8, xmm12
|
||||
paddd xmm9, xmm13
|
||||
paddd xmm10, xmm14
|
||||
paddd xmm11, xmm15
|
||||
pxor xmm4, xmm8
|
||||
pxor xmm5, xmm9
|
||||
pxor xmm6, xmm10
|
||||
pxor xmm7, xmm11
|
||||
movdqa xmmword ptr [rsp+0x100], xmm8
|
||||
movdqa xmm8, xmm4
|
||||
psrld xmm8, 12
|
||||
pslld xmm4, 20
|
||||
por xmm4, xmm8
|
||||
movdqa xmm8, xmm5
|
||||
psrld xmm8, 12
|
||||
pslld xmm5, 20
|
||||
por xmm5, xmm8
|
||||
movdqa xmm8, xmm6
|
||||
psrld xmm8, 12
|
||||
pslld xmm6, 20
|
||||
por xmm6, xmm8
|
||||
movdqa xmm8, xmm7
|
||||
psrld xmm8, 12
|
||||
pslld xmm7, 20
|
||||
por xmm7, xmm8
|
||||
paddd xmm0, xmmword ptr [rsp+0x70]
|
||||
paddd xmm1, xmmword ptr [rsp+0x90]
|
||||
paddd xmm2, xmmword ptr [rsp+0x30]
|
||||
paddd xmm3, xmmword ptr [rsp+0xF0]
|
||||
paddd xmm0, xmm4
|
||||
paddd xmm1, xmm5
|
||||
paddd xmm2, xmm6
|
||||
paddd xmm3, xmm7
|
||||
pxor xmm12, xmm0
|
||||
pxor xmm13, xmm1
|
||||
pxor xmm14, xmm2
|
||||
pxor xmm15, xmm3
|
||||
movdqa xmm8, xmmword ptr [ROT8+rip]
|
||||
pshufb xmm12, xmm8
|
||||
pshufb xmm13, xmm8
|
||||
pshufb xmm14, xmm8
|
||||
pshufb xmm15, xmm8
|
||||
movdqa xmm8, xmmword ptr [rsp+0x100]
|
||||
paddd xmm8, xmm12
|
||||
paddd xmm9, xmm13
|
||||
paddd xmm10, xmm14
|
||||
paddd xmm11, xmm15
|
||||
pxor xmm4, xmm8
|
||||
pxor xmm5, xmm9
|
||||
pxor xmm6, xmm10
|
||||
pxor xmm7, xmm11
|
||||
movdqa xmmword ptr [rsp+0x100], xmm8
|
||||
movdqa xmm8, xmm4
|
||||
psrld xmm8, 7
|
||||
pslld xmm4, 25
|
||||
por xmm4, xmm8
|
||||
movdqa xmm8, xmm5
|
||||
psrld xmm8, 7
|
||||
pslld xmm5, 25
|
||||
por xmm5, xmm8
|
||||
movdqa xmm8, xmm6
|
||||
psrld xmm8, 7
|
||||
pslld xmm6, 25
|
||||
por xmm6, xmm8
|
||||
movdqa xmm8, xmm7
|
||||
psrld xmm8, 7
|
||||
pslld xmm7, 25
|
||||
por xmm7, xmm8
|
||||
paddd xmm0, xmmword ptr [rsp+0x40]
|
||||
paddd xmm1, xmmword ptr [rsp+0xB0]
|
||||
paddd xmm2, xmmword ptr [rsp+0x50]
|
||||
paddd xmm3, xmmword ptr [rsp+0x10]
|
||||
paddd xmm0, xmm5
|
||||
paddd xmm1, xmm6
|
||||
paddd xmm2, xmm7
|
||||
paddd xmm3, xmm4
|
||||
pxor xmm15, xmm0
|
||||
pxor xmm12, xmm1
|
||||
pxor xmm13, xmm2
|
||||
pxor xmm14, xmm3
|
||||
movdqa xmm8, xmmword ptr [ROT16+rip]
|
||||
pshufb xmm15, xmm8
|
||||
pshufb xmm12, xmm8
|
||||
pshufb xmm13, xmm8
|
||||
pshufb xmm14, xmm8
|
||||
paddd xmm10, xmm15
|
||||
paddd xmm11, xmm12
|
||||
movdqa xmm8, xmmword ptr [rsp+0x100]
|
||||
paddd xmm8, xmm13
|
||||
paddd xmm9, xmm14
|
||||
pxor xmm5, xmm10
|
||||
pxor xmm6, xmm11
|
||||
pxor xmm7, xmm8
|
||||
pxor xmm4, xmm9
|
||||
movdqa xmmword ptr [rsp+0x100], xmm8
|
||||
movdqa xmm8, xmm5
|
||||
psrld xmm8, 12
|
||||
pslld xmm5, 20
|
||||
por xmm5, xmm8
|
||||
movdqa xmm8, xmm6
|
||||
psrld xmm8, 12
|
||||
pslld xmm6, 20
|
||||
por xmm6, xmm8
|
||||
movdqa xmm8, xmm7
|
||||
psrld xmm8, 12
|
||||
pslld xmm7, 20
|
||||
por xmm7, xmm8
|
||||
movdqa xmm8, xmm4
|
||||
psrld xmm8, 12
|
||||
pslld xmm4, 20
|
||||
por xmm4, xmm8
|
||||
paddd xmm0, xmmword ptr [rsp]
|
||||
paddd xmm1, xmmword ptr [rsp+0x20]
|
||||
paddd xmm2, xmmword ptr [rsp+0x80]
|
||||
paddd xmm3, xmmword ptr [rsp+0x60]
|
||||
paddd xmm0, xmm5
|
||||
paddd xmm1, xmm6
|
||||
paddd xmm2, xmm7
|
||||
paddd xmm3, xmm4
|
||||
pxor xmm15, xmm0
|
||||
pxor xmm12, xmm1
|
||||
pxor xmm13, xmm2
|
||||
pxor xmm14, xmm3
|
||||
movdqa xmm8, xmmword ptr [ROT8+rip]
|
||||
pshufb xmm15, xmm8
|
||||
pshufb xmm12, xmm8
|
||||
pshufb xmm13, xmm8
|
||||
pshufb xmm14, xmm8
|
||||
paddd xmm10, xmm15
|
||||
paddd xmm11, xmm12
|
||||
movdqa xmm8, xmmword ptr [rsp+0x100]
|
||||
paddd xmm8, xmm13
|
||||
paddd xmm9, xmm14
|
||||
pxor xmm5, xmm10
|
||||
pxor xmm6, xmm11
|
||||
pxor xmm7, xmm8
|
||||
pxor xmm4, xmm9
|
||||
movdqa xmmword ptr [rsp+0x100], xmm8
|
||||
movdqa xmm8, xmm5
|
||||
psrld xmm8, 7
|
||||
pslld xmm5, 25
|
||||
por xmm5, xmm8
|
||||
movdqa xmm8, xmm6
|
||||
psrld xmm8, 7
|
||||
pslld xmm6, 25
|
||||
por xmm6, xmm8
|
||||
movdqa xmm8, xmm7
|
||||
psrld xmm8, 7
|
||||
pslld xmm7, 25
|
||||
por xmm7, xmm8
|
||||
movdqa xmm8, xmm4
|
||||
psrld xmm8, 7
|
||||
pslld xmm4, 25
|
||||
por xmm4, xmm8
|
||||
paddd xmm0, xmmword ptr [rsp+0xC0]
|
||||
paddd xmm1, xmmword ptr [rsp+0x90]
|
||||
paddd xmm2, xmmword ptr [rsp+0xF0]
|
||||
paddd xmm3, xmmword ptr [rsp+0xE0]
|
||||
paddd xmm0, xmm4
|
||||
paddd xmm1, xmm5
|
||||
paddd xmm2, xmm6
|
||||
paddd xmm3, xmm7
|
||||
pxor xmm12, xmm0
|
||||
pxor xmm13, xmm1
|
||||
pxor xmm14, xmm2
|
||||
pxor xmm15, xmm3
|
||||
movdqa xmm8, xmmword ptr [ROT16+rip]
|
||||
pshufb xmm12, xmm8
|
||||
pshufb xmm13, xmm8
|
||||
pshufb xmm14, xmm8
|
||||
pshufb xmm15, xmm8
|
||||
movdqa xmm8, xmmword ptr [rsp+0x100]
|
||||
paddd xmm8, xmm12
|
||||
paddd xmm9, xmm13
|
||||
paddd xmm10, xmm14
|
||||
paddd xmm11, xmm15
|
||||
pxor xmm4, xmm8
|
||||
pxor xmm5, xmm9
|
||||
pxor xmm6, xmm10
|
||||
pxor xmm7, xmm11
|
||||
movdqa xmmword ptr [rsp+0x100], xmm8
|
||||
movdqa xmm8, xmm4
|
||||
psrld xmm8, 12
|
||||
pslld xmm4, 20
|
||||
por xmm4, xmm8
|
||||
movdqa xmm8, xmm5
|
||||
psrld xmm8, 12
|
||||
pslld xmm5, 20
|
||||
por xmm5, xmm8
|
||||
movdqa xmm8, xmm6
|
||||
psrld xmm8, 12
|
||||
pslld xmm6, 20
|
||||
por xmm6, xmm8
|
||||
movdqa xmm8, xmm7
|
||||
psrld xmm8, 12
|
||||
pslld xmm7, 20
|
||||
por xmm7, xmm8
|
||||
paddd xmm0, xmmword ptr [rsp+0xD0]
|
||||
paddd xmm1, xmmword ptr [rsp+0xB0]
|
||||
paddd xmm2, xmmword ptr [rsp+0xA0]
|
||||
paddd xmm3, xmmword ptr [rsp+0x80]
|
||||
paddd xmm0, xmm4
|
||||
paddd xmm1, xmm5
|
||||
paddd xmm2, xmm6
|
||||
paddd xmm3, xmm7
|
||||
pxor xmm12, xmm0
|
||||
pxor xmm13, xmm1
|
||||
pxor xmm14, xmm2
|
||||
pxor xmm15, xmm3
|
||||
movdqa xmm8, xmmword ptr [ROT8+rip]
|
||||
pshufb xmm12, xmm8
|
||||
pshufb xmm13, xmm8
|
||||
pshufb xmm14, xmm8
|
||||
pshufb xmm15, xmm8
|
||||
movdqa xmm8, xmmword ptr [rsp+0x100]
|
||||
paddd xmm8, xmm12
|
||||
paddd xmm9, xmm13
|
||||
paddd xmm10, xmm14
|
||||
paddd xmm11, xmm15
|
||||
pxor xmm4, xmm8
|
||||
pxor xmm5, xmm9
|
||||
pxor xmm6, xmm10
|
||||
pxor xmm7, xmm11
|
||||
movdqa xmmword ptr [rsp+0x100], xmm8
|
||||
movdqa xmm8, xmm4
|
||||
psrld xmm8, 7
|
||||
pslld xmm4, 25
|
||||
por xmm4, xmm8
|
||||
movdqa xmm8, xmm5
|
||||
psrld xmm8, 7
|
||||
pslld xmm5, 25
|
||||
por xmm5, xmm8
|
||||
movdqa xmm8, xmm6
|
||||
psrld xmm8, 7
|
||||
pslld xmm6, 25
|
||||
por xmm6, xmm8
|
||||
movdqa xmm8, xmm7
|
||||
psrld xmm8, 7
|
||||
pslld xmm7, 25
|
||||
por xmm7, xmm8
|
||||
paddd xmm0, xmmword ptr [rsp+0x70]
|
||||
paddd xmm1, xmmword ptr [rsp+0x50]
|
||||
paddd xmm2, xmmword ptr [rsp]
|
||||
paddd xmm3, xmmword ptr [rsp+0x60]
|
||||
paddd xmm0, xmm5
|
||||
paddd xmm1, xmm6
|
||||
paddd xmm2, xmm7
|
||||
paddd xmm3, xmm4
|
||||
pxor xmm15, xmm0
|
||||
pxor xmm12, xmm1
|
||||
pxor xmm13, xmm2
|
||||
pxor xmm14, xmm3
|
||||
movdqa xmm8, xmmword ptr [ROT16+rip]
|
||||
pshufb xmm15, xmm8
|
||||
pshufb xmm12, xmm8
|
||||
pshufb xmm13, xmm8
|
||||
pshufb xmm14, xmm8
|
||||
paddd xmm10, xmm15
|
||||
paddd xmm11, xmm12
|
||||
movdqa xmm8, xmmword ptr [rsp+0x100]
|
||||
paddd xmm8, xmm13
|
||||
paddd xmm9, xmm14
|
||||
pxor xmm5, xmm10
|
||||
pxor xmm6, xmm11
|
||||
pxor xmm7, xmm8
|
||||
pxor xmm4, xmm9
|
||||
movdqa xmmword ptr [rsp+0x100], xmm8
|
||||
movdqa xmm8, xmm5
|
||||
psrld xmm8, 12
|
||||
pslld xmm5, 20
|
||||
por xmm5, xmm8
|
||||
movdqa xmm8, xmm6
|
||||
psrld xmm8, 12
|
||||
pslld xmm6, 20
|
||||
por xmm6, xmm8
|
||||
movdqa xmm8, xmm7
|
||||
psrld xmm8, 12
|
||||
pslld xmm7, 20
|
||||
por xmm7, xmm8
|
||||
movdqa xmm8, xmm4
|
||||
psrld xmm8, 12
|
||||
pslld xmm4, 20
|
||||
por xmm4, xmm8
|
||||
paddd xmm0, xmmword ptr [rsp+0x20]
|
||||
paddd xmm1, xmmword ptr [rsp+0x30]
|
||||
paddd xmm2, xmmword ptr [rsp+0x10]
|
||||
paddd xmm3, xmmword ptr [rsp+0x40]
|
||||
paddd xmm0, xmm5
|
||||
paddd xmm1, xmm6
|
||||
paddd xmm2, xmm7
|
||||
paddd xmm3, xmm4
|
||||
pxor xmm15, xmm0
|
||||
pxor xmm12, xmm1
|
||||
pxor xmm13, xmm2
|
||||
pxor xmm14, xmm3
|
||||
movdqa xmm8, xmmword ptr [ROT8+rip]
|
||||
pshufb xmm15, xmm8
|
||||
pshufb xmm12, xmm8
|
||||
pshufb xmm13, xmm8
|
||||
pshufb xmm14, xmm8
|
||||
paddd xmm10, xmm15
|
||||
paddd xmm11, xmm12
|
||||
movdqa xmm8, xmmword ptr [rsp+0x100]
|
||||
paddd xmm8, xmm13
|
||||
paddd xmm9, xmm14
|
||||
pxor xmm5, xmm10
|
||||
pxor xmm6, xmm11
|
||||
pxor xmm7, xmm8
|
||||
pxor xmm4, xmm9
|
||||
movdqa xmmword ptr [rsp+0x100], xmm8
|
||||
movdqa xmm8, xmm5
|
||||
psrld xmm8, 7
|
||||
pslld xmm5, 25
|
||||
por xmm5, xmm8
|
||||
movdqa xmm8, xmm6
|
||||
psrld xmm8, 7
|
||||
pslld xmm6, 25
|
||||
por xmm6, xmm8
|
||||
movdqa xmm8, xmm7
|
||||
psrld xmm8, 7
|
||||
pslld xmm7, 25
|
||||
por xmm7, xmm8
|
||||
movdqa xmm8, xmm4
|
||||
psrld xmm8, 7
|
||||
pslld xmm4, 25
|
||||
por xmm4, xmm8
|
||||
paddd xmm0, xmmword ptr [rsp+0x90]
|
||||
paddd xmm1, xmmword ptr [rsp+0xB0]
|
||||
paddd xmm2, xmmword ptr [rsp+0x80]
|
||||
paddd xmm3, xmmword ptr [rsp+0xF0]
|
||||
paddd xmm0, xmm4
|
||||
paddd xmm1, xmm5
|
||||
paddd xmm2, xmm6
|
||||
paddd xmm3, xmm7
|
||||
pxor xmm12, xmm0
|
||||
pxor xmm13, xmm1
|
||||
pxor xmm14, xmm2
|
||||
pxor xmm15, xmm3
|
||||
movdqa xmm8, xmmword ptr [ROT16+rip]
|
||||
pshufb xmm12, xmm8
|
||||
pshufb xmm13, xmm8
|
||||
pshufb xmm14, xmm8
|
||||
pshufb xmm15, xmm8
|
||||
movdqa xmm8, xmmword ptr [rsp+0x100]
|
||||
paddd xmm8, xmm12
|
||||
paddd xmm9, xmm13
|
||||
paddd xmm10, xmm14
|
||||
paddd xmm11, xmm15
|
||||
pxor xmm4, xmm8
|
||||
pxor xmm5, xmm9
|
||||
pxor xmm6, xmm10
|
||||
pxor xmm7, xmm11
|
||||
movdqa xmmword ptr [rsp+0x100], xmm8
|
||||
movdqa xmm8, xmm4
|
||||
psrld xmm8, 12
|
||||
pslld xmm4, 20
|
||||
por xmm4, xmm8
|
||||
movdqa xmm8, xmm5
|
||||
psrld xmm8, 12
|
||||
pslld xmm5, 20
|
||||
por xmm5, xmm8
|
||||
movdqa xmm8, xmm6
|
||||
psrld xmm8, 12
|
||||
pslld xmm6, 20
|
||||
por xmm6, xmm8
|
||||
movdqa xmm8, xmm7
|
||||
psrld xmm8, 12
|
||||
pslld xmm7, 20
|
||||
por xmm7, xmm8
|
||||
paddd xmm0, xmmword ptr [rsp+0xE0]
|
||||
paddd xmm1, xmmword ptr [rsp+0x50]
|
||||
paddd xmm2, xmmword ptr [rsp+0xC0]
|
||||
paddd xmm3, xmmword ptr [rsp+0x10]
|
||||
paddd xmm0, xmm4
|
||||
paddd xmm1, xmm5
|
||||
paddd xmm2, xmm6
|
||||
paddd xmm3, xmm7
|
||||
pxor xmm12, xmm0
|
||||
pxor xmm13, xmm1
|
||||
pxor xmm14, xmm2
|
||||
pxor xmm15, xmm3
|
||||
movdqa xmm8, xmmword ptr [ROT8+rip]
|
||||
pshufb xmm12, xmm8
|
||||
pshufb xmm13, xmm8
|
||||
pshufb xmm14, xmm8
|
||||
pshufb xmm15, xmm8
|
||||
movdqa xmm8, xmmword ptr [rsp+0x100]
|
||||
paddd xmm8, xmm12
|
||||
paddd xmm9, xmm13
|
||||
paddd xmm10, xmm14
|
||||
paddd xmm11, xmm15
|
||||
pxor xmm4, xmm8
|
||||
pxor xmm5, xmm9
|
||||
pxor xmm6, xmm10
|
||||
pxor xmm7, xmm11
|
||||
movdqa xmmword ptr [rsp+0x100], xmm8
|
||||
movdqa xmm8, xmm4
|
||||
psrld xmm8, 7
|
||||
pslld xmm4, 25
|
||||
por xmm4, xmm8
|
||||
movdqa xmm8, xmm5
|
||||
psrld xmm8, 7
|
||||
pslld xmm5, 25
|
||||
por xmm5, xmm8
|
||||
movdqa xmm8, xmm6
|
||||
psrld xmm8, 7
|
||||
pslld xmm6, 25
|
||||
por xmm6, xmm8
|
||||
movdqa xmm8, xmm7
|
||||
psrld xmm8, 7
|
||||
pslld xmm7, 25
|
||||
por xmm7, xmm8
|
||||
paddd xmm0, xmmword ptr [rsp+0xD0]
|
||||
paddd xmm1, xmmword ptr [rsp]
|
||||
paddd xmm2, xmmword ptr [rsp+0x20]
|
||||
paddd xmm3, xmmword ptr [rsp+0x40]
|
||||
paddd xmm0, xmm5
|
||||
paddd xmm1, xmm6
|
||||
paddd xmm2, xmm7
|
||||
paddd xmm3, xmm4
|
||||
pxor xmm15, xmm0
|
||||
pxor xmm12, xmm1
|
||||
pxor xmm13, xmm2
|
||||
pxor xmm14, xmm3
|
||||
movdqa xmm8, xmmword ptr [ROT16+rip]
|
||||
pshufb xmm15, xmm8
|
||||
pshufb xmm12, xmm8
|
||||
pshufb xmm13, xmm8
|
||||
pshufb xmm14, xmm8
|
||||
paddd xmm10, xmm15
|
||||
paddd xmm11, xmm12
|
||||
movdqa xmm8, xmmword ptr [rsp+0x100]
|
||||
paddd xmm8, xmm13
|
||||
paddd xmm9, xmm14
|
||||
pxor xmm5, xmm10
|
||||
pxor xmm6, xmm11
|
||||
pxor xmm7, xmm8
|
||||
pxor xmm4, xmm9
|
||||
movdqa xmmword ptr [rsp+0x100], xmm8
|
||||
movdqa xmm8, xmm5
|
||||
psrld xmm8, 12
|
||||
pslld xmm5, 20
|
||||
por xmm5, xmm8
|
||||
movdqa xmm8, xmm6
|
||||
psrld xmm8, 12
|
||||
pslld xmm6, 20
|
||||
por xmm6, xmm8
|
||||
movdqa xmm8, xmm7
|
||||
psrld xmm8, 12
|
||||
pslld xmm7, 20
|
||||
por xmm7, xmm8
|
||||
movdqa xmm8, xmm4
|
||||
psrld xmm8, 12
|
||||
pslld xmm4, 20
|
||||
por xmm4, xmm8
|
||||
paddd xmm0, xmmword ptr [rsp+0x30]
|
||||
paddd xmm1, xmmword ptr [rsp+0xA0]
|
||||
paddd xmm2, xmmword ptr [rsp+0x60]
|
||||
paddd xmm3, xmmword ptr [rsp+0x70]
|
||||
paddd xmm0, xmm5
|
||||
paddd xmm1, xmm6
|
||||
paddd xmm2, xmm7
|
||||
paddd xmm3, xmm4
|
||||
pxor xmm15, xmm0
|
||||
pxor xmm12, xmm1
|
||||
pxor xmm13, xmm2
|
||||
pxor xmm14, xmm3
|
||||
movdqa xmm8, xmmword ptr [ROT8+rip]
|
||||
pshufb xmm15, xmm8
|
||||
pshufb xmm12, xmm8
|
||||
pshufb xmm13, xmm8
|
||||
pshufb xmm14, xmm8
|
||||
paddd xmm10, xmm15
|
||||
paddd xmm11, xmm12
|
||||
movdqa xmm8, xmmword ptr [rsp+0x100]
|
||||
paddd xmm8, xmm13
|
||||
paddd xmm9, xmm14
|
||||
pxor xmm5, xmm10
|
||||
pxor xmm6, xmm11
|
||||
pxor xmm7, xmm8
|
||||
pxor xmm4, xmm9
|
||||
movdqa xmmword ptr [rsp+0x100], xmm8
|
||||
movdqa xmm8, xmm5
|
||||
psrld xmm8, 7
|
||||
pslld xmm5, 25
|
||||
por xmm5, xmm8
|
||||
movdqa xmm8, xmm6
|
||||
psrld xmm8, 7
|
||||
pslld xmm6, 25
|
||||
por xmm6, xmm8
|
||||
movdqa xmm8, xmm7
|
||||
psrld xmm8, 7
|
||||
pslld xmm7, 25
|
||||
por xmm7, xmm8
|
||||
movdqa xmm8, xmm4
|
||||
psrld xmm8, 7
|
||||
pslld xmm4, 25
|
||||
por xmm4, xmm8
|
||||
paddd xmm0, xmmword ptr [rsp+0xB0]
|
||||
paddd xmm1, xmmword ptr [rsp+0x50]
|
||||
paddd xmm2, xmmword ptr [rsp+0x10]
|
||||
paddd xmm3, xmmword ptr [rsp+0x80]
|
||||
paddd xmm0, xmm4
|
||||
paddd xmm1, xmm5
|
||||
paddd xmm2, xmm6
|
||||
paddd xmm3, xmm7
|
||||
pxor xmm12, xmm0
|
||||
pxor xmm13, xmm1
|
||||
pxor xmm14, xmm2
|
||||
pxor xmm15, xmm3
|
||||
movdqa xmm8, xmmword ptr [ROT16+rip]
|
||||
pshufb xmm12, xmm8
|
||||
pshufb xmm13, xmm8
|
||||
pshufb xmm14, xmm8
|
||||
pshufb xmm15, xmm8
|
||||
movdqa xmm8, xmmword ptr [rsp+0x100]
|
||||
paddd xmm8, xmm12
|
||||
paddd xmm9, xmm13
|
||||
paddd xmm10, xmm14
|
||||
paddd xmm11, xmm15
|
||||
pxor xmm4, xmm8
|
||||
pxor xmm5, xmm9
|
||||
pxor xmm6, xmm10
|
||||
pxor xmm7, xmm11
|
||||
movdqa xmmword ptr [rsp+0x100], xmm8
|
||||
movdqa xmm8, xmm4
|
||||
psrld xmm8, 12
|
||||
pslld xmm4, 20
|
||||
por xmm4, xmm8
|
||||
movdqa xmm8, xmm5
|
||||
psrld xmm8, 12
|
||||
pslld xmm5, 20
|
||||
por xmm5, xmm8
|
||||
movdqa xmm8, xmm6
|
||||
psrld xmm8, 12
|
||||
pslld xmm6, 20
|
||||
por xmm6, xmm8
|
||||
movdqa xmm8, xmm7
|
||||
psrld xmm8, 12
|
||||
pslld xmm7, 20
|
||||
por xmm7, xmm8
|
||||
paddd xmm0, xmmword ptr [rsp+0xF0]
|
||||
paddd xmm1, xmmword ptr [rsp]
|
||||
paddd xmm2, xmmword ptr [rsp+0x90]
|
||||
paddd xmm3, xmmword ptr [rsp+0x60]
|
||||
paddd xmm0, xmm4
|
||||
paddd xmm1, xmm5
|
||||
paddd xmm2, xmm6
|
||||
paddd xmm3, xmm7
|
||||
pxor xmm12, xmm0
|
||||
pxor xmm13, xmm1
|
||||
pxor xmm14, xmm2
|
||||
pxor xmm15, xmm3
|
||||
movdqa xmm8, xmmword ptr [ROT8+rip]
|
||||
pshufb xmm12, xmm8
|
||||
pshufb xmm13, xmm8
|
||||
pshufb xmm14, xmm8
|
||||
pshufb xmm15, xmm8
|
||||
movdqa xmm8, xmmword ptr [rsp+0x100]
|
||||
paddd xmm8, xmm12
|
||||
paddd xmm9, xmm13
|
||||
paddd xmm10, xmm14
|
||||
paddd xmm11, xmm15
|
||||
pxor xmm4, xmm8
|
||||
pxor xmm5, xmm9
|
||||
pxor xmm6, xmm10
|
||||
pxor xmm7, xmm11
|
||||
movdqa xmmword ptr [rsp+0x100], xmm8
|
||||
movdqa xmm8, xmm4
|
||||
psrld xmm8, 7
|
||||
pslld xmm4, 25
|
||||
por xmm4, xmm8
|
||||
movdqa xmm8, xmm5
|
||||
psrld xmm8, 7
|
||||
pslld xmm5, 25
|
||||
por xmm5, xmm8
|
||||
movdqa xmm8, xmm6
|
||||
psrld xmm8, 7
|
||||
pslld xmm6, 25
|
||||
por xmm6, xmm8
|
||||
movdqa xmm8, xmm7
|
||||
psrld xmm8, 7
|
||||
pslld xmm7, 25
|
||||
por xmm7, xmm8
|
||||
paddd xmm0, xmmword ptr [rsp+0xE0]
|
||||
paddd xmm1, xmmword ptr [rsp+0x20]
|
||||
paddd xmm2, xmmword ptr [rsp+0x30]
|
||||
paddd xmm3, xmmword ptr [rsp+0x70]
|
||||
paddd xmm0, xmm5
|
||||
paddd xmm1, xmm6
|
||||
paddd xmm2, xmm7
|
||||
paddd xmm3, xmm4
|
||||
pxor xmm15, xmm0
|
||||
pxor xmm12, xmm1
|
||||
pxor xmm13, xmm2
|
||||
pxor xmm14, xmm3
|
||||
movdqa xmm8, xmmword ptr [ROT16+rip]
|
||||
pshufb xmm15, xmm8
|
||||
pshufb xmm12, xmm8
|
||||
pshufb xmm13, xmm8
|
||||
pshufb xmm14, xmm8
|
||||
paddd xmm10, xmm15
|
||||
paddd xmm11, xmm12
|
||||
movdqa xmm8, xmmword ptr [rsp+0x100]
|
||||
paddd xmm8, xmm13
|
||||
paddd xmm9, xmm14
|
||||
pxor xmm5, xmm10
|
||||
pxor xmm6, xmm11
|
||||
pxor xmm7, xmm8
|
||||
pxor xmm4, xmm9
|
||||
movdqa xmmword ptr [rsp+0x100], xmm8
|
||||
movdqa xmm8, xmm5
|
||||
psrld xmm8, 12
|
||||
pslld xmm5, 20
|
||||
por xmm5, xmm8
|
||||
movdqa xmm8, xmm6
|
||||
psrld xmm8, 12
|
||||
pslld xmm6, 20
|
||||
por xmm6, xmm8
|
||||
movdqa xmm8, xmm7
|
||||
psrld xmm8, 12
|
||||
pslld xmm7, 20
|
||||
por xmm7, xmm8
|
||||
movdqa xmm8, xmm4
|
||||
psrld xmm8, 12
|
||||
pslld xmm4, 20
|
||||
por xmm4, xmm8
|
||||
paddd xmm0, xmmword ptr [rsp+0xA0]
|
||||
paddd xmm1, xmmword ptr [rsp+0xC0]
|
||||
paddd xmm2, xmmword ptr [rsp+0x40]
|
||||
paddd xmm3, xmmword ptr [rsp+0xD0]
|
||||
paddd xmm0, xmm5
|
||||
paddd xmm1, xmm6
|
||||
paddd xmm2, xmm7
|
||||
paddd xmm3, xmm4
|
||||
pxor xmm15, xmm0
|
||||
pxor xmm12, xmm1
|
||||
pxor xmm13, xmm2
|
||||
pxor xmm14, xmm3
|
||||
movdqa xmm8, xmmword ptr [ROT8+rip]
|
||||
pshufb xmm15, xmm8
|
||||
pshufb xmm12, xmm8
|
||||
pshufb xmm13, xmm8
|
||||
pshufb xmm14, xmm8
|
||||
paddd xmm10, xmm15
|
||||
paddd xmm11, xmm12
|
||||
movdqa xmm8, xmmword ptr [rsp+0x100]
|
||||
paddd xmm8, xmm13
|
||||
paddd xmm9, xmm14
|
||||
pxor xmm5, xmm10
|
||||
pxor xmm6, xmm11
|
||||
pxor xmm7, xmm8
|
||||
pxor xmm4, xmm9
|
||||
pxor xmm0, xmm8
|
||||
pxor xmm1, xmm9
|
||||
pxor xmm2, xmm10
|
||||
pxor xmm3, xmm11
|
||||
movdqa xmm8, xmm5
|
||||
psrld xmm8, 7
|
||||
pslld xmm5, 25
|
||||
por xmm5, xmm8
|
||||
movdqa xmm8, xmm6
|
||||
psrld xmm8, 7
|
||||
pslld xmm6, 25
|
||||
por xmm6, xmm8
|
||||
movdqa xmm8, xmm7
|
||||
psrld xmm8, 7
|
||||
pslld xmm7, 25
|
||||
por xmm7, xmm8
|
||||
movdqa xmm8, xmm4
|
||||
psrld xmm8, 7
|
||||
pslld xmm4, 25
|
||||
por xmm4, xmm8
|
||||
pxor xmm4, xmm12
|
||||
pxor xmm5, xmm13
|
||||
pxor xmm6, xmm14
|
||||
pxor xmm7, xmm15
|
||||
mov eax, r13d
|
||||
jne 9b
|
||||
movdqa xmm9, xmm0
|
||||
punpckldq xmm0, xmm1
|
||||
punpckhdq xmm9, xmm1
|
||||
movdqa xmm11, xmm2
|
||||
punpckldq xmm2, xmm3
|
||||
punpckhdq xmm11, xmm3
|
||||
movdqa xmm1, xmm0
|
||||
punpcklqdq xmm0, xmm2
|
||||
punpckhqdq xmm1, xmm2
|
||||
movdqa xmm3, xmm9
|
||||
punpcklqdq xmm9, xmm11
|
||||
punpckhqdq xmm3, xmm11
|
||||
movdqu xmmword ptr [rbx], xmm0
|
||||
movdqu xmmword ptr [rbx+0x20], xmm1
|
||||
movdqu xmmword ptr [rbx+0x40], xmm9
|
||||
movdqu xmmword ptr [rbx+0x60], xmm3
|
||||
movdqa xmm9, xmm4
|
||||
punpckldq xmm4, xmm5
|
||||
punpckhdq xmm9, xmm5
|
||||
movdqa xmm11, xmm6
|
||||
punpckldq xmm6, xmm7
|
||||
punpckhdq xmm11, xmm7
|
||||
movdqa xmm5, xmm4
|
||||
punpcklqdq xmm4, xmm6
|
||||
punpckhqdq xmm5, xmm6
|
||||
movdqa xmm7, xmm9
|
||||
punpcklqdq xmm9, xmm11
|
||||
punpckhqdq xmm7, xmm11
|
||||
movdqu xmmword ptr [rbx+0x10], xmm4
|
||||
movdqu xmmword ptr [rbx+0x30], xmm5
|
||||
movdqu xmmword ptr [rbx+0x50], xmm9
|
||||
movdqu xmmword ptr [rbx+0x70], xmm7
|
||||
movdqa xmm1, xmmword ptr [rsp+0x110]
|
||||
movdqa xmm0, xmm1
|
||||
paddd xmm1, xmmword ptr [rsp+0x150]
|
||||
movdqa xmmword ptr [rsp+0x110], xmm1
|
||||
pxor xmm0, xmmword ptr [CMP_MSB_MASK+rip]
|
||||
pxor xmm1, xmmword ptr [CMP_MSB_MASK+rip]
|
||||
pcmpgtd xmm0, xmm1
|
||||
movdqa xmm1, xmmword ptr [rsp+0x120]
|
||||
psubd xmm1, xmm0
|
||||
movdqa xmmword ptr [rsp+0x120], xmm1
|
||||
add rbx, 128
|
||||
add rdi, 32
|
||||
sub rsi, 4
|
||||
cmp rsi, 4
|
||||
jnc 2b
|
||||
test rsi, rsi
|
||||
jne 3f
|
||||
4:
|
||||
movdqa xmm6, xmmword ptr [rsp+0x170]
|
||||
movdqa xmm7, xmmword ptr [rsp+0x180]
|
||||
movdqa xmm8, xmmword ptr [rsp+0x190]
|
||||
movdqa xmm9, xmmword ptr [rsp+0x1A0]
|
||||
movdqa xmm10, xmmword ptr [rsp+0x1B0]
|
||||
movdqa xmm11, xmmword ptr [rsp+0x1C0]
|
||||
movdqa xmm12, xmmword ptr [rsp+0x1D0]
|
||||
movdqa xmm13, xmmword ptr [rsp+0x1E0]
|
||||
movdqa xmm14, xmmword ptr [rsp+0x1F0]
|
||||
movdqa xmm15, xmmword ptr [rsp+0x200]
|
||||
mov rsp, rbp
|
||||
pop rbp
|
||||
pop rbx
|
||||
pop rdi
|
||||
pop rsi
|
||||
pop r12
|
||||
pop r13
|
||||
pop r14
|
||||
pop r15
|
||||
ret
|
||||
.p2align 5
|
||||
3:
|
||||
test esi, 0x2
|
||||
je 3f
|
||||
movups xmm0, xmmword ptr [rcx]
|
||||
movups xmm1, xmmword ptr [rcx+0x10]
|
||||
movaps xmm8, xmm0
|
||||
movaps xmm9, xmm1
|
||||
movd xmm13, dword ptr [rsp+0x110]
|
||||
pinsrd xmm13, dword ptr [rsp+0x120], 1
|
||||
pinsrd xmm13, dword ptr [BLAKE3_BLOCK_LEN+rip], 2
|
||||
movaps xmmword ptr [rsp], xmm13
|
||||
movd xmm14, dword ptr [rsp+0x114]
|
||||
pinsrd xmm14, dword ptr [rsp+0x124], 1
|
||||
pinsrd xmm14, dword ptr [BLAKE3_BLOCK_LEN+rip], 2
|
||||
movaps xmmword ptr [rsp+0x10], xmm14
|
||||
mov r8, qword ptr [rdi]
|
||||
mov r9, qword ptr [rdi+0x8]
|
||||
movzx eax, byte ptr [rbp+0x80]
|
||||
or eax, r13d
|
||||
xor edx, edx
|
||||
2:
|
||||
mov r14d, eax
|
||||
or eax, r12d
|
||||
add rdx, 64
|
||||
cmp rdx, r15
|
||||
cmovne eax, r14d
|
||||
movaps xmm2, xmmword ptr [BLAKE3_IV+rip]
|
||||
movaps xmm10, xmm2
|
||||
movups xmm4, xmmword ptr [r8+rdx-0x40]
|
||||
movups xmm5, xmmword ptr [r8+rdx-0x30]
|
||||
movaps xmm3, xmm4
|
||||
shufps xmm4, xmm5, 136
|
||||
shufps xmm3, xmm5, 221
|
||||
movaps xmm5, xmm3
|
||||
movups xmm6, xmmword ptr [r8+rdx-0x20]
|
||||
movups xmm7, xmmword ptr [r8+rdx-0x10]
|
||||
movaps xmm3, xmm6
|
||||
shufps xmm6, xmm7, 136
|
||||
pshufd xmm6, xmm6, 0x93
|
||||
shufps xmm3, xmm7, 221
|
||||
pshufd xmm7, xmm3, 0x93
|
||||
movups xmm12, xmmword ptr [r9+rdx-0x40]
|
||||
movups xmm13, xmmword ptr [r9+rdx-0x30]
|
||||
movaps xmm11, xmm12
|
||||
shufps xmm12, xmm13, 136
|
||||
shufps xmm11, xmm13, 221
|
||||
movaps xmm13, xmm11
|
||||
movups xmm14, xmmword ptr [r9+rdx-0x20]
|
||||
movups xmm15, xmmword ptr [r9+rdx-0x10]
|
||||
movaps xmm11, xmm14
|
||||
shufps xmm14, xmm15, 136
|
||||
pshufd xmm14, xmm14, 0x93
|
||||
shufps xmm11, xmm15, 221
|
||||
pshufd xmm15, xmm11, 0x93
|
||||
movaps xmm3, xmmword ptr [rsp]
|
||||
movaps xmm11, xmmword ptr [rsp+0x10]
|
||||
pinsrd xmm3, eax, 3
|
||||
pinsrd xmm11, eax, 3
|
||||
mov al, 7
|
||||
9:
|
||||
paddd xmm0, xmm4
|
||||
paddd xmm8, xmm12
|
||||
movaps xmmword ptr [rsp+0x20], xmm4
|
||||
movaps xmmword ptr [rsp+0x30], xmm12
|
||||
paddd xmm0, xmm1
|
||||
paddd xmm8, xmm9
|
||||
pxor xmm3, xmm0
|
||||
pxor xmm11, xmm8
|
||||
movaps xmm12, xmmword ptr [ROT16+rip]
|
||||
pshufb xmm3, xmm12
|
||||
pshufb xmm11, xmm12
|
||||
paddd xmm2, xmm3
|
||||
paddd xmm10, xmm11
|
||||
pxor xmm1, xmm2
|
||||
pxor xmm9, xmm10
|
||||
movdqa xmm4, xmm1
|
||||
pslld xmm1, 20
|
||||
psrld xmm4, 12
|
||||
por xmm1, xmm4
|
||||
movdqa xmm4, xmm9
|
||||
pslld xmm9, 20
|
||||
psrld xmm4, 12
|
||||
por xmm9, xmm4
|
||||
paddd xmm0, xmm5
|
||||
paddd xmm8, xmm13
|
||||
movaps xmmword ptr [rsp+0x40], xmm5
|
||||
movaps xmmword ptr [rsp+0x50], xmm13
|
||||
paddd xmm0, xmm1
|
||||
paddd xmm8, xmm9
|
||||
pxor xmm3, xmm0
|
||||
pxor xmm11, xmm8
|
||||
movaps xmm13, xmmword ptr [ROT8+rip]
|
||||
pshufb xmm3, xmm13
|
||||
pshufb xmm11, xmm13
|
||||
paddd xmm2, xmm3
|
||||
paddd xmm10, xmm11
|
||||
pxor xmm1, xmm2
|
||||
pxor xmm9, xmm10
|
||||
movdqa xmm4, xmm1
|
||||
pslld xmm1, 25
|
||||
psrld xmm4, 7
|
||||
por xmm1, xmm4
|
||||
movdqa xmm4, xmm9
|
||||
pslld xmm9, 25
|
||||
psrld xmm4, 7
|
||||
por xmm9, xmm4
|
||||
pshufd xmm0, xmm0, 0x93
|
||||
pshufd xmm8, xmm8, 0x93
|
||||
pshufd xmm3, xmm3, 0x4E
|
||||
pshufd xmm11, xmm11, 0x4E
|
||||
pshufd xmm2, xmm2, 0x39
|
||||
pshufd xmm10, xmm10, 0x39
|
||||
paddd xmm0, xmm6
|
||||
paddd xmm8, xmm14
|
||||
paddd xmm0, xmm1
|
||||
paddd xmm8, xmm9
|
||||
pxor xmm3, xmm0
|
||||
pxor xmm11, xmm8
|
||||
pshufb xmm3, xmm12
|
||||
pshufb xmm11, xmm12
|
||||
paddd xmm2, xmm3
|
||||
paddd xmm10, xmm11
|
||||
pxor xmm1, xmm2
|
||||
pxor xmm9, xmm10
|
||||
movdqa xmm4, xmm1
|
||||
pslld xmm1, 20
|
||||
psrld xmm4, 12
|
||||
por xmm1, xmm4
|
||||
movdqa xmm4, xmm9
|
||||
pslld xmm9, 20
|
||||
psrld xmm4, 12
|
||||
por xmm9, xmm4
|
||||
paddd xmm0, xmm7
|
||||
paddd xmm8, xmm15
|
||||
paddd xmm0, xmm1
|
||||
paddd xmm8, xmm9
|
||||
pxor xmm3, xmm0
|
||||
pxor xmm11, xmm8
|
||||
pshufb xmm3, xmm13
|
||||
pshufb xmm11, xmm13
|
||||
paddd xmm2, xmm3
|
||||
paddd xmm10, xmm11
|
||||
pxor xmm1, xmm2
|
||||
pxor xmm9, xmm10
|
||||
movdqa xmm4, xmm1
|
||||
pslld xmm1, 25
|
||||
psrld xmm4, 7
|
||||
por xmm1, xmm4
|
||||
movdqa xmm4, xmm9
|
||||
pslld xmm9, 25
|
||||
psrld xmm4, 7
|
||||
por xmm9, xmm4
|
||||
pshufd xmm0, xmm0, 0x39
|
||||
pshufd xmm8, xmm8, 0x39
|
||||
pshufd xmm3, xmm3, 0x4E
|
||||
pshufd xmm11, xmm11, 0x4E
|
||||
pshufd xmm2, xmm2, 0x93
|
||||
pshufd xmm10, xmm10, 0x93
|
||||
dec al
|
||||
je 9f
|
||||
movdqa xmm12, xmmword ptr [rsp+0x20]
|
||||
movdqa xmm5, xmmword ptr [rsp+0x40]
|
||||
pshufd xmm13, xmm12, 0x0F
|
||||
shufps xmm12, xmm5, 214
|
||||
pshufd xmm4, xmm12, 0x39
|
||||
movdqa xmm12, xmm6
|
||||
shufps xmm12, xmm7, 250
|
||||
pblendw xmm13, xmm12, 0xCC
|
||||
movdqa xmm12, xmm7
|
||||
punpcklqdq xmm12, xmm5
|
||||
pblendw xmm12, xmm6, 0xC0
|
||||
pshufd xmm12, xmm12, 0x78
|
||||
punpckhdq xmm5, xmm7
|
||||
punpckldq xmm6, xmm5
|
||||
pshufd xmm7, xmm6, 0x1E
|
||||
movdqa xmmword ptr [rsp+0x20], xmm13
|
||||
movdqa xmmword ptr [rsp+0x40], xmm12
|
||||
movdqa xmm5, xmmword ptr [rsp+0x30]
|
||||
movdqa xmm13, xmmword ptr [rsp+0x50]
|
||||
pshufd xmm6, xmm5, 0x0F
|
||||
shufps xmm5, xmm13, 214
|
||||
pshufd xmm12, xmm5, 0x39
|
||||
movdqa xmm5, xmm14
|
||||
shufps xmm5, xmm15, 250
|
||||
pblendw xmm6, xmm5, 0xCC
|
||||
movdqa xmm5, xmm15
|
||||
punpcklqdq xmm5, xmm13
|
||||
pblendw xmm5, xmm14, 0xC0
|
||||
pshufd xmm5, xmm5, 0x78
|
||||
punpckhdq xmm13, xmm15
|
||||
punpckldq xmm14, xmm13
|
||||
pshufd xmm15, xmm14, 0x1E
|
||||
movdqa xmm13, xmm6
|
||||
movdqa xmm14, xmm5
|
||||
movdqa xmm5, xmmword ptr [rsp+0x20]
|
||||
movdqa xmm6, xmmword ptr [rsp+0x40]
|
||||
jmp 9b
|
||||
9:
|
||||
pxor xmm0, xmm2
|
||||
pxor xmm1, xmm3
|
||||
pxor xmm8, xmm10
|
||||
pxor xmm9, xmm11
|
||||
mov eax, r13d
|
||||
cmp rdx, r15
|
||||
jne 2b
|
||||
movups xmmword ptr [rbx], xmm0
|
||||
movups xmmword ptr [rbx+0x10], xmm1
|
||||
movups xmmword ptr [rbx+0x20], xmm8
|
||||
movups xmmword ptr [rbx+0x30], xmm9
|
||||
movdqa xmm0, xmmword ptr [rsp+0x130]
|
||||
movdqa xmm1, xmmword ptr [rsp+0x110]
|
||||
movdqa xmm2, xmmword ptr [rsp+0x120]
|
||||
movdqu xmm3, xmmword ptr [rsp+0x118]
|
||||
movdqu xmm4, xmmword ptr [rsp+0x128]
|
||||
blendvps xmm1, xmm3, xmm0
|
||||
blendvps xmm2, xmm4, xmm0
|
||||
movdqa xmmword ptr [rsp+0x110], xmm1
|
||||
movdqa xmmword ptr [rsp+0x120], xmm2
|
||||
add rdi, 16
|
||||
add rbx, 64
|
||||
sub rsi, 2
|
||||
3:
|
||||
test esi, 0x1
|
||||
je 4b
|
||||
movups xmm0, xmmword ptr [rcx]
|
||||
movups xmm1, xmmword ptr [rcx+0x10]
|
||||
movd xmm13, dword ptr [rsp+0x110]
|
||||
pinsrd xmm13, dword ptr [rsp+0x120], 1
|
||||
pinsrd xmm13, dword ptr [BLAKE3_BLOCK_LEN+rip], 2
|
||||
movaps xmm14, xmmword ptr [ROT8+rip]
|
||||
movaps xmm15, xmmword ptr [ROT16+rip]
|
||||
mov r8, qword ptr [rdi]
|
||||
movzx eax, byte ptr [rbp+0x80]
|
||||
or eax, r13d
|
||||
xor edx, edx
|
||||
2:
|
||||
mov r14d, eax
|
||||
or eax, r12d
|
||||
add rdx, 64
|
||||
cmp rdx, r15
|
||||
cmovne eax, r14d
|
||||
movaps xmm2, xmmword ptr [BLAKE3_IV+rip]
|
||||
movaps xmm3, xmm13
|
||||
pinsrd xmm3, eax, 3
|
||||
movups xmm4, xmmword ptr [r8+rdx-0x40]
|
||||
movups xmm5, xmmword ptr [r8+rdx-0x30]
|
||||
movaps xmm8, xmm4
|
||||
shufps xmm4, xmm5, 136
|
||||
shufps xmm8, xmm5, 221
|
||||
movaps xmm5, xmm8
|
||||
movups xmm6, xmmword ptr [r8+rdx-0x20]
|
||||
movups xmm7, xmmword ptr [r8+rdx-0x10]
|
||||
movaps xmm8, xmm6
|
||||
shufps xmm6, xmm7, 136
|
||||
pshufd xmm6, xmm6, 0x93
|
||||
shufps xmm8, xmm7, 221
|
||||
pshufd xmm7, xmm8, 0x93
|
||||
mov al, 7
|
||||
9:
|
||||
paddd xmm0, xmm4
|
||||
paddd xmm0, xmm1
|
||||
pxor xmm3, xmm0
|
||||
pshufb xmm3, xmm15
|
||||
paddd xmm2, xmm3
|
||||
pxor xmm1, xmm2
|
||||
movdqa xmm11, xmm1
|
||||
pslld xmm1, 20
|
||||
psrld xmm11, 12
|
||||
por xmm1, xmm11
|
||||
paddd xmm0, xmm5
|
||||
paddd xmm0, xmm1
|
||||
pxor xmm3, xmm0
|
||||
pshufb xmm3, xmm14
|
||||
paddd xmm2, xmm3
|
||||
pxor xmm1, xmm2
|
||||
movdqa xmm11, xmm1
|
||||
pslld xmm1, 25
|
||||
psrld xmm11, 7
|
||||
por xmm1, xmm11
|
||||
pshufd xmm0, xmm0, 0x93
|
||||
pshufd xmm3, xmm3, 0x4E
|
||||
pshufd xmm2, xmm2, 0x39
|
||||
paddd xmm0, xmm6
|
||||
paddd xmm0, xmm1
|
||||
pxor xmm3, xmm0
|
||||
pshufb xmm3, xmm15
|
||||
paddd xmm2, xmm3
|
||||
pxor xmm1, xmm2
|
||||
movdqa xmm11, xmm1
|
||||
pslld xmm1, 20
|
||||
psrld xmm11, 12
|
||||
por xmm1, xmm11
|
||||
paddd xmm0, xmm7
|
||||
paddd xmm0, xmm1
|
||||
pxor xmm3, xmm0
|
||||
pshufb xmm3, xmm14
|
||||
paddd xmm2, xmm3
|
||||
pxor xmm1, xmm2
|
||||
movdqa xmm11, xmm1
|
||||
pslld xmm1, 25
|
||||
psrld xmm11, 7
|
||||
por xmm1, xmm11
|
||||
pshufd xmm0, xmm0, 0x39
|
||||
pshufd xmm3, xmm3, 0x4E
|
||||
pshufd xmm2, xmm2, 0x93
|
||||
dec al
|
||||
jz 9f
|
||||
movdqa xmm8, xmm4
|
||||
shufps xmm8, xmm5, 214
|
||||
pshufd xmm9, xmm4, 0x0F
|
||||
pshufd xmm4, xmm8, 0x39
|
||||
movdqa xmm8, xmm6
|
||||
shufps xmm8, xmm7, 250
|
||||
pblendw xmm9, xmm8, 0xCC
|
||||
movdqa xmm8, xmm7
|
||||
punpcklqdq xmm8, xmm5
|
||||
pblendw xmm8, xmm6, 0xC0
|
||||
pshufd xmm8, xmm8, 0x78
|
||||
punpckhdq xmm5, xmm7
|
||||
punpckldq xmm6, xmm5
|
||||
pshufd xmm7, xmm6, 0x1E
|
||||
movdqa xmm5, xmm9
|
||||
movdqa xmm6, xmm8
|
||||
jmp 9b
|
||||
9:
|
||||
pxor xmm0, xmm2
|
||||
pxor xmm1, xmm3
|
||||
mov eax, r13d
|
||||
cmp rdx, r15
|
||||
jne 2b
|
||||
movups xmmword ptr [rbx], xmm0
|
||||
movups xmmword ptr [rbx+0x10], xmm1
|
||||
jmp 4b
|
||||
|
||||
.p2align 6
|
||||
blake3_compress_in_place_sse2:
|
||||
_blake3_compress_in_place_sse2:
|
||||
sub rsp, 120
|
||||
movdqa xmmword ptr [rsp], xmm6
|
||||
movdqa xmmword ptr [rsp+0x10], xmm7
|
||||
movdqa xmmword ptr [rsp+0x20], xmm8
|
||||
movdqa xmmword ptr [rsp+0x30], xmm9
|
||||
movdqa xmmword ptr [rsp+0x40], xmm11
|
||||
movdqa xmmword ptr [rsp+0x50], xmm14
|
||||
movdqa xmmword ptr [rsp+0x60], xmm15
|
||||
movups xmm0, xmmword ptr [rcx]
|
||||
movups xmm1, xmmword ptr [rcx+0x10]
|
||||
movaps xmm2, xmmword ptr [BLAKE3_IV+rip]
|
||||
movzx eax, byte ptr [rsp+0xA0]
|
||||
movzx r8d, r8b
|
||||
shl rax, 32
|
||||
add r8, rax
|
||||
movq xmm3, r9
|
||||
movq xmm4, r8
|
||||
punpcklqdq xmm3, xmm4
|
||||
movups xmm4, xmmword ptr [rdx]
|
||||
movups xmm5, xmmword ptr [rdx+0x10]
|
||||
movaps xmm8, xmm4
|
||||
shufps xmm4, xmm5, 136
|
||||
shufps xmm8, xmm5, 221
|
||||
movaps xmm5, xmm8
|
||||
movups xmm6, xmmword ptr [rdx+0x20]
|
||||
movups xmm7, xmmword ptr [rdx+0x30]
|
||||
movaps xmm8, xmm6
|
||||
shufps xmm6, xmm7, 136
|
||||
pshufd xmm6, xmm6, 0x93
|
||||
shufps xmm8, xmm7, 221
|
||||
pshufd xmm7, xmm8, 0x93
|
||||
movaps xmm14, xmmword ptr [ROT8+rip]
|
||||
movaps xmm15, xmmword ptr [ROT16+rip]
|
||||
mov al, 7
|
||||
9:
|
||||
paddd xmm0, xmm4
|
||||
paddd xmm0, xmm1
|
||||
pxor xmm3, xmm0
|
||||
pshufb xmm3, xmm15
|
||||
paddd xmm2, xmm3
|
||||
pxor xmm1, xmm2
|
||||
movdqa xmm11, xmm1
|
||||
pslld xmm1, 20
|
||||
psrld xmm11, 12
|
||||
por xmm1, xmm11
|
||||
paddd xmm0, xmm5
|
||||
paddd xmm0, xmm1
|
||||
pxor xmm3, xmm0
|
||||
pshufb xmm3, xmm14
|
||||
paddd xmm2, xmm3
|
||||
pxor xmm1, xmm2
|
||||
movdqa xmm11, xmm1
|
||||
pslld xmm1, 25
|
||||
psrld xmm11, 7
|
||||
por xmm1, xmm11
|
||||
pshufd xmm0, xmm0, 0x93
|
||||
pshufd xmm3, xmm3, 0x4E
|
||||
pshufd xmm2, xmm2, 0x39
|
||||
paddd xmm0, xmm6
|
||||
paddd xmm0, xmm1
|
||||
pxor xmm3, xmm0
|
||||
pshufb xmm3, xmm15
|
||||
paddd xmm2, xmm3
|
||||
pxor xmm1, xmm2
|
||||
movdqa xmm11, xmm1
|
||||
pslld xmm1, 20
|
||||
psrld xmm11, 12
|
||||
por xmm1, xmm11
|
||||
paddd xmm0, xmm7
|
||||
paddd xmm0, xmm1
|
||||
pxor xmm3, xmm0
|
||||
pshufb xmm3, xmm14
|
||||
paddd xmm2, xmm3
|
||||
pxor xmm1, xmm2
|
||||
movdqa xmm11, xmm1
|
||||
pslld xmm1, 25
|
||||
psrld xmm11, 7
|
||||
por xmm1, xmm11
|
||||
pshufd xmm0, xmm0, 0x39
|
||||
pshufd xmm3, xmm3, 0x4E
|
||||
pshufd xmm2, xmm2, 0x93
|
||||
dec al
|
||||
jz 9f
|
||||
movdqa xmm8, xmm4
|
||||
shufps xmm8, xmm5, 214
|
||||
pshufd xmm9, xmm4, 0x0F
|
||||
pshufd xmm4, xmm8, 0x39
|
||||
movdqa xmm8, xmm6
|
||||
shufps xmm8, xmm7, 250
|
||||
pblendw xmm9, xmm8, 0xCC
|
||||
movdqa xmm8, xmm7
|
||||
punpcklqdq xmm8, xmm5
|
||||
pblendw xmm8, xmm6, 0xC0
|
||||
pshufd xmm8, xmm8, 0x78
|
||||
punpckhdq xmm5, xmm7
|
||||
punpckldq xmm6, xmm5
|
||||
pshufd xmm7, xmm6, 0x1E
|
||||
movdqa xmm5, xmm9
|
||||
movdqa xmm6, xmm8
|
||||
jmp 9b
|
||||
9:
|
||||
pxor xmm0, xmm2
|
||||
pxor xmm1, xmm3
|
||||
movups xmmword ptr [rcx], xmm0
|
||||
movups xmmword ptr [rcx+0x10], xmm1
|
||||
movdqa xmm6, xmmword ptr [rsp]
|
||||
movdqa xmm7, xmmword ptr [rsp+0x10]
|
||||
movdqa xmm8, xmmword ptr [rsp+0x20]
|
||||
movdqa xmm9, xmmword ptr [rsp+0x30]
|
||||
movdqa xmm11, xmmword ptr [rsp+0x40]
|
||||
movdqa xmm14, xmmword ptr [rsp+0x50]
|
||||
movdqa xmm15, xmmword ptr [rsp+0x60]
|
||||
add rsp, 120
|
||||
ret
|
||||
|
||||
|
||||
.p2align 6
|
||||
_blake3_compress_xof_sse2:
|
||||
blake3_compress_xof_sse2:
|
||||
sub rsp, 120
|
||||
movdqa xmmword ptr [rsp], xmm6
|
||||
movdqa xmmword ptr [rsp+0x10], xmm7
|
||||
movdqa xmmword ptr [rsp+0x20], xmm8
|
||||
movdqa xmmword ptr [rsp+0x30], xmm9
|
||||
movdqa xmmword ptr [rsp+0x40], xmm11
|
||||
movdqa xmmword ptr [rsp+0x50], xmm14
|
||||
movdqa xmmword ptr [rsp+0x60], xmm15
|
||||
movups xmm0, xmmword ptr [rcx]
|
||||
movups xmm1, xmmword ptr [rcx+0x10]
|
||||
movaps xmm2, xmmword ptr [BLAKE3_IV+rip]
|
||||
movzx eax, byte ptr [rsp+0xA0]
|
||||
movzx r8d, r8b
|
||||
mov r10, qword ptr [rsp+0xA8]
|
||||
shl rax, 32
|
||||
add r8, rax
|
||||
movq xmm3, r9
|
||||
movq xmm4, r8
|
||||
punpcklqdq xmm3, xmm4
|
||||
movups xmm4, xmmword ptr [rdx]
|
||||
movups xmm5, xmmword ptr [rdx+0x10]
|
||||
movaps xmm8, xmm4
|
||||
shufps xmm4, xmm5, 136
|
||||
shufps xmm8, xmm5, 221
|
||||
movaps xmm5, xmm8
|
||||
movups xmm6, xmmword ptr [rdx+0x20]
|
||||
movups xmm7, xmmword ptr [rdx+0x30]
|
||||
movaps xmm8, xmm6
|
||||
shufps xmm6, xmm7, 136
|
||||
pshufd xmm6, xmm6, 0x93
|
||||
shufps xmm8, xmm7, 221
|
||||
pshufd xmm7, xmm8, 0x93
|
||||
movaps xmm14, xmmword ptr [ROT8+rip]
|
||||
movaps xmm15, xmmword ptr [ROT16+rip]
|
||||
mov al, 7
|
||||
9:
|
||||
paddd xmm0, xmm4
|
||||
paddd xmm0, xmm1
|
||||
pxor xmm3, xmm0
|
||||
pshufb xmm3, xmm15
|
||||
paddd xmm2, xmm3
|
||||
pxor xmm1, xmm2
|
||||
movdqa xmm11, xmm1
|
||||
pslld xmm1, 20
|
||||
psrld xmm11, 12
|
||||
por xmm1, xmm11
|
||||
paddd xmm0, xmm5
|
||||
paddd xmm0, xmm1
|
||||
pxor xmm3, xmm0
|
||||
pshufb xmm3, xmm14
|
||||
paddd xmm2, xmm3
|
||||
pxor xmm1, xmm2
|
||||
movdqa xmm11, xmm1
|
||||
pslld xmm1, 25
|
||||
psrld xmm11, 7
|
||||
por xmm1, xmm11
|
||||
pshufd xmm0, xmm0, 0x93
|
||||
pshufd xmm3, xmm3, 0x4E
|
||||
pshufd xmm2, xmm2, 0x39
|
||||
paddd xmm0, xmm6
|
||||
paddd xmm0, xmm1
|
||||
pxor xmm3, xmm0
|
||||
pshufb xmm3, xmm15
|
||||
paddd xmm2, xmm3
|
||||
pxor xmm1, xmm2
|
||||
movdqa xmm11, xmm1
|
||||
pslld xmm1, 20
|
||||
psrld xmm11, 12
|
||||
por xmm1, xmm11
|
||||
paddd xmm0, xmm7
|
||||
paddd xmm0, xmm1
|
||||
pxor xmm3, xmm0
|
||||
pshufb xmm3, xmm14
|
||||
paddd xmm2, xmm3
|
||||
pxor xmm1, xmm2
|
||||
movdqa xmm11, xmm1
|
||||
pslld xmm1, 25
|
||||
psrld xmm11, 7
|
||||
por xmm1, xmm11
|
||||
pshufd xmm0, xmm0, 0x39
|
||||
pshufd xmm3, xmm3, 0x4E
|
||||
pshufd xmm2, xmm2, 0x93
|
||||
dec al
|
||||
jz 9f
|
||||
movdqa xmm8, xmm4
|
||||
shufps xmm8, xmm5, 214
|
||||
pshufd xmm9, xmm4, 0x0F
|
||||
pshufd xmm4, xmm8, 0x39
|
||||
movdqa xmm8, xmm6
|
||||
shufps xmm8, xmm7, 250
|
||||
pblendw xmm9, xmm8, 0xCC
|
||||
movdqa xmm8, xmm7
|
||||
punpcklqdq xmm8, xmm5
|
||||
pblendw xmm8, xmm6, 0xC0
|
||||
pshufd xmm8, xmm8, 0x78
|
||||
punpckhdq xmm5, xmm7
|
||||
punpckldq xmm6, xmm5
|
||||
pshufd xmm7, xmm6, 0x1E
|
||||
movdqa xmm5, xmm9
|
||||
movdqa xmm6, xmm8
|
||||
jmp 9b
|
||||
9:
|
||||
movdqu xmm4, xmmword ptr [rcx]
|
||||
movdqu xmm5, xmmword ptr [rcx+0x10]
|
||||
pxor xmm0, xmm2
|
||||
pxor xmm1, xmm3
|
||||
pxor xmm2, xmm4
|
||||
pxor xmm3, xmm5
|
||||
movups xmmword ptr [r10], xmm0
|
||||
movups xmmword ptr [r10+0x10], xmm1
|
||||
movups xmmword ptr [r10+0x20], xmm2
|
||||
movups xmmword ptr [r10+0x30], xmm3
|
||||
movdqa xmm6, xmmword ptr [rsp]
|
||||
movdqa xmm7, xmmword ptr [rsp+0x10]
|
||||
movdqa xmm8, xmmword ptr [rsp+0x20]
|
||||
movdqa xmm9, xmmword ptr [rsp+0x30]
|
||||
movdqa xmm11, xmmword ptr [rsp+0x40]
|
||||
movdqa xmm14, xmmword ptr [rsp+0x50]
|
||||
movdqa xmm15, xmmword ptr [rsp+0x60]
|
||||
add rsp, 120
|
||||
ret
|
||||
|
||||
|
||||
.section .rodata
|
||||
.p2align 6
|
||||
BLAKE3_IV:
|
||||
.long 0x6A09E667, 0xBB67AE85
|
||||
.long 0x3C6EF372, 0xA54FF53A
|
||||
ROT16:
|
||||
.byte 2, 3, 0, 1, 6, 7, 4, 5, 10, 11, 8, 9, 14, 15, 12, 13
|
||||
ROT8:
|
||||
.byte 1, 2, 3, 0, 5, 6, 7, 4, 9, 10, 11, 8, 13, 14, 15, 12
|
||||
ADD0:
|
||||
.long 0, 1, 2, 3
|
||||
ADD1:
|
||||
.long 4, 4, 4, 4
|
||||
BLAKE3_IV_0:
|
||||
.long 0x6A09E667, 0x6A09E667, 0x6A09E667, 0x6A09E667
|
||||
BLAKE3_IV_1:
|
||||
.long 0xBB67AE85, 0xBB67AE85, 0xBB67AE85, 0xBB67AE85
|
||||
BLAKE3_IV_2:
|
||||
.long 0x3C6EF372, 0x3C6EF372, 0x3C6EF372, 0x3C6EF372
|
||||
BLAKE3_IV_3:
|
||||
.long 0xA54FF53A, 0xA54FF53A, 0xA54FF53A, 0xA54FF53A
|
||||
BLAKE3_BLOCK_LEN:
|
||||
.long 64, 64, 64, 64
|
||||
CMP_MSB_MASK:
|
||||
.long 0x80000000, 0x80000000, 0x80000000, 0x80000000
|
|
@ -0,0 +1,2089 @@
|
|||
public _blake3_hash_many_sse2
|
||||
public blake3_hash_many_sse2
|
||||
public blake3_compress_in_place_sse2
|
||||
public _blake3_compress_in_place_sse2
|
||||
public blake3_compress_xof_sse2
|
||||
public _blake3_compress_xof_sse2
|
||||
|
||||
_TEXT SEGMENT ALIGN(16) 'CODE'
|
||||
|
||||
ALIGN 16
|
||||
blake3_hash_many_sse2 PROC
|
||||
_blake3_hash_many_sse2 PROC
|
||||
push r15
|
||||
push r14
|
||||
push r13
|
||||
push r12
|
||||
push rsi
|
||||
push rdi
|
||||
push rbx
|
||||
push rbp
|
||||
mov rbp, rsp
|
||||
sub rsp, 528
|
||||
and rsp, 0FFFFFFFFFFFFFFC0H
|
||||
movdqa xmmword ptr [rsp+170H], xmm6
|
||||
movdqa xmmword ptr [rsp+180H], xmm7
|
||||
movdqa xmmword ptr [rsp+190H], xmm8
|
||||
movdqa xmmword ptr [rsp+1A0H], xmm9
|
||||
movdqa xmmword ptr [rsp+1B0H], xmm10
|
||||
movdqa xmmword ptr [rsp+1C0H], xmm11
|
||||
movdqa xmmword ptr [rsp+1D0H], xmm12
|
||||
movdqa xmmword ptr [rsp+1E0H], xmm13
|
||||
movdqa xmmword ptr [rsp+1F0H], xmm14
|
||||
movdqa xmmword ptr [rsp+200H], xmm15
|
||||
mov rdi, rcx
|
||||
mov rsi, rdx
|
||||
mov rdx, r8
|
||||
mov rcx, r9
|
||||
mov r8, qword ptr [rbp+68H]
|
||||
movzx r9, byte ptr [rbp+70H]
|
||||
neg r9d
|
||||
movd xmm0, r9d
|
||||
pshufd xmm0, xmm0, 00H
|
||||
movdqa xmmword ptr [rsp+130H], xmm0
|
||||
movdqa xmm1, xmm0
|
||||
pand xmm1, xmmword ptr [ADD0]
|
||||
pand xmm0, xmmword ptr [ADD1]
|
||||
movdqa xmmword ptr [rsp+150H], xmm0
|
||||
movd xmm0, r8d
|
||||
pshufd xmm0, xmm0, 00H
|
||||
paddd xmm0, xmm1
|
||||
movdqa xmmword ptr [rsp+110H], xmm0
|
||||
pxor xmm0, xmmword ptr [CMP_MSB_MASK]
|
||||
pxor xmm1, xmmword ptr [CMP_MSB_MASK]
|
||||
pcmpgtd xmm1, xmm0
|
||||
shr r8, 32
|
||||
movd xmm2, r8d
|
||||
pshufd xmm2, xmm2, 00H
|
||||
psubd xmm2, xmm1
|
||||
movdqa xmmword ptr [rsp+120H], xmm2
|
||||
mov rbx, qword ptr [rbp+90H]
|
||||
mov r15, rdx
|
||||
shl r15, 6
|
||||
movzx r13d, byte ptr [rbp+78H]
|
||||
movzx r12d, byte ptr [rbp+88H]
|
||||
cmp rsi, 4
|
||||
jc final3blocks
|
||||
outerloop4:
|
||||
movdqu xmm3, xmmword ptr [rcx]
|
||||
pshufd xmm0, xmm3, 00H
|
||||
pshufd xmm1, xmm3, 55H
|
||||
pshufd xmm2, xmm3, 0AAH
|
||||
pshufd xmm3, xmm3, 0FFH
|
||||
movdqu xmm7, xmmword ptr [rcx+10H]
|
||||
pshufd xmm4, xmm7, 00H
|
||||
pshufd xmm5, xmm7, 55H
|
||||
pshufd xmm6, xmm7, 0AAH
|
||||
pshufd xmm7, xmm7, 0FFH
|
||||
mov r8, qword ptr [rdi]
|
||||
mov r9, qword ptr [rdi+8H]
|
||||
mov r10, qword ptr [rdi+10H]
|
||||
mov r11, qword ptr [rdi+18H]
|
||||
movzx eax, byte ptr [rbp+80H]
|
||||
or eax, r13d
|
||||
xor edx, edx
|
||||
innerloop4:
|
||||
mov r14d, eax
|
||||
or eax, r12d
|
||||
add rdx, 64
|
||||
cmp rdx, r15
|
||||
cmovne eax, r14d
|
||||
movdqu xmm8, xmmword ptr [r8+rdx-40H]
|
||||
movdqu xmm9, xmmword ptr [r9+rdx-40H]
|
||||
movdqu xmm10, xmmword ptr [r10+rdx-40H]
|
||||
movdqu xmm11, xmmword ptr [r11+rdx-40H]
|
||||
movdqa xmm12, xmm8
|
||||
punpckldq xmm8, xmm9
|
||||
punpckhdq xmm12, xmm9
|
||||
movdqa xmm14, xmm10
|
||||
punpckldq xmm10, xmm11
|
||||
punpckhdq xmm14, xmm11
|
||||
movdqa xmm9, xmm8
|
||||
punpcklqdq xmm8, xmm10
|
||||
punpckhqdq xmm9, xmm10
|
||||
movdqa xmm13, xmm12
|
||||
punpcklqdq xmm12, xmm14
|
||||
punpckhqdq xmm13, xmm14
|
||||
movdqa xmmword ptr [rsp], xmm8
|
||||
movdqa xmmword ptr [rsp+10H], xmm9
|
||||
movdqa xmmword ptr [rsp+20H], xmm12
|
||||
movdqa xmmword ptr [rsp+30H], xmm13
|
||||
movdqu xmm8, xmmword ptr [r8+rdx-30H]
|
||||
movdqu xmm9, xmmword ptr [r9+rdx-30H]
|
||||
movdqu xmm10, xmmword ptr [r10+rdx-30H]
|
||||
movdqu xmm11, xmmword ptr [r11+rdx-30H]
|
||||
movdqa xmm12, xmm8
|
||||
punpckldq xmm8, xmm9
|
||||
punpckhdq xmm12, xmm9
|
||||
movdqa xmm14, xmm10
|
||||
punpckldq xmm10, xmm11
|
||||
punpckhdq xmm14, xmm11
|
||||
movdqa xmm9, xmm8
|
||||
punpcklqdq xmm8, xmm10
|
||||
punpckhqdq xmm9, xmm10
|
||||
movdqa xmm13, xmm12
|
||||
punpcklqdq xmm12, xmm14
|
||||
punpckhqdq xmm13, xmm14
|
||||
movdqa xmmword ptr [rsp+40H], xmm8
|
||||
movdqa xmmword ptr [rsp+50H], xmm9
|
||||
movdqa xmmword ptr [rsp+60H], xmm12
|
||||
movdqa xmmword ptr [rsp+70H], xmm13
|
||||
movdqu xmm8, xmmword ptr [r8+rdx-20H]
|
||||
movdqu xmm9, xmmword ptr [r9+rdx-20H]
|
||||
movdqu xmm10, xmmword ptr [r10+rdx-20H]
|
||||
movdqu xmm11, xmmword ptr [r11+rdx-20H]
|
||||
movdqa xmm12, xmm8
|
||||
punpckldq xmm8, xmm9
|
||||
punpckhdq xmm12, xmm9
|
||||
movdqa xmm14, xmm10
|
||||
punpckldq xmm10, xmm11
|
||||
punpckhdq xmm14, xmm11
|
||||
movdqa xmm9, xmm8
|
||||
punpcklqdq xmm8, xmm10
|
||||
punpckhqdq xmm9, xmm10
|
||||
movdqa xmm13, xmm12
|
||||
punpcklqdq xmm12, xmm14
|
||||
punpckhqdq xmm13, xmm14
|
||||
movdqa xmmword ptr [rsp+80H], xmm8
|
||||
movdqa xmmword ptr [rsp+90H], xmm9
|
||||
movdqa xmmword ptr [rsp+0A0H], xmm12
|
||||
movdqa xmmword ptr [rsp+0B0H], xmm13
|
||||
movdqu xmm8, xmmword ptr [r8+rdx-10H]
|
||||
movdqu xmm9, xmmword ptr [r9+rdx-10H]
|
||||
movdqu xmm10, xmmword ptr [r10+rdx-10H]
|
||||
movdqu xmm11, xmmword ptr [r11+rdx-10H]
|
||||
movdqa xmm12, xmm8
|
||||
punpckldq xmm8, xmm9
|
||||
punpckhdq xmm12, xmm9
|
||||
movdqa xmm14, xmm10
|
||||
punpckldq xmm10, xmm11
|
||||
punpckhdq xmm14, xmm11
|
||||
movdqa xmm9, xmm8
|
||||
punpcklqdq xmm8, xmm10
|
||||
punpckhqdq xmm9, xmm10
|
||||
movdqa xmm13, xmm12
|
||||
punpcklqdq xmm12, xmm14
|
||||
punpckhqdq xmm13, xmm14
|
||||
movdqa xmmword ptr [rsp+0C0H], xmm8
|
||||
movdqa xmmword ptr [rsp+0D0H], xmm9
|
||||
movdqa xmmword ptr [rsp+0E0H], xmm12
|
||||
movdqa xmmword ptr [rsp+0F0H], xmm13
|
||||
movdqa xmm9, xmmword ptr [BLAKE3_IV_1]
|
||||
movdqa xmm10, xmmword ptr [BLAKE3_IV_2]
|
||||
movdqa xmm11, xmmword ptr [BLAKE3_IV_3]
|
||||
movdqa xmm12, xmmword ptr [rsp+110H]
|
||||
movdqa xmm13, xmmword ptr [rsp+120H]
|
||||
movdqa xmm14, xmmword ptr [BLAKE3_BLOCK_LEN]
|
||||
movd xmm15, eax
|
||||
pshufd xmm15, xmm15, 00H
|
||||
prefetcht0 byte ptr [r8+rdx+80H]
|
||||
prefetcht0 byte ptr [r9+rdx+80H]
|
||||
prefetcht0 byte ptr [r10+rdx+80H]
|
||||
prefetcht0 byte ptr [r11+rdx+80H]
|
||||
paddd xmm0, xmmword ptr [rsp]
|
||||
paddd xmm1, xmmword ptr [rsp+20H]
|
||||
paddd xmm2, xmmword ptr [rsp+40H]
|
||||
paddd xmm3, xmmword ptr [rsp+60H]
|
||||
paddd xmm0, xmm4
|
||||
paddd xmm1, xmm5
|
||||
paddd xmm2, xmm6
|
||||
paddd xmm3, xmm7
|
||||
pxor xmm12, xmm0
|
||||
pxor xmm13, xmm1
|
||||
pxor xmm14, xmm2
|
||||
pxor xmm15, xmm3
|
||||
movdqa xmm8, xmmword ptr [ROT16]
|
||||
pshufb xmm12, xmm8
|
||||
pshufb xmm13, xmm8
|
||||
pshufb xmm14, xmm8
|
||||
pshufb xmm15, xmm8
|
||||
movdqa xmm8, xmmword ptr [BLAKE3_IV_0]
|
||||
paddd xmm8, xmm12
|
||||
paddd xmm9, xmm13
|
||||
paddd xmm10, xmm14
|
||||
paddd xmm11, xmm15
|
||||
pxor xmm4, xmm8
|
||||
pxor xmm5, xmm9
|
||||
pxor xmm6, xmm10
|
||||
pxor xmm7, xmm11
|
||||
movdqa xmmword ptr [rsp+100H], xmm8
|
||||
movdqa xmm8, xmm4
|
||||
psrld xmm8, 12
|
||||
pslld xmm4, 20
|
||||
por xmm4, xmm8
|
||||
movdqa xmm8, xmm5
|
||||
psrld xmm8, 12
|
||||
pslld xmm5, 20
|
||||
por xmm5, xmm8
|
||||
movdqa xmm8, xmm6
|
||||
psrld xmm8, 12
|
||||
pslld xmm6, 20
|
||||
por xmm6, xmm8
|
||||
movdqa xmm8, xmm7
|
||||
psrld xmm8, 12
|
||||
pslld xmm7, 20
|
||||
por xmm7, xmm8
|
||||
paddd xmm0, xmmword ptr [rsp+10H]
|
||||
paddd xmm1, xmmword ptr [rsp+30H]
|
||||
paddd xmm2, xmmword ptr [rsp+50H]
|
||||
paddd xmm3, xmmword ptr [rsp+70H]
|
||||
paddd xmm0, xmm4
|
||||
paddd xmm1, xmm5
|
||||
paddd xmm2, xmm6
|
||||
paddd xmm3, xmm7
|
||||
pxor xmm12, xmm0
|
||||
pxor xmm13, xmm1
|
||||
pxor xmm14, xmm2
|
||||
pxor xmm15, xmm3
|
||||
movdqa xmm8, xmmword ptr [ROT8]
|
||||
pshufb xmm12, xmm8
|
||||
pshufb xmm13, xmm8
|
||||
pshufb xmm14, xmm8
|
||||
pshufb xmm15, xmm8
|
||||
movdqa xmm8, xmmword ptr [rsp+100H]
|
||||
paddd xmm8, xmm12
|
||||
paddd xmm9, xmm13
|
||||
paddd xmm10, xmm14
|
||||
paddd xmm11, xmm15
|
||||
pxor xmm4, xmm8
|
||||
pxor xmm5, xmm9
|
||||
pxor xmm6, xmm10
|
||||
pxor xmm7, xmm11
|
||||
movdqa xmmword ptr [rsp+100H], xmm8
|
||||
movdqa xmm8, xmm4
|
||||
psrld xmm8, 7
|
||||
pslld xmm4, 25
|
||||
por xmm4, xmm8
|
||||
movdqa xmm8, xmm5
|
||||
psrld xmm8, 7
|
||||
pslld xmm5, 25
|
||||
por xmm5, xmm8
|
||||
movdqa xmm8, xmm6
|
||||
psrld xmm8, 7
|
||||
pslld xmm6, 25
|
||||
por xmm6, xmm8
|
||||
movdqa xmm8, xmm7
|
||||
psrld xmm8, 7
|
||||
pslld xmm7, 25
|
||||
por xmm7, xmm8
|
||||
paddd xmm0, xmmword ptr [rsp+80H]
|
||||
paddd xmm1, xmmword ptr [rsp+0A0H]
|
||||
paddd xmm2, xmmword ptr [rsp+0C0H]
|
||||
paddd xmm3, xmmword ptr [rsp+0E0H]
|
||||
paddd xmm0, xmm5
|
||||
paddd xmm1, xmm6
|
||||
paddd xmm2, xmm7
|
||||
paddd xmm3, xmm4
|
||||
pxor xmm15, xmm0
|
||||
pxor xmm12, xmm1
|
||||
pxor xmm13, xmm2
|
||||
pxor xmm14, xmm3
|
||||
movdqa xmm8, xmmword ptr [ROT16]
|
||||
pshufb xmm15, xmm8
|
||||
pshufb xmm12, xmm8
|
||||
pshufb xmm13, xmm8
|
||||
pshufb xmm14, xmm8
|
||||
paddd xmm10, xmm15
|
||||
paddd xmm11, xmm12
|
||||
movdqa xmm8, xmmword ptr [rsp+100H]
|
||||
paddd xmm8, xmm13
|
||||
paddd xmm9, xmm14
|
||||
pxor xmm5, xmm10
|
||||
pxor xmm6, xmm11
|
||||
pxor xmm7, xmm8
|
||||
pxor xmm4, xmm9
|
||||
movdqa xmmword ptr [rsp+100H], xmm8
|
||||
movdqa xmm8, xmm5
|
||||
psrld xmm8, 12
|
||||
pslld xmm5, 20
|
||||
por xmm5, xmm8
|
||||
movdqa xmm8, xmm6
|
||||
psrld xmm8, 12
|
||||
pslld xmm6, 20
|
||||
por xmm6, xmm8
|
||||
movdqa xmm8, xmm7
|
||||
psrld xmm8, 12
|
||||
pslld xmm7, 20
|
||||
por xmm7, xmm8
|
||||
movdqa xmm8, xmm4
|
||||
psrld xmm8, 12
|
||||
pslld xmm4, 20
|
||||
por xmm4, xmm8
|
||||
paddd xmm0, xmmword ptr [rsp+90H]
|
||||
paddd xmm1, xmmword ptr [rsp+0B0H]
|
||||
paddd xmm2, xmmword ptr [rsp+0D0H]
|
||||
paddd xmm3, xmmword ptr [rsp+0F0H]
|
||||
paddd xmm0, xmm5
|
||||
paddd xmm1, xmm6
|
||||
paddd xmm2, xmm7
|
||||
paddd xmm3, xmm4
|
||||
pxor xmm15, xmm0
|
||||
pxor xmm12, xmm1
|
||||
pxor xmm13, xmm2
|
||||
pxor xmm14, xmm3
|
||||
movdqa xmm8, xmmword ptr [ROT8]
|
||||
pshufb xmm15, xmm8
|
||||
pshufb xmm12, xmm8
|
||||
pshufb xmm13, xmm8
|
||||
pshufb xmm14, xmm8
|
||||
paddd xmm10, xmm15
|
||||
paddd xmm11, xmm12
|
||||
movdqa xmm8, xmmword ptr [rsp+100H]
|
||||
paddd xmm8, xmm13
|
||||
paddd xmm9, xmm14
|
||||
pxor xmm5, xmm10
|
||||
pxor xmm6, xmm11
|
||||
pxor xmm7, xmm8
|
||||
pxor xmm4, xmm9
|
||||
movdqa xmmword ptr [rsp+100H], xmm8
|
||||
movdqa xmm8, xmm5
|
||||
psrld xmm8, 7
|
||||
pslld xmm5, 25
|
||||
por xmm5, xmm8
|
||||
movdqa xmm8, xmm6
|
||||
psrld xmm8, 7
|
||||
pslld xmm6, 25
|
||||
por xmm6, xmm8
|
||||
movdqa xmm8, xmm7
|
||||
psrld xmm8, 7
|
||||
pslld xmm7, 25
|
||||
por xmm7, xmm8
|
||||
movdqa xmm8, xmm4
|
||||
psrld xmm8, 7
|
||||
pslld xmm4, 25
|
||||
por xmm4, xmm8
|
||||
paddd xmm0, xmmword ptr [rsp+20H]
|
||||
paddd xmm1, xmmword ptr [rsp+30H]
|
||||
paddd xmm2, xmmword ptr [rsp+70H]
|
||||
paddd xmm3, xmmword ptr [rsp+40H]
|
||||
paddd xmm0, xmm4
|
||||
paddd xmm1, xmm5
|
||||
paddd xmm2, xmm6
|
||||
paddd xmm3, xmm7
|
||||
pxor xmm12, xmm0
|
||||
pxor xmm13, xmm1
|
||||
pxor xmm14, xmm2
|
||||
pxor xmm15, xmm3
|
||||
movdqa xmm8, xmmword ptr [ROT16]
|
||||
pshufb xmm12, xmm8
|
||||
pshufb xmm13, xmm8
|
||||
pshufb xmm14, xmm8
|
||||
pshufb xmm15, xmm8
|
||||
movdqa xmm8, xmmword ptr [rsp+100H]
|
||||
paddd xmm8, xmm12
|
||||
paddd xmm9, xmm13
|
||||
paddd xmm10, xmm14
|
||||
paddd xmm11, xmm15
|
||||
pxor xmm4, xmm8
|
||||
pxor xmm5, xmm9
|
||||
pxor xmm6, xmm10
|
||||
pxor xmm7, xmm11
|
||||
movdqa xmmword ptr [rsp+100H], xmm8
|
||||
movdqa xmm8, xmm4
|
||||
psrld xmm8, 12
|
||||
pslld xmm4, 20
|
||||
por xmm4, xmm8
|
||||
movdqa xmm8, xmm5
|
||||
psrld xmm8, 12
|
||||
pslld xmm5, 20
|
||||
por xmm5, xmm8
|
||||
movdqa xmm8, xmm6
|
||||
psrld xmm8, 12
|
||||
pslld xmm6, 20
|
||||
por xmm6, xmm8
|
||||
movdqa xmm8, xmm7
|
||||
psrld xmm8, 12
|
||||
pslld xmm7, 20
|
||||
por xmm7, xmm8
|
||||
paddd xmm0, xmmword ptr [rsp+60H]
|
||||
paddd xmm1, xmmword ptr [rsp+0A0H]
|
||||
paddd xmm2, xmmword ptr [rsp]
|
||||
paddd xmm3, xmmword ptr [rsp+0D0H]
|
||||
paddd xmm0, xmm4
|
||||
paddd xmm1, xmm5
|
||||
paddd xmm2, xmm6
|
||||
paddd xmm3, xmm7
|
||||
pxor xmm12, xmm0
|
||||
pxor xmm13, xmm1
|
||||
pxor xmm14, xmm2
|
||||
pxor xmm15, xmm3
|
||||
movdqa xmm8, xmmword ptr [ROT8]
|
||||
pshufb xmm12, xmm8
|
||||
pshufb xmm13, xmm8
|
||||
pshufb xmm14, xmm8
|
||||
pshufb xmm15, xmm8
|
||||
movdqa xmm8, xmmword ptr [rsp+100H]
|
||||
paddd xmm8, xmm12
|
||||
paddd xmm9, xmm13
|
||||
paddd xmm10, xmm14
|
||||
paddd xmm11, xmm15
|
||||
pxor xmm4, xmm8
|
||||
pxor xmm5, xmm9
|
||||
pxor xmm6, xmm10
|
||||
pxor xmm7, xmm11
|
||||
movdqa xmmword ptr [rsp+100H], xmm8
|
||||
movdqa xmm8, xmm4
|
||||
psrld xmm8, 7
|
||||
pslld xmm4, 25
|
||||
por xmm4, xmm8
|
||||
movdqa xmm8, xmm5
|
||||
psrld xmm8, 7
|
||||
pslld xmm5, 25
|
||||
por xmm5, xmm8
|
||||
movdqa xmm8, xmm6
|
||||
psrld xmm8, 7
|
||||
pslld xmm6, 25
|
||||
por xmm6, xmm8
|
||||
movdqa xmm8, xmm7
|
||||
psrld xmm8, 7
|
||||
pslld xmm7, 25
|
||||
por xmm7, xmm8
|
||||
paddd xmm0, xmmword ptr [rsp+10H]
|
||||
paddd xmm1, xmmword ptr [rsp+0C0H]
|
||||
paddd xmm2, xmmword ptr [rsp+90H]
|
||||
paddd xmm3, xmmword ptr [rsp+0F0H]
|
||||
paddd xmm0, xmm5
|
||||
paddd xmm1, xmm6
|
||||
paddd xmm2, xmm7
|
||||
paddd xmm3, xmm4
|
||||
pxor xmm15, xmm0
|
||||
pxor xmm12, xmm1
|
||||
pxor xmm13, xmm2
|
||||
pxor xmm14, xmm3
|
||||
movdqa xmm8, xmmword ptr [ROT16]
|
||||
pshufb xmm15, xmm8
|
||||
pshufb xmm12, xmm8
|
||||
pshufb xmm13, xmm8
|
||||
pshufb xmm14, xmm8
|
||||
paddd xmm10, xmm15
|
||||
paddd xmm11, xmm12
|
||||
movdqa xmm8, xmmword ptr [rsp+100H]
|
||||
paddd xmm8, xmm13
|
||||
paddd xmm9, xmm14
|
||||
pxor xmm5, xmm10
|
||||
pxor xmm6, xmm11
|
||||
pxor xmm7, xmm8
|
||||
pxor xmm4, xmm9
|
||||
movdqa xmmword ptr [rsp+100H], xmm8
|
||||
movdqa xmm8, xmm5
|
||||
psrld xmm8, 12
|
||||
pslld xmm5, 20
|
||||
por xmm5, xmm8
|
||||
movdqa xmm8, xmm6
|
||||
psrld xmm8, 12
|
||||
pslld xmm6, 20
|
||||
por xmm6, xmm8
|
||||
movdqa xmm8, xmm7
|
||||
psrld xmm8, 12
|
||||
pslld xmm7, 20
|
||||
por xmm7, xmm8
|
||||
movdqa xmm8, xmm4
|
||||
psrld xmm8, 12
|
||||
pslld xmm4, 20
|
||||
por xmm4, xmm8
|
||||
paddd xmm0, xmmword ptr [rsp+0B0H]
|
||||
paddd xmm1, xmmword ptr [rsp+50H]
|
||||
paddd xmm2, xmmword ptr [rsp+0E0H]
|
||||
paddd xmm3, xmmword ptr [rsp+80H]
|
||||
paddd xmm0, xmm5
|
||||
paddd xmm1, xmm6
|
||||
paddd xmm2, xmm7
|
||||
paddd xmm3, xmm4
|
||||
pxor xmm15, xmm0
|
||||
pxor xmm12, xmm1
|
||||
pxor xmm13, xmm2
|
||||
pxor xmm14, xmm3
|
||||
movdqa xmm8, xmmword ptr [ROT8]
|
||||
pshufb xmm15, xmm8
|
||||
pshufb xmm12, xmm8
|
||||
pshufb xmm13, xmm8
|
||||
pshufb xmm14, xmm8
|
||||
paddd xmm10, xmm15
|
||||
paddd xmm11, xmm12
|
||||
movdqa xmm8, xmmword ptr [rsp+100H]
|
||||
paddd xmm8, xmm13
|
||||
paddd xmm9, xmm14
|
||||
pxor xmm5, xmm10
|
||||
pxor xmm6, xmm11
|
||||
pxor xmm7, xmm8
|
||||
pxor xmm4, xmm9
|
||||
movdqa xmmword ptr [rsp+100H], xmm8
|
||||
movdqa xmm8, xmm5
|
||||
psrld xmm8, 7
|
||||
pslld xmm5, 25
|
||||
por xmm5, xmm8
|
||||
movdqa xmm8, xmm6
|
||||
psrld xmm8, 7
|
||||
pslld xmm6, 25
|
||||
por xmm6, xmm8
|
||||
movdqa xmm8, xmm7
|
||||
psrld xmm8, 7
|
||||
pslld xmm7, 25
|
||||
por xmm7, xmm8
|
||||
movdqa xmm8, xmm4
|
||||
psrld xmm8, 7
|
||||
pslld xmm4, 25
|
||||
por xmm4, xmm8
|
||||
paddd xmm0, xmmword ptr [rsp+30H]
|
||||
paddd xmm1, xmmword ptr [rsp+0A0H]
|
||||
paddd xmm2, xmmword ptr [rsp+0D0H]
|
||||
paddd xmm3, xmmword ptr [rsp+70H]
|
||||
paddd xmm0, xmm4
|
||||
paddd xmm1, xmm5
|
||||
paddd xmm2, xmm6
|
||||
paddd xmm3, xmm7
|
||||
pxor xmm12, xmm0
|
||||
pxor xmm13, xmm1
|
||||
pxor xmm14, xmm2
|
||||
pxor xmm15, xmm3
|
||||
movdqa xmm8, xmmword ptr [ROT16]
|
||||
pshufb xmm12, xmm8
|
||||
pshufb xmm13, xmm8
|
||||
pshufb xmm14, xmm8
|
||||
pshufb xmm15, xmm8
|
||||
movdqa xmm8, xmmword ptr [rsp+100H]
|
||||
paddd xmm8, xmm12
|
||||
paddd xmm9, xmm13
|
||||
paddd xmm10, xmm14
|
||||
paddd xmm11, xmm15
|
||||
pxor xmm4, xmm8
|
||||
pxor xmm5, xmm9
|
||||
pxor xmm6, xmm10
|
||||
pxor xmm7, xmm11
|
||||
movdqa xmmword ptr [rsp+100H], xmm8
|
||||
movdqa xmm8, xmm4
|
||||
psrld xmm8, 12
|
||||
pslld xmm4, 20
|
||||
por xmm4, xmm8
|
||||
movdqa xmm8, xmm5
|
||||
psrld xmm8, 12
|
||||
pslld xmm5, 20
|
||||
por xmm5, xmm8
|
||||
movdqa xmm8, xmm6
|
||||
psrld xmm8, 12
|
||||
pslld xmm6, 20
|
||||
por xmm6, xmm8
|
||||
movdqa xmm8, xmm7
|
||||
psrld xmm8, 12
|
||||
pslld xmm7, 20
|
||||
por xmm7, xmm8
|
||||
paddd xmm0, xmmword ptr [rsp+40H]
|
||||
paddd xmm1, xmmword ptr [rsp+0C0H]
|
||||
paddd xmm2, xmmword ptr [rsp+20H]
|
||||
paddd xmm3, xmmword ptr [rsp+0E0H]
|
||||
paddd xmm0, xmm4
|
||||
paddd xmm1, xmm5
|
||||
paddd xmm2, xmm6
|
||||
paddd xmm3, xmm7
|
||||
pxor xmm12, xmm0
|
||||
pxor xmm13, xmm1
|
||||
pxor xmm14, xmm2
|
||||
pxor xmm15, xmm3
|
||||
movdqa xmm8, xmmword ptr [ROT8]
|
||||
pshufb xmm12, xmm8
|
||||
pshufb xmm13, xmm8
|
||||
pshufb xmm14, xmm8
|
||||
pshufb xmm15, xmm8
|
||||
movdqa xmm8, xmmword ptr [rsp+100H]
|
||||
paddd xmm8, xmm12
|
||||
paddd xmm9, xmm13
|
||||
paddd xmm10, xmm14
|
||||
paddd xmm11, xmm15
|
||||
pxor xmm4, xmm8
|
||||
pxor xmm5, xmm9
|
||||
pxor xmm6, xmm10
|
||||
pxor xmm7, xmm11
|
||||
movdqa xmmword ptr [rsp+100H], xmm8
|
||||
movdqa xmm8, xmm4
|
||||
psrld xmm8, 7
|
||||
pslld xmm4, 25
|
||||
por xmm4, xmm8
|
||||
movdqa xmm8, xmm5
|
||||
psrld xmm8, 7
|
||||
pslld xmm5, 25
|
||||
por xmm5, xmm8
|
||||
movdqa xmm8, xmm6
|
||||
psrld xmm8, 7
|
||||
pslld xmm6, 25
|
||||
por xmm6, xmm8
|
||||
movdqa xmm8, xmm7
|
||||
psrld xmm8, 7
|
||||
pslld xmm7, 25
|
||||
por xmm7, xmm8
|
||||
paddd xmm0, xmmword ptr [rsp+60H]
|
||||
paddd xmm1, xmmword ptr [rsp+90H]
|
||||
paddd xmm2, xmmword ptr [rsp+0B0H]
|
||||
paddd xmm3, xmmword ptr [rsp+80H]
|
||||
paddd xmm0, xmm5
|
||||
paddd xmm1, xmm6
|
||||
paddd xmm2, xmm7
|
||||
paddd xmm3, xmm4
|
||||
pxor xmm15, xmm0
|
||||
pxor xmm12, xmm1
|
||||
pxor xmm13, xmm2
|
||||
pxor xmm14, xmm3
|
||||
movdqa xmm8, xmmword ptr [ROT16]
|
||||
pshufb xmm15, xmm8
|
||||
pshufb xmm12, xmm8
|
||||
pshufb xmm13, xmm8
|
||||
pshufb xmm14, xmm8
|
||||
paddd xmm10, xmm15
|
||||
paddd xmm11, xmm12
|
||||
movdqa xmm8, xmmword ptr [rsp+100H]
|
||||
paddd xmm8, xmm13
|
||||
paddd xmm9, xmm14
|
||||
pxor xmm5, xmm10
|
||||
pxor xmm6, xmm11
|
||||
pxor xmm7, xmm8
|
||||
pxor xmm4, xmm9
|
||||
movdqa xmmword ptr [rsp+100H], xmm8
|
||||
movdqa xmm8, xmm5
|
||||
psrld xmm8, 12
|
||||
pslld xmm5, 20
|
||||
por xmm5, xmm8
|
||||
movdqa xmm8, xmm6
|
||||
psrld xmm8, 12
|
||||
pslld xmm6, 20
|
||||
por xmm6, xmm8
|
||||
movdqa xmm8, xmm7
|
||||
psrld xmm8, 12
|
||||
pslld xmm7, 20
|
||||
por xmm7, xmm8
|
||||
movdqa xmm8, xmm4
|
||||
psrld xmm8, 12
|
||||
pslld xmm4, 20
|
||||
por xmm4, xmm8
|
||||
paddd xmm0, xmmword ptr [rsp+50H]
|
||||
paddd xmm1, xmmword ptr [rsp]
|
||||
paddd xmm2, xmmword ptr [rsp+0F0H]
|
||||
paddd xmm3, xmmword ptr [rsp+10H]
|
||||
paddd xmm0, xmm5
|
||||
paddd xmm1, xmm6
|
||||
paddd xmm2, xmm7
|
||||
paddd xmm3, xmm4
|
||||
pxor xmm15, xmm0
|
||||
pxor xmm12, xmm1
|
||||
pxor xmm13, xmm2
|
||||
pxor xmm14, xmm3
|
||||
movdqa xmm8, xmmword ptr [ROT8]
|
||||
pshufb xmm15, xmm8
|
||||
pshufb xmm12, xmm8
|
||||
pshufb xmm13, xmm8
|
||||
pshufb xmm14, xmm8
|
||||
paddd xmm10, xmm15
|
||||
paddd xmm11, xmm12
|
||||
movdqa xmm8, xmmword ptr [rsp+100H]
|
||||
paddd xmm8, xmm13
|
||||
paddd xmm9, xmm14
|
||||
pxor xmm5, xmm10
|
||||
pxor xmm6, xmm11
|
||||
pxor xmm7, xmm8
|
||||
pxor xmm4, xmm9
|
||||
movdqa xmmword ptr [rsp+100H], xmm8
|
||||
movdqa xmm8, xmm5
|
||||
psrld xmm8, 7
|
||||
pslld xmm5, 25
|
||||
por xmm5, xmm8
|
||||
movdqa xmm8, xmm6
|
||||
psrld xmm8, 7
|
||||
pslld xmm6, 25
|
||||
por xmm6, xmm8
|
||||
movdqa xmm8, xmm7
|
||||
psrld xmm8, 7
|
||||
pslld xmm7, 25
|
||||
por xmm7, xmm8
|
||||
movdqa xmm8, xmm4
|
||||
psrld xmm8, 7
|
||||
pslld xmm4, 25
|
||||
por xmm4, xmm8
|
||||
paddd xmm0, xmmword ptr [rsp+0A0H]
|
||||
paddd xmm1, xmmword ptr [rsp+0C0H]
|
||||
paddd xmm2, xmmword ptr [rsp+0E0H]
|
||||
paddd xmm3, xmmword ptr [rsp+0D0H]
|
||||
paddd xmm0, xmm4
|
||||
paddd xmm1, xmm5
|
||||
paddd xmm2, xmm6
|
||||
paddd xmm3, xmm7
|
||||
pxor xmm12, xmm0
|
||||
pxor xmm13, xmm1
|
||||
pxor xmm14, xmm2
|
||||
pxor xmm15, xmm3
|
||||
movdqa xmm8, xmmword ptr [ROT16]
|
||||
pshufb xmm12, xmm8
|
||||
pshufb xmm13, xmm8
|
||||
pshufb xmm14, xmm8
|
||||
pshufb xmm15, xmm8
|
||||
movdqa xmm8, xmmword ptr [rsp+100H]
|
||||
paddd xmm8, xmm12
|
||||
paddd xmm9, xmm13
|
||||
paddd xmm10, xmm14
|
||||
paddd xmm11, xmm15
|
||||
pxor xmm4, xmm8
|
||||
pxor xmm5, xmm9
|
||||
pxor xmm6, xmm10
|
||||
pxor xmm7, xmm11
|
||||
movdqa xmmword ptr [rsp+100H], xmm8
|
||||
movdqa xmm8, xmm4
|
||||
psrld xmm8, 12
|
||||
pslld xmm4, 20
|
||||
por xmm4, xmm8
|
||||
movdqa xmm8, xmm5
|
||||
psrld xmm8, 12
|
||||
pslld xmm5, 20
|
||||
por xmm5, xmm8
|
||||
movdqa xmm8, xmm6
|
||||
psrld xmm8, 12
|
||||
pslld xmm6, 20
|
||||
por xmm6, xmm8
|
||||
movdqa xmm8, xmm7
|
||||
psrld xmm8, 12
|
||||
pslld xmm7, 20
|
||||
por xmm7, xmm8
|
||||
paddd xmm0, xmmword ptr [rsp+70H]
|
||||
paddd xmm1, xmmword ptr [rsp+90H]
|
||||
paddd xmm2, xmmword ptr [rsp+30H]
|
||||
paddd xmm3, xmmword ptr [rsp+0F0H]
|
||||
paddd xmm0, xmm4
|
||||
paddd xmm1, xmm5
|
||||
paddd xmm2, xmm6
|
||||
paddd xmm3, xmm7
|
||||
pxor xmm12, xmm0
|
||||
pxor xmm13, xmm1
|
||||
pxor xmm14, xmm2
|
||||
pxor xmm15, xmm3
|
||||
movdqa xmm8, xmmword ptr [ROT8]
|
||||
pshufb xmm12, xmm8
|
||||
pshufb xmm13, xmm8
|
||||
pshufb xmm14, xmm8
|
||||
pshufb xmm15, xmm8
|
||||
movdqa xmm8, xmmword ptr [rsp+100H]
|
||||
paddd xmm8, xmm12
|
||||
paddd xmm9, xmm13
|
||||
paddd xmm10, xmm14
|
||||
paddd xmm11, xmm15
|
||||
pxor xmm4, xmm8
|
||||
pxor xmm5, xmm9
|
||||
pxor xmm6, xmm10
|
||||
pxor xmm7, xmm11
|
||||
movdqa xmmword ptr [rsp+100H], xmm8
|
||||
movdqa xmm8, xmm4
|
||||
psrld xmm8, 7
|
||||
pslld xmm4, 25
|
||||
por xmm4, xmm8
|
||||
movdqa xmm8, xmm5
|
||||
psrld xmm8, 7
|
||||
pslld xmm5, 25
|
||||
por xmm5, xmm8
|
||||
movdqa xmm8, xmm6
|
||||
psrld xmm8, 7
|
||||
pslld xmm6, 25
|
||||
por xmm6, xmm8
|
||||
movdqa xmm8, xmm7
|
||||
psrld xmm8, 7
|
||||
pslld xmm7, 25
|
||||
por xmm7, xmm8
|
||||
paddd xmm0, xmmword ptr [rsp+40H]
|
||||
paddd xmm1, xmmword ptr [rsp+0B0H]
|
||||
paddd xmm2, xmmword ptr [rsp+50H]
|
||||
paddd xmm3, xmmword ptr [rsp+10H]
|
||||
paddd xmm0, xmm5
|
||||
paddd xmm1, xmm6
|
||||
paddd xmm2, xmm7
|
||||
paddd xmm3, xmm4
|
||||
pxor xmm15, xmm0
|
||||
pxor xmm12, xmm1
|
||||
pxor xmm13, xmm2
|
||||
pxor xmm14, xmm3
|
||||
movdqa xmm8, xmmword ptr [ROT16]
|
||||
pshufb xmm15, xmm8
|
||||
pshufb xmm12, xmm8
|
||||
pshufb xmm13, xmm8
|
||||
pshufb xmm14, xmm8
|
||||
paddd xmm10, xmm15
|
||||
paddd xmm11, xmm12
|
||||
movdqa xmm8, xmmword ptr [rsp+100H]
|
||||
paddd xmm8, xmm13
|
||||
paddd xmm9, xmm14
|
||||
pxor xmm5, xmm10
|
||||
pxor xmm6, xmm11
|
||||
pxor xmm7, xmm8
|
||||
pxor xmm4, xmm9
|
||||
movdqa xmmword ptr [rsp+100H], xmm8
|
||||
movdqa xmm8, xmm5
|
||||
psrld xmm8, 12
|
||||
pslld xmm5, 20
|
||||
por xmm5, xmm8
|
||||
movdqa xmm8, xmm6
|
||||
psrld xmm8, 12
|
||||
pslld xmm6, 20
|
||||
por xmm6, xmm8
|
||||
movdqa xmm8, xmm7
|
||||
psrld xmm8, 12
|
||||
pslld xmm7, 20
|
||||
por xmm7, xmm8
|
||||
movdqa xmm8, xmm4
|
||||
psrld xmm8, 12
|
||||
pslld xmm4, 20
|
||||
por xmm4, xmm8
|
||||
paddd xmm0, xmmword ptr [rsp]
|
||||
paddd xmm1, xmmword ptr [rsp+20H]
|
||||
paddd xmm2, xmmword ptr [rsp+80H]
|
||||
paddd xmm3, xmmword ptr [rsp+60H]
|
||||
paddd xmm0, xmm5
|
||||
paddd xmm1, xmm6
|
||||
paddd xmm2, xmm7
|
||||
paddd xmm3, xmm4
|
||||
pxor xmm15, xmm0
|
||||
pxor xmm12, xmm1
|
||||
pxor xmm13, xmm2
|
||||
pxor xmm14, xmm3
|
||||
movdqa xmm8, xmmword ptr [ROT8]
|
||||
pshufb xmm15, xmm8
|
||||
pshufb xmm12, xmm8
|
||||
pshufb xmm13, xmm8
|
||||
pshufb xmm14, xmm8
|
||||
paddd xmm10, xmm15
|
||||
paddd xmm11, xmm12
|
||||
movdqa xmm8, xmmword ptr [rsp+100H]
|
||||
paddd xmm8, xmm13
|
||||
paddd xmm9, xmm14
|
||||
pxor xmm5, xmm10
|
||||
pxor xmm6, xmm11
|
||||
pxor xmm7, xmm8
|
||||
pxor xmm4, xmm9
|
||||
movdqa xmmword ptr [rsp+100H], xmm8
|
||||
movdqa xmm8, xmm5
|
||||
psrld xmm8, 7
|
||||
pslld xmm5, 25
|
||||
por xmm5, xmm8
|
||||
movdqa xmm8, xmm6
|
||||
psrld xmm8, 7
|
||||
pslld xmm6, 25
|
||||
por xmm6, xmm8
|
||||
movdqa xmm8, xmm7
|
||||
psrld xmm8, 7
|
||||
pslld xmm7, 25
|
||||
por xmm7, xmm8
|
||||
movdqa xmm8, xmm4
|
||||
psrld xmm8, 7
|
||||
pslld xmm4, 25
|
||||
por xmm4, xmm8
|
||||
paddd xmm0, xmmword ptr [rsp+0C0H]
|
||||
paddd xmm1, xmmword ptr [rsp+90H]
|
||||
paddd xmm2, xmmword ptr [rsp+0F0H]
|
||||
paddd xmm3, xmmword ptr [rsp+0E0H]
|
||||
paddd xmm0, xmm4
|
||||
paddd xmm1, xmm5
|
||||
paddd xmm2, xmm6
|
||||
paddd xmm3, xmm7
|
||||
pxor xmm12, xmm0
|
||||
pxor xmm13, xmm1
|
||||
pxor xmm14, xmm2
|
||||
pxor xmm15, xmm3
|
||||
movdqa xmm8, xmmword ptr [ROT16]
|
||||
pshufb xmm12, xmm8
|
||||
pshufb xmm13, xmm8
|
||||
pshufb xmm14, xmm8
|
||||
pshufb xmm15, xmm8
|
||||
movdqa xmm8, xmmword ptr [rsp+100H]
|
||||
paddd xmm8, xmm12
|
||||
paddd xmm9, xmm13
|
||||
paddd xmm10, xmm14
|
||||
paddd xmm11, xmm15
|
||||
pxor xmm4, xmm8
|
||||
pxor xmm5, xmm9
|
||||
pxor xmm6, xmm10
|
||||
pxor xmm7, xmm11
|
||||
movdqa xmmword ptr [rsp+100H], xmm8
|
||||
movdqa xmm8, xmm4
|
||||
psrld xmm8, 12
|
||||
pslld xmm4, 20
|
||||
por xmm4, xmm8
|
||||
movdqa xmm8, xmm5
|
||||
psrld xmm8, 12
|
||||
pslld xmm5, 20
|
||||
por xmm5, xmm8
|
||||
movdqa xmm8, xmm6
|
||||
psrld xmm8, 12
|
||||
pslld xmm6, 20
|
||||
por xmm6, xmm8
|
||||
movdqa xmm8, xmm7
|
||||
psrld xmm8, 12
|
||||
pslld xmm7, 20
|
||||
por xmm7, xmm8
|
||||
paddd xmm0, xmmword ptr [rsp+0D0H]
|
||||
paddd xmm1, xmmword ptr [rsp+0B0H]
|
||||
paddd xmm2, xmmword ptr [rsp+0A0H]
|
||||
paddd xmm3, xmmword ptr [rsp+80H]
|
||||
paddd xmm0, xmm4
|
||||
paddd xmm1, xmm5
|
||||
paddd xmm2, xmm6
|
||||
paddd xmm3, xmm7
|
||||
pxor xmm12, xmm0
|
||||
pxor xmm13, xmm1
|
||||
pxor xmm14, xmm2
|
||||
pxor xmm15, xmm3
|
||||
movdqa xmm8, xmmword ptr [ROT8]
|
||||
pshufb xmm12, xmm8
|
||||
pshufb xmm13, xmm8
|
||||
pshufb xmm14, xmm8
|
||||
pshufb xmm15, xmm8
|
||||
movdqa xmm8, xmmword ptr [rsp+100H]
|
||||
paddd xmm8, xmm12
|
||||
paddd xmm9, xmm13
|
||||
paddd xmm10, xmm14
|
||||
paddd xmm11, xmm15
|
||||
pxor xmm4, xmm8
|
||||
pxor xmm5, xmm9
|
||||
pxor xmm6, xmm10
|
||||
pxor xmm7, xmm11
|
||||
movdqa xmmword ptr [rsp+100H], xmm8
|
||||
movdqa xmm8, xmm4
|
||||
psrld xmm8, 7
|
||||
pslld xmm4, 25
|
||||
por xmm4, xmm8
|
||||
movdqa xmm8, xmm5
|
||||
psrld xmm8, 7
|
||||
pslld xmm5, 25
|
||||
por xmm5, xmm8
|
||||
movdqa xmm8, xmm6
|
||||
psrld xmm8, 7
|
||||
pslld xmm6, 25
|
||||
por xmm6, xmm8
|
||||
movdqa xmm8, xmm7
|
||||
psrld xmm8, 7
|
||||
pslld xmm7, 25
|
||||
por xmm7, xmm8
|
||||
paddd xmm0, xmmword ptr [rsp+70H]
|
||||
paddd xmm1, xmmword ptr [rsp+50H]
|
||||
paddd xmm2, xmmword ptr [rsp]
|
||||
paddd xmm3, xmmword ptr [rsp+60H]
|
||||
paddd xmm0, xmm5
|
||||
paddd xmm1, xmm6
|
||||
paddd xmm2, xmm7
|
||||
paddd xmm3, xmm4
|
||||
pxor xmm15, xmm0
|
||||
pxor xmm12, xmm1
|
||||
pxor xmm13, xmm2
|
||||
pxor xmm14, xmm3
|
||||
movdqa xmm8, xmmword ptr [ROT16]
|
||||
pshufb xmm15, xmm8
|
||||
pshufb xmm12, xmm8
|
||||
pshufb xmm13, xmm8
|
||||
pshufb xmm14, xmm8
|
||||
paddd xmm10, xmm15
|
||||
paddd xmm11, xmm12
|
||||
movdqa xmm8, xmmword ptr [rsp+100H]
|
||||
paddd xmm8, xmm13
|
||||
paddd xmm9, xmm14
|
||||
pxor xmm5, xmm10
|
||||
pxor xmm6, xmm11
|
||||
pxor xmm7, xmm8
|
||||
pxor xmm4, xmm9
|
||||
movdqa xmmword ptr [rsp+100H], xmm8
|
||||
movdqa xmm8, xmm5
|
||||
psrld xmm8, 12
|
||||
pslld xmm5, 20
|
||||
por xmm5, xmm8
|
||||
movdqa xmm8, xmm6
|
||||
psrld xmm8, 12
|
||||
pslld xmm6, 20
|
||||
por xmm6, xmm8
|
||||
movdqa xmm8, xmm7
|
||||
psrld xmm8, 12
|
||||
pslld xmm7, 20
|
||||
por xmm7, xmm8
|
||||
movdqa xmm8, xmm4
|
||||
psrld xmm8, 12
|
||||
pslld xmm4, 20
|
||||
por xmm4, xmm8
|
||||
paddd xmm0, xmmword ptr [rsp+20H]
|
||||
paddd xmm1, xmmword ptr [rsp+30H]
|
||||
paddd xmm2, xmmword ptr [rsp+10H]
|
||||
paddd xmm3, xmmword ptr [rsp+40H]
|
||||
paddd xmm0, xmm5
|
||||
paddd xmm1, xmm6
|
||||
paddd xmm2, xmm7
|
||||
paddd xmm3, xmm4
|
||||
pxor xmm15, xmm0
|
||||
pxor xmm12, xmm1
|
||||
pxor xmm13, xmm2
|
||||
pxor xmm14, xmm3
|
||||
movdqa xmm8, xmmword ptr [ROT8]
|
||||
pshufb xmm15, xmm8
|
||||
pshufb xmm12, xmm8
|
||||
pshufb xmm13, xmm8
|
||||
pshufb xmm14, xmm8
|
||||
paddd xmm10, xmm15
|
||||
paddd xmm11, xmm12
|
||||
movdqa xmm8, xmmword ptr [rsp+100H]
|
||||
paddd xmm8, xmm13
|
||||
paddd xmm9, xmm14
|
||||
pxor xmm5, xmm10
|
||||
pxor xmm6, xmm11
|
||||
pxor xmm7, xmm8
|
||||
pxor xmm4, xmm9
|
||||
movdqa xmmword ptr [rsp+100H], xmm8
|
||||
movdqa xmm8, xmm5
|
||||
psrld xmm8, 7
|
||||
pslld xmm5, 25
|
||||
por xmm5, xmm8
|
||||
movdqa xmm8, xmm6
|
||||
psrld xmm8, 7
|
||||
pslld xmm6, 25
|
||||
por xmm6, xmm8
|
||||
movdqa xmm8, xmm7
|
||||
psrld xmm8, 7
|
||||
pslld xmm7, 25
|
||||
por xmm7, xmm8
|
||||
movdqa xmm8, xmm4
|
||||
psrld xmm8, 7
|
||||
pslld xmm4, 25
|
||||
por xmm4, xmm8
|
||||
paddd xmm0, xmmword ptr [rsp+90H]
|
||||
paddd xmm1, xmmword ptr [rsp+0B0H]
|
||||
paddd xmm2, xmmword ptr [rsp+80H]
|
||||
paddd xmm3, xmmword ptr [rsp+0F0H]
|
||||
paddd xmm0, xmm4
|
||||
paddd xmm1, xmm5
|
||||
paddd xmm2, xmm6
|
||||
paddd xmm3, xmm7
|
||||
pxor xmm12, xmm0
|
||||
pxor xmm13, xmm1
|
||||
pxor xmm14, xmm2
|
||||
pxor xmm15, xmm3
|
||||
movdqa xmm8, xmmword ptr [ROT16]
|
||||
pshufb xmm12, xmm8
|
||||
pshufb xmm13, xmm8
|
||||
pshufb xmm14, xmm8
|
||||
pshufb xmm15, xmm8
|
||||
movdqa xmm8, xmmword ptr [rsp+100H]
|
||||
paddd xmm8, xmm12
|
||||
paddd xmm9, xmm13
|
||||
paddd xmm10, xmm14
|
||||
paddd xmm11, xmm15
|
||||
pxor xmm4, xmm8
|
||||
pxor xmm5, xmm9
|
||||
pxor xmm6, xmm10
|
||||
pxor xmm7, xmm11
|
||||
movdqa xmmword ptr [rsp+100H], xmm8
|
||||
movdqa xmm8, xmm4
|
||||
psrld xmm8, 12
|
||||
pslld xmm4, 20
|
||||
por xmm4, xmm8
|
||||
movdqa xmm8, xmm5
|
||||
psrld xmm8, 12
|
||||
pslld xmm5, 20
|
||||
por xmm5, xmm8
|
||||
movdqa xmm8, xmm6
|
||||
psrld xmm8, 12
|
||||
pslld xmm6, 20
|
||||
por xmm6, xmm8
|
||||
movdqa xmm8, xmm7
|
||||
psrld xmm8, 12
|
||||
pslld xmm7, 20
|
||||
por xmm7, xmm8
|
||||
paddd xmm0, xmmword ptr [rsp+0E0H]
|
||||
paddd xmm1, xmmword ptr [rsp+50H]
|
||||
paddd xmm2, xmmword ptr [rsp+0C0H]
|
||||
paddd xmm3, xmmword ptr [rsp+10H]
|
||||
paddd xmm0, xmm4
|
||||
paddd xmm1, xmm5
|
||||
paddd xmm2, xmm6
|
||||
paddd xmm3, xmm7
|
||||
pxor xmm12, xmm0
|
||||
pxor xmm13, xmm1
|
||||
pxor xmm14, xmm2
|
||||
pxor xmm15, xmm3
|
||||
movdqa xmm8, xmmword ptr [ROT8]
|
||||
pshufb xmm12, xmm8
|
||||
pshufb xmm13, xmm8
|
||||
pshufb xmm14, xmm8
|
||||
pshufb xmm15, xmm8
|
||||
movdqa xmm8, xmmword ptr [rsp+100H]
|
||||
paddd xmm8, xmm12
|
||||
paddd xmm9, xmm13
|
||||
paddd xmm10, xmm14
|
||||
paddd xmm11, xmm15
|
||||
pxor xmm4, xmm8
|
||||
pxor xmm5, xmm9
|
||||
pxor xmm6, xmm10
|
||||
pxor xmm7, xmm11
|
||||
movdqa xmmword ptr [rsp+100H], xmm8
|
||||
movdqa xmm8, xmm4
|
||||
psrld xmm8, 7
|
||||
pslld xmm4, 25
|
||||
por xmm4, xmm8
|
||||
movdqa xmm8, xmm5
|
||||
psrld xmm8, 7
|
||||
pslld xmm5, 25
|
||||
por xmm5, xmm8
|
||||
movdqa xmm8, xmm6
|
||||
psrld xmm8, 7
|
||||
pslld xmm6, 25
|
||||
por xmm6, xmm8
|
||||
movdqa xmm8, xmm7
|
||||
psrld xmm8, 7
|
||||
pslld xmm7, 25
|
||||
por xmm7, xmm8
|
||||
paddd xmm0, xmmword ptr [rsp+0D0H]
|
||||
paddd xmm1, xmmword ptr [rsp]
|
||||
paddd xmm2, xmmword ptr [rsp+20H]
|
||||
paddd xmm3, xmmword ptr [rsp+40H]
|
||||
paddd xmm0, xmm5
|
||||
paddd xmm1, xmm6
|
||||
paddd xmm2, xmm7
|
||||
paddd xmm3, xmm4
|
||||
pxor xmm15, xmm0
|
||||
pxor xmm12, xmm1
|
||||
pxor xmm13, xmm2
|
||||
pxor xmm14, xmm3
|
||||
movdqa xmm8, xmmword ptr [ROT16]
|
||||
pshufb xmm15, xmm8
|
||||
pshufb xmm12, xmm8
|
||||
pshufb xmm13, xmm8
|
||||
pshufb xmm14, xmm8
|
||||
paddd xmm10, xmm15
|
||||
paddd xmm11, xmm12
|
||||
movdqa xmm8, xmmword ptr [rsp+100H]
|
||||
paddd xmm8, xmm13
|
||||
paddd xmm9, xmm14
|
||||
pxor xmm5, xmm10
|
||||
pxor xmm6, xmm11
|
||||
pxor xmm7, xmm8
|
||||
pxor xmm4, xmm9
|
||||
movdqa xmmword ptr [rsp+100H], xmm8
|
||||
movdqa xmm8, xmm5
|
||||
psrld xmm8, 12
|
||||
pslld xmm5, 20
|
||||
por xmm5, xmm8
|
||||
movdqa xmm8, xmm6
|
||||
psrld xmm8, 12
|
||||
pslld xmm6, 20
|
||||
por xmm6, xmm8
|
||||
movdqa xmm8, xmm7
|
||||
psrld xmm8, 12
|
||||
pslld xmm7, 20
|
||||
por xmm7, xmm8
|
||||
movdqa xmm8, xmm4
|
||||
psrld xmm8, 12
|
||||
pslld xmm4, 20
|
||||
por xmm4, xmm8
|
||||
paddd xmm0, xmmword ptr [rsp+30H]
|
||||
paddd xmm1, xmmword ptr [rsp+0A0H]
|
||||
paddd xmm2, xmmword ptr [rsp+60H]
|
||||
paddd xmm3, xmmword ptr [rsp+70H]
|
||||
paddd xmm0, xmm5
|
||||
paddd xmm1, xmm6
|
||||
paddd xmm2, xmm7
|
||||
paddd xmm3, xmm4
|
||||
pxor xmm15, xmm0
|
||||
pxor xmm12, xmm1
|
||||
pxor xmm13, xmm2
|
||||
pxor xmm14, xmm3
|
||||
movdqa xmm8, xmmword ptr [ROT8]
|
||||
pshufb xmm15, xmm8
|
||||
pshufb xmm12, xmm8
|
||||
pshufb xmm13, xmm8
|
||||
pshufb xmm14, xmm8
|
||||
paddd xmm10, xmm15
|
||||
paddd xmm11, xmm12
|
||||
movdqa xmm8, xmmword ptr [rsp+100H]
|
||||
paddd xmm8, xmm13
|
||||
paddd xmm9, xmm14
|
||||
pxor xmm5, xmm10
|
||||
pxor xmm6, xmm11
|
||||
pxor xmm7, xmm8
|
||||
pxor xmm4, xmm9
|
||||
movdqa xmmword ptr [rsp+100H], xmm8
|
||||
movdqa xmm8, xmm5
|
||||
psrld xmm8, 7
|
||||
pslld xmm5, 25
|
||||
por xmm5, xmm8
|
||||
movdqa xmm8, xmm6
|
||||
psrld xmm8, 7
|
||||
pslld xmm6, 25
|
||||
por xmm6, xmm8
|
||||
movdqa xmm8, xmm7
|
||||
psrld xmm8, 7
|
||||
pslld xmm7, 25
|
||||
por xmm7, xmm8
|
||||
movdqa xmm8, xmm4
|
||||
psrld xmm8, 7
|
||||
pslld xmm4, 25
|
||||
por xmm4, xmm8
|
||||
paddd xmm0, xmmword ptr [rsp+0B0H]
|
||||
paddd xmm1, xmmword ptr [rsp+50H]
|
||||
paddd xmm2, xmmword ptr [rsp+10H]
|
||||
paddd xmm3, xmmword ptr [rsp+80H]
|
||||
paddd xmm0, xmm4
|
||||
paddd xmm1, xmm5
|
||||
paddd xmm2, xmm6
|
||||
paddd xmm3, xmm7
|
||||
pxor xmm12, xmm0
|
||||
pxor xmm13, xmm1
|
||||
pxor xmm14, xmm2
|
||||
pxor xmm15, xmm3
|
||||
movdqa xmm8, xmmword ptr [ROT16]
|
||||
pshufb xmm12, xmm8
|
||||
pshufb xmm13, xmm8
|
||||
pshufb xmm14, xmm8
|
||||
pshufb xmm15, xmm8
|
||||
movdqa xmm8, xmmword ptr [rsp+100H]
|
||||
paddd xmm8, xmm12
|
||||
paddd xmm9, xmm13
|
||||
paddd xmm10, xmm14
|
||||
paddd xmm11, xmm15
|
||||
pxor xmm4, xmm8
|
||||
pxor xmm5, xmm9
|
||||
pxor xmm6, xmm10
|
||||
pxor xmm7, xmm11
|
||||
movdqa xmmword ptr [rsp+100H], xmm8
|
||||
movdqa xmm8, xmm4
|
||||
psrld xmm8, 12
|
||||
pslld xmm4, 20
|
||||
por xmm4, xmm8
|
||||
movdqa xmm8, xmm5
|
||||
psrld xmm8, 12
|
||||
pslld xmm5, 20
|
||||
por xmm5, xmm8
|
||||
movdqa xmm8, xmm6
|
||||
psrld xmm8, 12
|
||||
pslld xmm6, 20
|
||||
por xmm6, xmm8
|
||||
movdqa xmm8, xmm7
|
||||
psrld xmm8, 12
|
||||
pslld xmm7, 20
|
||||
por xmm7, xmm8
|
||||
paddd xmm0, xmmword ptr [rsp+0F0H]
|
||||
paddd xmm1, xmmword ptr [rsp]
|
||||
paddd xmm2, xmmword ptr [rsp+90H]
|
||||
paddd xmm3, xmmword ptr [rsp+60H]
|
||||
paddd xmm0, xmm4
|
||||
paddd xmm1, xmm5
|
||||
paddd xmm2, xmm6
|
||||
paddd xmm3, xmm7
|
||||
pxor xmm12, xmm0
|
||||
pxor xmm13, xmm1
|
||||
pxor xmm14, xmm2
|
||||
pxor xmm15, xmm3
|
||||
movdqa xmm8, xmmword ptr [ROT8]
|
||||
pshufb xmm12, xmm8
|
||||
pshufb xmm13, xmm8
|
||||
pshufb xmm14, xmm8
|
||||
pshufb xmm15, xmm8
|
||||
movdqa xmm8, xmmword ptr [rsp+100H]
|
||||
paddd xmm8, xmm12
|
||||
paddd xmm9, xmm13
|
||||
paddd xmm10, xmm14
|
||||
paddd xmm11, xmm15
|
||||
pxor xmm4, xmm8
|
||||
pxor xmm5, xmm9
|
||||
pxor xmm6, xmm10
|
||||
pxor xmm7, xmm11
|
||||
movdqa xmmword ptr [rsp+100H], xmm8
|
||||
movdqa xmm8, xmm4
|
||||
psrld xmm8, 7
|
||||
pslld xmm4, 25
|
||||
por xmm4, xmm8
|
||||
movdqa xmm8, xmm5
|
||||
psrld xmm8, 7
|
||||
pslld xmm5, 25
|
||||
por xmm5, xmm8
|
||||
movdqa xmm8, xmm6
|
||||
psrld xmm8, 7
|
||||
pslld xmm6, 25
|
||||
por xmm6, xmm8
|
||||
movdqa xmm8, xmm7
|
||||
psrld xmm8, 7
|
||||
pslld xmm7, 25
|
||||
por xmm7, xmm8
|
||||
paddd xmm0, xmmword ptr [rsp+0E0H]
|
||||
paddd xmm1, xmmword ptr [rsp+20H]
|
||||
paddd xmm2, xmmword ptr [rsp+30H]
|
||||
paddd xmm3, xmmword ptr [rsp+70H]
|
||||
paddd xmm0, xmm5
|
||||
paddd xmm1, xmm6
|
||||
paddd xmm2, xmm7
|
||||
paddd xmm3, xmm4
|
||||
pxor xmm15, xmm0
|
||||
pxor xmm12, xmm1
|
||||
pxor xmm13, xmm2
|
||||
pxor xmm14, xmm3
|
||||
movdqa xmm8, xmmword ptr [ROT16]
|
||||
pshufb xmm15, xmm8
|
||||
pshufb xmm12, xmm8
|
||||
pshufb xmm13, xmm8
|
||||
pshufb xmm14, xmm8
|
||||
paddd xmm10, xmm15
|
||||
paddd xmm11, xmm12
|
||||
movdqa xmm8, xmmword ptr [rsp+100H]
|
||||
paddd xmm8, xmm13
|
||||
paddd xmm9, xmm14
|
||||
pxor xmm5, xmm10
|
||||
pxor xmm6, xmm11
|
||||
pxor xmm7, xmm8
|
||||
pxor xmm4, xmm9
|
||||
movdqa xmmword ptr [rsp+100H], xmm8
|
||||
movdqa xmm8, xmm5
|
||||
psrld xmm8, 12
|
||||
pslld xmm5, 20
|
||||
por xmm5, xmm8
|
||||
movdqa xmm8, xmm6
|
||||
psrld xmm8, 12
|
||||
pslld xmm6, 20
|
||||
por xmm6, xmm8
|
||||
movdqa xmm8, xmm7
|
||||
psrld xmm8, 12
|
||||
pslld xmm7, 20
|
||||
por xmm7, xmm8
|
||||
movdqa xmm8, xmm4
|
||||
psrld xmm8, 12
|
||||
pslld xmm4, 20
|
||||
por xmm4, xmm8
|
||||
paddd xmm0, xmmword ptr [rsp+0A0H]
|
||||
paddd xmm1, xmmword ptr [rsp+0C0H]
|
||||
paddd xmm2, xmmword ptr [rsp+40H]
|
||||
paddd xmm3, xmmword ptr [rsp+0D0H]
|
||||
paddd xmm0, xmm5
|
||||
paddd xmm1, xmm6
|
||||
paddd xmm2, xmm7
|
||||
paddd xmm3, xmm4
|
||||
pxor xmm15, xmm0
|
||||
pxor xmm12, xmm1
|
||||
pxor xmm13, xmm2
|
||||
pxor xmm14, xmm3
|
||||
movdqa xmm8, xmmword ptr [ROT8]
|
||||
pshufb xmm15, xmm8
|
||||
pshufb xmm12, xmm8
|
||||
pshufb xmm13, xmm8
|
||||
pshufb xmm14, xmm8
|
||||
paddd xmm10, xmm15
|
||||
paddd xmm11, xmm12
|
||||
movdqa xmm8, xmmword ptr [rsp+100H]
|
||||
paddd xmm8, xmm13
|
||||
paddd xmm9, xmm14
|
||||
pxor xmm5, xmm10
|
||||
pxor xmm6, xmm11
|
||||
pxor xmm7, xmm8
|
||||
pxor xmm4, xmm9
|
||||
pxor xmm0, xmm8
|
||||
pxor xmm1, xmm9
|
||||
pxor xmm2, xmm10
|
||||
pxor xmm3, xmm11
|
||||
movdqa xmm8, xmm5
|
||||
psrld xmm8, 7
|
||||
pslld xmm5, 25
|
||||
por xmm5, xmm8
|
||||
movdqa xmm8, xmm6
|
||||
psrld xmm8, 7
|
||||
pslld xmm6, 25
|
||||
por xmm6, xmm8
|
||||
movdqa xmm8, xmm7
|
||||
psrld xmm8, 7
|
||||
pslld xmm7, 25
|
||||
por xmm7, xmm8
|
||||
movdqa xmm8, xmm4
|
||||
psrld xmm8, 7
|
||||
pslld xmm4, 25
|
||||
por xmm4, xmm8
|
||||
pxor xmm4, xmm12
|
||||
pxor xmm5, xmm13
|
||||
pxor xmm6, xmm14
|
||||
pxor xmm7, xmm15
|
||||
mov eax, r13d
|
||||
jne innerloop4
|
||||
movdqa xmm9, xmm0
|
||||
punpckldq xmm0, xmm1
|
||||
punpckhdq xmm9, xmm1
|
||||
movdqa xmm11, xmm2
|
||||
punpckldq xmm2, xmm3
|
||||
punpckhdq xmm11, xmm3
|
||||
movdqa xmm1, xmm0
|
||||
punpcklqdq xmm0, xmm2
|
||||
punpckhqdq xmm1, xmm2
|
||||
movdqa xmm3, xmm9
|
||||
punpcklqdq xmm9, xmm11
|
||||
punpckhqdq xmm3, xmm11
|
||||
movdqu xmmword ptr [rbx], xmm0
|
||||
movdqu xmmword ptr [rbx+20H], xmm1
|
||||
movdqu xmmword ptr [rbx+40H], xmm9
|
||||
movdqu xmmword ptr [rbx+60H], xmm3
|
||||
movdqa xmm9, xmm4
|
||||
punpckldq xmm4, xmm5
|
||||
punpckhdq xmm9, xmm5
|
||||
movdqa xmm11, xmm6
|
||||
punpckldq xmm6, xmm7
|
||||
punpckhdq xmm11, xmm7
|
||||
movdqa xmm5, xmm4
|
||||
punpcklqdq xmm4, xmm6
|
||||
punpckhqdq xmm5, xmm6
|
||||
movdqa xmm7, xmm9
|
||||
punpcklqdq xmm9, xmm11
|
||||
punpckhqdq xmm7, xmm11
|
||||
movdqu xmmword ptr [rbx+10H], xmm4
|
||||
movdqu xmmword ptr [rbx+30H], xmm5
|
||||
movdqu xmmword ptr [rbx+50H], xmm9
|
||||
movdqu xmmword ptr [rbx+70H], xmm7
|
||||
movdqa xmm1, xmmword ptr [rsp+110H]
|
||||
movdqa xmm0, xmm1
|
||||
paddd xmm1, xmmword ptr [rsp+150H]
|
||||
movdqa xmmword ptr [rsp+110H], xmm1
|
||||
pxor xmm0, xmmword ptr [CMP_MSB_MASK]
|
||||
pxor xmm1, xmmword ptr [CMP_MSB_MASK]
|
||||
pcmpgtd xmm0, xmm1
|
||||
movdqa xmm1, xmmword ptr [rsp+120H]
|
||||
psubd xmm1, xmm0
|
||||
movdqa xmmword ptr [rsp+120H], xmm1
|
||||
add rbx, 128
|
||||
add rdi, 32
|
||||
sub rsi, 4
|
||||
cmp rsi, 4
|
||||
jnc outerloop4
|
||||
test rsi, rsi
|
||||
jne final3blocks
|
||||
unwind:
|
||||
movdqa xmm6, xmmword ptr [rsp+170H]
|
||||
movdqa xmm7, xmmword ptr [rsp+180H]
|
||||
movdqa xmm8, xmmword ptr [rsp+190H]
|
||||
movdqa xmm9, xmmword ptr [rsp+1A0H]
|
||||
movdqa xmm10, xmmword ptr [rsp+1B0H]
|
||||
movdqa xmm11, xmmword ptr [rsp+1C0H]
|
||||
movdqa xmm12, xmmword ptr [rsp+1D0H]
|
||||
movdqa xmm13, xmmword ptr [rsp+1E0H]
|
||||
movdqa xmm14, xmmword ptr [rsp+1F0H]
|
||||
movdqa xmm15, xmmword ptr [rsp+200H]
|
||||
mov rsp, rbp
|
||||
pop rbp
|
||||
pop rbx
|
||||
pop rdi
|
||||
pop rsi
|
||||
pop r12
|
||||
pop r13
|
||||
pop r14
|
||||
pop r15
|
||||
ret
|
||||
ALIGN 16
|
||||
final3blocks:
|
||||
test esi, 2H
|
||||
je final1block
|
||||
movups xmm0, xmmword ptr [rcx]
|
||||
movups xmm1, xmmword ptr [rcx+10H]
|
||||
movaps xmm8, xmm0
|
||||
movaps xmm9, xmm1
|
||||
movd xmm13, dword ptr [rsp+110H]
|
||||
pinsrd xmm13, dword ptr [rsp+120H], 1
|
||||
pinsrd xmm13, dword ptr [BLAKE3_BLOCK_LEN], 2
|
||||
movaps xmmword ptr [rsp], xmm13
|
||||
movd xmm14, dword ptr [rsp+114H]
|
||||
pinsrd xmm14, dword ptr [rsp+124H], 1
|
||||
pinsrd xmm14, dword ptr [BLAKE3_BLOCK_LEN], 2
|
||||
movaps xmmword ptr [rsp+10H], xmm14
|
||||
mov r8, qword ptr [rdi]
|
||||
mov r9, qword ptr [rdi+8H]
|
||||
movzx eax, byte ptr [rbp+80H]
|
||||
or eax, r13d
|
||||
xor edx, edx
|
||||
innerloop2:
|
||||
mov r14d, eax
|
||||
or eax, r12d
|
||||
add rdx, 64
|
||||
cmp rdx, r15
|
||||
cmovne eax, r14d
|
||||
movaps xmm2, xmmword ptr [BLAKE3_IV]
|
||||
movaps xmm10, xmm2
|
||||
movups xmm4, xmmword ptr [r8+rdx-40H]
|
||||
movups xmm5, xmmword ptr [r8+rdx-30H]
|
||||
movaps xmm3, xmm4
|
||||
shufps xmm4, xmm5, 136
|
||||
shufps xmm3, xmm5, 221
|
||||
movaps xmm5, xmm3
|
||||
movups xmm6, xmmword ptr [r8+rdx-20H]
|
||||
movups xmm7, xmmword ptr [r8+rdx-10H]
|
||||
movaps xmm3, xmm6
|
||||
shufps xmm6, xmm7, 136
|
||||
pshufd xmm6, xmm6, 93H
|
||||
shufps xmm3, xmm7, 221
|
||||
pshufd xmm7, xmm3, 93H
|
||||
movups xmm12, xmmword ptr [r9+rdx-40H]
|
||||
movups xmm13, xmmword ptr [r9+rdx-30H]
|
||||
movaps xmm11, xmm12
|
||||
shufps xmm12, xmm13, 136
|
||||
shufps xmm11, xmm13, 221
|
||||
movaps xmm13, xmm11
|
||||
movups xmm14, xmmword ptr [r9+rdx-20H]
|
||||
movups xmm15, xmmword ptr [r9+rdx-10H]
|
||||
movaps xmm11, xmm14
|
||||
shufps xmm14, xmm15, 136
|
||||
pshufd xmm14, xmm14, 93H
|
||||
shufps xmm11, xmm15, 221
|
||||
pshufd xmm15, xmm11, 93H
|
||||
movaps xmm3, xmmword ptr [rsp]
|
||||
movaps xmm11, xmmword ptr [rsp+10H]
|
||||
pinsrd xmm3, eax, 3
|
||||
pinsrd xmm11, eax, 3
|
||||
mov al, 7
|
||||
roundloop2:
|
||||
paddd xmm0, xmm4
|
||||
paddd xmm8, xmm12
|
||||
movaps xmmword ptr [rsp+20H], xmm4
|
||||
movaps xmmword ptr [rsp+30H], xmm12
|
||||
paddd xmm0, xmm1
|
||||
paddd xmm8, xmm9
|
||||
pxor xmm3, xmm0
|
||||
pxor xmm11, xmm8
|
||||
movaps xmm12, xmmword ptr [ROT16]
|
||||
pshufb xmm3, xmm12
|
||||
pshufb xmm11, xmm12
|
||||
paddd xmm2, xmm3
|
||||
paddd xmm10, xmm11
|
||||
pxor xmm1, xmm2
|
||||
pxor xmm9, xmm10
|
||||
movdqa xmm4, xmm1
|
||||
pslld xmm1, 20
|
||||
psrld xmm4, 12
|
||||
por xmm1, xmm4
|
||||
movdqa xmm4, xmm9
|
||||
pslld xmm9, 20
|
||||
psrld xmm4, 12
|
||||
por xmm9, xmm4
|
||||
paddd xmm0, xmm5
|
||||
paddd xmm8, xmm13
|
||||
movaps xmmword ptr [rsp+40H], xmm5
|
||||
movaps xmmword ptr [rsp+50H], xmm13
|
||||
paddd xmm0, xmm1
|
||||
paddd xmm8, xmm9
|
||||
pxor xmm3, xmm0
|
||||
pxor xmm11, xmm8
|
||||
movaps xmm13, xmmword ptr [ROT8]
|
||||
pshufb xmm3, xmm13
|
||||
pshufb xmm11, xmm13
|
||||
paddd xmm2, xmm3
|
||||
paddd xmm10, xmm11
|
||||
pxor xmm1, xmm2
|
||||
pxor xmm9, xmm10
|
||||
movdqa xmm4, xmm1
|
||||
pslld xmm1, 25
|
||||
psrld xmm4, 7
|
||||
por xmm1, xmm4
|
||||
movdqa xmm4, xmm9
|
||||
pslld xmm9, 25
|
||||
psrld xmm4, 7
|
||||
por xmm9, xmm4
|
||||
pshufd xmm0, xmm0, 93H
|
||||
pshufd xmm8, xmm8, 93H
|
||||
pshufd xmm3, xmm3, 4EH
|
||||
pshufd xmm11, xmm11, 4EH
|
||||
pshufd xmm2, xmm2, 39H
|
||||
pshufd xmm10, xmm10, 39H
|
||||
paddd xmm0, xmm6
|
||||
paddd xmm8, xmm14
|
||||
paddd xmm0, xmm1
|
||||
paddd xmm8, xmm9
|
||||
pxor xmm3, xmm0
|
||||
pxor xmm11, xmm8
|
||||
pshufb xmm3, xmm12
|
||||
pshufb xmm11, xmm12
|
||||
paddd xmm2, xmm3
|
||||
paddd xmm10, xmm11
|
||||
pxor xmm1, xmm2
|
||||
pxor xmm9, xmm10
|
||||
movdqa xmm4, xmm1
|
||||
pslld xmm1, 20
|
||||
psrld xmm4, 12
|
||||
por xmm1, xmm4
|
||||
movdqa xmm4, xmm9
|
||||
pslld xmm9, 20
|
||||
psrld xmm4, 12
|
||||
por xmm9, xmm4
|
||||
paddd xmm0, xmm7
|
||||
paddd xmm8, xmm15
|
||||
paddd xmm0, xmm1
|
||||
paddd xmm8, xmm9
|
||||
pxor xmm3, xmm0
|
||||
pxor xmm11, xmm8
|
||||
pshufb xmm3, xmm13
|
||||
pshufb xmm11, xmm13
|
||||
paddd xmm2, xmm3
|
||||
paddd xmm10, xmm11
|
||||
pxor xmm1, xmm2
|
||||
pxor xmm9, xmm10
|
||||
movdqa xmm4, xmm1
|
||||
pslld xmm1, 25
|
||||
psrld xmm4, 7
|
||||
por xmm1, xmm4
|
||||
movdqa xmm4, xmm9
|
||||
pslld xmm9, 25
|
||||
psrld xmm4, 7
|
||||
por xmm9, xmm4
|
||||
pshufd xmm0, xmm0, 39H
|
||||
pshufd xmm8, xmm8, 39H
|
||||
pshufd xmm3, xmm3, 4EH
|
||||
pshufd xmm11, xmm11, 4EH
|
||||
pshufd xmm2, xmm2, 93H
|
||||
pshufd xmm10, xmm10, 93H
|
||||
dec al
|
||||
je endroundloop2
|
||||
movdqa xmm12, xmmword ptr [rsp+20H]
|
||||
movdqa xmm5, xmmword ptr [rsp+40H]
|
||||
pshufd xmm13, xmm12, 0FH
|
||||
shufps xmm12, xmm5, 214
|
||||
pshufd xmm4, xmm12, 39H
|
||||
movdqa xmm12, xmm6
|
||||
shufps xmm12, xmm7, 250
|
||||
pblendw xmm13, xmm12, 0CCH
|
||||
movdqa xmm12, xmm7
|
||||
punpcklqdq xmm12, xmm5
|
||||
pblendw xmm12, xmm6, 0C0H
|
||||
pshufd xmm12, xmm12, 78H
|
||||
punpckhdq xmm5, xmm7
|
||||
punpckldq xmm6, xmm5
|
||||
pshufd xmm7, xmm6, 1EH
|
||||
movdqa xmmword ptr [rsp+20H], xmm13
|
||||
movdqa xmmword ptr [rsp+40H], xmm12
|
||||
movdqa xmm5, xmmword ptr [rsp+30H]
|
||||
movdqa xmm13, xmmword ptr [rsp+50H]
|
||||
pshufd xmm6, xmm5, 0FH
|
||||
shufps xmm5, xmm13, 214
|
||||
pshufd xmm12, xmm5, 39H
|
||||
movdqa xmm5, xmm14
|
||||
shufps xmm5, xmm15, 250
|
||||
pblendw xmm6, xmm5, 0CCH
|
||||
movdqa xmm5, xmm15
|
||||
punpcklqdq xmm5, xmm13
|
||||
pblendw xmm5, xmm14, 0C0H
|
||||
pshufd xmm5, xmm5, 78H
|
||||
punpckhdq xmm13, xmm15
|
||||
punpckldq xmm14, xmm13
|
||||
pshufd xmm15, xmm14, 1EH
|
||||
movdqa xmm13, xmm6
|
||||
movdqa xmm14, xmm5
|
||||
movdqa xmm5, xmmword ptr [rsp+20H]
|
||||
movdqa xmm6, xmmword ptr [rsp+40H]
|
||||
jmp roundloop2
|
||||
endroundloop2:
|
||||
pxor xmm0, xmm2
|
||||
pxor xmm1, xmm3
|
||||
pxor xmm8, xmm10
|
||||
pxor xmm9, xmm11
|
||||
mov eax, r13d
|
||||
cmp rdx, r15
|
||||
jne innerloop2
|
||||
movups xmmword ptr [rbx], xmm0
|
||||
movups xmmword ptr [rbx+10H], xmm1
|
||||
movups xmmword ptr [rbx+20H], xmm8
|
||||
movups xmmword ptr [rbx+30H], xmm9
|
||||
movdqa xmm0, xmmword ptr [rsp+130H]
|
||||
movdqa xmm1, xmmword ptr [rsp+110H]
|
||||
movdqa xmm2, xmmword ptr [rsp+120H]
|
||||
movdqu xmm3, xmmword ptr [rsp+118H]
|
||||
movdqu xmm4, xmmword ptr [rsp+128H]
|
||||
blendvps xmm1, xmm3, xmm0
|
||||
blendvps xmm2, xmm4, xmm0
|
||||
movdqa xmmword ptr [rsp+110H], xmm1
|
||||
movdqa xmmword ptr [rsp+120H], xmm2
|
||||
add rdi, 16
|
||||
add rbx, 64
|
||||
sub rsi, 2
|
||||
final1block:
|
||||
test esi, 1H
|
||||
je unwind
|
||||
movups xmm0, xmmword ptr [rcx]
|
||||
movups xmm1, xmmword ptr [rcx+10H]
|
||||
movd xmm13, dword ptr [rsp+110H]
|
||||
pinsrd xmm13, dword ptr [rsp+120H], 1
|
||||
pinsrd xmm13, dword ptr [BLAKE3_BLOCK_LEN], 2
|
||||
movaps xmm14, xmmword ptr [ROT8]
|
||||
movaps xmm15, xmmword ptr [ROT16]
|
||||
mov r8, qword ptr [rdi]
|
||||
movzx eax, byte ptr [rbp+80H]
|
||||
or eax, r13d
|
||||
xor edx, edx
|
||||
innerloop1:
|
||||
mov r14d, eax
|
||||
or eax, r12d
|
||||
add rdx, 64
|
||||
cmp rdx, r15
|
||||
cmovne eax, r14d
|
||||
movaps xmm2, xmmword ptr [BLAKE3_IV]
|
||||
movaps xmm3, xmm13
|
||||
pinsrd xmm3, eax, 3
|
||||
movups xmm4, xmmword ptr [r8+rdx-40H]
|
||||
movups xmm5, xmmword ptr [r8+rdx-30H]
|
||||
movaps xmm8, xmm4
|
||||
shufps xmm4, xmm5, 136
|
||||
shufps xmm8, xmm5, 221
|
||||
movaps xmm5, xmm8
|
||||
movups xmm6, xmmword ptr [r8+rdx-20H]
|
||||
movups xmm7, xmmword ptr [r8+rdx-10H]
|
||||
movaps xmm8, xmm6
|
||||
shufps xmm6, xmm7, 136
|
||||
pshufd xmm6, xmm6, 93H
|
||||
shufps xmm8, xmm7, 221
|
||||
pshufd xmm7, xmm8, 93H
|
||||
mov al, 7
|
||||
roundloop1:
|
||||
paddd xmm0, xmm4
|
||||
paddd xmm0, xmm1
|
||||
pxor xmm3, xmm0
|
||||
pshufb xmm3, xmm15
|
||||
paddd xmm2, xmm3
|
||||
pxor xmm1, xmm2
|
||||
movdqa xmm11, xmm1
|
||||
pslld xmm1, 20
|
||||
psrld xmm11, 12
|
||||
por xmm1, xmm11
|
||||
paddd xmm0, xmm5
|
||||
paddd xmm0, xmm1
|
||||
pxor xmm3, xmm0
|
||||
pshufb xmm3, xmm14
|
||||
paddd xmm2, xmm3
|
||||
pxor xmm1, xmm2
|
||||
movdqa xmm11, xmm1
|
||||
pslld xmm1, 25
|
||||
psrld xmm11, 7
|
||||
por xmm1, xmm11
|
||||
pshufd xmm0, xmm0, 93H
|
||||
pshufd xmm3, xmm3, 4EH
|
||||
pshufd xmm2, xmm2, 39H
|
||||
paddd xmm0, xmm6
|
||||
paddd xmm0, xmm1
|
||||
pxor xmm3, xmm0
|
||||
pshufb xmm3, xmm15
|
||||
paddd xmm2, xmm3
|
||||
pxor xmm1, xmm2
|
||||
movdqa xmm11, xmm1
|
||||
pslld xmm1, 20
|
||||
psrld xmm11, 12
|
||||
por xmm1, xmm11
|
||||
paddd xmm0, xmm7
|
||||
paddd xmm0, xmm1
|
||||
pxor xmm3, xmm0
|
||||
pshufb xmm3, xmm14
|
||||
paddd xmm2, xmm3
|
||||
pxor xmm1, xmm2
|
||||
movdqa xmm11, xmm1
|
||||
pslld xmm1, 25
|
||||
psrld xmm11, 7
|
||||
por xmm1, xmm11
|
||||
pshufd xmm0, xmm0, 39H
|
||||
pshufd xmm3, xmm3, 4EH
|
||||
pshufd xmm2, xmm2, 93H
|
||||
dec al
|
||||
jz endroundloop1
|
||||
movdqa xmm8, xmm4
|
||||
shufps xmm8, xmm5, 214
|
||||
pshufd xmm9, xmm4, 0FH
|
||||
pshufd xmm4, xmm8, 39H
|
||||
movdqa xmm8, xmm6
|
||||
shufps xmm8, xmm7, 250
|
||||
pblendw xmm9, xmm8, 0CCH
|
||||
movdqa xmm8, xmm7
|
||||
punpcklqdq xmm8, xmm5
|
||||
pblendw xmm8, xmm6, 0C0H
|
||||
pshufd xmm8, xmm8, 78H
|
||||
punpckhdq xmm5, xmm7
|
||||
punpckldq xmm6, xmm5
|
||||
pshufd xmm7, xmm6, 1EH
|
||||
movdqa xmm5, xmm9
|
||||
movdqa xmm6, xmm8
|
||||
jmp roundloop1
|
||||
endroundloop1:
|
||||
pxor xmm0, xmm2
|
||||
pxor xmm1, xmm3
|
||||
mov eax, r13d
|
||||
cmp rdx, r15
|
||||
jne innerloop1
|
||||
movups xmmword ptr [rbx], xmm0
|
||||
movups xmmword ptr [rbx+10H], xmm1
|
||||
jmp unwind
|
||||
_blake3_hash_many_sse2 ENDP
|
||||
blake3_hash_many_sse2 ENDP
|
||||
|
||||
blake3_compress_in_place_sse2 PROC
|
||||
_blake3_compress_in_place_sse2 PROC
|
||||
sub rsp, 120
|
||||
movdqa xmmword ptr [rsp], xmm6
|
||||
movdqa xmmword ptr [rsp+10H], xmm7
|
||||
movdqa xmmword ptr [rsp+20H], xmm8
|
||||
movdqa xmmword ptr [rsp+30H], xmm9
|
||||
movdqa xmmword ptr [rsp+40H], xmm11
|
||||
movdqa xmmword ptr [rsp+50H], xmm14
|
||||
movdqa xmmword ptr [rsp+60H], xmm15
|
||||
movups xmm0, xmmword ptr [rcx]
|
||||
movups xmm1, xmmword ptr [rcx+10H]
|
||||
movaps xmm2, xmmword ptr [BLAKE3_IV]
|
||||
movzx eax, byte ptr [rsp+0A0H]
|
||||
movzx r8d, r8b
|
||||
shl rax, 32
|
||||
add r8, rax
|
||||
movq xmm3, r9
|
||||
movq xmm4, r8
|
||||
punpcklqdq xmm3, xmm4
|
||||
movups xmm4, xmmword ptr [rdx]
|
||||
movups xmm5, xmmword ptr [rdx+10H]
|
||||
movaps xmm8, xmm4
|
||||
shufps xmm4, xmm5, 136
|
||||
shufps xmm8, xmm5, 221
|
||||
movaps xmm5, xmm8
|
||||
movups xmm6, xmmword ptr [rdx+20H]
|
||||
movups xmm7, xmmword ptr [rdx+30H]
|
||||
movaps xmm8, xmm6
|
||||
shufps xmm6, xmm7, 136
|
||||
pshufd xmm6, xmm6, 93H
|
||||
shufps xmm8, xmm7, 221
|
||||
pshufd xmm7, xmm8, 93H
|
||||
movaps xmm14, xmmword ptr [ROT8]
|
||||
movaps xmm15, xmmword ptr [ROT16]
|
||||
mov al, 7
|
||||
@@:
|
||||
paddd xmm0, xmm4
|
||||
paddd xmm0, xmm1
|
||||
pxor xmm3, xmm0
|
||||
pshufb xmm3, xmm15
|
||||
paddd xmm2, xmm3
|
||||
pxor xmm1, xmm2
|
||||
movdqa xmm11, xmm1
|
||||
pslld xmm1, 20
|
||||
psrld xmm11, 12
|
||||
por xmm1, xmm11
|
||||
paddd xmm0, xmm5
|
||||
paddd xmm0, xmm1
|
||||
pxor xmm3, xmm0
|
||||
pshufb xmm3, xmm14
|
||||
paddd xmm2, xmm3
|
||||
pxor xmm1, xmm2
|
||||
movdqa xmm11, xmm1
|
||||
pslld xmm1, 25
|
||||
psrld xmm11, 7
|
||||
por xmm1, xmm11
|
||||
pshufd xmm0, xmm0, 93H
|
||||
pshufd xmm3, xmm3, 4EH
|
||||
pshufd xmm2, xmm2, 39H
|
||||
paddd xmm0, xmm6
|
||||
paddd xmm0, xmm1
|
||||
pxor xmm3, xmm0
|
||||
pshufb xmm3, xmm15
|
||||
paddd xmm2, xmm3
|
||||
pxor xmm1, xmm2
|
||||
movdqa xmm11, xmm1
|
||||
pslld xmm1, 20
|
||||
psrld xmm11, 12
|
||||
por xmm1, xmm11
|
||||
paddd xmm0, xmm7
|
||||
paddd xmm0, xmm1
|
||||
pxor xmm3, xmm0
|
||||
pshufb xmm3, xmm14
|
||||
paddd xmm2, xmm3
|
||||
pxor xmm1, xmm2
|
||||
movdqa xmm11, xmm1
|
||||
pslld xmm1, 25
|
||||
psrld xmm11, 7
|
||||
por xmm1, xmm11
|
||||
pshufd xmm0, xmm0, 39H
|
||||
pshufd xmm3, xmm3, 4EH
|
||||
pshufd xmm2, xmm2, 93H
|
||||
dec al
|
||||
jz @F
|
||||
movdqa xmm8, xmm4
|
||||
shufps xmm8, xmm5, 214
|
||||
pshufd xmm9, xmm4, 0FH
|
||||
pshufd xmm4, xmm8, 39H
|
||||
movdqa xmm8, xmm6
|
||||
shufps xmm8, xmm7, 250
|
||||
pblendw xmm9, xmm8, 0CCH
|
||||
movdqa xmm8, xmm7
|
||||
punpcklqdq xmm8, xmm5
|
||||
pblendw xmm8, xmm6, 0C0H
|
||||
pshufd xmm8, xmm8, 78H
|
||||
punpckhdq xmm5, xmm7
|
||||
punpckldq xmm6, xmm5
|
||||
pshufd xmm7, xmm6, 1EH
|
||||
movdqa xmm5, xmm9
|
||||
movdqa xmm6, xmm8
|
||||
jmp @B
|
||||
@@:
|
||||
pxor xmm0, xmm2
|
||||
pxor xmm1, xmm3
|
||||
movups xmmword ptr [rcx], xmm0
|
||||
movups xmmword ptr [rcx+10H], xmm1
|
||||
movdqa xmm6, xmmword ptr [rsp]
|
||||
movdqa xmm7, xmmword ptr [rsp+10H]
|
||||
movdqa xmm8, xmmword ptr [rsp+20H]
|
||||
movdqa xmm9, xmmword ptr [rsp+30H]
|
||||
movdqa xmm11, xmmword ptr [rsp+40H]
|
||||
movdqa xmm14, xmmword ptr [rsp+50H]
|
||||
movdqa xmm15, xmmword ptr [rsp+60H]
|
||||
add rsp, 120
|
||||
ret
|
||||
_blake3_compress_in_place_sse2 ENDP
|
||||
blake3_compress_in_place_sse2 ENDP
|
||||
|
||||
ALIGN 16
|
||||
blake3_compress_xof_sse2 PROC
|
||||
_blake3_compress_xof_sse2 PROC
|
||||
sub rsp, 120
|
||||
movdqa xmmword ptr [rsp], xmm6
|
||||
movdqa xmmword ptr [rsp+10H], xmm7
|
||||
movdqa xmmword ptr [rsp+20H], xmm8
|
||||
movdqa xmmword ptr [rsp+30H], xmm9
|
||||
movdqa xmmword ptr [rsp+40H], xmm11
|
||||
movdqa xmmword ptr [rsp+50H], xmm14
|
||||
movdqa xmmword ptr [rsp+60H], xmm15
|
||||
movups xmm0, xmmword ptr [rcx]
|
||||
movups xmm1, xmmword ptr [rcx+10H]
|
||||
movaps xmm2, xmmword ptr [BLAKE3_IV]
|
||||
movzx eax, byte ptr [rsp+0A0H]
|
||||
movzx r8d, r8b
|
||||
mov r10, qword ptr [rsp+0A8H]
|
||||
shl rax, 32
|
||||
add r8, rax
|
||||
movq xmm3, r9
|
||||
movq xmm4, r8
|
||||
punpcklqdq xmm3, xmm4
|
||||
movups xmm4, xmmword ptr [rdx]
|
||||
movups xmm5, xmmword ptr [rdx+10H]
|
||||
movaps xmm8, xmm4
|
||||
shufps xmm4, xmm5, 136
|
||||
shufps xmm8, xmm5, 221
|
||||
movaps xmm5, xmm8
|
||||
movups xmm6, xmmword ptr [rdx+20H]
|
||||
movups xmm7, xmmword ptr [rdx+30H]
|
||||
movaps xmm8, xmm6
|
||||
shufps xmm6, xmm7, 136
|
||||
pshufd xmm6, xmm6, 93H
|
||||
shufps xmm8, xmm7, 221
|
||||
pshufd xmm7, xmm8, 93H
|
||||
movaps xmm14, xmmword ptr [ROT8]
|
||||
movaps xmm15, xmmword ptr [ROT16]
|
||||
mov al, 7
|
||||
@@:
|
||||
paddd xmm0, xmm4
|
||||
paddd xmm0, xmm1
|
||||
pxor xmm3, xmm0
|
||||
pshufb xmm3, xmm15
|
||||
paddd xmm2, xmm3
|
||||
pxor xmm1, xmm2
|
||||
movdqa xmm11, xmm1
|
||||
pslld xmm1, 20
|
||||
psrld xmm11, 12
|
||||
por xmm1, xmm11
|
||||
paddd xmm0, xmm5
|
||||
paddd xmm0, xmm1
|
||||
pxor xmm3, xmm0
|
||||
pshufb xmm3, xmm14
|
||||
paddd xmm2, xmm3
|
||||
pxor xmm1, xmm2
|
||||
movdqa xmm11, xmm1
|
||||
pslld xmm1, 25
|
||||
psrld xmm11, 7
|
||||
por xmm1, xmm11
|
||||
pshufd xmm0, xmm0, 93H
|
||||
pshufd xmm3, xmm3, 4EH
|
||||
pshufd xmm2, xmm2, 39H
|
||||
paddd xmm0, xmm6
|
||||
paddd xmm0, xmm1
|
||||
pxor xmm3, xmm0
|
||||
pshufb xmm3, xmm15
|
||||
paddd xmm2, xmm3
|
||||
pxor xmm1, xmm2
|
||||
movdqa xmm11, xmm1
|
||||
pslld xmm1, 20
|
||||
psrld xmm11, 12
|
||||
por xmm1, xmm11
|
||||
paddd xmm0, xmm7
|
||||
paddd xmm0, xmm1
|
||||
pxor xmm3, xmm0
|
||||
pshufb xmm3, xmm14
|
||||
paddd xmm2, xmm3
|
||||
pxor xmm1, xmm2
|
||||
movdqa xmm11, xmm1
|
||||
pslld xmm1, 25
|
||||
psrld xmm11, 7
|
||||
por xmm1, xmm11
|
||||
pshufd xmm0, xmm0, 39H
|
||||
pshufd xmm3, xmm3, 4EH
|
||||
pshufd xmm2, xmm2, 93H
|
||||
dec al
|
||||
jz @F
|
||||
movdqa xmm8, xmm4
|
||||
shufps xmm8, xmm5, 214
|
||||
pshufd xmm9, xmm4, 0FH
|
||||
pshufd xmm4, xmm8, 39H
|
||||
movdqa xmm8, xmm6
|
||||
shufps xmm8, xmm7, 250
|
||||
pblendw xmm9, xmm8, 0CCH
|
||||
movdqa xmm8, xmm7
|
||||
punpcklqdq xmm8, xmm5
|
||||
pblendw xmm8, xmm6, 0C0H
|
||||
pshufd xmm8, xmm8, 78H
|
||||
punpckhdq xmm5, xmm7
|
||||
punpckldq xmm6, xmm5
|
||||
pshufd xmm7, xmm6, 1EH
|
||||
movdqa xmm5, xmm9
|
||||
movdqa xmm6, xmm8
|
||||
jmp @B
|
||||
@@:
|
||||
movdqu xmm4, xmmword ptr [rcx]
|
||||
movdqu xmm5, xmmword ptr [rcx+10H]
|
||||
pxor xmm0, xmm2
|
||||
pxor xmm1, xmm3
|
||||
pxor xmm2, xmm4
|
||||
pxor xmm3, xmm5
|
||||
movups xmmword ptr [r10], xmm0
|
||||
movups xmmword ptr [r10+10H], xmm1
|
||||
movups xmmword ptr [r10+20H], xmm2
|
||||
movups xmmword ptr [r10+30H], xmm3
|
||||
movdqa xmm6, xmmword ptr [rsp]
|
||||
movdqa xmm7, xmmword ptr [rsp+10H]
|
||||
movdqa xmm8, xmmword ptr [rsp+20H]
|
||||
movdqa xmm9, xmmword ptr [rsp+30H]
|
||||
movdqa xmm11, xmmword ptr [rsp+40H]
|
||||
movdqa xmm14, xmmword ptr [rsp+50H]
|
||||
movdqa xmm15, xmmword ptr [rsp+60H]
|
||||
add rsp, 120
|
||||
ret
|
||||
_blake3_compress_xof_sse2 ENDP
|
||||
blake3_compress_xof_sse2 ENDP
|
||||
|
||||
_TEXT ENDS
|
||||
|
||||
|
||||
_RDATA SEGMENT READONLY PAGE ALIAS(".rdata") 'CONST'
|
||||
ALIGN 64
|
||||
BLAKE3_IV:
|
||||
dd 6A09E667H, 0BB67AE85H, 3C6EF372H, 0A54FF53AH
|
||||
|
||||
ADD0:
|
||||
dd 0, 1, 2, 3
|
||||
|
||||
ADD1:
|
||||
dd 4 dup (4)
|
||||
|
||||
BLAKE3_IV_0:
|
||||
dd 4 dup (6A09E667H)
|
||||
|
||||
BLAKE3_IV_1:
|
||||
dd 4 dup (0BB67AE85H)
|
||||
|
||||
BLAKE3_IV_2:
|
||||
dd 4 dup (3C6EF372H)
|
||||
|
||||
BLAKE3_IV_3:
|
||||
dd 4 dup (0A54FF53AH)
|
||||
|
||||
BLAKE3_BLOCK_LEN:
|
||||
dd 4 dup (64)
|
||||
|
||||
ROT16:
|
||||
db 2, 3, 0, 1, 6, 7, 4, 5, 10, 11, 8, 9, 14, 15, 12, 13
|
||||
|
||||
ROT8:
|
||||
db 1, 2, 3, 0, 5, 6, 7, 4, 9, 10, 11, 8, 13, 14, 15, 12
|
||||
|
||||
CMP_MSB_MASK:
|
||||
dd 8 dup(80000000H)
|
||||
|
||||
_RDATA ENDS
|
||||
END
|
||||
|
|
@ -0,0 +1,114 @@
|
|||
use crate::{CVWords, IncrementCounter, BLOCK_LEN, OUT_LEN};
|
||||
|
||||
// Unsafe because this may only be called on platforms supporting SSE2.
|
||||
pub unsafe fn compress_in_place(
|
||||
cv: &mut CVWords,
|
||||
block: &[u8; BLOCK_LEN],
|
||||
block_len: u8,
|
||||
counter: u64,
|
||||
flags: u8,
|
||||
) {
|
||||
ffi::blake3_compress_in_place_sse2(cv.as_mut_ptr(), block.as_ptr(), block_len, counter, flags)
|
||||
}
|
||||
|
||||
// Unsafe because this may only be called on platforms supporting SSE2.
|
||||
pub unsafe fn compress_xof(
|
||||
cv: &CVWords,
|
||||
block: &[u8; BLOCK_LEN],
|
||||
block_len: u8,
|
||||
counter: u64,
|
||||
flags: u8,
|
||||
) -> [u8; 64] {
|
||||
let mut out = [0u8; 64];
|
||||
ffi::blake3_compress_xof_sse2(
|
||||
cv.as_ptr(),
|
||||
block.as_ptr(),
|
||||
block_len,
|
||||
counter,
|
||||
flags,
|
||||
out.as_mut_ptr(),
|
||||
);
|
||||
out
|
||||
}
|
||||
|
||||
// Unsafe because this may only be called on platforms supporting SSE2.
|
||||
pub unsafe fn hash_many<A: arrayvec::Array<Item = u8>>(
|
||||
inputs: &[&A],
|
||||
key: &CVWords,
|
||||
counter: u64,
|
||||
increment_counter: IncrementCounter,
|
||||
flags: u8,
|
||||
flags_start: u8,
|
||||
flags_end: u8,
|
||||
out: &mut [u8],
|
||||
) {
|
||||
// The Rust hash_many implementations do bounds checking on the `out`
|
||||
// array, but the C implementations don't. Even though this is an unsafe
|
||||
// function, assert the bounds here.
|
||||
assert!(out.len() >= inputs.len() * OUT_LEN);
|
||||
ffi::blake3_hash_many_sse2(
|
||||
inputs.as_ptr() as *const *const u8,
|
||||
inputs.len(),
|
||||
A::CAPACITY / BLOCK_LEN,
|
||||
key.as_ptr(),
|
||||
counter,
|
||||
increment_counter.yes(),
|
||||
flags,
|
||||
flags_start,
|
||||
flags_end,
|
||||
out.as_mut_ptr(),
|
||||
)
|
||||
}
|
||||
|
||||
pub mod ffi {
|
||||
extern "C" {
|
||||
pub fn blake3_compress_in_place_sse2(
|
||||
cv: *mut u32,
|
||||
block: *const u8,
|
||||
block_len: u8,
|
||||
counter: u64,
|
||||
flags: u8,
|
||||
);
|
||||
pub fn blake3_compress_xof_sse2(
|
||||
cv: *const u32,
|
||||
block: *const u8,
|
||||
block_len: u8,
|
||||
counter: u64,
|
||||
flags: u8,
|
||||
out: *mut u8,
|
||||
);
|
||||
pub fn blake3_hash_many_sse2(
|
||||
inputs: *const *const u8,
|
||||
num_inputs: usize,
|
||||
blocks: usize,
|
||||
key: *const u32,
|
||||
counter: u64,
|
||||
increment_counter: bool,
|
||||
flags: u8,
|
||||
flags_start: u8,
|
||||
flags_end: u8,
|
||||
out: *mut u8,
|
||||
);
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod test {
|
||||
use super::*;
|
||||
|
||||
#[test]
|
||||
fn test_compress() {
|
||||
if !crate::platform::sse2_detected() {
|
||||
return;
|
||||
}
|
||||
crate::test::test_compress_fn(compress_in_place, compress_xof);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_hash_many() {
|
||||
if !crate::platform::sse2_detected() {
|
||||
return;
|
||||
}
|
||||
crate::test::test_hash_many_fn(hash_many, hash_many);
|
||||
}
|
||||
}
|
|
@ -94,6 +94,12 @@ mod sse41;
|
|||
#[cfg(blake3_sse41_ffi)]
|
||||
#[path = "ffi_sse41.rs"]
|
||||
mod sse41;
|
||||
#[cfg(blake3_sse2_rust)]
|
||||
#[path = "rust_sse2.rs"]
|
||||
mod sse2;
|
||||
#[cfg(blake3_sse2_ffi)]
|
||||
#[path = "ffi_sse2.rs"]
|
||||
mod sse2;
|
||||
|
||||
pub mod traits;
|
||||
|
||||
|
|
|
@ -41,6 +41,8 @@ cfg_if::cfg_if! {
|
|||
pub enum Platform {
|
||||
Portable,
|
||||
#[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
|
||||
SSE2,
|
||||
#[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
|
||||
SSE41,
|
||||
#[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
|
||||
AVX2,
|
||||
|
@ -68,6 +70,9 @@ impl Platform {
|
|||
if sse41_detected() {
|
||||
return Platform::SSE41;
|
||||
}
|
||||
if sse2_detected() {
|
||||
return Platform::SSE2;
|
||||
}
|
||||
}
|
||||
// We don't use dynamic feature detection for NEON. If the "neon"
|
||||
// feature is on, NEON is assumed to be supported.
|
||||
|
@ -82,6 +87,8 @@ impl Platform {
|
|||
let degree = match self {
|
||||
Platform::Portable => 1,
|
||||
#[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
|
||||
Platform::SSE2 => 4,
|
||||
#[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
|
||||
Platform::SSE41 => 4,
|
||||
#[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
|
||||
Platform::AVX2 => 8,
|
||||
|
@ -107,6 +114,11 @@ impl Platform {
|
|||
Platform::Portable => portable::compress_in_place(cv, block, block_len, counter, flags),
|
||||
// Safe because detect() checked for platform support.
|
||||
#[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
|
||||
Platform::SSE2 => unsafe {
|
||||
crate::sse2::compress_in_place(cv, block, block_len, counter, flags)
|
||||
},
|
||||
// Safe because detect() checked for platform support.
|
||||
#[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
|
||||
Platform::SSE41 | Platform::AVX2 => unsafe {
|
||||
crate::sse41::compress_in_place(cv, block, block_len, counter, flags)
|
||||
},
|
||||
|
@ -134,6 +146,11 @@ impl Platform {
|
|||
Platform::Portable => portable::compress_xof(cv, block, block_len, counter, flags),
|
||||
// Safe because detect() checked for platform support.
|
||||
#[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
|
||||
Platform::SSE2 => unsafe {
|
||||
crate::sse2::compress_xof(cv, block, block_len, counter, flags)
|
||||
},
|
||||
// Safe because detect() checked for platform support.
|
||||
#[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
|
||||
Platform::SSE41 | Platform::AVX2 => unsafe {
|
||||
crate::sse41::compress_xof(cv, block, block_len, counter, flags)
|
||||
},
|
||||
|
@ -183,6 +200,20 @@ impl Platform {
|
|||
),
|
||||
// Safe because detect() checked for platform support.
|
||||
#[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
|
||||
Platform::SSE2 => unsafe {
|
||||
crate::sse2::hash_many(
|
||||
inputs,
|
||||
key,
|
||||
counter,
|
||||
increment_counter,
|
||||
flags,
|
||||
flags_start,
|
||||
flags_end,
|
||||
out,
|
||||
)
|
||||
},
|
||||
// Safe because detect() checked for platform support.
|
||||
#[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
|
||||
Platform::SSE41 => unsafe {
|
||||
crate::sse41::hash_many(
|
||||
inputs,
|
||||
|
@ -247,6 +278,15 @@ impl Platform {
|
|||
Self::Portable
|
||||
}
|
||||
|
||||
#[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
|
||||
pub fn sse2() -> Option<Self> {
|
||||
if sse2_detected() {
|
||||
Some(Self::SSE2)
|
||||
} else {
|
||||
None
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
|
||||
pub fn sse41() -> Option<Self> {
|
||||
if sse41_detected() {
|
||||
|
@ -351,6 +391,28 @@ pub fn sse41_detected() -> bool {
|
|||
false
|
||||
}
|
||||
|
||||
#[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
|
||||
#[inline(always)]
|
||||
pub fn sse2_detected() -> bool {
|
||||
// A testing-only short-circuit.
|
||||
if cfg!(feature = "no_sse2") {
|
||||
return false;
|
||||
}
|
||||
// Static check, e.g. for building with target-cpu=native.
|
||||
#[cfg(target_feature = "sse2")]
|
||||
{
|
||||
return true;
|
||||
}
|
||||
// Dyanmic check, if std is enabled.
|
||||
#[cfg(feature = "std")]
|
||||
{
|
||||
if is_x86_feature_detected!("sse2") {
|
||||
return true;
|
||||
}
|
||||
}
|
||||
false
|
||||
}
|
||||
|
||||
#[inline(always)]
|
||||
pub fn words_from_le_bytes_32(bytes: &[u8; 32]) -> [u32; 8] {
|
||||
let mut out = [0; 8];
|
||||
|
|
|
@ -0,0 +1,766 @@
|
|||
#[cfg(target_arch = "x86")]
|
||||
use core::arch::x86::*;
|
||||
#[cfg(target_arch = "x86_64")]
|
||||
use core::arch::x86_64::*;
|
||||
|
||||
use crate::{
|
||||
counter_high, counter_low, CVBytes, CVWords, IncrementCounter, BLOCK_LEN, IV, MSG_SCHEDULE,
|
||||
OUT_LEN,
|
||||
};
|
||||
use arrayref::{array_mut_ref, array_ref, mut_array_refs};
|
||||
|
||||
pub const DEGREE: usize = 4;
|
||||
|
||||
#[inline(always)]
|
||||
unsafe fn loadu(src: *const u8) -> __m128i {
|
||||
// This is an unaligned load, so the pointer cast is allowed.
|
||||
_mm_loadu_si128(src as *const __m128i)
|
||||
}
|
||||
|
||||
#[inline(always)]
|
||||
unsafe fn storeu(src: __m128i, dest: *mut u8) {
|
||||
// This is an unaligned store, so the pointer cast is allowed.
|
||||
_mm_storeu_si128(dest as *mut __m128i, src)
|
||||
}
|
||||
|
||||
#[inline(always)]
|
||||
unsafe fn add(a: __m128i, b: __m128i) -> __m128i {
|
||||
_mm_add_epi32(a, b)
|
||||
}
|
||||
|
||||
#[inline(always)]
|
||||
unsafe fn xor(a: __m128i, b: __m128i) -> __m128i {
|
||||
_mm_xor_si128(a, b)
|
||||
}
|
||||
|
||||
#[inline(always)]
|
||||
unsafe fn set1(x: u32) -> __m128i {
|
||||
_mm_set1_epi32(x as i32)
|
||||
}
|
||||
|
||||
#[inline(always)]
|
||||
unsafe fn set4(a: u32, b: u32, c: u32, d: u32) -> __m128i {
|
||||
_mm_setr_epi32(a as i32, b as i32, c as i32, d as i32)
|
||||
}
|
||||
|
||||
// These rotations are the "simple/shifts version". For the
|
||||
// "complicated/shuffles version", see
|
||||
// https://github.com/sneves/blake2-avx2/blob/b3723921f668df09ece52dcd225a36d4a4eea1d9/blake2s-common.h#L63-L66.
|
||||
// For a discussion of the tradeoffs, see
|
||||
// https://github.com/sneves/blake2-avx2/pull/5. Due to an LLVM bug
|
||||
// (https://bugs.llvm.org/show_bug.cgi?id=44379), this version performs better
|
||||
// on recent x86 chips.
|
||||
|
||||
#[inline(always)]
|
||||
unsafe fn rot16(a: __m128i) -> __m128i {
|
||||
_mm_or_si128(_mm_srli_epi32(a, 16), _mm_slli_epi32(a, 32 - 16))
|
||||
}
|
||||
|
||||
#[inline(always)]
|
||||
unsafe fn rot12(a: __m128i) -> __m128i {
|
||||
_mm_or_si128(_mm_srli_epi32(a, 12), _mm_slli_epi32(a, 32 - 12))
|
||||
}
|
||||
|
||||
#[inline(always)]
|
||||
unsafe fn rot8(a: __m128i) -> __m128i {
|
||||
_mm_or_si128(_mm_srli_epi32(a, 8), _mm_slli_epi32(a, 32 - 8))
|
||||
}
|
||||
|
||||
#[inline(always)]
|
||||
unsafe fn rot7(a: __m128i) -> __m128i {
|
||||
_mm_or_si128(_mm_srli_epi32(a, 7), _mm_slli_epi32(a, 32 - 7))
|
||||
}
|
||||
|
||||
#[inline(always)]
|
||||
unsafe fn g1(
|
||||
row0: &mut __m128i,
|
||||
row1: &mut __m128i,
|
||||
row2: &mut __m128i,
|
||||
row3: &mut __m128i,
|
||||
m: __m128i,
|
||||
) {
|
||||
*row0 = add(add(*row0, m), *row1);
|
||||
*row3 = xor(*row3, *row0);
|
||||
*row3 = rot16(*row3);
|
||||
*row2 = add(*row2, *row3);
|
||||
*row1 = xor(*row1, *row2);
|
||||
*row1 = rot12(*row1);
|
||||
}
|
||||
|
||||
#[inline(always)]
|
||||
unsafe fn g2(
|
||||
row0: &mut __m128i,
|
||||
row1: &mut __m128i,
|
||||
row2: &mut __m128i,
|
||||
row3: &mut __m128i,
|
||||
m: __m128i,
|
||||
) {
|
||||
*row0 = add(add(*row0, m), *row1);
|
||||
*row3 = xor(*row3, *row0);
|
||||
*row3 = rot8(*row3);
|
||||
*row2 = add(*row2, *row3);
|
||||
*row1 = xor(*row1, *row2);
|
||||
*row1 = rot7(*row1);
|
||||
}
|
||||
|
||||
// Adapted from https://github.com/rust-lang-nursery/stdsimd/pull/479.
|
||||
macro_rules! _MM_SHUFFLE {
|
||||
($z:expr, $y:expr, $x:expr, $w:expr) => {
|
||||
($z << 6) | ($y << 4) | ($x << 2) | $w
|
||||
};
|
||||
}
|
||||
|
||||
macro_rules! shuffle2 {
|
||||
($a:expr, $b:expr, $c:expr) => {
|
||||
_mm_castps_si128(_mm_shuffle_ps(
|
||||
_mm_castsi128_ps($a),
|
||||
_mm_castsi128_ps($b),
|
||||
$c,
|
||||
))
|
||||
};
|
||||
}
|
||||
|
||||
// Note the optimization here of leaving row1 as the unrotated row, rather than
|
||||
// row0. All the message loads below are adjusted to compensate for this. See
|
||||
// discussion at https://github.com/sneves/blake2-avx2/pull/4
|
||||
#[inline(always)]
|
||||
unsafe fn diagonalize(row0: &mut __m128i, row2: &mut __m128i, row3: &mut __m128i) {
|
||||
*row0 = _mm_shuffle_epi32(*row0, _MM_SHUFFLE!(2, 1, 0, 3));
|
||||
*row3 = _mm_shuffle_epi32(*row3, _MM_SHUFFLE!(1, 0, 3, 2));
|
||||
*row2 = _mm_shuffle_epi32(*row2, _MM_SHUFFLE!(0, 3, 2, 1));
|
||||
}
|
||||
|
||||
#[inline(always)]
|
||||
unsafe fn undiagonalize(row0: &mut __m128i, row2: &mut __m128i, row3: &mut __m128i) {
|
||||
*row0 = _mm_shuffle_epi32(*row0, _MM_SHUFFLE!(0, 3, 2, 1));
|
||||
*row3 = _mm_shuffle_epi32(*row3, _MM_SHUFFLE!(1, 0, 3, 2));
|
||||
*row2 = _mm_shuffle_epi32(*row2, _MM_SHUFFLE!(2, 1, 0, 3));
|
||||
}
|
||||
|
||||
#[inline(always)]
|
||||
unsafe fn compress_pre(
|
||||
cv: &CVWords,
|
||||
block: &[u8; BLOCK_LEN],
|
||||
block_len: u8,
|
||||
counter: u64,
|
||||
flags: u8,
|
||||
) -> [__m128i; 4] {
|
||||
let row0 = &mut loadu(cv.as_ptr().add(0) as *const u8);
|
||||
let row1 = &mut loadu(cv.as_ptr().add(4) as *const u8);
|
||||
let row2 = &mut set4(IV[0], IV[1], IV[2], IV[3]);
|
||||
let row3 = &mut set4(
|
||||
counter_low(counter),
|
||||
counter_high(counter),
|
||||
block_len as u32,
|
||||
flags as u32,
|
||||
);
|
||||
|
||||
let mut m0 = loadu(block.as_ptr().add(0 * 4 * DEGREE));
|
||||
let mut m1 = loadu(block.as_ptr().add(1 * 4 * DEGREE));
|
||||
let mut m2 = loadu(block.as_ptr().add(2 * 4 * DEGREE));
|
||||
let mut m3 = loadu(block.as_ptr().add(3 * 4 * DEGREE));
|
||||
|
||||
let mut t0;
|
||||
let mut t1;
|
||||
let mut t2;
|
||||
let mut t3;
|
||||
let mut tt;
|
||||
|
||||
// Round 1. The first round permutes the message words from the original
|
||||
// input order, into the groups that get mixed in parallel.
|
||||
t0 = shuffle2!(m0, m1, _MM_SHUFFLE!(2, 0, 2, 0)); // 6 4 2 0
|
||||
g1(row0, row1, row2, row3, t0);
|
||||
t1 = shuffle2!(m0, m1, _MM_SHUFFLE!(3, 1, 3, 1)); // 7 5 3 1
|
||||
g2(row0, row1, row2, row3, t1);
|
||||
diagonalize(row0, row2, row3);
|
||||
t2 = shuffle2!(m2, m3, _MM_SHUFFLE!(2, 0, 2, 0)); // 14 12 10 8
|
||||
t2 = _mm_shuffle_epi32(t2, _MM_SHUFFLE!(2, 1, 0, 3)); // 12 10 8 14
|
||||
g1(row0, row1, row2, row3, t2);
|
||||
t3 = shuffle2!(m2, m3, _MM_SHUFFLE!(3, 1, 3, 1)); // 15 13 11 9
|
||||
t3 = _mm_shuffle_epi32(t3, _MM_SHUFFLE!(2, 1, 0, 3)); // 13 11 9 15
|
||||
g2(row0, row1, row2, row3, t3);
|
||||
undiagonalize(row0, row2, row3);
|
||||
m0 = t0;
|
||||
m1 = t1;
|
||||
m2 = t2;
|
||||
m3 = t3;
|
||||
|
||||
// Round 2. This round and all following rounds apply a fixed permutation
|
||||
// to the message words from the round before.
|
||||
t0 = shuffle2!(m0, m1, _MM_SHUFFLE!(3, 1, 1, 2));
|
||||
t0 = _mm_shuffle_epi32(t0, _MM_SHUFFLE!(0, 3, 2, 1));
|
||||
g1(row0, row1, row2, row3, t0);
|
||||
t1 = shuffle2!(m2, m3, _MM_SHUFFLE!(3, 3, 2, 2));
|
||||
tt = _mm_shuffle_epi32(m0, _MM_SHUFFLE!(0, 0, 3, 3));
|
||||
t1 = _mm_blend_epi16(tt, t1, 0xCC);
|
||||
g2(row0, row1, row2, row3, t1);
|
||||
diagonalize(row0, row2, row3);
|
||||
t2 = _mm_unpacklo_epi64(m3, m1);
|
||||
tt = _mm_blend_epi16(t2, m2, 0xC0);
|
||||
t2 = _mm_shuffle_epi32(tt, _MM_SHUFFLE!(1, 3, 2, 0));
|
||||
g1(row0, row1, row2, row3, t2);
|
||||
t3 = _mm_unpackhi_epi32(m1, m3);
|
||||
tt = _mm_unpacklo_epi32(m2, t3);
|
||||
t3 = _mm_shuffle_epi32(tt, _MM_SHUFFLE!(0, 1, 3, 2));
|
||||
g2(row0, row1, row2, row3, t3);
|
||||
undiagonalize(row0, row2, row3);
|
||||
m0 = t0;
|
||||
m1 = t1;
|
||||
m2 = t2;
|
||||
m3 = t3;
|
||||
|
||||
// Round 3
|
||||
t0 = shuffle2!(m0, m1, _MM_SHUFFLE!(3, 1, 1, 2));
|
||||
t0 = _mm_shuffle_epi32(t0, _MM_SHUFFLE!(0, 3, 2, 1));
|
||||
g1(row0, row1, row2, row3, t0);
|
||||
t1 = shuffle2!(m2, m3, _MM_SHUFFLE!(3, 3, 2, 2));
|
||||
tt = _mm_shuffle_epi32(m0, _MM_SHUFFLE!(0, 0, 3, 3));
|
||||
t1 = _mm_blend_epi16(tt, t1, 0xCC);
|
||||
g2(row0, row1, row2, row3, t1);
|
||||
diagonalize(row0, row2, row3);
|
||||
t2 = _mm_unpacklo_epi64(m3, m1);
|
||||
tt = _mm_blend_epi16(t2, m2, 0xC0);
|
||||
t2 = _mm_shuffle_epi32(tt, _MM_SHUFFLE!(1, 3, 2, 0));
|
||||
g1(row0, row1, row2, row3, t2);
|
||||
t3 = _mm_unpackhi_epi32(m1, m3);
|
||||
tt = _mm_unpacklo_epi32(m2, t3);
|
||||
t3 = _mm_shuffle_epi32(tt, _MM_SHUFFLE!(0, 1, 3, 2));
|
||||
g2(row0, row1, row2, row3, t3);
|
||||
undiagonalize(row0, row2, row3);
|
||||
m0 = t0;
|
||||
m1 = t1;
|
||||
m2 = t2;
|
||||
m3 = t3;
|
||||
|
||||
// Round 4
|
||||
t0 = shuffle2!(m0, m1, _MM_SHUFFLE!(3, 1, 1, 2));
|
||||
t0 = _mm_shuffle_epi32(t0, _MM_SHUFFLE!(0, 3, 2, 1));
|
||||
g1(row0, row1, row2, row3, t0);
|
||||
t1 = shuffle2!(m2, m3, _MM_SHUFFLE!(3, 3, 2, 2));
|
||||
tt = _mm_shuffle_epi32(m0, _MM_SHUFFLE!(0, 0, 3, 3));
|
||||
t1 = _mm_blend_epi16(tt, t1, 0xCC);
|
||||
g2(row0, row1, row2, row3, t1);
|
||||
diagonalize(row0, row2, row3);
|
||||
t2 = _mm_unpacklo_epi64(m3, m1);
|
||||
tt = _mm_blend_epi16(t2, m2, 0xC0);
|
||||
t2 = _mm_shuffle_epi32(tt, _MM_SHUFFLE!(1, 3, 2, 0));
|
||||
g1(row0, row1, row2, row3, t2);
|
||||
t3 = _mm_unpackhi_epi32(m1, m3);
|
||||
tt = _mm_unpacklo_epi32(m2, t3);
|
||||
t3 = _mm_shuffle_epi32(tt, _MM_SHUFFLE!(0, 1, 3, 2));
|
||||
g2(row0, row1, row2, row3, t3);
|
||||
undiagonalize(row0, row2, row3);
|
||||
m0 = t0;
|
||||
m1 = t1;
|
||||
m2 = t2;
|
||||
m3 = t3;
|
||||
|
||||
// Round 5
|
||||
t0 = shuffle2!(m0, m1, _MM_SHUFFLE!(3, 1, 1, 2));
|
||||
t0 = _mm_shuffle_epi32(t0, _MM_SHUFFLE!(0, 3, 2, 1));
|
||||
g1(row0, row1, row2, row3, t0);
|
||||
t1 = shuffle2!(m2, m3, _MM_SHUFFLE!(3, 3, 2, 2));
|
||||
tt = _mm_shuffle_epi32(m0, _MM_SHUFFLE!(0, 0, 3, 3));
|
||||
t1 = _mm_blend_epi16(tt, t1, 0xCC);
|
||||
g2(row0, row1, row2, row3, t1);
|
||||
diagonalize(row0, row2, row3);
|
||||
t2 = _mm_unpacklo_epi64(m3, m1);
|
||||
tt = _mm_blend_epi16(t2, m2, 0xC0);
|
||||
t2 = _mm_shuffle_epi32(tt, _MM_SHUFFLE!(1, 3, 2, 0));
|
||||
g1(row0, row1, row2, row3, t2);
|
||||
t3 = _mm_unpackhi_epi32(m1, m3);
|
||||
tt = _mm_unpacklo_epi32(m2, t3);
|
||||
t3 = _mm_shuffle_epi32(tt, _MM_SHUFFLE!(0, 1, 3, 2));
|
||||
g2(row0, row1, row2, row3, t3);
|
||||
undiagonalize(row0, row2, row3);
|
||||
m0 = t0;
|
||||
m1 = t1;
|
||||
m2 = t2;
|
||||
m3 = t3;
|
||||
|
||||
// Round 6
|
||||
t0 = shuffle2!(m0, m1, _MM_SHUFFLE!(3, 1, 1, 2));
|
||||
t0 = _mm_shuffle_epi32(t0, _MM_SHUFFLE!(0, 3, 2, 1));
|
||||
g1(row0, row1, row2, row3, t0);
|
||||
t1 = shuffle2!(m2, m3, _MM_SHUFFLE!(3, 3, 2, 2));
|
||||
tt = _mm_shuffle_epi32(m0, _MM_SHUFFLE!(0, 0, 3, 3));
|
||||
t1 = _mm_blend_epi16(tt, t1, 0xCC);
|
||||
g2(row0, row1, row2, row3, t1);
|
||||
diagonalize(row0, row2, row3);
|
||||
t2 = _mm_unpacklo_epi64(m3, m1);
|
||||
tt = _mm_blend_epi16(t2, m2, 0xC0);
|
||||
t2 = _mm_shuffle_epi32(tt, _MM_SHUFFLE!(1, 3, 2, 0));
|
||||
g1(row0, row1, row2, row3, t2);
|
||||
t3 = _mm_unpackhi_epi32(m1, m3);
|
||||
tt = _mm_unpacklo_epi32(m2, t3);
|
||||
t3 = _mm_shuffle_epi32(tt, _MM_SHUFFLE!(0, 1, 3, 2));
|
||||
g2(row0, row1, row2, row3, t3);
|
||||
undiagonalize(row0, row2, row3);
|
||||
m0 = t0;
|
||||
m1 = t1;
|
||||
m2 = t2;
|
||||
m3 = t3;
|
||||
|
||||
// Round 7
|
||||
t0 = shuffle2!(m0, m1, _MM_SHUFFLE!(3, 1, 1, 2));
|
||||
t0 = _mm_shuffle_epi32(t0, _MM_SHUFFLE!(0, 3, 2, 1));
|
||||
g1(row0, row1, row2, row3, t0);
|
||||
t1 = shuffle2!(m2, m3, _MM_SHUFFLE!(3, 3, 2, 2));
|
||||
tt = _mm_shuffle_epi32(m0, _MM_SHUFFLE!(0, 0, 3, 3));
|
||||
t1 = _mm_blend_epi16(tt, t1, 0xCC);
|
||||
g2(row0, row1, row2, row3, t1);
|
||||
diagonalize(row0, row2, row3);
|
||||
t2 = _mm_unpacklo_epi64(m3, m1);
|
||||
tt = _mm_blend_epi16(t2, m2, 0xC0);
|
||||
t2 = _mm_shuffle_epi32(tt, _MM_SHUFFLE!(1, 3, 2, 0));
|
||||
g1(row0, row1, row2, row3, t2);
|
||||
t3 = _mm_unpackhi_epi32(m1, m3);
|
||||
tt = _mm_unpacklo_epi32(m2, t3);
|
||||
t3 = _mm_shuffle_epi32(tt, _MM_SHUFFLE!(0, 1, 3, 2));
|
||||
g2(row0, row1, row2, row3, t3);
|
||||
undiagonalize(row0, row2, row3);
|
||||
|
||||
[*row0, *row1, *row2, *row3]
|
||||
}
|
||||
|
||||
#[target_feature(enable = "sse2")]
|
||||
pub unsafe fn compress_in_place(
|
||||
cv: &mut CVWords,
|
||||
block: &[u8; BLOCK_LEN],
|
||||
block_len: u8,
|
||||
counter: u64,
|
||||
flags: u8,
|
||||
) {
|
||||
let [row0, row1, row2, row3] = compress_pre(cv, block, block_len, counter, flags);
|
||||
storeu(xor(row0, row2), cv.as_mut_ptr().add(0) as *mut u8);
|
||||
storeu(xor(row1, row3), cv.as_mut_ptr().add(4) as *mut u8);
|
||||
}
|
||||
|
||||
#[target_feature(enable = "sse2")]
|
||||
pub unsafe fn compress_xof(
|
||||
cv: &CVWords,
|
||||
block: &[u8; BLOCK_LEN],
|
||||
block_len: u8,
|
||||
counter: u64,
|
||||
flags: u8,
|
||||
) -> [u8; 64] {
|
||||
let [mut row0, mut row1, mut row2, mut row3] =
|
||||
compress_pre(cv, block, block_len, counter, flags);
|
||||
row0 = xor(row0, row2);
|
||||
row1 = xor(row1, row3);
|
||||
row2 = xor(row2, loadu(cv.as_ptr().add(0) as *const u8));
|
||||
row3 = xor(row3, loadu(cv.as_ptr().add(4) as *const u8));
|
||||
core::mem::transmute([row0, row1, row2, row3])
|
||||
}
|
||||
|
||||
#[inline(always)]
|
||||
unsafe fn round(v: &mut [__m128i; 16], m: &[__m128i; 16], r: usize) {
|
||||
v[0] = add(v[0], m[MSG_SCHEDULE[r][0] as usize]);
|
||||
v[1] = add(v[1], m[MSG_SCHEDULE[r][2] as usize]);
|
||||
v[2] = add(v[2], m[MSG_SCHEDULE[r][4] as usize]);
|
||||
v[3] = add(v[3], m[MSG_SCHEDULE[r][6] as usize]);
|
||||
v[0] = add(v[0], v[4]);
|
||||
v[1] = add(v[1], v[5]);
|
||||
v[2] = add(v[2], v[6]);
|
||||
v[3] = add(v[3], v[7]);
|
||||
v[12] = xor(v[12], v[0]);
|
||||
v[13] = xor(v[13], v[1]);
|
||||
v[14] = xor(v[14], v[2]);
|
||||
v[15] = xor(v[15], v[3]);
|
||||
v[12] = rot16(v[12]);
|
||||
v[13] = rot16(v[13]);
|
||||
v[14] = rot16(v[14]);
|
||||
v[15] = rot16(v[15]);
|
||||
v[8] = add(v[8], v[12]);
|
||||
v[9] = add(v[9], v[13]);
|
||||
v[10] = add(v[10], v[14]);
|
||||
v[11] = add(v[11], v[15]);
|
||||
v[4] = xor(v[4], v[8]);
|
||||
v[5] = xor(v[5], v[9]);
|
||||
v[6] = xor(v[6], v[10]);
|
||||
v[7] = xor(v[7], v[11]);
|
||||
v[4] = rot12(v[4]);
|
||||
v[5] = rot12(v[5]);
|
||||
v[6] = rot12(v[6]);
|
||||
v[7] = rot12(v[7]);
|
||||
v[0] = add(v[0], m[MSG_SCHEDULE[r][1] as usize]);
|
||||
v[1] = add(v[1], m[MSG_SCHEDULE[r][3] as usize]);
|
||||
v[2] = add(v[2], m[MSG_SCHEDULE[r][5] as usize]);
|
||||
v[3] = add(v[3], m[MSG_SCHEDULE[r][7] as usize]);
|
||||
v[0] = add(v[0], v[4]);
|
||||
v[1] = add(v[1], v[5]);
|
||||
v[2] = add(v[2], v[6]);
|
||||
v[3] = add(v[3], v[7]);
|
||||
v[12] = xor(v[12], v[0]);
|
||||
v[13] = xor(v[13], v[1]);
|
||||
v[14] = xor(v[14], v[2]);
|
||||
v[15] = xor(v[15], v[3]);
|
||||
v[12] = rot8(v[12]);
|
||||
v[13] = rot8(v[13]);
|
||||
v[14] = rot8(v[14]);
|
||||
v[15] = rot8(v[15]);
|
||||
v[8] = add(v[8], v[12]);
|
||||
v[9] = add(v[9], v[13]);
|
||||
v[10] = add(v[10], v[14]);
|
||||
v[11] = add(v[11], v[15]);
|
||||
v[4] = xor(v[4], v[8]);
|
||||
v[5] = xor(v[5], v[9]);
|
||||
v[6] = xor(v[6], v[10]);
|
||||
v[7] = xor(v[7], v[11]);
|
||||
v[4] = rot7(v[4]);
|
||||
v[5] = rot7(v[5]);
|
||||
v[6] = rot7(v[6]);
|
||||
v[7] = rot7(v[7]);
|
||||
|
||||
v[0] = add(v[0], m[MSG_SCHEDULE[r][8] as usize]);
|
||||
v[1] = add(v[1], m[MSG_SCHEDULE[r][10] as usize]);
|
||||
v[2] = add(v[2], m[MSG_SCHEDULE[r][12] as usize]);
|
||||
v[3] = add(v[3], m[MSG_SCHEDULE[r][14] as usize]);
|
||||
v[0] = add(v[0], v[5]);
|
||||
v[1] = add(v[1], v[6]);
|
||||
v[2] = add(v[2], v[7]);
|
||||
v[3] = add(v[3], v[4]);
|
||||
v[15] = xor(v[15], v[0]);
|
||||
v[12] = xor(v[12], v[1]);
|
||||
v[13] = xor(v[13], v[2]);
|
||||
v[14] = xor(v[14], v[3]);
|
||||
v[15] = rot16(v[15]);
|
||||
v[12] = rot16(v[12]);
|
||||
v[13] = rot16(v[13]);
|
||||
v[14] = rot16(v[14]);
|
||||
v[10] = add(v[10], v[15]);
|
||||
v[11] = add(v[11], v[12]);
|
||||
v[8] = add(v[8], v[13]);
|
||||
v[9] = add(v[9], v[14]);
|
||||
v[5] = xor(v[5], v[10]);
|
||||
v[6] = xor(v[6], v[11]);
|
||||
v[7] = xor(v[7], v[8]);
|
||||
v[4] = xor(v[4], v[9]);
|
||||
v[5] = rot12(v[5]);
|
||||
v[6] = rot12(v[6]);
|
||||
v[7] = rot12(v[7]);
|
||||
v[4] = rot12(v[4]);
|
||||
v[0] = add(v[0], m[MSG_SCHEDULE[r][9] as usize]);
|
||||
v[1] = add(v[1], m[MSG_SCHEDULE[r][11] as usize]);
|
||||
v[2] = add(v[2], m[MSG_SCHEDULE[r][13] as usize]);
|
||||
v[3] = add(v[3], m[MSG_SCHEDULE[r][15] as usize]);
|
||||
v[0] = add(v[0], v[5]);
|
||||
v[1] = add(v[1], v[6]);
|
||||
v[2] = add(v[2], v[7]);
|
||||
v[3] = add(v[3], v[4]);
|
||||
v[15] = xor(v[15], v[0]);
|
||||
v[12] = xor(v[12], v[1]);
|
||||
v[13] = xor(v[13], v[2]);
|
||||
v[14] = xor(v[14], v[3]);
|
||||
v[15] = rot8(v[15]);
|
||||
v[12] = rot8(v[12]);
|
||||
v[13] = rot8(v[13]);
|
||||
v[14] = rot8(v[14]);
|
||||
v[10] = add(v[10], v[15]);
|
||||
v[11] = add(v[11], v[12]);
|
||||
v[8] = add(v[8], v[13]);
|
||||
v[9] = add(v[9], v[14]);
|
||||
v[5] = xor(v[5], v[10]);
|
||||
v[6] = xor(v[6], v[11]);
|
||||
v[7] = xor(v[7], v[8]);
|
||||
v[4] = xor(v[4], v[9]);
|
||||
v[5] = rot7(v[5]);
|
||||
v[6] = rot7(v[6]);
|
||||
v[7] = rot7(v[7]);
|
||||
v[4] = rot7(v[4]);
|
||||
}
|
||||
|
||||
#[inline(always)]
|
||||
unsafe fn transpose_vecs(vecs: &mut [__m128i; DEGREE]) {
|
||||
// Interleave 32-bit lates. The low unpack is lanes 00/11 and the high is
|
||||
// 22/33. Note that this doesn't split the vector into two lanes, as the
|
||||
// AVX2 counterparts do.
|
||||
let ab_01 = _mm_unpacklo_epi32(vecs[0], vecs[1]);
|
||||
let ab_23 = _mm_unpackhi_epi32(vecs[0], vecs[1]);
|
||||
let cd_01 = _mm_unpacklo_epi32(vecs[2], vecs[3]);
|
||||
let cd_23 = _mm_unpackhi_epi32(vecs[2], vecs[3]);
|
||||
|
||||
// Interleave 64-bit lanes.
|
||||
let abcd_0 = _mm_unpacklo_epi64(ab_01, cd_01);
|
||||
let abcd_1 = _mm_unpackhi_epi64(ab_01, cd_01);
|
||||
let abcd_2 = _mm_unpacklo_epi64(ab_23, cd_23);
|
||||
let abcd_3 = _mm_unpackhi_epi64(ab_23, cd_23);
|
||||
|
||||
vecs[0] = abcd_0;
|
||||
vecs[1] = abcd_1;
|
||||
vecs[2] = abcd_2;
|
||||
vecs[3] = abcd_3;
|
||||
}
|
||||
|
||||
#[inline(always)]
|
||||
unsafe fn transpose_msg_vecs(inputs: &[*const u8; DEGREE], block_offset: usize) -> [__m128i; 16] {
|
||||
let mut vecs = [
|
||||
loadu(inputs[0].add(block_offset + 0 * 4 * DEGREE)),
|
||||
loadu(inputs[1].add(block_offset + 0 * 4 * DEGREE)),
|
||||
loadu(inputs[2].add(block_offset + 0 * 4 * DEGREE)),
|
||||
loadu(inputs[3].add(block_offset + 0 * 4 * DEGREE)),
|
||||
loadu(inputs[0].add(block_offset + 1 * 4 * DEGREE)),
|
||||
loadu(inputs[1].add(block_offset + 1 * 4 * DEGREE)),
|
||||
loadu(inputs[2].add(block_offset + 1 * 4 * DEGREE)),
|
||||
loadu(inputs[3].add(block_offset + 1 * 4 * DEGREE)),
|
||||
loadu(inputs[0].add(block_offset + 2 * 4 * DEGREE)),
|
||||
loadu(inputs[1].add(block_offset + 2 * 4 * DEGREE)),
|
||||
loadu(inputs[2].add(block_offset + 2 * 4 * DEGREE)),
|
||||
loadu(inputs[3].add(block_offset + 2 * 4 * DEGREE)),
|
||||
loadu(inputs[0].add(block_offset + 3 * 4 * DEGREE)),
|
||||
loadu(inputs[1].add(block_offset + 3 * 4 * DEGREE)),
|
||||
loadu(inputs[2].add(block_offset + 3 * 4 * DEGREE)),
|
||||
loadu(inputs[3].add(block_offset + 3 * 4 * DEGREE)),
|
||||
];
|
||||
for i in 0..DEGREE {
|
||||
_mm_prefetch(inputs[i].add(block_offset + 256) as * const i8, _MM_HINT_T0);
|
||||
}
|
||||
let squares = mut_array_refs!(&mut vecs, DEGREE, DEGREE, DEGREE, DEGREE);
|
||||
transpose_vecs(squares.0);
|
||||
transpose_vecs(squares.1);
|
||||
transpose_vecs(squares.2);
|
||||
transpose_vecs(squares.3);
|
||||
vecs
|
||||
}
|
||||
|
||||
#[inline(always)]
|
||||
unsafe fn load_counters(counter: u64, increment_counter: IncrementCounter) -> (__m128i, __m128i) {
|
||||
let mask = if increment_counter.yes() { !0 } else { 0 };
|
||||
(
|
||||
set4(
|
||||
counter_low(counter + (mask & 0)),
|
||||
counter_low(counter + (mask & 1)),
|
||||
counter_low(counter + (mask & 2)),
|
||||
counter_low(counter + (mask & 3)),
|
||||
),
|
||||
set4(
|
||||
counter_high(counter + (mask & 0)),
|
||||
counter_high(counter + (mask & 1)),
|
||||
counter_high(counter + (mask & 2)),
|
||||
counter_high(counter + (mask & 3)),
|
||||
),
|
||||
)
|
||||
}
|
||||
|
||||
#[target_feature(enable = "sse2")]
|
||||
pub unsafe fn hash4(
|
||||
inputs: &[*const u8; DEGREE],
|
||||
blocks: usize,
|
||||
key: &CVWords,
|
||||
counter: u64,
|
||||
increment_counter: IncrementCounter,
|
||||
flags: u8,
|
||||
flags_start: u8,
|
||||
flags_end: u8,
|
||||
out: &mut [u8; DEGREE * OUT_LEN],
|
||||
) {
|
||||
let mut h_vecs = [
|
||||
set1(key[0]),
|
||||
set1(key[1]),
|
||||
set1(key[2]),
|
||||
set1(key[3]),
|
||||
set1(key[4]),
|
||||
set1(key[5]),
|
||||
set1(key[6]),
|
||||
set1(key[7]),
|
||||
];
|
||||
let (counter_low_vec, counter_high_vec) = load_counters(counter, increment_counter);
|
||||
let mut block_flags = flags | flags_start;
|
||||
|
||||
for block in 0..blocks {
|
||||
if block + 1 == blocks {
|
||||
block_flags |= flags_end;
|
||||
}
|
||||
let block_len_vec = set1(BLOCK_LEN as u32); // full blocks only
|
||||
let block_flags_vec = set1(block_flags as u32);
|
||||
let msg_vecs = transpose_msg_vecs(inputs, block * BLOCK_LEN);
|
||||
|
||||
// The transposed compression function. Note that inlining this
|
||||
// manually here improves compile times by a lot, compared to factoring
|
||||
// it out into its own function and making it #[inline(always)]. Just
|
||||
// guessing, it might have something to do with loop unrolling.
|
||||
let mut v = [
|
||||
h_vecs[0],
|
||||
h_vecs[1],
|
||||
h_vecs[2],
|
||||
h_vecs[3],
|
||||
h_vecs[4],
|
||||
h_vecs[5],
|
||||
h_vecs[6],
|
||||
h_vecs[7],
|
||||
set1(IV[0]),
|
||||
set1(IV[1]),
|
||||
set1(IV[2]),
|
||||
set1(IV[3]),
|
||||
counter_low_vec,
|
||||
counter_high_vec,
|
||||
block_len_vec,
|
||||
block_flags_vec,
|
||||
];
|
||||
round(&mut v, &msg_vecs, 0);
|
||||
round(&mut v, &msg_vecs, 1);
|
||||
round(&mut v, &msg_vecs, 2);
|
||||
round(&mut v, &msg_vecs, 3);
|
||||
round(&mut v, &msg_vecs, 4);
|
||||
round(&mut v, &msg_vecs, 5);
|
||||
round(&mut v, &msg_vecs, 6);
|
||||
h_vecs[0] = xor(v[0], v[8]);
|
||||
h_vecs[1] = xor(v[1], v[9]);
|
||||
h_vecs[2] = xor(v[2], v[10]);
|
||||
h_vecs[3] = xor(v[3], v[11]);
|
||||
h_vecs[4] = xor(v[4], v[12]);
|
||||
h_vecs[5] = xor(v[5], v[13]);
|
||||
h_vecs[6] = xor(v[6], v[14]);
|
||||
h_vecs[7] = xor(v[7], v[15]);
|
||||
|
||||
block_flags = flags;
|
||||
}
|
||||
|
||||
let squares = mut_array_refs!(&mut h_vecs, DEGREE, DEGREE);
|
||||
transpose_vecs(squares.0);
|
||||
transpose_vecs(squares.1);
|
||||
// The first four vecs now contain the first half of each output, and the
|
||||
// second four vecs contain the second half of each output.
|
||||
storeu(h_vecs[0], out.as_mut_ptr().add(0 * 4 * DEGREE));
|
||||
storeu(h_vecs[4], out.as_mut_ptr().add(1 * 4 * DEGREE));
|
||||
storeu(h_vecs[1], out.as_mut_ptr().add(2 * 4 * DEGREE));
|
||||
storeu(h_vecs[5], out.as_mut_ptr().add(3 * 4 * DEGREE));
|
||||
storeu(h_vecs[2], out.as_mut_ptr().add(4 * 4 * DEGREE));
|
||||
storeu(h_vecs[6], out.as_mut_ptr().add(5 * 4 * DEGREE));
|
||||
storeu(h_vecs[3], out.as_mut_ptr().add(6 * 4 * DEGREE));
|
||||
storeu(h_vecs[7], out.as_mut_ptr().add(7 * 4 * DEGREE));
|
||||
}
|
||||
|
||||
#[target_feature(enable = "sse2")]
|
||||
unsafe fn hash1<A: arrayvec::Array<Item = u8>>(
|
||||
input: &A,
|
||||
key: &CVWords,
|
||||
counter: u64,
|
||||
flags: u8,
|
||||
flags_start: u8,
|
||||
flags_end: u8,
|
||||
out: &mut CVBytes,
|
||||
) {
|
||||
debug_assert_eq!(A::CAPACITY % BLOCK_LEN, 0, "uneven blocks");
|
||||
let mut cv = *key;
|
||||
let mut block_flags = flags | flags_start;
|
||||
let mut slice = input.as_slice();
|
||||
while slice.len() >= BLOCK_LEN {
|
||||
if slice.len() == BLOCK_LEN {
|
||||
block_flags |= flags_end;
|
||||
}
|
||||
compress_in_place(
|
||||
&mut cv,
|
||||
array_ref!(slice, 0, BLOCK_LEN),
|
||||
BLOCK_LEN as u8,
|
||||
counter,
|
||||
block_flags,
|
||||
);
|
||||
block_flags = flags;
|
||||
slice = &slice[BLOCK_LEN..];
|
||||
}
|
||||
*out = core::mem::transmute(cv); // x86 is little-endian
|
||||
}
|
||||
|
||||
#[target_feature(enable = "sse2")]
|
||||
pub unsafe fn hash_many<A: arrayvec::Array<Item = u8>>(
|
||||
mut inputs: &[&A],
|
||||
key: &CVWords,
|
||||
mut counter: u64,
|
||||
increment_counter: IncrementCounter,
|
||||
flags: u8,
|
||||
flags_start: u8,
|
||||
flags_end: u8,
|
||||
mut out: &mut [u8],
|
||||
) {
|
||||
debug_assert!(out.len() >= inputs.len() * OUT_LEN, "out too short");
|
||||
while inputs.len() >= DEGREE && out.len() >= DEGREE * OUT_LEN {
|
||||
// Safe because the layout of arrays is guaranteed, and because the
|
||||
// `blocks` count is determined statically from the argument type.
|
||||
let input_ptrs: &[*const u8; DEGREE] = &*(inputs.as_ptr() as *const [*const u8; DEGREE]);
|
||||
let blocks = A::CAPACITY / BLOCK_LEN;
|
||||
hash4(
|
||||
input_ptrs,
|
||||
blocks,
|
||||
key,
|
||||
counter,
|
||||
increment_counter,
|
||||
flags,
|
||||
flags_start,
|
||||
flags_end,
|
||||
array_mut_ref!(out, 0, DEGREE * OUT_LEN),
|
||||
);
|
||||
if increment_counter.yes() {
|
||||
counter += DEGREE as u64;
|
||||
}
|
||||
inputs = &inputs[DEGREE..];
|
||||
out = &mut out[DEGREE * OUT_LEN..];
|
||||
}
|
||||
for (&input, output) in inputs.iter().zip(out.chunks_exact_mut(OUT_LEN)) {
|
||||
hash1(
|
||||
input,
|
||||
key,
|
||||
counter,
|
||||
flags,
|
||||
flags_start,
|
||||
flags_end,
|
||||
array_mut_ref!(output, 0, OUT_LEN),
|
||||
);
|
||||
if increment_counter.yes() {
|
||||
counter += 1;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod test {
|
||||
use super::*;
|
||||
|
||||
#[test]
|
||||
fn test_transpose() {
|
||||
if !crate::platform::sse2_detected() {
|
||||
return;
|
||||
}
|
||||
|
||||
#[target_feature(enable = "sse2")]
|
||||
unsafe fn transpose_wrapper(vecs: &mut [__m128i; DEGREE]) {
|
||||
transpose_vecs(vecs);
|
||||
}
|
||||
|
||||
let mut matrix = [[0 as u32; DEGREE]; DEGREE];
|
||||
for i in 0..DEGREE {
|
||||
for j in 0..DEGREE {
|
||||
matrix[i][j] = (i * DEGREE + j) as u32;
|
||||
}
|
||||
}
|
||||
|
||||
unsafe {
|
||||
let mut vecs: [__m128i; DEGREE] = core::mem::transmute(matrix);
|
||||
transpose_wrapper(&mut vecs);
|
||||
matrix = core::mem::transmute(vecs);
|
||||
}
|
||||
|
||||
for i in 0..DEGREE {
|
||||
for j in 0..DEGREE {
|
||||
// Reversed indexes from above.
|
||||
assert_eq!(matrix[j][i], (i * DEGREE + j) as u32);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_compress() {
|
||||
if !crate::platform::sse2_detected() {
|
||||
return;
|
||||
}
|
||||
crate::test::test_compress_fn(compress_in_place, compress_xof);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_hash_many() {
|
||||
if !crate::platform::sse2_detected() {
|
||||
return;
|
||||
}
|
||||
crate::test::test_hash_many_fn(hash_many, hash_many);
|
||||
}
|
||||
}
|
|
@ -1,6 +1,7 @@
|
|||
fn main() {
|
||||
#[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
|
||||
{
|
||||
dbg!(is_x86_feature_detected!("sse2"));
|
||||
dbg!(is_x86_feature_detected!("sse4.1"));
|
||||
dbg!(is_x86_feature_detected!("avx2"));
|
||||
dbg!(is_x86_feature_detected!("avx512f"));
|
||||
|
|
Loading…
Reference in New Issue