mirror of
https://github.com/BLAKE3-team/BLAKE3
synced 2024-05-10 04:26:09 +02:00
manually prefetch message blocks
This commit is contained in:
parent
a3147eb909
commit
b8c33e11ef
|
@ -213,6 +213,9 @@ INLINE void transpose_msg_vecs(const uint8_t *const *inputs,
|
|||
out[13] = loadu(&inputs[5][block_offset + 1 * sizeof(__m256i)]);
|
||||
out[14] = loadu(&inputs[6][block_offset + 1 * sizeof(__m256i)]);
|
||||
out[15] = loadu(&inputs[7][block_offset + 1 * sizeof(__m256i)]);
|
||||
for(size_t i = 0; i < 8; ++i) {
|
||||
_mm_prefetch(&inputs[i][block_offset + 256], _MM_HINT_T0);
|
||||
}
|
||||
transpose_vecs(&out[0]);
|
||||
transpose_vecs(&out[8]);
|
||||
}
|
||||
|
|
|
@ -467,6 +467,9 @@ INLINE void transpose_msg_vecs4(const uint8_t *const *inputs,
|
|||
out[13] = loadu_128(&inputs[1][block_offset + 3 * sizeof(__m128i)]);
|
||||
out[14] = loadu_128(&inputs[2][block_offset + 3 * sizeof(__m128i)]);
|
||||
out[15] = loadu_128(&inputs[3][block_offset + 3 * sizeof(__m128i)]);
|
||||
for(size_t i = 0; i < 4; ++i) {
|
||||
_mm_prefetch(&inputs[i][block_offset + 256], _MM_HINT_T0);
|
||||
}
|
||||
transpose_vecs_128(&out[0]);
|
||||
transpose_vecs_128(&out[4]);
|
||||
transpose_vecs_128(&out[8]);
|
||||
|
@ -720,6 +723,9 @@ INLINE void transpose_msg_vecs8(const uint8_t *const *inputs,
|
|||
out[13] = loadu_256(&inputs[5][block_offset + 1 * sizeof(__m256i)]);
|
||||
out[14] = loadu_256(&inputs[6][block_offset + 1 * sizeof(__m256i)]);
|
||||
out[15] = loadu_256(&inputs[7][block_offset + 1 * sizeof(__m256i)]);
|
||||
for(size_t i = 0; i < 8; ++i) {
|
||||
_mm_prefetch(&inputs[i][block_offset + 256], _MM_HINT_T0);
|
||||
}
|
||||
transpose_vecs_256(&out[0]);
|
||||
transpose_vecs_256(&out[8]);
|
||||
}
|
||||
|
@ -1030,6 +1036,9 @@ INLINE void transpose_msg_vecs16(const uint8_t *const *inputs,
|
|||
out[13] = loadu_512(&inputs[13][block_offset]);
|
||||
out[14] = loadu_512(&inputs[14][block_offset]);
|
||||
out[15] = loadu_512(&inputs[15][block_offset]);
|
||||
for(size_t i = 0; i < 16; ++i) {
|
||||
_mm_prefetch(&inputs[i][block_offset + 256], _MM_HINT_T0);
|
||||
}
|
||||
transpose_vecs_512(out);
|
||||
}
|
||||
|
||||
|
|
|
@ -428,6 +428,9 @@ INLINE void transpose_msg_vecs(const uint8_t *const *inputs,
|
|||
out[13] = loadu(&inputs[1][block_offset + 3 * sizeof(__m128i)]);
|
||||
out[14] = loadu(&inputs[2][block_offset + 3 * sizeof(__m128i)]);
|
||||
out[15] = loadu(&inputs[3][block_offset + 3 * sizeof(__m128i)]);
|
||||
for(size_t i = 0; i < 4; ++i) {
|
||||
_mm_prefetch(&inputs[i][block_offset + 256], _MM_HINT_T0);
|
||||
}
|
||||
transpose_vecs(&out[0]);
|
||||
transpose_vecs(&out[4]);
|
||||
transpose_vecs(&out[8]);
|
||||
|
|
|
@ -261,6 +261,9 @@ unsafe fn transpose_msg_vecs(inputs: &[*const u8; DEGREE], block_offset: usize)
|
|||
loadu(inputs[6].add(block_offset + 1 * 4 * DEGREE)),
|
||||
loadu(inputs[7].add(block_offset + 1 * 4 * DEGREE)),
|
||||
];
|
||||
for i in 0..DEGREE {
|
||||
_mm_prefetch(inputs[i].add(block_offset + 256) as * const i8, _MM_HINT_T0);
|
||||
}
|
||||
let squares = mut_array_refs!(&mut vecs, DEGREE, DEGREE);
|
||||
transpose_vecs(squares.0);
|
||||
transpose_vecs(squares.1);
|
||||
|
|
|
@ -512,6 +512,9 @@ unsafe fn transpose_msg_vecs(inputs: &[*const u8; DEGREE], block_offset: usize)
|
|||
loadu(inputs[2].add(block_offset + 3 * 4 * DEGREE)),
|
||||
loadu(inputs[3].add(block_offset + 3 * 4 * DEGREE)),
|
||||
];
|
||||
for i in 0..DEGREE {
|
||||
_mm_prefetch(inputs[i].add(block_offset + 256) as * const i8, _MM_HINT_T0);
|
||||
}
|
||||
let squares = mut_array_refs!(&mut vecs, DEGREE, DEGREE, DEGREE, DEGREE);
|
||||
transpose_vecs(squares.0);
|
||||
transpose_vecs(squares.1);
|
||||
|
|
Loading…
Reference in New Issue