2019-12-03 19:44:30 +01:00
|
|
|
#![cfg_attr(not(feature = "std"), no_std)]
|
|
|
|
|
2019-12-03 17:00:47 +01:00
|
|
|
#[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
|
|
|
|
mod avx2;
|
2019-12-03 19:27:28 +01:00
|
|
|
mod platform;
|
2019-12-02 23:30:55 +01:00
|
|
|
mod portable;
|
2019-12-03 00:02:11 +01:00
|
|
|
#[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
|
|
|
|
mod sse41;
|
2019-12-02 23:30:55 +01:00
|
|
|
#[cfg(test)]
|
|
|
|
mod test;
|
|
|
|
|
2019-12-03 21:18:08 +01:00
|
|
|
use arrayref::array_ref;
|
|
|
|
use arrayvec::ArrayString;
|
2019-12-03 21:46:58 +01:00
|
|
|
use core::cmp;
|
2019-12-03 21:18:08 +01:00
|
|
|
use core::fmt;
|
|
|
|
use platform::Platform;
|
|
|
|
|
2019-12-02 23:30:55 +01:00
|
|
|
/// The default number of bytes in a hash, 32.
|
|
|
|
pub const OUT_LEN: usize = 32;
|
|
|
|
|
|
|
|
/// The number of bytes in a key, 32.
|
|
|
|
pub const KEY_LEN: usize = 32;
|
|
|
|
|
|
|
|
// These are pub for tests and benchmarks. Callers don't need them.
|
|
|
|
#[doc(hidden)]
|
|
|
|
pub const BLOCK_LEN: usize = 64;
|
|
|
|
#[doc(hidden)]
|
|
|
|
pub const CHUNK_LEN: usize = 2048;
|
|
|
|
|
|
|
|
const IV: [u32; 8] = [
|
|
|
|
0x6A09E667, 0xBB67AE85, 0x3C6EF372, 0xA54FF53A, 0x510E527F, 0x9B05688C, 0x1F83D9AB, 0x5BE0CD19,
|
|
|
|
];
|
|
|
|
|
|
|
|
const MSG_SCHEDULE: [[usize; 16]; 7] = [
|
|
|
|
[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15],
|
|
|
|
[14, 10, 4, 8, 9, 15, 13, 6, 1, 12, 0, 2, 11, 7, 5, 3],
|
|
|
|
[11, 8, 12, 0, 5, 2, 15, 13, 10, 14, 3, 6, 7, 1, 9, 4],
|
|
|
|
[7, 9, 3, 1, 13, 12, 11, 14, 2, 6, 5, 10, 4, 0, 15, 8],
|
|
|
|
[9, 0, 5, 7, 2, 4, 10, 15, 14, 1, 11, 12, 6, 8, 3, 13],
|
|
|
|
[2, 12, 6, 10, 0, 11, 8, 3, 4, 13, 7, 5, 15, 14, 1, 9],
|
|
|
|
[12, 5, 1, 15, 14, 13, 4, 10, 0, 7, 6, 3, 9, 2, 8, 11],
|
|
|
|
];
|
|
|
|
|
|
|
|
const CHUNK_OFFSET_DELTAS: &[u64; 16] = &[
|
|
|
|
CHUNK_LEN as u64 * 0,
|
|
|
|
CHUNK_LEN as u64 * 1,
|
|
|
|
CHUNK_LEN as u64 * 2,
|
|
|
|
CHUNK_LEN as u64 * 3,
|
|
|
|
CHUNK_LEN as u64 * 4,
|
|
|
|
CHUNK_LEN as u64 * 5,
|
|
|
|
CHUNK_LEN as u64 * 6,
|
|
|
|
CHUNK_LEN as u64 * 7,
|
|
|
|
CHUNK_LEN as u64 * 8,
|
|
|
|
CHUNK_LEN as u64 * 9,
|
|
|
|
CHUNK_LEN as u64 * 10,
|
|
|
|
CHUNK_LEN as u64 * 11,
|
|
|
|
CHUNK_LEN as u64 * 12,
|
|
|
|
CHUNK_LEN as u64 * 13,
|
|
|
|
CHUNK_LEN as u64 * 14,
|
|
|
|
CHUNK_LEN as u64 * 15,
|
|
|
|
];
|
|
|
|
|
|
|
|
const PARENT_OFFSET_DELTAS: &[u64; 16] = &[0; 16];
|
|
|
|
|
|
|
|
// These are the internal flags that we use to domain separate root/non-root,
|
|
|
|
// chunk/parent, and chunk beginning/middle/end. These get set at the high end
|
|
|
|
// of the block flags word in the compression function, so their values start
|
|
|
|
// high and go down.
|
|
|
|
bitflags::bitflags! {
|
|
|
|
struct Flags: u8 {
|
|
|
|
const CHUNK_START = 1 << 0;
|
|
|
|
const CHUNK_END = 1 << 1;
|
|
|
|
const PARENT = 1 << 2;
|
|
|
|
const ROOT = 1 << 3;
|
|
|
|
const KEYED_HASH = 1 << 4;
|
|
|
|
const DERIVE_KEY = 1 << 5;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
fn offset_low(offset: u64) -> u32 {
|
|
|
|
offset as u32
|
|
|
|
}
|
|
|
|
|
|
|
|
fn offset_high(offset: u64) -> u32 {
|
|
|
|
(offset >> 32) as u32
|
|
|
|
}
|
2019-12-03 19:34:12 +01:00
|
|
|
|
|
|
|
/// A BLAKE3 output of the default size, 32 bytes, which implements
|
|
|
|
/// constant-time equality.
|
|
|
|
#[derive(Clone, Copy)]
|
|
|
|
pub struct Hash([u8; OUT_LEN]);
|
|
|
|
|
|
|
|
impl Hash {
|
|
|
|
pub fn as_bytes(&self) -> &[u8; OUT_LEN] {
|
|
|
|
&self.0
|
|
|
|
}
|
|
|
|
|
|
|
|
pub fn to_hex(&self) -> ArrayString<[u8; 2 * OUT_LEN]> {
|
|
|
|
let mut s = ArrayString::new();
|
|
|
|
let table = b"0123456789abcdef";
|
|
|
|
for &b in self.0.iter() {
|
|
|
|
s.push(table[(b >> 4) as usize] as char);
|
|
|
|
s.push(table[(b & 0xf) as usize] as char);
|
|
|
|
}
|
|
|
|
s
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
impl From<[u8; OUT_LEN]> for Hash {
|
|
|
|
fn from(bytes: [u8; OUT_LEN]) -> Self {
|
|
|
|
Self(bytes)
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
impl From<Hash> for [u8; OUT_LEN] {
|
|
|
|
fn from(hash: Hash) -> Self {
|
|
|
|
hash.0
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
/// This implementation is constant-time.
|
|
|
|
impl PartialEq for Hash {
|
|
|
|
fn eq(&self, other: &Hash) -> bool {
|
|
|
|
constant_time_eq::constant_time_eq(&self.0[..], &other.0[..])
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
/// This implementation is constant-time.
|
|
|
|
impl PartialEq<[u8; OUT_LEN]> for Hash {
|
|
|
|
fn eq(&self, other: &[u8; OUT_LEN]) -> bool {
|
|
|
|
constant_time_eq::constant_time_eq(&self.0[..], other)
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
impl Eq for Hash {}
|
|
|
|
|
|
|
|
impl fmt::Debug for Hash {
|
|
|
|
fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
|
|
|
|
write!(f, "Hash(0x{})", self.to_hex())
|
|
|
|
}
|
|
|
|
}
|
2019-12-03 21:18:16 +01:00
|
|
|
|
|
|
|
// Each chunk or parent node can produce either a 32-byte chaining value or, by
|
|
|
|
// setting the ROOT flag, any number of final output bytes. The Output struct
|
|
|
|
// captures the state just prior to choosing between those two possibilities.
|
|
|
|
struct Output {
|
|
|
|
input_chaining_value: [u8; 32],
|
|
|
|
block: [u8; 64],
|
|
|
|
block_len: u8,
|
|
|
|
offset: u64,
|
|
|
|
flags: Flags,
|
|
|
|
platform: Platform,
|
|
|
|
}
|
|
|
|
|
|
|
|
impl Output {
|
|
|
|
fn chaining_value(&self) -> [u8; 32] {
|
|
|
|
let out = self.platform.compress(
|
|
|
|
&self.input_chaining_value,
|
|
|
|
&self.block,
|
|
|
|
self.block_len,
|
|
|
|
self.offset,
|
2019-12-03 21:46:58 +01:00
|
|
|
self.flags,
|
2019-12-03 21:18:16 +01:00
|
|
|
);
|
|
|
|
*array_ref!(out, 0, 32)
|
|
|
|
}
|
|
|
|
|
|
|
|
fn root_hash(&self) -> Hash {
|
2019-12-03 21:46:58 +01:00
|
|
|
debug_assert_eq!(self.offset, 0);
|
2019-12-03 21:18:16 +01:00
|
|
|
let out = self.platform.compress(
|
|
|
|
&self.input_chaining_value,
|
|
|
|
&self.block,
|
|
|
|
self.block_len,
|
2019-12-03 21:46:58 +01:00
|
|
|
0,
|
|
|
|
self.flags | Flags::ROOT,
|
2019-12-03 21:18:16 +01:00
|
|
|
);
|
|
|
|
Hash(*array_ref!(out, 0, 32))
|
|
|
|
}
|
|
|
|
|
|
|
|
fn root_output_bytes(&self, out_slice: &mut [u8]) {
|
|
|
|
let mut offset = 0;
|
|
|
|
for out_block in out_slice.chunks_mut(2 * OUT_LEN) {
|
|
|
|
let out_bytes = self.platform.compress(
|
|
|
|
&self.input_chaining_value,
|
|
|
|
&self.block,
|
|
|
|
self.block_len,
|
|
|
|
offset,
|
2019-12-03 21:46:58 +01:00
|
|
|
self.flags | Flags::ROOT,
|
2019-12-03 21:18:16 +01:00
|
|
|
);
|
|
|
|
out_block.copy_from_slice(&out_bytes[..out_block.len()]);
|
|
|
|
offset += 2 * OUT_LEN as u64;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
2019-12-03 21:46:58 +01:00
|
|
|
|
|
|
|
#[derive(Clone)]
|
|
|
|
struct ChunkState {
|
|
|
|
cv: [u8; 32],
|
|
|
|
offset: u64,
|
|
|
|
buf: [u8; BLOCK_LEN],
|
|
|
|
buf_len: u8,
|
|
|
|
blocks_compressed: u8,
|
|
|
|
flags: Flags,
|
|
|
|
platform: Platform,
|
|
|
|
}
|
|
|
|
|
|
|
|
impl ChunkState {
|
|
|
|
fn new(key: &[u8; 32], offset: u64, flags: Flags, platform: Platform) -> Self {
|
|
|
|
Self {
|
|
|
|
cv: *key,
|
|
|
|
offset: 0,
|
|
|
|
buf: [0; BLOCK_LEN],
|
|
|
|
buf_len: 0,
|
|
|
|
blocks_compressed: 0,
|
|
|
|
flags,
|
|
|
|
platform,
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
fn len(&self) -> u64 {
|
|
|
|
BLOCK_LEN as u64 * self.blocks_compressed as u64 + self.buf_len as u64
|
|
|
|
}
|
|
|
|
|
|
|
|
fn fill_buf(&mut self, input: &mut &[u8]) {
|
|
|
|
let want = BLOCK_LEN - self.buf_len as usize;
|
|
|
|
let take = cmp::min(want, input.len());
|
|
|
|
self.buf[self.buf_len as usize..][..take].copy_from_slice(&input[..take]);
|
|
|
|
self.buf_len += take as u8;
|
|
|
|
*input = &input[take..];
|
|
|
|
}
|
|
|
|
|
|
|
|
fn start_flag(&self) -> Flags {
|
|
|
|
if self.blocks_compressed == 0 {
|
|
|
|
Flags::CHUNK_START
|
|
|
|
} else {
|
|
|
|
Flags::empty()
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
// Try to avoid buffering as much as possible, by compressing directly from
|
|
|
|
// the input slice when full blocks are available.
|
|
|
|
fn update(&mut self, mut input: &[u8]) {
|
|
|
|
if self.buf_len > 0 {
|
|
|
|
self.fill_buf(&mut input);
|
|
|
|
if !input.is_empty() {
|
|
|
|
debug_assert_eq!(self.buf_len as usize, BLOCK_LEN);
|
|
|
|
let block_flags = self.flags | self.start_flag(); // borrowck
|
|
|
|
self.platform.compress(
|
|
|
|
&mut self.cv,
|
|
|
|
&self.buf,
|
|
|
|
BLOCK_LEN as u8,
|
|
|
|
self.offset,
|
|
|
|
block_flags,
|
|
|
|
);
|
|
|
|
self.buf_len = 0;
|
|
|
|
self.buf = [0; BLOCK_LEN];
|
|
|
|
self.blocks_compressed += 1;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
while input.len() > BLOCK_LEN {
|
|
|
|
debug_assert_eq!(self.buf_len, 0);
|
|
|
|
let block_flags = self.flags | self.start_flag(); // borrowck
|
|
|
|
self.platform.compress(
|
|
|
|
&mut self.cv,
|
|
|
|
array_ref!(input, 0, BLOCK_LEN),
|
|
|
|
BLOCK_LEN as u8,
|
|
|
|
self.offset,
|
|
|
|
block_flags,
|
|
|
|
);
|
|
|
|
self.blocks_compressed += 1;
|
|
|
|
input = &input[BLOCK_LEN..];
|
|
|
|
}
|
|
|
|
|
|
|
|
self.fill_buf(&mut input);
|
|
|
|
debug_assert!(input.is_empty());
|
|
|
|
debug_assert!(self.len() <= CHUNK_LEN as u64);
|
|
|
|
}
|
|
|
|
|
|
|
|
fn finalize(&self) -> Output {
|
|
|
|
let block_flags = self.flags | self.start_flag() | Flags::CHUNK_END;
|
|
|
|
Output {
|
|
|
|
input_chaining_value: self.cv,
|
|
|
|
block: self.buf,
|
|
|
|
block_len: self.buf_len,
|
|
|
|
offset: self.offset,
|
|
|
|
flags: block_flags,
|
|
|
|
platform: self.platform,
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
// Don't derive(Debug), because the state may be secret.
|
|
|
|
impl fmt::Debug for ChunkState {
|
|
|
|
fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
|
|
|
|
write!(
|
|
|
|
f,
|
|
|
|
"ChunkState {{ len: {}, offset: {}, flags: {:?}, platform: {:?} }}",
|
|
|
|
self.len(),
|
|
|
|
self.offset,
|
|
|
|
self.flags,
|
|
|
|
self.platform
|
|
|
|
)
|
|
|
|
}
|
|
|
|
}
|
2019-12-03 22:23:19 +01:00
|
|
|
|
|
|
|
// IMPLEMENTATION NOTE
|
|
|
|
// ===================
|
|
|
|
// hash_subtree() is the basis of high-performance BLAKE3. We use it both for
|
|
|
|
// all-at-once hashing, and for the incremental input with Hasher (though we
|
|
|
|
// have to be careful with subtree boundaries in the incremental case).
|
|
|
|
// hash_subtree() applies several optimizations at the same time:
|
|
|
|
// - Multi-threading with Rayon.
|
|
|
|
// - Parallel chunk hashing with SIMD.
|
|
|
|
// - Parallel parent hashing with SIMD. Note that while SIMD chunk hashing
|
|
|
|
// maxes out at MAX_SIMD_DEGREE*CHUNK_LEN, parallel parent hashing continues
|
|
|
|
// to benefit from larger inputs, because more levels of the tree benefit can
|
|
|
|
// use full-width SIMD vectors for parent hashing. Without parallel parent
|
|
|
|
// hashing, we lose about 10% of overall throughput on AVX2 and AVX-512.
|