mirror of
https://github.com/BLAKE3-team/BLAKE3
synced 2024-09-26 08:50:39 +02:00
replace the new file module with inherent methods on Hasher
New methods: - update_reader - update_mmap - update_mmap_rayon These are more discoverable and more convenient to use. There are two problems I want to avoid by taking a `Path` instead of a `File`. First, exposing `Mmap` objects to the caller is fundamentally unsafe, and making `maybe_mmap_file` private avoids that issue. Second, taking a `File` raises questions about whether memory mapped reads should behave like regular file reads. (Should they respect the current seek position? Should they update the seek position?) Taking a `Path` from the caller and opening the `File` internally avoids these questions.
This commit is contained in:
parent
e0bb915641
commit
6fca60d650
4
.github/workflows/ci.yml
vendored
4
.github/workflows/ci.yml
vendored
@ -59,6 +59,10 @@ jobs:
|
||||
run: cargo test --features=rayon,traits-preview,zeroize
|
||||
env:
|
||||
RAYON_NUM_THREADS: 1
|
||||
# The mmap feature by itself (update_mmap_rayon is omitted).
|
||||
- run: cargo test --features=mmap
|
||||
# All public features put together.
|
||||
- run: cargo test --features=mmap,rayon,traits-preview,zeroize
|
||||
# no_std tests.
|
||||
- run: cargo test --no-default-features
|
||||
|
||||
|
21
Cargo.toml
21
Cargo.toml
@ -25,11 +25,17 @@ neon = []
|
||||
# entire build, with e.g. RUSTFLAGS="-C target-cpu=native".
|
||||
std = []
|
||||
|
||||
# The "rayon" feature (defined below as an optional dependency) enables the
|
||||
# `Hasher::update_rayon` method, for multithreaded hashing. However, even if
|
||||
# this feature is enabled, all other APIs remain single-threaded.
|
||||
# The `rayon` feature (disabled by default, but enabled for docs.rs) adds the
|
||||
# `update_rayon` and (in combination with `mmap` below) `update_mmap_rayon`
|
||||
# methods, for multithreaded hashing. However, even if this feature is enabled,
|
||||
# all other APIs remain single-threaded.
|
||||
rayon = ["dep:rayon", "std"]
|
||||
|
||||
# The `mmap` feature (disabled by default, but enabled for docs.rs) adds the
|
||||
# `update_mmap` and (in combination with `rayon` above) `update_mmap_rayon`
|
||||
# helper methods for memory-mapped IO.
|
||||
mmap = ["std", "dep:memmap2"]
|
||||
|
||||
# Implement the zeroize::Zeroize trait for types in this crate.
|
||||
zeroize = ["dep:zeroize", "arrayvec/zeroize"]
|
||||
|
||||
@ -81,11 +87,9 @@ no_avx2 = []
|
||||
no_avx512 = []
|
||||
no_neon = []
|
||||
|
||||
file = ["memmap2", "rayon", "std"]
|
||||
|
||||
[package.metadata.docs.rs]
|
||||
# Document Hasher::update_rayon on docs.rs.
|
||||
features = ["rayon", "zeroize"]
|
||||
# Document the rayon/mmap methods and the Zeroize impls on docs.rs.
|
||||
features = ["mmap", "rayon", "zeroize"]
|
||||
|
||||
[dependencies]
|
||||
arrayref = "0.3.5"
|
||||
@ -98,12 +102,13 @@ zeroize = { version = "1", default-features = false, features = ["zeroize_derive
|
||||
memmap2 = { version = "0.7.1", optional = true }
|
||||
|
||||
[dev-dependencies]
|
||||
hmac = "0.12.0"
|
||||
hex = "0.4.2"
|
||||
page_size = "0.6.0"
|
||||
rand = "0.8.0"
|
||||
rand_chacha = "0.3.0"
|
||||
reference_impl = { path = "./reference_impl" }
|
||||
hmac = "0.12.0"
|
||||
tempfile = "3.8.0"
|
||||
|
||||
[build-dependencies]
|
||||
cc = "1.0.4"
|
||||
|
24
b3sum/Cargo.lock
generated
24
b3sum/Cargo.lock
generated
@ -18,9 +18,9 @@ dependencies = [
|
||||
|
||||
[[package]]
|
||||
name = "anstyle"
|
||||
version = "1.0.2"
|
||||
version = "1.0.3"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "15c4c2c83f81532e5845a733998b6971faca23490340a418e9b72a3ec9de12ea"
|
||||
checksum = "b84bf0a05bbb2a83e5eb6fa36bb6e87baa08193c35ff52bbf6b38d8af2890e46"
|
||||
|
||||
[[package]]
|
||||
name = "anstyle-parse"
|
||||
@ -131,9 +131,9 @@ checksum = "baf1de4339761588bc0619e3cbc0120ee582ebb74b53b4efbf79117bd2da40fd"
|
||||
|
||||
[[package]]
|
||||
name = "clap"
|
||||
version = "4.4.2"
|
||||
version = "4.4.3"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "6a13b88d2c62ff462f88e4a121f17a82c1af05693a2f192b5c38d14de73c19f6"
|
||||
checksum = "84ed82781cea27b43c9b106a979fe450a13a31aab0500595fb3fc06616de08e6"
|
||||
dependencies = [
|
||||
"clap_builder",
|
||||
"clap_derive",
|
||||
@ -307,9 +307,9 @@ dependencies = [
|
||||
|
||||
[[package]]
|
||||
name = "libc"
|
||||
version = "0.2.147"
|
||||
version = "0.2.148"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "b4668fb0ea861c1df094127ac5f1da3409a82116a4ba74fca2e58ef927159bb3"
|
||||
checksum = "9cdc71e17332e86d2e1d38c1f99edcb6288ee11b815fb1a4b049eaa2114d369b"
|
||||
|
||||
[[package]]
|
||||
name = "linux-raw-sys"
|
||||
@ -369,9 +369,9 @@ dependencies = [
|
||||
|
||||
[[package]]
|
||||
name = "proc-macro2"
|
||||
version = "1.0.66"
|
||||
version = "1.0.67"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "18fb31db3f9bddb2ea821cde30a9f70117e3f119938b5ee630b7403aa6e2ead9"
|
||||
checksum = "3d433d9f1a3e8c1263d9456598b16fec66f4acc9a74dacffd35c7bb09b3a1328"
|
||||
dependencies = [
|
||||
"unicode-ident",
|
||||
]
|
||||
@ -467,9 +467,9 @@ checksum = "73473c0e59e6d5812c5dfe2a064a6444949f089e20eec9a2e5506596494e4623"
|
||||
|
||||
[[package]]
|
||||
name = "syn"
|
||||
version = "2.0.32"
|
||||
version = "2.0.36"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "239814284fd6f1a4ffe4ca893952cdd93c224b6a1571c9a9eadd670295c0c9e2"
|
||||
checksum = "91e02e55d62894af2a08aca894c6577281f76769ba47c94d5756bec8ac6e7373"
|
||||
dependencies = [
|
||||
"proc-macro2",
|
||||
"quote",
|
||||
@ -501,9 +501,9 @@ dependencies = [
|
||||
|
||||
[[package]]
|
||||
name = "unicode-ident"
|
||||
version = "1.0.11"
|
||||
version = "1.0.12"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "301abaae475aa91687eb82514b328ab47a211a533026cb25fc3e519b86adfc3c"
|
||||
checksum = "3354b9ac3fae1ff6755cb6db53683adb661634f67557942dea4facebec0fee4b"
|
||||
|
||||
[[package]]
|
||||
name = "utf8parse"
|
||||
|
@ -15,7 +15,7 @@ pure = ["blake3/pure"]
|
||||
|
||||
[dependencies]
|
||||
anyhow = "1.0.25"
|
||||
blake3 = { version = "1", path = "..", features = ["file", "rayon"] }
|
||||
blake3 = { version = "1", path = "..", features = ["mmap", "rayon"] }
|
||||
clap = { version = "4.0.8", features = ["derive", "wrap_help"] }
|
||||
hex = "0.4.0"
|
||||
memmap2 = "0.7.0"
|
||||
|
@ -163,73 +163,22 @@ impl Args {
|
||||
}
|
||||
}
|
||||
|
||||
enum Input {
|
||||
Mmap(io::Cursor<memmap2::Mmap>),
|
||||
File(File),
|
||||
Stdin,
|
||||
}
|
||||
|
||||
impl Input {
|
||||
// Open an input file, using mmap if appropriate. "-" means stdin. Note
|
||||
// that this convention applies both to command line arguments, and to
|
||||
// filepaths that appear in a checkfile.
|
||||
fn open(path: &Path, args: &Args) -> Result<Self> {
|
||||
if path == Path::new("-") {
|
||||
if args.keyed() {
|
||||
bail!("Cannot open `-` in keyed mode");
|
||||
}
|
||||
return Ok(Self::Stdin);
|
||||
}
|
||||
let file = File::open(path)?;
|
||||
if !args.no_mmap() {
|
||||
if let Some(mmap) = blake3::file::maybe_memmap_file(&file)? {
|
||||
return Ok(Self::Mmap(io::Cursor::new(mmap)));
|
||||
}
|
||||
}
|
||||
Ok(Self::File(file))
|
||||
}
|
||||
|
||||
fn hash(&mut self, args: &Args) -> Result<blake3::OutputReader> {
|
||||
let mut hasher = args.base_hasher.clone();
|
||||
match self {
|
||||
// The fast path: If we mmapped the file successfully, hash using
|
||||
// multiple threads. This doesn't work on stdin, or on some files,
|
||||
// and it can also be disabled with --no-mmap.
|
||||
Self::Mmap(cursor) => {
|
||||
hasher.update_rayon(cursor.get_ref());
|
||||
}
|
||||
// The slower paths, for stdin or files we didn't/couldn't mmap.
|
||||
// This is currently all single-threaded. Doing multi-threaded
|
||||
// hashing without memory mapping is tricky, since all your worker
|
||||
// threads have to stop every time you refill the buffer, and that
|
||||
// ends up being a lot of overhead. To solve that, we need a more
|
||||
// complicated double-buffering strategy where a background thread
|
||||
// fills one buffer while the worker threads are hashing the other
|
||||
// one. We might implement that in the future, but since this is
|
||||
// the slow path anyway, it's not high priority.
|
||||
Self::File(file) => {
|
||||
blake3::copy_wide(file, &mut hasher)?;
|
||||
}
|
||||
Self::Stdin => {
|
||||
let stdin = io::stdin();
|
||||
let lock = stdin.lock();
|
||||
blake3::copy_wide(lock, &mut hasher)?;
|
||||
}
|
||||
}
|
||||
let mut output_reader = hasher.finalize_xof();
|
||||
output_reader.set_position(args.seek());
|
||||
Ok(output_reader)
|
||||
}
|
||||
}
|
||||
|
||||
impl Read for Input {
|
||||
fn read(&mut self, buf: &mut [u8]) -> io::Result<usize> {
|
||||
match self {
|
||||
Self::Mmap(cursor) => cursor.read(buf),
|
||||
Self::File(file) => file.read(buf),
|
||||
Self::Stdin => io::stdin().read(buf),
|
||||
fn hash_path(args: &Args, path: &Path) -> Result<blake3::OutputReader> {
|
||||
let mut hasher = args.base_hasher.clone();
|
||||
if path == Path::new("-") {
|
||||
if args.keyed() {
|
||||
bail!("Cannot open `-` in keyed mode");
|
||||
}
|
||||
hasher.update_reader(io::stdin().lock())?;
|
||||
} else if args.no_mmap() {
|
||||
hasher.update_reader(File::open(path)?)?;
|
||||
} else {
|
||||
// The fast path: Try to mmap the file and hash it with multiple threads.
|
||||
hasher.update_mmap_rayon(path)?;
|
||||
}
|
||||
let mut output_reader = hasher.finalize_xof();
|
||||
output_reader.set_position(args.seek());
|
||||
Ok(output_reader)
|
||||
}
|
||||
|
||||
fn write_hex_output(mut output: blake3::OutputReader, args: &Args) -> Result<()> {
|
||||
@ -425,8 +374,7 @@ fn parse_check_line(mut line: &str) -> Result<ParsedCheckLine> {
|
||||
}
|
||||
|
||||
fn hash_one_input(path: &Path, args: &Args) -> Result<()> {
|
||||
let mut input = Input::open(path, args)?;
|
||||
let output = input.hash(args)?;
|
||||
let output = hash_path(args, path)?;
|
||||
if args.raw() {
|
||||
write_raw_output(output, args)?;
|
||||
return Ok(());
|
||||
@ -470,15 +418,13 @@ fn check_one_line(line: &str, args: &Args) -> bool {
|
||||
} else {
|
||||
file_string
|
||||
};
|
||||
let hash_result: Result<blake3::Hash> = Input::open(&file_path, args)
|
||||
.and_then(|mut input| input.hash(args))
|
||||
.map(|mut hash_output| {
|
||||
let found_hash: blake3::Hash;
|
||||
match hash_path(args, &file_path) {
|
||||
Ok(mut output) => {
|
||||
let mut found_hash_bytes = [0; blake3::OUT_LEN];
|
||||
hash_output.fill(&mut found_hash_bytes);
|
||||
found_hash_bytes.into()
|
||||
});
|
||||
let found_hash: blake3::Hash = match hash_result {
|
||||
Ok(hash) => hash,
|
||||
output.fill(&mut found_hash_bytes);
|
||||
found_hash = found_hash_bytes.into();
|
||||
}
|
||||
Err(e) => {
|
||||
println!("{}: FAILED ({})", file_string, e);
|
||||
return false;
|
||||
@ -497,8 +443,18 @@ fn check_one_line(line: &str, args: &Args) -> bool {
|
||||
}
|
||||
|
||||
fn check_one_checkfile(path: &Path, args: &Args, files_failed: &mut u64) -> Result<()> {
|
||||
let checkfile_input = Input::open(path, args)?;
|
||||
let mut bufreader = io::BufReader::new(checkfile_input);
|
||||
let mut file;
|
||||
let stdin;
|
||||
let mut stdin_lock;
|
||||
let mut bufreader: io::BufReader<&mut dyn Read>;
|
||||
if path == Path::new("-") {
|
||||
stdin = io::stdin();
|
||||
stdin_lock = stdin.lock();
|
||||
bufreader = io::BufReader::new(&mut stdin_lock);
|
||||
} else {
|
||||
file = File::open(path)?;
|
||||
bufreader = io::BufReader::new(&mut file);
|
||||
}
|
||||
let mut line = String::new();
|
||||
loop {
|
||||
line.clear();
|
||||
|
67
src/file.rs
67
src/file.rs
@ -1,67 +0,0 @@
|
||||
//! The file-related utilities.
|
||||
//!
|
||||
//! # Examples
|
||||
//!
|
||||
//! ```no_run
|
||||
//! use std::io;
|
||||
//!
|
||||
//! use blake3::file::hash_path_maybe_mmap;
|
||||
//!
|
||||
//! fn main() -> io::Result<()> {
|
||||
//! let args: Vec<_> = std::env::args_os().collect();
|
||||
//! assert_eq!(args.len(), 2);
|
||||
//! let path = &args[1];
|
||||
//! let mut hasher = blake3::Hasher::new();
|
||||
//! hash_path_maybe_mmap(&mut hasher, path)?;
|
||||
//! println!("{}", hasher.finalize());
|
||||
//! Ok(())
|
||||
//! }
|
||||
//! ```
|
||||
|
||||
use std::{fs::File, io, path::Path};
|
||||
|
||||
/// Mmap a file, if it looks like a good idea. Return None in cases where we
|
||||
/// know mmap will fail, or if the file is short enough that mmapping isn't
|
||||
/// worth it. However, if we do try to mmap and it fails, return the error.
|
||||
pub fn maybe_memmap_file(file: &File) -> io::Result<Option<memmap2::Mmap>> {
|
||||
let metadata = file.metadata()?;
|
||||
let file_size = metadata.len();
|
||||
#[allow(clippy::if_same_then_else)]
|
||||
if !metadata.is_file() {
|
||||
// Not a real file.
|
||||
Ok(None)
|
||||
} else if file_size > isize::max_value() as u64 {
|
||||
// Too long to safely map.
|
||||
// https://github.com/danburkert/memmap-rs/issues/69
|
||||
Ok(None)
|
||||
} else if file_size == 0 {
|
||||
// Mapping an empty file currently fails.
|
||||
// https://github.com/danburkert/memmap-rs/issues/72
|
||||
Ok(None)
|
||||
} else if file_size < 16 * 1024 {
|
||||
// Mapping small files is not worth it.
|
||||
Ok(None)
|
||||
} else {
|
||||
// Explicitly set the length of the memory map, so that filesystem
|
||||
// changes can't race to violate the invariants we just checked.
|
||||
let map = unsafe {
|
||||
memmap2::MmapOptions::new()
|
||||
.len(file_size as usize)
|
||||
.map(file)?
|
||||
};
|
||||
Ok(Some(map))
|
||||
}
|
||||
}
|
||||
|
||||
/// Hash a file fast.
|
||||
///
|
||||
/// It may use mmap if the file is big enough. If not, it will read the whole file into a buffer.
|
||||
pub fn hash_path_maybe_mmap(hasher: &mut crate::Hasher, path: impl AsRef<Path>) -> io::Result<()> {
|
||||
let file = File::open(path.as_ref())?;
|
||||
if let Some(mmap) = maybe_memmap_file(&file)? {
|
||||
hasher.update_rayon(&mmap);
|
||||
} else {
|
||||
crate::copy_wide(&file, hasher)?;
|
||||
}
|
||||
Ok(())
|
||||
}
|
79
src/io.rs
Normal file
79
src/io.rs
Normal file
@ -0,0 +1,79 @@
|
||||
//! Helper functions for efficient IO.
|
||||
|
||||
#[cfg(feature = "std")]
|
||||
pub(crate) fn copy_wide(
|
||||
mut reader: impl std::io::Read,
|
||||
hasher: &mut crate::Hasher,
|
||||
) -> std::io::Result<u64> {
|
||||
let mut buffer = [0; 65536];
|
||||
let mut total = 0;
|
||||
loop {
|
||||
match reader.read(&mut buffer) {
|
||||
Ok(0) => return Ok(total),
|
||||
Ok(n) => {
|
||||
hasher.update(&buffer[..n]);
|
||||
total += n as u64;
|
||||
}
|
||||
// see test_update_reader_interrupted
|
||||
Err(e) if e.kind() == std::io::ErrorKind::Interrupted => continue,
|
||||
Err(e) => return Err(e),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Mmap a file, if it looks like a good idea. Return None in cases where we know mmap will fail, or
|
||||
// if the file is short enough that mmapping isn't worth it. However, if we do try to mmap and it
|
||||
// fails, return the error.
|
||||
//
|
||||
// SAFETY: Mmaps are fundamentally unsafe, because you can call invariant-checking functions like
|
||||
// str::from_utf8 on them and then have them change out from under you. Letting a safe caller get
|
||||
// their hands on an mmap, or even a &[u8] that's backed by an mmap, is unsound. However, because
|
||||
// this function is crate-private, we can guarantee that all can ever happen in the event of a race
|
||||
// condition is that we either hash nonsense bytes or crash with SIGBUS or similar, neither of
|
||||
// which should risk memory corruption in a safe caller.
|
||||
//
|
||||
// PARANOIA: But a data race...is a data race...is a data race...right? Even if we know that no
|
||||
// platform in the "real world" is ever going to do anything other than compute the "wrong answer"
|
||||
// if we race on this mmap while we hash it, aren't we still supposed to feel bad about doing this?
|
||||
// Well, maybe. This is IO, and IO gets special carve-outs in the memory model. Consider a
|
||||
// memory-mapped register that returns random 32-bit words. (This is actually realistic if you have
|
||||
// a hardware RNG.) It's probably sound to construct a *const i32 pointing to that register and do
|
||||
// some raw pointer reads from it. Those reads should be volatile if you don't want the compiler to
|
||||
// coalesce them, but either way the compiler isn't allowed to just _go nuts_ and insert
|
||||
// should-never-happen branches to wipe your hard drive if two adjacent reads happen to give
|
||||
// different values. As far as I'm aware, there's no such thing as a read that's allowed if it's
|
||||
// volatile but prohibited if it's not (unlike atomics). As mentioned above, it's not ok to
|
||||
// construct a safe &i32 to the register if you're going to leak that reference to unknown callers.
|
||||
// But if you "know what you're doing," I don't think *const i32 and &i32 are fundamentally
|
||||
// different here. Feedback needed.
|
||||
#[cfg(feature = "mmap")]
|
||||
pub(crate) fn maybe_mmap_file(file: &std::fs::File) -> std::io::Result<Option<memmap2::Mmap>> {
|
||||
let metadata = file.metadata()?;
|
||||
let file_size = metadata.len();
|
||||
#[allow(clippy::if_same_then_else)]
|
||||
if !metadata.is_file() {
|
||||
// Not a real file.
|
||||
Ok(None)
|
||||
} else if file_size > isize::max_value() as u64 {
|
||||
// Too long to safely map.
|
||||
// https://github.com/danburkert/memmap-rs/issues/69
|
||||
Ok(None)
|
||||
} else if file_size == 0 {
|
||||
// Mapping an empty file currently fails.
|
||||
// https://github.com/danburkert/memmap-rs/issues/72
|
||||
// See test_mmap_virtual_file.
|
||||
Ok(None)
|
||||
} else if file_size < 16 * 1024 {
|
||||
// Mapping small files is not worth it.
|
||||
Ok(None)
|
||||
} else {
|
||||
// Explicitly set the length of the memory map, so that filesystem
|
||||
// changes can't race to violate the invariants we just checked.
|
||||
let map = unsafe {
|
||||
memmap2::MmapOptions::new()
|
||||
.len(file_size as usize)
|
||||
.map(file)?
|
||||
};
|
||||
Ok(Some(map))
|
||||
}
|
||||
}
|
282
src/lib.rs
282
src/lib.rs
@ -33,18 +33,28 @@
|
||||
//! # Cargo Features
|
||||
//!
|
||||
//! The `std` feature (the only feature enabled by default) is required for
|
||||
//! implementations of the [`Write`] and [`Seek`] traits, and also for runtime
|
||||
//! CPU feature detection on x86. If this feature is disabled, the only way to
|
||||
//! use the x86 SIMD implementations is to enable the corresponding instruction
|
||||
//! sets globally, with e.g. `RUSTFLAGS="-C target-cpu=native"`. The resulting
|
||||
//! binary will not be portable to other machines.
|
||||
//! implementations of the [`Write`] and [`Seek`] traits, the
|
||||
//! [`update_reader`](Hasher::update_reader) helper method, and runtime CPU
|
||||
//! feature detection on x86. If this feature is disabled, the only way to use
|
||||
//! the x86 SIMD implementations is to enable the corresponding instruction sets
|
||||
//! globally, with e.g. `RUSTFLAGS="-C target-cpu=native"`. The resulting binary
|
||||
//! will not be portable to other machines.
|
||||
//!
|
||||
//! The `rayon` feature (disabled by default, but enabled for [docs.rs]) adds
|
||||
//! the [`Hasher::update_rayon`] method, for multithreaded hashing. However,
|
||||
//! even if this feature is enabled, all other APIs remain single-threaded.
|
||||
//! the [`update_rayon`](Hasher::update_rayon) and (in combination with `mmap`
|
||||
//! below) [`update_mmap_rayon`](Hasher::update_mmap_rayon) methods, for
|
||||
//! multithreaded hashing. However, even if this feature is enabled, all other
|
||||
//! APIs remain single-threaded.
|
||||
//!
|
||||
//! The `zeroize` feature (disabled by default, but enabled for [docs.rs]) implements
|
||||
//! [`Zeroize`](https://docs.rs/zeroize/latest/zeroize/trait.Zeroize.html) for this crate's types.
|
||||
//! The `mmap` feature (disabled by default, but enabled for [docs.rs]) adds the
|
||||
//! [`update_mmap`](Hasher::update_mmap) and (in combination with `rayon` above)
|
||||
//! [`update_mmap_rayon`](Hasher::update_mmap_rayon) helper methods for
|
||||
//! memory-mapped IO.
|
||||
//!
|
||||
//! The `zeroize` feature (disabled by default, but enabled for [docs.rs])
|
||||
//! implements
|
||||
//! [`Zeroize`](https://docs.rs/zeroize/latest/zeroize/trait.Zeroize.html) for
|
||||
//! this crate's types.
|
||||
//!
|
||||
//! The NEON implementation is enabled by default for AArch64 but requires the
|
||||
//! `neon` feature for other ARM targets. Not all ARMv7 CPUs support NEON, and
|
||||
@ -52,12 +62,12 @@
|
||||
//! without NEON support.
|
||||
//!
|
||||
//! The `traits-preview` feature enables implementations of traits from the
|
||||
//! RustCrypto [`digest`] crate, and re-exports that crate as
|
||||
//! `traits::digest`. However, the traits aren't stable, and they're expected to
|
||||
//! change in incompatible ways before that crate reaches 1.0. For that reason,
|
||||
//! this crate makes no SemVer guarantees for this feature, and callers who use
|
||||
//! it should expect breaking changes between patch versions. (The "-preview"
|
||||
//! feature name follows the conventions of the RustCrypto [`signature`] crate.)
|
||||
//! RustCrypto [`digest`] crate, and re-exports that crate as `traits::digest`.
|
||||
//! However, the traits aren't stable, and they're expected to change in
|
||||
//! incompatible ways before that crate reaches 1.0. For that reason, this crate
|
||||
//! makes no SemVer guarantees for this feature, and callers who use it should
|
||||
//! expect breaking changes between patch versions. (The "-preview" feature name
|
||||
//! follows the conventions of the RustCrypto [`signature`] crate.)
|
||||
//!
|
||||
//! [`Hasher::update_rayon`]: struct.Hasher.html#method.update_rayon
|
||||
//! [BLAKE3]: https://blake3.io
|
||||
@ -115,9 +125,7 @@ mod sse41;
|
||||
#[cfg(feature = "traits-preview")]
|
||||
pub mod traits;
|
||||
|
||||
#[cfg(feature = "file")]
|
||||
pub mod file;
|
||||
|
||||
mod io;
|
||||
mod join;
|
||||
|
||||
use arrayref::{array_mut_ref, array_ref};
|
||||
@ -914,6 +922,15 @@ fn parent_node_output(
|
||||
|
||||
/// An incremental hash state that can accept any number of writes.
|
||||
///
|
||||
/// **Performance note:** The [`update`](#method.update) method can't take full
|
||||
/// advantage of SIMD optimizations if its input buffer is too small or oddly
|
||||
/// sized. Using a 16 KiB buffer, or any multiple of that, enables all currently
|
||||
/// supported SIMD instruction sets. See also
|
||||
/// [`update_reader`](Hasher::update_reader).
|
||||
///
|
||||
/// The `rayon` and `mmap` Cargo features enable additional methods on this
|
||||
/// type related to multithreading and memory-mapped IO.
|
||||
///
|
||||
/// When the `traits-preview` Cargo feature is enabled, this type implements
|
||||
/// several commonly used traits from the
|
||||
/// [`digest`](https://crates.io/crates/digest) crate. However, those
|
||||
@ -922,15 +939,6 @@ fn parent_node_output(
|
||||
/// guarantees for this feature, and callers who use it should expect breaking
|
||||
/// changes between patch versions.
|
||||
///
|
||||
/// When the `rayon` Cargo feature is enabled, the
|
||||
/// [`update_rayon`](#method.update_rayon) method is available for multithreaded
|
||||
/// hashing.
|
||||
///
|
||||
/// **Performance note:** The [`update`](#method.update) method can't take full
|
||||
/// advantage of SIMD optimizations if its input buffer is too small or oddly
|
||||
/// sized. Using a 16 KiB buffer, or any multiple of that, enables all currently
|
||||
/// supported SIMD instruction sets.
|
||||
///
|
||||
/// # Examples
|
||||
///
|
||||
/// ```
|
||||
@ -1099,30 +1107,6 @@ impl Hasher {
|
||||
self.update_with_join::<join::SerialJoin>(input)
|
||||
}
|
||||
|
||||
/// Identical to [`update`](Hasher::update), but using Rayon-based
|
||||
/// multithreading internally.
|
||||
///
|
||||
/// This method is gated by the `rayon` Cargo feature, which is disabled by
|
||||
/// default but enabled on [docs.rs](https://docs.rs).
|
||||
///
|
||||
/// To get any performance benefit from multithreading, the input buffer
|
||||
/// needs to be large. As a rule of thumb on x86_64, `update_rayon` is
|
||||
/// _slower_ than `update` for inputs under 128 KiB. That threshold varies
|
||||
/// quite a lot across different processors, and it's important to benchmark
|
||||
/// your specific use case.
|
||||
///
|
||||
/// Memory mapping an entire input file is a simple way to take advantage of
|
||||
/// multithreading without needing to carefully tune your buffer size or
|
||||
/// offload IO. However, on spinning disks where random access is expensive,
|
||||
/// that approach can lead to disk thrashing and terrible IO performance.
|
||||
/// Note that OS page caching can mask this problem, in which case it might
|
||||
/// only appear for files larger than available RAM. Again, benchmarking
|
||||
/// your specific use case is important.
|
||||
#[cfg(feature = "rayon")]
|
||||
pub fn update_rayon(&mut self, input: &[u8]) -> &mut Self {
|
||||
self.update_with_join::<join::RayonJoin>(input)
|
||||
}
|
||||
|
||||
fn update_with_join<J: join::Join>(&mut self, mut input: &[u8]) -> &mut Self {
|
||||
// If we have some partial chunk bytes in the internal chunk_state, we
|
||||
// need to finish that chunk first.
|
||||
@ -1321,6 +1305,179 @@ impl Hasher {
|
||||
pub fn count(&self) -> u64 {
|
||||
self.chunk_state.chunk_counter * CHUNK_LEN as u64 + self.chunk_state.len() as u64
|
||||
}
|
||||
|
||||
/// As [`update`](Hasher::update), but reading from a
|
||||
/// [`std::io::Read`](https://doc.rust-lang.org/std/io/trait.Read.html) implementation.
|
||||
///
|
||||
/// [`Hasher`] implements
|
||||
/// [`std::io::Write`](https://doc.rust-lang.org/std/io/trait.Write.html), so it's possible to
|
||||
/// use [`std::io::copy`](https://doc.rust-lang.org/std/io/fn.copy.html) to update a [`Hasher`]
|
||||
/// from any reader. Unfortunately, this standard approach can limit performance, because the
|
||||
/// [`copy`](https://doc.rust-lang.org/std/io/fn.copy.html) function currently uses an internal
|
||||
/// 8 KiB buffer that isn't big enough to take advantage of all SIMD instruction sets. (In
|
||||
/// particular, [AVX-512](https://en.wikipedia.org/wiki/AVX-512) needs a
|
||||
/// 16 KiB buffer.) `update_reader` avoids this performance problem and is slightly more
|
||||
/// convenient.
|
||||
///
|
||||
/// The internal buffer size this method uses may change at any time, and it may be different
|
||||
/// for different targets. The only guarantee is that it will be large enough for all of this
|
||||
/// crate's SIMD implementations on the current platform.
|
||||
///
|
||||
/// The most common implementer of
|
||||
/// [`std::io::Read`](https://doc.rust-lang.org/std/io/trait.Read.html) might be
|
||||
/// [`std::fs::File`](https://doc.rust-lang.org/std/fs/struct.File.html), but note that memory
|
||||
/// mapping can be faster than this method for hashing large files. See
|
||||
/// [`update_mmap`](Hasher::update_mmap) and [`update_mmap_rayon`](Hasher::update_mmap_rayon),
|
||||
/// which require the `mmap` and (for the latter) `rayon` Cargo features.
|
||||
///
|
||||
/// This method requires the `std` Cargo feature, which is enabled by default.
|
||||
///
|
||||
/// # Example
|
||||
///
|
||||
/// ```no_run
|
||||
/// # use std::fs::File;
|
||||
/// # use std::io;
|
||||
/// # fn main() -> io::Result<()> {
|
||||
/// // Hash standard input.
|
||||
/// let mut hasher = blake3::Hasher::new();
|
||||
/// hasher.update_reader(std::io::stdin().lock())?;
|
||||
/// println!("{}", hasher.finalize());
|
||||
/// # Ok(())
|
||||
/// # }
|
||||
/// ```
|
||||
#[cfg(feature = "std")]
|
||||
pub fn update_reader(&mut self, reader: impl std::io::Read) -> std::io::Result<u64> {
|
||||
io::copy_wide(reader, self)
|
||||
}
|
||||
|
||||
/// As [`update`](Hasher::update), but using Rayon-based multithreading
|
||||
/// internally.
|
||||
///
|
||||
/// This method is gated by the `rayon` Cargo feature, which is disabled by
|
||||
/// default but enabled on [docs.rs](https://docs.rs).
|
||||
///
|
||||
/// To get any performance benefit from multithreading, the input buffer
|
||||
/// needs to be large. As a rule of thumb on x86_64, `update_rayon` is
|
||||
/// _slower_ than `update` for inputs under 128 KiB. That threshold varies
|
||||
/// quite a lot across different processors, and it's important to benchmark
|
||||
/// your specific use case. See also the performance warning associated with
|
||||
/// [`update_mmap_rayon`](Hasher::update_mmap_rayon).
|
||||
///
|
||||
/// If you already have a large buffer in memory, and you want to hash it
|
||||
/// with multiple threads, this method is a good option. However, reading a
|
||||
/// file into memory just to call this method can be a performance mistake,
|
||||
/// both because it requires lots of memory and because single-threaded
|
||||
/// reads can be slow. For hashing whole files, see
|
||||
/// [`update_mmap_rayon`](Hasher::update_mmap_rayon), which is gated by both
|
||||
/// the `rayon` and `mmap` Cargo features.
|
||||
#[cfg(feature = "rayon")]
|
||||
pub fn update_rayon(&mut self, input: &[u8]) -> &mut Self {
|
||||
self.update_with_join::<join::RayonJoin>(input)
|
||||
}
|
||||
|
||||
/// As [`update`](Hasher::update), but reading the contents of a file using memory mapping.
|
||||
///
|
||||
/// Not all files can be memory mapped, and memory mapping small files can be slower than
|
||||
/// reading them the usual way. In those cases, this method will fall back to standard file IO.
|
||||
/// The heuristic for whether to use memory mapping is currently very simple (file size >=
|
||||
/// 16 KiB), and it might change at any time.
|
||||
///
|
||||
/// Like [`update`](Hasher::update), this method is single-threaded. In this author's
|
||||
/// experience, memory mapping improves single-threaded performance by ~10% for large files
|
||||
/// that are already in cache. This probably varies between platforms, and as always it's a
|
||||
/// good idea to benchmark your own use case. In comparison, the multithreaded
|
||||
/// [`update_mmap_rayon`](Hasher::update_mmap_rayon) method can have a much larger impact on
|
||||
/// performance.
|
||||
///
|
||||
/// There's a correctness reason that this method takes
|
||||
/// [`Path`](https://doc.rust-lang.org/stable/std/path/struct.Path.html) instead of
|
||||
/// [`File`](https://doc.rust-lang.org/std/fs/struct.File.html): reading from a memory-mapped
|
||||
/// file ignores the seek position of the original file handle (it neither respects the current
|
||||
/// position nor updates the position). This difference in behavior would've caused
|
||||
/// `update_mmap` and [`update_reader`](Hasher::update_reader) to give different answers and
|
||||
/// have different side effects in some cases. Taking a
|
||||
/// [`Path`](https://doc.rust-lang.org/stable/std/path/struct.Path.html) avoids this problem by
|
||||
/// making it clear that a new [`File`](https://doc.rust-lang.org/std/fs/struct.File.html) is
|
||||
/// opened internally.
|
||||
///
|
||||
/// This method requires the `mmap` Cargo feature, which is disabled by default but enabled on
|
||||
/// [docs.rs](https://docs.rs).
|
||||
///
|
||||
/// # Example
|
||||
///
|
||||
/// ```no_run
|
||||
/// # use std::io;
|
||||
/// # use std::path::Path;
|
||||
/// # fn main() -> io::Result<()> {
|
||||
/// let path = Path::new("file.dat");
|
||||
/// let mut hasher = blake3::Hasher::new();
|
||||
/// hasher.update_mmap(path)?;
|
||||
/// println!("{}", hasher.finalize());
|
||||
/// # Ok(())
|
||||
/// # }
|
||||
/// ```
|
||||
#[cfg(feature = "mmap")]
|
||||
pub fn update_mmap(&mut self, path: impl AsRef<std::path::Path>) -> std::io::Result<()> {
|
||||
let file = std::fs::File::open(path.as_ref())?;
|
||||
if let Some(mmap) = io::maybe_mmap_file(&file)? {
|
||||
self.update(&mmap);
|
||||
} else {
|
||||
io::copy_wide(&file, self)?;
|
||||
}
|
||||
Ok(())
|
||||
}
|
||||
|
||||
/// As [`update_rayon`](Hasher::update_rayon), but reading the contents of a file using
|
||||
/// memory mapping. This is the default behavior of `b3sum`.
|
||||
///
|
||||
/// For large files that are likely to be in cache, this can be much faster than
|
||||
/// single-threaded hashing. When benchmarks report that BLAKE3 is 10x or 20x faster than other
|
||||
/// cryptographic hashes, this is usually what they're measuring. However...
|
||||
///
|
||||
/// **Performance Warning:** There are cases where multithreading hurts performance. The worst
|
||||
/// case is [a large file on a spinning disk](https://github.com/BLAKE3-team/BLAKE3/issues/31),
|
||||
/// where simultaneous reads from multiple threads can cause "thrashing" (i.e. the disk spends
|
||||
/// more time seeking around than reading data). Windows tends to be somewhat worse about this,
|
||||
/// in part because it's less likely than Linux to keep very large files in cache. More
|
||||
/// generally, if your CPU cores are already busy, then multithreading will add overhead
|
||||
/// without improving performance. If your code runs in different environments that you don't
|
||||
/// control and can't measure, then unfortunately there's no one-size-fits-all answer for
|
||||
/// whether multithreading is a good idea.
|
||||
///
|
||||
/// The memory mapping behavior of this function is the same as
|
||||
/// [`update_mmap`](Hasher::update_mmap), and the heuristic for when to fall back to standard
|
||||
/// file IO might change at any time.
|
||||
///
|
||||
/// This method requires both the `mmap` and `rayon` Cargo features, which are disabled by
|
||||
/// default but enabled on [docs.rs](https://docs.rs).
|
||||
///
|
||||
/// # Example
|
||||
///
|
||||
/// ```no_run
|
||||
/// # use std::io;
|
||||
/// # use std::path::Path;
|
||||
/// # fn main() -> io::Result<()> {
|
||||
/// # #[cfg(feature = "rayon")]
|
||||
/// # {
|
||||
/// let path = Path::new("big_file.dat");
|
||||
/// let mut hasher = blake3::Hasher::new();
|
||||
/// hasher.update_mmap_rayon(path)?;
|
||||
/// println!("{}", hasher.finalize());
|
||||
/// # }
|
||||
/// # Ok(())
|
||||
/// # }
|
||||
/// ```
|
||||
#[cfg(feature = "mmap")]
|
||||
#[cfg(feature = "rayon")]
|
||||
pub fn update_mmap_rayon(&mut self, path: impl AsRef<std::path::Path>) -> std::io::Result<()> {
|
||||
let file = std::fs::File::open(path.as_ref())?;
|
||||
if let Some(mmap) = io::maybe_mmap_file(&file)? {
|
||||
self.update_rayon(&mmap);
|
||||
} else {
|
||||
io::copy_wide(&file, self)?;
|
||||
}
|
||||
Ok(())
|
||||
}
|
||||
}
|
||||
|
||||
// Don't derive(Debug), because the state may be secret.
|
||||
@ -1355,29 +1512,6 @@ impl std::io::Write for Hasher {
|
||||
}
|
||||
}
|
||||
|
||||
/// Copy from `reader` to `hasher`, returning the number of bytes read.
|
||||
///
|
||||
/// A 16 KiB buffer is enough to take advantage of all the SIMD instruction sets
|
||||
/// that we support, but `std::io::copy` currently uses 8 KiB. Most platforms
|
||||
/// can support at least 64 KiB, and there's some performance benefit to using
|
||||
/// bigger reads, so that's what we use here.
|
||||
#[cfg(feature = "std")]
|
||||
pub fn copy_wide(mut reader: impl std::io::Read, hasher: &mut Hasher) -> std::io::Result<u64> {
|
||||
let mut buffer = [0; 65536];
|
||||
let mut total = 0;
|
||||
loop {
|
||||
match reader.read(&mut buffer) {
|
||||
Ok(0) => return Ok(total),
|
||||
Ok(n) => {
|
||||
hasher.update(&buffer[..n]);
|
||||
total += n as u64;
|
||||
}
|
||||
Err(e) if e.kind() == std::io::ErrorKind::Interrupted => continue,
|
||||
Err(e) => return Err(e),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/// An incremental reader for extended output, returned by
|
||||
/// [`Hasher::finalize_xof`](struct.Hasher.html#method.finalize_xof).
|
||||
///
|
||||
|
121
src/test.rs
121
src/test.rs
@ -658,11 +658,13 @@ fn test_zeroize() {
|
||||
assert_eq!(hasher.chunk_state.buf_len, 0);
|
||||
assert_eq!(hasher.chunk_state.blocks_compressed, 0);
|
||||
assert_eq!(hasher.chunk_state.flags, 0);
|
||||
assert!(matches!(hasher.chunk_state.platform, crate::Platform::Portable));
|
||||
assert!(matches!(
|
||||
hasher.chunk_state.platform,
|
||||
crate::Platform::Portable
|
||||
));
|
||||
assert_eq!(hasher.key, [0; 8]);
|
||||
assert_eq!(&*hasher.cv_stack, &[[0u8; 32]; 0]);
|
||||
|
||||
|
||||
let mut output_reader = crate::OutputReader {
|
||||
inner: crate::Output {
|
||||
input_chaining_value: [42; 8],
|
||||
@ -675,14 +677,123 @@ fn test_zeroize() {
|
||||
position_within_block: 42,
|
||||
};
|
||||
|
||||
|
||||
output_reader.zeroize();
|
||||
assert_eq!(output_reader.inner.input_chaining_value, [0; 8]);
|
||||
assert_eq!(output_reader.inner.block, [0; 64]);
|
||||
assert_eq!(output_reader.inner.counter, 0);
|
||||
assert_eq!(output_reader.inner.block_len, 0);
|
||||
assert_eq!(output_reader.inner.flags, 0);
|
||||
assert!(matches!(output_reader.inner.platform, crate::Platform::Portable));
|
||||
assert!(matches!(
|
||||
output_reader.inner.platform,
|
||||
crate::Platform::Portable
|
||||
));
|
||||
assert_eq!(output_reader.position_within_block, 0);
|
||||
|
||||
}
|
||||
|
||||
#[test]
|
||||
#[cfg(feature = "std")]
|
||||
fn test_update_reader() -> Result<(), std::io::Error> {
|
||||
// This is a brief test, since update_reader() is mostly a wrapper around update(), which already
|
||||
// has substantial testing.
|
||||
use std::io::prelude::*;
|
||||
let mut input = vec![0; 1_000_000];
|
||||
paint_test_input(&mut input);
|
||||
let mut tempfile = tempfile::NamedTempFile::new()?;
|
||||
tempfile.write_all(&input)?;
|
||||
tempfile.flush()?;
|
||||
let mut hasher = crate::Hasher::new();
|
||||
hasher.update_reader(std::fs::File::open(tempfile.path())?)?;
|
||||
assert_eq!(hasher.finalize(), crate::hash(&input));
|
||||
Ok(())
|
||||
}
|
||||
|
||||
#[test]
|
||||
#[cfg(feature = "std")]
|
||||
fn test_update_reader_interrupted() -> std::io::Result<()> {
|
||||
use std::io;
|
||||
struct InterruptingReader<'a> {
|
||||
already_interrupted: bool,
|
||||
slice: &'a [u8],
|
||||
}
|
||||
impl<'a> InterruptingReader<'a> {
|
||||
fn new(slice: &'a [u8]) -> Self {
|
||||
Self {
|
||||
already_interrupted: false,
|
||||
slice,
|
||||
}
|
||||
}
|
||||
}
|
||||
impl<'a> io::Read for InterruptingReader<'a> {
|
||||
fn read(&mut self, buf: &mut [u8]) -> io::Result<usize> {
|
||||
if !self.already_interrupted {
|
||||
self.already_interrupted = true;
|
||||
return Err(io::Error::from(io::ErrorKind::Interrupted));
|
||||
}
|
||||
let take = std::cmp::min(self.slice.len(), buf.len());
|
||||
buf[..take].copy_from_slice(&self.slice[..take]);
|
||||
self.slice = &self.slice[take..];
|
||||
Ok(take)
|
||||
}
|
||||
}
|
||||
|
||||
let input = b"hello world";
|
||||
let mut reader = InterruptingReader::new(input);
|
||||
let mut hasher = crate::Hasher::new();
|
||||
hasher.update_reader(&mut reader)?;
|
||||
assert_eq!(hasher.finalize(), crate::hash(input));
|
||||
Ok(())
|
||||
}
|
||||
|
||||
#[test]
|
||||
#[cfg(feature = "mmap")]
|
||||
fn test_mmap() -> Result<(), std::io::Error> {
|
||||
// This is a brief test, since update_mmap() is mostly a wrapper around update(), which already
|
||||
// has substantial testing.
|
||||
use std::io::prelude::*;
|
||||
let mut input = vec![0; 1_000_000];
|
||||
paint_test_input(&mut input);
|
||||
let mut tempfile = tempfile::NamedTempFile::new()?;
|
||||
tempfile.write_all(&input)?;
|
||||
tempfile.flush()?;
|
||||
let mut hasher = crate::Hasher::new();
|
||||
hasher.update_mmap(tempfile.path())?;
|
||||
assert_eq!(hasher.finalize(), crate::hash(&input));
|
||||
Ok(())
|
||||
}
|
||||
|
||||
#[test]
|
||||
#[cfg(feature = "mmap")]
|
||||
#[cfg(target_os = "linux")]
|
||||
fn test_mmap_virtual_file() -> Result<(), std::io::Error> {
|
||||
// Virtual files like /proc/version can't be mmapped, because their contents don't actually
|
||||
// exist anywhere in memory. Make sure we fall back to regular file IO in these cases.
|
||||
// Currently this is handled with a length check, where the assumption is that virtual files
|
||||
// will always report length 0. If that assumption ever breaks, hopefully this test will catch
|
||||
// it.
|
||||
let virtual_filepath = "/proc/version";
|
||||
let mut mmap_hasher = crate::Hasher::new();
|
||||
// We'll fail right here if the fallback doesn't work.
|
||||
mmap_hasher.update_mmap(virtual_filepath)?;
|
||||
let mut read_hasher = crate::Hasher::new();
|
||||
read_hasher.update_reader(std::fs::File::open(virtual_filepath)?)?;
|
||||
assert_eq!(mmap_hasher.finalize(), read_hasher.finalize());
|
||||
Ok(())
|
||||
}
|
||||
|
||||
#[test]
|
||||
#[cfg(feature = "mmap")]
|
||||
#[cfg(feature = "rayon")]
|
||||
fn test_mmap_rayon() -> Result<(), std::io::Error> {
|
||||
// This is a brief test, since update_mmap_rayon() is mostly a wrapper around update_rayon(),
|
||||
// which already has substantial testing.
|
||||
use std::io::prelude::*;
|
||||
let mut input = vec![0; 1_000_000];
|
||||
paint_test_input(&mut input);
|
||||
let mut tempfile = tempfile::NamedTempFile::new()?;
|
||||
tempfile.write_all(&input)?;
|
||||
tempfile.flush()?;
|
||||
let mut hasher = crate::Hasher::new();
|
||||
hasher.update_mmap_rayon(tempfile.path())?;
|
||||
assert_eq!(hasher.finalize(), crate::hash(&input));
|
||||
Ok(())
|
||||
}
|
Loading…
Reference in New Issue
Block a user