From cb32f0bd1450991d86a8ceb3716fd591382cc507 Mon Sep 17 00:00:00 2001 From: Jack O'Connor Date: Sun, 10 Sep 2023 15:18:11 -0700 Subject: [PATCH] replace the new file module with inherent methods on Hasher New methods: - update_reader - update_mmap - update_mmap_rayon These are more discoverable, more convenient, and safer. There are two problems I want to avoid by taking a `Path` instead of a `File`. First, exposing `Mmap` objects to the caller is fundamentally unsafe, and making `maybe_mmap_file` private avoids that issue. Second, taking a `File` raises questions about whether memory mapped reads should behave like regular file reads. (Should they respect the current seek position? Should they update the seek position?) Taking a `Path` from the caller and opening the `File` internally avoids these questions. --- .github/workflows/ci.yml | 4 + Cargo.toml | 21 +-- b3sum/Cargo.lock | 24 ++-- b3sum/Cargo.toml | 2 +- b3sum/src/main.rs | 110 +++++---------- src/file.rs | 67 ---------- src/io.rs | 79 +++++++++++ src/lib.rs | 282 +++++++++++++++++++++++++++++---------- src/test.rs | 121 ++++++++++++++++- 9 files changed, 466 insertions(+), 244 deletions(-) delete mode 100644 src/file.rs create mode 100644 src/io.rs diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index f179ada..16a5468 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -59,6 +59,10 @@ jobs: run: cargo test --features=rayon,traits-preview,zeroize env: RAYON_NUM_THREADS: 1 + # The mmap feature by itself (update_mmap_rayon is omitted). + - run: cargo test --features=mmap + # All public features put together. + - run: cargo test --features=mmap,rayon,traits-preview,zeroize # no_std tests. - run: cargo test --no-default-features diff --git a/Cargo.toml b/Cargo.toml index 74aed30..d92fc6e 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -25,11 +25,17 @@ neon = [] # entire build, with e.g. RUSTFLAGS="-C target-cpu=native". std = [] -# The "rayon" feature (defined below as an optional dependency) enables the -# `Hasher::update_rayon` method, for multithreaded hashing. However, even if -# this feature is enabled, all other APIs remain single-threaded. +# The `rayon` feature (disabled by default, but enabled for docs.rs) adds the +# `update_rayon` and (in combination with `mmap` below) `update_mmap_rayon` +# methods, for multithreaded hashing. However, even if this feature is enabled, +# all other APIs remain single-threaded. rayon = ["dep:rayon", "std"] +# The `mmap` feature (disabled by default, but enabled for docs.rs) adds the +# `update_mmap` and (in combination with `rayon` above) `update_mmap_rayon` +# helper methods for memory-mapped IO. +mmap = ["std", "dep:memmap2"] + # Implement the zeroize::Zeroize trait for types in this crate. zeroize = ["dep:zeroize", "arrayvec/zeroize"] @@ -81,11 +87,9 @@ no_avx2 = [] no_avx512 = [] no_neon = [] -file = ["memmap2", "rayon", "std"] - [package.metadata.docs.rs] -# Document Hasher::update_rayon on docs.rs. -features = ["rayon", "zeroize"] +# Document the rayon/mmap methods and the Zeroize impls on docs.rs. +features = ["mmap", "rayon", "zeroize"] [dependencies] arrayref = "0.3.5" @@ -98,12 +102,13 @@ zeroize = { version = "1", default-features = false, features = ["zeroize_derive memmap2 = { version = "0.7.1", optional = true } [dev-dependencies] +hmac = "0.12.0" hex = "0.4.2" page_size = "0.6.0" rand = "0.8.0" rand_chacha = "0.3.0" reference_impl = { path = "./reference_impl" } -hmac = "0.12.0" +tempfile = "3.8.0" [build-dependencies] cc = "1.0.4" diff --git a/b3sum/Cargo.lock b/b3sum/Cargo.lock index 3c7c737..d76d175 100644 --- a/b3sum/Cargo.lock +++ b/b3sum/Cargo.lock @@ -18,9 +18,9 @@ dependencies = [ [[package]] name = "anstyle" -version = "1.0.2" +version = "1.0.3" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "15c4c2c83f81532e5845a733998b6971faca23490340a418e9b72a3ec9de12ea" +checksum = "b84bf0a05bbb2a83e5eb6fa36bb6e87baa08193c35ff52bbf6b38d8af2890e46" [[package]] name = "anstyle-parse" @@ -131,9 +131,9 @@ checksum = "baf1de4339761588bc0619e3cbc0120ee582ebb74b53b4efbf79117bd2da40fd" [[package]] name = "clap" -version = "4.4.2" +version = "4.4.3" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "6a13b88d2c62ff462f88e4a121f17a82c1af05693a2f192b5c38d14de73c19f6" +checksum = "84ed82781cea27b43c9b106a979fe450a13a31aab0500595fb3fc06616de08e6" dependencies = [ "clap_builder", "clap_derive", @@ -307,9 +307,9 @@ dependencies = [ [[package]] name = "libc" -version = "0.2.147" +version = "0.2.148" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "b4668fb0ea861c1df094127ac5f1da3409a82116a4ba74fca2e58ef927159bb3" +checksum = "9cdc71e17332e86d2e1d38c1f99edcb6288ee11b815fb1a4b049eaa2114d369b" [[package]] name = "linux-raw-sys" @@ -369,9 +369,9 @@ dependencies = [ [[package]] name = "proc-macro2" -version = "1.0.66" +version = "1.0.67" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "18fb31db3f9bddb2ea821cde30a9f70117e3f119938b5ee630b7403aa6e2ead9" +checksum = "3d433d9f1a3e8c1263d9456598b16fec66f4acc9a74dacffd35c7bb09b3a1328" dependencies = [ "unicode-ident", ] @@ -467,9 +467,9 @@ checksum = "73473c0e59e6d5812c5dfe2a064a6444949f089e20eec9a2e5506596494e4623" [[package]] name = "syn" -version = "2.0.32" +version = "2.0.36" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "239814284fd6f1a4ffe4ca893952cdd93c224b6a1571c9a9eadd670295c0c9e2" +checksum = "91e02e55d62894af2a08aca894c6577281f76769ba47c94d5756bec8ac6e7373" dependencies = [ "proc-macro2", "quote", @@ -501,9 +501,9 @@ dependencies = [ [[package]] name = "unicode-ident" -version = "1.0.11" +version = "1.0.12" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "301abaae475aa91687eb82514b328ab47a211a533026cb25fc3e519b86adfc3c" +checksum = "3354b9ac3fae1ff6755cb6db53683adb661634f67557942dea4facebec0fee4b" [[package]] name = "utf8parse" diff --git a/b3sum/Cargo.toml b/b3sum/Cargo.toml index 19b617e..dcd9f96 100644 --- a/b3sum/Cargo.toml +++ b/b3sum/Cargo.toml @@ -15,7 +15,7 @@ pure = ["blake3/pure"] [dependencies] anyhow = "1.0.25" -blake3 = { version = "1", path = "..", features = ["file", "rayon"] } +blake3 = { version = "1", path = "..", features = ["mmap", "rayon"] } clap = { version = "4.0.8", features = ["derive", "wrap_help"] } hex = "0.4.0" memmap2 = "0.7.0" diff --git a/b3sum/src/main.rs b/b3sum/src/main.rs index 165c579..228737f 100644 --- a/b3sum/src/main.rs +++ b/b3sum/src/main.rs @@ -163,73 +163,22 @@ impl Args { } } -enum Input { - Mmap(io::Cursor), - File(File), - Stdin, -} - -impl Input { - // Open an input file, using mmap if appropriate. "-" means stdin. Note - // that this convention applies both to command line arguments, and to - // filepaths that appear in a checkfile. - fn open(path: &Path, args: &Args) -> Result { - if path == Path::new("-") { - if args.keyed() { - bail!("Cannot open `-` in keyed mode"); - } - return Ok(Self::Stdin); - } - let file = File::open(path)?; - if !args.no_mmap() { - if let Some(mmap) = blake3::file::maybe_memmap_file(&file)? { - return Ok(Self::Mmap(io::Cursor::new(mmap))); - } - } - Ok(Self::File(file)) - } - - fn hash(&mut self, args: &Args) -> Result { - let mut hasher = args.base_hasher.clone(); - match self { - // The fast path: If we mmapped the file successfully, hash using - // multiple threads. This doesn't work on stdin, or on some files, - // and it can also be disabled with --no-mmap. - Self::Mmap(cursor) => { - hasher.update_rayon(cursor.get_ref()); - } - // The slower paths, for stdin or files we didn't/couldn't mmap. - // This is currently all single-threaded. Doing multi-threaded - // hashing without memory mapping is tricky, since all your worker - // threads have to stop every time you refill the buffer, and that - // ends up being a lot of overhead. To solve that, we need a more - // complicated double-buffering strategy where a background thread - // fills one buffer while the worker threads are hashing the other - // one. We might implement that in the future, but since this is - // the slow path anyway, it's not high priority. - Self::File(file) => { - blake3::copy_wide(file, &mut hasher)?; - } - Self::Stdin => { - let stdin = io::stdin(); - let lock = stdin.lock(); - blake3::copy_wide(lock, &mut hasher)?; - } - } - let mut output_reader = hasher.finalize_xof(); - output_reader.set_position(args.seek()); - Ok(output_reader) - } -} - -impl Read for Input { - fn read(&mut self, buf: &mut [u8]) -> io::Result { - match self { - Self::Mmap(cursor) => cursor.read(buf), - Self::File(file) => file.read(buf), - Self::Stdin => io::stdin().read(buf), +fn hash_path(args: &Args, path: &Path) -> Result { + let mut hasher = args.base_hasher.clone(); + if path == Path::new("-") { + if args.keyed() { + bail!("Cannot open `-` in keyed mode"); } + hasher.update_reader(io::stdin().lock())?; + } else if args.no_mmap() { + hasher.update_reader(File::open(path)?)?; + } else { + // The fast path: Try to mmap the file and hash it with multiple threads. + hasher.update_mmap_rayon(path)?; } + let mut output_reader = hasher.finalize_xof(); + output_reader.set_position(args.seek()); + Ok(output_reader) } fn write_hex_output(mut output: blake3::OutputReader, args: &Args) -> Result<()> { @@ -425,8 +374,7 @@ fn parse_check_line(mut line: &str) -> Result { } fn hash_one_input(path: &Path, args: &Args) -> Result<()> { - let mut input = Input::open(path, args)?; - let output = input.hash(args)?; + let output = hash_path(args, path)?; if args.raw() { write_raw_output(output, args)?; return Ok(()); @@ -470,15 +418,13 @@ fn check_one_line(line: &str, args: &Args) -> bool { } else { file_string }; - let hash_result: Result = Input::open(&file_path, args) - .and_then(|mut input| input.hash(args)) - .map(|mut hash_output| { + let found_hash: blake3::Hash; + match hash_path(args, &file_path) { + Ok(mut output) => { let mut found_hash_bytes = [0; blake3::OUT_LEN]; - hash_output.fill(&mut found_hash_bytes); - found_hash_bytes.into() - }); - let found_hash: blake3::Hash = match hash_result { - Ok(hash) => hash, + output.fill(&mut found_hash_bytes); + found_hash = found_hash_bytes.into(); + } Err(e) => { println!("{}: FAILED ({})", file_string, e); return false; @@ -497,8 +443,18 @@ fn check_one_line(line: &str, args: &Args) -> bool { } fn check_one_checkfile(path: &Path, args: &Args, files_failed: &mut u64) -> Result<()> { - let checkfile_input = Input::open(path, args)?; - let mut bufreader = io::BufReader::new(checkfile_input); + let mut file; + let stdin; + let mut stdin_lock; + let mut bufreader: io::BufReader<&mut dyn Read>; + if path == Path::new("-") { + stdin = io::stdin(); + stdin_lock = stdin.lock(); + bufreader = io::BufReader::new(&mut stdin_lock); + } else { + file = File::open(path)?; + bufreader = io::BufReader::new(&mut file); + } let mut line = String::new(); loop { line.clear(); diff --git a/src/file.rs b/src/file.rs deleted file mode 100644 index 81ccbbe..0000000 --- a/src/file.rs +++ /dev/null @@ -1,67 +0,0 @@ -//! The file-related utilities. -//! -//! # Examples -//! -//! ```no_run -//! use std::io; -//! -//! use blake3::file::hash_path_maybe_mmap; -//! -//! fn main() -> io::Result<()> { -//! let args: Vec<_> = std::env::args_os().collect(); -//! assert_eq!(args.len(), 2); -//! let path = &args[1]; -//! let mut hasher = blake3::Hasher::new(); -//! hash_path_maybe_mmap(&mut hasher, path)?; -//! println!("{}", hasher.finalize()); -//! Ok(()) -//! } -//! ``` - -use std::{fs::File, io, path::Path}; - -/// Mmap a file, if it looks like a good idea. Return None in cases where we -/// know mmap will fail, or if the file is short enough that mmapping isn't -/// worth it. However, if we do try to mmap and it fails, return the error. -pub fn maybe_memmap_file(file: &File) -> io::Result> { - let metadata = file.metadata()?; - let file_size = metadata.len(); - #[allow(clippy::if_same_then_else)] - if !metadata.is_file() { - // Not a real file. - Ok(None) - } else if file_size > isize::max_value() as u64 { - // Too long to safely map. - // https://github.com/danburkert/memmap-rs/issues/69 - Ok(None) - } else if file_size == 0 { - // Mapping an empty file currently fails. - // https://github.com/danburkert/memmap-rs/issues/72 - Ok(None) - } else if file_size < 16 * 1024 { - // Mapping small files is not worth it. - Ok(None) - } else { - // Explicitly set the length of the memory map, so that filesystem - // changes can't race to violate the invariants we just checked. - let map = unsafe { - memmap2::MmapOptions::new() - .len(file_size as usize) - .map(file)? - }; - Ok(Some(map)) - } -} - -/// Hash a file fast. -/// -/// It may use mmap if the file is big enough. If not, it will read the whole file into a buffer. -pub fn hash_path_maybe_mmap(hasher: &mut crate::Hasher, path: impl AsRef) -> io::Result<()> { - let file = File::open(path.as_ref())?; - if let Some(mmap) = maybe_memmap_file(&file)? { - hasher.update_rayon(&mmap); - } else { - crate::copy_wide(&file, hasher)?; - } - Ok(()) -} diff --git a/src/io.rs b/src/io.rs new file mode 100644 index 0000000..1c19881 --- /dev/null +++ b/src/io.rs @@ -0,0 +1,79 @@ +//! Helper functions for efficient IO. + +#[cfg(feature = "std")] +pub(crate) fn copy_wide( + mut reader: impl std::io::Read, + hasher: &mut crate::Hasher, +) -> std::io::Result { + let mut buffer = [0; 65536]; + let mut total = 0; + loop { + match reader.read(&mut buffer) { + Ok(0) => return Ok(total), + Ok(n) => { + hasher.update(&buffer[..n]); + total += n as u64; + } + // see test_update_reader_interrupted + Err(e) if e.kind() == std::io::ErrorKind::Interrupted => continue, + Err(e) => return Err(e), + } + } +} + +// Mmap a file, if it looks like a good idea. Return None in cases where we know mmap will fail, or +// if the file is short enough that mmapping isn't worth it. However, if we do try to mmap and it +// fails, return the error. +// +// SAFETY: Mmaps are fundamentally unsafe, because you can call invariant-checking functions like +// str::from_utf8 on them and then have them change out from under you. Letting a safe caller get +// their hands on an mmap, or even a &[u8] that's backed by an mmap, is unsound. However, because +// this function is crate-private, we can guarantee that all can ever happen in the event of a race +// condition is that we either hash nonsense bytes or crash with SIGBUS or similar, neither of +// which should risk memory corruption in a safe caller. +// +// PARANOIA: But a data race...is a data race...is a data race...right? Even if we know that no +// platform in the "real world" is ever going to do anything other than compute the "wrong answer" +// if we race on this mmap while we hash it, aren't we still supposed to feel bad about doing this? +// Well, maybe. This is IO, and IO gets special carve-outs in the memory model. Consider a +// memory-mapped register that returns random 32-bit words. (This is actually realistic if you have +// a hardware RNG.) It's probably sound to construct a *const i32 pointing to that register and do +// some raw pointer reads from it. Those reads should be volatile if you don't want the compiler to +// coalesce them, but either way the compiler isn't allowed to just _go nuts_ and insert +// should-never-happen branches to wipe your hard drive if two adjacent reads happen to give +// different values. As far as I'm aware, there's no such thing as a read that's allowed if it's +// volatile but prohibited if it's not (unlike atomics). As mentioned above, it's not ok to +// construct a safe &i32 to the register if you're going to leak that reference to unknown callers. +// But if you "know what you're doing," I don't think *const i32 and &i32 are fundamentally +// different here. Feedback needed. +#[cfg(feature = "mmap")] +pub(crate) fn maybe_mmap_file(file: &std::fs::File) -> std::io::Result> { + let metadata = file.metadata()?; + let file_size = metadata.len(); + #[allow(clippy::if_same_then_else)] + if !metadata.is_file() { + // Not a real file. + Ok(None) + } else if file_size > isize::max_value() as u64 { + // Too long to safely map. + // https://github.com/danburkert/memmap-rs/issues/69 + Ok(None) + } else if file_size == 0 { + // Mapping an empty file currently fails. + // https://github.com/danburkert/memmap-rs/issues/72 + // See test_mmap_virtual_file. + Ok(None) + } else if file_size < 16 * 1024 { + // Mapping small files is not worth it. + Ok(None) + } else { + // Explicitly set the length of the memory map, so that filesystem + // changes can't race to violate the invariants we just checked. + let map = unsafe { + memmap2::MmapOptions::new() + .len(file_size as usize) + .map(file)? + }; + Ok(Some(map)) + } +} diff --git a/src/lib.rs b/src/lib.rs index b262380..1a2d22d 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -33,18 +33,28 @@ //! # Cargo Features //! //! The `std` feature (the only feature enabled by default) is required for -//! implementations of the [`Write`] and [`Seek`] traits, and also for runtime -//! CPU feature detection on x86. If this feature is disabled, the only way to -//! use the x86 SIMD implementations is to enable the corresponding instruction -//! sets globally, with e.g. `RUSTFLAGS="-C target-cpu=native"`. The resulting -//! binary will not be portable to other machines. +//! implementations of the [`Write`] and [`Seek`] traits, the +//! [`update_reader`](Hasher::update_reader) helper method, and runtime CPU +//! feature detection on x86. If this feature is disabled, the only way to use +//! the x86 SIMD implementations is to enable the corresponding instruction sets +//! globally, with e.g. `RUSTFLAGS="-C target-cpu=native"`. The resulting binary +//! will not be portable to other machines. //! //! The `rayon` feature (disabled by default, but enabled for [docs.rs]) adds -//! the [`Hasher::update_rayon`] method, for multithreaded hashing. However, -//! even if this feature is enabled, all other APIs remain single-threaded. +//! the [`update_rayon`](Hasher::update_rayon) and (in combination with `mmap` +//! below) [`update_mmap_rayon`](Hasher::update_mmap_rayon) methods, for +//! multithreaded hashing. However, even if this feature is enabled, all other +//! APIs remain single-threaded. //! -//! The `zeroize` feature (disabled by default, but enabled for [docs.rs]) implements -//! [`Zeroize`](https://docs.rs/zeroize/latest/zeroize/trait.Zeroize.html) for this crate's types. +//! The `mmap` feature (disabled by default, but enabled for [docs.rs]) adds the +//! [`update_mmap`](Hasher::update_mmap) and (in combination with `rayon` above) +//! [`update_mmap_rayon`](Hasher::update_mmap_rayon) helper methods for +//! memory-mapped IO. +//! +//! The `zeroize` feature (disabled by default, but enabled for [docs.rs]) +//! implements +//! [`Zeroize`](https://docs.rs/zeroize/latest/zeroize/trait.Zeroize.html) for +//! this crate's types. //! //! The NEON implementation is enabled by default for AArch64 but requires the //! `neon` feature for other ARM targets. Not all ARMv7 CPUs support NEON, and @@ -52,12 +62,12 @@ //! without NEON support. //! //! The `traits-preview` feature enables implementations of traits from the -//! RustCrypto [`digest`] crate, and re-exports that crate as -//! `traits::digest`. However, the traits aren't stable, and they're expected to -//! change in incompatible ways before that crate reaches 1.0. For that reason, -//! this crate makes no SemVer guarantees for this feature, and callers who use -//! it should expect breaking changes between patch versions. (The "-preview" -//! feature name follows the conventions of the RustCrypto [`signature`] crate.) +//! RustCrypto [`digest`] crate, and re-exports that crate as `traits::digest`. +//! However, the traits aren't stable, and they're expected to change in +//! incompatible ways before that crate reaches 1.0. For that reason, this crate +//! makes no SemVer guarantees for this feature, and callers who use it should +//! expect breaking changes between patch versions. (The "-preview" feature name +//! follows the conventions of the RustCrypto [`signature`] crate.) //! //! [`Hasher::update_rayon`]: struct.Hasher.html#method.update_rayon //! [BLAKE3]: https://blake3.io @@ -115,9 +125,7 @@ mod sse41; #[cfg(feature = "traits-preview")] pub mod traits; -#[cfg(feature = "file")] -pub mod file; - +mod io; mod join; use arrayref::{array_mut_ref, array_ref}; @@ -914,6 +922,15 @@ fn parent_node_output( /// An incremental hash state that can accept any number of writes. /// +/// **Performance note:** The [`update`](#method.update) method can't take full +/// advantage of SIMD optimizations if its input buffer is too small or oddly +/// sized. Using a 16 KiB buffer, or any multiple of that, enables all currently +/// supported SIMD instruction sets. See also +/// [`update_reader`](Hasher::update_reader). +/// +/// The `rayon` and `mmap` Cargo features enable additional methods on this +/// type related to multithreading and memory-mapped IO. +/// /// When the `traits-preview` Cargo feature is enabled, this type implements /// several commonly used traits from the /// [`digest`](https://crates.io/crates/digest) crate. However, those @@ -922,15 +939,6 @@ fn parent_node_output( /// guarantees for this feature, and callers who use it should expect breaking /// changes between patch versions. /// -/// When the `rayon` Cargo feature is enabled, the -/// [`update_rayon`](#method.update_rayon) method is available for multithreaded -/// hashing. -/// -/// **Performance note:** The [`update`](#method.update) method can't take full -/// advantage of SIMD optimizations if its input buffer is too small or oddly -/// sized. Using a 16 KiB buffer, or any multiple of that, enables all currently -/// supported SIMD instruction sets. -/// /// # Examples /// /// ``` @@ -1099,30 +1107,6 @@ impl Hasher { self.update_with_join::(input) } - /// Identical to [`update`](Hasher::update), but using Rayon-based - /// multithreading internally. - /// - /// This method is gated by the `rayon` Cargo feature, which is disabled by - /// default but enabled on [docs.rs](https://docs.rs). - /// - /// To get any performance benefit from multithreading, the input buffer - /// needs to be large. As a rule of thumb on x86_64, `update_rayon` is - /// _slower_ than `update` for inputs under 128 KiB. That threshold varies - /// quite a lot across different processors, and it's important to benchmark - /// your specific use case. - /// - /// Memory mapping an entire input file is a simple way to take advantage of - /// multithreading without needing to carefully tune your buffer size or - /// offload IO. However, on spinning disks where random access is expensive, - /// that approach can lead to disk thrashing and terrible IO performance. - /// Note that OS page caching can mask this problem, in which case it might - /// only appear for files larger than available RAM. Again, benchmarking - /// your specific use case is important. - #[cfg(feature = "rayon")] - pub fn update_rayon(&mut self, input: &[u8]) -> &mut Self { - self.update_with_join::(input) - } - fn update_with_join(&mut self, mut input: &[u8]) -> &mut Self { // If we have some partial chunk bytes in the internal chunk_state, we // need to finish that chunk first. @@ -1321,6 +1305,179 @@ impl Hasher { pub fn count(&self) -> u64 { self.chunk_state.chunk_counter * CHUNK_LEN as u64 + self.chunk_state.len() as u64 } + + /// As [`update`](Hasher::update), but reading from a + /// [`std::io::Read`](https://doc.rust-lang.org/std/io/trait.Read.html) implementation. + /// + /// [`Hasher`] implements + /// [`std::io::Write`](https://doc.rust-lang.org/std/io/trait.Write.html), so it's possible to + /// use [`std::io::copy`](https://doc.rust-lang.org/std/io/fn.copy.html) to update a [`Hasher`] + /// from any reader. Unfortunately, this standard approach can limit performance, because the + /// [`copy`](https://doc.rust-lang.org/std/io/fn.copy.html) function currently uses an internal + /// 8 KiB buffer that isn't big enough to take advantage of all SIMD instruction sets. (In + /// particular, [AVX-512](https://en.wikipedia.org/wiki/AVX-512) needs a + /// 16 KiB buffer.) `update_reader` avoids this performance problem and is slightly more + /// convenient. + /// + /// The internal buffer size this method uses may change at any time, and it may be different + /// for different targets. The only guarantee is that it will be large enough for all of this + /// crate's SIMD implementations on the current platform. + /// + /// The most common implementer of + /// [`std::io::Read`](https://doc.rust-lang.org/std/io/trait.Read.html) might be + /// [`std::fs::File`](https://doc.rust-lang.org/std/fs/struct.File.html), but note that memory + /// mapping can be faster than this method for hashing large files. See + /// [`update_mmap`](Hasher::update_mmap) and [`update_mmap_rayon`](Hasher::update_mmap_rayon), + /// which require the `mmap` and (for the latter) `rayon` Cargo features. + /// + /// This method requires the `std` Cargo feature, which is enabled by default. + /// + /// # Example + /// + /// ```no_run + /// # use std::fs::File; + /// # use std::io; + /// # fn main() -> io::Result<()> { + /// // Hash standard input. + /// let mut hasher = blake3::Hasher::new(); + /// hasher.update_reader(std::io::stdin().lock())?; + /// println!("{}", hasher.finalize()); + /// # Ok(()) + /// # } + /// ``` + #[cfg(feature = "std")] + pub fn update_reader(&mut self, reader: impl std::io::Read) -> std::io::Result { + io::copy_wide(reader, self) + } + + /// As [`update`](Hasher::update), but using Rayon-based multithreading + /// internally. + /// + /// This method is gated by the `rayon` Cargo feature, which is disabled by + /// default but enabled on [docs.rs](https://docs.rs). + /// + /// To get any performance benefit from multithreading, the input buffer + /// needs to be large. As a rule of thumb on x86_64, `update_rayon` is + /// _slower_ than `update` for inputs under 128 KiB. That threshold varies + /// quite a lot across different processors, and it's important to benchmark + /// your specific use case. See also the performance warning associated with + /// [`update_mmap_rayon`](Hasher::update_mmap_rayon). + /// + /// If you already have a large buffer in memory, and you want to hash it + /// with multiple threads, this method is a good option. However, reading a + /// file into memory just to call this method can be a performance mistake, + /// both because it requires lots of memory and because single-threaded + /// reads can be slow. For hashing whole files, see + /// [`update_mmap_rayon`](Hasher::update_mmap_rayon), which is gated by both + /// the `rayon` and `mmap` Cargo features. + #[cfg(feature = "rayon")] + pub fn update_rayon(&mut self, input: &[u8]) -> &mut Self { + self.update_with_join::(input) + } + + /// As [`update`](Hasher::update), but reading the contents of a file using memory mapping. + /// + /// Not all files can be memory mapped, and memory mapping small files can be slower than + /// reading them the usual way. In those cases, this method will fall back to standard file IO. + /// The heuristic for whether to use memory mapping is currently very simple (file size >= + /// 16 KiB), and it might change at any time. + /// + /// Like [`update`](Hasher::update), this method is single-threaded. In this author's + /// experience, memory mapping improves single-threaded performance by ~10% for large files + /// that are already in cache. This probably varies between platforms, and as always it's a + /// good idea to benchmark your own use case. In comparison, the multithreaded + /// [`update_mmap_rayon`](Hasher::update_mmap_rayon) method can have a much larger impact on + /// performance. + /// + /// There's a correctness reason that this method takes + /// [`Path`](https://doc.rust-lang.org/stable/std/path/struct.Path.html) instead of + /// [`File`](https://doc.rust-lang.org/std/fs/struct.File.html): reading from a memory-mapped + /// file ignores the seek position of the original file handle (it neither respects the current + /// position nor updates the position). This difference in behavior would've caused + /// `update_mmap` and [`update_reader`](Hasher::update_reader) to give different answers and + /// have different side effects in some cases. Taking a + /// [`Path`](https://doc.rust-lang.org/stable/std/path/struct.Path.html) avoids this problem by + /// making it clear that a new [`File`](https://doc.rust-lang.org/std/fs/struct.File.html) is + /// opened internally. + /// + /// This method requires the `mmap` Cargo feature, which is disabled by default but enabled on + /// [docs.rs](https://docs.rs). + /// + /// # Example + /// + /// ```no_run + /// # use std::io; + /// # use std::path::Path; + /// # fn main() -> io::Result<()> { + /// let path = Path::new("file.dat"); + /// let mut hasher = blake3::Hasher::new(); + /// hasher.update_mmap(path)?; + /// println!("{}", hasher.finalize()); + /// # Ok(()) + /// # } + /// ``` + #[cfg(feature = "mmap")] + pub fn update_mmap(&mut self, path: impl AsRef) -> std::io::Result<()> { + let file = std::fs::File::open(path.as_ref())?; + if let Some(mmap) = io::maybe_mmap_file(&file)? { + self.update(&mmap); + } else { + io::copy_wide(&file, self)?; + } + Ok(()) + } + + /// As [`update_rayon`](Hasher::update_rayon), but reading the contents of a file using + /// memory mapping. This is the default behavior of `b3sum`. + /// + /// For large files that are likely to be in cache, this can be much faster than + /// single-threaded hashing. When benchmarks report that BLAKE3 is 10x or 20x faster than other + /// cryptographic hashes, this is usually what they're measuring. However... + /// + /// **Performance Warning:** There are cases where multithreading hurts performance. The worst + /// case is [a large file on a spinning disk](https://github.com/BLAKE3-team/BLAKE3/issues/31), + /// where simultaneous reads from multiple threads can cause "thrashing" (i.e. the disk spends + /// more time seeking around than reading data). Windows tends to be somewhat worse about this, + /// in part because it's less likely than Linux to keep very large files in cache. More + /// generally, if your CPU cores are already busy, then multithreading will add overhead + /// without improving performance. If your code runs in different environments that you don't + /// control and can't measure, then unfortunately there's no one-size-fits-all answer for + /// whether multithreading is a good idea. + /// + /// The memory mapping behavior of this function is the same as + /// [`update_mmap`](Hasher::update_mmap), and the heuristic for when to fall back to standard + /// file IO might change at any time. + /// + /// This method requires both the `mmap` and `rayon` Cargo features, which are disabled by + /// default but enabled on [docs.rs](https://docs.rs). + /// + /// # Example + /// + /// ```no_run + /// # use std::io; + /// # use std::path::Path; + /// # fn main() -> io::Result<()> { + /// # #[cfg(feature = "rayon")] + /// # { + /// let path = Path::new("big_file.dat"); + /// let mut hasher = blake3::Hasher::new(); + /// hasher.update_mmap_rayon(path)?; + /// println!("{}", hasher.finalize()); + /// # } + /// # Ok(()) + /// # } + /// ``` + #[cfg(feature = "mmap")] + #[cfg(feature = "rayon")] + pub fn update_mmap_rayon(&mut self, path: impl AsRef) -> std::io::Result<()> { + let file = std::fs::File::open(path.as_ref())?; + if let Some(mmap) = io::maybe_mmap_file(&file)? { + self.update_rayon(&mmap); + } else { + io::copy_wide(&file, self)?; + } + Ok(()) + } } // Don't derive(Debug), because the state may be secret. @@ -1355,29 +1512,6 @@ impl std::io::Write for Hasher { } } -/// Copy from `reader` to `hasher`, returning the number of bytes read. -/// -/// A 16 KiB buffer is enough to take advantage of all the SIMD instruction sets -/// that we support, but `std::io::copy` currently uses 8 KiB. Most platforms -/// can support at least 64 KiB, and there's some performance benefit to using -/// bigger reads, so that's what we use here. -#[cfg(feature = "std")] -pub fn copy_wide(mut reader: impl std::io::Read, hasher: &mut Hasher) -> std::io::Result { - let mut buffer = [0; 65536]; - let mut total = 0; - loop { - match reader.read(&mut buffer) { - Ok(0) => return Ok(total), - Ok(n) => { - hasher.update(&buffer[..n]); - total += n as u64; - } - Err(e) if e.kind() == std::io::ErrorKind::Interrupted => continue, - Err(e) => return Err(e), - } - } -} - /// An incremental reader for extended output, returned by /// [`Hasher::finalize_xof`](struct.Hasher.html#method.finalize_xof). /// diff --git a/src/test.rs b/src/test.rs index 0d94f44..a319930 100644 --- a/src/test.rs +++ b/src/test.rs @@ -658,11 +658,13 @@ fn test_zeroize() { assert_eq!(hasher.chunk_state.buf_len, 0); assert_eq!(hasher.chunk_state.blocks_compressed, 0); assert_eq!(hasher.chunk_state.flags, 0); - assert!(matches!(hasher.chunk_state.platform, crate::Platform::Portable)); + assert!(matches!( + hasher.chunk_state.platform, + crate::Platform::Portable + )); assert_eq!(hasher.key, [0; 8]); assert_eq!(&*hasher.cv_stack, &[[0u8; 32]; 0]); - let mut output_reader = crate::OutputReader { inner: crate::Output { input_chaining_value: [42; 8], @@ -675,14 +677,123 @@ fn test_zeroize() { position_within_block: 42, }; - output_reader.zeroize(); assert_eq!(output_reader.inner.input_chaining_value, [0; 8]); assert_eq!(output_reader.inner.block, [0; 64]); assert_eq!(output_reader.inner.counter, 0); assert_eq!(output_reader.inner.block_len, 0); assert_eq!(output_reader.inner.flags, 0); - assert!(matches!(output_reader.inner.platform, crate::Platform::Portable)); + assert!(matches!( + output_reader.inner.platform, + crate::Platform::Portable + )); assert_eq!(output_reader.position_within_block, 0); +} -} \ No newline at end of file +#[test] +#[cfg(feature = "std")] +fn test_update_reader() -> Result<(), std::io::Error> { + // This is a brief test, since update_reader() is mostly a wrapper around update(), which already + // has substantial testing. + use std::io::prelude::*; + let mut input = vec![0; 1_000_000]; + paint_test_input(&mut input); + let mut tempfile = tempfile::NamedTempFile::new()?; + tempfile.write_all(&input)?; + tempfile.flush()?; + let mut hasher = crate::Hasher::new(); + hasher.update_reader(std::fs::File::open(tempfile.path())?)?; + assert_eq!(hasher.finalize(), crate::hash(&input)); + Ok(()) +} + +#[test] +#[cfg(feature = "std")] +fn test_update_reader_interrupted() -> std::io::Result<()> { + use std::io; + struct InterruptingReader<'a> { + already_interrupted: bool, + slice: &'a [u8], + } + impl<'a> InterruptingReader<'a> { + fn new(slice: &'a [u8]) -> Self { + Self { + already_interrupted: false, + slice, + } + } + } + impl<'a> io::Read for InterruptingReader<'a> { + fn read(&mut self, buf: &mut [u8]) -> io::Result { + if !self.already_interrupted { + self.already_interrupted = true; + return Err(io::Error::from(io::ErrorKind::Interrupted)); + } + let take = std::cmp::min(self.slice.len(), buf.len()); + buf[..take].copy_from_slice(&self.slice[..take]); + self.slice = &self.slice[take..]; + Ok(take) + } + } + + let input = b"hello world"; + let mut reader = InterruptingReader::new(input); + let mut hasher = crate::Hasher::new(); + hasher.update_reader(&mut reader)?; + assert_eq!(hasher.finalize(), crate::hash(input)); + Ok(()) +} + +#[test] +#[cfg(feature = "mmap")] +fn test_mmap() -> Result<(), std::io::Error> { + // This is a brief test, since update_mmap() is mostly a wrapper around update(), which already + // has substantial testing. + use std::io::prelude::*; + let mut input = vec![0; 1_000_000]; + paint_test_input(&mut input); + let mut tempfile = tempfile::NamedTempFile::new()?; + tempfile.write_all(&input)?; + tempfile.flush()?; + let mut hasher = crate::Hasher::new(); + hasher.update_mmap(tempfile.path())?; + assert_eq!(hasher.finalize(), crate::hash(&input)); + Ok(()) +} + +#[test] +#[cfg(feature = "mmap")] +#[cfg(target_os = "linux")] +fn test_mmap_virtual_file() -> Result<(), std::io::Error> { + // Virtual files like /proc/version can't be mmapped, because their contents don't actually + // exist anywhere in memory. Make sure we fall back to regular file IO in these cases. + // Currently this is handled with a length check, where the assumption is that virtual files + // will always report length 0. If that assumption ever breaks, hopefully this test will catch + // it. + let virtual_filepath = "/proc/version"; + let mut mmap_hasher = crate::Hasher::new(); + // We'll fail right here if the fallback doesn't work. + mmap_hasher.update_mmap(virtual_filepath)?; + let mut read_hasher = crate::Hasher::new(); + read_hasher.update_reader(std::fs::File::open(virtual_filepath)?)?; + assert_eq!(mmap_hasher.finalize(), read_hasher.finalize()); + Ok(()) +} + +#[test] +#[cfg(feature = "mmap")] +#[cfg(feature = "rayon")] +fn test_mmap_rayon() -> Result<(), std::io::Error> { + // This is a brief test, since update_mmap_rayon() is mostly a wrapper around update_rayon(), + // which already has substantial testing. + use std::io::prelude::*; + let mut input = vec![0; 1_000_000]; + paint_test_input(&mut input); + let mut tempfile = tempfile::NamedTempFile::new()?; + tempfile.write_all(&input)?; + tempfile.flush()?; + let mut hasher = crate::Hasher::new(); + hasher.update_mmap_rayon(tempfile.path())?; + assert_eq!(hasher.finalize(), crate::hash(&input)); + Ok(()) +}