Merge 03e0949d13 into 5b9af1c347

2024-05-13 03:16:11 +02:00 · 2024-03-11 20:18:47 -07:00 · 2024-03-11 20:18:47 -07:00 · 2104aaf2f4
parent 5b9af1c347 03e0949d13
commit 2104aaf2f4
4 changed files with 117 additions and 1 deletions
--- a/b3sum/Cargo.lock
+++ b/b3sum/Cargo.lock
@ -78,6 +78,7 @@ dependencies = [
 "duct",
 "hex",
 "rayon",
+ "read_chunks",
 "tempfile",
 "wild",
 ]
@ -327,6 +328,12 @@ dependencies = [
 "crossbeam-utils",
 ]

+[[package]]
+name = "read_chunks"
+version = "0.2.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "dda95aba172630718d3dffb3c2a3c316848fd9f2725898f56af81fdfcd3c8abf"
+
 [[package]]
 name = "redox_syscall"
 version = "0.4.1"
--- a/b3sum/Cargo.toml
+++ b/b3sum/Cargo.toml
@ -19,6 +19,7 @@ blake3 = { version = "1", path = "..", features = ["mmap", "rayon"] }
 clap = { version = "4.0.8", features = ["derive", "wrap_help"] }
 hex = "0.4.0"
 rayon = "1.2.1"
+read_chunks = "0.2.0"
 wild = "2.0.3"

 [dev-dependencies]
--- a/b3sum/src/main.rs
+++ b/b3sum/src/main.rs
@ -163,6 +163,62 @@ impl Args {
    }
 }

+/// Hashes a reader in parallel into a hasher using rayon, optionally takes a length to heuristically decide if
+/// we are better off sequentially hashing
+fn hash_reader_parallel(
+    hasher: &mut blake3::Hasher,
+    reader: &mut (impl Read + Send),
+    len: Option<u64>,
+) -> Result<()> {
+    // we use read_chunks here because I(ultrabear) coded it, and know it is as safe as the code here.
+    // TODO make this just a function in main.rs instead of an extra dep,
+    // but only worth doing if we want to merge the PR, this is a proof of concept
+    use read_chunks::ReadExt;
+
+    // 2MiB of total buffer is not an extreme amount of memory, and performance is a good bit
+    // better with that amount, increasing this will probably equal more performance up to a
+    // certain point, and there might be a "magic" size to target (where blake3 multithreading can
+    // split evenly and have maximum possible performance without using too much memory)
+    const BUF_SIZE: usize = 1024 * 1024;
+    // if anything is under 1MiB we don't want to put it through multithreading, this is probably an
+    // overshoot of where multithreading is effective (512KiB was found to also have a performance
+    // increase), but it is essential that we do not undershoot where it is effective, and risk
+    // having worse performance than before this codechange on small files.
+    const MIN_SIZE: u64 = BUF_SIZE as u64;
+
+    // fallback to update_reader if the length is too small
+    if len.is_some_and(|s| s < MIN_SIZE) {
+        hasher.update_reader(reader)?;
+        return Ok(());
+    }
+
+    // allocate the double buffers and their return memory locations
+    let mut hashing = vec![0; BUF_SIZE];
+    let mut hashing_res = reader.keep_reading(&mut *hashing)?;
+
+    let mut reading_to = vec![0; BUF_SIZE];
+    let mut reading_res = None::<io::Result<usize>>;
+
+    while hashing_res != 0 {
+        // by scoping we guarantee that all tasks complete, and can get our mutable references back
+        // to do error handling and buffer swapping
+        rayon::scope(|s| {
+            s.spawn(|_| {
+                reading_res = Some(reader.keep_reading(&mut *reading_to));
+            });
+
+            s.spawn(|_| {
+                hasher.update_rayon(&hashing[..hashing_res]);
+            });
+        });
+
+        hashing_res = reading_res.take().unwrap()?;
+        (hashing, reading_to) = (reading_to, hashing);
+    }
+
+    Ok(())
+}
+
 fn hash_path(args: &Args, path: &Path) -> Result<blake3::OutputReader> {
    let mut hasher = args.base_hasher.clone();
    if path == Path::new("-") {
@ -171,7 +227,9 @@ fn hash_path(args: &Args, path: &Path) -> Result<blake3::OutputReader> {
        }
        hasher.update_reader(io::stdin().lock())?;
    } else if args.no_mmap() {
-        hasher.update_reader(File::open(path)?)?;
+        let length = std::fs::metadata(path)?.len();
+
+        hash_reader_parallel(&mut hasher, &mut File::open(path)?, Some(length))?;
    } else {
        // The fast path: Try to mmap the file and hash it with multiple threads.
        hasher.update_mmap_rayon(path)?;
--- a/b3sum/tests/cli_tests.rs
+++ b/b3sum/tests/cli_tests.rs
@ -611,3 +611,53 @@ fn test_globbing() {
        .unwrap();
    assert_eq!(expected, output);
 }
+
+#[test]
+// tests that hash_reader_parallel fallsback correctly and hashes multithreaded correctly
+fn test_hash_reader_parallel() {
+    let dir = tempfile::tempdir().unwrap();
+
+    let file1 = dir.path().join("file1");
+    fs::write(&file1, b"foobar").unwrap();
+
+    let expected = blake3::hash(b"foobar");
+
+    let output = cmd!(b3sum_exe(), "--no-mmap", &file1)
+        .stdout_capture()
+        .run()
+        .unwrap()
+        .stdout;
+
+    let expected = format!("{}  {}\n", expected.to_hex(), file1.display());
+
+    // fallback test
+    assert_eq!(output, expected.as_bytes());
+
+    // tests multithread gives correct results
+
+    let file2 = dir.path().join("file2");
+    let mut f = fs::File::create(&file2).unwrap();
+
+    let mut expected = blake3::Hasher::new();
+
+    // 20_000 * 62 is 1.2MiB, which passes the threshold of using multithreading
+
+    for _ in 0..20_000 {
+        // we use a big string here to avoid looping many times, which is bad for opt-level=0
+        const WRITE: &[u8] = b"0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz";
+        assert_eq!(WRITE.len(), 62);
+
+        f.write_all(WRITE).unwrap();
+        expected.update(WRITE);
+    }
+
+    let output = cmd!(b3sum_exe(), "--no-mmap", &file2)
+        .stdout_capture()
+        .run()
+        .unwrap()
+        .stdout;
+
+    let expected = format!("{}  {}\n", expected.finalize().to_hex(), file2.display());
+
+    assert_eq!(output, expected.as_bytes());
+}