From 1322974a8560d13cca5ee453f98c311bc469bf0c Mon Sep 17 00:00:00 2001
From: ultrabear <bearodark@gmail.com>
Date: Sun, 5 Nov 2023 21:21:47 -0800
Subject: [PATCH 1/4] Improve performance of `--no-mmap` mode by enabling
 multithread.

This uses a double buffer of 1MiB each, reading to one buffer while
hashing the other in parallel. This is around 2x as fast as hashing
singlethreadedly on my machine (ryzen 2600) with an in memory benchmark.

This is still 2x slower than using memmap.
---
 b3sum/Cargo.lock  |  7 +++++++
 b3sum/Cargo.toml  |  9 +++++++++
 b3sum/src/main.rs | 42 +++++++++++++++++++++++++++++++++++++++++-
 3 files changed, 57 insertions(+), 1 deletion(-)
diff --git a/b3sum/Cargo.lock b/b3sum/Cargo.lock
index 763a2e1..4e89e8e 100644
--- a/b3sum/Cargo.lock
+++ b/b3sum/Cargo.lock
@@ -85,6 +85,7 @@ dependencies = [
  "hex",
  "memmap2",
  "rayon",
+ "read_chunks",
  "tempfile",
  "wild",
 ]
@@ -362,6 +363,12 @@ dependencies = [
  "crossbeam-utils",
 ]
 
+[[package]]
+name = "read_chunks"
+version = "0.2.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "dda95aba172630718d3dffb3c2a3c316848fd9f2725898f56af81fdfcd3c8abf"
+
 [[package]]
 name = "redox_syscall"
 version = "0.3.5"
diff --git a/b3sum/Cargo.toml b/b3sum/Cargo.toml
index e3bf820..ac29755 100644
--- a/b3sum/Cargo.toml
+++ b/b3sum/Cargo.toml
@@ -20,8 +20,17 @@ clap = { version = "4.0.8", features = ["derive", "wrap_help"] }
 hex = "0.4.0"
 memmap2 = "0.7.0"
 rayon = "1.2.1"
+read_chunks = "0.2.0"
 wild = "2.0.3"
 
 [dev-dependencies]
 duct = "0.13.3"
 tempfile = "3.1.0"
+
+
+[profile.dev]
+opt-level = 1
+
+[profile.release]
+overflow-checks = true
+lto = "thin"
diff --git a/b3sum/src/main.rs b/b3sum/src/main.rs
index 228737f..491d688 100644
--- a/b3sum/src/main.rs
+++ b/b3sum/src/main.rs
@@ -163,6 +163,44 @@ impl Args {
     }
 }
 
+fn hash_reader_parallel(
+    hasher: &mut blake3::Hasher,
+    reader: &mut (impl Read + Send),
+    len: Option<u64>,
+) -> Result<()> {
+    use read_chunks::ReadExt;
+    const BUF_SIZE: usize = 1024 * 1024;
+    const MIN_SIZE: u64 = BUF_SIZE as u64;
+
+    if len.is_some_and(|s| s < MIN_SIZE) {
+        hasher.update_reader(reader)?;
+        return Ok(());
+    }
+
+    let mut hashing = vec![0; BUF_SIZE];
+    let mut hashing_res = reader.keep_reading(&mut *hashing)?;
+
+    let mut reading_to = vec![0; BUF_SIZE];
+    let mut reading_res = None::<io::Result<usize>>;
+
+    while hashing_res != 0 {
+        rayon::scope(|s| {
+            s.spawn(|_| {
+                reading_res = Some(reader.keep_reading(&mut *reading_to));
+            });
+
+            s.spawn(|_| {
+                hasher.update_rayon(&hashing[..hashing_res]);
+            });
+        });
+
+        hashing_res = reading_res.take().unwrap()?;
+        (hashing, reading_to) = (reading_to, hashing);
+    }
+
+    Ok(())
+}
+
 fn hash_path(args: &Args, path: &Path) -> Result<blake3::OutputReader> {
     let mut hasher = args.base_hasher.clone();
     if path == Path::new("-") {
@@ -171,7 +209,9 @@ fn hash_path(args: &Args, path: &Path) -> Result<blake3::OutputReader> {
         }
         hasher.update_reader(io::stdin().lock())?;
     } else if args.no_mmap() {
-        hasher.update_reader(File::open(path)?)?;
+        let length = std::fs::metadata(path)?.len();
+
+        hash_reader_parallel(&mut hasher, &mut File::open(path)?, Some(length))?;
     } else {
         // The fast path: Try to mmap the file and hash it with multiple threads.
         hasher.update_mmap_rayon(path)?;

From e297112bc9177edcd530b3d11b1a1b83c04f6388 Mon Sep 17 00:00:00 2001
From: ultrabear <bearodark@gmail.com>
Date: Sun, 5 Nov 2023 21:36:48 -0800
Subject: [PATCH 2/4] add comments explaining thought processes

---
 b3sum/src/main.rs | 18 ++++++++++++++++++
 1 file changed, 18 insertions(+)

diff --git a/b3sum/src/main.rs b/b3sum/src/main.rs
index 491d688..bb80969 100644
--- a/b3sum/src/main.rs
+++ b/b3sum/src/main.rs
@@ -163,20 +163,36 @@ impl Args {
     }
 }
 
+/// Hashes a reader in parallel into a hasher using rayon, optionally takes a length to heuristically decide if
+/// we are better off sequentially hashing
 fn hash_reader_parallel(
     hasher: &mut blake3::Hasher,
     reader: &mut (impl Read + Send),
     len: Option<u64>,
 ) -> Result<()> {
+    // we use read_chunks here because I(ultrabear) coded it, and know it is as safe as the code here.
+    // TODO make this just a function in main.rs instead of an extra dep,
+    // but only worth doing if we want to merge the PR, this is a proof of concept
     use read_chunks::ReadExt;
+
+    // 2MiB of total buffer is not an extreme amount of memory, and performance is a good bit
+    // better with that amount, increasing this will probably equal more performance up to a
+    // certain point, and there might be a "magic" size to target (where blake3 multithreading can
+    // split evenly and have maximum possible performance without using too much memory)
     const BUF_SIZE: usize = 1024 * 1024;
+    // if anything is under 1MiB we don't want to put it through multithreading, this is probably an
+    // overshoot of where multithreading is effective (512KiB was found to also have a performance
+    // increase), but it is essential that we do not undershoot where it is effective, and risk
+    // having worse performance than before this codechange on small files.
     const MIN_SIZE: u64 = BUF_SIZE as u64;
 
+    // fallback to update_reader if the length is too small
     if len.is_some_and(|s| s < MIN_SIZE) {
         hasher.update_reader(reader)?;
         return Ok(());
     }
 
+    // allocate the double buffers and their return memory locations
     let mut hashing = vec![0; BUF_SIZE];
     let mut hashing_res = reader.keep_reading(&mut *hashing)?;
 
@@ -184,6 +200,8 @@ fn hash_reader_parallel(
     let mut reading_res = None::<io::Result<usize>>;
 
     while hashing_res != 0 {
+        // by scoping we guarantee that all tasks complete, and can get our mutable references back
+        // to do error handling and buffer swapping
         rayon::scope(|s| {
             s.spawn(|_| {
                 reading_res = Some(reader.keep_reading(&mut *reading_to));

From 0b3f8432e5a2aa48eaf01993d6ae0fd98608d78b Mon Sep 17 00:00:00 2001
From: ultrabear <bearodark@gmail.com>
Date: Sun, 5 Nov 2023 21:38:13 -0800
Subject: [PATCH 3/4] remove accidentally left in profile fiddling

---
 b3sum/Cargo.toml | 8 --------
 1 file changed, 8 deletions(-)

diff --git a/b3sum/Cargo.toml b/b3sum/Cargo.toml
index ac29755..ff19412 100644
--- a/b3sum/Cargo.toml
+++ b/b3sum/Cargo.toml
@@ -26,11 +26,3 @@ wild = "2.0.3"
 [dev-dependencies]
 duct = "0.13.3"
 tempfile = "3.1.0"
-
-
-[profile.dev]
-opt-level = 1
-
-[profile.release]
-overflow-checks = true
-lto = "thin"

From 03e0949d13cebe3c04e1c908d25cf1e22bc71623 Mon Sep 17 00:00:00 2001
From: ultrabear <bearodark@gmail.com>
Date: Sun, 5 Nov 2023 22:03:19 -0800
Subject: [PATCH 4/4] add testcase that hash_reader_parallel gives correct
 results

---
 b3sum/tests/cli_tests.rs | 50 ++++++++++++++++++++++++++++++++++++++++
 1 file changed, 50 insertions(+)

diff --git a/b3sum/tests/cli_tests.rs b/b3sum/tests/cli_tests.rs
index d5d4efa..e1f21cf 100644
--- a/b3sum/tests/cli_tests.rs
+++ b/b3sum/tests/cli_tests.rs
@@ -611,3 +611,53 @@ fn test_globbing() {
         .unwrap();
     assert_eq!(expected, output);
 }
+
+#[test]
+// tests that hash_reader_parallel fallsback correctly and hashes multithreaded correctly
+fn test_hash_reader_parallel() {
+    let dir = tempfile::tempdir().unwrap();
+
+    let file1 = dir.path().join("file1");
+    fs::write(&file1, b"foobar").unwrap();
+
+    let expected = blake3::hash(b"foobar");
+
+    let output = cmd!(b3sum_exe(), "--no-mmap", &file1)
+        .stdout_capture()
+        .run()
+        .unwrap()
+        .stdout;
+
+    let expected = format!("{}  {}\n", expected.to_hex(), file1.display());
+
+    // fallback test
+    assert_eq!(output, expected.as_bytes());
+
+    // tests multithread gives correct results
+
+    let file2 = dir.path().join("file2");
+    let mut f = fs::File::create(&file2).unwrap();
+
+    let mut expected = blake3::Hasher::new();
+
+    // 20_000 * 62 is 1.2MiB, which passes the threshold of using multithreading
+
+    for _ in 0..20_000 {
+        // we use a big string here to avoid looping many times, which is bad for opt-level=0
+        const WRITE: &[u8] = b"0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz";
+        assert_eq!(WRITE.len(), 62);
+
+        f.write_all(WRITE).unwrap();
+        expected.update(WRITE);
+    }
+
+    let output = cmd!(b3sum_exe(), "--no-mmap", &file2)
+        .stdout_capture()
+        .run()
+        .unwrap()
+        .stdout;
+
+    let expected = format!("{}  {}\n", expected.finalize().to_hex(), file2.display());
+
+    assert_eq!(output, expected.as_bytes());
+}