2019-12-12 08:13:16 +01:00
|
|
|
use anyhow::{bail, Context, Result};
|
|
|
|
use clap::{App, Arg};
|
|
|
|
use std::cmp;
|
2019-12-13 21:56:21 +01:00
|
|
|
use std::convert::TryInto;
|
2020-05-04 03:13:58 +02:00
|
|
|
use std::ffi::OsStr;
|
2019-12-12 08:13:16 +01:00
|
|
|
use std::fs::File;
|
2020-02-03 17:35:50 +01:00
|
|
|
use std::io;
|
2019-12-12 08:13:16 +01:00
|
|
|
use std::io::prelude::*;
|
|
|
|
|
2019-12-13 21:56:21 +01:00
|
|
|
const FILE_ARG: &str = "file";
|
|
|
|
const DERIVE_KEY_ARG: &str = "derive-key";
|
2020-03-16 17:06:16 +01:00
|
|
|
const KEYED_ARG: &str = "keyed";
|
|
|
|
const LENGTH_ARG: &str = "length";
|
2020-01-20 15:09:37 +01:00
|
|
|
const NO_MMAP_ARG: &str = "no-mmap";
|
2019-12-13 21:56:21 +01:00
|
|
|
const NO_NAMES_ARG: &str = "no-names";
|
2020-03-16 17:06:16 +01:00
|
|
|
const NUM_THREADS_ARG: &str = "num-threads";
|
2020-01-13 22:12:47 +01:00
|
|
|
const RAW_ARG: &str = "raw";
|
2019-12-13 21:56:21 +01:00
|
|
|
|
2019-12-12 08:13:16 +01:00
|
|
|
fn clap_parse_argv() -> clap::ArgMatches<'static> {
|
|
|
|
App::new("b3sum")
|
|
|
|
.version(env!("CARGO_PKG_VERSION"))
|
2019-12-13 21:56:21 +01:00
|
|
|
.arg(Arg::with_name(FILE_ARG).multiple(true))
|
2019-12-12 08:13:16 +01:00
|
|
|
.arg(
|
2019-12-13 21:56:21 +01:00
|
|
|
Arg::with_name(LENGTH_ARG)
|
|
|
|
.long(LENGTH_ARG)
|
2019-12-12 08:13:16 +01:00
|
|
|
.short("l")
|
|
|
|
.takes_value(true)
|
|
|
|
.value_name("LEN")
|
2020-03-15 20:47:58 +01:00
|
|
|
.help(
|
|
|
|
"The number of output bytes, prior to hex\n\
|
|
|
|
encoding (default 32)",
|
|
|
|
),
|
2019-12-12 08:13:16 +01:00
|
|
|
)
|
2020-03-16 17:06:16 +01:00
|
|
|
.arg(
|
|
|
|
Arg::with_name(NUM_THREADS_ARG)
|
|
|
|
.long(NUM_THREADS_ARG)
|
|
|
|
.takes_value(true)
|
|
|
|
.value_name("NUM")
|
|
|
|
.help(
|
|
|
|
"The maximum number of threads to use. By\n\
|
|
|
|
default, this is the number of logical cores.\n\
|
|
|
|
If this flag is omitted, or if its value is 0,\n\
|
|
|
|
RAYON_NUM_THREADS is also respected.",
|
|
|
|
),
|
|
|
|
)
|
2019-12-12 08:13:16 +01:00
|
|
|
.arg(
|
2019-12-13 21:56:21 +01:00
|
|
|
Arg::with_name(KEYED_ARG)
|
|
|
|
.long(KEYED_ARG)
|
|
|
|
.requires(FILE_ARG)
|
2020-03-15 20:47:58 +01:00
|
|
|
.help(
|
|
|
|
"Uses the keyed mode. The secret key is read from standard\n\
|
|
|
|
input, and it must be exactly 32 raw bytes.",
|
|
|
|
),
|
2019-12-12 08:13:16 +01:00
|
|
|
)
|
|
|
|
.arg(
|
2019-12-13 21:56:21 +01:00
|
|
|
Arg::with_name(DERIVE_KEY_ARG)
|
|
|
|
.long(DERIVE_KEY_ARG)
|
|
|
|
.conflicts_with(KEYED_ARG)
|
2019-12-27 23:20:38 +01:00
|
|
|
.takes_value(true)
|
|
|
|
.value_name("CONTEXT")
|
2020-03-15 20:47:58 +01:00
|
|
|
.help(
|
|
|
|
"Uses the key derivation mode, with the given\n\
|
|
|
|
context string. Cannot be used with --keyed.",
|
|
|
|
),
|
2019-12-12 08:13:16 +01:00
|
|
|
)
|
2020-01-20 15:09:37 +01:00
|
|
|
.arg(
|
|
|
|
Arg::with_name(NO_MMAP_ARG)
|
|
|
|
.long(NO_MMAP_ARG)
|
2020-01-20 18:02:50 +01:00
|
|
|
.help("Disables memory mapping"),
|
2020-01-20 15:09:37 +01:00
|
|
|
)
|
2019-12-13 19:08:18 +01:00
|
|
|
.arg(
|
2019-12-13 21:56:21 +01:00
|
|
|
Arg::with_name(NO_NAMES_ARG)
|
|
|
|
.long(NO_NAMES_ARG)
|
2019-12-13 22:13:40 +01:00
|
|
|
.help("Omits filenames in the output"),
|
2019-12-13 19:08:18 +01:00
|
|
|
)
|
2020-03-15 20:47:58 +01:00
|
|
|
.arg(Arg::with_name(RAW_ARG).long(RAW_ARG).help(
|
|
|
|
"Writes raw output bytes to stdout, rather than hex.\n\
|
|
|
|
--no-names is implied. In this case, only a single\n\
|
|
|
|
input is allowed.",
|
|
|
|
))
|
2019-12-12 08:13:16 +01:00
|
|
|
.get_matches()
|
|
|
|
}
|
|
|
|
|
2020-02-03 17:35:50 +01:00
|
|
|
// A 16 KiB buffer is enough to take advantage of all the SIMD instruction sets
|
|
|
|
// that we support, but `std::io::copy` currently uses 8 KiB. Most platforms
|
|
|
|
// can support at least 64 KiB, and there's some performance benefit to using
|
|
|
|
// bigger reads, so that's what we use here.
|
|
|
|
fn copy_wide(mut reader: impl Read, hasher: &mut blake3::Hasher) -> io::Result<u64> {
|
|
|
|
let mut buffer = [0; 65536];
|
|
|
|
let mut total = 0;
|
|
|
|
loop {
|
|
|
|
match reader.read(&mut buffer) {
|
|
|
|
Ok(0) => return Ok(total),
|
|
|
|
Ok(n) => {
|
|
|
|
hasher.update(&buffer[..n]);
|
|
|
|
total += n as u64;
|
|
|
|
}
|
|
|
|
Err(ref e) if e.kind() == io::ErrorKind::Interrupted => continue,
|
|
|
|
Err(e) => return Err(e),
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2019-12-12 08:13:16 +01:00
|
|
|
// The slow path, for inputs that we can't memmap.
|
2020-02-03 17:35:50 +01:00
|
|
|
fn hash_reader(base_hasher: &blake3::Hasher, reader: impl Read) -> Result<blake3::OutputReader> {
|
2019-12-12 08:13:16 +01:00
|
|
|
let mut hasher = base_hasher.clone();
|
2020-02-03 17:35:50 +01:00
|
|
|
// This is currently all single-threaded. Doing multi-threaded hashing
|
|
|
|
// without memory mapping is tricky, since all your worker threads have to
|
|
|
|
// stop every time you refill the buffer, and that ends up being a lot of
|
|
|
|
// overhead. To solve that, we need a more complicated double-buffering
|
|
|
|
// strategy where a background thread fills one buffer while the worker
|
|
|
|
// threads are hashing the other one. We might implement that in the
|
|
|
|
// future, but since this is the slow path anyway, it's not high priority.
|
|
|
|
copy_wide(reader, &mut hasher)?;
|
2019-12-12 08:13:16 +01:00
|
|
|
Ok(hasher.finalize_xof())
|
|
|
|
}
|
|
|
|
|
|
|
|
fn maybe_memmap_file(file: &File) -> Result<Option<memmap::Mmap>> {
|
|
|
|
let metadata = file.metadata()?;
|
|
|
|
let file_size = metadata.len();
|
|
|
|
Ok(if !metadata.is_file() {
|
|
|
|
// Not a real file.
|
|
|
|
None
|
|
|
|
} else if file_size > isize::max_value() as u64 {
|
|
|
|
// Too long to safely map.
|
|
|
|
// https://github.com/danburkert/memmap-rs/issues/69
|
|
|
|
None
|
|
|
|
} else if file_size == 0 {
|
|
|
|
// Mapping an empty file currently fails.
|
|
|
|
// https://github.com/danburkert/memmap-rs/issues/72
|
|
|
|
None
|
2020-01-15 18:22:20 +01:00
|
|
|
} else if file_size < 16 * 1024 {
|
|
|
|
// Mapping small files is not worth it.
|
|
|
|
None
|
2019-12-12 08:13:16 +01:00
|
|
|
} else {
|
|
|
|
// Explicitly set the length of the memory map, so that filesystem
|
|
|
|
// changes can't race to violate the invariants we just checked.
|
|
|
|
let map = unsafe {
|
|
|
|
memmap::MmapOptions::new()
|
|
|
|
.len(file_size as usize)
|
|
|
|
.map(&file)?
|
|
|
|
};
|
|
|
|
Some(map)
|
|
|
|
})
|
|
|
|
}
|
|
|
|
|
|
|
|
// The fast path: Try to hash a file by mem-mapping it first. This is faster if
|
|
|
|
// it works, but it's not always possible.
|
|
|
|
fn maybe_hash_memmap(
|
2019-12-13 19:27:01 +01:00
|
|
|
_base_hasher: &blake3::Hasher,
|
|
|
|
_file: &File,
|
2019-12-12 08:13:16 +01:00
|
|
|
) -> Result<Option<blake3::OutputReader>> {
|
2020-03-16 17:06:16 +01:00
|
|
|
if let Some(map) = maybe_memmap_file(_file)? {
|
|
|
|
// Memory mapping worked. Use Rayon-based multi-threading to split
|
|
|
|
// up the whole file across many worker threads.
|
|
|
|
return Ok(Some(
|
|
|
|
_base_hasher
|
|
|
|
.clone()
|
|
|
|
.update_with_join::<blake3::join::RayonJoin>(&map)
|
|
|
|
.finalize_xof(),
|
|
|
|
));
|
2019-12-12 08:13:16 +01:00
|
|
|
}
|
2019-12-12 20:40:56 +01:00
|
|
|
Ok(None)
|
2019-12-12 08:13:16 +01:00
|
|
|
}
|
|
|
|
|
2020-01-13 22:12:47 +01:00
|
|
|
fn write_hex_output(mut output: blake3::OutputReader, mut len: u64) -> Result<()> {
|
2019-12-12 08:13:16 +01:00
|
|
|
// Encoding multiples of the block size is most efficient.
|
|
|
|
let mut block = [0; blake3::BLOCK_LEN];
|
|
|
|
while len > 0 {
|
|
|
|
output.fill(&mut block);
|
|
|
|
let hex_str = hex::encode(&block[..]);
|
|
|
|
let take_bytes = cmp::min(len, block.len() as u64);
|
|
|
|
print!("{}", &hex_str[..2 * take_bytes as usize]);
|
|
|
|
len -= take_bytes;
|
|
|
|
}
|
|
|
|
Ok(())
|
|
|
|
}
|
|
|
|
|
2020-01-14 23:35:18 +01:00
|
|
|
fn write_raw_output(output: blake3::OutputReader, len: u64) -> Result<()> {
|
|
|
|
let mut output = output.take(len);
|
2020-01-13 23:40:30 +01:00
|
|
|
let stdout = std::io::stdout();
|
|
|
|
let mut handler = stdout.lock();
|
2020-01-14 23:35:18 +01:00
|
|
|
std::io::copy(&mut output, &mut handler)?;
|
2020-01-13 22:12:47 +01:00
|
|
|
|
|
|
|
Ok(())
|
|
|
|
}
|
|
|
|
|
2019-12-12 08:13:16 +01:00
|
|
|
// Errors from this function get handled by the file loop and printed per-file.
|
|
|
|
fn hash_file(
|
|
|
|
base_hasher: &blake3::Hasher,
|
|
|
|
filepath: &std::ffi::OsStr,
|
2020-01-20 18:02:50 +01:00
|
|
|
mmap_disabled: bool,
|
2019-12-12 08:13:16 +01:00
|
|
|
) -> Result<blake3::OutputReader> {
|
|
|
|
let file = File::open(filepath)?;
|
2020-01-20 18:02:50 +01:00
|
|
|
if !mmap_disabled {
|
|
|
|
if let Some(output) = maybe_hash_memmap(&base_hasher, &file)? {
|
|
|
|
return Ok(output); // the fast path
|
|
|
|
}
|
2019-12-12 08:13:16 +01:00
|
|
|
}
|
2020-01-20 18:02:50 +01:00
|
|
|
// the slow path
|
|
|
|
hash_reader(&base_hasher, file)
|
2019-12-12 08:13:16 +01:00
|
|
|
}
|
|
|
|
|
2019-12-13 21:56:21 +01:00
|
|
|
fn read_key_from_stdin() -> Result<[u8; blake3::KEY_LEN]> {
|
|
|
|
let mut bytes = Vec::with_capacity(blake3::KEY_LEN + 1);
|
|
|
|
let n = std::io::stdin()
|
|
|
|
.lock()
|
|
|
|
.take(blake3::KEY_LEN as u64 + 1)
|
|
|
|
.read_to_end(&mut bytes)?;
|
|
|
|
if n < 32 {
|
|
|
|
bail!(
|
|
|
|
"expected {} key bytes from stdin, found {}",
|
|
|
|
blake3::KEY_LEN,
|
|
|
|
n,
|
|
|
|
)
|
|
|
|
} else if n > 32 {
|
|
|
|
bail!("read more than {} key bytes from stdin", blake3::KEY_LEN)
|
|
|
|
} else {
|
|
|
|
Ok(bytes[..blake3::KEY_LEN].try_into().unwrap())
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2020-05-04 03:13:58 +02:00
|
|
|
struct FilepathString {
|
|
|
|
filepath_string: String,
|
|
|
|
has_escapes: bool,
|
|
|
|
}
|
|
|
|
|
|
|
|
// returns (string, did_escape)
|
|
|
|
fn filepath_to_string(filepath_osstr: &OsStr) -> FilepathString {
|
|
|
|
let unicode_cow = filepath_osstr.to_string_lossy();
|
|
|
|
let mut filepath_string = unicode_cow.to_string();
|
|
|
|
// If we're on Windows, normalize backslashes to forward slashes. This
|
|
|
|
// avoids a lot of ugly escaping in the common case, and it makes
|
|
|
|
// checkfiles created on Windows more likely to be portable to Unix. It
|
|
|
|
// also allows us to set a blanket "no backslashes allowed in checkfiles on
|
|
|
|
// Windows" rule, rather than allowing a Unix backslash to potentially get
|
|
|
|
// interpreted as a directory separator on Windows.
|
|
|
|
if cfg!(windows) {
|
|
|
|
filepath_string = filepath_string.replace('\\', "/");
|
|
|
|
}
|
|
|
|
let mut has_escapes = false;
|
|
|
|
if filepath_string.contains('\\') || filepath_string.contains('\n') {
|
|
|
|
filepath_string = filepath_string.replace('\\', "\\\\").replace('\n', "\\n");
|
|
|
|
has_escapes = true;
|
|
|
|
}
|
|
|
|
FilepathString {
|
|
|
|
filepath_string,
|
|
|
|
has_escapes,
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2019-12-12 08:13:16 +01:00
|
|
|
fn main() -> Result<()> {
|
2019-12-13 21:56:21 +01:00
|
|
|
let args = clap_parse_argv();
|
2020-03-16 17:06:16 +01:00
|
|
|
let len = if let Some(length) = args.value_of(LENGTH_ARG) {
|
|
|
|
length.parse::<u64>().context("Failed to parse length.")?
|
|
|
|
} else {
|
|
|
|
blake3::OUT_LEN as u64
|
|
|
|
};
|
2019-12-13 21:56:21 +01:00
|
|
|
let base_hasher = if args.is_present(KEYED_ARG) {
|
|
|
|
blake3::Hasher::new_keyed(&read_key_from_stdin()?)
|
2019-12-27 23:20:38 +01:00
|
|
|
} else if let Some(context) = args.value_of(DERIVE_KEY_ARG) {
|
|
|
|
blake3::Hasher::new_derive_key(context)
|
2019-12-12 08:13:16 +01:00
|
|
|
} else {
|
|
|
|
blake3::Hasher::new()
|
|
|
|
};
|
2020-01-20 18:02:50 +01:00
|
|
|
let mmap_disabled = args.is_present(NO_MMAP_ARG);
|
2019-12-13 21:56:21 +01:00
|
|
|
let print_names = !args.is_present(NO_NAMES_ARG);
|
2020-01-13 22:12:47 +01:00
|
|
|
let raw_output = args.is_present(RAW_ARG);
|
2020-03-16 17:06:16 +01:00
|
|
|
let mut thread_pool_builder = rayon::ThreadPoolBuilder::new();
|
|
|
|
if let Some(num_threads_str) = args.value_of(NUM_THREADS_ARG) {
|
|
|
|
let num_threads: usize = num_threads_str
|
|
|
|
.parse()
|
|
|
|
.context("Failed to parse num threads.")?;
|
|
|
|
thread_pool_builder = thread_pool_builder.num_threads(num_threads);
|
|
|
|
}
|
2020-01-13 22:12:47 +01:00
|
|
|
|
2020-03-16 17:06:16 +01:00
|
|
|
let thread_pool = thread_pool_builder.build()?;
|
|
|
|
thread_pool.install(|| {
|
|
|
|
let mut did_error = false;
|
|
|
|
if let Some(files) = args.values_of_os(FILE_ARG) {
|
|
|
|
if raw_output && files.len() > 1 {
|
|
|
|
bail!("b3sum: Only one filename can be provided when using --raw");
|
|
|
|
}
|
2020-05-04 03:13:58 +02:00
|
|
|
for filepath_osstr in files {
|
|
|
|
let FilepathString {
|
|
|
|
filepath_string,
|
|
|
|
has_escapes,
|
|
|
|
} = filepath_to_string(filepath_osstr);
|
|
|
|
match hash_file(&base_hasher, filepath_osstr, mmap_disabled) {
|
2020-03-16 17:06:16 +01:00
|
|
|
Ok(output) => {
|
|
|
|
if raw_output {
|
|
|
|
write_raw_output(output, len)?;
|
2020-01-13 22:12:47 +01:00
|
|
|
} else {
|
2020-05-04 03:13:58 +02:00
|
|
|
if has_escapes {
|
|
|
|
print!("\\");
|
|
|
|
}
|
2020-03-16 17:06:16 +01:00
|
|
|
write_hex_output(output, len)?;
|
|
|
|
if print_names {
|
2020-05-04 03:13:58 +02:00
|
|
|
println!(" {}", filepath_string);
|
2020-03-16 17:06:16 +01:00
|
|
|
} else {
|
|
|
|
println!();
|
|
|
|
}
|
2020-01-13 22:12:47 +01:00
|
|
|
}
|
2019-12-12 08:13:16 +01:00
|
|
|
}
|
2020-03-16 17:06:16 +01:00
|
|
|
Err(e) => {
|
|
|
|
did_error = true;
|
2020-05-04 03:13:58 +02:00
|
|
|
eprintln!("b3sum: {}: {}", filepath_string, e);
|
2020-03-16 17:06:16 +01:00
|
|
|
}
|
2019-12-12 08:13:16 +01:00
|
|
|
}
|
|
|
|
}
|
2020-01-13 22:12:47 +01:00
|
|
|
} else {
|
2020-03-16 17:06:16 +01:00
|
|
|
let stdin = std::io::stdin();
|
|
|
|
let stdin = stdin.lock();
|
|
|
|
let output = hash_reader(&base_hasher, stdin)?;
|
|
|
|
if raw_output {
|
|
|
|
write_raw_output(output, len)?;
|
|
|
|
} else {
|
|
|
|
write_hex_output(output, len)?;
|
|
|
|
println!();
|
|
|
|
}
|
2020-01-13 22:12:47 +01:00
|
|
|
}
|
2020-03-16 17:06:16 +01:00
|
|
|
std::process::exit(if did_error { 1 } else { 0 });
|
|
|
|
})
|
2019-12-12 08:13:16 +01:00
|
|
|
}
|