1
0
Fork 0
mirror of https://github.com/BLAKE3-team/BLAKE3 synced 2024-04-27 16:55:04 +02:00

Merge branch 'master' into wasm-simd

This commit is contained in:
Ivan Boldyrev 2024-01-23 17:07:04 +01:00
commit 009438cf3d
14 changed files with 2107 additions and 167 deletions

2
.git-blame-ignore-revs Normal file
View File

@ -0,0 +1,2 @@
# CMakeLists.txt whitespace fixups
3e14f865d30271c74fc68d417af488ea91b66d48

View File

@ -38,12 +38,10 @@ jobs:
]
steps:
- uses: actions/checkout@v3
- uses: actions-rs/toolchain@v1
- uses: actions/checkout@v4
- uses: dtolnay/rust-toolchain@master
with:
toolchain: ${{ format('{0}-{1}', matrix.channel, matrix.target.toolchain) }}
profile: minimal
override: true
# Print the compiler version, for debugging.
- name: print compiler version
run: cargo run --quiet
@ -133,6 +131,17 @@ jobs:
run: cargo test
working-directory: ./reference_impl
# the new guts crate
- name: guts test
run: cargo test --all-features
working-directory: ./rust/guts
- name: guts no_std build
run: cargo build --no-default-features
working-directory: ./rust/guts
- name: guts no_std test # note that rust/guts/src/test.rs still uses libstd
run: cargo test --no-default-features
working-directory: ./rust/guts
b3sum_tests:
name: b3sum ${{ matrix.target.name }} ${{ matrix.channel }}
runs-on: ${{ matrix.target.os }}
@ -156,12 +165,10 @@ jobs:
]
steps:
- uses: actions/checkout@v3
- uses: actions-rs/toolchain@v1
- uses: actions/checkout@v4
- uses: dtolnay/rust-toolchain@master
with:
toolchain: ${{ format('{0}-{1}', matrix.channel, matrix.target.toolchain) }}
profile: minimal
override: true
# Test b3sum.
- name: test b3sum
run: cargo test
@ -186,11 +193,8 @@ jobs:
- s390x-unknown-linux-gnu
steps:
- uses: actions/checkout@v3
- uses: actions-rs/toolchain@v1
with:
toolchain: stable
override: true
- uses: actions/checkout@v4
- uses: dtolnay/rust-toolchain@stable
- run: cargo install cross
# Test the portable implementation on everything.
- run: cross test --target ${{ matrix.arch }}
@ -216,7 +220,7 @@ jobs:
runs-on: ubuntu-latest
steps:
- uses: actions/checkout@v3
- uses: actions/checkout@v4
# Test the intrinsics-based implementations.
- run: make -f Makefile.testing test
working-directory: ./c
@ -268,12 +272,10 @@ jobs:
strategy:
fail-fast: false
steps:
- uses: actions/checkout@v3
- uses: actions-rs/toolchain@v1
- uses: actions/checkout@v4
- uses: dtolnay/rust-toolchain@stable
with:
toolchain: stable
target: aarch64-apple-darwin
override: true
targets: aarch64-apple-darwin
- name: build blake3
run: cargo build --target aarch64-apple-darwin
- name: build b3sum
@ -284,7 +286,7 @@ jobs:
name: build with the Tiny C Compiler
runs-on: ubuntu-latest
steps:
- uses: actions/checkout@v3
- uses: actions/checkout@v4
- name: install TCC
run: sudo apt-get install -y tcc
- name: compile
@ -301,7 +303,7 @@ jobs:
name: "compile and test with GCC 5.4"
runs-on: ubuntu-latest
steps:
- uses: actions/checkout@v3
- uses: actions/checkout@v4
- uses: addnab/docker-run-action@v3
with:
image: gcc:5.4
@ -314,7 +316,7 @@ jobs:
# CMake build test (Library only), current macOS/Linux only.
cmake_build:
name: CMake ${{ matrix.os }}
name: CMake ${{ matrix.os }} ${{ matrix.compiler }}
runs-on: ${{ matrix.os }}
strategy:
fail-fast: false
@ -329,7 +331,7 @@ jobs:
- os: macOS-latest
compiler: msvc
steps:
- uses: actions/checkout@v3
- uses: actions/checkout@v4
- name: CMake generation
run: cmake -S c -B c/build -DCMAKE_INSTALL_PREFIX=${{github.workspace}}/target
- name: CMake build / install

View File

@ -23,18 +23,16 @@ jobs:
]
steps:
- uses: actions/checkout@v3
- uses: actions/checkout@v4
- uses: actions/setup-python@v4
with:
python-version: "3.x"
- run: pip install PyGithub
- run: sudo apt-get install musl-tools
if: matrix.target.os == 'ubuntu-latest'
- uses: actions-rs/toolchain@v1
- uses: dtolnay/rust-toolchain@stable
with:
toolchain: stable
profile: minimal
- run: rustup target add ${{ matrix.target.rust-target }}
targets: ${{ matrix.target.rust-target }}
- name: build b3sum
id: build_b3sum
run: python -u .github/workflows/build_b3sum.py ${{ matrix.target.rust-target }}

View File

@ -102,7 +102,7 @@ arrayvec = { version = "0.7.4", default-features = false }
constant_time_eq = "0.3.0"
cfg-if = "1.0.0"
digest = { version = "0.10.1", features = [ "mac" ], optional = true }
memmap2 = { version = "0.7.1", optional = true }
memmap2 = { version = "0.9", optional = true }
rayon = { version = "1.2.1", optional = true }
serde = { version = "1.0", default-features = false, features = ["derive"], optional = true }
zeroize = { version = "1", default-features = false, features = ["zeroize_derive"], optional = true }

View File

@ -201,6 +201,7 @@ Alternatively, it is licensed under the Apache License 2.0.
Here's a (non-exhaustive) list of protocols and software that use BLAKE3:
* [Alephium](https://github.com/alephium/alephium/blob/master/crypto/src/main/scala/org/alephium/crypto/Blake3.scala)
* [Bazel](https://github.com/bazelbuild/bazel/releases/tag/6.4.0)
* [Chia](https://github.com/Chia-Network/chia-blockchain/blob/main/CHANGELOG.md#10beta8-aka-beta-18---2020-07-16)
* [IPFS](https://github.com/ipfs/go-verifcid/issues/13)
* [Farcaster](https://www.farcaster.xyz/)

263
b3sum/Cargo.lock generated
View File

@ -4,9 +4,9 @@ version = 3
[[package]]
name = "anstream"
version = "0.5.0"
version = "0.6.11"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "b1f58811cfac344940f1a400b6e6231ce35171f614f26439e80f8c1465c5cc0c"
checksum = "6e2e1ebcb11de5c03c67de28a7df593d32191b44939c482e97702baaaa6ab6a5"
dependencies = [
"anstyle",
"anstyle-parse",
@ -18,43 +18,43 @@ dependencies = [
[[package]]
name = "anstyle"
version = "1.0.3"
version = "1.0.4"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "b84bf0a05bbb2a83e5eb6fa36bb6e87baa08193c35ff52bbf6b38d8af2890e46"
checksum = "7079075b41f533b8c61d2a4d073c4676e1f8b249ff94a393b0595db304e0dd87"
[[package]]
name = "anstyle-parse"
version = "0.2.1"
version = "0.2.3"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "938874ff5980b03a87c5524b3ae5b59cf99b1d6bc836848df7bc5ada9643c333"
checksum = "c75ac65da39e5fe5ab759307499ddad880d724eed2f6ce5b5e8a26f4f387928c"
dependencies = [
"utf8parse",
]
[[package]]
name = "anstyle-query"
version = "1.0.0"
version = "1.0.2"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "5ca11d4be1bab0c8bc8734a9aa7bf4ee8316d462a08c6ac5052f888fef5b494b"
checksum = "e28923312444cdd728e4738b3f9c9cac739500909bb3d3c94b43551b16517648"
dependencies = [
"windows-sys",
"windows-sys 0.52.0",
]
[[package]]
name = "anstyle-wincon"
version = "2.1.0"
version = "3.0.2"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "58f54d10c6dfa51283a066ceab3ec1ab78d13fae00aa49243a45e4571fb79dfd"
checksum = "1cd54b81ec8d6180e24654d0b371ad22fc3dd083b6ff8ba325b72e00c87660a7"
dependencies = [
"anstyle",
"windows-sys",
"windows-sys 0.52.0",
]
[[package]]
name = "anyhow"
version = "1.0.75"
version = "1.0.79"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "a4668cab20f66d8d020e1fbc0ebe47217433c1b6c8f2040faf858554e394ace6"
checksum = "080e9890a082662b09c1ad45f567faeeb47f22b5fb23895fbe1e651e718e25ca"
[[package]]
name = "arrayref"
@ -68,12 +68,6 @@ version = "0.7.4"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "96d30a06541fbafbc7f82ed10c06164cfbd2c401138f6addd8404629c4b16711"
[[package]]
name = "autocfg"
version = "1.1.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "d468802bab17cbc0cc575e9b053f41e72aa36bfa6b7f55e3529ffa43161b97fa"
[[package]]
name = "b3sum"
version = "1.5.0"
@ -83,7 +77,6 @@ dependencies = [
"clap",
"duct",
"hex",
"memmap2",
"rayon",
"tempfile",
"wild",
@ -97,9 +90,9 @@ checksum = "bef38d45163c2f1dde094a7dfd33ccf595c92905c8f8f4fdc18d06fb1037718a"
[[package]]
name = "bitflags"
version = "2.4.0"
version = "2.4.2"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "b4682ae6287fcf752ecaabbfcc7b6f9b72aa33933dc23a554d853aea8eea8635"
checksum = "ed570934406eb16438a4e976b1b4500774099c13b8cb96eec99f620f05090ddf"
[[package]]
name = "blake3"
@ -131,9 +124,9 @@ checksum = "baf1de4339761588bc0619e3cbc0120ee582ebb74b53b4efbf79117bd2da40fd"
[[package]]
name = "clap"
version = "4.4.4"
version = "4.4.18"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "b1d7b8d5ec32af0fadc644bf1fd509a688c2103b185644bb1e29d164e0703136"
checksum = "1e578d6ec4194633722ccf9544794b71b1385c3c027efe0c55db226fc880865c"
dependencies = [
"clap_builder",
"clap_derive",
@ -141,9 +134,9 @@ dependencies = [
[[package]]
name = "clap_builder"
version = "4.4.4"
version = "4.4.18"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "5179bb514e4d7c2051749d8fcefa2ed6d06a9f4e6d69faf3805f5d80b8cf8d56"
checksum = "4df4df40ec50c46000231c914968278b1eb05098cf8f1b3a518a95030e71d1c7"
dependencies = [
"anstream",
"anstyle",
@ -154,9 +147,9 @@ dependencies = [
[[package]]
name = "clap_derive"
version = "4.4.2"
version = "4.4.7"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "0862016ff20d69b84ef8247369fabf5c008a7417002411897d40ee1f4532b873"
checksum = "cf9804afaaf59a91e75b022a30fb7229a7901f60c755489cc61c9b423b836442"
dependencies = [
"heck",
"proc-macro2",
@ -166,9 +159,9 @@ dependencies = [
[[package]]
name = "clap_lex"
version = "0.5.1"
version = "0.6.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "cd7cc57abe963c6d3b9d8be5b06ba7c8957a930305ca90304f24ef040aa6f961"
checksum = "702fc72eb24e5a1e48ce58027a675bc24edd52096d5397d4aea7c6dd9eca0bd1"
[[package]]
name = "colorchoice"
@ -184,42 +177,34 @@ checksum = "f7144d30dcf0fafbce74250a3963025d8d52177934239851c917d29f1df280c2"
[[package]]
name = "crossbeam-deque"
version = "0.8.3"
version = "0.8.5"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "ce6fd6f855243022dcecf8702fef0c297d4338e226845fe067f6341ad9fa0cef"
checksum = "613f8cc01fe9cf1a3eb3d7f488fd2fa8388403e97039e2f73692932e291a770d"
dependencies = [
"cfg-if",
"crossbeam-epoch",
"crossbeam-utils",
]
[[package]]
name = "crossbeam-epoch"
version = "0.9.15"
version = "0.9.18"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "ae211234986c545741a7dc064309f67ee1e5ad243d0e48335adc0484d960bcc7"
checksum = "5b82ac4a3c2ca9c3460964f020e1402edd5753411d7737aa39c3714ad1b5420e"
dependencies = [
"autocfg",
"cfg-if",
"crossbeam-utils",
"memoffset",
"scopeguard",
]
[[package]]
name = "crossbeam-utils"
version = "0.8.16"
version = "0.8.19"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "5a22b2d63d4d1dc0b7f1b6b2747dd0088008a9be28b6ddf0b1e7d335e3037294"
dependencies = [
"cfg-if",
]
checksum = "248e3bacc7dc6baa3b21e405ee045c3047101a49145e7e9eca583ab4c2ca5345"
[[package]]
name = "duct"
version = "0.13.6"
version = "0.13.7"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "37ae3fc31835f74c2a7ceda3aeede378b0ae2e74c8f1c36559fcc9ae2a4e7d3e"
checksum = "e4ab5718d1224b63252cd0c6f74f6480f9ffeb117438a2e0f5cf6d9a4798929c"
dependencies = [
"libc",
"once_cell",
@ -235,30 +220,19 @@ checksum = "a26ae43d7bcc3b814de94796a5e736d4029efb0ee900c12e2d54c993ad1a1e07"
[[package]]
name = "errno"
version = "0.3.3"
version = "0.3.8"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "136526188508e25c6fef639d7927dfb3e0e3084488bf202267829cf7fc23dbdd"
checksum = "a258e46cdc063eb8519c00b9fc845fc47bcfca4130e2f08e88665ceda8474245"
dependencies = [
"errno-dragonfly",
"libc",
"windows-sys",
]
[[package]]
name = "errno-dragonfly"
version = "0.1.2"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "aa68f1b12764fab894d2755d2518754e71b4fd80ecfb822714a1206c2aab39bf"
dependencies = [
"cc",
"libc",
"windows-sys 0.52.0",
]
[[package]]
name = "fastrand"
version = "2.0.0"
version = "2.0.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "6999dc1837253364c2ebb0704ba97994bd874e8f195d665c50b7548f6ea92764"
checksum = "25cbce373ec4653f1a01a31e8a5e5ec0c622dc27ff9c4e6606eefef5cbbed4a5"
[[package]]
name = "glob"
@ -280,73 +254,64 @@ checksum = "7f24254aa9a54b5c858eaee2f5bccdb46aaf0e486a595ed5fd8f86ba55232a70"
[[package]]
name = "libc"
version = "0.2.148"
version = "0.2.152"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "9cdc71e17332e86d2e1d38c1f99edcb6288ee11b815fb1a4b049eaa2114d369b"
checksum = "13e3bf6590cbc649f4d1a3eefc9d5d6eb746f5200ffb04e5e142700b8faa56e7"
[[package]]
name = "linux-raw-sys"
version = "0.4.7"
version = "0.4.13"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "1a9bad9f94746442c783ca431b22403b519cd7fbeed0533fdd6328b2f2212128"
checksum = "01cda141df6706de531b6c46c3a33ecca755538219bd484262fa09410c13539c"
[[package]]
name = "memmap2"
version = "0.7.1"
version = "0.9.3"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "f49388d20533534cd19360ad3d6a7dadc885944aa802ba3995040c5ec11288c6"
checksum = "45fd3a57831bf88bc63f8cebc0cf956116276e97fef3966103e96416209f7c92"
dependencies = [
"libc",
]
[[package]]
name = "memoffset"
version = "0.9.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "5a634b1c61a95585bd15607c6ab0c4e5b226e695ff2800ba0cdccddf208c406c"
dependencies = [
"autocfg",
]
[[package]]
name = "once_cell"
version = "1.18.0"
version = "1.19.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "dd8b5dd2ae5ed71462c540258bedcb51965123ad7e7ccf4b9a8cafaa4a63576d"
checksum = "3fdb12b2476b595f9358c5161aa467c2438859caa136dec86c26fdd2efe17b92"
[[package]]
name = "os_pipe"
version = "1.1.4"
version = "1.1.5"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "0ae859aa07428ca9a929b936690f8b12dc5f11dd8c6992a18ca93919f28bc177"
checksum = "57119c3b893986491ec9aa85056780d3a0f3cf4da7cc09dd3650dbd6c6738fb9"
dependencies = [
"libc",
"windows-sys",
"windows-sys 0.52.0",
]
[[package]]
name = "proc-macro2"
version = "1.0.67"
version = "1.0.78"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "3d433d9f1a3e8c1263d9456598b16fec66f4acc9a74dacffd35c7bb09b3a1328"
checksum = "e2422ad645d89c99f8f3e6b88a9fdeca7fabeac836b1002371c4367c8f984aae"
dependencies = [
"unicode-ident",
]
[[package]]
name = "quote"
version = "1.0.33"
version = "1.0.35"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "5267fca4496028628a95160fc423a33e8b2e6af8a5302579e322e4b520293cae"
checksum = "291ec9ab5efd934aaf503a6466c5d5251535d108ee747472c3977cc5acc868ef"
dependencies = [
"proc-macro2",
]
[[package]]
name = "rayon"
version = "1.8.0"
version = "1.8.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "9c27db03db7734835b3f53954b534c91069375ce6ccaa2e065441e07d9b6cdb1"
checksum = "fa7237101a77a10773db45d62004a272517633fbcc3df19d96455ede1122e051"
dependencies = [
"either",
"rayon-core",
@ -354,9 +319,9 @@ dependencies = [
[[package]]
name = "rayon-core"
version = "1.12.0"
version = "1.12.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "5ce3fb6ad83f861aac485e76e1985cd109d9a3713802152be56c3b1f0e0658ed"
checksum = "1465873a3dfdaa8ae7cb14b4383657caab0b3e8a0aa9ae8e04b044854c8dfce2"
dependencies = [
"crossbeam-deque",
"crossbeam-utils",
@ -364,32 +329,26 @@ dependencies = [
[[package]]
name = "redox_syscall"
version = "0.3.5"
version = "0.4.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "567664f262709473930a4bf9e51bf2ebf3348f2e748ccc50dea20646858f8f29"
checksum = "4722d768eff46b75989dd134e5c353f0d6296e5aaa3132e776cbdb56be7731aa"
dependencies = [
"bitflags 1.3.2",
]
[[package]]
name = "rustix"
version = "0.38.14"
version = "0.38.30"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "747c788e9ce8e92b12cd485c49ddf90723550b654b32508f979b71a7b1ecda4f"
checksum = "322394588aaf33c24007e8bb3238ee3e4c5c09c084ab32bc73890b99ff326bca"
dependencies = [
"bitflags 2.4.0",
"bitflags 2.4.2",
"errno",
"libc",
"linux-raw-sys",
"windows-sys",
"windows-sys 0.52.0",
]
[[package]]
name = "scopeguard"
version = "1.2.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "94143f37725109f92c262ed2cf5e59bce7498c01bcc1502d7b9afe439a4e9f49"
[[package]]
name = "shared_child"
version = "1.0.0"
@ -408,9 +367,9 @@ checksum = "73473c0e59e6d5812c5dfe2a064a6444949f089e20eec9a2e5506596494e4623"
[[package]]
name = "syn"
version = "2.0.37"
version = "2.0.48"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "7303ef2c05cd654186cb250d29049a24840ca25d2747c25c0381c8d9e2f582e8"
checksum = "0f3531638e407dfc0814761abb7c00a5b54992b849452a0646b7f65c9f770f3f"
dependencies = [
"proc-macro2",
"quote",
@ -419,15 +378,15 @@ dependencies = [
[[package]]
name = "tempfile"
version = "3.8.0"
version = "3.9.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "cb94d2f3cc536af71caac6b6fcebf65860b347e7ce0cc9ebe8f70d3e521054ef"
checksum = "01ce4141aa927a6d1bd34a041795abd0db1cccba5d5f24b009f694bdf3a1f3fa"
dependencies = [
"cfg-if",
"fastrand",
"redox_syscall",
"rustix",
"windows-sys",
"windows-sys 0.52.0",
]
[[package]]
@ -437,7 +396,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "21bebf2b7c9e0a515f6e0f8c51dc0f8e4696391e6f1ff30379559f8365fb0df7"
dependencies = [
"rustix",
"windows-sys",
"windows-sys 0.48.0",
]
[[package]]
@ -454,9 +413,9 @@ checksum = "711b9620af191e0cdc7468a8d14e709c3dcdb115b36f838e601583af800a370a"
[[package]]
name = "wild"
version = "2.1.0"
version = "2.2.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "05b116685a6be0c52f5a103334cbff26db643826c7b3735fc0a3ba9871310a74"
checksum = "10d01931a94d5a115a53f95292f51d316856b68a035618eb831bbba593a30b67"
dependencies = [
"glob",
]
@ -489,7 +448,16 @@ version = "0.48.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "677d2418bec65e3338edb076e806bc1ec15693c5d0104683f2efe857f61056a9"
dependencies = [
"windows-targets",
"windows-targets 0.48.5",
]
[[package]]
name = "windows-sys"
version = "0.52.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "282be5f36a8ce781fad8c8ae18fa3f9beff57ec1b52cb3de0789201425d9a33d"
dependencies = [
"windows-targets 0.52.0",
]
[[package]]
@ -498,13 +466,28 @@ version = "0.48.5"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "9a2fa6e2155d7247be68c096456083145c183cbbbc2764150dda45a87197940c"
dependencies = [
"windows_aarch64_gnullvm",
"windows_aarch64_msvc",
"windows_i686_gnu",
"windows_i686_msvc",
"windows_x86_64_gnu",
"windows_x86_64_gnullvm",
"windows_x86_64_msvc",
"windows_aarch64_gnullvm 0.48.5",
"windows_aarch64_msvc 0.48.5",
"windows_i686_gnu 0.48.5",
"windows_i686_msvc 0.48.5",
"windows_x86_64_gnu 0.48.5",
"windows_x86_64_gnullvm 0.48.5",
"windows_x86_64_msvc 0.48.5",
]
[[package]]
name = "windows-targets"
version = "0.52.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "8a18201040b24831fbb9e4eb208f8892e1f50a37feb53cc7ff887feb8f50e7cd"
dependencies = [
"windows_aarch64_gnullvm 0.52.0",
"windows_aarch64_msvc 0.52.0",
"windows_i686_gnu 0.52.0",
"windows_i686_msvc 0.52.0",
"windows_x86_64_gnu 0.52.0",
"windows_x86_64_gnullvm 0.52.0",
"windows_x86_64_msvc 0.52.0",
]
[[package]]
@ -513,38 +496,80 @@ version = "0.48.5"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "2b38e32f0abccf9987a4e3079dfb67dcd799fb61361e53e2882c3cbaf0d905d8"
[[package]]
name = "windows_aarch64_gnullvm"
version = "0.52.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "cb7764e35d4db8a7921e09562a0304bf2f93e0a51bfccee0bd0bb0b666b015ea"
[[package]]
name = "windows_aarch64_msvc"
version = "0.48.5"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "dc35310971f3b2dbbf3f0690a219f40e2d9afcf64f9ab7cc1be722937c26b4bc"
[[package]]
name = "windows_aarch64_msvc"
version = "0.52.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "bbaa0368d4f1d2aaefc55b6fcfee13f41544ddf36801e793edbbfd7d7df075ef"
[[package]]
name = "windows_i686_gnu"
version = "0.48.5"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "a75915e7def60c94dcef72200b9a8e58e5091744960da64ec734a6c6e9b3743e"
[[package]]
name = "windows_i686_gnu"
version = "0.52.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "a28637cb1fa3560a16915793afb20081aba2c92ee8af57b4d5f28e4b3e7df313"
[[package]]
name = "windows_i686_msvc"
version = "0.48.5"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "8f55c233f70c4b27f66c523580f78f1004e8b5a8b659e05a4eb49d4166cca406"
[[package]]
name = "windows_i686_msvc"
version = "0.52.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "ffe5e8e31046ce6230cc7215707b816e339ff4d4d67c65dffa206fd0f7aa7b9a"
[[package]]
name = "windows_x86_64_gnu"
version = "0.48.5"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "53d40abd2583d23e4718fddf1ebec84dbff8381c07cae67ff7768bbf19c6718e"
[[package]]
name = "windows_x86_64_gnu"
version = "0.52.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "3d6fa32db2bc4a2f5abeacf2b69f7992cd09dca97498da74a151a3132c26befd"
[[package]]
name = "windows_x86_64_gnullvm"
version = "0.48.5"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "0b7b52767868a23d5bab768e390dc5f5c55825b6d30b86c844ff2dc7414044cc"
[[package]]
name = "windows_x86_64_gnullvm"
version = "0.52.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "1a657e1e9d3f514745a572a6846d3c7aa7dbe1658c056ed9c3344c4109a6949e"
[[package]]
name = "windows_x86_64_msvc"
version = "0.48.5"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "ed94fce61571a4006852b7389a063ab983c02eb1bb37b47f8272ce92d06d9538"
[[package]]
name = "windows_x86_64_msvc"
version = "0.52.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "dff9641d1cd4be8d1a070daf9e3773c5f67e78b4d9d42263020c057706765c04"

View File

@ -18,7 +18,6 @@ anyhow = "1.0.25"
blake3 = { version = "1", path = "..", features = ["mmap", "rayon"] }
clap = { version = "4.0.8", features = ["derive", "wrap_help"] }
hex = "0.4.0"
memmap2 = "0.7.0"
rayon = "1.2.1"
wild = "2.0.3"

View File

@ -1,4 +1,9 @@
cmake_minimum_required(VERSION 3.9)
cmake_minimum_required(VERSION 3.9 FATAL_ERROR)
# respect C_EXTENSIONS OFF without explicitly setting C_STANDARD
if (POLICY CMP0128)
cmake_policy(SET CMP0128 NEW)
endif()
project(libblake3
VERSION 1.5.0
@ -9,8 +14,12 @@ project(libblake3
include(FeatureSummary)
include(GNUInstallDirs)
# architecture lists for which to enable assembly / SIMD sources
set(BLAKE3_AMD64_NAMES amd64 AMD64 x86_64)
set(BLAKE3_X86_NAMES i686 x86 X86)
set(BLAKE3_ARMv8_NAMES aarch64 AArch64 arm64 ARM64 armv8 armv8a)
# default SIMD compiler flag configuration (can be overriden by toolchains or CLI)
if(CMAKE_C_COMPILER_ID STREQUAL "MSVC")
if(MSVC)
set(BLAKE3_CFLAGS_SSE2 "/arch:SSE2" CACHE STRING "the compiler flags to enable SSE2")
# MSVC has no dedicated sse4.1 flag (see https://learn.microsoft.com/en-us/cpp/build/reference/arch-x86?view=msvc-170)
set(BLAKE3_CFLAGS_SSE4.1 "/arch:AVX" CACHE STRING "the compiler flags to enable SSE4.1")
@ -24,11 +33,13 @@ elseif(CMAKE_C_COMPILER_ID STREQUAL "GNU"
set(BLAKE3_CFLAGS_SSE4.1 "-msse4.1" CACHE STRING "the compiler flags to enable SSE4.1")
set(BLAKE3_CFLAGS_AVX2 "-mavx2" CACHE STRING "the compiler flags to enable AVX2")
set(BLAKE3_CFLAGS_AVX512 "-mavx512f -mavx512vl" CACHE STRING "the compiler flags to enable AVX512")
if (CMAKE_SYSTEM_PROCESSOR IN_LIST BLAKE3_ARMv8_NAMES
AND NOT CMAKE_SIZEOF_VOID_P EQUAL 8)
# 32-bit ARMv8 needs NEON to be enabled explicitly
set(BLAKE3_CFLAGS_NEON "-mfpu=neon" CACHE STRING "the compiler flags to enable NEON")
endif()
endif()
# architecture lists for which to enable assembly / SIMD sources
set(BLAKE3_AMD64_NAMES amd64 AMD64 x86_64)
set(BLAKE3_X86_NAMES i686 x86 X86)
set(BLAKE3_ARMv8_NAMES aarch64 AArch64 arm64 ARM64 armv8 armv8a)
# library target
add_library(blake3
@ -41,33 +52,47 @@ add_library(BLAKE3::blake3 ALIAS blake3)
# library configuration
set(BLAKE3_PKGCONFIG_CFLAGS)
if (BUILD_SHARED_LIBS)
target_compile_definitions(blake3
target_compile_definitions(blake3
PUBLIC BLAKE3_DLL
PRIVATE BLAKE3_DLL_EXPORTS
)
list(APPEND BLAKE3_PKGCONFIG_CFLAGS -DBLAKE3_DLL)
endif()
target_include_directories(blake3 PUBLIC $<INSTALL_INTERFACE:${CMAKE_INSTALL_INCLUDEDIR}>)
target_include_directories(blake3 PUBLIC
$<BUILD_INTERFACE:${CMAKE_CURRENT_SOURCE_DIR}>
$<INSTALL_INTERFACE:${CMAKE_INSTALL_INCLUDEDIR}>
)
set_target_properties(blake3 PROPERTIES
VERSION ${PROJECT_VERSION}
SOVERSION 0
C_VISIBILITY_PRESET hidden
C_EXTENSIONS OFF
)
target_compile_features(blake3 PUBLIC c_std_99)
# ensure C_EXTENSIONS OFF is respected without overriding CMAKE_C_STANDARD
# which may be set by the user or toolchain file
if (NOT POLICY CMP0128 AND NOT DEFINED CMAKE_C_STANDARD)
set_target_properties(blake3 PROPERTIES C_STANDARD 99)
endif()
# optional SIMD sources
macro(BLAKE3_DISABLE_SIMD)
set(BLAKE3_SIMD_AMD64_ASM OFF)
set(BLAKE3_SIMD_X86_INTRINSICS OFF)
set(BLAKE3_SIMD_NEON_INTRINSICS OFF)
set_source_files_properties(blake3_dispatch.c PROPERTIES
COMPILE_DEFINITIONS BLAKE3_USE_NEON=0;BLAKE3_NO_SSE2;BLAKE3_NO_SSE41;BLAKE3_NO_AVX2;BLAKE3_NO_AVX512
target_compile_definitions(blake3 PRIVATE
BLAKE3_USE_NEON=0
BLAKE3_NO_SSE2
BLAKE3_NO_SSE41
BLAKE3_NO_AVX2
BLAKE3_NO_AVX512
)
endmacro()
if(CMAKE_SYSTEM_PROCESSOR IN_LIST BLAKE3_AMD64_NAMES OR BLAKE3_USE_AMD64_ASM)
set(BLAKE3_SIMD_AMD64_ASM ON)
if(CMAKE_C_COMPILER_ID STREQUAL "MSVC")
if(MSVC)
enable_language(ASM_MASM)
target_sources(blake3 PRIVATE
blake3_avx2_x86-64_windows_msvc.asm
@ -99,7 +124,7 @@ if(CMAKE_SYSTEM_PROCESSOR IN_LIST BLAKE3_AMD64_NAMES OR BLAKE3_USE_AMD64_ASM)
BLAKE3_DISABLE_SIMD()
endif()
else()
else()
BLAKE3_DISABLE_SIMD()
endif()
@ -121,17 +146,19 @@ elseif((CMAKE_SYSTEM_PROCESSOR IN_LIST BLAKE3_X86_NAMES OR BLAKE3_USE_X86_INTRIN
set_source_files_properties(blake3_sse2.c PROPERTIES COMPILE_FLAGS "${BLAKE3_CFLAGS_SSE2}")
set_source_files_properties(blake3_sse41.c PROPERTIES COMPILE_FLAGS "${BLAKE3_CFLAGS_SSE4.1}")
elseif(CMAKE_SYSTEM_PROCESSOR IN_LIST BLAKE3_ARMv8_NAMES
OR ((ANDROID_ABI STREQUAL "armeabi-v7a"
OR BLAKE3_USE_NEON_INTRINSICS)
AND (DEFINED BLAKE3_CFLAGS_NEON
OR CMAKE_SIZEOF_VOID_P EQUAL 8)))
elseif((CMAKE_SYSTEM_PROCESSOR IN_LIST BLAKE3_ARMv8_NAMES
OR ANDROID_ABI STREQUAL "armeabi-v7a"
OR BLAKE3_USE_NEON_INTRINSICS)
AND (DEFINED BLAKE3_CFLAGS_NEON
OR CMAKE_SIZEOF_VOID_P EQUAL 8))
set(BLAKE3_SIMD_NEON_INTRINSICS ON)
target_sources(blake3 PRIVATE
blake3_neon.c
)
set_source_files_properties(blake3_dispatch.c PROPERTIES COMPILE_DEFINITIONS BLAKE3_USE_NEON=1)
target_compile_definitions(blake3 PRIVATE
BLAKE3_USE_NEON=1
)
if (DEFINED BLAKE3_CFLAGS_NEON)
set_source_files_properties(blake3_neon.c PROPERTIES COMPILE_FLAGS "${BLAKE3_CFLAGS_NEON}")

18
rust/guts/Cargo.toml Normal file
View File

@ -0,0 +1,18 @@
[package]
name = "blake3_guts"
version = "0.0.0"
authors = ["Jack O'Connor <oconnor663@gmail.com>", "Samuel Neves"]
description = "low-level building blocks for the BLAKE3 hash function"
repository = "https://github.com/BLAKE3-team/BLAKE3"
license = "CC0-1.0 OR Apache-2.0"
documentation = "https://docs.rs/blake3_guts"
readme = "readme.md"
edition = "2021"
[dev-dependencies]
hex = "0.4.3"
reference_impl = { path = "../../reference_impl" }
[features]
default = ["std"]
std = []

80
rust/guts/readme.md Normal file
View File

@ -0,0 +1,80 @@
# The BLAKE3 Guts API
## Introduction
This [`blake3_guts`](https://crates.io/crates/blake3_guts) sub-crate contains
low-level, high-performance, platform-specific implementations of the BLAKE3
compression function. This API is complicated and unsafe, and this crate will
never have a stable release. Most callers should instead use the
[`blake3`](https://crates.io/crates/blake3) crate, which will eventually depend
on this one internally.
The code you see here (as of January 2024) is an early stage of a large planned
refactor. The motivation for this refactor is a couple of missing features in
both the Rust and C implementations:
- The output side
([`OutputReader`](https://docs.rs/blake3/latest/blake3/struct.OutputReader.html)
in Rust) doesn't take advantage of the most important SIMD optimizations that
compute multiple blocks in parallel. This blocks any project that wants to
use the BLAKE3 XOF as a stream cipher
([[1]](https://github.com/oconnor663/bessie),
[[2]](https://github.com/oconnor663/blake3_aead)).
- Low-level callers like [Bao](https://github.com/oconnor663/bao) that need
interior nodes of the tree also don't get those SIMD optimizations. They have
to use a slow, minimalistic, unstable, doc-hidden module [(also called
`guts`)](https://github.com/BLAKE3-team/BLAKE3/blob/master/src/guts.rs).
The difficulty with adding those features is that they require changes to all
of our optimized assembly and C intrinsics code. That's a couple dozen
different files that are large, platform-specific, difficult to understand, and
full of duplicated code. The higher-level Rust and C implementations of BLAKE3
both depend on these files and will need to coordinate changes.
At the same time, it won't be long before we add support for more platforms:
- RISCV vector extensions
- ARM SVE
- WebAssembly SIMD
It's important to get this refactor done before new platforms make it even
harder to do.
## The private guts API
This is the API that each platform reimplements, so we want it to be as simple
as possible apart from the high-performance work it needs to do. It's
completely `unsafe`, and inputs and outputs are raw pointers that are allowed
to alias (this matters for `hash_parents`, see below).
- `degree`
- `compress`
- The single compression function, for short inputs and odd-length tails.
- `hash_chunks`
- `hash_parents`
- `xof`
- `xof_xor`
- As `xof` but XOR'ing the result into the output buffer.
- `universal_hash`
- This is a new construction specifically to support
[BLAKE3-AEAD](https://github.com/oconnor663/blake3_aead). Some
implementations might just stub it out with portable code.
## The public guts API
This is the API that this crate exposes to callers, i.e. to the main `blake3`
crate. It's a thin, portable layer on top of the private API above. The Rust
version of this API is memory-safe.
- `degree`
- `compress`
- `hash_chunks`
- `hash_parents`
- This handles most levels of the tree, where we keep hashing SIMD_DEGREE
parents at a time.
- `reduce_parents`
- This uses the same `hash_parents` private API, but it handles the top
levels of the tree where we reduce in-place to the root parent node.
- `xof`
- `xof_xor`
- `universal_hash`

1000
rust/guts/src/lib.rs Normal file
View File

@ -0,0 +1,1000 @@
//! # The BLAKE3 Guts API
//!
//! See `readme.md`.
//!
//! The main entrypoint into this crate is [`DETECTED_IMPL`], which is a global [`Implementation`]
//! that atomically initializes itself the first time you use it.
//!
//! # Example
//!
//! ```rust
//! use blake3_guts::{TransposedVectors, DETECTED_IMPL, IV_BYTES, PARENT, ROOT};
//!
//! // Hash an input of exactly two chunks.
//! let input = [0u8; 2048];
//! let mut outputs = TransposedVectors::new();
//! let (left_outputs, _) = DETECTED_IMPL.split_transposed_vectors(&mut outputs);
//! DETECTED_IMPL.hash_chunks(
//! &input,
//! &IV_BYTES,
//! 0, // counter
//! 0, // flags
//! left_outputs,
//! );
//! let root_node = outputs.extract_parent_node(0);
//! let hash = DETECTED_IMPL.compress(
//! &root_node,
//! 64, // block_len
//! &IV_BYTES,
//! 0, // counter
//! PARENT | ROOT,
//! );
//!
//! // Compute the same hash using the reference implementation.
//! let mut reference_hasher = reference_impl::Hasher::new();
//! reference_hasher.update(&input);
//! let mut expected_hash = [0u8; 32];
//! reference_hasher.finalize(&mut expected_hash);
//!
//! assert_eq!(hash, expected_hash);
//! ```
// Tests always require libstd.
#![cfg_attr(all(not(feature = "std"), not(test)), no_std)]
use core::cmp;
use core::marker::PhantomData;
use core::mem;
use core::ptr;
use core::sync::atomic::{AtomicPtr, Ordering::Relaxed};
pub mod portable;
#[cfg(test)]
mod test;
pub const OUT_LEN: usize = 32;
pub const BLOCK_LEN: usize = 64;
pub const CHUNK_LEN: usize = 1024;
pub const WORD_LEN: usize = 4;
pub const UNIVERSAL_HASH_LEN: usize = 16;
pub const CHUNK_START: u32 = 1 << 0;
pub const CHUNK_END: u32 = 1 << 1;
pub const PARENT: u32 = 1 << 2;
pub const ROOT: u32 = 1 << 3;
pub const KEYED_HASH: u32 = 1 << 4;
pub const DERIVE_KEY_CONTEXT: u32 = 1 << 5;
pub const DERIVE_KEY_MATERIAL: u32 = 1 << 6;
pub const IV: CVWords = [
0x6A09E667, 0xBB67AE85, 0x3C6EF372, 0xA54FF53A, 0x510E527F, 0x9B05688C, 0x1F83D9AB, 0x5BE0CD19,
];
pub const IV_BYTES: CVBytes = le_bytes_from_words_32(&IV);
pub const MSG_SCHEDULE: [[usize; 16]; 7] = [
[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15],
[2, 6, 3, 10, 7, 0, 4, 13, 1, 11, 12, 5, 9, 14, 15, 8],
[3, 4, 10, 12, 13, 2, 7, 14, 6, 5, 9, 0, 11, 15, 8, 1],
[10, 7, 12, 9, 14, 3, 13, 15, 4, 0, 11, 2, 5, 8, 1, 6],
[12, 13, 9, 11, 15, 10, 14, 8, 7, 2, 5, 3, 0, 1, 6, 4],
[9, 14, 11, 5, 8, 12, 15, 1, 13, 3, 0, 10, 2, 6, 4, 7],
[11, 15, 5, 0, 1, 9, 8, 6, 14, 10, 2, 12, 3, 4, 7, 13],
];
// never less than 2
pub const MAX_SIMD_DEGREE: usize = 2;
pub type CVBytes = [u8; 32];
pub type CVWords = [u32; 8];
pub type BlockBytes = [u8; 64];
pub type BlockWords = [u32; 16];
pub static DETECTED_IMPL: Implementation = Implementation::new(
degree_init,
compress_init,
hash_chunks_init,
hash_parents_init,
xof_init,
xof_xor_init,
universal_hash_init,
);
fn detect() -> Implementation {
portable::implementation()
}
fn init_detected_impl() {
let detected = detect();
DETECTED_IMPL
.degree_ptr
.store(detected.degree_ptr.load(Relaxed), Relaxed);
DETECTED_IMPL
.compress_ptr
.store(detected.compress_ptr.load(Relaxed), Relaxed);
DETECTED_IMPL
.hash_chunks_ptr
.store(detected.hash_chunks_ptr.load(Relaxed), Relaxed);
DETECTED_IMPL
.hash_parents_ptr
.store(detected.hash_parents_ptr.load(Relaxed), Relaxed);
DETECTED_IMPL
.xof_ptr
.store(detected.xof_ptr.load(Relaxed), Relaxed);
DETECTED_IMPL
.xof_xor_ptr
.store(detected.xof_xor_ptr.load(Relaxed), Relaxed);
DETECTED_IMPL
.universal_hash_ptr
.store(detected.universal_hash_ptr.load(Relaxed), Relaxed);
}
pub struct Implementation {
degree_ptr: AtomicPtr<()>,
compress_ptr: AtomicPtr<()>,
hash_chunks_ptr: AtomicPtr<()>,
hash_parents_ptr: AtomicPtr<()>,
xof_ptr: AtomicPtr<()>,
xof_xor_ptr: AtomicPtr<()>,
universal_hash_ptr: AtomicPtr<()>,
}
impl Implementation {
const fn new(
degree_fn: DegreeFn,
compress_fn: CompressFn,
hash_chunks_fn: HashChunksFn,
hash_parents_fn: HashParentsFn,
xof_fn: XofFn,
xof_xor_fn: XofFn,
universal_hash_fn: UniversalHashFn,
) -> Self {
Self {
degree_ptr: AtomicPtr::new(degree_fn as *mut ()),
compress_ptr: AtomicPtr::new(compress_fn as *mut ()),
hash_chunks_ptr: AtomicPtr::new(hash_chunks_fn as *mut ()),
hash_parents_ptr: AtomicPtr::new(hash_parents_fn as *mut ()),
xof_ptr: AtomicPtr::new(xof_fn as *mut ()),
xof_xor_ptr: AtomicPtr::new(xof_xor_fn as *mut ()),
universal_hash_ptr: AtomicPtr::new(universal_hash_fn as *mut ()),
}
}
#[inline]
fn degree_fn(&self) -> DegreeFn {
unsafe { mem::transmute(self.degree_ptr.load(Relaxed)) }
}
#[inline]
pub fn degree(&self) -> usize {
let degree = unsafe { self.degree_fn()() };
debug_assert!(degree >= 2);
debug_assert!(degree <= MAX_SIMD_DEGREE);
debug_assert_eq!(1, degree.count_ones(), "power of 2");
degree
}
#[inline]
pub fn split_transposed_vectors<'v>(
&self,
vectors: &'v mut TransposedVectors,
) -> (TransposedSplit<'v>, TransposedSplit<'v>) {
unsafe { vectors.split(self.degree()) }
}
#[inline]
fn compress_fn(&self) -> CompressFn {
unsafe { mem::transmute(self.compress_ptr.load(Relaxed)) }
}
#[inline]
pub fn compress(
&self,
block: &BlockBytes,
block_len: u32,
cv: &CVBytes,
counter: u64,
flags: u32,
) -> CVBytes {
let mut out = [0u8; 32];
unsafe {
self.compress_fn()(block, block_len, cv, counter, flags, &mut out);
}
out
}
// The contract for HashChunksFn doesn't require the implementation to support single-chunk
// inputs. Instead we handle that case here by calling compress in a loop.
#[inline]
fn hash_one_chunk(
&self,
mut input: &[u8],
key: &CVBytes,
counter: u64,
mut flags: u32,
output: TransposedSplit,
) {
debug_assert!(input.len() <= CHUNK_LEN);
let mut cv = *key;
flags |= CHUNK_START;
while input.len() > BLOCK_LEN {
cv = self.compress(
input[..BLOCK_LEN].try_into().unwrap(),
BLOCK_LEN as u32,
&cv,
counter,
flags,
);
input = &input[BLOCK_LEN..];
flags &= !CHUNK_START;
}
let mut final_block = [0u8; BLOCK_LEN];
final_block[..input.len()].copy_from_slice(input);
cv = self.compress(
&final_block,
input.len() as u32,
&cv,
counter,
flags | CHUNK_END,
);
unsafe {
write_transposed_cv(&words_from_le_bytes_32(&cv), output.ptr);
}
}
#[inline]
fn hash_chunks_fn(&self) -> HashChunksFn {
unsafe { mem::transmute(self.hash_chunks_ptr.load(Relaxed)) }
}
#[inline]
pub fn hash_chunks(
&self,
input: &[u8],
key: &CVBytes,
counter: u64,
flags: u32,
transposed_output: TransposedSplit,
) -> usize {
debug_assert!(input.len() <= self.degree() * CHUNK_LEN);
if input.len() <= CHUNK_LEN {
// The underlying hash_chunks_fn isn't required to support this case. Instead we handle
// it by calling compress_fn in a loop. But note that we still don't support root
// finalization or the empty input here.
self.hash_one_chunk(input, key, counter, flags, transposed_output);
return 1;
}
// SAFETY: If the caller passes in more than MAX_SIMD_DEGREE * CHUNK_LEN bytes, silently
// ignore the remainder. This makes it impossible to write out of bounds in a properly
// constructed TransposedSplit.
let len = cmp::min(input.len(), MAX_SIMD_DEGREE * CHUNK_LEN);
unsafe {
self.hash_chunks_fn()(
input.as_ptr(),
len,
key,
counter,
flags,
transposed_output.ptr,
);
}
if input.len() % CHUNK_LEN == 0 {
input.len() / CHUNK_LEN
} else {
(input.len() / CHUNK_LEN) + 1
}
}
#[inline]
fn hash_parents_fn(&self) -> HashParentsFn {
unsafe { mem::transmute(self.hash_parents_ptr.load(Relaxed)) }
}
#[inline]
pub fn hash_parents(
&self,
transposed_input: &TransposedVectors,
mut num_cvs: usize,
key: &CVBytes,
flags: u32,
transposed_output: TransposedSplit,
) -> usize {
debug_assert!(num_cvs <= 2 * MAX_SIMD_DEGREE);
// SAFETY: Cap num_cvs at 2 * MAX_SIMD_DEGREE, to guarantee no out-of-bounds accesses.
num_cvs = cmp::min(num_cvs, 2 * MAX_SIMD_DEGREE);
let mut odd_cv = [0u32; 8];
if num_cvs % 2 == 1 {
unsafe {
odd_cv = read_transposed_cv(transposed_input.as_ptr().add(num_cvs - 1));
}
}
let num_parents = num_cvs / 2;
unsafe {
self.hash_parents_fn()(
transposed_input.as_ptr(),
num_parents,
key,
flags | PARENT,
transposed_output.ptr,
);
}
if num_cvs % 2 == 1 {
unsafe {
write_transposed_cv(&odd_cv, transposed_output.ptr.add(num_parents));
}
num_parents + 1
} else {
num_parents
}
}
#[inline]
pub fn reduce_parents(
&self,
transposed_in_out: &mut TransposedVectors,
mut num_cvs: usize,
key: &CVBytes,
flags: u32,
) -> usize {
debug_assert!(num_cvs <= 2 * MAX_SIMD_DEGREE);
// SAFETY: Cap num_cvs at 2 * MAX_SIMD_DEGREE, to guarantee no out-of-bounds accesses.
num_cvs = cmp::min(num_cvs, 2 * MAX_SIMD_DEGREE);
let in_out_ptr = transposed_in_out.as_mut_ptr();
let mut odd_cv = [0u32; 8];
if num_cvs % 2 == 1 {
unsafe {
odd_cv = read_transposed_cv(in_out_ptr.add(num_cvs - 1));
}
}
let num_parents = num_cvs / 2;
unsafe {
self.hash_parents_fn()(in_out_ptr, num_parents, key, flags | PARENT, in_out_ptr);
}
if num_cvs % 2 == 1 {
unsafe {
write_transposed_cv(&odd_cv, in_out_ptr.add(num_parents));
}
num_parents + 1
} else {
num_parents
}
}
#[inline]
fn xof_fn(&self) -> XofFn {
unsafe { mem::transmute(self.xof_ptr.load(Relaxed)) }
}
#[inline]
pub fn xof(
&self,
block: &BlockBytes,
block_len: u32,
cv: &CVBytes,
mut counter: u64,
flags: u32,
mut out: &mut [u8],
) {
let degree = self.degree();
let simd_len = degree * BLOCK_LEN;
while !out.is_empty() {
let take = cmp::min(simd_len, out.len());
unsafe {
self.xof_fn()(
block,
block_len,
cv,
counter,
flags | ROOT,
out.as_mut_ptr(),
take,
);
}
out = &mut out[take..];
counter += degree as u64;
}
}
#[inline]
fn xof_xor_fn(&self) -> XofFn {
unsafe { mem::transmute(self.xof_xor_ptr.load(Relaxed)) }
}
#[inline]
pub fn xof_xor(
&self,
block: &BlockBytes,
block_len: u32,
cv: &CVBytes,
mut counter: u64,
flags: u32,
mut out: &mut [u8],
) {
let degree = self.degree();
let simd_len = degree * BLOCK_LEN;
while !out.is_empty() {
let take = cmp::min(simd_len, out.len());
unsafe {
self.xof_xor_fn()(
block,
block_len,
cv,
counter,
flags | ROOT,
out.as_mut_ptr(),
take,
);
}
out = &mut out[take..];
counter += degree as u64;
}
}
#[inline]
fn universal_hash_fn(&self) -> UniversalHashFn {
unsafe { mem::transmute(self.universal_hash_ptr.load(Relaxed)) }
}
#[inline]
pub fn universal_hash(&self, mut input: &[u8], key: &CVBytes, mut counter: u64) -> [u8; 16] {
let degree = self.degree();
let simd_len = degree * BLOCK_LEN;
let mut ret = [0u8; 16];
while !input.is_empty() {
let take = cmp::min(simd_len, input.len());
let mut output = [0u8; 16];
unsafe {
self.universal_hash_fn()(input.as_ptr(), take, key, counter, &mut output);
}
input = &input[take..];
counter += degree as u64;
for byte_index in 0..16 {
ret[byte_index] ^= output[byte_index];
}
}
ret
}
}
impl Clone for Implementation {
fn clone(&self) -> Self {
Self {
degree_ptr: AtomicPtr::new(self.degree_ptr.load(Relaxed)),
compress_ptr: AtomicPtr::new(self.compress_ptr.load(Relaxed)),
hash_chunks_ptr: AtomicPtr::new(self.hash_chunks_ptr.load(Relaxed)),
hash_parents_ptr: AtomicPtr::new(self.hash_parents_ptr.load(Relaxed)),
xof_ptr: AtomicPtr::new(self.xof_ptr.load(Relaxed)),
xof_xor_ptr: AtomicPtr::new(self.xof_xor_ptr.load(Relaxed)),
universal_hash_ptr: AtomicPtr::new(self.universal_hash_ptr.load(Relaxed)),
}
}
}
// never less than 2
type DegreeFn = unsafe extern "C" fn() -> usize;
unsafe extern "C" fn degree_init() -> usize {
init_detected_impl();
DETECTED_IMPL.degree_fn()()
}
type CompressFn = unsafe extern "C" fn(
block: *const BlockBytes, // zero padded to 64 bytes
block_len: u32,
cv: *const CVBytes,
counter: u64,
flags: u32,
out: *mut CVBytes, // may overlap the input
);
unsafe extern "C" fn compress_init(
block: *const BlockBytes,
block_len: u32,
cv: *const CVBytes,
counter: u64,
flags: u32,
out: *mut CVBytes,
) {
init_detected_impl();
DETECTED_IMPL.compress_fn()(block, block_len, cv, counter, flags, out);
}
type CompressXofFn = unsafe extern "C" fn(
block: *const BlockBytes, // zero padded to 64 bytes
block_len: u32,
cv: *const CVBytes,
counter: u64,
flags: u32,
out: *mut BlockBytes, // may overlap the input
);
type HashChunksFn = unsafe extern "C" fn(
input: *const u8,
input_len: usize,
key: *const CVBytes,
counter: u64,
flags: u32,
transposed_output: *mut u32,
);
unsafe extern "C" fn hash_chunks_init(
input: *const u8,
input_len: usize,
key: *const CVBytes,
counter: u64,
flags: u32,
transposed_output: *mut u32,
) {
init_detected_impl();
DETECTED_IMPL.hash_chunks_fn()(input, input_len, key, counter, flags, transposed_output);
}
type HashParentsFn = unsafe extern "C" fn(
transposed_input: *const u32,
num_parents: usize,
key: *const CVBytes,
flags: u32,
transposed_output: *mut u32, // may overlap the input
);
unsafe extern "C" fn hash_parents_init(
transposed_input: *const u32,
num_parents: usize,
key: *const CVBytes,
flags: u32,
transposed_output: *mut u32,
) {
init_detected_impl();
DETECTED_IMPL.hash_parents_fn()(transposed_input, num_parents, key, flags, transposed_output);
}
// This signature covers both xof() and xof_xor().
type XofFn = unsafe extern "C" fn(
block: *const BlockBytes, // zero padded to 64 bytes
block_len: u32,
cv: *const CVBytes,
counter: u64,
flags: u32,
out: *mut u8,
out_len: usize,
);
unsafe extern "C" fn xof_init(
block: *const BlockBytes,
block_len: u32,
cv: *const CVBytes,
counter: u64,
flags: u32,
out: *mut u8,
out_len: usize,
) {
init_detected_impl();
DETECTED_IMPL.xof_fn()(block, block_len, cv, counter, flags, out, out_len);
}
unsafe extern "C" fn xof_xor_init(
block: *const BlockBytes,
block_len: u32,
cv: *const CVBytes,
counter: u64,
flags: u32,
out: *mut u8,
out_len: usize,
) {
init_detected_impl();
DETECTED_IMPL.xof_xor_fn()(block, block_len, cv, counter, flags, out, out_len);
}
type UniversalHashFn = unsafe extern "C" fn(
input: *const u8,
input_len: usize,
key: *const CVBytes,
counter: u64,
out: *mut [u8; 16],
);
unsafe extern "C" fn universal_hash_init(
input: *const u8,
input_len: usize,
key: *const CVBytes,
counter: u64,
out: *mut [u8; 16],
) {
init_detected_impl();
DETECTED_IMPL.universal_hash_fn()(input, input_len, key, counter, out);
}
// The implicit degree of this implementation is MAX_SIMD_DEGREE.
#[inline(always)]
unsafe fn hash_chunks_using_compress(
compress: CompressFn,
mut input: *const u8,
mut input_len: usize,
key: *const CVBytes,
mut counter: u64,
flags: u32,
mut transposed_output: *mut u32,
) {
debug_assert!(input_len > 0);
debug_assert!(input_len <= MAX_SIMD_DEGREE * CHUNK_LEN);
input_len = cmp::min(input_len, MAX_SIMD_DEGREE * CHUNK_LEN);
while input_len > 0 {
let mut chunk_len = cmp::min(input_len, CHUNK_LEN);
input_len -= chunk_len;
// We only use 8 words of the CV, but compress returns 16.
let mut cv = *key;
let cv_ptr: *mut CVBytes = &mut cv;
let mut chunk_flags = flags | CHUNK_START;
while chunk_len > BLOCK_LEN {
compress(
input as *const BlockBytes,
BLOCK_LEN as u32,
cv_ptr,
counter,
chunk_flags,
cv_ptr,
);
input = input.add(BLOCK_LEN);
chunk_len -= BLOCK_LEN;
chunk_flags &= !CHUNK_START;
}
let mut last_block = [0u8; BLOCK_LEN];
ptr::copy_nonoverlapping(input, last_block.as_mut_ptr(), chunk_len);
input = input.add(chunk_len);
compress(
&last_block,
chunk_len as u32,
cv_ptr,
counter,
chunk_flags | CHUNK_END,
cv_ptr,
);
let cv_words = words_from_le_bytes_32(&cv);
for word_index in 0..8 {
transposed_output
.add(word_index * TRANSPOSED_STRIDE)
.write(cv_words[word_index]);
}
transposed_output = transposed_output.add(1);
counter += 1;
}
}
// The implicit degree of this implementation is MAX_SIMD_DEGREE.
#[inline(always)]
unsafe fn hash_parents_using_compress(
compress: CompressFn,
mut transposed_input: *const u32,
mut num_parents: usize,
key: *const CVBytes,
flags: u32,
mut transposed_output: *mut u32, // may overlap the input
) {
debug_assert!(num_parents > 0);
debug_assert!(num_parents <= MAX_SIMD_DEGREE);
while num_parents > 0 {
let mut block_bytes = [0u8; 64];
for word_index in 0..8 {
let left_child_word = transposed_input.add(word_index * TRANSPOSED_STRIDE).read();
block_bytes[WORD_LEN * word_index..][..WORD_LEN]
.copy_from_slice(&left_child_word.to_le_bytes());
let right_child_word = transposed_input
.add(word_index * TRANSPOSED_STRIDE + 1)
.read();
block_bytes[WORD_LEN * (word_index + 8)..][..WORD_LEN]
.copy_from_slice(&right_child_word.to_le_bytes());
}
let mut cv = [0u8; 32];
compress(&block_bytes, BLOCK_LEN as u32, key, 0, flags, &mut cv);
let cv_words = words_from_le_bytes_32(&cv);
for word_index in 0..8 {
transposed_output
.add(word_index * TRANSPOSED_STRIDE)
.write(cv_words[word_index]);
}
transposed_input = transposed_input.add(2);
transposed_output = transposed_output.add(1);
num_parents -= 1;
}
}
#[inline(always)]
unsafe fn xof_using_compress_xof(
compress_xof: CompressXofFn,
block: *const BlockBytes,
block_len: u32,
cv: *const CVBytes,
mut counter: u64,
flags: u32,
mut out: *mut u8,
mut out_len: usize,
) {
debug_assert!(out_len <= MAX_SIMD_DEGREE * BLOCK_LEN);
while out_len > 0 {
let mut block_output = [0u8; 64];
compress_xof(block, block_len, cv, counter, flags, &mut block_output);
let take = cmp::min(out_len, BLOCK_LEN);
ptr::copy_nonoverlapping(block_output.as_ptr(), out, take);
out = out.add(take);
out_len -= take;
counter += 1;
}
}
#[inline(always)]
unsafe fn xof_xor_using_compress_xof(
compress_xof: CompressXofFn,
block: *const BlockBytes,
block_len: u32,
cv: *const CVBytes,
mut counter: u64,
flags: u32,
mut out: *mut u8,
mut out_len: usize,
) {
debug_assert!(out_len <= MAX_SIMD_DEGREE * BLOCK_LEN);
while out_len > 0 {
let mut block_output = [0u8; 64];
compress_xof(block, block_len, cv, counter, flags, &mut block_output);
let take = cmp::min(out_len, BLOCK_LEN);
for i in 0..take {
*out.add(i) ^= block_output[i];
}
out = out.add(take);
out_len -= take;
counter += 1;
}
}
#[inline(always)]
unsafe fn universal_hash_using_compress(
compress: CompressFn,
mut input: *const u8,
mut input_len: usize,
key: *const CVBytes,
mut counter: u64,
out: *mut [u8; 16],
) {
let flags = KEYED_HASH | CHUNK_START | CHUNK_END | ROOT;
let mut result = [0u8; 16];
while input_len > 0 {
let block_len = cmp::min(input_len, BLOCK_LEN);
let mut block = [0u8; BLOCK_LEN];
ptr::copy_nonoverlapping(input, block.as_mut_ptr(), block_len);
let mut block_output = [0u8; 32];
compress(
&block,
block_len as u32,
key,
counter,
flags,
&mut block_output,
);
for i in 0..16 {
result[i] ^= block_output[i];
}
input = input.add(block_len);
input_len -= block_len;
counter += 1;
}
*out = result;
}
// this is in units of *words*, for pointer operations on *const/*mut u32
const TRANSPOSED_STRIDE: usize = 2 * MAX_SIMD_DEGREE;
#[cfg_attr(any(target_arch = "x86", target_arch = "x86_64"), repr(C, align(64)))]
#[derive(Clone, Debug, PartialEq, Eq)]
pub struct TransposedVectors([[u32; 2 * MAX_SIMD_DEGREE]; 8]);
impl TransposedVectors {
pub fn new() -> Self {
Self([[0; 2 * MAX_SIMD_DEGREE]; 8])
}
pub fn extract_cv(&self, cv_index: usize) -> CVBytes {
let mut words = [0u32; 8];
for word_index in 0..8 {
words[word_index] = self.0[word_index][cv_index];
}
le_bytes_from_words_32(&words)
}
pub fn extract_parent_node(&self, parent_index: usize) -> BlockBytes {
let mut bytes = [0u8; 64];
bytes[..32].copy_from_slice(&self.extract_cv(parent_index / 2));
bytes[32..].copy_from_slice(&self.extract_cv(parent_index / 2 + 1));
bytes
}
fn as_ptr(&self) -> *const u32 {
self.0[0].as_ptr()
}
fn as_mut_ptr(&mut self) -> *mut u32 {
self.0[0].as_mut_ptr()
}
// SAFETY: This function is just pointer arithmetic, but callers assume that it's safe (not
// necessarily correct) to write up to `degree` words to either side of the split, possibly
// from different threads.
unsafe fn split(&mut self, degree: usize) -> (TransposedSplit, TransposedSplit) {
debug_assert!(degree > 0);
debug_assert!(degree <= MAX_SIMD_DEGREE);
debug_assert_eq!(degree.count_ones(), 1, "power of 2");
let ptr = self.as_mut_ptr();
let left = TransposedSplit {
ptr,
phantom_data: PhantomData,
};
let right = TransposedSplit {
ptr: ptr.wrapping_add(degree),
phantom_data: PhantomData,
};
(left, right)
}
}
pub struct TransposedSplit<'vectors> {
ptr: *mut u32,
phantom_data: PhantomData<&'vectors mut u32>,
}
unsafe impl<'vectors> Send for TransposedSplit<'vectors> {}
unsafe impl<'vectors> Sync for TransposedSplit<'vectors> {}
unsafe fn read_transposed_cv(src: *const u32) -> CVWords {
let mut cv = [0u32; 8];
for word_index in 0..8 {
let offset_words = word_index * TRANSPOSED_STRIDE;
cv[word_index] = src.add(offset_words).read();
}
cv
}
unsafe fn write_transposed_cv(cv: &CVWords, dest: *mut u32) {
for word_index in 0..8 {
let offset_words = word_index * TRANSPOSED_STRIDE;
dest.add(offset_words).write(cv[word_index]);
}
}
#[inline(always)]
pub const fn le_bytes_from_words_32(words: &CVWords) -> CVBytes {
let mut bytes = [0u8; 32];
// This loop is super verbose because currently that's what it takes to be const.
let mut word_index = 0;
while word_index < bytes.len() / WORD_LEN {
let word_bytes = words[word_index].to_le_bytes();
let mut byte_index = 0;
while byte_index < WORD_LEN {
bytes[word_index * WORD_LEN + byte_index] = word_bytes[byte_index];
byte_index += 1;
}
word_index += 1;
}
bytes
}
#[inline(always)]
pub const fn le_bytes_from_words_64(words: &BlockWords) -> BlockBytes {
let mut bytes = [0u8; 64];
// This loop is super verbose because currently that's what it takes to be const.
let mut word_index = 0;
while word_index < bytes.len() / WORD_LEN {
let word_bytes = words[word_index].to_le_bytes();
let mut byte_index = 0;
while byte_index < WORD_LEN {
bytes[word_index * WORD_LEN + byte_index] = word_bytes[byte_index];
byte_index += 1;
}
word_index += 1;
}
bytes
}
#[inline(always)]
pub const fn words_from_le_bytes_32(bytes: &CVBytes) -> CVWords {
let mut words = [0u32; 8];
// This loop is super verbose because currently that's what it takes to be const.
let mut word_index = 0;
while word_index < words.len() {
let mut word_bytes = [0u8; WORD_LEN];
let mut byte_index = 0;
while byte_index < WORD_LEN {
word_bytes[byte_index] = bytes[word_index * WORD_LEN + byte_index];
byte_index += 1;
}
words[word_index] = u32::from_le_bytes(word_bytes);
word_index += 1;
}
words
}
#[inline(always)]
pub const fn words_from_le_bytes_64(bytes: &BlockBytes) -> BlockWords {
let mut words = [0u32; 16];
// This loop is super verbose because currently that's what it takes to be const.
let mut word_index = 0;
while word_index < words.len() {
let mut word_bytes = [0u8; WORD_LEN];
let mut byte_index = 0;
while byte_index < WORD_LEN {
word_bytes[byte_index] = bytes[word_index * WORD_LEN + byte_index];
byte_index += 1;
}
words[word_index] = u32::from_le_bytes(word_bytes);
word_index += 1;
}
words
}
#[test]
fn test_byte_word_round_trips() {
let cv = *b"This is 32 LE bytes/eight words.";
assert_eq!(cv, le_bytes_from_words_32(&words_from_le_bytes_32(&cv)));
let block = *b"This is sixty-four little-endian bytes, or sixteen 32-bit words.";
assert_eq!(
block,
le_bytes_from_words_64(&words_from_le_bytes_64(&block)),
);
}
// The largest power of two less than or equal to `n`, used for left_len()
// immediately below, and also directly in Hasher::update().
pub fn largest_power_of_two_leq(n: usize) -> usize {
((n / 2) + 1).next_power_of_two()
}
#[test]
fn test_largest_power_of_two_leq() {
let input_output = &[
// The zero case is nonsensical, but it does work.
(0, 1),
(1, 1),
(2, 2),
(3, 2),
(4, 4),
(5, 4),
(6, 4),
(7, 4),
(8, 8),
// the largest possible usize
(usize::MAX, (usize::MAX >> 1) + 1),
];
for &(input, output) in input_output {
assert_eq!(
output,
crate::largest_power_of_two_leq(input),
"wrong output for n={}",
input
);
}
}
// Given some input larger than one chunk, return the number of bytes that
// should go in the left subtree. This is the largest power-of-2 number of
// chunks that leaves at least 1 byte for the right subtree.
pub fn left_len(content_len: usize) -> usize {
debug_assert!(content_len > CHUNK_LEN);
// Subtract 1 to reserve at least one byte for the right side.
let full_chunks = (content_len - 1) / CHUNK_LEN;
largest_power_of_two_leq(full_chunks) * CHUNK_LEN
}
#[test]
fn test_left_len() {
let input_output = &[
(CHUNK_LEN + 1, CHUNK_LEN),
(2 * CHUNK_LEN - 1, CHUNK_LEN),
(2 * CHUNK_LEN, CHUNK_LEN),
(2 * CHUNK_LEN + 1, 2 * CHUNK_LEN),
(4 * CHUNK_LEN - 1, 2 * CHUNK_LEN),
(4 * CHUNK_LEN, 2 * CHUNK_LEN),
(4 * CHUNK_LEN + 1, 4 * CHUNK_LEN),
];
for &(input, output) in input_output {
assert_eq!(left_len(input), output);
}
}

262
rust/guts/src/portable.rs Normal file
View File

@ -0,0 +1,262 @@
use crate::{
le_bytes_from_words_32, le_bytes_from_words_64, words_from_le_bytes_32, words_from_le_bytes_64,
BlockBytes, BlockWords, CVBytes, CVWords, Implementation, IV, MAX_SIMD_DEGREE, MSG_SCHEDULE,
};
const DEGREE: usize = MAX_SIMD_DEGREE;
unsafe extern "C" fn degree() -> usize {
DEGREE
}
#[inline(always)]
fn g(state: &mut BlockWords, a: usize, b: usize, c: usize, d: usize, x: u32, y: u32) {
state[a] = state[a].wrapping_add(state[b]).wrapping_add(x);
state[d] = (state[d] ^ state[a]).rotate_right(16);
state[c] = state[c].wrapping_add(state[d]);
state[b] = (state[b] ^ state[c]).rotate_right(12);
state[a] = state[a].wrapping_add(state[b]).wrapping_add(y);
state[d] = (state[d] ^ state[a]).rotate_right(8);
state[c] = state[c].wrapping_add(state[d]);
state[b] = (state[b] ^ state[c]).rotate_right(7);
}
#[inline(always)]
fn round(state: &mut [u32; 16], msg: &BlockWords, round: usize) {
// Select the message schedule based on the round.
let schedule = MSG_SCHEDULE[round];
// Mix the columns.
g(state, 0, 4, 8, 12, msg[schedule[0]], msg[schedule[1]]);
g(state, 1, 5, 9, 13, msg[schedule[2]], msg[schedule[3]]);
g(state, 2, 6, 10, 14, msg[schedule[4]], msg[schedule[5]]);
g(state, 3, 7, 11, 15, msg[schedule[6]], msg[schedule[7]]);
// Mix the diagonals.
g(state, 0, 5, 10, 15, msg[schedule[8]], msg[schedule[9]]);
g(state, 1, 6, 11, 12, msg[schedule[10]], msg[schedule[11]]);
g(state, 2, 7, 8, 13, msg[schedule[12]], msg[schedule[13]]);
g(state, 3, 4, 9, 14, msg[schedule[14]], msg[schedule[15]]);
}
#[inline(always)]
fn compress_inner(
block_words: &BlockWords,
block_len: u32,
cv_words: &CVWords,
counter: u64,
flags: u32,
) -> [u32; 16] {
let mut state = [
cv_words[0],
cv_words[1],
cv_words[2],
cv_words[3],
cv_words[4],
cv_words[5],
cv_words[6],
cv_words[7],
IV[0],
IV[1],
IV[2],
IV[3],
counter as u32,
(counter >> 32) as u32,
block_len as u32,
flags as u32,
];
for round_number in 0..7 {
round(&mut state, &block_words, round_number);
}
state
}
pub(crate) unsafe extern "C" fn compress(
block: *const BlockBytes,
block_len: u32,
cv: *const CVBytes,
counter: u64,
flags: u32,
out: *mut CVBytes,
) {
let block_words = words_from_le_bytes_64(&*block);
let cv_words = words_from_le_bytes_32(&*cv);
let mut state = compress_inner(&block_words, block_len, &cv_words, counter, flags);
for word_index in 0..8 {
state[word_index] ^= state[word_index + 8];
}
*out = le_bytes_from_words_32(state[..8].try_into().unwrap());
}
pub(crate) unsafe extern "C" fn compress_xof(
block: *const BlockBytes,
block_len: u32,
cv: *const CVBytes,
counter: u64,
flags: u32,
out: *mut BlockBytes,
) {
let block_words = words_from_le_bytes_64(&*block);
let cv_words = words_from_le_bytes_32(&*cv);
let mut state = compress_inner(&block_words, block_len, &cv_words, counter, flags);
for word_index in 0..8 {
state[word_index] ^= state[word_index + 8];
state[word_index + 8] ^= cv_words[word_index];
}
*out = le_bytes_from_words_64(&state);
}
pub(crate) unsafe extern "C" fn hash_chunks(
input: *const u8,
input_len: usize,
key: *const CVBytes,
counter: u64,
flags: u32,
transposed_output: *mut u32,
) {
crate::hash_chunks_using_compress(
compress,
input,
input_len,
key,
counter,
flags,
transposed_output,
)
}
pub(crate) unsafe extern "C" fn hash_parents(
transposed_input: *const u32,
num_parents: usize,
key: *const CVBytes,
flags: u32,
transposed_output: *mut u32, // may overlap the input
) {
crate::hash_parents_using_compress(
compress,
transposed_input,
num_parents,
key,
flags,
transposed_output,
)
}
pub(crate) unsafe extern "C" fn xof(
block: *const BlockBytes,
block_len: u32,
cv: *const CVBytes,
counter: u64,
flags: u32,
out: *mut u8,
out_len: usize,
) {
crate::xof_using_compress_xof(
compress_xof,
block,
block_len,
cv,
counter,
flags,
out,
out_len,
)
}
pub(crate) unsafe extern "C" fn xof_xor(
block: *const BlockBytes,
block_len: u32,
cv: *const CVBytes,
counter: u64,
flags: u32,
out: *mut u8,
out_len: usize,
) {
crate::xof_xor_using_compress_xof(
compress_xof,
block,
block_len,
cv,
counter,
flags,
out,
out_len,
)
}
pub(crate) unsafe extern "C" fn universal_hash(
input: *const u8,
input_len: usize,
key: *const CVBytes,
counter: u64,
out: *mut [u8; 16],
) {
crate::universal_hash_using_compress(compress, input, input_len, key, counter, out)
}
pub fn implementation() -> Implementation {
Implementation::new(
degree,
compress,
hash_chunks,
hash_parents,
xof,
xof_xor,
universal_hash,
)
}
#[cfg(test)]
mod test {
use super::*;
// This is circular but do it anyway.
#[test]
fn test_compress_vs_portable() {
crate::test::test_compress_vs_portable(&implementation());
}
#[test]
fn test_compress_vs_reference() {
crate::test::test_compress_vs_reference(&implementation());
}
// This is circular but do it anyway.
#[test]
fn test_hash_chunks_vs_portable() {
crate::test::test_hash_chunks_vs_portable(&implementation());
}
// This is circular but do it anyway.
#[test]
fn test_hash_parents_vs_portable() {
crate::test::test_hash_parents_vs_portable(&implementation());
}
#[test]
fn test_chunks_and_parents_vs_reference() {
crate::test::test_chunks_and_parents_vs_reference(&implementation());
}
// This is circular but do it anyway.
#[test]
fn test_xof_vs_portable() {
crate::test::test_xof_vs_portable(&implementation());
}
#[test]
fn test_xof_vs_reference() {
crate::test::test_xof_vs_reference(&implementation());
}
// This is circular but do it anyway.
#[test]
fn test_universal_hash_vs_portable() {
crate::test::test_universal_hash_vs_portable(&implementation());
}
#[test]
fn test_universal_hash_vs_reference() {
crate::test::test_universal_hash_vs_reference(&implementation());
}
}

523
rust/guts/src/test.rs Normal file
View File

@ -0,0 +1,523 @@
use crate::*;
pub const TEST_KEY: CVBytes = *b"whats the Elvish word for friend";
// Test a few different initial counter values.
// - 0: The base case.
// - i32::MAX: *No* overflow. But carry bugs in tricky SIMD code can screw this up, if you XOR when
// you're supposed to ANDNOT.
// - u32::MAX: The low word of the counter overflows for all inputs except the first.
// - (42 << 32) + u32::MAX: Same but with a non-zero value in the high word.
const INITIAL_COUNTERS: [u64; 4] = [
0,
i32::MAX as u64,
u32::MAX as u64,
(42u64 << 32) + u32::MAX as u64,
];
const BLOCK_LENGTHS: [usize; 4] = [0, 1, 63, 64];
pub fn paint_test_input(buf: &mut [u8]) {
for (i, b) in buf.iter_mut().enumerate() {
*b = (i % 251) as u8;
}
}
pub fn test_compress_vs_portable(test_impl: &Implementation) {
for block_len in BLOCK_LENGTHS {
dbg!(block_len);
let mut block = [0; BLOCK_LEN];
paint_test_input(&mut block[..block_len]);
for counter in INITIAL_COUNTERS {
dbg!(counter);
let portable_cv = portable::implementation().compress(
&block,
block_len as u32,
&TEST_KEY,
counter,
KEYED_HASH,
);
let test_cv =
test_impl.compress(&block, block_len as u32, &TEST_KEY, counter, KEYED_HASH);
assert_eq!(portable_cv, test_cv);
}
}
}
pub fn test_compress_vs_reference(test_impl: &Implementation) {
for block_len in BLOCK_LENGTHS {
dbg!(block_len);
let mut block = [0; BLOCK_LEN];
paint_test_input(&mut block[..block_len]);
let mut ref_hasher = reference_impl::Hasher::new_keyed(&TEST_KEY);
ref_hasher.update(&block[..block_len]);
let mut ref_hash = [0u8; 32];
ref_hasher.finalize(&mut ref_hash);
let test_cv = test_impl.compress(
&block,
block_len as u32,
&TEST_KEY,
0,
CHUNK_START | CHUNK_END | ROOT | KEYED_HASH,
);
assert_eq!(ref_hash, test_cv);
}
}
fn check_transposed_eq(output_a: &TransposedVectors, output_b: &TransposedVectors) {
if output_a == output_b {
return;
}
for cv_index in 0..2 * MAX_SIMD_DEGREE {
let cv_a = output_a.extract_cv(cv_index);
let cv_b = output_b.extract_cv(cv_index);
if cv_a == [0; 32] && cv_b == [0; 32] {
println!("CV {cv_index:2} empty");
} else if cv_a == cv_b {
println!("CV {cv_index:2} matches");
} else {
println!("CV {cv_index:2} mismatch:");
println!(" {}", hex::encode(cv_a));
println!(" {}", hex::encode(cv_b));
}
}
panic!("transposed outputs are not equal");
}
pub fn test_hash_chunks_vs_portable(test_impl: &Implementation) {
assert!(test_impl.degree() <= MAX_SIMD_DEGREE);
dbg!(test_impl.degree() * CHUNK_LEN);
// Allocate 4 extra bytes of padding so we can make aligned slices.
let mut input_buf = [0u8; 2 * 2 * MAX_SIMD_DEGREE * CHUNK_LEN + 4];
let mut input_slice = &mut input_buf[..];
// Make sure the start of the input is word-aligned.
while input_slice.as_ptr() as usize % 4 != 0 {
input_slice = &mut input_slice[1..];
}
let (aligned_input, mut unaligned_input) =
input_slice.split_at_mut(2 * MAX_SIMD_DEGREE * CHUNK_LEN);
unaligned_input = &mut unaligned_input[1..][..2 * MAX_SIMD_DEGREE * CHUNK_LEN];
assert_eq!(aligned_input.as_ptr() as usize % 4, 0);
assert_eq!(unaligned_input.as_ptr() as usize % 4, 1);
paint_test_input(aligned_input);
paint_test_input(unaligned_input);
// Try just below, equal to, and just above every whole number of chunks.
let mut input_2_lengths = Vec::new();
let mut next_len = 2 * CHUNK_LEN;
loop {
// 95 is one whole block plus one interesting part of another
input_2_lengths.push(next_len - 95);
input_2_lengths.push(next_len);
if next_len == test_impl.degree() * CHUNK_LEN {
break;
}
input_2_lengths.push(next_len + 95);
next_len += CHUNK_LEN;
}
for input_2_len in input_2_lengths {
dbg!(input_2_len);
let aligned_input1 = &aligned_input[..test_impl.degree() * CHUNK_LEN];
let aligned_input2 = &aligned_input[test_impl.degree() * CHUNK_LEN..][..input_2_len];
let unaligned_input1 = &unaligned_input[..test_impl.degree() * CHUNK_LEN];
let unaligned_input2 = &unaligned_input[test_impl.degree() * CHUNK_LEN..][..input_2_len];
for initial_counter in INITIAL_COUNTERS {
dbg!(initial_counter);
// Make two calls, to test the output_column parameter.
let mut portable_output = TransposedVectors::new();
let (portable_left, portable_right) =
test_impl.split_transposed_vectors(&mut portable_output);
portable::implementation().hash_chunks(
aligned_input1,
&IV_BYTES,
initial_counter,
0,
portable_left,
);
portable::implementation().hash_chunks(
aligned_input2,
&TEST_KEY,
initial_counter + test_impl.degree() as u64,
KEYED_HASH,
portable_right,
);
let mut test_output = TransposedVectors::new();
let (test_left, test_right) = test_impl.split_transposed_vectors(&mut test_output);
test_impl.hash_chunks(aligned_input1, &IV_BYTES, initial_counter, 0, test_left);
test_impl.hash_chunks(
aligned_input2,
&TEST_KEY,
initial_counter + test_impl.degree() as u64,
KEYED_HASH,
test_right,
);
check_transposed_eq(&portable_output, &test_output);
// Do the same thing with unaligned input.
let mut unaligned_test_output = TransposedVectors::new();
let (unaligned_left, unaligned_right) =
test_impl.split_transposed_vectors(&mut unaligned_test_output);
test_impl.hash_chunks(
unaligned_input1,
&IV_BYTES,
initial_counter,
0,
unaligned_left,
);
test_impl.hash_chunks(
unaligned_input2,
&TEST_KEY,
initial_counter + test_impl.degree() as u64,
KEYED_HASH,
unaligned_right,
);
check_transposed_eq(&portable_output, &unaligned_test_output);
}
}
}
fn painted_transposed_input() -> TransposedVectors {
let mut vectors = TransposedVectors::new();
let mut val = 0;
for col in 0..2 * MAX_SIMD_DEGREE {
for row in 0..8 {
vectors.0[row][col] = val;
val += 1;
}
}
vectors
}
pub fn test_hash_parents_vs_portable(test_impl: &Implementation) {
assert!(test_impl.degree() <= MAX_SIMD_DEGREE);
let input = painted_transposed_input();
for num_parents in 2..=(test_impl.degree() / 2) {
dbg!(num_parents);
let mut portable_output = TransposedVectors::new();
let (portable_left, portable_right) =
test_impl.split_transposed_vectors(&mut portable_output);
portable::implementation().hash_parents(
&input,
2 * num_parents, // num_cvs
&IV_BYTES,
0,
portable_left,
);
portable::implementation().hash_parents(
&input,
2 * num_parents, // num_cvs
&TEST_KEY,
KEYED_HASH,
portable_right,
);
let mut test_output = TransposedVectors::new();
let (test_left, test_right) = test_impl.split_transposed_vectors(&mut test_output);
test_impl.hash_parents(
&input,
2 * num_parents, // num_cvs
&IV_BYTES,
0,
test_left,
);
test_impl.hash_parents(
&input,
2 * num_parents, // num_cvs
&TEST_KEY,
KEYED_HASH,
test_right,
);
check_transposed_eq(&portable_output, &test_output);
}
}
fn hash_with_chunks_and_parents_recurse(
test_impl: &Implementation,
input: &[u8],
counter: u64,
output: TransposedSplit,
) -> usize {
assert!(input.len() > 0);
if input.len() <= test_impl.degree() * CHUNK_LEN {
return test_impl.hash_chunks(input, &IV_BYTES, counter, 0, output);
}
let (left_input, right_input) = input.split_at(left_len(input.len()));
let mut child_output = TransposedVectors::new();
let (left_output, right_output) = test_impl.split_transposed_vectors(&mut child_output);
let mut children =
hash_with_chunks_and_parents_recurse(test_impl, left_input, counter, left_output);
assert_eq!(children, test_impl.degree());
children += hash_with_chunks_and_parents_recurse(
test_impl,
right_input,
counter + (left_input.len() / CHUNK_LEN) as u64,
right_output,
);
test_impl.hash_parents(&child_output, children, &IV_BYTES, PARENT, output)
}
// Note: This test implementation doesn't support the 1-chunk-or-less case.
fn root_hash_with_chunks_and_parents(test_impl: &Implementation, input: &[u8]) -> CVBytes {
// TODO: handle the 1-chunk case?
assert!(input.len() > CHUNK_LEN);
let mut cvs = TransposedVectors::new();
// The right half of these vectors are never used.
let (cvs_left, _) = test_impl.split_transposed_vectors(&mut cvs);
let mut num_cvs = hash_with_chunks_and_parents_recurse(test_impl, input, 0, cvs_left);
while num_cvs > 2 {
num_cvs = test_impl.reduce_parents(&mut cvs, num_cvs, &IV_BYTES, 0);
}
test_impl.compress(
&cvs.extract_parent_node(0),
BLOCK_LEN as u32,
&IV_BYTES,
0,
PARENT | ROOT,
)
}
pub fn test_chunks_and_parents_vs_reference(test_impl: &Implementation) {
assert_eq!(test_impl.degree().count_ones(), 1, "power of 2");
const MAX_INPUT_LEN: usize = 2 * MAX_SIMD_DEGREE * CHUNK_LEN;
let mut input_buf = [0u8; MAX_INPUT_LEN];
paint_test_input(&mut input_buf);
// Try just below, equal to, and just above every whole number of chunks, except that
// root_hash_with_chunks_and_parents doesn't support the 1-chunk-or-less case.
let mut test_lengths = vec![CHUNK_LEN + 1];
let mut next_len = 2 * CHUNK_LEN;
loop {
// 95 is one whole block plus one interesting part of another
test_lengths.push(next_len - 95);
test_lengths.push(next_len);
if next_len == MAX_INPUT_LEN {
break;
}
test_lengths.push(next_len + 95);
next_len += CHUNK_LEN;
}
for test_len in test_lengths {
dbg!(test_len);
let input = &input_buf[..test_len];
let mut ref_hasher = reference_impl::Hasher::new();
ref_hasher.update(&input);
let mut ref_hash = [0u8; 32];
ref_hasher.finalize(&mut ref_hash);
let test_hash = root_hash_with_chunks_and_parents(test_impl, input);
assert_eq!(ref_hash, test_hash);
}
}
pub fn test_xof_vs_portable(test_impl: &Implementation) {
let flags = CHUNK_START | CHUNK_END | KEYED_HASH;
for counter in INITIAL_COUNTERS {
dbg!(counter);
for input_len in [0, 1, BLOCK_LEN] {
dbg!(input_len);
let mut input_block = [0u8; BLOCK_LEN];
for byte_index in 0..input_len {
input_block[byte_index] = byte_index as u8 + 42;
}
// Try equal to and partway through every whole number of output blocks.
const MAX_OUTPUT_LEN: usize = 2 * MAX_SIMD_DEGREE * BLOCK_LEN;
let mut output_lengths = Vec::new();
let mut next_len = 0;
loop {
output_lengths.push(next_len);
if next_len == MAX_OUTPUT_LEN {
break;
}
output_lengths.push(next_len + 31);
next_len += BLOCK_LEN;
}
for output_len in output_lengths {
dbg!(output_len);
let mut portable_output = [0xff; MAX_OUTPUT_LEN];
portable::implementation().xof(
&input_block,
input_len as u32,
&TEST_KEY,
counter,
flags,
&mut portable_output[..output_len],
);
let mut test_output = [0xff; MAX_OUTPUT_LEN];
test_impl.xof(
&input_block,
input_len as u32,
&TEST_KEY,
counter,
flags,
&mut test_output[..output_len],
);
assert_eq!(portable_output, test_output);
// Double check that the implementation didn't overwrite.
assert!(test_output[output_len..].iter().all(|&b| b == 0xff));
// The first XOR cancels out the output.
test_impl.xof_xor(
&input_block,
input_len as u32,
&TEST_KEY,
counter,
flags,
&mut test_output[..output_len],
);
assert!(test_output[..output_len].iter().all(|&b| b == 0));
assert!(test_output[output_len..].iter().all(|&b| b == 0xff));
// The second XOR restores out the output.
test_impl.xof_xor(
&input_block,
input_len as u32,
&TEST_KEY,
counter,
flags,
&mut test_output[..output_len],
);
assert_eq!(portable_output, test_output);
assert!(test_output[output_len..].iter().all(|&b| b == 0xff));
}
}
}
}
pub fn test_xof_vs_reference(test_impl: &Implementation) {
let input = b"hello world";
let mut input_block = [0; BLOCK_LEN];
input_block[..input.len()].copy_from_slice(input);
const MAX_OUTPUT_LEN: usize = 2 * MAX_SIMD_DEGREE * BLOCK_LEN;
let mut ref_output = [0; MAX_OUTPUT_LEN];
let mut ref_hasher = reference_impl::Hasher::new_keyed(&TEST_KEY);
ref_hasher.update(input);
ref_hasher.finalize(&mut ref_output);
// Try equal to and partway through every whole number of output blocks.
let mut output_lengths = vec![0, 1, 31];
let mut next_len = BLOCK_LEN;
loop {
output_lengths.push(next_len);
if next_len == MAX_OUTPUT_LEN {
break;
}
output_lengths.push(next_len + 31);
next_len += BLOCK_LEN;
}
for output_len in output_lengths {
dbg!(output_len);
let mut test_output = [0; MAX_OUTPUT_LEN];
test_impl.xof(
&input_block,
input.len() as u32,
&TEST_KEY,
0,
KEYED_HASH | CHUNK_START | CHUNK_END,
&mut test_output[..output_len],
);
assert_eq!(ref_output[..output_len], test_output[..output_len]);
// Double check that the implementation didn't overwrite.
assert!(test_output[output_len..].iter().all(|&b| b == 0));
// Do it again starting from block 1.
if output_len >= BLOCK_LEN {
test_impl.xof(
&input_block,
input.len() as u32,
&TEST_KEY,
1,
KEYED_HASH | CHUNK_START | CHUNK_END,
&mut test_output[..output_len - BLOCK_LEN],
);
assert_eq!(
ref_output[BLOCK_LEN..output_len],
test_output[..output_len - BLOCK_LEN],
);
}
}
}
pub fn test_universal_hash_vs_portable(test_impl: &Implementation) {
const MAX_INPUT_LEN: usize = 2 * MAX_SIMD_DEGREE * BLOCK_LEN;
let mut input_buf = [0; MAX_INPUT_LEN];
paint_test_input(&mut input_buf);
// Try equal to and partway through every whole number of input blocks.
let mut input_lengths = vec![0, 1, 31];
let mut next_len = BLOCK_LEN;
loop {
input_lengths.push(next_len);
if next_len == MAX_INPUT_LEN {
break;
}
input_lengths.push(next_len + 31);
next_len += BLOCK_LEN;
}
for input_len in input_lengths {
dbg!(input_len);
for counter in INITIAL_COUNTERS {
dbg!(counter);
let portable_output = portable::implementation().universal_hash(
&input_buf[..input_len],
&TEST_KEY,
counter,
);
let test_output = test_impl.universal_hash(&input_buf[..input_len], &TEST_KEY, counter);
assert_eq!(portable_output, test_output);
}
}
}
fn reference_impl_universal_hash(input: &[u8], key: &CVBytes) -> [u8; UNIVERSAL_HASH_LEN] {
// The reference_impl doesn't support XOF seeking, so we have to materialize an entire extended
// output to seek to a block.
const MAX_BLOCKS: usize = 2 * MAX_SIMD_DEGREE;
assert!(input.len() / BLOCK_LEN <= MAX_BLOCKS);
let mut output_buffer: [u8; BLOCK_LEN * MAX_BLOCKS] = [0u8; BLOCK_LEN * MAX_BLOCKS];
let mut result = [0u8; UNIVERSAL_HASH_LEN];
let mut block_start = 0;
while block_start < input.len() {
let block_len = cmp::min(input.len() - block_start, BLOCK_LEN);
let mut ref_hasher = reference_impl::Hasher::new_keyed(key);
ref_hasher.update(&input[block_start..block_start + block_len]);
ref_hasher.finalize(&mut output_buffer[..block_start + UNIVERSAL_HASH_LEN]);
for byte_index in 0..UNIVERSAL_HASH_LEN {
result[byte_index] ^= output_buffer[block_start + byte_index];
}
block_start += BLOCK_LEN;
}
result
}
pub fn test_universal_hash_vs_reference(test_impl: &Implementation) {
const MAX_INPUT_LEN: usize = 2 * MAX_SIMD_DEGREE * BLOCK_LEN;
let mut input_buf = [0; MAX_INPUT_LEN];
paint_test_input(&mut input_buf);
// Try equal to and partway through every whole number of input blocks.
let mut input_lengths = vec![0, 1, 31];
let mut next_len = BLOCK_LEN;
loop {
input_lengths.push(next_len);
if next_len == MAX_INPUT_LEN {
break;
}
input_lengths.push(next_len + 31);
next_len += BLOCK_LEN;
}
for input_len in input_lengths {
dbg!(input_len);
let ref_output = reference_impl_universal_hash(&input_buf[..input_len], &TEST_KEY);
let test_output = test_impl.universal_hash(&input_buf[..input_len], &TEST_KEY, 0);
assert_eq!(ref_output, test_output);
}
}

View File

@ -368,6 +368,7 @@ impl Platform {
#[cfg(blake3_avx512_ffi)]
#[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
#[inline(always)]
#[allow(unreachable_code)]
pub fn avx512_detected() -> bool {
// A testing-only short-circuit.
if cfg!(feature = "no_avx512") {
@ -390,6 +391,7 @@ pub fn avx512_detected() -> bool {
#[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
#[inline(always)]
#[allow(unreachable_code)]
pub fn avx2_detected() -> bool {
// A testing-only short-circuit.
if cfg!(feature = "no_avx2") {
@ -412,6 +414,7 @@ pub fn avx2_detected() -> bool {
#[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
#[inline(always)]
#[allow(unreachable_code)]
pub fn sse41_detected() -> bool {
// A testing-only short-circuit.
if cfg!(feature = "no_sse41") {