From 3d921ae703a492d7449228d29990c8bc9e768c40 Mon Sep 17 00:00:00 2001 From: Jack O'Connor Date: Mon, 2 Oct 2023 08:36:29 -0700 Subject: [PATCH 01/18] allow(unreachable_code) in all the *_detected functions Previously we only disabled these warnings for SSE2, which is assumed enabled on x86-64, but it looks like new nightly compilers are also assuming SSE4.1 on macOS. Disabling these warnings across all the detection functions accounts for that, and it also gets rid of some warnings that you'd see if you used RUSTFLAGS='-C target-cpu=native'. --- src/platform.rs | 3 +++ 1 file changed, 3 insertions(+) diff --git a/src/platform.rs b/src/platform.rs index 00058b1..ef910aa 100644 --- a/src/platform.rs +++ b/src/platform.rs @@ -327,6 +327,7 @@ impl Platform { #[cfg(blake3_avx512_ffi)] #[cfg(any(target_arch = "x86", target_arch = "x86_64"))] #[inline(always)] +#[allow(unreachable_code)] pub fn avx512_detected() -> bool { // A testing-only short-circuit. if cfg!(feature = "no_avx512") { @@ -349,6 +350,7 @@ pub fn avx512_detected() -> bool { #[cfg(any(target_arch = "x86", target_arch = "x86_64"))] #[inline(always)] +#[allow(unreachable_code)] pub fn avx2_detected() -> bool { // A testing-only short-circuit. if cfg!(feature = "no_avx2") { @@ -371,6 +373,7 @@ pub fn avx2_detected() -> bool { #[cfg(any(target_arch = "x86", target_arch = "x86_64"))] #[inline(always)] +#[allow(unreachable_code)] pub fn sse41_detected() -> bool { // A testing-only short-circuit. if cfg!(feature = "no_sse41") { From dd30dcb00221591db3a983e0215b81d86cff941d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Henrik=20Ga=C3=9Fmann?= Date: Sun, 1 Oct 2023 20:53:47 +0200 Subject: [PATCH 02/18] build(CMake): Apply PP definitions to all sources --- c/CMakeLists.txt | 12 +++++++++--- 1 file changed, 9 insertions(+), 3 deletions(-) diff --git a/c/CMakeLists.txt b/c/CMakeLists.txt index c73b9c7..21b24c4 100644 --- a/c/CMakeLists.txt +++ b/c/CMakeLists.txt @@ -59,8 +59,12 @@ macro(BLAKE3_DISABLE_SIMD) set(BLAKE3_SIMD_AMD64_ASM OFF) set(BLAKE3_SIMD_X86_INTRINSICS OFF) set(BLAKE3_SIMD_NEON_INTRINSICS OFF) - set_source_files_properties(blake3_dispatch.c PROPERTIES - COMPILE_DEFINITIONS BLAKE3_USE_NEON=0;BLAKE3_NO_SSE2;BLAKE3_NO_SSE41;BLAKE3_NO_AVX2;BLAKE3_NO_AVX512 + target_compile_definitions(blake3 PRIVATE + BLAKE3_USE_NEON=0 + BLAKE3_NO_SSE2 + BLAKE3_NO_SSE41 + BLAKE3_NO_AVX2 + BLAKE3_NO_AVX512 ) endmacro() @@ -131,7 +135,9 @@ elseif(CMAKE_SYSTEM_PROCESSOR IN_LIST BLAKE3_ARMv8_NAMES target_sources(blake3 PRIVATE blake3_neon.c ) - set_source_files_properties(blake3_dispatch.c PROPERTIES COMPILE_DEFINITIONS BLAKE3_USE_NEON=1) + target_compile_definitions(blake3 PRIVATE + BLAKE3_USE_NEON=1 + ) if (DEFINED BLAKE3_CFLAGS_NEON) set_source_files_properties(blake3_neon.c PROPERTIES COMPILE_FLAGS "${BLAKE3_CFLAGS_NEON}") From bfd568897a8fc6b6eaf54ad262620536d10e9261 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Henrik=20Ga=C3=9Fmann?= Date: Tue, 31 Oct 2023 11:45:26 +0100 Subject: [PATCH 03/18] build(CMake): Provide NEON cflags for ARMv8 32bit ARMv8 CPUs are guaranteed to support NEON instructions. However, for 32bit ARMv8 triplets GCC needs to explicitly be configured to enable NEON intrinsics. --- c/CMakeLists.txt | 24 +++++++++++++++--------- 1 file changed, 15 insertions(+), 9 deletions(-) diff --git a/c/CMakeLists.txt b/c/CMakeLists.txt index 21b24c4..5bd7bb4 100644 --- a/c/CMakeLists.txt +++ b/c/CMakeLists.txt @@ -9,6 +9,10 @@ project(libblake3 include(FeatureSummary) include(GNUInstallDirs) +# architecture lists for which to enable assembly / SIMD sources +set(BLAKE3_AMD64_NAMES amd64 AMD64 x86_64) +set(BLAKE3_X86_NAMES i686 x86 X86) +set(BLAKE3_ARMv8_NAMES aarch64 AArch64 arm64 ARM64 armv8 armv8a) # default SIMD compiler flag configuration (can be overriden by toolchains or CLI) if(CMAKE_C_COMPILER_ID STREQUAL "MSVC") set(BLAKE3_CFLAGS_SSE2 "/arch:SSE2" CACHE STRING "the compiler flags to enable SSE2") @@ -24,11 +28,13 @@ elseif(CMAKE_C_COMPILER_ID STREQUAL "GNU" set(BLAKE3_CFLAGS_SSE4.1 "-msse4.1" CACHE STRING "the compiler flags to enable SSE4.1") set(BLAKE3_CFLAGS_AVX2 "-mavx2" CACHE STRING "the compiler flags to enable AVX2") set(BLAKE3_CFLAGS_AVX512 "-mavx512f -mavx512vl" CACHE STRING "the compiler flags to enable AVX512") + + if (CMAKE_SYSTEM_PROCESSOR IN_LIST BLAKE3_ARMv8_NAMES + AND NOT CMAKE_SIZEOF_VOID_P EQUAL 8) + # 32-bit ARMv8 needs NEON to be enabled explicitly + set(BLAKE3_CFLAGS_NEON "-mfpu=neon" CACHE STRING "the compiler flags to enable NEON") + endif() endif() -# architecture lists for which to enable assembly / SIMD sources -set(BLAKE3_AMD64_NAMES amd64 AMD64 x86_64) -set(BLAKE3_X86_NAMES i686 x86 X86) -set(BLAKE3_ARMv8_NAMES aarch64 AArch64 arm64 ARM64 armv8 armv8a) # library target add_library(blake3 @@ -125,11 +131,11 @@ elseif((CMAKE_SYSTEM_PROCESSOR IN_LIST BLAKE3_X86_NAMES OR BLAKE3_USE_X86_INTRIN set_source_files_properties(blake3_sse2.c PROPERTIES COMPILE_FLAGS "${BLAKE3_CFLAGS_SSE2}") set_source_files_properties(blake3_sse41.c PROPERTIES COMPILE_FLAGS "${BLAKE3_CFLAGS_SSE4.1}") -elseif(CMAKE_SYSTEM_PROCESSOR IN_LIST BLAKE3_ARMv8_NAMES - OR ((ANDROID_ABI STREQUAL "armeabi-v7a" - OR BLAKE3_USE_NEON_INTRINSICS) - AND (DEFINED BLAKE3_CFLAGS_NEON - OR CMAKE_SIZEOF_VOID_P EQUAL 8))) +elseif((CMAKE_SYSTEM_PROCESSOR IN_LIST BLAKE3_ARMv8_NAMES + OR ANDROID_ABI STREQUAL "armeabi-v7a" + OR BLAKE3_USE_NEON_INTRINSICS) + AND (DEFINED BLAKE3_CFLAGS_NEON + OR CMAKE_SIZEOF_VOID_P EQUAL 8)) set(BLAKE3_SIMD_NEON_INTRINSICS ON) target_sources(blake3 PRIVATE From 3e14f865d30271c74fc68d417af488ea91b66d48 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Henrik=20Ga=C3=9Fmann?= Date: Tue, 31 Oct 2023 11:51:26 +0100 Subject: [PATCH 04/18] style: Remove trailing whitespace in CMakeLists.txt --- c/CMakeLists.txt | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/c/CMakeLists.txt b/c/CMakeLists.txt index 5bd7bb4..cbeb555 100644 --- a/c/CMakeLists.txt +++ b/c/CMakeLists.txt @@ -47,7 +47,7 @@ add_library(BLAKE3::blake3 ALIAS blake3) # library configuration set(BLAKE3_PKGCONFIG_CFLAGS) if (BUILD_SHARED_LIBS) - target_compile_definitions(blake3 + target_compile_definitions(blake3 PUBLIC BLAKE3_DLL PRIVATE BLAKE3_DLL_EXPORTS ) @@ -109,7 +109,7 @@ if(CMAKE_SYSTEM_PROCESSOR IN_LIST BLAKE3_AMD64_NAMES OR BLAKE3_USE_AMD64_ASM) BLAKE3_DISABLE_SIMD() endif() - else() + else() BLAKE3_DISABLE_SIMD() endif() From 3465fe455e6cfd98d94f6d5fe1de9c4e2d566b33 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Henrik=20Ga=C3=9Fmann?= Date: Tue, 31 Oct 2023 11:53:11 +0100 Subject: [PATCH 05/18] style: Exclude whitespace fixups from git blame --- .git-blame-ignore-revs | 2 ++ 1 file changed, 2 insertions(+) create mode 100644 .git-blame-ignore-revs diff --git a/.git-blame-ignore-revs b/.git-blame-ignore-revs new file mode 100644 index 0000000..6e814e6 --- /dev/null +++ b/.git-blame-ignore-revs @@ -0,0 +1,2 @@ +# CMakeLists.txt whitespace fixups +3e14f865d30271c74fc68d417af488ea91b66d48 From e1f851d461324793c1261609f91288d3b868cb93 Mon Sep 17 00:00:00 2001 From: Rui Ueyama Date: Wed, 16 Aug 2023 13:54:45 +0900 Subject: [PATCH 06/18] Fix Windows build with clang-cl clang-cl is LLVM's MSVC-compatible compiler frontend for Windows ABI. If clang-cl is in use, `CMAKE_C_COMPILER_ID` is `Clang` even though it doesn't take Unix-like command line options but MSVC-like options. `if(MSVC)` is the correct predicate to check if we should pass MSVC-ish command line options. --- c/CMakeLists.txt | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/c/CMakeLists.txt b/c/CMakeLists.txt index cbeb555..34874b3 100644 --- a/c/CMakeLists.txt +++ b/c/CMakeLists.txt @@ -14,7 +14,7 @@ set(BLAKE3_AMD64_NAMES amd64 AMD64 x86_64) set(BLAKE3_X86_NAMES i686 x86 X86) set(BLAKE3_ARMv8_NAMES aarch64 AArch64 arm64 ARM64 armv8 armv8a) # default SIMD compiler flag configuration (can be overriden by toolchains or CLI) -if(CMAKE_C_COMPILER_ID STREQUAL "MSVC") +if(MSVC) set(BLAKE3_CFLAGS_SSE2 "/arch:SSE2" CACHE STRING "the compiler flags to enable SSE2") # MSVC has no dedicated sse4.1 flag (see https://learn.microsoft.com/en-us/cpp/build/reference/arch-x86?view=msvc-170) set(BLAKE3_CFLAGS_SSE4.1 "/arch:AVX" CACHE STRING "the compiler flags to enable SSE4.1") @@ -77,7 +77,7 @@ endmacro() if(CMAKE_SYSTEM_PROCESSOR IN_LIST BLAKE3_AMD64_NAMES OR BLAKE3_USE_AMD64_ASM) set(BLAKE3_SIMD_AMD64_ASM ON) - if(CMAKE_C_COMPILER_ID STREQUAL "MSVC") + if(MSVC) enable_language(ASM_MASM) target_sources(blake3 PRIVATE blake3_avx2_x86-64_windows_msvc.asm From 1930721c50ca788304fc1108a9c5b62d62342223 Mon Sep 17 00:00:00 2001 From: Viacheslav H <30746510+r4nx@users.noreply.github.com> Date: Sun, 5 Nov 2023 19:16:48 +0200 Subject: [PATCH 07/18] Fix CMake target include directories if library is used with add_subdirectory or FetchContent --- c/CMakeLists.txt | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/c/CMakeLists.txt b/c/CMakeLists.txt index 34874b3..49cde64 100644 --- a/c/CMakeLists.txt +++ b/c/CMakeLists.txt @@ -53,7 +53,10 @@ if (BUILD_SHARED_LIBS) ) list(APPEND BLAKE3_PKGCONFIG_CFLAGS -DBLAKE3_DLL) endif() -target_include_directories(blake3 PUBLIC $) +target_include_directories(blake3 PUBLIC + $ + $ +) set_target_properties(blake3 PROPERTIES VERSION ${PROJECT_VERSION} SOVERSION 0 From 92e4cd71be48fdf9a79e88ef37b8f415ec5ac210 Mon Sep 17 00:00:00 2001 From: Jack O'Connor Date: Sun, 5 Nov 2023 09:18:39 -0800 Subject: [PATCH 08/18] add the compiler name to CMake CI jobs --- .github/workflows/ci.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 2de4fdd..6bfbbea 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -314,7 +314,7 @@ jobs: # CMake build test (Library only), current macOS/Linux only. cmake_build: - name: CMake ${{ matrix.os }} + name: CMake ${{ matrix.os }} ${{ matrix.compiler }} runs-on: ${{ matrix.os }} strategy: fail-fast: false From 7ce2aa41e9f01a91c3b309a7bf5e86b4136ed8a9 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Henrik=20S=2E=20Ga=C3=9Fmann?= Date: Sat, 2 Dec 2023 15:19:30 +0100 Subject: [PATCH 09/18] build(CMake): Require C99 mode Specify language requirement as a [compile-feature] and force compiler extensions off ensuring portability problems are detected early on. Note that we do not use the `C_STANDARD` property, because it doesn't propagate to dependent targets and would prohibit users from compiling their code base with consistent flags / language configuations if they were to target a newer C standard. Similarly we do not configure `C_STANDARD_REQUIRED` as [compile-features] do not interact with it--they are enforced regardless. [compile-feature]: https://cmake.org/cmake/help/latest/manual/cmake-compile-features.7.html#compile-feature-requirements --- c/CMakeLists.txt | 14 +++++++++++++- 1 file changed, 13 insertions(+), 1 deletion(-) diff --git a/c/CMakeLists.txt b/c/CMakeLists.txt index 49cde64..8f38144 100644 --- a/c/CMakeLists.txt +++ b/c/CMakeLists.txt @@ -1,4 +1,9 @@ -cmake_minimum_required(VERSION 3.9) +cmake_minimum_required(VERSION 3.9 FATAL_ERROR) + +# respect C_EXTENSIONS OFF without explicitly setting C_STANDARD +if (POLICY CMP0128) + cmake_policy(SET CMP0128 NEW) +endif() project(libblake3 VERSION 1.5.0 @@ -61,7 +66,14 @@ set_target_properties(blake3 PROPERTIES VERSION ${PROJECT_VERSION} SOVERSION 0 C_VISIBILITY_PRESET hidden + C_EXTENSIONS OFF ) +target_compile_features(blake3 PUBLIC c_std_99) +# ensure C_EXTENSIONS OFF is respected without overriding CMAKE_C_STANDARD +# which may be set by the user or toolchain file +if (NOT POLICY CMP0128 AND NOT DEFINED CMAKE_C_STANDARD) + set_target_properties(blake3 PROPERTIES C_STANDARD 99) +endif() # optional SIMD sources macro(BLAKE3_DISABLE_SIMD) From c0ea395cf91d242f078c23d5f8d87eb9dd5f7b78 Mon Sep 17 00:00:00 2001 From: Jack O'Connor Date: Fri, 8 Dec 2023 10:23:30 -0800 Subject: [PATCH 10/18] add Bazel to the list of users in the readme --- README.md | 1 + 1 file changed, 1 insertion(+) diff --git a/README.md b/README.md index a63d5f2..0d34db7 100644 --- a/README.md +++ b/README.md @@ -201,6 +201,7 @@ Alternatively, it is licensed under the Apache License 2.0. Here's a (non-exhaustive) list of protocols and software that use BLAKE3: * [Alephium](https://github.com/alephium/alephium/blob/master/crypto/src/main/scala/org/alephium/crypto/Blake3.scala) +* [Bazel](https://github.com/bazelbuild/bazel/releases/tag/6.4.0) * [Chia](https://github.com/Chia-Network/chia-blockchain/blob/main/CHANGELOG.md#10beta8-aka-beta-18---2020-07-16) * [IPFS](https://github.com/ipfs/go-verifcid/issues/13) * [Farcaster](https://www.farcaster.xyz/) From 5306464d031f70676194497a2009af1416be41df Mon Sep 17 00:00:00 2001 From: Dirk Stolle Date: Thu, 28 Dec 2023 02:44:32 +0100 Subject: [PATCH 11/18] update actions/checkout in GitHub Actions to v4 --- .github/workflows/ci.yml | 16 ++++++++-------- .github/workflows/tag.yml | 2 +- 2 files changed, 9 insertions(+), 9 deletions(-) diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 6bfbbea..83accaa 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -38,7 +38,7 @@ jobs: ] steps: - - uses: actions/checkout@v3 + - uses: actions/checkout@v4 - uses: actions-rs/toolchain@v1 with: toolchain: ${{ format('{0}-{1}', matrix.channel, matrix.target.toolchain) }} @@ -156,7 +156,7 @@ jobs: ] steps: - - uses: actions/checkout@v3 + - uses: actions/checkout@v4 - uses: actions-rs/toolchain@v1 with: toolchain: ${{ format('{0}-{1}', matrix.channel, matrix.target.toolchain) }} @@ -186,7 +186,7 @@ jobs: - s390x-unknown-linux-gnu steps: - - uses: actions/checkout@v3 + - uses: actions/checkout@v4 - uses: actions-rs/toolchain@v1 with: toolchain: stable @@ -216,7 +216,7 @@ jobs: runs-on: ubuntu-latest steps: - - uses: actions/checkout@v3 + - uses: actions/checkout@v4 # Test the intrinsics-based implementations. - run: make -f Makefile.testing test working-directory: ./c @@ -268,7 +268,7 @@ jobs: strategy: fail-fast: false steps: - - uses: actions/checkout@v3 + - uses: actions/checkout@v4 - uses: actions-rs/toolchain@v1 with: toolchain: stable @@ -284,7 +284,7 @@ jobs: name: build with the Tiny C Compiler runs-on: ubuntu-latest steps: - - uses: actions/checkout@v3 + - uses: actions/checkout@v4 - name: install TCC run: sudo apt-get install -y tcc - name: compile @@ -301,7 +301,7 @@ jobs: name: "compile and test with GCC 5.4" runs-on: ubuntu-latest steps: - - uses: actions/checkout@v3 + - uses: actions/checkout@v4 - uses: addnab/docker-run-action@v3 with: image: gcc:5.4 @@ -329,7 +329,7 @@ jobs: - os: macOS-latest compiler: msvc steps: - - uses: actions/checkout@v3 + - uses: actions/checkout@v4 - name: CMake generation run: cmake -S c -B c/build -DCMAKE_INSTALL_PREFIX=${{github.workspace}}/target - name: CMake build / install diff --git a/.github/workflows/tag.yml b/.github/workflows/tag.yml index 3f7e886..867c421 100644 --- a/.github/workflows/tag.yml +++ b/.github/workflows/tag.yml @@ -23,7 +23,7 @@ jobs: ] steps: - - uses: actions/checkout@v3 + - uses: actions/checkout@v4 - uses: actions/setup-python@v4 with: python-version: "3.x" From 4d32708f511fd85c6b0fb131295cc73224246738 Mon Sep 17 00:00:00 2001 From: Dirk Stolle Date: Sat, 30 Dec 2023 00:40:17 +0100 Subject: [PATCH 12/18] replace unmaintained actions-rs/toolchain action in CI Basically all of the `actions-rs/*` actions are unmaintained. See for more information. Due to their age they generate several warnings in CI runs. To get rid of those warnings the occurrences of `actions-rs/toolchain` are replaced by `dtolnay/rust-toolchain`. --- .github/workflows/ci.yml | 19 +++++-------------- .github/workflows/tag.yml | 6 ++---- 2 files changed, 7 insertions(+), 18 deletions(-) diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 83accaa..3ff1199 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -39,11 +39,9 @@ jobs: steps: - uses: actions/checkout@v4 - - uses: actions-rs/toolchain@v1 + - uses: dtolnay/rust-toolchain@master with: toolchain: ${{ format('{0}-{1}', matrix.channel, matrix.target.toolchain) }} - profile: minimal - override: true # Print the compiler version, for debugging. - name: print compiler version run: cargo run --quiet @@ -157,11 +155,9 @@ jobs: steps: - uses: actions/checkout@v4 - - uses: actions-rs/toolchain@v1 + - uses: dtolnay/rust-toolchain@master with: toolchain: ${{ format('{0}-{1}', matrix.channel, matrix.target.toolchain) }} - profile: minimal - override: true # Test b3sum. - name: test b3sum run: cargo test @@ -187,10 +183,7 @@ jobs: steps: - uses: actions/checkout@v4 - - uses: actions-rs/toolchain@v1 - with: - toolchain: stable - override: true + - uses: dtolnay/rust-toolchain@stable - run: cargo install cross # Test the portable implementation on everything. - run: cross test --target ${{ matrix.arch }} @@ -269,11 +262,9 @@ jobs: fail-fast: false steps: - uses: actions/checkout@v4 - - uses: actions-rs/toolchain@v1 + - uses: dtolnay/rust-toolchain@stable with: - toolchain: stable - target: aarch64-apple-darwin - override: true + targets: aarch64-apple-darwin - name: build blake3 run: cargo build --target aarch64-apple-darwin - name: build b3sum diff --git a/.github/workflows/tag.yml b/.github/workflows/tag.yml index 867c421..61be4ff 100644 --- a/.github/workflows/tag.yml +++ b/.github/workflows/tag.yml @@ -30,11 +30,9 @@ jobs: - run: pip install PyGithub - run: sudo apt-get install musl-tools if: matrix.target.os == 'ubuntu-latest' - - uses: actions-rs/toolchain@v1 + - uses: dtolnay/rust-toolchain@stable with: - toolchain: stable - profile: minimal - - run: rustup target add ${{ matrix.target.rust-target }} + targets: ${{ matrix.target.rust-target }} - name: build b3sum id: build_b3sum run: python -u .github/workflows/build_b3sum.py ${{ matrix.target.rust-target }} From 6f3e6fc86c21a04e89791f6b6930a34f0b66bc6c Mon Sep 17 00:00:00 2001 From: Jack O'Connor Date: Sun, 21 Jan 2024 18:25:26 -0800 Subject: [PATCH 13/18] update memmap2 to v0.9 --- Cargo.toml | 2 +- b3sum/Cargo.lock | 263 ++++++++++++++++++++++++++--------------------- b3sum/Cargo.toml | 1 - 3 files changed, 145 insertions(+), 121 deletions(-) diff --git a/Cargo.toml b/Cargo.toml index a98cb6a..591a6e3 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -97,7 +97,7 @@ arrayvec = { version = "0.7.4", default-features = false } constant_time_eq = "0.3.0" cfg-if = "1.0.0" digest = { version = "0.10.1", features = [ "mac" ], optional = true } -memmap2 = { version = "0.7.1", optional = true } +memmap2 = { version = "0.9", optional = true } rayon = { version = "1.2.1", optional = true } serde = { version = "1.0", default-features = false, features = ["derive"], optional = true } zeroize = { version = "1", default-features = false, features = ["zeroize_derive"], optional = true } diff --git a/b3sum/Cargo.lock b/b3sum/Cargo.lock index 763a2e1..10caffb 100644 --- a/b3sum/Cargo.lock +++ b/b3sum/Cargo.lock @@ -4,9 +4,9 @@ version = 3 [[package]] name = "anstream" -version = "0.5.0" +version = "0.6.11" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "b1f58811cfac344940f1a400b6e6231ce35171f614f26439e80f8c1465c5cc0c" +checksum = "6e2e1ebcb11de5c03c67de28a7df593d32191b44939c482e97702baaaa6ab6a5" dependencies = [ "anstyle", "anstyle-parse", @@ -18,43 +18,43 @@ dependencies = [ [[package]] name = "anstyle" -version = "1.0.3" +version = "1.0.4" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "b84bf0a05bbb2a83e5eb6fa36bb6e87baa08193c35ff52bbf6b38d8af2890e46" +checksum = "7079075b41f533b8c61d2a4d073c4676e1f8b249ff94a393b0595db304e0dd87" [[package]] name = "anstyle-parse" -version = "0.2.1" +version = "0.2.3" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "938874ff5980b03a87c5524b3ae5b59cf99b1d6bc836848df7bc5ada9643c333" +checksum = "c75ac65da39e5fe5ab759307499ddad880d724eed2f6ce5b5e8a26f4f387928c" dependencies = [ "utf8parse", ] [[package]] name = "anstyle-query" -version = "1.0.0" +version = "1.0.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "5ca11d4be1bab0c8bc8734a9aa7bf4ee8316d462a08c6ac5052f888fef5b494b" +checksum = "e28923312444cdd728e4738b3f9c9cac739500909bb3d3c94b43551b16517648" dependencies = [ - "windows-sys", + "windows-sys 0.52.0", ] [[package]] name = "anstyle-wincon" -version = "2.1.0" +version = "3.0.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "58f54d10c6dfa51283a066ceab3ec1ab78d13fae00aa49243a45e4571fb79dfd" +checksum = "1cd54b81ec8d6180e24654d0b371ad22fc3dd083b6ff8ba325b72e00c87660a7" dependencies = [ "anstyle", - "windows-sys", + "windows-sys 0.52.0", ] [[package]] name = "anyhow" -version = "1.0.75" +version = "1.0.79" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "a4668cab20f66d8d020e1fbc0ebe47217433c1b6c8f2040faf858554e394ace6" +checksum = "080e9890a082662b09c1ad45f567faeeb47f22b5fb23895fbe1e651e718e25ca" [[package]] name = "arrayref" @@ -68,12 +68,6 @@ version = "0.7.4" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "96d30a06541fbafbc7f82ed10c06164cfbd2c401138f6addd8404629c4b16711" -[[package]] -name = "autocfg" -version = "1.1.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "d468802bab17cbc0cc575e9b053f41e72aa36bfa6b7f55e3529ffa43161b97fa" - [[package]] name = "b3sum" version = "1.5.0" @@ -83,7 +77,6 @@ dependencies = [ "clap", "duct", "hex", - "memmap2", "rayon", "tempfile", "wild", @@ -97,9 +90,9 @@ checksum = "bef38d45163c2f1dde094a7dfd33ccf595c92905c8f8f4fdc18d06fb1037718a" [[package]] name = "bitflags" -version = "2.4.0" +version = "2.4.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "b4682ae6287fcf752ecaabbfcc7b6f9b72aa33933dc23a554d853aea8eea8635" +checksum = "ed570934406eb16438a4e976b1b4500774099c13b8cb96eec99f620f05090ddf" [[package]] name = "blake3" @@ -131,9 +124,9 @@ checksum = "baf1de4339761588bc0619e3cbc0120ee582ebb74b53b4efbf79117bd2da40fd" [[package]] name = "clap" -version = "4.4.4" +version = "4.4.18" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "b1d7b8d5ec32af0fadc644bf1fd509a688c2103b185644bb1e29d164e0703136" +checksum = "1e578d6ec4194633722ccf9544794b71b1385c3c027efe0c55db226fc880865c" dependencies = [ "clap_builder", "clap_derive", @@ -141,9 +134,9 @@ dependencies = [ [[package]] name = "clap_builder" -version = "4.4.4" +version = "4.4.18" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "5179bb514e4d7c2051749d8fcefa2ed6d06a9f4e6d69faf3805f5d80b8cf8d56" +checksum = "4df4df40ec50c46000231c914968278b1eb05098cf8f1b3a518a95030e71d1c7" dependencies = [ "anstream", "anstyle", @@ -154,9 +147,9 @@ dependencies = [ [[package]] name = "clap_derive" -version = "4.4.2" +version = "4.4.7" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "0862016ff20d69b84ef8247369fabf5c008a7417002411897d40ee1f4532b873" +checksum = "cf9804afaaf59a91e75b022a30fb7229a7901f60c755489cc61c9b423b836442" dependencies = [ "heck", "proc-macro2", @@ -166,9 +159,9 @@ dependencies = [ [[package]] name = "clap_lex" -version = "0.5.1" +version = "0.6.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "cd7cc57abe963c6d3b9d8be5b06ba7c8957a930305ca90304f24ef040aa6f961" +checksum = "702fc72eb24e5a1e48ce58027a675bc24edd52096d5397d4aea7c6dd9eca0bd1" [[package]] name = "colorchoice" @@ -184,42 +177,34 @@ checksum = "f7144d30dcf0fafbce74250a3963025d8d52177934239851c917d29f1df280c2" [[package]] name = "crossbeam-deque" -version = "0.8.3" +version = "0.8.5" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "ce6fd6f855243022dcecf8702fef0c297d4338e226845fe067f6341ad9fa0cef" +checksum = "613f8cc01fe9cf1a3eb3d7f488fd2fa8388403e97039e2f73692932e291a770d" dependencies = [ - "cfg-if", "crossbeam-epoch", "crossbeam-utils", ] [[package]] name = "crossbeam-epoch" -version = "0.9.15" +version = "0.9.18" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "ae211234986c545741a7dc064309f67ee1e5ad243d0e48335adc0484d960bcc7" +checksum = "5b82ac4a3c2ca9c3460964f020e1402edd5753411d7737aa39c3714ad1b5420e" dependencies = [ - "autocfg", - "cfg-if", "crossbeam-utils", - "memoffset", - "scopeguard", ] [[package]] name = "crossbeam-utils" -version = "0.8.16" +version = "0.8.19" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "5a22b2d63d4d1dc0b7f1b6b2747dd0088008a9be28b6ddf0b1e7d335e3037294" -dependencies = [ - "cfg-if", -] +checksum = "248e3bacc7dc6baa3b21e405ee045c3047101a49145e7e9eca583ab4c2ca5345" [[package]] name = "duct" -version = "0.13.6" +version = "0.13.7" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "37ae3fc31835f74c2a7ceda3aeede378b0ae2e74c8f1c36559fcc9ae2a4e7d3e" +checksum = "e4ab5718d1224b63252cd0c6f74f6480f9ffeb117438a2e0f5cf6d9a4798929c" dependencies = [ "libc", "once_cell", @@ -235,30 +220,19 @@ checksum = "a26ae43d7bcc3b814de94796a5e736d4029efb0ee900c12e2d54c993ad1a1e07" [[package]] name = "errno" -version = "0.3.3" +version = "0.3.8" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "136526188508e25c6fef639d7927dfb3e0e3084488bf202267829cf7fc23dbdd" +checksum = "a258e46cdc063eb8519c00b9fc845fc47bcfca4130e2f08e88665ceda8474245" dependencies = [ - "errno-dragonfly", - "libc", - "windows-sys", -] - -[[package]] -name = "errno-dragonfly" -version = "0.1.2" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "aa68f1b12764fab894d2755d2518754e71b4fd80ecfb822714a1206c2aab39bf" -dependencies = [ - "cc", "libc", + "windows-sys 0.52.0", ] [[package]] name = "fastrand" -version = "2.0.0" +version = "2.0.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "6999dc1837253364c2ebb0704ba97994bd874e8f195d665c50b7548f6ea92764" +checksum = "25cbce373ec4653f1a01a31e8a5e5ec0c622dc27ff9c4e6606eefef5cbbed4a5" [[package]] name = "glob" @@ -280,73 +254,64 @@ checksum = "7f24254aa9a54b5c858eaee2f5bccdb46aaf0e486a595ed5fd8f86ba55232a70" [[package]] name = "libc" -version = "0.2.148" +version = "0.2.152" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "9cdc71e17332e86d2e1d38c1f99edcb6288ee11b815fb1a4b049eaa2114d369b" +checksum = "13e3bf6590cbc649f4d1a3eefc9d5d6eb746f5200ffb04e5e142700b8faa56e7" [[package]] name = "linux-raw-sys" -version = "0.4.7" +version = "0.4.13" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "1a9bad9f94746442c783ca431b22403b519cd7fbeed0533fdd6328b2f2212128" +checksum = "01cda141df6706de531b6c46c3a33ecca755538219bd484262fa09410c13539c" [[package]] name = "memmap2" -version = "0.7.1" +version = "0.9.3" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "f49388d20533534cd19360ad3d6a7dadc885944aa802ba3995040c5ec11288c6" +checksum = "45fd3a57831bf88bc63f8cebc0cf956116276e97fef3966103e96416209f7c92" dependencies = [ "libc", ] -[[package]] -name = "memoffset" -version = "0.9.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "5a634b1c61a95585bd15607c6ab0c4e5b226e695ff2800ba0cdccddf208c406c" -dependencies = [ - "autocfg", -] - [[package]] name = "once_cell" -version = "1.18.0" +version = "1.19.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "dd8b5dd2ae5ed71462c540258bedcb51965123ad7e7ccf4b9a8cafaa4a63576d" +checksum = "3fdb12b2476b595f9358c5161aa467c2438859caa136dec86c26fdd2efe17b92" [[package]] name = "os_pipe" -version = "1.1.4" +version = "1.1.5" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "0ae859aa07428ca9a929b936690f8b12dc5f11dd8c6992a18ca93919f28bc177" +checksum = "57119c3b893986491ec9aa85056780d3a0f3cf4da7cc09dd3650dbd6c6738fb9" dependencies = [ "libc", - "windows-sys", + "windows-sys 0.52.0", ] [[package]] name = "proc-macro2" -version = "1.0.67" +version = "1.0.78" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "3d433d9f1a3e8c1263d9456598b16fec66f4acc9a74dacffd35c7bb09b3a1328" +checksum = "e2422ad645d89c99f8f3e6b88a9fdeca7fabeac836b1002371c4367c8f984aae" dependencies = [ "unicode-ident", ] [[package]] name = "quote" -version = "1.0.33" +version = "1.0.35" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "5267fca4496028628a95160fc423a33e8b2e6af8a5302579e322e4b520293cae" +checksum = "291ec9ab5efd934aaf503a6466c5d5251535d108ee747472c3977cc5acc868ef" dependencies = [ "proc-macro2", ] [[package]] name = "rayon" -version = "1.8.0" +version = "1.8.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "9c27db03db7734835b3f53954b534c91069375ce6ccaa2e065441e07d9b6cdb1" +checksum = "fa7237101a77a10773db45d62004a272517633fbcc3df19d96455ede1122e051" dependencies = [ "either", "rayon-core", @@ -354,9 +319,9 @@ dependencies = [ [[package]] name = "rayon-core" -version = "1.12.0" +version = "1.12.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "5ce3fb6ad83f861aac485e76e1985cd109d9a3713802152be56c3b1f0e0658ed" +checksum = "1465873a3dfdaa8ae7cb14b4383657caab0b3e8a0aa9ae8e04b044854c8dfce2" dependencies = [ "crossbeam-deque", "crossbeam-utils", @@ -364,32 +329,26 @@ dependencies = [ [[package]] name = "redox_syscall" -version = "0.3.5" +version = "0.4.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "567664f262709473930a4bf9e51bf2ebf3348f2e748ccc50dea20646858f8f29" +checksum = "4722d768eff46b75989dd134e5c353f0d6296e5aaa3132e776cbdb56be7731aa" dependencies = [ "bitflags 1.3.2", ] [[package]] name = "rustix" -version = "0.38.14" +version = "0.38.30" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "747c788e9ce8e92b12cd485c49ddf90723550b654b32508f979b71a7b1ecda4f" +checksum = "322394588aaf33c24007e8bb3238ee3e4c5c09c084ab32bc73890b99ff326bca" dependencies = [ - "bitflags 2.4.0", + "bitflags 2.4.2", "errno", "libc", "linux-raw-sys", - "windows-sys", + "windows-sys 0.52.0", ] -[[package]] -name = "scopeguard" -version = "1.2.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "94143f37725109f92c262ed2cf5e59bce7498c01bcc1502d7b9afe439a4e9f49" - [[package]] name = "shared_child" version = "1.0.0" @@ -408,9 +367,9 @@ checksum = "73473c0e59e6d5812c5dfe2a064a6444949f089e20eec9a2e5506596494e4623" [[package]] name = "syn" -version = "2.0.37" +version = "2.0.48" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "7303ef2c05cd654186cb250d29049a24840ca25d2747c25c0381c8d9e2f582e8" +checksum = "0f3531638e407dfc0814761abb7c00a5b54992b849452a0646b7f65c9f770f3f" dependencies = [ "proc-macro2", "quote", @@ -419,15 +378,15 @@ dependencies = [ [[package]] name = "tempfile" -version = "3.8.0" +version = "3.9.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "cb94d2f3cc536af71caac6b6fcebf65860b347e7ce0cc9ebe8f70d3e521054ef" +checksum = "01ce4141aa927a6d1bd34a041795abd0db1cccba5d5f24b009f694bdf3a1f3fa" dependencies = [ "cfg-if", "fastrand", "redox_syscall", "rustix", - "windows-sys", + "windows-sys 0.52.0", ] [[package]] @@ -437,7 +396,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "21bebf2b7c9e0a515f6e0f8c51dc0f8e4696391e6f1ff30379559f8365fb0df7" dependencies = [ "rustix", - "windows-sys", + "windows-sys 0.48.0", ] [[package]] @@ -454,9 +413,9 @@ checksum = "711b9620af191e0cdc7468a8d14e709c3dcdb115b36f838e601583af800a370a" [[package]] name = "wild" -version = "2.1.0" +version = "2.2.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "05b116685a6be0c52f5a103334cbff26db643826c7b3735fc0a3ba9871310a74" +checksum = "10d01931a94d5a115a53f95292f51d316856b68a035618eb831bbba593a30b67" dependencies = [ "glob", ] @@ -489,7 +448,16 @@ version = "0.48.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "677d2418bec65e3338edb076e806bc1ec15693c5d0104683f2efe857f61056a9" dependencies = [ - "windows-targets", + "windows-targets 0.48.5", +] + +[[package]] +name = "windows-sys" +version = "0.52.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "282be5f36a8ce781fad8c8ae18fa3f9beff57ec1b52cb3de0789201425d9a33d" +dependencies = [ + "windows-targets 0.52.0", ] [[package]] @@ -498,13 +466,28 @@ version = "0.48.5" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "9a2fa6e2155d7247be68c096456083145c183cbbbc2764150dda45a87197940c" dependencies = [ - "windows_aarch64_gnullvm", - "windows_aarch64_msvc", - "windows_i686_gnu", - "windows_i686_msvc", - "windows_x86_64_gnu", - "windows_x86_64_gnullvm", - "windows_x86_64_msvc", + "windows_aarch64_gnullvm 0.48.5", + "windows_aarch64_msvc 0.48.5", + "windows_i686_gnu 0.48.5", + "windows_i686_msvc 0.48.5", + "windows_x86_64_gnu 0.48.5", + "windows_x86_64_gnullvm 0.48.5", + "windows_x86_64_msvc 0.48.5", +] + +[[package]] +name = "windows-targets" +version = "0.52.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8a18201040b24831fbb9e4eb208f8892e1f50a37feb53cc7ff887feb8f50e7cd" +dependencies = [ + "windows_aarch64_gnullvm 0.52.0", + "windows_aarch64_msvc 0.52.0", + "windows_i686_gnu 0.52.0", + "windows_i686_msvc 0.52.0", + "windows_x86_64_gnu 0.52.0", + "windows_x86_64_gnullvm 0.52.0", + "windows_x86_64_msvc 0.52.0", ] [[package]] @@ -513,38 +496,80 @@ version = "0.48.5" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "2b38e32f0abccf9987a4e3079dfb67dcd799fb61361e53e2882c3cbaf0d905d8" +[[package]] +name = "windows_aarch64_gnullvm" +version = "0.52.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "cb7764e35d4db8a7921e09562a0304bf2f93e0a51bfccee0bd0bb0b666b015ea" + [[package]] name = "windows_aarch64_msvc" version = "0.48.5" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "dc35310971f3b2dbbf3f0690a219f40e2d9afcf64f9ab7cc1be722937c26b4bc" +[[package]] +name = "windows_aarch64_msvc" +version = "0.52.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "bbaa0368d4f1d2aaefc55b6fcfee13f41544ddf36801e793edbbfd7d7df075ef" + [[package]] name = "windows_i686_gnu" version = "0.48.5" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "a75915e7def60c94dcef72200b9a8e58e5091744960da64ec734a6c6e9b3743e" +[[package]] +name = "windows_i686_gnu" +version = "0.52.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a28637cb1fa3560a16915793afb20081aba2c92ee8af57b4d5f28e4b3e7df313" + [[package]] name = "windows_i686_msvc" version = "0.48.5" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "8f55c233f70c4b27f66c523580f78f1004e8b5a8b659e05a4eb49d4166cca406" +[[package]] +name = "windows_i686_msvc" +version = "0.52.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ffe5e8e31046ce6230cc7215707b816e339ff4d4d67c65dffa206fd0f7aa7b9a" + [[package]] name = "windows_x86_64_gnu" version = "0.48.5" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "53d40abd2583d23e4718fddf1ebec84dbff8381c07cae67ff7768bbf19c6718e" +[[package]] +name = "windows_x86_64_gnu" +version = "0.52.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "3d6fa32db2bc4a2f5abeacf2b69f7992cd09dca97498da74a151a3132c26befd" + [[package]] name = "windows_x86_64_gnullvm" version = "0.48.5" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "0b7b52767868a23d5bab768e390dc5f5c55825b6d30b86c844ff2dc7414044cc" +[[package]] +name = "windows_x86_64_gnullvm" +version = "0.52.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1a657e1e9d3f514745a572a6846d3c7aa7dbe1658c056ed9c3344c4109a6949e" + [[package]] name = "windows_x86_64_msvc" version = "0.48.5" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "ed94fce61571a4006852b7389a063ab983c02eb1bb37b47f8272ce92d06d9538" + +[[package]] +name = "windows_x86_64_msvc" +version = "0.52.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "dff9641d1cd4be8d1a070daf9e3773c5f67e78b4d9d42263020c057706765c04" diff --git a/b3sum/Cargo.toml b/b3sum/Cargo.toml index e3bf820..52acb08 100644 --- a/b3sum/Cargo.toml +++ b/b3sum/Cargo.toml @@ -18,7 +18,6 @@ anyhow = "1.0.25" blake3 = { version = "1", path = "..", features = ["mmap", "rayon"] } clap = { version = "4.0.8", features = ["derive", "wrap_help"] } hex = "0.4.0" -memmap2 = "0.7.0" rayon = "1.2.1" wild = "2.0.3" From fc7522717092a9530691736f1fd89c94a79a7800 Mon Sep 17 00:00:00 2001 From: Jack O'Connor Date: Sun, 21 Jan 2024 17:35:29 -0800 Subject: [PATCH 14/18] factor out just the portable parts of the guts_api branch --- rust/guts/Cargo.toml | 18 + rust/guts/readme.md | 62 +++ rust/guts/src/lib.rs | 956 ++++++++++++++++++++++++++++++++++++++ rust/guts/src/portable.rs | 262 +++++++++++ rust/guts/src/test.rs | 523 +++++++++++++++++++++ 5 files changed, 1821 insertions(+) create mode 100644 rust/guts/Cargo.toml create mode 100644 rust/guts/readme.md create mode 100644 rust/guts/src/lib.rs create mode 100644 rust/guts/src/portable.rs create mode 100644 rust/guts/src/test.rs diff --git a/rust/guts/Cargo.toml b/rust/guts/Cargo.toml new file mode 100644 index 0000000..ebcf77f --- /dev/null +++ b/rust/guts/Cargo.toml @@ -0,0 +1,18 @@ +[package] +name = "blake3_guts" +version = "0.0.0" +authors = ["Jack O'Connor ", "Samuel Neves"] +description = "low-level building blocks for the BLAKE3 hash function" +repository = "https://github.com/BLAKE3-team/BLAKE3" +license = "CC0-1.0 OR Apache-2.0" +documentation = "https://docs.rs/blake3_guts" +readme = "readme.md" +edition = "2021" + +[dev-dependencies] +hex = "0.4.3" +reference_impl = { path = "../../reference_impl" } + +[features] +default = ["std"] +std = [] diff --git a/rust/guts/readme.md b/rust/guts/readme.md new file mode 100644 index 0000000..a1adbf1 --- /dev/null +++ b/rust/guts/readme.md @@ -0,0 +1,62 @@ +# The BLAKE3 Guts API + +## Introduction + +This crate contains low-level, high-performance, platform-specific +implementations of the BLAKE3 compression function. This API is complicated and +unsafe, and this crate will never have a stable release. For the standard +BLAKE3 hash function, see the [`blake3`](https://crates.io/crates/blake3) +crate, which depends on this one. + +The most important ingredient in a high-performance implementation of BLAKE3 is +parallelism. The BLAKE3 tree structure lets us hash different parts of the tree +in parallel, and modern computers have a _lot_ of parallelism to offer. +Sometimes that means using multiple threads running on multiple cores, but +multithreading isn't appropriate for all applications, and it's not the usual +default for library APIs. More commonly, BLAKE3 implementations use SIMD +instructions ("Single Instruction Multiple Data") to improve the performance of +a single thread. When we do use multithreading, the performance benefits +multiply. + +The tricky thing about SIMD is that each instruction set works differently. +Instead of writing portable code once and letting the compiler do most of the +optimization work, we need to write platform-specific implementations, and +sometimes more than one per platform. We maintain *four* different +implementations on x86 alone (targeting SSE2, SSE4.1, AVX2, and AVX-512), in +addition to ARM NEON and the RISC-V vector extensions. In the future we might +add ARM SVE2. + +All of that means a lot of duplicated logic and maintenance. So while the main +goal of this API is high performance, it's also important to keep the API as +small and simple as possible. Higher level details like the "CV stack", input +buffering, and multithreading are handled by portable code in the main `blake3` +crate. These are just building blocks. + +## The private API + +This is the API that each platform reimplements. It's completely `unsafe`, +inputs and outputs are allowed to alias, and bounds checking is the caller's +responsibility. + +- `degree` +- `compress` +- `hash_chunks` +- `hash_parents` +- `xof` +- `xof_xor` +- `universal_hash` + +## The public API + +This is the API that this crate exposes to callers, i.e. to the main `blake3` +crate. It's a thin, portable layer on top of the private API above. The Rust +version of this API is memory-safe. + +- `degree` +- `compress` +- `hash_chunks` +- `hash_parents` +- `reduce_parents` +- `xof` +- `xof_xor` +- `universal_hash` diff --git a/rust/guts/src/lib.rs b/rust/guts/src/lib.rs new file mode 100644 index 0000000..67f7a05 --- /dev/null +++ b/rust/guts/src/lib.rs @@ -0,0 +1,956 @@ +use core::cmp; +use core::marker::PhantomData; +use core::mem; +use core::ptr; +use core::sync::atomic::{AtomicPtr, Ordering::Relaxed}; + +pub mod portable; + +#[cfg(test)] +mod test; + +pub const OUT_LEN: usize = 32; +pub const BLOCK_LEN: usize = 64; +pub const CHUNK_LEN: usize = 1024; +pub const WORD_LEN: usize = 4; +pub const UNIVERSAL_HASH_LEN: usize = 16; + +pub const CHUNK_START: u32 = 1 << 0; +pub const CHUNK_END: u32 = 1 << 1; +pub const PARENT: u32 = 1 << 2; +pub const ROOT: u32 = 1 << 3; +pub const KEYED_HASH: u32 = 1 << 4; +pub const DERIVE_KEY_CONTEXT: u32 = 1 << 5; +pub const DERIVE_KEY_MATERIAL: u32 = 1 << 6; + +pub const IV: CVWords = [ + 0x6A09E667, 0xBB67AE85, 0x3C6EF372, 0xA54FF53A, 0x510E527F, 0x9B05688C, 0x1F83D9AB, 0x5BE0CD19, +]; +pub const IV_BYTES: CVBytes = le_bytes_from_words_32(&IV); + +pub const MSG_SCHEDULE: [[usize; 16]; 7] = [ + [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15], + [2, 6, 3, 10, 7, 0, 4, 13, 1, 11, 12, 5, 9, 14, 15, 8], + [3, 4, 10, 12, 13, 2, 7, 14, 6, 5, 9, 0, 11, 15, 8, 1], + [10, 7, 12, 9, 14, 3, 13, 15, 4, 0, 11, 2, 5, 8, 1, 6], + [12, 13, 9, 11, 15, 10, 14, 8, 7, 2, 5, 3, 0, 1, 6, 4], + [9, 14, 11, 5, 8, 12, 15, 1, 13, 3, 0, 10, 2, 6, 4, 7], + [11, 15, 5, 0, 1, 9, 8, 6, 14, 10, 2, 12, 3, 4, 7, 13], +]; + +// never less than 2 +pub const MAX_SIMD_DEGREE: usize = 2; + +pub type CVBytes = [u8; 32]; +pub type CVWords = [u32; 8]; +pub type BlockBytes = [u8; 64]; +pub type BlockWords = [u32; 16]; + +pub static DETECTED_IMPL: Implementation = Implementation::new( + degree_init, + compress_init, + hash_chunks_init, + hash_parents_init, + xof_init, + xof_xor_init, + universal_hash_init, +); + +fn detect() -> Implementation { + portable::implementation() +} + +fn init_detected_impl() { + let detected = detect(); + + DETECTED_IMPL + .degree_ptr + .store(detected.degree_ptr.load(Relaxed), Relaxed); + DETECTED_IMPL + .compress_ptr + .store(detected.compress_ptr.load(Relaxed), Relaxed); + DETECTED_IMPL + .hash_chunks_ptr + .store(detected.hash_chunks_ptr.load(Relaxed), Relaxed); + DETECTED_IMPL + .hash_parents_ptr + .store(detected.hash_parents_ptr.load(Relaxed), Relaxed); + DETECTED_IMPL + .xof_ptr + .store(detected.xof_ptr.load(Relaxed), Relaxed); + DETECTED_IMPL + .xof_xor_ptr + .store(detected.xof_xor_ptr.load(Relaxed), Relaxed); + DETECTED_IMPL + .universal_hash_ptr + .store(detected.universal_hash_ptr.load(Relaxed), Relaxed); +} + +pub struct Implementation { + degree_ptr: AtomicPtr<()>, + compress_ptr: AtomicPtr<()>, + hash_chunks_ptr: AtomicPtr<()>, + hash_parents_ptr: AtomicPtr<()>, + xof_ptr: AtomicPtr<()>, + xof_xor_ptr: AtomicPtr<()>, + universal_hash_ptr: AtomicPtr<()>, +} + +impl Implementation { + const fn new( + degree_fn: DegreeFn, + compress_fn: CompressFn, + hash_chunks_fn: HashChunksFn, + hash_parents_fn: HashParentsFn, + xof_fn: XofFn, + xof_xor_fn: XofFn, + universal_hash_fn: UniversalHashFn, + ) -> Self { + Self { + degree_ptr: AtomicPtr::new(degree_fn as *mut ()), + compress_ptr: AtomicPtr::new(compress_fn as *mut ()), + hash_chunks_ptr: AtomicPtr::new(hash_chunks_fn as *mut ()), + hash_parents_ptr: AtomicPtr::new(hash_parents_fn as *mut ()), + xof_ptr: AtomicPtr::new(xof_fn as *mut ()), + xof_xor_ptr: AtomicPtr::new(xof_xor_fn as *mut ()), + universal_hash_ptr: AtomicPtr::new(universal_hash_fn as *mut ()), + } + } + + #[inline] + fn degree_fn(&self) -> DegreeFn { + unsafe { mem::transmute(self.degree_ptr.load(Relaxed)) } + } + + #[inline] + pub fn degree(&self) -> usize { + let degree = unsafe { self.degree_fn()() }; + debug_assert!(degree >= 2); + debug_assert!(degree <= MAX_SIMD_DEGREE); + debug_assert_eq!(1, degree.count_ones(), "power of 2"); + degree + } + + #[inline] + pub fn split_transposed_vectors<'v>( + &self, + vectors: &'v mut TransposedVectors, + ) -> (TransposedSplit<'v>, TransposedSplit<'v>) { + unsafe { vectors.split(self.degree()) } + } + + #[inline] + fn compress_fn(&self) -> CompressFn { + unsafe { mem::transmute(self.compress_ptr.load(Relaxed)) } + } + + #[inline] + pub fn compress( + &self, + block: &BlockBytes, + block_len: u32, + cv: &CVBytes, + counter: u64, + flags: u32, + ) -> CVBytes { + let mut out = [0u8; 32]; + unsafe { + self.compress_fn()(block, block_len, cv, counter, flags, &mut out); + } + out + } + + // The contract for HashChunksFn doesn't require the implementation to support single-chunk + // inputs. Instead we handle that case here by calling compress in a loop. + #[inline] + fn hash_one_chunk( + &self, + mut input: &[u8], + key: &CVBytes, + counter: u64, + mut flags: u32, + output: TransposedSplit, + ) { + debug_assert!(input.len() <= CHUNK_LEN); + let mut cv = *key; + flags |= CHUNK_START; + while input.len() > BLOCK_LEN { + cv = self.compress( + input[..BLOCK_LEN].try_into().unwrap(), + BLOCK_LEN as u32, + &cv, + counter, + flags, + ); + input = &input[BLOCK_LEN..]; + flags &= !CHUNK_START; + } + let mut final_block = [0u8; BLOCK_LEN]; + final_block[..input.len()].copy_from_slice(input); + cv = self.compress( + &final_block, + input.len() as u32, + &cv, + counter, + flags | CHUNK_END, + ); + unsafe { + write_transposed_cv(&words_from_le_bytes_32(&cv), output.ptr); + } + } + + #[inline] + fn hash_chunks_fn(&self) -> HashChunksFn { + unsafe { mem::transmute(self.hash_chunks_ptr.load(Relaxed)) } + } + + #[inline] + pub fn hash_chunks( + &self, + input: &[u8], + key: &CVBytes, + counter: u64, + flags: u32, + transposed_output: TransposedSplit, + ) -> usize { + debug_assert!(input.len() <= self.degree() * CHUNK_LEN); + if input.len() <= CHUNK_LEN { + // The underlying hash_chunks_fn isn't required to support this case. Instead we handle + // it by calling compress_fn in a loop. But note that we still don't support root + // finalization or the empty input here. + self.hash_one_chunk(input, key, counter, flags, transposed_output); + return 1; + } + // SAFETY: If the caller passes in more than MAX_SIMD_DEGREE * CHUNK_LEN bytes, silently + // ignore the remainder. This makes it impossible to write out of bounds in a properly + // constructed TransposedSplit. + let len = cmp::min(input.len(), MAX_SIMD_DEGREE * CHUNK_LEN); + unsafe { + self.hash_chunks_fn()( + input.as_ptr(), + len, + key, + counter, + flags, + transposed_output.ptr, + ); + } + if input.len() % CHUNK_LEN == 0 { + input.len() / CHUNK_LEN + } else { + (input.len() / CHUNK_LEN) + 1 + } + } + + #[inline] + fn hash_parents_fn(&self) -> HashParentsFn { + unsafe { mem::transmute(self.hash_parents_ptr.load(Relaxed)) } + } + + #[inline] + pub fn hash_parents( + &self, + transposed_input: &TransposedVectors, + mut num_cvs: usize, + key: &CVBytes, + flags: u32, + transposed_output: TransposedSplit, + ) -> usize { + debug_assert!(num_cvs <= 2 * MAX_SIMD_DEGREE); + // SAFETY: Cap num_cvs at 2 * MAX_SIMD_DEGREE, to guarantee no out-of-bounds accesses. + num_cvs = cmp::min(num_cvs, 2 * MAX_SIMD_DEGREE); + let mut odd_cv = [0u32; 8]; + if num_cvs % 2 == 1 { + unsafe { + odd_cv = read_transposed_cv(transposed_input.as_ptr().add(num_cvs - 1)); + } + } + let num_parents = num_cvs / 2; + unsafe { + self.hash_parents_fn()( + transposed_input.as_ptr(), + num_parents, + key, + flags | PARENT, + transposed_output.ptr, + ); + } + if num_cvs % 2 == 1 { + unsafe { + write_transposed_cv(&odd_cv, transposed_output.ptr.add(num_parents)); + } + num_parents + 1 + } else { + num_parents + } + } + + #[inline] + pub fn reduce_parents( + &self, + transposed_in_out: &mut TransposedVectors, + mut num_cvs: usize, + key: &CVBytes, + flags: u32, + ) -> usize { + debug_assert!(num_cvs <= 2 * MAX_SIMD_DEGREE); + // SAFETY: Cap num_cvs at 2 * MAX_SIMD_DEGREE, to guarantee no out-of-bounds accesses. + num_cvs = cmp::min(num_cvs, 2 * MAX_SIMD_DEGREE); + let in_out_ptr = transposed_in_out.as_mut_ptr(); + let mut odd_cv = [0u32; 8]; + if num_cvs % 2 == 1 { + unsafe { + odd_cv = read_transposed_cv(in_out_ptr.add(num_cvs - 1)); + } + } + let num_parents = num_cvs / 2; + unsafe { + self.hash_parents_fn()(in_out_ptr, num_parents, key, flags | PARENT, in_out_ptr); + } + if num_cvs % 2 == 1 { + unsafe { + write_transposed_cv(&odd_cv, in_out_ptr.add(num_parents)); + } + num_parents + 1 + } else { + num_parents + } + } + + #[inline] + fn xof_fn(&self) -> XofFn { + unsafe { mem::transmute(self.xof_ptr.load(Relaxed)) } + } + + #[inline] + pub fn xof( + &self, + block: &BlockBytes, + block_len: u32, + cv: &CVBytes, + mut counter: u64, + flags: u32, + mut out: &mut [u8], + ) { + let degree = self.degree(); + let simd_len = degree * BLOCK_LEN; + while !out.is_empty() { + let take = cmp::min(simd_len, out.len()); + unsafe { + self.xof_fn()( + block, + block_len, + cv, + counter, + flags | ROOT, + out.as_mut_ptr(), + take, + ); + } + out = &mut out[take..]; + counter += degree as u64; + } + } + + #[inline] + fn xof_xor_fn(&self) -> XofFn { + unsafe { mem::transmute(self.xof_xor_ptr.load(Relaxed)) } + } + + #[inline] + pub fn xof_xor( + &self, + block: &BlockBytes, + block_len: u32, + cv: &CVBytes, + mut counter: u64, + flags: u32, + mut out: &mut [u8], + ) { + let degree = self.degree(); + let simd_len = degree * BLOCK_LEN; + while !out.is_empty() { + let take = cmp::min(simd_len, out.len()); + unsafe { + self.xof_xor_fn()( + block, + block_len, + cv, + counter, + flags | ROOT, + out.as_mut_ptr(), + take, + ); + } + out = &mut out[take..]; + counter += degree as u64; + } + } + + #[inline] + fn universal_hash_fn(&self) -> UniversalHashFn { + unsafe { mem::transmute(self.universal_hash_ptr.load(Relaxed)) } + } + + #[inline] + pub fn universal_hash(&self, mut input: &[u8], key: &CVBytes, mut counter: u64) -> [u8; 16] { + let degree = self.degree(); + let simd_len = degree * BLOCK_LEN; + let mut ret = [0u8; 16]; + while !input.is_empty() { + let take = cmp::min(simd_len, input.len()); + let mut output = [0u8; 16]; + unsafe { + self.universal_hash_fn()(input.as_ptr(), take, key, counter, &mut output); + } + input = &input[take..]; + counter += degree as u64; + for byte_index in 0..16 { + ret[byte_index] ^= output[byte_index]; + } + } + ret + } +} + +impl Clone for Implementation { + fn clone(&self) -> Self { + Self { + degree_ptr: AtomicPtr::new(self.degree_ptr.load(Relaxed)), + compress_ptr: AtomicPtr::new(self.compress_ptr.load(Relaxed)), + hash_chunks_ptr: AtomicPtr::new(self.hash_chunks_ptr.load(Relaxed)), + hash_parents_ptr: AtomicPtr::new(self.hash_parents_ptr.load(Relaxed)), + xof_ptr: AtomicPtr::new(self.xof_ptr.load(Relaxed)), + xof_xor_ptr: AtomicPtr::new(self.xof_xor_ptr.load(Relaxed)), + universal_hash_ptr: AtomicPtr::new(self.universal_hash_ptr.load(Relaxed)), + } + } +} + +// never less than 2 +type DegreeFn = unsafe extern "C" fn() -> usize; + +unsafe extern "C" fn degree_init() -> usize { + init_detected_impl(); + DETECTED_IMPL.degree_fn()() +} + +type CompressFn = unsafe extern "C" fn( + block: *const BlockBytes, // zero padded to 64 bytes + block_len: u32, + cv: *const CVBytes, + counter: u64, + flags: u32, + out: *mut CVBytes, // may overlap the input +); + +unsafe extern "C" fn compress_init( + block: *const BlockBytes, + block_len: u32, + cv: *const CVBytes, + counter: u64, + flags: u32, + out: *mut CVBytes, +) { + init_detected_impl(); + DETECTED_IMPL.compress_fn()(block, block_len, cv, counter, flags, out); +} + +type CompressXofFn = unsafe extern "C" fn( + block: *const BlockBytes, // zero padded to 64 bytes + block_len: u32, + cv: *const CVBytes, + counter: u64, + flags: u32, + out: *mut BlockBytes, // may overlap the input +); + +type HashChunksFn = unsafe extern "C" fn( + input: *const u8, + input_len: usize, + key: *const CVBytes, + counter: u64, + flags: u32, + transposed_output: *mut u32, +); + +unsafe extern "C" fn hash_chunks_init( + input: *const u8, + input_len: usize, + key: *const CVBytes, + counter: u64, + flags: u32, + transposed_output: *mut u32, +) { + init_detected_impl(); + DETECTED_IMPL.hash_chunks_fn()(input, input_len, key, counter, flags, transposed_output); +} + +type HashParentsFn = unsafe extern "C" fn( + transposed_input: *const u32, + num_parents: usize, + key: *const CVBytes, + flags: u32, + transposed_output: *mut u32, // may overlap the input +); + +unsafe extern "C" fn hash_parents_init( + transposed_input: *const u32, + num_parents: usize, + key: *const CVBytes, + flags: u32, + transposed_output: *mut u32, +) { + init_detected_impl(); + DETECTED_IMPL.hash_parents_fn()(transposed_input, num_parents, key, flags, transposed_output); +} + +// This signature covers both xof() and xof_xor(). +type XofFn = unsafe extern "C" fn( + block: *const BlockBytes, // zero padded to 64 bytes + block_len: u32, + cv: *const CVBytes, + counter: u64, + flags: u32, + out: *mut u8, + out_len: usize, +); + +unsafe extern "C" fn xof_init( + block: *const BlockBytes, + block_len: u32, + cv: *const CVBytes, + counter: u64, + flags: u32, + out: *mut u8, + out_len: usize, +) { + init_detected_impl(); + DETECTED_IMPL.xof_fn()(block, block_len, cv, counter, flags, out, out_len); +} + +unsafe extern "C" fn xof_xor_init( + block: *const BlockBytes, + block_len: u32, + cv: *const CVBytes, + counter: u64, + flags: u32, + out: *mut u8, + out_len: usize, +) { + init_detected_impl(); + DETECTED_IMPL.xof_xor_fn()(block, block_len, cv, counter, flags, out, out_len); +} + +type UniversalHashFn = unsafe extern "C" fn( + input: *const u8, + input_len: usize, + key: *const CVBytes, + counter: u64, + out: *mut [u8; 16], +); + +unsafe extern "C" fn universal_hash_init( + input: *const u8, + input_len: usize, + key: *const CVBytes, + counter: u64, + out: *mut [u8; 16], +) { + init_detected_impl(); + DETECTED_IMPL.universal_hash_fn()(input, input_len, key, counter, out); +} + +// The implicit degree of this implementation is MAX_SIMD_DEGREE. +#[inline(always)] +unsafe fn hash_chunks_using_compress( + compress: CompressFn, + mut input: *const u8, + mut input_len: usize, + key: *const CVBytes, + mut counter: u64, + flags: u32, + mut transposed_output: *mut u32, +) { + debug_assert!(input_len > 0); + debug_assert!(input_len <= MAX_SIMD_DEGREE * CHUNK_LEN); + input_len = cmp::min(input_len, MAX_SIMD_DEGREE * CHUNK_LEN); + while input_len > 0 { + let mut chunk_len = cmp::min(input_len, CHUNK_LEN); + input_len -= chunk_len; + // We only use 8 words of the CV, but compress returns 16. + let mut cv = *key; + let cv_ptr: *mut CVBytes = &mut cv; + let mut chunk_flags = flags | CHUNK_START; + while chunk_len > BLOCK_LEN { + compress( + input as *const BlockBytes, + BLOCK_LEN as u32, + cv_ptr, + counter, + chunk_flags, + cv_ptr, + ); + input = input.add(BLOCK_LEN); + chunk_len -= BLOCK_LEN; + chunk_flags &= !CHUNK_START; + } + let mut last_block = [0u8; BLOCK_LEN]; + ptr::copy_nonoverlapping(input, last_block.as_mut_ptr(), chunk_len); + input = input.add(chunk_len); + compress( + &last_block, + chunk_len as u32, + cv_ptr, + counter, + chunk_flags | CHUNK_END, + cv_ptr, + ); + let cv_words = words_from_le_bytes_32(&cv); + for word_index in 0..8 { + transposed_output + .add(word_index * TRANSPOSED_STRIDE) + .write(cv_words[word_index]); + } + transposed_output = transposed_output.add(1); + counter += 1; + } +} + +// The implicit degree of this implementation is MAX_SIMD_DEGREE. +#[inline(always)] +unsafe fn hash_parents_using_compress( + compress: CompressFn, + mut transposed_input: *const u32, + mut num_parents: usize, + key: *const CVBytes, + flags: u32, + mut transposed_output: *mut u32, // may overlap the input +) { + debug_assert!(num_parents > 0); + debug_assert!(num_parents <= MAX_SIMD_DEGREE); + while num_parents > 0 { + let mut block_bytes = [0u8; 64]; + for word_index in 0..8 { + let left_child_word = transposed_input.add(word_index * TRANSPOSED_STRIDE).read(); + block_bytes[WORD_LEN * word_index..][..WORD_LEN] + .copy_from_slice(&left_child_word.to_le_bytes()); + let right_child_word = transposed_input + .add(word_index * TRANSPOSED_STRIDE + 1) + .read(); + block_bytes[WORD_LEN * (word_index + 8)..][..WORD_LEN] + .copy_from_slice(&right_child_word.to_le_bytes()); + } + let mut cv = [0u8; 32]; + compress(&block_bytes, BLOCK_LEN as u32, key, 0, flags, &mut cv); + let cv_words = words_from_le_bytes_32(&cv); + for word_index in 0..8 { + transposed_output + .add(word_index * TRANSPOSED_STRIDE) + .write(cv_words[word_index]); + } + transposed_input = transposed_input.add(2); + transposed_output = transposed_output.add(1); + num_parents -= 1; + } +} + +#[inline(always)] +unsafe fn xof_using_compress_xof( + compress_xof: CompressXofFn, + block: *const BlockBytes, + block_len: u32, + cv: *const CVBytes, + mut counter: u64, + flags: u32, + mut out: *mut u8, + mut out_len: usize, +) { + debug_assert!(out_len <= MAX_SIMD_DEGREE * BLOCK_LEN); + while out_len > 0 { + let mut block_output = [0u8; 64]; + compress_xof(block, block_len, cv, counter, flags, &mut block_output); + let take = cmp::min(out_len, BLOCK_LEN); + ptr::copy_nonoverlapping(block_output.as_ptr(), out, take); + out = out.add(take); + out_len -= take; + counter += 1; + } +} + +#[inline(always)] +unsafe fn xof_xor_using_compress_xof( + compress_xof: CompressXofFn, + block: *const BlockBytes, + block_len: u32, + cv: *const CVBytes, + mut counter: u64, + flags: u32, + mut out: *mut u8, + mut out_len: usize, +) { + debug_assert!(out_len <= MAX_SIMD_DEGREE * BLOCK_LEN); + while out_len > 0 { + let mut block_output = [0u8; 64]; + compress_xof(block, block_len, cv, counter, flags, &mut block_output); + let take = cmp::min(out_len, BLOCK_LEN); + for i in 0..take { + *out.add(i) ^= block_output[i]; + } + out = out.add(take); + out_len -= take; + counter += 1; + } +} + +#[inline(always)] +unsafe fn universal_hash_using_compress( + compress: CompressFn, + mut input: *const u8, + mut input_len: usize, + key: *const CVBytes, + mut counter: u64, + out: *mut [u8; 16], +) { + let flags = KEYED_HASH | CHUNK_START | CHUNK_END | ROOT; + let mut result = [0u8; 16]; + while input_len > 0 { + let block_len = cmp::min(input_len, BLOCK_LEN); + let mut block = [0u8; BLOCK_LEN]; + ptr::copy_nonoverlapping(input, block.as_mut_ptr(), block_len); + let mut block_output = [0u8; 32]; + compress( + &block, + block_len as u32, + key, + counter, + flags, + &mut block_output, + ); + for i in 0..16 { + result[i] ^= block_output[i]; + } + input = input.add(block_len); + input_len -= block_len; + counter += 1; + } + *out = result; +} + +// this is in units of *words*, for pointer operations on *const/*mut u32 +const TRANSPOSED_STRIDE: usize = 2 * MAX_SIMD_DEGREE; + +#[cfg_attr(any(target_arch = "x86", target_arch = "x86_64"), repr(C, align(64)))] +#[derive(Clone, Debug, PartialEq, Eq)] +pub struct TransposedVectors([[u32; 2 * MAX_SIMD_DEGREE]; 8]); + +impl TransposedVectors { + pub fn new() -> Self { + Self([[0; 2 * MAX_SIMD_DEGREE]; 8]) + } + + pub fn extract_cv(&self, cv_index: usize) -> CVBytes { + let mut words = [0u32; 8]; + for word_index in 0..8 { + words[word_index] = self.0[word_index][cv_index]; + } + le_bytes_from_words_32(&words) + } + + pub fn extract_parent_node(&self, parent_index: usize) -> BlockBytes { + let mut bytes = [0u8; 64]; + bytes[..32].copy_from_slice(&self.extract_cv(parent_index / 2)); + bytes[32..].copy_from_slice(&self.extract_cv(parent_index / 2 + 1)); + bytes + } + + fn as_ptr(&self) -> *const u32 { + self.0[0].as_ptr() + } + + fn as_mut_ptr(&mut self) -> *mut u32 { + self.0[0].as_mut_ptr() + } + + // SAFETY: This function is just pointer arithmetic, but callers assume that it's safe (not + // necessarily correct) to write up to `degree` words to either side of the split, possibly + // from different threads. + unsafe fn split(&mut self, degree: usize) -> (TransposedSplit, TransposedSplit) { + debug_assert!(degree > 0); + debug_assert!(degree <= MAX_SIMD_DEGREE); + debug_assert_eq!(degree.count_ones(), 1, "power of 2"); + let ptr = self.as_mut_ptr(); + let left = TransposedSplit { + ptr, + phantom_data: PhantomData, + }; + let right = TransposedSplit { + ptr: ptr.wrapping_add(degree), + phantom_data: PhantomData, + }; + (left, right) + } +} + +pub struct TransposedSplit<'vectors> { + ptr: *mut u32, + phantom_data: PhantomData<&'vectors mut u32>, +} + +unsafe impl<'vectors> Send for TransposedSplit<'vectors> {} +unsafe impl<'vectors> Sync for TransposedSplit<'vectors> {} + +unsafe fn read_transposed_cv(src: *const u32) -> CVWords { + let mut cv = [0u32; 8]; + for word_index in 0..8 { + let offset_words = word_index * TRANSPOSED_STRIDE; + cv[word_index] = src.add(offset_words).read(); + } + cv +} + +unsafe fn write_transposed_cv(cv: &CVWords, dest: *mut u32) { + for word_index in 0..8 { + let offset_words = word_index * TRANSPOSED_STRIDE; + dest.add(offset_words).write(cv[word_index]); + } +} + +#[inline(always)] +pub const fn le_bytes_from_words_32(words: &CVWords) -> CVBytes { + let mut bytes = [0u8; 32]; + // This loop is super verbose because currently that's what it takes to be const. + let mut word_index = 0; + while word_index < bytes.len() / WORD_LEN { + let word_bytes = words[word_index].to_le_bytes(); + let mut byte_index = 0; + while byte_index < WORD_LEN { + bytes[word_index * WORD_LEN + byte_index] = word_bytes[byte_index]; + byte_index += 1; + } + word_index += 1; + } + bytes +} + +#[inline(always)] +pub const fn le_bytes_from_words_64(words: &BlockWords) -> BlockBytes { + let mut bytes = [0u8; 64]; + // This loop is super verbose because currently that's what it takes to be const. + let mut word_index = 0; + while word_index < bytes.len() / WORD_LEN { + let word_bytes = words[word_index].to_le_bytes(); + let mut byte_index = 0; + while byte_index < WORD_LEN { + bytes[word_index * WORD_LEN + byte_index] = word_bytes[byte_index]; + byte_index += 1; + } + word_index += 1; + } + bytes +} + +#[inline(always)] +pub const fn words_from_le_bytes_32(bytes: &CVBytes) -> CVWords { + let mut words = [0u32; 8]; + // This loop is super verbose because currently that's what it takes to be const. + let mut word_index = 0; + while word_index < words.len() { + let mut word_bytes = [0u8; WORD_LEN]; + let mut byte_index = 0; + while byte_index < WORD_LEN { + word_bytes[byte_index] = bytes[word_index * WORD_LEN + byte_index]; + byte_index += 1; + } + words[word_index] = u32::from_le_bytes(word_bytes); + word_index += 1; + } + words +} + +#[inline(always)] +pub const fn words_from_le_bytes_64(bytes: &BlockBytes) -> BlockWords { + let mut words = [0u32; 16]; + // This loop is super verbose because currently that's what it takes to be const. + let mut word_index = 0; + while word_index < words.len() { + let mut word_bytes = [0u8; WORD_LEN]; + let mut byte_index = 0; + while byte_index < WORD_LEN { + word_bytes[byte_index] = bytes[word_index * WORD_LEN + byte_index]; + byte_index += 1; + } + words[word_index] = u32::from_le_bytes(word_bytes); + word_index += 1; + } + words +} + +#[test] +fn test_byte_word_round_trips() { + let cv = *b"This is 32 LE bytes/eight words."; + assert_eq!(cv, le_bytes_from_words_32(&words_from_le_bytes_32(&cv))); + let block = *b"This is sixty-four little-endian bytes, or sixteen 32-bit words."; + assert_eq!( + block, + le_bytes_from_words_64(&words_from_le_bytes_64(&block)), + ); +} + +// The largest power of two less than or equal to `n`, used for left_len() +// immediately below, and also directly in Hasher::update(). +pub fn largest_power_of_two_leq(n: usize) -> usize { + ((n / 2) + 1).next_power_of_two() +} + +#[test] +fn test_largest_power_of_two_leq() { + let input_output = &[ + // The zero case is nonsensical, but it does work. + (0, 1), + (1, 1), + (2, 2), + (3, 2), + (4, 4), + (5, 4), + (6, 4), + (7, 4), + (8, 8), + // the largest possible usize + (usize::MAX, (usize::MAX >> 1) + 1), + ]; + for &(input, output) in input_output { + assert_eq!( + output, + crate::largest_power_of_two_leq(input), + "wrong output for n={}", + input + ); + } +} + +// Given some input larger than one chunk, return the number of bytes that +// should go in the left subtree. This is the largest power-of-2 number of +// chunks that leaves at least 1 byte for the right subtree. +pub fn left_len(content_len: usize) -> usize { + debug_assert!(content_len > CHUNK_LEN); + // Subtract 1 to reserve at least one byte for the right side. + let full_chunks = (content_len - 1) / CHUNK_LEN; + largest_power_of_two_leq(full_chunks) * CHUNK_LEN +} + +#[test] +fn test_left_len() { + let input_output = &[ + (CHUNK_LEN + 1, CHUNK_LEN), + (2 * CHUNK_LEN - 1, CHUNK_LEN), + (2 * CHUNK_LEN, CHUNK_LEN), + (2 * CHUNK_LEN + 1, 2 * CHUNK_LEN), + (4 * CHUNK_LEN - 1, 2 * CHUNK_LEN), + (4 * CHUNK_LEN, 2 * CHUNK_LEN), + (4 * CHUNK_LEN + 1, 4 * CHUNK_LEN), + ]; + for &(input, output) in input_output { + assert_eq!(left_len(input), output); + } +} diff --git a/rust/guts/src/portable.rs b/rust/guts/src/portable.rs new file mode 100644 index 0000000..d597644 --- /dev/null +++ b/rust/guts/src/portable.rs @@ -0,0 +1,262 @@ +use crate::{ + le_bytes_from_words_32, le_bytes_from_words_64, words_from_le_bytes_32, words_from_le_bytes_64, + BlockBytes, BlockWords, CVBytes, CVWords, Implementation, IV, MAX_SIMD_DEGREE, MSG_SCHEDULE, +}; + +const DEGREE: usize = MAX_SIMD_DEGREE; + +unsafe extern "C" fn degree() -> usize { + DEGREE +} + +#[inline(always)] +fn g(state: &mut BlockWords, a: usize, b: usize, c: usize, d: usize, x: u32, y: u32) { + state[a] = state[a].wrapping_add(state[b]).wrapping_add(x); + state[d] = (state[d] ^ state[a]).rotate_right(16); + state[c] = state[c].wrapping_add(state[d]); + state[b] = (state[b] ^ state[c]).rotate_right(12); + state[a] = state[a].wrapping_add(state[b]).wrapping_add(y); + state[d] = (state[d] ^ state[a]).rotate_right(8); + state[c] = state[c].wrapping_add(state[d]); + state[b] = (state[b] ^ state[c]).rotate_right(7); +} + +#[inline(always)] +fn round(state: &mut [u32; 16], msg: &BlockWords, round: usize) { + // Select the message schedule based on the round. + let schedule = MSG_SCHEDULE[round]; + + // Mix the columns. + g(state, 0, 4, 8, 12, msg[schedule[0]], msg[schedule[1]]); + g(state, 1, 5, 9, 13, msg[schedule[2]], msg[schedule[3]]); + g(state, 2, 6, 10, 14, msg[schedule[4]], msg[schedule[5]]); + g(state, 3, 7, 11, 15, msg[schedule[6]], msg[schedule[7]]); + + // Mix the diagonals. + g(state, 0, 5, 10, 15, msg[schedule[8]], msg[schedule[9]]); + g(state, 1, 6, 11, 12, msg[schedule[10]], msg[schedule[11]]); + g(state, 2, 7, 8, 13, msg[schedule[12]], msg[schedule[13]]); + g(state, 3, 4, 9, 14, msg[schedule[14]], msg[schedule[15]]); +} + +#[inline(always)] +fn compress_inner( + block_words: &BlockWords, + block_len: u32, + cv_words: &CVWords, + counter: u64, + flags: u32, +) -> [u32; 16] { + let mut state = [ + cv_words[0], + cv_words[1], + cv_words[2], + cv_words[3], + cv_words[4], + cv_words[5], + cv_words[6], + cv_words[7], + IV[0], + IV[1], + IV[2], + IV[3], + counter as u32, + (counter >> 32) as u32, + block_len as u32, + flags as u32, + ]; + for round_number in 0..7 { + round(&mut state, &block_words, round_number); + } + state +} + +pub(crate) unsafe extern "C" fn compress( + block: *const BlockBytes, + block_len: u32, + cv: *const CVBytes, + counter: u64, + flags: u32, + out: *mut CVBytes, +) { + let block_words = words_from_le_bytes_64(&*block); + let cv_words = words_from_le_bytes_32(&*cv); + let mut state = compress_inner(&block_words, block_len, &cv_words, counter, flags); + for word_index in 0..8 { + state[word_index] ^= state[word_index + 8]; + } + *out = le_bytes_from_words_32(state[..8].try_into().unwrap()); +} + +pub(crate) unsafe extern "C" fn compress_xof( + block: *const BlockBytes, + block_len: u32, + cv: *const CVBytes, + counter: u64, + flags: u32, + out: *mut BlockBytes, +) { + let block_words = words_from_le_bytes_64(&*block); + let cv_words = words_from_le_bytes_32(&*cv); + let mut state = compress_inner(&block_words, block_len, &cv_words, counter, flags); + for word_index in 0..8 { + state[word_index] ^= state[word_index + 8]; + state[word_index + 8] ^= cv_words[word_index]; + } + *out = le_bytes_from_words_64(&state); +} + +pub(crate) unsafe extern "C" fn hash_chunks( + input: *const u8, + input_len: usize, + key: *const CVBytes, + counter: u64, + flags: u32, + transposed_output: *mut u32, +) { + crate::hash_chunks_using_compress( + compress, + input, + input_len, + key, + counter, + flags, + transposed_output, + ) +} + +pub(crate) unsafe extern "C" fn hash_parents( + transposed_input: *const u32, + num_parents: usize, + key: *const CVBytes, + flags: u32, + transposed_output: *mut u32, // may overlap the input +) { + crate::hash_parents_using_compress( + compress, + transposed_input, + num_parents, + key, + flags, + transposed_output, + ) +} + +pub(crate) unsafe extern "C" fn xof( + block: *const BlockBytes, + block_len: u32, + cv: *const CVBytes, + counter: u64, + flags: u32, + out: *mut u8, + out_len: usize, +) { + crate::xof_using_compress_xof( + compress_xof, + block, + block_len, + cv, + counter, + flags, + out, + out_len, + ) +} + +pub(crate) unsafe extern "C" fn xof_xor( + block: *const BlockBytes, + block_len: u32, + cv: *const CVBytes, + counter: u64, + flags: u32, + out: *mut u8, + out_len: usize, +) { + crate::xof_xor_using_compress_xof( + compress_xof, + block, + block_len, + cv, + counter, + flags, + out, + out_len, + ) +} + +pub(crate) unsafe extern "C" fn universal_hash( + input: *const u8, + input_len: usize, + key: *const CVBytes, + counter: u64, + out: *mut [u8; 16], +) { + crate::universal_hash_using_compress(compress, input, input_len, key, counter, out) +} + +pub fn implementation() -> Implementation { + Implementation::new( + degree, + compress, + hash_chunks, + hash_parents, + xof, + xof_xor, + universal_hash, + ) +} + +#[cfg(test)] +mod test { + use super::*; + + // This is circular but do it anyway. + #[test] + fn test_compress_vs_portable() { + crate::test::test_compress_vs_portable(&implementation()); + } + + #[test] + fn test_compress_vs_reference() { + crate::test::test_compress_vs_reference(&implementation()); + } + + // This is circular but do it anyway. + #[test] + fn test_hash_chunks_vs_portable() { + crate::test::test_hash_chunks_vs_portable(&implementation()); + } + + // This is circular but do it anyway. + #[test] + fn test_hash_parents_vs_portable() { + crate::test::test_hash_parents_vs_portable(&implementation()); + } + + #[test] + fn test_chunks_and_parents_vs_reference() { + crate::test::test_chunks_and_parents_vs_reference(&implementation()); + } + + // This is circular but do it anyway. + #[test] + fn test_xof_vs_portable() { + crate::test::test_xof_vs_portable(&implementation()); + } + + #[test] + fn test_xof_vs_reference() { + crate::test::test_xof_vs_reference(&implementation()); + } + + // This is circular but do it anyway. + #[test] + fn test_universal_hash_vs_portable() { + crate::test::test_universal_hash_vs_portable(&implementation()); + } + + #[test] + fn test_universal_hash_vs_reference() { + crate::test::test_universal_hash_vs_reference(&implementation()); + } +} diff --git a/rust/guts/src/test.rs b/rust/guts/src/test.rs new file mode 100644 index 0000000..83bd790 --- /dev/null +++ b/rust/guts/src/test.rs @@ -0,0 +1,523 @@ +use crate::*; + +pub const TEST_KEY: CVBytes = *b"whats the Elvish word for friend"; + +// Test a few different initial counter values. +// - 0: The base case. +// - i32::MAX: *No* overflow. But carry bugs in tricky SIMD code can screw this up, if you XOR when +// you're supposed to ANDNOT. +// - u32::MAX: The low word of the counter overflows for all inputs except the first. +// - (42 << 32) + u32::MAX: Same but with a non-zero value in the high word. +const INITIAL_COUNTERS: [u64; 4] = [ + 0, + i32::MAX as u64, + u32::MAX as u64, + (42u64 << 32) + u32::MAX as u64, +]; + +const BLOCK_LENGTHS: [usize; 4] = [0, 1, 63, 64]; + +pub fn paint_test_input(buf: &mut [u8]) { + for (i, b) in buf.iter_mut().enumerate() { + *b = (i % 251) as u8; + } +} + +pub fn test_compress_vs_portable(test_impl: &Implementation) { + for block_len in BLOCK_LENGTHS { + dbg!(block_len); + let mut block = [0; BLOCK_LEN]; + paint_test_input(&mut block[..block_len]); + for counter in INITIAL_COUNTERS { + dbg!(counter); + let portable_cv = portable::implementation().compress( + &block, + block_len as u32, + &TEST_KEY, + counter, + KEYED_HASH, + ); + + let test_cv = + test_impl.compress(&block, block_len as u32, &TEST_KEY, counter, KEYED_HASH); + + assert_eq!(portable_cv, test_cv); + } + } +} + +pub fn test_compress_vs_reference(test_impl: &Implementation) { + for block_len in BLOCK_LENGTHS { + dbg!(block_len); + let mut block = [0; BLOCK_LEN]; + paint_test_input(&mut block[..block_len]); + + let mut ref_hasher = reference_impl::Hasher::new_keyed(&TEST_KEY); + ref_hasher.update(&block[..block_len]); + let mut ref_hash = [0u8; 32]; + ref_hasher.finalize(&mut ref_hash); + + let test_cv = test_impl.compress( + &block, + block_len as u32, + &TEST_KEY, + 0, + CHUNK_START | CHUNK_END | ROOT | KEYED_HASH, + ); + + assert_eq!(ref_hash, test_cv); + } +} + +fn check_transposed_eq(output_a: &TransposedVectors, output_b: &TransposedVectors) { + if output_a == output_b { + return; + } + for cv_index in 0..2 * MAX_SIMD_DEGREE { + let cv_a = output_a.extract_cv(cv_index); + let cv_b = output_b.extract_cv(cv_index); + if cv_a == [0; 32] && cv_b == [0; 32] { + println!("CV {cv_index:2} empty"); + } else if cv_a == cv_b { + println!("CV {cv_index:2} matches"); + } else { + println!("CV {cv_index:2} mismatch:"); + println!(" {}", hex::encode(cv_a)); + println!(" {}", hex::encode(cv_b)); + } + } + panic!("transposed outputs are not equal"); +} + +pub fn test_hash_chunks_vs_portable(test_impl: &Implementation) { + assert!(test_impl.degree() <= MAX_SIMD_DEGREE); + dbg!(test_impl.degree() * CHUNK_LEN); + // Allocate 4 extra bytes of padding so we can make aligned slices. + let mut input_buf = [0u8; 2 * 2 * MAX_SIMD_DEGREE * CHUNK_LEN + 4]; + let mut input_slice = &mut input_buf[..]; + // Make sure the start of the input is word-aligned. + while input_slice.as_ptr() as usize % 4 != 0 { + input_slice = &mut input_slice[1..]; + } + let (aligned_input, mut unaligned_input) = + input_slice.split_at_mut(2 * MAX_SIMD_DEGREE * CHUNK_LEN); + unaligned_input = &mut unaligned_input[1..][..2 * MAX_SIMD_DEGREE * CHUNK_LEN]; + assert_eq!(aligned_input.as_ptr() as usize % 4, 0); + assert_eq!(unaligned_input.as_ptr() as usize % 4, 1); + paint_test_input(aligned_input); + paint_test_input(unaligned_input); + // Try just below, equal to, and just above every whole number of chunks. + let mut input_2_lengths = Vec::new(); + let mut next_len = 2 * CHUNK_LEN; + loop { + // 95 is one whole block plus one interesting part of another + input_2_lengths.push(next_len - 95); + input_2_lengths.push(next_len); + if next_len == test_impl.degree() * CHUNK_LEN { + break; + } + input_2_lengths.push(next_len + 95); + next_len += CHUNK_LEN; + } + for input_2_len in input_2_lengths { + dbg!(input_2_len); + let aligned_input1 = &aligned_input[..test_impl.degree() * CHUNK_LEN]; + let aligned_input2 = &aligned_input[test_impl.degree() * CHUNK_LEN..][..input_2_len]; + let unaligned_input1 = &unaligned_input[..test_impl.degree() * CHUNK_LEN]; + let unaligned_input2 = &unaligned_input[test_impl.degree() * CHUNK_LEN..][..input_2_len]; + for initial_counter in INITIAL_COUNTERS { + dbg!(initial_counter); + // Make two calls, to test the output_column parameter. + let mut portable_output = TransposedVectors::new(); + let (portable_left, portable_right) = + test_impl.split_transposed_vectors(&mut portable_output); + portable::implementation().hash_chunks( + aligned_input1, + &IV_BYTES, + initial_counter, + 0, + portable_left, + ); + portable::implementation().hash_chunks( + aligned_input2, + &TEST_KEY, + initial_counter + test_impl.degree() as u64, + KEYED_HASH, + portable_right, + ); + + let mut test_output = TransposedVectors::new(); + let (test_left, test_right) = test_impl.split_transposed_vectors(&mut test_output); + test_impl.hash_chunks(aligned_input1, &IV_BYTES, initial_counter, 0, test_left); + test_impl.hash_chunks( + aligned_input2, + &TEST_KEY, + initial_counter + test_impl.degree() as u64, + KEYED_HASH, + test_right, + ); + check_transposed_eq(&portable_output, &test_output); + + // Do the same thing with unaligned input. + let mut unaligned_test_output = TransposedVectors::new(); + let (unaligned_left, unaligned_right) = + test_impl.split_transposed_vectors(&mut unaligned_test_output); + test_impl.hash_chunks( + unaligned_input1, + &IV_BYTES, + initial_counter, + 0, + unaligned_left, + ); + test_impl.hash_chunks( + unaligned_input2, + &TEST_KEY, + initial_counter + test_impl.degree() as u64, + KEYED_HASH, + unaligned_right, + ); + check_transposed_eq(&portable_output, &unaligned_test_output); + } + } +} + +fn painted_transposed_input() -> TransposedVectors { + let mut vectors = TransposedVectors::new(); + let mut val = 0; + for col in 0..2 * MAX_SIMD_DEGREE { + for row in 0..8 { + vectors.0[row][col] = val; + val += 1; + } + } + vectors +} + +pub fn test_hash_parents_vs_portable(test_impl: &Implementation) { + assert!(test_impl.degree() <= MAX_SIMD_DEGREE); + let input = painted_transposed_input(); + for num_parents in 2..=(test_impl.degree() / 2) { + dbg!(num_parents); + let mut portable_output = TransposedVectors::new(); + let (portable_left, portable_right) = + test_impl.split_transposed_vectors(&mut portable_output); + portable::implementation().hash_parents( + &input, + 2 * num_parents, // num_cvs + &IV_BYTES, + 0, + portable_left, + ); + portable::implementation().hash_parents( + &input, + 2 * num_parents, // num_cvs + &TEST_KEY, + KEYED_HASH, + portable_right, + ); + + let mut test_output = TransposedVectors::new(); + let (test_left, test_right) = test_impl.split_transposed_vectors(&mut test_output); + test_impl.hash_parents( + &input, + 2 * num_parents, // num_cvs + &IV_BYTES, + 0, + test_left, + ); + test_impl.hash_parents( + &input, + 2 * num_parents, // num_cvs + &TEST_KEY, + KEYED_HASH, + test_right, + ); + + check_transposed_eq(&portable_output, &test_output); + } +} + +fn hash_with_chunks_and_parents_recurse( + test_impl: &Implementation, + input: &[u8], + counter: u64, + output: TransposedSplit, +) -> usize { + assert!(input.len() > 0); + if input.len() <= test_impl.degree() * CHUNK_LEN { + return test_impl.hash_chunks(input, &IV_BYTES, counter, 0, output); + } + let (left_input, right_input) = input.split_at(left_len(input.len())); + let mut child_output = TransposedVectors::new(); + let (left_output, right_output) = test_impl.split_transposed_vectors(&mut child_output); + let mut children = + hash_with_chunks_and_parents_recurse(test_impl, left_input, counter, left_output); + assert_eq!(children, test_impl.degree()); + children += hash_with_chunks_and_parents_recurse( + test_impl, + right_input, + counter + (left_input.len() / CHUNK_LEN) as u64, + right_output, + ); + test_impl.hash_parents(&child_output, children, &IV_BYTES, PARENT, output) +} + +// Note: This test implementation doesn't support the 1-chunk-or-less case. +fn root_hash_with_chunks_and_parents(test_impl: &Implementation, input: &[u8]) -> CVBytes { + // TODO: handle the 1-chunk case? + assert!(input.len() > CHUNK_LEN); + let mut cvs = TransposedVectors::new(); + // The right half of these vectors are never used. + let (cvs_left, _) = test_impl.split_transposed_vectors(&mut cvs); + let mut num_cvs = hash_with_chunks_and_parents_recurse(test_impl, input, 0, cvs_left); + while num_cvs > 2 { + num_cvs = test_impl.reduce_parents(&mut cvs, num_cvs, &IV_BYTES, 0); + } + test_impl.compress( + &cvs.extract_parent_node(0), + BLOCK_LEN as u32, + &IV_BYTES, + 0, + PARENT | ROOT, + ) +} + +pub fn test_chunks_and_parents_vs_reference(test_impl: &Implementation) { + assert_eq!(test_impl.degree().count_ones(), 1, "power of 2"); + const MAX_INPUT_LEN: usize = 2 * MAX_SIMD_DEGREE * CHUNK_LEN; + let mut input_buf = [0u8; MAX_INPUT_LEN]; + paint_test_input(&mut input_buf); + // Try just below, equal to, and just above every whole number of chunks, except that + // root_hash_with_chunks_and_parents doesn't support the 1-chunk-or-less case. + let mut test_lengths = vec![CHUNK_LEN + 1]; + let mut next_len = 2 * CHUNK_LEN; + loop { + // 95 is one whole block plus one interesting part of another + test_lengths.push(next_len - 95); + test_lengths.push(next_len); + if next_len == MAX_INPUT_LEN { + break; + } + test_lengths.push(next_len + 95); + next_len += CHUNK_LEN; + } + for test_len in test_lengths { + dbg!(test_len); + let input = &input_buf[..test_len]; + + let mut ref_hasher = reference_impl::Hasher::new(); + ref_hasher.update(&input); + let mut ref_hash = [0u8; 32]; + ref_hasher.finalize(&mut ref_hash); + + let test_hash = root_hash_with_chunks_and_parents(test_impl, input); + + assert_eq!(ref_hash, test_hash); + } +} + +pub fn test_xof_vs_portable(test_impl: &Implementation) { + let flags = CHUNK_START | CHUNK_END | KEYED_HASH; + for counter in INITIAL_COUNTERS { + dbg!(counter); + for input_len in [0, 1, BLOCK_LEN] { + dbg!(input_len); + let mut input_block = [0u8; BLOCK_LEN]; + for byte_index in 0..input_len { + input_block[byte_index] = byte_index as u8 + 42; + } + // Try equal to and partway through every whole number of output blocks. + const MAX_OUTPUT_LEN: usize = 2 * MAX_SIMD_DEGREE * BLOCK_LEN; + let mut output_lengths = Vec::new(); + let mut next_len = 0; + loop { + output_lengths.push(next_len); + if next_len == MAX_OUTPUT_LEN { + break; + } + output_lengths.push(next_len + 31); + next_len += BLOCK_LEN; + } + for output_len in output_lengths { + dbg!(output_len); + let mut portable_output = [0xff; MAX_OUTPUT_LEN]; + portable::implementation().xof( + &input_block, + input_len as u32, + &TEST_KEY, + counter, + flags, + &mut portable_output[..output_len], + ); + let mut test_output = [0xff; MAX_OUTPUT_LEN]; + test_impl.xof( + &input_block, + input_len as u32, + &TEST_KEY, + counter, + flags, + &mut test_output[..output_len], + ); + assert_eq!(portable_output, test_output); + + // Double check that the implementation didn't overwrite. + assert!(test_output[output_len..].iter().all(|&b| b == 0xff)); + + // The first XOR cancels out the output. + test_impl.xof_xor( + &input_block, + input_len as u32, + &TEST_KEY, + counter, + flags, + &mut test_output[..output_len], + ); + assert!(test_output[..output_len].iter().all(|&b| b == 0)); + assert!(test_output[output_len..].iter().all(|&b| b == 0xff)); + + // The second XOR restores out the output. + test_impl.xof_xor( + &input_block, + input_len as u32, + &TEST_KEY, + counter, + flags, + &mut test_output[..output_len], + ); + assert_eq!(portable_output, test_output); + assert!(test_output[output_len..].iter().all(|&b| b == 0xff)); + } + } + } +} + +pub fn test_xof_vs_reference(test_impl: &Implementation) { + let input = b"hello world"; + let mut input_block = [0; BLOCK_LEN]; + input_block[..input.len()].copy_from_slice(input); + + const MAX_OUTPUT_LEN: usize = 2 * MAX_SIMD_DEGREE * BLOCK_LEN; + let mut ref_output = [0; MAX_OUTPUT_LEN]; + let mut ref_hasher = reference_impl::Hasher::new_keyed(&TEST_KEY); + ref_hasher.update(input); + ref_hasher.finalize(&mut ref_output); + + // Try equal to and partway through every whole number of output blocks. + let mut output_lengths = vec![0, 1, 31]; + let mut next_len = BLOCK_LEN; + loop { + output_lengths.push(next_len); + if next_len == MAX_OUTPUT_LEN { + break; + } + output_lengths.push(next_len + 31); + next_len += BLOCK_LEN; + } + + for output_len in output_lengths { + dbg!(output_len); + let mut test_output = [0; MAX_OUTPUT_LEN]; + test_impl.xof( + &input_block, + input.len() as u32, + &TEST_KEY, + 0, + KEYED_HASH | CHUNK_START | CHUNK_END, + &mut test_output[..output_len], + ); + assert_eq!(ref_output[..output_len], test_output[..output_len]); + + // Double check that the implementation didn't overwrite. + assert!(test_output[output_len..].iter().all(|&b| b == 0)); + + // Do it again starting from block 1. + if output_len >= BLOCK_LEN { + test_impl.xof( + &input_block, + input.len() as u32, + &TEST_KEY, + 1, + KEYED_HASH | CHUNK_START | CHUNK_END, + &mut test_output[..output_len - BLOCK_LEN], + ); + assert_eq!( + ref_output[BLOCK_LEN..output_len], + test_output[..output_len - BLOCK_LEN], + ); + } + } +} + +pub fn test_universal_hash_vs_portable(test_impl: &Implementation) { + const MAX_INPUT_LEN: usize = 2 * MAX_SIMD_DEGREE * BLOCK_LEN; + let mut input_buf = [0; MAX_INPUT_LEN]; + paint_test_input(&mut input_buf); + // Try equal to and partway through every whole number of input blocks. + let mut input_lengths = vec![0, 1, 31]; + let mut next_len = BLOCK_LEN; + loop { + input_lengths.push(next_len); + if next_len == MAX_INPUT_LEN { + break; + } + input_lengths.push(next_len + 31); + next_len += BLOCK_LEN; + } + for input_len in input_lengths { + dbg!(input_len); + for counter in INITIAL_COUNTERS { + dbg!(counter); + let portable_output = portable::implementation().universal_hash( + &input_buf[..input_len], + &TEST_KEY, + counter, + ); + let test_output = test_impl.universal_hash(&input_buf[..input_len], &TEST_KEY, counter); + assert_eq!(portable_output, test_output); + } + } +} + +fn reference_impl_universal_hash(input: &[u8], key: &CVBytes) -> [u8; UNIVERSAL_HASH_LEN] { + // The reference_impl doesn't support XOF seeking, so we have to materialize an entire extended + // output to seek to a block. + const MAX_BLOCKS: usize = 2 * MAX_SIMD_DEGREE; + assert!(input.len() / BLOCK_LEN <= MAX_BLOCKS); + let mut output_buffer: [u8; BLOCK_LEN * MAX_BLOCKS] = [0u8; BLOCK_LEN * MAX_BLOCKS]; + let mut result = [0u8; UNIVERSAL_HASH_LEN]; + let mut block_start = 0; + while block_start < input.len() { + let block_len = cmp::min(input.len() - block_start, BLOCK_LEN); + let mut ref_hasher = reference_impl::Hasher::new_keyed(key); + ref_hasher.update(&input[block_start..block_start + block_len]); + ref_hasher.finalize(&mut output_buffer[..block_start + UNIVERSAL_HASH_LEN]); + for byte_index in 0..UNIVERSAL_HASH_LEN { + result[byte_index] ^= output_buffer[block_start + byte_index]; + } + block_start += BLOCK_LEN; + } + result +} + +pub fn test_universal_hash_vs_reference(test_impl: &Implementation) { + const MAX_INPUT_LEN: usize = 2 * MAX_SIMD_DEGREE * BLOCK_LEN; + let mut input_buf = [0; MAX_INPUT_LEN]; + paint_test_input(&mut input_buf); + // Try equal to and partway through every whole number of input blocks. + let mut input_lengths = vec![0, 1, 31]; + let mut next_len = BLOCK_LEN; + loop { + input_lengths.push(next_len); + if next_len == MAX_INPUT_LEN { + break; + } + input_lengths.push(next_len + 31); + next_len += BLOCK_LEN; + } + for input_len in input_lengths { + dbg!(input_len); + let ref_output = reference_impl_universal_hash(&input_buf[..input_len], &TEST_KEY); + let test_output = test_impl.universal_hash(&input_buf[..input_len], &TEST_KEY, 0); + assert_eq!(ref_output, test_output); + } +} From 6e519ea6b7e157ef1da89e39b660fe2aab8f6dcf Mon Sep 17 00:00:00 2001 From: Jack O'Connor Date: Sun, 21 Jan 2024 17:54:09 -0800 Subject: [PATCH 15/18] configure no_std for guts, but not for testing --- rust/guts/src/lib.rs | 3 +++ 1 file changed, 3 insertions(+) diff --git a/rust/guts/src/lib.rs b/rust/guts/src/lib.rs index 67f7a05..3760fa8 100644 --- a/rust/guts/src/lib.rs +++ b/rust/guts/src/lib.rs @@ -1,3 +1,6 @@ +// Tests always require libstd. +#![cfg_attr(all(not(feature = "std"), not(test)), no_std)] + use core::cmp; use core::marker::PhantomData; use core::mem; From 1ca383ba9b1aa2c2508b15d89f3048be10cd9def Mon Sep 17 00:00:00 2001 From: Jack O'Connor Date: Sun, 21 Jan 2024 17:56:41 -0800 Subject: [PATCH 16/18] add guts testing to CI --- .github/workflows/ci.yml | 11 +++++++++++ 1 file changed, 11 insertions(+) diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 3ff1199..8c31d4d 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -131,6 +131,17 @@ jobs: run: cargo test working-directory: ./reference_impl + # the new guts crate + - name: guts test + run: cargo test --all-features + working-directory: ./rust/guts + - name: guts no_std build + run: cargo build --no-default-features + working-directory: ./rust/guts + - name: guts no_std test # note that rust/guts/src/test.rs still uses libstd + run: cargo test --no-default-features + working-directory: ./rust/guts + b3sum_tests: name: b3sum ${{ matrix.target.name }} ${{ matrix.channel }} runs-on: ${{ matrix.target.os }} From 1a6c1e2037c3b0221e57bcba9e02b7bb5f29f067 Mon Sep 17 00:00:00 2001 From: Jack O'Connor Date: Sun, 21 Jan 2024 19:43:07 -0800 Subject: [PATCH 17/18] guts readme updates --- rust/guts/readme.md | 80 +++++++++++++++++++++++++++------------------ 1 file changed, 49 insertions(+), 31 deletions(-) diff --git a/rust/guts/readme.md b/rust/guts/readme.md index a1adbf1..4957816 100644 --- a/rust/guts/readme.md +++ b/rust/guts/readme.md @@ -2,51 +2,65 @@ ## Introduction -This crate contains low-level, high-performance, platform-specific -implementations of the BLAKE3 compression function. This API is complicated and -unsafe, and this crate will never have a stable release. For the standard -BLAKE3 hash function, see the [`blake3`](https://crates.io/crates/blake3) -crate, which depends on this one. +This [`blake3_guts`](https://crates.io/crates/blake3_guts) sub-crate contains +low-level, high-performance, platform-specific implementations of the BLAKE3 +compression function. This API is complicated and unsafe, and this crate will +never have a stable release. Most callers should instead use the +[`blake3`](https://crates.io/crates/blake3) crate, which will eventually depend +on this one internally. -The most important ingredient in a high-performance implementation of BLAKE3 is -parallelism. The BLAKE3 tree structure lets us hash different parts of the tree -in parallel, and modern computers have a _lot_ of parallelism to offer. -Sometimes that means using multiple threads running on multiple cores, but -multithreading isn't appropriate for all applications, and it's not the usual -default for library APIs. More commonly, BLAKE3 implementations use SIMD -instructions ("Single Instruction Multiple Data") to improve the performance of -a single thread. When we do use multithreading, the performance benefits -multiply. +The code you see here (as of January 2024) is an early stage of a large planned +refactor. The motivation for this refactor is a couple of missing features in +both the Rust and C implementations: -The tricky thing about SIMD is that each instruction set works differently. -Instead of writing portable code once and letting the compiler do most of the -optimization work, we need to write platform-specific implementations, and -sometimes more than one per platform. We maintain *four* different -implementations on x86 alone (targeting SSE2, SSE4.1, AVX2, and AVX-512), in -addition to ARM NEON and the RISC-V vector extensions. In the future we might -add ARM SVE2. +- The output side + ([`OutputReader`](https://docs.rs/blake3/latest/blake3/struct.OutputReader.html) + in Rust) doesn't take advantage of the most important SIMD optimizations that + compute multiple blocks in parallel. This blocks any project that wants to + use the BLAKE3 XOF as a stream cipher + ([[1]](https://github.com/oconnor663/bessie), + [[2]](https://github.com/oconnor663/blake3_aead)). +- Low-level callers like [Bao](https://github.com/oconnor663/bao) that need + interior nodes of the tree also don't get those SIMD optimizations. They have + to use a slow, minimalistic, unstable, doc-hidden module [(also called + `guts`)](https://github.com/BLAKE3-team/BLAKE3/blob/master/src/guts.rs). -All of that means a lot of duplicated logic and maintenance. So while the main -goal of this API is high performance, it's also important to keep the API as -small and simple as possible. Higher level details like the "CV stack", input -buffering, and multithreading are handled by portable code in the main `blake3` -crate. These are just building blocks. +The difficulty with adding those features is that they require changes to all +of our optimized assembly and C intrinsics code. That's a couple dozen +different files that are large, platform-specific, difficult to understand, and +full of duplicated code. The higher-level Rust and C implementations of BLAKE3 +both depend on these files and will need to coordinate changes. -## The private API +At the same time, it won't be long before we add support for more platforms: -This is the API that each platform reimplements. It's completely `unsafe`, -inputs and outputs are allowed to alias, and bounds checking is the caller's -responsibility. +- RISCV vector extensions +- ARM SVE +- WebAssembly SIMD + +It's important to get this refactor done before new platforms make it even +harder to do. + +## The private guts API + +This is the API that each platform reimplements, so we want it to be as simple +as possible apart from the high-performance work it needs to do. It's +completely `unsafe`, and inputs and outputs are raw pointers that are allowed +to alias (this matters for `hash_parents`, see below). - `degree` - `compress` + - The single compression function, for short inputs and odd-length tails. - `hash_chunks` - `hash_parents` - `xof` - `xof_xor` + - As `xof` but XOR'ing the result into the output buffer. - `universal_hash` + - This is a new construction specifically to support + [BLAKE3-AEAD](https://github.com/oconnor663/blake3_aead). Some + implementations might just stub it out with portable code. -## The public API +## The public guts API This is the API that this crate exposes to callers, i.e. to the main `blake3` crate. It's a thin, portable layer on top of the private API above. The Rust @@ -56,7 +70,11 @@ version of this API is memory-safe. - `compress` - `hash_chunks` - `hash_parents` + - This handles most levels of the tree, where we keep hashing SIMD_DEGREE + parents at a time. - `reduce_parents` + - This uses the same `hash_parents` private API, but it handles the top + levels of the tree where we reduce in-place to the root parent node. - `xof` - `xof_xor` - `universal_hash` From 5558fa46239742720d84c46edb0544732adf4db8 Mon Sep 17 00:00:00 2001 From: Jack O'Connor Date: Sun, 21 Jan 2024 20:09:53 -0800 Subject: [PATCH 18/18] add a guts docs example --- rust/guts/src/lib.rs | 41 +++++++++++++++++++++++++++++++++++++++++ 1 file changed, 41 insertions(+) diff --git a/rust/guts/src/lib.rs b/rust/guts/src/lib.rs index 3760fa8..e9b4914 100644 --- a/rust/guts/src/lib.rs +++ b/rust/guts/src/lib.rs @@ -1,3 +1,44 @@ +//! # The BLAKE3 Guts API +//! +//! See `readme.md`. +//! +//! The main entrypoint into this crate is [`DETECTED_IMPL`], which is a global [`Implementation`] +//! that atomically initializes itself the first time you use it. +//! +//! # Example +//! +//! ```rust +//! use blake3_guts::{TransposedVectors, DETECTED_IMPL, IV_BYTES, PARENT, ROOT}; +//! +//! // Hash an input of exactly two chunks. +//! let input = [0u8; 2048]; +//! let mut outputs = TransposedVectors::new(); +//! let (left_outputs, _) = DETECTED_IMPL.split_transposed_vectors(&mut outputs); +//! DETECTED_IMPL.hash_chunks( +//! &input, +//! &IV_BYTES, +//! 0, // counter +//! 0, // flags +//! left_outputs, +//! ); +//! let root_node = outputs.extract_parent_node(0); +//! let hash = DETECTED_IMPL.compress( +//! &root_node, +//! 64, // block_len +//! &IV_BYTES, +//! 0, // counter +//! PARENT | ROOT, +//! ); +//! +//! // Compute the same hash using the reference implementation. +//! let mut reference_hasher = reference_impl::Hasher::new(); +//! reference_hasher.update(&input); +//! let mut expected_hash = [0u8; 32]; +//! reference_hasher.finalize(&mut expected_hash); +//! +//! assert_eq!(hash, expected_hash); +//! ``` + // Tests always require libstd. #![cfg_attr(all(not(feature = "std"), not(test)), no_std)]