From a7579d30ad16c19306cedeeacd919e319aff1089 Mon Sep 17 00:00:00 2001
From: Jack O'Connor <oconnor663@gmail.com>
Date: Wed, 8 Jan 2020 22:51:17 -0500
Subject: [PATCH] merge BLAKE3-c into this repo

This is commit 4476d9da0e370993823e7ad17592b84e905afd76 of
https://github.com/veorq/BLAKE3-c.
---
 c/.github/workflows/ci.yml |   12 +
 c/.gitignore               |    1 +
 c/LICENSE                  |  330 ++++++++++
 c/Makefile                 |   25 +
 c/README.md                |   28 +
 c/blake3.c                 |  307 +++++++++
 c/blake3.h                 |   35 ++
 c/blake3_avx2.c            |  316 ++++++++++
 c/blake3_avx512.c          | 1201 ++++++++++++++++++++++++++++++++++++
 c/blake3_dispatch.c        |  263 ++++++++
 c/blake3_impl.h            |   97 +++
 c/blake3_neon.c            |  346 +++++++++++
 c/blake3_portable.c        |  168 +++++
 c/blake3_sse41.c           |  554 +++++++++++++++++
 c/main.c                   |  147 +++++
 c/test.py                  |   96 +++
 c/test_vectors.json        |  132 ++++
 17 files changed, 4058 insertions(+)
 create mode 100644 c/.github/workflows/ci.yml
 create mode 100644 c/.gitignore
 create mode 100644 c/LICENSE
 create mode 100644 c/Makefile
 create mode 100644 c/README.md
 create mode 100644 c/blake3.c
 create mode 100644 c/blake3.h
 create mode 100644 c/blake3_avx2.c
 create mode 100644 c/blake3_avx512.c
 create mode 100644 c/blake3_dispatch.c
 create mode 100644 c/blake3_impl.h
 create mode 100644 c/blake3_neon.c
 create mode 100644 c/blake3_portable.c
 create mode 100644 c/blake3_sse41.c
 create mode 100644 c/main.c
 create mode 100755 c/test.py
 create mode 100644 c/test_vectors.json

diff --git a/c/.github/workflows/ci.yml b/c/.github/workflows/ci.yml
new file mode 100644
index 0000000..7826402
--- /dev/null
+++ b/c/.github/workflows/ci.yml
@@ -0,0 +1,12 @@
+name: tests
+
+on: [push]
+
+jobs:
+  x86_tests:
+    name: x86 tests
+    runs-on: ubuntu-latest
+
+    steps:
+    - uses: actions/checkout@v1
+    - run: make test
diff --git a/c/.gitignore b/c/.gitignore
new file mode 100644
index 0000000..00e3bd5
--- /dev/null
+++ b/c/.gitignore
@@ -0,0 +1 @@
+blake3
diff --git a/c/LICENSE b/c/LICENSE
new file mode 100644
index 0000000..4a8008a
--- /dev/null
+++ b/c/LICENSE
@@ -0,0 +1,330 @@
+This work is released into the public domain with CC0 1.0. Alternatively, it is
+licensed under the Apache License 2.0.
+
+-------------------------------------------------------------------------------
+
+Creative Commons Legal Code
+
+CC0 1.0 Universal
+
+    CREATIVE COMMONS CORPORATION IS NOT A LAW FIRM AND DOES NOT PROVIDE
+    LEGAL SERVICES. DISTRIBUTION OF THIS DOCUMENT DOES NOT CREATE AN
+    ATTORNEY-CLIENT RELATIONSHIP. CREATIVE COMMONS PROVIDES THIS
+    INFORMATION ON AN "AS-IS" BASIS. CREATIVE COMMONS MAKES NO WARRANTIES
+    REGARDING THE USE OF THIS DOCUMENT OR THE INFORMATION OR WORKS
+    PROVIDED HEREUNDER, AND DISCLAIMS LIABILITY FOR DAMAGES RESULTING FROM
+    THE USE OF THIS DOCUMENT OR THE INFORMATION OR WORKS PROVIDED
+    HEREUNDER.
+
+Statement of Purpose
+
+The laws of most jurisdictions throughout the world automatically confer
+exclusive Copyright and Related Rights (defined below) upon the creator
+and subsequent owner(s) (each and all, an "owner") of an original work of
+authorship and/or a database (each, a "Work").
+
+Certain owners wish to permanently relinquish those rights to a Work for
+the purpose of contributing to a commons of creative, cultural and
+scientific works ("Commons") that the public can reliably and without fear
+of later claims of infringement build upon, modify, incorporate in other
+works, reuse and redistribute as freely as possible in any form whatsoever
+and for any purposes, including without limitation commercial purposes.
+These owners may contribute to the Commons to promote the ideal of a free
+culture and the further production of creative, cultural and scientific
+works, or to gain reputation or greater distribution for their Work in
+part through the use and efforts of others.
+
+For these and/or other purposes and motivations, and without any
+expectation of additional consideration or compensation, the person
+associating CC0 with a Work (the "Affirmer"), to the extent that he or she
+is an owner of Copyright and Related Rights in the Work, voluntarily
+elects to apply CC0 to the Work and publicly distribute the Work under its
+terms, with knowledge of his or her Copyright and Related Rights in the
+Work and the meaning and intended legal effect of CC0 on those rights.
+
+1. Copyright and Related Rights. A Work made available under CC0 may be
+protected by copyright and related or neighboring rights ("Copyright and
+Related Rights"). Copyright and Related Rights include, but are not
+limited to, the following:
+
+  i. the right to reproduce, adapt, distribute, perform, display,
+     communicate, and translate a Work;
+ ii. moral rights retained by the original author(s) and/or performer(s);
+iii. publicity and privacy rights pertaining to a person's image or
+     likeness depicted in a Work;
+ iv. rights protecting against unfair competition in regards to a Work,
+     subject to the limitations in paragraph 4(a), below;
+  v. rights protecting the extraction, dissemination, use and reuse of data
+     in a Work;
+ vi. database rights (such as those arising under Directive 96/9/EC of the
+     European Parliament and of the Council of 11 March 1996 on the legal
+     protection of databases, and under any national implementation
+     thereof, including any amended or successor version of such
+     directive); and
+vii. other similar, equivalent or corresponding rights throughout the
+     world based on applicable law or treaty, and any national
+     implementations thereof.
+
+2. Waiver. To the greatest extent permitted by, but not in contravention
+of, applicable law, Affirmer hereby overtly, fully, permanently,
+irrevocably and unconditionally waives, abandons, and surrenders all of
+Affirmer's Copyright and Related Rights and associated claims and causes
+of action, whether now known or unknown (including existing as well as
+future claims and causes of action), in the Work (i) in all territories
+worldwide, (ii) for the maximum duration provided by applicable law or
+treaty (including future time extensions), (iii) in any current or future
+medium and for any number of copies, and (iv) for any purpose whatsoever,
+including without limitation commercial, advertising or promotional
+purposes (the "Waiver"). Affirmer makes the Waiver for the benefit of each
+member of the public at large and to the detriment of Affirmer's heirs and
+successors, fully intending that such Waiver shall not be subject to
+revocation, rescission, cancellation, termination, or any other legal or
+equitable action to disrupt the quiet enjoyment of the Work by the public
+as contemplated by Affirmer's express Statement of Purpose.
+
+3. Public License Fallback. Should any part of the Waiver for any reason
+be judged legally invalid or ineffective under applicable law, then the
+Waiver shall be preserved to the maximum extent permitted taking into
+account Affirmer's express Statement of Purpose. In addition, to the
+extent the Waiver is so judged Affirmer hereby grants to each affected
+person a royalty-free, non transferable, non sublicensable, non exclusive,
+irrevocable and unconditional license to exercise Affirmer's Copyright and
+Related Rights in the Work (i) in all territories worldwide, (ii) for the
+maximum duration provided by applicable law or treaty (including future
+time extensions), (iii) in any current or future medium and for any number
+of copies, and (iv) for any purpose whatsoever, including without
+limitation commercial, advertising or promotional purposes (the
+"License"). The License shall be deemed effective as of the date CC0 was
+applied by Affirmer to the Work. Should any part of the License for any
+reason be judged legally invalid or ineffective under applicable law, such
+partial invalidity or ineffectiveness shall not invalidate the remainder
+of the License, and in such case Affirmer hereby affirms that he or she
+will not (i) exercise any of his or her remaining Copyright and Related
+Rights in the Work or (ii) assert any associated claims and causes of
+action with respect to the Work, in either case contrary to Affirmer's
+express Statement of Purpose.
+
+4. Limitations and Disclaimers.
+
+ a. No trademark or patent rights held by Affirmer are waived, abandoned,
+    surrendered, licensed or otherwise affected by this document.
+ b. Affirmer offers the Work as-is and makes no representations or
+    warranties of any kind concerning the Work, express, implied,
+    statutory or otherwise, including without limitation warranties of
+    title, merchantability, fitness for a particular purpose, non
+    infringement, or the absence of latent or other defects, accuracy, or
+    the present or absence of errors, whether or not discoverable, all to
+    the greatest extent permissible under applicable law.
+ c. Affirmer disclaims responsibility for clearing rights of other persons
+    that may apply to the Work or any use thereof, including without
+    limitation any person's Copyright and Related Rights in the Work.
+    Further, Affirmer disclaims responsibility for obtaining any necessary
+    consents, permissions or other rights required for any use of the
+    Work.
+ d. Affirmer understands and acknowledges that Creative Commons is not a
+    party to this document and has no duty or obligation with respect to
+    this CC0 or use of the Work.
+
+-------------------------------------------------------------------------------
+
+                                 Apache License
+                           Version 2.0, January 2004
+                        http://www.apache.org/licenses/
+
+   TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
+
+   1. Definitions.
+
+      "License" shall mean the terms and conditions for use, reproduction,
+      and distribution as defined by Sections 1 through 9 of this document.
+
+      "Licensor" shall mean the copyright owner or entity authorized by
+      the copyright owner that is granting the License.
+
+      "Legal Entity" shall mean the union of the acting entity and all
+      other entities that control, are controlled by, or are under common
+      control with that entity. For the purposes of this definition,
+      "control" means (i) the power, direct or indirect, to cause the
+      direction or management of such entity, whether by contract or
+      otherwise, or (ii) ownership of fifty percent (50%) or more of the
+      outstanding shares, or (iii) beneficial ownership of such entity.
+
+      "You" (or "Your") shall mean an individual or Legal Entity
+      exercising permissions granted by this License.
+
+      "Source" form shall mean the preferred form for making modifications,
+      including but not limited to software source code, documentation
+      source, and configuration files.
+
+      "Object" form shall mean any form resulting from mechanical
+      transformation or translation of a Source form, including but
+      not limited to compiled object code, generated documentation,
+      and conversions to other media types.
+
+      "Work" shall mean the work of authorship, whether in Source or
+      Object form, made available under the License, as indicated by a
+      copyright notice that is included in or attached to the work
+      (an example is provided in the Appendix below).
+
+      "Derivative Works" shall mean any work, whether in Source or Object
+      form, that is based on (or derived from) the Work and for which the
+      editorial revisions, annotations, elaborations, or other modifications
+      represent, as a whole, an original work of authorship. For the purposes
+      of this License, Derivative Works shall not include works that remain
+      separable from, or merely link (or bind by name) to the interfaces of,
+      the Work and Derivative Works thereof.
+
+      "Contribution" shall mean any work of authorship, including
+      the original version of the Work and any modifications or additions
+      to that Work or Derivative Works thereof, that is intentionally
+      submitted to Licensor for inclusion in the Work by the copyright owner
+      or by an individual or Legal Entity authorized to submit on behalf of
+      the copyright owner. For the purposes of this definition, "submitted"
+      means any form of electronic, verbal, or written communication sent
+      to the Licensor or its representatives, including but not limited to
+      communication on electronic mailing lists, source code control systems,
+      and issue tracking systems that are managed by, or on behalf of, the
+      Licensor for the purpose of discussing and improving the Work, but
+      excluding communication that is conspicuously marked or otherwise
+      designated in writing by the copyright owner as "Not a Contribution."
+
+      "Contributor" shall mean Licensor and any individual or Legal Entity
+      on behalf of whom a Contribution has been received by Licensor and
+      subsequently incorporated within the Work.
+
+   2. Grant of Copyright License. Subject to the terms and conditions of
+      this License, each Contributor hereby grants to You a perpetual,
+      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+      copyright license to reproduce, prepare Derivative Works of,
+      publicly display, publicly perform, sublicense, and distribute the
+      Work and such Derivative Works in Source or Object form.
+
+   3. Grant of Patent License. Subject to the terms and conditions of
+      this License, each Contributor hereby grants to You a perpetual,
+      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+      (except as stated in this section) patent license to make, have made,
+      use, offer to sell, sell, import, and otherwise transfer the Work,
+      where such license applies only to those patent claims licensable
+      by such Contributor that are necessarily infringed by their
+      Contribution(s) alone or by combination of their Contribution(s)
+      with the Work to which such Contribution(s) was submitted. If You
+      institute patent litigation against any entity (including a
+      cross-claim or counterclaim in a lawsuit) alleging that the Work
+      or a Contribution incorporated within the Work constitutes direct
+      or contributory patent infringement, then any patent licenses
+      granted to You under this License for that Work shall terminate
+      as of the date such litigation is filed.
+
+   4. Redistribution. You may reproduce and distribute copies of the
+      Work or Derivative Works thereof in any medium, with or without
+      modifications, and in Source or Object form, provided that You
+      meet the following conditions:
+
+      (a) You must give any other recipients of the Work or
+          Derivative Works a copy of this License; and
+
+      (b) You must cause any modified files to carry prominent notices
+          stating that You changed the files; and
+
+      (c) You must retain, in the Source form of any Derivative Works
+          that You distribute, all copyright, patent, trademark, and
+          attribution notices from the Source form of the Work,
+          excluding those notices that do not pertain to any part of
+          the Derivative Works; and
+
+      (d) If the Work includes a "NOTICE" text file as part of its
+          distribution, then any Derivative Works that You distribute must
+          include a readable copy of the attribution notices contained
+          within such NOTICE file, excluding those notices that do not
+          pertain to any part of the Derivative Works, in at least one
+          of the following places: within a NOTICE text file distributed
+          as part of the Derivative Works; within the Source form or
+          documentation, if provided along with the Derivative Works; or,
+          within a display generated by the Derivative Works, if and
+          wherever such third-party notices normally appear. The contents
+          of the NOTICE file are for informational purposes only and
+          do not modify the License. You may add Your own attribution
+          notices within Derivative Works that You distribute, alongside
+          or as an addendum to the NOTICE text from the Work, provided
+          that such additional attribution notices cannot be construed
+          as modifying the License.
+
+      You may add Your own copyright statement to Your modifications and
+      may provide additional or different license terms and conditions
+      for use, reproduction, or distribution of Your modifications, or
+      for any such Derivative Works as a whole, provided Your use,
+      reproduction, and distribution of the Work otherwise complies with
+      the conditions stated in this License.
+
+   5. Submission of Contributions. Unless You explicitly state otherwise,
+      any Contribution intentionally submitted for inclusion in the Work
+      by You to the Licensor shall be under the terms and conditions of
+      this License, without any additional terms or conditions.
+      Notwithstanding the above, nothing herein shall supersede or modify
+      the terms of any separate license agreement you may have executed
+      with Licensor regarding such Contributions.
+
+   6. Trademarks. This License does not grant permission to use the trade
+      names, trademarks, service marks, or product names of the Licensor,
+      except as required for reasonable and customary use in describing the
+      origin of the Work and reproducing the content of the NOTICE file.
+
+   7. Disclaimer of Warranty. Unless required by applicable law or
+      agreed to in writing, Licensor provides the Work (and each
+      Contributor provides its Contributions) on an "AS IS" BASIS,
+      WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
+      implied, including, without limitation, any warranties or conditions
+      of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
+      PARTICULAR PURPOSE. You are solely responsible for determining the
+      appropriateness of using or redistributing the Work and assume any
+      risks associated with Your exercise of permissions under this License.
+
+   8. Limitation of Liability. In no event and under no legal theory,
+      whether in tort (including negligence), contract, or otherwise,
+      unless required by applicable law (such as deliberate and grossly
+      negligent acts) or agreed to in writing, shall any Contributor be
+      liable to You for damages, including any direct, indirect, special,
+      incidental, or consequential damages of any character arising as a
+      result of this License or out of the use or inability to use the
+      Work (including but not limited to damages for loss of goodwill,
+      work stoppage, computer failure or malfunction, or any and all
+      other commercial damages or losses), even if such Contributor
+      has been advised of the possibility of such damages.
+
+   9. Accepting Warranty or Additional Liability. While redistributing
+      the Work or Derivative Works thereof, You may choose to offer,
+      and charge a fee for, acceptance of support, warranty, indemnity,
+      or other liability obligations and/or rights consistent with this
+      License. However, in accepting such obligations, You may act only
+      on Your own behalf and on Your sole responsibility, not on behalf
+      of any other Contributor, and only if You agree to indemnify,
+      defend, and hold each Contributor harmless for any liability
+      incurred by, or claims asserted against, such Contributor by reason
+      of your accepting any such warranty or additional liability.
+
+   END OF TERMS AND CONDITIONS
+
+   APPENDIX: How to apply the Apache License to your work.
+
+      To apply the Apache License to your work, attach the following
+      boilerplate notice, with the fields enclosed by brackets "[]"
+      replaced with your own identifying information. (Don't include
+      the brackets!)  The text should be enclosed in the appropriate
+      comment syntax for the file format. We also recommend that a
+      file or class name and description of purpose be included on the
+      same "printed page" as the copyright notice for easier
+      identification within third-party archives.
+
+   Copyright 2019 Jack O'Connor
+
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+
+       http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License.
diff --git a/c/Makefile b/c/Makefile
new file mode 100644
index 0000000..f79f0e2
--- /dev/null
+++ b/c/Makefile
@@ -0,0 +1,25 @@
+NAME=blake3
+CC=gcc
+CFLAGS=-O3 -Wall -Wextra -std=c11 -pedantic
+
+all: blake3.c blake3_dispatch.c blake3_portable.c main.c blake3_sse41.o blake3_avx2.o blake3_avx512.o
+	$(CC) $(CFLAGS) $^ -o $(NAME)
+
+blake3_sse41.o: blake3_sse41.c
+	$(CC) $(CFLAGS) -c $^ -o $@ -msse4.1 -D BLAKE3_USE_SSE41
+
+blake3_avx2.o: blake3_avx2.c # blake3_sse41.c
+	$(CC) $(CFLAGS) -c $^ -o $@ -mavx2 -D BLAKE3_USE_SSE41 -D BLAKE3_USE_AVX2
+
+blake3_avx512.o: blake3_avx512.c
+	$(CC) $(CFLAGS) -c $^ -o $@ -mavx512f -mavx512vl -D BLAKE3_USE_SSE41 -D BLAKE3_USE_AVX2 -D BLAKE3_USE_AVX512
+
+blake3_neon.o: blake3_neon.c
+	$(CC) $(CFLAGS) -c $^ -o $@ -D BLAKE3_USE_NEON
+
+test: CFLAGS += -DBLAKE3_TESTING
+test: all
+	./test.py
+
+clean: 
+	rm -f $(NAME) *.o
\ No newline at end of file
diff --git a/c/README.md b/c/README.md
new file mode 100644
index 0000000..3c98c9b
--- /dev/null
+++ b/c/README.md
@@ -0,0 +1,28 @@
+# BLAKE3-c [![Actions Status](https://github.com/veorq/BLAKE3-c/workflows/tests/badge.svg)](https://github.com/veorq/BLAKE3-c/actions)
+
+A very rough initial implementation of BLAKE3 in C. SSE4.1, AVX2,
+AVX-512, and NEON are supported, using compile-time feature selection in
+the Makefile.
+
+This implementation is simpler than the [Rust
+implementation](https://github.com/veorq/BLAKE3). It doesn't support
+multithreading, and it doesn't parallelize parent hashes, so throughput
+is lower.
+
+TODO:
+- CI testing for AVX-512 and NEON.
+- Cross-platform build, e.g. Windows.
+- Dynamic CPU feature detection, at least for x86.
+
+Example usage:
+
+```bash
+$ make avx2
+$ head -c 1000000 /dev/urandom | ./blake3
+43f2cae3cfd7678bc3a3ebdbf170608d19d5ebaad23e9d06291dba3269853608
+$ head -c 1000000 /dev/urandom | ./blake3 --length 50
+4fc0ee74a60aa77fb699821997498fd93f1a98bd03eaf2a7969c4b35fb742c233a7a161fd2a431605f6e92dcf4cd7d052102
+$ head -c 1000000 /dev/urandom | ./blake3 --keyed 0123456789abcdef0123456789abcdef0123456789abcdef0123456789abcdef
+8aee87b232fe90b042bf9119591e24409763a268139ff157d20021003e314064
+
+```
diff --git a/c/blake3.c b/c/blake3.c
new file mode 100644
index 0000000..a6561e8
--- /dev/null
+++ b/c/blake3.c
@@ -0,0 +1,307 @@
+// NB: This is only for benchmarking. The guy who wrote this file hasn't
+// touched C since college. Please don't use this code in production.
+
+#include <assert.h>
+#include <stdbool.h>
+#include <string.h>
+
+#include "blake3.h"
+#include "blake3_impl.h"
+
+INLINE void chunk_state_init(blake3_chunk_state *self, const uint32_t key[8],
+                             uint8_t flags) {
+  memcpy(self->cv, key, BLAKE3_KEY_LEN);
+  self->chunk_counter = 0;
+  memset(self->buf, 0, BLAKE3_BLOCK_LEN);
+  self->buf_len = 0;
+  self->blocks_compressed = 0;
+  self->flags = flags;
+}
+
+INLINE void chunk_state_reset(blake3_chunk_state *self, const uint32_t key[8],
+                              uint64_t chunk_counter) {
+  memcpy(self->cv, key, BLAKE3_KEY_LEN);
+  self->chunk_counter = chunk_counter;
+  self->blocks_compressed = 0;
+  memset(self->buf, 0, BLAKE3_BLOCK_LEN);
+  self->buf_len = 0;
+}
+
+INLINE size_t chunk_state_len(const blake3_chunk_state *self) {
+  return (BLAKE3_BLOCK_LEN * (size_t)self->blocks_compressed) +
+         ((size_t)self->buf_len);
+}
+
+INLINE size_t chunk_state_fill_buf(blake3_chunk_state *self,
+                                   const uint8_t *input, size_t input_len) {
+  size_t take = BLAKE3_BLOCK_LEN - ((size_t)self->buf_len);
+  if (take > input_len) {
+    take = input_len;
+  }
+  uint8_t *dest = self->buf + ((size_t)self->buf_len);
+  memcpy(dest, input, take);
+  self->buf_len += (uint8_t)take;
+  return take;
+}
+
+INLINE uint8_t chunk_state_maybe_start_flag(const blake3_chunk_state *self) {
+  if (self->blocks_compressed == 0) {
+    return CHUNK_START;
+  } else {
+    return 0;
+  }
+}
+
+typedef struct {
+  uint32_t input_cv[8];
+  uint64_t counter;
+  uint8_t block[BLAKE3_BLOCK_LEN];
+  uint8_t block_len;
+  uint8_t flags;
+} output_t;
+
+INLINE output_t make_output(const uint32_t input_cv[8],
+                            const uint8_t block[BLAKE3_BLOCK_LEN],
+                            uint8_t block_len, uint64_t counter,
+                            uint8_t flags) {
+  output_t ret;
+  memcpy(ret.input_cv, input_cv, 32);
+  memcpy(ret.block, block, BLAKE3_BLOCK_LEN);
+  ret.block_len = block_len;
+  ret.counter = counter;
+  ret.flags = flags;
+  return ret;
+}
+
+// Chaining values within a given chunk (specifically the compress_in_place
+// interface) are represented as words. This avoids unnecessary bytes<->words
+// conversion overhead in the portable implementation. However, the hash_many
+// interface handles both user input and parent node blocks, so it accepts
+// bytes. For that reason, chaining values in the CV stack are represented as
+// bytes.
+INLINE void output_chaining_value(const output_t *self, uint8_t cv[32]) {
+  uint32_t cv_words[8];
+  memcpy(cv_words, self->input_cv, 32);
+  blake3_compress_in_place(cv_words, self->block, self->block_len, self->counter,
+                    self->flags);
+  memcpy(cv, cv_words, 32);
+}
+
+INLINE void output_root_bytes(const output_t *self, uint8_t *out,
+                              size_t out_len) {
+  uint64_t output_block_counter = 0;
+  uint8_t wide_buf[64];
+  while (out_len > 0) {
+    blake3_compress_xof(self->input_cv, self->block, self->block_len,
+                 output_block_counter, self->flags | ROOT, wide_buf);
+    size_t memcpy_len;
+    if (out_len > 64) {
+      memcpy_len = 64;
+    } else {
+      memcpy_len = out_len;
+    }
+    memcpy(out, wide_buf, memcpy_len);
+    out += memcpy_len;
+    out_len -= memcpy_len;
+    output_block_counter += 1;
+  }
+}
+
+INLINE void chunk_state_update(blake3_chunk_state *self, const uint8_t *input,
+                               size_t input_len) {
+  if (self->buf_len > 0) {
+    size_t take = chunk_state_fill_buf(self, input, input_len);
+    input += take;
+    input_len -= take;
+    if (input_len > 0) {
+      blake3_compress_in_place(self->cv, self->buf, BLAKE3_BLOCK_LEN,
+                        self->chunk_counter,
+                        self->flags | chunk_state_maybe_start_flag(self));
+      self->blocks_compressed += 1;
+      self->buf_len = 0;
+      memset(self->buf, 0, BLAKE3_BLOCK_LEN);
+    }
+  }
+
+  while (input_len > BLAKE3_BLOCK_LEN) {
+    blake3_compress_in_place(self->cv, input, BLAKE3_BLOCK_LEN, self->chunk_counter,
+                      self->flags | chunk_state_maybe_start_flag(self));
+    self->blocks_compressed += 1;
+    input += BLAKE3_BLOCK_LEN;
+    input_len -= BLAKE3_BLOCK_LEN;
+  }
+
+  size_t take = chunk_state_fill_buf(self, input, input_len);
+  input += take;
+  input_len -= take;
+}
+
+INLINE output_t chunk_state_output(const blake3_chunk_state *self) {
+  uint8_t block_flags =
+      self->flags | chunk_state_maybe_start_flag(self) | CHUNK_END;
+  return make_output(self->cv, self->buf, self->buf_len, self->chunk_counter,
+                     block_flags);
+}
+
+INLINE output_t parent_output(const uint8_t block[BLAKE3_BLOCK_LEN],
+                              const uint32_t key[8], uint8_t flags) {
+  return make_output(key, block, BLAKE3_BLOCK_LEN, 0, flags | PARENT);
+}
+
+INLINE void hasher_init_base(blake3_hasher *self, const uint32_t key[8],
+                             uint8_t flags) {
+  memcpy(self->key, key, BLAKE3_KEY_LEN);
+  chunk_state_init(&self->chunk, key, flags);
+  self->cv_stack_len = 0;
+}
+
+void blake3_hasher_init(blake3_hasher *self) { hasher_init_base(self, IV, 0); }
+
+void blake3_hasher_init_keyed(blake3_hasher *self,
+                              const uint8_t key[BLAKE3_KEY_LEN]) {
+  uint32_t key_words[8];
+  load_key_words(key, key_words);
+  hasher_init_base(self, key_words, KEYED_HASH);
+}
+
+void blake3_hasher_init_derive_key(blake3_hasher *self, const char *context) {
+  blake3_hasher context_hasher;
+  hasher_init_base(&context_hasher, IV, DERIVE_KEY_CONTEXT);
+  blake3_hasher_update(&context_hasher, context, strlen(context));
+  uint8_t context_key[BLAKE3_KEY_LEN];
+  blake3_hasher_finalize(&context_hasher, context_key, BLAKE3_KEY_LEN);
+  uint32_t context_key_words[8];
+  load_key_words(context_key, context_key_words);
+  hasher_init_base(self, context_key_words, DERIVE_KEY_MATERIAL);
+}
+
+INLINE bool hasher_needs_merge(const blake3_hasher *self,
+                               uint64_t total_chunks) {
+  return self->cv_stack_len > popcnt(total_chunks);
+}
+
+INLINE void hasher_merge_parent(blake3_hasher *self) {
+  size_t parent_block_start =
+      (((size_t)self->cv_stack_len) - 2) * BLAKE3_OUT_LEN;
+  output_t output = parent_output(&self->cv_stack[parent_block_start],
+                                  self->key, self->chunk.flags);
+  output_chaining_value(&output, &self->cv_stack[parent_block_start]);
+  self->cv_stack_len -= 1;
+}
+
+INLINE void hasher_push_chunk_cv(blake3_hasher *self,
+                                 uint8_t cv[BLAKE3_OUT_LEN],
+                                 uint64_t chunk_counter) {
+  assert(self->cv_stack_len < BLAKE3_MAX_DEPTH);
+  while (hasher_needs_merge(self, chunk_counter)) {
+    hasher_merge_parent(self);
+  }
+  memcpy(&self->cv_stack[self->cv_stack_len * BLAKE3_OUT_LEN], cv,
+         BLAKE3_OUT_LEN);
+  self->cv_stack_len += 1;
+}
+
+void blake3_hasher_update(blake3_hasher *self, const void *input,
+                          size_t input_len) {
+  const uint8_t *input_bytes = (const uint8_t *)input;
+
+  // If we already have a partial chunk, or if this is the very first chunk
+  // (and it could be the root), we need to add bytes to the chunk state.
+  bool is_first_chunk = self->chunk.chunk_counter == 0;
+  bool maybe_root = is_first_chunk && input_len == BLAKE3_CHUNK_LEN;
+  if (maybe_root || chunk_state_len(&self->chunk) > 0) {
+    size_t take = BLAKE3_CHUNK_LEN - chunk_state_len(&self->chunk);
+    if (take > input_len) {
+      take = input_len;
+    }
+    chunk_state_update(&self->chunk, input_bytes, take);
+    input_bytes += take;
+    input_len -= take;
+    // If we've filled the current chunk and there's more coming, finalize this
+    // chunk and proceed. In this case we know it's not the root.
+    if (input_len > 0) {
+      output_t output = chunk_state_output(&self->chunk);
+      uint8_t chunk_cv[32];
+      output_chaining_value(&output, chunk_cv);
+      hasher_push_chunk_cv(self, chunk_cv, self->chunk.chunk_counter);
+      chunk_state_reset(&self->chunk, self->key, self->chunk.chunk_counter + 1);
+    } else {
+      return;
+    }
+  }
+
+  // Hash as many whole chunks as we can, without buffering anything. At this
+  // point we know none of them can be the root.
+  uint8_t out[BLAKE3_OUT_LEN * BLAKE3_MAX_SIMD_DEGREE];
+  const uint8_t *chunks[BLAKE3_MAX_SIMD_DEGREE];
+  size_t num_chunks = 0;
+  while (input_len >= BLAKE3_CHUNK_LEN) {
+    while (input_len >= BLAKE3_CHUNK_LEN &&
+           num_chunks < BLAKE3_MAX_SIMD_DEGREE) {
+      chunks[num_chunks] = input_bytes;
+      input_bytes += BLAKE3_CHUNK_LEN;
+      input_len -= BLAKE3_CHUNK_LEN;
+      num_chunks += 1;
+    }
+    blake3_hash_many(chunks, num_chunks, BLAKE3_CHUNK_LEN / BLAKE3_BLOCK_LEN,
+              self->key, self->chunk.chunk_counter, true, self->chunk.flags,
+              CHUNK_START, CHUNK_END, out);
+    for (size_t chunk_index = 0; chunk_index < num_chunks; chunk_index++) {
+      // The chunk state is empty here, but it stores the counter of the next
+      // chunk hash we need to push. Use that counter, and then move it forward.
+      hasher_push_chunk_cv(self, &out[chunk_index * BLAKE3_OUT_LEN],
+                           self->chunk.chunk_counter);
+      self->chunk.chunk_counter += 1;
+    }
+    num_chunks = 0;
+  }
+
+  // If there's any remaining input less than a full chunk, add it to the chunk
+  // state. In that case, also do a final merge loop to make sure the subtree
+  // stack doesn't contain any unmerged pairs. The remaining input means we
+  // know these merges are non-root. This merge loop isn't strictly necessary
+  // here, because hasher_push_chunk_cv already does its own merge loop, but it
+  // simplifies blake3_hasher_finalize below.
+  if (input_len > 0) {
+    while (hasher_needs_merge(self, self->chunk.chunk_counter)) {
+      hasher_merge_parent(self);
+    }
+    chunk_state_update(&self->chunk, input_bytes, input_len);
+  }
+}
+
+void blake3_hasher_finalize(const blake3_hasher *self, uint8_t *out,
+                            size_t out_len) {
+  // If the subtree stack is empty, then the current chunk is the root.
+  if (self->cv_stack_len == 0) {
+    output_t output = chunk_state_output(&self->chunk);
+    output_root_bytes(&output, out, out_len);
+    return;
+  }
+  // If there are any bytes in the chunk state, finalize that chunk and do a
+  // roll-up merge between that chunk hash and every subtree in the stack. In
+  // this case, the extra merge loop at the end of blake3_hasher_update
+  // guarantees that none of the subtrees in the stack need to be merged with
+  // each other first. Otherwise, if there are no bytes in the chunk state,
+  // then the top of the stack is a chunk hash, and we start the merge from
+  // that.
+  output_t output;
+  size_t cvs_remaining;
+  if (chunk_state_len(&self->chunk) > 0) {
+    cvs_remaining = self->cv_stack_len;
+    output = chunk_state_output(&self->chunk);
+  } else {
+    // There are always at least 2 CVs in the stack in this case.
+    cvs_remaining = self->cv_stack_len - 2;
+    output = parent_output(&self->cv_stack[cvs_remaining * 32], self->key,
+                           self->chunk.flags);
+  }
+  while (cvs_remaining > 0) {
+    cvs_remaining -= 1;
+    uint8_t parent_block[BLAKE3_BLOCK_LEN];
+    memcpy(parent_block, &self->cv_stack[cvs_remaining * 32], 32);
+    output_chaining_value(&output, &parent_block[32]);
+    output = parent_output(parent_block, self->key, self->chunk.flags);
+  }
+  output_root_bytes(&output, out, out_len);
+}
diff --git a/c/blake3.h b/c/blake3.h
new file mode 100644
index 0000000..c3cf6be
--- /dev/null
+++ b/c/blake3.h
@@ -0,0 +1,35 @@
+#pragma once
+
+#include <stdint.h>
+
+#define BLAKE3_KEY_LEN 32
+#define BLAKE3_OUT_LEN 32
+#define BLAKE3_BLOCK_LEN 64
+#define BLAKE3_CHUNK_LEN 1024
+#define BLAKE3_MAX_DEPTH 54
+#define BLAKE3_MAX_SIMD_DEGREE 16
+
+typedef struct {
+  uint32_t cv[8];
+  uint64_t chunk_counter;
+  uint8_t buf[BLAKE3_BLOCK_LEN];
+  uint8_t buf_len;
+  uint8_t blocks_compressed;
+  uint8_t flags;
+} blake3_chunk_state;
+
+typedef struct {
+  uint32_t key[8];
+  blake3_chunk_state chunk;
+  uint8_t cv_stack_len;
+  uint8_t cv_stack[BLAKE3_MAX_DEPTH * BLAKE3_OUT_LEN];
+} blake3_hasher;
+
+void blake3_hasher_init(blake3_hasher *self);
+void blake3_hasher_init_keyed(blake3_hasher *self,
+                              const uint8_t key[BLAKE3_KEY_LEN]);
+void blake3_hasher_init_derive_key(blake3_hasher *self, const char *context);
+void blake3_hasher_update(blake3_hasher *self, const void *input,
+                          size_t input_len);
+void blake3_hasher_finalize(const blake3_hasher *self, uint8_t *out,
+                            size_t out_len);
diff --git a/c/blake3_avx2.c b/c/blake3_avx2.c
new file mode 100644
index 0000000..0300505
--- /dev/null
+++ b/c/blake3_avx2.c
@@ -0,0 +1,316 @@
+#include "blake3_impl.h"
+
+#include <immintrin.h>
+
+#define DEGREE 8
+
+INLINE __m256i loadu(const uint8_t src[32]) {
+  return _mm256_loadu_si256((const __m256i *)src);
+}
+
+INLINE void storeu(__m256i src, uint8_t dest[16]) {
+  _mm256_storeu_si256((__m256i *)dest, src);
+}
+
+INLINE __m256i addv(__m256i a, __m256i b) { return _mm256_add_epi32(a, b); }
+
+// Note that clang-format doesn't like the name "xor" for some reason.
+INLINE __m256i xorv(__m256i a, __m256i b) { return _mm256_xor_si256(a, b); }
+
+INLINE __m256i set1(uint32_t x) { return _mm256_set1_epi32((int32_t)x); }
+
+INLINE __m256i set8(uint32_t a, uint32_t b, uint32_t c, uint32_t d, uint32_t e,
+                    uint32_t f, uint32_t g, uint32_t h) {
+  return _mm256_setr_epi32((int32_t)a, (int32_t)b, (int32_t)c, (int32_t)d,
+                           (int32_t)e, (int32_t)f, (int32_t)g, (int32_t)h);
+}
+
+INLINE __m256i rot16(__m256i x) {
+  return _mm256_shuffle_epi8(
+      x, _mm256_set_epi8(13, 12, 15, 14, 9, 8, 11, 10, 5, 4, 7, 6, 1, 0, 3, 2,
+                         13, 12, 15, 14, 9, 8, 11, 10, 5, 4, 7, 6, 1, 0, 3, 2));
+}
+
+INLINE __m256i rot12(__m256i x) {
+  return xorv(_mm256_srli_epi32(x, 12), _mm256_slli_epi32(x, 32 - 12));
+}
+
+INLINE __m256i rot8(__m256i x) {
+  return _mm256_shuffle_epi8(
+      x, _mm256_set_epi8(12, 15, 14, 13, 8, 11, 10, 9, 4, 7, 6, 5, 0, 3, 2, 1,
+                         12, 15, 14, 13, 8, 11, 10, 9, 4, 7, 6, 5, 0, 3, 2, 1));
+}
+
+INLINE __m256i rot7(__m256i x) {
+  return xorv(_mm256_srli_epi32(x, 7), _mm256_slli_epi32(x, 32 - 7));
+}
+
+INLINE void round_fn(__m256i v[16], __m256i m[16], size_t r) {
+  v[0] = addv(v[0], m[(size_t)MSG_SCHEDULE[r][0]]);
+  v[1] = addv(v[1], m[(size_t)MSG_SCHEDULE[r][2]]);
+  v[2] = addv(v[2], m[(size_t)MSG_SCHEDULE[r][4]]);
+  v[3] = addv(v[3], m[(size_t)MSG_SCHEDULE[r][6]]);
+  v[0] = addv(v[0], v[4]);
+  v[1] = addv(v[1], v[5]);
+  v[2] = addv(v[2], v[6]);
+  v[3] = addv(v[3], v[7]);
+  v[12] = xorv(v[12], v[0]);
+  v[13] = xorv(v[13], v[1]);
+  v[14] = xorv(v[14], v[2]);
+  v[15] = xorv(v[15], v[3]);
+  v[12] = rot16(v[12]);
+  v[13] = rot16(v[13]);
+  v[14] = rot16(v[14]);
+  v[15] = rot16(v[15]);
+  v[8] = addv(v[8], v[12]);
+  v[9] = addv(v[9], v[13]);
+  v[10] = addv(v[10], v[14]);
+  v[11] = addv(v[11], v[15]);
+  v[4] = xorv(v[4], v[8]);
+  v[5] = xorv(v[5], v[9]);
+  v[6] = xorv(v[6], v[10]);
+  v[7] = xorv(v[7], v[11]);
+  v[4] = rot12(v[4]);
+  v[5] = rot12(v[5]);
+  v[6] = rot12(v[6]);
+  v[7] = rot12(v[7]);
+  v[0] = addv(v[0], m[(size_t)MSG_SCHEDULE[r][1]]);
+  v[1] = addv(v[1], m[(size_t)MSG_SCHEDULE[r][3]]);
+  v[2] = addv(v[2], m[(size_t)MSG_SCHEDULE[r][5]]);
+  v[3] = addv(v[3], m[(size_t)MSG_SCHEDULE[r][7]]);
+  v[0] = addv(v[0], v[4]);
+  v[1] = addv(v[1], v[5]);
+  v[2] = addv(v[2], v[6]);
+  v[3] = addv(v[3], v[7]);
+  v[12] = xorv(v[12], v[0]);
+  v[13] = xorv(v[13], v[1]);
+  v[14] = xorv(v[14], v[2]);
+  v[15] = xorv(v[15], v[3]);
+  v[12] = rot8(v[12]);
+  v[13] = rot8(v[13]);
+  v[14] = rot8(v[14]);
+  v[15] = rot8(v[15]);
+  v[8] = addv(v[8], v[12]);
+  v[9] = addv(v[9], v[13]);
+  v[10] = addv(v[10], v[14]);
+  v[11] = addv(v[11], v[15]);
+  v[4] = xorv(v[4], v[8]);
+  v[5] = xorv(v[5], v[9]);
+  v[6] = xorv(v[6], v[10]);
+  v[7] = xorv(v[7], v[11]);
+  v[4] = rot7(v[4]);
+  v[5] = rot7(v[5]);
+  v[6] = rot7(v[6]);
+  v[7] = rot7(v[7]);
+
+  v[0] = addv(v[0], m[(size_t)MSG_SCHEDULE[r][8]]);
+  v[1] = addv(v[1], m[(size_t)MSG_SCHEDULE[r][10]]);
+  v[2] = addv(v[2], m[(size_t)MSG_SCHEDULE[r][12]]);
+  v[3] = addv(v[3], m[(size_t)MSG_SCHEDULE[r][14]]);
+  v[0] = addv(v[0], v[5]);
+  v[1] = addv(v[1], v[6]);
+  v[2] = addv(v[2], v[7]);
+  v[3] = addv(v[3], v[4]);
+  v[15] = xorv(v[15], v[0]);
+  v[12] = xorv(v[12], v[1]);
+  v[13] = xorv(v[13], v[2]);
+  v[14] = xorv(v[14], v[3]);
+  v[15] = rot16(v[15]);
+  v[12] = rot16(v[12]);
+  v[13] = rot16(v[13]);
+  v[14] = rot16(v[14]);
+  v[10] = addv(v[10], v[15]);
+  v[11] = addv(v[11], v[12]);
+  v[8] = addv(v[8], v[13]);
+  v[9] = addv(v[9], v[14]);
+  v[5] = xorv(v[5], v[10]);
+  v[6] = xorv(v[6], v[11]);
+  v[7] = xorv(v[7], v[8]);
+  v[4] = xorv(v[4], v[9]);
+  v[5] = rot12(v[5]);
+  v[6] = rot12(v[6]);
+  v[7] = rot12(v[7]);
+  v[4] = rot12(v[4]);
+  v[0] = addv(v[0], m[(size_t)MSG_SCHEDULE[r][9]]);
+  v[1] = addv(v[1], m[(size_t)MSG_SCHEDULE[r][11]]);
+  v[2] = addv(v[2], m[(size_t)MSG_SCHEDULE[r][13]]);
+  v[3] = addv(v[3], m[(size_t)MSG_SCHEDULE[r][15]]);
+  v[0] = addv(v[0], v[5]);
+  v[1] = addv(v[1], v[6]);
+  v[2] = addv(v[2], v[7]);
+  v[3] = addv(v[3], v[4]);
+  v[15] = xorv(v[15], v[0]);
+  v[12] = xorv(v[12], v[1]);
+  v[13] = xorv(v[13], v[2]);
+  v[14] = xorv(v[14], v[3]);
+  v[15] = rot8(v[15]);
+  v[12] = rot8(v[12]);
+  v[13] = rot8(v[13]);
+  v[14] = rot8(v[14]);
+  v[10] = addv(v[10], v[15]);
+  v[11] = addv(v[11], v[12]);
+  v[8] = addv(v[8], v[13]);
+  v[9] = addv(v[9], v[14]);
+  v[5] = xorv(v[5], v[10]);
+  v[6] = xorv(v[6], v[11]);
+  v[7] = xorv(v[7], v[8]);
+  v[4] = xorv(v[4], v[9]);
+  v[5] = rot7(v[5]);
+  v[6] = rot7(v[6]);
+  v[7] = rot7(v[7]);
+  v[4] = rot7(v[4]);
+}
+
+INLINE void transpose_vecs(__m256i vecs[DEGREE]) {
+  // Interleave 32-bit lanes. The low unpack is lanes 00/11/44/55, and the high
+  // is 22/33/66/77.
+  __m256i ab_0145 = _mm256_unpacklo_epi32(vecs[0], vecs[1]);
+  __m256i ab_2367 = _mm256_unpackhi_epi32(vecs[0], vecs[1]);
+  __m256i cd_0145 = _mm256_unpacklo_epi32(vecs[2], vecs[3]);
+  __m256i cd_2367 = _mm256_unpackhi_epi32(vecs[2], vecs[3]);
+  __m256i ef_0145 = _mm256_unpacklo_epi32(vecs[4], vecs[5]);
+  __m256i ef_2367 = _mm256_unpackhi_epi32(vecs[4], vecs[5]);
+  __m256i gh_0145 = _mm256_unpacklo_epi32(vecs[6], vecs[7]);
+  __m256i gh_2367 = _mm256_unpackhi_epi32(vecs[6], vecs[7]);
+
+  // Interleave 64-bit lates. The low unpack is lanes 00/22 and the high is
+  // 11/33.
+  __m256i abcd_04 = _mm256_unpacklo_epi64(ab_0145, cd_0145);
+  __m256i abcd_15 = _mm256_unpackhi_epi64(ab_0145, cd_0145);
+  __m256i abcd_26 = _mm256_unpacklo_epi64(ab_2367, cd_2367);
+  __m256i abcd_37 = _mm256_unpackhi_epi64(ab_2367, cd_2367);
+  __m256i efgh_04 = _mm256_unpacklo_epi64(ef_0145, gh_0145);
+  __m256i efgh_15 = _mm256_unpackhi_epi64(ef_0145, gh_0145);
+  __m256i efgh_26 = _mm256_unpacklo_epi64(ef_2367, gh_2367);
+  __m256i efgh_37 = _mm256_unpackhi_epi64(ef_2367, gh_2367);
+
+  // Interleave 128-bit lanes.
+  vecs[0] = _mm256_permute2x128_si256(abcd_04, efgh_04, 0x20);
+  vecs[1] = _mm256_permute2x128_si256(abcd_15, efgh_15, 0x20);
+  vecs[2] = _mm256_permute2x128_si256(abcd_26, efgh_26, 0x20);
+  vecs[3] = _mm256_permute2x128_si256(abcd_37, efgh_37, 0x20);
+  vecs[4] = _mm256_permute2x128_si256(abcd_04, efgh_04, 0x31);
+  vecs[5] = _mm256_permute2x128_si256(abcd_15, efgh_15, 0x31);
+  vecs[6] = _mm256_permute2x128_si256(abcd_26, efgh_26, 0x31);
+  vecs[7] = _mm256_permute2x128_si256(abcd_37, efgh_37, 0x31);
+}
+
+INLINE void transpose_msg_vecs(const uint8_t *const *inputs,
+                               size_t block_offset, __m256i out[16]) {
+  out[0] = loadu(&inputs[0][block_offset + 0 * sizeof(__m256i)]);
+  out[1] = loadu(&inputs[1][block_offset + 0 * sizeof(__m256i)]);
+  out[2] = loadu(&inputs[2][block_offset + 0 * sizeof(__m256i)]);
+  out[3] = loadu(&inputs[3][block_offset + 0 * sizeof(__m256i)]);
+  out[4] = loadu(&inputs[4][block_offset + 0 * sizeof(__m256i)]);
+  out[5] = loadu(&inputs[5][block_offset + 0 * sizeof(__m256i)]);
+  out[6] = loadu(&inputs[6][block_offset + 0 * sizeof(__m256i)]);
+  out[7] = loadu(&inputs[7][block_offset + 0 * sizeof(__m256i)]);
+  out[8] = loadu(&inputs[0][block_offset + 1 * sizeof(__m256i)]);
+  out[9] = loadu(&inputs[1][block_offset + 1 * sizeof(__m256i)]);
+  out[10] = loadu(&inputs[2][block_offset + 1 * sizeof(__m256i)]);
+  out[11] = loadu(&inputs[3][block_offset + 1 * sizeof(__m256i)]);
+  out[12] = loadu(&inputs[4][block_offset + 1 * sizeof(__m256i)]);
+  out[13] = loadu(&inputs[5][block_offset + 1 * sizeof(__m256i)]);
+  out[14] = loadu(&inputs[6][block_offset + 1 * sizeof(__m256i)]);
+  out[15] = loadu(&inputs[7][block_offset + 1 * sizeof(__m256i)]);
+  transpose_vecs(&out[0]);
+  transpose_vecs(&out[8]);
+}
+
+INLINE void load_counters(uint64_t counter, bool increment_counter,
+                          __m256i *out_low, __m256i *out_high) {
+  uint64_t mask = (increment_counter ? ~0 : 0);
+  *out_low = set8(
+      counter_low(counter + (mask & 0)), counter_low(counter + (mask & 1)),
+      counter_low(counter + (mask & 2)), counter_low(counter + (mask & 3)),
+      counter_low(counter + (mask & 4)), counter_low(counter + (mask & 5)),
+      counter_low(counter + (mask & 6)), counter_low(counter + (mask & 7)));
+  *out_high = set8(
+      counter_high(counter + (mask & 0)), counter_high(counter + (mask & 1)),
+      counter_high(counter + (mask & 2)), counter_high(counter + (mask & 3)),
+      counter_high(counter + (mask & 4)), counter_high(counter + (mask & 5)),
+      counter_high(counter + (mask & 6)), counter_high(counter + (mask & 7)));
+}
+
+void blake3_hash8_avx2(const uint8_t *const *inputs, size_t blocks,
+                       const uint32_t key[8], uint64_t counter,
+                       bool increment_counter, uint8_t flags,
+                       uint8_t flags_start, uint8_t flags_end, uint8_t *out) {
+  __m256i h_vecs[8] = {
+      set1(key[0]), set1(key[1]), set1(key[2]), set1(key[3]),
+      set1(key[4]), set1(key[5]), set1(key[6]), set1(key[7]),
+  };
+  __m256i counter_low_vec, counter_high_vec;
+  load_counters(counter, increment_counter, &counter_low_vec,
+                &counter_high_vec);
+  uint8_t block_flags = flags | flags_start;
+
+  for (size_t block = 0; block < blocks; block++) {
+    if (block + 1 == blocks) {
+      block_flags |= flags_end;
+    }
+    __m256i block_len_vec = set1(BLAKE3_BLOCK_LEN);
+    __m256i block_flags_vec = set1(block_flags);
+    __m256i msg_vecs[16];
+    transpose_msg_vecs(inputs, block * BLAKE3_BLOCK_LEN, msg_vecs);
+
+    __m256i v[16] = {
+        h_vecs[0],       h_vecs[1],        h_vecs[2],     h_vecs[3],
+        h_vecs[4],       h_vecs[5],        h_vecs[6],     h_vecs[7],
+        set1(IV[0]),     set1(IV[1]),      set1(IV[2]),   set1(IV[3]),
+        counter_low_vec, counter_high_vec, block_len_vec, block_flags_vec,
+    };
+    round_fn(v, msg_vecs, 0);
+    round_fn(v, msg_vecs, 1);
+    round_fn(v, msg_vecs, 2);
+    round_fn(v, msg_vecs, 3);
+    round_fn(v, msg_vecs, 4);
+    round_fn(v, msg_vecs, 5);
+    round_fn(v, msg_vecs, 6);
+    h_vecs[0] = xorv(v[0], v[8]);
+    h_vecs[1] = xorv(v[1], v[9]);
+    h_vecs[2] = xorv(v[2], v[10]);
+    h_vecs[3] = xorv(v[3], v[11]);
+    h_vecs[4] = xorv(v[4], v[12]);
+    h_vecs[5] = xorv(v[5], v[13]);
+    h_vecs[6] = xorv(v[6], v[14]);
+    h_vecs[7] = xorv(v[7], v[15]);
+
+    block_flags = flags;
+  }
+
+  transpose_vecs(h_vecs);
+  storeu(h_vecs[0], &out[0 * sizeof(__m256i)]);
+  storeu(h_vecs[1], &out[1 * sizeof(__m256i)]);
+  storeu(h_vecs[2], &out[2 * sizeof(__m256i)]);
+  storeu(h_vecs[3], &out[3 * sizeof(__m256i)]);
+  storeu(h_vecs[4], &out[4 * sizeof(__m256i)]);
+  storeu(h_vecs[5], &out[5 * sizeof(__m256i)]);
+  storeu(h_vecs[6], &out[6 * sizeof(__m256i)]);
+  storeu(h_vecs[7], &out[7 * sizeof(__m256i)]);
+}
+
+void blake3_hash_many_sse41(const uint8_t *const *inputs, size_t num_inputs,
+                            size_t blocks, const uint32_t key[8],
+                            uint64_t counter, bool increment_counter,
+                            uint8_t flags, uint8_t flags_start,
+                            uint8_t flags_end, uint8_t *out);
+
+void blake3_hash_many_avx2(const uint8_t *const *inputs, size_t num_inputs,
+                           size_t blocks, const uint32_t key[8],
+                           uint64_t counter, bool increment_counter,
+                           uint8_t flags, uint8_t flags_start,
+                           uint8_t flags_end, uint8_t *out) {
+  while (num_inputs >= DEGREE) {
+    blake3_hash8_avx2(inputs, blocks, key, counter, increment_counter, flags,
+                      flags_start, flags_end, out);
+    if (increment_counter) {
+      counter += DEGREE;
+    }
+    inputs += DEGREE;
+    num_inputs -= DEGREE;
+    out = &out[DEGREE * BLAKE3_OUT_LEN];
+  }
+  blake3_hash_many_sse41(inputs, num_inputs, blocks, key, counter,
+                         increment_counter, flags, flags_start, flags_end, out);
+}
diff --git a/c/blake3_avx512.c b/c/blake3_avx512.c
new file mode 100644
index 0000000..fc754e2
--- /dev/null
+++ b/c/blake3_avx512.c
@@ -0,0 +1,1201 @@
+#include "blake3_impl.h"
+
+#include <immintrin.h>
+
+#define _mm_shuffle_ps2(a, b, c)                                               \
+  (_mm_castps_si128(                                                           \
+      _mm_shuffle_ps(_mm_castsi128_ps(a), _mm_castsi128_ps(b), (c))))
+
+INLINE __m128i loadu_128(const uint8_t src[16]) {
+  return _mm_loadu_si128((const __m128i *)src);
+}
+
+INLINE __m256i loadu_256(const uint8_t src[32]) {
+  return _mm256_loadu_si256((const __m256i *)src);
+}
+
+INLINE __m512i loadu_512(const uint8_t src[64]) {
+  return _mm512_loadu_si512((const __m512i *)src);
+}
+
+INLINE void storeu_128(__m128i src, uint8_t dest[16]) {
+  _mm_storeu_si128((__m128i *)dest, src);
+}
+
+INLINE void storeu_256(__m256i src, uint8_t dest[16]) {
+  _mm256_storeu_si256((__m256i *)dest, src);
+}
+
+INLINE __m128i add_128(__m128i a, __m128i b) { return _mm_add_epi32(a, b); }
+
+INLINE __m256i add_256(__m256i a, __m256i b) { return _mm256_add_epi32(a, b); }
+
+INLINE __m512i add_512(__m512i a, __m512i b) { return _mm512_add_epi32(a, b); }
+
+INLINE __m128i xor_128(__m128i a, __m128i b) { return _mm_xor_si128(a, b); }
+
+INLINE __m256i xor_256(__m256i a, __m256i b) { return _mm256_xor_si256(a, b); }
+
+INLINE __m512i xor_512(__m512i a, __m512i b) { return _mm512_xor_si512(a, b); }
+
+INLINE __m128i set1_128(uint32_t x) { return _mm_set1_epi32((int32_t)x); }
+
+INLINE __m256i set1_256(uint32_t x) { return _mm256_set1_epi32((int32_t)x); }
+
+INLINE __m512i set1_512(uint32_t x) { return _mm512_set1_epi32((int32_t)x); }
+
+INLINE __m128i set4(uint32_t a, uint32_t b, uint32_t c, uint32_t d) {
+  return _mm_setr_epi32((int32_t)a, (int32_t)b, (int32_t)c, (int32_t)d);
+}
+
+INLINE __m128i rot16_128(__m128i x) { return _mm_ror_epi32(x, 16); }
+
+INLINE __m256i rot16_256(__m256i x) { return _mm256_ror_epi32(x, 16); }
+
+INLINE __m512i rot16_512(__m512i x) { return _mm512_ror_epi32(x, 16); }
+
+INLINE __m128i rot12_128(__m128i x) { return _mm_ror_epi32(x, 12); }
+
+INLINE __m256i rot12_256(__m256i x) { return _mm256_ror_epi32(x, 12); }
+
+INLINE __m512i rot12_512(__m512i x) { return _mm512_ror_epi32(x, 12); }
+
+INLINE __m128i rot8_128(__m128i x) { return _mm_ror_epi32(x, 8); }
+
+INLINE __m256i rot8_256(__m256i x) { return _mm256_ror_epi32(x, 8); }
+
+INLINE __m512i rot8_512(__m512i x) { return _mm512_ror_epi32(x, 8); }
+
+INLINE __m128i rot7_128(__m128i x) { return _mm_ror_epi32(x, 7); }
+
+INLINE __m256i rot7_256(__m256i x) { return _mm256_ror_epi32(x, 7); }
+
+INLINE __m512i rot7_512(__m512i x) { return _mm512_ror_epi32(x, 7); }
+
+/*
+ * ----------------------------------------------------------------------------
+ * compress_avx512
+ * ----------------------------------------------------------------------------
+ */
+
+INLINE void g1(__m128i *row0, __m128i *row1, __m128i *row2, __m128i *row3,
+               __m128i m) {
+  *row0 = add_128(add_128(*row0, m), *row1);
+  *row3 = xor_128(*row3, *row0);
+  *row3 = rot16_128(*row3);
+  *row2 = add_128(*row2, *row3);
+  *row1 = xor_128(*row1, *row2);
+  *row1 = rot12_128(*row1);
+}
+
+INLINE void g2(__m128i *row0, __m128i *row1, __m128i *row2, __m128i *row3,
+               __m128i m) {
+  *row0 = add_128(add_128(*row0, m), *row1);
+  *row3 = xor_128(*row3, *row0);
+  *row3 = rot8_128(*row3);
+  *row2 = add_128(*row2, *row3);
+  *row1 = xor_128(*row1, *row2);
+  *row1 = rot7_128(*row1);
+}
+
+// Note the optimization here of leaving row1 as the unrotated row, rather than
+// row0. All the message loads below are adjusted to compensate for this. See
+// discussion at https://github.com/sneves/blake2-avx2/pull/4
+INLINE void diagonalize(__m128i *row0, __m128i *row2, __m128i *row3) {
+  *row0 = _mm_shuffle_epi32(*row0, _MM_SHUFFLE(2, 1, 0, 3));
+  *row3 = _mm_shuffle_epi32(*row3, _MM_SHUFFLE(1, 0, 3, 2));
+  *row2 = _mm_shuffle_epi32(*row2, _MM_SHUFFLE(0, 3, 2, 1));
+}
+
+INLINE void undiagonalize(__m128i *row0, __m128i *row2, __m128i *row3) {
+  *row0 = _mm_shuffle_epi32(*row0, _MM_SHUFFLE(0, 3, 2, 1));
+  *row3 = _mm_shuffle_epi32(*row3, _MM_SHUFFLE(1, 0, 3, 2));
+  *row2 = _mm_shuffle_epi32(*row2, _MM_SHUFFLE(2, 1, 0, 3));
+}
+
+INLINE void compress_pre(__m128i rows[4], const uint32_t cv[8],
+                         const uint8_t block[BLAKE3_BLOCK_LEN],
+                         uint8_t block_len, uint64_t counter, uint8_t flags) {
+  rows[0] = loadu_128((uint8_t *)&cv[0]);
+  rows[1] = loadu_128((uint8_t *)&cv[4]);
+  rows[2] = set4(IV[0], IV[1], IV[2], IV[3]);
+  rows[3] = set4(counter_low(counter), counter_high(counter),
+                 (uint32_t)block_len, (uint32_t)flags);
+
+  __m128i m0 = loadu_128(&block[sizeof(__m128i) * 0]);
+  __m128i m1 = loadu_128(&block[sizeof(__m128i) * 1]);
+  __m128i m2 = loadu_128(&block[sizeof(__m128i) * 2]);
+  __m128i m3 = loadu_128(&block[sizeof(__m128i) * 3]);
+
+  __m128i t0, t1, t2, t3, tt;
+
+  // Round 1. The first round permutes the message words from the original
+  // input order, into the groups that get mixed in parallel.
+  t0 = _mm_shuffle_ps2(m0, m1, _MM_SHUFFLE(2, 0, 2, 0)); //  6  4  2  0
+  g1(&rows[0], &rows[1], &rows[2], &rows[3], t0);
+  t1 = _mm_shuffle_ps2(m0, m1, _MM_SHUFFLE(3, 1, 3, 1)); //  7  5  3  1
+  g2(&rows[0], &rows[1], &rows[2], &rows[3], t1);
+  diagonalize(&rows[0], &rows[2], &rows[3]);
+  t2 = _mm_shuffle_ps2(m2, m3, _MM_SHUFFLE(2, 0, 2, 0)); // 14 12 10  8
+  t2 = _mm_shuffle_epi32(t2, _MM_SHUFFLE(2, 1, 0, 3));   // 12 10  8 14
+  g1(&rows[0], &rows[1], &rows[2], &rows[3], t2);
+  t3 = _mm_shuffle_ps2(m2, m3, _MM_SHUFFLE(3, 1, 3, 1)); // 15 13 11  9
+  t3 = _mm_shuffle_epi32(t3, _MM_SHUFFLE(2, 1, 0, 3));   // 13 11  9 15
+  g2(&rows[0], &rows[1], &rows[2], &rows[3], t3);
+  undiagonalize(&rows[0], &rows[2], &rows[3]);
+  m0 = t0;
+  m1 = t1;
+  m2 = t2;
+  m3 = t3;
+
+  // Round 2. This round and all following rounds apply a fixed permutation
+  // to the message words from the round before.
+  t0 = _mm_shuffle_ps2(m0, m1, _MM_SHUFFLE(3, 1, 1, 2));
+  t0 = _mm_shuffle_epi32(t0, _MM_SHUFFLE(0, 3, 2, 1));
+  g1(&rows[0], &rows[1], &rows[2], &rows[3], t0);
+  t1 = _mm_shuffle_ps2(m2, m3, _MM_SHUFFLE(3, 3, 2, 2));
+  tt = _mm_shuffle_epi32(m0, _MM_SHUFFLE(0, 0, 3, 3));
+  t1 = _mm_blend_epi16(tt, t1, 0xCC);
+  g2(&rows[0], &rows[1], &rows[2], &rows[3], t1);
+  diagonalize(&rows[0], &rows[2], &rows[3]);
+  t2 = _mm_unpacklo_epi64(m3, m1);
+  tt = _mm_blend_epi16(t2, m2, 0xC0);
+  t2 = _mm_shuffle_epi32(tt, _MM_SHUFFLE(1, 3, 2, 0));
+  g1(&rows[0], &rows[1], &rows[2], &rows[3], t2);
+  t3 = _mm_unpackhi_epi32(m1, m3);
+  tt = _mm_unpacklo_epi32(m2, t3);
+  t3 = _mm_shuffle_epi32(tt, _MM_SHUFFLE(0, 1, 3, 2));
+  g2(&rows[0], &rows[1], &rows[2], &rows[3], t3);
+  undiagonalize(&rows[0], &rows[2], &rows[3]);
+  m0 = t0;
+  m1 = t1;
+  m2 = t2;
+  m3 = t3;
+
+  // Round 3
+  t0 = _mm_shuffle_ps2(m0, m1, _MM_SHUFFLE(3, 1, 1, 2));
+  t0 = _mm_shuffle_epi32(t0, _MM_SHUFFLE(0, 3, 2, 1));
+  g1(&rows[0], &rows[1], &rows[2], &rows[3], t0);
+  t1 = _mm_shuffle_ps2(m2, m3, _MM_SHUFFLE(3, 3, 2, 2));
+  tt = _mm_shuffle_epi32(m0, _MM_SHUFFLE(0, 0, 3, 3));
+  t1 = _mm_blend_epi16(tt, t1, 0xCC);
+  g2(&rows[0], &rows[1], &rows[2], &rows[3], t1);
+  diagonalize(&rows[0], &rows[2], &rows[3]);
+  t2 = _mm_unpacklo_epi64(m3, m1);
+  tt = _mm_blend_epi16(t2, m2, 0xC0);
+  t2 = _mm_shuffle_epi32(tt, _MM_SHUFFLE(1, 3, 2, 0));
+  g1(&rows[0], &rows[1], &rows[2], &rows[3], t2);
+  t3 = _mm_unpackhi_epi32(m1, m3);
+  tt = _mm_unpacklo_epi32(m2, t3);
+  t3 = _mm_shuffle_epi32(tt, _MM_SHUFFLE(0, 1, 3, 2));
+  g2(&rows[0], &rows[1], &rows[2], &rows[3], t3);
+  undiagonalize(&rows[0], &rows[2], &rows[3]);
+  m0 = t0;
+  m1 = t1;
+  m2 = t2;
+  m3 = t3;
+
+  // Round 4
+  t0 = _mm_shuffle_ps2(m0, m1, _MM_SHUFFLE(3, 1, 1, 2));
+  t0 = _mm_shuffle_epi32(t0, _MM_SHUFFLE(0, 3, 2, 1));
+  g1(&rows[0], &rows[1], &rows[2], &rows[3], t0);
+  t1 = _mm_shuffle_ps2(m2, m3, _MM_SHUFFLE(3, 3, 2, 2));
+  tt = _mm_shuffle_epi32(m0, _MM_SHUFFLE(0, 0, 3, 3));
+  t1 = _mm_blend_epi16(tt, t1, 0xCC);
+  g2(&rows[0], &rows[1], &rows[2], &rows[3], t1);
+  diagonalize(&rows[0], &rows[2], &rows[3]);
+  t2 = _mm_unpacklo_epi64(m3, m1);
+  tt = _mm_blend_epi16(t2, m2, 0xC0);
+  t2 = _mm_shuffle_epi32(tt, _MM_SHUFFLE(1, 3, 2, 0));
+  g1(&rows[0], &rows[1], &rows[2], &rows[3], t2);
+  t3 = _mm_unpackhi_epi32(m1, m3);
+  tt = _mm_unpacklo_epi32(m2, t3);
+  t3 = _mm_shuffle_epi32(tt, _MM_SHUFFLE(0, 1, 3, 2));
+  g2(&rows[0], &rows[1], &rows[2], &rows[3], t3);
+  undiagonalize(&rows[0], &rows[2], &rows[3]);
+  m0 = t0;
+  m1 = t1;
+  m2 = t2;
+  m3 = t3;
+
+  // Round 5
+  t0 = _mm_shuffle_ps2(m0, m1, _MM_SHUFFLE(3, 1, 1, 2));
+  t0 = _mm_shuffle_epi32(t0, _MM_SHUFFLE(0, 3, 2, 1));
+  g1(&rows[0], &rows[1], &rows[2], &rows[3], t0);
+  t1 = _mm_shuffle_ps2(m2, m3, _MM_SHUFFLE(3, 3, 2, 2));
+  tt = _mm_shuffle_epi32(m0, _MM_SHUFFLE(0, 0, 3, 3));
+  t1 = _mm_blend_epi16(tt, t1, 0xCC);
+  g2(&rows[0], &rows[1], &rows[2], &rows[3], t1);
+  diagonalize(&rows[0], &rows[2], &rows[3]);
+  t2 = _mm_unpacklo_epi64(m3, m1);
+  tt = _mm_blend_epi16(t2, m2, 0xC0);
+  t2 = _mm_shuffle_epi32(tt, _MM_SHUFFLE(1, 3, 2, 0));
+  g1(&rows[0], &rows[1], &rows[2], &rows[3], t2);
+  t3 = _mm_unpackhi_epi32(m1, m3);
+  tt = _mm_unpacklo_epi32(m2, t3);
+  t3 = _mm_shuffle_epi32(tt, _MM_SHUFFLE(0, 1, 3, 2));
+  g2(&rows[0], &rows[1], &rows[2], &rows[3], t3);
+  undiagonalize(&rows[0], &rows[2], &rows[3]);
+  m0 = t0;
+  m1 = t1;
+  m2 = t2;
+  m3 = t3;
+
+  // Round 6
+  t0 = _mm_shuffle_ps2(m0, m1, _MM_SHUFFLE(3, 1, 1, 2));
+  t0 = _mm_shuffle_epi32(t0, _MM_SHUFFLE(0, 3, 2, 1));
+  g1(&rows[0], &rows[1], &rows[2], &rows[3], t0);
+  t1 = _mm_shuffle_ps2(m2, m3, _MM_SHUFFLE(3, 3, 2, 2));
+  tt = _mm_shuffle_epi32(m0, _MM_SHUFFLE(0, 0, 3, 3));
+  t1 = _mm_blend_epi16(tt, t1, 0xCC);
+  g2(&rows[0], &rows[1], &rows[2], &rows[3], t1);
+  diagonalize(&rows[0], &rows[2], &rows[3]);
+  t2 = _mm_unpacklo_epi64(m3, m1);
+  tt = _mm_blend_epi16(t2, m2, 0xC0);
+  t2 = _mm_shuffle_epi32(tt, _MM_SHUFFLE(1, 3, 2, 0));
+  g1(&rows[0], &rows[1], &rows[2], &rows[3], t2);
+  t3 = _mm_unpackhi_epi32(m1, m3);
+  tt = _mm_unpacklo_epi32(m2, t3);
+  t3 = _mm_shuffle_epi32(tt, _MM_SHUFFLE(0, 1, 3, 2));
+  g2(&rows[0], &rows[1], &rows[2], &rows[3], t3);
+  undiagonalize(&rows[0], &rows[2], &rows[3]);
+  m0 = t0;
+  m1 = t1;
+  m2 = t2;
+  m3 = t3;
+
+  // Round 7
+  t0 = _mm_shuffle_ps2(m0, m1, _MM_SHUFFLE(3, 1, 1, 2));
+  t0 = _mm_shuffle_epi32(t0, _MM_SHUFFLE(0, 3, 2, 1));
+  g1(&rows[0], &rows[1], &rows[2], &rows[3], t0);
+  t1 = _mm_shuffle_ps2(m2, m3, _MM_SHUFFLE(3, 3, 2, 2));
+  tt = _mm_shuffle_epi32(m0, _MM_SHUFFLE(0, 0, 3, 3));
+  t1 = _mm_blend_epi16(tt, t1, 0xCC);
+  g2(&rows[0], &rows[1], &rows[2], &rows[3], t1);
+  diagonalize(&rows[0], &rows[2], &rows[3]);
+  t2 = _mm_unpacklo_epi64(m3, m1);
+  tt = _mm_blend_epi16(t2, m2, 0xC0);
+  t2 = _mm_shuffle_epi32(tt, _MM_SHUFFLE(1, 3, 2, 0));
+  g1(&rows[0], &rows[1], &rows[2], &rows[3], t2);
+  t3 = _mm_unpackhi_epi32(m1, m3);
+  tt = _mm_unpacklo_epi32(m2, t3);
+  t3 = _mm_shuffle_epi32(tt, _MM_SHUFFLE(0, 1, 3, 2));
+  g2(&rows[0], &rows[1], &rows[2], &rows[3], t3);
+  undiagonalize(&rows[0], &rows[2], &rows[3]);
+}
+
+void blake3_compress_xof_avx512(const uint32_t cv[8],
+                                const uint8_t block[BLAKE3_BLOCK_LEN],
+                                uint8_t block_len, uint64_t counter,
+                                uint8_t flags, uint8_t out[64]) {
+  __m128i rows[4];
+  compress_pre(rows, cv, block, block_len, counter, flags);
+  storeu_128(xor_128(rows[0], rows[2]), &out[0]);
+  storeu_128(xor_128(rows[1], rows[3]), &out[16]);
+  storeu_128(xor_128(rows[2], loadu_128((uint8_t *)&cv[0])), &out[32]);
+  storeu_128(xor_128(rows[3], loadu_128((uint8_t *)&cv[4])), &out[48]);
+}
+
+void blake3_compress_in_place_avx512(uint32_t cv[8],
+                                     const uint8_t block[BLAKE3_BLOCK_LEN],
+                                     uint8_t block_len, uint64_t counter,
+                                     uint8_t flags) {
+  __m128i rows[4];
+  compress_pre(rows, cv, block, block_len, counter, flags);
+  storeu_128(xor_128(rows[0], rows[2]), (uint8_t *)&cv[0]);
+  storeu_128(xor_128(rows[1], rows[3]), (uint8_t *)&cv[4]);
+}
+
+/*
+ * ----------------------------------------------------------------------------
+ * hash4_avx512
+ * ----------------------------------------------------------------------------
+ */
+
+INLINE void round_fn4(__m128i v[16], __m128i m[16], size_t r) {
+  v[0] = add_128(v[0], m[(size_t)MSG_SCHEDULE[r][0]]);
+  v[1] = add_128(v[1], m[(size_t)MSG_SCHEDULE[r][2]]);
+  v[2] = add_128(v[2], m[(size_t)MSG_SCHEDULE[r][4]]);
+  v[3] = add_128(v[3], m[(size_t)MSG_SCHEDULE[r][6]]);
+  v[0] = add_128(v[0], v[4]);
+  v[1] = add_128(v[1], v[5]);
+  v[2] = add_128(v[2], v[6]);
+  v[3] = add_128(v[3], v[7]);
+  v[12] = xor_128(v[12], v[0]);
+  v[13] = xor_128(v[13], v[1]);
+  v[14] = xor_128(v[14], v[2]);
+  v[15] = xor_128(v[15], v[3]);
+  v[12] = rot16_128(v[12]);
+  v[13] = rot16_128(v[13]);
+  v[14] = rot16_128(v[14]);
+  v[15] = rot16_128(v[15]);
+  v[8] = add_128(v[8], v[12]);
+  v[9] = add_128(v[9], v[13]);
+  v[10] = add_128(v[10], v[14]);
+  v[11] = add_128(v[11], v[15]);
+  v[4] = xor_128(v[4], v[8]);
+  v[5] = xor_128(v[5], v[9]);
+  v[6] = xor_128(v[6], v[10]);
+  v[7] = xor_128(v[7], v[11]);
+  v[4] = rot12_128(v[4]);
+  v[5] = rot12_128(v[5]);
+  v[6] = rot12_128(v[6]);
+  v[7] = rot12_128(v[7]);
+  v[0] = add_128(v[0], m[(size_t)MSG_SCHEDULE[r][1]]);
+  v[1] = add_128(v[1], m[(size_t)MSG_SCHEDULE[r][3]]);
+  v[2] = add_128(v[2], m[(size_t)MSG_SCHEDULE[r][5]]);
+  v[3] = add_128(v[3], m[(size_t)MSG_SCHEDULE[r][7]]);
+  v[0] = add_128(v[0], v[4]);
+  v[1] = add_128(v[1], v[5]);
+  v[2] = add_128(v[2], v[6]);
+  v[3] = add_128(v[3], v[7]);
+  v[12] = xor_128(v[12], v[0]);
+  v[13] = xor_128(v[13], v[1]);
+  v[14] = xor_128(v[14], v[2]);
+  v[15] = xor_128(v[15], v[3]);
+  v[12] = rot8_128(v[12]);
+  v[13] = rot8_128(v[13]);
+  v[14] = rot8_128(v[14]);
+  v[15] = rot8_128(v[15]);
+  v[8] = add_128(v[8], v[12]);
+  v[9] = add_128(v[9], v[13]);
+  v[10] = add_128(v[10], v[14]);
+  v[11] = add_128(v[11], v[15]);
+  v[4] = xor_128(v[4], v[8]);
+  v[5] = xor_128(v[5], v[9]);
+  v[6] = xor_128(v[6], v[10]);
+  v[7] = xor_128(v[7], v[11]);
+  v[4] = rot7_128(v[4]);
+  v[5] = rot7_128(v[5]);
+  v[6] = rot7_128(v[6]);
+  v[7] = rot7_128(v[7]);
+
+  v[0] = add_128(v[0], m[(size_t)MSG_SCHEDULE[r][8]]);
+  v[1] = add_128(v[1], m[(size_t)MSG_SCHEDULE[r][10]]);
+  v[2] = add_128(v[2], m[(size_t)MSG_SCHEDULE[r][12]]);
+  v[3] = add_128(v[3], m[(size_t)MSG_SCHEDULE[r][14]]);
+  v[0] = add_128(v[0], v[5]);
+  v[1] = add_128(v[1], v[6]);
+  v[2] = add_128(v[2], v[7]);
+  v[3] = add_128(v[3], v[4]);
+  v[15] = xor_128(v[15], v[0]);
+  v[12] = xor_128(v[12], v[1]);
+  v[13] = xor_128(v[13], v[2]);
+  v[14] = xor_128(v[14], v[3]);
+  v[15] = rot16_128(v[15]);
+  v[12] = rot16_128(v[12]);
+  v[13] = rot16_128(v[13]);
+  v[14] = rot16_128(v[14]);
+  v[10] = add_128(v[10], v[15]);
+  v[11] = add_128(v[11], v[12]);
+  v[8] = add_128(v[8], v[13]);
+  v[9] = add_128(v[9], v[14]);
+  v[5] = xor_128(v[5], v[10]);
+  v[6] = xor_128(v[6], v[11]);
+  v[7] = xor_128(v[7], v[8]);
+  v[4] = xor_128(v[4], v[9]);
+  v[5] = rot12_128(v[5]);
+  v[6] = rot12_128(v[6]);
+  v[7] = rot12_128(v[7]);
+  v[4] = rot12_128(v[4]);
+  v[0] = add_128(v[0], m[(size_t)MSG_SCHEDULE[r][9]]);
+  v[1] = add_128(v[1], m[(size_t)MSG_SCHEDULE[r][11]]);
+  v[2] = add_128(v[2], m[(size_t)MSG_SCHEDULE[r][13]]);
+  v[3] = add_128(v[3], m[(size_t)MSG_SCHEDULE[r][15]]);
+  v[0] = add_128(v[0], v[5]);
+  v[1] = add_128(v[1], v[6]);
+  v[2] = add_128(v[2], v[7]);
+  v[3] = add_128(v[3], v[4]);
+  v[15] = xor_128(v[15], v[0]);
+  v[12] = xor_128(v[12], v[1]);
+  v[13] = xor_128(v[13], v[2]);
+  v[14] = xor_128(v[14], v[3]);
+  v[15] = rot8_128(v[15]);
+  v[12] = rot8_128(v[12]);
+  v[13] = rot8_128(v[13]);
+  v[14] = rot8_128(v[14]);
+  v[10] = add_128(v[10], v[15]);
+  v[11] = add_128(v[11], v[12]);
+  v[8] = add_128(v[8], v[13]);
+  v[9] = add_128(v[9], v[14]);
+  v[5] = xor_128(v[5], v[10]);
+  v[6] = xor_128(v[6], v[11]);
+  v[7] = xor_128(v[7], v[8]);
+  v[4] = xor_128(v[4], v[9]);
+  v[5] = rot7_128(v[5]);
+  v[6] = rot7_128(v[6]);
+  v[7] = rot7_128(v[7]);
+  v[4] = rot7_128(v[4]);
+}
+
+INLINE void transpose_vecs_128(__m128i vecs[4]) {
+  // Interleave 32-bit lates. The low unpack is lanes 00/11 and the high is
+  // 22/33. Note that this doesn't split the vector into two lanes, as the
+  // AVX2 counterparts do.
+  __m128i ab_01 = _mm_unpacklo_epi32(vecs[0], vecs[1]);
+  __m128i ab_23 = _mm_unpackhi_epi32(vecs[0], vecs[1]);
+  __m128i cd_01 = _mm_unpacklo_epi32(vecs[2], vecs[3]);
+  __m128i cd_23 = _mm_unpackhi_epi32(vecs[2], vecs[3]);
+
+  // Interleave 64-bit lanes.
+  __m128i abcd_0 = _mm_unpacklo_epi64(ab_01, cd_01);
+  __m128i abcd_1 = _mm_unpackhi_epi64(ab_01, cd_01);
+  __m128i abcd_2 = _mm_unpacklo_epi64(ab_23, cd_23);
+  __m128i abcd_3 = _mm_unpackhi_epi64(ab_23, cd_23);
+
+  vecs[0] = abcd_0;
+  vecs[1] = abcd_1;
+  vecs[2] = abcd_2;
+  vecs[3] = abcd_3;
+}
+
+INLINE void transpose_msg_vecs4(const uint8_t *const *inputs,
+                                size_t block_offset, __m128i out[16]) {
+  out[0] = loadu_128(&inputs[0][block_offset + 0 * sizeof(__m128i)]);
+  out[1] = loadu_128(&inputs[1][block_offset + 0 * sizeof(__m128i)]);
+  out[2] = loadu_128(&inputs[2][block_offset + 0 * sizeof(__m128i)]);
+  out[3] = loadu_128(&inputs[3][block_offset + 0 * sizeof(__m128i)]);
+  out[4] = loadu_128(&inputs[0][block_offset + 1 * sizeof(__m128i)]);
+  out[5] = loadu_128(&inputs[1][block_offset + 1 * sizeof(__m128i)]);
+  out[6] = loadu_128(&inputs[2][block_offset + 1 * sizeof(__m128i)]);
+  out[7] = loadu_128(&inputs[3][block_offset + 1 * sizeof(__m128i)]);
+  out[8] = loadu_128(&inputs[0][block_offset + 2 * sizeof(__m128i)]);
+  out[9] = loadu_128(&inputs[1][block_offset + 2 * sizeof(__m128i)]);
+  out[10] = loadu_128(&inputs[2][block_offset + 2 * sizeof(__m128i)]);
+  out[11] = loadu_128(&inputs[3][block_offset + 2 * sizeof(__m128i)]);
+  out[12] = loadu_128(&inputs[0][block_offset + 3 * sizeof(__m128i)]);
+  out[13] = loadu_128(&inputs[1][block_offset + 3 * sizeof(__m128i)]);
+  out[14] = loadu_128(&inputs[2][block_offset + 3 * sizeof(__m128i)]);
+  out[15] = loadu_128(&inputs[3][block_offset + 3 * sizeof(__m128i)]);
+  transpose_vecs_128(&out[0]);
+  transpose_vecs_128(&out[4]);
+  transpose_vecs_128(&out[8]);
+  transpose_vecs_128(&out[12]);
+}
+
+INLINE void load_counters4(uint64_t counter, bool increment_counter,
+                           __m128i *out_lo, __m128i *out_hi) {
+  uint64_t mask = (increment_counter ? ~0 : 0);
+  __m256i mask_vec = _mm256_set1_epi64x(mask);
+  __m256i deltas = _mm256_setr_epi64x(0, 1, 2, 3);
+  deltas = _mm256_and_si256(mask_vec, deltas);
+  __m256i counters =
+      _mm256_add_epi64(_mm256_set1_epi64x((int64_t)counter), deltas);
+  *out_lo = _mm256_cvtepi64_epi32(counters);
+  *out_hi = _mm256_cvtepi64_epi32(_mm256_srli_epi64(counters, 32));
+}
+
+void blake3_hash4_avx512(const uint8_t *const *inputs, size_t blocks,
+                         const uint32_t key[8], uint64_t counter,
+                         bool increment_counter, uint8_t flags,
+                         uint8_t flags_start, uint8_t flags_end, uint8_t *out) {
+  __m128i h_vecs[8] = {
+      set1_128(key[0]), set1_128(key[1]), set1_128(key[2]), set1_128(key[3]),
+      set1_128(key[4]), set1_128(key[5]), set1_128(key[6]), set1_128(key[7]),
+  };
+  __m128i counter_low_vec, counter_high_vec;
+  load_counters4(counter, increment_counter, &counter_low_vec,
+                 &counter_high_vec);
+  uint8_t block_flags = flags | flags_start;
+
+  for (size_t block = 0; block < blocks; block++) {
+    if (block + 1 == blocks) {
+      block_flags |= flags_end;
+    }
+    __m128i block_len_vec = set1_128(BLAKE3_BLOCK_LEN);
+    __m128i block_flags_vec = set1_128(block_flags);
+    __m128i msg_vecs[16];
+    transpose_msg_vecs4(inputs, block * BLAKE3_BLOCK_LEN, msg_vecs);
+
+    __m128i v[16] = {
+        h_vecs[0],       h_vecs[1],        h_vecs[2],       h_vecs[3],
+        h_vecs[4],       h_vecs[5],        h_vecs[6],       h_vecs[7],
+        set1_128(IV[0]), set1_128(IV[1]),  set1_128(IV[2]), set1_128(IV[3]),
+        counter_low_vec, counter_high_vec, block_len_vec,   block_flags_vec,
+    };
+    round_fn4(v, msg_vecs, 0);
+    round_fn4(v, msg_vecs, 1);
+    round_fn4(v, msg_vecs, 2);
+    round_fn4(v, msg_vecs, 3);
+    round_fn4(v, msg_vecs, 4);
+    round_fn4(v, msg_vecs, 5);
+    round_fn4(v, msg_vecs, 6);
+    h_vecs[0] = xor_128(v[0], v[8]);
+    h_vecs[1] = xor_128(v[1], v[9]);
+    h_vecs[2] = xor_128(v[2], v[10]);
+    h_vecs[3] = xor_128(v[3], v[11]);
+    h_vecs[4] = xor_128(v[4], v[12]);
+    h_vecs[5] = xor_128(v[5], v[13]);
+    h_vecs[6] = xor_128(v[6], v[14]);
+    h_vecs[7] = xor_128(v[7], v[15]);
+
+    block_flags = flags;
+  }
+
+  transpose_vecs_128(&h_vecs[0]);
+  transpose_vecs_128(&h_vecs[4]);
+  // The first four vecs now contain the first half of each output, and the
+  // second four vecs contain the second half of each output.
+  storeu_128(h_vecs[0], &out[0 * sizeof(__m128i)]);
+  storeu_128(h_vecs[4], &out[1 * sizeof(__m128i)]);
+  storeu_128(h_vecs[1], &out[2 * sizeof(__m128i)]);
+  storeu_128(h_vecs[5], &out[3 * sizeof(__m128i)]);
+  storeu_128(h_vecs[2], &out[4 * sizeof(__m128i)]);
+  storeu_128(h_vecs[6], &out[5 * sizeof(__m128i)]);
+  storeu_128(h_vecs[3], &out[6 * sizeof(__m128i)]);
+  storeu_128(h_vecs[7], &out[7 * sizeof(__m128i)]);
+}
+
+/*
+ * ----------------------------------------------------------------------------
+ * hash8_avx512
+ * ----------------------------------------------------------------------------
+ */
+
+INLINE void round_fn8(__m256i v[16], __m256i m[16], size_t r) {
+  v[0] = add_256(v[0], m[(size_t)MSG_SCHEDULE[r][0]]);
+  v[1] = add_256(v[1], m[(size_t)MSG_SCHEDULE[r][2]]);
+  v[2] = add_256(v[2], m[(size_t)MSG_SCHEDULE[r][4]]);
+  v[3] = add_256(v[3], m[(size_t)MSG_SCHEDULE[r][6]]);
+  v[0] = add_256(v[0], v[4]);
+  v[1] = add_256(v[1], v[5]);
+  v[2] = add_256(v[2], v[6]);
+  v[3] = add_256(v[3], v[7]);
+  v[12] = xor_256(v[12], v[0]);
+  v[13] = xor_256(v[13], v[1]);
+  v[14] = xor_256(v[14], v[2]);
+  v[15] = xor_256(v[15], v[3]);
+  v[12] = rot16_256(v[12]);
+  v[13] = rot16_256(v[13]);
+  v[14] = rot16_256(v[14]);
+  v[15] = rot16_256(v[15]);
+  v[8] = add_256(v[8], v[12]);
+  v[9] = add_256(v[9], v[13]);
+  v[10] = add_256(v[10], v[14]);
+  v[11] = add_256(v[11], v[15]);
+  v[4] = xor_256(v[4], v[8]);
+  v[5] = xor_256(v[5], v[9]);
+  v[6] = xor_256(v[6], v[10]);
+  v[7] = xor_256(v[7], v[11]);
+  v[4] = rot12_256(v[4]);
+  v[5] = rot12_256(v[5]);
+  v[6] = rot12_256(v[6]);
+  v[7] = rot12_256(v[7]);
+  v[0] = add_256(v[0], m[(size_t)MSG_SCHEDULE[r][1]]);
+  v[1] = add_256(v[1], m[(size_t)MSG_SCHEDULE[r][3]]);
+  v[2] = add_256(v[2], m[(size_t)MSG_SCHEDULE[r][5]]);
+  v[3] = add_256(v[3], m[(size_t)MSG_SCHEDULE[r][7]]);
+  v[0] = add_256(v[0], v[4]);
+  v[1] = add_256(v[1], v[5]);
+  v[2] = add_256(v[2], v[6]);
+  v[3] = add_256(v[3], v[7]);
+  v[12] = xor_256(v[12], v[0]);
+  v[13] = xor_256(v[13], v[1]);
+  v[14] = xor_256(v[14], v[2]);
+  v[15] = xor_256(v[15], v[3]);
+  v[12] = rot8_256(v[12]);
+  v[13] = rot8_256(v[13]);
+  v[14] = rot8_256(v[14]);
+  v[15] = rot8_256(v[15]);
+  v[8] = add_256(v[8], v[12]);
+  v[9] = add_256(v[9], v[13]);
+  v[10] = add_256(v[10], v[14]);
+  v[11] = add_256(v[11], v[15]);
+  v[4] = xor_256(v[4], v[8]);
+  v[5] = xor_256(v[5], v[9]);
+  v[6] = xor_256(v[6], v[10]);
+  v[7] = xor_256(v[7], v[11]);
+  v[4] = rot7_256(v[4]);
+  v[5] = rot7_256(v[5]);
+  v[6] = rot7_256(v[6]);
+  v[7] = rot7_256(v[7]);
+
+  v[0] = add_256(v[0], m[(size_t)MSG_SCHEDULE[r][8]]);
+  v[1] = add_256(v[1], m[(size_t)MSG_SCHEDULE[r][10]]);
+  v[2] = add_256(v[2], m[(size_t)MSG_SCHEDULE[r][12]]);
+  v[3] = add_256(v[3], m[(size_t)MSG_SCHEDULE[r][14]]);
+  v[0] = add_256(v[0], v[5]);
+  v[1] = add_256(v[1], v[6]);
+  v[2] = add_256(v[2], v[7]);
+  v[3] = add_256(v[3], v[4]);
+  v[15] = xor_256(v[15], v[0]);
+  v[12] = xor_256(v[12], v[1]);
+  v[13] = xor_256(v[13], v[2]);
+  v[14] = xor_256(v[14], v[3]);
+  v[15] = rot16_256(v[15]);
+  v[12] = rot16_256(v[12]);
+  v[13] = rot16_256(v[13]);
+  v[14] = rot16_256(v[14]);
+  v[10] = add_256(v[10], v[15]);
+  v[11] = add_256(v[11], v[12]);
+  v[8] = add_256(v[8], v[13]);
+  v[9] = add_256(v[9], v[14]);
+  v[5] = xor_256(v[5], v[10]);
+  v[6] = xor_256(v[6], v[11]);
+  v[7] = xor_256(v[7], v[8]);
+  v[4] = xor_256(v[4], v[9]);
+  v[5] = rot12_256(v[5]);
+  v[6] = rot12_256(v[6]);
+  v[7] = rot12_256(v[7]);
+  v[4] = rot12_256(v[4]);
+  v[0] = add_256(v[0], m[(size_t)MSG_SCHEDULE[r][9]]);
+  v[1] = add_256(v[1], m[(size_t)MSG_SCHEDULE[r][11]]);
+  v[2] = add_256(v[2], m[(size_t)MSG_SCHEDULE[r][13]]);
+  v[3] = add_256(v[3], m[(size_t)MSG_SCHEDULE[r][15]]);
+  v[0] = add_256(v[0], v[5]);
+  v[1] = add_256(v[1], v[6]);
+  v[2] = add_256(v[2], v[7]);
+  v[3] = add_256(v[3], v[4]);
+  v[15] = xor_256(v[15], v[0]);
+  v[12] = xor_256(v[12], v[1]);
+  v[13] = xor_256(v[13], v[2]);
+  v[14] = xor_256(v[14], v[3]);
+  v[15] = rot8_256(v[15]);
+  v[12] = rot8_256(v[12]);
+  v[13] = rot8_256(v[13]);
+  v[14] = rot8_256(v[14]);
+  v[10] = add_256(v[10], v[15]);
+  v[11] = add_256(v[11], v[12]);
+  v[8] = add_256(v[8], v[13]);
+  v[9] = add_256(v[9], v[14]);
+  v[5] = xor_256(v[5], v[10]);
+  v[6] = xor_256(v[6], v[11]);
+  v[7] = xor_256(v[7], v[8]);
+  v[4] = xor_256(v[4], v[9]);
+  v[5] = rot7_256(v[5]);
+  v[6] = rot7_256(v[6]);
+  v[7] = rot7_256(v[7]);
+  v[4] = rot7_256(v[4]);
+}
+
+INLINE void transpose_vecs_256(__m256i vecs[8]) {
+  // Interleave 32-bit lanes. The low unpack is lanes 00/11/44/55, and the high
+  // is 22/33/66/77.
+  __m256i ab_0145 = _mm256_unpacklo_epi32(vecs[0], vecs[1]);
+  __m256i ab_2367 = _mm256_unpackhi_epi32(vecs[0], vecs[1]);
+  __m256i cd_0145 = _mm256_unpacklo_epi32(vecs[2], vecs[3]);
+  __m256i cd_2367 = _mm256_unpackhi_epi32(vecs[2], vecs[3]);
+  __m256i ef_0145 = _mm256_unpacklo_epi32(vecs[4], vecs[5]);
+  __m256i ef_2367 = _mm256_unpackhi_epi32(vecs[4], vecs[5]);
+  __m256i gh_0145 = _mm256_unpacklo_epi32(vecs[6], vecs[7]);
+  __m256i gh_2367 = _mm256_unpackhi_epi32(vecs[6], vecs[7]);
+
+  // Interleave 64-bit lates. The low unpack is lanes 00/22 and the high is
+  // 11/33.
+  __m256i abcd_04 = _mm256_unpacklo_epi64(ab_0145, cd_0145);
+  __m256i abcd_15 = _mm256_unpackhi_epi64(ab_0145, cd_0145);
+  __m256i abcd_26 = _mm256_unpacklo_epi64(ab_2367, cd_2367);
+  __m256i abcd_37 = _mm256_unpackhi_epi64(ab_2367, cd_2367);
+  __m256i efgh_04 = _mm256_unpacklo_epi64(ef_0145, gh_0145);
+  __m256i efgh_15 = _mm256_unpackhi_epi64(ef_0145, gh_0145);
+  __m256i efgh_26 = _mm256_unpacklo_epi64(ef_2367, gh_2367);
+  __m256i efgh_37 = _mm256_unpackhi_epi64(ef_2367, gh_2367);
+
+  // Interleave 128-bit lanes.
+  vecs[0] = _mm256_permute2x128_si256(abcd_04, efgh_04, 0x20);
+  vecs[1] = _mm256_permute2x128_si256(abcd_15, efgh_15, 0x20);
+  vecs[2] = _mm256_permute2x128_si256(abcd_26, efgh_26, 0x20);
+  vecs[3] = _mm256_permute2x128_si256(abcd_37, efgh_37, 0x20);
+  vecs[4] = _mm256_permute2x128_si256(abcd_04, efgh_04, 0x31);
+  vecs[5] = _mm256_permute2x128_si256(abcd_15, efgh_15, 0x31);
+  vecs[6] = _mm256_permute2x128_si256(abcd_26, efgh_26, 0x31);
+  vecs[7] = _mm256_permute2x128_si256(abcd_37, efgh_37, 0x31);
+}
+
+INLINE void transpose_msg_vecs8(const uint8_t *const *inputs,
+                                size_t block_offset, __m256i out[16]) {
+  out[0] = loadu_256(&inputs[0][block_offset + 0 * sizeof(__m256i)]);
+  out[1] = loadu_256(&inputs[1][block_offset + 0 * sizeof(__m256i)]);
+  out[2] = loadu_256(&inputs[2][block_offset + 0 * sizeof(__m256i)]);
+  out[3] = loadu_256(&inputs[3][block_offset + 0 * sizeof(__m256i)]);
+  out[4] = loadu_256(&inputs[4][block_offset + 0 * sizeof(__m256i)]);
+  out[5] = loadu_256(&inputs[5][block_offset + 0 * sizeof(__m256i)]);
+  out[6] = loadu_256(&inputs[6][block_offset + 0 * sizeof(__m256i)]);
+  out[7] = loadu_256(&inputs[7][block_offset + 0 * sizeof(__m256i)]);
+  out[8] = loadu_256(&inputs[0][block_offset + 1 * sizeof(__m256i)]);
+  out[9] = loadu_256(&inputs[1][block_offset + 1 * sizeof(__m256i)]);
+  out[10] = loadu_256(&inputs[2][block_offset + 1 * sizeof(__m256i)]);
+  out[11] = loadu_256(&inputs[3][block_offset + 1 * sizeof(__m256i)]);
+  out[12] = loadu_256(&inputs[4][block_offset + 1 * sizeof(__m256i)]);
+  out[13] = loadu_256(&inputs[5][block_offset + 1 * sizeof(__m256i)]);
+  out[14] = loadu_256(&inputs[6][block_offset + 1 * sizeof(__m256i)]);
+  out[15] = loadu_256(&inputs[7][block_offset + 1 * sizeof(__m256i)]);
+  transpose_vecs_256(&out[0]);
+  transpose_vecs_256(&out[8]);
+}
+
+INLINE void load_counters8(uint64_t counter, bool increment_counter,
+                           __m256i *out_lo, __m256i *out_hi) {
+  uint64_t mask = (increment_counter ? ~0 : 0);
+  __m512i mask_vec = _mm512_set1_epi64(mask);
+  __m512i deltas = _mm512_setr_epi64(0, 1, 2, 3, 4, 5, 6, 7);
+  deltas = _mm512_and_si512(mask_vec, deltas);
+  __m512i counters =
+      _mm512_add_epi64(_mm512_set1_epi64((int64_t)counter), deltas);
+  *out_lo = _mm512_cvtepi64_epi32(counters);
+  *out_hi = _mm512_cvtepi64_epi32(_mm512_srli_epi64(counters, 32));
+}
+
+void blake3_hash8_avx512(const uint8_t *const *inputs, size_t blocks,
+                         const uint32_t key[8], uint64_t counter,
+                         bool increment_counter, uint8_t flags,
+                         uint8_t flags_start, uint8_t flags_end, uint8_t *out) {
+  __m256i h_vecs[8] = {
+      set1_256(key[0]), set1_256(key[1]), set1_256(key[2]), set1_256(key[3]),
+      set1_256(key[4]), set1_256(key[5]), set1_256(key[6]), set1_256(key[7]),
+  };
+  __m256i counter_low_vec, counter_high_vec;
+  load_counters8(counter, increment_counter, &counter_low_vec,
+                 &counter_high_vec);
+  uint8_t block_flags = flags | flags_start;
+
+  for (size_t block = 0; block < blocks; block++) {
+    if (block + 1 == blocks) {
+      block_flags |= flags_end;
+    }
+    __m256i block_len_vec = set1_256(BLAKE3_BLOCK_LEN);
+    __m256i block_flags_vec = set1_256(block_flags);
+    __m256i msg_vecs[16];
+    transpose_msg_vecs8(inputs, block * BLAKE3_BLOCK_LEN, msg_vecs);
+
+    __m256i v[16] = {
+        h_vecs[0],       h_vecs[1],        h_vecs[2],       h_vecs[3],
+        h_vecs[4],       h_vecs[5],        h_vecs[6],       h_vecs[7],
+        set1_256(IV[0]), set1_256(IV[1]),  set1_256(IV[2]), set1_256(IV[3]),
+        counter_low_vec, counter_high_vec, block_len_vec,   block_flags_vec,
+    };
+    round_fn8(v, msg_vecs, 0);
+    round_fn8(v, msg_vecs, 1);
+    round_fn8(v, msg_vecs, 2);
+    round_fn8(v, msg_vecs, 3);
+    round_fn8(v, msg_vecs, 4);
+    round_fn8(v, msg_vecs, 5);
+    round_fn8(v, msg_vecs, 6);
+    h_vecs[0] = xor_256(v[0], v[8]);
+    h_vecs[1] = xor_256(v[1], v[9]);
+    h_vecs[2] = xor_256(v[2], v[10]);
+    h_vecs[3] = xor_256(v[3], v[11]);
+    h_vecs[4] = xor_256(v[4], v[12]);
+    h_vecs[5] = xor_256(v[5], v[13]);
+    h_vecs[6] = xor_256(v[6], v[14]);
+    h_vecs[7] = xor_256(v[7], v[15]);
+
+    block_flags = flags;
+  }
+
+  transpose_vecs_256(h_vecs);
+  storeu_256(h_vecs[0], &out[0 * sizeof(__m256i)]);
+  storeu_256(h_vecs[1], &out[1 * sizeof(__m256i)]);
+  storeu_256(h_vecs[2], &out[2 * sizeof(__m256i)]);
+  storeu_256(h_vecs[3], &out[3 * sizeof(__m256i)]);
+  storeu_256(h_vecs[4], &out[4 * sizeof(__m256i)]);
+  storeu_256(h_vecs[5], &out[5 * sizeof(__m256i)]);
+  storeu_256(h_vecs[6], &out[6 * sizeof(__m256i)]);
+  storeu_256(h_vecs[7], &out[7 * sizeof(__m256i)]);
+}
+
+/*
+ * ----------------------------------------------------------------------------
+ * hash16_avx512
+ * ----------------------------------------------------------------------------
+ */
+
+INLINE void round_fn16(__m512i v[16], __m512i m[16], size_t r) {
+  v[0] = add_512(v[0], m[(size_t)MSG_SCHEDULE[r][0]]);
+  v[1] = add_512(v[1], m[(size_t)MSG_SCHEDULE[r][2]]);
+  v[2] = add_512(v[2], m[(size_t)MSG_SCHEDULE[r][4]]);
+  v[3] = add_512(v[3], m[(size_t)MSG_SCHEDULE[r][6]]);
+  v[0] = add_512(v[0], v[4]);
+  v[1] = add_512(v[1], v[5]);
+  v[2] = add_512(v[2], v[6]);
+  v[3] = add_512(v[3], v[7]);
+  v[12] = xor_512(v[12], v[0]);
+  v[13] = xor_512(v[13], v[1]);
+  v[14] = xor_512(v[14], v[2]);
+  v[15] = xor_512(v[15], v[3]);
+  v[12] = rot16_512(v[12]);
+  v[13] = rot16_512(v[13]);
+  v[14] = rot16_512(v[14]);
+  v[15] = rot16_512(v[15]);
+  v[8] = add_512(v[8], v[12]);
+  v[9] = add_512(v[9], v[13]);
+  v[10] = add_512(v[10], v[14]);
+  v[11] = add_512(v[11], v[15]);
+  v[4] = xor_512(v[4], v[8]);
+  v[5] = xor_512(v[5], v[9]);
+  v[6] = xor_512(v[6], v[10]);
+  v[7] = xor_512(v[7], v[11]);
+  v[4] = rot12_512(v[4]);
+  v[5] = rot12_512(v[5]);
+  v[6] = rot12_512(v[6]);
+  v[7] = rot12_512(v[7]);
+  v[0] = add_512(v[0], m[(size_t)MSG_SCHEDULE[r][1]]);
+  v[1] = add_512(v[1], m[(size_t)MSG_SCHEDULE[r][3]]);
+  v[2] = add_512(v[2], m[(size_t)MSG_SCHEDULE[r][5]]);
+  v[3] = add_512(v[3], m[(size_t)MSG_SCHEDULE[r][7]]);
+  v[0] = add_512(v[0], v[4]);
+  v[1] = add_512(v[1], v[5]);
+  v[2] = add_512(v[2], v[6]);
+  v[3] = add_512(v[3], v[7]);
+  v[12] = xor_512(v[12], v[0]);
+  v[13] = xor_512(v[13], v[1]);
+  v[14] = xor_512(v[14], v[2]);
+  v[15] = xor_512(v[15], v[3]);
+  v[12] = rot8_512(v[12]);
+  v[13] = rot8_512(v[13]);
+  v[14] = rot8_512(v[14]);
+  v[15] = rot8_512(v[15]);
+  v[8] = add_512(v[8], v[12]);
+  v[9] = add_512(v[9], v[13]);
+  v[10] = add_512(v[10], v[14]);
+  v[11] = add_512(v[11], v[15]);
+  v[4] = xor_512(v[4], v[8]);
+  v[5] = xor_512(v[5], v[9]);
+  v[6] = xor_512(v[6], v[10]);
+  v[7] = xor_512(v[7], v[11]);
+  v[4] = rot7_512(v[4]);
+  v[5] = rot7_512(v[5]);
+  v[6] = rot7_512(v[6]);
+  v[7] = rot7_512(v[7]);
+
+  v[0] = add_512(v[0], m[(size_t)MSG_SCHEDULE[r][8]]);
+  v[1] = add_512(v[1], m[(size_t)MSG_SCHEDULE[r][10]]);
+  v[2] = add_512(v[2], m[(size_t)MSG_SCHEDULE[r][12]]);
+  v[3] = add_512(v[3], m[(size_t)MSG_SCHEDULE[r][14]]);
+  v[0] = add_512(v[0], v[5]);
+  v[1] = add_512(v[1], v[6]);
+  v[2] = add_512(v[2], v[7]);
+  v[3] = add_512(v[3], v[4]);
+  v[15] = xor_512(v[15], v[0]);
+  v[12] = xor_512(v[12], v[1]);
+  v[13] = xor_512(v[13], v[2]);
+  v[14] = xor_512(v[14], v[3]);
+  v[15] = rot16_512(v[15]);
+  v[12] = rot16_512(v[12]);
+  v[13] = rot16_512(v[13]);
+  v[14] = rot16_512(v[14]);
+  v[10] = add_512(v[10], v[15]);
+  v[11] = add_512(v[11], v[12]);
+  v[8] = add_512(v[8], v[13]);
+  v[9] = add_512(v[9], v[14]);
+  v[5] = xor_512(v[5], v[10]);
+  v[6] = xor_512(v[6], v[11]);
+  v[7] = xor_512(v[7], v[8]);
+  v[4] = xor_512(v[4], v[9]);
+  v[5] = rot12_512(v[5]);
+  v[6] = rot12_512(v[6]);
+  v[7] = rot12_512(v[7]);
+  v[4] = rot12_512(v[4]);
+  v[0] = add_512(v[0], m[(size_t)MSG_SCHEDULE[r][9]]);
+  v[1] = add_512(v[1], m[(size_t)MSG_SCHEDULE[r][11]]);
+  v[2] = add_512(v[2], m[(size_t)MSG_SCHEDULE[r][13]]);
+  v[3] = add_512(v[3], m[(size_t)MSG_SCHEDULE[r][15]]);
+  v[0] = add_512(v[0], v[5]);
+  v[1] = add_512(v[1], v[6]);
+  v[2] = add_512(v[2], v[7]);
+  v[3] = add_512(v[3], v[4]);
+  v[15] = xor_512(v[15], v[0]);
+  v[12] = xor_512(v[12], v[1]);
+  v[13] = xor_512(v[13], v[2]);
+  v[14] = xor_512(v[14], v[3]);
+  v[15] = rot8_512(v[15]);
+  v[12] = rot8_512(v[12]);
+  v[13] = rot8_512(v[13]);
+  v[14] = rot8_512(v[14]);
+  v[10] = add_512(v[10], v[15]);
+  v[11] = add_512(v[11], v[12]);
+  v[8] = add_512(v[8], v[13]);
+  v[9] = add_512(v[9], v[14]);
+  v[5] = xor_512(v[5], v[10]);
+  v[6] = xor_512(v[6], v[11]);
+  v[7] = xor_512(v[7], v[8]);
+  v[4] = xor_512(v[4], v[9]);
+  v[5] = rot7_512(v[5]);
+  v[6] = rot7_512(v[6]);
+  v[7] = rot7_512(v[7]);
+  v[4] = rot7_512(v[4]);
+}
+
+// 0b10001000, or lanes a0/a2/b0/b2 in little-endian order
+#define LO_IMM8 0x88
+
+INLINE __m512i unpack_lo_128(__m512i a, __m512i b) {
+  return _mm512_shuffle_i32x4(a, b, LO_IMM8);
+}
+
+// 0b11011101, or lanes a1/a3/b1/b3 in little-endian order
+#define HI_IMM8 0xdd
+
+INLINE __m512i unpack_hi_128(__m512i a, __m512i b) {
+  return _mm512_shuffle_i32x4(a, b, HI_IMM8);
+}
+
+INLINE void transpose_vecs_512(__m512i vecs[16]) {
+  // Interleave 32-bit lanes. The _0 unpack is lanes
+  // 0/0/1/1/4/4/5/5/8/8/9/9/12/12/13/13, and the _2 unpack is lanes
+  // 2/2/3/3/6/6/7/7/10/10/11/11/14/14/15/15.
+  __m512i ab_0 = _mm512_unpacklo_epi32(vecs[0], vecs[1]);
+  __m512i ab_2 = _mm512_unpackhi_epi32(vecs[0], vecs[1]);
+  __m512i cd_0 = _mm512_unpacklo_epi32(vecs[2], vecs[3]);
+  __m512i cd_2 = _mm512_unpackhi_epi32(vecs[2], vecs[3]);
+  __m512i ef_0 = _mm512_unpacklo_epi32(vecs[4], vecs[5]);
+  __m512i ef_2 = _mm512_unpackhi_epi32(vecs[4], vecs[5]);
+  __m512i gh_0 = _mm512_unpacklo_epi32(vecs[6], vecs[7]);
+  __m512i gh_2 = _mm512_unpackhi_epi32(vecs[6], vecs[7]);
+  __m512i ij_0 = _mm512_unpacklo_epi32(vecs[8], vecs[9]);
+  __m512i ij_2 = _mm512_unpackhi_epi32(vecs[8], vecs[9]);
+  __m512i kl_0 = _mm512_unpacklo_epi32(vecs[10], vecs[11]);
+  __m512i kl_2 = _mm512_unpackhi_epi32(vecs[10], vecs[11]);
+  __m512i mn_0 = _mm512_unpacklo_epi32(vecs[12], vecs[13]);
+  __m512i mn_2 = _mm512_unpackhi_epi32(vecs[12], vecs[13]);
+  __m512i op_0 = _mm512_unpacklo_epi32(vecs[14], vecs[15]);
+  __m512i op_2 = _mm512_unpackhi_epi32(vecs[14], vecs[15]);
+
+  // Interleave 64-bit lates. The _0 unpack is lanes
+  // 0/0/0/0/4/4/4/4/8/8/8/8/12/12/12/12, the _1 unpack is lanes
+  // 1/1/1/1/5/5/5/5/9/9/9/9/13/13/13/13, the _2 unpack is lanes
+  // 2/2/2/2/6/6/6/6/10/10/10/10/14/14/14/14, and the _3 unpack is lanes
+  // 3/3/3/3/7/7/7/7/11/11/11/11/15/15/15/15.
+  __m512i abcd_0 = _mm512_unpacklo_epi64(ab_0, cd_0);
+  __m512i abcd_1 = _mm512_unpackhi_epi64(ab_0, cd_0);
+  __m512i abcd_2 = _mm512_unpacklo_epi64(ab_2, cd_2);
+  __m512i abcd_3 = _mm512_unpackhi_epi64(ab_2, cd_2);
+  __m512i efgh_0 = _mm512_unpacklo_epi64(ef_0, gh_0);
+  __m512i efgh_1 = _mm512_unpackhi_epi64(ef_0, gh_0);
+  __m512i efgh_2 = _mm512_unpacklo_epi64(ef_2, gh_2);
+  __m512i efgh_3 = _mm512_unpackhi_epi64(ef_2, gh_2);
+  __m512i ijkl_0 = _mm512_unpacklo_epi64(ij_0, kl_0);
+  __m512i ijkl_1 = _mm512_unpackhi_epi64(ij_0, kl_0);
+  __m512i ijkl_2 = _mm512_unpacklo_epi64(ij_2, kl_2);
+  __m512i ijkl_3 = _mm512_unpackhi_epi64(ij_2, kl_2);
+  __m512i mnop_0 = _mm512_unpacklo_epi64(mn_0, op_0);
+  __m512i mnop_1 = _mm512_unpackhi_epi64(mn_0, op_0);
+  __m512i mnop_2 = _mm512_unpacklo_epi64(mn_2, op_2);
+  __m512i mnop_3 = _mm512_unpackhi_epi64(mn_2, op_2);
+
+  // Interleave 128-bit lanes. The _0 unpack is
+  // 0/0/0/0/8/8/8/8/0/0/0/0/8/8/8/8, the _1 unpack is
+  // 1/1/1/1/9/9/9/9/1/1/1/1/9/9/9/9, and so on.
+  __m512i abcdefgh_0 = unpack_lo_128(abcd_0, efgh_0);
+  __m512i abcdefgh_1 = unpack_lo_128(abcd_1, efgh_1);
+  __m512i abcdefgh_2 = unpack_lo_128(abcd_2, efgh_2);
+  __m512i abcdefgh_3 = unpack_lo_128(abcd_3, efgh_3);
+  __m512i abcdefgh_4 = unpack_hi_128(abcd_0, efgh_0);
+  __m512i abcdefgh_5 = unpack_hi_128(abcd_1, efgh_1);
+  __m512i abcdefgh_6 = unpack_hi_128(abcd_2, efgh_2);
+  __m512i abcdefgh_7 = unpack_hi_128(abcd_3, efgh_3);
+  __m512i ijklmnop_0 = unpack_lo_128(ijkl_0, mnop_0);
+  __m512i ijklmnop_1 = unpack_lo_128(ijkl_1, mnop_1);
+  __m512i ijklmnop_2 = unpack_lo_128(ijkl_2, mnop_2);
+  __m512i ijklmnop_3 = unpack_lo_128(ijkl_3, mnop_3);
+  __m512i ijklmnop_4 = unpack_hi_128(ijkl_0, mnop_0);
+  __m512i ijklmnop_5 = unpack_hi_128(ijkl_1, mnop_1);
+  __m512i ijklmnop_6 = unpack_hi_128(ijkl_2, mnop_2);
+  __m512i ijklmnop_7 = unpack_hi_128(ijkl_3, mnop_3);
+
+  // Interleave 128-bit lanes again for the final outputs.
+  vecs[0] = unpack_lo_128(abcdefgh_0, ijklmnop_0);
+  vecs[1] = unpack_lo_128(abcdefgh_1, ijklmnop_1);
+  vecs[2] = unpack_lo_128(abcdefgh_2, ijklmnop_2);
+  vecs[3] = unpack_lo_128(abcdefgh_3, ijklmnop_3);
+  vecs[4] = unpack_lo_128(abcdefgh_4, ijklmnop_4);
+  vecs[5] = unpack_lo_128(abcdefgh_5, ijklmnop_5);
+  vecs[6] = unpack_lo_128(abcdefgh_6, ijklmnop_6);
+  vecs[7] = unpack_lo_128(abcdefgh_7, ijklmnop_7);
+  vecs[8] = unpack_hi_128(abcdefgh_0, ijklmnop_0);
+  vecs[9] = unpack_hi_128(abcdefgh_1, ijklmnop_1);
+  vecs[10] = unpack_hi_128(abcdefgh_2, ijklmnop_2);
+  vecs[11] = unpack_hi_128(abcdefgh_3, ijklmnop_3);
+  vecs[12] = unpack_hi_128(abcdefgh_4, ijklmnop_4);
+  vecs[13] = unpack_hi_128(abcdefgh_5, ijklmnop_5);
+  vecs[14] = unpack_hi_128(abcdefgh_6, ijklmnop_6);
+  vecs[15] = unpack_hi_128(abcdefgh_7, ijklmnop_7);
+}
+
+INLINE void transpose_msg_vecs16(const uint8_t *const *inputs,
+                                 size_t block_offset, __m512i out[16]) {
+  out[0] = loadu_512(&inputs[0][block_offset]);
+  out[1] = loadu_512(&inputs[1][block_offset]);
+  out[2] = loadu_512(&inputs[2][block_offset]);
+  out[3] = loadu_512(&inputs[3][block_offset]);
+  out[4] = loadu_512(&inputs[4][block_offset]);
+  out[5] = loadu_512(&inputs[5][block_offset]);
+  out[6] = loadu_512(&inputs[6][block_offset]);
+  out[7] = loadu_512(&inputs[7][block_offset]);
+  out[8] = loadu_512(&inputs[8][block_offset]);
+  out[9] = loadu_512(&inputs[9][block_offset]);
+  out[10] = loadu_512(&inputs[10][block_offset]);
+  out[11] = loadu_512(&inputs[11][block_offset]);
+  out[12] = loadu_512(&inputs[12][block_offset]);
+  out[13] = loadu_512(&inputs[13][block_offset]);
+  out[14] = loadu_512(&inputs[14][block_offset]);
+  out[15] = loadu_512(&inputs[15][block_offset]);
+  transpose_vecs_512(out);
+}
+
+INLINE void load_counters16(uint64_t counter, bool increment_counter,
+                            __m512i *out_lo, __m512i *out_hi) {
+  uint64_t mask = (increment_counter ? ~0 : 0);
+  __m512i mask_vec = _mm512_set1_epi64(mask);
+  __m512i deltas_a = _mm512_setr_epi64(0, 1, 2, 3, 4, 5, 6, 7);
+  deltas_a = _mm512_and_si512(mask_vec, deltas_a);
+  __m512i deltas_b = _mm512_setr_epi64(8, 9, 10, 11, 12, 13, 14, 15);
+  deltas_b = _mm512_and_si512(mask_vec, deltas_b);
+  __m512i a = _mm512_add_epi64(_mm512_set1_epi64((int64_t)counter), deltas_a);
+  __m512i b = _mm512_add_epi64(_mm512_set1_epi64((int64_t)counter), deltas_b);
+  __m512i lo_indexes = _mm512_setr_epi32(0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20,
+                                         22, 24, 26, 28, 30);
+  __m512i hi_indexes = _mm512_setr_epi32(1, 3, 5, 7, 9, 11, 13, 15, 17, 19, 21,
+                                         23, 25, 27, 29, 31);
+  *out_lo = _mm512_permutex2var_epi32(a, lo_indexes, b);
+  *out_hi = _mm512_permutex2var_epi32(a, hi_indexes, b);
+}
+
+void blake3_hash16_avx512(const uint8_t *const *inputs, size_t blocks,
+                          const uint32_t key[8], uint64_t counter,
+                          bool increment_counter, uint8_t flags,
+                          uint8_t flags_start, uint8_t flags_end,
+                          uint8_t *out) {
+  __m512i h_vecs[8] = {
+      set1_512(key[0]), set1_512(key[1]), set1_512(key[2]), set1_512(key[3]),
+      set1_512(key[4]), set1_512(key[5]), set1_512(key[6]), set1_512(key[7]),
+  };
+  __m512i counter_low_vec, counter_high_vec;
+  load_counters16(counter, increment_counter, &counter_low_vec,
+                  &counter_high_vec);
+  uint8_t block_flags = flags | flags_start;
+
+  for (size_t block = 0; block < blocks; block++) {
+    if (block + 1 == blocks) {
+      block_flags |= flags_end;
+    }
+    __m512i block_len_vec = set1_512(BLAKE3_BLOCK_LEN);
+    __m512i block_flags_vec = set1_512(block_flags);
+    __m512i msg_vecs[16];
+    transpose_msg_vecs16(inputs, block * BLAKE3_BLOCK_LEN, msg_vecs);
+
+    __m512i v[16] = {
+        h_vecs[0],       h_vecs[1],        h_vecs[2],       h_vecs[3],
+        h_vecs[4],       h_vecs[5],        h_vecs[6],       h_vecs[7],
+        set1_512(IV[0]), set1_512(IV[1]),  set1_512(IV[2]), set1_512(IV[3]),
+        counter_low_vec, counter_high_vec, block_len_vec,   block_flags_vec,
+    };
+    round_fn16(v, msg_vecs, 0);
+    round_fn16(v, msg_vecs, 1);
+    round_fn16(v, msg_vecs, 2);
+    round_fn16(v, msg_vecs, 3);
+    round_fn16(v, msg_vecs, 4);
+    round_fn16(v, msg_vecs, 5);
+    round_fn16(v, msg_vecs, 6);
+    h_vecs[0] = xor_512(v[0], v[8]);
+    h_vecs[1] = xor_512(v[1], v[9]);
+    h_vecs[2] = xor_512(v[2], v[10]);
+    h_vecs[3] = xor_512(v[3], v[11]);
+    h_vecs[4] = xor_512(v[4], v[12]);
+    h_vecs[5] = xor_512(v[5], v[13]);
+    h_vecs[6] = xor_512(v[6], v[14]);
+    h_vecs[7] = xor_512(v[7], v[15]);
+
+    block_flags = flags;
+  }
+
+  // transpose_vecs_512 operates on a 16x16 matrix of words, but we only have 8
+  // state vectors. Pad the matrix with zeros. After transposition, store the
+  // lower half of each vector.
+  __m512i padded[16] = {
+      h_vecs[0],   h_vecs[1],   h_vecs[2],   h_vecs[3],
+      h_vecs[4],   h_vecs[5],   h_vecs[6],   h_vecs[7],
+      set1_512(0), set1_512(0), set1_512(0), set1_512(0),
+      set1_512(0), set1_512(0), set1_512(0), set1_512(0),
+  };
+  transpose_vecs_512(padded);
+  storeu_256(_mm512_castsi512_si256(padded[0]), &out[0 * sizeof(__m256i)]);
+  storeu_256(_mm512_castsi512_si256(padded[1]), &out[1 * sizeof(__m256i)]);
+  storeu_256(_mm512_castsi512_si256(padded[2]), &out[2 * sizeof(__m256i)]);
+  storeu_256(_mm512_castsi512_si256(padded[3]), &out[3 * sizeof(__m256i)]);
+  storeu_256(_mm512_castsi512_si256(padded[4]), &out[4 * sizeof(__m256i)]);
+  storeu_256(_mm512_castsi512_si256(padded[5]), &out[5 * sizeof(__m256i)]);
+  storeu_256(_mm512_castsi512_si256(padded[6]), &out[6 * sizeof(__m256i)]);
+  storeu_256(_mm512_castsi512_si256(padded[7]), &out[7 * sizeof(__m256i)]);
+  storeu_256(_mm512_castsi512_si256(padded[8]), &out[8 * sizeof(__m256i)]);
+  storeu_256(_mm512_castsi512_si256(padded[9]), &out[9 * sizeof(__m256i)]);
+  storeu_256(_mm512_castsi512_si256(padded[10]), &out[10 * sizeof(__m256i)]);
+  storeu_256(_mm512_castsi512_si256(padded[11]), &out[11 * sizeof(__m256i)]);
+  storeu_256(_mm512_castsi512_si256(padded[12]), &out[12 * sizeof(__m256i)]);
+  storeu_256(_mm512_castsi512_si256(padded[13]), &out[13 * sizeof(__m256i)]);
+  storeu_256(_mm512_castsi512_si256(padded[14]), &out[14 * sizeof(__m256i)]);
+  storeu_256(_mm512_castsi512_si256(padded[15]), &out[15 * sizeof(__m256i)]);
+}
+
+/*
+ * ----------------------------------------------------------------------------
+ * hash_many_avx512
+ * ----------------------------------------------------------------------------
+ */
+
+INLINE void hash_one_avx512(const uint8_t *input, size_t blocks,
+                            const uint32_t key[8], uint64_t counter,
+                            uint8_t flags, uint8_t flags_start,
+                            uint8_t flags_end, uint8_t out[BLAKE3_OUT_LEN]) {
+  uint32_t cv[8];
+  memcpy(cv, key, BLAKE3_KEY_LEN);
+  uint8_t block_flags = flags | flags_start;
+  while (blocks > 0) {
+    if (blocks == 1) {
+      block_flags |= flags_end;
+    }
+    blake3_compress_in_place_avx512(cv, input, BLAKE3_BLOCK_LEN, counter,
+                                    block_flags);
+    input = &input[BLAKE3_BLOCK_LEN];
+    blocks -= 1;
+    block_flags = flags;
+  }
+  memcpy(out, cv, BLAKE3_OUT_LEN);
+}
+
+void blake3_hash_many_avx512(const uint8_t *const *inputs, size_t num_inputs,
+                             size_t blocks, const uint32_t key[8],
+                             uint64_t counter, bool increment_counter,
+                             uint8_t flags, uint8_t flags_start,
+                             uint8_t flags_end, uint8_t *out) {
+  while (num_inputs >= 16) {
+    blake3_hash16_avx512(inputs, blocks, key, counter, increment_counter, flags,
+                         flags_start, flags_end, out);
+    if (increment_counter) {
+      counter += 16;
+    }
+    inputs += 16;
+    num_inputs -= 16;
+    out = &out[16 * BLAKE3_OUT_LEN];
+  }
+  while (num_inputs >= 8) {
+    blake3_hash8_avx512(inputs, blocks, key, counter, increment_counter, flags,
+                        flags_start, flags_end, out);
+    if (increment_counter) {
+      counter += 8;
+    }
+    inputs += 8;
+    num_inputs -= 8;
+    out = &out[8 * BLAKE3_OUT_LEN];
+  }
+  while (num_inputs >= 4) {
+    blake3_hash4_avx512(inputs, blocks, key, counter, increment_counter, flags,
+                        flags_start, flags_end, out);
+    if (increment_counter) {
+      counter += 4;
+    }
+    inputs += 4;
+    num_inputs -= 4;
+    out = &out[4 * BLAKE3_OUT_LEN];
+  }
+  while (num_inputs > 0) {
+    hash_one_avx512(inputs[0], blocks, key, counter, flags, flags_start,
+                    flags_end, out);
+    if (increment_counter) {
+      counter += 1;
+    }
+    inputs += 1;
+    num_inputs -= 1;
+    out = &out[BLAKE3_OUT_LEN];
+  }
+}
diff --git a/c/blake3_dispatch.c b/c/blake3_dispatch.c
new file mode 100644
index 0000000..a896492
--- /dev/null
+++ b/c/blake3_dispatch.c
@@ -0,0 +1,263 @@
+#include <stddef.h>
+#include <stdint.h>
+#include <stdbool.h>
+
+#include "blake3.h"
+
+#if defined(__x86_64__) || defined(__i386__) || defined(_M_IX86) || defined(_M_X64)
+#define IS_X86 
+#endif
+
+#if defined(__arm__)
+#define IS_ARM
+#endif
+
+#if defined(IS_X86)
+#if defined(_MSC_VER)
+#include <intrin.h>
+#elif defined(__GNUC__)
+#include <immintrin.h>
+#else
+#error "Unimplemented!"
+#endif
+#endif
+
+
+// Declarations for implementation-specific functions.
+void blake3_compress_in_place_portable(uint32_t cv[8],
+                                       const uint8_t block[BLAKE3_BLOCK_LEN],
+                                       uint8_t block_len, uint64_t counter,
+                                       uint8_t flags);
+
+void blake3_compress_xof_portable(const uint32_t cv[8],
+                                  const uint8_t block[BLAKE3_BLOCK_LEN],
+                                  uint8_t block_len, uint64_t counter,
+                                  uint8_t flags, uint8_t out[64]);
+
+void blake3_hash_many_portable(const uint8_t *const *inputs, size_t num_inputs,
+                               size_t blocks, const uint32_t key[8],
+                               uint64_t counter, bool increment_counter,
+                               uint8_t flags, uint8_t flags_start,
+                               uint8_t flags_end, uint8_t *out);
+
+
+#if defined(IS_X86)
+void blake3_compress_in_place_sse41(uint32_t cv[8],
+                                    const uint8_t block[BLAKE3_BLOCK_LEN],
+                                    uint8_t block_len, uint64_t counter,
+                                    uint8_t flags);
+void blake3_compress_in_place_avx512(uint32_t cv[8],
+                                     const uint8_t block[BLAKE3_BLOCK_LEN],
+                                     uint8_t block_len, uint64_t counter,
+                                     uint8_t flags);
+
+void blake3_compress_xof_sse41(const uint32_t cv[8],
+                               const uint8_t block[BLAKE3_BLOCK_LEN],
+                               uint8_t block_len, uint64_t counter,
+                               uint8_t flags, uint8_t out[64]);
+void blake3_compress_xof_avx512(const uint32_t cv[8],
+                                const uint8_t block[BLAKE3_BLOCK_LEN],
+                                uint8_t block_len, uint64_t counter,
+                                uint8_t flags, uint8_t out[64]);
+
+void blake3_hash_many_sse41(const uint8_t *const *inputs, size_t num_inputs,
+                            size_t blocks, const uint32_t key[8],
+                            uint64_t counter, bool increment_counter,
+                            uint8_t flags, uint8_t flags_start,
+                            uint8_t flags_end, uint8_t *out);
+void blake3_hash_many_avx2(const uint8_t *const *inputs, size_t num_inputs,
+                           size_t blocks, const uint32_t key[8],
+                           uint64_t counter, bool increment_counter,
+                           uint8_t flags, uint8_t flags_start,
+                           uint8_t flags_end, uint8_t *out);
+void blake3_hash_many_avx512(const uint8_t *const *inputs, size_t num_inputs,
+                             size_t blocks, const uint32_t key[8],
+                             uint64_t counter, bool increment_counter,
+                             uint8_t flags, uint8_t flags_start,
+                             uint8_t flags_end, uint8_t *out);
+#endif
+
+#if defined(IS_ARM)
+void blake3_hash_many_neon(const uint8_t *const *inputs, size_t num_inputs,
+                           size_t blocks, const uint32_t key[8],
+                           uint64_t counter, bool increment_counter,
+                           uint8_t flags, uint8_t flags_start,
+                           uint8_t flags_end, uint8_t *out);
+#endif
+
+#if defined(IS_X86)
+static uint64_t xgetbv()
+{
+#if defined(_MSC_VER)
+    return _xgetbv(0);
+#else
+    uint32_t eax=0, edx=0;
+    __asm__ volatile("xgetbv\n" : "=a"(eax), "=d"(edx) : "c"(0));
+    return ((uint64_t)edx << 32) | eax;
+#endif
+}
+
+static void cpuid(uint32_t out[4], uint32_t id) 
+{
+#if defined(_MSC_VER)
+    __cpuid((int*)out, id);
+#else
+#if defined(__i386__) || defined(_M_IX86)
+    __asm__ __volatile__("pushl %%ebx\ncpuid\nmovl %%ebp, %%esi\npopl %%ebx" : "=a"(out[0]), "=S"(out[1]), "=c"(out[2]), "=d"(out[3]) : "a"(id));
+#else
+    __asm__ __volatile__("cpuid\n" : "=a"(out[0]), "=b"(out[1]), "=c"(out[2]), "=d"(out[3]) : "a"(id));
+#endif
+#endif
+}
+
+static void cpuidex(uint32_t out[4], uint32_t id, uint32_t sid) 
+{
+#if defined(_MSC_VER)
+    __cpuidex((int*)out, id, sid);
+#else
+#if defined(__i386__) || defined(_M_IX86)
+    __asm__ __volatile__("pushl %%ebx\ncpuid\nmovl %%ebp, %%esi\npopl %%ebx" : "=a"(out[0]), "=S"(out[1]), "=c"(out[2]), "=d"(out[3]) : "a"(id), "c"(sid));
+#else
+    __asm__ __volatile__("cpuid\n" : "=a"(out[0]), "=b"(out[1]), "=c"(out[2]), "=d"(out[3]) : "a"(id), "c"(sid));
+#endif
+#endif
+}
+
+#endif
+
+enum cpu_feature {
+    SSE2     = 1 << 0,
+    SSSE3    = 1 << 1,
+    SSE41    = 1 << 2,
+    AVX      = 1 << 3,
+    AVX2     = 1 << 4,
+    AVX512F  = 1 << 5,
+    AVX512VL = 1 << 6,
+    /* ... */
+    UNDEFINED = 1 << 30
+};
+
+#if !defined(BLAKE3_TESTING)
+static /* Allow the variable to be controlled manually for testing */
+#endif
+enum cpu_feature g_cpu_features = UNDEFINED;
+
+#if !defined(BLAKE3_TESTING)
+static 
+#endif
+enum cpu_feature get_cpu_features()
+{
+    
+    if( g_cpu_features != UNDEFINED ) {
+        return g_cpu_features;
+    } else {
+#if defined(IS_X86)
+        uint32_t regs[4] = {0};
+        uint32_t * eax = &regs[0], * ebx = &regs[1], * ecx = &regs[2], * edx = &regs[3];
+        (void)edx;
+        enum cpu_feature features = 0;
+        cpuid(regs, 0);
+        const int max_id = *eax;
+        cpuid(regs, 1);
+    #if defined(__amd64__) || defined(_M_X64)
+        features |= SSE2;
+    #else
+        if(*edx & (1UL << 26))
+            features |= SSE2;
+    #endif
+        if(*ecx & (1UL << 0))
+            features |= SSSE3;
+        if(*ecx & (1UL << 19))
+            features |= SSE41;
+
+        if( *ecx & (1UL << 27) ) { // OSXSAVE
+            const uint64_t mask = xgetbv();
+            if( (mask & 6) == 6 ) { // SSE and AVX states
+                if(*ecx & (1UL << 28))
+                    features |= AVX;
+                if(max_id >= 7) {
+                    cpuidex(regs, 7, 0);
+                    if( *ebx & (1UL << 5) )
+                        features |= AVX2;
+                    if( (mask & 224) == 224 ) { // Opmask, ZMM_Hi256, Hi16_Zmm
+                        if( *ebx & (1UL << 31) )
+                            features |= AVX512VL;
+                        if(*ebx & (1UL << 16))
+                            features |= AVX512F;
+                    }
+                }
+            }
+        }
+        g_cpu_features = features;
+        return features;
+#elif defined(IS_ARM)
+        /* How to detect NEON? */
+        return 0;
+#else
+        return 0;
+#endif
+    }
+}
+
+void blake3_compress_in_place(uint32_t cv[8], 
+                              const uint8_t block[BLAKE3_BLOCK_LEN], 
+                              uint8_t block_len, uint64_t counter, 
+                              uint8_t flags)
+{
+    const enum cpu_feature features = get_cpu_features();
+#if defined(IS_X86)
+    if(features & AVX512VL) {
+        blake3_compress_in_place_avx512(cv, block, block_len, counter, flags);
+        return;
+    } 
+    if(features & SSE41) {
+        blake3_compress_in_place_sse41(cv, block, block_len, counter, flags);
+        return;
+    }
+#endif
+    blake3_compress_in_place_portable(cv, block, block_len, counter, flags);
+}
+
+void blake3_compress_xof(const uint32_t cv[8],
+                        const uint8_t block[BLAKE3_BLOCK_LEN],
+                        uint8_t block_len, uint64_t counter,
+                        uint8_t flags, uint8_t out[64])
+{
+    const enum cpu_feature features = get_cpu_features();
+#if defined(IS_X86)
+    if(features & AVX512VL) {
+        blake3_compress_xof_avx512(cv, block, block_len, counter, flags, out);
+        return;
+    }
+    if(features & SSE41) {
+        blake3_compress_xof_sse41(cv, block, block_len, counter, flags, out);
+        return;
+    }
+#endif
+    blake3_compress_xof_portable(cv, block, block_len, counter, flags, out);
+}
+
+void blake3_hash_many(const uint8_t *const *inputs, size_t num_inputs,
+                           size_t blocks, const uint32_t key[8],
+                           uint64_t counter, bool increment_counter,
+                           uint8_t flags, uint8_t flags_start,
+                           uint8_t flags_end, uint8_t *out)
+{
+    const enum cpu_feature features = get_cpu_features();
+#if defined(IS_X86)
+    if(features & AVX512F) {
+        blake3_hash_many_avx512(inputs, num_inputs, blocks, key, counter, increment_counter, flags, flags_start, flags_end, out);
+        return;
+    }
+    if(features & AVX2) {
+        blake3_hash_many_avx2(inputs, num_inputs, blocks, key, counter, increment_counter, flags, flags_start, flags_end, out);
+        return;
+    }
+    if(features & SSE41) {
+        blake3_hash_many_sse41(inputs, num_inputs, blocks, key, counter, increment_counter, flags, flags_start, flags_end, out);
+        return;
+    }
+#endif
+    blake3_hash_many_portable(inputs, num_inputs, blocks, key, counter, increment_counter, flags, flags_start, flags_end, out);
+}
+
diff --git a/c/blake3_impl.h b/c/blake3_impl.h
new file mode 100644
index 0000000..576ccf4
--- /dev/null
+++ b/c/blake3_impl.h
@@ -0,0 +1,97 @@
+#pragma once
+
+#include <assert.h>
+#include <stdbool.h>
+#include <stddef.h>
+#include <stdint.h>
+#include <string.h>
+
+#if __POPCNT__
+#include <nmmintrin.h>
+#endif
+
+#include "blake3.h"
+
+// internal flags
+#define CHUNK_START 1
+#define CHUNK_END 2
+#define PARENT 4
+#define ROOT 8
+#define KEYED_HASH 16
+#define DERIVE_KEY_CONTEXT 32
+#define DERIVE_KEY_MATERIAL 64
+
+// This C implementation tries to support recent versions of GCC, Clang, and
+// MSVC.
+#if defined(_MSC_VER)
+#define INLINE __forceinline static
+#else
+#define INLINE __attribute__((always_inline)) static inline
+#endif
+
+static const uint32_t IV[8] = {0x6A09E667UL, 0xBB67AE85UL, 0x3C6EF372UL,
+                               0xA54FF53AUL, 0x510E527FUL, 0x9B05688CUL,
+                               0x1F83D9ABUL, 0x5BE0CD19UL};
+
+static const uint8_t MSG_SCHEDULE[7][16] = {
+    {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15},
+    {2, 6, 3, 10, 7, 0, 4, 13, 1, 11, 12, 5, 9, 14, 15, 8},
+    {3, 4, 10, 12, 13, 2, 7, 14, 6, 5, 9, 0, 11, 15, 8, 1},
+    {10, 7, 12, 9, 14, 3, 13, 15, 4, 0, 11, 2, 5, 8, 1, 6},
+    {12, 13, 9, 11, 15, 10, 14, 8, 7, 2, 5, 3, 0, 1, 6, 4},
+    {9, 14, 11, 5, 8, 12, 15, 1, 13, 3, 0, 10, 2, 6, 4, 7},
+    {11, 15, 5, 0, 1, 9, 8, 6, 14, 10, 2, 12, 3, 4, 7, 13},
+};
+
+// Count the number of 1 bits.
+INLINE uint8_t popcnt(uint64_t x) {
+#if __POPCNT__
+  return (uint8_t)_mm_popcnt_u64(x);
+#else
+  uint8_t count = 0;
+  while (x > 0) {
+    count += ((uint8_t)x) & 1;
+    x >>= 1;
+  }
+  return count;
+#endif
+}
+
+INLINE uint32_t counter_low(uint64_t counter) { return (uint32_t)counter; }
+
+INLINE uint32_t counter_high(uint64_t counter) {
+  return (uint32_t)(counter >> 32);
+}
+
+INLINE uint32_t load32(const void *src) {
+  const uint8_t *p = (const uint8_t *)src;
+  return ((uint32_t)(p[0]) << 0) | ((uint32_t)(p[1]) << 8) |
+         ((uint32_t)(p[2]) << 16) | ((uint32_t)(p[3]) << 24);
+}
+
+INLINE void load_key_words(const uint8_t key[BLAKE3_KEY_LEN],
+                           uint32_t key_words[8]) {
+  key_words[0] = load32(&key[0 * 4]);
+  key_words[1] = load32(&key[1 * 4]);
+  key_words[2] = load32(&key[2 * 4]);
+  key_words[3] = load32(&key[3 * 4]);
+  key_words[4] = load32(&key[4 * 4]);
+  key_words[5] = load32(&key[5 * 4]);
+  key_words[6] = load32(&key[6 * 4]);
+  key_words[7] = load32(&key[7 * 4]);
+}
+
+void blake3_compress_in_place(uint32_t cv[8],
+                              const uint8_t block[BLAKE3_BLOCK_LEN],
+                              uint8_t block_len, uint64_t counter,
+                              uint8_t flags);
+
+void blake3_compress_xof(const uint32_t cv[8],
+                         const uint8_t block[BLAKE3_BLOCK_LEN],
+                         uint8_t block_len, uint64_t counter, uint8_t flags,
+                         uint8_t out[64]);
+
+void blake3_hash_many(const uint8_t *const *inputs, size_t num_inputs,
+                      size_t blocks, const uint32_t key[8], uint64_t counter,
+                      bool increment_counter, uint8_t flags,
+                      uint8_t flags_start, uint8_t flags_end, uint8_t *out);
diff --git a/c/blake3_neon.c b/c/blake3_neon.c
new file mode 100644
index 0000000..46691f5
--- /dev/null
+++ b/c/blake3_neon.c
@@ -0,0 +1,346 @@
+#include "blake3_impl.h"
+
+#include <arm_neon.h>
+
+// TODO: This is probably incorrect for big-endian ARM. How should that work?
+INLINE uint32x4_t loadu_128(const uint8_t src[16]) {
+  // vld1q_u32 has alignment requirements. Don't use it.
+  uint32x4_t x;
+  memcpy(&x, src, 16);
+  return x;
+}
+
+INLINE void storeu_128(uint32x4_t src, uint8_t dest[16]) {
+  // vst1q_u32 has alignment requirements. Don't use it.
+  memcpy(dest, &src, 16);
+}
+
+INLINE uint32x4_t add_128(uint32x4_t a, uint32x4_t b) {
+  return vaddq_u32(a, b);
+}
+
+INLINE uint32x4_t xor_128(uint32x4_t a, uint32x4_t b) {
+  return veorq_u32(a, b);
+}
+
+INLINE uint32x4_t set1_128(uint32_t x) { return vld1q_dup_u32(&x); }
+
+INLINE uint32x4_t set4(uint32_t a, uint32_t b, uint32_t c, uint32_t d) {
+  uint32_t array[4] = {a, b, c, d};
+  return vld1q_u32(array);
+}
+
+INLINE uint32x4_t rot16_128(uint32x4_t x) {
+  return vorrq_u32(vshrq_n_u32(x, 16), vshlq_n_u32(x, 32 - 16));
+}
+
+INLINE uint32x4_t rot12_128(uint32x4_t x) {
+  return vorrq_u32(vshrq_n_u32(x, 12), vshlq_n_u32(x, 32 - 12));
+}
+
+INLINE uint32x4_t rot8_128(uint32x4_t x) {
+  return vorrq_u32(vshrq_n_u32(x, 8), vshlq_n_u32(x, 32 - 8));
+}
+
+INLINE uint32x4_t rot7_128(uint32x4_t x) {
+  return vorrq_u32(vshrq_n_u32(x, 7), vshlq_n_u32(x, 32 - 7));
+}
+
+// TODO: compress_neon
+
+// TODO: hash2_neon
+
+/*
+ * ----------------------------------------------------------------------------
+ * hash4_neon
+ * ----------------------------------------------------------------------------
+ */
+
+INLINE void round_fn4(uint32x4_t v[16], uint32x4_t m[16], size_t r) {
+  v[0] = add_128(v[0], m[(size_t)MSG_SCHEDULE[r][0]]);
+  v[1] = add_128(v[1], m[(size_t)MSG_SCHEDULE[r][2]]);
+  v[2] = add_128(v[2], m[(size_t)MSG_SCHEDULE[r][4]]);
+  v[3] = add_128(v[3], m[(size_t)MSG_SCHEDULE[r][6]]);
+  v[0] = add_128(v[0], v[4]);
+  v[1] = add_128(v[1], v[5]);
+  v[2] = add_128(v[2], v[6]);
+  v[3] = add_128(v[3], v[7]);
+  v[12] = xor_128(v[12], v[0]);
+  v[13] = xor_128(v[13], v[1]);
+  v[14] = xor_128(v[14], v[2]);
+  v[15] = xor_128(v[15], v[3]);
+  v[12] = rot16_128(v[12]);
+  v[13] = rot16_128(v[13]);
+  v[14] = rot16_128(v[14]);
+  v[15] = rot16_128(v[15]);
+  v[8] = add_128(v[8], v[12]);
+  v[9] = add_128(v[9], v[13]);
+  v[10] = add_128(v[10], v[14]);
+  v[11] = add_128(v[11], v[15]);
+  v[4] = xor_128(v[4], v[8]);
+  v[5] = xor_128(v[5], v[9]);
+  v[6] = xor_128(v[6], v[10]);
+  v[7] = xor_128(v[7], v[11]);
+  v[4] = rot12_128(v[4]);
+  v[5] = rot12_128(v[5]);
+  v[6] = rot12_128(v[6]);
+  v[7] = rot12_128(v[7]);
+  v[0] = add_128(v[0], m[(size_t)MSG_SCHEDULE[r][1]]);
+  v[1] = add_128(v[1], m[(size_t)MSG_SCHEDULE[r][3]]);
+  v[2] = add_128(v[2], m[(size_t)MSG_SCHEDULE[r][5]]);
+  v[3] = add_128(v[3], m[(size_t)MSG_SCHEDULE[r][7]]);
+  v[0] = add_128(v[0], v[4]);
+  v[1] = add_128(v[1], v[5]);
+  v[2] = add_128(v[2], v[6]);
+  v[3] = add_128(v[3], v[7]);
+  v[12] = xor_128(v[12], v[0]);
+  v[13] = xor_128(v[13], v[1]);
+  v[14] = xor_128(v[14], v[2]);
+  v[15] = xor_128(v[15], v[3]);
+  v[12] = rot8_128(v[12]);
+  v[13] = rot8_128(v[13]);
+  v[14] = rot8_128(v[14]);
+  v[15] = rot8_128(v[15]);
+  v[8] = add_128(v[8], v[12]);
+  v[9] = add_128(v[9], v[13]);
+  v[10] = add_128(v[10], v[14]);
+  v[11] = add_128(v[11], v[15]);
+  v[4] = xor_128(v[4], v[8]);
+  v[5] = xor_128(v[5], v[9]);
+  v[6] = xor_128(v[6], v[10]);
+  v[7] = xor_128(v[7], v[11]);
+  v[4] = rot7_128(v[4]);
+  v[5] = rot7_128(v[5]);
+  v[6] = rot7_128(v[6]);
+  v[7] = rot7_128(v[7]);
+
+  v[0] = add_128(v[0], m[(size_t)MSG_SCHEDULE[r][8]]);
+  v[1] = add_128(v[1], m[(size_t)MSG_SCHEDULE[r][10]]);
+  v[2] = add_128(v[2], m[(size_t)MSG_SCHEDULE[r][12]]);
+  v[3] = add_128(v[3], m[(size_t)MSG_SCHEDULE[r][14]]);
+  v[0] = add_128(v[0], v[5]);
+  v[1] = add_128(v[1], v[6]);
+  v[2] = add_128(v[2], v[7]);
+  v[3] = add_128(v[3], v[4]);
+  v[15] = xor_128(v[15], v[0]);
+  v[12] = xor_128(v[12], v[1]);
+  v[13] = xor_128(v[13], v[2]);
+  v[14] = xor_128(v[14], v[3]);
+  v[15] = rot16_128(v[15]);
+  v[12] = rot16_128(v[12]);
+  v[13] = rot16_128(v[13]);
+  v[14] = rot16_128(v[14]);
+  v[10] = add_128(v[10], v[15]);
+  v[11] = add_128(v[11], v[12]);
+  v[8] = add_128(v[8], v[13]);
+  v[9] = add_128(v[9], v[14]);
+  v[5] = xor_128(v[5], v[10]);
+  v[6] = xor_128(v[6], v[11]);
+  v[7] = xor_128(v[7], v[8]);
+  v[4] = xor_128(v[4], v[9]);
+  v[5] = rot12_128(v[5]);
+  v[6] = rot12_128(v[6]);
+  v[7] = rot12_128(v[7]);
+  v[4] = rot12_128(v[4]);
+  v[0] = add_128(v[0], m[(size_t)MSG_SCHEDULE[r][9]]);
+  v[1] = add_128(v[1], m[(size_t)MSG_SCHEDULE[r][11]]);
+  v[2] = add_128(v[2], m[(size_t)MSG_SCHEDULE[r][13]]);
+  v[3] = add_128(v[3], m[(size_t)MSG_SCHEDULE[r][15]]);
+  v[0] = add_128(v[0], v[5]);
+  v[1] = add_128(v[1], v[6]);
+  v[2] = add_128(v[2], v[7]);
+  v[3] = add_128(v[3], v[4]);
+  v[15] = xor_128(v[15], v[0]);
+  v[12] = xor_128(v[12], v[1]);
+  v[13] = xor_128(v[13], v[2]);
+  v[14] = xor_128(v[14], v[3]);
+  v[15] = rot8_128(v[15]);
+  v[12] = rot8_128(v[12]);
+  v[13] = rot8_128(v[13]);
+  v[14] = rot8_128(v[14]);
+  v[10] = add_128(v[10], v[15]);
+  v[11] = add_128(v[11], v[12]);
+  v[8] = add_128(v[8], v[13]);
+  v[9] = add_128(v[9], v[14]);
+  v[5] = xor_128(v[5], v[10]);
+  v[6] = xor_128(v[6], v[11]);
+  v[7] = xor_128(v[7], v[8]);
+  v[4] = xor_128(v[4], v[9]);
+  v[5] = rot7_128(v[5]);
+  v[6] = rot7_128(v[6]);
+  v[7] = rot7_128(v[7]);
+  v[4] = rot7_128(v[4]);
+}
+
+INLINE void transpose_vecs_128(uint32x4_t vecs[4]) {
+  // Individually transpose the four 2x2 sub-matrices in each corner.
+  uint32x4x2_t rows01 = vtrnq_u32(vecs[0], vecs[1]);
+  uint32x4x2_t rows23 = vtrnq_u32(vecs[2], vecs[3]);
+
+  // Swap the top-right and bottom-left 2x2s (which just got transposed).
+  vecs[0] =
+      vcombine_u32(vget_low_u32(rows01.val[0]), vget_low_u32(rows23.val[0]));
+  vecs[1] =
+      vcombine_u32(vget_low_u32(rows01.val[1]), vget_low_u32(rows23.val[1]));
+  vecs[2] =
+      vcombine_u32(vget_high_u32(rows01.val[0]), vget_high_u32(rows23.val[0]));
+  vecs[3] =
+      vcombine_u32(vget_high_u32(rows01.val[1]), vget_high_u32(rows23.val[1]));
+}
+
+INLINE void transpose_msg_vecs4(const uint8_t *const *inputs,
+                                size_t block_offset, uint32x4_t out[16]) {
+  out[0] = loadu_128(&inputs[0][block_offset + 0 * sizeof(uint32x4_t)]);
+  out[1] = loadu_128(&inputs[1][block_offset + 0 * sizeof(uint32x4_t)]);
+  out[2] = loadu_128(&inputs[2][block_offset + 0 * sizeof(uint32x4_t)]);
+  out[3] = loadu_128(&inputs[3][block_offset + 0 * sizeof(uint32x4_t)]);
+  out[4] = loadu_128(&inputs[0][block_offset + 1 * sizeof(uint32x4_t)]);
+  out[5] = loadu_128(&inputs[1][block_offset + 1 * sizeof(uint32x4_t)]);
+  out[6] = loadu_128(&inputs[2][block_offset + 1 * sizeof(uint32x4_t)]);
+  out[7] = loadu_128(&inputs[3][block_offset + 1 * sizeof(uint32x4_t)]);
+  out[8] = loadu_128(&inputs[0][block_offset + 2 * sizeof(uint32x4_t)]);
+  out[9] = loadu_128(&inputs[1][block_offset + 2 * sizeof(uint32x4_t)]);
+  out[10] = loadu_128(&inputs[2][block_offset + 2 * sizeof(uint32x4_t)]);
+  out[11] = loadu_128(&inputs[3][block_offset + 2 * sizeof(uint32x4_t)]);
+  out[12] = loadu_128(&inputs[0][block_offset + 3 * sizeof(uint32x4_t)]);
+  out[13] = loadu_128(&inputs[1][block_offset + 3 * sizeof(uint32x4_t)]);
+  out[14] = loadu_128(&inputs[2][block_offset + 3 * sizeof(uint32x4_t)]);
+  out[15] = loadu_128(&inputs[3][block_offset + 3 * sizeof(uint32x4_t)]);
+  transpose_vecs_128(&out[0]);
+  transpose_vecs_128(&out[4]);
+  transpose_vecs_128(&out[8]);
+  transpose_vecs_128(&out[12]);
+}
+
+INLINE void load_counters4(uint64_t counter, bool increment_counter,
+                           uint32x4_t *out_low, uint32x4_t *out_high) {
+  uint64_t mask = (increment_counter ? ~0 : 0);
+  *out_low = set4(
+      counter_low(counter + (mask & 0)), counter_low(counter + (mask & 1)),
+      counter_low(counter + (mask & 2)), counter_low(counter + (mask & 3)));
+  *out_high = set4(
+      counter_high(counter + (mask & 0)), counter_high(counter + (mask & 1)),
+      counter_high(counter + (mask & 2)), counter_high(counter + (mask & 3)));
+}
+
+void blake3_hash4_neon(const uint8_t *const *inputs, size_t blocks,
+                       const uint32_t key[8], uint64_t counter,
+                       bool increment_counter, uint8_t flags,
+                       uint8_t flags_start, uint8_t flags_end, uint8_t *out) {
+  uint32x4_t h_vecs[8] = {
+      set1_128(key[0]), set1_128(key[1]), set1_128(key[2]), set1_128(key[3]),
+      set1_128(key[4]), set1_128(key[5]), set1_128(key[6]), set1_128(key[7]),
+  };
+  uint32x4_t counter_low_vec, counter_high_vec;
+  load_counters4(counter, increment_counter, &counter_low_vec,
+                 &counter_high_vec);
+  uint8_t block_flags = flags | flags_start;
+
+  for (size_t block = 0; block < blocks; block++) {
+    if (block + 1 == blocks) {
+      block_flags |= flags_end;
+    }
+    uint32x4_t block_len_vec = set1_128(BLAKE3_BLOCK_LEN);
+    uint32x4_t block_flags_vec = set1_128(block_flags);
+    uint32x4_t msg_vecs[16];
+    transpose_msg_vecs4(inputs, block * BLAKE3_BLOCK_LEN, msg_vecs);
+
+    uint32x4_t v[16] = {
+        h_vecs[0],       h_vecs[1],        h_vecs[2],       h_vecs[3],
+        h_vecs[4],       h_vecs[5],        h_vecs[6],       h_vecs[7],
+        set1_128(IV[0]), set1_128(IV[1]),  set1_128(IV[2]), set1_128(IV[3]),
+        counter_low_vec, counter_high_vec, block_len_vec,   block_flags_vec,
+    };
+    round_fn4(v, msg_vecs, 0);
+    round_fn4(v, msg_vecs, 1);
+    round_fn4(v, msg_vecs, 2);
+    round_fn4(v, msg_vecs, 3);
+    round_fn4(v, msg_vecs, 4);
+    round_fn4(v, msg_vecs, 5);
+    round_fn4(v, msg_vecs, 6);
+    h_vecs[0] = xor_128(v[0], v[8]);
+    h_vecs[1] = xor_128(v[1], v[9]);
+    h_vecs[2] = xor_128(v[2], v[10]);
+    h_vecs[3] = xor_128(v[3], v[11]);
+    h_vecs[4] = xor_128(v[4], v[12]);
+    h_vecs[5] = xor_128(v[5], v[13]);
+    h_vecs[6] = xor_128(v[6], v[14]);
+    h_vecs[7] = xor_128(v[7], v[15]);
+
+    block_flags = flags;
+  }
+
+  transpose_vecs_128(&h_vecs[0]);
+  transpose_vecs_128(&h_vecs[4]);
+  // The first four vecs now contain the first half of each output, and the
+  // second four vecs contain the second half of each output.
+  storeu_128(h_vecs[0], &out[0 * sizeof(uint32x4_t)]);
+  storeu_128(h_vecs[4], &out[1 * sizeof(uint32x4_t)]);
+  storeu_128(h_vecs[1], &out[2 * sizeof(uint32x4_t)]);
+  storeu_128(h_vecs[5], &out[3 * sizeof(uint32x4_t)]);
+  storeu_128(h_vecs[2], &out[4 * sizeof(uint32x4_t)]);
+  storeu_128(h_vecs[6], &out[5 * sizeof(uint32x4_t)]);
+  storeu_128(h_vecs[3], &out[6 * sizeof(uint32x4_t)]);
+  storeu_128(h_vecs[7], &out[7 * sizeof(uint32x4_t)]);
+}
+
+/*
+ * ----------------------------------------------------------------------------
+ * hash_many_neon
+ * ----------------------------------------------------------------------------
+ */
+
+void blake3_compress_in_place_portable(uint32_t cv[8],
+                                       const uint8_t block[BLAKE3_BLOCK_LEN],
+                                       uint8_t block_len, uint64_t counter,
+                                       uint8_t flags);
+
+INLINE void hash_one_neon(const uint8_t *input, size_t blocks,
+                          const uint32_t key[8], uint64_t counter,
+                          uint8_t flags, uint8_t flags_start, uint8_t flags_end,
+                          uint8_t out[BLAKE3_OUT_LEN]) {
+  uint32_t cv[8];
+  memcpy(cv, key, BLAKE3_KEY_LEN);
+  uint8_t block_flags = flags | flags_start;
+  while (blocks > 0) {
+    if (blocks == 1) {
+      block_flags |= flags_end;
+    }
+    // TODO: Implement compress_neon. However note that according to
+    // https://github.com/BLAKE2/BLAKE2/commit/7965d3e6e1b4193438b8d3a656787587d2579227,
+    // compress_neon might not be any faster than compress_portable.
+    blake3_compress_in_place_portable(cv, input, BLAKE3_BLOCK_LEN, counter,
+                                      block_flags);
+    input = &input[BLAKE3_BLOCK_LEN];
+    blocks -= 1;
+    block_flags = flags;
+  }
+  memcpy(out, cv, BLAKE3_OUT_LEN);
+}
+
+void blake3_hash_many_neon(const uint8_t *const *inputs, size_t num_inputs,
+                           size_t blocks, const uint32_t key[8],
+                           uint64_t counter, bool increment_counter,
+                           uint8_t flags, uint8_t flags_start,
+                           uint8_t flags_end, uint8_t *out) {
+  while (num_inputs >= 4) {
+    blake3_hash4_neon(inputs, blocks, key, counter, increment_counter, flags,
+                      flags_start, flags_end, out);
+    if (increment_counter) {
+      counter += 4;
+    }
+    inputs += 4;
+    num_inputs -= 4;
+    out = &out[4 * BLAKE3_OUT_LEN];
+  }
+  while (num_inputs > 0) {
+    hash_one_neon(inputs[0], blocks, key, counter, flags, flags_start,
+                  flags_end, out);
+    if (increment_counter) {
+      counter += 1;
+    }
+    inputs += 1;
+    num_inputs -= 1;
+    out = &out[BLAKE3_OUT_LEN];
+  }
+}
diff --git a/c/blake3_portable.c b/c/blake3_portable.c
new file mode 100644
index 0000000..9ee2f4a
--- /dev/null
+++ b/c/blake3_portable.c
@@ -0,0 +1,168 @@
+#include "blake3_impl.h"
+#include <string.h>
+
+INLINE void store32(void *dst, uint32_t w) {
+  uint8_t *p = (uint8_t *)dst;
+  p[0] = (uint8_t)(w >> 0);
+  p[1] = (uint8_t)(w >> 8);
+  p[2] = (uint8_t)(w >> 16);
+  p[3] = (uint8_t)(w >> 24);
+}
+
+INLINE uint32_t rotr32(uint32_t w, uint32_t c) {
+  return (w >> c) | (w << (32 - c));
+}
+
+INLINE void g(uint32_t *state, size_t a, size_t b, size_t c, size_t d,
+              uint32_t x, uint32_t y) {
+  state[a] = state[a] + state[b] + x;
+  state[d] = rotr32(state[d] ^ state[a], 16);
+  state[c] = state[c] + state[d];
+  state[b] = rotr32(state[b] ^ state[c], 12);
+  state[a] = state[a] + state[b] + y;
+  state[d] = rotr32(state[d] ^ state[a], 8);
+  state[c] = state[c] + state[d];
+  state[b] = rotr32(state[b] ^ state[c], 7);
+}
+
+INLINE void round_fn(uint32_t state[16], const uint32_t *msg, size_t round) {
+  // Select the message schedule based on the round.
+  const uint8_t *schedule = MSG_SCHEDULE[round];
+
+  // Mix the columns.
+  g(state, 0, 4, 8, 12, msg[schedule[0]], msg[schedule[1]]);
+  g(state, 1, 5, 9, 13, msg[schedule[2]], msg[schedule[3]]);
+  g(state, 2, 6, 10, 14, msg[schedule[4]], msg[schedule[5]]);
+  g(state, 3, 7, 11, 15, msg[schedule[6]], msg[schedule[7]]);
+
+  // Mix the rows.
+  g(state, 0, 5, 10, 15, msg[schedule[8]], msg[schedule[9]]);
+  g(state, 1, 6, 11, 12, msg[schedule[10]], msg[schedule[11]]);
+  g(state, 2, 7, 8, 13, msg[schedule[12]], msg[schedule[13]]);
+  g(state, 3, 4, 9, 14, msg[schedule[14]], msg[schedule[15]]);
+}
+
+INLINE void compress_pre(uint32_t state[16], const uint32_t cv[8],
+                         const uint8_t block[BLAKE3_BLOCK_LEN],
+                         uint8_t block_len, uint64_t counter, uint8_t flags) {
+  uint32_t block_words[16];
+  block_words[0] = load32(block + 4 * 0);
+  block_words[1] = load32(block + 4 * 1);
+  block_words[2] = load32(block + 4 * 2);
+  block_words[3] = load32(block + 4 * 3);
+  block_words[4] = load32(block + 4 * 4);
+  block_words[5] = load32(block + 4 * 5);
+  block_words[6] = load32(block + 4 * 6);
+  block_words[7] = load32(block + 4 * 7);
+  block_words[8] = load32(block + 4 * 8);
+  block_words[9] = load32(block + 4 * 9);
+  block_words[10] = load32(block + 4 * 10);
+  block_words[11] = load32(block + 4 * 11);
+  block_words[12] = load32(block + 4 * 12);
+  block_words[13] = load32(block + 4 * 13);
+  block_words[14] = load32(block + 4 * 14);
+  block_words[15] = load32(block + 4 * 15);
+
+  state[0] = cv[0];
+  state[1] = cv[1];
+  state[2] = cv[2];
+  state[3] = cv[3];
+  state[4] = cv[4];
+  state[5] = cv[5];
+  state[6] = cv[6];
+  state[7] = cv[7];
+  state[8] = IV[0];
+  state[9] = IV[1];
+  state[10] = IV[2];
+  state[11] = IV[3];
+  state[12] = counter_low(counter);
+  state[13] = counter_high(counter);
+  state[14] = (uint32_t)block_len;
+  state[15] = (uint32_t)flags;
+
+  round_fn(state, &block_words[0], 0);
+  round_fn(state, &block_words[0], 1);
+  round_fn(state, &block_words[0], 2);
+  round_fn(state, &block_words[0], 3);
+  round_fn(state, &block_words[0], 4);
+  round_fn(state, &block_words[0], 5);
+  round_fn(state, &block_words[0], 6);
+}
+
+void blake3_compress_in_place_portable(uint32_t cv[8],
+                                       const uint8_t block[BLAKE3_BLOCK_LEN],
+                                       uint8_t block_len, uint64_t counter,
+                                       uint8_t flags) {
+  uint32_t state[16];
+  compress_pre(state, cv, block, block_len, counter, flags);
+  cv[0] = state[0] ^ state[8];
+  cv[1] = state[1] ^ state[9];
+  cv[2] = state[2] ^ state[10];
+  cv[3] = state[3] ^ state[11];
+  cv[4] = state[4] ^ state[12];
+  cv[5] = state[5] ^ state[13];
+  cv[6] = state[6] ^ state[14];
+  cv[7] = state[7] ^ state[15];
+}
+
+void blake3_compress_xof_portable(const uint32_t cv[8],
+                                  const uint8_t block[BLAKE3_BLOCK_LEN],
+                                  uint8_t block_len, uint64_t counter,
+                                  uint8_t flags, uint8_t out[64]) {
+  uint32_t state[16];
+  compress_pre(state, cv, block, block_len, counter, flags);
+
+  store32(&out[0 * 4], state[0] ^ state[8]);
+  store32(&out[1 * 4], state[1] ^ state[9]);
+  store32(&out[2 * 4], state[2] ^ state[10]);
+  store32(&out[3 * 4], state[3] ^ state[11]);
+  store32(&out[4 * 4], state[4] ^ state[12]);
+  store32(&out[5 * 4], state[5] ^ state[13]);
+  store32(&out[6 * 4], state[6] ^ state[14]);
+  store32(&out[7 * 4], state[7] ^ state[15]);
+  store32(&out[8 * 4], state[8] ^ cv[0]);
+  store32(&out[9 * 4], state[9] ^ cv[1]);
+  store32(&out[10 * 4], state[10] ^ cv[2]);
+  store32(&out[11 * 4], state[11] ^ cv[3]);
+  store32(&out[12 * 4], state[12] ^ cv[4]);
+  store32(&out[13 * 4], state[13] ^ cv[5]);
+  store32(&out[14 * 4], state[14] ^ cv[6]);
+  store32(&out[15 * 4], state[15] ^ cv[7]);
+}
+
+INLINE void hash_one_portable(const uint8_t *input, size_t blocks,
+                              const uint32_t key[8], uint64_t counter,
+                              uint8_t flags, uint8_t flags_start,
+                              uint8_t flags_end, uint8_t out[BLAKE3_OUT_LEN]) {
+  uint32_t cv[8];
+  memcpy(cv, key, BLAKE3_KEY_LEN);
+  uint8_t block_flags = flags | flags_start;
+  while (blocks > 0) {
+    if (blocks == 1) {
+      block_flags |= flags_end;
+    }
+    blake3_compress_in_place_portable(cv, input, BLAKE3_BLOCK_LEN, counter,
+                                      block_flags);
+    input = &input[BLAKE3_BLOCK_LEN];
+    blocks -= 1;
+    block_flags = flags;
+  }
+  memcpy(out, cv, 32);
+}
+
+void blake3_hash_many_portable(const uint8_t *const *inputs, size_t num_inputs,
+                               size_t blocks, const uint32_t key[8],
+                               uint64_t counter, bool increment_counter,
+                               uint8_t flags, uint8_t flags_start,
+                               uint8_t flags_end, uint8_t *out) {
+  while (num_inputs > 0) {
+    hash_one_portable(inputs[0], blocks, key, counter, flags, flags_start,
+                      flags_end, out);
+    if (increment_counter) {
+      counter += 1;
+    }
+    inputs += 1;
+    num_inputs -= 1;
+    out = &out[BLAKE3_OUT_LEN];
+  }
+}
diff --git a/c/blake3_sse41.c b/c/blake3_sse41.c
new file mode 100644
index 0000000..3bf281f
--- /dev/null
+++ b/c/blake3_sse41.c
@@ -0,0 +1,554 @@
+#include "blake3_impl.h"
+
+#include <immintrin.h>
+
+#define DEGREE 4
+
+#define _mm_shuffle_ps2(a, b, c)                                               \
+  (_mm_castps_si128(                                                           \
+      _mm_shuffle_ps(_mm_castsi128_ps(a), _mm_castsi128_ps(b), (c))))
+
+INLINE __m128i loadu(const uint8_t src[16]) {
+  return _mm_loadu_si128((const __m128i *)src);
+}
+
+INLINE void storeu(__m128i src, uint8_t dest[16]) {
+  _mm_storeu_si128((__m128i *)dest, src);
+}
+
+INLINE __m128i addv(__m128i a, __m128i b) { return _mm_add_epi32(a, b); }
+
+// Note that clang-format doesn't like the name "xor" for some reason.
+INLINE __m128i xorv(__m128i a, __m128i b) { return _mm_xor_si128(a, b); }
+
+INLINE __m128i set1(uint32_t x) { return _mm_set1_epi32((int32_t)x); }
+
+INLINE __m128i set4(uint32_t a, uint32_t b, uint32_t c, uint32_t d) {
+  return _mm_setr_epi32((int32_t)a, (int32_t)b, (int32_t)c, (int32_t)d);
+}
+
+INLINE __m128i rot16(__m128i x) {
+  return _mm_shuffle_epi8(
+      x, _mm_set_epi8(13, 12, 15, 14, 9, 8, 11, 10, 5, 4, 7, 6, 1, 0, 3, 2));
+}
+
+INLINE __m128i rot12(__m128i x) {
+  return xorv(_mm_srli_epi32(x, 12), _mm_slli_epi32(x, 32 - 12));
+}
+
+INLINE __m128i rot8(__m128i x) {
+  return _mm_shuffle_epi8(
+      x, _mm_set_epi8(12, 15, 14, 13, 8, 11, 10, 9, 4, 7, 6, 5, 0, 3, 2, 1));
+}
+
+INLINE __m128i rot7(__m128i x) {
+  return xorv(_mm_srli_epi32(x, 7), _mm_slli_epi32(x, 32 - 7));
+}
+
+INLINE void g1(__m128i *row0, __m128i *row1, __m128i *row2, __m128i *row3,
+               __m128i m) {
+  *row0 = addv(addv(*row0, m), *row1);
+  *row3 = xorv(*row3, *row0);
+  *row3 = rot16(*row3);
+  *row2 = addv(*row2, *row3);
+  *row1 = xorv(*row1, *row2);
+  *row1 = rot12(*row1);
+}
+
+INLINE void g2(__m128i *row0, __m128i *row1, __m128i *row2, __m128i *row3,
+               __m128i m) {
+  *row0 = addv(addv(*row0, m), *row1);
+  *row3 = xorv(*row3, *row0);
+  *row3 = rot8(*row3);
+  *row2 = addv(*row2, *row3);
+  *row1 = xorv(*row1, *row2);
+  *row1 = rot7(*row1);
+}
+
+// Note the optimization here of leaving row1 as the unrotated row, rather than
+// row0. All the message loads below are adjusted to compensate for this. See
+// discussion at https://github.com/sneves/blake2-avx2/pull/4
+INLINE void diagonalize(__m128i *row0, __m128i *row2, __m128i *row3) {
+  *row0 = _mm_shuffle_epi32(*row0, _MM_SHUFFLE(2, 1, 0, 3));
+  *row3 = _mm_shuffle_epi32(*row3, _MM_SHUFFLE(1, 0, 3, 2));
+  *row2 = _mm_shuffle_epi32(*row2, _MM_SHUFFLE(0, 3, 2, 1));
+}
+
+INLINE void undiagonalize(__m128i *row0, __m128i *row2, __m128i *row3) {
+  *row0 = _mm_shuffle_epi32(*row0, _MM_SHUFFLE(0, 3, 2, 1));
+  *row3 = _mm_shuffle_epi32(*row3, _MM_SHUFFLE(1, 0, 3, 2));
+  *row2 = _mm_shuffle_epi32(*row2, _MM_SHUFFLE(2, 1, 0, 3));
+}
+
+INLINE void compress_pre(__m128i rows[4], const uint32_t cv[8],
+                         const uint8_t block[BLAKE3_BLOCK_LEN],
+                         uint8_t block_len, uint64_t counter, uint8_t flags) {
+  rows[0] = loadu((uint8_t *)&cv[0]);
+  rows[1] = loadu((uint8_t *)&cv[4]);
+  rows[2] = set4(IV[0], IV[1], IV[2], IV[3]);
+  rows[3] = set4(counter_low(counter), counter_high(counter),
+                 (uint32_t)block_len, (uint32_t)flags);
+
+  __m128i m0 = loadu(&block[sizeof(__m128i) * 0]);
+  __m128i m1 = loadu(&block[sizeof(__m128i) * 1]);
+  __m128i m2 = loadu(&block[sizeof(__m128i) * 2]);
+  __m128i m3 = loadu(&block[sizeof(__m128i) * 3]);
+
+  __m128i t0, t1, t2, t3, tt;
+
+  // Round 1. The first round permutes the message words from the original
+  // input order, into the groups that get mixed in parallel.
+  t0 = _mm_shuffle_ps2(m0, m1, _MM_SHUFFLE(2, 0, 2, 0)); //  6  4  2  0
+  g1(&rows[0], &rows[1], &rows[2], &rows[3], t0);
+  t1 = _mm_shuffle_ps2(m0, m1, _MM_SHUFFLE(3, 1, 3, 1)); //  7  5  3  1
+  g2(&rows[0], &rows[1], &rows[2], &rows[3], t1);
+  diagonalize(&rows[0], &rows[2], &rows[3]);
+  t2 = _mm_shuffle_ps2(m2, m3, _MM_SHUFFLE(2, 0, 2, 0)); // 14 12 10  8
+  t2 = _mm_shuffle_epi32(t2, _MM_SHUFFLE(2, 1, 0, 3));   // 12 10  8 14
+  g1(&rows[0], &rows[1], &rows[2], &rows[3], t2);
+  t3 = _mm_shuffle_ps2(m2, m3, _MM_SHUFFLE(3, 1, 3, 1)); // 15 13 11  9
+  t3 = _mm_shuffle_epi32(t3, _MM_SHUFFLE(2, 1, 0, 3));   // 13 11  9 15
+  g2(&rows[0], &rows[1], &rows[2], &rows[3], t3);
+  undiagonalize(&rows[0], &rows[2], &rows[3]);
+  m0 = t0;
+  m1 = t1;
+  m2 = t2;
+  m3 = t3;
+
+  // Round 2. This round and all following rounds apply a fixed permutation
+  // to the message words from the round before.
+  t0 = _mm_shuffle_ps2(m0, m1, _MM_SHUFFLE(3, 1, 1, 2));
+  t0 = _mm_shuffle_epi32(t0, _MM_SHUFFLE(0, 3, 2, 1));
+  g1(&rows[0], &rows[1], &rows[2], &rows[3], t0);
+  t1 = _mm_shuffle_ps2(m2, m3, _MM_SHUFFLE(3, 3, 2, 2));
+  tt = _mm_shuffle_epi32(m0, _MM_SHUFFLE(0, 0, 3, 3));
+  t1 = _mm_blend_epi16(tt, t1, 0xCC);
+  g2(&rows[0], &rows[1], &rows[2], &rows[3], t1);
+  diagonalize(&rows[0], &rows[2], &rows[3]);
+  t2 = _mm_unpacklo_epi64(m3, m1);
+  tt = _mm_blend_epi16(t2, m2, 0xC0);
+  t2 = _mm_shuffle_epi32(tt, _MM_SHUFFLE(1, 3, 2, 0));
+  g1(&rows[0], &rows[1], &rows[2], &rows[3], t2);
+  t3 = _mm_unpackhi_epi32(m1, m3);
+  tt = _mm_unpacklo_epi32(m2, t3);
+  t3 = _mm_shuffle_epi32(tt, _MM_SHUFFLE(0, 1, 3, 2));
+  g2(&rows[0], &rows[1], &rows[2], &rows[3], t3);
+  undiagonalize(&rows[0], &rows[2], &rows[3]);
+  m0 = t0;
+  m1 = t1;
+  m2 = t2;
+  m3 = t3;
+
+  // Round 3
+  t0 = _mm_shuffle_ps2(m0, m1, _MM_SHUFFLE(3, 1, 1, 2));
+  t0 = _mm_shuffle_epi32(t0, _MM_SHUFFLE(0, 3, 2, 1));
+  g1(&rows[0], &rows[1], &rows[2], &rows[3], t0);
+  t1 = _mm_shuffle_ps2(m2, m3, _MM_SHUFFLE(3, 3, 2, 2));
+  tt = _mm_shuffle_epi32(m0, _MM_SHUFFLE(0, 0, 3, 3));
+  t1 = _mm_blend_epi16(tt, t1, 0xCC);
+  g2(&rows[0], &rows[1], &rows[2], &rows[3], t1);
+  diagonalize(&rows[0], &rows[2], &rows[3]);
+  t2 = _mm_unpacklo_epi64(m3, m1);
+  tt = _mm_blend_epi16(t2, m2, 0xC0);
+  t2 = _mm_shuffle_epi32(tt, _MM_SHUFFLE(1, 3, 2, 0));
+  g1(&rows[0], &rows[1], &rows[2], &rows[3], t2);
+  t3 = _mm_unpackhi_epi32(m1, m3);
+  tt = _mm_unpacklo_epi32(m2, t3);
+  t3 = _mm_shuffle_epi32(tt, _MM_SHUFFLE(0, 1, 3, 2));
+  g2(&rows[0], &rows[1], &rows[2], &rows[3], t3);
+  undiagonalize(&rows[0], &rows[2], &rows[3]);
+  m0 = t0;
+  m1 = t1;
+  m2 = t2;
+  m3 = t3;
+
+  // Round 4
+  t0 = _mm_shuffle_ps2(m0, m1, _MM_SHUFFLE(3, 1, 1, 2));
+  t0 = _mm_shuffle_epi32(t0, _MM_SHUFFLE(0, 3, 2, 1));
+  g1(&rows[0], &rows[1], &rows[2], &rows[3], t0);
+  t1 = _mm_shuffle_ps2(m2, m3, _MM_SHUFFLE(3, 3, 2, 2));
+  tt = _mm_shuffle_epi32(m0, _MM_SHUFFLE(0, 0, 3, 3));
+  t1 = _mm_blend_epi16(tt, t1, 0xCC);
+  g2(&rows[0], &rows[1], &rows[2], &rows[3], t1);
+  diagonalize(&rows[0], &rows[2], &rows[3]);
+  t2 = _mm_unpacklo_epi64(m3, m1);
+  tt = _mm_blend_epi16(t2, m2, 0xC0);
+  t2 = _mm_shuffle_epi32(tt, _MM_SHUFFLE(1, 3, 2, 0));
+  g1(&rows[0], &rows[1], &rows[2], &rows[3], t2);
+  t3 = _mm_unpackhi_epi32(m1, m3);
+  tt = _mm_unpacklo_epi32(m2, t3);
+  t3 = _mm_shuffle_epi32(tt, _MM_SHUFFLE(0, 1, 3, 2));
+  g2(&rows[0], &rows[1], &rows[2], &rows[3], t3);
+  undiagonalize(&rows[0], &rows[2], &rows[3]);
+  m0 = t0;
+  m1 = t1;
+  m2 = t2;
+  m3 = t3;
+
+  // Round 5
+  t0 = _mm_shuffle_ps2(m0, m1, _MM_SHUFFLE(3, 1, 1, 2));
+  t0 = _mm_shuffle_epi32(t0, _MM_SHUFFLE(0, 3, 2, 1));
+  g1(&rows[0], &rows[1], &rows[2], &rows[3], t0);
+  t1 = _mm_shuffle_ps2(m2, m3, _MM_SHUFFLE(3, 3, 2, 2));
+  tt = _mm_shuffle_epi32(m0, _MM_SHUFFLE(0, 0, 3, 3));
+  t1 = _mm_blend_epi16(tt, t1, 0xCC);
+  g2(&rows[0], &rows[1], &rows[2], &rows[3], t1);
+  diagonalize(&rows[0], &rows[2], &rows[3]);
+  t2 = _mm_unpacklo_epi64(m3, m1);
+  tt = _mm_blend_epi16(t2, m2, 0xC0);
+  t2 = _mm_shuffle_epi32(tt, _MM_SHUFFLE(1, 3, 2, 0));
+  g1(&rows[0], &rows[1], &rows[2], &rows[3], t2);
+  t3 = _mm_unpackhi_epi32(m1, m3);
+  tt = _mm_unpacklo_epi32(m2, t3);
+  t3 = _mm_shuffle_epi32(tt, _MM_SHUFFLE(0, 1, 3, 2));
+  g2(&rows[0], &rows[1], &rows[2], &rows[3], t3);
+  undiagonalize(&rows[0], &rows[2], &rows[3]);
+  m0 = t0;
+  m1 = t1;
+  m2 = t2;
+  m3 = t3;
+
+  // Round 6
+  t0 = _mm_shuffle_ps2(m0, m1, _MM_SHUFFLE(3, 1, 1, 2));
+  t0 = _mm_shuffle_epi32(t0, _MM_SHUFFLE(0, 3, 2, 1));
+  g1(&rows[0], &rows[1], &rows[2], &rows[3], t0);
+  t1 = _mm_shuffle_ps2(m2, m3, _MM_SHUFFLE(3, 3, 2, 2));
+  tt = _mm_shuffle_epi32(m0, _MM_SHUFFLE(0, 0, 3, 3));
+  t1 = _mm_blend_epi16(tt, t1, 0xCC);
+  g2(&rows[0], &rows[1], &rows[2], &rows[3], t1);
+  diagonalize(&rows[0], &rows[2], &rows[3]);
+  t2 = _mm_unpacklo_epi64(m3, m1);
+  tt = _mm_blend_epi16(t2, m2, 0xC0);
+  t2 = _mm_shuffle_epi32(tt, _MM_SHUFFLE(1, 3, 2, 0));
+  g1(&rows[0], &rows[1], &rows[2], &rows[3], t2);
+  t3 = _mm_unpackhi_epi32(m1, m3);
+  tt = _mm_unpacklo_epi32(m2, t3);
+  t3 = _mm_shuffle_epi32(tt, _MM_SHUFFLE(0, 1, 3, 2));
+  g2(&rows[0], &rows[1], &rows[2], &rows[3], t3);
+  undiagonalize(&rows[0], &rows[2], &rows[3]);
+  m0 = t0;
+  m1 = t1;
+  m2 = t2;
+  m3 = t3;
+
+  // Round 7
+  t0 = _mm_shuffle_ps2(m0, m1, _MM_SHUFFLE(3, 1, 1, 2));
+  t0 = _mm_shuffle_epi32(t0, _MM_SHUFFLE(0, 3, 2, 1));
+  g1(&rows[0], &rows[1], &rows[2], &rows[3], t0);
+  t1 = _mm_shuffle_ps2(m2, m3, _MM_SHUFFLE(3, 3, 2, 2));
+  tt = _mm_shuffle_epi32(m0, _MM_SHUFFLE(0, 0, 3, 3));
+  t1 = _mm_blend_epi16(tt, t1, 0xCC);
+  g2(&rows[0], &rows[1], &rows[2], &rows[3], t1);
+  diagonalize(&rows[0], &rows[2], &rows[3]);
+  t2 = _mm_unpacklo_epi64(m3, m1);
+  tt = _mm_blend_epi16(t2, m2, 0xC0);
+  t2 = _mm_shuffle_epi32(tt, _MM_SHUFFLE(1, 3, 2, 0));
+  g1(&rows[0], &rows[1], &rows[2], &rows[3], t2);
+  t3 = _mm_unpackhi_epi32(m1, m3);
+  tt = _mm_unpacklo_epi32(m2, t3);
+  t3 = _mm_shuffle_epi32(tt, _MM_SHUFFLE(0, 1, 3, 2));
+  g2(&rows[0], &rows[1], &rows[2], &rows[3], t3);
+  undiagonalize(&rows[0], &rows[2], &rows[3]);
+}
+
+void blake3_compress_in_place_sse41(uint32_t cv[8],
+                                    const uint8_t block[BLAKE3_BLOCK_LEN],
+                                    uint8_t block_len, uint64_t counter,
+                                    uint8_t flags) {
+  __m128i rows[4];
+  compress_pre(rows, cv, block, block_len, counter, flags);
+  storeu(xorv(rows[0], rows[2]), (uint8_t *)&cv[0]);
+  storeu(xorv(rows[1], rows[3]), (uint8_t *)&cv[4]);
+}
+
+void blake3_compress_xof_sse41(const uint32_t cv[8],
+                               const uint8_t block[BLAKE3_BLOCK_LEN],
+                               uint8_t block_len, uint64_t counter,
+                               uint8_t flags, uint8_t out[64]) {
+  __m128i rows[4];
+  compress_pre(rows, cv, block, block_len, counter, flags);
+  storeu(xorv(rows[0], rows[2]), &out[0]);
+  storeu(xorv(rows[1], rows[3]), &out[16]);
+  storeu(xorv(rows[2], loadu((uint8_t *)&cv[0])), &out[32]);
+  storeu(xorv(rows[3], loadu((uint8_t *)&cv[4])), &out[48]);
+}
+
+INLINE void round_fn(__m128i v[16], __m128i m[16], size_t r) {
+  v[0] = addv(v[0], m[(size_t)MSG_SCHEDULE[r][0]]);
+  v[1] = addv(v[1], m[(size_t)MSG_SCHEDULE[r][2]]);
+  v[2] = addv(v[2], m[(size_t)MSG_SCHEDULE[r][4]]);
+  v[3] = addv(v[3], m[(size_t)MSG_SCHEDULE[r][6]]);
+  v[0] = addv(v[0], v[4]);
+  v[1] = addv(v[1], v[5]);
+  v[2] = addv(v[2], v[6]);
+  v[3] = addv(v[3], v[7]);
+  v[12] = xorv(v[12], v[0]);
+  v[13] = xorv(v[13], v[1]);
+  v[14] = xorv(v[14], v[2]);
+  v[15] = xorv(v[15], v[3]);
+  v[12] = rot16(v[12]);
+  v[13] = rot16(v[13]);
+  v[14] = rot16(v[14]);
+  v[15] = rot16(v[15]);
+  v[8] = addv(v[8], v[12]);
+  v[9] = addv(v[9], v[13]);
+  v[10] = addv(v[10], v[14]);
+  v[11] = addv(v[11], v[15]);
+  v[4] = xorv(v[4], v[8]);
+  v[5] = xorv(v[5], v[9]);
+  v[6] = xorv(v[6], v[10]);
+  v[7] = xorv(v[7], v[11]);
+  v[4] = rot12(v[4]);
+  v[5] = rot12(v[5]);
+  v[6] = rot12(v[6]);
+  v[7] = rot12(v[7]);
+  v[0] = addv(v[0], m[(size_t)MSG_SCHEDULE[r][1]]);
+  v[1] = addv(v[1], m[(size_t)MSG_SCHEDULE[r][3]]);
+  v[2] = addv(v[2], m[(size_t)MSG_SCHEDULE[r][5]]);
+  v[3] = addv(v[3], m[(size_t)MSG_SCHEDULE[r][7]]);
+  v[0] = addv(v[0], v[4]);
+  v[1] = addv(v[1], v[5]);
+  v[2] = addv(v[2], v[6]);
+  v[3] = addv(v[3], v[7]);
+  v[12] = xorv(v[12], v[0]);
+  v[13] = xorv(v[13], v[1]);
+  v[14] = xorv(v[14], v[2]);
+  v[15] = xorv(v[15], v[3]);
+  v[12] = rot8(v[12]);
+  v[13] = rot8(v[13]);
+  v[14] = rot8(v[14]);
+  v[15] = rot8(v[15]);
+  v[8] = addv(v[8], v[12]);
+  v[9] = addv(v[9], v[13]);
+  v[10] = addv(v[10], v[14]);
+  v[11] = addv(v[11], v[15]);
+  v[4] = xorv(v[4], v[8]);
+  v[5] = xorv(v[5], v[9]);
+  v[6] = xorv(v[6], v[10]);
+  v[7] = xorv(v[7], v[11]);
+  v[4] = rot7(v[4]);
+  v[5] = rot7(v[5]);
+  v[6] = rot7(v[6]);
+  v[7] = rot7(v[7]);
+
+  v[0] = addv(v[0], m[(size_t)MSG_SCHEDULE[r][8]]);
+  v[1] = addv(v[1], m[(size_t)MSG_SCHEDULE[r][10]]);
+  v[2] = addv(v[2], m[(size_t)MSG_SCHEDULE[r][12]]);
+  v[3] = addv(v[3], m[(size_t)MSG_SCHEDULE[r][14]]);
+  v[0] = addv(v[0], v[5]);
+  v[1] = addv(v[1], v[6]);
+  v[2] = addv(v[2], v[7]);
+  v[3] = addv(v[3], v[4]);
+  v[15] = xorv(v[15], v[0]);
+  v[12] = xorv(v[12], v[1]);
+  v[13] = xorv(v[13], v[2]);
+  v[14] = xorv(v[14], v[3]);
+  v[15] = rot16(v[15]);
+  v[12] = rot16(v[12]);
+  v[13] = rot16(v[13]);
+  v[14] = rot16(v[14]);
+  v[10] = addv(v[10], v[15]);
+  v[11] = addv(v[11], v[12]);
+  v[8] = addv(v[8], v[13]);
+  v[9] = addv(v[9], v[14]);
+  v[5] = xorv(v[5], v[10]);
+  v[6] = xorv(v[6], v[11]);
+  v[7] = xorv(v[7], v[8]);
+  v[4] = xorv(v[4], v[9]);
+  v[5] = rot12(v[5]);
+  v[6] = rot12(v[6]);
+  v[7] = rot12(v[7]);
+  v[4] = rot12(v[4]);
+  v[0] = addv(v[0], m[(size_t)MSG_SCHEDULE[r][9]]);
+  v[1] = addv(v[1], m[(size_t)MSG_SCHEDULE[r][11]]);
+  v[2] = addv(v[2], m[(size_t)MSG_SCHEDULE[r][13]]);
+  v[3] = addv(v[3], m[(size_t)MSG_SCHEDULE[r][15]]);
+  v[0] = addv(v[0], v[5]);
+  v[1] = addv(v[1], v[6]);
+  v[2] = addv(v[2], v[7]);
+  v[3] = addv(v[3], v[4]);
+  v[15] = xorv(v[15], v[0]);
+  v[12] = xorv(v[12], v[1]);
+  v[13] = xorv(v[13], v[2]);
+  v[14] = xorv(v[14], v[3]);
+  v[15] = rot8(v[15]);
+  v[12] = rot8(v[12]);
+  v[13] = rot8(v[13]);
+  v[14] = rot8(v[14]);
+  v[10] = addv(v[10], v[15]);
+  v[11] = addv(v[11], v[12]);
+  v[8] = addv(v[8], v[13]);
+  v[9] = addv(v[9], v[14]);
+  v[5] = xorv(v[5], v[10]);
+  v[6] = xorv(v[6], v[11]);
+  v[7] = xorv(v[7], v[8]);
+  v[4] = xorv(v[4], v[9]);
+  v[5] = rot7(v[5]);
+  v[6] = rot7(v[6]);
+  v[7] = rot7(v[7]);
+  v[4] = rot7(v[4]);
+}
+
+INLINE void transpose_vecs(__m128i vecs[DEGREE]) {
+  // Interleave 32-bit lates. The low unpack is lanes 00/11 and the high is
+  // 22/33. Note that this doesn't split the vector into two lanes, as the
+  // AVX2 counterparts do.
+  __m128i ab_01 = _mm_unpacklo_epi32(vecs[0], vecs[1]);
+  __m128i ab_23 = _mm_unpackhi_epi32(vecs[0], vecs[1]);
+  __m128i cd_01 = _mm_unpacklo_epi32(vecs[2], vecs[3]);
+  __m128i cd_23 = _mm_unpackhi_epi32(vecs[2], vecs[3]);
+
+  // Interleave 64-bit lanes.
+  __m128i abcd_0 = _mm_unpacklo_epi64(ab_01, cd_01);
+  __m128i abcd_1 = _mm_unpackhi_epi64(ab_01, cd_01);
+  __m128i abcd_2 = _mm_unpacklo_epi64(ab_23, cd_23);
+  __m128i abcd_3 = _mm_unpackhi_epi64(ab_23, cd_23);
+
+  vecs[0] = abcd_0;
+  vecs[1] = abcd_1;
+  vecs[2] = abcd_2;
+  vecs[3] = abcd_3;
+}
+
+INLINE void transpose_msg_vecs(const uint8_t *const *inputs,
+                               size_t block_offset, __m128i out[16]) {
+  out[0] = loadu(&inputs[0][block_offset + 0 * sizeof(__m128i)]);
+  out[1] = loadu(&inputs[1][block_offset + 0 * sizeof(__m128i)]);
+  out[2] = loadu(&inputs[2][block_offset + 0 * sizeof(__m128i)]);
+  out[3] = loadu(&inputs[3][block_offset + 0 * sizeof(__m128i)]);
+  out[4] = loadu(&inputs[0][block_offset + 1 * sizeof(__m128i)]);
+  out[5] = loadu(&inputs[1][block_offset + 1 * sizeof(__m128i)]);
+  out[6] = loadu(&inputs[2][block_offset + 1 * sizeof(__m128i)]);
+  out[7] = loadu(&inputs[3][block_offset + 1 * sizeof(__m128i)]);
+  out[8] = loadu(&inputs[0][block_offset + 2 * sizeof(__m128i)]);
+  out[9] = loadu(&inputs[1][block_offset + 2 * sizeof(__m128i)]);
+  out[10] = loadu(&inputs[2][block_offset + 2 * sizeof(__m128i)]);
+  out[11] = loadu(&inputs[3][block_offset + 2 * sizeof(__m128i)]);
+  out[12] = loadu(&inputs[0][block_offset + 3 * sizeof(__m128i)]);
+  out[13] = loadu(&inputs[1][block_offset + 3 * sizeof(__m128i)]);
+  out[14] = loadu(&inputs[2][block_offset + 3 * sizeof(__m128i)]);
+  out[15] = loadu(&inputs[3][block_offset + 3 * sizeof(__m128i)]);
+  transpose_vecs(&out[0]);
+  transpose_vecs(&out[4]);
+  transpose_vecs(&out[8]);
+  transpose_vecs(&out[12]);
+}
+
+INLINE void load_counters(uint64_t counter, bool increment_counter,
+                          __m128i *out_low, __m128i *out_high) {
+  uint64_t mask = (increment_counter ? ~0 : 0);
+  *out_low = set4(
+      counter_low(counter + (mask & 0)), counter_low(counter + (mask & 1)),
+      counter_low(counter + (mask & 2)), counter_low(counter + (mask & 3)));
+  *out_high = set4(
+      counter_high(counter + (mask & 0)), counter_high(counter + (mask & 1)),
+      counter_high(counter + (mask & 2)), counter_high(counter + (mask & 3)));
+}
+
+void blake3_hash4_sse41(const uint8_t *const *inputs, size_t blocks,
+                        const uint32_t key[8], uint64_t counter,
+                        bool increment_counter, uint8_t flags,
+                        uint8_t flags_start, uint8_t flags_end, uint8_t *out) {
+  __m128i h_vecs[8] = {
+      set1(key[0]), set1(key[1]), set1(key[2]), set1(key[3]),
+      set1(key[4]), set1(key[5]), set1(key[6]), set1(key[7]),
+  };
+  __m128i counter_low_vec, counter_high_vec;
+  load_counters(counter, increment_counter, &counter_low_vec,
+                &counter_high_vec);
+  uint8_t block_flags = flags | flags_start;
+
+  for (size_t block = 0; block < blocks; block++) {
+    if (block + 1 == blocks) {
+      block_flags |= flags_end;
+    }
+    __m128i block_len_vec = set1(BLAKE3_BLOCK_LEN);
+    __m128i block_flags_vec = set1(block_flags);
+    __m128i msg_vecs[16];
+    transpose_msg_vecs(inputs, block * BLAKE3_BLOCK_LEN, msg_vecs);
+
+    __m128i v[16] = {
+        h_vecs[0],       h_vecs[1],        h_vecs[2],     h_vecs[3],
+        h_vecs[4],       h_vecs[5],        h_vecs[6],     h_vecs[7],
+        set1(IV[0]),     set1(IV[1]),      set1(IV[2]),   set1(IV[3]),
+        counter_low_vec, counter_high_vec, block_len_vec, block_flags_vec,
+    };
+    round_fn(v, msg_vecs, 0);
+    round_fn(v, msg_vecs, 1);
+    round_fn(v, msg_vecs, 2);
+    round_fn(v, msg_vecs, 3);
+    round_fn(v, msg_vecs, 4);
+    round_fn(v, msg_vecs, 5);
+    round_fn(v, msg_vecs, 6);
+    h_vecs[0] = xorv(v[0], v[8]);
+    h_vecs[1] = xorv(v[1], v[9]);
+    h_vecs[2] = xorv(v[2], v[10]);
+    h_vecs[3] = xorv(v[3], v[11]);
+    h_vecs[4] = xorv(v[4], v[12]);
+    h_vecs[5] = xorv(v[5], v[13]);
+    h_vecs[6] = xorv(v[6], v[14]);
+    h_vecs[7] = xorv(v[7], v[15]);
+
+    block_flags = flags;
+  }
+
+  transpose_vecs(&h_vecs[0]);
+  transpose_vecs(&h_vecs[4]);
+  // The first four vecs now contain the first half of each output, and the
+  // second four vecs contain the second half of each output.
+  storeu(h_vecs[0], &out[0 * sizeof(__m128i)]);
+  storeu(h_vecs[4], &out[1 * sizeof(__m128i)]);
+  storeu(h_vecs[1], &out[2 * sizeof(__m128i)]);
+  storeu(h_vecs[5], &out[3 * sizeof(__m128i)]);
+  storeu(h_vecs[2], &out[4 * sizeof(__m128i)]);
+  storeu(h_vecs[6], &out[5 * sizeof(__m128i)]);
+  storeu(h_vecs[3], &out[6 * sizeof(__m128i)]);
+  storeu(h_vecs[7], &out[7 * sizeof(__m128i)]);
+}
+
+INLINE void hash_one_sse41(const uint8_t *input, size_t blocks,
+                           const uint32_t key[8], uint64_t counter,
+                           uint8_t flags, uint8_t flags_start,
+                           uint8_t flags_end, uint8_t out[BLAKE3_OUT_LEN]) {
+  uint32_t cv[8];
+  memcpy(cv, key, BLAKE3_KEY_LEN);
+  uint8_t block_flags = flags | flags_start;
+  while (blocks > 0) {
+    if (blocks == 1) {
+      block_flags |= flags_end;
+    }
+    blake3_compress_in_place_sse41(cv, input, BLAKE3_BLOCK_LEN, counter,
+                                   block_flags);
+    input = &input[BLAKE3_BLOCK_LEN];
+    blocks -= 1;
+    block_flags = flags;
+  }
+  memcpy(out, cv, BLAKE3_OUT_LEN);
+}
+
+void blake3_hash_many_sse41(const uint8_t *const *inputs, size_t num_inputs,
+                            size_t blocks, const uint32_t key[8],
+                            uint64_t counter, bool increment_counter,
+                            uint8_t flags, uint8_t flags_start,
+                            uint8_t flags_end, uint8_t *out) {
+  while (num_inputs >= DEGREE) {
+    blake3_hash4_sse41(inputs, blocks, key, counter, increment_counter, flags,
+                       flags_start, flags_end, out);
+    if (increment_counter) {
+      counter += DEGREE;
+    }
+    inputs += DEGREE;
+    num_inputs -= DEGREE;
+    out = &out[DEGREE * BLAKE3_OUT_LEN];
+  }
+  while (num_inputs > 0) {
+    hash_one_sse41(inputs[0], blocks, key, counter, flags, flags_start,
+                   flags_end, out);
+    if (increment_counter) {
+      counter += 1;
+    }
+    inputs += 1;
+    num_inputs -= 1;
+    out = &out[BLAKE3_OUT_LEN];
+  }
+}
diff --git a/c/main.c b/c/main.c
new file mode 100644
index 0000000..6abd4f3
--- /dev/null
+++ b/c/main.c
@@ -0,0 +1,147 @@
+#include <assert.h>
+#include <errno.h>
+#include <stdbool.h>
+#include <stdint.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <unistd.h>
+
+#include "blake3.h"
+#include "blake3_impl.h"
+
+#define HASH_MODE 0
+#define KEYED_HASH_MODE 1
+#define DERIVE_KEY_MODE 2
+
+void hex_char_value(uint8_t c, uint8_t *value, bool *valid) {
+  if ('0' <= c && c <= '9') {
+    *value = c - '0';
+    *valid = true;
+  } else if ('a' <= c && c <= 'f') {
+    *value = 10 + c - 'a';
+    *valid = true;
+  } else {
+    *valid = false;
+  }
+}
+
+int parse_key(char *hex_key, uint8_t out[BLAKE3_KEY_LEN]) {
+  size_t hex_len = strlen(hex_key);
+  if (hex_len != 64) {
+    fprintf(stderr, "Expected a 64-char hexadecimal key, got %zu chars.\n",
+            hex_len);
+    return 1;
+  }
+  for (size_t i = 0; i < 64; i++) {
+    uint8_t value;
+    bool valid;
+    hex_char_value(hex_key[i], &value, &valid);
+    if (!valid) {
+      fprintf(stderr, "Invalid hex char.\n");
+      return 1;
+    }
+    if (i % 2 == 0) {
+      out[i / 2] = 0;
+      value <<= 4;
+    }
+    out[i / 2] += value;
+  }
+  return 0;
+}
+
+/* A little repetition here */
+enum cpu_feature {
+    SSE2     = 1 << 0,
+    SSSE3    = 1 << 1,
+    SSE41    = 1 << 2,
+    AVX      = 1 << 3,
+    AVX2     = 1 << 4,
+    AVX512F  = 1 << 5,
+    AVX512VL = 1 << 6,
+    /* ... */
+    UNDEFINED = 1 << 30
+};
+
+extern enum cpu_feature g_cpu_features;
+enum cpu_feature get_cpu_features();
+
+int main(int argc, char **argv) {
+  size_t out_len = BLAKE3_OUT_LEN;
+  uint8_t key[BLAKE3_KEY_LEN];
+  char *context = "";
+  uint8_t mode = HASH_MODE;
+  while (argc > 1) {
+    if (argc <= 2) {
+      fprintf(stderr, "Odd number of arguments.\n");
+      return 1;
+    }
+    if (strcmp("--length", argv[1]) == 0) {
+      char *endptr = NULL;
+      unsigned long long out_len_ll = strtoull(argv[2], &endptr, 10);
+      // TODO: There are so many possible error conditions for parsing a
+      //       non-negative size_t...I probably missed something.
+      if (errno != 0 || out_len > SIZE_MAX || endptr == argv[2] || *endptr != 0) {
+        fprintf(stderr, "Bad length argument.\n");
+        return 1;
+      }
+      // TODO: A more sanitary cast?
+      out_len = (size_t)out_len_ll;
+    } else if (strcmp("--keyed", argv[1]) == 0) {
+      mode = KEYED_HASH_MODE;
+      int ret = parse_key(argv[2], key);
+      if (ret != 0) {
+        return ret;
+      }
+    } else if (strcmp("--derive-key", argv[1]) == 0) {
+      mode = DERIVE_KEY_MODE;
+      context = argv[2];
+    } else {
+      fprintf(stderr, "Unknown flag.\n");
+      return 1;
+    }
+    argc -= 2;
+    argv += 2;
+  }
+
+  uint8_t buf[65536] = {0};
+  size_t n = fread(buf, 1, sizeof(buf), stdin);
+
+  const int mask = get_cpu_features();
+  int feature = 0;
+  do {
+    fprintf(stderr, "Testing 0x%08X\n", feature);
+    g_cpu_features = feature;
+    blake3_hasher hasher;
+    switch(mode) {
+    case HASH_MODE:
+      blake3_hasher_init(&hasher);
+      break;
+    case KEYED_HASH_MODE:
+      blake3_hasher_init_keyed(&hasher, key);
+      break;
+    case DERIVE_KEY_MODE:
+      blake3_hasher_init_derive_key(&hasher, context);
+      break;
+    default:
+      abort();
+    }
+
+    blake3_hasher_update(&hasher, buf, n);
+
+    // TODO: An incremental output reader API to avoid this allocation.
+    uint8_t *out = malloc(out_len);
+    memset(out, 0, out_len);
+    if (out_len > 0 && out == NULL) {
+      fprintf(stderr, "malloc() failed.\n");
+      return 1;
+    }
+    blake3_hasher_finalize(&hasher, out, out_len);
+    for (size_t i = 0; i < out_len; i++) {
+      printf("%02x", out[i]);
+    }
+    printf("\n");
+    free(out);
+    feature = (feature - mask) & mask;
+  } while(feature != 0);
+  return 0;
+}
diff --git a/c/test.py b/c/test.py
new file mode 100755
index 0000000..f046f37
--- /dev/null
+++ b/c/test.py
@@ -0,0 +1,96 @@
+#! /usr/bin/env python3
+
+from binascii import hexlify
+import json
+from os import path
+import subprocess
+
+PROJECT_DIR = path.dirname(__file__)
+TEST_VECTORS = json.load(open(path.join(PROJECT_DIR, "test_vectors.json")))
+TEST_CONTEXT = "BLAKE3 2019-12-27 16:29:52 test vectors context"
+
+
+def run_blake3(args, input):
+    output = subprocess.run([path.join(PROJECT_DIR, "blake3")] + args,
+                            input=input,
+                            stdout=subprocess.PIPE,
+                            check=True)
+    return output.stdout.decode().strip()
+
+
+# Fill the input with a repeating byte pattern. We use a cycle length of 251,
+# because that's the largets prime number less than 256. This makes it unlikely
+# to swapping any two adjacent input blocks or chunks will give the same
+# answer.
+def make_test_input(length):
+    i = 0
+    buf = bytearray()
+    while len(buf) < length:
+        buf.append(i)
+        i = (i + 1) % 251
+    return buf
+
+
+def main():
+    for case in TEST_VECTORS["cases"]:
+        input_len = case["input_len"]
+        input = make_test_input(input_len)
+        hex_key = hexlify(TEST_VECTORS["key"].encode())
+        expected_hash_xof = case["hash"]
+        expected_hash = expected_hash_xof[:64]
+        expected_keyed_hash_xof = case["keyed_hash"]
+        expected_keyed_hash = expected_keyed_hash_xof[:64]
+        expected_derive_key_xof = case["derive_key"]
+        expected_derive_key = expected_derive_key_xof[:64]
+
+        # Test the default hash.
+        test_hash = run_blake3([], input)
+        for line in test_hash.splitlines():
+            assert expected_hash == line, \
+                "hash({}): {} != {}".format(input_len, expected_hash, line)
+
+        # Test the extended hash.
+        xof_len = len(expected_hash_xof) // 2
+        test_hash_xof = run_blake3(["--length", str(xof_len)], input)
+        for line in test_hash_xof.splitlines():
+            assert expected_hash_xof == line, \
+                "hash_xof({}): {} != {}".format(
+                    input_len, expected_hash_xof, line)
+
+        # Test the default keyed hash.
+        test_keyed_hash = run_blake3(["--keyed", hex_key], input)
+        for line in test_keyed_hash.splitlines():
+            assert expected_keyed_hash == line, \
+                "keyed_hash({}): {} != {}".format(
+                    input_len, expected_keyed_hash, line)
+
+        # Test the extended keyed hash.
+        xof_len = len(expected_keyed_hash_xof) // 2
+        test_keyed_hash_xof = run_blake3(
+            ["--keyed", hex_key, "--length",
+             str(xof_len)], input)
+        for line in test_keyed_hash_xof.splitlines():
+            assert expected_keyed_hash_xof == line, \
+                "keyed_hash_xof({}): {} != {}".format(
+                    input_len, expected_keyed_hash_xof, line)
+
+        # Test the default derive key.
+        test_derive_key = run_blake3(["--derive-key", TEST_CONTEXT], input)
+        for line in test_derive_key.splitlines():
+            assert expected_derive_key == line, \
+                "derive_key({}): {} != {}".format(
+                    input_len, expected_derive_key, line)
+
+        # Test the extended derive key.
+        xof_len = len(expected_derive_key_xof) // 2
+        test_derive_key_xof = run_blake3(
+            ["--derive-key", TEST_CONTEXT, "--length",
+             str(xof_len)], input)
+        for line in test_derive_key_xof.splitlines():
+            assert expected_derive_key_xof == line, \
+                "derive_key_xof({}): {} != {}".format(
+                    input_len, expected_derive_key_xof, line)
+
+
+if __name__ == "__main__":
+    main()
diff --git a/c/test_vectors.json b/c/test_vectors.json
new file mode 100644
index 0000000..a628815
--- /dev/null
+++ b/c/test_vectors.json
@@ -0,0 +1,132 @@
+{
+  "_comment": "Each test is an input length and three outputs, one for each of the hash, keyed_hash, and derive_key modes. The input in each case is filled with a 251-byte-long repeating pattern: 0, 1, 2, ..., 249, 250, 0, 1, ... The key used with keyed_hash is the 32-byte ASCII string given in the 'key' field below. For derive_key, the test input is used as the input key, and the context string is 'BLAKE3 2019-12-27 16:29:52 example context'. (As good practice for following the security requirements of derive_key, test runners should make that context string a hardcoded constant, and we do not provided it in machine-readable form.) Outputs are encoded as hexadecimal. Each case is an extended output, and implementations should also check that the first 32 bytes match their default-length output.",
+  "key": "whats the Elvish word for friend",
+  "cases": [
+    {
+      "input_len": 0,
+      "hash": "af1349b9f5f9a1a6a0404dea36dcc9499bcb25c9adc112b7cc9a93cae41f3262e00f03e7b69af26b7faaf09fcd333050338ddfe085b8cc869ca98b206c08243a26f5487789e8f660afe6c99ef9e0c52b92e7393024a80459cf91f476f9ffdbda7001c22e159b402631f277ca96f2defdf1078282314e763699a31c5363165421cce14d",
+      "keyed_hash": "92b2b75604ed3c761f9d6f62392c8a9227ad0ea3f09573e783f1498a4ed60d26b18171a2f22a4b94822c701f107153dba24918c4bae4d2945c20ece13387627d3b73cbf97b797d5e59948c7ef788f54372df45e45e4293c7dc18c1d41144a9758be58960856be1eabbe22c2653190de560ca3b2ac4aa692a9210694254c371e851bc8f",
+      "derive_key": "2cc39783c223154fea8dfb7c1b1660f2ac2dcbd1c1de8277b0b0dd39b7e50d7d905630c8be290dfcf3e6842f13bddd573c098c3f17361f1f206b8cad9d088aa4a3f746752c6b0ce6a83b0da81d59649257cdf8eb3e9f7d4998e41021fac119deefb896224ac99f860011f73609e6e0e4540f93b273e56547dfd3aa1a035ba6689d89a0"
+    },
+    {
+      "input_len": 1,
+      "hash": "2d3adedff11b61f14c886e35afa036736dcd87a74d27b5c1510225d0f592e213c3a6cb8bf623e20cdb535f8d1a5ffb86342d9c0b64aca3bce1d31f60adfa137b358ad4d79f97b47c3d5e79f179df87a3b9776ef8325f8329886ba42f07fb138bb502f4081cbcec3195c5871e6c23e2cc97d3c69a613eba131e5f1351f3f1da786545e5",
+      "keyed_hash": "6d7878dfff2f485635d39013278ae14f1454b8c0a3a2d34bc1ab38228a80c95b6568c0490609413006fbd428eb3fd14e7756d90f73a4725fad147f7bf70fd61c4e0cf7074885e92b0e3f125978b4154986d4fb202a3f331a3fb6cf349a3a70e49990f98fe4289761c8602c4e6ab1138d31d3b62218078b2f3ba9a88e1d08d0dd4cea11",
+      "derive_key": "b3e2e340a117a499c6cf2398a19ee0d29cca2bb7404c73063382693bf66cb06c5827b91bf889b6b97c5477f535361caefca0b5d8c4746441c57617111933158950670f9aa8a05d791daae10ac683cbef8faf897c84e6114a59d2173c3f417023a35d6983f2c7dfa57e7fc559ad751dbfb9ffab39c2ef8c4aafebc9ae973a64f0c76551"
+    },
+    {
+      "input_len": 1023,
+      "hash": "10108970eeda3eb932baac1428c7a2163b0e924c9a9e25b35bba72b28f70bd11a182d27a591b05592b15607500e1e8dd56bc6c7fc063715b7a1d737df5bad3339c56778957d870eb9717b57ea3d9fb68d1b55127bba6a906a4a24bbd5acb2d123a37b28f9e9a81bbaae360d58f85e5fc9d75f7c370a0cc09b6522d9c8d822f2f28f485",
+      "keyed_hash": "c951ecdf03288d0fcc96ee3413563d8a6d3589547f2c2fb36d9786470f1b9d6e890316d2e6d8b8c25b0a5b2180f94fb1a158ef508c3cde45e2966bd796a696d3e13efd86259d756387d9becf5c8bf1ce2192b87025152907b6d8cc33d17826d8b7b9bc97e38c3c85108ef09f013e01c229c20a83d9e8efac5b37470da28575fd755a10",
+      "derive_key": "74a16c1c3d44368a86e1ca6df64be6a2f64cce8f09220787450722d85725dea59c413264404661e9e4d955409dfe4ad3aa487871bcd454ed12abfe2c2b1eb7757588cf6cb18d2eccad49e018c0d0fec323bec82bf1644c6325717d13ea712e6840d3e6e730d35553f59eff5377a9c350bcc1556694b924b858f329c44ee64b884ef00d"
+    },
+    {
+      "input_len": 1024,
+      "hash": "42214739f095a406f3fc83deb889744ac00df831c10daa55189b5d121c855af71cf8107265ecdaf8505b95d8fcec83a98a6a96ea5109d2c179c47a387ffbb404756f6eeae7883b446b70ebb144527c2075ab8ab204c0086bb22b7c93d465efc57f8d917f0b385c6df265e77003b85102967486ed57db5c5ca170ba441427ed9afa684e",
+      "keyed_hash": "75c46f6f3d9eb4f55ecaaee480db732e6c2105546f1e675003687c31719c7ba4a78bc838c72852d4f49c864acb7adafe2478e824afe51c8919d06168414c265f298a8094b1ad813a9b8614acabac321f24ce61c5a5346eb519520d38ecc43e89b5000236df0597243e4d2493fd626730e2ba17ac4d8824d09d1a4a8f57b8227778e2de",
+      "derive_key": "7356cd7720d5b66b6d0697eb3177d9f8d73a4a5c5e968896eb6a6896843027066c23b601d3ddfb391e90d5c8eccdef4ae2a264bce9e612ba15e2bc9d654af1481b2e75dbabe615974f1070bba84d56853265a34330b4766f8e75edd1f4a1650476c10802f22b64bd3919d246ba20a17558bc51c199efdec67e80a227251808d8ce5bad"
+    },
+    {
+      "input_len": 1025,
+      "hash": "d00278ae47eb27b34faecf67b4fe263f82d5412916c1ffd97c8cb7fb814b8444f4c4a22b4b399155358a994e52bf255de60035742ec71bd08ac275a1b51cc6bfe332b0ef84b409108cda080e6269ed4b3e2c3f7d722aa4cdc98d16deb554e5627be8f955c98e1d5f9565a9194cad0c4285f93700062d9595adb992ae68ff12800ab67a",
+      "keyed_hash": "357dc55de0c7e382c900fd6e320acc04146be01db6a8ce7210b7189bd664ea69362396b77fdc0d2634a552970843722066c3c15902ae5097e00ff53f1e116f1cd5352720113a837ab2452cafbde4d54085d9cf5d21ca613071551b25d52e69d6c81123872b6f19cd3bc1333edf0c52b94de23ba772cf82636cff4542540a7738d5b930",
+      "derive_key": "effaa245f065fbf82ac186839a249707c3bddf6d3fdda22d1b95a3c970379bcb5d31013a167509e9066273ab6e2123bc835b408b067d88f96addb550d96b6852dad38e320b9d940f86db74d398c770f462118b35d2724efa13da97194491d96dd37c3c09cbef665953f2ee85ec83d88b88d11547a6f911c8217cca46defa2751e7f3ad"
+    },
+    {
+      "input_len": 2048,
+      "hash": "e776b6028c7cd22a4d0ba182a8bf62205d2ef576467e838ed6f2529b85fba24a9a60bf80001410ec9eea6698cd537939fad4749edd484cb541aced55cd9bf54764d063f23f6f1e32e12958ba5cfeb1bf618ad094266d4fc3c968c2088f677454c288c67ba0dba337b9d91c7e1ba586dc9a5bc2d5e90c14f53a8863ac75655461cea8f9",
+      "keyed_hash": "879cf1fa2ea0e79126cb1063617a05b6ad9d0b696d0d757cf053439f60a99dd10173b961cd574288194b23ece278c330fbb8585485e74967f31352a8183aa782b2b22f26cdcadb61eed1a5bc144b8198fbb0c13abbf8e3192c145d0a5c21633b0ef86054f42809df823389ee40811a5910dcbd1018af31c3b43aa55201ed4edaac74fe",
+      "derive_key": "7b2945cb4fef70885cc5d78a87bf6f6207dd901ff239201351ffac04e1088a23e2c11a1ebffcea4d80447867b61badb1383d842d4e79645d48dd82ccba290769caa7af8eaa1bd78a2a5e6e94fbdab78d9c7b74e894879f6a515257ccf6f95056f4e25390f24f6b35ffbb74b766202569b1d797f2d4bd9d17524c720107f985f4ddc583"
+    },
+    {
+      "input_len": 2049,
+      "hash": "5f4d72f40d7a5f82b15ca2b2e44b1de3c2ef86c426c95c1af0b687952256303096de31d71d74103403822a2e0bc1eb193e7aecc9643a76b7bbc0c9f9c52e8783aae98764ca468962b5c2ec92f0c74eb5448d519713e09413719431c802f948dd5d90425a4ecdadece9eb178d80f26efccae630734dff63340285adec2aed3b51073ad3",
+      "keyed_hash": "9f29700902f7c86e514ddc4df1e3049f258b2472b6dd5267f61bf13983b78dd5f9a88abfefdfa1e00b418971f2b39c64ca621e8eb37fceac57fd0c8fc8e117d43b81447be22d5d8186f8f5919ba6bcc6846bd7d50726c06d245672c2ad4f61702c646499ee1173daa061ffe15bf45a631e2946d616a4c345822f1151284712f76b2b0e",
+      "derive_key": "2ea477c5515cc3dd606512ee72bb3e0e758cfae7232826f35fb98ca1bcbdf27316d8e9e79081a80b046b60f6a263616f33ca464bd78d79fa18200d06c7fc9bffd808cc4755277a7d5e09da0f29ed150f6537ea9bed946227ff184cc66a72a5f8c1e4bd8b04e81cf40fe6dc4427ad5678311a61f4ffc39d195589bdbc670f63ae70f4b6"
+    },
+    {
+      "input_len": 3072,
+      "hash": "b98cb0ff3623be03326b373de6b9095218513e64f1ee2edd2525c7ad1e5cffd29a3f6b0b978d6608335c09dc94ccf682f9951cdfc501bfe47b9c9189a6fc7b404d120258506341a6d802857322fbd20d3e5dae05b95c88793fa83db1cb08e7d8008d1599b6209d78336e24839724c191b2a52a80448306e0daa84a3fdb566661a37e11",
+      "keyed_hash": "044a0e7b172a312dc02a4c9a818c036ffa2776368d7f528268d2e6b5df19177022f302d0529e4174cc507c463671217975e81dab02b8fdeb0d7ccc7568dd22574c783a76be215441b32e91b9a904be8ea81f7a0afd14bad8ee7c8efc305ace5d3dd61b996febe8da4f56ca0919359a7533216e2999fc87ff7d8f176fbecb3d6f34278b",
+      "derive_key": "050df97f8c2ead654d9bb3ab8c9178edcd902a32f8495949feadcc1e0480c46b3604131bbd6e3ba573b6dd682fa0a63e5b165d39fc43a625d00207607a2bfeb65ff1d29292152e26b298868e3b87be95d6458f6f2ce6118437b632415abe6ad522874bcd79e4030a5e7bad2efa90a7a7c67e93f0a18fb28369d0a9329ab5c24134ccb0"
+    },
+    {
+      "input_len": 3073,
+      "hash": "7124b49501012f81cc7f11ca069ec9226cecb8a2c850cfe644e327d22d3e1cd39a27ae3b79d68d89da9bf25bc27139ae65a324918a5f9b7828181e52cf373c84f35b639b7fccbb985b6f2fa56aea0c18f531203497b8bbd3a07ceb5926f1cab74d14bd66486d9a91eba99059a98bd1cd25876b2af5a76c3e9eed554ed72ea952b603bf",
+      "keyed_hash": "68dede9bef00ba89e43f31a6825f4cf433389fedae75c04ee9f0cf16a427c95a96d6da3fe985054d3478865be9a092250839a697bbda74e279e8a9e69f0025e4cfddd6cfb434b1cd9543aaf97c635d1b451a4386041e4bb100f5e45407cbbc24fa53ea2de3536ccb329e4eb9466ec37093a42cf62b82903c696a93a50b702c80f3c3c5",
+      "derive_key": "72613c9ec9ff7e40f8f5c173784c532ad852e827dba2bf85b2ab4b76f7079081576288e552647a9d86481c2cae75c2dd4e7c5195fb9ada1ef50e9c5098c249d743929191441301c69e1f48505a4305ec1778450ee48b8e69dc23a25960fe33070ea549119599760a8a2d28aeca06b8c5e9ba58bc19e11fe57b6ee98aa44b2a8e6b14a5"
+    },
+    {
+      "input_len": 4096,
+      "hash": "015094013f57a5277b59d8475c0501042c0b642e531b0a1c8f58d2163229e9690289e9409ddb1b99768eafe1623da896faf7e1114bebeadc1be30829b6f8af707d85c298f4f0ff4d9438aef948335612ae921e76d411c3a9111df62d27eaf871959ae0062b5492a0feb98ef3ed4af277f5395172dbe5c311918ea0074ce0036454f620",
+      "keyed_hash": "befc660aea2f1718884cd8deb9902811d332f4fc4a38cf7c7300d597a081bfc0bbb64a36edb564e01e4b4aaf3b060092a6b838bea44afebd2deb8298fa562b7b597c757b9df4c911c3ca462e2ac89e9a787357aaf74c3b56d5c07bc93ce899568a3eb17d9250c20f6c5f6c1e792ec9a2dcb715398d5a6ec6d5c54f586a00403a1af1de",
+      "derive_key": "1e0d7f3db8c414c97c6307cbda6cd27ac3b030949da8e23be1a1a924ad2f25b9d78038f7b198596c6cc4a9ccf93223c08722d684f240ff6569075ed81591fd93f9fff1110b3a75bc67e426012e5588959cc5a4c192173a03c00731cf84544f65a2fb9378989f72e9694a6a394a8a30997c2e67f95a504e631cd2c5f55246024761b245"
+    },
+    {
+      "input_len": 4097,
+      "hash": "9b4052b38f1c5fc8b1f9ff7ac7b27cd242487b3d890d15c96a1c25b8aa0fb99505f91b0b5600a11251652eacfa9497b31cd3c409ce2e45cfe6c0a016967316c426bd26f619eab5d70af9a418b845c608840390f361630bd497b1ab44019316357c61dbe091ce72fc16dc340ac3d6e009e050b3adac4b5b2c92e722cffdc46501531956",
+      "keyed_hash": "00df940cd36bb9fa7cbbc3556744e0dbc8191401afe70520ba292ee3ca80abbc606db4976cfdd266ae0abf667d9481831ff12e0caa268e7d3e57260c0824115a54ce595ccc897786d9dcbf495599cfd90157186a46ec800a6763f1c59e36197e9939e900809f7077c102f888caaf864b253bc41eea812656d46742e4ea42769f89b83f",
+      "derive_key": "aca51029626b55fda7117b42a7c211f8c6e9ba4fe5b7a8ca922f34299500ead8a897f66a400fed9198fd61dd2d58d382458e64e100128075fc54b860934e8de2e84170734b06e1d212a117100820dbc48292d148afa50567b8b84b1ec336ae10d40c8c975a624996e12de31abbe135d9d159375739c333798a80c64ae895e51e22f3ad"
+    },
+    {
+      "input_len": 5120,
+      "hash": "9cadc15fed8b5d854562b26a9536d9707cadeda9b143978f319ab34230535833acc61c8fdc114a2010ce8038c853e121e1544985133fccdd0a2d507e8e615e611e9a0ba4f47915f49e53d721816a9198e8b30f12d20ec3689989175f1bf7a300eee0d9321fad8da232ece6efb8e9fd81b42ad161f6b9550a069e66b11b40487a5f5059",
+      "keyed_hash": "2c493e48e9b9bf31e0553a22b23503c0a3388f035cece68eb438d22fa1943e209b4dc9209cd80ce7c1f7c9a744658e7e288465717ae6e56d5463d4f80cdb2ef56495f6a4f5487f69749af0c34c2cdfa857f3056bf8d807336a14d7b89bf62bef2fb54f9af6a546f818dc1e98b9e07f8a5834da50fa28fb5874af91bf06020d1bf0120e",
+      "derive_key": "7a7acac8a02adcf3038d74cdd1d34527de8a0fcc0ee3399d1262397ce5817f6055d0cefd84d9d57fe792d65a278fd20384ac6c30fdb340092f1a74a92ace99c482b28f0fc0ef3b923e56ade20c6dba47e49227166251337d80a037e987ad3a7f728b5ab6dfafd6e2ab1bd583a95d9c895ba9c2422c24ea0f62961f0dca45cad47bfa0d"
+    },
+    {
+      "input_len": 5121,
+      "hash": "628bd2cb2004694adaab7bbd778a25df25c47b9d4155a55f8fbd79f2fe154cff96adaab0613a6146cdaabe498c3a94e529d3fc1da2bd08edf54ed64d40dcd6777647eac51d8277d70219a9694334a68bc8f0f23e20b0ff70ada6f844542dfa32cd4204ca1846ef76d811cdb296f65e260227f477aa7aa008bac878f72257484f2b6c95",
+      "keyed_hash": "6ccf1c34753e7a044db80798ecd0782a8f76f33563accaddbfbb2e0ea4b2d0240d07e63f13667a8d1490e5e04f13eb617aea16a8c8a5aaed1ef6fbde1b0515e3c81050b361af6ead126032998290b563e3caddeaebfab592e155f2e161fb7cba939092133f23f9e65245e58ec23457b78a2e8a125588aad6e07d7f11a85b88d375b72d",
+      "derive_key": "b07f01e518e702f7ccb44a267e9e112d403a7b3f4883a47ffbed4b48339b3c341a0add0ac032ab5aaea1e4e5b004707ec5681ae0fcbe3796974c0b1cf31a194740c14519273eedaabec832e8a784b6e7cfc2c5952677e6c3f2c3914454082d7eb1ce1766ac7d75a4d3001fc89544dd46b5147382240d689bbbaefc359fb6ae30263165"
+    },
+    {
+      "input_len": 6144,
+      "hash": "3e2e5b74e048f3add6d21faab3f83aa44d3b2278afb83b80b3c35164ebeca2054d742022da6fdda444ebc384b04a54c3ac5839b49da7d39f6d8a9db03deab32aade156c1c0311e9b3435cde0ddba0dce7b26a376cad121294b689193508dd63151603c6ddb866ad16c2ee41585d1633a2cea093bea714f4c5d6b903522045b20395c83",
+      "keyed_hash": "3d6b6d21281d0ade5b2b016ae4034c5dec10ca7e475f90f76eac7138e9bc8f1dc35754060091dc5caf3efabe0603c60f45e415bb3407db67e6beb3d11cf8e4f7907561f05dace0c15807f4b5f389c841eb114d81a82c02a00b57206b1d11fa6e803486b048a5ce87105a686dee041207e095323dfe172df73deb8c9532066d88f9da7e",
+      "derive_key": "2a95beae63ddce523762355cf4b9c1d8f131465780a391286a5d01abb5683a1597099e3c6488aab6c48f3c15dbe1942d21dbcdc12115d19a8b8465fb54e9053323a9178e4275647f1a9927f6439e52b7031a0b465c861a3fc531527f7758b2b888cf2f20582e9e2c593709c0a44f9c6e0f8b963994882ea4168827823eef1f64169fef"
+    },
+    {
+      "input_len": 6145,
+      "hash": "f1323a8631446cc50536a9f705ee5cb619424d46887f3c376c695b70e0f0507f18a2cfdd73c6e39dd75ce7c1c6e3ef238fd54465f053b25d21044ccb2093beb015015532b108313b5829c3621ce324b8e14229091b7c93f32db2e4e63126a377d2a63a3597997d4f1cba59309cb4af240ba70cebff9a23d5e3ff0cdae2cfd54e070022",
+      "keyed_hash": "9ac301e9e39e45e3250a7e3b3df701aa0fb6889fbd80eeecf28dbc6300fbc539f3c184ca2f59780e27a576c1d1fb9772e99fd17881d02ac7dfd39675aca918453283ed8c3169085ef4a466b91c1649cc341dfdee60e32231fc34c9c4e0b9a2ba87ca8f372589c744c15fd6f985eec15e98136f25beeb4b13c4e43dc84abcc79cd4646c",
+      "derive_key": "379bcc61d0051dd489f686c13de00d5b14c505245103dc040d9e4dd1facab8e5114493d029bdbd295aaa744a59e31f35c7f52dba9c3642f773dd0b4262a9980a2aef811697e1305d37ba9d8b6d850ef07fe41108993180cf779aeece363704c76483458603bbeeb693cffbbe5588d1f3535dcad888893e53d977424bb707201569a8d2"
+    },
+    {
+      "input_len": 7168,
+      "hash": "61da957ec2499a95d6b8023e2b0e604ec7f6b50e80a9678b89d2628e99ada77a5707c321c83361793b9af62a40f43b523df1c8633cecb4cd14d00bdc79c78fca5165b863893f6d38b02ff7236c5a9a8ad2dba87d24c547cab046c29fc5bc1ed142e1de4763613bb162a5a538e6ef05ed05199d751f9eb58d332791b8d73fb74e4fce95",
+      "keyed_hash": "b42835e40e9d4a7f42ad8cc04f85a963a76e18198377ed84adddeaecacc6f3fca2f01d5277d69bb681c70fa8d36094f73ec06e452c80d2ff2257ed82e7ba348400989a65ee8daa7094ae0933e3d2210ac6395c4af24f91c2b590ef87d7788d7066ea3eaebca4c08a4f14b9a27644f99084c3543711b64a070b94f2c9d1d8a90d035d52",
+      "derive_key": "11c37a112765370c94a51415d0d651190c288566e295d505defdad895dae223730d5a5175a38841693020669c7638f40b9bc1f9f39cf98bda7a5b54ae24218a800a2116b34665aa95d846d97ea988bfcb53dd9c055d588fa21ba78996776ea6c40bc428b53c62b5f3ccf200f647a5aae8067f0ea1976391fcc72af1945100e2a6dcb88"
+    },
+    {
+      "input_len": 7169,
+      "hash": "a003fc7a51754a9b3c7fae0367ab3d782dccf28855a03d435f8cfe74605e781798a8b20534be1ca9eb2ae2df3fae2ea60e48c6fb0b850b1385b5de0fe460dbe9d9f9b0d8db4435da75c601156df9d047f4ede008732eb17adc05d96180f8a73548522840779e6062d643b79478a6e8dbce68927f36ebf676ffa7d72d5f68f050b119c8",
+      "keyed_hash": "ed9b1a922c046fdb3d423ae34e143b05ca1bf28b710432857bf738bcedbfa5113c9e28d72fcbfc020814ce3f5d4fc867f01c8f5b6caf305b3ea8a8ba2da3ab69fabcb438f19ff11f5378ad4484d75c478de425fb8e6ee809b54eec9bdb184315dc856617c09f5340451bf42fd3270a7b0b6566169f242e533777604c118a6358250f54",
+      "derive_key": "554b0a5efea9ef183f2f9b931b7497995d9eb26f5c5c6dad2b97d62fc5ac31d99b20652c016d88ba2a611bbd761668d5eda3e568e940faae24b0d9991c3bd25a65f770b89fdcadabcb3d1a9c1cb63e69721cacf1ae69fefdcef1e3ef41bc5312ccc17222199e47a26552c6adc460cf47a72319cb5039369d0060eaea59d6c65130f1dd"
+    },
+    {
+      "input_len": 8192,
+      "hash": "aae792484c8efe4f19e2ca7d371d8c467ffb10748d8a5a1ae579948f718a2a635fe51a27db045a567c1ad51be5aa34c01c6651c4d9b5b5ac5d0fd58cf18dd61a47778566b797a8c67df7b1d60b97b19288d2d877bb2df417ace009dcb0241ca1257d62712b6a4043b4ff33f690d849da91ea3bf711ed583cb7b7a7da2839ba71309bbf",
+      "keyed_hash": "dc9637c8845a770b4cbf76b8daec0eebf7dc2eac11498517f08d44c8fc00d58a4834464159dcbc12a0ba0c6d6eb41bac0ed6585cabfe0aca36a375e6c5480c22afdc40785c170f5a6b8a1107dbee282318d00d915ac9ed1143ad40765ec120042ee121cd2baa36250c618adaf9e27260fda2f94dea8fb6f08c04f8f10c78292aa46102",
+      "derive_key": "ad01d7ae4ad059b0d33baa3c01319dcf8088094d0359e5fd45d6aeaa8b2d0c3d4c9e58958553513b67f84f8eac653aeeb02ae1d5672dcecf91cd9985a0e67f4501910ecba25555395427ccc7241d70dc21c190e2aadee875e5aae6bf1912837e53411dabf7a56cbf8e4fb780432b0d7fe6cec45024a0788cf5874616407757e9e6bef7"
+    },
+    {
+      "input_len": 8193,
+      "hash": "bab6c09cb8ce8cf459261398d2e7aef35700bf488116ceb94a36d0f5f1b7bc3bb2282aa69be089359ea1154b9a9286c4a56af4de975a9aa4a5c497654914d279bea60bb6d2cf7225a2fa0ff5ef56bbe4b149f3ed15860f78b4e2ad04e158e375c1e0c0b551cd7dfc82f1b155c11b6b3ed51ec9edb30d133653bb5709d1dbd55f4e1ff6",
+      "keyed_hash": "954a2a75420c8d6547e3ba5b98d963e6fa6491addc8c023189cc519821b4a1f5f03228648fd983aef045c2fa8290934b0866b615f585149587dda2299039965328835a2b18f1d63b7e300fc76ff260b571839fe44876a4eae66cbac8c67694411ed7e09df51068a22c6e67d6d3dd2cca8ff12e3275384006c80f4db68023f24eebba57",
+      "derive_key": "af1e0346e389b17c23200270a64aa4e1ead98c61695d917de7d5b00491c9b0f12f20a01d6d622edf3de026a4db4e4526225debb93c1237934d71c7340bb5916158cbdafe9ac3225476b6ab57a12357db3abbad7a26c6e66290e44034fb08a20a8d0ec264f309994d2810c49cfba6989d7abb095897459f5425adb48aba07c5fb3c83c0"
+    },
+    {
+      "input_len": 16384,
+      "hash": "f875d6646de28985646f34ee13be9a576fd515f76b5b0a26bb324735041ddde49d764c270176e53e97bdffa58d549073f2c660be0e81293767ed4e4929f9ad34bbb39a529334c57c4a381ffd2a6d4bfdbf1482651b172aa883cc13408fa67758a3e47503f93f87720a3177325f7823251b85275f64636a8f1d599c2e49722f42e93893",
+      "keyed_hash": "9e9fc4eb7cf081ea7c47d1807790ed211bfec56aa25bb7037784c13c4b707b0df9e601b101e4cf63a404dfe50f2e1865bb12edc8fca166579ce0c70dba5a5c0fc960ad6f3772183416a00bd29d4c6e651ea7620bb100c9449858bf14e1ddc9ecd35725581ca5b9160de04060045993d972571c3e8f71e9d0496bfa744656861b169d65",
+      "derive_key": "160e18b5878cd0df1c3af85eb25a0db5344d43a6fbd7a8ef4ed98d0714c3f7e160dc0b1f09caa35f2f417b9ef309dfe5ebd67f4c9507995a531374d099cf8ae317542e885ec6f589378864d3ea98716b3bbb65ef4ab5e0ab5bb298a501f19a41ec19af84a5e6b428ecd813b1a47ed91c9657c3fba11c406bc316768b58f6802c9e9b57"
+    },
+    {
+      "input_len": 31744,
+      "hash": "62b6960e1a44bcc1eb1a611a8d6235b6b4b78f32e7abc4fb4c6cdcce94895c47860cc51f2b0c28a7b77304bd55fe73af663c02d3f52ea053ba43431ca5bab7bfea2f5e9d7121770d88f70ae9649ea713087d1914f7f312147e247f87eb2d4ffef0ac978bf7b6579d57d533355aa20b8b77b13fd09748728a5cc327a8ec470f4013226f",
+      "keyed_hash": "efa53b389ab67c593dba624d898d0f7353ab99e4ac9d42302ee64cbf9939a4193a7258db2d9cd32a7a3ecfce46144114b15c2fcb68a618a976bd74515d47be08b628be420b5e830fade7c080e351a076fbc38641ad80c736c8a18fe3c66ce12f95c61c2462a9770d60d0f77115bbcd3782b593016a4e728d4c06cee4505cb0c08a42ec",
+      "derive_key": "39772aef80e0ebe60596361e45b061e8f417429d529171b6764468c22928e28e9759adeb797a3fbf771b1bcea30150a020e317982bf0d6e7d14dd9f064bc11025c25f31e81bd78a921db0174f03dd481d30e93fd8e90f8b2fee209f849f2d2a52f31719a490fb0ba7aea1e09814ee912eba111a9fde9d5c274185f7bae8ba85d300a2b"
+    }
+  ]
+}