WIPPPPPPPPPPPP use bytes instead of words in more places

2024-05-28 13:46:02 +02:00 · 2023-07-09 12:54:00 -07:00 · 2023-07-09 12:54:00 -07:00 · 5ee3d75afa
parent 418f8f18e7
commit 5ee3d75afa
3 changed files with 317 additions and 246 deletions
--- a/rust/blake3_guts/src/lib.rs
+++ b/rust/blake3_guts/src/lib.rs
@ -19,9 +19,10 @@ pub const KEYED_HASH: u32 = 1 << 4;
 pub const DERIVE_KEY_CONTEXT: u32 = 1 << 5;
 pub const DERIVE_KEY_MATERIAL: u32 = 1 << 6;

-pub const IV: [u32; 8] = [
+pub const IV: CVWords = [
    0x6A09E667, 0xBB67AE85, 0x3C6EF372, 0xA54FF53A, 0x510E527F, 0x9B05688C, 0x1F83D9AB, 0x5BE0CD19,
 ];
+pub const IV_BYTES: CVBytes = le_bytes_from_words_32(&IV);

 pub const MSG_SCHEDULE: [[usize; 16]; 7] = [
    [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15],
@ -44,6 +45,11 @@ cfg_if::cfg_if! {
    }
 }

+pub type CVBytes = [u8; 32];
+pub type CVWords = [u32; 8];
+pub type BlockBytes = [u8; 64];
+pub type BlockWords = [u32; 16];
+
 #[inline]
 pub fn degree() -> usize {
    DETECTED_IMPL.degree()
@ -58,30 +64,19 @@ pub fn split_transposed_vectors(

 #[inline]
 pub fn compress(
-    block: &[u8; 64],
+    block: &BlockBytes,
    block_len: u32,
-    cv: &[u32; 8],
+    cv: &CVBytes,
    counter: u64,
    flags: u32,
-) -> [u32; 8] {
+) -> CVBytes {
    DETECTED_IMPL.compress(block, block_len, cv, counter, flags)
 }

-#[inline]
-pub fn compress_xof(
-    block: &[u8; 64],
-    block_len: u32,
-    cv: &[u32; 8],
-    counter: u64,
-    flags: u32,
-) -> [u8; 64] {
-    DETECTED_IMPL.compress_xof(block, block_len, cv, counter, flags)
-}
-
 #[inline]
 pub fn hash_chunks(
    input: &[u8],
-    key: &[u32; 8],
+    key: &CVBytes,
    counter: u64,
    flags: u32,
    transposed_output: TransposedSplit,
@ -93,7 +88,7 @@ pub fn hash_chunks(
 pub fn hash_parents(
    transposed_input: &TransposedVectors,
    num_cvs: usize,
-    key: &[u32; 8],
+    key: &CVBytes,
    flags: u32,
    transposed_output: TransposedSplit,
 ) -> usize {
@ -104,7 +99,7 @@ pub fn hash_parents(
 pub fn reduce_parents(
    transposed_in_out: &mut TransposedVectors,
    num_cvs: usize,
-    key: &[u32; 8],
+    key: &CVBytes,
    flags: u32,
 ) -> usize {
    DETECTED_IMPL.reduce_parents(transposed_in_out, num_cvs, key, flags)
@ -112,9 +107,9 @@ pub fn reduce_parents(

 #[inline]
 pub fn xof(
-    block: &[u8; 64],
+    block: &BlockBytes,
    block_len: u32,
-    cv: &[u32; 8],
+    cv: &CVBytes,
    counter: u64,
    flags: u32,
    out: &mut [u8],
@ -124,9 +119,9 @@ pub fn xof(

 #[inline]
 pub fn xof_xor(
-    block: &[u8; 64],
+    block: &BlockBytes,
    block_len: u32,
-    cv: &[u32; 8],
+    cv: &CVBytes,
    counter: u64,
    flags: u32,
    out: &mut [u8],
@ -135,7 +130,7 @@ pub fn xof_xor(
 }

 #[inline]
-pub fn universal_hash(input: &[u8], key: &[u32; 8], counter: u64) -> [u8; 16] {
+pub fn universal_hash(input: &[u8], key: &CVBytes, counter: u64) -> [u8; 16] {
    DETECTED_IMPL.universal_hash(input, key, counter)
 }

@ -246,38 +241,17 @@ impl Implementation {
    #[inline]
    pub fn compress(
        &self,
-        block: &[u8; 64],
+        block: &BlockBytes,
        block_len: u32,
-        cv: &[u32; 8],
+        cv: &CVBytes,
        counter: u64,
        flags: u32,
-    ) -> [u32; 8] {
-        let mut out = [0u32; 16];
+    ) -> CVBytes {
+        let mut out = [0u8; 32];
        unsafe {
            self.compress_fn()(block, block_len, cv, counter, flags, &mut out);
        }
-        out[..8].try_into().unwrap()
-    }
-
-    #[inline]
-    pub fn compress_xof(
-        &self,
-        block: &[u8; 64],
-        block_len: u32,
-        cv: &[u32; 8],
-        counter: u64,
-        flags: u32,
-    ) -> [u8; 64] {
-        let mut out_words = [0u32; 16];
-        unsafe {
-            self.compress_fn()(block, block_len, cv, counter, flags, &mut out_words);
-        }
-        let mut out_bytes = [0u8; 64];
-        for word_index in 0..16 {
-            out_bytes[word_index * WORD_LEN..][..WORD_LEN]
-                .copy_from_slice(&out_words[word_index].to_le_bytes());
-        }
-        out_bytes
+        out
    }

    #[inline]
@ -289,7 +263,7 @@ impl Implementation {
    pub fn hash_chunks(
        &self,
        input: &[u8],
-        key: &[u32; 8],
+        key: &CVBytes,
        counter: u64,
        flags: u32,
        transposed_output: TransposedSplit,
@ -327,7 +301,7 @@ impl Implementation {
        &self,
        transposed_input: &TransposedVectors,
        num_cvs: usize,
-        key: &[u32; 8],
+        key: &CVBytes,
        flags: u32,
        transposed_output: TransposedSplit,
    ) -> usize {
@ -359,7 +333,7 @@ impl Implementation {
        &self,
        transposed_in_out: &mut TransposedVectors,
        num_cvs: usize,
-        key: &[u32; 8],
+        key: &CVBytes,
        flags: u32,
    ) -> usize {
        let num_parents = num_cvs / 2;
@ -385,9 +359,9 @@ impl Implementation {
    #[inline]
    pub fn xof(
        &self,
-        block: &[u8; 64],
+        block: &BlockBytes,
        block_len: u32,
-        cv: &[u32; 8],
+        cv: &CVBytes,
        counter: u64,
        flags: u32,
        out: &mut [u8],
@ -413,9 +387,9 @@ impl Implementation {
    #[inline]
    pub fn xof_xor(
        &self,
-        block: &[u8; 64],
+        block: &BlockBytes,
        block_len: u32,
-        cv: &[u32; 8],
+        cv: &CVBytes,
        counter: u64,
        flags: u32,
        out: &mut [u8],
@ -439,7 +413,7 @@ impl Implementation {
    }

    #[inline]
-    pub fn universal_hash(&self, input: &[u8], key: &[u32; 8], counter: u64) -> [u8; 16] {
+    pub fn universal_hash(&self, input: &[u8], key: &CVBytes, counter: u64) -> [u8; 16] {
        let mut out = [0u8; 16];
        unsafe {
            self.universal_hash_fn()(input.as_ptr(), input.len(), key, counter, &mut out);
@ -471,30 +445,39 @@ fn degree_init() -> usize {
 }

 type CompressFn = unsafe extern "C" fn(
-    block: *const [u8; 64], // zero padded to 64 bytes
+    block: *const BlockBytes, // zero padded to 64 bytes
    block_len: u32,
-    cv: *const [u32; 8],
+    cv: *const CVBytes,
    counter: u64,
    flags: u32,
-    out: *mut [u32; 16], // may overlap the input
+    out: *mut CVBytes, // may overlap the input
 );

 unsafe extern "C" fn compress_init(
-    block: *const [u8; 64],
+    block: *const BlockBytes,
    block_len: u32,
-    cv: *const [u32; 8],
+    cv: *const CVBytes,
    counter: u64,
    flags: u32,
-    out: *mut [u32; 16],
+    out: *mut CVBytes,
 ) {
    init_detected_impl();
    DETECTED_IMPL.compress_fn()(block, block_len, cv, counter, flags, out);
 }

+type CompressXofFn = unsafe extern "C" fn(
+    block: *const BlockBytes, // zero padded to 64 bytes
+    block_len: u32,
+    cv: *const CVBytes,
+    counter: u64,
+    flags: u32,
+    out: *mut BlockBytes, // may overlap the input
+);
+
 type HashChunksFn = unsafe extern "C" fn(
    input: *const u8,
    input_len: usize,
-    key: *const [u32; 8],
+    key: *const CVBytes,
    counter: u64,
    flags: u32,
    transposed_output: *mut u32,
@ -503,7 +486,7 @@ type HashChunksFn = unsafe extern "C" fn(
 unsafe extern "C" fn hash_chunks_init(
    input: *const u8,
    input_len: usize,
-    key: *const [u32; 8],
+    key: *const CVBytes,
    counter: u64,
    flags: u32,
    transposed_output: *mut u32,
@ -515,7 +498,7 @@ unsafe extern "C" fn hash_chunks_init(
 type HashParentsFn = unsafe extern "C" fn(
    transposed_input: *const u32,
    num_parents: usize,
-    key: *const [u32; 8],
+    key: *const CVBytes,
    flags: u32,
    transposed_output: *mut u32, // may overlap the input
 );
@ -523,7 +506,7 @@ type HashParentsFn = unsafe extern "C" fn(
 unsafe extern "C" fn hash_parents_init(
    transposed_input: *const u32,
    num_parents: usize,
-    key: *const [u32; 8],
+    key: *const CVBytes,
    flags: u32,
    transposed_output: *mut u32,
 ) {
@ -533,9 +516,9 @@ unsafe extern "C" fn hash_parents_init(

 // This signature covers both xof() and xof_xor().
 type XofFn = unsafe extern "C" fn(
-    block: *const [u8; 64], // zero padded to 64 bytes
+    block: *const BlockBytes, // zero padded to 64 bytes
    block_len: u32,
-    cv: *const [u32; 8],
+    cv: *const CVBytes,
    counter: u64,
    flags: u32,
    out: *mut u8,
@ -543,9 +526,9 @@ type XofFn = unsafe extern "C" fn(
 );

 unsafe extern "C" fn xof_init(
-    block: *const [u8; 64],
+    block: *const BlockBytes,
    block_len: u32,
-    cv: *const [u32; 8],
+    cv: *const CVBytes,
    counter: u64,
    flags: u32,
    out: *mut u8,
@ -556,9 +539,9 @@ unsafe extern "C" fn xof_init(
 }

 unsafe extern "C" fn xof_xor_init(
-    block: *const [u8; 64],
+    block: *const BlockBytes,
    block_len: u32,
-    cv: *const [u32; 8],
+    cv: *const CVBytes,
    counter: u64,
    flags: u32,
    out: *mut u8,
@ -571,7 +554,7 @@ unsafe extern "C" fn xof_xor_init(
 type UniversalHashFn = unsafe extern "C" fn(
    input: *const u8,
    input_len: usize,
-    key: *const [u32; 8],
+    key: *const CVBytes,
    counter: u64,
    out: *mut [u8; 16],
 );
@ -579,7 +562,7 @@ type UniversalHashFn = unsafe extern "C" fn(
 unsafe extern "C" fn universal_hash_init(
    input: *const u8,
    input_len: usize,
-    key: *const [u32; 8],
+    key: *const CVBytes,
    counter: u64,
    out: *mut [u8; 16],
 ) {
@ -588,11 +571,12 @@ unsafe extern "C" fn universal_hash_init(
 }

 // The implicit degree of this implementation is MAX_SIMD_DEGREE.
+#[inline(always)]
 unsafe fn hash_chunks_using_compress(
    compress: CompressFn,
    mut input: *const u8,
    mut input_len: usize,
-    key: *const [u32; 8],
+    key: *const CVBytes,
    mut counter: u64,
    flags: u32,
    mut transposed_output: *mut u32,
@ -603,15 +587,14 @@ unsafe fn hash_chunks_using_compress(
        let mut chunk_len = cmp::min(input_len, CHUNK_LEN);
        input_len -= chunk_len;
        // We only use 8 words of the CV, but compress returns 16.
-        let mut cv = [0u32; 16];
-        cv[..8].copy_from_slice(&*key);
-        let cv_ptr: *mut [u32; 16] = &mut cv;
+        let mut cv = *key;
+        let cv_ptr: *mut CVBytes = &mut cv;
        let mut chunk_flags = flags | CHUNK_START;
        while chunk_len > BLOCK_LEN {
            compress(
-                input as *const [u8; 64],
+                input as *const BlockBytes,
                BLOCK_LEN as u32,
-                cv_ptr as *const [u32; 8],
+                cv_ptr,
                counter,
                chunk_flags,
                cv_ptr,
@ -626,15 +609,16 @@ unsafe fn hash_chunks_using_compress(
        compress(
            &last_block,
            chunk_len as u32,
-            cv_ptr as *const [u32; 8],
+            cv_ptr,
            counter,
            chunk_flags | CHUNK_END,
            cv_ptr,
        );
+        let cv_words = words_from_le_bytes_32(&cv);
        for word_index in 0..8 {
            transposed_output
                .add(word_index * TRANSPOSED_STRIDE)
-                .write(cv[word_index]);
+                .write(cv_words[word_index]);
        }
        transposed_output = transposed_output.add(1);
        counter += 1;
@ -642,11 +626,12 @@ unsafe fn hash_chunks_using_compress(
 }

 // The implicit degree of this implementation is MAX_SIMD_DEGREE.
+#[inline(always)]
 unsafe fn hash_parents_using_compress(
    compress: CompressFn,
    mut transposed_input: *const u32,
    mut num_parents: usize,
-    key: *const [u32; 8],
+    key: *const CVBytes,
    flags: u32,
    mut transposed_output: *mut u32, // may overlap the input
 ) {
@ -664,12 +649,13 @@ unsafe fn hash_parents_using_compress(
            block_bytes[WORD_LEN * (word_index + 8)..][..WORD_LEN]
                .copy_from_slice(&right_child_word.to_le_bytes());
        }
-        let mut cv = [0u32; 16];
+        let mut cv = [0u8; 32];
        compress(&block_bytes, BLOCK_LEN as u32, key, 0, flags, &mut cv);
+        let cv_words = words_from_le_bytes_32(&cv);
        for word_index in 0..8 {
            transposed_output
                .add(word_index * TRANSPOSED_STRIDE)
-                .write(cv[word_index]);
+                .write(cv_words[word_index]);
        }
        transposed_input = transposed_input.add(2);
        transposed_output = transposed_output.add(1);
@ -677,70 +663,68 @@ unsafe fn hash_parents_using_compress(
    }
 }

-unsafe fn xof_using_compress(
-    compress: CompressFn,
-    block: *const [u8; 64],
+#[inline(always)]
+unsafe fn xof_using_compress_xof(
+    compress_xof: CompressXofFn,
+    block: *const BlockBytes,
    block_len: u32,
-    cv: *const [u32; 8],
+    cv: *const CVBytes,
    mut counter: u64,
    flags: u32,
    mut out: *mut u8,
    mut out_len: usize,
 ) {
    while out_len > 0 {
-        let mut block_output = [0u32; 16];
-        compress(block, block_len, cv, counter, flags, &mut block_output);
-        for output_word in block_output {
-            let bytes = output_word.to_le_bytes();
-            let take = cmp::min(bytes.len(), out_len);
-            ptr::copy_nonoverlapping(bytes.as_ptr(), out, take);
-            out = out.add(take);
-            out_len -= take;
-        }
+        let mut block_output = [0u8; 64];
+        compress_xof(block, block_len, cv, counter, flags, &mut block_output);
+        let take = cmp::min(out_len, BLOCK_LEN);
+        ptr::copy_nonoverlapping(block_output.as_ptr(), out, take);
+        out = out.add(take);
+        out_len -= take;
        counter += 1;
    }
 }

-unsafe fn xof_xor_using_compress(
-    compress: CompressFn,
-    block: *const [u8; 64],
+#[inline(always)]
+unsafe fn xof_xor_using_compress_xof(
+    compress_xof: CompressXofFn,
+    block: *const BlockBytes,
    block_len: u32,
-    cv: *const [u32; 8],
+    cv: *const CVBytes,
    mut counter: u64,
    flags: u32,
    mut out: *mut u8,
    mut out_len: usize,
 ) {
    while out_len > 0 {
-        let mut block_output = [0u32; 16];
-        compress(block, block_len, cv, counter, flags, &mut block_output);
-        for output_word in block_output {
-            let bytes = output_word.to_le_bytes();
-            for i in 0..cmp::min(bytes.len(), out_len) {
-                *out = *out ^ bytes[i];
-                out = out.add(1);
-                out_len -= 1;
-            }
+        let mut block_output = [0u8; 64];
+        compress_xof(block, block_len, cv, counter, flags, &mut block_output);
+        let take = cmp::min(out_len, BLOCK_LEN);
+        for i in 0..take {
+            *out.add(i) ^= block_output[i];
        }
+        out = out.add(take);
+        out_len -= take;
        counter += 1;
    }
 }

+#[inline(always)]
 unsafe fn universal_hash_using_compress(
    compress: CompressFn,
    mut input: *const u8,
    mut input_len: usize,
-    key: *const [u32; 8],
+    key: *const CVBytes,
    mut counter: u64,
    out: *mut [u8; 16],
 ) {
    let flags = KEYED_HASH | CHUNK_START | CHUNK_END | ROOT;
-    let mut result = [0u32; 4];
+    let mut result = [0u8; 16];
    while input_len > 0 {
        let block_len = cmp::min(input_len, BLOCK_LEN);
        let mut block = [0u8; BLOCK_LEN];
        ptr::copy_nonoverlapping(input, block.as_mut_ptr(), block_len);
-        let mut block_output = [0u32; 16];
+        let mut block_output = [0u8; 32];
        compress(
            &block,
            BLOCK_LEN as u32,
@ -749,7 +733,7 @@ unsafe fn universal_hash_using_compress(
            flags,
            &mut block_output,
        );
-        for i in 0..4 {
+        for i in 0..16 {
            result[i] ^= block_output[i];
        }
        input = input.add(block_len);
@ -769,7 +753,7 @@ const TRANSPOSED_STRIDE: usize = 2 * MAX_SIMD_DEGREE;
 pub struct TransposedVectors([[u32; 2 * MAX_SIMD_DEGREE]; 8]);

 impl TransposedVectors {
-    pub fn parent_node(&self, parent_index: usize) -> [u8; 64] {
+    pub fn parent_node(&self, parent_index: usize) -> BlockBytes {
        let mut bytes = [0u8; 64];
        for word_index in 0..8 {
            bytes[word_index * WORD_LEN..][..WORD_LEN]
@ -820,3 +804,84 @@ unsafe fn copy_one_transposed_cv(transposed_src: *const u32, transposed_dest: *m
        transposed_dest.add(offset_words).write(word);
    }
 }
+
+#[inline(always)]
+pub const fn le_bytes_from_words_32(words: &CVWords) -> CVBytes {
+    let mut bytes = [0u8; 32];
+    // This loop is super verbose because currently that's what it takes to be const.
+    let mut word_index = 0;
+    while word_index < bytes.len() / WORD_LEN {
+        let word_bytes = words[word_index].to_le_bytes();
+        let mut byte_index = 0;
+        while byte_index < WORD_LEN {
+            bytes[word_index * WORD_LEN + byte_index] = word_bytes[byte_index];
+            byte_index += 1;
+        }
+        word_index += 1;
+    }
+    bytes
+}
+
+#[inline(always)]
+pub const fn le_bytes_from_words_64(words: &BlockWords) -> BlockBytes {
+    let mut bytes = [0u8; 64];
+    // This loop is super verbose because currently that's what it takes to be const.
+    let mut word_index = 0;
+    while word_index < bytes.len() / WORD_LEN {
+        let word_bytes = words[word_index].to_le_bytes();
+        let mut byte_index = 0;
+        while byte_index < WORD_LEN {
+            bytes[word_index * WORD_LEN + byte_index] = word_bytes[byte_index];
+            byte_index += 1;
+        }
+        word_index += 1;
+    }
+    bytes
+}
+
+#[inline(always)]
+pub const fn words_from_le_bytes_32(bytes: &CVBytes) -> CVWords {
+    let mut words = [0u32; 8];
+    // This loop is super verbose because currently that's what it takes to be const.
+    let mut word_index = 0;
+    while word_index < words.len() {
+        let mut word_bytes = [0u8; WORD_LEN];
+        let mut byte_index = 0;
+        while byte_index < WORD_LEN {
+            word_bytes[byte_index] = bytes[word_index * WORD_LEN + byte_index];
+            byte_index += 1;
+        }
+        words[word_index] = u32::from_le_bytes(word_bytes);
+        word_index += 1;
+    }
+    words
+}
+
+#[inline(always)]
+pub const fn words_from_le_bytes_64(bytes: &BlockBytes) -> BlockWords {
+    let mut words = [0u32; 16];
+    // This loop is super verbose because currently that's what it takes to be const.
+    let mut word_index = 0;
+    while word_index < words.len() {
+        let mut word_bytes = [0u8; WORD_LEN];
+        let mut byte_index = 0;
+        while byte_index < WORD_LEN {
+            word_bytes[byte_index] = bytes[word_index * WORD_LEN + byte_index];
+            byte_index += 1;
+        }
+        words[word_index] = u32::from_le_bytes(word_bytes);
+        word_index += 1;
+    }
+    words
+}
+
+#[test]
+fn test_byte_word_round_trips() {
+    let cv = *b"This is 32 LE bytes/eight words.";
+    assert_eq!(cv, le_bytes_from_words_32(&words_from_le_bytes_32(&cv)));
+    let block = *b"This is sixty-four little-endian bytes, or sixteen 32-bit words.";
+    assert_eq!(
+        block,
+        le_bytes_from_words_64(&words_from_le_bytes_64(&block)),
+    );
+}
--- a/rust/blake3_guts/src/portable.rs
+++ b/rust/blake3_guts/src/portable.rs
@ -1,9 +1,12 @@
-use crate::{IV, MAX_SIMD_DEGREE, MSG_SCHEDULE, WORD_LEN};
+use crate::{
+    le_bytes_from_words_32, le_bytes_from_words_64, words_from_le_bytes_32, words_from_le_bytes_64,
+    BlockBytes, BlockWords, CVBytes, CVWords, IV, MAX_SIMD_DEGREE, MSG_SCHEDULE,
+};

 pub const DEGREE: usize = MAX_SIMD_DEGREE;

 #[inline(always)]
-fn g(state: &mut [u32; 16], a: usize, b: usize, c: usize, d: usize, x: u32, y: u32) {
+fn g(state: &mut BlockWords, a: usize, b: usize, c: usize, d: usize, x: u32, y: u32) {
    state[a] = state[a].wrapping_add(state[b]).wrapping_add(x);
    state[d] = (state[d] ^ state[a]).rotate_right(16);
    state[c] = state[c].wrapping_add(state[d]);
@ -15,7 +18,7 @@ fn g(state: &mut [u32; 16], a: usize, b: usize, c: usize, d: usize, x: u32, y: u
 }

 #[inline(always)]
-fn round(state: &mut [u32; 16], msg: &[u32; 16], round: usize) {
+fn round(state: &mut [u32; 16], msg: &BlockWords, round: usize) {
    // Select the message schedule based on the round.
    let schedule = MSG_SCHEDULE[round];

@ -33,30 +36,22 @@ fn round(state: &mut [u32; 16], msg: &[u32; 16], round: usize) {
 }

 #[inline(always)]
-fn compress_safe(
-    block: &[u8; 64],
+fn compress_inner(
+    block_words: &BlockWords,
    block_len: u32,
-    cv: &[u32; 8],
+    cv_words: &CVWords,
    counter: u64,
    flags: u32,
 ) -> [u32; 16] {
-    let mut block_words = [0u32; 16];
-    for word_index in 0..16 {
-        block_words[word_index] = u32::from_le_bytes(
-            block[WORD_LEN * word_index..][..WORD_LEN]
-                .try_into()
-                .unwrap(),
-        );
-    }
    let mut state = [
-        cv[0],
-        cv[1],
-        cv[2],
-        cv[3],
-        cv[4],
-        cv[5],
-        cv[6],
-        cv[7],
+        cv_words[0],
+        cv_words[1],
+        cv_words[2],
+        cv_words[3],
+        cv_words[4],
+        cv_words[5],
+        cv_words[6],
+        cv_words[7],
        IV[0],
        IV[1],
        IV[2],
@ -69,28 +64,48 @@ fn compress_safe(
    for round_number in 0..7 {
        round(&mut state, &block_words, round_number);
    }
-    for i in 0..8 {
-        state[i] ^= state[i + 8];
-        state[i + 8] ^= (*cv)[i];
-    }
    state
 }

 pub unsafe extern "C" fn compress(
-    block: *const [u8; 64],
+    block: *const BlockBytes,
    block_len: u32,
-    cv: *const [u32; 8],
+    cv: *const CVBytes,
    counter: u64,
    flags: u32,
-    out: *mut [u32; 16],
+    out: *mut CVBytes,
 ) {
-    *out = compress_safe(&*block, block_len, &*cv, counter, flags);
+    let block_words = words_from_le_bytes_64(&*block);
+    let cv_words = words_from_le_bytes_32(&*cv);
+    let mut state = compress_inner(&block_words, block_len, &cv_words, counter, flags);
+    for word_index in 0..8 {
+        state[word_index] ^= state[word_index + 8];
+    }
+    *out = le_bytes_from_words_32(state[..8].try_into().unwrap());
+}
+
+pub unsafe extern "C" fn compress_xof(
+    block: *const BlockBytes,
+    block_len: u32,
+    cv: *const CVBytes,
+    counter: u64,
+    flags: u32,
+    out: *mut BlockBytes,
+) {
+    let block_words = words_from_le_bytes_64(&*block);
+    let cv_words = words_from_le_bytes_32(&*cv);
+    let mut state = compress_inner(&block_words, block_len, &cv_words, counter, flags);
+    for word_index in 0..8 {
+        state[word_index] ^= state[word_index + 8];
+        state[word_index + 8] ^= cv_words[word_index];
+    }
+    *out = le_bytes_from_words_64(&state);
 }

 pub unsafe extern "C" fn hash_chunks(
    input: *const u8,
    input_len: usize,
-    key: *const [u32; 8],
+    key: *const CVBytes,
    counter: u64,
    flags: u32,
    transposed_output: *mut u32,
@ -109,7 +124,7 @@ pub unsafe extern "C" fn hash_chunks(
 pub unsafe extern "C" fn hash_parents(
    transposed_input: *const u32,
    num_parents: usize,
-    key: *const [u32; 8],
+    key: *const CVBytes,
    flags: u32,
    transposed_output: *mut u32, // may overlap the input
 ) {
@ -124,33 +139,51 @@ pub unsafe extern "C" fn hash_parents(
 }

 pub unsafe extern "C" fn xof(
-    block: *const [u8; 64],
+    block: *const BlockBytes,
    block_len: u32,
-    cv: *const [u32; 8],
+    cv: *const CVBytes,
    counter: u64,
    flags: u32,
    out: *mut u8,
    out_len: usize,
 ) {
-    crate::xof_using_compress(compress, block, block_len, cv, counter, flags, out, out_len)
+    crate::xof_using_compress_xof(
+        compress_xof,
+        block,
+        block_len,
+        cv,
+        counter,
+        flags,
+        out,
+        out_len,
+    )
 }

 pub unsafe extern "C" fn xof_xor(
-    block: *const [u8; 64],
+    block: *const BlockBytes,
    block_len: u32,
-    cv: *const [u32; 8],
+    cv: *const CVBytes,
    counter: u64,
    flags: u32,
    out: *mut u8,
    out_len: usize,
 ) {
-    crate::xof_xor_using_compress(compress, block, block_len, cv, counter, flags, out, out_len)
+    crate::xof_xor_using_compress_xof(
+        compress_xof,
+        block,
+        block_len,
+        cv,
+        counter,
+        flags,
+        out,
+        out_len,
+    )
 }

 pub unsafe extern "C" fn universal_hash(
    input: *const u8,
    input_len: usize,
-    key: *const [u32; 8],
+    key: *const CVBytes,
    counter: u64,
    out: *mut [u8; 16],
 ) {
--- a/src/lib.rs
+++ b/src/lib.rs
@ -84,8 +84,8 @@ use core::fmt;

 use blake3_guts as guts;
 use guts::{
-    BLOCK_LEN, CHUNK_END, CHUNK_LEN, CHUNK_START, DERIVE_KEY_CONTEXT, DERIVE_KEY_MATERIAL, IV,
-    KEYED_HASH, PARENT, ROOT, WORD_LEN,
+    BlockBytes, CVBytes, BLOCK_LEN, CHUNK_END, CHUNK_LEN, CHUNK_START, DERIVE_KEY_CONTEXT,
+    DERIVE_KEY_MATERIAL, IV_BYTES, KEYED_HASH, PARENT, ROOT,
 };

 /// The number of bytes in a [`Hash`](struct.Hash.html), 32
@ -122,19 +122,19 @@ const MAX_DEPTH: usize = 54; // 2^54 * CHUNK_LEN = 2^64
 /// [`FromStr`]: https://doc.rust-lang.org/std/str/trait.FromStr.html
 #[cfg_attr(feature = "zeroize", derive(zeroize::Zeroize))]
 #[derive(Clone, Copy, Hash)]
-pub struct Hash([u8; OUT_LEN]);
+pub struct Hash(CVBytes);

 impl Hash {
    /// The raw bytes of the `Hash`. Note that byte arrays don't provide
    /// constant-time equality checking, so if  you need to compare hashes,
    /// prefer the `Hash` type.
    #[inline]
-    pub const fn as_bytes(&self) -> &[u8; OUT_LEN] {
+    pub const fn as_bytes(&self) -> &CVBytes {
        &self.0
    }

    /// Create a `Hash` from its raw bytes representation.
-    pub const fn from_bytes(bytes: [u8; OUT_LEN]) -> Self {
+    pub const fn from_bytes(bytes: CVBytes) -> Self {
        Self(bytes)
    }

@ -178,7 +178,7 @@ impl Hash {
        if hex_bytes.len() != OUT_LEN * 2 {
            return Err(HexError(HexErrorInner::InvalidLen(hex_bytes.len())));
        }
-        let mut hash_bytes: [u8; OUT_LEN] = [0; OUT_LEN];
+        let mut hash_bytes: CVBytes = [0; OUT_LEN];
        for i in 0..OUT_LEN {
            hash_bytes[i] = 16 * hex_val(hex_bytes[2 * i])? + hex_val(hex_bytes[2 * i + 1])?;
        }
@ -186,14 +186,14 @@ impl Hash {
    }
 }

-impl From<[u8; OUT_LEN]> for Hash {
+impl From<CVBytes> for Hash {
    #[inline]
-    fn from(bytes: [u8; OUT_LEN]) -> Self {
+    fn from(bytes: CVBytes) -> Self {
        Self::from_bytes(bytes)
    }
 }

-impl From<Hash> for [u8; OUT_LEN] {
+impl From<Hash> for CVBytes {
    #[inline]
    fn from(hash: Hash) -> Self {
        hash.0
@ -217,9 +217,9 @@ impl PartialEq for Hash {
 }

 /// This implementation is constant-time.
-impl PartialEq<[u8; OUT_LEN]> for Hash {
+impl PartialEq<CVBytes> for Hash {
    #[inline]
-    fn eq(&self, other: &[u8; OUT_LEN]) -> bool {
+    fn eq(&self, other: &CVBytes) -> bool {
        constant_time_eq::constant_time_eq_32(&self.0, other)
    }
 }
@ -298,66 +298,49 @@ impl std::error::Error for HexError {}
 #[cfg_attr(feature = "zeroize", derive(zeroize::Zeroize))]
 #[derive(Clone)]
 struct Output {
-    input_chaining_value: [u32; 8],
-    block: [u8; 64],
+    input_chaining_value: CVBytes,
+    block: BlockBytes,
    block_len: u8,
    counter: u64,
    flags: u8,
 }

 impl Output {
-    fn chaining_value(&self) -> [u8; 32] {
-        let words = guts::compress(
+    fn chaining_value(&self) -> CVBytes {
+        guts::compress(
            &self.block,
            self.block_len as u32,
            &self.input_chaining_value,
            self.counter,
            self.flags as u32,
-        );
-        let mut bytes = [0u8; 32];
-        for word_index in 0..8 {
-            bytes[word_index * WORD_LEN..][..WORD_LEN]
-                .copy_from_slice(&words[word_index].to_le_bytes());
-        }
-        bytes
+        )
    }

    fn root_hash(&self) -> Hash {
        debug_assert_eq!(self.counter, 0);
-        let out_bytes = guts::compress_xof(
+        Hash(guts::compress(
            &self.block,
            self.block_len as u32,
            &self.input_chaining_value,
            0,
            self.flags as u32 | ROOT,
-        );
-        Hash(out_bytes[..OUT_LEN].try_into().unwrap())
-    }
-
-    fn root_output_block(&self) -> [u8; 2 * OUT_LEN] {
-        guts::compress_xof(
-            &self.block,
-            self.block_len as u32,
-            &self.input_chaining_value,
-            self.counter,
-            self.flags as u32 | ROOT,
-        )
+        ))
    }
 }

 #[derive(Clone)]
 #[cfg_attr(feature = "zeroize", derive(zeroize::Zeroize))]
 struct ChunkState {
-    cv: [u32; 8],
+    cv: CVBytes,
    chunk_counter: u64,
-    buf: [u8; BLOCK_LEN],
+    buf: BlockBytes,
    buf_len: u8,
    blocks_compressed: u8,
    flags: u8,
 }

 impl ChunkState {
-    fn new(key: &[u32; 8], chunk_counter: u64, flags: u32) -> Self {
+    fn new(key: &CVBytes, chunk_counter: u64, flags: u32) -> Self {
        Self {
            cv: *key,
            chunk_counter,
@ -499,7 +482,7 @@ fn left_len(content_len: usize) -> usize {
 // multithreading parallelism for that update().
 fn compress_subtree_wide<J: join::Join>(
    input: &[u8],
-    key: &[u32; 8],
+    key: &CVBytes,
    chunk_counter: u64,
    flags: u32,
    out: guts::TransposedSplit,
@ -547,10 +530,10 @@ fn compress_subtree_wide<J: join::Join>(
 // chunk or less. That's a different codepath.
 fn compress_subtree_to_parent_node<J: join::Join>(
    input: &[u8],
-    key: &[u32; 8],
+    key: &CVBytes,
    chunk_counter: u64,
    flags: u32,
-) -> [u8; 64] {
+) -> BlockBytes {
    debug_assert!(input.len() > CHUNK_LEN);
    let mut transposed_cvs = guts::TransposedVectors::default();
    let (left_cvs, _) = guts::split_transposed_vectors(&mut transposed_cvs);
@ -568,7 +551,7 @@ fn compress_subtree_to_parent_node<J: join::Join>(

 // Hash a complete input all at once. Unlike compress_subtree_wide() and
 // compress_subtree_to_parent_node(), this function handles the 1 chunk case.
-fn hash_all_at_once<J: join::Join>(input: &[u8], key: &[u32; 8], flags: u32) -> Output {
+fn hash_all_at_once<J: join::Join>(input: &[u8], key: &CVBytes, flags: u32) -> Output {
    // If the whole subtree is one chunk, hash it directly with a ChunkState.
    if input.len() <= CHUNK_LEN {
        return ChunkState::new(key, 0, flags).update(input).output();
@ -596,20 +579,7 @@ fn hash_all_at_once<J: join::Join>(input: &[u8], key: &[u32; 8], flags: u32) ->
 /// This function is always single-threaded. For multithreading support, see
 /// [`Hasher::update_rayon`](struct.Hasher.html#method.update_rayon).
 pub fn hash(input: &[u8]) -> Hash {
-    hash_all_at_once::<join::SerialJoin>(input, &IV, 0).root_hash()
-}
-
-#[inline(always)]
-pub fn words_from_le_bytes_32(bytes: &[u8; 32]) -> [u32; 8] {
-    let mut out = [0; 8];
-    for word_index in 0..8 {
-        out[word_index] = u32::from_le_bytes(
-            bytes[word_index * WORD_LEN..][..WORD_LEN]
-                .try_into()
-                .unwrap(),
-        );
-    }
-    out
+    hash_all_at_once::<join::SerialJoin>(input, &IV_BYTES, 0).root_hash()
 }

 /// The keyed hash function.
@ -626,9 +596,8 @@ pub fn words_from_le_bytes_32(bytes: &[u8; 32]) -> [u32; 8] {
 /// This function is always single-threaded. For multithreading support, see
 /// [`Hasher::new_keyed`] and
 /// [`Hasher::update_rayon`](struct.Hasher.html#method.update_rayon).
-pub fn keyed_hash(key: &[u8; KEY_LEN], input: &[u8]) -> Hash {
-    let key_words = words_from_le_bytes_32(key);
-    hash_all_at_once::<join::SerialJoin>(input, &key_words, KEYED_HASH).root_hash()
+pub fn keyed_hash(key: &CVBytes, input: &[u8]) -> Hash {
+    hash_all_at_once::<join::SerialJoin>(input, key, KEYED_HASH).root_hash()
 }

 /// The key derivation function.
@ -666,20 +635,19 @@ pub fn keyed_hash(key: &[u8; KEY_LEN], input: &[u8]) -> Hash {
 /// [`Hasher::update_rayon`](struct.Hasher.html#method.update_rayon).
 ///
 /// [Argon2]: https://en.wikipedia.org/wiki/Argon2
-pub fn derive_key(context: &str, key_material: &[u8]) -> [u8; OUT_LEN] {
+pub fn derive_key(context: &str, key_material: &[u8]) -> CVBytes {
    let context_key =
-        hash_all_at_once::<join::SerialJoin>(context.as_bytes(), &IV, DERIVE_KEY_CONTEXT)
+        hash_all_at_once::<join::SerialJoin>(context.as_bytes(), &IV_BYTES, DERIVE_KEY_CONTEXT)
            .root_hash();
-    let context_key_words = words_from_le_bytes_32(context_key.as_bytes());
-    hash_all_at_once::<join::SerialJoin>(key_material, &context_key_words, DERIVE_KEY_MATERIAL)
+    hash_all_at_once::<join::SerialJoin>(key_material, context_key.as_bytes(), DERIVE_KEY_MATERIAL)
        .root_hash()
        .0
 }

 fn parent_node_output(
-    left_child: &[u8; 32],
-    right_child: &[u8; 32],
-    key: &[u32; 8],
+    left_child: &CVBytes,
+    right_child: &CVBytes,
+    key: &CVBytes,
    flags: u32,
 ) -> Output {
    let mut block = [0; BLOCK_LEN];
@ -737,18 +705,18 @@ fn parent_node_output(
 #[derive(Clone)]
 #[cfg_attr(feature = "zeroize", derive(zeroize::Zeroize))]
 pub struct Hasher {
-    key: [u32; 8],
+    key: CVBytes,
    chunk_state: ChunkState,
    // The stack size is MAX_DEPTH + 1 because we do lazy merging. For example,
    // with 7 chunks, we have 3 entries in the stack. Adding an 8th chunk
    // requires a 4th entry, rather than merging everything down to 1, because
    // we don't know whether more input is coming. This is different from how
    // the reference implementation does things.
-    cv_stack: ArrayVec<[u8; 32], { MAX_DEPTH + 1 }>,
+    cv_stack: ArrayVec<CVBytes, { MAX_DEPTH + 1 }>,
 }

 impl Hasher {
-    fn new_internal(key: &[u32; 8], flags: u32) -> Self {
+    fn new_internal(key: &CVBytes, flags: u32) -> Self {
        Self {
            key: *key,
            chunk_state: ChunkState::new(key, 0, flags),
@ -758,16 +726,15 @@ impl Hasher {

    /// Construct a new `Hasher` for the regular hash function.
    pub fn new() -> Self {
-        Self::new_internal(&IV, 0)
+        Self::new_internal(&IV_BYTES, 0)
    }

    /// Construct a new `Hasher` for the keyed hash function. See
    /// [`keyed_hash`].
    ///
    /// [`keyed_hash`]: fn.keyed_hash.html
-    pub fn new_keyed(key: &[u8; KEY_LEN]) -> Self {
-        let key_words = words_from_le_bytes_32(key);
-        Self::new_internal(&key_words, KEYED_HASH)
+    pub fn new_keyed(key: &CVBytes) -> Self {
+        Self::new_internal(key, KEYED_HASH)
    }

    /// Construct a new `Hasher` for the key derivation function. See
@ -777,10 +744,9 @@ impl Hasher {
    /// [`derive_key`]: fn.derive_key.html
    pub fn new_derive_key(context: &str) -> Self {
        let context_key =
-            hash_all_at_once::<join::SerialJoin>(context.as_bytes(), &IV, DERIVE_KEY_CONTEXT)
+            hash_all_at_once::<join::SerialJoin>(context.as_bytes(), &IV_BYTES, DERIVE_KEY_CONTEXT)
                .root_hash();
-        let context_key_words = words_from_le_bytes_32(context_key.as_bytes());
-        Self::new_internal(&context_key_words, DERIVE_KEY_MATERIAL)
+        Self::new_internal(context_key.as_bytes(), DERIVE_KEY_MATERIAL)
    }

    /// Reset the `Hasher` to its initial state.
@ -852,7 +818,7 @@ impl Hasher {
    // merging with each of them separately, so that the second CV will always
    // remain unmerged. (That also helps us support extendable output when
    // we're hashing an input all-at-once.)
-    fn push_cv(&mut self, new_cv: &[u8; 32], chunk_counter: u64) {
+    fn push_cv(&mut self, new_cv: &CVBytes, chunk_counter: u64) {
        self.merge_cv_stack(chunk_counter);
        self.cv_stack.push(*new_cv);
    }
@ -1178,19 +1144,26 @@ impl OutputReader {
    /// reading further, the behavior is unspecified.
    ///
    /// [`Read::read`]: #method.read
-    pub fn fill(&mut self, mut buf: &mut [u8]) {
-        while !buf.is_empty() {
-            let block: [u8; BLOCK_LEN] = self.inner.root_output_block();
-            let output_bytes = &block[self.position_within_block as usize..];
-            let take = cmp::min(buf.len(), output_bytes.len());
-            buf[..take].copy_from_slice(&output_bytes[..take]);
-            buf = &mut buf[take..];
-            self.position_within_block += take as u8;
-            if self.position_within_block == BLOCK_LEN as u8 {
-                self.inner.counter += 1;
-                self.position_within_block = 0;
-            }
-        }
+    pub fn fill(&mut self, buf: &mut [u8]) {
+        guts::xof(
+            &self.inner.block,
+            self.inner.block_len as u32,
+            &self.inner.input_chaining_value,
+            self.inner.counter,
+            self.inner.flags as u32,
+            buf,
+        );
+    }
+
+    pub fn fill_xor(&mut self, buf: &mut [u8]) {
+        guts::xof_xor(
+            &self.inner.block,
+            self.inner.block_len as u32,
+            &self.inner.input_chaining_value,
+            self.inner.counter,
+            self.inner.flags as u32,
+            buf,
+        );
    }

    /// Return the current read position in the output stream. This is