init

2024-06-01 22:06:12 +02:00 · 2015-10-16 10:52:09 +02:00 · 2015-10-16 10:52:09 +02:00 · 31fc57d45a
parent e7a44c1dab
commit 31fc57d45a
43 changed files with 55698 additions and 2 deletions
--- a/31
+++ b/31
--- a/101
+++ b/101
@ -0,0 +1,101 @@
+#
+# Argon2 source code package
+# 
+# This work is licensed under a Creative Commons CC0 1.0 License/Waiver.
+# 
+# You should have received a copy of the CC0 Public Domain Dedication along with
+# this software. If not, see <http://creativecommons.org/publicdomain/zero/1.0/>.
+#
+
+CC = gcc
+REF_CFLAGS = -std=c99 -pthread -O3 -Wall
+OPT_CFLAGS = -std=c99 -pthread -O3 -m64 -mavx -Wall
+
+ARGON2_DIR = ./src/
+BLAKE2_DIR = ./src/blake2
+BUILD_DIR = ./build
+
+ARGON2_SOURCES = argon2.c argon2-core.c kat.c
+BLAKE2_SOURCES = blake2b-ref.c
+TEST_SOURCES = argon2-test.c
+
+REF_SOURCES = argon2-ref-core.c
+OPT_SOURCES = argon2-opt-core.c
+
+LIB_NAME=argon2
+
+SCRIPTS_DIR = ./../../Scripts
+
+ARGON2_BUILD_SOURCES = $(addprefix $(ARGON2_DIR)/,$(ARGON2_SOURCES))
+BLAKE2_BUILD_SOURCES = $(addprefix $(BLAKE2_DIR)/,$(BLAKE2_SOURCES))
+TEST_BUILD_SOURCES = $(addprefix $(ARGON2_DIR)/,$(TEST_SOURCES))
+
+
+#OPT=TRUE
+ifeq ($(OPT), TRUE)
+	CFLAGS=$(OPT_CFLAGS)
+	ARGON2_BUILD_SOURCES += $(addprefix $(ARGON2_DIR)/,$(OPT_SOURCES))
+else
+	CFLAGS=$(REF_CFLAGS)
+	ARGON2_BUILD_SOURCES += $(addprefix $(ARGON2_DIR)/,$(REF_SOURCES))
+endif
+
+
+SRC_DIR := $(dir $(abspath $(lastword $(MAKEFILE_LIST))))
+
+BUILD_DIR_PATH := $(shell pwd)/$(BUILD_DIR)
+
+SYSTEM_KERNEL_NAME := $(shell uname -s)
+
+ifeq ($(SYSTEM_KERNEL_NAME), Linux)
+	LIB_EXT := so
+	LIB_CFLAGS := -shared -fPIC
+	LIB_PATH := -Wl,-rpath=$(BUILD_DIR_PATH)
+endif
+ifeq ($(SYSTEM_KERNEL_NAME), Darwin)
+	LIB_EXT := dylib
+	LIB_CFLAGS := -dynamiclib -install_name @rpath/lib$(LIB_NAME).$(LIB_EXT)
+	LIB_PATH := -Xlinker -rpath -Xlinker $(BUILD_DIR_PATH)
+endif
+
+
+.PHONY: clean argon2-genkat argon2-lib test
+
+all:  argon2 argon2-genkat argon2-lib 
+
+argon2:
+	mkdir -p $(BUILD_DIR)
+	$(CC) $(CFLAGS) \
+		$(ARGON2_BUILD_SOURCES) \
+		$(BLAKE2_BUILD_SOURCES) \
+		$(TEST_BUILD_SOURCES) \
+		-I$(ARGON2_DIR) \
+		-I$(BLAKE2_DIR) \
+		-o $(BUILD_DIR)/$@
+
+argon2-genkat:
+	mkdir -p $(BUILD_DIR)
+	$(CC) $(CFLAGS) \
+		-DARGON2_KAT -DARGON2_KAT_INTERNAL \
+		$(ARGON2_BUILD_SOURCES) \
+		$(BLAKE2_BUILD_SOURCES) \
+		$(TEST_BUILD_SOURCES) \
+		-I$(ARGON2_DIR) \
+		-I$(BLAKE2_DIR) \
+		-o $(BUILD_DIR)/$@
+
+argon2-lib:
+	mkdir -p $(BUILD_DIR)
+	$(CC) $(CFLAGS) \
+		$(LIB_CFLAGS) \
+		$(ARGON2_BUILD_SOURCES) \
+		$(BLAKE2_BUILD_SOURCES) \
+		-I$(ARGON2_DIR) \
+		-I$(BLAKE2_DIR) \
+		-o $(BUILD_DIR)/lib$(LIB_NAME).$(LIB_EXT)
+
+test:   argon2-genkat
+	$(SCRIPTS_DIR)/check_test_vectors.sh -src=$(SRC_DIR)
+
+clean:
+	rm -rf $(BUILD_DIR)/
--- a/README.md
+++ b/README.md
@ -1,3 +1,3 @@
-# PHC winner: Argon2
+# Argon2

-Soon
+Work in progress, do not use 
--- a/check_test_vectors.sh
+++ b/check_test_vectors.sh
@ -0,0 +1,109 @@
+#!/bin/bash
+#
+# Argon2 source code package
+# 
+# This work is licensed under a Creative Commons CC0 1.0 License/Waiver.
+# 
+# You should have received a copy of the CC0 Public Domain Dedication along with
+# this software. If not, see <http://creativecommons.org/publicdomain/zero/1.0/>.
+#
+
+
+# Get current script path
+script_path=$(dirname $0)
+
+
+# Change current directory to root directory
+if [ '.' != $script_path ] ; then
+	cd $script_path/../
+fi
+
+
+ARGON2_TYPES=(Argon2d Argon2i Argon2id Argon2ds)
+ARGON2_IMPLEMENTATIONS=(REF OPT)
+
+OUTPUT_PATH=./../../Output/
+TEST_VECTORS_PATH=./../../TestVectors/
+
+KAT_REF=kat-argon2-ref.log
+KAT_OPT=kat-argon2-opt.log
+
+
+# Parse script arguments
+for i in "$@"
+do
+	case $i in
+		-s=*|-src=*|--source=*)
+			SOURCE_DIR="${i#*=}"
+			shift
+			;;
+		*)
+			# Unknown option
+			;;
+	esac
+done
+
+
+# Change current directory to source directory
+cd $SOURCE_DIR
+
+
+for implementation in ${ARGON2_IMPLEMENTATIONS[@]}
+do
+	echo "Test for $implementation"
+
+	make_log=$OUTPUT_PATH"make_"$implementation".log"
+	rm -f $make_log
+
+	flags=""
+	if [ "OPT" == "$implementation" ] ; then
+		flags="OPT=TRUE"
+	fi
+
+	make $flags &> $make_log
+
+	if [ 0 -ne $? ] ; then
+		echo -e "\t\t -> Wrong! Make error! See $make_log for details!"
+		continue
+	else
+		rm -f $make_log
+	fi
+
+
+	for type in ${ARGON2_TYPES[@]}
+	do
+		echo -e "\t Test for $type"
+
+		kat_file_name="KAT_"$implementation
+		kat_file=${!kat_file_name}
+		rm -f $kat_file
+
+		run_log=$OUTPUT_PATH"run_"$type"_"$implementation".log"
+		./../../Build/argon2-tv -gen-tv -type $type > $run_log
+		if [ 0 -ne $? ] ; then
+			echo -e "\t\t -> Wrong! Run error! See $run_log for details!"
+			continue
+		else
+			rm -f $run_log
+		fi
+
+
+		kat_file_copy=$OUTPUT_PATH/${kat_file/"argon2"/$type}
+		cp $kat_file $kat_file_copy
+		rm -f $kat_file
+
+		test_vectors_file=$TEST_VECTORS_PATH$type".txt"
+
+		diff_file=$OUTPUT_PATH"diff_"$type"_"$implementation
+		rm -f $diff_file
+
+
+		if diff -Naur $kat_file_copy $test_vectors_file > $diff_file ; then
+			echo -e "\t\t -> OK!"
+			rm -f $kat_file_copy
+			rm -f $diff_file
+		else
+			echo -e "\t\t -> Wrong! See $diff_file for details!"
+		fi
+	done
+done
--- a/doc/Argon2.pdf
+++ b/doc/Argon2.pdf
--- a/doc/LaTeX/argon2-spec.tex
+++ b/doc/LaTeX/argon2-spec.tex
@ -0,0 +1,863 @@
+
+\documentclass[a4paper]{article}
+\usepackage[hmargin=2cm,vmargin=2cm]{geometry}
+
+
+\pagestyle{plain}
+
+\usepackage{amssymb,amsthm,amsfonts,longtable, comment,array, ifpdf, hyperref,cite,url}
+\usepackage{graphicx}
+\newtheorem{theorem}{Theorem}
+\newtheorem{lemma}{Lemma}
+\newcommand{\Tag}{\mathrm{Tag}}
+
+
+% *** MATH PACKAGES ***
+%
+\usepackage[cmex10]{amsmath}
+
+
+
+
+% *** SPECIALIZED LIST PACKAGES ***
+%
+\usepackage{algorithmic}
+
+
+
+
+
+\begin{document}
+
+%FINISHED
+
+\title{\textsf{Argon2: the memory-hard function for password hashing and other applications}}
+
+\author{Designers: Alex Biryukov, Daniel Dinu,  and Dmitry Khovratovich\\University of Luxembourg, Luxembourg
+\\[10pt]
+%Submitters: Alex Biryukov and Dmitry Khovratovich
+%\\
+{\tt alex.biryukov@uni.lu, dumitru-daniel.dinu@uni.lu, khovratovich@gmail.com}\\[10 pt]
+\url{https://www.cryptolux.org/index.php/Argon2}\\
+\url{https://github.com/khovratovich/Argon2}\\[10pt]
+Version 1.2.1 of Argon2}
+
+\maketitle
+
+\section{Introduction}
+
+Passwords, despite all their drawbacks, remain the primary form of authentication on various web-services. Passwords are usually stored in a hashed form in a server's database. These databases are quite often captured by the adversaries,  who then apply dictionary attacks since passwords tend to have low entropy. Protocol designers use a number of tricks to mitigate these issues. Starting from the late 70's, a password is hashed together with a random \emph{salt} value to prevent detection of identical passwords across different users and services. The hash function computations, which became faster and faster due to Moore's law have been called multiple times to increase the cost of password trial for the attacker.
+
+ In the meanwhile, the password crackers migrated to new architectures, such as FPGAs, multiple-core GPUs and dedicated ASIC modules, where the amortized cost of a multiple-iterated hash function is much lower. It was quickly noted that these new environments are great when the computation is almost memoryless, but they experience difficulties when  operating on a large amount of memory. The defenders responded by designing \emph{memory-hard} functions, which require a large amount of memory to be computed, and impose computational penalties if less memory is used. The password hashing scheme \textsf{scrypt}~\cite{percival2009stronger} is an instance of such function.
+
+Memory-hard schemes also have other applications. They can be used for key derivation from low-entropy sources. Memory-hard schemes are also welcome in cryptocurrency designs~\cite{litecoin} if a creator wants to demotivate the use of GPUs and ASICs for mining and promote the use of standard desktops.
+
+\paragraph{Problems of existing schemes} A trivial solution for password hashing is a keyed hash function such as HMAC. If the protocol designer prefers hashing without secret keys to avoid all the problems with key generation, storage, and update, then he has few alternatives: the generic mode PBKDF2, the Blowfish-based \textsf{bcrypt}, and \textsf{scrypt}. Among those, only 
+\textsf{scrypt} aims for high memory, but the existence of a trivial time-memory tradeoff~\cite{ForlerLW14} allows compact implementations with the same energy cost.
+
+Design of a memory-hard function proved to be a tough problem. Since early 80's it has been known that many cryptographic problems that seemingly require large memory actually allow for a time-memory tradeoff~\cite{hellman1980cryptanalytic}, where the adversary can trade memory for time and do his job on fast hardware with low memory. In application
+to password-hashing schemes, this means that the password crackers can still be implemented on a dedicated hardware even though at some additional cost. 
+
+Another problem with the existing schemes is their complexity. The same \textsf{scrypt} calls a stack of subprocedures, whose design rationale has not been fully motivated (e.g, \textsf{scrypt} calls SMix, which calls ROMix, which calls BlockMix, which calls Salsa20/8 etc.).  It is hard to analyze and, moreover, hard to achieve confidence. Finally, it is not flexible in separating time and memory costs. 
+At the same time, the story of cryptographic competitions~\cite{robshaw2008new,sha3} has demonstrated that
+the most secure designs come with simplicity, where every element is well motivated and a cryptanalyst has as few entry points as possible.
+
+The  Password Hashing Competition, which started in 2014,  highlighted the following problems:
+\begin{itemize}
+  \item Should the memory addressing (indexing functions) be input-independent or input-dependent, or hybrid? The first type of schemes, where the memory read location are known in advance, is immediately vulnerable to time-space tradeoff attacks,
+ since an adversary can precompute the missing block by the time it is needed~\cite{trade-att}. In turn, the input-dependent schemes are vulnerable to side-channel attacks~\cite{RistenpartTSS09}, as the timing information allows for much faster password search.
+  \item Is it better to fill more memory but suffer from time-space tradeoffs, or make more passes over the memory to be more robust? This question was quite difficult to answer due to absence of generic tradeoff tools, which would analyze the security against tradeoff attacks, and the absence of unified metric to measure adversary's costs.
+          \item How should the input-independent addresses be computed? Several seemingly secure options have been attacked~\cite{trade-att}.
+  \item How large a single memory block should be? Reading smaller random-placed blocks is slower (in cycles per byte) due to the spacial locality principle of the CPU cache. In turn, larger
+  blocks are difficult to process due to the limited number of long registers.
+  \item If the block is large, how to choose the internal compression function? Should it be cryptographically secure or more lightweight, providing only basic mixing of the inputs? Many candidates simply proposed an iterative construction  and argued against cryptographically strong transformations.
+
+      \item How to exploit multiple cores of modern CPUs, when they are available? Parallelizing calls to the hashing function without any interaction is subject  to simple tradeoff attacks.
+\end{itemize}
+
+\paragraph{Our solution} We offer a hashing scheme called  \textsf{Argon2}.
+ \textsf{Argon2} summarizes the state of the art in the design of memory-hard functions. It is a streamlined and simple design. It aims at the highest memory filling rate and effective use of multiple computing units, while still
+providing defense against tradeoff attacks. \textsf{Argon2} is optimized for the x86 architecture and exploits the cache and memory organization of the recent Intel and AMD processors. \textsf{Argon2} has two variants: \textsf{Argon2d} and \textsf{Argon2i}. \textsf{Argon2d} is faster and uses data-depending memory access, which makes it suitable for cryptocurrencies and applications with no threats from side-channel timing attacks. \textsf{Argon2i} uses data-independent memory access, which is preferred for password hashing and password-based key derivation. \textsf{Argon2i} is slower as it makes more passes over the memory to protect from tradeoff attacks.
+
+
+We recommend \textsf{Argon2} for the applications that aim for high performance. Both versions of \textsf{Argon2} allow to fill 1 GB of RAM in a fraction of second, and smaller amounts even faster. It scales easily to the arbitrary number of parallel computing units. Its design is also optimized for clarity to ease analysis and implementation.
+
+Our scheme provides more features and better tradeoff resilience than pre-PHC designs and equals in performance with the  PHC finalists~\cite{broz15}. 
+
+\section{Definitions}
+
+\subsection{Motivation}\label{sec:costs}
+We aim to maximize the cost of password cracking on ASICs. There can be different approaches to measure this cost, but we turn to one of the most popular -- the time-area product~\cite{Thompson79,BernsteinL13}. We assume that the password $P$ is hashed with salt $S$ but without secret keys, and the hashes may leak to the adversaries together with salts:
+$$
+\begin{aligned}
+\mathrm{Tag} &\leftarrow \mathcal{H}(P,S);\\
+\mathrm{Cracker} &\leftarrow \{(\mathrm{Tag}_i, S_i)\}.
+\end{aligned}
+$$
+
+In the case of the password hashing, we suppose that the defender allocates certain amount of time (e.g., 1 second) per password and a certain number of CPU cores (e.g., 4 cores). Then he hashes the password using the maximum amount $M$ of memory. This memory size translates to certain ASIC area $A$. The running ASIC time $T$ is determined by the length of the longest computational chain and by the ASIC memory latency.
+Therefore, we maximize the value $AT$. The other usecases follow a similar procedure. 
+
+Suppose that an ASIC designer that wants to reduce the memory and thus the area wants to compute $\mathcal{H}$ using $\alpha M$ memory only for some $\alpha<1$. Using some tradeoff specific to $\mathcal{H}$, he has to spend $C(\alpha)$ times as much computation and his running time increases by at least the factor $D(\alpha)$. Therefore, the maximum possible gain $\mathcal{E}$ in the time-area product is 
+$$
+\mathcal{E}_{max}= \max_{\alpha}\frac{1}{\alpha D(\alpha)}.
+$$
+The hash function is called \emph{memory-hard} if $D(\alpha) >1/\alpha$ as $\alpha\rightarrow 0$. Clearly, in this case the time-area product does not decrease. Moreover, the following aspects may further increase it:
+\begin{itemize}
+\item Computing cores needed to implement the $C(\alpha)$ penalty may occupy significant area.
+\item If the tradeoff requires significant communication between the computing cores, the memory bandwidth limits may impose additional restrictions on the running time.
+\end{itemize}
+
+In the following text, we will not attempt to  estimate  time and area with large precision. However, an interested reader may use the following implementations as reference:
+ \begin{itemize}
+   \item The 50-nm DRAM implementation~\cite{giridhar2013dram} takes 550 mm${}^2$ per GByte;
+   \item The Blake2b implementation in the 65-nm process should take about 0.1 mm${}^2$ (using Blake-512 implementation in~\cite{gurkaynak2012sha3});
+   \item The maximum memory bandwidth achieved by modern GPUs is around 400 GB/sec.
+ \end{itemize}
+ 
+ \subsection{Model for memory-hard functions}
+ 
+ The memory-hard functions that we explore use the following mode of operation. The memory array $B[]$ is filled with the   compression function $G$:
+
+ \begin{equation}\label{eq:class}
+\begin{array}{rl}
+B[0] &= H(P,S);\\
+\text{for $j$ }&\text{from 1  to } t\\
+  &B[j] = G \bigl(B[\phi_1(j)] ,  B[\phi_2(j)] ,\cdots , B[\phi_k(j)]\bigr),
+\end{array}
+\end{equation}
+where $\phi_i()$ are some \emph{indexing functions}.
+
+
+We distinguish two types of indexing functions:
+\begin{itemize}
+  \item Independent of the password and salt, but possibly dependent on other public parameters (\emph{data-independent}). Thus the addresses can be calculated by the adversaries. We suppose that the dedicated hardware can handle parallel memory access, so that the  cracker can prefetch the data from the memory. Moreover, if she implements a time-space tradeoff, then the missing blocks can be also precomputed without losing time. Let the single $G$ core occupy the area equivalent to the $\beta$ of the entire memory. Then if we use $\alpha M$ memory, then the  gain in the time-area product is
+  $$
+  \mathcal{E}(\alpha) = \frac{1}{\alpha + C(\alpha)\beta}.
+  $$
+  \item Dependent on the password (\emph{data-dependent}), in our case: $\phi(j) = g(B[j-1])$. This choice prevents the adversary from prefetching and precomputing missing data. The adversary figures out what he has to recompute only at the time the element is needed. If an element is recomputed as a tree of $F$ calls of average depth $D$, then the total processing time is multiplied by $D$. The   gain in the time-area product is
+  $$
+  \mathcal{E}(\alpha) = \frac{1}{(\alpha + C(\alpha)\beta)D(\alpha)}.
+  $$
+\end{itemize}
+
+
+ The maximum bandwidth $Bw_{max}$ is a hypothetical upper bound on the memory bandwidth on the adversary's architecture. Suppose that for each call to $G$ an adversary has to load
+  $R(\alpha)$ blocks from the memory on average. Therefore,  the adversary can keep the execution time the same as long as
+ $$
+ R(alpha) Bw \leq Bw_{max},
+ $$
+ where $Bw$ is the bandwidth achieved by a full-space implementation. In the tradeoff attacks that we apply the following holds:
+   $$
+   R(alpha)  = C(alpha).
+   $$
+
+
+
+\section{Specification of Argon2}
+
+There are two flavors of \textsf{Argon2}\ -- \textsf{Argon2d} and \textsf{Argon2i}. The former one uses data-dependent memory access to thwart tradeoff attacks. However, this makes it vulnerable for side-channel attacks, so \textsf{Argon2d} is recommended primarily for cryptocurrencies and backend servers. \textsf{Argon2i} uses data-independent memory access, which is recommended for password hashing and password-based key derivation.
+
+\subsection{Inputs}
+
+\textsf{Argon2}\ has two types of inputs: primary inputs and secondary inputs, or parameters. Primary inputs are message $P$ and nonce $S$, which are password and salt, respectively, for the password hashing. Primary inputs must always be given by the user such that
+\begin{itemize}
+  \item Message $P$ may have any length from $0$ to $2^{32}-1$ bytes;
+  \item Nonce $S$ may have any length from $8$ to $2^{32}-1$ bytes (16 bytes is recommended for password hashing).
+\end{itemize}
+ Secondary inputs have the following restrictions:
+\begin{itemize}
+  \item Degree of parallelism $p$ determines how many independent (but synchronizing) computational chains can be run.  It may take any integer value from 1 to 255.
+  \item Tag length $\tau$ may be any integer number of bytes from 4 to $2^{32}-1$.
+  \item Memory size $m$ can be any integer number of kilobytes from $8p$ to $2^{32}-1$. The actual number of blocks is $m'$, which is $m$ rounded down to the nearest multiple of $4p$. 
+  \item Number of iterations $t$ (used to tune the running time independently of the memory size) can be any integer number from 1 to $2^{32}-1$;
+  \item Version number $v$ is one byte $0x10$;
+  \item Secret value $K$ (serves as key if necessary, but we do not assume any key use by default) may have any length from $0$ to $32$ bytes.
+  \item Associated data $X$ may have any length from $0$ to $2^{32}-1$ bytes.
+  \item Type $y$ of \textsf{Argon2}: 0 for  \textsf{Argon2d}, 1 for \textsf{Argon2i}.
+\end{itemize}
+
+\textsf{Argon2}\ uses internal compression function ${G}$ with two 1024-byte inputs and a 1024-byte output, and internal hash function ${H}$. Here ${H}$ is the Blake2b hash function, and ${G}$ is based on  its internal permutation. The mode of operation of \textsf{Argon2} is quite simple when no parallelism is used: function ${G}$ is iterated $m$ times. At step $i$ a block with index $\phi(i)<i$ is taken from the memory (Figure~\ref{fig:generic}), where $\phi(i)$ is either determined by the previous block in \textsf{Argon2d}, or is a fixed value in \textsf{Argon2i}.
+
+
+
+\begin{figure}[ht]
+  \ifpdf
+\begin{center}
+  \includegraphics[scale=0.6]{pics/generic.pdf}
+  \caption{Argon2 mode of operation with no parallelism. }\label{fig:generic}
+\end{center}
+\fi
+  \end{figure}
+
+\subsection{Operation}
+
+\textsf{Argon2}\ follows the extract-then-expand concept. First, it extracts entropy from message and nonce by hashing it.  All the other parameters are also added to the input. The variable length inputs $P,S,K,X$  are prepended with their lengths:
+$$
+H_0 = \mathcal{H}(p,\tau,m,t,v,y,\langle P \rangle,P,\langle S \rangle,S,\langle K \rangle,K, \langle X \rangle,X).
+$$
+Here $H_0$ is 64-byte value, and the parameters $p,\tau,m,t,v,y,
+\langle P \rangle,\langle S \rangle, \langle K \rangle,\langle X \rangle$ are treated as little-endian 32-bit integers.
+
+\textsf{Argon2}\ then fills the memory with $m$ 1024-byte blocks. For tunable parallelism with $p$ threads, the memory is organized in a matrix $B[i][j]$ of blocks
+with $p$ rows (\emph{lanes}) and $q=\lfloor m/p \rfloor$ columns. Blocks are computed as follows:
+\begin{align*}
+B[i][0] &= H'(H_0||\underbrace{i}_{\text{4 bytes}}||\underbrace{0}_{\text{4 bytes}}),\quad 0 \leq i < p; \\
+B[i][1] &= H'(H_0||\underbrace{i}_{\text{4 bytes}}||\underbrace{1}_{\text{4 bytes}}),\quad 0 \leq i < p;\\
+B[i][j] &= G(B[i][j-1], B[i'][j']),\quad 0 \leq i < p,\; 2\leq j <q.
+\end{align*}
+where block index $[i'][j']$ is determined differently for \textsf{Argon2d/2ds} and \textsf{Argon2i},  $G$ is the compression function, and $H'$ is a variable-length hash function built upon $H$. Both $G$ and $H'$ will be fully defined in the further text. 
+
+If $t>1$, we repeat the procedure; however the first two blocks of a lane are now computed in the same way:
+\begin{align*}
+B[i][0] &=G(B[i][q-1], B[i'][j']);\\
+B[i][j] &= G(B[i][j-1], B[i'][j']).
+\end{align*}
+
+When we have done  $t$ iterations over the memory, we compute the final block $B_m$ as the XOR of the last column:
+$$
+B_m = B[0][q-1] \oplus B[1][q-1]\oplus \cdots\oplus B[d-1][q-1].
+$$
+Then we apply $H'$ to $B_m$ to get the output tag.
+
+\paragraph{Variable-length hash function.} Let $H_x$ be a hash function with $x$-byte output (in our case $H_x$ is Blake2b, which supports $1\leq x \leq 64$). We define $H'$ as follows. Let $V_i$ be a 64-byte block, and  $A_i$ be its first 32 bytes,
+and $\tau<2^{32}$ be the tag length in bytes. 
+Then we define
+\begin{align*}
+H'(X):\quad V_0 &\leftarrow\tau||X;\\
+V_1 &\leftarrow H_{64}(V_0);\\
+V_2 &\leftarrow H_{64}(V_1);\\
+&\cdots\\
+V_m & \leftarrow H_{64}(V_{m-1}),\quad m = \lfloor\tau/32\rfloor-1;\\
+V_{m+1} &\leftarrow H_{\tau\hspace{-10pt} \pmod{64}}(V_{m-1})\quad \text{(absent if  64 divides }\tau).\\
+\text{Tag} & \leftarrow A_1||A_2||\ldots A_m||V_{m+1}.
+\end{align*}
+
+ \begin{figure}[ht]
+  \ifpdf
+\begin{center}
+  \includegraphics[scale=0.3]{pics/argon2-par.pdf}
+  \caption{Single-pass \textsf{Argon2} with $p$ lanes and 4 slices. }\label{fig:argon2}
+\end{center}
+\fi
+  \end{figure}
+
+\subsection{Indexing}\label{sec:index}
+
+To enable parallel block computation, we further partition the memory matrix into  $S=4$ vertical \emph{slices}. Intersection of a slice and a lane is \emph{segment} of length $q/S$.  Segments of the same slice
+  are computed in parallel, and may not reference blocks from each other. All other blocks can be referenced, and now we explain the procedure in details.
+
+\paragraph{Getting two 32-bit values.} In Argon2d we select  the first 32 bits of  block $B[i][j-1]$ and denote this value by $J_1$. Then we take the next 32 bits of $B[i][j-1]$ and denote this value by $J_2$. In \textsf{Argon2i} we run $G^2$ --- the 2-round compression function $G$ -- in the counter mode, where the first input is all-zero block, and the second input is constructed as 
+  $$
+  (\underbrace{r}_{\text{8 bytes}}||\underbrace{l}_{\text{8 bytes}}||\underbrace{s}_{\text{8 bytes}}||\underbrace{m'}_{\text{8 bytes}}||\underbrace{t}_{\text{8 bytes}}||\underbrace{x}_{\text{8 bytes}}||\underbrace{i}_{\text{8 bytes}}||\underbrace{0}_{\text{968 bytes}}),
+  $$ where
+  \begin{itemize}
+  \item $r$ is the pass number;
+  \item $l$ is the lane number;
+  \item $s$ is the slice number;
+  \item $m'$ is the total number of memory blocks;
+  \item $t$ is the total number of passes;
+  \item $x$ is the type of the Argon function (equals $1$ for \textsf{Argon2i});
+  \item $i$ is  the counter starting in each segment from 1.
+  \end{itemize} All the numbers are put as little-endian. We increase the counter so that each application of $G^2$ gives 128 64-bit values $J_1||J_2$.
+  
+ \paragraph{Mapping $J_1,J_2$ to the reference block index} The value $l = J_2 \bmod p$ determines the index of the lane from which the block will be taken. If we work with the first slice and the first pass ($r=s=0$), then $l$ is set to the current lane index.  
+ 
+ Then we determine the set of indices $\mathcal{R}$ that can be referenced for given $[i][j]$ according to the following rules:
+\begin{enumerate}
+    \item If $l$ is the current lane, then $\mathcal{R}$ includes all blocks computed in this lane, which are not overwritten yet, excluding $B[i][j-1]$.
+    \item If $l$ is not the current lane, then $\mathcal{R}$ includes all blocks in the last $S-1=3$ segments computed and finished in lane $l$. If $B[i][j]$ is the first block of a segment, then the very last block from $\mathcal{R}$ is excluded.
+\end{enumerate}
+  We are going to take a block from $\mathcal{R}$ with a non-uniform distribution over $[0..|\mathcal{R}|)$:
+  $$
+  J_1\in [0..2^{32}) \rightarrow |\mathcal{R}|(1-\frac{(J_1)^2}{2^{64}}).
+  $$ To avoid floating-point computation, we use the following integer approximation:
+\begin{align*}
+x &= (J_1)^2/2^{32};\\
+y &= (|\mathcal{R}|*x)/2^{32};\\
+z & = |\mathcal{R}|-1-y.
+\end{align*}
+Then we enumerate the blocks in $\mathcal{R}$ in the order of construction and select $z$-th block from it as the reference block.
+
+
+
+\subsection{Compression function $G$}\label{sec:compr}
+
+Compression function $G$ is built upon  the Blake2b round function $\mathcal{P}$ (fully defined in Section~\ref{sec:blakeround}). $\mathcal{P}$ operates on the 128-byte input, which can be viewed as 8 16-byte registers (see details below):
+$$
+\mathcal{P}(A_0,A_1,\ldots, A_7) = (B_0,B_1,\ldots, B_7).
+$$
+
+Compression function ${G}(X,Y)$ operates on two 1024-byte blocks $X$ and $Y$. It first computes $R=X\oplus Y$. Then $R$ is viewed as a $8\times 8$-matrix of 16-byte registers $R_0, R_1,\ldots, R_{63}.$ Then
+$\mathcal{P}$ is first applied rowwise, and then columnwise to get $Z$:
+\begin{align*}
+    (Q_0,Q_1,\ldots,Q_7) &\leftarrow \mathcal{P}(R_0,R_1,\ldots,R_7);\\
+        (Q_8,Q_9,\ldots,Q_{15})&\leftarrow \mathcal{P}(R_8,R_9,\ldots,R_{15});\\
+        \ldots&\\
+        (Q_{56},Q_{57},\ldots,Q_{63})&\leftarrow \mathcal{P}(R_{56},R_{57},\ldots,R_{63});\\[10pt]
+        (Z_0,Z_8,Z_{16},\ldots,Z_{56})&\leftarrow \mathcal{P}(Q_0,Q_8,Q_{16},\ldots,Q_{56});\\
+        (Z_1,Z_9,Z_{17},\ldots,Z_{57})&\leftarrow     \mathcal{P}(Q_1,Q_9,Q_{17},\ldots,Q_{57});\\
+        \ldots&\\
+        (Z_7,Z_{15},Z_{23},\ldots,Z_{63})&\leftarrow \mathcal{P}(Q_7,Q_{15},Q_{23},\ldots,Q_{63}).
+  \end{align*}
+  Finally, $G$ outputs $Z\oplus R$:
+  $$
+  G:\quad (X,Y)\; \rightarrow\; R = X\oplus Y\; \xrightarrow{\mathcal{P}}\;Q\;\xrightarrow{\mathcal{P}}\;Z\;
+  \rightarrow \;Z\oplus R.
+  $$
+  
+  \begin{figure}[ht]
+  \ifpdf
+\begin{center}
+  \includegraphics[scale=0.6]{pics/compression.pdf}
+  \caption{Argon2 compression function $G$. }\label{fig:compression}
+\end{center}
+\fi
+  \end{figure}
+  
+  \section{Features}
+
+\textsf{Argon2} is a multi-purpose family of  hashing schemes, which is suitable for password hashing, key derivation, cryptocurrencies and other applications that require provably high memory use. \textsf{Argon2} is optimized for the x86 architecture, but it does not slow much on older processors. The key feature of \textsf{Argon2} is its performance and the ability to use multiple computational cores in a way that prohibit time-memory tradeoffs. Several features are not included into this version, but can be easily added later.
+
+\subsection{Available features}
+Now we provide an extensive list of features of Argon2.
+
+\textbf{Performance}. \textsf{Argon2} fills memory very fast, thus increasing the area multiplier in the time-area product for ASIC-equipped adversaries. Data-independent version \textsf{Argon2i} securely fills the memory spending about 2 CPU cycles per byte, and \textsf{Argon2d} is three times as fast. This makes it suitable for applications that need memory-hardness but can not allow much CPU time, like cryptocurrency peer software.
+
+\textbf{Tradeoff resilience}.  Despite high performance, \textsf{Argon2} provides reasonable level of tradeoff resilience. Our tradeoff attacks previously applied to Catena and Lyra2 show the following. With default number of passes over memory (1 for \textsf{Argon2d}, 3 for \textsf{Argon2i}, an ASIC-equipped adversary can not decrease the time-area product if the memory is reduced by the factor of 4 or more. Much higher penalties apply if more passes over the memory are made. 
+
+\textbf{Scalability}.  \textsf{Argon2} is scalable both in time and memory dimensions.  Both parameters can be changed independently provided that a certain amount of time is always needed to fill the memory. 
+
+\textbf{Parallelism}. \textsf{Argon2}  may use up to 64 threads in parallel,  although in our experiments 8 threads already exhaust the available bandwidth and computing power of the machine.
+
+\textbf{GPU/FPGA/ASIC-unfriendly}. \textsf{Argon2} is heavily optimized for the x86 architecture, so that implementing it on  dedicated cracking hardware should  be neither cheaper nor faster. Even specialized ASICs would require significant area and would not allow reduction in the time-area product.
+
+\textbf{Additional input support}. \textsf{Argon2} supports additional input, which is syntactically separated from the message and nonce, such as secret key, environment parameters, user data, etc..
+
+
+\subsection{Possible future extensions}\label{sec:future2}
+
+Argon2\ can be rather easily tuned to support other compression functions, hash functions and block sizes.
+ROM can be easily integrated into \textsf{Argon2} by simply including it into the area where the blocks are referenced from.
+
+
+\section{Security analysis}
+
+
+
+
+
+\subsection{Ranking tradeoff attack}\label{sec:tradeoff} To figure out the costs of the ASIC-equipped adversary, we first need to calculate the time-space tradeoffs for \textsf{Argon2}. To the best of our knowledge, the first generic
+tradeoffs attacks were reported in~\cite{trade-att}, and they apply to both data-dependent and data-independent schemes. The idea of the ranking method~\cite{trade-att} is as follows. When we generate a memory block $B[l]$, we make a decision, to store it or not. If we do not store it, we calculate the access complexity of this block --- the number of calls to $F$ needed to compute the block, which is based on the access complexity of $B[l-1]$ and $B[\phi(l)]$. The detailed strategy is as follows:
+ \begin{enumerate}
+ \item Select an integer $q$ (for the sake of simplicity let $q$ divide $T$).
+  \item Store $B[kq]$ for all $k$;
+  \item Store all $r_i$ and all access complexities;
+  \item Store the  $T/q$  highest access complexities. If $B[i]$ refers to a vertex from this top, we store $B[i]$.
+\end{enumerate}
+The memory reduction is a probabilistic function of $q$. We applied the algorithm  to the indexing function of \textsf{Argon2} and obtained the results in Table~\ref{tab:generic3}. Each recomputation is a tree of certain depth, also given in the table.
+
+We conclude that for data-dependent one-pass schemes the adversary is always able to reduce the memory by the factor of 4 and still keep the time-area product the same.
+\begin{table}[hb]
+\renewcommand{\arraystretch}{1.3}
+$$
+\begin{array}{|c||c|c|c|c|c|c|c|c|}
+\hline
+\text{$\alpha$ } &\frac{1}{2} &\frac{1}{3} &\frac{1}{4} &\frac{1}{5} &\frac{1}{6} &\frac{1}{7}  \\
+\hline
+\text{$C(\alpha)$} &1.5& 4& 20.2& 344&  4660 &  2^{18}\\
+\text{($D(\alpha)$} & 1.5 & 2.8 & 5.5 & 10.3 & 17 &27 \\
+\hline
+\end{array}
+$$
+\caption{Time and computation penalties for the ranking tradeoff attack for the Argon2 indexing function.}\label{tab:generic3}
+\end{table}
+
+
+\subsection{Attack on iterative compression function}\label{sec:att-iter}
+
+Let us consider the following structure of the compression function $F(X,Y)$, where $X$ and $Y$ are input blocks:
+\begin{itemize}
+  \item The input blocks of size $t$ are divided into shorter subblocks of length $t'$ (for instance, 128 bits) $X_0,X_1,X_2,\ldots$ and $Y_0,Y_1,Y_2,\ldots$.
+  \item The output block $Z$ is computed subblockwise:
+  \begin{align*}
+    Z_0 = G(X_0,Y_0);\\
+    Z_i = G(X_i,Y_i,Z_{i-1}),\;i>0.
+  \end{align*}
+\end{itemize}
+This scheme resembles the duplex authenticated encryption mode, which is secure under certain assumptions on $G$. However, it is totally insecure against tradeoff adversaries, as shown below.
+
+Suppose that an adversary computes $Z = F(X,Y)$ but $Y$ is not stored. Suppose that  $Y$ is a tree function of stored elements of depth $D$.  The adversary starts with computing $Z_0$, which requires only $Y_0$. In turn, $Y_0 = G(X_0', Y_0')$ for some $X',Y'$.
+Therefore, the adversary computes the tree of the same depth $D$, but with the function $G$ instead of $F$. $Z_1$ is then a tree function of depth $D+1$, $Z_2$ of depth $D+2$, etc. In total, the recomputation takes $(D+s)L_G$ time, where $s$ is the number of subblocks and $L_G$ is the latency of $G$. This should be compared to the full-space implementation, which takes time
+$sL_G$. Therefore, if the memory is reduced by the factor $q$, then the time-area product is changed as
+$$
+AT_{new} = \frac{D(q)+s}{sq}AT.
+$$
+Therefore, if
+\begin{equation}\label{att:iter}
+D(q) \leq s(q-1),
+\end{equation}
+the adversary wins.
+
+One may think of using the $Z_{m-1}[l-1]$ as input to computing $Z_0[l]$. Clearly, this changes little in adversary's strategy, who could simply store all $Z_{m-1}$, which is feasible for large $m$. In concrete proposals, $s$ can be 64, 128, 256 and even larger.
+
+We conclude that $F$ with an iterative structure is insecure. We note that this attack applies also to other PHC candidates with iterative compression function.
+
+\begin{comment}
+  \subsection{Multi-pass schemes}
+
+  If the defender has more time than needed to fill the available memory, then he can run several passes on the memory. Also some designers decided to process memory several times
+   to get better time-space tradeoffs. Let us figure out how the adversary's costs are affected in this case.
+
+ Suppose we make $K$ passes with $T$ iterations each following the scheme~\eqref{eq:class}, so that after the first pass any address in the memory may be used. Then this is equivalent
+ to running a single pass with $KT$ iterations such that $\phi(j)\geq j-T$. The time-space tradeoff would be the same as in a single pass with $T$ iterations and additional condition
+ $$
+ \phi(j) \geq j-\frac{T}{K}.
+ $$
+  We have applied the ranking algorithm (Section~\ref{sec:tradeoff}) and obtained the  results in Tables~\ref{tab:generic4},\ref{tab:generic5}. We conclude that for the data-dependent schemes
+  using several passes does  increase the time-area product for the adversary who uses tradeoffs. Indeed, suppose we run a scheme with memory $A$ with one pass for time $T$,
+   or on $A/2$ with 2 passes.
+  If the adversary reduces the memory to $A/6$ GB (i.e. by the factor of 6) for the first case, the time grows by the factor of 8.2, so that the time-area product is $1.35 AT$. However, if in the second setting the memory is reduced to $A/6$ GB (i.e. by the factor of 3), the time grows by the factor of 14.3, so that the time-area product is $2.2AT$. For other reduction factors the ratio between the two products remains around 2.
+
+  Nevertheless, we do not immediately argue for the prevalence of multi-pass schemes, since it can be possible that new tradeoff algorithms change their relative strength.
+
+  \begin{table}[ht]
+\renewcommand{\arraystretch}{1.3}
+$$
+\begin{array}{|c||c|c|c|c|c|}
+\hline
+\text{Memory fraction ($\alpha$) } &\frac{1}{2} &\frac{1}{3} &\frac{1}{4} &\frac{1}{5} &\frac{1}{6}\\
+\hline
+\text{1 pass} & 1.7 & 3 & 6.3 & 16.6 & 55\\
+\text{2 passes} & 15 & 410 & 19300& 2^{20} & 2^{25}\\
+\text{3 passes} &3423& 2^{22}& 2^{32}\\
+\hline
+\end{array}
+$$
+\caption{Computation penalties for the ranking tradeoff attack.}\label{tab:generic4}
+\end{table}
+
+  \begin{table}[ht]
+\renewcommand{\arraystretch}{1.3}
+$$
+\begin{array}{|c||c|c|c|c|c|}
+\hline
+\text{Memory fraction ($\alpha$) } &\frac{1}{2} &\frac{1}{3} &\frac{1}{4}&\frac{1}{5} &\frac{1}{6}\\
+\hline
+\text{1 pass} & 1.7 & 2.5 & 3.8 & 5.7 & 8.2\\
+\text{2 passes} & 5.7 & 14.3 & 28.8 & 49 & 75\\
+\text{3 passes} &20.7& 56& 103 &- & -\\
+\hline
+\end{array}
+$$
+\caption{Time penalties for the ranking tradeoff attack.}\label{tab:generic5}
+\end{table}
+\end{comment}
+
+\subsection{Security of Argon2 to generic attacks}\label{sec:generic}
+
+Now we consider preimage and collision resistance of both versions of \textsf{Argon2}. Variable-length inputs are prepended with their lengths, which shall ensure
+ the absence of equal input strings. Inputs are processed by a cryptographic hash function, so no collisions should occur at this stage.
+ 
+ \paragraph{Internal collision resistance.} The compression function $G$ is not claimed to be collision resistant, so it may happen that distinct inputs produce identical outputs. Recall
+that $G$ works as follows:
+$$
+G(X,Y) = P(Z)\oplus (Z), \quad Z = X\oplus Y.
+$$
+where $P$ is a permutation based on the 2-round Blake2b permutation. Let us prove that all $Z$ are different under certain assumptions.
+\begin{theorem}
+  Let $\Pi$ be \textsf{Argon2d} or \textsf{Argon2i} with $d$ lanes, $s$ slices, and $t$ passes over memory. Assume that
+   \begin{itemize}
+     \item $P(Z)\oplus Z$ is collision-resistant, i.e. it is hard to find $a,b$ such that $P(a)\oplus a = P(b)\oplus b$.
+     \item $P(Z)\oplus Z$ is 4-generalized-birthday-resistant, i.e. it is hard to find distinct $a,b,c,d$ such that $P(a)\oplus P(b)\oplus P(c)\oplus P(d) = a\oplus b\oplus c \oplus d$.
+   \end{itemize}Then all the blocks $B[i]$ generated in those $t$ passes are different.
+\end{theorem}
+\begin{proof}
+    By specification, the value of $Z$ is different for the first two blocks of each segment in the first slice in the first pass. Consider the other blocks.
+
+  Let us enumerate the blocks according to the moment they are computed. Within a slice, where segments can be computed in parallel, we enumerate lane 0 fully first, then lane 1, etc.. Slices are then computed and enumerated sequentially.
+  Suppose the proposition is wrong, and  let $(B[a],B[b])$ be a block collision such that $x<y$ and $y$ is the smallest among all such collisions. As $F(Z)\oplus Z$ is collision resistant,
+  the collision occurs in $Z$, i.e.
+  $$
+  Z_x = Z_y.
+  $$
+  Let $r_x, r_y$ be reference block indices for $B[x]$ and $B[y]$, respectively, and let $p_x, p_y$ be previous block indices for $B[x],B[y].$ Then we get
+  $$
+  B[r_x] \oplus B[p_x] = B[r_y] \oplus B[p_y].
+  $$
+  As we assume 4-generalized-birthday-resistance, some arguments are equal. Consider three cases:
+  \begin{itemize}
+    \item $r_x=p_x$. This is forbidden by the rule 3 in Section~\ref{sec:index}.
+    \item $r_x=r_y$.  We get $B[p_x] =  B[p_y]$. As $p_x,p_y <y$, and $y$ is the smallest yielding such a collision, we get $p_x = p_y$. However, by construction $p_x \neq p_y$ for $x\neq y$.
+    \item $r_x = p_y$. Then we get $B[r_y] = B[p_x]$. As $r_y <y$ and $p_x<x<y$, we obtain $r_y = p_x$. Since $p_y=r_x<x<y$, we get that $x$ and $y$ are in the same , we have two options:
+    \begin{itemize}
+    \item $p_y$ is the last block of a segment. Then $y$ is the first block of a segment in the next slice. Since $r_x$ is the last block of a segment, and $x<y$, $x$ must be in the   same slice as $y$, and $x$ can not be the first block in a segment by the rule 4 in Section~\ref{sec:index}. Therefore, $r_y=p_x = x-1$. However, this is impossible, as $r_y$ can not belong to the same slice as $y$.
+        \item $p_y$ is not the last block of a segment. Then $r_x = p_y = y-1$, which implies that $r_x \geq x$. The latter is forbidden.
+    \end{itemize}
+  \end{itemize}
+  Thus we get a contradiction in all cases. This ends the proof.
+\end{proof}
+
+ The compression function $G$ is not claimed to be collision resistant nor preimage-resistant. However, as the attacker has no control over its input, the collisions are highly unlikely. We only take care that the starting blocks are not identical by producing the first two blocks with a counter and forbidding to reference from the memory the last block as (pseudo)random.
+
+\textsf{Argon2d} does not overwrite the memory, hence it is vulnerable to garbage-collector attacks and similar ones, and is not recommended to use in the setting where these threats are possible. \textsf{Argon2i} with 3 passes overwrites the memory twice, thus thwarting the memory-leak attacks. Even if the entire working memory of \textsf{Argon2i} is leaked after the hash is computed, the adversary would have to compute two passes over the memory to try the password.
+
+
+
+\subsection{Security of Argon2 to tradeoff attacks}
+
+Time and computational penalties for 1-pass \textsf{Argon2d} are given in Table~\ref{tab:generic3}. It suggests that the adversary can reduce memory by the factor of 3 at most
+while keeping the time-area product the same.
+
+\textsf{Argon2i} is more vulnerable to tradeoff attacks due to its data-independent addressing scheme. We applied the ranking algorithm to 3-pass \textsf{Argon2i} to calculate time and computational penalties. We found out  that the memory reduction by the factor of 3 already gives the computational penalty of around $2^{14}$.  The $2^{14}$ Blake2b cores would take more area than 1 GB of RAM (Section~\ref{sec:costs}), thus prohibiting the adversary to further reduce the time-area product. We conclude that the time-area product cost for \textsf{Argon2d} can be reduced by 3 at best.
+
+
+
+
+  \section{Design rationale}
+
+  \textsf{Argon2}\ was designed with the following primary goal: to maximize the cost of exhaustive search on non-x86 architectures, so that the switch even to dedicated ASICs would not give significant advantage over doing the exhaustive search on defender's machine.
+
+
+
+
+ \subsection{Indexing function}  
+
+The basic scheme~\eqref{eq:class} was extended to implement:
+\begin{itemize}
+  \item Tunable parallelism;
+  \item Several passes over memory.
+\end{itemize}
+
+
+
+
+
+  For the data-dependent addressing we set $\phi(l) = g(B[l])$, where $g$ simply truncates the block and takes the result modulo $l-1$. We considered
+ taking the address
+  not from the block $B[l-1]$ but from the block $B[l-2]$, which should have allowed to prefetch the block earlier. However, not only the gain in our implementations is limited, but also this benefit can be exploited by the adversary. Indeed, the efficient depth $D(q)$ is
+ now reduced to $D(q)-1$, since the adversary has one extra timeslot. Table~\ref{tab:generic3} implies that then the adversary would be able to reduce the memory by the factor of 5 without increasing the time-area product (which is a 25\% increase in the reduction factor compared to the standard approach).
+
+
+
+ For the data-independent addressing we use a simple PRNG, in particular the compression function $G$ in the counter mode.
+  Due to its long output, one call (or two consecutive calls) would produce hundreds of addresses,
+ thus minimizing the overhead. This approach does not give provable tradeoff bounds, but instead allows
+ the analysis with the  tradeoff algorithms suited for data-dependent addressing.
+
+\paragraph{Motivation for our indexing functions}
+
+Initially, we considered uniform selection of referenced blocks, but then we considered a more generic case:
+$$
+\phi \leftarrow \lceil(2^{64}-(J_1)^\gamma)\cdot |\mathcal{R}_l|/2^{64} \rceil
+$$
+
+We tried to choose the $\gamma$ which would maximize the adversary's costs if he applies the tradeoff based on the ranking method. We also attempted to make the reference block distribution close to uniform, so that each memory block is referenced similar number of times.
+
+For each $1\leq \gamma\leq 5$ with step $0.1$ we applied the ranking method with sliding window and selected the best available tradeoffs. We obtained a set of time penalties $\{D_{\gamma}(\alpha)\}$ and computational penalties $\{C_{\gamma}(\alpha)\}$ for $0.01<\alpha<1$. We also calculated the reference block distribution for all possible $\gamma$. We considered two possible metrics:
+\begin{enumerate}
+\item Minimum time-area product $$AT_{\gamma} = \min_{\alpha}\{\alpha\cdot D_{\gamma}(\alpha)\}.
+$$
+\item Maximum memory reduction which reduces the time-area product compared to the original:
+$$
+\alpha_{\gamma} = \min_{\alpha} \{\alpha\,|\,D_{\gamma}(\alpha)<\alpha\}.
+$$
+\item The goodness-of-fit value of the reference block distribution w.r.t. the uniform distribution with $n$ bins:
+$$
+\chi^2 = \sum_i \frac{(p_i-\frac{1}{n})^2}{\frac{1}{n}},
+$$
+where $p_i$ is the average probability of the block from $i$-th bin to be referenced. For example, if $p_3 = 0.2, \,n=10$ and there are 1000 blocks, then blocks from $201$ to $300$ are referenced $1000\cdot 0.2 =200$ times throughout the computation.  
+\end{enumerate}
+We got the following results for $n=10$:
+$$
+\begin{array}{|c|c|c|c|}
+\hline
+\gamma & AT_{\gamma}&\alpha_{\gamma} &\chi^2\\
+\hline
+ 1&  0.78 & 3.95&0.89\\
+  \hline 2 & 0.72 & 3.2& 0.35\\
+ \hline 3 & 0.67 & 3.48&0.2\\
+  \hline 4 & 0.63 & 3.9&0.13\\
+ \hline 5 & 0.59 & 4.38&0.09\\
+ \hline
+\end{array}
+$$
+We conclude that the time-area product achievable by the attacker slowly decreases as $\gamma$ grows. However, the difference between $\gamma=1$ and $\gamma=5$ is only the factor of $1.3$. We also see that the time-area product can be kept below the original up to $q=3.2$ for $\gamma=2$, whereas for $\gamma=4$ and $\gamma=1$ such $q$ is close to $4$.
+To avoid floating-point computations, we restrict to integer $\gamma$. Thus the optimal values are $\gamma=2$ and $\gamma=3$, where the former is slightly better in the first two metrics. 
+
+However, if we consider the reference block uniformity, the situation favors larger $\gamma$ considerably. We see that the $\chi^2$ value is decreased by the factor of $2.5$ when going from $\gamma=1$ to $\gamma=2$, and by the factor of $1.8$ further to $\gamma=3$. In concrete probabilities (see also Figure~\ref{fig:histo}),
+the first 20\% of blocks accumulate 40\% of all reference hits for $\gamma=2$ and 32\% for $\gamma=3$ (23.8\% vs  19.3\% hit for the first 10\% of blocks). 
+
+To summarize, $\gamma=2$ and $\gamma=3$ both are better against one specific attacker and slightly worse against the other. We take $\gamma=2$ as the value that minimizes the AT gain, as we consider this metric more important. 
+
+ \begin{table}[ht]
+\renewcommand{\arraystretch}{1.3}
+$$
+\begin{array}{|c||c|c|c|c|c|}
+\hline
+\text{Memory fraction ($1/q$) } &\frac{1}{2} &\frac{1}{3} &\frac{1}{4}&\frac{1}{5} &\frac{1}{6}\\
+\hline
+\gamma=1 & 1.6 & 2.9 & 7.3 & 16.4 & 59\\
+\gamma=2 & 1.5 & 4 & 20.2 & 344 & 4700\\
+\gamma=3 &1.4& 4.3& 28.1 &1040 & 2^{17}\\
+\hline
+\end{array}
+$$
+\caption{Computational penalties for the ranking tradeoff attack with a sliding window, 1 pass.}\label{tab:comp-alpha}
+\end{table}
+
+ \begin{table}[ht]
+\renewcommand{\arraystretch}{1.3}
+$$
+\begin{array}{|c||c|c|c|c|c|}
+\hline
+\text{Memory fraction ($1/q$) } &\frac{1}{2} &\frac{1}{3} &\frac{1}{4}&\frac{1}{5} &\frac{1}{6}\\
+\hline
+\gamma=1 & 1.6 & 2.5 & 4 & 5.8 & 8.7\\
+\gamma=2 & 1.5 & 2.6 & 5.4 & 10.7 & 17\\
+\gamma=3 &1.3& 2.5& 5.3 &10.1 & 18\\
+\hline
+\end{array}
+$$
+\caption{Depth penalties for the ranking tradeoff attack with a sliding window, 1 pass.}\label{tab:depth-alpha}
+\end{table}
+
+\begin{figure}[hb]
+\begin{center}
+\includegraphics[width=5cm]{pics/power-distribution.jpg}
+\end{center}
+\caption{Access frequency for different memory segments (10\%-buckets) and different exponents (from $\gamma=1$ to $\gamma=5$) in the indexing functions.}\label{fig:histo}
+\end{figure}
+
+
+ \subsection{Implementing parallelism}\label{sec:parall}
+
+As modern CPUs have several cores possibly available  for hashing, it is tempting to use these cores to increase the bandwidth, the amount of filled memory, and the CPU load.
+The cores of the recent Intel CPU share the L3 cache and the entire memory, which both have large latencies (100 cycles and more). Therefore, the inter-processor communication should be minimal to avoid delays.
+
+The simplest way to use  $p$ parallel cores is to compute and XOR $p$ independent calls to $H$:
+$$
+H'(P,S) = H(P,S, 0)\oplus H(P,S,1)\oplus \cdots \oplus H(P,S,p).
+$$
+If a single call uses $m$ memory units, then $p$ calls use $pm$ units. However, this method admits a trivial tradeoff: an adversary just makes $p$ sequential calls to $H$ using only $m$ memory in total, which keeps the time-area product constant.
+
+We suggest the following solution for $p$ cores: the entire memory is split  into $p$ lanes of  $l$ equal slices each, which can be viewed as elements of a $(p\times l)$-matrix $Q[i][j]$. Consider the class of
+schemes given by Equation~\eqref{eq:class}. We modify it as follows:
+\begin{itemize}
+  \item $p$ invocations to $H$ run in parallel on the first column $Q[*][0]$ of the memory matrix. Their indexing functions refer to their own slices only;
+  \item For each  column $j>0$, $l$ invocations to $H$ continue to run in parallel, but the indexing functions now may refer not only to their own slice, but also to all $jp$ slices of previous columns $Q[*][0],Q[*][1],\ldots,Q[*][j-1]$.
+  \item The last blocks produced in each slice of the last column are XORed.
+\end{itemize}
+This idea is easily implemented in software with $p$ threads and $l$ joining points. It is easy to see that the adversary can use less memory when computing the last column, for instance
+by computing the slices sequentially and storing only the slice which is currently computed. Then his time is multiplied by $(1+\frac{p-1}{l})$, whereas the memory use is multiplied
+by $(1-\frac{p-1}{pl})$, so the time-area product is modified as
+$$
+AT_{new} = AT \left(1-\frac{p-1}{pl}\right)\left(1+\frac{p-1}{l}\right).
+$$
+For $2 \leq p,l \leq 10$ this value is always between $1.05$ and $3$. We have selected $l=4$ as this value gives low synchronisation overhead while imposing time-area penalties on the adversary who reduces the memory even by the factor 3/4. We note that values $l=8$ or $l=16$ could be chosen.
+
+If the compression function is collision-resistant, then one may easily prove that block collisions are highly unlikely. However, we employ a weaker compression function, for which the following holds:
+$$
+G(X,Y) = F(X\oplus Y),
+$$
+which is invariant to swap of inputs and is not collision-free. We take special care to ensure that the mode of operation does not allow such collisions by introducing additional rule:
+\begin{itemize}
+  \item First block of a segment can not refer to the last block of any segment in the previous slice.
+\end{itemize}
+We prove that block collisions are unlikely under reasonable conditions on $F$ in Section~\ref{sec:generic}.
+
+  \subsection{Compression function design}\label{sec:compression}
+
+  \subsubsection{Overview}
+
+In contrast to attacks on regular hash functions, the adversary does not control inputs to the compression function $G$ in our scheme. Intuitively, this should relax the cryptographic properties required from the compression function and allow for a faster primitive. To avoid being the bottleneck, the compression function ideally should be on par with the performance of memcpy() or similar function, which may run at 0.1 cycle per byte or even faster. This much faster than ordinary stream ciphers or hash functions, but we might not need strong properties of those primitives.
+
+However, we first have to determine the optimal block size. When we request a block from a random location in the memory, we most likely get a cache miss. The first bytes would arrive at the CPU from RAM within at best 10 ns, which accounts for 30 cycles. In practice, the latency of a single load instruction may reach 100 cycles and more. However, this number can be amortized if we request a large block of sequentially stored bytes. When the first bytes are requested, the CPU stores the next ones in the L1 cache, automatically or using the \texttt{prefetch} instruction. The data from the L1 cache can be loaded as fast as 64 bytes per cycle on the Haswell architecture, though we did not manage to reach this speed in our application.
+
+Therefore, the larger the block is, the higher the throughput is. We have made a series of experiments with a non-cryptographic compression function, which does little beyond simple XOR of its inputs, and achieved the performance of around 0.7 cycles per byte per core with block sizes of 1024 bits and larger.
+
+\subsubsection{Design criteria}
+
+It was demonstrated that a compression function with a large block size may be vulnerable to tradeoff attacks if it has a simple iterative structure, like modes of operation for a blockcipher~\cite{trade-att} (some details in Appendix~\ref{sec:att-iter}). 
+
+Thus we formulate the following design criteria:
+\begin{itemize}
+  \item \emph{The compression function must require about $t$ bits of storage (excluding inputs) to compute any output bit.}
+  \item \emph{Each output byte of $F$ must be a nonlinear function of all input bytes, so that the function has differential probability below certain level, for example $\frac{1}{4}$}.
+\end{itemize}
+These criteria ensure that the attacker is unable to compute an output bit using only a few input bits or a few stored bits. Moreover, the output bits should not be (almost) linear functions of input bits, as otherwise the function tree would collapse.
+
+We have not found any generic design strategy for such large-block compression functions. It is difficult to maintain diffusion on large memory blocks due to the lack of CPU instructions that interleave many registers at once. A naive approach would be to apply a linear transformation with certain branch number. However, even if we operate on 16-byte registers, a 1024-byte block would consist of 64 elements. A $64\times 64$-matrix would require 32 XORs per register to implement, which gives a penalty about 2 cycles per byte.
+
+Instead, we propose to build the compression function on the top of a transformation $P$ that already mixes several registers. We apply $P$ in parallel (having a P-box), then shuffle the output registers and apply it again. If $P$ handles $p$ registers, then the compression function may transform a block of $p^2$ registers with 2 rounds of P-boxes. We do not have to manually shuffle the data, we just change the inputs to P-boxes. As an example, an implementation of the Blake2b~\cite{AumassonNWW13} permutation processes 8 128-bit registers, so with 2 rounds of Blake2b we can design
+a compression function that mixes the 8192-bit block. We stress that this approach is not possible with dedicated AES instructions. Even though they are very fast, they apply only to the 128-bit block, and we still have to diffuse its content across other blocks.
+
+
+
+\subsection{User-controlled parameters}
+
+We have made a number of design choices, which we consider optimal for a wide range of applications. Some parameters can be altered, some should be kept as is. We give a user full control over:
+\begin{itemize}
+  \item Amount $M$ of memory filled by algorithm. This value, evidently, depends on the application and the environment. There is no "insecure" value for this parameter, though clearly the more memory the better.
+  \item Number $T$ of passes over the memory. The running  time depends linearly on this parameter. We expect that the user chooses this number according to the time constraints on the application. Again, there is no "insecure value" for $T$.
+  \item Degree $d$ of parallelism. This number determines the number of threads used by an optimized implementation of \textsf{Argon2}. We expect that the user is restricted by a number of CPU cores (or half-cores) that can be devoted to the hash function, and chooses $d$ accordingly (double the number of cores).
+  \item Length of password/message, salt/nonce, and tag (except for some low, insecure values for salt and tag lengths).
+\end{itemize}
+
+We allow to choose another compression function $G$, hash function $H$, block size $b$, and number of slices $l$. However, we do not provide this flexibility in a reference implementation as we guess that
+the vast majority of the users would prefer as few parameters as possible.
+
+
+
+\section{Performance}
+
+\subsection{x86 architecture}
+To optimize the data load and store from/to memory, the memory that will be processed has to be alligned on 16-byte boundary when loaded/stored into/from 128-bit registers and on 32-byte boundary when loaded/stored into/from 256-bit registers. If the memory is not aligned on the specified boundaries, then each memory operation may take one extra CPU cycle, which may cause consistent penalties for many memory accesses.
+
+
+The results presented are obtained using the \texttt{gcc 4.8.2} compiler  with the following options: \texttt{-m64 -mavx -std=c++11 -pthread -O3}.
+The cycle count value was measured using the \texttt{\_\_rdtscp} Intel intrinsics C function which inlines the \texttt{RDTSCP} assembly instruction that returns the 64-bit Time Stamp Counter (TSC) value. The instruction waits for prevoius instruction to finish and then is executed, but meanwhile the next instructions may begin before the value is read. Although this shortcoming, we used this method because it is the most realiable handy method to measure the execution time and also it is widely used in other cryptographic operations benchmarking.
+
+\begin{table}
+\begin{center}
+\begin{tabular}{|cc||cc|cc|}
+\hline
+& & \multicolumn{2}{c|}{\textsf{Argon2d} (1 pass)} & \multicolumn{2}{|c|}{\textsf{Argon2i} (3 passes)}  \\
+\cline{3-6}
+Processor & Threads & Cycles/Byte & Bandwidth & Cycles/Byte & Bandwidth \\
+& & & (GB/s) & & (GB/s)\\
+\hline
+ i7-4500U & 1 &1.6 & 2.2 & 4.7 & 2.6 \\
+\hline
+ i7-4500U & 2 &1.0& 3.6&2.8 & 4.5\\
+\hline
+ i7-4500U & 4 &0.7 & 5.1 & 2 & 5.4 \\
+\hline
+ i7-4500U & 8 & 0.7 & 5.1 & 1.9 & 5.8\\
+\hline
+\end{tabular}
+\end{center}
+\caption{Speed and memory bandwidth of Argon2(d/i) measured on 1 GB memory filled. Core i7-4500U --- Intel Haswell 1.8 GHz, 4 cores}
+\label{table:cycle_per_byte_results}
+\end{table}
+
+\section{Applications}
+\textsf{Argon2d} is optimized for settings where the adversary does not get regular access to system memory or CPU, i.e. he can not run side-channel attacks based on the timing information, nor he
+can recover the password much faster using garbage collection~\cite{cryptoeprint:2014:881}. These settings are more typical for backend servers and cryptocurrency minings. For practice we suggest the following settings:
+\begin{itemize}
+  \item Cryptocurrency mining, that takes 0.1 seconds on a 2 Ghz CPU using 1 core --- \textsf{Argon2d} with 2 lanes and 250 MB of RAM;
+  \item Backend server authentication, that takes 0.5 seconds on a 2 GHz CPU using 4 cores --- \textsf{Argon2d} with 8 lanes and 4 GB of RAM.
+\end{itemize}
+
+\textsf{Argon2i} is optimized for more dangerous settings, where the adversary possibly can access the same machine, use its CPU or mount cold-boot attacks. We use three passes to get rid entirely of the password in the memory. We suggest the following settings:
+\begin{itemize}
+  \item Key derivation for hard-drive encryption, that takes 3 seconds on a 2 GHz CPU using 2 cores --- \textsf{Argon2i}with 4 lanes and 6 GB of RAM;
+  \item Frontend server authentication, that takes 0.5 seconds on a 2 GHz CPU using 2 cores --- \textsf{Argon2i} with 4 lanes and 1 GB of RAM.
+\end{itemize}
+
+
+
+
+\section{Conclusion}
+
+We presented  the memory-hard function \textsf{Argon2}, which maximizes the ASIC implementation costs for given CPU computing time. We aimed to make the design clear and compact, so that any feature and operation has certain rationale. The clarity and brevity of the Argon2 design has been confirmed by its eventual selection as the PHC winner.
+
+Further development of tradeoff attacks with dedication to \textsf{Argon2} is the subject of future work. It also remains to be seen how \textsf{Argon2} withstands GPU cracking with low memory requirements.
+
+
+
+\bibliographystyle{IEEEtranS}
+\bibliography{tradeoff}
+
+\appendix 
+
+
+
+\section{Permutation $\mathcal{P}$}\label{sec:blakeround}
+
+Permutation $\mathcal{P}$  is based on the round function of Blake2b and works as follows. Its 8 16-byte inputs $S_0, S_1,\ldots, S_7$ are viewed as a $4\times 4$-matrix of 64-bit words, where $S_i = (v_{2i+1}||v_{2i})$:
+$$
+\begin{pmatrix}
+  v_0 & v_1 & v_2 & v_3\\
+    v_4 & v_5 & v_6 & v_7\\
+      v_8 & v_9 & v_{10} & v_{11}\\
+        v_{12} & v_{13} & v_{14} & v_{15}\\
+\end{pmatrix}
+$$
+Then we do
+\begin{eqnarray*}
+G(v_0, v_4, v_8, v_{12})\quad G(v_1, v_5, v_9, v_{13}) \\ G(v_2, v_6, v_{10}, v_{14}) \quad G(v_3, v_7, v_{11}, v_{15})\\
+G(v_0, v_5, v_{10}, v_{15})\quad G(v_1, v_6, v_{11}, v_{12}) \\ G(v_2, v_7, v_{8}, v_{13}) \quad G(v_3, v_4, v_{9}, v_{14}),
+\end{eqnarray*}
+where $G$  applies to $(a,b,c,d)$ as follows:
+\begin{equation}\label{eq:blake-orig}
+\begin{aligned}
+  a &\leftarrow a + b+ 2*a_L*b_L;\\
+  d &\leftarrow (d\oplus a)\ggg 32;\\
+  c &\leftarrow c + d+ 2*c_L*d_L;\\
+  b &\leftarrow (b\oplus c)\ggg 24;\\
+    a &\leftarrow a + b+ 2*a_L*b_L;\\
+  d &\leftarrow (d\oplus a)\ggg 16;\\
+  c &\leftarrow c + d+ 2*c_L*d_L;\\
+  b &\leftarrow (b\oplus c)\ggg 63;\\
+\end{aligned}
+\end{equation}
+Here $+$ are additions modulo $2^{64}$ and $\ggg$ are 64-bit rotations to the right. $x_L$ is the 64-bit integer $x$ truncated to the 32 least significant bits. The modular additions in $G$ are combined with 64-bit multiplications (that is the only difference to the original Blake2 design). 
+
+
+Our motivation in adding multiplications is to increase the circuit depth (and thus the running time) of a potential ASIC implementation while having roughly the same running time on CPU thanks to parallelism and pipelining. Extra multiplications in the scheme serve well, as the best addition-based circuits for multiplication have latency about 4-5 times the addition latency for 32-bit multiplication (or roughly $\log_n$ for $n$-bit multiplication).
+
+As a result, any output 64-bit word of  $\mathcal{P}$  is implemented by a chain of additions, multiplications, XORs, and rotations. The shortest possible chain for the 1 KB-block (e.g, from $v_0$  to $v_0$) consists of 12 MULs, 12 XORs, and 12 rotations.
+
+\section{Additional functionality}
+
+The following functionality is enabled in the reference implementation but is 
+not officially included in the version 1.2.1.:
+\begin{itemize}
+\item Hybrid construction \textsf{Argon2id}, which has type $y=2$ (used in the pre-hashing and address generation). In the first two slices of the first pass it generates reference addresses data-independently as in \textsf{Argon2i}, whereas in later slices and next passes it generates them data-dependently as in \textsf{Argon2d}.
+\item Sbox-hardened version \textsf{Argon2ds}, which has type $y=4$. In this version the compression function $G$ includes the 64-bit transformation $\mathcal{T}$, which is a chain of S-boxes, multiplications, and additions. In terms of Section~\ref{sec:compr}, we additionally compute 
+\begin{align*}
+W&= LSB_{64}(R_0\oplus R_{63});\\
+Z_0 &+= \mathcal{T}(W);\\
+Z_{63}&+=\mathcal{T}(W)\ll 32.
+\end{align*}
+The transformation $\mathcal{T}$, on the 64-bit word $W$  is defined as follows:
+\begin{itemize}
+\item Repeat 96 times:
+\begin{enumerate}
+\item $y\leftarrow S[W[8:0]]$;
+\item $z\leftarrow S[512+W[40:32]]$;
+\item $W \leftarrow ((W[31:0]\circ W[63:32])+y)\oplus z$.
+\end{enumerate}
+\item $T(W)\leftarrow W$.
+\end{itemize}
+All the operations are performed modulo $2^{64}$. $\circ$ is the 64-bit multiplication, $S[]$ is the Sbox (lookup table) that maps 10-bit indices to 64-bit values. $W[i:j]$ is the subset of bits of $W$ from $i$ to $j$ inclusive. 
+
+The S-box is generated in the start of every  pass in the following procedure. In total we specify $2^{10}\cdot 8$ bytes, or 8 KBytes. We take block $B[0][0]$ and apply $F$ (the core of $G$) to it 16 times. After each two iterations we use the entire 1024-byte value and initialize 128 lookup values.
+
+The properties of $\mathcal{T}$ and its initialization procedure is subject to change.
+\end{itemize}
+
+\section{Change log}
+
+\subsection{v1.2.1 -- 8th September, 2015}
+\begin{itemize}
+\item The total number of blocks can reach $2^{32}-1$;
+\item The reference block index now requires 64 bits; the lane number is computed separately.
+\item New modes \textsf{Argon2id} and \textsf{Argon2ds} are added as optional.
+\end{itemize}
+The specification of v1.2.1 released on 26th August, 2015, had incorrect description of the first block generation. The version released on 2d September, 2015, had incorrect description of the counter used in generating addresses for \textsf{Argon2i}.
+
+\subsection{v1.2  -- 21th June, 2015}
+
+Non-uniform  indexing rule, the compression function gets multiplications.
+
+\subsection{v1.1  -- 6th February, 2015}
+\begin{itemize}
+\item New indexing rule added to avoid collision with a proof. 
+\item New rule to generate first two blocks in each lane.
+\item Non-zero constant added to the input block used to generate addresses in \textsf{Argon2i}.
+\end{itemize}
+
+\end{document}
+
--- a/doc/LaTeX/pics/argon2-par.pdf
+++ b/doc/LaTeX/pics/argon2-par.pdf
--- a/doc/LaTeX/pics/compression.pdf
+++ b/doc/LaTeX/pics/compression.pdf
--- a/doc/LaTeX/pics/generic.pdf
+++ b/doc/LaTeX/pics/generic.pdf
--- a/doc/LaTeX/pics/instant.pdf
+++ b/doc/LaTeX/pics/instant.pdf
--- a/doc/LaTeX/pics/over-v2.pdf
+++ b/doc/LaTeX/pics/over-v2.pdf
--- a/doc/LaTeX/pics/parallel.pdf
+++ b/doc/LaTeX/pics/parallel.pdf
--- a/doc/LaTeX/pics/parallel2.pdf
+++ b/doc/LaTeX/pics/parallel2.pdf
--- a/doc/LaTeX/pics/power-distribution.jpg
+++ b/doc/LaTeX/pics/power-distribution.jpg
--- a/doc/LaTeX/pics/precomp.pdf
+++ b/doc/LaTeX/pics/precomp.pdf
--- a/doc/LaTeX/tradeoff.bib
+++ b/doc/LaTeX/tradeoff.bib
@ -0,0 +1,803 @@
+@article{hellman1980cryptanalytic,
+  title={A cryptanalytic time-memory trade-off},
+  author={Hellman, Martin E},
+  journal={Information Theory, IEEE Transactions on},
+  volume={26},
+  number={4},
+  pages={401--406},
+  year={1980},
+  publisher={IEEE}
+}
+
+
+@inproceedings{DworkN92,
+  author    = {Cynthia Dwork and
+               Moni Naor},
+  title     = {Pricing via Processing or Combatting Junk Mail},
+  booktitle = {CRYPTO'92},
+  series    = {Lecture Notes in Computer Science},
+  volume    = {740},
+  pages     = {139--147},
+  publisher = {Springer},
+  year      = {1992},
+  timestamp = {Fri, 18 Sep 2009 10:18:29 +0200},
+  biburl    = {http://dblp.uni-trier.de/rec/bib/conf/crypto/DworkN92},
+  bibsource = {dblp computer science bibliography, http://dblp.org}
+}
+
+@article{Sudan97,
+  author    = {Madhu Sudan},
+  title     = {Decoding of {Reed Solomon} Codes beyond the Error-Correction Bound},
+  journal   = {J. Complexity},
+  volume    = {13},
+  number    = {1},
+  pages     = {180--193},
+  year      = {1997},
+  url       = {http://dx.doi.org/10.1006/jcom.1997.0439},
+  doi       = {10.1006/jcom.1997.0439},
+  timestamp = {Thu, 10 Nov 2005 11:26:57 +0100},
+  biburl    = {http://dblp.uni-trier.de/rec/bib/journals/jc/Sudan97},
+  bibsource = {dblp computer science bibliography, http://dblp.org}
+}
+
+@article{OorschotW99,
+  author    = {Paul C. van Oorschot and
+               Michael J. Wiener},
+  title     = {Parallel Collision Search with Cryptanalytic Applications},
+  journal   = {J. Cryptology},
+  volume    = {12},
+  number    = {1},
+  pages     = {1--28},
+  year      = {1999},
+  url       = {http://dx.doi.org/10.1007/PL00003816},
+  doi       = {10.1007/PL00003816},
+  timestamp = {Tue, 24 May 2011 14:18:06 +0200},
+  biburl    = {http://dblp.uni-trier.de/rec/bib/journals/joc/OorschotW99},
+  bibsource = {dblp computer science bibliography, http://dblp.org}
+}
+
+@inproceedings{JakobssonJ99,
+  author    = {Markus Jakobsson and
+               Ari Juels},
+  editor    = {Bart Preneel},
+  title     = {Proofs of Work and Bread Pudding Protocols},
+  booktitle = {Secure Information Networks: Communications and Multimedia Security,
+               {IFIP} {TC6/TC11} Joint Working Conference on Communications and Multimedia
+               Security {(CMS} '99), September 20-21, 1999, Leuven, Belgium},
+  series    = {{IFIP} Conference Proceedings},
+  volume    = {152},
+  pages     = {258--272},
+  publisher = {Kluwer},
+  year      = {1999},
+  timestamp = {Mon, 14 Oct 2002 12:00:15 +0200},
+  biburl    = {http://dblp.uni-trier.de/rec/bib/conf/cms/JakobssonJ99},
+  bibsource = {dblp computer science bibliography, http://dblp.org}
+}
+
+
+
+@MANUAL{FIPS-197,
+TITLE = {{FIPS}-197: {Advanced Encryption Standard}},
+organization = {{N}ational {I}nstitute of {S}tandards and {T}echnology ({NIST}), available at \url{http://csrc.nist.gov/publications/fips/fips197/fips-197.pdf}},
+month = {November},
+year = {2001},
+}
+
+
+@MANUAL{FIPS-197,
+TITLE = {{FIPS}-197: {Advanced Encryption Standard}},
+organization = {{N}ational {I}nstitute of {S}tandards and {T}echnology ({NIST}), available at \url{http://csrc.nist.gov/publications/fips/fips197/fips-197.pdf}},
+month = {November},
+year = {2001},
+}
+
+@BOOK{DR02,
+AUTHOR = {Joan Daemen and Vincent Rijmen},
+TITLE = {The Design of {Rijndael}. {AES}~--- the {Advanced Encryption Standard}},
+PUBLISHER = {Springer},
+YEAR = {2002}
+}
+
+
+@misc{back2002hashcash,
+  title={Hashcash -- a denial of service counter-measure},
+  author={Back, Adam},
+  year={2002},
+  note ={available at \url{http://www.hashcash.org/papers/hashcash.pdf}}
+}
+
+
+@inproceedings{DworkGN03,
+  author    = {Cynthia Dwork and
+               Andrew Goldberg and
+               Moni Naor},
+  title     = {On Memory-Bound Functions for Fighting Spam},
+  booktitle = {CRYPTO'03},
+  year      = {2003},
+  pages     = {426--444},
+  series    = {Lecture Notes in Computer Science},
+  volume    = {2729},
+  publisher = {Springer}
+}
+
+
+@MANUAL{sha3,
+  title =        {NIST: SHA-3 competition},
+  year =         {2007},
+  note =         {\url{http://csrc.nist.gov/groups/ST/hash/sha-3/index.html‎}}
+}
+
+@book{robshaw2008new,
+  title={New stream cipher designs: the eSTREAM finalists},
+  author={Robshaw, Matthew and Billet, Olivier},
+  volume={4986},
+  year={2008},
+  publisher={Springer}
+}
+
+
+
+@article{percival2009stronger,
+  title={Stronger key derivation via sequential memory-hard functions},
+  author={Percival, Colin},
+  note={\url{http://www.tarsnap.com/scrypt/scrypt.pdf}},
+  year={2009}
+}
+
+
+@MANUAL{litecoin,
+  title =        {Litecoin - Open source P2P digital currency},
+  year =         {2011},
+  note =         {\url{https://litecoin.org/‎}}
+}
+
+
+
+@MANUAL{ietf-scrypt,
+  title =        {IETF Draft: The scrypt Password-Based Key Derivation Function},
+  year =         {2012},
+  note =         {\url{
+https://tools.ietf.org/html/draft-josefsson-scrypt-kdf-02}}
+}
+
+
+@MISC{story,
+ year=2012,
+  title =        {Password security: past, present, future},
+  note =         {\url{http://www.openwall.com/presentations/Passwords12-The-Future-Of-Hashing/}}
+  
+  @article{DziembowskiFKP13,
+  author    = {Stefan Dziembowski and
+               Sebastian Faust and
+               Vladimir Kolmogorov and
+               Krzysztof Pietrzak},
+  title     = {Proofs of Space},
+  journal   = {IACR Cryptology ePrint Archive 2013/796},
+  note = {to appear at Crypto'15}
+}
+
+@MISC{momentum,
+  year = {2013},
+  title =        {Momentum: a memory-hard proof-of-work},
+  note =         {\url{http://www.hashcash.org/papers/momentum.pdf}}
+}
+
+
+@MISC{ebay,
+  year = {2014},
+  title =        {{eBay} hacked, requests all users change passwords},
+  note =         {\url{http://www.cnet.com/news/ebay-hacked-requests-all-users-change-passwords/}}
+}
+
+@TECHREPORT{yescrypt,
+  author =       {Alexander Peslyak },
+  title =        {Yescrypt - a Password Hashing Competition submission},
+  year =         {2014},
+  note =         {available at \url{https://password-hashing.net/submissions/specs/yescrypt-v0.pdf}}
+}
+
+
+
+
+
+  
+@MISC{bitasic,
+  title =        {Avalon ASIC's 40nm Chip to Bring Hashing Boost for Less Power},
+  year = {2014},
+  note =         {\url{  http://www.coindesk.com/avalon-asics-40nm-chip-bring-hashing-boost-less-power/}}
+
+
+@MISC{comp,
+  title =        {{Password Hashing Competition}},
+  year = 2015,
+  note =         {\url{https://password-hashing.net/}}
+}
+
+
+
+
+@MANUAL{vertcoin,
+  title =        {Vertcoin: Lyra2RE reference guide},
+  year =         {2014},
+  note =         {\url{https://vertcoin.org/downloads/Vertcoin_Lyra2RE_Paper_11292014.pdf}}
+}
+
+
+
+@MANUAL{FIPS-180-4,
+TITLE = {{FIPS}-180-4: {Secure Hash Standard}},
+organization = {{N}ational {I}nstitute of {S}tandards and {T}echnology ({NIST})},
+note={available at \url{http://csrc.nist.gov/publications/fips/fips180-4/fips-180-4.pdf}},
+month = {March},
+year = {2012},
+}
+
+
+
+
+
+@article{gueronaes,
+  title={AES-GCM software performance on the current high end CPUs as a performance baseline for CAESAR competition},
+  author={Gueron, Shay},
+  year={2013},
+  note={\url{http://2013.diac.cr.yp.to/slides/gueron.pdf}}
+}
+
+
+@inproceedings{norwayTrade,
+  author    = {Donghoon Chang and Arpan Jati and Sweta Mishra and Somitra Kumar Sanadhya},
+  title     = {Time Memory Tradeoff Analysis of Graphs in Password
+Hashing Constructions},
+  booktitle = {Preproceedings of PASSWORDS'14},
+  year      = {2014},
+  pages     = {256-266},
+  note={available at \url{http://passwords14.item.ntnu.no/Preproceedings_Passwords14.pdf}}
+}
+
+
+@inproceedings{BogdanovKLTVV11,
+  author    = {Andrey Bogdanov and
+               Miroslav Knezevic and
+               Gregor Leander and
+               Deniz Toz and
+               Kerem Varici and
+               Ingrid Verbauwhede},
+  title     = {Spongent: A Lightweight Hash Function},
+  booktitle = {CHES'11},
+  year      = {2011},
+  pages     = {312-325},
+  publisher = {Springer},
+  series    = {Lecture Notes in Computer Science},
+  volume    = {6917}
+}
+
+@misc{cryptoeprint:2014:881,
+    author = {Christian Forler and Eik List and Stefan Lucks and Jakob Wenzel},
+    title = {Overview of the Candidates for the Password Hashing Competition - And Their Resistance Against Garbage-Collector Attacks},
+    howpublished = {Cryptology ePrint Archive, Report 2014/881},
+    year = {2014},
+    note = {\url{http://eprint.iacr.org/}},
+}
+
+@TECHREPORT{Daemen13,
+  author =       {Joan Daemen},
+  title =        {Permutation-based symmetric cryptography
+and
+{Keccak}},
+  institution =  {Ecrypt II, Crypto for 2020 Invited Talk},
+  year =         {2013},
+  note={\url{https://www.cosic.esat.kuleuven.be/ecrypt/cryptofor2020/slides/KeccakEcryptTenerife.pdf}}
+}
+
+@inproceedings{AumassonHMN10,
+  author    = {Jean-Philippe Aumasson and
+               Luca Henzen and
+               Willi Meier and
+               Mar\'{\i}a Naya-Plasencia},
+  title     = {Quark: A Lightweight Hash},
+  booktitle = {CHES'10},
+  year      = {2010},
+  pages     = {1-15},
+  publisher = {Springer},
+  series    = {Lecture Notes in Computer Science},
+  volume    = {6225},
+  note= {\url{https://131002.net/quark/quark_full.pdf}}
+}
+
+@inproceedings{knudsen1998analysis,
+  title={Analysis methods for (alleged) {RC4}},
+  author={Knudsen, Lars R and Meier, Willi and Preneel, Bart and Rijmen, Vincent and Verdoolaege, Sven},
+  booktitle={Advances in Cryptology—ASIACRYPT’98},
+  pages={327--341},
+  year={1998},
+  organization={Springer}
+}
+
+
+@report{Keccak-ref,
+  author    = {Guido Bertoni and
+               Joan Daemen and
+               Michael Peeters and
+               Gilles Van Assche},
+  title     = {The {Keccak} reference, version 3.0},
+  year      = {2011},
+  note = {\url{http://keccak.noekeon.org/Keccak-reference-3.0.pdf}}
+}
+
+
+
+@inproceedings{DworkNW05,
+  author    = {Cynthia Dwork and
+               Moni Naor and
+               Hoeteck Wee},
+  title     = {Pebbling and Proofs of Work},
+  booktitle = {{CRYPTO}'05},
+  year      = {2005},
+  pages     = {37--54},
+  series    = {Lecture Notes in Computer Science},
+  volume    = {3621},
+  publisher = {Springer}
+}
+
+@inproceedings{FiatS86,
+  author    = {Amos Fiat and
+               Adi Shamir},
+  editor    = {Andrew M. Odlyzko},
+  title     = {How to Prove Yourself: Practical Solutions to Identification and Signature
+               Problems},
+  booktitle = {Advances in Cryptology - {CRYPTO} '86, Santa Barbara, California,
+               USA, 1986, Proceedings},
+  series    = {Lecture Notes in Computer Science},
+  volume    = {263},
+  pages     = {186--194},
+  publisher = {Springer},
+  year      = {1986},
+  url       = {http://dx.doi.org/10.1007/3-540-47721-7_12},
+  doi       = {10.1007/3-540-47721-7_12},
+  timestamp = {Fri, 18 Sep 2009 08:01:49 +0200},
+  biburl    = {http://dblp.uni-trier.de/rec/bib/conf/crypto/FiatS86},
+  bibsource = {dblp computer science bibliography, http://dblp.org}
+}
+
+
+@article{DziembowskiFKP13,
+  author    = {Stefan Dziembowski and
+               Sebastian Faust and
+               Vladimir Kolmogorov and
+               Krzysztof Pietrzak},
+  title     = {Proofs of Space},
+  journal   = {IACR Cryptology ePrint Archive 2013/796}
+}
+
+@article{HopcroftPV77,
+  author    = {John E. Hopcroft and
+               Wolfgang J. Paul and
+               Leslie G. Valiant},
+  title     = {On Time Versus Space},
+  journal   = {J. ACM},
+  volume    = {24},
+  number    = {2},
+  year      = {1977},
+  pages     = {332-337},
+  ee        = {http://doi.acm.org/10.1145/322003.322015},
+  bibsource = {DBLP, http://dblp.uni-trier.de}
+}
+
+@article{PaulTC77,
+  author    = {Wolfgang J. Paul and
+               Robert Endre Tarjan and
+               James R. Celoni},
+  title     = {Space Bounds for a Game on Graphs},
+  journal   = {Mathematical Systems Theory},
+  volume    = {10},
+  year      = {1977},
+  pages     = {239-251},
+  ee        = {http://dx.doi.org/10.1007/BF01683275},
+  bibsource = {DBLP, http://dblp.uni-trier.de}
+}
+
+@article{LengauerT82,
+  author    = {Thomas Lengauer and
+               Robert Endre Tarjan},
+  title     = {Asymptotically tight bounds on time-space trade-offs in
+               a pebble game},
+  journal   = {J. ACM},
+  volume    = {29},
+  number    = {4},
+  year      = {1982},
+  pages     = {1087-1130},
+  ee        = {http://doi.acm.org/10.1145/322344.322354},
+  bibsource = {DBLP, http://dblp.uni-trier.de}
+}
+
+@article{AlwenS14,
+  author    = {Jo{\"e}l Alwen and
+               Vladimir Serbinenko},
+  title     = {High Parallel Complexity Graphs and Memory-Hard Functions},
+  journal   = {IACR Cryptology ePrint Archive 2014/238}
+}
+
+@TECHREPORT{Bernstein05,
+  author =       {Daniel J. Bernstein},
+  title =        {Cache-timing
+attacks
+on
+AES},
+  year =         {2005},
+  note =         {\url{http://cr.yp.to/antiforgery/cachetiming-20050414.pdf}}
+}
+
+@TECHREPORT{trade-att,
+  author =       {Alex Biryukov and Dmitry Khovratovich},
+  title =        {Tradeoff Cryptanalysis of Memory-Hard Functions},
+  year =         {2015},
+  note =         {\url{https://orbilu.uni.lu/handle/10993/20043}}
+}
+
+@TECHREPORT{Argon2,
+  author =       {Alex Biryukov and Daniel Dinu and Dmitry Khovratovich},
+  title =        {Argon2},
+  year =         {2015},
+  note =         {\url{https://www.cryptolux.org/images/0/0d/Argon2.pdf}}
+}
+
+@MISC{BSTY,
+  title =        {GlobalBoost announces a yescrypt-based cryptocurrency},
+  note =         {\url{https://bitcointalk.org/index.php?topic=775289.0}}
+}
+
+@article{ForlerLW13,
+  author    = {Christian Forler and
+               Stefan Lucks and
+               Jakob Wenzel},
+  title     = {Catena: A Memory-Consuming Password Scrambler},
+  journal   = {IACR Cryptology ePrint Archive, Report 2013/525},
+  note = {non-tweaked version \url{http://eprint.iacr.org/2013/525/20140105:194859}}
+}
+
+@misc{broz15,
+year = 2015,
+author = {Milan Broz},
+title = {PHC benchmarks},
+note = {\url{https://github.com/mbroz/PHCtest/blob/master/output/phc_round2.pdf}}
+}
+
+ @inproceedings{ForlerLW14,
+  author    = {Christian Forler and
+               Stefan Lucks and
+               Jakob Wenzel},
+  title     = {Memory-Demanding Password Scrambling},
+  booktitle = {{ASIACRYPT}'14},
+   series    = {Lecture Notes in Computer Science},
+  volume    = {8874},
+  pages     = {289--305},
+  publisher = {Springer},
+  year      = {2014},
+  note = {tweaked version of \cite{ForlerLW13}}
+}
+
+@article{ParkPAFG15,
+  author    = {Sunoo Park and
+               Krzysztof Pietrzak and
+               Jo{\"{e}}l Alwen and
+               Georg Fuchsbauer and
+               Peter Gazi},
+  title     = {Spacecoin: {A} Cryptocurrency Based on Proofs of Space},
+  journal   = {{IACR} Cryptology ePrint Archive},
+  volume    = {2015},
+  pages     = {528},
+  year      = {2015},
+  url       = {http://eprint.iacr.org/2015/528},
+  timestamp = {Fri, 26 Jun 2015 09:49:58 +0200},
+  biburl    = {http://dblp.uni-trier.de/rec/bib/journals/iacr/ParkPAFG15},
+  bibsource = {dblp computer science bibliography, http://dblp.org}
+}
+
+
+
+
+
+@inproceedings{BiryukovS01,
+author = {Alex Biryukov and
+Adi Shamir},
+title = {Structural Cryptanalysis of {SASAS}},
+booktitle = {EUROCRYPT'01},
+year = {2001}
+}
+
+@inproceedings{RistenpartTSS09,
+  author    = {Thomas Ristenpart and
+               Eran Tromer and
+               Hovav Shacham and
+               Stefan Savage},
+  title     = {Hey, you, get off of my cloud: exploring information leakage in third-party
+               compute clouds},
+  booktitle = {Proceedings of the 2009 {ACM} Conference on Computer and Communications
+               Security, {CCS} 2009, Chicago, Illinois, USA, November 9-13, 2009},
+  year      = {2009},
+  pages     = {199--212},
+  crossref  = {DBLP:conf/ccs/2009},
+  url       = {http://doi.acm.org/10.1145/1653662.1653687},
+  doi       = {10.1145/1653662.1653687},
+  timestamp = {Tue, 09 Sep 2014 14:55:39 +0200},
+  biburl    = {http://dblp.uni-trier.de/rec/bib/conf/ccs/RistenpartTSS09},
+  bibsource = {dblp computer science bibliography, http://dblp.org}
+}
+
+@MISC{bitcoin,
+  title =        {Bitcoin: Mining hardware comparison},
+  year={2014},
+  note =         {available at \url{https://en.bitcoin.it/wiki/Mining_hardware_comparison}. We compare $2^{32}$ hashes per joule on the best ASICs with $2^{17}$ hashes per joule on the most efficient x86-laptops.}
+}
+
+
+@MISC{litecoin-comp,
+  title =        {Litecoin: Mining hardware comparison},
+  note =         {\url{https://litecoin.info/Mining_hardware_comparison}}
+}
+
+
+@article{AbadiBMW05,
+  author    = {Mart{\'{\i}}n Abadi and
+               Michael Burrows and
+               Mark S. Manasse and
+               Ted Wobber},
+  title     = {Moderately hard, memory-bound functions},
+  journal   = {{ACM} Trans. Internet Techn.},
+  year      = {2005},
+  volume    = {5},
+  number    = {2},
+  pages     = {299--327},
+  url       = {http://doi.acm.org/10.1145/1064340.1064341},
+  doi       = {10.1145/1064340.1064341},
+  timestamp = {Tue, 09 Sep 2014 16:27:47 +0200},
+  biburl    = {http://dblp.uni-trier.de/rec/bib/journals/toit/AbadiBMW05},
+  bibsource = {dblp computer science bibliography, http://dblp.org}
+}
+
+@article{Pippenger77,
+  author    = {Nicholas Pippenger},
+  title     = {Superconcentrators},
+  journal   = {{SIAM} J. Comput.},
+  year      = {1977},
+  volume    = {6},
+  number    = {2},
+  pages     = {298--304},
+  url       = {http://dx.doi.org/10.1137/0206022},
+  doi       = {10.1137/0206022},
+  timestamp = {Tue, 09 Sep 2014 16:52:40 +0200},
+  biburl    = {http://dblp.uni-trier.de/rec/bib/journals/siamcomp/Pippenger77},
+  bibsource = {dblp computer science bibliography, http://dblp.org}
+}
+
+@TECHREPORT{lyra,
+  author =       {Marcos A. Simplicio Jr and Leonardo C. Almeida and Ewerton R. Andrade and Paulo C. F. dos Santos and Paulo S. L. M. Barreto},
+  title =        {The {Lyra2} reference guide, version 2.3.2},
+  year =         {2014},
+  month = {april}
+  note =         {available at  \url{http://lyra-kdf.net/Lyra2ReferenceGuide_v1.pdf}},
+}
+
+
+@inproceedings{Thompson79,
+  author    = {Clark D. Thompson},
+  title     = {Area-Time Complexity for {VLSI}},
+  booktitle = {STOC'79},
+  year      = {1979},
+  pages     = {81--88},
+  year      = {1979},
+  publisher = {{ACM}}
+}
+
+@TECHREPORT{pomelo,
+  author =       {Hongjun Wu},
+  title =        {{POMELO}:
+A Password Hashing Algorithm},
+  year =         {2014},
+  note =         {available at \url{https://password-hashing.net/submissions/specs/POMELO-v1.pdf}},
+}
+
+
+@inproceedings{knudsen1998analysis,
+  title={Analysis methods for (alleged) {RC4}},
+  author={Knudsen, Lars R and Meier, Willi and Preneel, Bart and Rijmen, Vincent and Verdoolaege, Sven},
+  booktitle={Advances in Cryptology—ASIACRYPT’98},
+  pages={327--341},
+  year={1998},
+  organization={Springer}
+}
+
+@MISC{fpga,
+  title =        {Energy-efficient bcrypt cracking},
+author={Katja Malvoni},
+  note =         {Passwords'14 conference, available at \url{http://www.openwall.com/presentations/Passwords14-Energy-Efficient-Cracking/}}
+}
+
+
+@MISC{ripper,
+  title =        {Software tool: {John the Ripper} password cracker},
+  note =         {\url{http://www.openwall.com/john/}}
+}
+
+@MISC{sharcs,
+title = {{SHARCS} -- Special-purpose Hardware for Attacking Cryptographic Systems},
+note = {\url{http://www.sharcs.org/}}
+}
+
+@article{Wiener04,
+  author    = {Michael J. Wiener},
+  title     = {The Full Cost of Cryptanalytic Attacks},
+  journal   = {J. Cryptology},
+  year      = {2004},
+  volume    = {17},
+  number    = {2},
+  pages     = {105--124},
+  url       = {http://dx.doi.org/10.1007/s00145-003-0213-5},
+  doi       = {10.1007/s00145-003-0213-5},
+  timestamp = {Sat, 27 Sep 2014 18:00:09 +0200},
+  biburl    = {http://dblp.uni-trier.de/rec/bib/journals/joc/Wiener04},
+  bibsource = {dblp computer science bibliography, http://dblp.org}
+}
+
+
+
+
+
+@inproceedings{MukhopadhyayS06,
+  author    = {Sourav Mukhopadhyay and
+               Palash Sarkar},
+  title     = {On the Effectiveness of {TMTO} and Exhaustive Search Attacks},
+  booktitle = {{IWSEC} 2006},
+  year      = {2006},
+  pages     = {337--352},
+  series    = {Lecture Notes in Computer Science},
+  volume    = {4266},
+  publisher = {Springer}
+}
+
+
+
+@inproceedings{SprengerB12,
+  author    = {Martijn Sprengers and Lejla Batina},
+  title     = {Speeding up {GPU-based} password cracking},
+  booktitle   = {SHARCS'12},
+  year      = {2012},
+  note = {available at \url{http://2012.sharcs.org/record.pdf}}
+}
+
+@article{nakamoto2012bitcoin,
+  title={Bitcoin: A peer-to-peer electronic cash system},
+  author={Nakamoto, Satoshi},
+  note={\url{http://www. bitcoin.org/bitcoin.pdf}},
+  year={2009}
+}
+
+
+
+@inproceedings{BernsteinL13,
+  author    = {Daniel J. Bernstein and
+               Tanja Lange},
+  title     = {Non-uniform Cracks in the Concrete: The Power of Free Precomputation},
+  booktitle = {ASIACRYPT'13},
+  year      = {2013},
+  pages     = {321--340},
+  series    = {Lecture Notes in Computer Science},
+  volume    = {8270},
+  publisher = {Springer}
+}
+
+
+
+@inproceedings{AumassonNWW13,
+  author    = {Jean{-}Philippe Aumasson and
+               Samuel Neves and
+               Zooko Wilcox{-}O'Hearn and
+               Christian Winnerlein},
+  title     = {{BLAKE2:} Simpler, Smaller, Fast as {MD5}},
+  booktitle = {{ACNS}'13},
+  pages     = {119--135},
+  series    = {Lecture Notes in Computer Science},
+  year      = {2013},
+  volume    = {7954},
+  publisher = {Springer}
+}
+
+
+@article{liu2013parallel,
+  author    = {Bin Liu and Bevan M. Baas},
+  title     = {Parallel {AES} Encryption Engines for Many-Core Processor Arrays},
+  journal   = {{IEEE} Transactions on Computers},
+  year      = {2013},
+  volume    = {62},
+  number    = {3},
+  pages     = {536--547},
+  month     = mar,
+}
+
+@article{ForlerLLW14,
+  author    = {Christian Forler and
+               Eik List and
+               Stefan Lucks and
+               Jakob Wenzel},
+  title     = {Overview of the Candidates for the Password Hashing Competition -
+               And their Resistance against Garbage-Collector Attacks},
+  journal   = {{IACR} Cryptology ePrint Archive},
+  volume    = {2014},
+  pages     = {881},
+  year      = {2014},
+  url       = {http://eprint.iacr.org/2014/881},
+  timestamp = {Sat, 02 Mar 4439591 14:05:04 +},
+  biburl    = {http://dblp.uni-trier.de/rec/bib/journals/iacr/ForlerLLW14},
+  bibsource = {dblp computer science bibliography, http://dblp.org}
+}
+
+@inproceedings{gurkaynak2012sha3,
+author = {Frank G{\"{u}}rkaynak and Kris Gaj and Beat Muheim and Ekawat Homsirikamol and Christoph Keller and Marcin Rogawski and Hubert Kaeslin and Jens-Peter Kaps},
+title = {Lessons Learned from Designing a 65nm {ASIC} for Evaluating Third Round {SHA-3} Candidates},
+booktitle = {Third SHA-3 Candidate Conference},
+month = mar,
+year = {2012}
+}
+
+@inproceedings{giridhar2013dram,
+  author    = {Bharan Giridhar and Michael Cieslak and Deepankar Duggal and Ronald G. Dreslinski and Hsing Min Chen and Robert Patti and Betina Hold and Chaitali Chakrabarti and Trevor N. Mudge and David Blaauw},
+  title     = {Exploring {DRAM} organizations for energy-efficient and resilient
+               exascale memories},
+  booktitle = {International Conference for High Performance Computing, Networking,
+               Storage and Analysis (SC 2013)},
+  year      = {2013},
+  pages     = {23--35},
+  publisher = {ACM},
+}
+
+@inproceedings{BertoniDPA11,
+  author    = {Guido Bertoni and
+               Joan Daemen and
+               Michael Peeters and
+               Gilles Van Assche},
+  title     = {Duplexing the Sponge: Single-Pass Authenticated Encryption and Other
+               Applications},
+  booktitle = {{SAC}'11,}
+  series    = {Lecture Notes in Computer Science},
+  volume    = {7118},
+  pages     = {320--337},
+  publisher = {Springer},
+  year      = {2011}
+}
+
+@inproceedings{Rig,
+  author    = {Donghoon Chang and Arpan Jati and Sweta Mishra and Somitra Sanadhya},
+  title     = {Rig: A simple, secure and flexible design for Password Hashing},
+  booktitle = {Inscrypt'14},
+  series    = {Lecture Notes in Computer Science, to appear},
+  publisher = {Springer},
+  year      = {2014}
+}
+
+@article{BiryukovP14,
+  author    = {Alex Biryukov and
+               Ivan Pustogarov},
+  title     = {Proof-of-Work as Anonymous Micropayment: Rewarding a {Tor} Relay},
+  journal   = {{IACR} Cryptology ePrint Archive 2014/1011},
+  note= {to appear at Financial Cryptography 2015},
+  url       = {http://eprint.iacr.org/2014/1011},
+  timestamp = {Mon, 19 Jan 2015 11:11:51 +0100},
+  biburl    = {http://dblp.uni-trier.de/rec/bib/journals/iacr/BiryukovP14},
+  bibsource = {dblp computer science bibliography, http://dblp.org}
+}
+
+
+@misc{Andersen14,
+    author = {David Andersen},
+    title = {A Public Review of Cuckoo Cycle},
+    howpublished = {\url{http://www.cs.cmu.edu/~dga/crypto/cuckoo/analysis.pdf}},
+    year = {2014}
+}
+
+@misc{Tromp14,
+    author = {John Tromp},
+    title = {Cuckoo Cycle: a memory bound graph-theoretic proof-of-work},
+    howpublished = {Cryptology ePrint Archive, Report 2014/059},
+    year = {2014},
+    note = {\url{http://eprint.iacr.org/2014/059}, project webpage \url{https://github.com/tromp/cuckoo}},
+}
--- a/src/argon2-core.c
+++ b/src/argon2-core.c
@ -0,0 +1,556 @@
+/*
+ * Argon2 source code package
+ * 
+ * Written by Daniel Dinu and Dmitry Khovratovich, 2015
+ * 
+ * This work is licensed under a Creative Commons CC0 1.0 License/Waiver.
+ * 
+ * You should have received a copy of the CC0 Public Domain Dedication along with
+ * this software. If not, see <http://creativecommons.org/publicdomain/zero/1.0/>.
+ */
+
+
+/*For memory wiping*/
+#ifdef _MSC_VER
+#include "windows.h"
+#include "winbase.h" //For SecureZeroMemory
+#endif
+#if defined __STDC_LIB_EXT1__
+#define __STDC_WANT_LIB_EXT1__ 1
+#endif
+#define VC_GE_2005( version )		( version >= 1400 )
+
+
+#include <inttypes.h>
+#include "string.h"
+#include "stdlib.h"
+#include "stdio.h"
+#include "pthread.h"
+
+#include "argon2.h"
+#include "argon2-core.h"
+#include "kat.h"
+
+
+#include "../Blake2/blake2.h"
+#include "../Blake2/blake2-impl.h"
+
+#if defined(__clang__)
+#if __has_attribute(optnone)
+#define NOT_OPTIMIZED __attribute__((optnone))
+#else
+#define NOT_OPTIMIZED
+#endif
+#elif defined(__GNUC__)
+#define GCC_VERSION (__GNUC__ * 10000 \
+                    + __GNUC_MINOR__ * 100 \
+                    + __GNUC_PATCHLEVEL__)
+#if GCC_VERSION >= 40400
+#define NOT_OPTIMIZED __attribute__((optimize("O0")))
+#endif
+#else 
+#define NOT_OPTIMIZED
+#endif
+
+
+/***************Instance and Position constructors**********/
+void InitBlockValue(block* b, uint8_t in){
+    memset(b->v,in,sizeof(b->v));
+}
+
+void CopyBlock(block* dst, const block* src){
+    memcpy(dst->v,src->v,sizeof(uint64_t)*ARGON2_WORDS_IN_BLOCK);
+}
+
+ void XORBlock(block* dst, const  block* src){
+     int i; 
+     for(i=0; i<ARGON2_WORDS_IN_BLOCK; ++i){
+         dst->v[i] ^= src->v[i];
+     }
+ }
+
+
+
+
+/***************Memory allocators*****************/
+int AllocateMemory(block **memory, uint32_t m_cost) {
+    if (memory != NULL) {
+        *memory = (block*)malloc(sizeof(block)*m_cost);
+        if (!*memory) {
+            return ARGON2_MEMORY_ALLOCATION_ERROR;
+        }
+        return ARGON2_OK;
+    } else return ARGON2_MEMORY_ALLOCATION_ERROR;
+}
+
+/* Function that securely cleans the memory
+ * @param mem Pointer to the memory
+ * @param s Memory size in bytes
+ */
+
+static inline void NOT_OPTIMIZED secure_wipe_memory(void *v, size_t n) {
+#if defined  (_MSC_VER ) &&  VC_GE_2005( _MSC_VER )
+    SecureZeroMemory(v, n);
+#elif defined memset_s
+    memset_s(v, n);
+#elif defined( __OpenBSD__ )
+    explicit_bzero(memory, size);
+#else
+    static void* (*const volatile memset_sec)(void*, int, size_t) = &memset;
+    memset_sec(v, 0, n);
+#endif
+}
+
+/*************************Argon2 internal constants**************************************************/
+
+/* Version of the algorithm */
+const uint32_t ARGON2_VERSION_NUMBER = 0x10;
+
+/* Memory block size in bytes */
+#define  ARGON2_BLOCK_SIZE  1024
+#define ARGON2_WORDS_IN_BLOCK (ARGON2_BLOCK_SIZE/8)
+const uint32_t ARGON2_QWORDS_IN_BLOCK = 64; /*Dependent values!*/
+
+/* Number of pseudo-random values generated by one call to Blake in Argon2i  to generate reference block positions*/
+const uint32_t ARGON2_ADDRESSES_IN_BLOCK = 128;
+
+/* Pre-hashing digest length and its extension*/
+const uint32_t ARGON2_PREHASH_DIGEST_LENGTH = 64;
+const uint32_t ARGON2_PREHASH_SEED_LENGTH = 72;/*Dependent values!*/
+
+
+
+/*****SM-related constants******/
+const uint32_t ARGON2_SBOX_SIZE = 1 << 10;
+const uint32_t ARGON2_SBOX_MASK = (1<<9) - 1;
+
+
+/*********Memory functions*/
+
+void ClearMemory(Argon2_instance_t* instance, bool clear) {
+    if (instance->memory != NULL && clear) {
+        if (instance->type == Argon2_ds && instance->Sbox != NULL) {
+            secure_wipe_memory(instance->Sbox, ARGON2_SBOX_SIZE * sizeof (uint64_t));
+        }
+        secure_wipe_memory(instance->memory, sizeof (block) * instance->memory_blocks);
+    }
+}
+
+void FreeMemory(block* memory) {
+    if (memory != NULL) {
+        free(memory);
+    }
+}
+
+void Finalize(const Argon2_Context *context, Argon2_instance_t* instance) {
+    if (context != NULL && instance != NULL) {
+        block blockhash;
+        CopyBlock(&blockhash, instance->memory+ instance->lane_length - 1);
+
+        // XOR the last blocks
+        for (uint32_t l = 1; l < instance->lanes; ++l) {
+            uint32_t last_block_in_lane = l * instance->lane_length + (instance->lane_length - 1);
+            XORBlock(&blockhash,instance->memory + last_block_in_lane);
+
+        }
+
+        // Hash the result
+        blake2b_long(context->out, (uint8_t*) blockhash.v, context->outlen, ARGON2_BLOCK_SIZE);
+        secure_wipe_memory(blockhash.v, ARGON2_BLOCK_SIZE); //clear the blockhash
+#ifdef ARGON2_KAT
+        PrintTag(context->out, context->outlen);
+#endif 
+
+        // Clear memory
+        ClearMemory(instance, context->clear_memory);
+
+        // Deallocate Sbox memory
+        if (instance->memory != NULL && instance->Sbox != NULL) {
+            free(instance->Sbox);
+        }
+
+        // Deallocate the memory
+        if (NULL != context->free_cbk) {
+            context->free_cbk((uint8_t *) instance->memory, instance->memory_blocks * sizeof (block));
+        } else {
+            FreeMemory(instance->memory);
+        }
+
+    }
+}
+
+uint32_t IndexAlpha(const Argon2_instance_t* instance, const Argon2_position_t* position, uint32_t pseudo_rand, bool same_lane) {
+    /*
+     * Pass 0:
+     *      This lane : all already finished segments plus already constructed blocks in this segment
+     *      Other lanes : all already finished segments
+     * Pass 1+:
+     *      This lane : (SYNC_POINTS - 1) last segments plus already constructed blocks in this segment
+     *      Other lanes : (SYNC_POINTS - 1) last segments 
+     */
+    uint32_t reference_area_size;
+
+    if (0 == position->pass) {
+        // First pass
+        if (0 == position->slice) {
+            // First slice
+            reference_area_size = position->index - 1; // all but the previous
+        } else {
+            if (same_lane) {
+                // The same lane => add current segment
+                reference_area_size = position->slice * instance->segment_length + position->index - 1;
+            } else {
+                reference_area_size = position->slice * instance->segment_length + ((position->index == 0) ? (-1) : 0);
+            }
+        }
+    } else {
+        // Second pass
+        if (same_lane) {
+            reference_area_size = instance->lane_length - instance->segment_length + position->index - 1;
+        } else {
+            reference_area_size = instance->lane_length - instance->segment_length + ((position->index == 0) ? (-1) : 0);
+        }
+    }
+
+    /* 1.2.4. Mapping pseudo_rand to 0..<reference_area_size-1> and produce relative position */
+    uint64_t relative_position = pseudo_rand;
+    relative_position = relative_position * relative_position >> 32;
+    relative_position = reference_area_size - 1 - (reference_area_size * relative_position >> 32);
+
+    /* 1.2.5 Computing starting position */
+    uint32_t start_position = 0;
+    if (0 != position->pass) {
+        start_position = (position->slice == ARGON2_SYNC_POINTS - 1) ? 0 : (position->slice + 1) * instance->segment_length;
+    }
+
+    /* 1.2.6. Computing absolute position */
+    uint32_t absolute_position = (start_position + relative_position) % instance->lane_length; // absolute position
+    return absolute_position;
+}
+
+void FillMemoryBlocks(Argon2_instance_t* instance) {
+    if (instance != NULL) {
+        for (uint32_t r = 0; r < instance->passes; ++r) {
+            if (Argon2_ds == instance->type) {
+                GenerateSbox(instance);
+            }
+            for (uint8_t s = 0; s < ARGON2_SYNC_POINTS; ++s) {
+                //1. Allocating space for threads
+                pthread_t* thread = malloc(sizeof(pthread_t)*(instance->lanes));
+                Argon2_thread_data* thr_data = malloc(sizeof(Argon2_thread_data)*(instance->lanes));
+                pthread_attr_t attr;
+                int rc;
+                void* status;
+                pthread_attr_init(&attr);
+                pthread_attr_setdetachstate(&attr,PTHREAD_CREATE_JOINABLE);
+                
+                //2. Calling threads 
+                for (uint32_t l = 0; l < instance->lanes; ++l) {
+                    //2.1 Join a thread if limit is exceeded
+                    if(l>=instance->threads){
+                        rc=pthread_join(thread[l-instance->threads],&status);
+                        if (rc) {
+                            printf("ERROR; return code from pthread_join() is %d\n", rc);
+                            exit(-1);
+                        }
+                    }
+                    
+                    //2.2 Create thread
+                    Argon2_position_t position = {r,l,s,0};
+                    thr_data[l].instance_ptr = instance;//preparing the thread input
+                    memcpy(&(thr_data[l].pos), &position, sizeof(Argon2_position_t));
+                    rc =pthread_create(&thread[l],&attr,FillSegmentThr,(void*)&thr_data[l]);
+                    
+                    
+                    //FillSegment(instance, position);  //Non-thread equivalent of the lines above
+                }
+
+                //3. Joining remaining threads
+                for (uint32_t l = instance->lanes - instance->threads; l < instance->lanes; ++l) {
+                    rc=pthread_join(thread[l],&status);
+                    if (rc) {
+                        printf("ERROR; return code from pthread_join() is %d\n", rc);
+                        exit(-1);
+                    }
+                }
+                free(thread);
+                pthread_attr_destroy(&attr);
+                free(thr_data);
+            }
+#ifdef ARGON2_KAT_INTERNAL
+            InternalKat(instance, r);
+#endif
+        }
+    }
+    
+}
+
+int ValidateInputs(const Argon2_Context* context) {
+    if (NULL == context) {
+        return ARGON2_INCORRECT_PARAMETER;
+    }
+
+    if (NULL == context->out) {
+        return ARGON2_OUTPUT_PTR_NULL;
+    }
+
+    /* Validate output length */
+    if (ARGON2_MIN_OUTLEN > context->outlen) {
+        return ARGON2_OUTPUT_TOO_SHORT;
+    }
+    if (ARGON2_MAX_OUTLEN < context->outlen) {
+        return ARGON2_OUTPUT_TOO_LONG;
+    }
+
+    /* Validate password length */
+    if (NULL == context->pwd) {
+        if (0 != context->pwdlen) {
+            return ARGON2_PWD_PTR_MISMATCH;
+        }
+    } else {
+        if (ARGON2_MIN_PWD_LENGTH != 0 && ARGON2_MIN_PWD_LENGTH > context->pwdlen) {
+            return ARGON2_PWD_TOO_SHORT;
+        }
+        if (ARGON2_MAX_PWD_LENGTH < context->pwdlen) {
+            return ARGON2_PWD_TOO_LONG;
+        }
+    }
+
+    /* Validate salt length */
+    if (NULL == context->salt) {
+        if (0 != context->saltlen) {
+            return ARGON2_SALT_PTR_MISMATCH;
+        }
+    } else {
+        if (ARGON2_MIN_SALT_LENGTH > context->saltlen) {
+            return ARGON2_SALT_TOO_SHORT;
+        }
+        if (ARGON2_MAX_SALT_LENGTH < context->saltlen) {
+            return ARGON2_SALT_TOO_LONG;
+        }
+    }
+
+    /* Validate secret length */
+    if (NULL == context->secret) {
+        if (0 != context->secretlen) {
+            return ARGON2_SECRET_PTR_MISMATCH;
+        }
+    } else {
+        if (ARGON2_MIN_SECRET > context->secretlen) {
+            return ARGON2_SECRET_TOO_SHORT;
+        }
+        if (ARGON2_MAX_SECRET < context->secretlen) {
+            return ARGON2_SECRET_TOO_LONG;
+        }
+    }
+
+    /* Validate associated data */
+    if (NULL == context->ad) {
+        if (0 != context->adlen) {
+            return ARGON2_AD_PTR_MISMATCH;
+        }
+    } else {
+        if (ARGON2_MIN_AD_LENGTH > context->adlen) {
+            return ARGON2_AD_TOO_SHORT;
+        }
+        if (ARGON2_MAX_AD_LENGTH < context->adlen) {
+            return ARGON2_AD_TOO_LONG;
+        }
+    }
+
+    /* Validate memory cost */
+    if (ARGON2_MIN_MEMORY > context->m_cost) {
+        return ARGON2_MEMORY_TOO_LITTLE;
+    }
+    if (ARGON2_MAX_MEMORY < context->m_cost) {
+        return ARGON2_MEMORY_TOO_MUCH;
+    }
+
+    /* Validate time cost */
+    if (ARGON2_MIN_TIME > context->t_cost) {
+        return ARGON2_TIME_TOO_SMALL;
+    }
+    if (ARGON2_MAX_TIME < context->t_cost) {
+        return ARGON2_TIME_TOO_LARGE;
+    }
+
+    /* Validate lanes */
+    if (ARGON2_MIN_LANES > context->lanes) {
+        return ARGON2_LANES_TOO_FEW;
+    }
+    if (ARGON2_MAX_LANES < context->lanes) {
+        return ARGON2_LANES_TOO_MANY;
+    }
+    
+    /* Validate threads */
+    if (ARGON2_MIN_THREADS > context->threads) {
+        return ARGON2_THREADS_TOO_FEW;
+    }
+    if (ARGON2_MAX_THREADS < context->threads) {
+        return ARGON2_THREADS_TOO_MANY;
+    }
+
+    if (NULL != context->allocate_cbk && NULL == context->free_cbk) {
+        return ARGON2_FREE_MEMORY_CBK_NULL;
+    }
+
+    if (NULL == context->allocate_cbk && NULL != context->free_cbk) {
+        return ARGON2_ALLOCATE_MEMORY_CBK_NULL;
+    }
+
+    return ARGON2_OK;
+}
+
+void FillFirstBlocks(uint8_t* blockhash, const Argon2_instance_t* instance) {
+    // Make the first and second block in each lane as G(H0||i||0) or G(H0||i||1)
+    for (uint32_t l = 0; l < instance->lanes; ++l) {
+        store32(blockhash+ARGON2_PREHASH_DIGEST_LENGTH,0);
+        store32(blockhash+ARGON2_PREHASH_DIGEST_LENGTH + 4,l);
+        blake2b_long((uint8_t*) (instance->memory[l * instance->lane_length].v), blockhash, ARGON2_BLOCK_SIZE, ARGON2_PREHASH_SEED_LENGTH);
+
+        store32(blockhash+ARGON2_PREHASH_DIGEST_LENGTH,1);
+        blake2b_long((uint8_t*) (instance->memory[l * instance->lane_length + 1].v), blockhash, ARGON2_BLOCK_SIZE, ARGON2_PREHASH_SEED_LENGTH);
+    }
+}
+
+void InitialHash(uint8_t* blockhash, Argon2_Context* context, Argon2_type type) {
+    blake2b_state BlakeHash;
+    uint8_t value[sizeof (uint32_t)];
+
+    if (NULL == context || NULL == blockhash) {
+        return;
+    }
+
+    blake2b_init(&BlakeHash, ARGON2_PREHASH_DIGEST_LENGTH);
+
+    store32(&value, context->lanes);
+    blake2b_update(&BlakeHash, (const uint8_t*) &value, sizeof (value));
+
+    store32(&value, context->outlen);
+    blake2b_update(&BlakeHash, (const uint8_t*) &value, sizeof (value));
+
+    store32(&value, context->m_cost);
+    blake2b_update(&BlakeHash, (const uint8_t*) &value, sizeof (value));
+
+    store32(&value, context->t_cost);
+    blake2b_update(&BlakeHash, (const uint8_t*) &value, sizeof (value));
+
+    store32(&value, ARGON2_VERSION_NUMBER);
+    blake2b_update(&BlakeHash, (const uint8_t*) &value, sizeof (value));
+
+    store32(&value, (uint32_t) type);
+    blake2b_update(&BlakeHash, (const uint8_t*) &value, sizeof (value));
+
+    store32(&value, context->pwdlen);
+    blake2b_update(&BlakeHash, (const uint8_t*) &value, sizeof (value));
+    if (context->pwd != NULL) {
+        blake2b_update(&BlakeHash, (const uint8_t*) context->pwd, context->pwdlen);
+        if (context->clear_password) {
+            secure_wipe_memory(context->pwd, context->pwdlen);
+            context->pwdlen = 0;
+        }
+    }
+
+    store32(&value, context->saltlen);
+    blake2b_update(&BlakeHash, (const uint8_t*) &value, sizeof (value));
+    if (context->salt != NULL) {
+        blake2b_update(&BlakeHash, (const uint8_t*) context->salt, context->saltlen);
+    }
+
+    store32(&value, context->secretlen);
+    blake2b_update(&BlakeHash, (const uint8_t*) &value, sizeof (value));
+    if (context->secret != NULL) {
+        blake2b_update(&BlakeHash, (const uint8_t*) context->secret, context->secretlen);
+        if (context->clear_secret) {
+            secure_wipe_memory(context->secret, context->secretlen);
+            context->secretlen = 0;
+        }
+    }
+
+    store32(&value, context->adlen);
+    blake2b_update(&BlakeHash, (const uint8_t*) &value, sizeof (value));
+    if (context->ad != NULL) {
+        blake2b_update(&BlakeHash, (const uint8_t*) context->ad, context->adlen);
+    }
+    blake2b_final(&BlakeHash, blockhash, ARGON2_PREHASH_DIGEST_LENGTH);
+}
+
+int Initialize(Argon2_instance_t* instance, Argon2_Context* context) {
+    if (instance == NULL || context == NULL)
+        return ARGON2_INCORRECT_PARAMETER;
+    // 1. Memory allocation
+    int result = ARGON2_OK;
+    if (NULL != context->allocate_cbk) {
+        result = context->allocate_cbk((uint8_t **)&(instance->memory), instance->memory_blocks * ARGON2_BLOCK_SIZE);
+    } else {
+        result = AllocateMemory(&(instance->memory), instance->memory_blocks);
+    }
+
+    if (ARGON2_OK != result) {
+        return result;
+    }
+
+    // 2. Initial hashing
+    // H_0 + 8 extra bytes to produce the first blocks
+    uint8_t blockhash[ARGON2_PREHASH_SEED_LENGTH];
+    // Hashing all inputs
+    InitialHash(blockhash, context, instance->type);
+    // Zeroing 8 extra bytes
+    secure_wipe_memory(blockhash + ARGON2_PREHASH_DIGEST_LENGTH, ARGON2_PREHASH_SEED_LENGTH - ARGON2_PREHASH_DIGEST_LENGTH);
+
+#ifdef ARGON2_KAT
+    InitialKat(blockhash, context, instance->type);
+#endif
+
+    // 3. Creating first blocks, we always have at least two blocks in a slice
+    FillFirstBlocks(blockhash, instance);
+    // Clearing the hash
+    secure_wipe_memory(blockhash, ARGON2_PREHASH_SEED_LENGTH);
+
+    return ARGON2_OK;
+}
+
+int Argon2Core(Argon2_Context* context, Argon2_type type) {
+    /* 1. Validate all inputs */
+    int result = ValidateInputs(context);
+    if (ARGON2_OK != result) {
+        return result;
+    }
+    if (Argon2_d != type && Argon2_i != type && Argon2_id != type && Argon2_ds != type) {
+        return ARGON2_INCORRECT_TYPE;
+    }
+
+    /* 2. Align memory size */
+    // Minimum memory_blocks = 8L blocks, where L is the number of lanes
+    uint32_t memory_blocks = context->m_cost;
+    if (memory_blocks < 2 * ARGON2_SYNC_POINTS * context->lanes) {
+        memory_blocks = 2 * ARGON2_SYNC_POINTS * context->lanes;
+    }
+    uint32_t segment_length = memory_blocks / (context->lanes * ARGON2_SYNC_POINTS);
+    // Ensure that all segments have equal length
+    memory_blocks = segment_length * (context->lanes * ARGON2_SYNC_POINTS);
+	Argon2_instance_t instance = { NULL, context->t_cost, memory_blocks, segment_length, 
+        segment_length * ARGON2_SYNC_POINTS, context->lanes, context->threads, type, NULL };
+
+    /* 3. Initialization: Hashing inputs, allocating memory, filling first blocks */
+    result = Initialize(&instance, context);
+    if (ARGON2_OK != result) {
+        return result;
+    }
+
+    /* 4. Filling memory */
+    FillMemoryBlocks(&instance);
+
+    /* 5. Finalization */
+    Finalize(context, &instance);
+
+    return ARGON2_OK;
+}
+
+
+void* FillSegmentThr(void* thread_data)
+{
+    Argon2_thread_data* my_data = (Argon2_thread_data*)thread_data;
+    FillSegment(my_data->instance_ptr, my_data->pos);
+    pthread_exit(thread_data);
+}
--- a/src/argon2-core.h
+++ b/src/argon2-core.h
@ -0,0 +1,223 @@
+/*
+ * Argon2 source code package
+ * 
+ * Written by Daniel Dinu and Dmitry Khovratovich, 2015
+ * 
+ * This work is licensed under a Creative Commons CC0 1.0 License/Waiver.
+ * 
+ * You should have received a copy of the CC0 Public Domain Dedication along with
+ * this software. If not, see <http://creativecommons.org/publicdomain/zero/1.0/>.
+ */
+
+
+#pragma once
+
+#ifndef __ARGON2_CORE_H__
+#define __ARGON2_CORE_H__
+
+/*************************Argon2 internal constants**************************************************/
+
+/* Version of the algorithm */
+extern const uint32_t ARGON2_VERSION_NUMBER;
+
+/* Memory block size in bytes */
+#define  ARGON2_BLOCK_SIZE  1024
+#define ARGON2_WORDS_IN_BLOCK (ARGON2_BLOCK_SIZE/8)
+extern const uint32_t ARGON2_QWORDS_IN_BLOCK ; /*Dependent values!*/
+
+/* Number of pseudo-random values generated by one call to Blake in Argon2i  to generate reference block positions*/
+extern const uint32_t ARGON2_ADDRESSES_IN_BLOCK;
+
+/* Pre-hashing digest length and its extension*/
+extern const uint32_t ARGON2_PREHASH_DIGEST_LENGTH ;
+extern const uint32_t ARGON2_PREHASH_SEED_LENGTH ;/*Dependent values!*/
+
+
+
+/*****SM-related constants******/
+extern const uint32_t ARGON2_SBOX_SIZE;
+extern const uint32_t ARGON2_SBOX_MASK ;
+
+/* Argon2 primitive type */
+typedef enum _Argon2_type {
+    Argon2_d=0,
+    Argon2_i=1,
+    Argon2_id=2,
+    Argon2_ds=4
+} Argon2_type;
+
+/*************************Argon2 internal data types**************************************************/
+
+/*
+ * Structure for the (1KB) memory block implemented as 128 64-bit words.
+ * Memory blocks can be copied, XORed. Internal words can be accessed by [] (no bounds checking).
+ */
+typedef struct _block {
+    uint64_t v[ARGON2_WORDS_IN_BLOCK];    
+} block;
+
+/*****************Functions that work with the block******************/
+
+//Initialize each byte of the block with @in
+extern void InitBlockValue(block* b, uint8_t in);
+
+//Copy block @src to block @dst 
+extern void CopyBlock(block* dst, const block* src);
+
+//XOR @src onto @dst bytewise
+extern void XORBlock(block* dst, const block* src);
+
+
+/*
+ * Argon2 instance: memory pointer, number of passes, amount of memory, type, and derived values. 
+ * Used to evaluate the number and location of blocks to construct in each thread
+ */
+typedef struct _Argon2_instance_t {
+    block* memory; //Memory pointer
+    const uint32_t passes; //Number of passes
+    const uint32_t memory_blocks; //Number of blocks in memory
+    const uint32_t segment_length;
+    const uint32_t lane_length;
+    const uint32_t lanes;
+    const uint32_t threads;
+    const Argon2_type type;
+    uint64_t *Sbox; //S-boxes for Argon2_ds
+}Argon2_instance_t;
+
+
+/*
+ * Argon2 position: where we construct the block right now. Used to distribute work between threads.
+ */
+typedef struct _Argon2_position_t {
+    const uint32_t pass;
+    const uint32_t lane;
+    const uint8_t slice;
+    uint32_t index;
+}Argon2_position_t;
+
+/*Struct that holds the inputs for thread handling FillSegment*/
+typedef struct _Argon2_thread_data {
+    Argon2_instance_t* instance_ptr;
+    Argon2_position_t pos;   
+}Argon2_thread_data;
+
+/*Macro for endianness conversion*/
+
+#if defined(_MSC_VER) 
+#define BSWAP32(x) _byteswap_ulong(x)
+#else
+#define BSWAP32(x) __builtin_bswap32(x)
+#endif
+
+/*************************Argon2 core functions**************************************************/
+
+/* Allocates memory to the given pointer
+ * @param memory pointer to the pointer to the memory
+ * @param m_cost number of blocks to allocate in the memory
+ * @return ARGON2_OK if @memory is a valid pointer and memory is allocated
+ */
+int AllocateMemory(block **memory, uint32_t m_cost);
+
+/* Clears memory
+ * @param instance pointer to the current instance
+ * @param clear_memory indicates if we clear the memory with zeros.
+ */
+void ClearMemory(Argon2_instance_t* instance, bool clear);
+
+/* Deallocates memory
+ * @param memory pointer to the blocks
+ */
+void FreeMemory(block* memory);
+
+
+
+
+/*
+ * Computes absolute position of reference block in the lane following a skewed distribution and using a pseudo-random value as input
+ * @param instance Pointer to the current instance
+ * @param position Pointer to the current position
+ * @param pseudo_rand 32-bit pseudo-random value used to determine the position
+ * @param same_lane Indicates if the block will be taken from the current lane. If so we can reference the current segment
+ * @pre All pointers must be valid
+ */
+uint32_t IndexAlpha(const Argon2_instance_t* instance, const Argon2_position_t* position, uint32_t pseudo_rand, bool same_lane);
+
+/*
+ * Function that validates all inputs against predefined restrictions and return an error code
+ * @param context Pointer to current Argon2 context
+ * @return ARGON2_OK if everything is all right, otherwise one of error codes (all defined in <argon2.h>
+ */
+int ValidateInputs(const Argon2_Context* context);
+
+/*
+ * Hashes all the inputs into @a blockhash[PREHASH_DIGEST_LENGTH], clears password and secret if needed
+ * @param  context  Pointer to the Argon2 internal structure containing memory pointer, and parameters for time and space requirements.
+ * @param  blockhash Buffer for pre-hashing digest
+ * @param  type Argon2 type
+ * @pre    @a blockhash must have at least @a PREHASH_DIGEST_LENGTH bytes allocated
+ */
+void InitialHash(uint8_t* blockhash,  Argon2_Context* context, Argon2_type type);
+
+/*
+ * Function creates first 2 blocks per lane
+ * @param instance Pointer to the current instance
+ * @param blockhash Pointer to the pre-hashing digest
+ * @pre blockhash must point to @a PREHASH_SEED_LENGTH allocated values
+ */
+void FillFirstBlocks(uint8_t* blockhash, const Argon2_instance_t* instance);
+
+
+/*
+ * Function allocates memory, hashes the inputs with Blake,  and creates first two blocks. Returns the pointer to the main memory with 2 blocks per lane
+ * initialized
+ * @param  context  Pointer to the Argon2 internal structure containing memory pointer, and parameters for time and space requirements.
+ * @param  instance Current Argon2 instance
+ * @return Zero if successful, -1 if memory failed to allocate. @context->state will be modified if successful.
+ */
+int Initialize(Argon2_instance_t* instance, Argon2_Context* context);
+
+/*
+ * XORing the last block of each lane, hashing it, making the tag. Deallocates the memory.
+ * @param context Pointer to current Argon2 context (use only the out parameters from it)
+ * @param instance Pointer to current instance of Argon2
+ * @pre instance->state must point to necessary amount of memory
+ * @pre context->out must point to outlen bytes of memory
+ * @pre if context->free_cbk is not NULL, it should point to a function that deallocates memory
+ */
+void Finalize(const Argon2_Context *context, Argon2_instance_t* instance);
+
+
+
+/*
+ * Function that fills the segment using previous segments also from other threads
+ * @param instance Pointer to the current instance
+ * @param position Current position
+ * @pre all block pointers must be valid
+ */
+extern void FillSegment(const Argon2_instance_t* instance, Argon2_position_t position);
+
+/*
+ * Wrapper for FillSegment for <pthread> library
+ * @param thread_data Pointer to the structure that holds inputs for FillSegment
+ * @pre all block pointers must be valid
+ */
+void* FillSegmentThr(void* Argon2_thread_data);
+
+/*
+ * Function that fills the entire memory t_cost times based on the first two blocks in each lane
+ * @param instance Pointer to the current instance
+ */
+void FillMemoryBlocks(Argon2_instance_t* instance);
+
+
+/*
+ * Function that performs memory-hard hashing with certain degree of parallelism
+ * @param  context  Pointer to the Argon2 internal structure
+ * @return Error code if smth is wrong, ARGON2_OK otherwise
+ */
+int Argon2Core(Argon2_Context* context, Argon2_type type);
+
+
+extern void GenerateSbox(Argon2_instance_t* instance);
+
+#endif
--- a/src/argon2-opt-core.c
+++ b/src/argon2-opt-core.c
@ -0,0 +1,254 @@
+/*
+ * Argon2 source code package
+ * 
+ * Written by Daniel Dinu and Dmitry Khovratovich, 2015
+ * 
+ * This work is licensed under a Creative Commons CC0 1.0 License/Waiver.
+ * 
+ * You should have received a copy of the CC0 Public Domain Dedication along with
+ * this software. If not, see <http://creativecommons.org/publicdomain/zero/1.0/>.
+ */
+
+
+#include <stdint.h>
+#include <string.h>
+#include <stdlib.h>
+
+#if !defined(_MSC_VER)
+#include <x86intrin.h>
+#else 
+#include <emmintrin.h>
+#include <smmintrin.h>
+#include <intrin.h>
+#endif
+
+
+#include "argon2.h"
+#include "argon2-core.h"
+#include "argon2-opt-core.h"
+#include "kat.h"
+
+
+#include "../Blake2/blake2-round-mka.h"
+#include "../Blake2/blake2-impl.h"
+#include "../Blake2/blake2.h"
+
+
+#if defined(ARGON2_KAT) || defined(ARGON2_KAT_INTERNAL)
+/* The KAT file name */
+const char* ARGON2_KAT_FILENAME = "kat-argon2-opt.log";
+#endif
+
+
+#define r16  (_mm_setr_epi8(2, 3, 4, 5, 6, 7, 0, 1, 10, 11, 12, 13, 14, 15, 8, 9))
+#define r24 (_mm_setr_epi8(3, 4, 5, 6, 7, 0, 1, 2, 11, 12, 13, 14, 15, 8, 9, 10))
+
+__m128i t0, t1;
+
+void FillBlock(__m128i* state, const uint8_t *ref_block, uint8_t *next_block, const uint64_t* Sbox) {
+    __m128i block_XY[ARGON2_QWORDS_IN_BLOCK];
+    //__m128i state[64];
+
+
+
+    for (uint32_t i = 0; i < ARGON2_QWORDS_IN_BLOCK; i++) {
+        block_XY[i] = _mm_load_si128((__m128i *) ref_block);
+        ref_block += 16;
+    }
+    for (uint32_t i = 0; i < ARGON2_QWORDS_IN_BLOCK; i++) {
+        block_XY[i] = state[i] = _mm_xor_si128(state[i], block_XY[i]);
+    }
+
+    uint64_t x = 0;
+    if (Sbox != NULL) {
+        x = _mm_extract_epi64(block_XY[0], 0) ^ _mm_extract_epi64(block_XY[ARGON2_QWORDS_IN_BLOCK - 1], 1);
+        for (int i = 0; i < 6 * 16; ++i) {
+            uint32_t x1 = x >> 32;
+            uint32_t x2 = x & 0xFFFFFFFF;
+            uint64_t y = Sbox[x1 & ARGON2_SBOX_MASK];
+            uint64_t z = Sbox[(x2 & ARGON2_SBOX_MASK) + ARGON2_SBOX_SIZE / 2];
+            x = (uint64_t) x1 * (uint64_t) x2;
+            x += y;
+            x ^= z;
+        }
+    }
+
+    BLAKE2_ROUND(state[0], state[1], state[2], state[3],
+            state[4], state[5], state[6], state[7]);
+
+    BLAKE2_ROUND(state[8], state[9], state[10], state[11],
+            state[12], state[13], state[14], state[15]);
+
+    BLAKE2_ROUND(state[16], state[17], state[18], state[19],
+            state[20], state[21], state[22], state[23]);
+
+    BLAKE2_ROUND(state[24], state[25], state[26], state[27],
+            state[28], state[29], state[30], state[31]);
+
+    BLAKE2_ROUND(state[32], state[33], state[34], state[35],
+            state[36], state[37], state[38], state[39]);
+
+    BLAKE2_ROUND(state[40], state[41], state[42], state[43],
+            state[44], state[45], state[46], state[47]);
+
+    BLAKE2_ROUND(state[48], state[49], state[50], state[51],
+            state[52], state[53], state[54], state[55]);
+
+    BLAKE2_ROUND(state[56], state[57], state[58], state[59],
+            state[60], state[61], state[62], state[63]);
+
+
+    BLAKE2_ROUND(state[0], state[8], state[16], state[24],
+            state[32], state[40], state[48], state[56]);
+
+    BLAKE2_ROUND(state[1], state[9], state[17], state[25],
+            state[33], state[41], state[49], state[57]);
+
+    BLAKE2_ROUND(state[2], state[10], state[18], state[26],
+            state[34], state[42], state[50], state[58])
+
+            BLAKE2_ROUND(state[3], state[11], state[19], state[27],
+            state[35], state[43], state[51], state[59]);
+
+    BLAKE2_ROUND(state[4], state[12], state[20], state[28],
+            state[36], state[44], state[52], state[60]);
+
+    BLAKE2_ROUND(state[5], state[13], state[21], state[29],
+            state[37], state[45], state[53], state[61]);
+
+    BLAKE2_ROUND(state[6], state[14], state[22], state[30],
+            state[38], state[46], state[54], state[62]);
+
+    BLAKE2_ROUND(state[7], state[15], state[23], state[31],
+            state[39], state[47], state[55], state[63]);
+
+    for (uint32_t i = 0; i < ARGON2_QWORDS_IN_BLOCK; i++) {
+        // Feedback
+        state[i] = _mm_xor_si128(state[i], block_XY[i]);
+    }
+    state[0] = _mm_add_epi64(state[0], _mm_set_epi64x(0, x));
+    state[ARGON2_QWORDS_IN_BLOCK - 1] = _mm_add_epi64(state[ARGON2_QWORDS_IN_BLOCK - 1], _mm_set_epi64x(x, 0));
+    for (uint32_t i = 0; i < ARGON2_QWORDS_IN_BLOCK; i++) {
+        _mm_store_si128((__m128i *) next_block, state[i]);
+        next_block += 16;
+    }
+}
+
+void GenerateAddresses(const Argon2_instance_t* instance, const Argon2_position_t* position, uint64_t* pseudo_rands) {
+    block zero_block, address_block,input_block;
+    InitBlockValue(&zero_block,0);
+    CopyBlock(&address_block,&zero_block);
+    CopyBlock(&input_block,&zero_block);
+    if (instance != NULL && position != NULL) {
+        input_block.v[0] = position->pass;
+        input_block.v[1] = position->lane;
+        input_block.v[2] = position->slice;
+        input_block.v[3] = instance->memory_blocks;
+        input_block.v[4] = instance->passes;
+        input_block.v[5] = instance->type;
+
+        for (uint32_t i = 0; i < instance->segment_length; ++i) {
+            if (i % ARGON2_ADDRESSES_IN_BLOCK == 0) {
+                input_block.v[6]++;
+                block zero_block, zero2_block;
+                InitBlockValue(&zero_block,0);
+                InitBlockValue(&zero2_block,0);
+                FillBlock((__m128i *) & zero_block.v, (uint8_t *) & input_block.v, (uint8_t *) & address_block.v, NULL);
+                FillBlock((__m128i *) & zero2_block.v, (uint8_t *) & address_block.v, (uint8_t *) & address_block.v, NULL);
+            }
+            pseudo_rands[i] = address_block.v[i % ARGON2_ADDRESSES_IN_BLOCK];
+        }
+    }
+}
+
+void FillSegment(const Argon2_instance_t* instance, Argon2_position_t position) {
+ 	if (instance == NULL){
+	   return;
+ 	}    
+	uint64_t pseudo_rand, ref_index, ref_lane;
+	uint32_t prev_offset, curr_offset;
+	__m128i state[64];
+	bool data_independent_addressing = (instance->type == Argon2_i) || (instance->type == Argon2_id && (position.pass == 0) && (position.slice < ARGON2_SYNC_POINTS / 2));
+
+    
+   // Pseudo-random values that determine the reference block position
+   uint64_t *pseudo_rands = (uint64_t*)malloc( sizeof(uint64_t)*instance->segment_length);
+   if (pseudo_rands == NULL) {
+		return;
+	}
+   if (data_independent_addressing) {
+       GenerateAddresses(instance, &position, pseudo_rands);
+   }
+
+   uint32_t starting_index = 0;
+   if ((0 == position.pass) && (0 == position.slice)) {
+       starting_index = 2; // we have already generated the first two blocks
+   }
+
+   // Offset of the current block
+   curr_offset = position.lane * instance->lane_length + position.slice * instance->segment_length + starting_index;
+   if (0 == curr_offset % instance->lane_length) {
+       // Last block in this lane
+       prev_offset = curr_offset + instance->lane_length - 1;
+   } else {
+       // Previous block
+       prev_offset = curr_offset - 1;
+   }
+   memcpy(state, (uint8_t *) ((instance->memory + prev_offset)->v), ARGON2_BLOCK_SIZE);
+   for (uint32_t i = starting_index; i < instance->segment_length; ++i, ++curr_offset, ++prev_offset) {
+       /*1.1 Rotating prev_offset if needed */
+       if (curr_offset % instance->lane_length == 1) {
+           prev_offset = curr_offset - 1;
+       }
+
+       /* 1.2 Computing the index of the reference block */
+       /* 1.2.1 Taking pseudo-random value from the previous block */
+       if (data_independent_addressing) {
+           pseudo_rand = pseudo_rands[i];
+       } else {
+           pseudo_rand = instance->memory[prev_offset].v[0];
+       }
+
+       /* 1.2.2 Computing the lane of the reference block */
+       ref_lane = ((pseudo_rand >> 32)) % instance->lanes;
+       if ((position.pass == 0) && (position.slice == 0)) {
+           // Can not reference other lanes yet
+           ref_lane = position.lane;
+       }
+
+       /* 1.2.3 Computing the number of possible reference block within the lane. */
+       position.index = i;
+       ref_index = IndexAlpha(instance, &position, pseudo_rand & 0xFFFFFFFF, ref_lane == position.lane);
+
+       /* 2 Creating a new block */
+       block* ref_block = instance->memory + instance->lane_length * ref_lane + ref_index;
+       block* curr_block = instance->memory + curr_offset;
+       FillBlock(state, (uint8_t *) ref_block->v, (uint8_t *) curr_block->v, instance->Sbox);
+   }
+
+   free(pseudo_rands);
+   
+}
+
+void GenerateSbox(Argon2_instance_t* instance) {
+    if (instance == NULL) {
+        return;
+    }
+    block zero_block;
+    InitBlockValue(&zero_block,0);
+    block out_block = zero_block;
+    block start_block = instance->memory[0];
+    
+    if (instance->Sbox == NULL) {
+        instance->Sbox = (uint64_t*) malloc(sizeof(uint64_t)*ARGON2_SBOX_SIZE);
+    }
+
+    for (uint32_t i = 0; i < ARGON2_SBOX_SIZE / ARGON2_WORDS_IN_BLOCK; ++i) {
+         block zero_block, zero2_block;
+        InitBlockValue(&zero_block,0);
+        InitBlockValue(&zero2_block,0);
+        FillBlock((__m128i*) zero_block.v, (uint8_t*) start_block.v, (uint8_t*) out_block.v, NULL);
+        FillBlock((__m128i*) zero2_block.v, (uint8_t*) out_block.v, (uint8_t*) start_block.v, NULL);
+        memcpy(instance->Sbox + i * ARGON2_WORDS_IN_BLOCK, start_block.v, ARGON2_BLOCK_SIZE);
+    }
+}
--- a/src/argon2-opt-core.h
+++ b/src/argon2-opt-core.h
@ -0,0 +1,51 @@
+/*
+ * Argon2 source code package
+ * 
+ * Written by Daniel Dinu and Dmitry Khovratovich, 2015
+ * 
+ * This work is licensed under a Creative Commons CC0 1.0 License/Waiver.
+ * 
+ * You should have received a copy of the CC0 Public Domain Dedication along with
+ * this software. If not, see <http://creativecommons.org/publicdomain/zero/1.0/>.
+ */
+
+#ifndef ARGON2_OPT_CORE_H
+#define	ARGON2_OPT_CORE_H
+
+/*
+ * Function fills a new memory block. Differs from the
+ * @param state Pointer to the just produced block. Content will be updated(!)
+ * @param ref_block Pointer to the reference block
+ * @param next_block Pointer to the block to be constructed
+ * @param Sbox Pointer to the Sbox (used in Argon2_ds only)
+ * @pre all block pointers must be valid
+ */
+void FillBlock(__m128i* state, const uint8_t *ref_block, uint8_t *next_block, const uint64_t* Sbox);
+
+
+/*
+ * Generate pseudo-random values to reference blocks in the segment and puts them into the array
+ * @param instance Pointer to the current instance
+ * @param position Pointer to the current position
+ * @param pseudo_rands Pointer to the array of 64-bit values
+ * @pre pseudo_rands must point to @a instance->segment_length allocated values
+ */
+void GenerateAddresses(const Argon2_instance_t* instance, const Argon2_position_t* position, uint64_t* pseudo_rands);
+
+/*
+ * Function that fills the segment using previous segments also from other threads. 
+ * Identical to the reference code except that it calls optimized FillBlock()
+ * @param instance Pointer to the current instance
+ * @param position Current position
+ * @pre all block pointers must be valid
+ */
+void FillSegment(const Argon2_instance_t* instance, Argon2_position_t position);
+
+/*
+ * Generates the Sbox from the first memory block (must be ready at that time)
+ * @param instance Pointer to the current instance 
+ */
+void GenerateSbox(Argon2_instance_t* instance);
+
+#endif	/* ARGON2_OPT_CORE_H */
+
--- a/src/argon2-ref-core.c
+++ b/src/argon2-ref-core.c
@ -0,0 +1,186 @@
+/*
+ * Argon2 source code package
+ * 
+ * Written by Daniel Dinu and Dmitry Khovratovich, 2015
+ * 
+ * This work is licensed under a Creative Commons CC0 1.0 License/Waiver.
+ * 
+ * You should have received a copy of the CC0 Public Domain Dedication along with
+ * this software. If not, see <http://creativecommons.org/publicdomain/zero/1.0/>.
+ */
+
+
+#include <stdint.h>
+#include <string.h>
+#include <stdlib.h>
+
+#include "argon2.h"
+#include "argon2-core.h"
+#include "argon2-ref-core.h"
+#include "kat.h"
+
+
+#include "../Blake2/blake-round-mka.h"
+#include "../Blake2/blake2-impl.h"
+#include "../Blake2/blake2.h"
+
+
+#if defined(ARGON2_KAT) || defined(ARGON2_KAT_INTERNAL)
+/* The KAT file name */
+const char* ARGON2_KAT_FILENAME = "kat-argon2-ref.log";
+#endif
+
+
+void FillBlock(const block* prev_block, const block* ref_block, block* next_block, const uint64_t* Sbox) {
+    block blockR;
+    CopyBlock(&blockR,ref_block);
+    XORBlock(&blockR,prev_block);
+    block block_tmp;
+    CopyBlock(&block_tmp, &blockR);
+
+    uint64_t x = 0;
+    if (Sbox != NULL) {
+        x = blockR.v[0] ^ blockR.v[ARGON2_WORDS_IN_BLOCK - 1];
+        for (int i = 0; i < 6 * 16; ++i) {
+            uint32_t x1 = x >> 32;
+            uint32_t x2 = x & 0xFFFFFFFF;
+            uint64_t y = Sbox[x1 & ARGON2_SBOX_MASK];
+            uint64_t z = Sbox[(x2 & ARGON2_SBOX_MASK) + ARGON2_SBOX_SIZE / 2];
+            x = (uint64_t) x1 * (uint64_t) x2;
+            x += y;
+            x ^= z;
+        }
+    }
+
+
+    // Apply Blake2 on columns of 64-bit words: (0,1,...,15) , then (16,17,..31)... finally (112,113,...127)
+    for (unsigned i = 0; i < 8; ++i) {
+        BLAKE2_ROUND_NOMSG(blockR.v[16 * i], blockR.v[16 * i + 1], blockR.v[16 * i + 2], blockR.v[16 * i + 3],
+                blockR.v[16 * i + 4], blockR.v[16 * i + 5], blockR.v[16 * i + 6], blockR.v[16 * i + 7],
+                blockR.v[16 * i + 8], blockR.v[16 * i + 9], blockR.v[16 * i + 10], blockR.v[16 * i + 11],
+                blockR.v[16 * i + 12], blockR.v[16 * i + 13], blockR.v[16 * i + 14], blockR.v[16 * i + 15]);
+    }
+    // Apply Blake2 on rows of 64-bit words: (0,1,16,17,...112,113), then (2,3,18,19,...,114,115).. finally (14,15,30,31,...,126,127)
+    for (unsigned i = 0; i < 8; i++) {
+        BLAKE2_ROUND_NOMSG(blockR.v[2 * i], blockR.v[2 * i + 1], blockR.v[2 * i + 16], blockR.v[2 * i + 17],
+                blockR.v[2 * i + 32], blockR.v[2 * i + 33], blockR.v[2 * i + 48], blockR.v[2 * i + 49],
+                blockR.v[2 * i + 64], blockR.v[2 * i + 65], blockR.v[2 * i + 80], blockR.v[2 * i + 81],
+                blockR.v[2 * i + 96], blockR.v[2 * i + 97], blockR.v[2 * i + 112], blockR.v[2 * i + 113]);
+    }
+
+    CopyBlock(next_block,&block_tmp);
+    XORBlock(next_block,&blockR);
+    next_block->v[0] += x;
+    next_block->v[ARGON2_WORDS_IN_BLOCK - 1] += x;
+}
+
+void GenerateAddresses(const Argon2_instance_t* instance, const Argon2_position_t* position, uint64_t* pseudo_rands) {
+    block zero_block, input_block, address_block;
+    InitBlockValue(&zero_block,0);
+    InitBlockValue(&input_block,0);
+    InitBlockValue(&address_block,0);
+    if (instance != NULL && position != NULL) {
+        input_block.v[0] = position->pass;
+        input_block.v[1] = position->lane;
+        input_block.v[2] = position->slice;
+        input_block.v[3] = instance->memory_blocks;
+        input_block.v[4] = instance->passes;
+        input_block.v[5] = instance->type;
+
+        for (uint32_t i = 0; i < instance->segment_length; ++i) {
+            if (i % ARGON2_ADDRESSES_IN_BLOCK == 0) {
+                input_block.v[6]++;
+                FillBlock(&zero_block, &input_block, &address_block, NULL);
+                FillBlock(&zero_block, &address_block, &address_block, NULL);
+            }
+            pseudo_rands[i] = address_block.v[i % ARGON2_ADDRESSES_IN_BLOCK];
+        }
+    }
+}
+
+void FillSegment(const Argon2_instance_t* instance, Argon2_position_t position) {
+    if (instance == NULL) {
+        return;
+    }
+    uint64_t pseudo_rand, ref_index, ref_lane;
+    uint32_t prev_offset, curr_offset;
+    bool data_independent_addressing = (instance->type == Argon2_i) || (instance->type == Argon2_id && (position.pass == 0) && (position.slice < ARGON2_SYNC_POINTS / 2));
+    // Pseudo-random values that determine the reference block position
+	uint64_t *pseudo_rands = (uint64_t*)malloc(sizeof(uint64_t)*(instance->segment_length));
+    if (pseudo_rands == NULL){
+        return;
+    }
+         
+    if (data_independent_addressing) {
+        GenerateAddresses(instance, &position, pseudo_rands);
+    }
+
+    uint32_t starting_index = 0;
+    if ((0 == position.pass) && (0 == position.slice)) {
+        starting_index = 2; // we have already generated the first two blocks
+    }
+
+    // Offset of the current block
+    curr_offset = position.lane * instance->lane_length + position.slice * instance->segment_length + starting_index;
+    if (0 == curr_offset % instance->lane_length) {
+        // Last block in this lane
+        prev_offset = curr_offset + instance->lane_length - 1;
+    } else {
+        // Previous block
+        prev_offset = curr_offset - 1;
+    }
+
+    for (uint32_t i = starting_index; i < instance->segment_length; ++i, ++curr_offset, ++prev_offset) {
+        /*1.1 Rotating prev_offset if needed */
+        if (curr_offset % instance->lane_length == 1) {
+            prev_offset = curr_offset - 1;
+        }
+
+        /* 1.2 Computing the index of the reference block */
+        /* 1.2.1 Taking pseudo-random value from the previous block */
+        if (data_independent_addressing) {
+            pseudo_rand = pseudo_rands[i];
+        } 
+        else {
+            pseudo_rand = instance->memory[prev_offset].v[0];
+        }
+
+        /* 1.2.2 Computing the lane of the reference block */
+        ref_lane = ((pseudo_rand >> 32)) % instance->lanes;
+        if ((position.pass == 0) && (position.slice == 0)) {
+            // Can not reference other lanes yet
+            ref_lane = position.lane;
+        }
+
+        /* 1.2.3 Computing the number of possible reference block within the lane. */
+        position.index = i;
+        ref_index = IndexAlpha(instance, &position, pseudo_rand & 0xFFFFFFFF, ref_lane == position.lane);
+
+        /* 2 Creating a new block */
+        block* ref_block = instance->memory + instance->lane_length * ref_lane + ref_index;
+        block* curr_block = instance->memory + curr_offset;
+        FillBlock(instance->memory + prev_offset, ref_block, curr_block, instance->Sbox);
+    }
+
+    free(pseudo_rands);
+}
+    
+
+void GenerateSbox(Argon2_instance_t* instance) {
+    if (instance == NULL){
+        return;
+    }
+    block zero_block;
+    InitBlockValue(&zero_block,0);
+    block start_block =instance->memory[0];
+    block out_block  = zero_block;
+    
+    if (instance->Sbox == NULL){
+		instance->Sbox = (uint64_t*)malloc(sizeof(uint64_t)*ARGON2_SBOX_SIZE);
+    }
+    for (uint32_t i = 0; i < ARGON2_SBOX_SIZE / ARGON2_WORDS_IN_BLOCK; ++i) {
+        FillBlock(&zero_block, &start_block, &out_block, NULL);
+        FillBlock(&zero_block, &out_block, &start_block, NULL);
+        memcpy(instance->Sbox + i*ARGON2_WORDS_IN_BLOCK, start_block.v, ARGON2_BLOCK_SIZE);
+    }
+}
--- a/src/argon2-ref-core.h
+++ b/src/argon2-ref-core.h
@ -0,0 +1,50 @@
+/*
+ * Argon2 source code package
+ * 
+ * Written by Daniel Dinu and Dmitry Khovratovich, 2015
+ * 
+ * This work is licensed under a Creative Commons CC0 1.0 License/Waiver.
+ * 
+ * You should have received a copy of the CC0 Public Domain Dedication along with
+ * this software. If not, see <http://creativecommons.org/publicdomain/zero/1.0/>.
+ */
+
+#ifndef ARGON2_REF_CORE_H
+#define	ARGON2_REF_CORE_H
+
+
+/*
+ * Function fills a new memory block
+ * @param prev_block Pointer to the previous block
+ * @param ref_block Pointer to the reference block
+ * @param next_block Pointer to the block to be constructed
+ * @param Sbox Pointer to the Sbox (used in Argon2_ds only)
+ * @pre all block pointers must be valid
+ */
+void FillBlock(const block* prev_block, const block* ref_block, block* next_block, const uint64_t* Sbox);
+
+/*
+ * Generate pseudo-random values to reference blocks in the segment and puts them into the array
+ * @param instance Pointer to the current instance
+ * @param position Pointer to the current position
+ * @param pseudo_rands Pointer to the array of 64-bit values
+ * @pre pseudo_rands must point to @a instance->segment_length allocated values
+ */
+void GenerateAddresses(const Argon2_instance_t* instance, const Argon2_position_t* position, uint64_t* pseudo_rands);
+
+/*
+ * Function that fills the segment using previous segments also from other threads
+ * @param instance Pointer to the current instance
+ * @param position Current position
+ * @pre all block pointers must be valid
+ */
+void FillSegment(const Argon2_instance_t* instance, Argon2_position_t position);
+
+
+/*
+ * Generates the Sbox from the first memory block (must be ready at that time)
+ * @param instance Pointer to the current instance 
+ */
+void GenerateSbox(Argon2_instance_t* instance);
+#endif	/* ARGON2_REF_CORE_H */
+
--- a/src/argon2-test.c
+++ b/src/argon2-test.c
@ -0,0 +1,368 @@
+/*
+ * Argon2 source code package
+ * 
+ * Written by Daniel Dinu and Dmitry Khovratovich, 2015
+ * 
+ * This work is licensed under a Creative Commons CC0 1.0 License/Waiver.
+ * 
+ * You should have received a copy of the CC0 Public Domain Dedication along with
+ * this software. If not, see <http://creativecommons.org/publicdomain/zero/1.0/>.
+ */
+
+
+#include <stdio.h>
+#include <stdint.h>
+#include <stdlib.h>
+#include <string.h>
+
+#include "time.h"
+#include "argon2.h"
+#ifdef _MSC_VER
+#include "intrin.h"
+#endif 
+/* Enable timing measurements */
+#define _MEASURE
+
+static inline uint64_t rdtscp(uint32_t *aux) {
+#ifdef _MSC_VER
+	return __rdtscp(aux);
+#else
+    uint64_t rax, rdx;
+    __asm volatile ( "rdtscp\n" : "=a" (rax), "=d" (rdx), "=c" (aux) : : );
+    return (rdx << 32) + rax;
+#endif
+}
+
+/*
+ * Custom allocate memory
+ */
+int CustomAllocateMemory(uint8_t **memory, size_t length) {
+	*memory = (uint8_t*)malloc(length);
+    if (!*memory) {
+        return ARGON2_MEMORY_ALLOCATION_ERROR;
+    }
+    return ARGON2_OK;
+}
+
+/*
+ * Custom free memory
+ */
+void CustomFreeMemory(uint8_t *memory, size_t length) {
+    if (memory) {
+        free(memory);
+    }
+}
+
+
+
+/*
+ * Benchmarks Argon2 with salt length 16, password length 32, t_cost 3, and different threads and m_cost
+ */
+void Benchmark() {
+    const uint32_t inlen = 16;
+    const unsigned outlen=16;
+    unsigned char out[outlen];
+    unsigned char pwd_array[inlen];
+    unsigned char salt_array[inlen];
+
+    uint32_t t_cost = 1;
+
+    memset(pwd_array, 0, inlen);
+    memset(salt_array, 1, inlen);
+    uint32_t thread_test[6] = {1, 2, 4, 6, 8, 16};
+
+    uint32_t m_cost;
+    for (m_cost = (uint32_t) 1 << 10; m_cost <= (uint32_t) 1 << 22; m_cost *= 2) {
+        uint32_t i;
+        for ( i=0; i <6; ++i) {
+			uint32_t thread_n = thread_test[i];
+#ifdef _MEASURE
+            uint64_t start_cycles, stop_cycles, stop_cycles_i, stop_cycles_di, stop_cycles_ds;
+            uint32_t ui1, ui2, ui3, ui4, ui5;
+
+            clock_t start_time = clock();
+            start_cycles = rdtscp(&ui1);
+#endif
+
+            Argon2_Context context = {out, outlen, pwd_array, inlen, salt_array, inlen, 
+				NULL, 0, NULL, 0, t_cost, m_cost, thread_n, thread_n, NULL, NULL, false, false, false };
+            Argon2d(&context);
+
+#ifdef _MEASURE
+            stop_cycles = rdtscp(&ui2);
+#endif
+            Argon2i(&context);
+#ifdef _MEASURE
+            stop_cycles_i = rdtscp(&ui3);
+#endif
+            Argon2id(&context);
+#ifdef _MEASURE
+            stop_cycles_di = rdtscp(&ui4);
+#endif
+            Argon2ds(&context);
+#ifdef _MEASURE
+            stop_cycles_ds = rdtscp(&ui5);
+            clock_t stop_time = clock();
+
+            uint64_t delta_d = (stop_cycles - start_cycles) / (m_cost);
+            uint64_t delta_i = (stop_cycles_i - stop_cycles) / (m_cost);
+            uint64_t delta_id = (stop_cycles_di - stop_cycles_i) / m_cost;
+            uint64_t delta_ds = (stop_cycles_ds - stop_cycles_di) / m_cost;
+            float mcycles_d = (float) (stop_cycles - start_cycles) / (1 << 20);
+            float mcycles_i = (float) (stop_cycles_i - stop_cycles) / (1 << 20);
+            float mcycles_id = (float) (stop_cycles_di - stop_cycles_i) / (1 << 20);
+            float mcycles_ds = (float) (stop_cycles_ds - stop_cycles_di) / (1 << 20);
+            printf("Argon2d %d pass(es)  %d Mbytes %d threads:  %2.2f cpb %2.2f Mcycles \n", t_cost, m_cost >> 10, thread_n, (float) delta_d / 1024, mcycles_d);
+            printf("Argon2i %d pass(es)  %d Mbytes %d threads:  %2.2f cpb %2.2f Mcycles \n", t_cost, m_cost >> 10, thread_n, (float) delta_i / 1024, mcycles_i);
+            printf("Argon2id %d pass(es)  %d Mbytes %d threads:  %2.2f cpb %2.2f Mcycles \n", t_cost, m_cost >> 10, thread_n, (float) delta_id / 1024, mcycles_id);
+            printf("Argon2ds %d pass(es)  %d Mbytes %d threads:  %2.2f cpb %2.2f Mcycles \n", t_cost, m_cost >> 10, thread_n, (float) delta_ds / 1024, mcycles_ds);
+
+            float run_time = ((float) stop_time - start_time) / (CLOCKS_PER_SEC);
+            printf("%2.4f seconds\n\n", run_time);
+#endif
+        }
+    }
+}
+
+/*Call Argon2 with default salt and password and user-defined parameter values.*/
+
+void Run(uint8_t *out, uint32_t t_cost, uint32_t m_cost, uint32_t lanes, uint32_t threads,const char* type) {
+#ifdef _MEASURE
+    uint64_t start_cycles, stop_cycles, delta;
+    uint32_t ui1, ui2;
+
+    clock_t start_time = clock();
+    start_cycles = rdtscp(&ui1);
+#endif
+
+    /*Fixed parameters*/
+    const unsigned out_length = 32;
+    const unsigned pwd_length = 32;
+    const unsigned salt_length = 16;
+    const unsigned secret_length = 8;
+    const unsigned ad_length = 12;
+    bool clear_memory = false;
+    bool clear_secret = false;
+    bool clear_password = false;
+    uint8_t pwd[pwd_length];
+    uint8_t salt[salt_length];
+    uint8_t secret[secret_length];
+    uint8_t ad[ad_length];
+    
+    
+
+    memset(pwd, 1, pwd_length);
+    memset(salt, 2, salt_length);
+    memset(secret, 3, secret_length);
+    memset(ad, 4, ad_length);
+
+    Argon2_Context context={out, out_length, pwd, pwd_length, salt, salt_length,
+            secret, secret_length, ad, ad_length, t_cost, m_cost, lanes, lanes,
+            NULL, NULL,
+            clear_password, clear_secret, clear_memory};
+
+    if (strcmp(type,"Argon2d")==0) {
+        printf("Test Argon2d\n");
+        Argon2d(&context);
+        return;
+    }
+    if (strcmp(type,"Argon2i")==0) {
+        printf("Test Argon2i\n");
+        Argon2i(&context);
+        return;
+    }
+    if (strcmp(type,"Argon2ds")==0) {
+        printf("Test Argon2ds\n");
+        Argon2ds(&context);
+        return;
+    }
+    if (strcmp(type,"Argon2id")==0) {
+        printf("Test Argon2id\n");
+        Argon2id(&context);
+        return;
+    }
+
+    printf("Wrong Argon2 type!\n");
+    
+    
+#ifdef _MEASURE
+    stop_cycles = rdtscp(&ui2);
+    clock_t finish_time = clock();
+
+    delta = (stop_cycles - start_cycles) / (m_cost);
+    float mcycles = (float) (stop_cycles - start_cycles) / (1 << 20);
+    printf("Argon:  %2.2f cpb %2.2f Mcycles ", (float) delta / 1024, mcycles);
+
+    float run_time = ((float) finish_time - start_time) / (CLOCKS_PER_SEC);
+    printf("%2.4f seconds\n", run_time);
+#endif
+}
+
+void GenerateTestVectors(const char* type) {
+    const unsigned out_length = 32; 
+    const unsigned pwd_length = 32;
+    const unsigned salt_length = 16;
+    const unsigned secret_length = 8;
+    const unsigned ad_length = 12;
+    bool clear_memory = false;
+    bool clear_secret = false;
+    bool clear_password = false;
+    unsigned char out[out_length];
+    unsigned char pwd[pwd_length];
+    unsigned char salt[salt_length];
+    unsigned char secret[secret_length];
+    unsigned char ad[ad_length];
+    const AllocateMemoryCallback myown_allocator = NULL;
+    const FreeMemoryCallback myown_deallocator = NULL;
+
+    unsigned t_cost = 3;
+    unsigned m_cost = 16;
+    unsigned lanes = 4;
+
+
+    memset(pwd, 1, pwd_length);
+    memset(salt, 2, salt_length);
+    memset(secret, 3, secret_length);
+    memset(ad, 4, ad_length);
+
+#if defined(ARGON2_KAT) || defined(ARGON2_KAT_INTERNAL)
+    printf("Generate test vectors in file: \"%s\".\n", ARGON2_KAT_FILENAME);
+#else
+    printf("Enable ARGON2_KAT to generate the test vectors.\n");
+#endif
+
+    Argon2_Context context={out, out_length, pwd, pwd_length, salt, salt_length,
+            secret, secret_length, ad, ad_length, t_cost, m_cost, lanes, lanes,
+            myown_allocator, myown_deallocator,
+            clear_password, clear_secret, clear_memory};
+
+    if (strcmp(type,"Argon2d")==0) {
+        printf("Test Argon2d\n");
+        Argon2d(&context);
+        return;
+    }
+    if (strcmp(type,"Argon2i")==0) {
+        printf("Test Argon2i\n");
+        Argon2i(&context);
+        return;
+    }
+    if (strcmp(type,"Argon2ds")==0) {
+        printf("Test Argon2ds\n");
+        Argon2ds(&context);
+        return;
+    }
+    if (strcmp(type,"Argon2id")==0) {
+        printf("Test Argon2id\n");
+        Argon2id(&context);
+        return;
+    }
+
+    printf("Wrong Argon2 type!\n");
+}
+
+
+int main(int argc, char* argv[]) {
+   
+   
+    unsigned char out[32];
+    uint32_t m_cost = 1 << 18;
+    uint32_t t_cost = 3;
+    uint32_t lanes=4;
+    uint32_t threads = 4;
+
+    bool generate_test_vectors = false;
+    //char type[argon2_type_length] = "Argon2d";
+    const char* type= "Argon2d";
+
+#ifdef ARGON2_KAT
+    remove(ARGON2_KAT_FILENAME);
+#endif
+
+    
+    for (int i = 1; i < argc; i++) {
+        if (strcmp(argv[i], "-help") == 0) {
+            printf("====================================== \n");
+            printf("Argon2 - test implementation \n");
+            printf("====================================== \n");
+            printf("Options:\n");
+            printf("\t -logmcost < Base 2 logarithm of m_cost : 0..23 > \n");
+            printf("\t -tcost < t_cost : 0..2^24 > \n");
+            printf("\t -lanes < Number of lanes : %u.. %u>\n", ARGON2_MIN_LANES, ARGON2_MAX_LANES);
+            printf("\t -threads < Number of threads : %u.. %u>\n", ARGON2_MIN_THREADS, ARGON2_MAX_THREADS);
+            printf("\t -type <Argon2d; Argon2ds; Argon2i; Argon2id >\n");
+            printf("\t -gen-tv\n");
+            printf("\t -benchmark\n");
+            printf("\t -help\n");
+            printf("If no arguments given, Argon2 is called with default parameters t_cost=%d, "
+                    "m_cost=%d and threads=%d.\n", t_cost, m_cost, threads);
+            return 0;
+        }
+
+
+        if (strcmp(argv[i], "-logmcost") == 0) {
+            if (i < argc - 1) {
+                i++;
+                m_cost = (uint8_t) 1 << ((uint8_t)atoi(argv[i]) % 24);
+                continue;
+            }
+        }
+
+        if (strcmp(argv[i], "-tcost") == 0) {
+            if (i < argc - 1) {
+                i++;
+                t_cost = atoi(argv[i]) & 0xffffff;
+                continue;
+            }
+        }
+
+        if (strcmp(argv[i], "-threads") == 0) {
+            if (i < argc - 1) {
+                i++;
+                threads = atoi(argv[i]) % ARGON2_MAX_THREADS;
+                continue;
+            }
+        }
+        
+        if (strcmp(argv[i], "-lanes") == 0) {
+            if (i < argc - 1) {
+                i++;
+                lanes = atoi(argv[i]) % ARGON2_MAX_LANES;
+                continue;
+            }
+        }
+
+
+        if (strcmp(argv[i], "-type") == 0) {
+            if (i < argc - 1) {
+                i++;
+                type = argv[i];
+                continue;
+            }
+        }
+
+          if (strcmp(argv[i], "-gen-tv") == 0) {
+            generate_test_vectors = true;
+            continue;
+        }
+
+
+
+        if (strcmp(argv[i], "-benchmark") == 0) {
+            Benchmark();
+            return 0;
+        }
+    }
+
+    if (generate_test_vectors) {
+        GenerateTestVectors(type);
+        return 0;
+    }
+    
+    /*No benchmark, no test vectors, just run*/
+    
+    
+
+    Run(out,  t_cost, m_cost, lanes, threads, type);
+
+    return 0;
+}
--- a/src/argon2.c
+++ b/src/argon2.c
@ -0,0 +1,171 @@
+/*
+ * Argon2 source code package
+ * 
+ * Written by Daniel Dinu and Dmitry Khovratovich, 2015
+ * 
+ * This work is licensed under a Creative Commons CC0 1.0 License/Waiver.
+ * 
+ * You should have received a copy of the CC0 Public Domain Dedication along with
+ * this software. If not, see <http://creativecommons.org/publicdomain/zero/1.0/>.
+ */
+
+
+#include "stdint.h" 
+#include "stdbool.h"
+#include <string.h>
+#include "stdio.h"
+
+#include "argon2.h"
+#include "argon2-core.h"
+
+
+/*************************Argon2 input parameter restrictions**************************************************/
+
+/* Minimum and maximum number of lanes (degree of parallelism) */
+const uint32_t ARGON2_MIN_LANES = 1;
+const uint32_t ARGON2_MAX_LANES = 0xFFFFFF;
+
+/* Minimum and maximum number of threads */
+const uint32_t ARGON2_MIN_THREADS = 1;
+const uint32_t ARGON2_MAX_THREADS = 0xFFFFFF;
+
+/* Number of synchronization points between lanes per pass */
+#define __ARGON_SYNC_POINTS 4
+const uint32_t ARGON2_SYNC_POINTS = __ARGON_SYNC_POINTS;
+
+/* Minimum and maximum digest size in bytes */
+const uint32_t ARGON2_MIN_OUTLEN = 4;
+const uint32_t ARGON2_MAX_OUTLEN = 0xFFFFFFFF;
+
+/* Minimum and maximum number of memory blocks (each of BLOCK_SIZE bytes) */
+const uint32_t ARGON2_MIN_MEMORY = 2 * __ARGON_SYNC_POINTS; // 2 blocks per slice
+const uint32_t ARGON2_MAX_MEMORY = 0xFFFFFFFF; // 2^32-1 blocks
+
+/* Minimum and maximum number of passes */
+const uint32_t ARGON2_MIN_TIME = 1;
+const uint32_t ARGON2_MAX_TIME = 0xFFFFFFFF;
+
+/* Minimum and maximum password length in bytes */
+const uint32_t ARGON2_MIN_PWD_LENGTH = 0;
+const uint32_t ARGON2_MAX_PWD_LENGTH = 0xFFFFFFFF;
+
+/* Minimum and maximum associated data length in bytes */
+const uint32_t ARGON2_MIN_AD_LENGTH = 0;
+const uint32_t ARGON2_MAX_AD_LENGTH = 0xFFFFFFFF;
+
+/* Minimum and maximum salt length in bytes */
+const uint32_t ARGON2_MIN_SALT_LENGTH = 8;
+const uint32_t ARGON2_MAX_SALT_LENGTH = 0xFFFFFFFF;
+
+/* Minimum and maximum key length in bytes */
+const uint32_t ARGON2_MIN_SECRET = 0;
+const uint32_t ARGON2_MAX_SECRET = 0xFFFFFFFF;
+
+
+/************************* Error messages *********************************************************************************/
+
+const char* Argon2_ErrorMessage[] = {
+    /*{ARGON2_OK, */"OK",/*},
+
+    {ARGON2_OUTPUT_PTR_NULL, */"Output pointer is NULL",/*},
+
+    {ARGON2_OUTPUT_TOO_SHORT, */"Output is too short",/*},
+    {ARGON2_OUTPUT_TOO_LONG, */"Output is too long",/*},
+
+    {ARGON2_PWD_TOO_SHORT, */"Password is too short",/*},
+    {ARGON2_PWD_TOO_LONG, */"Password is too long",/*},
+
+    {ARGON2_SALT_TOO_SHORT, */"Salt is too short",/*},
+    {ARGON2_SALT_TOO_LONG, */"Salt is too long",/*},
+
+    {ARGON2_AD_TOO_SHORT, */"Associated data is too short",/*},
+    {ARGON2_AD_TOO_LONG, */"Associated date is too long",/*},
+
+    {ARGON2_SECRET_TOO_SHORT, */"Secret is too short",/*},
+    {ARGON2_SECRET_TOO_LONG, */"Secret is too long",/*},
+
+    {ARGON2_TIME_TOO_SMALL, */"Time cost is too small",/*},
+    {ARGON2_TIME_TOO_LARGE, */"Time cost is too large",/*},
+
+    {ARGON2_MEMORY_TOO_LITTLE, */"Memory cost is too small",/*},
+    {ARGON2_MEMORY_TOO_MUCH, */"Memory cost is too large",/*},
+
+    {ARGON2_LANES_TOO_FEW, */"Too few lanes",/*},
+    {ARGON2_LANES_TOO_MANY, */"Too many lanes",/*},
+
+    {ARGON2_PWD_PTR_MISMATCH, */"Password pointer is NULL, but password length is not 0",/*},
+    {ARGON2_SALT_PTR_MISMATCH, */"Salt pointer is NULL, but salt length is not 0",/*},
+    {ARGON2_SECRET_PTR_MISMATCH, */"Secret pointer is NULL, but secret length is not 0",/*},
+    {ARGON2_AD_PTR_MISMATCH, */"Associated data pointer is NULL, but ad length is not 0",/*},
+
+    {ARGON2_MEMORY_ALLOCATION_ERROR, */"Memory allocation error",/*},
+
+    {ARGON2_FREE_MEMORY_CBK_NULL, */"The free memory callback is NULL",/*},
+    {ARGON2_ALLOCATE_MEMORY_CBK_NULL, */"The allocate memory callback is NULL",/*},
+
+    {ARGON2_INCORRECT_PARAMETER, */"Argon2_Context context is NULL",/*},
+    {ARGON2_INCORRECT_TYPE, */"There is no such version of Argon2",/*},
+    
+    {ARGON2_OUT_PTR_MISMATCH, */"Output pointer mismatch"/*}*/
+};
+
+int PHS(void *out, size_t outlen, const void *in, size_t inlen, const void *salt, 
+        size_t saltlen, unsigned int t_cost, unsigned int m_cost) {
+    uint8_t* default_ad_ptr = NULL;
+    uint32_t default_ad_length = 0;
+    uint8_t* default_secret_ptr = NULL;
+    uint32_t default_secret_length = 0;
+    uint8_t default_parallelism = 1;
+    AllocateMemoryCallback default_a_cbk = NULL;
+    FreeMemoryCallback default_f_cbk= NULL;
+    bool c_p=true;
+    bool c_s=true;
+    bool c_m=false;
+
+    Argon2_Context context = {(uint8_t*) out, (uint32_t) outlen,
+            (uint8_t*) in, (uint32_t) inlen,
+            (uint8_t*) salt, (uint32_t) saltlen,
+            default_ad_ptr, default_ad_length,
+            default_secret_ptr, default_secret_length,
+            (uint32_t) t_cost, (uint32_t) m_cost, default_parallelism,default_parallelism,default_a_cbk,default_f_cbk,
+    c_p,c_s,c_m};
+
+    return Argon2Core(&context, Argon2_d);
+}
+
+int Argon2d(Argon2_Context* context) {
+    return Argon2Core(context, Argon2_d);
+}
+
+int Argon2i(Argon2_Context* context) {
+    return Argon2Core(context, Argon2_i);
+}
+
+int Argon2id(Argon2_Context* context) {
+    return Argon2Core(context, Argon2_id);
+}
+
+int Argon2ds(Argon2_Context* context) {
+    return Argon2Core(context, Argon2_ds);
+}
+
+int VerifyD(Argon2_Context* context, const char *hash) {
+    if (0 == context->outlen || NULL == hash) {
+        return ARGON2_OUT_PTR_MISMATCH;
+    }
+
+    int result = Argon2Core(context, Argon2_d);
+    if (ARGON2_OK != result) {
+        return result;
+    }
+
+    return 0 == memcmp(hash, context->out, context->outlen);
+}
+
+const char* ErrorMessage(int error_code) {
+    if (error_code < ARGON2_ERROR_CODES_LENGTH) {
+        return Argon2_ErrorMessage[(Argon2_ErrorCodes) error_code];
+    }
+
+    return "Unknown error code.";
+}
--- a/src/argon2.h
+++ b/src/argon2.h
@ -0,0 +1,253 @@
+/*
+ * Argon2 source code package
+ * 
+ * Written by Daniel Dinu and Dmitry Khovratovich, 2015
+ * 
+ * This work is licensed under a Creative Commons CC0 1.0 License/Waiver.
+ * 
+ * You should have received a copy of the CC0 Public Domain Dedication along with
+ * this software. If not, see <http://creativecommons.org/publicdomain/zero/1.0/>.
+ */
+#pragma once
+
+#ifndef __ARGON2_H__
+#define __ARGON2_H__
+
+#include "stddef.h"
+#include "stdbool.h"
+
+/************************* Constants to enable Known Answer Tests (KAT)  **************************************************/
+/* Enable ARGON2_KAT */
+//#define ARGON2_KAT
+//#define ARGON2_KAT_INTERNAL
+
+
+#if defined(ARGON2_KAT) || defined(ARGON2_KAT_INTERNAL)
+/* The KAT file name */
+extern const char* ARGON2_KAT_FILENAME;
+#endif 
+
+
+/*************************Argon2 input parameter restrictions**************************************************/
+
+/* Minimum and maximum number of lanes (degree of parallelism) */
+extern const uint32_t ARGON2_MIN_LANES ;
+extern const uint32_t ARGON2_MAX_LANES;
+
+extern const uint32_t ARGON2_MIN_THREADS ;
+extern const uint32_t ARGON2_MAX_THREADS;
+
+/* Number of synchronization points between lanes per pass */
+#define __ARGON_SYNC_POINTS 4
+extern const uint32_t ARGON2_SYNC_POINTS;
+
+/* Minimum and maximum digest size in bytes */
+extern const uint32_t ARGON2_MIN_OUTLEN ;
+extern const uint32_t ARGON2_MAX_OUTLEN ;
+
+/* Minimum and maximum number of memory blocks (each of BLOCK_SIZE bytes) */
+extern const uint32_t ARGON2_MIN_MEMORY ; // 2 blocks per slice
+extern const uint32_t ARGON2_MAX_MEMORY; // 2^32-1 blocks
+
+/* Minimum and maximum number of passes */
+extern const uint32_t ARGON2_MIN_TIME ;
+extern const uint32_t ARGON2_MAX_TIME;
+
+/* Minimum and maximum password length in bytes */
+extern const uint32_t ARGON2_MIN_PWD_LENGTH ;
+extern const uint32_t ARGON2_MAX_PWD_LENGTH ;
+
+/* Minimum and maximum associated data length in bytes */
+extern const uint32_t ARGON2_MIN_AD_LENGTH ;
+extern const uint32_t ARGON2_MAX_AD_LENGTH ;
+
+/* Minimum and maximum salt length in bytes */
+extern const uint32_t ARGON2_MIN_SALT_LENGTH ;
+extern  const uint32_t ARGON2_MAX_SALT_LENGTH;
+
+/* Minimum and maximum key length in bytes */
+extern const uint32_t ARGON2_MIN_SECRET ;
+extern const uint32_t ARGON2_MAX_SECRET ;
+
+/************************* Error codes *********************************************************************************/
+typedef enum _Argon2_ErrorCodes {
+    ARGON2_OK = 0,
+
+    ARGON2_OUTPUT_PTR_NULL = 1,
+
+    ARGON2_OUTPUT_TOO_SHORT = 2,
+    ARGON2_OUTPUT_TOO_LONG = 3,
+
+    ARGON2_PWD_TOO_SHORT = 4,
+    ARGON2_PWD_TOO_LONG = 5,
+
+    ARGON2_SALT_TOO_SHORT = 6,
+    ARGON2_SALT_TOO_LONG = 7,
+
+    ARGON2_AD_TOO_SHORT = 8,
+    ARGON2_AD_TOO_LONG = 9,
+
+    ARGON2_SECRET_TOO_SHORT = 10,
+    ARGON2_SECRET_TOO_LONG = 11,
+
+    ARGON2_TIME_TOO_SMALL = 12,
+    ARGON2_TIME_TOO_LARGE = 13,
+
+    ARGON2_MEMORY_TOO_LITTLE = 14,
+    ARGON2_MEMORY_TOO_MUCH = 15,
+
+    ARGON2_LANES_TOO_FEW = 16,
+    ARGON2_LANES_TOO_MANY = 17,
+
+    ARGON2_PWD_PTR_MISMATCH = 18, //NULL ptr with non-zero length
+    ARGON2_SALT_PTR_MISMATCH = 19, //NULL ptr with non-zero length
+    ARGON2_SECRET_PTR_MISMATCH = 20, //NULL ptr with non-zero length
+    ARGON2_AD_PTR_MISMATCH = 21, //NULL ptr with non-zero length
+
+    ARGON2_MEMORY_ALLOCATION_ERROR = 22,
+
+    ARGON2_FREE_MEMORY_CBK_NULL = 23,
+    ARGON2_ALLOCATE_MEMORY_CBK_NULL = 24,
+
+    ARGON2_INCORRECT_PARAMETER = 25,
+    ARGON2_INCORRECT_TYPE = 26,
+
+    ARGON2_OUT_PTR_MISMATCH = 27,
+            
+    ARGON2_THREADS_TOO_FEW = 28,
+    ARGON2_THREADS_TOO_MANY = 29,
+
+    ARGON2_ERROR_CODES_LENGTH /* Do NOT remove; Do NOT add error codes after this error code */
+} Argon2_ErrorCodes;
+
+
+
+/********************************************* Memory allocator types --- for external allocation *************************************************************/
+typedef int (*AllocateMemoryCallback)(uint8_t **memory, size_t bytes_to_allocate);
+typedef void(*FreeMemoryCallback)(uint8_t *memory, size_t bytes_to_allocate);
+
+/********************************************* Argon2 external data structures*************************************************************/
+
+/*
+ *****Context: structure to hold Argon2 inputs: 
+ * output array and its length, 
+ * password and its length,
+ * salt and its length,
+ * secret and its length,
+ * associated data and its length,
+ * number of passes, amount of used memory (in KBytes, can be rounded up a bit)
+ * number of parallel threads that will be run.
+ * All the parameters above affect the output hash value.
+ * Additionally, two function pointers can be provided to allocate and deallocate the memory (if NULL, memory will be allocated internally).
+ * Also, three flags indicate whether to erase password, secret as soon as they are pre-hashed (and thus not needed anymore), and the entire memory
+ ****************************
+ Simplest situation: you have output array out[8], password is stored in pwd[32], salt is stored in salt[16], you do not have keys nor associated data.
+ You need to spend 1 GB of RAM and you run 5 passes of Argon2d with 4 parallel lanes.
+ You want to erase the password, but you're OK with last pass not being erased.
+ You want to use the default memory allocator.
+ Then you initialize
+ Argon2_Context(out,8,pwd,32,salt,16,NULL,0,NULL,0,5,1<<20,4,NULL,NULL,true,false,false).
+ */
+typedef struct _Argon2_Context {
+    uint8_t *out; //output array
+    const uint32_t outlen; //digest length
+
+    uint8_t *pwd; //password array
+    uint32_t pwdlen; //password length
+
+    const uint8_t *salt; //salt array
+    const uint32_t saltlen; //salt length
+
+    uint8_t *secret; //key array
+    uint32_t secretlen; //key length
+
+    const uint8_t *ad; //associated data array
+    const uint32_t adlen; //associated data length
+
+    const uint32_t t_cost; //number of passes
+    const uint32_t m_cost; //amount of memory requested (KB)
+    const uint32_t lanes; //number of lanes
+    const uint32_t threads; //maximum number of threads
+
+    AllocateMemoryCallback allocate_cbk; //pointer to memory allocator
+    FreeMemoryCallback free_cbk; //pointer to memory deallocator
+
+    const bool clear_password; //whether to clear the password array
+    const bool clear_secret; //whether to clear the secret array
+    const bool clear_memory; //whether to clear the memory after the run
+
+    
+} Argon2_Context;
+
+
+
+/**
+ * Function to hash the inputs in the memory-hard fashion
+ * @param  out  Pointer to the memory where the hash digest will be written
+ * @param  outlen Digest length in bytes
+ * @param  in Pointer to the input (password)
+ * @param  inlen Input length in bytes
+ * @param  salt Pointer to the salt
+ * @param  saltlen Salt length in bytes
+ * @pre    @a out must have at least @a outlen bytes allocated
+ * @pre    @a in must be at least @inlen bytes long
+ * @pre    @a saltlen must be at least @saltlen bytes long
+ * @return Zero if successful, 1 otherwise.
+ */
+extern  int PHS(void *out, size_t outlen, const void *in, size_t inlen, const void *salt, size_t saltlen,
+        unsigned int t_cost, unsigned int m_cost);
+
+/*
+ * **************Argon2d: Version of Argon2 that picks memory blocks depending on the password and salt. Only for side-channel-free environment!!***************
+ * @param  context  Pointer to current Argon2 context
+ * @return  Zero if successful, a non zero error code otherwise
+ */
+extern int Argon2d(Argon2_Context* context);
+
+/*
+ *  * **************Argon2i: Version of Argon2 that picks memory blocks independent on the password and salt. Good for side-channels,
+ ******************* but worse w.r.t. tradeoff attacks if
+ *******************only one pass is used***************
+ * @param  context  Pointer to current Argon2 context
+ * @return  Zero if successful, a non zero error code otherwise
+ */
+extern int Argon2i(Argon2_Context* context);
+
+/*
+ *   * **************Argon2di: Reserved name***************
+ * @param  context  Pointer to current Argon2 context
+ * @return  Zero if successful, a non zero error code otherwise
+ */
+extern int Argon2di(Argon2_Context* context);
+
+/*
+ *   * **************Argon2ds: Argon2d hardened against GPU attacks, 20% slower***************
+ * @param  context  Pointer to current Argon2 context
+ * @return  Zero if successful, a non zero error code otherwise
+ */
+extern int Argon2ds(Argon2_Context* context);
+
+
+/*
+ *   * **************Argon2id: First half-pass over memory is password-independent, the rest are password-dependent
+ ********************OK against side channels: they reduce to 1/2-pass Argon2i***************
+ * @param  context  Pointer to current Argon2 context
+ * @return  Zero if successful, a non zero error code otherwise
+ */
+extern int Argon2id(Argon2_Context* context);
+
+/*
+ * Verify if a given password is correct for Argon2d hashing
+ * @param  context  Pointer to current Argon2 context
+ * @param  hash  The password hash to verify. The length of the hash is specified by the context outlen member
+ * @return  Zero if successful, a non zero error code otherwise
+ */
+extern int VerifyD(Argon2_Context* context, const char *hash);
+
+/*
+ * Get the associated error message for given error code
+ * @return  The error message associated with the given error code
+ */
+const char* ErrorMessage(int error_code);
+
+#endif
--- a/src/blake2/blake-round-mka.h
+++ b/src/blake2/blake-round-mka.h
@ -0,0 +1,44 @@
+#pragma once
+
+#ifndef __BLAKE_ROUND_MKA_H__
+#define __BLAKE_ROUND_MKA_H__
+
+
+#define G(a,b,c,d) \
+	a = fBlaMka(a, b) ; \
+	d = rotr64(d ^ a, 32); \
+	c = fBlaMka(c, d); \
+	b = rotr64(b ^ c, 24); \
+	a = fBlaMka(a, b) ; \
+	d = rotr64(d ^ a, 16); \
+	c = fBlaMka(c, d); \
+	b = rotr64(b ^ c, 63); 
+
+#define BLAKE2_ROUND_NOMSG(v0,v1,v2,v3,v4,v5,v6,v7,v8,v9,v10,v11,v12,v13,v14,v15)  \
+	G(v0, v4, v8, v12); \
+	G(v1, v5, v9, v13); \
+	G(v2, v6, v10, v14); \
+	G(v3, v7, v11, v15); \
+	G(v0, v5, v10, v15); \
+	G(v1, v6, v11, v12); \
+	G(v2, v7, v8, v13); \
+	G(v3, v4, v9, v14); 
+
+
+/*designed by the Lyra PHC team */
+static inline uint64_t fBlaMka(uint64_t x, uint64_t y)
+{
+	uint32_t lessX = (uint32_t)x;
+	uint32_t lessY = (uint32_t)y;
+
+	uint64_t lessZ = (uint64_t)lessX;
+	lessZ = lessZ * lessY;
+	lessZ = lessZ << 1;
+
+	uint64_t z = lessZ + x + y;
+
+	return z;
+}
+
+
+#endif
--- a/src/blake2/blake2-config.h
+++ b/src/blake2/blake2-config.h
@ -0,0 +1,72 @@
+/*
+   BLAKE2 reference source code package - optimized C implementations
+
+   Written in 2012 by Samuel Neves <sneves@dei.uc.pt>
+
+   To the extent possible under law, the author(s) have dedicated all copyright
+   and related and neighboring rights to this software to the public domain
+   worldwide. This software is distributed without any warranty.
+
+   You should have received a copy of the CC0 Public Domain Dedication along with
+   this software. If not, see <http://creativecommons.org/publicdomain/zero/1.0/>.
+*/
+#pragma once
+#ifndef __BLAKE2_CONFIG_H__
+#define __BLAKE2_CONFIG_H__
+
+// These don't work everywhere
+#if defined(__SSE2__)
+#define HAVE_SSE2
+#endif
+
+#if defined(__SSSE3__)
+#define HAVE_SSSE3
+#endif
+
+#if defined(__SSE4_1__)
+#define HAVE_SSE41
+#endif
+
+#if defined(__AVX__)
+#define HAVE_AVX
+#endif
+
+#if defined(__XOP__)
+#define HAVE_XOP
+#endif
+
+
+#ifdef HAVE_AVX2
+#ifndef HAVE_AVX
+#define HAVE_AVX
+#endif
+#endif
+
+#ifdef HAVE_XOP
+#ifndef HAVE_AVX
+#define HAVE_AVX
+#endif
+#endif
+
+#ifdef HAVE_AVX
+#ifndef HAVE_SSE41
+#define HAVE_SSE41
+#endif
+#endif
+
+#ifdef HAVE_SSE41
+#ifndef HAVE_SSSE3
+#define HAVE_SSSE3
+#endif
+#endif
+
+#ifdef HAVE_SSSE3
+#define HAVE_SSE2
+#endif
+
+#if !defined(HAVE_SSE2)
+#error "This code requires at least SSE2."
+#endif
+
+#endif
+
--- a/src/blake2/blake2-impl.h
+++ b/src/blake2/blake2-impl.h
@ -0,0 +1,145 @@
+/*
+   BLAKE2 reference source code package - optimized C implementations
+
+   Written in 2012 by Samuel Neves <sneves@dei.uc.pt>
+
+   To the extent possible under law, the author(s) have dedicated all copyright
+   and related and neighboring rights to this software to the public domain
+   worldwide. This software is distributed without any warranty.
+
+   You should have received a copy of the CC0 Public Domain Dedication along with
+   this software. If not, see <http://creativecommons.org/publicdomain/zero/1.0/>.
+*/
+#pragma once
+#ifndef __BLAKE2_IMPL_H__
+#define __BLAKE2_IMPL_H__
+
+#include <stdint.h>
+
+/* Argon2 Team - Begin Code */
+#include "brg-endian.h"
+
+#if defined(PLATFORM_BYTE_ORDER) && (PLATFORM_BYTE_ORDER == IS_LITTLE_ENDIAN) && !defined(NATIVE_LITTLE_ENDIAN)
+#define NATIVE_LITTLE_ENDIAN
+#endif
+/* Argon2 Team - End Code */
+
+
+static inline uint32_t load32( const void *src )
+{
+#if defined(NATIVE_LITTLE_ENDIAN)
+  uint32_t w;
+  memcpy(&w, src, sizeof w);
+  return w;
+#else
+  const uint8_t *p = ( const uint8_t * )src;
+  uint32_t w = *p++;
+  w |= ( uint32_t )( *p++ ) <<  8;
+  w |= ( uint32_t )( *p++ ) << 16;
+  w |= ( uint32_t )( *p++ ) << 24;
+  return w;
+#endif
+}
+
+static inline uint64_t load64( const void *src )
+{
+#if defined(NATIVE_LITTLE_ENDIAN)
+  uint64_t w;
+  memcpy(&w, src, sizeof w);
+  return w;
+#else
+  const uint8_t *p = ( const uint8_t * )src;
+  uint64_t w = *p++;
+  w |= ( uint64_t )( *p++ ) <<  8;
+  w |= ( uint64_t )( *p++ ) << 16;
+  w |= ( uint64_t )( *p++ ) << 24;
+  w |= ( uint64_t )( *p++ ) << 32;
+  w |= ( uint64_t )( *p++ ) << 40;
+  w |= ( uint64_t )( *p++ ) << 48;
+  w |= ( uint64_t )( *p++ ) << 56;
+  return w;
+#endif
+}
+
+static inline void store32( void *dst, uint32_t w )
+{
+#if defined(NATIVE_LITTLE_ENDIAN)
+  memcpy(dst, &w, sizeof w);
+#else
+  uint8_t *p = ( uint8_t * )dst;
+  *p++ = ( uint8_t )w; w >>= 8;
+  *p++ = ( uint8_t )w; w >>= 8;
+  *p++ = ( uint8_t )w; w >>= 8;
+  *p++ = ( uint8_t )w;
+#endif
+}
+
+static inline void store64( void *dst, uint64_t w )
+{
+#if defined(NATIVE_LITTLE_ENDIAN)
+  memcpy(dst, &w, sizeof w);
+#else
+  uint8_t *p = ( uint8_t * )dst;
+  *p++ = ( uint8_t )w; w >>= 8;
+  *p++ = ( uint8_t )w; w >>= 8;
+  *p++ = ( uint8_t )w; w >>= 8;
+  *p++ = ( uint8_t )w; w >>= 8;
+  *p++ = ( uint8_t )w; w >>= 8;
+  *p++ = ( uint8_t )w; w >>= 8;
+  *p++ = ( uint8_t )w; w >>= 8;
+  *p++ = ( uint8_t )w;
+#endif
+}
+
+static inline uint64_t load48( const void *src )
+{
+  const uint8_t *p = ( const uint8_t * )src;
+  uint64_t w = *p++;
+  w |= ( uint64_t )( *p++ ) <<  8;
+  w |= ( uint64_t )( *p++ ) << 16;
+  w |= ( uint64_t )( *p++ ) << 24;
+  w |= ( uint64_t )( *p++ ) << 32;
+  w |= ( uint64_t )( *p++ ) << 40;
+  return w;
+}
+
+static inline void store48( void *dst, uint64_t w )
+{
+  uint8_t *p = ( uint8_t * )dst;
+  *p++ = ( uint8_t )w; w >>= 8;
+  *p++ = ( uint8_t )w; w >>= 8;
+  *p++ = ( uint8_t )w; w >>= 8;
+  *p++ = ( uint8_t )w; w >>= 8;
+  *p++ = ( uint8_t )w; w >>= 8;
+  *p++ = ( uint8_t )w;
+}
+
+static inline uint32_t rotl32( const uint32_t w, const unsigned c )
+{
+  return ( w << c ) | ( w >> ( 32 - c ) );
+}
+
+static inline uint64_t rotl64( const uint64_t w, const unsigned c )
+{
+  return ( w << c ) | ( w >> ( 64 - c ) );
+}
+
+static inline uint32_t rotr32( const uint32_t w, const unsigned c )
+{
+  return ( w >> c ) | ( w << ( 32 - c ) );
+}
+
+static inline uint64_t rotr64( const uint64_t w, const unsigned c )
+{
+  return ( w >> c ) | ( w << ( 64 - c ) );
+}
+
+/* prevents compiler optimizing out memset() */
+static inline void secure_zero_memory( void *v, size_t n )
+{
+  volatile uint8_t *p = ( volatile uint8_t * )v;
+  while( n-- ) *p++ = 0;
+}
+
+#endif
+
--- a/src/blake2/blake2-round-mka.h
+++ b/src/blake2/blake2-round-mka.h
@ -0,0 +1,97 @@
+#define _mm_roti_epi64(x, c) \
+	(-(c) == 32) ? _mm_shuffle_epi32((x), _MM_SHUFFLE(2,3,0,1))  \
+	: (-(c) == 24) ? _mm_shuffle_epi8((x), r24) \
+	: (-(c) == 16) ? _mm_shuffle_epi8((x), r16) \
+	: (-(c) == 63) ? _mm_xor_si128(_mm_srli_epi64((x), -(c)), _mm_add_epi64((x), (x)))  \
+	: _mm_xor_si128(_mm_srli_epi64((x), -(c)), _mm_slli_epi64((x), 64-(-(c))))
+
+static inline __m128i fBlaMka(__m128i x, __m128i y){
+    __m128i z = _mm_mul_epu32 (x, y);
+    
+    z = _mm_slli_epi64 (z, 1);
+    
+    z = _mm_add_epi64 (z, x);
+    z = _mm_add_epi64 (z, y);
+    
+    return z;
+}
+
+#define G1(row1l,row2l,row3l,row4l,row1h,row2h,row3h,row4h) \
+	row1l = fBlaMka(row1l, row2l); \
+	row1h = fBlaMka(row1h, row2h); \
+	\
+	row4l = _mm_xor_si128(row4l, row1l); \
+	row4h = _mm_xor_si128(row4h, row1h); \
+	\
+	row4l = _mm_roti_epi64(row4l, -32); \
+	row4h = _mm_roti_epi64(row4h, -32); \
+	\
+	row3l = fBlaMka(row3l, row4l); \
+	row3h = fBlaMka(row3h, row4h); \
+	\
+	row2l = _mm_xor_si128(row2l, row3l); \
+	row2h = _mm_xor_si128(row2h, row3h); \
+	\
+	row2l = _mm_roti_epi64(row2l, -24); \
+	row2h = _mm_roti_epi64(row2h, -24); \
+ 
+#define G2(row1l,row2l,row3l,row4l,row1h,row2h,row3h,row4h) \
+	row1l = fBlaMka(row1l, row2l); \
+	row1h = fBlaMka(row1h, row2h); \
+	\
+	row4l = _mm_xor_si128(row4l, row1l); \
+	row4h = _mm_xor_si128(row4h, row1h); \
+	\
+	row4l = _mm_roti_epi64(row4l, -16); \
+	row4h = _mm_roti_epi64(row4h, -16); \
+	\
+	row3l = fBlaMka(row3l, row4l); \
+	row3h = fBlaMka(row3h, row4h); \
+	\
+	row2l = _mm_xor_si128(row2l, row3l); \
+	row2h = _mm_xor_si128(row2h, row3h); \
+	\
+	row2l = _mm_roti_epi64(row2l, -63); \
+	row2h = _mm_roti_epi64(row2h, -63); \
+
+
+#define DIAGONALIZE(row1l,row2l,row3l,row4l,row1h,row2h,row3h,row4h) \
+	t0 = _mm_alignr_epi8(row2h, row2l, 8); \
+	t1 = _mm_alignr_epi8(row2l, row2h, 8); \
+	row2l = t0; \
+	row2h = t1; \
+	\
+	t0 = row3l; \
+	row3l = row3h; \
+	row3h = t0;    \
+	\
+	t0 = _mm_alignr_epi8(row4h, row4l, 8); \
+	t1 = _mm_alignr_epi8(row4l, row4h, 8); \
+	row4l = t1; \
+	row4h = t0;
+
+#define UNDIAGONALIZE(row1l,row2l,row3l,row4l,row1h,row2h,row3h,row4h) \
+	t0 = _mm_alignr_epi8(row2l, row2h, 8); \
+	t1 = _mm_alignr_epi8(row2h, row2l, 8); \
+	row2l = t0; \
+	row2h = t1; \
+	\
+	t0 = row3l; \
+	row3l = row3h; \
+	row3h = t0; \
+	\
+	t0 = _mm_alignr_epi8(row4l, row4h, 8); \
+	t1 = _mm_alignr_epi8(row4h, row4l, 8); \
+	row4l = t1; \
+	row4h = t0;
+
+#define BLAKE2_ROUND(row1l,row1h,row2l,row2h,row3l,row3h,row4l,row4h) \
+	G1(row1l,row2l,row3l,row4l,row1h,row2h,row3h,row4h); \
+	G2(row1l,row2l,row3l,row4l,row1h,row2h,row3h,row4h); \
+	\
+	DIAGONALIZE(row1l,row2l,row3l,row4l,row1h,row2h,row3h,row4h); \
+	\
+	G1(row1l,row2l,row3l,row4l,row1h,row2h,row3h,row4h); \
+	G2(row1l,row2l,row3l,row4l,row1h,row2h,row3h,row4h); \
+	\
+	UNDIAGONALIZE(row1l,row2l,row3l,row4l,row1h,row2h,row3h,row4h);
--- a/src/blake2/blake2-round.h
+++ b/src/blake2/blake2-round.h
@ -0,0 +1,85 @@
+#define _mm_roti_epi64(x, c) \
+	(-(c) == 32) ? _mm_shuffle_epi32((x), _MM_SHUFFLE(2,3,0,1))  \
+	: (-(c) == 24) ? _mm_shuffle_epi8((x), r24) \
+	: (-(c) == 16) ? _mm_shuffle_epi8((x), r16) \
+	: (-(c) == 63) ? _mm_xor_si128(_mm_srli_epi64((x), -(c)), _mm_add_epi64((x), (x)))  \
+	: _mm_xor_si128(_mm_srli_epi64((x), -(c)), _mm_slli_epi64((x), 64-(-(c))))
+
+#define G1(row1l,row2l,row3l,row4l,row1h,row2h,row3h,row4h) \
+	row1l = _mm_add_epi64(row1l, row2l); \
+	row1h = _mm_add_epi64(row1h, row2h); \
+	\
+	row4l = _mm_xor_si128(row4l, row1l); \
+	row4h = _mm_xor_si128(row4h, row1h); \
+	\
+	row4l = _mm_roti_epi64(row4l, -32); \
+	row4h = _mm_roti_epi64(row4h, -32); \
+	\
+	row3l = _mm_add_epi64(row3l, row4l); \
+	row3h = _mm_add_epi64(row3h, row4h); \
+	\
+	row2l = _mm_xor_si128(row2l, row3l); \
+	row2h = _mm_xor_si128(row2h, row3h); \
+	\
+	row2l = _mm_roti_epi64(row2l, -24); \
+	row2h = _mm_roti_epi64(row2h, -24); \
+ 
+#define G2(row1l,row2l,row3l,row4l,row1h,row2h,row3h,row4h) \
+	row1l = _mm_add_epi64(row1l, row2l); \
+	row1h = _mm_add_epi64(row1h, row2h); \
+	\
+	row4l = _mm_xor_si128(row4l, row1l); \
+	row4h = _mm_xor_si128(row4h, row1h); \
+	\
+	row4l = _mm_roti_epi64(row4l, -16); \
+	row4h = _mm_roti_epi64(row4h, -16); \
+	\
+	row3l = _mm_add_epi64(row3l, row4l); \
+	row3h = _mm_add_epi64(row3h, row4h); \
+	\
+	row2l = _mm_xor_si128(row2l, row3l); \
+	row2h = _mm_xor_si128(row2h, row3h); \
+	\
+	row2l = _mm_roti_epi64(row2l, -63); \
+	row2h = _mm_roti_epi64(row2h, -63); \
+
+#define DIAGONALIZE(row1l,row2l,row3l,row4l,row1h,row2h,row3h,row4h) \
+	t0 = _mm_alignr_epi8(row2h, row2l, 8); \
+	t1 = _mm_alignr_epi8(row2l, row2h, 8); \
+	row2l = t0; \
+	row2h = t1; \
+	\
+	t0 = row3l; \
+	row3l = row3h; \
+	row3h = t0;    \
+	\
+	t0 = _mm_alignr_epi8(row4h, row4l, 8); \
+	t1 = _mm_alignr_epi8(row4l, row4h, 8); \
+	row4l = t1; \
+	row4h = t0;
+
+#define UNDIAGONALIZE(row1l,row2l,row3l,row4l,row1h,row2h,row3h,row4h) \
+	t0 = _mm_alignr_epi8(row2l, row2h, 8); \
+	t1 = _mm_alignr_epi8(row2h, row2l, 8); \
+	row2l = t0; \
+	row2h = t1; \
+	\
+	t0 = row3l; \
+	row3l = row3h; \
+	row3h = t0; \
+	\
+	t0 = _mm_alignr_epi8(row4l, row4h, 8); \
+	t1 = _mm_alignr_epi8(row4h, row4l, 8); \
+	row4l = t1; \
+	row4h = t0;
+
+#define BLAKE2_ROUND(row1l,row1h,row2l,row2h,row3l,row3h,row4l,row4h) \
+	G1(row1l,row2l,row3l,row4l,row1h,row2h,row3h,row4h); \
+	G2(row1l,row2l,row3l,row4l,row1h,row2h,row3h,row4h); \
+	\
+	DIAGONALIZE(row1l,row2l,row3l,row4l,row1h,row2h,row3h,row4h); \
+	\
+	G1(row1l,row2l,row3l,row4l,row1h,row2h,row3h,row4h); \
+	G2(row1l,row2l,row3l,row4l,row1h,row2h,row3h,row4h); \
+	\
+	UNDIAGONALIZE(row1l,row2l,row3l,row4l,row1h,row2h,row3h,row4h);
--- a/src/blake2/blake2.h
+++ b/src/blake2/blake2.h
@ -0,0 +1,161 @@
+/*
+   BLAKE2 reference source code package - optimized C implementations
+
+   Written in 2012 by Samuel Neves <sneves@dei.uc.pt>
+
+   To the extent possible under law, the author(s) have dedicated all copyright
+   and related and neighboring rights to this software to the public domain
+   worldwide. This software is distributed without any warranty.
+
+   You should have received a copy of the CC0 Public Domain Dedication along with
+   this software. If not, see <http://creativecommons.org/publicdomain/zero/1.0/>.
+*/
+#pragma once
+#ifndef __BLAKE2_H__
+#define __BLAKE2_H__
+
+#include <stddef.h>
+#include <stdint.h>
+
+/* Argon2 Team - Begin Code */
+#if defined(_MSC_VER)
+#define ALIGN(x) __declspec(align(x))
+#else
+#define ALIGN(x) __attribute__ ((__aligned__(x)))
+#endif
+/* Argon2 Team - End Code */
+
+#if defined(__cplusplus)
+extern "C" {
+#endif
+
+  enum blake2s_constant
+  {
+    BLAKE2S_BLOCKBYTES = 64,
+    BLAKE2S_OUTBYTES   = 32,
+    BLAKE2S_KEYBYTES   = 32,
+    BLAKE2S_SALTBYTES  = 8,
+    BLAKE2S_PERSONALBYTES = 8
+  };
+
+  enum blake2b_constant
+  {
+    BLAKE2B_BLOCKBYTES = 128,
+    BLAKE2B_OUTBYTES   = 64,
+    BLAKE2B_KEYBYTES   = 64,
+    BLAKE2B_SALTBYTES  = 16,
+    BLAKE2B_PERSONALBYTES = 16
+  };
+
+#pragma pack(push, 1)
+  typedef struct __blake2s_param
+  {
+    uint8_t  digest_length; // 1
+    uint8_t  key_length;    // 2
+    uint8_t  fanout;        // 3
+    uint8_t  depth;         // 4
+    uint32_t leaf_length;   // 8
+    uint8_t  node_offset[6];// 14
+    uint8_t  node_depth;    // 15
+    uint8_t  inner_length;  // 16
+    // uint8_t  reserved[0];
+    uint8_t  salt[BLAKE2S_SALTBYTES]; // 24
+    uint8_t  personal[BLAKE2S_PERSONALBYTES];  // 32
+  } blake2s_param;
+
+  ALIGN( 64 ) typedef struct __blake2s_state
+  {
+    uint32_t h[8];
+    uint32_t t[2];
+    uint32_t f[2];
+    uint8_t  buf[2 * BLAKE2S_BLOCKBYTES];
+    size_t   buflen;
+    uint8_t  last_node;
+  } blake2s_state;
+
+  typedef struct __blake2b_param
+  {
+    uint8_t  digest_length; // 1
+    uint8_t  key_length;    // 2
+    uint8_t  fanout;        // 3
+    uint8_t  depth;         // 4
+    uint32_t leaf_length;   // 8
+    uint64_t node_offset;   // 16
+    uint8_t  node_depth;    // 17
+    uint8_t  inner_length;  // 18
+    uint8_t  reserved[14];  // 32
+    uint8_t  salt[BLAKE2B_SALTBYTES]; // 48
+    uint8_t  personal[BLAKE2B_PERSONALBYTES];  // 64
+  } blake2b_param;
+
+  ALIGN( 64 ) typedef struct __blake2b_state
+  {
+    uint64_t h[8];
+    uint64_t t[2];
+    uint64_t f[2];
+    uint8_t  buf[2 * BLAKE2B_BLOCKBYTES];
+    size_t   buflen;
+    uint8_t  last_node;
+  } blake2b_state;
+
+  ALIGN( 64 ) typedef struct __blake2sp_state
+  {
+    blake2s_state S[8][1];
+    blake2s_state R[1];
+    uint8_t buf[8 * BLAKE2S_BLOCKBYTES];
+    size_t  buflen;
+  } blake2sp_state;
+
+  ALIGN( 64 ) typedef struct __blake2bp_state
+  {
+    blake2b_state S[4][1];
+    blake2b_state R[1];
+    uint8_t buf[4 * BLAKE2B_BLOCKBYTES];
+    size_t  buflen;
+  } blake2bp_state;
+#pragma pack(pop)
+
+  // Streaming API
+  int blake2s_init( blake2s_state *S, const uint8_t outlen );
+  int blake2s_init_key( blake2s_state *S, const uint8_t outlen, const void *key, const uint8_t keylen );
+  int blake2s_init_param( blake2s_state *S, const blake2s_param *P );
+  int blake2s_update( blake2s_state *S, const uint8_t *in, uint64_t inlen );
+  int blake2s_final( blake2s_state *S, uint8_t *out, uint8_t outlen );
+
+  int blake2b_init( blake2b_state *S, const uint8_t outlen );
+  int blake2b_init_key( blake2b_state *S, const uint8_t outlen, const void *key, const uint8_t keylen );
+  int blake2b_init_param( blake2b_state *S, const blake2b_param *P );
+  int blake2b_update( blake2b_state *S, const uint8_t *in, uint64_t inlen );
+  int blake2b_final( blake2b_state *S, uint8_t *out, uint8_t outlen );
+
+  int blake2sp_init( blake2sp_state *S, const uint8_t outlen );
+  int blake2sp_init_key( blake2sp_state *S, const uint8_t outlen, const void *key, const uint8_t keylen );
+  int blake2sp_update( blake2sp_state *S, const uint8_t *in, uint64_t inlen );
+  int blake2sp_final( blake2sp_state *S, uint8_t *out, uint8_t outlen );
+
+  int blake2bp_init( blake2bp_state *S, const uint8_t outlen );
+  int blake2bp_init_key( blake2bp_state *S, const uint8_t outlen, const void *key, const uint8_t keylen );
+  int blake2bp_update( blake2bp_state *S, const uint8_t *in, uint64_t inlen );
+  int blake2bp_final( blake2bp_state *S, uint8_t *out, uint8_t outlen );
+
+  // Simple API
+  int blake2s( uint8_t *out, const void *in, const void *key, const uint8_t outlen, const uint64_t inlen, uint8_t keylen );
+  int blake2b( uint8_t *out, const void *in, const void *key, const uint8_t outlen, const uint64_t inlen, uint8_t keylen );
+  /* Argon2 Team - Begin Code */
+  int blake2b_long(uint8_t *out, const void *in, const uint32_t outlen, const uint64_t inlen);
+  /* Argon2 Team - End Code */
+
+  int blake2sp( uint8_t *out, const void *in, const void *key, const uint8_t outlen, const uint64_t inlen, uint8_t keylen );
+  int blake2bp( uint8_t *out, const void *in, const void *key, const uint8_t outlen, const uint64_t inlen, uint8_t keylen );
+
+  static inline int blake2( uint8_t *out, const void *in, const void *key, const uint8_t outlen, const uint64_t inlen, uint8_t keylen )
+  {
+    return blake2b( out, in, key, outlen, inlen, keylen );
+  }
+
+#if defined(__cplusplus)
+}
+#endif
+
+#endif
+
--- a/src/blake2/blake2b-load-sse2.h
+++ b/src/blake2/blake2b-load-sse2.h
@ -0,0 +1,68 @@
+/*
+   BLAKE2 reference source code package - optimized C implementations
+
+   Written in 2012 by Samuel Neves <sneves@dei.uc.pt>
+
+   To the extent possible under law, the author(s) have dedicated all copyright
+   and related and neighboring rights to this software to the public domain
+   worldwide. This software is distributed without any warranty.
+
+   You should have received a copy of the CC0 Public Domain Dedication along with
+   this software. If not, see <http://creativecommons.org/publicdomain/zero/1.0/>.
+*/
+#pragma once
+#ifndef __BLAKE2B_LOAD_SSE2_H__
+#define __BLAKE2B_LOAD_SSE2_H__
+
+#define LOAD_MSG_0_1(b0, b1) b0 = _mm_set_epi64x(m2, m0); b1 = _mm_set_epi64x(m6, m4)
+#define LOAD_MSG_0_2(b0, b1) b0 = _mm_set_epi64x(m3, m1); b1 = _mm_set_epi64x(m7, m5)
+#define LOAD_MSG_0_3(b0, b1) b0 = _mm_set_epi64x(m10, m8); b1 = _mm_set_epi64x(m14, m12)
+#define LOAD_MSG_0_4(b0, b1) b0 = _mm_set_epi64x(m11, m9); b1 = _mm_set_epi64x(m15, m13)
+#define LOAD_MSG_1_1(b0, b1) b0 = _mm_set_epi64x(m4, m14); b1 = _mm_set_epi64x(m13, m9)
+#define LOAD_MSG_1_2(b0, b1) b0 = _mm_set_epi64x(m8, m10); b1 = _mm_set_epi64x(m6, m15)
+#define LOAD_MSG_1_3(b0, b1) b0 = _mm_set_epi64x(m0, m1); b1 = _mm_set_epi64x(m5, m11)
+#define LOAD_MSG_1_4(b0, b1) b0 = _mm_set_epi64x(m2, m12); b1 = _mm_set_epi64x(m3, m7)
+#define LOAD_MSG_2_1(b0, b1) b0 = _mm_set_epi64x(m12, m11); b1 = _mm_set_epi64x(m15, m5)
+#define LOAD_MSG_2_2(b0, b1) b0 = _mm_set_epi64x(m0, m8); b1 = _mm_set_epi64x(m13, m2)
+#define LOAD_MSG_2_3(b0, b1) b0 = _mm_set_epi64x(m3, m10); b1 = _mm_set_epi64x(m9, m7)
+#define LOAD_MSG_2_4(b0, b1) b0 = _mm_set_epi64x(m6, m14); b1 = _mm_set_epi64x(m4, m1)
+#define LOAD_MSG_3_1(b0, b1) b0 = _mm_set_epi64x(m3, m7); b1 = _mm_set_epi64x(m11, m13)
+#define LOAD_MSG_3_2(b0, b1) b0 = _mm_set_epi64x(m1, m9); b1 = _mm_set_epi64x(m14, m12)
+#define LOAD_MSG_3_3(b0, b1) b0 = _mm_set_epi64x(m5, m2); b1 = _mm_set_epi64x(m15, m4)
+#define LOAD_MSG_3_4(b0, b1) b0 = _mm_set_epi64x(m10, m6); b1 = _mm_set_epi64x(m8, m0)
+#define LOAD_MSG_4_1(b0, b1) b0 = _mm_set_epi64x(m5, m9); b1 = _mm_set_epi64x(m10, m2)
+#define LOAD_MSG_4_2(b0, b1) b0 = _mm_set_epi64x(m7, m0); b1 = _mm_set_epi64x(m15, m4)
+#define LOAD_MSG_4_3(b0, b1) b0 = _mm_set_epi64x(m11, m14); b1 = _mm_set_epi64x(m3, m6)
+#define LOAD_MSG_4_4(b0, b1) b0 = _mm_set_epi64x(m12, m1); b1 = _mm_set_epi64x(m13, m8)
+#define LOAD_MSG_5_1(b0, b1) b0 = _mm_set_epi64x(m6, m2); b1 = _mm_set_epi64x(m8, m0)
+#define LOAD_MSG_5_2(b0, b1) b0 = _mm_set_epi64x(m10, m12); b1 = _mm_set_epi64x(m3, m11)
+#define LOAD_MSG_5_3(b0, b1) b0 = _mm_set_epi64x(m7, m4); b1 = _mm_set_epi64x(m1, m15)
+#define LOAD_MSG_5_4(b0, b1) b0 = _mm_set_epi64x(m5, m13); b1 = _mm_set_epi64x(m9, m14)
+#define LOAD_MSG_6_1(b0, b1) b0 = _mm_set_epi64x(m1, m12); b1 = _mm_set_epi64x(m4, m14)
+#define LOAD_MSG_6_2(b0, b1) b0 = _mm_set_epi64x(m15, m5); b1 = _mm_set_epi64x(m10, m13)
+#define LOAD_MSG_6_3(b0, b1) b0 = _mm_set_epi64x(m6, m0); b1 = _mm_set_epi64x(m8, m9)
+#define LOAD_MSG_6_4(b0, b1) b0 = _mm_set_epi64x(m3, m7); b1 = _mm_set_epi64x(m11, m2)
+#define LOAD_MSG_7_1(b0, b1) b0 = _mm_set_epi64x(m7, m13); b1 = _mm_set_epi64x(m3, m12)
+#define LOAD_MSG_7_2(b0, b1) b0 = _mm_set_epi64x(m14, m11); b1 = _mm_set_epi64x(m9, m1)
+#define LOAD_MSG_7_3(b0, b1) b0 = _mm_set_epi64x(m15, m5); b1 = _mm_set_epi64x(m2, m8)
+#define LOAD_MSG_7_4(b0, b1) b0 = _mm_set_epi64x(m4, m0); b1 = _mm_set_epi64x(m10, m6)
+#define LOAD_MSG_8_1(b0, b1) b0 = _mm_set_epi64x(m14, m6); b1 = _mm_set_epi64x(m0, m11)
+#define LOAD_MSG_8_2(b0, b1) b0 = _mm_set_epi64x(m9, m15); b1 = _mm_set_epi64x(m8, m3)
+#define LOAD_MSG_8_3(b0, b1) b0 = _mm_set_epi64x(m13, m12); b1 = _mm_set_epi64x(m10, m1)
+#define LOAD_MSG_8_4(b0, b1) b0 = _mm_set_epi64x(m7, m2); b1 = _mm_set_epi64x(m5, m4)
+#define LOAD_MSG_9_1(b0, b1) b0 = _mm_set_epi64x(m8, m10); b1 = _mm_set_epi64x(m1, m7)
+#define LOAD_MSG_9_2(b0, b1) b0 = _mm_set_epi64x(m4, m2); b1 = _mm_set_epi64x(m5, m6)
+#define LOAD_MSG_9_3(b0, b1) b0 = _mm_set_epi64x(m9, m15); b1 = _mm_set_epi64x(m13, m3)
+#define LOAD_MSG_9_4(b0, b1) b0 = _mm_set_epi64x(m14, m11); b1 = _mm_set_epi64x(m0, m12)
+#define LOAD_MSG_10_1(b0, b1) b0 = _mm_set_epi64x(m2, m0); b1 = _mm_set_epi64x(m6, m4)
+#define LOAD_MSG_10_2(b0, b1) b0 = _mm_set_epi64x(m3, m1); b1 = _mm_set_epi64x(m7, m5)
+#define LOAD_MSG_10_3(b0, b1) b0 = _mm_set_epi64x(m10, m8); b1 = _mm_set_epi64x(m14, m12)
+#define LOAD_MSG_10_4(b0, b1) b0 = _mm_set_epi64x(m11, m9); b1 = _mm_set_epi64x(m15, m13)
+#define LOAD_MSG_11_1(b0, b1) b0 = _mm_set_epi64x(m4, m14); b1 = _mm_set_epi64x(m13, m9)
+#define LOAD_MSG_11_2(b0, b1) b0 = _mm_set_epi64x(m8, m10); b1 = _mm_set_epi64x(m6, m15)
+#define LOAD_MSG_11_3(b0, b1) b0 = _mm_set_epi64x(m0, m1); b1 = _mm_set_epi64x(m5, m11)
+#define LOAD_MSG_11_4(b0, b1) b0 = _mm_set_epi64x(m2, m12); b1 = _mm_set_epi64x(m3, m7)
+
+
+#endif
+
--- a/src/blake2/blake2b-load-sse41.h
+++ b/src/blake2/blake2b-load-sse41.h
@ -0,0 +1,402 @@
+/*
+   BLAKE2 reference source code package - optimized C implementations
+
+   Written in 2012 by Samuel Neves <sneves@dei.uc.pt>
+
+   To the extent possible under law, the author(s) have dedicated all copyright
+   and related and neighboring rights to this software to the public domain
+   worldwide. This software is distributed without any warranty.
+
+   You should have received a copy of the CC0 Public Domain Dedication along with
+   this software. If not, see <http://creativecommons.org/publicdomain/zero/1.0/>.
+*/
+#pragma once
+#ifndef __BLAKE2B_LOAD_SSE41_H__
+#define __BLAKE2B_LOAD_SSE41_H__
+
+#define LOAD_MSG_0_1(b0, b1) \
+do \
+{ \
+b0 = _mm_unpacklo_epi64(m0, m1); \
+b1 = _mm_unpacklo_epi64(m2, m3); \
+} while(0)
+
+
+#define LOAD_MSG_0_2(b0, b1) \
+do \
+{ \
+b0 = _mm_unpackhi_epi64(m0, m1); \
+b1 = _mm_unpackhi_epi64(m2, m3); \
+} while(0)
+
+
+#define LOAD_MSG_0_3(b0, b1) \
+do \
+{ \
+b0 = _mm_unpacklo_epi64(m4, m5); \
+b1 = _mm_unpacklo_epi64(m6, m7); \
+} while(0)
+
+
+#define LOAD_MSG_0_4(b0, b1) \
+do \
+{ \
+b0 = _mm_unpackhi_epi64(m4, m5); \
+b1 = _mm_unpackhi_epi64(m6, m7); \
+} while(0)
+
+
+#define LOAD_MSG_1_1(b0, b1) \
+do \
+{ \
+b0 = _mm_unpacklo_epi64(m7, m2); \
+b1 = _mm_unpackhi_epi64(m4, m6); \
+} while(0)
+
+
+#define LOAD_MSG_1_2(b0, b1) \
+do \
+{ \
+b0 = _mm_unpacklo_epi64(m5, m4); \
+b1 = _mm_alignr_epi8(m3, m7, 8); \
+} while(0)
+
+
+#define LOAD_MSG_1_3(b0, b1) \
+do \
+{ \
+b0 = _mm_shuffle_epi32(m0, _MM_SHUFFLE(1,0,3,2)); \
+b1 = _mm_unpackhi_epi64(m5, m2); \
+} while(0)
+
+
+#define LOAD_MSG_1_4(b0, b1) \
+do \
+{ \
+b0 = _mm_unpacklo_epi64(m6, m1); \
+b1 = _mm_unpackhi_epi64(m3, m1); \
+} while(0)
+
+
+#define LOAD_MSG_2_1(b0, b1) \
+do \
+{ \
+b0 = _mm_alignr_epi8(m6, m5, 8); \
+b1 = _mm_unpackhi_epi64(m2, m7); \
+} while(0)
+
+
+#define LOAD_MSG_2_2(b0, b1) \
+do \
+{ \
+b0 = _mm_unpacklo_epi64(m4, m0); \
+b1 = _mm_blend_epi16(m1, m6, 0xF0); \
+} while(0)
+
+
+#define LOAD_MSG_2_3(b0, b1) \
+do \
+{ \
+b0 = _mm_blend_epi16(m5, m1, 0xF0); \
+b1 = _mm_unpackhi_epi64(m3, m4); \
+} while(0)
+
+
+#define LOAD_MSG_2_4(b0, b1) \
+do \
+{ \
+b0 = _mm_unpacklo_epi64(m7, m3); \
+b1 = _mm_alignr_epi8(m2, m0, 8); \
+} while(0)
+
+
+#define LOAD_MSG_3_1(b0, b1) \
+do \
+{ \
+b0 = _mm_unpackhi_epi64(m3, m1); \
+b1 = _mm_unpackhi_epi64(m6, m5); \
+} while(0)
+
+
+#define LOAD_MSG_3_2(b0, b1) \
+do \
+{ \
+b0 = _mm_unpackhi_epi64(m4, m0); \
+b1 = _mm_unpacklo_epi64(m6, m7); \
+} while(0)
+
+
+#define LOAD_MSG_3_3(b0, b1) \
+do \
+{ \
+b0 = _mm_blend_epi16(m1, m2, 0xF0); \
+b1 = _mm_blend_epi16(m2, m7, 0xF0); \
+} while(0)
+
+
+#define LOAD_MSG_3_4(b0, b1) \
+do \
+{ \
+b0 = _mm_unpacklo_epi64(m3, m5); \
+b1 = _mm_unpacklo_epi64(m0, m4); \
+} while(0)
+
+
+#define LOAD_MSG_4_1(b0, b1) \
+do \
+{ \
+b0 = _mm_unpackhi_epi64(m4, m2); \
+b1 = _mm_unpacklo_epi64(m1, m5); \
+} while(0)
+
+
+#define LOAD_MSG_4_2(b0, b1) \
+do \
+{ \
+b0 = _mm_blend_epi16(m0, m3, 0xF0); \
+b1 = _mm_blend_epi16(m2, m7, 0xF0); \
+} while(0)
+
+
+#define LOAD_MSG_4_3(b0, b1) \
+do \
+{ \
+b0 = _mm_blend_epi16(m7, m5, 0xF0); \
+b1 = _mm_blend_epi16(m3, m1, 0xF0); \
+} while(0)
+
+
+#define LOAD_MSG_4_4(b0, b1) \
+do \
+{ \
+b0 = _mm_alignr_epi8(m6, m0, 8); \
+b1 = _mm_blend_epi16(m4, m6, 0xF0); \
+} while(0)
+
+
+#define LOAD_MSG_5_1(b0, b1) \
+do \
+{ \
+b0 = _mm_unpacklo_epi64(m1, m3); \
+b1 = _mm_unpacklo_epi64(m0, m4); \
+} while(0)
+
+
+#define LOAD_MSG_5_2(b0, b1) \
+do \
+{ \
+b0 = _mm_unpacklo_epi64(m6, m5); \
+b1 = _mm_unpackhi_epi64(m5, m1); \
+} while(0)
+
+
+#define LOAD_MSG_5_3(b0, b1) \
+do \
+{ \
+b0 = _mm_blend_epi16(m2, m3, 0xF0); \
+b1 = _mm_unpackhi_epi64(m7, m0); \
+} while(0)
+
+
+#define LOAD_MSG_5_4(b0, b1) \
+do \
+{ \
+b0 = _mm_unpackhi_epi64(m6, m2); \
+b1 = _mm_blend_epi16(m7, m4, 0xF0); \
+} while(0)
+
+
+#define LOAD_MSG_6_1(b0, b1) \
+do \
+{ \
+b0 = _mm_blend_epi16(m6, m0, 0xF0); \
+b1 = _mm_unpacklo_epi64(m7, m2); \
+} while(0)
+
+
+#define LOAD_MSG_6_2(b0, b1) \
+do \
+{ \
+b0 = _mm_unpackhi_epi64(m2, m7); \
+b1 = _mm_alignr_epi8(m5, m6, 8); \
+} while(0)
+
+
+#define LOAD_MSG_6_3(b0, b1) \
+do \
+{ \
+b0 = _mm_unpacklo_epi64(m0, m3); \
+b1 = _mm_shuffle_epi32(m4, _MM_SHUFFLE(1,0,3,2)); \
+} while(0)
+
+
+#define LOAD_MSG_6_4(b0, b1) \
+do \
+{ \
+b0 = _mm_unpackhi_epi64(m3, m1); \
+b1 = _mm_blend_epi16(m1, m5, 0xF0); \
+} while(0)
+
+
+#define LOAD_MSG_7_1(b0, b1) \
+do \
+{ \
+b0 = _mm_unpackhi_epi64(m6, m3); \
+b1 = _mm_blend_epi16(m6, m1, 0xF0); \
+} while(0)
+
+
+#define LOAD_MSG_7_2(b0, b1) \
+do \
+{ \
+b0 = _mm_alignr_epi8(m7, m5, 8); \
+b1 = _mm_unpackhi_epi64(m0, m4); \
+} while(0)
+
+
+#define LOAD_MSG_7_3(b0, b1) \
+do \
+{ \
+b0 = _mm_unpackhi_epi64(m2, m7); \
+b1 = _mm_unpacklo_epi64(m4, m1); \
+} while(0)
+
+
+#define LOAD_MSG_7_4(b0, b1) \
+do \
+{ \
+b0 = _mm_unpacklo_epi64(m0, m2); \
+b1 = _mm_unpacklo_epi64(m3, m5); \
+} while(0)
+
+
+#define LOAD_MSG_8_1(b0, b1) \
+do \
+{ \
+b0 = _mm_unpacklo_epi64(m3, m7); \
+b1 = _mm_alignr_epi8(m0, m5, 8); \
+} while(0)
+
+
+#define LOAD_MSG_8_2(b0, b1) \
+do \
+{ \
+b0 = _mm_unpackhi_epi64(m7, m4); \
+b1 = _mm_alignr_epi8(m4, m1, 8); \
+} while(0)
+
+
+#define LOAD_MSG_8_3(b0, b1) \
+do \
+{ \
+b0 = m6; \
+b1 = _mm_alignr_epi8(m5, m0, 8); \
+} while(0)
+
+
+#define LOAD_MSG_8_4(b0, b1) \
+do \
+{ \
+b0 = _mm_blend_epi16(m1, m3, 0xF0); \
+b1 = m2; \
+} while(0)
+
+
+#define LOAD_MSG_9_1(b0, b1) \
+do \
+{ \
+b0 = _mm_unpacklo_epi64(m5, m4); \
+b1 = _mm_unpackhi_epi64(m3, m0); \
+} while(0)
+
+
+#define LOAD_MSG_9_2(b0, b1) \
+do \
+{ \
+b0 = _mm_unpacklo_epi64(m1, m2); \
+b1 = _mm_blend_epi16(m3, m2, 0xF0); \
+} while(0)
+
+
+#define LOAD_MSG_9_3(b0, b1) \
+do \
+{ \
+b0 = _mm_unpackhi_epi64(m7, m4); \
+b1 = _mm_unpackhi_epi64(m1, m6); \
+} while(0)
+
+
+#define LOAD_MSG_9_4(b0, b1) \
+do \
+{ \
+b0 = _mm_alignr_epi8(m7, m5, 8); \
+b1 = _mm_unpacklo_epi64(m6, m0); \
+} while(0)
+
+
+#define LOAD_MSG_10_1(b0, b1) \
+do \
+{ \
+b0 = _mm_unpacklo_epi64(m0, m1); \
+b1 = _mm_unpacklo_epi64(m2, m3); \
+} while(0)
+
+
+#define LOAD_MSG_10_2(b0, b1) \
+do \
+{ \
+b0 = _mm_unpackhi_epi64(m0, m1); \
+b1 = _mm_unpackhi_epi64(m2, m3); \
+} while(0)
+
+
+#define LOAD_MSG_10_3(b0, b1) \
+do \
+{ \
+b0 = _mm_unpacklo_epi64(m4, m5); \
+b1 = _mm_unpacklo_epi64(m6, m7); \
+} while(0)
+
+
+#define LOAD_MSG_10_4(b0, b1) \
+do \
+{ \
+b0 = _mm_unpackhi_epi64(m4, m5); \
+b1 = _mm_unpackhi_epi64(m6, m7); \
+} while(0)
+
+
+#define LOAD_MSG_11_1(b0, b1) \
+do \
+{ \
+b0 = _mm_unpacklo_epi64(m7, m2); \
+b1 = _mm_unpackhi_epi64(m4, m6); \
+} while(0)
+
+
+#define LOAD_MSG_11_2(b0, b1) \
+do \
+{ \
+b0 = _mm_unpacklo_epi64(m5, m4); \
+b1 = _mm_alignr_epi8(m3, m7, 8); \
+} while(0)
+
+
+#define LOAD_MSG_11_3(b0, b1) \
+do \
+{ \
+b0 = _mm_shuffle_epi32(m0, _MM_SHUFFLE(1,0,3,2)); \
+b1 = _mm_unpackhi_epi64(m5, m2); \
+} while(0)
+
+
+#define LOAD_MSG_11_4(b0, b1) \
+do \
+{ \
+b0 = _mm_unpacklo_epi64(m6, m1); \
+b1 = _mm_unpackhi_epi64(m3, m1); \
+} while(0)
+
+
+#endif
+
--- a/src/blake2/blake2b-ref.c
+++ b/src/blake2/blake2b-ref.c
@ -0,0 +1,433 @@
+/*
+   BLAKE2 reference source code package - reference C implementations
+
+   Written in 2012 by Samuel Neves <sneves@dei.uc.pt>
+
+   To the extent possible under law, the author(s) have dedicated all copyright
+   and related and neighboring rights to this software to the public domain
+   worldwide. This software is distributed without any warranty.
+
+   You should have received a copy of the CC0 Public Domain Dedication along with
+   this software. If not, see <http://creativecommons.org/publicdomain/zero/1.0/>.
+*/
+
+#include <stdint.h>
+#include <string.h>
+#include <stdio.h>
+
+#include "blake2.h"
+#include "blake2-impl.h"
+
+static const uint64_t blake2b_IV[8] =
+{
+  0x6a09e667f3bcc908ULL, 0xbb67ae8584caa73bULL,
+  0x3c6ef372fe94f82bULL, 0xa54ff53a5f1d36f1ULL,
+  0x510e527fade682d1ULL, 0x9b05688c2b3e6c1fULL,
+  0x1f83d9abfb41bd6bULL, 0x5be0cd19137e2179ULL
+};
+
+static const uint8_t blake2b_sigma[12][16] =
+{
+  {  0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15 } ,
+  { 14, 10,  4,  8,  9, 15, 13,  6,  1, 12,  0,  2, 11,  7,  5,  3 } ,
+  { 11,  8, 12,  0,  5,  2, 15, 13, 10, 14,  3,  6,  7,  1,  9,  4 } ,
+  {  7,  9,  3,  1, 13, 12, 11, 14,  2,  6,  5, 10,  4,  0, 15,  8 } ,
+  {  9,  0,  5,  7,  2,  4, 10, 15, 14,  1, 11, 12,  6,  8,  3, 13 } ,
+  {  2, 12,  6, 10,  0, 11,  8,  3,  4, 13,  7,  5, 15, 14,  1,  9 } ,
+  { 12,  5,  1, 15, 14, 13,  4, 10,  0,  7,  6,  3,  9,  2,  8, 11 } ,
+  { 13, 11,  7, 14, 12,  1,  3,  9,  5,  0, 15,  4,  8,  6,  2, 10 } ,
+  {  6, 15, 14,  9, 11,  3,  0,  8, 12,  2, 13,  7,  1,  4, 10,  5 } ,
+  { 10,  2,  8,  4,  7,  6,  1,  5, 15, 11,  9, 14,  3, 12, 13 , 0 } ,
+  {  0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15 } ,
+  { 14, 10,  4,  8,  9, 15, 13,  6,  1, 12,  0,  2, 11,  7,  5,  3 }
+};
+
+
+static inline int blake2b_set_lastnode( blake2b_state *S )
+{
+  S->f[1] = ~0ULL;
+  return 0;
+}
+
+static inline int blake2b_clear_lastnode( blake2b_state *S )
+{
+  S->f[1] = 0ULL;
+  return 0;
+}
+
+/* Some helper functions, not necessarily useful */
+static inline int blake2b_set_lastblock( blake2b_state *S )
+{
+  if( S->last_node ) blake2b_set_lastnode( S );
+
+  S->f[0] = ~0ULL;
+  return 0;
+}
+
+static inline int blake2b_clear_lastblock( blake2b_state *S )
+{
+  if( S->last_node ) blake2b_clear_lastnode( S );
+
+  S->f[0] = 0ULL;
+  return 0;
+}
+
+static inline int blake2b_increment_counter( blake2b_state *S, const uint64_t inc )
+{
+  S->t[0] += inc;
+  S->t[1] += ( S->t[0] < inc );
+  return 0;
+}
+
+
+
+// Parameter-related functions
+static inline int blake2b_param_set_digest_length( blake2b_param *P, const uint8_t digest_length )
+{
+  P->digest_length = digest_length;
+  return 0;
+}
+
+static inline int blake2b_param_set_fanout( blake2b_param *P, const uint8_t fanout )
+{
+  P->fanout = fanout;
+  return 0;
+}
+
+static inline int blake2b_param_set_max_depth( blake2b_param *P, const uint8_t depth )
+{
+  P->depth = depth;
+  return 0;
+}
+
+static inline int blake2b_param_set_leaf_length( blake2b_param *P, const uint32_t leaf_length )
+{
+  store32( &P->leaf_length, leaf_length );
+  return 0;
+}
+
+static inline int blake2b_param_set_node_offset( blake2b_param *P, const uint64_t node_offset )
+{
+  store64( &P->node_offset, node_offset );
+  return 0;
+}
+
+static inline int blake2b_param_set_node_depth( blake2b_param *P, const uint8_t node_depth )
+{
+  P->node_depth = node_depth;
+  return 0;
+}
+
+static inline int blake2b_param_set_inner_length( blake2b_param *P, const uint8_t inner_length )
+{
+  P->inner_length = inner_length;
+  return 0;
+}
+
+static inline int blake2b_param_set_salt( blake2b_param *P, const uint8_t salt[BLAKE2B_SALTBYTES] )
+{
+  memcpy( P->salt, salt, BLAKE2B_SALTBYTES );
+  return 0;
+}
+
+static inline int blake2b_param_set_personal( blake2b_param *P, const uint8_t personal[BLAKE2B_PERSONALBYTES] )
+{
+  memcpy( P->personal, personal, BLAKE2B_PERSONALBYTES );
+  return 0;
+}
+
+static inline int blake2b_init0( blake2b_state *S )
+{
+  memset( S, 0, sizeof( blake2b_state ) );
+
+  for( int i = 0; i < 8; ++i ) S->h[i] = blake2b_IV[i];
+
+  return 0;
+}
+
+/* init xors IV with input parameter block */
+int blake2b_init_param( blake2b_state *S, const blake2b_param *P )
+{
+  blake2b_init0( S );
+  const uint8_t *p = ( const uint8_t * )( P );
+
+  /* IV XOR ParamBlock */
+  for( size_t i = 0; i < 8; ++i )
+    S->h[i] ^= load64( p + sizeof( S->h[i] ) * i );
+
+  return 0;
+}
+
+
+
+int blake2b_init( blake2b_state *S, const uint8_t outlen )
+{
+  blake2b_param P[1];
+
+  if ( ( !outlen ) || ( outlen > BLAKE2B_OUTBYTES ) ) return -1;
+
+  P->digest_length = outlen;
+  P->key_length    = 0;
+  P->fanout        = 1;
+  P->depth         = 1;
+  store32( &P->leaf_length, 0 );
+  store64( &P->node_offset, 0 );
+  P->node_depth    = 0;
+  P->inner_length  = 0;
+  memset( P->reserved, 0, sizeof( P->reserved ) );
+  memset( P->salt,     0, sizeof( P->salt ) );
+  memset( P->personal, 0, sizeof( P->personal ) );
+  return blake2b_init_param( S, P );
+}
+
+
+int blake2b_init_key( blake2b_state *S, const uint8_t outlen, const void *key, const uint8_t keylen )
+{
+  blake2b_param P[1];
+
+  if ( ( !outlen ) || ( outlen > BLAKE2B_OUTBYTES ) ) return -1;
+
+  if ( !key || !keylen || keylen > BLAKE2B_KEYBYTES ) return -1;
+
+  P->digest_length = outlen;
+  P->key_length    = keylen;
+  P->fanout        = 1;
+  P->depth         = 1;
+  store32( &P->leaf_length, 0 );
+  store64( &P->node_offset, 0 );
+  P->node_depth    = 0;
+  P->inner_length  = 0;
+  memset( P->reserved, 0, sizeof( P->reserved ) );
+  memset( P->salt,     0, sizeof( P->salt ) );
+  memset( P->personal, 0, sizeof( P->personal ) );
+
+  if( blake2b_init_param( S, P ) < 0 ) return -1;
+
+  {
+    uint8_t block[BLAKE2B_BLOCKBYTES];
+    memset( block, 0, BLAKE2B_BLOCKBYTES );
+    memcpy( block, key, keylen );
+    blake2b_update( S, block, BLAKE2B_BLOCKBYTES );
+    secure_zero_memory( block, BLAKE2B_BLOCKBYTES ); /* Burn the key from stack */
+  }
+  return 0;
+}
+
+static int blake2b_compress( blake2b_state *S, const uint8_t block[BLAKE2B_BLOCKBYTES] )
+{
+  uint64_t m[16];
+  uint64_t v[16];
+  int i;
+
+  for( i = 0; i < 16; ++i )
+    m[i] = load64( block + i * sizeof( m[i] ) );
+
+  for( i = 0; i < 8; ++i )
+    v[i] = S->h[i];
+
+  v[ 8] = blake2b_IV[0];
+  v[ 9] = blake2b_IV[1];
+  v[10] = blake2b_IV[2];
+  v[11] = blake2b_IV[3];
+  v[12] = S->t[0] ^ blake2b_IV[4];
+  v[13] = S->t[1] ^ blake2b_IV[5];
+  v[14] = S->f[0] ^ blake2b_IV[6];
+  v[15] = S->f[1] ^ blake2b_IV[7];
+#define G(r,i,a,b,c,d) \
+  do { \
+    a = a + b + m[blake2b_sigma[r][2*i+0]]; \
+    d = rotr64(d ^ a, 32); \
+    c = c + d; \
+    b = rotr64(b ^ c, 24); \
+    a = a + b + m[blake2b_sigma[r][2*i+1]]; \
+    d = rotr64(d ^ a, 16); \
+    c = c + d; \
+    b = rotr64(b ^ c, 63); \
+  } while(0)
+#define ROUND(r)  \
+  do { \
+    G(r,0,v[ 0],v[ 4],v[ 8],v[12]); \
+    G(r,1,v[ 1],v[ 5],v[ 9],v[13]); \
+    G(r,2,v[ 2],v[ 6],v[10],v[14]); \
+    G(r,3,v[ 3],v[ 7],v[11],v[15]); \
+    G(r,4,v[ 0],v[ 5],v[10],v[15]); \
+    G(r,5,v[ 1],v[ 6],v[11],v[12]); \
+    G(r,6,v[ 2],v[ 7],v[ 8],v[13]); \
+    G(r,7,v[ 3],v[ 4],v[ 9],v[14]); \
+  } while(0)
+  ROUND( 0 );
+  ROUND( 1 );
+  ROUND( 2 );
+  ROUND( 3 );
+  ROUND( 4 );
+  ROUND( 5 );
+  ROUND( 6 );
+  ROUND( 7 );
+  ROUND( 8 );
+  ROUND( 9 );
+  ROUND( 10 );
+  ROUND( 11 );
+
+  for( i = 0; i < 8; ++i )
+    S->h[i] = S->h[i] ^ v[i] ^ v[i + 8];
+
+#undef G
+#undef ROUND
+  return 0;
+}
+
+/* inlen now in bytes */
+int blake2b_update( blake2b_state *S, const uint8_t *in, uint64_t inlen )
+{
+  while( inlen > 0 )
+  {
+    size_t left = S->buflen;
+    size_t fill = 2 * BLAKE2B_BLOCKBYTES - left;
+
+    if( inlen > fill )
+    {
+      memcpy( S->buf + left, in, fill ); // Fill buffer
+      S->buflen += fill;
+      blake2b_increment_counter( S, BLAKE2B_BLOCKBYTES );
+      blake2b_compress( S, S->buf ); // Compress
+      memcpy( S->buf, S->buf + BLAKE2B_BLOCKBYTES, BLAKE2B_BLOCKBYTES ); // Shift buffer left
+      S->buflen -= BLAKE2B_BLOCKBYTES;
+      in += fill;
+      inlen -= fill;
+    }
+    else // inlen <= fill
+    {
+      memcpy( S->buf + left, in, inlen );
+      S->buflen += inlen; // Be lazy, do not compress
+      in += inlen;
+      inlen -= inlen;
+    }
+  }
+
+  return 0;
+}
+
+/* Is this correct? */
+int blake2b_final( blake2b_state *S, uint8_t *out, uint8_t outlen )
+{
+  uint8_t buffer[BLAKE2B_OUTBYTES] = {0};
+
+  if( outlen > BLAKE2B_OUTBYTES )
+    return -1;
+
+  if( S->buflen > BLAKE2B_BLOCKBYTES )
+  {
+    blake2b_increment_counter( S, BLAKE2B_BLOCKBYTES );
+    blake2b_compress( S, S->buf );
+    S->buflen -= BLAKE2B_BLOCKBYTES;
+    memcpy( S->buf, S->buf + BLAKE2B_BLOCKBYTES, S->buflen );
+  }
+
+  blake2b_increment_counter( S, S->buflen );
+  blake2b_set_lastblock( S );
+  memset( S->buf + S->buflen, 0, 2 * BLAKE2B_BLOCKBYTES - S->buflen ); /* Padding */
+  blake2b_compress( S, S->buf );
+
+  for( int i = 0; i < 8; ++i ) /* Output full hash to temp buffer */
+    store64( buffer + sizeof( S->h[i] ) * i, S->h[i] );
+
+  memcpy( out, buffer, outlen );
+  return 0;
+}
+
+/* inlen, at least, should be uint64_t. Others can be size_t. */
+int blake2b( uint8_t *out, const void *in, const void *key, const uint8_t outlen, const uint64_t inlen, uint8_t keylen )
+{
+  blake2b_state S[1];
+
+  /* Verify parameters */
+  if ( NULL == in ) return -1;
+
+  if ( NULL == out ) return -1;
+
+  if( NULL == key ) keylen = 0;
+
+  if( keylen > 0 )
+  {
+    if( blake2b_init_key( S, outlen, key, keylen ) < 0 ) return -1;
+  }
+  else
+  {
+    if( blake2b_init( S, outlen ) < 0 ) return -1;
+  }
+
+  blake2b_update( S, ( const uint8_t * )in, inlen );
+  blake2b_final( S, out, outlen );
+  return 0;
+}
+
+#if defined(BLAKE2B_SELFTEST)
+#include <string.h>
+#include "blake2-kat.h"
+int main( int argc, char **argv )
+{
+  uint8_t key[BLAKE2B_KEYBYTES];
+  uint8_t buf[KAT_LENGTH];
+
+  for( size_t i = 0; i < BLAKE2B_KEYBYTES; ++i )
+    key[i] = ( uint8_t )i;
+
+  for( size_t i = 0; i < KAT_LENGTH; ++i )
+    buf[i] = ( uint8_t )i;
+
+  for( size_t i = 0; i < KAT_LENGTH; ++i )
+  {
+    uint8_t hash[BLAKE2B_OUTBYTES];
+    blake2b( hash, buf, key, BLAKE2B_OUTBYTES, i, BLAKE2B_KEYBYTES );
+
+    if( 0 != memcmp( hash, blake2b_keyed_kat[i], BLAKE2B_OUTBYTES ) )
+    {
+      puts( "error" );
+      return -1;
+    }
+  }
+
+  puts( "ok" );
+  return 0;
+}
+#endif
+
+
+/* Argon2 Team - Begin Code */
+int blake2b_long(uint8_t *out, const void *in, const uint32_t outlen, const uint64_t inlen)
+{
+	blake2b_state blake_state;
+	if (outlen <= BLAKE2B_OUTBYTES)
+	{
+		blake2b_init(&blake_state, outlen);
+		blake2b_update(&blake_state, (const uint8_t*)&outlen, sizeof(uint32_t));
+		blake2b_update(&blake_state, (const uint8_t *)in, inlen);
+		blake2b_final(&blake_state, out, outlen);
+	}
+	else
+	{
+		uint8_t out_buffer[BLAKE2B_OUTBYTES];
+		uint8_t in_buffer[BLAKE2B_OUTBYTES];
+		blake2b_init(&blake_state, BLAKE2B_OUTBYTES);
+		blake2b_update(&blake_state, (const uint8_t*)&outlen, sizeof(uint32_t));
+		blake2b_update(&blake_state, (const uint8_t *)in, inlen);
+		blake2b_final(&blake_state, out_buffer, BLAKE2B_OUTBYTES);
+		memcpy(out, out_buffer, BLAKE2B_OUTBYTES / 2);
+		out += BLAKE2B_OUTBYTES / 2;
+		uint32_t toproduce = outlen - BLAKE2B_OUTBYTES / 2;
+		while (toproduce > BLAKE2B_OUTBYTES)
+		{
+			memcpy(in_buffer, out_buffer, BLAKE2B_OUTBYTES);
+			blake2b(out_buffer, in_buffer, NULL, BLAKE2B_OUTBYTES, BLAKE2B_OUTBYTES, 0);
+			memcpy(out, out_buffer, BLAKE2B_OUTBYTES / 2);
+			out += BLAKE2B_OUTBYTES / 2;
+			toproduce -= BLAKE2B_OUTBYTES / 2;
+		}
+		memcpy(in_buffer, out_buffer, BLAKE2B_OUTBYTES);
+		blake2b(out_buffer, in_buffer, NULL, toproduce, BLAKE2B_OUTBYTES, 0);
+		memcpy(out, out_buffer, toproduce);
+
+	}
+	return 0;
+}
+/* Argon2 Team - End Code */
--- a/src/blake2/blake2b-round.h
+++ b/src/blake2/blake2b-round.h
@ -0,0 +1,172 @@
+/*
+   BLAKE2 reference source code package - optimized C implementations
+
+   Written in 2012 by Samuel Neves <sneves@dei.uc.pt>
+
+   To the extent possible under law, the author(s) have dedicated all copyright
+   and related and neighboring rights to this software to the public domain
+   worldwide. This software is distributed without any warranty.
+
+   You should have received a copy of the CC0 Public Domain Dedication along with
+   this software. If not, see <http://creativecommons.org/publicdomain/zero/1.0/>.
+*/
+#pragma once
+#ifndef __BLAKE2B_ROUND_H__
+#define __BLAKE2B_ROUND_H__
+
+/* Argon2 Team - Begin Code */
+#define LOAD(p)  _mm_load_si128( (const __m128i *)(p) )
+#define STORE(p,r) _mm_store_si128((__m128i *)(p), r)
+/* Argon2 Team - End Code */
+
+#define LOADU(p)  _mm_loadu_si128( (const __m128i *)(p) )
+#define STOREU(p,r) _mm_storeu_si128((__m128i *)(p), r)
+
+#define TOF(reg) _mm_castsi128_ps((reg))
+#define TOI(reg) _mm_castps_si128((reg))
+
+#define LIKELY(x) __builtin_expect((x),1)
+
+
+/* Microarchitecture-specific macros */
+#ifndef HAVE_XOP
+#ifdef HAVE_SSSE3
+#define _mm_roti_epi64(x, c) \
+    (-(c) == 32) ? _mm_shuffle_epi32((x), _MM_SHUFFLE(2,3,0,1))  \
+    : (-(c) == 24) ? _mm_shuffle_epi8((x), r24) \
+    : (-(c) == 16) ? _mm_shuffle_epi8((x), r16) \
+    : (-(c) == 63) ? _mm_xor_si128(_mm_srli_epi64((x), -(c)), _mm_add_epi64((x), (x)))  \
+    : _mm_xor_si128(_mm_srli_epi64((x), -(c)), _mm_slli_epi64((x), 64-(-(c))))
+#else
+#define _mm_roti_epi64(r, c) _mm_xor_si128(_mm_srli_epi64( (r), -(c) ),_mm_slli_epi64( (r), 64-(-c) ))
+#endif
+#else
+/* ... */
+#endif
+
+
+
+#define G1(row1l,row2l,row3l,row4l,row1h,row2h,row3h,row4h,b0,b1) \
+  row1l = _mm_add_epi64(_mm_add_epi64(row1l, b0), row2l); \
+  row1h = _mm_add_epi64(_mm_add_epi64(row1h, b1), row2h); \
+  \
+  row4l = _mm_xor_si128(row4l, row1l); \
+  row4h = _mm_xor_si128(row4h, row1h); \
+  \
+  row4l = _mm_roti_epi64(row4l, -32); \
+  row4h = _mm_roti_epi64(row4h, -32); \
+  \
+  row3l = _mm_add_epi64(row3l, row4l); \
+  row3h = _mm_add_epi64(row3h, row4h); \
+  \
+  row2l = _mm_xor_si128(row2l, row3l); \
+  row2h = _mm_xor_si128(row2h, row3h); \
+  \
+  row2l = _mm_roti_epi64(row2l, -24); \
+  row2h = _mm_roti_epi64(row2h, -24); \
+ 
+#define G2(row1l,row2l,row3l,row4l,row1h,row2h,row3h,row4h,b0,b1) \
+  row1l = _mm_add_epi64(_mm_add_epi64(row1l, b0), row2l); \
+  row1h = _mm_add_epi64(_mm_add_epi64(row1h, b1), row2h); \
+  \
+  row4l = _mm_xor_si128(row4l, row1l); \
+  row4h = _mm_xor_si128(row4h, row1h); \
+  \
+  row4l = _mm_roti_epi64(row4l, -16); \
+  row4h = _mm_roti_epi64(row4h, -16); \
+  \
+  row3l = _mm_add_epi64(row3l, row4l); \
+  row3h = _mm_add_epi64(row3h, row4h); \
+  \
+  row2l = _mm_xor_si128(row2l, row3l); \
+  row2h = _mm_xor_si128(row2h, row3h); \
+  \
+  row2l = _mm_roti_epi64(row2l, -63); \
+  row2h = _mm_roti_epi64(row2h, -63); \
+ 
+#if defined(HAVE_SSSE3)
+#define DIAGONALIZE(row1l,row2l,row3l,row4l,row1h,row2h,row3h,row4h) \
+  t0 = _mm_alignr_epi8(row2h, row2l, 8); \
+  t1 = _mm_alignr_epi8(row2l, row2h, 8); \
+  row2l = t0; \
+  row2h = t1; \
+  \
+  t0 = row3l; \
+  row3l = row3h; \
+  row3h = t0;    \
+  \
+  t0 = _mm_alignr_epi8(row4h, row4l, 8); \
+  t1 = _mm_alignr_epi8(row4l, row4h, 8); \
+  row4l = t1; \
+  row4h = t0;
+
+#define UNDIAGONALIZE(row1l,row2l,row3l,row4l,row1h,row2h,row3h,row4h) \
+  t0 = _mm_alignr_epi8(row2l, row2h, 8); \
+  t1 = _mm_alignr_epi8(row2h, row2l, 8); \
+  row2l = t0; \
+  row2h = t1; \
+  \
+  t0 = row3l; \
+  row3l = row3h; \
+  row3h = t0; \
+  \
+  t0 = _mm_alignr_epi8(row4l, row4h, 8); \
+  t1 = _mm_alignr_epi8(row4h, row4l, 8); \
+  row4l = t1; \
+  row4h = t0;
+#else
+
+#define DIAGONALIZE(row1l,row2l,row3l,row4l,row1h,row2h,row3h,row4h) \
+  t0 = row4l;\
+  t1 = row2l;\
+  row4l = row3l;\
+  row3l = row3h;\
+  row3h = row4l;\
+  row4l = _mm_unpackhi_epi64(row4h, _mm_unpacklo_epi64(t0, t0)); \
+  row4h = _mm_unpackhi_epi64(t0, _mm_unpacklo_epi64(row4h, row4h)); \
+  row2l = _mm_unpackhi_epi64(row2l, _mm_unpacklo_epi64(row2h, row2h)); \
+  row2h = _mm_unpackhi_epi64(row2h, _mm_unpacklo_epi64(t1, t1))
+
+#define UNDIAGONALIZE(row1l,row2l,row3l,row4l,row1h,row2h,row3h,row4h) \
+  t0 = row3l;\
+  row3l = row3h;\
+  row3h = t0;\
+  t0 = row2l;\
+  t1 = row4l;\
+  row2l = _mm_unpackhi_epi64(row2h, _mm_unpacklo_epi64(row2l, row2l)); \
+  row2h = _mm_unpackhi_epi64(t0, _mm_unpacklo_epi64(row2h, row2h)); \
+  row4l = _mm_unpackhi_epi64(row4l, _mm_unpacklo_epi64(row4h, row4h)); \
+  row4h = _mm_unpackhi_epi64(row4h, _mm_unpacklo_epi64(t1, t1))
+
+#endif
+
+#if defined(HAVE_SSE41)
+#include "blake2b-load-sse41.h"
+#else
+#include "blake2b-load-sse2.h"
+#endif
+
+#define ROUND(r) \
+  LOAD_MSG_ ##r ##_1(b0, b1); \
+  G1(row1l,row2l,row3l,row4l,row1h,row2h,row3h,row4h,b0,b1); \
+  LOAD_MSG_ ##r ##_2(b0, b1); \
+  G2(row1l,row2l,row3l,row4l,row1h,row2h,row3h,row4h,b0,b1); \
+  DIAGONALIZE(row1l,row2l,row3l,row4l,row1h,row2h,row3h,row4h); \
+  LOAD_MSG_ ##r ##_3(b0, b1); \
+  G1(row1l,row2l,row3l,row4l,row1h,row2h,row3h,row4h,b0,b1); \
+  LOAD_MSG_ ##r ##_4(b0, b1); \
+  G2(row1l,row2l,row3l,row4l,row1h,row2h,row3h,row4h,b0,b1); \
+  UNDIAGONALIZE(row1l,row2l,row3l,row4l,row1h,row2h,row3h,row4h);
+
+#endif
+
+#define BLAKE2_ROUND(row1l,row1h,row2l,row2h,row3l,row3h,row4l,row4h) \
+	G1(row1l, row2l, row3l, row4l, row1h, row2h, row3h, row4h); \
+	G2(row1l, row2l, row3l, row4l, row1h, row2h, row3h, row4h); \
+	\
+	DIAGONALIZE(row1l, row2l, row3l, row4l, row1h, row2h, row3h, row4h); \
+	\
+	G1(row1l, row2l, row3l, row4l, row1h, row2h, row3h, row4h); \
+	G2(row1l, row2l, row3l, row4l, row1h, row2h, row3h, row4h); \
+	\
+	UNDIAGONALIZE(row1l, row2l, row3l, row4l, row1h, row2h, row3h, row4h);
--- a/src/blake2/blake2b.c
+++ b/src/blake2/blake2b.c
@ -0,0 +1,475 @@
+/*
+   BLAKE2 reference source code package - optimized C implementations
+
+   Written in 2012 by Samuel Neves <sneves@dei.uc.pt>
+
+   To the extent possible under law, the author(s) have dedicated all copyright
+   and related and neighboring rights to this software to the public domain
+   worldwide. This software is distributed without any warranty.
+
+   You should have received a copy of the CC0 Public Domain Dedication along with
+   this software. If not, see <http://creativecommons.org/publicdomain/zero/1.0/>.
+*/
+
+#include <stdint.h>
+#include <string.h>
+#include <stdio.h>
+
+#include "blake2.h"
+#include "blake2-impl.h"
+
+#include "blake2-config.h"
+
+
+#include <emmintrin.h>
+#if defined(HAVE_SSSE3)
+#include <tmmintrin.h>
+#endif
+#if defined(HAVE_SSE41)
+#include <smmintrin.h>
+#endif
+#if defined(HAVE_AVX)
+#include <immintrin.h>
+#endif
+#if defined(HAVE_XOP)
+#include <x86intrin.h>
+#endif
+
+#include "blake2b-round.h"
+
+ALIGN( 64 ) static const uint64_t blake2b_IV[8] =
+{
+  0x6a09e667f3bcc908ULL, 0xbb67ae8584caa73bULL,
+  0x3c6ef372fe94f82bULL, 0xa54ff53a5f1d36f1ULL,
+  0x510e527fade682d1ULL, 0x9b05688c2b3e6c1fULL,
+  0x1f83d9abfb41bd6bULL, 0x5be0cd19137e2179ULL
+};
+
+static const uint8_t blake2b_sigma[12][16] =
+{
+  {  0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15 } ,
+  { 14, 10,  4,  8,  9, 15, 13,  6,  1, 12,  0,  2, 11,  7,  5,  3 } ,
+  { 11,  8, 12,  0,  5,  2, 15, 13, 10, 14,  3,  6,  7,  1,  9,  4 } ,
+  {  7,  9,  3,  1, 13, 12, 11, 14,  2,  6,  5, 10,  4,  0, 15,  8 } ,
+  {  9,  0,  5,  7,  2,  4, 10, 15, 14,  1, 11, 12,  6,  8,  3, 13 } ,
+  {  2, 12,  6, 10,  0, 11,  8,  3,  4, 13,  7,  5, 15, 14,  1,  9 } ,
+  { 12,  5,  1, 15, 14, 13,  4, 10,  0,  7,  6,  3,  9,  2,  8, 11 } ,
+  { 13, 11,  7, 14, 12,  1,  3,  9,  5,  0, 15,  4,  8,  6,  2, 10 } ,
+  {  6, 15, 14,  9, 11,  3,  0,  8, 12,  2, 13,  7,  1,  4, 10,  5 } ,
+  { 10,  2,  8,  4,  7,  6,  1,  5, 15, 11,  9, 14,  3, 12, 13 , 0 } ,
+  {  0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15 } ,
+  { 14, 10,  4,  8,  9, 15, 13,  6,  1, 12,  0,  2, 11,  7,  5,  3 }
+};
+
+
+/* Some helper functions, not necessarily useful */
+static inline int blake2b_set_lastnode( blake2b_state *S )
+{
+  S->f[1] = ~0ULL;
+  return 0;
+}
+
+static inline int blake2b_clear_lastnode( blake2b_state *S )
+{
+  S->f[1] = 0ULL;
+  return 0;
+}
+
+static inline int blake2b_set_lastblock( blake2b_state *S )
+{
+  if( S->last_node ) blake2b_set_lastnode( S );
+
+  S->f[0] = ~0ULL;
+  return 0;
+}
+
+static inline int blake2b_clear_lastblock( blake2b_state *S )
+{
+  if( S->last_node ) blake2b_clear_lastnode( S );
+
+  S->f[0] = 0ULL;
+  return 0;
+}
+
+
+static inline int blake2b_increment_counter( blake2b_state *S, const uint64_t inc )
+{
+#if __x86_64__
+  // ADD/ADC chain
+  __uint128_t t = ( ( __uint128_t )S->t[1] << 64 ) | S->t[0];
+  t += inc;
+  S->t[0] = ( uint64_t )( t >>  0 );
+  S->t[1] = ( uint64_t )( t >> 64 );
+#else
+  S->t[0] += inc;
+  S->t[1] += ( S->t[0] < inc );
+#endif
+  return 0;
+}
+
+
+// Parameter-related functions
+static inline int blake2b_param_set_digest_length( blake2b_param *P, const uint8_t digest_length )
+{
+  P->digest_length = digest_length;
+  return 0;
+}
+
+static inline int blake2b_param_set_fanout( blake2b_param *P, const uint8_t fanout )
+{
+  P->fanout = fanout;
+  return 0;
+}
+
+static inline int blake2b_param_set_max_depth( blake2b_param *P, const uint8_t depth )
+{
+  P->depth = depth;
+  return 0;
+}
+
+static inline int blake2b_param_set_leaf_length( blake2b_param *P, const uint32_t leaf_length )
+{
+  P->leaf_length = leaf_length;
+  return 0;
+}
+
+static inline int blake2b_param_set_node_offset( blake2b_param *P, const uint64_t node_offset )
+{
+  P->node_offset = node_offset;
+  return 0;
+}
+
+static inline int blake2b_param_set_node_depth( blake2b_param *P, const uint8_t node_depth )
+{
+  P->node_depth = node_depth;
+  return 0;
+}
+
+static inline int blake2b_param_set_inner_length( blake2b_param *P, const uint8_t inner_length )
+{
+  P->inner_length = inner_length;
+  return 0;
+}
+
+static inline int blake2b_param_set_salt( blake2b_param *P, const uint8_t salt[BLAKE2B_SALTBYTES] )
+{
+  memcpy( P->salt, salt, BLAKE2B_SALTBYTES );
+  return 0;
+}
+
+static inline int blake2b_param_set_personal( blake2b_param *P, const uint8_t personal[BLAKE2B_PERSONALBYTES] )
+{
+  memcpy( P->personal, personal, BLAKE2B_PERSONALBYTES );
+  return 0;
+}
+
+static inline int blake2b_init0( blake2b_state *S )
+{
+  memset( S, 0, sizeof( blake2b_state ) );
+
+  for( int i = 0; i < 8; ++i ) S->h[i] = blake2b_IV[i];
+
+  return 0;
+}
+
+/* init xors IV with input parameter block */
+int blake2b_init_param( blake2b_state *S, const blake2b_param *P )
+{
+  //blake2b_init0( S );
+  const uint8_t * v = ( const uint8_t * )( blake2b_IV );
+  const uint8_t * p = ( const uint8_t * )( P );
+  uint8_t * h = ( uint8_t * )( S->h );
+  /* IV XOR ParamBlock */
+  memset( S, 0, sizeof( blake2b_state ) );
+
+  for( int i = 0; i < BLAKE2B_OUTBYTES; ++i ) h[i] = v[i] ^ p[i];
+
+  return 0;
+}
+
+
+/* Some sort of default parameter block initialization, for sequential blake2b */
+int blake2b_init( blake2b_state *S, const uint8_t outlen )
+{
+  if ( ( !outlen ) || ( outlen > BLAKE2B_OUTBYTES ) ) return -1;
+
+  const blake2b_param P =
+  {
+    outlen,
+    0,
+    1,
+    1,
+    0,
+    0,
+    0,
+    0,
+    {0},
+    {0},
+    {0}
+  };
+  return blake2b_init_param( S, &P );
+}
+
+int blake2b_init_key( blake2b_state *S, const uint8_t outlen, const void *key, const uint8_t keylen )
+{
+  if ( ( !outlen ) || ( outlen > BLAKE2B_OUTBYTES ) ) return -1;
+
+  if ( ( !keylen ) || keylen > BLAKE2B_KEYBYTES ) return -1;
+
+  const blake2b_param P =
+  {
+    outlen,
+    keylen,
+    1,
+    1,
+    0,
+    0,
+    0,
+    0,
+    {0},
+    {0},
+    {0}
+  };
+
+  if( blake2b_init_param( S, &P ) < 0 )
+    return 0;
+
+  {
+    uint8_t block[BLAKE2B_BLOCKBYTES];
+    memset( block, 0, BLAKE2B_BLOCKBYTES );
+    memcpy( block, key, keylen );
+    blake2b_update( S, block, BLAKE2B_BLOCKBYTES );
+    secure_zero_memory( block, BLAKE2B_BLOCKBYTES ); /* Burn the key from stack */
+  }
+  return 0;
+}
+
+static inline int blake2b_compress( blake2b_state *S, const uint8_t block[BLAKE2B_BLOCKBYTES] )
+{
+  __m128i row1l, row1h;
+  __m128i row2l, row2h;
+  __m128i row3l, row3h;
+  __m128i row4l, row4h;
+  __m128i b0, b1;
+  __m128i t0, t1;
+#if defined(HAVE_SSSE3) && !defined(HAVE_XOP)
+  const __m128i r16 = _mm_setr_epi8( 2, 3, 4, 5, 6, 7, 0, 1, 10, 11, 12, 13, 14, 15, 8, 9 );
+  const __m128i r24 = _mm_setr_epi8( 3, 4, 5, 6, 7, 0, 1, 2, 11, 12, 13, 14, 15, 8, 9, 10 );
+#endif
+#if defined(HAVE_SSE41)
+  const __m128i m0 = LOADU( block + 00 );
+  const __m128i m1 = LOADU( block + 16 );
+  const __m128i m2 = LOADU( block + 32 );
+  const __m128i m3 = LOADU( block + 48 );
+  const __m128i m4 = LOADU( block + 64 );
+  const __m128i m5 = LOADU( block + 80 );
+  const __m128i m6 = LOADU( block + 96 );
+  const __m128i m7 = LOADU( block + 112 );
+#else
+  const uint64_t  m0 = ( ( uint64_t * )block )[ 0];
+  const uint64_t  m1 = ( ( uint64_t * )block )[ 1];
+  const uint64_t  m2 = ( ( uint64_t * )block )[ 2];
+  const uint64_t  m3 = ( ( uint64_t * )block )[ 3];
+  const uint64_t  m4 = ( ( uint64_t * )block )[ 4];
+  const uint64_t  m5 = ( ( uint64_t * )block )[ 5];
+  const uint64_t  m6 = ( ( uint64_t * )block )[ 6];
+  const uint64_t  m7 = ( ( uint64_t * )block )[ 7];
+  const uint64_t  m8 = ( ( uint64_t * )block )[ 8];
+  const uint64_t  m9 = ( ( uint64_t * )block )[ 9];
+  const uint64_t m10 = ( ( uint64_t * )block )[10];
+  const uint64_t m11 = ( ( uint64_t * )block )[11];
+  const uint64_t m12 = ( ( uint64_t * )block )[12];
+  const uint64_t m13 = ( ( uint64_t * )block )[13];
+  const uint64_t m14 = ( ( uint64_t * )block )[14];
+  const uint64_t m15 = ( ( uint64_t * )block )[15];
+#endif
+  row1l = LOADU( &S->h[0] );
+  row1h = LOADU( &S->h[2] );
+  row2l = LOADU( &S->h[4] );
+  row2h = LOADU( &S->h[6] );
+  row3l = LOADU( &blake2b_IV[0] );
+  row3h = LOADU( &blake2b_IV[2] );
+  row4l = _mm_xor_si128( LOADU( &blake2b_IV[4] ), LOADU( &S->t[0] ) );
+  row4h = _mm_xor_si128( LOADU( &blake2b_IV[6] ), LOADU( &S->f[0] ) );
+  ROUND( 0 );
+  ROUND( 1 );
+  ROUND( 2 );
+  ROUND( 3 );
+  ROUND( 4 );
+  ROUND( 5 );
+  ROUND( 6 );
+  ROUND( 7 );
+  ROUND( 8 );
+  ROUND( 9 );
+  ROUND( 10 );
+  ROUND( 11 );
+  row1l = _mm_xor_si128( row3l, row1l );
+  row1h = _mm_xor_si128( row3h, row1h );
+  STOREU( &S->h[0], _mm_xor_si128( LOADU( &S->h[0] ), row1l ) );
+  STOREU( &S->h[2], _mm_xor_si128( LOADU( &S->h[2] ), row1h ) );
+  row2l = _mm_xor_si128( row4l, row2l );
+  row2h = _mm_xor_si128( row4h, row2h );
+  STOREU( &S->h[4], _mm_xor_si128( LOADU( &S->h[4] ), row2l ) );
+  STOREU( &S->h[6], _mm_xor_si128( LOADU( &S->h[6] ), row2h ) );
+  return 0;
+}
+
+
+int blake2b_update( blake2b_state *S, const uint8_t *in, uint64_t inlen )
+{
+  while( inlen > 0 )
+  {
+    size_t left = S->buflen;
+    size_t fill = 2 * BLAKE2B_BLOCKBYTES - left;
+
+    if( inlen > fill )
+    {
+      memcpy( S->buf + left, in, fill ); // Fill buffer
+      S->buflen += fill;
+      blake2b_increment_counter( S, BLAKE2B_BLOCKBYTES );
+      blake2b_compress( S, S->buf ); // Compress
+      memcpy( S->buf, S->buf + BLAKE2B_BLOCKBYTES, BLAKE2B_BLOCKBYTES ); // Shift buffer left
+      S->buflen -= BLAKE2B_BLOCKBYTES;
+      in += fill;
+      inlen -= fill;
+    }
+    else // inlen <= fill
+    {
+      memcpy( S->buf + left, in, inlen );
+      S->buflen += inlen; // Be lazy, do not compress
+      in += inlen;
+      inlen -= inlen;
+    }
+  }
+
+  return 0;
+}
+
+
+int blake2b_final( blake2b_state *S, uint8_t *out, uint8_t outlen )
+{
+  if( outlen > BLAKE2B_OUTBYTES )
+    return -1;
+
+  if( S->buflen > BLAKE2B_BLOCKBYTES )
+  {
+    blake2b_increment_counter( S, BLAKE2B_BLOCKBYTES );
+    blake2b_compress( S, S->buf );
+    S->buflen -= BLAKE2B_BLOCKBYTES;
+    memcpy( S->buf, S->buf + BLAKE2B_BLOCKBYTES, S->buflen );
+  }
+
+  blake2b_increment_counter( S, S->buflen );
+  blake2b_set_lastblock( S );
+  memset( S->buf + S->buflen, 0, 2 * BLAKE2B_BLOCKBYTES - S->buflen ); /* Padding */
+  blake2b_compress( S, S->buf );
+  memcpy( out, &S->h[0], outlen );
+  return 0;
+}
+
+
+int blake2b( uint8_t *out, const void *in, const void *key, const uint8_t outlen, const uint64_t inlen, uint8_t keylen )
+{
+  blake2b_state S[1];
+
+  /* Verify parameters */
+  if ( NULL == in && inlen > 0 ) return -1;
+
+  if ( NULL == out ) return -1;
+
+  if( NULL == key && keylen > 0 ) return -1;
+
+  if( !outlen || outlen > BLAKE2B_OUTBYTES ) return -1;
+
+  if( keylen > BLAKE2B_KEYBYTES ) return -1;
+
+  if( keylen )
+  {
+    if( blake2b_init_key( S, outlen, key, keylen ) < 0 ) return -1;
+  }
+  else
+  {
+    if( blake2b_init( S, outlen ) < 0 ) return -1;
+  }
+
+  blake2b_update( S, ( const uint8_t * )in, inlen );
+  blake2b_final( S, out, outlen );
+  return 0;
+}
+
+#if defined(SUPERCOP)
+int crypto_hash( unsigned char *out, unsigned char *in, unsigned long long inlen )
+{
+  return blake2b( out, in, NULL, BLAKE2B_OUTBYTES, inlen, 0 );
+}
+#endif
+
+#if defined(BLAKE2B_SELFTEST)
+#include <string.h>
+#include "blake2-kat.h"
+int main( int argc, char **argv )
+{
+  uint8_t key[BLAKE2B_KEYBYTES];
+  uint8_t buf[KAT_LENGTH];
+
+  for( size_t i = 0; i < BLAKE2B_KEYBYTES; ++i )
+    key[i] = ( uint8_t )i;
+
+  for( size_t i = 0; i < KAT_LENGTH; ++i )
+    buf[i] = ( uint8_t )i;
+
+  for( size_t i = 0; i < KAT_LENGTH; ++i )
+  {
+    uint8_t hash[BLAKE2B_OUTBYTES];
+    blake2b( hash, buf, key, BLAKE2B_OUTBYTES, i, BLAKE2B_KEYBYTES );
+
+    if( 0 != memcmp( hash, blake2b_keyed_kat[i], BLAKE2B_OUTBYTES ) )
+    {
+      puts( "error" );
+      return -1;
+    }
+  }
+
+  puts( "ok" );
+  return 0;
+}
+#endif
+
+/* Argon2 Team - Begin Code */
+int blake2b_long(uint8_t *out, const void *in, const uint32_t outlen, const uint64_t inlen)
+{
+	blake2b_state blake_state;
+	if (outlen <= BLAKE2B_OUTBYTES)
+	{
+		blake2b_init(&blake_state, outlen);
+		blake2b_update(&blake_state, (const uint8_t*)&outlen, sizeof(uint32_t));
+		blake2b_update(&blake_state, (const uint8_t *)in, inlen);
+		blake2b_final(&blake_state, out, outlen);
+	}
+	else
+	{
+		uint8_t out_buffer[BLAKE2B_OUTBYTES];
+		uint8_t in_buffer[BLAKE2B_OUTBYTES];
+		blake2b_init(&blake_state, BLAKE2B_OUTBYTES);
+		blake2b_update(&blake_state, (const uint8_t*)&outlen, sizeof(uint32_t));
+		blake2b_update(&blake_state, (const uint8_t *)in, inlen);
+		blake2b_final(&blake_state, out_buffer, BLAKE2B_OUTBYTES);
+		memcpy(out, out_buffer, BLAKE2B_OUTBYTES / 2);
+		out += BLAKE2B_OUTBYTES / 2;
+		uint32_t toproduce = outlen - BLAKE2B_OUTBYTES / 2;
+		while (toproduce > BLAKE2B_OUTBYTES)
+		{
+			memcpy(in_buffer, out_buffer, BLAKE2B_OUTBYTES);
+			blake2b(out_buffer, in_buffer, NULL, BLAKE2B_OUTBYTES, BLAKE2B_OUTBYTES, 0);
+			memcpy(out, out_buffer, BLAKE2B_OUTBYTES / 2);
+			out += BLAKE2B_OUTBYTES / 2;
+			toproduce -= BLAKE2B_OUTBYTES / 2;
+		}
+		memcpy(in_buffer, out_buffer, BLAKE2B_OUTBYTES);
+		blake2b(out_buffer, in_buffer, NULL, toproduce, BLAKE2B_OUTBYTES, 0);
+		memcpy(out, out_buffer, toproduce);
+		
+	}
+	return 0;
+}
+/* Argon2 Team - End Code */
--- a/src/blake2/brg-endian.h
+++ b/src/blake2/brg-endian.h
@ -0,0 +1,143 @@
+/*
+ ---------------------------------------------------------------------------
+ Copyright (c) 1998-2008, Brian Gladman, Worcester, UK. All rights reserved.
+
+ LICENSE TERMS
+
+ The redistribution and use of this software (with or without changes)
+ is allowed without the payment of fees or royalties provided that:
+
+  1. source code distributions include the above copyright notice, this
+     list of conditions and the following disclaimer;
+
+  2. binary distributions include the above copyright notice, this list
+     of conditions and the following disclaimer in their documentation;
+
+  3. the name of the copyright holder is not used to endorse products
+     built using this software without specific written permission.
+
+ DISCLAIMER
+
+ This software is provided 'as is' with no explicit or implied warranties
+ in respect of its properties, including, but not limited to, correctness
+ and/or fitness for purpose.
+ ---------------------------------------------------------------------------
+ Issue Date: 20/12/2007
+ Changes for ARM 9/9/2010
+*/
+
+#ifndef _BRG_ENDIAN_H
+#define _BRG_ENDIAN_H
+
+#define IS_BIG_ENDIAN      4321 /* byte 0 is most significant (mc68k) */
+#define IS_LITTLE_ENDIAN   1234 /* byte 0 is least significant (i386) */
+
+#if 0
+/* Include files where endian defines and byteswap functions may reside */
+#if defined( __sun )
+#  include <sys/isa_defs.h>
+#elif defined( __FreeBSD__ ) || defined( __OpenBSD__ ) || defined( __NetBSD__ )
+#  include <sys/endian.h>
+#elif defined( BSD ) && ( BSD >= 199103 ) || defined( __APPLE__ ) || \
+      defined( __CYGWIN32__ ) || defined( __DJGPP__ ) || defined( __osf__ )
+#  include <machine/endian.h>
+#elif defined( __linux__ ) || defined( __GNUC__ ) || defined( __GNU_LIBRARY__ )
+#  if !defined( __MINGW32__ ) && !defined( _AIX )
+#    include <endian.h>
+#    if !defined( __BEOS__ )
+#      include <byteswap.h>
+#    endif
+#  endif
+#endif
+#endif
+
+/* Now attempt to set the define for platform byte order using any  */
+/* of the four forms SYMBOL, _SYMBOL, __SYMBOL & __SYMBOL__, which  */
+/* seem to encompass most endian symbol definitions                 */
+
+#if defined( BIG_ENDIAN ) && defined( LITTLE_ENDIAN )
+#  if defined( BYTE_ORDER ) && BYTE_ORDER == BIG_ENDIAN
+#    define PLATFORM_BYTE_ORDER IS_BIG_ENDIAN
+#  elif defined( BYTE_ORDER ) && BYTE_ORDER == LITTLE_ENDIAN
+#    define PLATFORM_BYTE_ORDER IS_LITTLE_ENDIAN
+#  endif
+#elif defined( BIG_ENDIAN )
+#  define PLATFORM_BYTE_ORDER IS_BIG_ENDIAN
+#elif defined( LITTLE_ENDIAN )
+#  define PLATFORM_BYTE_ORDER IS_LITTLE_ENDIAN
+#endif
+
+#if defined( _BIG_ENDIAN ) && defined( _LITTLE_ENDIAN )
+#  if defined( _BYTE_ORDER ) && _BYTE_ORDER == _BIG_ENDIAN
+#    define PLATFORM_BYTE_ORDER IS_BIG_ENDIAN
+#  elif defined( _BYTE_ORDER ) && _BYTE_ORDER == _LITTLE_ENDIAN
+#    define PLATFORM_BYTE_ORDER IS_LITTLE_ENDIAN
+#  endif
+#elif defined( _BIG_ENDIAN )
+#  define PLATFORM_BYTE_ORDER IS_BIG_ENDIAN
+#elif defined( _LITTLE_ENDIAN )
+#  define PLATFORM_BYTE_ORDER IS_LITTLE_ENDIAN
+#endif
+
+#if defined( __BIG_ENDIAN ) && defined( __LITTLE_ENDIAN )
+#  if defined( __BYTE_ORDER ) && __BYTE_ORDER == __BIG_ENDIAN
+#    define PLATFORM_BYTE_ORDER IS_BIG_ENDIAN
+#  elif defined( __BYTE_ORDER ) && __BYTE_ORDER == __LITTLE_ENDIAN
+#    define PLATFORM_BYTE_ORDER IS_LITTLE_ENDIAN
+#  endif
+#elif defined( __BIG_ENDIAN )
+#  define PLATFORM_BYTE_ORDER IS_BIG_ENDIAN
+#elif defined( __LITTLE_ENDIAN )
+#  define PLATFORM_BYTE_ORDER IS_LITTLE_ENDIAN
+#endif
+
+#if defined( __BIG_ENDIAN__ ) && defined( __LITTLE_ENDIAN__ )
+#  if defined( __BYTE_ORDER__ ) && __BYTE_ORDER__ == __BIG_ENDIAN__
+#    define PLATFORM_BYTE_ORDER IS_BIG_ENDIAN
+#  elif defined( __BYTE_ORDER__ ) && __BYTE_ORDER__ == __LITTLE_ENDIAN__
+#    define PLATFORM_BYTE_ORDER IS_LITTLE_ENDIAN
+#  endif
+#elif defined( __BIG_ENDIAN__ )
+#  define PLATFORM_BYTE_ORDER IS_BIG_ENDIAN
+#elif defined( __LITTLE_ENDIAN__ )
+#  define PLATFORM_BYTE_ORDER IS_LITTLE_ENDIAN
+#endif
+
+/*  if the platform byte order could not be determined, then try to */
+/*  set this define using common machine defines                    */
+#if !defined(PLATFORM_BYTE_ORDER)
+
+#if   defined( __alpha__ ) || defined( __alpha ) || defined( i386 )       || \
+      defined( __i386__ )  || defined( _M_I86 )  || defined( _M_IX86 )    || \
+      defined( __OS2__ )   || defined( sun386 )  || defined( __TURBOC__ ) || \
+      defined( vax )       || defined( vms )     || defined( VMS )        || \
+      defined( __VMS )     || defined( _M_X64 )
+#  define PLATFORM_BYTE_ORDER IS_LITTLE_ENDIAN
+
+#elif defined( AMIGA )   || defined( applec )    || defined( __AS400__ )  || \
+      defined( _CRAY )   || defined( __hppa )    || defined( __hp9000 )   || \
+      defined( ibm370 )  || defined( mc68000 )   || defined( m68k )       || \
+      defined( __MRC__ ) || defined( __MVS__ )   || defined( __MWERKS__ ) || \
+      defined( sparc )   || defined( __sparc)    || defined( SYMANTEC_C ) || \
+      defined( __VOS__ ) || defined( __TIGCC__ ) || defined( __TANDEM )   || \
+      defined( THINK_C ) || defined( __VMCMS__ ) || defined( _AIX )
+#  define PLATFORM_BYTE_ORDER IS_BIG_ENDIAN
+
+#elif defined(__arm__)
+# ifdef __BIG_ENDIAN
+#  define PLATFORM_BYTE_ORDER IS_BIG_ENDIAN
+# else
+#  define PLATFORM_BYTE_ORDER IS_LITTLE_ENDIAN
+# endif
+#elif 1     /* **** EDIT HERE IF NECESSARY **** */
+#  define PLATFORM_BYTE_ORDER IS_LITTLE_ENDIAN
+#elif 0     /* **** EDIT HERE IF NECESSARY **** */
+#  define PLATFORM_BYTE_ORDER IS_BIG_ENDIAN
+#else
+#  error Please edit lines 132 or 134 in brg_endian.h to set the platform byte order
+#endif
+
+#endif
+
+#endif
+
--- a/src/kat.c
+++ b/src/kat.c
@ -0,0 +1,128 @@
+/*
+ * Argon2 source code package
+ * 
+ * Written by Daniel Dinu and Dmitry Khovratovich, 2015
+ * 
+ * 
+ * This work is licensed under a Creative Commons CC0 1.0 License/Waiver.
+ * 
+ * You should have received a copy of the CC0 Public Domain Dedication along with
+ * this software. If not, see <http://creativecommons.org/publicdomain/zero/1.0/>.
+ */
+
+
+#include "stdio.h"
+#include "inttypes.h"
+
+#include "argon2.h"
+#include "argon2-core.h"
+
+
+
+#ifdef ARGON2_KAT
+
+void InitialKat(const uint8_t* blockhash, const Argon2_Context* context, Argon2_type type) {
+    FILE* fp = fopen(ARGON2_KAT_FILENAME, "a+");
+
+    if (fp && blockhash != NULL && context != NULL) {
+        fprintf(fp, "=======================================");
+
+        switch (type) {
+            case Argon2_d:
+                fprintf(fp, "Argon2d\n");
+                break;
+            case Argon2_i:
+                fprintf(fp, "Argon2i\n");
+                break;
+            case Argon2_id:
+                fprintf(fp, "Argon2id\n");
+                break;
+            case Argon2_ds:
+                fprintf(fp, "Argon2ds\n");
+                break;
+	    default:
+		break;
+        }
+
+        fprintf(fp, "Iterations: %d, Memory: %d KBytes, Parallelism: %d lanes, Tag length: %d bytes\n",
+                context->t_cost, context->m_cost, context->lanes, context->outlen);
+
+
+        fprintf(fp, "Password[%d]: ", context->pwdlen);
+        if (context->clear_password) {
+            fprintf(fp, "CLEARED\n");
+        } else {
+            for (unsigned i = 0; i < context->pwdlen; ++i) {
+                fprintf(fp, "%2.2x ", ((unsigned char*) context->pwd)[i]);
+            }
+            fprintf(fp, "\n");
+        }
+
+
+        fprintf(fp, "Salt[%d]: ", context->saltlen);
+        for (unsigned i = 0; i < context->saltlen; ++i) {
+            fprintf(fp, "%2.2x ", ((unsigned char*) context->salt)[i]);
+        }
+        fprintf(fp, "\n");
+
+        fprintf(fp, "Secret[%d]: ", context->secretlen);
+
+        if (context->clear_secret) {
+            fprintf(fp, "CLEARED\n");
+        } else {
+            for (unsigned i = 0; i < context->secretlen; ++i) {
+                fprintf(fp, "%2.2x ", ((unsigned char*) context->secret)[i]);
+            }
+            fprintf(fp, "\n");
+        }
+
+        fprintf(fp, "Associated data[%d]: ", context->adlen);
+        for (unsigned i = 0; i < context->adlen; ++i) {
+            fprintf(fp, "%2.2x ", ((unsigned char*) context->ad)[i]);
+        }
+        fprintf(fp, "\n");
+
+
+
+        fprintf(fp, "Pre-hashing digest: ");
+        for (unsigned i = 0; i < ARGON2_PREHASH_DIGEST_LENGTH; ++i) {
+            fprintf(fp, "%2.2x ", ((unsigned char*) blockhash)[i]);
+        }
+        fprintf(fp, "\n");
+
+        fclose(fp);
+    }
+}
+
+void PrintTag(const void* out, uint32_t outlen) {
+    FILE* fp = fopen(ARGON2_KAT_FILENAME, "a+");
+
+    if (fp && out != NULL) {
+        fprintf(fp, "Tag: ");
+        for (unsigned i = 0; i < outlen; ++i) {
+            fprintf(fp, "%2.2x ", ((uint8_t*) out)[i]);
+        }
+        fprintf(fp, "\n");
+
+        fclose(fp);
+    }
+}
+#endif
+
+
+#ifdef ARGON2_KAT_INTERNAL
+
+void InternalKat(const Argon2_instance_t* instance, uint32_t pass) {
+    FILE* fp = fopen(ARGON2_KAT_FILENAME, "a+");
+    if (fp && instance != NULL) {
+        fprintf(fp, "\n After pass %d:\n", pass);
+        for (uint32_t i = 0; i < instance->memory_blocks; ++i) {
+            uint32_t how_many_words = (instance->memory_blocks > ARGON2_WORDS_IN_BLOCK) ? 1 : ARGON2_WORDS_IN_BLOCK;
+            for (uint32_t j = 0; j < how_many_words; ++j)
+                fprintf(fp, "Block %.4d [%3d]: %016" PRIx64 "\n", i, j, instance->memory[i].v[j]);
+        }
+
+        fclose(fp);
+    }
+}
+#endif
--- a/src/kat.h
+++ b/src/kat.h
@ -0,0 +1,44 @@
+/*
+ * Argon2 source code package
+ * 
+ * Written by Daniel Dinu and Dmitry Khovratovich, 2015
+ * 
+ * This work is licensed under a Creative Commons CC0 1.0 License/Waiver.
+ * 
+ * You should have received a copy of the CC0 Public Domain Dedication along with
+ * this software. If not, see <http://creativecommons.org/publicdomain/zero/1.0/>.
+ */
+
+
+#ifndef __ARGON2_KAT_H__
+#define __ARGON2_KAT_H__
+
+
+/*
+ * Initial KAT function that prints the inputs to the file
+ * @param  blockhash  Array that contains pre-hashing digest
+ * @param  context Holds inputs
+ * @param  type Argon2 type
+ * @pre blockhash must point to INPUT_INITIAL_HASH_LENGTH bytes
+ * @pre context member pointers must point to allocated memory of size according to the length values
+ */
+void InitialKat(const uint8_t* blockhash, const Argon2_Context* context, Argon2_type type);
+
+/*
+ * Function that prints the output tag
+ * @param  out  output array pointer
+ * @param  outlen digest length
+ * @pre out must point to @a outlen bytes
+ **/
+void PrintTag(const void* out, uint32_t outlen);
+
+/*
+ * Function that prints the internal state at given moment
+ * @param  instance pointer to the current instance
+ * @param  pass current pass number
+ * @pre instance must have necessary memory allocated
+ **/
+void InternalKat(const Argon2_instance_t* instance, uint32_t pass);
+
+
+#endif
--- a/testvectors/Argon2d.txt
+++ b/testvectors/Argon2d.txt
--- a/testvectors/Argon2ds.txt
+++ b/testvectors/Argon2ds.txt
--- a/testvectors/Argon2i.txt
+++ b/testvectors/Argon2i.txt
--- a/testvectors/Argon2id.txt
+++ b/testvectors/Argon2id.txt