From 56cc0239d84c13dcdfd0d3c25bdda6534fab96d4 Mon Sep 17 00:00:00 2001
From: surtur <a_mirre@utb.cz>
Date: Wed, 23 Aug 2023 20:05:58 +0200
Subject: [PATCH] tex: extend, reword the hash functions section

---
 tex/part-theoretical.tex | 89 ++++++++++++++++++++++------------------
 tex/references.bib       | 42 +++++++++++++++++++
 2 files changed, 91 insertions(+), 40 deletions(-)

diff --git a/tex/part-theoretical.tex b/tex/part-theoretical.tex
index b87fcc7..c382fc4 100644
--- a/tex/part-theoretical.tex
+++ b/tex/part-theoretical.tex
@@ -43,21 +43,19 @@ to a website protected by the famed HTTPS.
 The popularity of hash functions stems from a common use case: the need to
 simplify reliably identifying a chunk of data. Of course, two chunks of data,
 two files, frames or packets could always be compared bit by bit, but that can
-get prohibitive from both cost and energy point of view relatively quickly.
-That is when the hash functions come in, since they are able to take a long
-input and produce a short output, named a digest or a hash value. The function
-also only works one way.
-
-A file, or any original input data for that matter, cannot be reconstructed
-from the hash digest alone by somehow \emph{reversing} the hashing operation,
-since at the heart of any hash function there is essentially a compression
-function.
+get prohibitive from both cost and energy point of view relatively quickly,
+with transport channels being often insecure and unreliable. That is when the
+hash functions come in, since they are able to take a long input and produce a
+short output, named a digest or a hash value. The function also only works one
+way. A file, or any original input data for that matter, cannot be
+reconstructed from the hash digest alone by somehow \emph{reversing} the
+hashing operation, since at the heart of any hash function there is essentially
+a compression function.
 
 Most alluringly, hashes are frequently used with the intent of
 \emph{protecting} passwords by making those unreadable, while still being able
-to verify that the user knows the password, therefore should be authorised.
-
-As the hashing operation is irreversible, once the one-way function produces a
+to verify that the user knows the password, therefore should be authorised. As
+the hashing operation is irreversible, once the one-way function produces a
 short a digest, there is no way to reconstruct the original message from it.
 That is, unless the input of the hash function is also known, in which case all
 it takes is hashing the supposed input and comparing the digest with existing
@@ -66,41 +64,52 @@ digests that are known to be digests of passwords.
 
 \n{3}{Types and use cases}
 
-Hash functions can be loosely categorised based on their intended use case to
-\emph{password protection hashes}, \emph{integrity verification hashes},
-\emph{message authentication codes} and \emph{cryptographic hashes}. Each of
-these possess unique characteristics and using the wrong type of hash function
-for the wrong job can potentially result in a security breach.
+Hash functions can be loosely categorised based on their intended cryptographic
+application to \emph{password protection}, \emph{integrity verification},
+\emph{message authentication} hashes. Each of them possesses unique
+characteristics and using the wrong type of hash function for the wrong job can
+potentially result in a security breach.
 
-As an example, suppose \texttt{MD5}, a popular hash function internally using
-the same data structure - \emph{Merkle-Damgård} construction - as
-\texttt{BLAKE3}. The former produces 128 bit digests, compared to the default
-256 bits of output and no upper ($<2^{64}$ bytes) limit (Merkle tree
-extensibility) for the latter. There is a list of differences that could
-further be mentioned, however, they both have one thing in common: they are
-\emph{designed} to be \emph{fast}. The latter, as a cryptographic hash
-function, is conjectured to be \emph{random oracle indifferentiable}, secure
-against length extension, but it is also in fact faster than all of
+As a contrived example, suppose \texttt{MD5}, a popular hash function
+internally using the same data structure - \emph{Merkle-Damgård} (MD)
+construction - as \texttt{BLAKE3}. The former produces 128 bit digests,
+compared to the default 256 bits of output and no upper ($<2^{64}$ bytes) limit
+(Merkle tree extensibility) for the latter. Aside from \texttt{MD5} considered
+to be \emph{broken} in regard to collision
+resistance~\cite{md5collision}~\cite{md5collision2} (and have theoretically
+weakened resistance to preimages~\cite{md5preimage}~\cite{md5preimage2}), a
+list of differences could be mentioned; however, they both have one thing in
+common: they are \emph{designed} to be \emph{fast}. The latter cryptographic
+hash function, is conjectured to be \emph{random oracle indifferentiable},
+secure against length extension, and was built with pre-image and collision
+resistance in mind. That said, it is also in fact faster than all of
 \texttt{MD5}, \texttt{SHA3-256}, \texttt{SHA-1} and even \texttt{Blake2} family
 of functions~\cite{blake3}.
 
-The use case of both is to (quickly) verify integrity of a given chunk of data,
-in case of \texttt{BLAKE3} with pre-image and collision resistance in mind, not
-to secure a password by hashing it first, which poses a big issue when used
+\begin{lstlisting}[caption=Broken collision resistance of
+\texttt{MD5},label=md5,backgroundcolor=\color{lstbg}]
+    m := x
+    m' := y
+    MD5(m) == MD5(m')
+\end{lstlisting}
+
+However, the default use case of both \texttt{MD5} and \texttt{BLAKE3}
+(unkeyed) is to (quickly) verify integrity of a given chunk of data, not to
+secure a password by hashing it first, which poses a big issue when used
 to...secure passwords by hashing them first.
 
 Password hashing functions such as \texttt{argon2} or \texttt{bcrypt} are good
-choices for \emph{securely} storing hashed passwords, namely because they place
-CPU and memory burden on the machine that is computing the digest. In case of
-the mentioned functions, \emph{hardness} is even configurable to satisfy the
-greatest possible array of scenarios. These functions also forcefully limit
-potential parallelism, thereby restricting the scale at which exhaustive
-searches performed using tools like \texttt{Hashcat} or \texttt{John the
-Ripper} could be at all feasible, practically obviating old-school hash
-cracking~\cite{hashcracking},~\cite{hashcracking2}. Additionally, both
-functions can automatically add random \emph{salt} to passwords, automatically
-ensuring that no copies of the same password provided by different users will
-end up hashing to the same digest value.
+choices for \emph{securely} storing passwords representations, namely because
+they place CPU and memory burden on the machine that is computing the digest.
+In case of the mentioned functions, \emph{hardness} is even configurable to
+satisfy the greatest possible array of scenarios. These functions also
+forcefully limit potential parallelism, thereby restricting the scale at which
+exhaustive searches performed using tools like \texttt{Hashcat} or \texttt{John
+the Ripper} could be at all feasible, Additionally, both functions can
+automatically add random \emph{salts} to passwords, automatically ensuring that
+no copies of the same password provided by different users end up hashing to
+the same digest value, which for practical purposes obviates large-scale
+old-school hash cracking~\cite{hashcracking},~\cite{hashcracking2}.
 
 
 \n{3}{Why are hashes interesting}
diff --git a/tex/references.bib b/tex/references.bib
index 341ab35..9aecfae 100644
--- a/tex/references.bib
+++ b/tex/references.bib
@@ -513,4 +513,46 @@ and-wealth-of-other-data-for-6-6-million-people-go-public/} [viewed 2023-08-13]}
 	note={{Available from: \url{https://securitynirvana.blogspot.com/2012/06/linkedin-password-infographic.html} [viewed 2023-08-13]}}
 }
 
+@inproceedings{md5collision,
+author = {Wang, Xiaoyun and Yu, Hongbo},
+year = {2005},
+month = {05},
+pages = {561-561},
+title = {How to Break MD5 and Other Hash Functions},
+volume = {3494},
+isbn = {978-3-540-25910-7},
+journal = {Lecture Notes in Computer Science},
+doi = {10.1007/11426639_2}
+}
+
+@article{md5collision2,
+  author = {Klíma, Vlastimil},
+  year = 2006,
+  month = jan,
+  pages = {105},
+  title = {Tunnels in Hash Functions: MD5 Collisions Within a Minute.},
+  volume = {2006},
+  journal = {IACR Cryptology ePrint Archive}
+}
+
+@inbook{md5preimage,
+  title={ Finding Preimages in Full MD5 Faster Than Exhaustive Search },
+  author={ Yu Sasaki and Kazumaro Aoki },
+  year= 2009 ,
+  publisher={ Springer, Berlin, Heidelberg },
+  pages={ 134-152 },
+  doi={ 10.1007/978-3-642-01001-9_8 },  
+}
+
+@inproceedings{md5preimage2,
+  author={Mao, Ming and Chen, Shaohui and Xu, Jin},
+  booktitle={2009 International Conference on Computational Intelligence and Security}, 
+  title={Construction of the Initial Structure for Preimage Attack of MD5}, 
+  year={2009},
+  volume={1},
+  number={},
+  pages={442-445},
+  doi={10.1109/CIS.2009.214}
+}
+
 % =========================================================================== %