daemon: Do not deduplicate files smaller than 8 KiB.

Files smaller than 8 KiB typically represent ~70% of the entries in
/gnu/store/.links but only contribute to ~4% of the space savings
afforded by deduplication.

Not considering these files for deduplication speeds up file insertion
in the store and, more importantly, leaves 'removeUnusedLinks' with
fewer entries to traverse, thereby speeding it up proportionally.

Partly fixes <https://issues.guix.gnu.org/24937>.

* config-daemon.ac: Remove symlink hard link check and CAN_LINK_SYMLINK
definition.
* guix/store/deduplication.scm (%deduplication-minimum-size): New
variable.
(deduplicate)[loop]: Do not recurse when FILE's size is below
%DEDUPLICATION-MINIMUM-SIZE.
(dump-port): New procedure.
(dump-file/deduplicate)[hash]: Turn into...
[dump-and-compute-hash]: ... this thunk.
Call 'deduplicate' only when SIZE is greater than
%DEDUPLICATION-MINIMUM-SIZE; otherwise call 'dump-port'.
* nix/libstore/gc.cc (LocalStore::removeUnusedLinks): Drop files where
st.st_size < deduplicationMinSize.
* nix/libstore/local-store.hh (deduplicationMinSize): New declaration.
* nix/libstore/optimise-store.cc (deduplicationMinSize): New variable.
(LocalStore::optimisePath_): Return when PATH is a symlink or smaller
than 'deduplicationMinSize'.
* tests/derivations.scm ("identical files are deduplicated"): Produce
files bigger than %DEDUPLICATION-MINIMUM-SIZE.
* tests/nar.scm ("restore-file-set with directories (signed, valid)"):
Likewise.
* tests/store-deduplication.scm ("deduplicate, below %deduplication-minimum-size"):
New test.
("deduplicate", "deduplicate, ENOSPC"): Produce files bigger than
%DEDUPLICATION-MINIMUM-SIZE.
* tests/store.scm ("substitute, deduplication"): Likewise.
This commit is contained in:
Ludovic Courtès 2021-11-13 21:47:15 +01:00
parent f39397b210
commit 472a0e82a5
No known key found for this signature in database
GPG Key ID: 090B11993D9AEBB5
9 changed files with 126 additions and 42 deletions

@ -94,17 +94,6 @@ if test "x$guix_build_daemon" = "xyes"; then
AC_CHECK_FUNCS([lutimes lchown posix_fallocate sched_setaffinity \ AC_CHECK_FUNCS([lutimes lchown posix_fallocate sched_setaffinity \
statvfs nanosleep strsignal statx]) statvfs nanosleep strsignal statx])
dnl Check whether the store optimiser can optimise symlinks.
AC_MSG_CHECKING([whether it is possible to create a link to a symlink])
ln -s bla tmp_link
if ln tmp_link tmp_link2 2> /dev/null; then
AC_MSG_RESULT(yes)
AC_DEFINE(CAN_LINK_SYMLINK, 1, [Whether link() works on symlinks.])
else
AC_MSG_RESULT(no)
fi
rm -f tmp_link tmp_link2
dnl Check for <locale>. dnl Check for <locale>.
AC_LANG_PUSH(C++) AC_LANG_PUSH(C++)
AC_CHECK_HEADERS([locale]) AC_CHECK_HEADERS([locale])

@ -1,6 +1,6 @@
;;; GNU Guix --- Functional package management for GNU ;;; GNU Guix --- Functional package management for GNU
;;; Copyright © 2017 Caleb Ristvedt <caleb.ristvedt@cune.org> ;;; Copyright © 2017 Caleb Ristvedt <caleb.ristvedt@cune.org>
;;; Copyright © 2018, 2019, 2020 Ludovic Courtès <ludo@gnu.org> ;;; Copyright © 2018-2021 Ludovic Courtès <ludo@gnu.org>
;;; ;;;
;;; This file is part of GNU Guix. ;;; This file is part of GNU Guix.
;;; ;;;
@ -22,12 +22,13 @@
(define-module (guix store deduplication) (define-module (guix store deduplication)
#:use-module (gcrypt hash) #:use-module (gcrypt hash)
#:use-module (guix build utils) #:use-module ((guix build utils) #:hide (dump-port))
#:use-module (guix build syscalls) #:use-module (guix build syscalls)
#:use-module (guix base32) #:use-module (guix base32)
#:use-module (srfi srfi-11) #:use-module (srfi srfi-11)
#:use-module (srfi srfi-34) #:use-module (srfi srfi-34)
#:use-module (srfi srfi-35) #:use-module (srfi srfi-35)
#:use-module (rnrs bytevectors)
#:use-module (rnrs io ports) #:use-module (rnrs io ports)
#:use-module (ice-9 ftw) #:use-module (ice-9 ftw)
#:use-module (ice-9 match) #:use-module (ice-9 match)
@ -37,6 +38,31 @@
dump-file/deduplicate dump-file/deduplicate
copy-file/deduplicate)) copy-file/deduplicate))
;; TODO: Remove once 'dump-port' in (guix build utils) has an optional 'len'
;; parameter.
(define* (dump-port in out
#:optional len
#:key (buffer-size 16384))
"Read LEN bytes from IN (or as much as possible if LEN is #f) and write it
to OUT, using chunks of BUFFER-SIZE bytes."
(define buffer
(make-bytevector buffer-size))
(let loop ((total 0)
(bytes (get-bytevector-n! in buffer 0
(if len
(min len buffer-size)
buffer-size))))
(or (eof-object? bytes)
(and len (= total len))
(let ((total (+ total bytes)))
(put-bytevector out buffer 0 bytes)
(loop total
(get-bytevector-n! in buffer 0
(if len
(min (- len total) buffer-size)
buffer-size)))))))
(define (nar-sha256 file) (define (nar-sha256 file)
"Gives the sha256 hash of a file and the size of the file in nar form." "Gives the sha256 hash of a file and the size of the file in nar form."
(let-values (((port get-hash) (open-sha256-port))) (let-values (((port get-hash) (open-sha256-port)))
@ -127,11 +153,27 @@ Note: TARGET, TO-REPLACE, and SWAP-DIRECTORY must be on the same file system."
(unless (= EMLINK (system-error-errno args)) (unless (= EMLINK (system-error-errno args))
(apply throw args))))))) (apply throw args)))))))
(define %deduplication-minimum-size
;; Size below which files are not deduplicated. This avoids adding too many
;; entries to '.links', which would slow down 'removeUnusedLinks' while
;; saving little space. Keep in sync with optimize-store.cc.
8192)
(define* (deduplicate path hash #:key (store (%store-directory))) (define* (deduplicate path hash #:key (store (%store-directory)))
"Check if a store item with sha256 hash HASH already exists. If so, "Check if a store item with sha256 hash HASH already exists. If so,
replace PATH with a hardlink to the already-existing one. If not, register replace PATH with a hardlink to the already-existing one. If not, register
PATH so that future duplicates can hardlink to it. PATH is assumed to be PATH so that future duplicates can hardlink to it. PATH is assumed to be
under STORE." under STORE."
;; Lightweight promises.
(define-syntax-rule (delay exp)
(let ((value #f))
(lambda ()
(unless value
(set! value exp))
value)))
(define-syntax-rule (force promise)
(promise))
(define links-directory (define links-directory
(string-append store "/.links")) (string-append store "/.links"))
@ -144,13 +186,18 @@ under STORE."
((file . properties) ((file . properties)
(unless (member file '("." "..")) (unless (member file '("." ".."))
(let* ((file (string-append path "/" file)) (let* ((file (string-append path "/" file))
(st (delay (lstat file)))
(type (match (assoc-ref properties 'type) (type (match (assoc-ref properties 'type)
((or 'unknown #f) ((or 'unknown #f)
(stat:type (lstat file))) (stat:type (force st)))
(type type)))) (type type))))
(loop file type (when (or (eq? 'directory type)
(and (not (eq? 'directory type)) (and (eq? 'regular type)
(nar-sha256 file))))))) (>= (stat:size (force st))
%deduplication-minimum-size)))
(loop file type
(and (not (eq? 'directory type))
(nar-sha256 file))))))))
(scandir* path)) (scandir* path))
(let ((link-file (string-append links-directory "/" (let ((link-file (string-append links-directory "/"
(bytevector->nix-base32-string hash)))) (bytevector->nix-base32-string hash))))
@ -222,9 +269,9 @@ OUTPUT as it goes."
This procedure is suitable as a #:dump-file argument to 'restore-file'. When This procedure is suitable as a #:dump-file argument to 'restore-file'. When
used that way, it deduplicates files on the fly as they are restored, thereby used that way, it deduplicates files on the fly as they are restored, thereby
removing the need to a deduplication pass that would re-read all the files removing the need for a deduplication pass that would re-read all the files
down the road." down the road."
(define hash (define (dump-and-compute-hash)
(call-with-output-file file (call-with-output-file file
(lambda (output) (lambda (output)
(let-values (((hash-port get-hash) (let-values (((hash-port get-hash)
@ -236,7 +283,11 @@ down the road."
(close-port hash-port) (close-port hash-port)
(get-hash))))) (get-hash)))))
(deduplicate file hash #:store store)) (if (>= size %deduplication-minimum-size)
(deduplicate file (dump-and-compute-hash) #:store store)
(call-with-output-file file
(lambda (output)
(dump-port input output size)))))
(define* (copy-file/deduplicate source target (define* (copy-file/deduplicate source target
#:key (store (%store-directory))) #:key (store (%store-directory)))

@ -606,7 +606,9 @@ void LocalStore::removeUnusedLinks(const GCState & state)
throw SysError(format("statting `%1%'") % path); throw SysError(format("statting `%1%'") % path);
#endif #endif
if (st.st_nlink != 1) { /* Drop links for files smaller than 'deduplicationMinSize', even if
they have more than one hard link. */
if (st.st_nlink != 1 && st.st_size >= deduplicationMinSize) {
actualSize += st.st_size; actualSize += st.st_size;
unsharedSize += (st.st_nlink - 1) * st.st_size; unsharedSize += (st.st_nlink - 1) * st.st_size;
continue; continue;

@ -292,4 +292,7 @@ void canonicaliseTimestampAndPermissions(const Path & path);
MakeError(PathInUse, Error); MakeError(PathInUse, Error);
/* Size below which a file is not considered for deduplication. */
extern const size_t deduplicationMinSize;
} }

@ -15,6 +15,9 @@
namespace nix { namespace nix {
/* Any file smaller than this is not considered for deduplication.
Keep in sync with (guix store deduplication). */
const size_t deduplicationMinSize = 8192;
static void makeWritable(const Path & path) static void makeWritable(const Path & path)
{ {
@ -105,12 +108,12 @@ void LocalStore::optimisePath_(OptimiseStats & stats, const Path & path, InodeHa
return; return;
} }
/* We can hard link regular files and maybe symlinks. */ /* We can hard link regular files (and maybe symlinks), but do that only
if (!S_ISREG(st.st_mode) for files larger than some threshold. This avoids adding too many
#if CAN_LINK_SYMLINK entries to '.links', which would slow down 'removeUnusedLinks' while
&& !S_ISLNK(st.st_mode) saving little space. */
#endif if (!S_ISREG(st.st_mode) || ((size_t) st.st_size) < deduplicationMinSize)
) return; return;
/* Sometimes SNAFUs can cause files in the store to be /* Sometimes SNAFUs can cause files in the store to be
modified, in particular when running programs as root under modified, in particular when running programs as root under

@ -1,5 +1,5 @@
;;; GNU Guix --- Functional package management for GNU ;;; GNU Guix --- Functional package management for GNU
;;; Copyright © 2012, 2013, 2014, 2015, 2016, 2017, 2018, 2019, 2020 Ludovic Courtès <ludo@gnu.org> ;;; Copyright © 2012-2021 Ludovic Courtès <ludo@gnu.org>
;;; ;;;
;;; This file is part of GNU Guix. ;;; This file is part of GNU Guix.
;;; ;;;
@ -170,11 +170,15 @@
#f)))) #f))))
(test-assert "identical files are deduplicated" (test-assert "identical files are deduplicated"
(let* ((build1 (add-text-to-store %store "one.sh" ;; Note: DATA must be longer than %DEDUPLICATION-MINIMUM-SIZE.
"echo hello, world > \"$out\"\n" (let* ((data (make-string 9000 #\a))
(build1 (add-text-to-store %store "one.sh"
(string-append "echo -n " data
" > \"$out\"\n")
'())) '()))
(build2 (add-text-to-store %store "two.sh" (build2 (add-text-to-store %store "two.sh"
"# Hey!\necho hello, world > \"$out\"\n" (string-append "# Hey!\necho -n "
data " > \"$out\"\n")
'())) '()))
(drv1 (derivation %store "foo" (drv1 (derivation %store "foo"
%bash `(,build1) %bash `(,build1)
@ -187,7 +191,7 @@
(file2 (derivation->output-path drv2))) (file2 (derivation->output-path drv2)))
(and (valid-path? %store file1) (valid-path? %store file2) (and (valid-path? %store file1) (valid-path? %store file2)
(string=? (call-with-input-file file1 get-string-all) (string=? (call-with-input-file file1 get-string-all)
"hello, world\n") data)
(= (stat:ino (lstat file1)) (= (stat:ino (lstat file1))
(stat:ino (lstat file2)))))))) (stat:ino (lstat file2))))))))

@ -1,5 +1,5 @@
;;; GNU Guix --- Functional package management for GNU ;;; GNU Guix --- Functional package management for GNU
;;; Copyright © 2012, 2013, 2014, 2015, 2016, 2017, 2018, 2019, 2020 Ludovic Courtès <ludo@gnu.org> ;;; Copyright © 2012-2021 Ludovic Courtès <ludo@gnu.org>
;;; ;;;
;;; This file is part of GNU Guix. ;;; This file is part of GNU Guix.
;;; ;;;
@ -486,8 +486,9 @@
;; their mtime and permissions were not reset. Ensure that this bug is ;; their mtime and permissions were not reset. Ensure that this bug is
;; gone. ;; gone.
(with-store store (with-store store
(let* ((text1 (random-text)) ;; Note: TEXT1 and TEXT2 must be longer than %DEDUPLICATION-MINIMUM-SIZE.
(text2 (random-text)) (let* ((text1 (string-concatenate (make-list 200 (random-text))))
(text2 (string-concatenate (make-list 200 (random-text))))
(tree `("tree" directory (tree `("tree" directory
("a" regular (data ,text1)) ("a" regular (data ,text1))
("b" directory ("b" directory

@ -1,5 +1,5 @@
;;; GNU Guix --- Functional package management for GNU ;;; GNU Guix --- Functional package management for GNU
;;; Copyright © 2018, 2020 Ludovic Courtès <ludo@gnu.org> ;;; Copyright © 2018, 2020-2021 Ludovic Courtès <ludo@gnu.org>
;;; ;;;
;;; This file is part of GNU Guix. ;;; This file is part of GNU Guix.
;;; ;;;
@ -30,13 +30,40 @@
(test-begin "store-deduplication") (test-begin "store-deduplication")
(test-equal "deduplicate, below %deduplication-minimum-size"
(list #t (make-list 5 1))
(call-with-temporary-directory
(lambda (store)
;; Note: DATA must be longer than %DEDUPLICATION-MINIMUM-SIZE.
(let ((data "Hello, world!")
(identical (map (lambda (n)
(string-append store "/" (number->string n)
"/a/b/c"))
(iota 5))))
(for-each (lambda (file)
(mkdir-p (dirname file))
(call-with-output-file file
(lambda (port)
(put-bytevector port (string->utf8 data)))))
identical)
(deduplicate store (nar-sha256 store) #:store store)
;; (system (string-append "ls -lRia " store))
(list (= (length (delete-duplicates
(map (compose stat:ino stat) identical)))
(length identical))
(map (compose stat:nlink stat) identical))))))
(test-equal "deduplicate" (test-equal "deduplicate"
(cons* #t #f ;inode comparisons (cons* #t #f ;inode comparisons
2 (make-list 5 6)) ;'nlink' values 2 (make-list 5 6)) ;'nlink' values
(call-with-temporary-directory (call-with-temporary-directory
(lambda (store) (lambda (store)
(let ((data (string->utf8 "Hello, world!")) ;; Note: DATA must be longer than %DEDUPLICATION-MINIMUM-SIZE.
(let ((data (string-concatenate (make-list 1000 "Hello, world!")))
(identical (map (lambda (n) (identical (map (lambda (n)
(string-append store "/" (number->string n) (string-append store "/" (number->string n)
"/a/b/c")) "/a/b/c"))
@ -46,7 +73,7 @@
(mkdir-p (dirname file)) (mkdir-p (dirname file))
(call-with-output-file file (call-with-output-file file
(lambda (port) (lambda (port)
(put-bytevector port data)))) (put-bytevector port (string->utf8 data)))))
identical) identical)
;; Make the parent of IDENTICAL read-only. This should not prevent ;; Make the parent of IDENTICAL read-only. This should not prevent
;; deduplication from inserting its hard link. ;; deduplication from inserting its hard link.
@ -54,7 +81,7 @@
(call-with-output-file unique (call-with-output-file unique
(lambda (port) (lambda (port)
(put-bytevector port (string->utf8 "This is unique.")))) (put-bytevector port (string->utf8 (string-reverse data)))))
(deduplicate store (nar-sha256 store) #:store store) (deduplicate store (nar-sha256 store) #:store store)
@ -77,8 +104,10 @@
(lambda (store) (lambda (store)
(let ((true-link link) (let ((true-link link)
(links 0) (links 0)
(data1 (string->utf8 "Hello, world!")) (data1 (string->utf8
(data2 (string->utf8 "Hi, world!")) (string-concatenate (make-list 1000 "Hello, world!"))))
(data2 (string->utf8
(string-concatenate (make-list 1000 "Hi, world!"))))
(identical (map (lambda (n) (identical (map (lambda (n)
(string-append store "/" (number->string n) (string-append store "/" (number->string n)
"/a/b/c")) "/a/b/c"))

@ -759,7 +759,9 @@
(test-assert "substitute, deduplication" (test-assert "substitute, deduplication"
(with-store s (with-store s
(let* ((c (random-text)) ; contents of the output ;; Note: C must be longer than %DEDUPLICATION-MINIMUM-SIZE.
(let* ((c (string-concatenate
(make-list 200 (random-text)))) ; contents of the output
(g (package-derivation s %bootstrap-guile)) (g (package-derivation s %bootstrap-guile))
(d1 (build-expression->derivation s "substitute-me" (d1 (build-expression->derivation s "substitute-me"
`(begin ,c (exit 1)) `(begin ,c (exit 1))