deduplication: Detect holes and create sparse files.

This reduces disk usage of sparse files that are substituted such as
Guile object files (ELF files).  As of Guile 3.0.9, .go files are sparse
due to ELF sections being aligned on 64 KiB boundaries.

This reduces disk usage reported by “du -sh” by 9% for the ‘guix’
package, by 23% for ‘guile’, and by 35% for ‘guile-git’.

* guix/store/deduplication.scm (hole-size, find-holes): New procedures.
(tee)[seekable?]: New variable.
[read!]: Add case when SEEKABLE? is true.
* tests/store-deduplication.scm (cartesian-product): New procedure.
("copy-file/deduplicate, sparse files (holes: ~a/~a/~a)"): New test set.

Change-Id: Iad2ab7830dcb1220e2026f4a127a6c718afa8964
This commit is contained in:
Ludovic Courtès 2024-04-19 22:00:44 +02:00
parent 73b3f941d7
commit 5a7cb59648
No known key found for this signature in database
GPG key ID: 090B11993D9AEBB5
2 changed files with 134 additions and 3 deletions

View file

@ -1,6 +1,6 @@
;;; GNU Guix --- Functional package management for GNU ;;; GNU Guix --- Functional package management for GNU
;;; Copyright © 2017 Caleb Ristvedt <caleb.ristvedt@cune.org> ;;; Copyright © 2017 Caleb Ristvedt <caleb.ristvedt@cune.org>
;;; Copyright © 2018-2022 Ludovic Courtès <ludo@gnu.org> ;;; Copyright © 2018-2022, 2024 Ludovic Courtès <ludo@gnu.org>
;;; ;;;
;;; This file is part of GNU Guix. ;;; This file is part of GNU Guix.
;;; ;;;
@ -28,6 +28,7 @@ (define-module (guix store deduplication)
#:use-module (srfi srfi-11) #:use-module (srfi srfi-11)
#:use-module (srfi srfi-34) #:use-module (srfi srfi-34)
#:use-module (srfi srfi-35) #:use-module (srfi srfi-35)
#:use-module (rnrs bytevectors)
#:use-module (rnrs io ports) #:use-module (rnrs io ports)
#:use-module (ice-9 match) #:use-module (ice-9 match)
#:use-module (guix serialization) #:use-module (guix serialization)
@ -206,6 +207,48 @@ (define links-directory
#f) #f)
(else (apply throw args))))))))))) (else (apply throw args)))))))))))
(define (hole-size bv start size)
"Return a lower bound of the number of leading zeros in the first SIZE bytes
of BV, starting at offset START."
(let ((end (+ start size)))
(let loop ((offset start))
(if (> offset (- end 4))
(- offset start)
(if (zero? (bytevector-u32-native-ref bv offset))
(loop (+ offset 4))
(- offset start))))))
(define (find-holes bv start size)
"Return the list of offset/size pairs representing \"holes\" (sequences of
zeros) in the SIZE bytes starting at START in BV."
(define granularity
;; Disk block size is traditionally 512 bytes; focus on larger holes to
;; reduce the computational effort.
1024)
(define (align offset)
(match (modulo offset granularity)
(0 offset)
(mod (+ offset (- granularity mod)))))
(define end
(+ start size))
(let loop ((offset start)
(size size)
(holes '()))
(if (>= offset end)
(reverse! holes)
(let ((hole (hole-size bv offset size)))
(if (and hole (>= hole granularity))
(let ((next (align (+ offset hole))))
(loop next
(- size (- next offset))
(cons (cons offset hole) holes)))
(loop (+ offset granularity)
(- size granularity)
holes))))))
(define (tee input len output) (define (tee input len output)
"Return a port that reads up to LEN bytes from INPUT and writes them to "Return a port that reads up to LEN bytes from INPUT and writes them to
OUTPUT as it goes." OUTPUT as it goes."
@ -217,6 +260,10 @@ (define (fail)
(&nar-error (port input) (&nar-error (port input)
(file (port-filename output)))))) (file (port-filename output))))))
(define seekable?
;; Whether OUTPUT can be a sparse file.
(file-port? output))
(define (read! bv start count) (define (read! bv start count)
;; Read at most LEN bytes in total. ;; Read at most LEN bytes in total.
(let ((count (min count (- len bytes-read)))) (let ((count (min count (- len bytes-read))))
@ -229,7 +276,35 @@ (define (read! bv start count)
;; Do not return zero since zero means EOF, so try again. ;; Do not return zero since zero means EOF, so try again.
(loop (get-bytevector-n! input bv start count))) (loop (get-bytevector-n! input bv start count)))
(else (else
(put-bytevector output bv start ret) (if seekable?
;; Render long-enough sequences of zeros as "holes".
(match (find-holes bv start ret)
(()
(put-bytevector output bv start ret))
(holes
(let loop ((offset start)
(size ret)
(holes holes))
(match holes
(()
(if (> size 0)
(put-bytevector output bv offset size)
(when (= len (+ bytes-read ret))
;; We created a hole in OUTPUT by seeking
;; forward but that hole only comes into
;; existence if we write something after it.
;; Make the hole one byte smaller and write a
;; final zero.
(seek output -1 SEEK_CUR)
(put-u8 output 0))))
(((hole-start . hole-size) . rest)
(let ((prefix-len (- hole-start offset)))
(put-bytevector output bv offset prefix-len)
(seek output hole-size SEEK_CUR)
(loop (+ hole-start hole-size)
(- size prefix-len hole-size)
rest)))))))
(put-bytevector output bv start ret))
(set! bytes-read (+ bytes-read ret)) (set! bytes-read (+ bytes-read ret))
ret))))) ret)))))

View file

@ -1,5 +1,5 @@
;;; GNU Guix --- Functional package management for GNU ;;; GNU Guix --- Functional package management for GNU
;;; Copyright © 2018, 2020-2022 Ludovic Courtès <ludo@gnu.org> ;;; Copyright © 2018, 2020-2022, 2024 Ludovic Courtès <ludo@gnu.org>
;;; ;;;
;;; This file is part of GNU Guix. ;;; This file is part of GNU Guix.
;;; ;;;
@ -24,10 +24,27 @@ (define-module (test-store-deduplication)
#:use-module (guix build utils) #:use-module (guix build utils)
#:use-module (rnrs bytevectors) #:use-module (rnrs bytevectors)
#:use-module (ice-9 binary-ports) #:use-module (ice-9 binary-ports)
#:use-module (ice-9 match)
#:use-module (srfi srfi-1) #:use-module (srfi srfi-1)
#:use-module (srfi srfi-26) #:use-module (srfi srfi-26)
#:use-module (srfi srfi-64)) #:use-module (srfi srfi-64))
(define (cartesian-product . lst)
"Return the Cartesian product of all the given lists."
(match lst
((head)
(map list head))
((head . rest)
(let ((others (apply cartesian-product rest)))
(append-map (lambda (init)
(map (lambda (lst)
(cons init lst))
others))
head)))
(()
'())))
(test-begin "store-deduplication") (test-begin "store-deduplication")
(test-equal "deduplicate, below %deduplication-minimum-size" (test-equal "deduplicate, below %deduplication-minimum-size"
@ -166,4 +183,43 @@ (define-module (test-store-deduplication)
(cut string-append store <>)) (cut string-append store <>))
'("/a" "/b" "/c")))))))) '("/a" "/b" "/c"))))))))
(for-each (match-lambda
((initial-gap middle-gap final-gap)
(test-assert
(format #f "copy-file/deduplicate, sparse files (holes: ~a/~a/~a)"
initial-gap middle-gap final-gap)
(call-with-temporary-directory
(lambda (store)
(let ((source (string-append store "/source")))
(call-with-output-file source
(lambda (port)
(seek port initial-gap SEEK_CUR)
(display "hi!" port)
(seek port middle-gap SEEK_CUR)
(display "bye." port)
(when (> final-gap 0)
(seek port (- final-gap 1) SEEK_CUR)
(put-u8 port 0))))
(for-each (lambda (target)
(copy-file/deduplicate source
(string-append store target)
#:store store))
'("/a" "/b" "/c"))
(system* "du" "-h" source)
(system* "du" "-h" "--apparent-size" source)
(system* "du" "-h" (string-append store "/a"))
(system* "du" "-h" "--apparent-size" (string-append store "/a"))
(and (directory-exists? (string-append store "/.links"))
(file=? source (string-append store "/a"))
(apply = (map (compose stat:ino stat
(cut string-append store <>))
'("/a" "/b" "/c")))
(let ((st (pk 'S (stat (string-append store "/a")))))
(<= (* 512 (stat:blocks st))
(stat:size st))))))))))
(cartesian-product '(0 3333 8192)
'(8192 9999 16384 22222)
'(0 8192)))
(test-end "store-deduplication") (test-end "store-deduplication")