grafts: Make grafting faster.

* guix/build/graft.scm (replace-store-references): Reimplement for
faster grafting.  Use binary I/O instead of textual I/O.  Replace
'mapping' argument (an alist) with 'replacement-table' (a vhash).
(rewrite-directory): Adapt to mapping argument change in
'replace-store-references'.  Remove 'with-fluids' that previously set
'%default-port-encoding' to #f, since we now use binary I/O.
(define-inline, hash-length): New macros.
(nix-base32-char?): New variable.
This commit is contained in:
Mark H Weaver 2016-03-09 01:23:53 -05:00
parent ba6d25f3b9
commit 5a1add373a
No known key found for this signature in database
GPG key ID: 7CEF29847562C516

View file

@ -1,5 +1,6 @@
;;; GNU Guix --- Functional package management for GNU ;;; GNU Guix --- Functional package management for GNU
;;; Copyright © 2014, 2015, 2016 Ludovic Courtès <ludo@gnu.org> ;;; Copyright © 2014, 2015, 2016 Ludovic Courtès <ludo@gnu.org>
;;; Copyright © 2016 Mark H Weaver <mhw@netris.org>
;;; ;;;
;;; This file is part of GNU Guix. ;;; This file is part of GNU Guix.
;;; ;;;
@ -20,8 +21,12 @@ (define-module (guix build graft)
#:use-module (guix build utils) #:use-module (guix build utils)
#:use-module (rnrs bytevectors) #:use-module (rnrs bytevectors)
#:use-module (rnrs io ports) #:use-module (rnrs io ports)
#:use-module (ice-9 vlist)
#:use-module (ice-9 match) #:use-module (ice-9 match)
#:use-module (ice-9 threads) #:use-module (ice-9 threads)
#:use-module (ice-9 binary-ports)
#:use-module (srfi srfi-1) ; list library
#:use-module (srfi srfi-26) ; cut and cute
#:export (replace-store-references #:export (replace-store-references
rewrite-directory)) rewrite-directory))
@ -38,50 +43,134 @@ (define-module (guix build graft)
;;; ;;;
;;; Code: ;;; Code:
(define* (replace-store-references input output mapping (define-syntax-rule (define-inline name val)
(define-syntax name (identifier-syntax val)))
(define-inline hash-length 32)
(define nix-base32-char?
(cute char-set-contains?
;; ASCII digits and lower case letters except e o t u
(string->char-set "0123456789abcdfghijklmnpqrsvwxyz")
<>))
(define* (replace-store-references input output replacement-table
#:optional (store (%store-directory))) #:optional (store (%store-directory)))
"Read data from INPUT, replacing store references according to MAPPING, and "Read data from INPUT, replacing store references according to
writing the result to OUTPUT." REPLACEMENT-TABLE, and writing the result to OUTPUT. REPLACEMENT-TABLE is a
(define pattern vhash that maps strings (original hashes) to bytevectors (replacement hashes).
(let ((nix-base32-chars Note: We use string keys to work around the fact that guile-2.0 hashes all
'(#\0 #\1 #\2 #\3 #\4 #\5 #\6 #\7 #\8 #\9 bytevectors to the same value."
#\a #\b #\c #\d #\f #\g #\h #\i #\j #\k #\l #\m #\n
#\p #\q #\r #\s #\v #\w #\x #\y #\z)))
`(,@(map char-set (string->list store))
,(char-set #\/)
,@(make-list 32 (list->char-set nix-base32-chars))
,(char-set #\-))))
;; We cannot use `regexp-exec' here because it cannot deal with strings (define (lookup-replacement s)
;; containing NUL characters, hence 'fold-port-matches'. (match (vhash-assoc s replacement-table)
(with-fluids ((%default-port-encoding #f)) ((origin . replacement)
(when (file-port? input) replacement)
(setvbuf input _IOFBF 65536)) (#f #f)))
(when (file-port? output)
(setvbuf output _IOFBF 65536))
(let* ((len (+ 34 (string-length store))) (define (optimize-u8-predicate pred)
(mapping (map (match-lambda (cute vector-ref
((origin . replacement) (list->vector (map pred (iota 256)))
(unless (string=? (string-drop origin len) <>))
(string-drop replacement len))
(error "invalid replacement" origin replacement)) (define nix-base32-byte?
(cons (string-take origin len) (optimize-u8-predicate
(string-take replacement len)))) (compose nix-base32-char?
mapping))) integer->char)))
(fold-port-matches (lambda (string result)
(match (assoc-ref mapping string) (define (dash? byte) (= byte 45))
(#f
(put-bytevector output (string->utf8 string))) (define request-size (expt 2 20)) ; 1 MiB
((= string->utf8 replacement)
(put-bytevector output replacement))) ;; We scan the file for the following 33-byte pattern: 32 bytes of
#t) ;; nix-base32 characters followed by a dash. To accommodate large files,
#f ;; we do not read the entire file, but instead work on buffers of up to
pattern ;; 'request-size' bytes. To ensure that every 33-byte sequence appears
input ;; entirely within exactly one buffer, adjacent buffers must overlap,
(lambda (char result) ;unmatched ;; i.e. they must share 32 byte positions. We accomplish this by
(put-u8 output (char->integer char)) ;; "ungetting" the last 32 bytes of each buffer before reading the next
result))))) ;; buffer, unless we know that we've reached the end-of-file.
(let ((buffer (make-bytevector request-size)))
(let loop ()
;; Note: We avoid 'get-bytevector-n' to work around
;; <http://bugs.gnu.org/17466>.
(match (get-bytevector-n! input buffer 0 request-size)
((? eof-object?) 'done)
(end
;; We scan the buffer for dashes that might be preceded by a
;; nix-base32 hash. The key optimization here is that whenever we
;; find a NON-nix-base32 character at position 'i', we know that it
;; cannot be part of a hash, so the earliest position where the next
;; hash could start is i+1 with the following dash at position i+33.
;;
;; Since nix-base32 characters comprise only 1/8 of the 256 possible
;; byte values, and exclude some of the most common letters in
;; English text (e t o u), in practice we can advance by 33 positions
;; most of the time.
(let scan-from ((i hash-length) (written 0))
;; 'i' is the first position where we look for a dash. 'written'
;; is the number of bytes in the buffer that have already been
;; written.
(if (< i end)
(let ((byte (bytevector-u8-ref buffer i)))
(cond ((and (dash? byte)
;; We've found a dash. Note that we do not know
;; whether the preceeding 32 bytes are nix-base32
;; characters, but we do not need to know. If
;; they are not, the following lookup will fail.
(lookup-replacement
(string-tabulate (lambda (j)
(integer->char
(bytevector-u8-ref buffer
(+ j (- i hash-length)))))
hash-length)))
=> (lambda (replacement)
;; We've found a hash that needs to be replaced.
;; First, write out all bytes preceding the hash
;; that have not yet been written.
(put-bytevector output buffer written
(- i hash-length written))
;; Now write the replacement hash.
(put-bytevector output replacement)
;; Since the byte at position 'i' is a dash,
;; which is not a nix-base32 char, the earliest
;; position where the next hash might start is
;; i+1, and the earliest position where the
;; following dash might start is (+ i 1
;; hash-length). Also, we have now written up to
;; position 'i' in the buffer.
(scan-from (+ i 1 hash-length) i)))
;; If the byte at position 'i' is a nix-base32 char,
;; then the dash we're looking for might be as early as
;; the following byte, so we can only advance by 1.
((nix-base32-byte? byte)
(scan-from (+ i 1) written))
;; If the byte at position 'i' is NOT a nix-base32
;; char, then the earliest position where the next hash
;; might start is i+1, with the following dash at
;; position (+ i 1 hash-length).
(else
(scan-from (+ i 1 hash-length) written))))
;; We have finished scanning the buffer. Now we determine how
;; many bytes have not yet been written, and how many bytes to
;; "unget". If 'end' is less than 'request-size' then we read
;; less than we asked for, which indicates that we are at EOF,
;; so we needn't unget anything. Otherwise, we unget up to
;; 'hash-length' bytes (32 bytes). However, we must be careful
;; not to unget bytes that have already been written, because
;; that would cause them to be written again from the next
;; buffer. In practice, this case occurs when a replacement is
;; made near the end of the buffer.
(let* ((unwritten (- end written))
(unget-size (if (= end request-size)
(min hash-length unwritten)
0))
(write-size (- unwritten unget-size)))
(put-bytevector output buffer written write-size)
(unget-bytevector input buffer (+ written write-size)
unget-size)
(loop)))))))))
(define (rename-matching-files directory mapping) (define (rename-matching-files directory mapping)
"Apply MAPPING to the names of all the files in DIRECTORY, where MAPPING is "Apply MAPPING to the names of all the files in DIRECTORY, where MAPPING is
@ -122,6 +211,35 @@ (define* (rewrite-directory directory output mapping
#:optional (store (%store-directory))) #:optional (store (%store-directory)))
"Copy DIRECTORY to OUTPUT, replacing strings according to MAPPING, a list of "Copy DIRECTORY to OUTPUT, replacing strings according to MAPPING, a list of
file name pairs." file name pairs."
(define hash-mapping
(let* ((prefix (string-append store "/"))
(start (string-length prefix))
(end (+ start hash-length)))
(define (valid-hash? h)
(every nix-base32-char? (string->list h)))
(define (valid-suffix? s)
(string-prefix? "-" s))
(define (hash+suffix s)
(and (< end (string-length s))
(let ((hash (substring s start end))
(suffix (substring s end)))
(and (string-prefix? prefix s)
(valid-hash? hash)
(valid-suffix? suffix)
(list hash suffix)))))
(map (match-lambda
(((= hash+suffix (origin-hash suffix))
.
(= hash+suffix (replacement-hash suffix)))
(cons origin-hash (string->utf8 replacement-hash)))
((origin . replacement)
(error "invalid replacement" origin replacement)))
mapping)))
(define replacement-table
(alist->vhash hash-mapping))
(define prefix-len (define prefix-len
(string-length directory)) (string-length directory))
@ -138,18 +256,17 @@ (define (rewrite-leaf file)
(symlink (call-with-output-string (symlink (call-with-output-string
(lambda (output) (lambda (output)
(replace-store-references (open-input-string target) (replace-store-references (open-input-string target)
output mapping output replacement-table
store))) store)))
dest))) dest)))
((regular) ((regular)
(with-fluids ((%default-port-encoding #f)) (call-with-input-file file
(call-with-input-file file (lambda (input)
(lambda (input) (call-with-output-file dest
(call-with-output-file dest (lambda (output)
(lambda (output) (replace-store-references input output replacement-table
(replace-store-references input output mapping store)
store) (chmod output (stat:perms stat)))))))
(chmod output (stat:perms stat))))))))
((directory) ((directory)
(mkdir-p dest)) (mkdir-p dest))
(else (else