grafts: Support rewriting UTF-16 and UTF-32 store references.

Partially fixes <https://bugs.gnu.org/33848>.

* guix/build/graft.scm (replace-store-references): Add support for
finding and rewriting UTF-16 and UTF-32 store references.
* tests/grafts.scm: Add tests.
This commit is contained in:
Mark H Weaver 2021-04-02 18:36:50 -04:00
parent abf032c131
commit 1bab9b9f17
No known key found for this signature in database
GPG key ID: 7CEF29847562C516
2 changed files with 272 additions and 90 deletions

View file

@ -1,6 +1,6 @@
;;; GNU Guix --- Functional package management for GNU ;;; GNU Guix --- Functional package management for GNU
;;; Copyright © 2014, 2015, 2016, 2018 Ludovic Courtès <ludo@gnu.org> ;;; Copyright © 2014, 2015, 2016, 2018 Ludovic Courtès <ludo@gnu.org>
;;; Copyright © 2016 Mark H Weaver <mhw@netris.org> ;;; Copyright © 2016, 2021 Mark H Weaver <mhw@netris.org>
;;; ;;;
;;; This file is part of GNU Guix. ;;; This file is part of GNU Guix.
;;; ;;;
@ -55,6 +55,52 @@ (define nix-base32-char?
(string->char-set "0123456789abcdfghijklmnpqrsvwxyz") (string->char-set "0123456789abcdfghijklmnpqrsvwxyz")
<>)) <>))
(define (nix-base32-char-or-nul? c)
"Return true if C is a nix-base32 character or NUL, otherwise return false."
(or (nix-base32-char? c)
(char=? c #\nul)))
(define (possible-utf16-hash? buffer i w)
"Return true if (I - W) is large enough to hold a UTF-16 encoded
nix-base32 hash and if BUFFER contains NULs in all positions where NULs
are to be expected in a UTF-16 encoded hash+dash pattern whose dash is
found at position I. Otherwise, return false."
(and (<= (* 2 hash-length) (- i w))
(let loop ((j (+ 1 (- i (* 2 hash-length)))))
(or (>= j i)
(and (zero? (bytevector-u8-ref buffer j))
(loop (+ j 2)))))))
(define (possible-utf32-hash? buffer i w)
"Return true if (I - W) is large enough to hold a UTF-32 encoded
nix-base32 hash and if BUFFER contains NULs in all positions where NULs
are to be expected in a UTF-32 encoded hash+dash pattern whose dash is
found at position I. Otherwise, return false."
(and (<= (* 4 hash-length) (- i w))
(let loop ((j (+ 1 (- i (* 4 hash-length)))))
(or (>= j i)
(and (zero? (bytevector-u8-ref buffer j))
(zero? (bytevector-u8-ref buffer (+ j 1)))
(zero? (bytevector-u8-ref buffer (+ j 2)))
(loop (+ j 4)))))))
(define (insert-nuls char-size bv)
"Given a bytevector BV, return a bytevector containing the same bytes but
with (CHAR-SIZE - 1) NULs inserted between every two adjacent bytes from BV.
For example, (insert-nuls 4 #u8(1 2 3)) => #u8(1 0 0 0 2 0 0 0 3)."
(if (= char-size 1)
bv
(let* ((len (bytevector-length bv))
(bv* (make-bytevector (+ 1 (* char-size
(- len 1)))
0)))
(let loop ((i 0))
(when (< i len)
(bytevector-u8-set! bv* (* i char-size)
(bytevector-u8-ref bv i))
(loop (+ i 1))))
bv*)))
(define* (replace-store-references input output replacement-table (define* (replace-store-references input output replacement-table
#:optional (store (%store-directory))) #:optional (store (%store-directory)))
"Read data from INPUT, replacing store references according to "Read data from INPUT, replacing store references according to
@ -76,9 +122,9 @@ (define (optimize-u8-predicate pred)
(list->vector (map pred (iota 256))) (list->vector (map pred (iota 256)))
<>)) <>))
(define nix-base32-byte? (define nix-base32-byte-or-nul?
(optimize-u8-predicate (optimize-u8-predicate
(compose nix-base32-char? (compose nix-base32-char-or-nul?
integer->char))) integer->char)))
(define (dash? byte) (= byte 45)) (define (dash? byte) (= byte 45))
@ -86,100 +132,153 @@ (define (dash? byte) (= byte 45))
(define request-size (expt 2 20)) ; 1 MiB (define request-size (expt 2 20)) ; 1 MiB
;; We scan the file for the following 33-byte pattern: 32 bytes of ;; We scan the file for the following 33-byte pattern: 32 bytes of
;; nix-base32 characters followed by a dash. To accommodate large files, ;; nix-base32 characters followed by a dash. When we find such a pattern
;; we do not read the entire file, but instead work on buffers of up to ;; whose hash is in REPLACEMENT-TABLE, we perform the required rewrite and
;; 'request-size' bytes. To ensure that every 33-byte sequence appears ;; continue scanning.
;; entirely within exactly one buffer, adjacent buffers must overlap, ;;
;; i.e. they must share 32 byte positions. We accomplish this by ;; To support UTF-16 and UTF-32 store references, the 33 bytes comprising
;; "ungetting" the last 32 bytes of each buffer before reading the next ;; this hash+dash pattern may optionally be interspersed by extra NUL bytes.
;; buffer, unless we know that we've reached the end-of-file. ;; This simple approach works because the characters we are looking for are
;; restricted to ASCII. UTF-16 hashes are interspersed with single NUL
;; bytes ("\0"), and UTF-32 hashes are interspersed with triplets of NULs
;; ("\0\0\0"). Note that we require NULs to be present only *between* the
;; other bytes, and not at either end, in order to be insensitive to byte
;; order.
;;
;; To accommodate large files, we do not read the entire file at once, but
;; instead work on buffers of up to REQUEST-SIZE bytes. To ensure that
;; every hash+dash pattern appears in its entirety in at least one buffer,
;; adjacent buffers must overlap by one byte less than the maximum size of a
;; hash+dash pattern. We accomplish this by "ungetting" a suffix of each
;; buffer before reading the next buffer, unless we know that we've reached
;; the end-of-file.
(let ((buffer (make-bytevector request-size))) (let ((buffer (make-bytevector request-size)))
(let loop () (define-syntax-rule (byte-at i)
;; Note: We avoid 'get-bytevector-n' to work around (bytevector-u8-ref buffer i))
;; <http://bugs.gnu.org/17466>. (let outer-loop ()
(match (get-bytevector-n! input buffer 0 request-size) (match (get-bytevector-n! input buffer 0 request-size)
((? eof-object?) 'done) ((? eof-object?) 'done)
(end (end
;; We scan the buffer for dashes that might be preceded by a (define (scan-from i w)
;; nix-base32 hash. The key optimization here is that whenever we ;; Scan the buffer for dashes that might be preceded by nix hashes,
;; find a NON-nix-base32 character at position 'i', we know that it ;; where I is the minimum position where such a dash might be
;; cannot be part of a hash, so the earliest position where the next ;; found, and W is the number of bytes in the buffer that have been
;; hash could start is i+1 with the following dash at position i+33. ;; written so far. We assume that I - W >= HASH-LENGTH.
;; ;;
;; Since nix-base32 characters comprise only 1/8 of the 256 possible ;; The key optimization here is that whenever we find a byte at
;; byte values, and exclude some of the most common letters in ;; position I that cannot occur within a nix hash (because it's
;; English text (e t o u), in practice we can advance by 33 positions ;; neither a nix-base32 character nor NUL), we can infer that the
;; most of the time. ;; earliest position where the next hash could start is at I + 1,
(let scan-from ((i hash-length) (written 0)) ;; and therefore the earliest position for the following dash is
;; 'i' is the first position where we look for a dash. 'written' ;; (+ I 1 HASH-LENGTH), which is I + 33.
;; is the number of bytes in the buffer that have already been ;;
;; written. ;; Since nix-base32-or-nul characters comprise only about 1/8 of
;; the 256 possible byte values, and exclude some of the most
;; common letters in English text (e t o u), we can advance 33
;; positions much of the time.
(if (< i end) (if (< i end)
(let ((byte (bytevector-u8-ref buffer i))) (let ((byte (byte-at i)))
(cond ((and (dash? byte) (cond ((dash? byte)
;; We've found a dash. Note that we do not know (found-dash i w))
;; whether the preceeding 32 bytes are nix-base32 ((nix-base32-byte-or-nul? byte)
;; characters, but we do not need to know. If (scan-from (+ i 1) w))
;; they are not, the following lookup will fail.
(lookup-replacement
(string-tabulate (lambda (j)
(integer->char
(bytevector-u8-ref buffer
(+ j (- i hash-length)))))
hash-length)))
=> (lambda (replacement)
;; We've found a hash that needs to be replaced.
;; First, write out all bytes preceding the hash
;; that have not yet been written.
(put-bytevector output buffer written
(- i hash-length written))
;; Now write the replacement string.
(put-bytevector output replacement)
;; Since the byte at position 'i' is a dash,
;; which is not a nix-base32 char, the earliest
;; position where the next hash might start is
;; i+1, and the earliest position where the
;; following dash might start is (+ i 1
;; hash-length). Also, increase the write
;; position to account for REPLACEMENT.
(let ((len (bytevector-length replacement)))
(scan-from (+ i 1 len)
(+ i (- len hash-length))))))
;; If the byte at position 'i' is a nix-base32 char,
;; then the dash we're looking for might be as early as
;; the following byte, so we can only advance by 1.
((nix-base32-byte? byte)
(scan-from (+ i 1) written))
;; If the byte at position 'i' is NOT a nix-base32
;; char, then the earliest position where the next hash
;; might start is i+1, with the following dash at
;; position (+ i 1 hash-length).
(else (else
(scan-from (+ i 1 hash-length) written)))) (not-part-of-hash i w))))
(finish-buffer i w)))
;; We have finished scanning the buffer. Now we determine how (define (not-part-of-hash i w)
;; many bytes have not yet been written, and how many bytes to ;; Position I is known to not be within a nix hash that we must
;; "unget". If 'end' is less than 'request-size' then we read ;; rewrite. Therefore, the earliest position where the next hash
;; less than we asked for, which indicates that we are at EOF, ;; might start is I + 1, and therefore the earliest position of
;; so we needn't unget anything. Otherwise, we unget up to ;; the following dash is (+ I 1 HASH-LENGTH).
;; 'hash-length' bytes (32 bytes). However, we must be careful (scan-from (+ i 1 hash-length) w))
;; not to unget bytes that have already been written, because
;; that would cause them to be written again from the next (define (found-dash i w)
;; buffer. In practice, this case occurs when a replacement is ;; We know that there is a dash '-' at position I, and that
;; made near or beyond the end of the buffer. When REPLACEMENT ;; I - W >= HASH-LENGTH. The immediately preceding bytes *might*
;; went beyond END, we consume the extra bytes from INPUT. ;; contain a nix-base32 hash, but that is not yet known. Here,
(begin ;; we rule out all but one possible encoding (ASCII, UTF-16,
(if (> written end) ;; UTF-32) by counting how many NULs precede the dash.
(get-bytevector-n! input buffer 0 (- written end)) (cond ((not (zero? (byte-at (- i 1))))
(let* ((unwritten (- end written)) ;; The dash is *not* preceded by a NUL, therefore it
(unget-size (if (= end request-size) ;; cannot possibly be a UTF-16 or UTF-32 hash. Proceed
(min hash-length unwritten) ;; to check for an ASCII hash.
0)) (found-possible-hash 1 i w))
(write-size (- unwritten unget-size)))
(put-bytevector output buffer written write-size) ((not (zero? (byte-at (- i 2))))
(unget-bytevector input buffer (+ written write-size) ;; The dash is preceded by exactly one NUL, therefore it
unget-size))) ;; cannot be an ASCII or UTF-32 hash. Proceed to check
(loop))))))))) ;; for a UTF-16 hash.
(if (possible-utf16-hash? buffer i w)
(found-possible-hash 2 i w)
(not-part-of-hash i w)))
(else
;; The dash is preceded by at least two NULs, therefore
;; it cannot be an ASCII or UTF-16 hash. Proceed to
;; check for a UTF-32 hash.
(if (possible-utf32-hash? buffer i w)
(found-possible-hash 4 i w)
(not-part-of-hash i w)))))
(define (found-possible-hash char-size i w)
;; We know that there is a dash '-' at position I, that
;; I - W >= CHAR-SIZE * HASH-LENGTH, and that the only
;; possible encoding for the preceding hash is as indicated by
;; CHAR-SIZE. Here we check to see if the given hash is in
;; REPLACEMENT-TABLE, and if so, we perform the required
;; rewrite.
(let* ((hash (string-tabulate
(lambda (j)
(integer->char
(byte-at (- i (* char-size
(- hash-length j))))))
hash-length))
(replacement* (lookup-replacement hash))
(replacement (and replacement*
(insert-nuls char-size replacement*))))
(cond
((not replacement)
(not-part-of-hash i w))
(else
;; We've found a hash that needs to be replaced.
;; First, write out all bytes preceding the hash
;; that have not yet been written.
(put-bytevector output buffer w
(- i (* char-size hash-length) w))
;; Now write the replacement string.
(put-bytevector output replacement)
;; Now compute the new values of W and I and continue.
(let ((w (+ (- i (* char-size hash-length))
(bytevector-length replacement))))
(scan-from (+ w hash-length) w))))))
(define (finish-buffer i w)
;; We have finished scanning the buffer. Now we determine how many
;; bytes have not yet been written, and how many bytes to "unget".
;; If END is less than REQUEST-SIZE then we read less than we asked
;; for, which indicates that we are at EOF, so we needn't unget
;; anything. Otherwise, we unget up to (* 4 HASH-LENGTH) bytes.
;; However, we must be careful not to unget bytes that have already
;; been written, because that would cause them to be written again
;; from the next buffer. In practice, this case occurs when a
;; replacement is made near or beyond the end of the buffer. When
;; REPLACEMENT went beyond END, we consume the extra bytes from
;; INPUT.
(if (> w end)
(get-bytevector-n! input buffer 0 (- w end))
(let* ((unwritten (- end w))
(unget-size (if (= end request-size)
(min (* 4 hash-length)
unwritten)
0))
(write-size (- unwritten unget-size)))
(put-bytevector output buffer w write-size)
(unget-bytevector input buffer (+ w write-size)
unget-size)))
(outer-loop))
(scan-from hash-length 0))))))
(define (rename-matching-files directory mapping) (define (rename-matching-files directory mapping)
"Apply MAPPING to the names of all the files in DIRECTORY, where MAPPING is "Apply MAPPING to the names of all the files in DIRECTORY, where MAPPING is

View file

@ -1,5 +1,6 @@
;;; GNU Guix --- Functional package management for GNU ;;; GNU Guix --- Functional package management for GNU
;;; Copyright © 2014, 2015, 2016, 2017, 2018, 2019 Ludovic Courtès <ludo@gnu.org> ;;; Copyright © 2014, 2015, 2016, 2017, 2018, 2019 Ludovic Courtès <ludo@gnu.org>
;;; Copyright © 2021 Mark H Weaver <mhw@netris.org>
;;; ;;;
;;; This file is part of GNU Guix. ;;; This file is part of GNU Guix.
;;; ;;;
@ -468,4 +469,86 @@ (define buffer-size
replacement replacement
"/gnu/store"))))) "/gnu/store")))))
(define (insert-nuls char-size str)
(string-join (map string (string->list str))
(make-string (- char-size 1) #\nul)))
(define (nuls-to-underscores s)
(string-replace-substring s "\0" "_"))
(define (annotate-buffer-boundary s)
(string-append (string-take s buffer-size)
"|"
(string-drop s buffer-size)))
(define (abbreviate-leading-fill s)
(let ((s* (string-trim s #\=)))
(format #f "[~a =s]~a"
(- (string-length s)
(string-length s*))
s*)))
(define (prettify-for-display s)
(abbreviate-leading-fill
(annotate-buffer-boundary
(nuls-to-underscores s))))
(define (two-sample-refs-with-gap char-size1 char-size2 gap offset
char1 name1 char2 name2)
(string-append
(make-string (- buffer-size offset) #\=)
(insert-nuls char-size1
(string-append "/gnu/store/" (make-string 32 char1) name1))
gap
(insert-nuls char-size2
(string-append "/gnu/store/" (make-string 32 char2) name2))
(list->string (map integer->char (iota 77 33)))))
(define (sample-map-entry old-char new-char new-name)
(cons (make-string 32 old-char)
(string->utf8 (string-append (make-string 32 new-char)
new-name))))
(define (test-two-refs-with-gap char-size1 char-size2 gap offset)
(test-equal
(format #f "test-two-refs-with-gap, char-sizes ~a ~a, gap ~s, offset ~a"
char-size1 char-size2 gap offset)
(prettify-for-display
(two-sample-refs-with-gap char-size1 char-size2 gap offset
#\6 "-BlahBlaH"
#\8"-SoMeTHiNG"))
(prettify-for-display
(let* ((content (two-sample-refs-with-gap char-size1 char-size2 gap offset
#\5 "-blahblah"
#\7 "-something"))
(replacement (alist->vhash
(list (sample-map-entry #\5 #\6 "-BlahBlaH")
(sample-map-entry #\7 #\8 "-SoMeTHiNG")))))
(call-with-output-string
(lambda (output)
((@@ (guix build graft) replace-store-references)
(open-input-string content) output
replacement
"/gnu/store")))))))
(for-each (lambda (char-size1)
(for-each (lambda (char-size2)
(for-each (lambda (gap)
(for-each (lambda (offset)
(test-two-refs-with-gap char-size1
char-size2
gap
offset))
;; offsets to test
(map (lambda (i)
(+ i (* 40 char-size1)))
(iota 30))))
;; gaps
'("" "-" " " "a")))
;; char-size2 values to test
'(1 2)))
;; char-size1 values to test
'(1 2 4))
(test-end) (test-end)