utils: Allow text substitution even in the presence of NUL characters.

Fixes <https://issues.guix.gnu.org/30116>.

Before this change, the presence of a NUL character on a line meant that
the (glibc) regexp engine used by Guile would either 1. stop scanning the
string or 2. crash with the error "string contains #\\nul character",
depending on the locale used.

This change works around this limitation by first replacing the NUL character
by an unused Unicode code point, doing the substitution, then reverting the
replacement.

* guix/build/utils.scm (unused-private-use-code-point)
(replace-char): New procedures.
(substitute): Make use of the above procedures to work around the NUL
character regexp engine limitation.
* tests/build-utils.scm: Add tests.

Co-authored-by: Maxim Cournoyer <maxim.cournoyer@gmail.com>
Signed-off-by: Maxim Cournoyer <maxim.cournoyer@gmail.com>
This commit is contained in:
Mark H Weaver 2021-01-07 15:15:24 -05:00 committed by Maxim Cournoyer
parent 8027711ee8
commit 485ac28235
No known key found for this signature in database
GPG key ID: 1260E46482E63562
2 changed files with 69 additions and 12 deletions

View file

@ -2,7 +2,7 @@
;;; Copyright © 2012, 2013, 2014, 2015, 2016, 2017, 2018, 2019, 2020 Ludovic Courtès <ludo@gnu.org>
;;; Copyright © 2013 Andreas Enge <andreas@enge.fr>
;;; Copyright © 2013 Nikita Karetnikov <nikita@karetnikov.org>
;;; Copyright © 2015, 2018 Mark H Weaver <mhw@netris.org>
;;; Copyright © 2015, 2018, 2021 Mark H Weaver <mhw@netris.org>
;;; Copyright © 2018 Arun Isaac <arunisaac@systemreboot.net>
;;; Copyright © 2018, 2019 Ricardo Wurmus <rekado@elephly.net>
;;; Copyright © 2020 Efraim Flashner <efraim@flashner.co.il>
@ -784,6 +784,31 @@ (define (with-atomic-file-replacement file proc)
(lambda (key . args)
(false-if-exception (delete-file template))))))
(define (unused-private-use-code-point s)
"Find a code point within a Unicode Private Use Area that is not
present in S, and return the corresponding character object. If one
cannot be found, return false."
(define (scan lo hi)
(and (<= lo hi)
(let ((c (integer->char lo)))
(if (string-index s c)
(scan (+ lo 1) hi)
c))))
(or (scan #xE000 #xF8FF)
(scan #xF0000 #xFFFFD)
(scan #x100000 #x10FFFD)))
(define (replace-char c1 c2 s)
"Return a string which is equal to S except with all instances of C1
replaced by C2. If C1 and C2 are equal, return S."
(if (char=? c1 c2)
s
(string-map (lambda (c)
(if (char=? c c1)
c2
c))
s)))
(define (substitute file pattern+procs)
"PATTERN+PROCS is a list of regexp/two-argument-procedure pairs. For each
line of FILE, and for each PATTERN that it matches, call the corresponding
@ -802,16 +827,26 @@ (define (substitute file pattern+procs)
(let loop ((line (read-line in 'concat)))
(if (eof-object? line)
#t
(let ((line (fold (lambda (r+p line)
(match r+p
((regexp . proc)
(match (list-matches regexp line)
((and m+ (_ _ ...))
(proc line m+))
(_ line)))))
line
rx+proc)))
(display line out)
;; Work around the fact that Guile's regexp-exec does not handle
;; NUL characters (a limitation of the underlying GNU libc's
;; regexec) by temporarily replacing them by an unused private
;; Unicode code point.
;; TODO: Use SRFI-115 instead, once available in Guile.
(let* ((nul* (or (and (string-index line #\nul)
(unused-private-use-code-point line))
#\nul))
(line* (replace-char #\nul nul* line))
(line1* (fold (lambda (r+p line)
(match r+p
((regexp . proc)
(match (list-matches regexp line)
((and m+ (_ _ ...))
(proc line m+))
(_ line)))))
line*
rx+proc))
(line1 (replace-char nul* #\nul line1*)))
(display line1 out)
(loop (read-line in 'concat)))))))))

View file

@ -1,6 +1,7 @@
;;; GNU Guix --- Functional package management for GNU
;;; Copyright © 2012, 2015, 2016, 2019, 2020 Ludovic Courtès <ludo@gnu.org>
;;; Copyright © 2019 Ricardo Wurmus <rekado@elephly.net>
;;; Copyright © 2021 Maxim Cournoyer <maxim.cournoyer@gmail.com>
;;;
;;; This file is part of GNU Guix.
;;;
@ -18,7 +19,7 @@
;;; along with GNU Guix. If not, see <http://www.gnu.org/licenses/>.
(define-module (test-build-utils)
(define-module (test build-utils)
#:use-module (guix tests)
#:use-module (guix build utils)
#:use-module ((guix utils)
@ -241,4 +242,25 @@ (define-module (test-build-utils)
"/some/other/path")))
#f)))))
(test-equal "substitute*, text contains a NUL byte, UTF-8"
"c\0d"
(with-fluids ((%default-port-encoding "UTF-8")
(%default-port-conversion-strategy 'error))
;; The GNU libc is locale sensitive. Depending on the value of LANG, the
;; test could fail with "string contains #\\nul character: ~S" or "cannot
;; convert wide string to output locale".
(setlocale LC_ALL "en_US.UTF-8")
(call-with-temporary-output-file
(lambda (file port)
(format port "a\0b")
(flush-output-port port)
(substitute* file
(("a") "c")
(("b") "d"))
(with-input-from-file file
(lambda _
(get-string-all (current-input-port))))))))
(test-end)